diff options
author | Victorien Le Couviour--Tuffet <victorien.lecouviour.tuffet@gmail.com> | 2021-05-04 15:03:14 +0300 |
---|---|---|
committer | Henrik Gramner <henrik@gramner.com> | 2021-05-04 18:00:07 +0300 |
commit | ae4b53fb4c9e8b616ea2f5ca92f9b9ee4ff33116 (patch) | |
tree | a6d3736a1416502f13c314b22f1449cdff84d6b9 | |
parent | 787862dbd7b6fc3d48b24ba09039fd33ab652811 (diff) |
x86: Add high bitdepth ipred_dc AVX2 asm
-rw-r--r-- | src/meson.build | 1 | ||||
-rw-r--r-- | src/x86/ipred16_avx2.asm | 356 | ||||
-rw-r--r-- | src/x86/ipred_init_tmpl.c | 188 |
3 files changed, 451 insertions, 94 deletions
diff --git a/src/meson.build b/src/meson.build index 8ecbd95..dded3e2 100644 --- a/src/meson.build +++ b/src/meson.build @@ -214,6 +214,7 @@ if is_asm_enabled if dav1d_bitdepths.contains('16') libdav1d_sources_asm += files( 'x86/cdef16_avx2.asm', + 'x86/ipred16_avx2.asm', 'x86/looprestoration16_avx2.asm', 'x86/mc16_avx2.asm', 'x86/cdef16_sse.asm', diff --git a/src/x86/ipred16_avx2.asm b/src/x86/ipred16_avx2.asm new file mode 100644 index 0000000..e99ae0b --- /dev/null +++ b/src/x86/ipred16_avx2.asm @@ -0,0 +1,356 @@ +; Copyright © 2021, VideoLAN and dav1d authors +; Copyright © 2021, Two Orioles, LLC +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are met: +; +; 1. Redistributions of source code must retain the above copyright notice, this +; list of conditions and the following disclaimer. +; +; 2. Redistributions in binary form must reproduce the above copyright notice, +; this list of conditions and the following disclaimer in the documentation +; and/or other materials provided with the distribution. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +%include "config.asm" +%include "ext/x86/x86inc.asm" + +%if ARCH_X86_64 + +SECTION_RODATA + +pw_512: times 2 dw 512 +pw_2048: times 2 dw 2048 + +%macro JMP_TABLE 3-* + %xdefine %1_%2_table (%%table - 2*4) + %xdefine %%base mangle(private_prefix %+ _%1_%2) + %%table: + %rep %0 - 2 + dd %%base %+ .%3 - (%%table - 2*4) + %rotate 1 + %endrep +%endmacro + +%define ipred_dc_splat_16bpc_avx2_table (ipred_dc_16bpc_avx2_table + 10*4) + +JMP_TABLE ipred_dc_16bpc, avx2, h4, h8, h16, h32, h64, w4, w8, w16, w32, w64, \ + s4-10*4, s8-10*4, s16-10*4, s32-10*4, s64-10*4 +JMP_TABLE ipred_dc_left_16bpc, avx2, h4, h8, h16, h32, h64 + +SECTION .text + +INIT_YMM avx2 + +cglobal ipred_dc_top_16bpc, 3, 7, 6, dst, stride, tl, w, h + movifnidn hd, hm + add tlq, 2 + movd xm4, wd + pxor xm3, xm3 + pavgw xm4, xm3 + tzcnt wd, wd + movd xm5, wd + movu m0, [tlq] + lea r5, [ipred_dc_left_16bpc_avx2_table] + movsxd r6, [r5+wq*4] + add r6, r5 + add r5, ipred_dc_splat_16bpc_avx2_table-ipred_dc_left_16bpc_avx2_table + movsxd wq, [r5+wq*4] + add wq, r5 + jmp r6 + +cglobal ipred_dc_left_16bpc, 3, 7, 6, dst, stride, tl, w, h, stride3 + mov hd, hm + sub tlq, hq + movd xm4, hd + sub tlq, hq + pxor xm3, xm3 + pavgw xm4, xm3 + tzcnt r6d, hd + movd xm5, r6d + movu m0, [tlq] + lea r5, [ipred_dc_left_16bpc_avx2_table] + movsxd r6, [r5+r6*4] + add r6, r5 + add r5, ipred_dc_splat_16bpc_avx2_table-ipred_dc_left_16bpc_avx2_table + tzcnt wd, wd + movsxd wq, [r5+wq*4] + add wq, r5 + jmp r6 +.h64: + paddw m0, [tlq+96] + paddw m0, [tlq+64] +.h32: + paddw m0, [tlq+32] +.h16: + vextracti128 xm1, m0, 1 + paddw xm0, xm1 +.h8: + psrldq xm1, xm0, 8 + paddw xm0, xm1 +.h4: + punpcklwd xm0, xm3 + psrlq xm1, xm0, 32 + paddd xm0, xm1 + psrldq xm1, xm0, 8 + paddd xm0, xm1 + paddd xm0, xm4 + psrld xm0, xm5 + lea stride3q, [strideq*3] + vpbroadcastw m0, xm0 + mova m1, m0 + mova m2, m0 + mova m3, m0 + jmp wq + +cglobal ipred_dc_16bpc, 3, 7, 6, dst, stride, tl, w, h, stride3 + movifnidn hd, hm + tzcnt r6d, hd + lea r5d, [wq+hq] + movd xm4, r5d + tzcnt r5d, r5d + movd xm5, r5d + lea r5, [ipred_dc_16bpc_avx2_table] + tzcnt wd, wd + movsxd r6, [r5+r6*4] + movsxd wq, [r5+wq*4+5*4] + pxor m3, m3 + psrlw xm4, 1 + add r6, r5 + add wq, r5 + lea stride3q, [strideq*3] + jmp r6 +.h4: + movq xm0, [tlq-8] + jmp wq +.w4: + movq xm1, [tlq+2] + paddw m0, m4 + paddw m0, m1 + psrlq m1, m0, 32 + paddw m0, m1 + psrld m1, m0, 16 + paddw m0, m1 + cmp hd, 4 + jg .w4_mul + psrlw xm0, 3 + jmp .w4_end +.w4_mul: + vextracti128 xm1, m0, 1 + paddw xm0, xm1 + lea r2d, [hq*2] + mov r6d, 0xAAAB6667 + shrx r6d, r6d, r2d + punpckhwd xm1, xm0, xm3 + punpcklwd xm0, xm3 + paddd xm0, xm1 + movd xm1, r6d + psrld xm0, 2 + pmulhuw xm0, xm1 + psrlw xm0, 1 +.w4_end: + vpbroadcastw xm0, xm0 +.s4: + movq [dstq+strideq*0], xm0 + movq [dstq+strideq*1], xm0 + movq [dstq+strideq*2], xm0 + movq [dstq+stride3q ], xm0 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .s4 + RET +ALIGN function_align +.h8: + mova xm0, [tlq-16] + jmp wq +.w8: + vextracti128 xm1, m0, 1 + paddw xm0, [tlq+2] + paddw xm0, xm4 + paddw xm0, xm1 + psrld xm1, xm0, 16 + paddw xm0, xm1 + pblendw xm0, xm3, 0xAA + psrlq xm1, xm0, 32 + paddd xm0, xm1 + psrldq xm1, xm0, 8 + paddd xm0, xm1 + psrld xm0, xm5 + cmp hd, 8 + je .w8_end + mov r6d, 0xAAAB + mov r2d, 0x6667 + cmp hd, 32 + cmovz r6d, r2d + movd xm1, r6d + pmulhuw xm0, xm1 + psrlw xm0, 1 +.w8_end: + vpbroadcastw xm0, xm0 +.s8: + mova [dstq+strideq*0], xm0 + mova [dstq+strideq*1], xm0 + mova [dstq+strideq*2], xm0 + mova [dstq+stride3q ], xm0 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .s8 + RET +ALIGN function_align +.h16: + mova m0, [tlq-32] + jmp wq +.w16: + paddw m0, [tlq+2] + vextracti128 xm1, m0, 1 + paddw xm0, xm4 + paddw xm0, xm1 + punpckhwd xm1, xm0, xm3 + punpcklwd xm0, xm3 + paddd xm0, xm1 + psrlq xm1, xm0, 32 + paddd xm0, xm1 + psrldq xm1, xm0, 8 + paddd xm0, xm1 + psrld xm0, xm5 + cmp hd, 16 + je .w16_end + mov r6d, 0xAAAB + mov r2d, 0x6667 + test hb, 8|32 + cmovz r6d, r2d + movd xm1, r6d + pmulhuw xm0, xm1 + psrlw xm0, 1 +.w16_end: + vpbroadcastw m0, xm0 +.s16: + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m0 + mova [dstq+strideq*2], m0 + mova [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .s16 + RET +ALIGN function_align +.h32: + mova m0, [tlq-64] + paddw m0, [tlq-32] + jmp wq +.w32: + paddw m0, [tlq+ 2] + paddw m0, [tlq+34] + vextracti128 xm1, m0, 1 + paddw xm0, xm4 + paddw xm0, xm1 + punpcklwd xm1, xm0, xm3 + punpckhwd xm0, xm3 + paddd xm0, xm1 + psrlq xm1, xm0, 32 + paddd xm0, xm1 + psrldq xm1, xm0, 8 + paddd xm0, xm1 + psrld xm0, xm5 + cmp hd, 32 + je .w32_end + lea r2d, [hq*2] + mov r6d, 0x6667AAAB + shrx r6d, r6d, r2d + movd xm1, r6d + pmulhuw xm0, xm1 + psrlw xm0, 1 +.w32_end: + vpbroadcastw m0, xm0 + mova m1, m0 +.s32: + mova [dstq+strideq*0+32*0], m0 + mova [dstq+strideq*0+32*1], m1 + mova [dstq+strideq*1+32*0], m0 + mova [dstq+strideq*1+32*1], m1 + mova [dstq+strideq*2+32*0], m0 + mova [dstq+strideq*2+32*1], m1 + mova [dstq+stride3q +32*0], m0 + mova [dstq+stride3q +32*1], m1 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .s32 + RET +ALIGN function_align +.h64: + mova m0, [tlq-128] + mova m1, [tlq- 96] + paddw m0, [tlq- 64] + paddw m1, [tlq- 32] + paddw m0, m1 + jmp wq +.w64: + movu m1, [tlq+ 2] + paddw m0, [tlq+34] + paddw m1, [tlq+66] + paddw m0, [tlq+98] + paddw m0, m1 + vextracti128 xm1, m0, 1 + paddw xm0, xm1 + punpcklwd xm1, xm0, xm3 + punpckhwd xm0, xm3 + paddd xm1, xm4 + paddd xm0, xm1 + psrlq xm1, xm0, 32 + paddd xm0, xm1 + psrldq xm1, xm0, 8 + paddd xm0, xm1 + psrld xm0, xm5 + cmp hd, 64 + je .w64_end + mov r6d, 0x6667AAAB + shrx r6d, r6d, hd + movd xm1, r6d + pmulhuw xm0, xm1 + psrlw xm0, 1 +.w64_end: + vpbroadcastw m0, xm0 + mova m1, m0 + mova m2, m0 + mova m3, m0 +.s64: + mova [dstq+strideq*0+32*0], m0 + mova [dstq+strideq*0+32*1], m1 + mova [dstq+strideq*0+32*2], m2 + mova [dstq+strideq*0+32*3], m3 + mova [dstq+strideq*1+32*0], m0 + mova [dstq+strideq*1+32*1], m1 + mova [dstq+strideq*1+32*2], m2 + mova [dstq+strideq*1+32*3], m3 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .s64 + RET + +cglobal ipred_dc_128_16bpc, 2, 7, 6, dst, stride, tl, w, h, stride3 + mov r6d, r8m + shr r6d, 11 + lea r5, [ipred_dc_splat_16bpc_avx2_table] + tzcnt wd, wd + movifnidn hd, hm + movsxd wq, [r5+wq*4] + vpbroadcastd m0, [r5-ipred_dc_splat_16bpc_avx2_table+pw_512+r6*4] + mova m1, m0 + mova m2, m0 + mova m3, m0 + add wq, r5 + lea stride3q, [strideq*3] + jmp wq + +%endif diff --git a/src/x86/ipred_init_tmpl.c b/src/x86/ipred_init_tmpl.c index 4219ab8..fb7e917 100644 --- a/src/x86/ipred_init_tmpl.c +++ b/src/x86/ipred_init_tmpl.c @@ -28,54 +28,52 @@ #include "src/cpu.h" #include "src/ipred.h" -decl_angular_ipred_fn(dav1d_ipred_dc_avx2); -decl_angular_ipred_fn(dav1d_ipred_dc_128_avx2); -decl_angular_ipred_fn(dav1d_ipred_dc_top_avx2); -decl_angular_ipred_fn(dav1d_ipred_dc_left_avx2); -decl_angular_ipred_fn(dav1d_ipred_h_avx2); -decl_angular_ipred_fn(dav1d_ipred_v_avx2); -decl_angular_ipred_fn(dav1d_ipred_paeth_avx2); -decl_angular_ipred_fn(dav1d_ipred_smooth_avx2); -decl_angular_ipred_fn(dav1d_ipred_smooth_v_avx2); -decl_angular_ipred_fn(dav1d_ipred_smooth_h_avx2); -decl_angular_ipred_fn(dav1d_ipred_z1_avx2); -decl_angular_ipred_fn(dav1d_ipred_z2_avx2); -decl_angular_ipred_fn(dav1d_ipred_z3_avx2); -decl_angular_ipred_fn(dav1d_ipred_filter_avx2); - -decl_cfl_pred_fn(dav1d_ipred_cfl_avx2); -decl_cfl_pred_fn(dav1d_ipred_cfl_128_avx2); -decl_cfl_pred_fn(dav1d_ipred_cfl_top_avx2); -decl_cfl_pred_fn(dav1d_ipred_cfl_left_avx2); - -decl_cfl_ac_fn(dav1d_ipred_cfl_ac_420_avx2); -decl_cfl_ac_fn(dav1d_ipred_cfl_ac_422_avx2); -decl_cfl_ac_fn(dav1d_ipred_cfl_ac_444_avx2); - -decl_pal_pred_fn(dav1d_pal_pred_avx2); - -decl_angular_ipred_fn(dav1d_ipred_dc_ssse3); -decl_angular_ipred_fn(dav1d_ipred_dc_128_ssse3); -decl_angular_ipred_fn(dav1d_ipred_dc_top_ssse3); -decl_angular_ipred_fn(dav1d_ipred_dc_left_ssse3); -decl_angular_ipred_fn(dav1d_ipred_h_ssse3); -decl_angular_ipred_fn(dav1d_ipred_v_ssse3); -decl_angular_ipred_fn(dav1d_ipred_paeth_ssse3); -decl_angular_ipred_fn(dav1d_ipred_smooth_ssse3); -decl_angular_ipred_fn(dav1d_ipred_smooth_v_ssse3); -decl_angular_ipred_fn(dav1d_ipred_smooth_h_ssse3); -decl_angular_ipred_fn(dav1d_ipred_filter_ssse3); - -decl_cfl_pred_fn(dav1d_ipred_cfl_ssse3); -decl_cfl_pred_fn(dav1d_ipred_cfl_128_ssse3); -decl_cfl_pred_fn(dav1d_ipred_cfl_top_ssse3); -decl_cfl_pred_fn(dav1d_ipred_cfl_left_ssse3); - -decl_cfl_ac_fn(dav1d_ipred_cfl_ac_420_ssse3); -decl_cfl_ac_fn(dav1d_ipred_cfl_ac_422_ssse3); -decl_cfl_ac_fn(dav1d_ipred_cfl_ac_444_ssse3); - -decl_pal_pred_fn(dav1d_pal_pred_ssse3); +#if BITDEPTH == 8 +#define decl_fn(type, name) \ + decl_##type##_fn(dav1d_##name##_ssse3); \ + decl_##type##_fn(dav1d_##name##_avx2) +#define init_fn(type0, type1, name, suffix) \ + c->type0[type1] = dav1d_##name##_##suffix +#else +#define decl_fn(type, name) \ + decl_##type##_fn(dav1d_##name##_16bpc_ssse3); \ + decl_##type##_fn(dav1d_##name##_16bpc_avx2) +#define init_fn(type0, type1, name, suffix) \ + c->type0[type1] = dav1d_##name##_16bpc_##suffix +#endif + +#define init_angular_ipred_fn(type, name, suffix) \ + init_fn(intra_pred, type, name, suffix) +#define init_cfl_pred_fn(type, name, suffix) \ + init_fn(cfl_pred, type, name, suffix) +#define init_cfl_ac_fn(type, name, suffix) \ + init_fn(cfl_ac, type, name, suffix) + +decl_fn(angular_ipred, ipred_dc); +decl_fn(angular_ipred, ipred_dc_128); +decl_fn(angular_ipred, ipred_dc_top); +decl_fn(angular_ipred, ipred_dc_left); +decl_fn(angular_ipred, ipred_h); +decl_fn(angular_ipred, ipred_v); +decl_fn(angular_ipred, ipred_paeth); +decl_fn(angular_ipred, ipred_smooth); +decl_fn(angular_ipred, ipred_smooth_h); +decl_fn(angular_ipred, ipred_smooth_v); +decl_fn(angular_ipred, ipred_z1); +decl_fn(angular_ipred, ipred_z2); +decl_fn(angular_ipred, ipred_z3); +decl_fn(angular_ipred, ipred_filter); + +decl_fn(cfl_pred, ipred_cfl); +decl_fn(cfl_pred, ipred_cfl_128); +decl_fn(cfl_pred, ipred_cfl_top); +decl_fn(cfl_pred, ipred_cfl_left); + +decl_fn(cfl_ac, ipred_cfl_ac_420); +decl_fn(cfl_ac, ipred_cfl_ac_422); +decl_fn(cfl_ac, ipred_cfl_ac_444); + +decl_fn(pal_pred, pal_pred); COLD void bitfn(dav1d_intra_pred_dsp_init_x86)(Dav1dIntraPredDSPContext *const c) { const unsigned flags = dav1d_get_cpu_flags(); @@ -83,57 +81,59 @@ COLD void bitfn(dav1d_intra_pred_dsp_init_x86)(Dav1dIntraPredDSPContext *const c if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return; #if BITDEPTH == 8 - c->intra_pred[DC_PRED] = dav1d_ipred_dc_ssse3; - c->intra_pred[DC_128_PRED] = dav1d_ipred_dc_128_ssse3; - c->intra_pred[TOP_DC_PRED] = dav1d_ipred_dc_top_ssse3; - c->intra_pred[LEFT_DC_PRED] = dav1d_ipred_dc_left_ssse3; - c->intra_pred[HOR_PRED] = dav1d_ipred_h_ssse3; - c->intra_pred[VERT_PRED] = dav1d_ipred_v_ssse3; - c->intra_pred[PAETH_PRED] = dav1d_ipred_paeth_ssse3; - c->intra_pred[SMOOTH_PRED] = dav1d_ipred_smooth_ssse3; - c->intra_pred[SMOOTH_V_PRED] = dav1d_ipred_smooth_v_ssse3; - c->intra_pred[SMOOTH_H_PRED] = dav1d_ipred_smooth_h_ssse3; - c->intra_pred[FILTER_PRED] = dav1d_ipred_filter_ssse3; - - c->cfl_pred[DC_PRED] = dav1d_ipred_cfl_ssse3; - c->cfl_pred[DC_128_PRED] = dav1d_ipred_cfl_128_ssse3; - c->cfl_pred[TOP_DC_PRED] = dav1d_ipred_cfl_top_ssse3; - c->cfl_pred[LEFT_DC_PRED] = dav1d_ipred_cfl_left_ssse3; - - c->cfl_ac[DAV1D_PIXEL_LAYOUT_I420 - 1] = dav1d_ipred_cfl_ac_420_ssse3; - c->cfl_ac[DAV1D_PIXEL_LAYOUT_I422 - 1] = dav1d_ipred_cfl_ac_422_ssse3; - c->cfl_ac[DAV1D_PIXEL_LAYOUT_I444 - 1] = dav1d_ipred_cfl_ac_444_ssse3; - - c->pal_pred = dav1d_pal_pred_ssse3; + init_angular_ipred_fn(DC_PRED, ipred_dc, ssse3); + init_angular_ipred_fn(DC_128_PRED, ipred_dc_128, ssse3); + init_angular_ipred_fn(TOP_DC_PRED, ipred_dc_top, ssse3); + init_angular_ipred_fn(LEFT_DC_PRED, ipred_dc_left, ssse3); + init_angular_ipred_fn(HOR_PRED, ipred_h, ssse3); + init_angular_ipred_fn(VERT_PRED, ipred_v, ssse3); + init_angular_ipred_fn(PAETH_PRED, ipred_paeth, ssse3); + init_angular_ipred_fn(SMOOTH_PRED, ipred_smooth, ssse3); + init_angular_ipred_fn(SMOOTH_H_PRED, ipred_smooth_h, ssse3); + init_angular_ipred_fn(SMOOTH_V_PRED, ipred_smooth_v, ssse3); + init_angular_ipred_fn(FILTER_PRED, ipred_filter, ssse3); + + init_cfl_pred_fn(DC_PRED, ipred_cfl, ssse3); + init_cfl_pred_fn(DC_128_PRED, ipred_cfl_128, ssse3); + init_cfl_pred_fn(TOP_DC_PRED, ipred_cfl_top, ssse3); + init_cfl_pred_fn(LEFT_DC_PRED, ipred_cfl_left, ssse3); + + init_cfl_ac_fn(DAV1D_PIXEL_LAYOUT_I420 - 1, ipred_cfl_ac_420, ssse3); + init_cfl_ac_fn(DAV1D_PIXEL_LAYOUT_I422 - 1, ipred_cfl_ac_422, ssse3); + init_cfl_ac_fn(DAV1D_PIXEL_LAYOUT_I444 - 1, ipred_cfl_ac_444, ssse3); + + c->pal_pred = dav1d_pal_pred_ssse3; #endif +#if ARCH_X86_64 if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return; -#if BITDEPTH == 8 && ARCH_X86_64 - c->intra_pred[DC_PRED] = dav1d_ipred_dc_avx2; - c->intra_pred[DC_128_PRED] = dav1d_ipred_dc_128_avx2; - c->intra_pred[TOP_DC_PRED] = dav1d_ipred_dc_top_avx2; - c->intra_pred[LEFT_DC_PRED] = dav1d_ipred_dc_left_avx2; - c->intra_pred[HOR_PRED] = dav1d_ipred_h_avx2; - c->intra_pred[VERT_PRED] = dav1d_ipred_v_avx2; - c->intra_pred[PAETH_PRED] = dav1d_ipred_paeth_avx2; - c->intra_pred[SMOOTH_PRED] = dav1d_ipred_smooth_avx2; - c->intra_pred[SMOOTH_V_PRED] = dav1d_ipred_smooth_v_avx2; - c->intra_pred[SMOOTH_H_PRED] = dav1d_ipred_smooth_h_avx2; - c->intra_pred[Z1_PRED] = dav1d_ipred_z1_avx2; - c->intra_pred[Z2_PRED] = dav1d_ipred_z2_avx2; - c->intra_pred[Z3_PRED] = dav1d_ipred_z3_avx2; - c->intra_pred[FILTER_PRED] = dav1d_ipred_filter_avx2; - - c->cfl_pred[DC_PRED] = dav1d_ipred_cfl_avx2; - c->cfl_pred[DC_128_PRED] = dav1d_ipred_cfl_128_avx2; - c->cfl_pred[TOP_DC_PRED] = dav1d_ipred_cfl_top_avx2; - c->cfl_pred[LEFT_DC_PRED] = dav1d_ipred_cfl_left_avx2; - - c->cfl_ac[DAV1D_PIXEL_LAYOUT_I420 - 1] = dav1d_ipred_cfl_ac_420_avx2; - c->cfl_ac[DAV1D_PIXEL_LAYOUT_I422 - 1] = dav1d_ipred_cfl_ac_422_avx2; - c->cfl_ac[DAV1D_PIXEL_LAYOUT_I444 - 1] = dav1d_ipred_cfl_ac_444_avx2; + init_angular_ipred_fn(DC_PRED, ipred_dc, avx2); + init_angular_ipred_fn(DC_128_PRED, ipred_dc_128, avx2); + init_angular_ipred_fn(TOP_DC_PRED, ipred_dc_top, avx2); + init_angular_ipred_fn(LEFT_DC_PRED, ipred_dc_left, avx2); +#if BITDEPTH == 8 + init_angular_ipred_fn(HOR_PRED, ipred_h, avx2); + init_angular_ipred_fn(VERT_PRED, ipred_v, avx2); + init_angular_ipred_fn(PAETH_PRED, ipred_paeth, avx2); + init_angular_ipred_fn(SMOOTH_PRED, ipred_smooth, avx2); + init_angular_ipred_fn(SMOOTH_H_PRED, ipred_smooth_h, avx2); + init_angular_ipred_fn(SMOOTH_V_PRED, ipred_smooth_v, avx2); + init_angular_ipred_fn(Z1_PRED, ipred_z1, avx2); + init_angular_ipred_fn(Z2_PRED, ipred_z2, avx2); + init_angular_ipred_fn(Z3_PRED, ipred_z3, avx2); + init_angular_ipred_fn(FILTER_PRED, ipred_filter, avx2); + + init_cfl_pred_fn(DC_PRED, ipred_cfl, avx2); + init_cfl_pred_fn(DC_128_PRED, ipred_cfl_128, avx2); + init_cfl_pred_fn(TOP_DC_PRED, ipred_cfl_top, avx2); + init_cfl_pred_fn(LEFT_DC_PRED, ipred_cfl_left, avx2); + + init_cfl_ac_fn(DAV1D_PIXEL_LAYOUT_I420 - 1, ipred_cfl_ac_420, avx2); + init_cfl_ac_fn(DAV1D_PIXEL_LAYOUT_I422 - 1, ipred_cfl_ac_422, avx2); + init_cfl_ac_fn(DAV1D_PIXEL_LAYOUT_I444 - 1, ipred_cfl_ac_444, avx2); c->pal_pred = dav1d_pal_pred_avx2; #endif +#endif } |