Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/videolan/dav1d.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorVictorien Le Couviour--Tuffet <victorien.lecouviour.tuffet@gmail.com>2021-05-04 15:03:14 +0300
committerHenrik Gramner <henrik@gramner.com>2021-05-04 18:00:07 +0300
commitae4b53fb4c9e8b616ea2f5ca92f9b9ee4ff33116 (patch)
treea6d3736a1416502f13c314b22f1449cdff84d6b9
parent787862dbd7b6fc3d48b24ba09039fd33ab652811 (diff)
x86: Add high bitdepth ipred_dc AVX2 asm
-rw-r--r--src/meson.build1
-rw-r--r--src/x86/ipred16_avx2.asm356
-rw-r--r--src/x86/ipred_init_tmpl.c188
3 files changed, 451 insertions, 94 deletions
diff --git a/src/meson.build b/src/meson.build
index 8ecbd95..dded3e2 100644
--- a/src/meson.build
+++ b/src/meson.build
@@ -214,6 +214,7 @@ if is_asm_enabled
if dav1d_bitdepths.contains('16')
libdav1d_sources_asm += files(
'x86/cdef16_avx2.asm',
+ 'x86/ipred16_avx2.asm',
'x86/looprestoration16_avx2.asm',
'x86/mc16_avx2.asm',
'x86/cdef16_sse.asm',
diff --git a/src/x86/ipred16_avx2.asm b/src/x86/ipred16_avx2.asm
new file mode 100644
index 0000000..e99ae0b
--- /dev/null
+++ b/src/x86/ipred16_avx2.asm
@@ -0,0 +1,356 @@
+; Copyright © 2021, VideoLAN and dav1d authors
+; Copyright © 2021, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+; list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+; this list of conditions and the following disclaimer in the documentation
+; and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+
+%if ARCH_X86_64
+
+SECTION_RODATA
+
+pw_512: times 2 dw 512
+pw_2048: times 2 dw 2048
+
+%macro JMP_TABLE 3-*
+ %xdefine %1_%2_table (%%table - 2*4)
+ %xdefine %%base mangle(private_prefix %+ _%1_%2)
+ %%table:
+ %rep %0 - 2
+ dd %%base %+ .%3 - (%%table - 2*4)
+ %rotate 1
+ %endrep
+%endmacro
+
+%define ipred_dc_splat_16bpc_avx2_table (ipred_dc_16bpc_avx2_table + 10*4)
+
+JMP_TABLE ipred_dc_16bpc, avx2, h4, h8, h16, h32, h64, w4, w8, w16, w32, w64, \
+ s4-10*4, s8-10*4, s16-10*4, s32-10*4, s64-10*4
+JMP_TABLE ipred_dc_left_16bpc, avx2, h4, h8, h16, h32, h64
+
+SECTION .text
+
+INIT_YMM avx2
+
+cglobal ipred_dc_top_16bpc, 3, 7, 6, dst, stride, tl, w, h
+ movifnidn hd, hm
+ add tlq, 2
+ movd xm4, wd
+ pxor xm3, xm3
+ pavgw xm4, xm3
+ tzcnt wd, wd
+ movd xm5, wd
+ movu m0, [tlq]
+ lea r5, [ipred_dc_left_16bpc_avx2_table]
+ movsxd r6, [r5+wq*4]
+ add r6, r5
+ add r5, ipred_dc_splat_16bpc_avx2_table-ipred_dc_left_16bpc_avx2_table
+ movsxd wq, [r5+wq*4]
+ add wq, r5
+ jmp r6
+
+cglobal ipred_dc_left_16bpc, 3, 7, 6, dst, stride, tl, w, h, stride3
+ mov hd, hm
+ sub tlq, hq
+ movd xm4, hd
+ sub tlq, hq
+ pxor xm3, xm3
+ pavgw xm4, xm3
+ tzcnt r6d, hd
+ movd xm5, r6d
+ movu m0, [tlq]
+ lea r5, [ipred_dc_left_16bpc_avx2_table]
+ movsxd r6, [r5+r6*4]
+ add r6, r5
+ add r5, ipred_dc_splat_16bpc_avx2_table-ipred_dc_left_16bpc_avx2_table
+ tzcnt wd, wd
+ movsxd wq, [r5+wq*4]
+ add wq, r5
+ jmp r6
+.h64:
+ paddw m0, [tlq+96]
+ paddw m0, [tlq+64]
+.h32:
+ paddw m0, [tlq+32]
+.h16:
+ vextracti128 xm1, m0, 1
+ paddw xm0, xm1
+.h8:
+ psrldq xm1, xm0, 8
+ paddw xm0, xm1
+.h4:
+ punpcklwd xm0, xm3
+ psrlq xm1, xm0, 32
+ paddd xm0, xm1
+ psrldq xm1, xm0, 8
+ paddd xm0, xm1
+ paddd xm0, xm4
+ psrld xm0, xm5
+ lea stride3q, [strideq*3]
+ vpbroadcastw m0, xm0
+ mova m1, m0
+ mova m2, m0
+ mova m3, m0
+ jmp wq
+
+cglobal ipred_dc_16bpc, 3, 7, 6, dst, stride, tl, w, h, stride3
+ movifnidn hd, hm
+ tzcnt r6d, hd
+ lea r5d, [wq+hq]
+ movd xm4, r5d
+ tzcnt r5d, r5d
+ movd xm5, r5d
+ lea r5, [ipred_dc_16bpc_avx2_table]
+ tzcnt wd, wd
+ movsxd r6, [r5+r6*4]
+ movsxd wq, [r5+wq*4+5*4]
+ pxor m3, m3
+ psrlw xm4, 1
+ add r6, r5
+ add wq, r5
+ lea stride3q, [strideq*3]
+ jmp r6
+.h4:
+ movq xm0, [tlq-8]
+ jmp wq
+.w4:
+ movq xm1, [tlq+2]
+ paddw m0, m4
+ paddw m0, m1
+ psrlq m1, m0, 32
+ paddw m0, m1
+ psrld m1, m0, 16
+ paddw m0, m1
+ cmp hd, 4
+ jg .w4_mul
+ psrlw xm0, 3
+ jmp .w4_end
+.w4_mul:
+ vextracti128 xm1, m0, 1
+ paddw xm0, xm1
+ lea r2d, [hq*2]
+ mov r6d, 0xAAAB6667
+ shrx r6d, r6d, r2d
+ punpckhwd xm1, xm0, xm3
+ punpcklwd xm0, xm3
+ paddd xm0, xm1
+ movd xm1, r6d
+ psrld xm0, 2
+ pmulhuw xm0, xm1
+ psrlw xm0, 1
+.w4_end:
+ vpbroadcastw xm0, xm0
+.s4:
+ movq [dstq+strideq*0], xm0
+ movq [dstq+strideq*1], xm0
+ movq [dstq+strideq*2], xm0
+ movq [dstq+stride3q ], xm0
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .s4
+ RET
+ALIGN function_align
+.h8:
+ mova xm0, [tlq-16]
+ jmp wq
+.w8:
+ vextracti128 xm1, m0, 1
+ paddw xm0, [tlq+2]
+ paddw xm0, xm4
+ paddw xm0, xm1
+ psrld xm1, xm0, 16
+ paddw xm0, xm1
+ pblendw xm0, xm3, 0xAA
+ psrlq xm1, xm0, 32
+ paddd xm0, xm1
+ psrldq xm1, xm0, 8
+ paddd xm0, xm1
+ psrld xm0, xm5
+ cmp hd, 8
+ je .w8_end
+ mov r6d, 0xAAAB
+ mov r2d, 0x6667
+ cmp hd, 32
+ cmovz r6d, r2d
+ movd xm1, r6d
+ pmulhuw xm0, xm1
+ psrlw xm0, 1
+.w8_end:
+ vpbroadcastw xm0, xm0
+.s8:
+ mova [dstq+strideq*0], xm0
+ mova [dstq+strideq*1], xm0
+ mova [dstq+strideq*2], xm0
+ mova [dstq+stride3q ], xm0
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .s8
+ RET
+ALIGN function_align
+.h16:
+ mova m0, [tlq-32]
+ jmp wq
+.w16:
+ paddw m0, [tlq+2]
+ vextracti128 xm1, m0, 1
+ paddw xm0, xm4
+ paddw xm0, xm1
+ punpckhwd xm1, xm0, xm3
+ punpcklwd xm0, xm3
+ paddd xm0, xm1
+ psrlq xm1, xm0, 32
+ paddd xm0, xm1
+ psrldq xm1, xm0, 8
+ paddd xm0, xm1
+ psrld xm0, xm5
+ cmp hd, 16
+ je .w16_end
+ mov r6d, 0xAAAB
+ mov r2d, 0x6667
+ test hb, 8|32
+ cmovz r6d, r2d
+ movd xm1, r6d
+ pmulhuw xm0, xm1
+ psrlw xm0, 1
+.w16_end:
+ vpbroadcastw m0, xm0
+.s16:
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m0
+ mova [dstq+strideq*2], m0
+ mova [dstq+stride3q ], m0
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .s16
+ RET
+ALIGN function_align
+.h32:
+ mova m0, [tlq-64]
+ paddw m0, [tlq-32]
+ jmp wq
+.w32:
+ paddw m0, [tlq+ 2]
+ paddw m0, [tlq+34]
+ vextracti128 xm1, m0, 1
+ paddw xm0, xm4
+ paddw xm0, xm1
+ punpcklwd xm1, xm0, xm3
+ punpckhwd xm0, xm3
+ paddd xm0, xm1
+ psrlq xm1, xm0, 32
+ paddd xm0, xm1
+ psrldq xm1, xm0, 8
+ paddd xm0, xm1
+ psrld xm0, xm5
+ cmp hd, 32
+ je .w32_end
+ lea r2d, [hq*2]
+ mov r6d, 0x6667AAAB
+ shrx r6d, r6d, r2d
+ movd xm1, r6d
+ pmulhuw xm0, xm1
+ psrlw xm0, 1
+.w32_end:
+ vpbroadcastw m0, xm0
+ mova m1, m0
+.s32:
+ mova [dstq+strideq*0+32*0], m0
+ mova [dstq+strideq*0+32*1], m1
+ mova [dstq+strideq*1+32*0], m0
+ mova [dstq+strideq*1+32*1], m1
+ mova [dstq+strideq*2+32*0], m0
+ mova [dstq+strideq*2+32*1], m1
+ mova [dstq+stride3q +32*0], m0
+ mova [dstq+stride3q +32*1], m1
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .s32
+ RET
+ALIGN function_align
+.h64:
+ mova m0, [tlq-128]
+ mova m1, [tlq- 96]
+ paddw m0, [tlq- 64]
+ paddw m1, [tlq- 32]
+ paddw m0, m1
+ jmp wq
+.w64:
+ movu m1, [tlq+ 2]
+ paddw m0, [tlq+34]
+ paddw m1, [tlq+66]
+ paddw m0, [tlq+98]
+ paddw m0, m1
+ vextracti128 xm1, m0, 1
+ paddw xm0, xm1
+ punpcklwd xm1, xm0, xm3
+ punpckhwd xm0, xm3
+ paddd xm1, xm4
+ paddd xm0, xm1
+ psrlq xm1, xm0, 32
+ paddd xm0, xm1
+ psrldq xm1, xm0, 8
+ paddd xm0, xm1
+ psrld xm0, xm5
+ cmp hd, 64
+ je .w64_end
+ mov r6d, 0x6667AAAB
+ shrx r6d, r6d, hd
+ movd xm1, r6d
+ pmulhuw xm0, xm1
+ psrlw xm0, 1
+.w64_end:
+ vpbroadcastw m0, xm0
+ mova m1, m0
+ mova m2, m0
+ mova m3, m0
+.s64:
+ mova [dstq+strideq*0+32*0], m0
+ mova [dstq+strideq*0+32*1], m1
+ mova [dstq+strideq*0+32*2], m2
+ mova [dstq+strideq*0+32*3], m3
+ mova [dstq+strideq*1+32*0], m0
+ mova [dstq+strideq*1+32*1], m1
+ mova [dstq+strideq*1+32*2], m2
+ mova [dstq+strideq*1+32*3], m3
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .s64
+ RET
+
+cglobal ipred_dc_128_16bpc, 2, 7, 6, dst, stride, tl, w, h, stride3
+ mov r6d, r8m
+ shr r6d, 11
+ lea r5, [ipred_dc_splat_16bpc_avx2_table]
+ tzcnt wd, wd
+ movifnidn hd, hm
+ movsxd wq, [r5+wq*4]
+ vpbroadcastd m0, [r5-ipred_dc_splat_16bpc_avx2_table+pw_512+r6*4]
+ mova m1, m0
+ mova m2, m0
+ mova m3, m0
+ add wq, r5
+ lea stride3q, [strideq*3]
+ jmp wq
+
+%endif
diff --git a/src/x86/ipred_init_tmpl.c b/src/x86/ipred_init_tmpl.c
index 4219ab8..fb7e917 100644
--- a/src/x86/ipred_init_tmpl.c
+++ b/src/x86/ipred_init_tmpl.c
@@ -28,54 +28,52 @@
#include "src/cpu.h"
#include "src/ipred.h"
-decl_angular_ipred_fn(dav1d_ipred_dc_avx2);
-decl_angular_ipred_fn(dav1d_ipred_dc_128_avx2);
-decl_angular_ipred_fn(dav1d_ipred_dc_top_avx2);
-decl_angular_ipred_fn(dav1d_ipred_dc_left_avx2);
-decl_angular_ipred_fn(dav1d_ipred_h_avx2);
-decl_angular_ipred_fn(dav1d_ipred_v_avx2);
-decl_angular_ipred_fn(dav1d_ipred_paeth_avx2);
-decl_angular_ipred_fn(dav1d_ipred_smooth_avx2);
-decl_angular_ipred_fn(dav1d_ipred_smooth_v_avx2);
-decl_angular_ipred_fn(dav1d_ipred_smooth_h_avx2);
-decl_angular_ipred_fn(dav1d_ipred_z1_avx2);
-decl_angular_ipred_fn(dav1d_ipred_z2_avx2);
-decl_angular_ipred_fn(dav1d_ipred_z3_avx2);
-decl_angular_ipred_fn(dav1d_ipred_filter_avx2);
-
-decl_cfl_pred_fn(dav1d_ipred_cfl_avx2);
-decl_cfl_pred_fn(dav1d_ipred_cfl_128_avx2);
-decl_cfl_pred_fn(dav1d_ipred_cfl_top_avx2);
-decl_cfl_pred_fn(dav1d_ipred_cfl_left_avx2);
-
-decl_cfl_ac_fn(dav1d_ipred_cfl_ac_420_avx2);
-decl_cfl_ac_fn(dav1d_ipred_cfl_ac_422_avx2);
-decl_cfl_ac_fn(dav1d_ipred_cfl_ac_444_avx2);
-
-decl_pal_pred_fn(dav1d_pal_pred_avx2);
-
-decl_angular_ipred_fn(dav1d_ipred_dc_ssse3);
-decl_angular_ipred_fn(dav1d_ipred_dc_128_ssse3);
-decl_angular_ipred_fn(dav1d_ipred_dc_top_ssse3);
-decl_angular_ipred_fn(dav1d_ipred_dc_left_ssse3);
-decl_angular_ipred_fn(dav1d_ipred_h_ssse3);
-decl_angular_ipred_fn(dav1d_ipred_v_ssse3);
-decl_angular_ipred_fn(dav1d_ipred_paeth_ssse3);
-decl_angular_ipred_fn(dav1d_ipred_smooth_ssse3);
-decl_angular_ipred_fn(dav1d_ipred_smooth_v_ssse3);
-decl_angular_ipred_fn(dav1d_ipred_smooth_h_ssse3);
-decl_angular_ipred_fn(dav1d_ipred_filter_ssse3);
-
-decl_cfl_pred_fn(dav1d_ipred_cfl_ssse3);
-decl_cfl_pred_fn(dav1d_ipred_cfl_128_ssse3);
-decl_cfl_pred_fn(dav1d_ipred_cfl_top_ssse3);
-decl_cfl_pred_fn(dav1d_ipred_cfl_left_ssse3);
-
-decl_cfl_ac_fn(dav1d_ipred_cfl_ac_420_ssse3);
-decl_cfl_ac_fn(dav1d_ipred_cfl_ac_422_ssse3);
-decl_cfl_ac_fn(dav1d_ipred_cfl_ac_444_ssse3);
-
-decl_pal_pred_fn(dav1d_pal_pred_ssse3);
+#if BITDEPTH == 8
+#define decl_fn(type, name) \
+ decl_##type##_fn(dav1d_##name##_ssse3); \
+ decl_##type##_fn(dav1d_##name##_avx2)
+#define init_fn(type0, type1, name, suffix) \
+ c->type0[type1] = dav1d_##name##_##suffix
+#else
+#define decl_fn(type, name) \
+ decl_##type##_fn(dav1d_##name##_16bpc_ssse3); \
+ decl_##type##_fn(dav1d_##name##_16bpc_avx2)
+#define init_fn(type0, type1, name, suffix) \
+ c->type0[type1] = dav1d_##name##_16bpc_##suffix
+#endif
+
+#define init_angular_ipred_fn(type, name, suffix) \
+ init_fn(intra_pred, type, name, suffix)
+#define init_cfl_pred_fn(type, name, suffix) \
+ init_fn(cfl_pred, type, name, suffix)
+#define init_cfl_ac_fn(type, name, suffix) \
+ init_fn(cfl_ac, type, name, suffix)
+
+decl_fn(angular_ipred, ipred_dc);
+decl_fn(angular_ipred, ipred_dc_128);
+decl_fn(angular_ipred, ipred_dc_top);
+decl_fn(angular_ipred, ipred_dc_left);
+decl_fn(angular_ipred, ipred_h);
+decl_fn(angular_ipred, ipred_v);
+decl_fn(angular_ipred, ipred_paeth);
+decl_fn(angular_ipred, ipred_smooth);
+decl_fn(angular_ipred, ipred_smooth_h);
+decl_fn(angular_ipred, ipred_smooth_v);
+decl_fn(angular_ipred, ipred_z1);
+decl_fn(angular_ipred, ipred_z2);
+decl_fn(angular_ipred, ipred_z3);
+decl_fn(angular_ipred, ipred_filter);
+
+decl_fn(cfl_pred, ipred_cfl);
+decl_fn(cfl_pred, ipred_cfl_128);
+decl_fn(cfl_pred, ipred_cfl_top);
+decl_fn(cfl_pred, ipred_cfl_left);
+
+decl_fn(cfl_ac, ipred_cfl_ac_420);
+decl_fn(cfl_ac, ipred_cfl_ac_422);
+decl_fn(cfl_ac, ipred_cfl_ac_444);
+
+decl_fn(pal_pred, pal_pred);
COLD void bitfn(dav1d_intra_pred_dsp_init_x86)(Dav1dIntraPredDSPContext *const c) {
const unsigned flags = dav1d_get_cpu_flags();
@@ -83,57 +81,59 @@ COLD void bitfn(dav1d_intra_pred_dsp_init_x86)(Dav1dIntraPredDSPContext *const c
if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return;
#if BITDEPTH == 8
- c->intra_pred[DC_PRED] = dav1d_ipred_dc_ssse3;
- c->intra_pred[DC_128_PRED] = dav1d_ipred_dc_128_ssse3;
- c->intra_pred[TOP_DC_PRED] = dav1d_ipred_dc_top_ssse3;
- c->intra_pred[LEFT_DC_PRED] = dav1d_ipred_dc_left_ssse3;
- c->intra_pred[HOR_PRED] = dav1d_ipred_h_ssse3;
- c->intra_pred[VERT_PRED] = dav1d_ipred_v_ssse3;
- c->intra_pred[PAETH_PRED] = dav1d_ipred_paeth_ssse3;
- c->intra_pred[SMOOTH_PRED] = dav1d_ipred_smooth_ssse3;
- c->intra_pred[SMOOTH_V_PRED] = dav1d_ipred_smooth_v_ssse3;
- c->intra_pred[SMOOTH_H_PRED] = dav1d_ipred_smooth_h_ssse3;
- c->intra_pred[FILTER_PRED] = dav1d_ipred_filter_ssse3;
-
- c->cfl_pred[DC_PRED] = dav1d_ipred_cfl_ssse3;
- c->cfl_pred[DC_128_PRED] = dav1d_ipred_cfl_128_ssse3;
- c->cfl_pred[TOP_DC_PRED] = dav1d_ipred_cfl_top_ssse3;
- c->cfl_pred[LEFT_DC_PRED] = dav1d_ipred_cfl_left_ssse3;
-
- c->cfl_ac[DAV1D_PIXEL_LAYOUT_I420 - 1] = dav1d_ipred_cfl_ac_420_ssse3;
- c->cfl_ac[DAV1D_PIXEL_LAYOUT_I422 - 1] = dav1d_ipred_cfl_ac_422_ssse3;
- c->cfl_ac[DAV1D_PIXEL_LAYOUT_I444 - 1] = dav1d_ipred_cfl_ac_444_ssse3;
-
- c->pal_pred = dav1d_pal_pred_ssse3;
+ init_angular_ipred_fn(DC_PRED, ipred_dc, ssse3);
+ init_angular_ipred_fn(DC_128_PRED, ipred_dc_128, ssse3);
+ init_angular_ipred_fn(TOP_DC_PRED, ipred_dc_top, ssse3);
+ init_angular_ipred_fn(LEFT_DC_PRED, ipred_dc_left, ssse3);
+ init_angular_ipred_fn(HOR_PRED, ipred_h, ssse3);
+ init_angular_ipred_fn(VERT_PRED, ipred_v, ssse3);
+ init_angular_ipred_fn(PAETH_PRED, ipred_paeth, ssse3);
+ init_angular_ipred_fn(SMOOTH_PRED, ipred_smooth, ssse3);
+ init_angular_ipred_fn(SMOOTH_H_PRED, ipred_smooth_h, ssse3);
+ init_angular_ipred_fn(SMOOTH_V_PRED, ipred_smooth_v, ssse3);
+ init_angular_ipred_fn(FILTER_PRED, ipred_filter, ssse3);
+
+ init_cfl_pred_fn(DC_PRED, ipred_cfl, ssse3);
+ init_cfl_pred_fn(DC_128_PRED, ipred_cfl_128, ssse3);
+ init_cfl_pred_fn(TOP_DC_PRED, ipred_cfl_top, ssse3);
+ init_cfl_pred_fn(LEFT_DC_PRED, ipred_cfl_left, ssse3);
+
+ init_cfl_ac_fn(DAV1D_PIXEL_LAYOUT_I420 - 1, ipred_cfl_ac_420, ssse3);
+ init_cfl_ac_fn(DAV1D_PIXEL_LAYOUT_I422 - 1, ipred_cfl_ac_422, ssse3);
+ init_cfl_ac_fn(DAV1D_PIXEL_LAYOUT_I444 - 1, ipred_cfl_ac_444, ssse3);
+
+ c->pal_pred = dav1d_pal_pred_ssse3;
#endif
+#if ARCH_X86_64
if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return;
-#if BITDEPTH == 8 && ARCH_X86_64
- c->intra_pred[DC_PRED] = dav1d_ipred_dc_avx2;
- c->intra_pred[DC_128_PRED] = dav1d_ipred_dc_128_avx2;
- c->intra_pred[TOP_DC_PRED] = dav1d_ipred_dc_top_avx2;
- c->intra_pred[LEFT_DC_PRED] = dav1d_ipred_dc_left_avx2;
- c->intra_pred[HOR_PRED] = dav1d_ipred_h_avx2;
- c->intra_pred[VERT_PRED] = dav1d_ipred_v_avx2;
- c->intra_pred[PAETH_PRED] = dav1d_ipred_paeth_avx2;
- c->intra_pred[SMOOTH_PRED] = dav1d_ipred_smooth_avx2;
- c->intra_pred[SMOOTH_V_PRED] = dav1d_ipred_smooth_v_avx2;
- c->intra_pred[SMOOTH_H_PRED] = dav1d_ipred_smooth_h_avx2;
- c->intra_pred[Z1_PRED] = dav1d_ipred_z1_avx2;
- c->intra_pred[Z2_PRED] = dav1d_ipred_z2_avx2;
- c->intra_pred[Z3_PRED] = dav1d_ipred_z3_avx2;
- c->intra_pred[FILTER_PRED] = dav1d_ipred_filter_avx2;
-
- c->cfl_pred[DC_PRED] = dav1d_ipred_cfl_avx2;
- c->cfl_pred[DC_128_PRED] = dav1d_ipred_cfl_128_avx2;
- c->cfl_pred[TOP_DC_PRED] = dav1d_ipred_cfl_top_avx2;
- c->cfl_pred[LEFT_DC_PRED] = dav1d_ipred_cfl_left_avx2;
-
- c->cfl_ac[DAV1D_PIXEL_LAYOUT_I420 - 1] = dav1d_ipred_cfl_ac_420_avx2;
- c->cfl_ac[DAV1D_PIXEL_LAYOUT_I422 - 1] = dav1d_ipred_cfl_ac_422_avx2;
- c->cfl_ac[DAV1D_PIXEL_LAYOUT_I444 - 1] = dav1d_ipred_cfl_ac_444_avx2;
+ init_angular_ipred_fn(DC_PRED, ipred_dc, avx2);
+ init_angular_ipred_fn(DC_128_PRED, ipred_dc_128, avx2);
+ init_angular_ipred_fn(TOP_DC_PRED, ipred_dc_top, avx2);
+ init_angular_ipred_fn(LEFT_DC_PRED, ipred_dc_left, avx2);
+#if BITDEPTH == 8
+ init_angular_ipred_fn(HOR_PRED, ipred_h, avx2);
+ init_angular_ipred_fn(VERT_PRED, ipred_v, avx2);
+ init_angular_ipred_fn(PAETH_PRED, ipred_paeth, avx2);
+ init_angular_ipred_fn(SMOOTH_PRED, ipred_smooth, avx2);
+ init_angular_ipred_fn(SMOOTH_H_PRED, ipred_smooth_h, avx2);
+ init_angular_ipred_fn(SMOOTH_V_PRED, ipred_smooth_v, avx2);
+ init_angular_ipred_fn(Z1_PRED, ipred_z1, avx2);
+ init_angular_ipred_fn(Z2_PRED, ipred_z2, avx2);
+ init_angular_ipred_fn(Z3_PRED, ipred_z3, avx2);
+ init_angular_ipred_fn(FILTER_PRED, ipred_filter, avx2);
+
+ init_cfl_pred_fn(DC_PRED, ipred_cfl, avx2);
+ init_cfl_pred_fn(DC_128_PRED, ipred_cfl_128, avx2);
+ init_cfl_pred_fn(TOP_DC_PRED, ipred_cfl_top, avx2);
+ init_cfl_pred_fn(LEFT_DC_PRED, ipred_cfl_left, avx2);
+
+ init_cfl_ac_fn(DAV1D_PIXEL_LAYOUT_I420 - 1, ipred_cfl_ac_420, avx2);
+ init_cfl_ac_fn(DAV1D_PIXEL_LAYOUT_I422 - 1, ipred_cfl_ac_422, avx2);
+ init_cfl_ac_fn(DAV1D_PIXEL_LAYOUT_I444 - 1, ipred_cfl_ac_444, avx2);
c->pal_pred = dav1d_pal_pred_avx2;
#endif
+#endif
}