From 949b8902b4cfd1505eb5885b57546b25f38c5bd5 Mon Sep 17 00:00:00 2001 From: Henrik Gramner Date: Mon, 7 Mar 2022 22:04:34 +0100 Subject: x86: Add high bit-depth film grain AVX-512 (Ice Lake) asm --- src/meson.build | 1 + src/x86/filmgrain16_avx2.asm | 41 +- src/x86/filmgrain16_avx512.asm | 932 +++++++++++++++++++++++++++++++++++++++++ src/x86/filmgrain_avx2.asm | 12 +- src/x86/filmgrain_init_tmpl.c | 2 - 5 files changed, 957 insertions(+), 31 deletions(-) create mode 100644 src/x86/filmgrain16_avx512.asm diff --git a/src/meson.build b/src/meson.build index b06aee6..a9ce159 100644 --- a/src/meson.build +++ b/src/meson.build @@ -226,6 +226,7 @@ if is_asm_enabled if dav1d_bitdepths.contains('16') libdav1d_sources_asm += files( + 'x86/filmgrain16_avx512.asm', 'x86/ipred16_avx512.asm', 'x86/looprestoration16_avx512.asm', 'x86/mc16_avx512.asm', diff --git a/src/x86/filmgrain16_avx2.asm b/src/x86/filmgrain16_avx2.asm index 5c2e386..a1d4c41 100644 --- a/src/x86/filmgrain16_avx2.asm +++ b/src/x86/filmgrain16_avx2.asm @@ -29,7 +29,7 @@ %if ARCH_X86_64 -SECTION_RODATA 32 +SECTION_RODATA 16 pb_mask: db 0,128,128, 0,128, 0, 0,128,128, 0, 0,128, 0,128,128, 0 gen_shufA: db 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9 gen_shufB: db 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13 @@ -877,7 +877,7 @@ cglobal fgy_32x32xn_16bpc, 6, 14, 16, dst, src, stride, fg_data, w, scaling, \ mov r6d, r9m ; bdmax mov r9d, [fg_dataq+FGData.clip_to_restricted_range] mov r7d, [fg_dataq+FGData.scaling_shift] - movifnidn sbyd, sbym + mov sbyd, sbym vpbroadcastd m8, r9m shr r6d, 11 ; is_12bpc vpbroadcastd m9, [base+grain_min+r6*4] @@ -890,7 +890,7 @@ cglobal fgy_32x32xn_16bpc, 6, 14, 16, dst, src, stride, fg_data, w, scaling, \ test sbyd, sbyd setnz r7b vpbroadcastd m14, [base+pd_16] - test r7b, byte [fg_dataq+FGData.overlap_flag] + test r7b, [fg_dataq+FGData.overlap_flag] jnz .vertical_overlap imul seed, sbyd, (173 << 24) | 37 @@ -986,7 +986,7 @@ cglobal fgy_32x32xn_16bpc, 6, 14, 16, dst, src, stride, fg_data, w, scaling, \ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ offx, offy, see, src_bak, left_offxy - lea left_offxyd, [offyd+32] ; previous column's offy*stride+offx + lea left_offxyd, [offyq+32] ; previous column's offy*stride+offx rorx offyd, seed, 8 rorx offxq, seeq, 12 and offyd, 0xf @@ -1059,8 +1059,6 @@ cglobal fgy_32x32xn_16bpc, 6, 14, 16, dst, src, stride, fg_data, w, scaling, \ cmp dword r8m, 0 ; sby jne .loop_x_hv_overlap jmp .loop_x_h_overlap -.end: - RET .vertical_overlap: DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused1, \ @@ -1194,7 +1192,7 @@ cglobal fgy_32x32xn_16bpc, 6, 14, 16, dst, src, stride, fg_data, w, scaling, \ jmp .loop_y .end_y_v_overlap: add wq, 32 - jge .end_hv + jge .end lea srcq, [src_bakq+wq*2] ; since fg_dataq.overlap is guaranteed to be set, we never jump @@ -1220,8 +1218,8 @@ cglobal fgy_32x32xn_16bpc, 6, 14, 16, dst, src, stride, fg_data, w, scaling, \ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ offx, offy, see, src_bak, left_offxy, top_offxy, topleft_offxy - lea topleft_offxyq, [top_offxyq+32] - lea left_offxyq, [offyq+32] + lea topleft_offxyd, [top_offxyq+32] + lea left_offxyd, [offyq+32] rorx offyd, seed, 8 rorx offxd, seed, 12 and offyd, 0xf000f @@ -1271,7 +1269,7 @@ cglobal fgy_32x32xn_16bpc, 6, 14, 16, dst, src, stride, fg_data, w, scaling, \ pmaddwd xm6, xm4 paddd xm6, xm14 psrad xm6, 5 - packssdw xm6, xm4 + packssdw xm6, xm6 pmaxsw xm6, xm9 pminsw xm6, xm10 pshuflw xm4, xm6, q1032 @@ -1293,7 +1291,6 @@ cglobal fgy_32x32xn_16bpc, 6, 14, 16, dst, src, stride, fg_data, w, scaling, \ punpckhwd m6, m7 pmaddwd m5, m15 pmaddwd m6, m15 - vpbroadcastd m7, r9m paddd m5, m14 paddd m6, m14 psrad m5, 5 @@ -1337,7 +1334,7 @@ cglobal fgy_32x32xn_16bpc, 6, 14, 16, dst, src, stride, fg_data, w, scaling, \ add wq, 32 lea srcq, [src_bakq+wq*2] jl .loop_x_hv_overlap -.end_hv: +.end: RET %macro FGUV_FN 3 ; name, ss_hor, ss_ver @@ -1348,7 +1345,7 @@ cglobal fguv_32x32xn_i%1_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling mov r9d, r13m ; bdmax mov r7d, [fg_dataq+FGData.scaling_shift] mov r11d, is_idm - movifnidn sbyd, sbym + mov sbyd, sbym vpbroadcastw m11, [base+mul_bits+r7*2-12] mov r6d, [fg_dataq+FGData.clip_to_restricted_range] shr r9d, 11 ; is_12bpc @@ -1385,7 +1382,7 @@ cglobal fguv_32x32xn_i%1_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling %endif vpbroadcastd m14, [base+pd_16] %endif - test r7b, byte [fg_dataq+FGData.overlap_flag] + test r7b, [fg_dataq+FGData.overlap_flag] jnz %%vertical_overlap imul seed, sbyd, (173 << 24) | 37 @@ -1538,7 +1535,7 @@ cglobal fguv_32x32xn_i%1_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling %endif jg %%loop_y add wq, 32>>%2 - jge %%end + jge .end mov srcq, r9mp mov dstq, r11mp mov lumaq, r12mp @@ -1561,7 +1558,7 @@ cglobal fguv_32x32xn_i%1_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ offx, offy, see, left_offxy, unused1, unused2, luma, lstride - lea left_offxyd, [offyd+(32>>%2)] ; previous column's offy*stride+offx + lea left_offxyd, [offyq+(32>>%2)] ; previous column's offy*stride+offx rorx offyd, seed, 8 rorx offxq, seeq, 12 and offyd, 0xf @@ -1717,7 +1714,7 @@ cglobal fguv_32x32xn_i%1_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling %endif jg %%loop_y_h_overlap add wq, 32>>%2 - jge %%end + jge .end mov srcq, r9mp mov dstq, r11mp mov lumaq, r12mp @@ -1727,8 +1724,6 @@ cglobal fguv_32x32xn_i%1_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling cmp dword r8m, 0 ; sby jne %%loop_x_hv_overlap jmp %%loop_x_h_overlap -%%end: - RET %%vertical_overlap: DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, \ @@ -1957,7 +1952,7 @@ cglobal fguv_32x32xn_i%1_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling %endif %%end_y_v_overlap: add wq, 32>>%2 - jge %%end_hv + jge .end mov srcq, r9mp mov dstq, r11mp mov lumaq, r12mp @@ -2226,7 +2221,7 @@ cglobal fguv_32x32xn_i%1_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling %endif %%end_y_hv_overlap: add wq, 32>>%2 - jge %%end_hv + jge .end mov srcq, r9mp mov dstq, r11mp mov lumaq, r12mp @@ -2234,13 +2229,13 @@ cglobal fguv_32x32xn_i%1_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling lea dstq, [dstq+wq*2] lea lumaq, [lumaq+wq*(2<<%2)] jmp %%loop_x_hv_overlap -%%end_hv: - RET %endmacro %%FGUV_32x32xN_LOOP 1, %2, %3 .csfl: %%FGUV_32x32xN_LOOP 0, %2, %3 +.end: + RET %endmacro GEN_GRAIN_UV_FN 420, 1, 1 diff --git a/src/x86/filmgrain16_avx512.asm b/src/x86/filmgrain16_avx512.asm new file mode 100644 index 0000000..00dd6af --- /dev/null +++ b/src/x86/filmgrain16_avx512.asm @@ -0,0 +1,932 @@ +; Copyright © 2022, VideoLAN and dav1d authors +; Copyright © 2022, Two Orioles, LLC +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are met: +; +; 1. Redistributions of source code must retain the above copyright notice, this +; list of conditions and the following disclaimer. +; +; 2. Redistributions in binary form must reproduce the above copyright notice, +; this list of conditions and the following disclaimer in the documentation +; and/or other materials provided with the distribution. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +%include "config.asm" +%include "ext/x86/x86inc.asm" +%include "x86/filmgrain_common.asm" + +%if ARCH_X86_64 + +SECTION_RODATA 64 +pb_0to63: db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 + db 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 + db 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47 + db 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63 +scale_mask: db -1, -1, 0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1 +scale_shift: dw 7, 7, 6, 6, 5, 5, 4, 4 +pw_27_17_17_27: dw 108, 68, 68, 108, 27, 17, 17, 27 +pw_23_22: dw 92, 88, 0, 128, 23, 22, 0, 32 +fg_min: times 2 dw 0 + times 2 dw 64 + times 2 dw 256 +fg_max: times 2 dw 1023 + times 2 dw 4095 + times 2 dw 960 + times 2 dw 3840 + times 2 dw 940 + times 2 dw 3760 +scale_rnd: dd 64 + dd 16 +uv_offset_mul: dd 256 + dd 1024 +pb_8_9_0_1: db 8, 9, 0, 1 + +SECTION .text + +INIT_ZMM avx512icl +cglobal fgy_32x32xn_16bpc, 6, 15, 21, dst, src, stride, fg_data, w, scaling, \ + grain_lut, offx, sby, see, offy, src_bak +%define base r11-fg_min + lea r11, [fg_min] + mov r6d, r9m ; bdmax + mov r9d, [fg_dataq+FGData.clip_to_restricted_range] + mov r7d, [fg_dataq+FGData.scaling_shift] + mov sbyd, sbym + vpbroadcastd m6, r9m + shr r6d, 11 ; is_12bpc + vbroadcasti32x4 m7, [base+scale_mask] + shlx r10d, r9d, r6d + vpbroadcastd m10, [base+scale_shift+r7*4-32] + lea r9d, [r6+r9*4] + vpbroadcastd m8, [base+fg_min+r10*4] + kxnorw k1, k1, k1 ; 0xffff + vpbroadcastd m9, [base+fg_max+r9*4] + mov r12, 0xeeeeeeeeeeeeeeee + vpbroadcastd m19, [base+scale_rnd+r6*4] + kshiftrb k2, k1, 4 ; 0xf + vpbroadcastq xm20, [base+pw_27_17_17_27+r6*8] + kmovq k3, r12 + vpbroadcastd m11, [base+scale_shift+r6*8+4] + test sbyd, sbyd + setnz r7b + vpbroadcastd m12, [base+pw_27_17_17_27+r6*8+0] + vpbroadcastd m13, [base+pw_27_17_17_27+r6*8+4] + test r7b, [fg_dataq+FGData.overlap_flag] + jnz .v_overlap + + imul seed, sbyd, (173 << 24) | 37 + add seed, (105 << 24) | 178 + rorx seed, seed, 24 + movzx seed, seew + xor seed, [fg_dataq+FGData.seed] + lea src_bakq, [srcq+wq*2] + neg wq + sub dstq, srcq + +.loop_x: + rorx r6, seeq, 1 + or seed, 0xeff4 + test seeb, seeh + lea seed, [r6+0x8000] + cmovp seed, r6d ; updated seed + rorx offyd, seed, 8 + rorx offxq, seeq, 12 + and offyd, 0xf + imul offyd, 164 + lea offyd, [offyq+offxq*2+747] ; offy*stride+offx + + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, h, \ + sby, see, offxy, src_bak + + mov grain_lutq, grain_lutmp + mov hd, hm +.loop_y: + movu m4, [grain_lutq+offxyq*2+82*0] + movu m5, [grain_lutq+offxyq*2+82*2] + call .add_noise + sub hb, 2 + jg .loop_y + add wq, 32 + jge .end + lea srcq, [src_bakq+wq*2] + cmp byte [fg_dataq+FGData.overlap_flag], 0 + je .loop_x + test sbyd, sbyd + jnz .hv_overlap + + ; horizontal overlap (without vertical overlap) +.loop_x_h_overlap: + rorx r6, seeq, 1 + or seed, 0xeff4 + test seeb, seeh + lea seed, [r6+0x8000] + cmovp seed, r6d ; updated seed + + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, offx, \ + sby, see, offy, src_bak, left_offxy + + lea left_offxyd, [offyq+73] ; previous column's offy*stride+offx + rorx offyd, seed, 8 + rorx offxq, seeq, 12 + and offyd, 0xf + imul offyd, 164 + lea offyd, [offyq+offxq*2+747] ; offy*stride+offx + + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, h, \ + sby, see, offxy, src_bak, left_offxy + + mov grain_lutq, grain_lutmp + mov hd, hm +.loop_y_h_overlap: + movu m4, [grain_lutq+offxyq*2+82*0] + movu m5, [grain_lutq+offxyq*2+82*2] + movd xm17, [grain_lutq+left_offxyq*2-82*1] + pinsrd xm17, [grain_lutq+left_offxyq*2+82*1], 1 + punpckldq xm16, xm4, xm5 + punpcklwd xm17, xm16 + mova xm16, xm19 + vpdpwssd xm16, xm20, xm17 + psrad xm16, 1 + packssdw xm16, xm16 + vpsravw xm16, xm11 + vmovdqu8 m4{k2}, m16 + vpalignr m5{k2}, m16, m16, 4 + call .add_noise + sub hb, 2 + jg .loop_y_h_overlap + add wq, 32 + jge .end + lea srcq, [src_bakq+wq*2] + test sbyd, sbyd + jnz .hv_overlap + jmp .loop_x_h_overlap + +.v_overlap: + movzx sbyd, sbyb + imul seed, [fg_dataq+FGData.seed], 0x00010001 + imul r7d, sbyd, 173 * 0x00010001 + imul sbyd, 37 * 0x01000100 + add r7d, (105 << 16) | 188 + add sbyd, (178 << 24) | (141 << 8) + and r7d, 0x00ff00ff + and sbyd, 0xff00ff00 + xor seed, r7d + xor seed, sbyd ; (cur_seed << 16) | top_seed + lea src_bakq, [srcq+wq*2] + neg wq + sub dstq, srcq + + ; we assume from the block above that bits 8-15 of r7d are zero'ed + mov r6d, seed + or seed, 0xeff4eff4 + test seeb, seeh + setp r7b ; parity of top_seed + shr seed, 16 + shl r7d, 16 + test seeb, seeh + setp r7b ; parity of cur_seed + or r6d, 0x00010001 + xor r7d, r6d + rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed + + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, offx, \ + sby, see, offy, src_bak, _, top_offxy + + rorx offyd, seed, 8 + rorx offxd, seed, 12 + and offyd, 0xf000f + and offxd, 0xf000f + imul offyd, 164 + ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy + lea offyd, [offyq+offxq*2+0x10001*747+32*82] + + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, h, \ + sby, see, offxy, src_bak, _, top_offxy + + mov grain_lutq, grain_lutmp + mov hd, hm + movzx top_offxyd, offxyw + shr offxyd, 16 + + movu m16, [grain_lutq+offxyq*2+82*0] + movu m0, [grain_lutq+top_offxyq*2+82*0] + movu m17, [grain_lutq+offxyq*2+82*2] + movu m1, [grain_lutq+top_offxyq*2+82*2] + punpckhwd m4, m0, m16 + punpcklwd m0, m16 + punpckhwd m5, m1, m17 + punpcklwd m1, m17 + call .add_noise_v + sub hb, 2 + jg .loop_y + add wq, 32 + jge .end + lea srcq, [src_bakq+wq*2] + + ; since fg_dataq.overlap is guaranteed to be set, we never jump back + ; to .v_overlap, and instead always fall-through to .hv_overlap +.hv_overlap: + ; we assume from the block above that bits 8-15 of r7d are zero'ed + mov r6d, seed + or seed, 0xeff4eff4 + test seeb, seeh + setp r7b ; parity of top_seed + shr seed, 16 + shl r7d, 16 + test seeb, seeh + setp r7b ; parity of cur_seed + or r6d, 0x00010001 + xor r7d, r6d + rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed + + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, offx, \ + sby, see, offy, src_bak, left_offxy, top_offxy, topleft_offxy + + lea topleft_offxyd, [top_offxyq+73] + lea left_offxyd, [offyq+73] + rorx offyd, seed, 8 + rorx offxd, seed, 12 + and offyd, 0xf000f + and offxd, 0xf000f + imul offyd, 164 + ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy + lea offyd, [offyq+offxq*2+0x10001*747+32*82] + + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, h, \ + sby, see, offxy, src_bak, left_offxy, top_offxy, topleft_offxy + + mov grain_lutq, grain_lutmp + mov hd, hm + movzx top_offxyd, offxyw + shr offxyd, 16 + + movu m5, [grain_lutq+offxyq*2+82*0] + movu m0, [grain_lutq+top_offxyq*2+82*0] + movd xm17, [grain_lutq+left_offxyq*2-82*1] + pinsrd xm17, [grain_lutq+topleft_offxyq*2-82*1], 1 + movu m2, [grain_lutq+offxyq*2+82*2] + movu m1, [grain_lutq+top_offxyq*2+82*2] + movd xm18, [grain_lutq+left_offxyq*2+82*1] + pinsrd xm18, [grain_lutq+topleft_offxyq*2+82*1], 1 + punpckldq xm16, xm5, xm0 + punpcklwd xm17, xm16 + mova xm16, xm19 + vpdpwssd xm16, xm20, xm17 + punpckldq xm17, xm2, xm1 + punpcklwd xm18, xm17 + mova xm17, xm19 + vpdpwssd xm17, xm20, xm18 + punpckhwd m4, m0, m5 + punpcklwd m0, m5 + punpckhwd m5, m1, m2 + punpcklwd m1, m2 + psrad xm16, 1 + psrad xm17, 1 + packssdw xm16, xm17 + vpsravw xm16, xm11 + vpshuflw m0{k2}, m16, q1302 + punpckhqdq xm16, xm16 + vpshuflw m1{k2}, m16, q1302 + call .add_noise_v + sub hb, 2 + jg .loop_y_h_overlap + add wq, 32 + lea srcq, [src_bakq+wq*2] + jl .hv_overlap +.end: + RET +ALIGN function_align +.add_noise_v: + mova m2, m19 + vpdpwssd m2, m12, m4 + mova m3, m19 + vpdpwssd m3, m13, m5 + mova m4, m19 + vpdpwssd m4, m12, m0 + mova m5, m19 + vpdpwssd m5, m13, m1 + REPX {psrad x, 1}, m2, m3, m4, m5 + packssdw m4, m2 + packssdw m5, m3 + vpsravw m4, m11 + vpsravw m5, m11 +.add_noise: + mova m0, [srcq+strideq*0] + mova m1, [srcq+strideq*1] + kmovw k4, k1 + pand m16, m6, m0 + psrld m3, m0, 16 + vpgatherdd m2{k4}, [scalingq+m16] + vpcmpud k4, m3, m6, 2 ; px <= bdmax + vpgatherdd m16{k4}, [scalingq+m3] + kmovw k4, k1 + pand m17, m6, m1 + vpgatherdd m3{k4}, [scalingq+m17] + vpshufb m2{k3}, m16, m7 + psrld m16, m1, 16 + vpcmpud k4, m16, m6, 2 + vpgatherdd m17{k4}, [scalingq+m16] + vpshufb m3{k3}, m17, m7 + vpsllvw m2, m10 + vpsllvw m3, m10 + pmulhrsw m4, m2 + pmulhrsw m5, m3 + add grain_lutq, 82*4 + paddw m0, m4 + paddw m1, m5 + pmaxsw m0, m8 + pmaxsw m1, m8 + pminsw m0, m9 + pminsw m1, m9 + mova [dstq+srcq], m0 + add srcq, strideq + mova [dstq+srcq], m1 + add srcq, strideq + ret + +%macro FGUV_FN 3 ; name, ss_hor, ss_ver +cglobal fguv_32x32xn_i%1_16bpc, 6, 15, 22, dst, src, stride, fg_data, w, scaling, \ + grain_lut, h, sby, luma, lstride, uv_pl, is_id +%define base r12-fg_min + lea r12, [fg_min] + mov r9d, r13m ; bdmax + mov r7d, [fg_dataq+FGData.scaling_shift] + mov r6d, [fg_dataq+FGData.clip_to_restricted_range] + mov r11d, is_idm + kxnorw k1, k1, k1 ; 0xffff + vpbroadcastd m5, r13m + mov r13, 0xeeeeeeeeeeeeeeee + vbroadcasti32x4 m6, [base+scale_mask] + shr r9d, 11 ; is_12bpc + vpbroadcastd m7, [base+scale_shift+r7*4-32] + shlx r10d, r6d, r9d + mov sbyd, sbym + shlx r6d, r6d, r11d + vpbroadcastd m8, [base+fg_min+r10*4] + lea r6d, [r9+r6*2] + vpbroadcastd m9, [base+fg_max+r6*4] + kmovq k2, r13 + vpbroadcastd m20, [base+scale_rnd+r9*4] + packssdw m4, m5, m5 + vpbroadcastd m21, [base+scale_shift+r9*8+4] +%if %2 + mova m12, [base+pb_0to63] ; pw_even + mov r13d, 0x0101 + vpbroadcastq m10, [base+pw_23_22+r9*8] + kmovw k3, r13d +%if %3 + pshufd m11, m10, q0000 +%else + vpbroadcastd ym16, [base+pw_27_17_17_27+r9*8+0] + vpbroadcastd m11, [base+pw_27_17_17_27+r9*8+4] + vmovdqu16 m11{k1}, m16 +%endif + psrlw m13, m12, 8 ; pw_odd +%else + vpbroadcastq m10, [base+pw_27_17_17_27+r9*8] + kshiftrb k3, k1, 7 ; 0x01 + kshiftrb k4, k1, 4 ; 0x0f + pshufd m11, m10, q0000 +%endif + mov lstrideq, r10mp + test sbyd, sbyd + setnz r7b + cmp byte [fg_dataq+FGData.chroma_scaling_from_luma], 0 + jne .csfl + +%macro %%FGUV_32x32xN_LOOP 3 ; not-csfl, ss_hor, ss_ver + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + _, sby, see, lstride + +%if %1 + mov r6d, r11m + vpbroadcastd m0, [base+uv_offset_mul+r9*4] + vpbroadcastd m1, [base+pb_8_9_0_1] + vpbroadcastd m14, [fg_dataq+FGData.uv_offset+r6*4] + vbroadcasti32x4 m15, [fg_dataq+FGData.uv_mult+r6*4] + pmaddwd m14, m0 + pshufb m15, m1 ; { uv_luma_mult, uv_mult } +%endif + test r7b, [fg_dataq+FGData.overlap_flag] + jnz %%v_overlap + + imul seed, sbyd, (173 << 24) | 37 + add seed, (105 << 24) | 178 + rorx seed, seed, 24 + movzx seed, seew + xor seed, [fg_dataq+FGData.seed] + + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + offx, offy, see, lstride, luma + + mov lumaq, r9mp + lea r12, [srcq+wq*2] + lea r13, [dstq+wq*2] + lea r14, [lumaq+wq*(2<<%2)] + mov r9mp, r12 + mov r10mp, r13 + mov r11mp, r14 + neg wq + +%%loop_x: + rorx r6, seeq, 1 + or seed, 0xeff4 + test seeb, seeh + lea seed, [r6+0x8000] + cmovp seed, r6d ; updated seed + rorx offyd, seed, 8 + rorx offxq, seeq, 12 + and offyd, 0xf + imul offyd, 164>>%3 + lea offyd, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+(3+(6>>%2))] ; offy*stride+offx + + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + h, offxy, see, lstride, luma + + mov grain_lutq, grain_lutmp + mov hd, hm +%%loop_y: +%if %2 + movu ym18, [grain_lutq+offxyq*2+82*0] + vinserti32x8 m18, [grain_lutq+offxyq*2+82*2], 1 + movu ym19, [grain_lutq+offxyq*2+82*4] + vinserti32x8 m19, [grain_lutq+offxyq*2+82*6], 1 +%else + movu m18, [grain_lutq+offxyq*2+82*0] + movu m19, [grain_lutq+offxyq*2+82*2] +%endif + call %%add_noise + sub hb, 2<<%2 + jg %%loop_y + add wq, 32>>%2 + jge .end + mov srcq, r9mp + mov dstq, r10mp + mov lumaq, r11mp + lea srcq, [srcq+wq*2] + lea dstq, [dstq+wq*2] + lea lumaq, [lumaq+wq*(2<<%2)] + cmp byte [fg_dataq+FGData.overlap_flag], 0 + je %%loop_x + cmp dword r8m, 0 ; sby + jne %%hv_overlap + + ; horizontal overlap (without vertical overlap) +%%loop_x_h_overlap: + rorx r6, seeq, 1 + or seed, 0xEFF4 + test seeb, seeh + lea seed, [r6+0x8000] + cmovp seed, r6d ; updated seed + + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + offx, offy, see, lstride, luma, left_offxy + + lea left_offxyd, [offyq+(32>>%2)] ; previous column's offy*stride+offx + rorx offyd, seed, 8 + rorx offxq, seeq, 12 + and offyd, 0xf + imul offyd, 164>>%3 + lea offyd, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)] ; offy*stride+offx + + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + h, offxy, see, lstride, luma, left_offxy + + mov grain_lutq, grain_lutmp + mov hd, hm +%%loop_y_h_overlap: +%if %2 + movu ym18, [grain_lutq+offxyq*2+82*0] + vinserti32x8 m18, [grain_lutq+offxyq*2+82*2], 1 + movu ym19, [grain_lutq+offxyq*2+82*4] + vinserti32x8 m19, [grain_lutq+offxyq*2+82*6], 1 + movd xm16, [grain_lutq+left_offxyq*2+82*0] + vinserti32x4 m16, [grain_lutq+left_offxyq*2+82*2], 2 + movd xm17, [grain_lutq+left_offxyq*2+82*4] + vinserti32x4 m17, [grain_lutq+left_offxyq*2+82*6], 2 + punpckldq m16, m17 + punpckldq m17, m18, m19 + punpcklwd m16, m17 + mova m17, m20 + vpdpwssd m17, m16, m10 + psrad m17, 1 + packssdw m17, m17 + vpsravw m17, m21 +%else + movu m18, [grain_lutq+offxyq*2+82*0] + movu m19, [grain_lutq+offxyq*2+82*2] + movd xm16, [grain_lutq+left_offxyq*2+82*0] + pinsrd xm16, [grain_lutq+left_offxyq*2+82*2], 1 + punpckldq xm17, xm18, xm19 + punpcklwd xm16, xm17 + mova xm17, xm20 + vpdpwssd xm17, xm16, xm10 + psrad xm17, 1 + packssdw xm17, xm17 + vpsravw xm17, xm21 +%endif + vmovdqa32 m18{k3}, m17 + vpshufd m19{k3}, m17, q0321 + call %%add_noise + sub hb, 2<<%2 + jg %%loop_y_h_overlap + add wq, 32>>%2 + jge .end + mov srcq, r9mp + mov dstq, r10mp + mov lumaq, r11mp + lea srcq, [srcq+wq*2] + lea dstq, [dstq+wq*2] + lea lumaq, [lumaq+wq*(2<<%2)] + cmp dword r8m, 0 ; sby + jne %%hv_overlap + jmp %%loop_x_h_overlap + +%%v_overlap: + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + _, sby, see, lstride + + movzx sbyd, sbyb + imul seed, [fg_dataq+FGData.seed], 0x00010001 + imul r7d, sbyd, 173 * 0x00010001 + imul sbyd, 37 * 0x01000100 + add r7d, (105 << 16) | 188 + add sbyd, (178 << 24) | (141 << 8) + and r7d, 0x00ff00ff + and sbyd, 0xff00ff00 + xor seed, r7d + xor seed, sbyd ; (cur_seed << 16) | top_seed + + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + offx, offy, see, lstride, luma, _, top_offxy + + mov lumaq, r9mp + lea r12, [srcq+wq*2] + lea r13, [dstq+wq*2] + lea r14, [lumaq+wq*(2<<%2)] + mov r9mp, r12 + mov r10mp, r13 + mov r11mp, r14 + neg wq + + ; we assume from the block above that bits 8-15 of r7d are zero'ed + mov r6d, seed + or seed, 0xeff4eff4 + test seeb, seeh + setp r7b ; parity of top_seed + shr seed, 16 + shl r7d, 16 + test seeb, seeh + setp r7b ; parity of cur_seed + or r6d, 0x00010001 + xor r7d, r6d + rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed + + rorx offyd, seed, 8 + rorx offxd, seed, 12 + and offyd, 0xf000f + and offxd, 0xf000f + imul offyd, 164>>%3 + ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy + lea offyd, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82] + + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + h, offxy, see, lstride, luma, _, top_offxy + + mov grain_lutq, grain_lutmp + mov hd, hm + movzx top_offxyd, offxyw + shr offxyd, 16 + +%if %3 + movu ym16, [grain_lutq+offxyq*2+82*0] + movu ym1, [grain_lutq+top_offxyq*2+82*0] + vbroadcasti32x8 m18, [grain_lutq+offxyq*2+82*2] + movu ym19, [grain_lutq+offxyq*2+82*4] + vinserti32x8 m19, [grain_lutq+offxyq*2+82*6], 1 + punpcklwd ym17, ym1, ym16 + punpckhwd ym1, ym16 +%elif %2 + movu ym18, [grain_lutq+offxyq*2+82*0] + vinserti32x8 m18, [grain_lutq+offxyq*2+82*2], 1 + movu ym17, [grain_lutq+top_offxyq*2+82*0] + vinserti32x8 m17, [grain_lutq+top_offxyq*2+82*2], 1 + movu ym19, [grain_lutq+offxyq*2+82*4] + vinserti32x8 m19, [grain_lutq+offxyq*2+82*6], 1 + punpcklwd m16, m17, m18 + punpckhwd m17, m18 +%else + movu m18, [grain_lutq+offxyq*2+82*0] + movu m19, [grain_lutq+top_offxyq*2+82*0] + movu m2, [grain_lutq+offxyq*2+82*2] + movu m16, [grain_lutq+top_offxyq*2+82*2] + punpckhwd m1, m19, m18 + punpcklwd m19, m18 + punpckhwd m18, m2, m16 + punpcklwd m2, m16 +%endif + call %%add_noise_v + sub hb, 2<<%2 + jg %%loop_y + add wq, 32>>%2 + jge .end + mov srcq, r9mp + mov dstq, r10mp + mov lumaq, r11mp + lea srcq, [srcq+wq*2] + lea dstq, [dstq+wq*2] + lea lumaq, [lumaq+wq*(2<<%2)] + + ; since fg_dataq.overlap is guaranteed to be set, we never jump back + ; to %%v_overlap, and instead always fall-through to %%hv_overlap +%%hv_overlap: + ; we assume from the block above that bits 8-15 of r7d are zero'ed + mov r6d, seed + or seed, 0xeff4eff4 + test seeb, seeh + setp r7b ; parity of top_seed + shr seed, 16 + shl r7d, 16 + test seeb, seeh + setp r7b ; parity of cur_seed + or r6d, 0x00010001 + xor r7d, r6d + rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed + + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + offx, offy, see, lstride, luma, left_offxy, top_offxy, topleft_offxy + + lea topleft_offxyq, [top_offxyq+(32>>%2)] + lea left_offxyq, [offyq+(32>>%2)] + rorx offyd, seed, 8 + rorx offxd, seed, 12 + and offyd, 0xf000f + and offxd, 0xf000f + imul offyd, 164>>%3 + ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy + lea offyd, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82] + + DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ + h, offxy, see, lstride, luma, left_offxy, top_offxy, topleft_offxy + + mov grain_lutq, grain_lutmp + mov hd, hm + movzx top_offxyd, offxyw + shr offxyd, 16 + + ; grain = grain_lut[offy+y][offx+x] +%if %2 + movd xm16, [grain_lutq+left_offxyq*2+82*0] + vinserti32x4 m16, [grain_lutq+left_offxyq*2+82*2], 2 + movd xm17, [grain_lutq+left_offxyq*2+82*4] + vinserti32x4 m17, [grain_lutq+left_offxyq*2+82*6], 2 + movu ym18, [grain_lutq+offxyq*2+82*0] + vinserti32x8 m18, [grain_lutq+offxyq*2+82*2], 1 + movu ym19, [grain_lutq+offxyq*2+82*4] + vinserti32x8 m19, [grain_lutq+offxyq*2+82*6], 1 + punpckldq m16, m17 + punpckldq m17, m18, m19 + punpcklwd m16, m17 + movu ym1, [grain_lutq+top_offxyq*2+82*0] + movd xm17, [grain_lutq+topleft_offxyq*2+82*0] + mova m0, m20 + vpdpwssd m0, m16, m10 +%if %3 + punpcklwd xm17, xm1 + mova xm16, xm20 + vpdpwssd xm16, xm17, xm10 + psrad xm16, 1 +%else + vinserti32x8 m1, [grain_lutq+top_offxyq*2+82*2], 1 + vinserti32x4 m17, [grain_lutq+topleft_offxyq*2+82*2], 2 + punpcklwd m17, m1 + mova m16, m20 + vpdpwssd m16, m17, m10 + psrad m16, 1 +%endif + psrad m0, 1 + packssdw m0, m16 + vpsravw m0, m21 + vmovdqa32 m18{k3}, m0 + vpshufd m19{k3}, m0, q0321 +%if %3 + vpunpckhdq ym1{k3}, ym0, ym0 + punpcklwd ym17, ym1, ym18 + punpckhwd ym1, ym18 +%else + vpunpckhdq m1{k3}, m0, m0 + punpcklwd m16, m1, m18 + punpckhwd m17, m1, m18 +%endif +%else + movu m18, [grain_lutq+offxyq*2+82*0] + movu m19, [grain_lutq+top_offxyq*2+82*0] + movd xm17, [grain_lutq+left_offxyq*2+82*0] + pinsrd xm17, [grain_lutq+topleft_offxyq*2+82*0], 1 + punpckldq xm16, xm18, xm19 + punpcklwd xm17, xm16 + movu m2, [grain_lutq+offxyq*2+82*2] + movu m0, [grain_lutq+top_offxyq*2+82*2] + movd xm16, [grain_lutq+left_offxyq*2+82*2] + pinsrd xm16, [grain_lutq+topleft_offxyq*2+82*2], 1 + punpckldq xm1, xm2, xm0 + punpcklwd xm1, xm16, xm1 + mova xm16, xm20 + vpdpwssd xm16, xm17, xm10 + mova xm17, xm20 + vpdpwssd xm17, xm1, xm10 + punpckhwd m1, m19, m18 + punpcklwd m19, m18 + punpckhwd m18, m2, m0 + punpcklwd m2, m0 + psrad xm16, 1 + psrad xm17, 1 + packssdw xm16, xm17 + vpsravw xm16, xm21 + vpshuflw m19{k4}, m16, q1302 + punpckhqdq xm16, xm16 + vpshuflw m2{k4}, m16, q3120 +%endif + call %%add_noise_v + sub hb, 2<<%2 + jg %%loop_y_h_overlap + add wq, 32>>%2 + jge .end + mov srcq, r9mp + mov dstq, r10mp + mov lumaq, r11mp + lea srcq, [srcq+wq*2] + lea dstq, [dstq+wq*2] + lea lumaq, [lumaq+wq*(2<<%2)] + jmp %%hv_overlap + +ALIGN function_align +%%add_noise_v: +%if %3 + mova ym16, ym20 + vpdpwssd ym16, ym17, ym11 + mova ym17, ym20 + vpdpwssd ym17, ym1, ym11 + psrad ym16, 1 + psrad ym17, 1 + packssdw ym16, ym17 + vpsravw m18{k1}, m16, m21 +%elif %2 + mova m18, m20 + vpdpwssd m18, m16, m11 + mova m16, m20 + vpdpwssd m16, m17, m11 + psrad m18, 1 + psrad m16, 1 + packssdw m18, m16 + vpsravw m18, m21 +%else + mova m16, m20 + vpdpwssd m16, m1, m11 + mova m17, m20 + vpdpwssd m17, m18, m11 + mova m18, m20 + vpdpwssd m18, m19, m11 + mova m19, m20 + vpdpwssd m19, m2, m11 + REPX {psrad x, 1}, m16, m17, m18, m19 + packssdw m18, m16 + packssdw m19, m17 + vpsravw m18, m21 + vpsravw m19, m21 +%endif +%%add_noise: +%if %2 + mova m2, [lumaq+lstrideq*(0<<%3)] + mova m0, [lumaq+lstrideq*(1<<%3)] + lea lumaq, [lumaq+lstrideq*(2<<%3)] + mova m3, [lumaq+lstrideq*(0<<%3)] + mova m1, [lumaq+lstrideq*(1<<%3)] + mova m16, m12 + vpermi2w m16, m2, m0 + vpermt2w m2, m13, m0 + mova m17, m12 + vpermi2w m17, m3, m1 + vpermt2w m3, m13, m1 + pavgw m2, m16 + pavgw m3, m17 +%elif %1 + mova m2, [lumaq+lstrideq*0] + mova m3, [lumaq+lstrideq*1] +%endif +%if %2 + mova ym16, [srcq+strideq*0] + vinserti32x8 m16, [srcq+strideq*1], 1 + lea srcq, [srcq+strideq*2] +%else + mova m16, [srcq+strideq*0] +%endif +%if %1 + punpckhwd m17, m2, m16 + mova m0, m14 + vpdpwssd m0, m17, m15 + punpcklwd m17, m2, m16 + mova m2, m14 + vpdpwssd m2, m17, m15 +%endif +%if %2 + mova ym17, [srcq+strideq*0] + vinserti32x8 m17, [srcq+strideq*1], 1 +%else + mova m17, [srcq+strideq*1] +%endif +%if %1 + psrad m0, 6 + psrad m2, 6 + packusdw m2, m0 + punpckhwd m0, m3, m17 + mova m1, m14 + vpdpwssd m1, m15, m0 + punpcklwd m0, m3, m17 + mova m3, m14 + vpdpwssd m3, m15, m0 + psrad m1, 6 + psrad m3, 6 + packusdw m3, m1 + pminuw m2, m4 + pminuw m3, m4 + +.add_noise_main: + ; scaling[luma_src] + kmovw k5, k1 + pand m1, m5, m2 + vpgatherdd m0{k5}, [scalingq+m1] + kmovw k5, k1 + psrld m2, 16 + vpgatherdd m1{k5}, [scalingq+m2] + vpshufb m0{k2}, m1, m6 + kmovw k5, k1 + psrld m1, m3, 16 + vpgatherdd m2{k5}, [scalingq+m1] + kmovw k5, k1 + pand m3, m5 + vpgatherdd m1{k5}, [scalingq+m3] + vpshufb m1{k2}, m2, m6 + + ; noise = round2(scaling[luma_src] * grain, scaling_shift) + vpsllvw m0, m7 + vpsllvw m1, m7 + pmulhrsw m18, m0 + pmulhrsw m19, m1 + add grain_lutq, 82*(4<<%2) + lea lumaq, [lumaq+lstrideq*(2<<%3)] + lea srcq, [srcq+strideq*2] + paddw m16, m18 + paddw m17, m19 + pmaxsw m16, m8 + pmaxsw m17, m8 + pminsw m16, m9 + pminsw m17, m9 +%if %2 + mova [dstq+strideq*0], ym16 + vextracti32x8 [dstq+strideq*1], m16, 1 + lea dstq, [dstq+strideq*2] + mova [dstq+strideq*0], ym17 + vextracti32x8 [dstq+strideq*1], m17, 1 +%else + mova [dstq+strideq*0], m16 + mova [dstq+strideq*1], m17 +%endif + lea dstq, [dstq+strideq*2] + ret +%else +%if %2 + pand m2, m4 + pand m3, m4 +%else + pand m2, m4, [lumaq+lstrideq*0] + pand m3, m4, [lumaq+lstrideq*1] +%endif + jmp .add_noise_main +%endif +%endmacro + + %%FGUV_32x32xN_LOOP 1, %2, %3 +.csfl: + %%FGUV_32x32xN_LOOP 0, %2, %3 +.end: + RET +%endmacro + +FGUV_FN 420, 1, 1 +FGUV_FN 422, 1, 0 +FGUV_FN 444, 0, 0 + +%endif diff --git a/src/x86/filmgrain_avx2.asm b/src/x86/filmgrain_avx2.asm index 7da8105..55445cf 100644 --- a/src/x86/filmgrain_avx2.asm +++ b/src/x86/filmgrain_avx2.asm @@ -1488,7 +1488,7 @@ cglobal fguv_32x32xn_i%1_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, jg %%loop_y add wq, 32>>%2 - jge %%end + jge .end mov srcq, r11mp mov dstq, r12mp lea lumaq, [r14+wq*(1+%2)] @@ -1647,7 +1647,7 @@ cglobal fguv_32x32xn_i%1_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, jg %%loop_y_h_overlap add wq, 32>>%2 - jge %%end + jge .end mov srcq, r11mp mov dstq, r12mp lea lumaq, [r14+wq*(1+%2)] @@ -1852,7 +1852,7 @@ cglobal fguv_32x32xn_i%1_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, %%end_y_v_overlap: add wq, 32>>%2 - jge %%end + jge .end mov srcq, r11mp mov dstq, r12mp lea lumaq, [r14+wq*(1+%2)] @@ -2081,20 +2081,20 @@ cglobal fguv_32x32xn_i%1_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, %%end_y_hv_overlap: add wq, 32>>%2 - jge %%end + jge .end mov srcq, r11mp mov dstq, r12mp lea lumaq, [r14+wq*(1+%2)] add srcq, wq add dstq, wq jmp %%loop_x_hv_overlap -%%end: - RET %endmacro %%FGUV_32x32xN_LOOP 1, %2, %3 .csfl: %%FGUV_32x32xN_LOOP 0, %2, %3 +.end: + RET %endmacro GEN_GRAIN_UV_FN 420, 1, 1 diff --git a/src/x86/filmgrain_init_tmpl.c b/src/x86/filmgrain_init_tmpl.c index 0b783d1..1c91d2a 100644 --- a/src/x86/filmgrain_init_tmpl.c +++ b/src/x86/filmgrain_init_tmpl.c @@ -71,7 +71,6 @@ COLD void bitfn(dav1d_film_grain_dsp_init_x86)(Dav1dFilmGrainDSPContext *const c c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I444 - 1] = BF(dav1d_fguv_32x32xn_i444, avx2); } -#if BITDEPTH == 8 if (!(flags & DAV1D_X86_CPU_FLAG_AVX512ICL)) return; c->fgy_32x32xn = BF(dav1d_fgy_32x32xn, avx512icl); @@ -79,5 +78,4 @@ COLD void bitfn(dav1d_film_grain_dsp_init_x86)(Dav1dFilmGrainDSPContext *const c c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I422 - 1] = BF(dav1d_fguv_32x32xn_i422, avx512icl); c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I444 - 1] = BF(dav1d_fguv_32x32xn_i444, avx512icl); #endif -#endif } -- cgit v1.2.3