Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/videolan/dav1d.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorHenrik Gramner <gramner@twoorioles.com>2022-03-08 00:04:34 +0300
committerHenrik Gramner <henrik@gramner.com>2022-03-14 00:09:08 +0300
commit949b8902b4cfd1505eb5885b57546b25f38c5bd5 (patch)
tree2d15c49e76af4563e134d79492300eed3aab24b7
parent28a9c46e1c36540d3276299f2e284ece1d2386be (diff)
x86: Add high bit-depth film grain AVX-512 (Ice Lake) asm
-rw-r--r--src/meson.build1
-rw-r--r--src/x86/filmgrain16_avx2.asm41
-rw-r--r--src/x86/filmgrain16_avx512.asm932
-rw-r--r--src/x86/filmgrain_avx2.asm12
-rw-r--r--src/x86/filmgrain_init_tmpl.c2
5 files changed, 957 insertions, 31 deletions
diff --git a/src/meson.build b/src/meson.build
index b06aee6..a9ce159 100644
--- a/src/meson.build
+++ b/src/meson.build
@@ -226,6 +226,7 @@ if is_asm_enabled
if dav1d_bitdepths.contains('16')
libdav1d_sources_asm += files(
+ 'x86/filmgrain16_avx512.asm',
'x86/ipred16_avx512.asm',
'x86/looprestoration16_avx512.asm',
'x86/mc16_avx512.asm',
diff --git a/src/x86/filmgrain16_avx2.asm b/src/x86/filmgrain16_avx2.asm
index 5c2e386..a1d4c41 100644
--- a/src/x86/filmgrain16_avx2.asm
+++ b/src/x86/filmgrain16_avx2.asm
@@ -29,7 +29,7 @@
%if ARCH_X86_64
-SECTION_RODATA 32
+SECTION_RODATA 16
pb_mask: db 0,128,128, 0,128, 0, 0,128,128, 0, 0,128, 0,128,128, 0
gen_shufA: db 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9
gen_shufB: db 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13
@@ -877,7 +877,7 @@ cglobal fgy_32x32xn_16bpc, 6, 14, 16, dst, src, stride, fg_data, w, scaling, \
mov r6d, r9m ; bdmax
mov r9d, [fg_dataq+FGData.clip_to_restricted_range]
mov r7d, [fg_dataq+FGData.scaling_shift]
- movifnidn sbyd, sbym
+ mov sbyd, sbym
vpbroadcastd m8, r9m
shr r6d, 11 ; is_12bpc
vpbroadcastd m9, [base+grain_min+r6*4]
@@ -890,7 +890,7 @@ cglobal fgy_32x32xn_16bpc, 6, 14, 16, dst, src, stride, fg_data, w, scaling, \
test sbyd, sbyd
setnz r7b
vpbroadcastd m14, [base+pd_16]
- test r7b, byte [fg_dataq+FGData.overlap_flag]
+ test r7b, [fg_dataq+FGData.overlap_flag]
jnz .vertical_overlap
imul seed, sbyd, (173 << 24) | 37
@@ -986,7 +986,7 @@ cglobal fgy_32x32xn_16bpc, 6, 14, 16, dst, src, stride, fg_data, w, scaling, \
DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
offx, offy, see, src_bak, left_offxy
- lea left_offxyd, [offyd+32] ; previous column's offy*stride+offx
+ lea left_offxyd, [offyq+32] ; previous column's offy*stride+offx
rorx offyd, seed, 8
rorx offxq, seeq, 12
and offyd, 0xf
@@ -1059,8 +1059,6 @@ cglobal fgy_32x32xn_16bpc, 6, 14, 16, dst, src, stride, fg_data, w, scaling, \
cmp dword r8m, 0 ; sby
jne .loop_x_hv_overlap
jmp .loop_x_h_overlap
-.end:
- RET
.vertical_overlap:
DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused1, \
@@ -1194,7 +1192,7 @@ cglobal fgy_32x32xn_16bpc, 6, 14, 16, dst, src, stride, fg_data, w, scaling, \
jmp .loop_y
.end_y_v_overlap:
add wq, 32
- jge .end_hv
+ jge .end
lea srcq, [src_bakq+wq*2]
; since fg_dataq.overlap is guaranteed to be set, we never jump
@@ -1220,8 +1218,8 @@ cglobal fgy_32x32xn_16bpc, 6, 14, 16, dst, src, stride, fg_data, w, scaling, \
DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
offx, offy, see, src_bak, left_offxy, top_offxy, topleft_offxy
- lea topleft_offxyq, [top_offxyq+32]
- lea left_offxyq, [offyq+32]
+ lea topleft_offxyd, [top_offxyq+32]
+ lea left_offxyd, [offyq+32]
rorx offyd, seed, 8
rorx offxd, seed, 12
and offyd, 0xf000f
@@ -1271,7 +1269,7 @@ cglobal fgy_32x32xn_16bpc, 6, 14, 16, dst, src, stride, fg_data, w, scaling, \
pmaddwd xm6, xm4
paddd xm6, xm14
psrad xm6, 5
- packssdw xm6, xm4
+ packssdw xm6, xm6
pmaxsw xm6, xm9
pminsw xm6, xm10
pshuflw xm4, xm6, q1032
@@ -1293,7 +1291,6 @@ cglobal fgy_32x32xn_16bpc, 6, 14, 16, dst, src, stride, fg_data, w, scaling, \
punpckhwd m6, m7
pmaddwd m5, m15
pmaddwd m6, m15
- vpbroadcastd m7, r9m
paddd m5, m14
paddd m6, m14
psrad m5, 5
@@ -1337,7 +1334,7 @@ cglobal fgy_32x32xn_16bpc, 6, 14, 16, dst, src, stride, fg_data, w, scaling, \
add wq, 32
lea srcq, [src_bakq+wq*2]
jl .loop_x_hv_overlap
-.end_hv:
+.end:
RET
%macro FGUV_FN 3 ; name, ss_hor, ss_ver
@@ -1348,7 +1345,7 @@ cglobal fguv_32x32xn_i%1_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling
mov r9d, r13m ; bdmax
mov r7d, [fg_dataq+FGData.scaling_shift]
mov r11d, is_idm
- movifnidn sbyd, sbym
+ mov sbyd, sbym
vpbroadcastw m11, [base+mul_bits+r7*2-12]
mov r6d, [fg_dataq+FGData.clip_to_restricted_range]
shr r9d, 11 ; is_12bpc
@@ -1385,7 +1382,7 @@ cglobal fguv_32x32xn_i%1_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling
%endif
vpbroadcastd m14, [base+pd_16]
%endif
- test r7b, byte [fg_dataq+FGData.overlap_flag]
+ test r7b, [fg_dataq+FGData.overlap_flag]
jnz %%vertical_overlap
imul seed, sbyd, (173 << 24) | 37
@@ -1538,7 +1535,7 @@ cglobal fguv_32x32xn_i%1_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling
%endif
jg %%loop_y
add wq, 32>>%2
- jge %%end
+ jge .end
mov srcq, r9mp
mov dstq, r11mp
mov lumaq, r12mp
@@ -1561,7 +1558,7 @@ cglobal fguv_32x32xn_i%1_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling
DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
offx, offy, see, left_offxy, unused1, unused2, luma, lstride
- lea left_offxyd, [offyd+(32>>%2)] ; previous column's offy*stride+offx
+ lea left_offxyd, [offyq+(32>>%2)] ; previous column's offy*stride+offx
rorx offyd, seed, 8
rorx offxq, seeq, 12
and offyd, 0xf
@@ -1717,7 +1714,7 @@ cglobal fguv_32x32xn_i%1_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling
%endif
jg %%loop_y_h_overlap
add wq, 32>>%2
- jge %%end
+ jge .end
mov srcq, r9mp
mov dstq, r11mp
mov lumaq, r12mp
@@ -1727,8 +1724,6 @@ cglobal fguv_32x32xn_i%1_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling
cmp dword r8m, 0 ; sby
jne %%loop_x_hv_overlap
jmp %%loop_x_h_overlap
-%%end:
- RET
%%vertical_overlap:
DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, \
@@ -1957,7 +1952,7 @@ cglobal fguv_32x32xn_i%1_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling
%endif
%%end_y_v_overlap:
add wq, 32>>%2
- jge %%end_hv
+ jge .end
mov srcq, r9mp
mov dstq, r11mp
mov lumaq, r12mp
@@ -2226,7 +2221,7 @@ cglobal fguv_32x32xn_i%1_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling
%endif
%%end_y_hv_overlap:
add wq, 32>>%2
- jge %%end_hv
+ jge .end
mov srcq, r9mp
mov dstq, r11mp
mov lumaq, r12mp
@@ -2234,13 +2229,13 @@ cglobal fguv_32x32xn_i%1_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling
lea dstq, [dstq+wq*2]
lea lumaq, [lumaq+wq*(2<<%2)]
jmp %%loop_x_hv_overlap
-%%end_hv:
- RET
%endmacro
%%FGUV_32x32xN_LOOP 1, %2, %3
.csfl:
%%FGUV_32x32xN_LOOP 0, %2, %3
+.end:
+ RET
%endmacro
GEN_GRAIN_UV_FN 420, 1, 1
diff --git a/src/x86/filmgrain16_avx512.asm b/src/x86/filmgrain16_avx512.asm
new file mode 100644
index 0000000..00dd6af
--- /dev/null
+++ b/src/x86/filmgrain16_avx512.asm
@@ -0,0 +1,932 @@
+; Copyright © 2022, VideoLAN and dav1d authors
+; Copyright © 2022, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+; list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+; this list of conditions and the following disclaimer in the documentation
+; and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+%include "x86/filmgrain_common.asm"
+
+%if ARCH_X86_64
+
+SECTION_RODATA 64
+pb_0to63: db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+ db 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+ db 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47
+ db 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63
+scale_mask: db -1, -1, 0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1
+scale_shift: dw 7, 7, 6, 6, 5, 5, 4, 4
+pw_27_17_17_27: dw 108, 68, 68, 108, 27, 17, 17, 27
+pw_23_22: dw 92, 88, 0, 128, 23, 22, 0, 32
+fg_min: times 2 dw 0
+ times 2 dw 64
+ times 2 dw 256
+fg_max: times 2 dw 1023
+ times 2 dw 4095
+ times 2 dw 960
+ times 2 dw 3840
+ times 2 dw 940
+ times 2 dw 3760
+scale_rnd: dd 64
+ dd 16
+uv_offset_mul: dd 256
+ dd 1024
+pb_8_9_0_1: db 8, 9, 0, 1
+
+SECTION .text
+
+INIT_ZMM avx512icl
+cglobal fgy_32x32xn_16bpc, 6, 15, 21, dst, src, stride, fg_data, w, scaling, \
+ grain_lut, offx, sby, see, offy, src_bak
+%define base r11-fg_min
+ lea r11, [fg_min]
+ mov r6d, r9m ; bdmax
+ mov r9d, [fg_dataq+FGData.clip_to_restricted_range]
+ mov r7d, [fg_dataq+FGData.scaling_shift]
+ mov sbyd, sbym
+ vpbroadcastd m6, r9m
+ shr r6d, 11 ; is_12bpc
+ vbroadcasti32x4 m7, [base+scale_mask]
+ shlx r10d, r9d, r6d
+ vpbroadcastd m10, [base+scale_shift+r7*4-32]
+ lea r9d, [r6+r9*4]
+ vpbroadcastd m8, [base+fg_min+r10*4]
+ kxnorw k1, k1, k1 ; 0xffff
+ vpbroadcastd m9, [base+fg_max+r9*4]
+ mov r12, 0xeeeeeeeeeeeeeeee
+ vpbroadcastd m19, [base+scale_rnd+r6*4]
+ kshiftrb k2, k1, 4 ; 0xf
+ vpbroadcastq xm20, [base+pw_27_17_17_27+r6*8]
+ kmovq k3, r12
+ vpbroadcastd m11, [base+scale_shift+r6*8+4]
+ test sbyd, sbyd
+ setnz r7b
+ vpbroadcastd m12, [base+pw_27_17_17_27+r6*8+0]
+ vpbroadcastd m13, [base+pw_27_17_17_27+r6*8+4]
+ test r7b, [fg_dataq+FGData.overlap_flag]
+ jnz .v_overlap
+
+ imul seed, sbyd, (173 << 24) | 37
+ add seed, (105 << 24) | 178
+ rorx seed, seed, 24
+ movzx seed, seew
+ xor seed, [fg_dataq+FGData.seed]
+ lea src_bakq, [srcq+wq*2]
+ neg wq
+ sub dstq, srcq
+
+.loop_x:
+ rorx r6, seeq, 1
+ or seed, 0xeff4
+ test seeb, seeh
+ lea seed, [r6+0x8000]
+ cmovp seed, r6d ; updated seed
+ rorx offyd, seed, 8
+ rorx offxq, seeq, 12
+ and offyd, 0xf
+ imul offyd, 164
+ lea offyd, [offyq+offxq*2+747] ; offy*stride+offx
+
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, h, \
+ sby, see, offxy, src_bak
+
+ mov grain_lutq, grain_lutmp
+ mov hd, hm
+.loop_y:
+ movu m4, [grain_lutq+offxyq*2+82*0]
+ movu m5, [grain_lutq+offxyq*2+82*2]
+ call .add_noise
+ sub hb, 2
+ jg .loop_y
+ add wq, 32
+ jge .end
+ lea srcq, [src_bakq+wq*2]
+ cmp byte [fg_dataq+FGData.overlap_flag], 0
+ je .loop_x
+ test sbyd, sbyd
+ jnz .hv_overlap
+
+ ; horizontal overlap (without vertical overlap)
+.loop_x_h_overlap:
+ rorx r6, seeq, 1
+ or seed, 0xeff4
+ test seeb, seeh
+ lea seed, [r6+0x8000]
+ cmovp seed, r6d ; updated seed
+
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, offx, \
+ sby, see, offy, src_bak, left_offxy
+
+ lea left_offxyd, [offyq+73] ; previous column's offy*stride+offx
+ rorx offyd, seed, 8
+ rorx offxq, seeq, 12
+ and offyd, 0xf
+ imul offyd, 164
+ lea offyd, [offyq+offxq*2+747] ; offy*stride+offx
+
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, h, \
+ sby, see, offxy, src_bak, left_offxy
+
+ mov grain_lutq, grain_lutmp
+ mov hd, hm
+.loop_y_h_overlap:
+ movu m4, [grain_lutq+offxyq*2+82*0]
+ movu m5, [grain_lutq+offxyq*2+82*2]
+ movd xm17, [grain_lutq+left_offxyq*2-82*1]
+ pinsrd xm17, [grain_lutq+left_offxyq*2+82*1], 1
+ punpckldq xm16, xm4, xm5
+ punpcklwd xm17, xm16
+ mova xm16, xm19
+ vpdpwssd xm16, xm20, xm17
+ psrad xm16, 1
+ packssdw xm16, xm16
+ vpsravw xm16, xm11
+ vmovdqu8 m4{k2}, m16
+ vpalignr m5{k2}, m16, m16, 4
+ call .add_noise
+ sub hb, 2
+ jg .loop_y_h_overlap
+ add wq, 32
+ jge .end
+ lea srcq, [src_bakq+wq*2]
+ test sbyd, sbyd
+ jnz .hv_overlap
+ jmp .loop_x_h_overlap
+
+.v_overlap:
+ movzx sbyd, sbyb
+ imul seed, [fg_dataq+FGData.seed], 0x00010001
+ imul r7d, sbyd, 173 * 0x00010001
+ imul sbyd, 37 * 0x01000100
+ add r7d, (105 << 16) | 188
+ add sbyd, (178 << 24) | (141 << 8)
+ and r7d, 0x00ff00ff
+ and sbyd, 0xff00ff00
+ xor seed, r7d
+ xor seed, sbyd ; (cur_seed << 16) | top_seed
+ lea src_bakq, [srcq+wq*2]
+ neg wq
+ sub dstq, srcq
+
+ ; we assume from the block above that bits 8-15 of r7d are zero'ed
+ mov r6d, seed
+ or seed, 0xeff4eff4
+ test seeb, seeh
+ setp r7b ; parity of top_seed
+ shr seed, 16
+ shl r7d, 16
+ test seeb, seeh
+ setp r7b ; parity of cur_seed
+ or r6d, 0x00010001
+ xor r7d, r6d
+ rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed
+
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, offx, \
+ sby, see, offy, src_bak, _, top_offxy
+
+ rorx offyd, seed, 8
+ rorx offxd, seed, 12
+ and offyd, 0xf000f
+ and offxd, 0xf000f
+ imul offyd, 164
+ ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
+ lea offyd, [offyq+offxq*2+0x10001*747+32*82]
+
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, h, \
+ sby, see, offxy, src_bak, _, top_offxy
+
+ mov grain_lutq, grain_lutmp
+ mov hd, hm
+ movzx top_offxyd, offxyw
+ shr offxyd, 16
+
+ movu m16, [grain_lutq+offxyq*2+82*0]
+ movu m0, [grain_lutq+top_offxyq*2+82*0]
+ movu m17, [grain_lutq+offxyq*2+82*2]
+ movu m1, [grain_lutq+top_offxyq*2+82*2]
+ punpckhwd m4, m0, m16
+ punpcklwd m0, m16
+ punpckhwd m5, m1, m17
+ punpcklwd m1, m17
+ call .add_noise_v
+ sub hb, 2
+ jg .loop_y
+ add wq, 32
+ jge .end
+ lea srcq, [src_bakq+wq*2]
+
+ ; since fg_dataq.overlap is guaranteed to be set, we never jump back
+ ; to .v_overlap, and instead always fall-through to .hv_overlap
+.hv_overlap:
+ ; we assume from the block above that bits 8-15 of r7d are zero'ed
+ mov r6d, seed
+ or seed, 0xeff4eff4
+ test seeb, seeh
+ setp r7b ; parity of top_seed
+ shr seed, 16
+ shl r7d, 16
+ test seeb, seeh
+ setp r7b ; parity of cur_seed
+ or r6d, 0x00010001
+ xor r7d, r6d
+ rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed
+
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, offx, \
+ sby, see, offy, src_bak, left_offxy, top_offxy, topleft_offxy
+
+ lea topleft_offxyd, [top_offxyq+73]
+ lea left_offxyd, [offyq+73]
+ rorx offyd, seed, 8
+ rorx offxd, seed, 12
+ and offyd, 0xf000f
+ and offxd, 0xf000f
+ imul offyd, 164
+ ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
+ lea offyd, [offyq+offxq*2+0x10001*747+32*82]
+
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, h, \
+ sby, see, offxy, src_bak, left_offxy, top_offxy, topleft_offxy
+
+ mov grain_lutq, grain_lutmp
+ mov hd, hm
+ movzx top_offxyd, offxyw
+ shr offxyd, 16
+
+ movu m5, [grain_lutq+offxyq*2+82*0]
+ movu m0, [grain_lutq+top_offxyq*2+82*0]
+ movd xm17, [grain_lutq+left_offxyq*2-82*1]
+ pinsrd xm17, [grain_lutq+topleft_offxyq*2-82*1], 1
+ movu m2, [grain_lutq+offxyq*2+82*2]
+ movu m1, [grain_lutq+top_offxyq*2+82*2]
+ movd xm18, [grain_lutq+left_offxyq*2+82*1]
+ pinsrd xm18, [grain_lutq+topleft_offxyq*2+82*1], 1
+ punpckldq xm16, xm5, xm0
+ punpcklwd xm17, xm16
+ mova xm16, xm19
+ vpdpwssd xm16, xm20, xm17
+ punpckldq xm17, xm2, xm1
+ punpcklwd xm18, xm17
+ mova xm17, xm19
+ vpdpwssd xm17, xm20, xm18
+ punpckhwd m4, m0, m5
+ punpcklwd m0, m5
+ punpckhwd m5, m1, m2
+ punpcklwd m1, m2
+ psrad xm16, 1
+ psrad xm17, 1
+ packssdw xm16, xm17
+ vpsravw xm16, xm11
+ vpshuflw m0{k2}, m16, q1302
+ punpckhqdq xm16, xm16
+ vpshuflw m1{k2}, m16, q1302
+ call .add_noise_v
+ sub hb, 2
+ jg .loop_y_h_overlap
+ add wq, 32
+ lea srcq, [src_bakq+wq*2]
+ jl .hv_overlap
+.end:
+ RET
+ALIGN function_align
+.add_noise_v:
+ mova m2, m19
+ vpdpwssd m2, m12, m4
+ mova m3, m19
+ vpdpwssd m3, m13, m5
+ mova m4, m19
+ vpdpwssd m4, m12, m0
+ mova m5, m19
+ vpdpwssd m5, m13, m1
+ REPX {psrad x, 1}, m2, m3, m4, m5
+ packssdw m4, m2
+ packssdw m5, m3
+ vpsravw m4, m11
+ vpsravw m5, m11
+.add_noise:
+ mova m0, [srcq+strideq*0]
+ mova m1, [srcq+strideq*1]
+ kmovw k4, k1
+ pand m16, m6, m0
+ psrld m3, m0, 16
+ vpgatherdd m2{k4}, [scalingq+m16]
+ vpcmpud k4, m3, m6, 2 ; px <= bdmax
+ vpgatherdd m16{k4}, [scalingq+m3]
+ kmovw k4, k1
+ pand m17, m6, m1
+ vpgatherdd m3{k4}, [scalingq+m17]
+ vpshufb m2{k3}, m16, m7
+ psrld m16, m1, 16
+ vpcmpud k4, m16, m6, 2
+ vpgatherdd m17{k4}, [scalingq+m16]
+ vpshufb m3{k3}, m17, m7
+ vpsllvw m2, m10
+ vpsllvw m3, m10
+ pmulhrsw m4, m2
+ pmulhrsw m5, m3
+ add grain_lutq, 82*4
+ paddw m0, m4
+ paddw m1, m5
+ pmaxsw m0, m8
+ pmaxsw m1, m8
+ pminsw m0, m9
+ pminsw m1, m9
+ mova [dstq+srcq], m0
+ add srcq, strideq
+ mova [dstq+srcq], m1
+ add srcq, strideq
+ ret
+
+%macro FGUV_FN 3 ; name, ss_hor, ss_ver
+cglobal fguv_32x32xn_i%1_16bpc, 6, 15, 22, dst, src, stride, fg_data, w, scaling, \
+ grain_lut, h, sby, luma, lstride, uv_pl, is_id
+%define base r12-fg_min
+ lea r12, [fg_min]
+ mov r9d, r13m ; bdmax
+ mov r7d, [fg_dataq+FGData.scaling_shift]
+ mov r6d, [fg_dataq+FGData.clip_to_restricted_range]
+ mov r11d, is_idm
+ kxnorw k1, k1, k1 ; 0xffff
+ vpbroadcastd m5, r13m
+ mov r13, 0xeeeeeeeeeeeeeeee
+ vbroadcasti32x4 m6, [base+scale_mask]
+ shr r9d, 11 ; is_12bpc
+ vpbroadcastd m7, [base+scale_shift+r7*4-32]
+ shlx r10d, r6d, r9d
+ mov sbyd, sbym
+ shlx r6d, r6d, r11d
+ vpbroadcastd m8, [base+fg_min+r10*4]
+ lea r6d, [r9+r6*2]
+ vpbroadcastd m9, [base+fg_max+r6*4]
+ kmovq k2, r13
+ vpbroadcastd m20, [base+scale_rnd+r9*4]
+ packssdw m4, m5, m5
+ vpbroadcastd m21, [base+scale_shift+r9*8+4]
+%if %2
+ mova m12, [base+pb_0to63] ; pw_even
+ mov r13d, 0x0101
+ vpbroadcastq m10, [base+pw_23_22+r9*8]
+ kmovw k3, r13d
+%if %3
+ pshufd m11, m10, q0000
+%else
+ vpbroadcastd ym16, [base+pw_27_17_17_27+r9*8+0]
+ vpbroadcastd m11, [base+pw_27_17_17_27+r9*8+4]
+ vmovdqu16 m11{k1}, m16
+%endif
+ psrlw m13, m12, 8 ; pw_odd
+%else
+ vpbroadcastq m10, [base+pw_27_17_17_27+r9*8]
+ kshiftrb k3, k1, 7 ; 0x01
+ kshiftrb k4, k1, 4 ; 0x0f
+ pshufd m11, m10, q0000
+%endif
+ mov lstrideq, r10mp
+ test sbyd, sbyd
+ setnz r7b
+ cmp byte [fg_dataq+FGData.chroma_scaling_from_luma], 0
+ jne .csfl
+
+%macro %%FGUV_32x32xN_LOOP 3 ; not-csfl, ss_hor, ss_ver
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
+ _, sby, see, lstride
+
+%if %1
+ mov r6d, r11m
+ vpbroadcastd m0, [base+uv_offset_mul+r9*4]
+ vpbroadcastd m1, [base+pb_8_9_0_1]
+ vpbroadcastd m14, [fg_dataq+FGData.uv_offset+r6*4]
+ vbroadcasti32x4 m15, [fg_dataq+FGData.uv_mult+r6*4]
+ pmaddwd m14, m0
+ pshufb m15, m1 ; { uv_luma_mult, uv_mult }
+%endif
+ test r7b, [fg_dataq+FGData.overlap_flag]
+ jnz %%v_overlap
+
+ imul seed, sbyd, (173 << 24) | 37
+ add seed, (105 << 24) | 178
+ rorx seed, seed, 24
+ movzx seed, seew
+ xor seed, [fg_dataq+FGData.seed]
+
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
+ offx, offy, see, lstride, luma
+
+ mov lumaq, r9mp
+ lea r12, [srcq+wq*2]
+ lea r13, [dstq+wq*2]
+ lea r14, [lumaq+wq*(2<<%2)]
+ mov r9mp, r12
+ mov r10mp, r13
+ mov r11mp, r14
+ neg wq
+
+%%loop_x:
+ rorx r6, seeq, 1
+ or seed, 0xeff4
+ test seeb, seeh
+ lea seed, [r6+0x8000]
+ cmovp seed, r6d ; updated seed
+ rorx offyd, seed, 8
+ rorx offxq, seeq, 12
+ and offyd, 0xf
+ imul offyd, 164>>%3
+ lea offyd, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+(3+(6>>%2))] ; offy*stride+offx
+
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
+ h, offxy, see, lstride, luma
+
+ mov grain_lutq, grain_lutmp
+ mov hd, hm
+%%loop_y:
+%if %2
+ movu ym18, [grain_lutq+offxyq*2+82*0]
+ vinserti32x8 m18, [grain_lutq+offxyq*2+82*2], 1
+ movu ym19, [grain_lutq+offxyq*2+82*4]
+ vinserti32x8 m19, [grain_lutq+offxyq*2+82*6], 1
+%else
+ movu m18, [grain_lutq+offxyq*2+82*0]
+ movu m19, [grain_lutq+offxyq*2+82*2]
+%endif
+ call %%add_noise
+ sub hb, 2<<%2
+ jg %%loop_y
+ add wq, 32>>%2
+ jge .end
+ mov srcq, r9mp
+ mov dstq, r10mp
+ mov lumaq, r11mp
+ lea srcq, [srcq+wq*2]
+ lea dstq, [dstq+wq*2]
+ lea lumaq, [lumaq+wq*(2<<%2)]
+ cmp byte [fg_dataq+FGData.overlap_flag], 0
+ je %%loop_x
+ cmp dword r8m, 0 ; sby
+ jne %%hv_overlap
+
+ ; horizontal overlap (without vertical overlap)
+%%loop_x_h_overlap:
+ rorx r6, seeq, 1
+ or seed, 0xEFF4
+ test seeb, seeh
+ lea seed, [r6+0x8000]
+ cmovp seed, r6d ; updated seed
+
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
+ offx, offy, see, lstride, luma, left_offxy
+
+ lea left_offxyd, [offyq+(32>>%2)] ; previous column's offy*stride+offx
+ rorx offyd, seed, 8
+ rorx offxq, seeq, 12
+ and offyd, 0xf
+ imul offyd, 164>>%3
+ lea offyd, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)] ; offy*stride+offx
+
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
+ h, offxy, see, lstride, luma, left_offxy
+
+ mov grain_lutq, grain_lutmp
+ mov hd, hm
+%%loop_y_h_overlap:
+%if %2
+ movu ym18, [grain_lutq+offxyq*2+82*0]
+ vinserti32x8 m18, [grain_lutq+offxyq*2+82*2], 1
+ movu ym19, [grain_lutq+offxyq*2+82*4]
+ vinserti32x8 m19, [grain_lutq+offxyq*2+82*6], 1
+ movd xm16, [grain_lutq+left_offxyq*2+82*0]
+ vinserti32x4 m16, [grain_lutq+left_offxyq*2+82*2], 2
+ movd xm17, [grain_lutq+left_offxyq*2+82*4]
+ vinserti32x4 m17, [grain_lutq+left_offxyq*2+82*6], 2
+ punpckldq m16, m17
+ punpckldq m17, m18, m19
+ punpcklwd m16, m17
+ mova m17, m20
+ vpdpwssd m17, m16, m10
+ psrad m17, 1
+ packssdw m17, m17
+ vpsravw m17, m21
+%else
+ movu m18, [grain_lutq+offxyq*2+82*0]
+ movu m19, [grain_lutq+offxyq*2+82*2]
+ movd xm16, [grain_lutq+left_offxyq*2+82*0]
+ pinsrd xm16, [grain_lutq+left_offxyq*2+82*2], 1
+ punpckldq xm17, xm18, xm19
+ punpcklwd xm16, xm17
+ mova xm17, xm20
+ vpdpwssd xm17, xm16, xm10
+ psrad xm17, 1
+ packssdw xm17, xm17
+ vpsravw xm17, xm21
+%endif
+ vmovdqa32 m18{k3}, m17
+ vpshufd m19{k3}, m17, q0321
+ call %%add_noise
+ sub hb, 2<<%2
+ jg %%loop_y_h_overlap
+ add wq, 32>>%2
+ jge .end
+ mov srcq, r9mp
+ mov dstq, r10mp
+ mov lumaq, r11mp
+ lea srcq, [srcq+wq*2]
+ lea dstq, [dstq+wq*2]
+ lea lumaq, [lumaq+wq*(2<<%2)]
+ cmp dword r8m, 0 ; sby
+ jne %%hv_overlap
+ jmp %%loop_x_h_overlap
+
+%%v_overlap:
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
+ _, sby, see, lstride
+
+ movzx sbyd, sbyb
+ imul seed, [fg_dataq+FGData.seed], 0x00010001
+ imul r7d, sbyd, 173 * 0x00010001
+ imul sbyd, 37 * 0x01000100
+ add r7d, (105 << 16) | 188
+ add sbyd, (178 << 24) | (141 << 8)
+ and r7d, 0x00ff00ff
+ and sbyd, 0xff00ff00
+ xor seed, r7d
+ xor seed, sbyd ; (cur_seed << 16) | top_seed
+
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
+ offx, offy, see, lstride, luma, _, top_offxy
+
+ mov lumaq, r9mp
+ lea r12, [srcq+wq*2]
+ lea r13, [dstq+wq*2]
+ lea r14, [lumaq+wq*(2<<%2)]
+ mov r9mp, r12
+ mov r10mp, r13
+ mov r11mp, r14
+ neg wq
+
+ ; we assume from the block above that bits 8-15 of r7d are zero'ed
+ mov r6d, seed
+ or seed, 0xeff4eff4
+ test seeb, seeh
+ setp r7b ; parity of top_seed
+ shr seed, 16
+ shl r7d, 16
+ test seeb, seeh
+ setp r7b ; parity of cur_seed
+ or r6d, 0x00010001
+ xor r7d, r6d
+ rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed
+
+ rorx offyd, seed, 8
+ rorx offxd, seed, 12
+ and offyd, 0xf000f
+ and offxd, 0xf000f
+ imul offyd, 164>>%3
+ ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
+ lea offyd, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82]
+
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
+ h, offxy, see, lstride, luma, _, top_offxy
+
+ mov grain_lutq, grain_lutmp
+ mov hd, hm
+ movzx top_offxyd, offxyw
+ shr offxyd, 16
+
+%if %3
+ movu ym16, [grain_lutq+offxyq*2+82*0]
+ movu ym1, [grain_lutq+top_offxyq*2+82*0]
+ vbroadcasti32x8 m18, [grain_lutq+offxyq*2+82*2]
+ movu ym19, [grain_lutq+offxyq*2+82*4]
+ vinserti32x8 m19, [grain_lutq+offxyq*2+82*6], 1
+ punpcklwd ym17, ym1, ym16
+ punpckhwd ym1, ym16
+%elif %2
+ movu ym18, [grain_lutq+offxyq*2+82*0]
+ vinserti32x8 m18, [grain_lutq+offxyq*2+82*2], 1
+ movu ym17, [grain_lutq+top_offxyq*2+82*0]
+ vinserti32x8 m17, [grain_lutq+top_offxyq*2+82*2], 1
+ movu ym19, [grain_lutq+offxyq*2+82*4]
+ vinserti32x8 m19, [grain_lutq+offxyq*2+82*6], 1
+ punpcklwd m16, m17, m18
+ punpckhwd m17, m18
+%else
+ movu m18, [grain_lutq+offxyq*2+82*0]
+ movu m19, [grain_lutq+top_offxyq*2+82*0]
+ movu m2, [grain_lutq+offxyq*2+82*2]
+ movu m16, [grain_lutq+top_offxyq*2+82*2]
+ punpckhwd m1, m19, m18
+ punpcklwd m19, m18
+ punpckhwd m18, m2, m16
+ punpcklwd m2, m16
+%endif
+ call %%add_noise_v
+ sub hb, 2<<%2
+ jg %%loop_y
+ add wq, 32>>%2
+ jge .end
+ mov srcq, r9mp
+ mov dstq, r10mp
+ mov lumaq, r11mp
+ lea srcq, [srcq+wq*2]
+ lea dstq, [dstq+wq*2]
+ lea lumaq, [lumaq+wq*(2<<%2)]
+
+ ; since fg_dataq.overlap is guaranteed to be set, we never jump back
+ ; to %%v_overlap, and instead always fall-through to %%hv_overlap
+%%hv_overlap:
+ ; we assume from the block above that bits 8-15 of r7d are zero'ed
+ mov r6d, seed
+ or seed, 0xeff4eff4
+ test seeb, seeh
+ setp r7b ; parity of top_seed
+ shr seed, 16
+ shl r7d, 16
+ test seeb, seeh
+ setp r7b ; parity of cur_seed
+ or r6d, 0x00010001
+ xor r7d, r6d
+ rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed
+
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
+ offx, offy, see, lstride, luma, left_offxy, top_offxy, topleft_offxy
+
+ lea topleft_offxyq, [top_offxyq+(32>>%2)]
+ lea left_offxyq, [offyq+(32>>%2)]
+ rorx offyd, seed, 8
+ rorx offxd, seed, 12
+ and offyd, 0xf000f
+ and offxd, 0xf000f
+ imul offyd, 164>>%3
+ ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
+ lea offyd, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82]
+
+ DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \
+ h, offxy, see, lstride, luma, left_offxy, top_offxy, topleft_offxy
+
+ mov grain_lutq, grain_lutmp
+ mov hd, hm
+ movzx top_offxyd, offxyw
+ shr offxyd, 16
+
+ ; grain = grain_lut[offy+y][offx+x]
+%if %2
+ movd xm16, [grain_lutq+left_offxyq*2+82*0]
+ vinserti32x4 m16, [grain_lutq+left_offxyq*2+82*2], 2
+ movd xm17, [grain_lutq+left_offxyq*2+82*4]
+ vinserti32x4 m17, [grain_lutq+left_offxyq*2+82*6], 2
+ movu ym18, [grain_lutq+offxyq*2+82*0]
+ vinserti32x8 m18, [grain_lutq+offxyq*2+82*2], 1
+ movu ym19, [grain_lutq+offxyq*2+82*4]
+ vinserti32x8 m19, [grain_lutq+offxyq*2+82*6], 1
+ punpckldq m16, m17
+ punpckldq m17, m18, m19
+ punpcklwd m16, m17
+ movu ym1, [grain_lutq+top_offxyq*2+82*0]
+ movd xm17, [grain_lutq+topleft_offxyq*2+82*0]
+ mova m0, m20
+ vpdpwssd m0, m16, m10
+%if %3
+ punpcklwd xm17, xm1
+ mova xm16, xm20
+ vpdpwssd xm16, xm17, xm10
+ psrad xm16, 1
+%else
+ vinserti32x8 m1, [grain_lutq+top_offxyq*2+82*2], 1
+ vinserti32x4 m17, [grain_lutq+topleft_offxyq*2+82*2], 2
+ punpcklwd m17, m1
+ mova m16, m20
+ vpdpwssd m16, m17, m10
+ psrad m16, 1
+%endif
+ psrad m0, 1
+ packssdw m0, m16
+ vpsravw m0, m21
+ vmovdqa32 m18{k3}, m0
+ vpshufd m19{k3}, m0, q0321
+%if %3
+ vpunpckhdq ym1{k3}, ym0, ym0
+ punpcklwd ym17, ym1, ym18
+ punpckhwd ym1, ym18
+%else
+ vpunpckhdq m1{k3}, m0, m0
+ punpcklwd m16, m1, m18
+ punpckhwd m17, m1, m18
+%endif
+%else
+ movu m18, [grain_lutq+offxyq*2+82*0]
+ movu m19, [grain_lutq+top_offxyq*2+82*0]
+ movd xm17, [grain_lutq+left_offxyq*2+82*0]
+ pinsrd xm17, [grain_lutq+topleft_offxyq*2+82*0], 1
+ punpckldq xm16, xm18, xm19
+ punpcklwd xm17, xm16
+ movu m2, [grain_lutq+offxyq*2+82*2]
+ movu m0, [grain_lutq+top_offxyq*2+82*2]
+ movd xm16, [grain_lutq+left_offxyq*2+82*2]
+ pinsrd xm16, [grain_lutq+topleft_offxyq*2+82*2], 1
+ punpckldq xm1, xm2, xm0
+ punpcklwd xm1, xm16, xm1
+ mova xm16, xm20
+ vpdpwssd xm16, xm17, xm10
+ mova xm17, xm20
+ vpdpwssd xm17, xm1, xm10
+ punpckhwd m1, m19, m18
+ punpcklwd m19, m18
+ punpckhwd m18, m2, m0
+ punpcklwd m2, m0
+ psrad xm16, 1
+ psrad xm17, 1
+ packssdw xm16, xm17
+ vpsravw xm16, xm21
+ vpshuflw m19{k4}, m16, q1302
+ punpckhqdq xm16, xm16
+ vpshuflw m2{k4}, m16, q3120
+%endif
+ call %%add_noise_v
+ sub hb, 2<<%2
+ jg %%loop_y_h_overlap
+ add wq, 32>>%2
+ jge .end
+ mov srcq, r9mp
+ mov dstq, r10mp
+ mov lumaq, r11mp
+ lea srcq, [srcq+wq*2]
+ lea dstq, [dstq+wq*2]
+ lea lumaq, [lumaq+wq*(2<<%2)]
+ jmp %%hv_overlap
+
+ALIGN function_align
+%%add_noise_v:
+%if %3
+ mova ym16, ym20
+ vpdpwssd ym16, ym17, ym11
+ mova ym17, ym20
+ vpdpwssd ym17, ym1, ym11
+ psrad ym16, 1
+ psrad ym17, 1
+ packssdw ym16, ym17
+ vpsravw m18{k1}, m16, m21
+%elif %2
+ mova m18, m20
+ vpdpwssd m18, m16, m11
+ mova m16, m20
+ vpdpwssd m16, m17, m11
+ psrad m18, 1
+ psrad m16, 1
+ packssdw m18, m16
+ vpsravw m18, m21
+%else
+ mova m16, m20
+ vpdpwssd m16, m1, m11
+ mova m17, m20
+ vpdpwssd m17, m18, m11
+ mova m18, m20
+ vpdpwssd m18, m19, m11
+ mova m19, m20
+ vpdpwssd m19, m2, m11
+ REPX {psrad x, 1}, m16, m17, m18, m19
+ packssdw m18, m16
+ packssdw m19, m17
+ vpsravw m18, m21
+ vpsravw m19, m21
+%endif
+%%add_noise:
+%if %2
+ mova m2, [lumaq+lstrideq*(0<<%3)]
+ mova m0, [lumaq+lstrideq*(1<<%3)]
+ lea lumaq, [lumaq+lstrideq*(2<<%3)]
+ mova m3, [lumaq+lstrideq*(0<<%3)]
+ mova m1, [lumaq+lstrideq*(1<<%3)]
+ mova m16, m12
+ vpermi2w m16, m2, m0
+ vpermt2w m2, m13, m0
+ mova m17, m12
+ vpermi2w m17, m3, m1
+ vpermt2w m3, m13, m1
+ pavgw m2, m16
+ pavgw m3, m17
+%elif %1
+ mova m2, [lumaq+lstrideq*0]
+ mova m3, [lumaq+lstrideq*1]
+%endif
+%if %2
+ mova ym16, [srcq+strideq*0]
+ vinserti32x8 m16, [srcq+strideq*1], 1
+ lea srcq, [srcq+strideq*2]
+%else
+ mova m16, [srcq+strideq*0]
+%endif
+%if %1
+ punpckhwd m17, m2, m16
+ mova m0, m14
+ vpdpwssd m0, m17, m15
+ punpcklwd m17, m2, m16
+ mova m2, m14
+ vpdpwssd m2, m17, m15
+%endif
+%if %2
+ mova ym17, [srcq+strideq*0]
+ vinserti32x8 m17, [srcq+strideq*1], 1
+%else
+ mova m17, [srcq+strideq*1]
+%endif
+%if %1
+ psrad m0, 6
+ psrad m2, 6
+ packusdw m2, m0
+ punpckhwd m0, m3, m17
+ mova m1, m14
+ vpdpwssd m1, m15, m0
+ punpcklwd m0, m3, m17
+ mova m3, m14
+ vpdpwssd m3, m15, m0
+ psrad m1, 6
+ psrad m3, 6
+ packusdw m3, m1
+ pminuw m2, m4
+ pminuw m3, m4
+
+.add_noise_main:
+ ; scaling[luma_src]
+ kmovw k5, k1
+ pand m1, m5, m2
+ vpgatherdd m0{k5}, [scalingq+m1]
+ kmovw k5, k1
+ psrld m2, 16
+ vpgatherdd m1{k5}, [scalingq+m2]
+ vpshufb m0{k2}, m1, m6
+ kmovw k5, k1
+ psrld m1, m3, 16
+ vpgatherdd m2{k5}, [scalingq+m1]
+ kmovw k5, k1
+ pand m3, m5
+ vpgatherdd m1{k5}, [scalingq+m3]
+ vpshufb m1{k2}, m2, m6
+
+ ; noise = round2(scaling[luma_src] * grain, scaling_shift)
+ vpsllvw m0, m7
+ vpsllvw m1, m7
+ pmulhrsw m18, m0
+ pmulhrsw m19, m1
+ add grain_lutq, 82*(4<<%2)
+ lea lumaq, [lumaq+lstrideq*(2<<%3)]
+ lea srcq, [srcq+strideq*2]
+ paddw m16, m18
+ paddw m17, m19
+ pmaxsw m16, m8
+ pmaxsw m17, m8
+ pminsw m16, m9
+ pminsw m17, m9
+%if %2
+ mova [dstq+strideq*0], ym16
+ vextracti32x8 [dstq+strideq*1], m16, 1
+ lea dstq, [dstq+strideq*2]
+ mova [dstq+strideq*0], ym17
+ vextracti32x8 [dstq+strideq*1], m17, 1
+%else
+ mova [dstq+strideq*0], m16
+ mova [dstq+strideq*1], m17
+%endif
+ lea dstq, [dstq+strideq*2]
+ ret
+%else
+%if %2
+ pand m2, m4
+ pand m3, m4
+%else
+ pand m2, m4, [lumaq+lstrideq*0]
+ pand m3, m4, [lumaq+lstrideq*1]
+%endif
+ jmp .add_noise_main
+%endif
+%endmacro
+
+ %%FGUV_32x32xN_LOOP 1, %2, %3
+.csfl:
+ %%FGUV_32x32xN_LOOP 0, %2, %3
+.end:
+ RET
+%endmacro
+
+FGUV_FN 420, 1, 1
+FGUV_FN 422, 1, 0
+FGUV_FN 444, 0, 0
+
+%endif
diff --git a/src/x86/filmgrain_avx2.asm b/src/x86/filmgrain_avx2.asm
index 7da8105..55445cf 100644
--- a/src/x86/filmgrain_avx2.asm
+++ b/src/x86/filmgrain_avx2.asm
@@ -1488,7 +1488,7 @@ cglobal fguv_32x32xn_i%1_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling,
jg %%loop_y
add wq, 32>>%2
- jge %%end
+ jge .end
mov srcq, r11mp
mov dstq, r12mp
lea lumaq, [r14+wq*(1+%2)]
@@ -1647,7 +1647,7 @@ cglobal fguv_32x32xn_i%1_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling,
jg %%loop_y_h_overlap
add wq, 32>>%2
- jge %%end
+ jge .end
mov srcq, r11mp
mov dstq, r12mp
lea lumaq, [r14+wq*(1+%2)]
@@ -1852,7 +1852,7 @@ cglobal fguv_32x32xn_i%1_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling,
%%end_y_v_overlap:
add wq, 32>>%2
- jge %%end
+ jge .end
mov srcq, r11mp
mov dstq, r12mp
lea lumaq, [r14+wq*(1+%2)]
@@ -2081,20 +2081,20 @@ cglobal fguv_32x32xn_i%1_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling,
%%end_y_hv_overlap:
add wq, 32>>%2
- jge %%end
+ jge .end
mov srcq, r11mp
mov dstq, r12mp
lea lumaq, [r14+wq*(1+%2)]
add srcq, wq
add dstq, wq
jmp %%loop_x_hv_overlap
-%%end:
- RET
%endmacro
%%FGUV_32x32xN_LOOP 1, %2, %3
.csfl:
%%FGUV_32x32xN_LOOP 0, %2, %3
+.end:
+ RET
%endmacro
GEN_GRAIN_UV_FN 420, 1, 1
diff --git a/src/x86/filmgrain_init_tmpl.c b/src/x86/filmgrain_init_tmpl.c
index 0b783d1..1c91d2a 100644
--- a/src/x86/filmgrain_init_tmpl.c
+++ b/src/x86/filmgrain_init_tmpl.c
@@ -71,7 +71,6 @@ COLD void bitfn(dav1d_film_grain_dsp_init_x86)(Dav1dFilmGrainDSPContext *const c
c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I444 - 1] = BF(dav1d_fguv_32x32xn_i444, avx2);
}
-#if BITDEPTH == 8
if (!(flags & DAV1D_X86_CPU_FLAG_AVX512ICL)) return;
c->fgy_32x32xn = BF(dav1d_fgy_32x32xn, avx512icl);
@@ -79,5 +78,4 @@ COLD void bitfn(dav1d_film_grain_dsp_init_x86)(Dav1dFilmGrainDSPContext *const c
c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I422 - 1] = BF(dav1d_fguv_32x32xn_i422, avx512icl);
c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I444 - 1] = BF(dav1d_fguv_32x32xn_i444, avx512icl);
#endif
-#endif
}