Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/videolan/dav1d.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorHenrik Gramner <gramner@twoorioles.com>2022-06-02 21:29:38 +0300
committerHenrik Gramner <henrik@gramner.com>2022-06-03 00:58:39 +0300
commit0cfb03cd6eddbaf350b17e06af6fa956e3455a00 (patch)
tree2f4f513ce033da2a3a96ee9134414ee9f351a629
parentb4f9eac85803303760f90ebad49de068cc2663d7 (diff)
x86: Add a workaround for quirky AVX-512 hardware behavior
On Intel CPUs certain AVX-512 shuffle instructions incorrectly flag the upper halves of YMM registers as in use when writing to XMM registers, which may cause AVX/SSE state transitions. This behavior is not documented and only occurs on physical hardware, not when using the Intel SDE, so as far as I can tell it appears to be a hardware bug. Work around the issue by using EVEX-only registers. This avoids the problem at the cost of a slightly larger code size.
-rw-r--r--src/x86/ipred16_avx512.asm66
-rw-r--r--src/x86/ipred_avx512.asm92
-rw-r--r--src/x86/mc16_avx512.asm144
-rw-r--r--src/x86/mc_avx512.asm166
4 files changed, 234 insertions, 234 deletions
diff --git a/src/x86/ipred16_avx512.asm b/src/x86/ipred16_avx512.asm
index 4a1b060..1a307ad 100644
--- a/src/x86/ipred16_avx512.asm
+++ b/src/x86/ipred16_avx512.asm
@@ -114,20 +114,20 @@ cglobal ipred_paeth_16bpc, 3, 7, 10, dst, stride, tl, w, h
vbroadcasti32x4 m2, [tlq]
pshufb m2, m7 ; left
PAETH 4, 5, 6
- vextracti32x4 xmm1, m0, 2
- vextracti32x4 xmm2, ym0, 1
- vextracti32x4 xmm3, m0, 3
+ vextracti32x4 xm1, m0, 2
+ vextracti32x4 xm8, ym0, 1
+ vextracti32x4 xm9, m0, 3
movq [dstq+strideq*0], xm0
- movq [dstq+strideq*1], xmm1
- movq [dstq+strideq*2], xmm2
- movq [dstq+r6 ], xmm3
+ movq [dstq+strideq*1], xm1
+ movq [dstq+strideq*2], xm8
+ movq [dstq+r6 ], xm9
sub hd, 8
jl .w4_end
lea dstq, [dstq+strideq*4]
movhps [dstq+strideq*0], xm0
- movhps [dstq+strideq*1], xmm1
- movhps [dstq+strideq*2], xmm2
- movhps [dstq+r6 ], xmm3
+ movhps [dstq+strideq*1], xm1
+ movhps [dstq+strideq*2], xm8
+ movhps [dstq+r6 ], xm9
lea dstq, [dstq+strideq*4]
jg .w4_loop
.w4_end:
@@ -220,19 +220,19 @@ cglobal ipred_smooth_v_16bpc, 3, 7, 7, dst, stride, tl, w, h, weights, stride3
pshufb m3, m4
pmulhrsw m3, m5
paddw m3, m6
- vextracti32x4 xmm0, m3, 3
- vextracti32x4 xmm1, ym3, 1
- vextracti32x4 xmm2, m3, 2
- movhps [dstq+strideq*0], xmm0
- movhps [dstq+strideq*1], xmm1
- movhps [dstq+strideq*2], xmm2
+ vextracti32x4 xm0, m3, 3
+ vextracti32x4 xm1, ym3, 1
+ vextracti32x4 xm2, m3, 2
+ movhps [dstq+strideq*0], xm0
+ movhps [dstq+strideq*1], xm1
+ movhps [dstq+strideq*2], xm2
movhps [dstq+stride3q ], xm3
add hq, 8
jg .end
lea dstq, [dstq+strideq*4]
- movq [dstq+strideq*0], xmm0
- movq [dstq+strideq*1], xmm1
- movq [dstq+strideq*2], xmm2
+ movq [dstq+strideq*0], xm0
+ movq [dstq+strideq*1], xm1
+ movq [dstq+strideq*2], xm2
movq [dstq+stride3q ], xm3
lea dstq, [dstq+strideq*4]
jl .w4_loop
@@ -337,20 +337,20 @@ cglobal ipred_smooth_h_16bpc, 3, 7, 7, dst, stride, tl, w, h, stride3
psubw m0, m6 ; left - right
pmulhrsw m0, m5
paddw m0, m6
- vextracti32x4 xmm1, m0, 2
- vextracti32x4 xmm2, ym0, 1
- vextracti32x4 xmm3, m0, 3
+ vextracti32x4 xm1, m0, 2
+ vextracti32x4 xm2, ym0, 1
+ vextracti32x4 xm3, m0, 3
movq [dstq+strideq*0], xm0
- movq [dstq+strideq*1], xmm1
- movq [dstq+strideq*2], xmm2
- movq [dstq+stride3q ], xmm3
+ movq [dstq+strideq*1], xm1
+ movq [dstq+strideq*2], xm2
+ movq [dstq+stride3q ], xm3
sub hd, 8*2
jl .end
lea dstq, [dstq+strideq*4]
movhps [dstq+strideq*0], xm0
- movhps [dstq+strideq*1], xmm1
- movhps [dstq+strideq*2], xmm2
- movhps [dstq+stride3q ], xmm3
+ movhps [dstq+strideq*1], xm1
+ movhps [dstq+strideq*2], xm2
+ movhps [dstq+stride3q ], xm3
lea dstq, [dstq+strideq*4]
jg .w4_loop
.end:
@@ -472,11 +472,11 @@ cglobal ipred_smooth_16bpc, 3, 7, 16, dst, stride, tl, w, h, v_weights, stride3
vpdpwssd m0, m1, m6
vpermb m0, m14, m0
pavgw ym0, ym15
- vextracti32x4 xmm1, ym0, 1
+ vextracti32x4 xm1, ym0, 1
movq [dstq+strideq*0], xm0
- movq [dstq+strideq*1], xmm1
+ movq [dstq+strideq*1], xm1
movhps [dstq+strideq*2], xm0
- movhps [dstq+stride3q ], xmm1
+ movhps [dstq+stride3q ], xm1
lea dstq, [dstq+strideq*4]
add v_weightsq, 4*4
sub hd, 4*2
@@ -624,11 +624,11 @@ cglobal pal_pred_16bpc, 4, 7, 4, dst, stride, pal, idx, w, h, stride3
pmovzxbw ym0, [idxq]
add idxq, 16
vpermw ym0, ym0, ym3
- vextracti32x4 xmm1, ym0, 1
+ vextracti32x4 xm1, ym0, 1
movq [dstq+strideq*0], xm0
movhps [dstq+strideq*1], xm0
- movq [dstq+strideq*2], xmm1
- movhps [dstq+stride3q ], xmm1
+ movq [dstq+strideq*2], xm1
+ movhps [dstq+stride3q ], xm1
lea dstq, [dstq+strideq*4]
sub hd, 4
jg .w4
diff --git a/src/x86/ipred_avx512.asm b/src/x86/ipred_avx512.asm
index 050ec9b..38c86b5 100644
--- a/src/x86/ipred_avx512.asm
+++ b/src/x86/ipred_avx512.asm
@@ -242,9 +242,9 @@ cglobal ipred_dc_8bpc, 3, 7, 5, dst, stride, tl, w, h, stride3
jmp wq
.w8:
movq xmm1, [tlq+1]
- vextracti32x4 xmm2, ym0, 1
+ vextracti32x4 xm2, ym0, 1
vpdpbusd xm0, xmm1, xm3
- paddd xmm2, xm0
+ paddd xmm2, xm2, xm0
punpckhqdq xmm0, xmm2, xmm2
paddd xmm0, xmm2
psrlq xmm1, xmm0, 32
@@ -275,9 +275,9 @@ cglobal ipred_dc_8bpc, 3, 7, 5, dst, stride, tl, w, h, stride3
jmp wq
.w16:
movu xmm1, [tlq+1]
- vextracti32x4 xmm2, ym0, 1
+ vextracti32x4 xm2, ym0, 1
vpdpbusd xm0, xmm1, xm3
- paddd xmm2, xm0
+ paddd xmm2, xm2, xm0
punpckhqdq xmm0, xmm2, xmm2
paddd xmm0, xmm2
psrlq xmm1, xmm0, 32
@@ -309,8 +309,8 @@ cglobal ipred_dc_8bpc, 3, 7, 5, dst, stride, tl, w, h, stride3
.w32:
movu ym1, [tlq+1]
vpdpbusd ym0, ym1, ym3
- vextracti32x4 xmm1, ym0, 1
- paddd xmm1, xm0
+ vextracti32x4 xm1, ym0, 1
+ paddd xmm1, xm1, xm0
punpckhqdq xmm0, xmm1, xmm1
paddd xmm0, xmm1
psrlq xmm1, xmm0, 32
@@ -345,8 +345,8 @@ cglobal ipred_dc_8bpc, 3, 7, 5, dst, stride, tl, w, h, stride3
movu ym2, [tlq+33]
vpdpbusd ym0, ym1, ym3
vpdpbusd ym0, ym2, ym3
- vextracti32x4 xmm1, ym0, 1
- paddd xmm1, xm0
+ vextracti32x4 xm1, ym0, 1
+ paddd xmm1, xm1, xm0
punpckhqdq xmm0, xmm1, xmm1
paddd xmm0, xmm1
psrlq xmm1, xmm0, 32
@@ -524,12 +524,12 @@ INIT_YMM avx512icl
pextrd [dstq+stride3q ], xm0, 3
sub hd, 8
jl .w4_ret
- vextracti32x4 xmm0, m0, 1
+ vextracti32x4 xm0, m0, 1
lea dstq, [dstq+strideq*4]
- movd [dstq+strideq*0], xmm0
- pextrd [dstq+strideq*1], xmm0, 1
- pextrd [dstq+strideq*2], xmm0, 2
- pextrd [dstq+stride3q ], xmm0, 3
+ movd [dstq+strideq*0], xm0
+ pextrd [dstq+strideq*1], xm0, 1
+ pextrd [dstq+strideq*2], xm0, 2
+ pextrd [dstq+stride3q ], xm0, 3
lea dstq, [dstq+strideq*4]
jg .w4_loop
.w4_ret:
@@ -545,20 +545,20 @@ INIT_ZMM avx512icl
vpbroadcastq m4, [tlq+hq-8]
pshufb m4, m9
PAETH
- vextracti32x4 xmm1, m0, 2
- vextracti32x4 xmm2, ym0, 1
- vextracti32x4 xmm3, m0, 3
+ vextracti32x4 xm1, m0, 2
+ vextracti32x4 xm2, ym0, 1
+ vextracti32x4 xm3, m0, 3
movq [dstq+strideq*0], xm0
- movq [dstq+strideq*1], xmm1
- movq [dstq+strideq*2], xmm2
- movq [dstq+stride3q ], xmm3
+ movq [dstq+strideq*1], xm1
+ movq [dstq+strideq*2], xm2
+ movq [dstq+stride3q ], xm3
sub hd, 8
jl .w8_ret
lea dstq, [dstq+strideq*4]
movhps [dstq+strideq*0], xm0
- movhps [dstq+strideq*1], xmm1
- movhps [dstq+strideq*2], xmm2
- movhps [dstq+stride3q ], xmm3
+ movhps [dstq+strideq*1], xm1
+ movhps [dstq+strideq*2], xm2
+ movhps [dstq+stride3q ], xm3
lea dstq, [dstq+strideq*4]
jg .w8_loop
.w8_ret:
@@ -639,18 +639,18 @@ cglobal ipred_smooth_v_8bpc, 3, 7, 7, dst, stride, tl, w, h, weights, stride3
pmaddubsw m0, m2, m0
paddw m0, m3
vpermb m0, m6, m0
- vextracti32x4 xmm1, ym0, 1
+ vextracti32x4 xm1, ym0, 1
movd [dstq+strideq*0], xm0
- movd [dstq+strideq*1], xmm1
+ movd [dstq+strideq*1], xm1
pextrd [dstq+strideq*2], xm0, 2
- pextrd [dstq+stride3q ], xmm1, 2
+ pextrd [dstq+stride3q ], xm1, 2
add hq, 8
jg .ret
lea dstq, [dstq+strideq*4]
pextrd [dstq+strideq*0], xm0, 1
- pextrd [dstq+strideq*1], xmm1, 1
+ pextrd [dstq+strideq*1], xm1, 1
pextrd [dstq+strideq*2], xm0, 3
- pextrd [dstq+stride3q ], xmm1, 3
+ pextrd [dstq+stride3q ], xm1, 3
lea dstq, [dstq+strideq*4]
jl .w4_loop
.ret:
@@ -669,11 +669,11 @@ cglobal ipred_smooth_v_8bpc, 3, 7, 7, dst, stride, tl, w, h, weights, stride3
pmaddubsw m0, m2, m0
paddw m0, m3
vpermb m0, m6, m0
- vextracti32x4 xmm1, ym0, 1
+ vextracti32x4 xm1, ym0, 1
movq [dstq+strideq*0], xm0
- movq [dstq+strideq*1], xmm1
+ movq [dstq+strideq*1], xm1
movhps [dstq+strideq*2], xm0
- movhps [dstq+stride3q ], xmm1
+ movhps [dstq+stride3q ], xm1
lea dstq, [dstq+strideq*4]
add hq, 4
jl .w8_loop
@@ -785,18 +785,18 @@ cglobal ipred_smooth_h_8bpc, 4, 7, 11, dst, stride, tl, w, h, stride3
paddw m0, m2
paddw m0, m1
vpermb m0, m8, m0
- vextracti32x4 xmm1, ym0, 1
+ vextracti32x4 xm1, ym0, 1
movd [dstq+strideq*0], xm0
- movd [dstq+strideq*1], xmm1
+ movd [dstq+strideq*1], xm1
pextrd [dstq+strideq*2], xm0, 2
- pextrd [dstq+stride3q ], xmm1, 2
+ pextrd [dstq+stride3q ], xm1, 2
sub hd, 8
jl .ret
lea dstq, [dstq+strideq*4]
pextrd [dstq+strideq*0], xm0, 1
- pextrd [dstq+strideq*1], xmm1, 1
+ pextrd [dstq+strideq*1], xm1, 1
pextrd [dstq+strideq*2], xm0, 3
- pextrd [dstq+stride3q ], xmm1, 3
+ pextrd [dstq+stride3q ], xm1, 3
lea dstq, [dstq+strideq*4]
jg .w4_loop
.ret:
@@ -815,11 +815,11 @@ cglobal ipred_smooth_h_8bpc, 4, 7, 11, dst, stride, tl, w, h, stride3
paddw m0, m2
paddw m0, m1
vpermb m0, m8, m0
- vextracti32x4 xmm1, ym0, 1
+ vextracti32x4 xm1, ym0, 1
movq [dstq+strideq*0], xm0
- movq [dstq+strideq*1], xmm1
+ movq [dstq+strideq*1], xm1
movhps [dstq+strideq*2], xm0
- movhps [dstq+stride3q ], xmm1
+ movhps [dstq+stride3q ], xm1
lea dstq, [dstq+strideq*4]
sub hd, 4
jg .w8_loop
@@ -937,18 +937,18 @@ cglobal ipred_smooth_8bpc, 4, 7, 16, dst, stride, tl, w, h, v_weights, stride3
paddw m1, m2
pavgw m0, m1
vpermb m0, m11, m0
- vextracti32x4 xmm1, ym0, 1
+ vextracti32x4 xm1, ym0, 1
movd [dstq+strideq*0], xm0
- movd [dstq+strideq*1], xmm1
+ movd [dstq+strideq*1], xm1
pextrd [dstq+strideq*2], xm0, 2
- pextrd [dstq+stride3q ], xmm1, 2
+ pextrd [dstq+stride3q ], xm1, 2
sub hd, 8
jl .ret
lea dstq, [dstq+strideq*4]
pextrd [dstq+strideq*0], xm0, 1
- pextrd [dstq+strideq*1], xmm1, 1
+ pextrd [dstq+strideq*1], xm1, 1
pextrd [dstq+strideq*2], xm0, 3
- pextrd [dstq+stride3q ], xmm1, 3
+ pextrd [dstq+stride3q ], xm1, 3
lea dstq, [dstq+strideq*4]
jg .w4_loop
.ret:
@@ -978,11 +978,11 @@ cglobal ipred_smooth_8bpc, 4, 7, 16, dst, stride, tl, w, h, v_weights, stride3
paddw m1, m2
pavgw m0, m1
vpermb m0, m11, m0
- vextracti32x4 xmm1, ym0, 1
+ vextracti32x4 xm1, ym0, 1
movq [dstq+strideq*0], xm0
- movq [dstq+strideq*1], xmm1
+ movq [dstq+strideq*1], xm1
movhps [dstq+strideq*2], xm0
- movhps [dstq+stride3q ], xmm1
+ movhps [dstq+stride3q ], xm1
lea dstq, [dstq+strideq*4]
sub hd, 4
jg .w8_loop
diff --git a/src/x86/mc16_avx512.asm b/src/x86/mc16_avx512.asm
index e83b18a..f8d1aaa 100644
--- a/src/x86/mc16_avx512.asm
+++ b/src/x86/mc16_avx512.asm
@@ -3615,32 +3615,32 @@ ALIGN function_align
.w4:
movq [dstq ], xm0
movhps [dstq+strideq*1], xm0
- vextracti32x4 xmm0, ym0, 1
- movq [dstq+strideq*2], xmm0
- movhps [dstq+stride3q ], xmm0
+ vextracti32x4 xm2, ym0, 1
+ movq [dstq+strideq*2], xm2
+ movhps [dstq+stride3q ], xm2
cmp hd, 8
jl .w4_end
- vextracti32x4 xmm0, m0, 2
+ vextracti32x4 xm2, m0, 2
lea dstq, [dstq+strideq*4]
- movq [dstq ], xmm0
- movhps [dstq+strideq*1], xmm0
- vextracti32x4 xmm0, m0, 3
- movq [dstq+strideq*2], xmm0
- movhps [dstq+stride3q ], xmm0
+ movq [dstq ], xm2
+ movhps [dstq+strideq*1], xm2
+ vextracti32x4 xm0, m0, 3
+ movq [dstq+strideq*2], xm0
+ movhps [dstq+stride3q ], xm0
je .w4_end
lea dstq, [dstq+strideq*4]
movq [dstq ], xm1
movhps [dstq+strideq*1], xm1
- vextracti32x4 xmm0, ym1, 1
- movq [dstq+strideq*2], xmm0
- movhps [dstq+stride3q ], xmm0
- vextracti32x4 xmm0, m1, 2
+ vextracti32x4 xm0, ym1, 1
+ movq [dstq+strideq*2], xm0
+ movhps [dstq+stride3q ], xm0
+ vextracti32x4 xm0, m1, 2
lea dstq, [dstq+strideq*4]
- movq [dstq ], xmm0
- movhps [dstq+strideq*1], xmm0
- vextracti32x4 xmm0, m1, 3
- movq [dstq+strideq*2], xmm0
- movhps [dstq+stride3q ], xmm0
+ movq [dstq ], xm0
+ movhps [dstq+strideq*1], xm0
+ vextracti32x4 xm1, m1, 3
+ movq [dstq+strideq*2], xm1
+ movhps [dstq+stride3q ], xm1
.w4_end:
RET
.w8_loop:
@@ -3860,33 +3860,33 @@ cglobal w_mask_420_16bpc, 4, 8, 16, dst, stride, tmp1, tmp2, w, h, mask, stride3
vpermb m3, m15, m3
movq [dstq+strideq*0], xm0
movhps [dstq+strideq*1], xm0
- vextracti32x4 xmm0, ym0, 1
- movq [dstq+strideq*2], xmm0
- movhps [dstq+stride3q ], xmm0
+ vextracti32x4 xm2, ym0, 1
+ movq [dstq+strideq*2], xm2
+ movhps [dstq+stride3q ], xm2
mova [maskq], xm3
cmp hd, 8
jl .w4_end
- vextracti32x4 xmm0, m0, 2
+ vextracti32x4 xm2, m0, 2
lea dstq, [dstq+strideq*4]
- movq [dstq+strideq*0], xmm0
- movhps [dstq+strideq*1], xmm0
- vextracti32x4 xmm0, m0, 3
- movq [dstq+strideq*2], xmm0
- movhps [dstq+stride3q ], xmm0
+ movq [dstq+strideq*0], xm2
+ movhps [dstq+strideq*1], xm2
+ vextracti32x4 xm0, m0, 3
+ movq [dstq+strideq*2], xm0
+ movhps [dstq+stride3q ], xm0
je .w4_end
lea dstq, [dstq+strideq*4]
movq [dstq+strideq*0], xm1
movhps [dstq+strideq*1], xm1
- vextracti32x4 xmm0, ym1, 1
- movq [dstq+strideq*2], xmm0
- movhps [dstq+stride3q ], xmm0
- vextracti32x4 xmm0, m1, 2
+ vextracti32x4 xm2, ym1, 1
+ movq [dstq+strideq*2], xm2
+ movhps [dstq+stride3q ], xm2
+ vextracti32x4 xm2, m1, 2
lea dstq, [dstq+strideq*4]
- movq [dstq+strideq*0], xmm0
- movhps [dstq+strideq*1], xmm0
- vextracti32x4 xmm0, m1, 3
- movq [dstq+strideq*2], xmm0
- movhps [dstq+stride3q ], xmm0
+ movq [dstq+strideq*0], xm2
+ movhps [dstq+strideq*1], xm2
+ vextracti32x4 xm1, m1, 3
+ movq [dstq+strideq*2], xm1
+ movhps [dstq+stride3q ], xm1
.w4_end:
RET
.w8:
@@ -4090,32 +4090,32 @@ cglobal w_mask_422_16bpc, 4, 8, 15, dst, stride, tmp1, tmp2, w, h, mask, stride3
.w4:
movq [dstq+strideq*0], xm0
movhps [dstq+strideq*1], xm0
- vextracti32x4 xmm0, ym0, 1
- movq [dstq+strideq*2], xmm0
- movhps [dstq+stride3q ], xmm0
+ vextracti32x4 xm2, ym0, 1
+ movq [dstq+strideq*2], xm2
+ movhps [dstq+stride3q ], xm2
cmp hd, 8
jl .w4_end
- vextracti32x4 xmm0, m0, 2
+ vextracti32x4 xm2, m0, 2
lea dstq, [dstq+strideq*4]
- movq [dstq+strideq*0], xmm0
- movhps [dstq+strideq*1], xmm0
- vextracti32x4 xmm0, m0, 3
- movq [dstq+strideq*2], xmm0
- movhps [dstq+stride3q ], xmm0
+ movq [dstq+strideq*0], xm2
+ movhps [dstq+strideq*1], xm2
+ vextracti32x4 xm0, m0, 3
+ movq [dstq+strideq*2], xm0
+ movhps [dstq+stride3q ], xm0
je .w4_end
lea dstq, [dstq+strideq*4]
movq [dstq+strideq*0], xm1
movhps [dstq+strideq*1], xm1
- vextracti32x4 xmm0, ym1, 1
- movq [dstq+strideq*2], xmm0
- movhps [dstq+stride3q ], xmm0
- vextracti32x4 xmm0, m1, 2
+ vextracti32x4 xm2, ym1, 1
+ movq [dstq+strideq*2], xm2
+ movhps [dstq+stride3q ], xm2
+ vextracti32x4 xm2, m1, 2
lea dstq, [dstq+strideq*4]
- movq [dstq+strideq*0], xmm0
- movhps [dstq+strideq*1], xmm0
- vextracti32x4 xmm0, m1, 3
- movq [dstq+strideq*2], xmm0
- movhps [dstq+stride3q ], xmm0
+ movq [dstq+strideq*0], xm2
+ movhps [dstq+strideq*1], xm2
+ vextracti32x4 xm1, m1, 3
+ movq [dstq+strideq*2], xm1
+ movhps [dstq+stride3q ], xm1
.w4_end:
RET
.w8_loop:
@@ -4249,32 +4249,32 @@ cglobal w_mask_444_16bpc, 4, 8, 13, dst, stride, tmp1, tmp2, w, h, mask, stride3
.w4:
movq [dstq+strideq*0], xm0
movhps [dstq+strideq*1], xm0
- vextracti32x4 xmm0, ym0, 1
- movq [dstq+strideq*2], xmm0
- movhps [dstq+stride3q ], xmm0
+ vextracti32x4 xm2, ym0, 1
+ movq [dstq+strideq*2], xm2
+ movhps [dstq+stride3q ], xm2
cmp hd, 8
jl .w4_end
- vextracti32x4 xmm0, m0, 2
+ vextracti32x4 xm2, m0, 2
lea dstq, [dstq+strideq*4]
- movq [dstq+strideq*0], xmm0
- movhps [dstq+strideq*1], xmm0
- vextracti32x4 xmm0, m0, 3
- movq [dstq+strideq*2], xmm0
- movhps [dstq+stride3q ], xmm0
+ movq [dstq+strideq*0], xm2
+ movhps [dstq+strideq*1], xm2
+ vextracti32x4 xm0, m0, 3
+ movq [dstq+strideq*2], xm0
+ movhps [dstq+stride3q ], xm0
je .w4_end
lea dstq, [dstq+strideq*4]
movq [dstq+strideq*0], xm1
movhps [dstq+strideq*1], xm1
- vextracti32x4 xmm0, ym1, 1
- movq [dstq+strideq*2], xmm0
- movhps [dstq+stride3q ], xmm0
- vextracti32x4 xmm0, m1, 2
+ vextracti32x4 xm2, ym1, 1
+ movq [dstq+strideq*2], xm2
+ movhps [dstq+stride3q ], xm2
+ vextracti32x4 xm2, m1, 2
lea dstq, [dstq+strideq*4]
- movq [dstq+strideq*0], xmm0
- movhps [dstq+strideq*1], xmm0
- vextracti32x4 xmm0, m1, 3
- movq [dstq+strideq*2], xmm0
- movhps [dstq+stride3q ], xmm0
+ movq [dstq+strideq*0], xm2
+ movhps [dstq+strideq*1], xm2
+ vextracti32x4 xm1, m1, 3
+ movq [dstq+strideq*2], xm1
+ movhps [dstq+stride3q ], xm1
.w4_end:
RET
.w8_loop:
diff --git a/src/x86/mc_avx512.asm b/src/x86/mc_avx512.asm
index eb3ca1c..7897f1d 100644
--- a/src/x86/mc_avx512.asm
+++ b/src/x86/mc_avx512.asm
@@ -449,9 +449,9 @@ cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w, h, mxy
pshufb ym0, ym4
pmaddubsw ym0, ym5
pmulhrsw ym0, ym3
- vpmovuswb xmm0, ym0
- movq [dstq+dsq*0], xmm0
- movhps [dstq+dsq*1], xmm0
+ vpmovuswb xm0, ym0
+ movq [dstq+dsq*0], xm0
+ movhps [dstq+dsq*1], xm0
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .h_w8
@@ -755,9 +755,9 @@ cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w, h, mxy
pmulhw ym1, ym6
paddw ym1, ym2
pmulhrsw ym1, ym7
- vpmovuswb xmm1, ym1
- movq [dstq+dsq*0], xmm1
- movhps [dstq+dsq*1], xmm1
+ vpmovuswb xm1, ym1
+ movq [dstq+dsq*0], xm1
+ movhps [dstq+dsq*1], xm1
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .hv_w8_loop
@@ -1588,13 +1588,13 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3
jg .h_w4_loop
RET
.h_w8:
- movu xmm0, [srcq+ssq*0]
- vinserti32x4 ym0, ymm0, [srcq+ssq*1], 1
+ movu xm0, [srcq+ssq*0]
+ vinserti32x4 ym0, [srcq+ssq*1], 1
lea srcq, [srcq+ssq*2]
WRAP_YMM PUT_8TAP_H 0, 1, 2, 3
- vpmovuswb xmm0, ym0
- movq [dstq+dsq*0], xmm0
- movhps [dstq+dsq*1], xmm0
+ vpmovuswb xm0, ym0
+ movq [dstq+dsq*0], xm0
+ movhps [dstq+dsq*1], xm0
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .h_w8
@@ -3308,17 +3308,17 @@ ALIGN function_align
cmp hd, 8
jg .w4_h16
WRAP_YMM %1 0
- vextracti32x4 xmm1, ym0, 1
+ vextracti32x4 xm1, ym0, 1
movd [dstq ], xm0
pextrd [dstq+strideq*1], xm0, 1
- movd [dstq+strideq*2], xmm1
- pextrd [dstq+stride3q ], xmm1, 1
+ movd [dstq+strideq*2], xm1
+ pextrd [dstq+stride3q ], xm1, 1
jl .w4_ret
lea dstq, [dstq+strideq*4]
pextrd [dstq ], xm0, 2
pextrd [dstq+strideq*1], xm0, 3
- pextrd [dstq+strideq*2], xmm1, 2
- pextrd [dstq+stride3q ], xmm1, 3
+ pextrd [dstq+strideq*2], xm1, 2
+ pextrd [dstq+stride3q ], xm1, 3
.w4_ret:
RET
.w4_h16:
@@ -3332,29 +3332,29 @@ ALIGN function_align
cmp hd, 4
jne .w8_h8
WRAP_YMM %1 0
- vextracti128 xmm1, ym0, 1
+ vextracti32x4 xm1, ym0, 1
movq [dstq ], xm0
- movq [dstq+strideq*1], xmm1
+ movq [dstq+strideq*1], xm1
movhps [dstq+strideq*2], xm0
- movhps [dstq+stride3q ], xmm1
+ movhps [dstq+stride3q ], xm1
RET
.w8_loop:
%1_INC_PTR 2
lea dstq, [dstq+strideq*4]
.w8_h8:
%1 0
- vextracti32x4 xmm1, ym0, 1
- vextracti32x4 xmm2, m0, 2
- vextracti32x4 xmm3, m0, 3
+ vextracti32x4 xm1, ym0, 1
+ vextracti32x4 xm2, m0, 2
+ vextracti32x4 xm3, m0, 3
movq [dstq ], xm0
- movq [dstq+strideq*1], xmm1
- movq [dstq+strideq*2], xmm2
- movq [dstq+stride3q ], xmm3
+ movq [dstq+strideq*1], xm1
+ movq [dstq+strideq*2], xm2
+ movq [dstq+stride3q ], xm3
lea dstq, [dstq+strideq*4]
movhps [dstq ], xm0
- movhps [dstq+strideq*1], xmm1
- movhps [dstq+strideq*2], xmm2
- movhps [dstq+stride3q ], xmm3
+ movhps [dstq+strideq*1], xm1
+ movhps [dstq+strideq*2], xm2
+ movhps [dstq+stride3q ], xm3
sub hd, 8
jg .w8_loop
RET
@@ -3415,8 +3415,8 @@ ALIGN function_align
paddw m0, [tmp2q+(%1+0)*mmsize]
mova m1, [tmp1q+(%1+1)*mmsize]
paddw m1, [tmp2q+(%1+1)*mmsize]
- pmulhrsw m0, m2
- pmulhrsw m1, m2
+ pmulhrsw m0, m4
+ pmulhrsw m1, m4
packuswb m0, m1
%endmacro
@@ -3425,13 +3425,13 @@ ALIGN function_align
add tmp2q, %1*mmsize
%endmacro
-cglobal avg_8bpc, 4, 7, 3, dst, stride, tmp1, tmp2, w, h, stride3
+cglobal avg_8bpc, 4, 7, 5, dst, stride, tmp1, tmp2, w, h, stride3
%define base r6-avg_avx512icl_table
lea r6, [avg_avx512icl_table]
tzcnt wd, wm
movifnidn hd, hm
movsxd wq, dword [r6+wq*4]
- vpbroadcastd m2, [base+pw_1024]
+ vpbroadcastd m4, [base+pw_1024]
add wq, r6
BIDIR_FN AVG
@@ -3573,17 +3573,17 @@ cglobal w_mask_420_8bpc, 4, 8, 16, dst, stride, tmp1, tmp2, w, h, mask, stride3
vinserti128 ym5, [wm_420_perm4+32], 1
vpermb ym4, ym5, ym4
vpdpbusd ym8, ym4, ym9
- vextracti128 xmm1, m0, 1
+ vextracti32x4 xm1, m0, 1
movd [dstq+strideq*0], xm0
pextrd [dstq+strideq*1], xm0, 1
- movd [dstq+strideq*2], xmm1
- pextrd [dstq+stride3q ], xmm1, 1
+ movd [dstq+strideq*2], xm1
+ pextrd [dstq+stride3q ], xm1, 1
jl .w4_end
lea dstq, [dstq+strideq*4]
pextrd [dstq+strideq*0], xm0, 2
pextrd [dstq+strideq*1], xm0, 3
- pextrd [dstq+strideq*2], xmm1, 2
- pextrd [dstq+stride3q ], xmm1, 3
+ pextrd [dstq+strideq*2], xm1, 2
+ pextrd [dstq+stride3q ], xm1, 3
.w4_end:
vpermb ym8, ym10, ym8
movq [maskq], xm8
@@ -3609,11 +3609,11 @@ cglobal w_mask_420_8bpc, 4, 8, 16, dst, stride, tmp1, tmp2, w, h, mask, stride3
vpdpbusd ym8, ym4, ym9
vpermb m8, m10, m8
mova [maskq], xm8
- vextracti128 xmm1, ym0, 1
+ vextracti32x4 xm1, ym0, 1
movq [dstq+strideq*0], xm0
- movq [dstq+strideq*1], xmm1
+ movq [dstq+strideq*1], xm1
movhps [dstq+strideq*2], xm0
- movhps [dstq+stride3q ], xmm1
+ movhps [dstq+stride3q ], xm1
RET
.w8_loop:
add tmp1q, 128
@@ -3627,18 +3627,18 @@ cglobal w_mask_420_8bpc, 4, 8, 16, dst, stride, tmp1, tmp2, w, h, mask, stride3
vpdpbusd m1, m4, m9
vpermb m1, m10, m1
mova [maskq], xm1
- vextracti32x4 xmm1, ym0, 1
- vextracti32x4 xmm2, m0, 2
- vextracti32x4 xmm3, m0, 3
+ vextracti32x4 xm1, ym0, 1
+ vextracti32x4 xm2, m0, 2
+ vextracti32x4 xm3, m0, 3
movq [dstq+strideq*0], xm0
- movq [dstq+strideq*1], xmm1
- movq [dstq+strideq*2], xmm2
- movq [dstq+stride3q ], xmm3
+ movq [dstq+strideq*1], xm1
+ movq [dstq+strideq*2], xm2
+ movq [dstq+stride3q ], xm3
lea dstq, [dstq+strideq*4]
movhps [dstq+strideq*0], xm0
- movhps [dstq+strideq*1], xmm1
- movhps [dstq+strideq*2], xmm2
- movhps [dstq+stride3q ], xmm3
+ movhps [dstq+strideq*1], xm1
+ movhps [dstq+strideq*2], xm2
+ movhps [dstq+stride3q ], xm3
sub hd, 8
jg .w8_loop
RET
@@ -3766,17 +3766,17 @@ cglobal w_mask_422_8bpc, 4, 8, 14, dst, stride, tmp1, tmp2, w, h, mask, stride3
movhps xm10, [wm_422_mask+16]
vpdpwssd ym8, ym4, ym9
vpermb ym8, ym10, ym8
- vextracti128 xmm1, m0, 1
+ vextracti32x4 xm1, m0, 1
movd [dstq+strideq*0], xm0
pextrd [dstq+strideq*1], xm0, 1
- movd [dstq+strideq*2], xmm1
- pextrd [dstq+stride3q ], xmm1, 1
+ movd [dstq+strideq*2], xm1
+ pextrd [dstq+stride3q ], xm1, 1
jl .w4_end
lea dstq, [dstq+strideq*4]
pextrd [dstq+strideq*0], xm0, 2
pextrd [dstq+strideq*1], xm0, 3
- pextrd [dstq+strideq*2], xmm1, 2
- pextrd [dstq+stride3q ], xmm1, 3
+ pextrd [dstq+strideq*2], xm1, 2
+ pextrd [dstq+stride3q ], xm1, 3
.w4_end:
pand xm8, xm11
mova [maskq], xm8
@@ -3801,11 +3801,11 @@ cglobal w_mask_422_8bpc, 4, 8, 14, dst, stride, tmp1, tmp2, w, h, mask, stride3
vpermb ym8, ym10, ym8
pand xm8, xm11
mova [maskq], xm8
- vextracti128 xmm1, ym0, 1
+ vextracti32x4 xm1, ym0, 1
movq [dstq+strideq*0], xm0
- movq [dstq+strideq*1], xmm1
+ movq [dstq+strideq*1], xm1
movhps [dstq+strideq*2], xm0
- movhps [dstq+stride3q ], xmm1
+ movhps [dstq+stride3q ], xm1
RET
.w8_loop:
add tmp1q, 128
@@ -3819,18 +3819,18 @@ cglobal w_mask_422_8bpc, 4, 8, 14, dst, stride, tmp1, tmp2, w, h, mask, stride3
vpermb m1, m10, m1
pand ym1, ym11
mova [maskq], ym1
- vextracti32x4 xmm1, ym0, 1
- vextracti32x4 xmm2, m0, 2
- vextracti32x4 xmm3, m0, 3
+ vextracti32x4 xm1, ym0, 1
+ vextracti32x4 xm2, m0, 2
+ vextracti32x4 xm3, m0, 3
movq [dstq+strideq*0], xm0
- movq [dstq+strideq*1], xmm1
- movq [dstq+strideq*2], xmm2
- movq [dstq+stride3q ], xmm3
+ movq [dstq+strideq*1], xm1
+ movq [dstq+strideq*2], xm2
+ movq [dstq+stride3q ], xm3
lea dstq, [dstq+strideq*4]
movhps [dstq+strideq*0], xm0
- movhps [dstq+strideq*1], xmm1
- movhps [dstq+strideq*2], xmm2
- movhps [dstq+stride3q ], xmm3
+ movhps [dstq+strideq*1], xm1
+ movhps [dstq+strideq*2], xm2
+ movhps [dstq+stride3q ], xm3
sub hd, 8
jg .w8_loop
RET
@@ -3936,17 +3936,17 @@ cglobal w_mask_444_8bpc, 4, 8, 12, dst, stride, tmp1, tmp2, w, h, mask, stride3
vinserti128 ym8, [wm_444_mask+32], 1
vpermb ym4, ym8, ym4
mova [maskq], ym4
- vextracti128 xmm1, m0, 1
+ vextracti32x4 xm1, m0, 1
movd [dstq+strideq*0], xm0
pextrd [dstq+strideq*1], xm0, 1
- movd [dstq+strideq*2], xmm1
- pextrd [dstq+stride3q ], xmm1, 1
+ movd [dstq+strideq*2], xm1
+ pextrd [dstq+stride3q ], xm1, 1
jl .w4_end
lea dstq, [dstq+strideq*4]
pextrd [dstq+strideq*0], xm0, 2
pextrd [dstq+strideq*1], xm0, 3
- pextrd [dstq+strideq*2], xmm1, 2
- pextrd [dstq+stride3q ], xmm1, 3
+ pextrd [dstq+strideq*2], xm1, 2
+ pextrd [dstq+stride3q ], xm1, 3
.w4_end:
RET
.w4_h16:
@@ -3965,11 +3965,11 @@ cglobal w_mask_444_8bpc, 4, 8, 12, dst, stride, tmp1, tmp2, w, h, mask, stride3
vinserti128 ym8, [wm_444_mask+32], 1
vpermb ym4, ym8, ym4
mova [maskq], ym4
- vextracti128 xmm1, ym0, 1
+ vextracti32x4 xm1, ym0, 1
movq [dstq+strideq*0], xm0
- movq [dstq+strideq*1], xmm1
+ movq [dstq+strideq*1], xm1
movhps [dstq+strideq*2], xm0
- movhps [dstq+stride3q ], xmm1
+ movhps [dstq+stride3q ], xm1
RET
.w8_loop:
add tmp1q, 128
@@ -3980,18 +3980,18 @@ cglobal w_mask_444_8bpc, 4, 8, 12, dst, stride, tmp1, tmp2, w, h, mask, stride3
W_MASK 0, 4, 0, 1, 1
vpermb m4, m8, m4
mova [maskq], m4
- vextracti32x4 xmm1, ym0, 1
- vextracti32x4 xmm2, m0, 2
- vextracti32x4 xmm3, m0, 3
+ vextracti32x4 xm1, ym0, 1
+ vextracti32x4 xm2, m0, 2
+ vextracti32x4 xm3, m0, 3
movq [dstq+strideq*0], xm0
- movq [dstq+strideq*1], xmm1
- movq [dstq+strideq*2], xmm2
- movq [dstq+stride3q ], xmm3
+ movq [dstq+strideq*1], xm1
+ movq [dstq+strideq*2], xm2
+ movq [dstq+stride3q ], xm3
lea dstq, [dstq+strideq*4]
movhps [dstq+strideq*0], xm0
- movhps [dstq+strideq*1], xmm1
- movhps [dstq+strideq*2], xmm2
- movhps [dstq+stride3q ], xmm3
+ movhps [dstq+strideq*1], xm1
+ movhps [dstq+strideq*2], xm2
+ movhps [dstq+stride3q ], xm3
sub hd, 8
jg .w8_loop
RET