diff options
author | Henrik Gramner <gramner@twoorioles.com> | 2022-06-02 21:29:38 +0300 |
---|---|---|
committer | Henrik Gramner <henrik@gramner.com> | 2022-06-03 00:58:39 +0300 |
commit | 0cfb03cd6eddbaf350b17e06af6fa956e3455a00 (patch) | |
tree | 2f4f513ce033da2a3a96ee9134414ee9f351a629 | |
parent | b4f9eac85803303760f90ebad49de068cc2663d7 (diff) |
x86: Add a workaround for quirky AVX-512 hardware behavior
On Intel CPUs certain AVX-512 shuffle instructions incorrectly
flag the upper halves of YMM registers as in use when writing
to XMM registers, which may cause AVX/SSE state transitions.
This behavior is not documented and only occurs on physical
hardware, not when using the Intel SDE, so as far as I can tell
it appears to be a hardware bug.
Work around the issue by using EVEX-only registers. This avoids
the problem at the cost of a slightly larger code size.
-rw-r--r-- | src/x86/ipred16_avx512.asm | 66 | ||||
-rw-r--r-- | src/x86/ipred_avx512.asm | 92 | ||||
-rw-r--r-- | src/x86/mc16_avx512.asm | 144 | ||||
-rw-r--r-- | src/x86/mc_avx512.asm | 166 |
4 files changed, 234 insertions, 234 deletions
diff --git a/src/x86/ipred16_avx512.asm b/src/x86/ipred16_avx512.asm index 4a1b060..1a307ad 100644 --- a/src/x86/ipred16_avx512.asm +++ b/src/x86/ipred16_avx512.asm @@ -114,20 +114,20 @@ cglobal ipred_paeth_16bpc, 3, 7, 10, dst, stride, tl, w, h vbroadcasti32x4 m2, [tlq] pshufb m2, m7 ; left PAETH 4, 5, 6 - vextracti32x4 xmm1, m0, 2 - vextracti32x4 xmm2, ym0, 1 - vextracti32x4 xmm3, m0, 3 + vextracti32x4 xm1, m0, 2 + vextracti32x4 xm8, ym0, 1 + vextracti32x4 xm9, m0, 3 movq [dstq+strideq*0], xm0 - movq [dstq+strideq*1], xmm1 - movq [dstq+strideq*2], xmm2 - movq [dstq+r6 ], xmm3 + movq [dstq+strideq*1], xm1 + movq [dstq+strideq*2], xm8 + movq [dstq+r6 ], xm9 sub hd, 8 jl .w4_end lea dstq, [dstq+strideq*4] movhps [dstq+strideq*0], xm0 - movhps [dstq+strideq*1], xmm1 - movhps [dstq+strideq*2], xmm2 - movhps [dstq+r6 ], xmm3 + movhps [dstq+strideq*1], xm1 + movhps [dstq+strideq*2], xm8 + movhps [dstq+r6 ], xm9 lea dstq, [dstq+strideq*4] jg .w4_loop .w4_end: @@ -220,19 +220,19 @@ cglobal ipred_smooth_v_16bpc, 3, 7, 7, dst, stride, tl, w, h, weights, stride3 pshufb m3, m4 pmulhrsw m3, m5 paddw m3, m6 - vextracti32x4 xmm0, m3, 3 - vextracti32x4 xmm1, ym3, 1 - vextracti32x4 xmm2, m3, 2 - movhps [dstq+strideq*0], xmm0 - movhps [dstq+strideq*1], xmm1 - movhps [dstq+strideq*2], xmm2 + vextracti32x4 xm0, m3, 3 + vextracti32x4 xm1, ym3, 1 + vextracti32x4 xm2, m3, 2 + movhps [dstq+strideq*0], xm0 + movhps [dstq+strideq*1], xm1 + movhps [dstq+strideq*2], xm2 movhps [dstq+stride3q ], xm3 add hq, 8 jg .end lea dstq, [dstq+strideq*4] - movq [dstq+strideq*0], xmm0 - movq [dstq+strideq*1], xmm1 - movq [dstq+strideq*2], xmm2 + movq [dstq+strideq*0], xm0 + movq [dstq+strideq*1], xm1 + movq [dstq+strideq*2], xm2 movq [dstq+stride3q ], xm3 lea dstq, [dstq+strideq*4] jl .w4_loop @@ -337,20 +337,20 @@ cglobal ipred_smooth_h_16bpc, 3, 7, 7, dst, stride, tl, w, h, stride3 psubw m0, m6 ; left - right pmulhrsw m0, m5 paddw m0, m6 - vextracti32x4 xmm1, m0, 2 - vextracti32x4 xmm2, ym0, 1 - vextracti32x4 xmm3, m0, 3 + vextracti32x4 xm1, m0, 2 + vextracti32x4 xm2, ym0, 1 + vextracti32x4 xm3, m0, 3 movq [dstq+strideq*0], xm0 - movq [dstq+strideq*1], xmm1 - movq [dstq+strideq*2], xmm2 - movq [dstq+stride3q ], xmm3 + movq [dstq+strideq*1], xm1 + movq [dstq+strideq*2], xm2 + movq [dstq+stride3q ], xm3 sub hd, 8*2 jl .end lea dstq, [dstq+strideq*4] movhps [dstq+strideq*0], xm0 - movhps [dstq+strideq*1], xmm1 - movhps [dstq+strideq*2], xmm2 - movhps [dstq+stride3q ], xmm3 + movhps [dstq+strideq*1], xm1 + movhps [dstq+strideq*2], xm2 + movhps [dstq+stride3q ], xm3 lea dstq, [dstq+strideq*4] jg .w4_loop .end: @@ -472,11 +472,11 @@ cglobal ipred_smooth_16bpc, 3, 7, 16, dst, stride, tl, w, h, v_weights, stride3 vpdpwssd m0, m1, m6 vpermb m0, m14, m0 pavgw ym0, ym15 - vextracti32x4 xmm1, ym0, 1 + vextracti32x4 xm1, ym0, 1 movq [dstq+strideq*0], xm0 - movq [dstq+strideq*1], xmm1 + movq [dstq+strideq*1], xm1 movhps [dstq+strideq*2], xm0 - movhps [dstq+stride3q ], xmm1 + movhps [dstq+stride3q ], xm1 lea dstq, [dstq+strideq*4] add v_weightsq, 4*4 sub hd, 4*2 @@ -624,11 +624,11 @@ cglobal pal_pred_16bpc, 4, 7, 4, dst, stride, pal, idx, w, h, stride3 pmovzxbw ym0, [idxq] add idxq, 16 vpermw ym0, ym0, ym3 - vextracti32x4 xmm1, ym0, 1 + vextracti32x4 xm1, ym0, 1 movq [dstq+strideq*0], xm0 movhps [dstq+strideq*1], xm0 - movq [dstq+strideq*2], xmm1 - movhps [dstq+stride3q ], xmm1 + movq [dstq+strideq*2], xm1 + movhps [dstq+stride3q ], xm1 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w4 diff --git a/src/x86/ipred_avx512.asm b/src/x86/ipred_avx512.asm index 050ec9b..38c86b5 100644 --- a/src/x86/ipred_avx512.asm +++ b/src/x86/ipred_avx512.asm @@ -242,9 +242,9 @@ cglobal ipred_dc_8bpc, 3, 7, 5, dst, stride, tl, w, h, stride3 jmp wq .w8: movq xmm1, [tlq+1] - vextracti32x4 xmm2, ym0, 1 + vextracti32x4 xm2, ym0, 1 vpdpbusd xm0, xmm1, xm3 - paddd xmm2, xm0 + paddd xmm2, xm2, xm0 punpckhqdq xmm0, xmm2, xmm2 paddd xmm0, xmm2 psrlq xmm1, xmm0, 32 @@ -275,9 +275,9 @@ cglobal ipred_dc_8bpc, 3, 7, 5, dst, stride, tl, w, h, stride3 jmp wq .w16: movu xmm1, [tlq+1] - vextracti32x4 xmm2, ym0, 1 + vextracti32x4 xm2, ym0, 1 vpdpbusd xm0, xmm1, xm3 - paddd xmm2, xm0 + paddd xmm2, xm2, xm0 punpckhqdq xmm0, xmm2, xmm2 paddd xmm0, xmm2 psrlq xmm1, xmm0, 32 @@ -309,8 +309,8 @@ cglobal ipred_dc_8bpc, 3, 7, 5, dst, stride, tl, w, h, stride3 .w32: movu ym1, [tlq+1] vpdpbusd ym0, ym1, ym3 - vextracti32x4 xmm1, ym0, 1 - paddd xmm1, xm0 + vextracti32x4 xm1, ym0, 1 + paddd xmm1, xm1, xm0 punpckhqdq xmm0, xmm1, xmm1 paddd xmm0, xmm1 psrlq xmm1, xmm0, 32 @@ -345,8 +345,8 @@ cglobal ipred_dc_8bpc, 3, 7, 5, dst, stride, tl, w, h, stride3 movu ym2, [tlq+33] vpdpbusd ym0, ym1, ym3 vpdpbusd ym0, ym2, ym3 - vextracti32x4 xmm1, ym0, 1 - paddd xmm1, xm0 + vextracti32x4 xm1, ym0, 1 + paddd xmm1, xm1, xm0 punpckhqdq xmm0, xmm1, xmm1 paddd xmm0, xmm1 psrlq xmm1, xmm0, 32 @@ -524,12 +524,12 @@ INIT_YMM avx512icl pextrd [dstq+stride3q ], xm0, 3 sub hd, 8 jl .w4_ret - vextracti32x4 xmm0, m0, 1 + vextracti32x4 xm0, m0, 1 lea dstq, [dstq+strideq*4] - movd [dstq+strideq*0], xmm0 - pextrd [dstq+strideq*1], xmm0, 1 - pextrd [dstq+strideq*2], xmm0, 2 - pextrd [dstq+stride3q ], xmm0, 3 + movd [dstq+strideq*0], xm0 + pextrd [dstq+strideq*1], xm0, 1 + pextrd [dstq+strideq*2], xm0, 2 + pextrd [dstq+stride3q ], xm0, 3 lea dstq, [dstq+strideq*4] jg .w4_loop .w4_ret: @@ -545,20 +545,20 @@ INIT_ZMM avx512icl vpbroadcastq m4, [tlq+hq-8] pshufb m4, m9 PAETH - vextracti32x4 xmm1, m0, 2 - vextracti32x4 xmm2, ym0, 1 - vextracti32x4 xmm3, m0, 3 + vextracti32x4 xm1, m0, 2 + vextracti32x4 xm2, ym0, 1 + vextracti32x4 xm3, m0, 3 movq [dstq+strideq*0], xm0 - movq [dstq+strideq*1], xmm1 - movq [dstq+strideq*2], xmm2 - movq [dstq+stride3q ], xmm3 + movq [dstq+strideq*1], xm1 + movq [dstq+strideq*2], xm2 + movq [dstq+stride3q ], xm3 sub hd, 8 jl .w8_ret lea dstq, [dstq+strideq*4] movhps [dstq+strideq*0], xm0 - movhps [dstq+strideq*1], xmm1 - movhps [dstq+strideq*2], xmm2 - movhps [dstq+stride3q ], xmm3 + movhps [dstq+strideq*1], xm1 + movhps [dstq+strideq*2], xm2 + movhps [dstq+stride3q ], xm3 lea dstq, [dstq+strideq*4] jg .w8_loop .w8_ret: @@ -639,18 +639,18 @@ cglobal ipred_smooth_v_8bpc, 3, 7, 7, dst, stride, tl, w, h, weights, stride3 pmaddubsw m0, m2, m0 paddw m0, m3 vpermb m0, m6, m0 - vextracti32x4 xmm1, ym0, 1 + vextracti32x4 xm1, ym0, 1 movd [dstq+strideq*0], xm0 - movd [dstq+strideq*1], xmm1 + movd [dstq+strideq*1], xm1 pextrd [dstq+strideq*2], xm0, 2 - pextrd [dstq+stride3q ], xmm1, 2 + pextrd [dstq+stride3q ], xm1, 2 add hq, 8 jg .ret lea dstq, [dstq+strideq*4] pextrd [dstq+strideq*0], xm0, 1 - pextrd [dstq+strideq*1], xmm1, 1 + pextrd [dstq+strideq*1], xm1, 1 pextrd [dstq+strideq*2], xm0, 3 - pextrd [dstq+stride3q ], xmm1, 3 + pextrd [dstq+stride3q ], xm1, 3 lea dstq, [dstq+strideq*4] jl .w4_loop .ret: @@ -669,11 +669,11 @@ cglobal ipred_smooth_v_8bpc, 3, 7, 7, dst, stride, tl, w, h, weights, stride3 pmaddubsw m0, m2, m0 paddw m0, m3 vpermb m0, m6, m0 - vextracti32x4 xmm1, ym0, 1 + vextracti32x4 xm1, ym0, 1 movq [dstq+strideq*0], xm0 - movq [dstq+strideq*1], xmm1 + movq [dstq+strideq*1], xm1 movhps [dstq+strideq*2], xm0 - movhps [dstq+stride3q ], xmm1 + movhps [dstq+stride3q ], xm1 lea dstq, [dstq+strideq*4] add hq, 4 jl .w8_loop @@ -785,18 +785,18 @@ cglobal ipred_smooth_h_8bpc, 4, 7, 11, dst, stride, tl, w, h, stride3 paddw m0, m2 paddw m0, m1 vpermb m0, m8, m0 - vextracti32x4 xmm1, ym0, 1 + vextracti32x4 xm1, ym0, 1 movd [dstq+strideq*0], xm0 - movd [dstq+strideq*1], xmm1 + movd [dstq+strideq*1], xm1 pextrd [dstq+strideq*2], xm0, 2 - pextrd [dstq+stride3q ], xmm1, 2 + pextrd [dstq+stride3q ], xm1, 2 sub hd, 8 jl .ret lea dstq, [dstq+strideq*4] pextrd [dstq+strideq*0], xm0, 1 - pextrd [dstq+strideq*1], xmm1, 1 + pextrd [dstq+strideq*1], xm1, 1 pextrd [dstq+strideq*2], xm0, 3 - pextrd [dstq+stride3q ], xmm1, 3 + pextrd [dstq+stride3q ], xm1, 3 lea dstq, [dstq+strideq*4] jg .w4_loop .ret: @@ -815,11 +815,11 @@ cglobal ipred_smooth_h_8bpc, 4, 7, 11, dst, stride, tl, w, h, stride3 paddw m0, m2 paddw m0, m1 vpermb m0, m8, m0 - vextracti32x4 xmm1, ym0, 1 + vextracti32x4 xm1, ym0, 1 movq [dstq+strideq*0], xm0 - movq [dstq+strideq*1], xmm1 + movq [dstq+strideq*1], xm1 movhps [dstq+strideq*2], xm0 - movhps [dstq+stride3q ], xmm1 + movhps [dstq+stride3q ], xm1 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w8_loop @@ -937,18 +937,18 @@ cglobal ipred_smooth_8bpc, 4, 7, 16, dst, stride, tl, w, h, v_weights, stride3 paddw m1, m2 pavgw m0, m1 vpermb m0, m11, m0 - vextracti32x4 xmm1, ym0, 1 + vextracti32x4 xm1, ym0, 1 movd [dstq+strideq*0], xm0 - movd [dstq+strideq*1], xmm1 + movd [dstq+strideq*1], xm1 pextrd [dstq+strideq*2], xm0, 2 - pextrd [dstq+stride3q ], xmm1, 2 + pextrd [dstq+stride3q ], xm1, 2 sub hd, 8 jl .ret lea dstq, [dstq+strideq*4] pextrd [dstq+strideq*0], xm0, 1 - pextrd [dstq+strideq*1], xmm1, 1 + pextrd [dstq+strideq*1], xm1, 1 pextrd [dstq+strideq*2], xm0, 3 - pextrd [dstq+stride3q ], xmm1, 3 + pextrd [dstq+stride3q ], xm1, 3 lea dstq, [dstq+strideq*4] jg .w4_loop .ret: @@ -978,11 +978,11 @@ cglobal ipred_smooth_8bpc, 4, 7, 16, dst, stride, tl, w, h, v_weights, stride3 paddw m1, m2 pavgw m0, m1 vpermb m0, m11, m0 - vextracti32x4 xmm1, ym0, 1 + vextracti32x4 xm1, ym0, 1 movq [dstq+strideq*0], xm0 - movq [dstq+strideq*1], xmm1 + movq [dstq+strideq*1], xm1 movhps [dstq+strideq*2], xm0 - movhps [dstq+stride3q ], xmm1 + movhps [dstq+stride3q ], xm1 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w8_loop diff --git a/src/x86/mc16_avx512.asm b/src/x86/mc16_avx512.asm index e83b18a..f8d1aaa 100644 --- a/src/x86/mc16_avx512.asm +++ b/src/x86/mc16_avx512.asm @@ -3615,32 +3615,32 @@ ALIGN function_align .w4: movq [dstq ], xm0 movhps [dstq+strideq*1], xm0 - vextracti32x4 xmm0, ym0, 1 - movq [dstq+strideq*2], xmm0 - movhps [dstq+stride3q ], xmm0 + vextracti32x4 xm2, ym0, 1 + movq [dstq+strideq*2], xm2 + movhps [dstq+stride3q ], xm2 cmp hd, 8 jl .w4_end - vextracti32x4 xmm0, m0, 2 + vextracti32x4 xm2, m0, 2 lea dstq, [dstq+strideq*4] - movq [dstq ], xmm0 - movhps [dstq+strideq*1], xmm0 - vextracti32x4 xmm0, m0, 3 - movq [dstq+strideq*2], xmm0 - movhps [dstq+stride3q ], xmm0 + movq [dstq ], xm2 + movhps [dstq+strideq*1], xm2 + vextracti32x4 xm0, m0, 3 + movq [dstq+strideq*2], xm0 + movhps [dstq+stride3q ], xm0 je .w4_end lea dstq, [dstq+strideq*4] movq [dstq ], xm1 movhps [dstq+strideq*1], xm1 - vextracti32x4 xmm0, ym1, 1 - movq [dstq+strideq*2], xmm0 - movhps [dstq+stride3q ], xmm0 - vextracti32x4 xmm0, m1, 2 + vextracti32x4 xm0, ym1, 1 + movq [dstq+strideq*2], xm0 + movhps [dstq+stride3q ], xm0 + vextracti32x4 xm0, m1, 2 lea dstq, [dstq+strideq*4] - movq [dstq ], xmm0 - movhps [dstq+strideq*1], xmm0 - vextracti32x4 xmm0, m1, 3 - movq [dstq+strideq*2], xmm0 - movhps [dstq+stride3q ], xmm0 + movq [dstq ], xm0 + movhps [dstq+strideq*1], xm0 + vextracti32x4 xm1, m1, 3 + movq [dstq+strideq*2], xm1 + movhps [dstq+stride3q ], xm1 .w4_end: RET .w8_loop: @@ -3860,33 +3860,33 @@ cglobal w_mask_420_16bpc, 4, 8, 16, dst, stride, tmp1, tmp2, w, h, mask, stride3 vpermb m3, m15, m3 movq [dstq+strideq*0], xm0 movhps [dstq+strideq*1], xm0 - vextracti32x4 xmm0, ym0, 1 - movq [dstq+strideq*2], xmm0 - movhps [dstq+stride3q ], xmm0 + vextracti32x4 xm2, ym0, 1 + movq [dstq+strideq*2], xm2 + movhps [dstq+stride3q ], xm2 mova [maskq], xm3 cmp hd, 8 jl .w4_end - vextracti32x4 xmm0, m0, 2 + vextracti32x4 xm2, m0, 2 lea dstq, [dstq+strideq*4] - movq [dstq+strideq*0], xmm0 - movhps [dstq+strideq*1], xmm0 - vextracti32x4 xmm0, m0, 3 - movq [dstq+strideq*2], xmm0 - movhps [dstq+stride3q ], xmm0 + movq [dstq+strideq*0], xm2 + movhps [dstq+strideq*1], xm2 + vextracti32x4 xm0, m0, 3 + movq [dstq+strideq*2], xm0 + movhps [dstq+stride3q ], xm0 je .w4_end lea dstq, [dstq+strideq*4] movq [dstq+strideq*0], xm1 movhps [dstq+strideq*1], xm1 - vextracti32x4 xmm0, ym1, 1 - movq [dstq+strideq*2], xmm0 - movhps [dstq+stride3q ], xmm0 - vextracti32x4 xmm0, m1, 2 + vextracti32x4 xm2, ym1, 1 + movq [dstq+strideq*2], xm2 + movhps [dstq+stride3q ], xm2 + vextracti32x4 xm2, m1, 2 lea dstq, [dstq+strideq*4] - movq [dstq+strideq*0], xmm0 - movhps [dstq+strideq*1], xmm0 - vextracti32x4 xmm0, m1, 3 - movq [dstq+strideq*2], xmm0 - movhps [dstq+stride3q ], xmm0 + movq [dstq+strideq*0], xm2 + movhps [dstq+strideq*1], xm2 + vextracti32x4 xm1, m1, 3 + movq [dstq+strideq*2], xm1 + movhps [dstq+stride3q ], xm1 .w4_end: RET .w8: @@ -4090,32 +4090,32 @@ cglobal w_mask_422_16bpc, 4, 8, 15, dst, stride, tmp1, tmp2, w, h, mask, stride3 .w4: movq [dstq+strideq*0], xm0 movhps [dstq+strideq*1], xm0 - vextracti32x4 xmm0, ym0, 1 - movq [dstq+strideq*2], xmm0 - movhps [dstq+stride3q ], xmm0 + vextracti32x4 xm2, ym0, 1 + movq [dstq+strideq*2], xm2 + movhps [dstq+stride3q ], xm2 cmp hd, 8 jl .w4_end - vextracti32x4 xmm0, m0, 2 + vextracti32x4 xm2, m0, 2 lea dstq, [dstq+strideq*4] - movq [dstq+strideq*0], xmm0 - movhps [dstq+strideq*1], xmm0 - vextracti32x4 xmm0, m0, 3 - movq [dstq+strideq*2], xmm0 - movhps [dstq+stride3q ], xmm0 + movq [dstq+strideq*0], xm2 + movhps [dstq+strideq*1], xm2 + vextracti32x4 xm0, m0, 3 + movq [dstq+strideq*2], xm0 + movhps [dstq+stride3q ], xm0 je .w4_end lea dstq, [dstq+strideq*4] movq [dstq+strideq*0], xm1 movhps [dstq+strideq*1], xm1 - vextracti32x4 xmm0, ym1, 1 - movq [dstq+strideq*2], xmm0 - movhps [dstq+stride3q ], xmm0 - vextracti32x4 xmm0, m1, 2 + vextracti32x4 xm2, ym1, 1 + movq [dstq+strideq*2], xm2 + movhps [dstq+stride3q ], xm2 + vextracti32x4 xm2, m1, 2 lea dstq, [dstq+strideq*4] - movq [dstq+strideq*0], xmm0 - movhps [dstq+strideq*1], xmm0 - vextracti32x4 xmm0, m1, 3 - movq [dstq+strideq*2], xmm0 - movhps [dstq+stride3q ], xmm0 + movq [dstq+strideq*0], xm2 + movhps [dstq+strideq*1], xm2 + vextracti32x4 xm1, m1, 3 + movq [dstq+strideq*2], xm1 + movhps [dstq+stride3q ], xm1 .w4_end: RET .w8_loop: @@ -4249,32 +4249,32 @@ cglobal w_mask_444_16bpc, 4, 8, 13, dst, stride, tmp1, tmp2, w, h, mask, stride3 .w4: movq [dstq+strideq*0], xm0 movhps [dstq+strideq*1], xm0 - vextracti32x4 xmm0, ym0, 1 - movq [dstq+strideq*2], xmm0 - movhps [dstq+stride3q ], xmm0 + vextracti32x4 xm2, ym0, 1 + movq [dstq+strideq*2], xm2 + movhps [dstq+stride3q ], xm2 cmp hd, 8 jl .w4_end - vextracti32x4 xmm0, m0, 2 + vextracti32x4 xm2, m0, 2 lea dstq, [dstq+strideq*4] - movq [dstq+strideq*0], xmm0 - movhps [dstq+strideq*1], xmm0 - vextracti32x4 xmm0, m0, 3 - movq [dstq+strideq*2], xmm0 - movhps [dstq+stride3q ], xmm0 + movq [dstq+strideq*0], xm2 + movhps [dstq+strideq*1], xm2 + vextracti32x4 xm0, m0, 3 + movq [dstq+strideq*2], xm0 + movhps [dstq+stride3q ], xm0 je .w4_end lea dstq, [dstq+strideq*4] movq [dstq+strideq*0], xm1 movhps [dstq+strideq*1], xm1 - vextracti32x4 xmm0, ym1, 1 - movq [dstq+strideq*2], xmm0 - movhps [dstq+stride3q ], xmm0 - vextracti32x4 xmm0, m1, 2 + vextracti32x4 xm2, ym1, 1 + movq [dstq+strideq*2], xm2 + movhps [dstq+stride3q ], xm2 + vextracti32x4 xm2, m1, 2 lea dstq, [dstq+strideq*4] - movq [dstq+strideq*0], xmm0 - movhps [dstq+strideq*1], xmm0 - vextracti32x4 xmm0, m1, 3 - movq [dstq+strideq*2], xmm0 - movhps [dstq+stride3q ], xmm0 + movq [dstq+strideq*0], xm2 + movhps [dstq+strideq*1], xm2 + vextracti32x4 xm1, m1, 3 + movq [dstq+strideq*2], xm1 + movhps [dstq+stride3q ], xm1 .w4_end: RET .w8_loop: diff --git a/src/x86/mc_avx512.asm b/src/x86/mc_avx512.asm index eb3ca1c..7897f1d 100644 --- a/src/x86/mc_avx512.asm +++ b/src/x86/mc_avx512.asm @@ -449,9 +449,9 @@ cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w, h, mxy pshufb ym0, ym4 pmaddubsw ym0, ym5 pmulhrsw ym0, ym3 - vpmovuswb xmm0, ym0 - movq [dstq+dsq*0], xmm0 - movhps [dstq+dsq*1], xmm0 + vpmovuswb xm0, ym0 + movq [dstq+dsq*0], xm0 + movhps [dstq+dsq*1], xm0 lea dstq, [dstq+dsq*2] sub hd, 2 jg .h_w8 @@ -755,9 +755,9 @@ cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w, h, mxy pmulhw ym1, ym6 paddw ym1, ym2 pmulhrsw ym1, ym7 - vpmovuswb xmm1, ym1 - movq [dstq+dsq*0], xmm1 - movhps [dstq+dsq*1], xmm1 + vpmovuswb xm1, ym1 + movq [dstq+dsq*0], xm1 + movhps [dstq+dsq*1], xm1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .hv_w8_loop @@ -1588,13 +1588,13 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 jg .h_w4_loop RET .h_w8: - movu xmm0, [srcq+ssq*0] - vinserti32x4 ym0, ymm0, [srcq+ssq*1], 1 + movu xm0, [srcq+ssq*0] + vinserti32x4 ym0, [srcq+ssq*1], 1 lea srcq, [srcq+ssq*2] WRAP_YMM PUT_8TAP_H 0, 1, 2, 3 - vpmovuswb xmm0, ym0 - movq [dstq+dsq*0], xmm0 - movhps [dstq+dsq*1], xmm0 + vpmovuswb xm0, ym0 + movq [dstq+dsq*0], xm0 + movhps [dstq+dsq*1], xm0 lea dstq, [dstq+dsq*2] sub hd, 2 jg .h_w8 @@ -3308,17 +3308,17 @@ ALIGN function_align cmp hd, 8 jg .w4_h16 WRAP_YMM %1 0 - vextracti32x4 xmm1, ym0, 1 + vextracti32x4 xm1, ym0, 1 movd [dstq ], xm0 pextrd [dstq+strideq*1], xm0, 1 - movd [dstq+strideq*2], xmm1 - pextrd [dstq+stride3q ], xmm1, 1 + movd [dstq+strideq*2], xm1 + pextrd [dstq+stride3q ], xm1, 1 jl .w4_ret lea dstq, [dstq+strideq*4] pextrd [dstq ], xm0, 2 pextrd [dstq+strideq*1], xm0, 3 - pextrd [dstq+strideq*2], xmm1, 2 - pextrd [dstq+stride3q ], xmm1, 3 + pextrd [dstq+strideq*2], xm1, 2 + pextrd [dstq+stride3q ], xm1, 3 .w4_ret: RET .w4_h16: @@ -3332,29 +3332,29 @@ ALIGN function_align cmp hd, 4 jne .w8_h8 WRAP_YMM %1 0 - vextracti128 xmm1, ym0, 1 + vextracti32x4 xm1, ym0, 1 movq [dstq ], xm0 - movq [dstq+strideq*1], xmm1 + movq [dstq+strideq*1], xm1 movhps [dstq+strideq*2], xm0 - movhps [dstq+stride3q ], xmm1 + movhps [dstq+stride3q ], xm1 RET .w8_loop: %1_INC_PTR 2 lea dstq, [dstq+strideq*4] .w8_h8: %1 0 - vextracti32x4 xmm1, ym0, 1 - vextracti32x4 xmm2, m0, 2 - vextracti32x4 xmm3, m0, 3 + vextracti32x4 xm1, ym0, 1 + vextracti32x4 xm2, m0, 2 + vextracti32x4 xm3, m0, 3 movq [dstq ], xm0 - movq [dstq+strideq*1], xmm1 - movq [dstq+strideq*2], xmm2 - movq [dstq+stride3q ], xmm3 + movq [dstq+strideq*1], xm1 + movq [dstq+strideq*2], xm2 + movq [dstq+stride3q ], xm3 lea dstq, [dstq+strideq*4] movhps [dstq ], xm0 - movhps [dstq+strideq*1], xmm1 - movhps [dstq+strideq*2], xmm2 - movhps [dstq+stride3q ], xmm3 + movhps [dstq+strideq*1], xm1 + movhps [dstq+strideq*2], xm2 + movhps [dstq+stride3q ], xm3 sub hd, 8 jg .w8_loop RET @@ -3415,8 +3415,8 @@ ALIGN function_align paddw m0, [tmp2q+(%1+0)*mmsize] mova m1, [tmp1q+(%1+1)*mmsize] paddw m1, [tmp2q+(%1+1)*mmsize] - pmulhrsw m0, m2 - pmulhrsw m1, m2 + pmulhrsw m0, m4 + pmulhrsw m1, m4 packuswb m0, m1 %endmacro @@ -3425,13 +3425,13 @@ ALIGN function_align add tmp2q, %1*mmsize %endmacro -cglobal avg_8bpc, 4, 7, 3, dst, stride, tmp1, tmp2, w, h, stride3 +cglobal avg_8bpc, 4, 7, 5, dst, stride, tmp1, tmp2, w, h, stride3 %define base r6-avg_avx512icl_table lea r6, [avg_avx512icl_table] tzcnt wd, wm movifnidn hd, hm movsxd wq, dword [r6+wq*4] - vpbroadcastd m2, [base+pw_1024] + vpbroadcastd m4, [base+pw_1024] add wq, r6 BIDIR_FN AVG @@ -3573,17 +3573,17 @@ cglobal w_mask_420_8bpc, 4, 8, 16, dst, stride, tmp1, tmp2, w, h, mask, stride3 vinserti128 ym5, [wm_420_perm4+32], 1 vpermb ym4, ym5, ym4 vpdpbusd ym8, ym4, ym9 - vextracti128 xmm1, m0, 1 + vextracti32x4 xm1, m0, 1 movd [dstq+strideq*0], xm0 pextrd [dstq+strideq*1], xm0, 1 - movd [dstq+strideq*2], xmm1 - pextrd [dstq+stride3q ], xmm1, 1 + movd [dstq+strideq*2], xm1 + pextrd [dstq+stride3q ], xm1, 1 jl .w4_end lea dstq, [dstq+strideq*4] pextrd [dstq+strideq*0], xm0, 2 pextrd [dstq+strideq*1], xm0, 3 - pextrd [dstq+strideq*2], xmm1, 2 - pextrd [dstq+stride3q ], xmm1, 3 + pextrd [dstq+strideq*2], xm1, 2 + pextrd [dstq+stride3q ], xm1, 3 .w4_end: vpermb ym8, ym10, ym8 movq [maskq], xm8 @@ -3609,11 +3609,11 @@ cglobal w_mask_420_8bpc, 4, 8, 16, dst, stride, tmp1, tmp2, w, h, mask, stride3 vpdpbusd ym8, ym4, ym9 vpermb m8, m10, m8 mova [maskq], xm8 - vextracti128 xmm1, ym0, 1 + vextracti32x4 xm1, ym0, 1 movq [dstq+strideq*0], xm0 - movq [dstq+strideq*1], xmm1 + movq [dstq+strideq*1], xm1 movhps [dstq+strideq*2], xm0 - movhps [dstq+stride3q ], xmm1 + movhps [dstq+stride3q ], xm1 RET .w8_loop: add tmp1q, 128 @@ -3627,18 +3627,18 @@ cglobal w_mask_420_8bpc, 4, 8, 16, dst, stride, tmp1, tmp2, w, h, mask, stride3 vpdpbusd m1, m4, m9 vpermb m1, m10, m1 mova [maskq], xm1 - vextracti32x4 xmm1, ym0, 1 - vextracti32x4 xmm2, m0, 2 - vextracti32x4 xmm3, m0, 3 + vextracti32x4 xm1, ym0, 1 + vextracti32x4 xm2, m0, 2 + vextracti32x4 xm3, m0, 3 movq [dstq+strideq*0], xm0 - movq [dstq+strideq*1], xmm1 - movq [dstq+strideq*2], xmm2 - movq [dstq+stride3q ], xmm3 + movq [dstq+strideq*1], xm1 + movq [dstq+strideq*2], xm2 + movq [dstq+stride3q ], xm3 lea dstq, [dstq+strideq*4] movhps [dstq+strideq*0], xm0 - movhps [dstq+strideq*1], xmm1 - movhps [dstq+strideq*2], xmm2 - movhps [dstq+stride3q ], xmm3 + movhps [dstq+strideq*1], xm1 + movhps [dstq+strideq*2], xm2 + movhps [dstq+stride3q ], xm3 sub hd, 8 jg .w8_loop RET @@ -3766,17 +3766,17 @@ cglobal w_mask_422_8bpc, 4, 8, 14, dst, stride, tmp1, tmp2, w, h, mask, stride3 movhps xm10, [wm_422_mask+16] vpdpwssd ym8, ym4, ym9 vpermb ym8, ym10, ym8 - vextracti128 xmm1, m0, 1 + vextracti32x4 xm1, m0, 1 movd [dstq+strideq*0], xm0 pextrd [dstq+strideq*1], xm0, 1 - movd [dstq+strideq*2], xmm1 - pextrd [dstq+stride3q ], xmm1, 1 + movd [dstq+strideq*2], xm1 + pextrd [dstq+stride3q ], xm1, 1 jl .w4_end lea dstq, [dstq+strideq*4] pextrd [dstq+strideq*0], xm0, 2 pextrd [dstq+strideq*1], xm0, 3 - pextrd [dstq+strideq*2], xmm1, 2 - pextrd [dstq+stride3q ], xmm1, 3 + pextrd [dstq+strideq*2], xm1, 2 + pextrd [dstq+stride3q ], xm1, 3 .w4_end: pand xm8, xm11 mova [maskq], xm8 @@ -3801,11 +3801,11 @@ cglobal w_mask_422_8bpc, 4, 8, 14, dst, stride, tmp1, tmp2, w, h, mask, stride3 vpermb ym8, ym10, ym8 pand xm8, xm11 mova [maskq], xm8 - vextracti128 xmm1, ym0, 1 + vextracti32x4 xm1, ym0, 1 movq [dstq+strideq*0], xm0 - movq [dstq+strideq*1], xmm1 + movq [dstq+strideq*1], xm1 movhps [dstq+strideq*2], xm0 - movhps [dstq+stride3q ], xmm1 + movhps [dstq+stride3q ], xm1 RET .w8_loop: add tmp1q, 128 @@ -3819,18 +3819,18 @@ cglobal w_mask_422_8bpc, 4, 8, 14, dst, stride, tmp1, tmp2, w, h, mask, stride3 vpermb m1, m10, m1 pand ym1, ym11 mova [maskq], ym1 - vextracti32x4 xmm1, ym0, 1 - vextracti32x4 xmm2, m0, 2 - vextracti32x4 xmm3, m0, 3 + vextracti32x4 xm1, ym0, 1 + vextracti32x4 xm2, m0, 2 + vextracti32x4 xm3, m0, 3 movq [dstq+strideq*0], xm0 - movq [dstq+strideq*1], xmm1 - movq [dstq+strideq*2], xmm2 - movq [dstq+stride3q ], xmm3 + movq [dstq+strideq*1], xm1 + movq [dstq+strideq*2], xm2 + movq [dstq+stride3q ], xm3 lea dstq, [dstq+strideq*4] movhps [dstq+strideq*0], xm0 - movhps [dstq+strideq*1], xmm1 - movhps [dstq+strideq*2], xmm2 - movhps [dstq+stride3q ], xmm3 + movhps [dstq+strideq*1], xm1 + movhps [dstq+strideq*2], xm2 + movhps [dstq+stride3q ], xm3 sub hd, 8 jg .w8_loop RET @@ -3936,17 +3936,17 @@ cglobal w_mask_444_8bpc, 4, 8, 12, dst, stride, tmp1, tmp2, w, h, mask, stride3 vinserti128 ym8, [wm_444_mask+32], 1 vpermb ym4, ym8, ym4 mova [maskq], ym4 - vextracti128 xmm1, m0, 1 + vextracti32x4 xm1, m0, 1 movd [dstq+strideq*0], xm0 pextrd [dstq+strideq*1], xm0, 1 - movd [dstq+strideq*2], xmm1 - pextrd [dstq+stride3q ], xmm1, 1 + movd [dstq+strideq*2], xm1 + pextrd [dstq+stride3q ], xm1, 1 jl .w4_end lea dstq, [dstq+strideq*4] pextrd [dstq+strideq*0], xm0, 2 pextrd [dstq+strideq*1], xm0, 3 - pextrd [dstq+strideq*2], xmm1, 2 - pextrd [dstq+stride3q ], xmm1, 3 + pextrd [dstq+strideq*2], xm1, 2 + pextrd [dstq+stride3q ], xm1, 3 .w4_end: RET .w4_h16: @@ -3965,11 +3965,11 @@ cglobal w_mask_444_8bpc, 4, 8, 12, dst, stride, tmp1, tmp2, w, h, mask, stride3 vinserti128 ym8, [wm_444_mask+32], 1 vpermb ym4, ym8, ym4 mova [maskq], ym4 - vextracti128 xmm1, ym0, 1 + vextracti32x4 xm1, ym0, 1 movq [dstq+strideq*0], xm0 - movq [dstq+strideq*1], xmm1 + movq [dstq+strideq*1], xm1 movhps [dstq+strideq*2], xm0 - movhps [dstq+stride3q ], xmm1 + movhps [dstq+stride3q ], xm1 RET .w8_loop: add tmp1q, 128 @@ -3980,18 +3980,18 @@ cglobal w_mask_444_8bpc, 4, 8, 12, dst, stride, tmp1, tmp2, w, h, mask, stride3 W_MASK 0, 4, 0, 1, 1 vpermb m4, m8, m4 mova [maskq], m4 - vextracti32x4 xmm1, ym0, 1 - vextracti32x4 xmm2, m0, 2 - vextracti32x4 xmm3, m0, 3 + vextracti32x4 xm1, ym0, 1 + vextracti32x4 xm2, m0, 2 + vextracti32x4 xm3, m0, 3 movq [dstq+strideq*0], xm0 - movq [dstq+strideq*1], xmm1 - movq [dstq+strideq*2], xmm2 - movq [dstq+stride3q ], xmm3 + movq [dstq+strideq*1], xm1 + movq [dstq+strideq*2], xm2 + movq [dstq+stride3q ], xm3 lea dstq, [dstq+strideq*4] movhps [dstq+strideq*0], xm0 - movhps [dstq+strideq*1], xmm1 - movhps [dstq+strideq*2], xmm2 - movhps [dstq+stride3q ], xmm3 + movhps [dstq+strideq*1], xm1 + movhps [dstq+strideq*2], xm2 + movhps [dstq+stride3q ], xm3 sub hd, 8 jg .w8_loop RET |