From 4a2c65162029755a4717528014a456a400590e36 Mon Sep 17 00:00:00 2001 From: Lynne Date: Sat, 16 Mar 2019 17:30:16 +0100 Subject: x86/opus_dsp: rename to celt_pvq Its only used in the encoder and in CELT's PVQ. --- libavcodec/x86/Makefile | 6 +- libavcodec/x86/celt_pvq_init.c | 45 +++++ libavcodec/x86/celt_pvq_search.asm | 385 +++++++++++++++++++++++++++++++++++++ libavcodec/x86/opus_dsp_init.c | 45 ----- libavcodec/x86/opus_pvq_search.asm | 385 ------------------------------------- 5 files changed, 433 insertions(+), 433 deletions(-) create mode 100644 libavcodec/x86/celt_pvq_init.c create mode 100644 libavcodec/x86/celt_pvq_search.asm delete mode 100644 libavcodec/x86/opus_dsp_init.c delete mode 100644 libavcodec/x86/opus_pvq_search.asm (limited to 'libavcodec/x86') diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile index 2350c8bbee..3bfba94ec2 100644 --- a/libavcodec/x86/Makefile +++ b/libavcodec/x86/Makefile @@ -53,8 +53,8 @@ OBJS-$(CONFIG_CAVS_DECODER) += x86/cavsdsp.o OBJS-$(CONFIG_DCA_DECODER) += x86/dcadsp_init.o x86/synth_filter_init.o OBJS-$(CONFIG_DNXHD_ENCODER) += x86/dnxhdenc_init.o OBJS-$(CONFIG_EXR_DECODER) += x86/exrdsp_init.o -OBJS-$(CONFIG_OPUS_DECODER) += x86/opus_dsp_init.o -OBJS-$(CONFIG_OPUS_ENCODER) += x86/opus_dsp_init.o +OBJS-$(CONFIG_OPUS_DECODER) += x86/celt_pvq_init.o +OBJS-$(CONFIG_OPUS_ENCODER) += x86/celt_pvq_init.o OBJS-$(CONFIG_HEVC_DECODER) += x86/hevcdsp_init.o OBJS-$(CONFIG_JPEG2000_DECODER) += x86/jpeg2000dsp_init.o OBJS-$(CONFIG_MLP_DECODER) += x86/mlpdsp_init.o @@ -127,7 +127,7 @@ X86ASM-OBJS-$(CONFIG_MDCT15) += x86/mdct15.o X86ASM-OBJS-$(CONFIG_ME_CMP) += x86/me_cmp.o X86ASM-OBJS-$(CONFIG_MPEGAUDIODSP) += x86/imdct36.o X86ASM-OBJS-$(CONFIG_MPEGVIDEOENC) += x86/mpegvideoencdsp.o -X86ASM-OBJS-$(CONFIG_OPUS_ENCODER) += x86/opus_pvq_search.o +X86ASM-OBJS-$(CONFIG_OPUS_ENCODER) += x86/celt_pvq_search.o X86ASM-OBJS-$(CONFIG_PIXBLOCKDSP) += x86/pixblockdsp.o X86ASM-OBJS-$(CONFIG_QPELDSP) += x86/qpeldsp.o \ x86/fpel.o \ diff --git a/libavcodec/x86/celt_pvq_init.c b/libavcodec/x86/celt_pvq_init.c new file mode 100644 index 0000000000..3890a9cb9f --- /dev/null +++ b/libavcodec/x86/celt_pvq_init.c @@ -0,0 +1,45 @@ +/* + * Opus encoder assembly optimizations + * Copyright (C) 2017 Ivan Kalvachev + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "config.h" + +#include "libavutil/x86/cpu.h" +#include "libavcodec/opus_pvq.h" + +extern float ff_pvq_search_approx_sse2(float *X, int *y, int K, int N); +extern float ff_pvq_search_approx_sse4(float *X, int *y, int K, int N); +extern float ff_pvq_search_exact_avx (float *X, int *y, int K, int N); + +av_cold void ff_celt_pvq_init_x86(CeltPVQ *s) +{ + int cpu_flags = av_get_cpu_flags(); + +#if CONFIG_OPUS_ENCODER + if (EXTERNAL_SSE2(cpu_flags)) + s->pvq_search = ff_pvq_search_approx_sse2; + + if (EXTERNAL_SSE4(cpu_flags)) + s->pvq_search = ff_pvq_search_approx_sse4; + + if (EXTERNAL_AVX_FAST(cpu_flags)) + s->pvq_search = ff_pvq_search_exact_avx; +#endif +} diff --git a/libavcodec/x86/celt_pvq_search.asm b/libavcodec/x86/celt_pvq_search.asm new file mode 100644 index 0000000000..5c1e6d6174 --- /dev/null +++ b/libavcodec/x86/celt_pvq_search.asm @@ -0,0 +1,385 @@ +;****************************************************************************** +;* SIMD optimized Opus encoder DSP function +;* +;* Copyright (C) 2017 Ivan Kalvachev +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with FFmpeg; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "config.asm" +%include "libavutil/x86/x86util.asm" + +%ifdef __NASM_VER__ +%use "smartalign" +ALIGNMODE p6 +%endif + +SECTION_RODATA 64 + +const_float_abs_mask: times 8 dd 0x7fffffff +const_align_abs_edge: times 8 dd 0 + +const_float_0_5: times 8 dd 0.5 +const_float_1: times 8 dd 1.0 +const_float_sign_mask: times 8 dd 0x80000000 + +const_int32_offsets: + %rep 8 + dd $-const_int32_offsets + %endrep +SECTION .text + +; +; Setup High Register to be used +; for holding memory constants +; +; %1 - the register to be used, assmues it is >= mm8 +; %2 - name of the constant. +; +; Subsequent opcodes are going to use the constant in the form +; "addps m0, mm_const_name" and it would be turned into: +; "addps m0, [const_name]" on 32 bit arch or +; "addps m0, m8" on 64 bit arch +%macro SET_HI_REG_MM_CONSTANT 3 ; movop, reg, const_name +%if num_mmregs > 8 + %define mm_%3 %2 + %{1} %2, [%3] ; movaps m8, [const_name] +%else + %define mm_%3 [%3] +%endif +%endmacro + +; +; Set Position Independent Code +; Base address of a constant +; %1 - the register to be used, if PIC is set +; %2 - name of the constant. +; +; Subsequent opcode are going to use the base address in the form +; "movaps m0, [pic_base_constant_name+r4]" and it would be turned into +; "movaps m0, [r5 + r4]" if PIC is enabled +; "movaps m0, [constant_name + r4]" if texrel are used +%macro SET_PIC_BASE 3; reg, const_label +%ifdef PIC + %{1} %2, [%3] ; lea r5, [rip+const] + %define pic_base_%3 %2 +%else + %define pic_base_%3 %3 +%endif +%endmacro + +%macro PULSES_SEARCH 1 +; m6 Syy_norm +; m7 Sxy_norm + addps m6, mm_const_float_0_5 ; Syy_norm += 1.0/2 + pxor m1, m1 ; max_idx + xorps m3, m3 ; p_max + xor r4d, r4d +align 16 +%%distortion_search: + movd xm2, dword r4d ; movd zero extends +%ifidn %1,add + movaps m4, [tmpY + r4] ; y[i] + movaps m5, [tmpX + r4] ; X[i] + + %if USE_APPROXIMATION == 1 + xorps m0, m0 + cmpps m0, m0, m5, 4 ; m0 = (X[i] != 0.0) + %endif + + addps m4, m6 ; m4 = Syy_new = y[i] + Syy_norm + addps m5, m7 ; m5 = Sxy_new = X[i] + Sxy_norm + + %if USE_APPROXIMATION == 1 + andps m5, m0 ; if(X[i] == 0) Sxy_new = 0; Prevent aproximation error from setting pulses in array padding. + %endif + +%else + movaps m5, [tmpY + r4] ; m5 = y[i] + + xorps m0, m0 ; m0 = 0; + cmpps m0, m0, m5, 1 ; m0 = (0 p_max) + maxps m3, m5 ; m3=max(p_max,p) + ; maxps here is faster than blendvps, despite blend having lower latency. + + pand m2, m0 ; This version seems faster than sse41 pblendvb + pmaxsw m1, m2 ; SSE2 signed word, so it would work for N < 32768/4 + + add r4d, mmsize + cmp r4d, Nd + jb %%distortion_search + + por m1, mm_const_int32_offsets ; max_idx offsets per individual lane (skipped in the inner loop) + movdqa m4, m1 ; needed for the aligned y[max_idx]+=1; processing + +%if mmsize >= 32 +; Merge parallel maximums round 8 (4 vs 4) + + vextractf128 xm5, ym3, 1 ; xmm5 = ymm3[1x128] = ymm3[255..128b] + cmpps xm0, xm3, xm5, 1 ; m0 = (m3 < m5) = ( p[0x128] < p[1x128] ) + + vextracti128 xm2, ym1, 1 ; xmm2 = ymm1[1x128] = ymm1[255..128b] + BLENDVPS xm3, xm5, xm0 ; max_idx = m0 ? max_idx[1x128] : max_idx[0x128] + PBLENDVB xm1, xm2, xm0 ; p = m0 ? p[1x128] : p[0x128] +%endif + +; Merge parallel maximums round 4 (2 vs 2) + ; m3=p[3210] + movhlps xm5, xm3 ; m5=p[xx32] + cmpps xm0, xm3, xm5, 1 ; m0 = (m3 < m5) = ( p[1,0] < p[3,2] ) + + pshufd xm2, xm1, q3232 + BLENDVPS xm3, xm5, xm0 ; max_idx = m0 ? max_idx[3,2] : max_idx[1,0] + PBLENDVB xm1, xm2, xm0 ; p = m0 ? p[3,2] : p[1,0] + +; Merge parallel maximums final round (1 vs 1) + shufps xm0, xm3, xm3, q1111 ; m0 = m3[1] = p[1] + cmpss xm0, xm3, 5 ; m0 = !(m0 >= m3) = !( p[1] >= p[0] ) + + pshufd xm2, xm1, q1111 + PBLENDVB xm1, xm2, xm0 + + movd dword r4d, xm1 ; zero extends to the rest of r4q + + VBROADCASTSS m3, [tmpX + r4] + %{1}ps m7, m3 ; Sxy += X[max_idx] + + VBROADCASTSS m5, [tmpY + r4] + %{1}ps m6, m5 ; Syy += Y[max_idx] + + ; We have to update a single element in Y[i] + ; However writing 4 bytes and then doing 16 byte load in the inner loop + ; could cause a stall due to breaking write forwarding. + VPBROADCASTD m1, xm1 + pcmpeqd m1, m1, m4 ; exactly 1 element matches max_idx and this finds it + + and r4d, ~(mmsize-1) ; align address down, so the value pointed by max_idx is inside a mmsize load + movaps m5, [tmpY + r4] ; m5 = Y[y3...ym...y0] + andps m1, mm_const_float_1 ; m1 = [ 0...1.0...0] + %{1}ps m5, m1 ; m5 = Y[y3...ym...y0] +/- [0...1.0...0] + movaps [tmpY + r4], m5 ; Y[max_idx] +-= 1.0; +%endmacro + +; +; We need one more register for +; PIC relative addressing. Use this +; to count it in cglobal +; +%ifdef PIC + %define num_pic_regs 1 +%else + %define num_pic_regs 0 +%endif + +; +; Pyramid Vector Quantization Search implementation +; +; float * inX - Unaligned (SIMD) access, it will be overread, +; but extra data is masked away. +; int32 * outY - Should be aligned and padded buffer. +; It is used as temp buffer. +; uint32 K - Number of pulses to have after quantizations. +; uint32 N - Number of vector elements. Must be 0 < N < 256 +; +%macro PVQ_FAST_SEARCH 1 +cglobal pvq_search%1, 4, 5+num_pic_regs, 11, 256*4, inX, outY, K, N +%define tmpX rsp +%define tmpY outYq + + movaps m0, [const_float_abs_mask] + shl Nd, 2 ; N *= sizeof(float); also 32 bit operation zeroes the high 32 bits in 64 bit mode. + mov r4d, Nd + + neg r4d + and r4d, mmsize-1 + + SET_PIC_BASE lea, r5, const_align_abs_edge ; rip+const + movups m2, [pic_base_const_align_abs_edge + r4 - mmsize] + + add Nd, r4d ; N = align(N, mmsize) + + lea r4d, [Nd - mmsize] ; N is rounded up (aligned up) to mmsize, so r4 can't become negative here, unless N=0. + movups m1, [inXq + r4] + andps m1, m2 + movaps [tmpX + r4], m1 ; Sx = abs( X[N-1] ) + +align 16 +%%loop_abs_sum: + sub r4d, mmsize + jc %%end_loop_abs_sum + + movups m2, [inXq + r4] + andps m2, m0 + + movaps [tmpX + r4], m2 ; tmpX[i]=abs(X[i]) + addps m1, m2 ; Sx += abs(X[i]) + jmp %%loop_abs_sum + +align 16 +%%end_loop_abs_sum: + + HSUMPS m1, m2 ; m1 = Sx + + xorps m0, m0 + comiss xm0, xm1 ; + jz %%zero_input ; if (Sx==0) goto zero_input + + cvtsi2ss xm0, dword Kd ; m0 = K +%if USE_APPROXIMATION == 1 + rcpss xm1, xm1 ; m1 = approx(1/Sx) + mulss xm0, xm1 ; m0 = K*(1/Sx) +%else + divss xm0, xm1 ; b = K/Sx + ; b = K/max_x +%endif + + VBROADCASTSS m0, xm0 + + lea r4d, [Nd - mmsize] + pxor m5, m5 ; Sy ( Sum of abs( y[i]) ) + xorps m6, m6 ; Syy ( Sum of y[i]*y[i] ) + xorps m7, m7 ; Sxy ( Sum of X[i]*y[i] ) +align 16 +%%loop_guess: + movaps m1, [tmpX + r4] ; m1 = X[i] + mulps m2, m0, m1 ; m2 = res*X[i] + cvtps2dq m2, m2 ; yt = (int)lrintf( res*X[i] ) + paddd m5, m2 ; Sy += yt + cvtdq2ps m2, m2 ; yt = (float)yt + mulps m1, m2 ; m1 = X[i]*yt + movaps [tmpY + r4], m2 ; y[i] = m2 + addps m7, m1 ; Sxy += m1; + mulps m2, m2 ; m2 = yt*yt + addps m6, m2 ; Syy += m2 + + sub r4d, mmsize + jnc %%loop_guess + + HSUMPS m6, m1 ; Syy_norm + HADDD m5, m4 ; pulses + + movd dword r4d, xm5 ; zero extends to the rest of r4q + + sub Kd, r4d ; K -= pulses , also 32 bit operation zeroes high 32 bit in 64 bit mode. + jz %%finish ; K - pulses == 0 + + SET_HI_REG_MM_CONSTANT movaps, m8, const_float_0_5 + SET_HI_REG_MM_CONSTANT movaps, m9, const_float_1 + SET_HI_REG_MM_CONSTANT movdqa, m10, const_int32_offsets + ; Use Syy/2 in distortion parameter calculations. + ; Saves pre and post-caclulation to correct Y[] values. + ; Same precision, since float mantisa is normalized. + ; The SQRT approximation does differ. + HSUMPS m7, m0 ; Sxy_norm + mulps m6, mm_const_float_0_5 + + jc %%remove_pulses_loop ; K - pulses < 0 + +align 16 ; K - pulses > 0 +%%add_pulses_loop: + + PULSES_SEARCH add ; m6 Syy_norm ; m7 Sxy_norm + + sub Kd, 1 + jnz %%add_pulses_loop + + addps m6, m6 ; Syy*=2 + + jmp %%finish + +align 16 +%%remove_pulses_loop: + + PULSES_SEARCH sub ; m6 Syy_norm ; m7 Sxy_norm + + add Kd, 1 + jnz %%remove_pulses_loop + + addps m6, m6 ; Syy*=2 + +align 16 +%%finish: + lea r4d, [Nd - mmsize] + movaps m2, [const_float_sign_mask] + +align 16 +%%restore_sign_loop: + movaps m0, [tmpY + r4] ; m0 = Y[i] + movups m1, [inXq + r4] ; m1 = X[i] + andps m1, m2 ; m1 = sign(X[i]) + orps m0, m1 ; m0 = Y[i]*sign + cvtps2dq m3, m0 ; m3 = (int)m0 + movaps [outYq + r4], m3 + + sub r4d, mmsize + jnc %%restore_sign_loop +%%return: + +%if ARCH_X86_64 == 0 ; sbrdsp + movss r0m, xm6 ; return (float)Syy_norm + fld dword r0m +%else + movaps m0, m6 ; return (float)Syy_norm +%endif + + RET + +align 16 +%%zero_input: + lea r4d, [Nd - mmsize] + xorps m0, m0 +%%zero_loop: + movaps [outYq + r4], m0 + sub r4d, mmsize + jnc %%zero_loop + + movaps m6, [const_float_1] + jmp %%return +%endmacro + +; if 1, use a float op that give half precision but execute for around 3 cycles. +; On Skylake & Ryzen the division is much faster (around 11c/3), +; that makes the full precision code about 2% slower. +; Opus also does use rsqrt approximation in their intrinsics code. +%define USE_APPROXIMATION 1 + +INIT_XMM sse2 +PVQ_FAST_SEARCH _approx + +INIT_XMM sse4 +PVQ_FAST_SEARCH _approx + +%define USE_APPROXIMATION 0 + +INIT_XMM avx +PVQ_FAST_SEARCH _exact diff --git a/libavcodec/x86/opus_dsp_init.c b/libavcodec/x86/opus_dsp_init.c deleted file mode 100644 index a9f8a96159..0000000000 --- a/libavcodec/x86/opus_dsp_init.c +++ /dev/null @@ -1,45 +0,0 @@ -/* - * Opus encoder assembly optimizations - * Copyright (C) 2017 Ivan Kalvachev - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "config.h" - -#include "libavutil/x86/cpu.h" -#include "libavcodec/opus_pvq.h" - -extern float ff_pvq_search_approx_sse2(float *X, int *y, int K, int N); -extern float ff_pvq_search_approx_sse4(float *X, int *y, int K, int N); -extern float ff_pvq_search_exact_avx (float *X, int *y, int K, int N); - -av_cold void ff_opus_dsp_init_x86(CeltPVQ *s) -{ - int cpu_flags = av_get_cpu_flags(); - -#if CONFIG_OPUS_ENCODER - if (EXTERNAL_SSE2(cpu_flags)) - s->pvq_search = ff_pvq_search_approx_sse2; - - if (EXTERNAL_SSE4(cpu_flags)) - s->pvq_search = ff_pvq_search_approx_sse4; - - if (EXTERNAL_AVX_FAST(cpu_flags)) - s->pvq_search = ff_pvq_search_exact_avx; -#endif -} diff --git a/libavcodec/x86/opus_pvq_search.asm b/libavcodec/x86/opus_pvq_search.asm deleted file mode 100644 index 5c1e6d6174..0000000000 --- a/libavcodec/x86/opus_pvq_search.asm +++ /dev/null @@ -1,385 +0,0 @@ -;****************************************************************************** -;* SIMD optimized Opus encoder DSP function -;* -;* Copyright (C) 2017 Ivan Kalvachev -;* -;* This file is part of FFmpeg. -;* -;* FFmpeg is free software; you can redistribute it and/or -;* modify it under the terms of the GNU Lesser General Public -;* License as published by the Free Software Foundation; either -;* version 2.1 of the License, or (at your option) any later version. -;* -;* FFmpeg is distributed in the hope that it will be useful, -;* but WITHOUT ANY WARRANTY; without even the implied warranty of -;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -;* Lesser General Public License for more details. -;* -;* You should have received a copy of the GNU Lesser General Public -;* License along with FFmpeg; if not, write to the Free Software -;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -;****************************************************************************** - -%include "config.asm" -%include "libavutil/x86/x86util.asm" - -%ifdef __NASM_VER__ -%use "smartalign" -ALIGNMODE p6 -%endif - -SECTION_RODATA 64 - -const_float_abs_mask: times 8 dd 0x7fffffff -const_align_abs_edge: times 8 dd 0 - -const_float_0_5: times 8 dd 0.5 -const_float_1: times 8 dd 1.0 -const_float_sign_mask: times 8 dd 0x80000000 - -const_int32_offsets: - %rep 8 - dd $-const_int32_offsets - %endrep -SECTION .text - -; -; Setup High Register to be used -; for holding memory constants -; -; %1 - the register to be used, assmues it is >= mm8 -; %2 - name of the constant. -; -; Subsequent opcodes are going to use the constant in the form -; "addps m0, mm_const_name" and it would be turned into: -; "addps m0, [const_name]" on 32 bit arch or -; "addps m0, m8" on 64 bit arch -%macro SET_HI_REG_MM_CONSTANT 3 ; movop, reg, const_name -%if num_mmregs > 8 - %define mm_%3 %2 - %{1} %2, [%3] ; movaps m8, [const_name] -%else - %define mm_%3 [%3] -%endif -%endmacro - -; -; Set Position Independent Code -; Base address of a constant -; %1 - the register to be used, if PIC is set -; %2 - name of the constant. -; -; Subsequent opcode are going to use the base address in the form -; "movaps m0, [pic_base_constant_name+r4]" and it would be turned into -; "movaps m0, [r5 + r4]" if PIC is enabled -; "movaps m0, [constant_name + r4]" if texrel are used -%macro SET_PIC_BASE 3; reg, const_label -%ifdef PIC - %{1} %2, [%3] ; lea r5, [rip+const] - %define pic_base_%3 %2 -%else - %define pic_base_%3 %3 -%endif -%endmacro - -%macro PULSES_SEARCH 1 -; m6 Syy_norm -; m7 Sxy_norm - addps m6, mm_const_float_0_5 ; Syy_norm += 1.0/2 - pxor m1, m1 ; max_idx - xorps m3, m3 ; p_max - xor r4d, r4d -align 16 -%%distortion_search: - movd xm2, dword r4d ; movd zero extends -%ifidn %1,add - movaps m4, [tmpY + r4] ; y[i] - movaps m5, [tmpX + r4] ; X[i] - - %if USE_APPROXIMATION == 1 - xorps m0, m0 - cmpps m0, m0, m5, 4 ; m0 = (X[i] != 0.0) - %endif - - addps m4, m6 ; m4 = Syy_new = y[i] + Syy_norm - addps m5, m7 ; m5 = Sxy_new = X[i] + Sxy_norm - - %if USE_APPROXIMATION == 1 - andps m5, m0 ; if(X[i] == 0) Sxy_new = 0; Prevent aproximation error from setting pulses in array padding. - %endif - -%else - movaps m5, [tmpY + r4] ; m5 = y[i] - - xorps m0, m0 ; m0 = 0; - cmpps m0, m0, m5, 1 ; m0 = (0 p_max) - maxps m3, m5 ; m3=max(p_max,p) - ; maxps here is faster than blendvps, despite blend having lower latency. - - pand m2, m0 ; This version seems faster than sse41 pblendvb - pmaxsw m1, m2 ; SSE2 signed word, so it would work for N < 32768/4 - - add r4d, mmsize - cmp r4d, Nd - jb %%distortion_search - - por m1, mm_const_int32_offsets ; max_idx offsets per individual lane (skipped in the inner loop) - movdqa m4, m1 ; needed for the aligned y[max_idx]+=1; processing - -%if mmsize >= 32 -; Merge parallel maximums round 8 (4 vs 4) - - vextractf128 xm5, ym3, 1 ; xmm5 = ymm3[1x128] = ymm3[255..128b] - cmpps xm0, xm3, xm5, 1 ; m0 = (m3 < m5) = ( p[0x128] < p[1x128] ) - - vextracti128 xm2, ym1, 1 ; xmm2 = ymm1[1x128] = ymm1[255..128b] - BLENDVPS xm3, xm5, xm0 ; max_idx = m0 ? max_idx[1x128] : max_idx[0x128] - PBLENDVB xm1, xm2, xm0 ; p = m0 ? p[1x128] : p[0x128] -%endif - -; Merge parallel maximums round 4 (2 vs 2) - ; m3=p[3210] - movhlps xm5, xm3 ; m5=p[xx32] - cmpps xm0, xm3, xm5, 1 ; m0 = (m3 < m5) = ( p[1,0] < p[3,2] ) - - pshufd xm2, xm1, q3232 - BLENDVPS xm3, xm5, xm0 ; max_idx = m0 ? max_idx[3,2] : max_idx[1,0] - PBLENDVB xm1, xm2, xm0 ; p = m0 ? p[3,2] : p[1,0] - -; Merge parallel maximums final round (1 vs 1) - shufps xm0, xm3, xm3, q1111 ; m0 = m3[1] = p[1] - cmpss xm0, xm3, 5 ; m0 = !(m0 >= m3) = !( p[1] >= p[0] ) - - pshufd xm2, xm1, q1111 - PBLENDVB xm1, xm2, xm0 - - movd dword r4d, xm1 ; zero extends to the rest of r4q - - VBROADCASTSS m3, [tmpX + r4] - %{1}ps m7, m3 ; Sxy += X[max_idx] - - VBROADCASTSS m5, [tmpY + r4] - %{1}ps m6, m5 ; Syy += Y[max_idx] - - ; We have to update a single element in Y[i] - ; However writing 4 bytes and then doing 16 byte load in the inner loop - ; could cause a stall due to breaking write forwarding. - VPBROADCASTD m1, xm1 - pcmpeqd m1, m1, m4 ; exactly 1 element matches max_idx and this finds it - - and r4d, ~(mmsize-1) ; align address down, so the value pointed by max_idx is inside a mmsize load - movaps m5, [tmpY + r4] ; m5 = Y[y3...ym...y0] - andps m1, mm_const_float_1 ; m1 = [ 0...1.0...0] - %{1}ps m5, m1 ; m5 = Y[y3...ym...y0] +/- [0...1.0...0] - movaps [tmpY + r4], m5 ; Y[max_idx] +-= 1.0; -%endmacro - -; -; We need one more register for -; PIC relative addressing. Use this -; to count it in cglobal -; -%ifdef PIC - %define num_pic_regs 1 -%else - %define num_pic_regs 0 -%endif - -; -; Pyramid Vector Quantization Search implementation -; -; float * inX - Unaligned (SIMD) access, it will be overread, -; but extra data is masked away. -; int32 * outY - Should be aligned and padded buffer. -; It is used as temp buffer. -; uint32 K - Number of pulses to have after quantizations. -; uint32 N - Number of vector elements. Must be 0 < N < 256 -; -%macro PVQ_FAST_SEARCH 1 -cglobal pvq_search%1, 4, 5+num_pic_regs, 11, 256*4, inX, outY, K, N -%define tmpX rsp -%define tmpY outYq - - movaps m0, [const_float_abs_mask] - shl Nd, 2 ; N *= sizeof(float); also 32 bit operation zeroes the high 32 bits in 64 bit mode. - mov r4d, Nd - - neg r4d - and r4d, mmsize-1 - - SET_PIC_BASE lea, r5, const_align_abs_edge ; rip+const - movups m2, [pic_base_const_align_abs_edge + r4 - mmsize] - - add Nd, r4d ; N = align(N, mmsize) - - lea r4d, [Nd - mmsize] ; N is rounded up (aligned up) to mmsize, so r4 can't become negative here, unless N=0. - movups m1, [inXq + r4] - andps m1, m2 - movaps [tmpX + r4], m1 ; Sx = abs( X[N-1] ) - -align 16 -%%loop_abs_sum: - sub r4d, mmsize - jc %%end_loop_abs_sum - - movups m2, [inXq + r4] - andps m2, m0 - - movaps [tmpX + r4], m2 ; tmpX[i]=abs(X[i]) - addps m1, m2 ; Sx += abs(X[i]) - jmp %%loop_abs_sum - -align 16 -%%end_loop_abs_sum: - - HSUMPS m1, m2 ; m1 = Sx - - xorps m0, m0 - comiss xm0, xm1 ; - jz %%zero_input ; if (Sx==0) goto zero_input - - cvtsi2ss xm0, dword Kd ; m0 = K -%if USE_APPROXIMATION == 1 - rcpss xm1, xm1 ; m1 = approx(1/Sx) - mulss xm0, xm1 ; m0 = K*(1/Sx) -%else - divss xm0, xm1 ; b = K/Sx - ; b = K/max_x -%endif - - VBROADCASTSS m0, xm0 - - lea r4d, [Nd - mmsize] - pxor m5, m5 ; Sy ( Sum of abs( y[i]) ) - xorps m6, m6 ; Syy ( Sum of y[i]*y[i] ) - xorps m7, m7 ; Sxy ( Sum of X[i]*y[i] ) -align 16 -%%loop_guess: - movaps m1, [tmpX + r4] ; m1 = X[i] - mulps m2, m0, m1 ; m2 = res*X[i] - cvtps2dq m2, m2 ; yt = (int)lrintf( res*X[i] ) - paddd m5, m2 ; Sy += yt - cvtdq2ps m2, m2 ; yt = (float)yt - mulps m1, m2 ; m1 = X[i]*yt - movaps [tmpY + r4], m2 ; y[i] = m2 - addps m7, m1 ; Sxy += m1; - mulps m2, m2 ; m2 = yt*yt - addps m6, m2 ; Syy += m2 - - sub r4d, mmsize - jnc %%loop_guess - - HSUMPS m6, m1 ; Syy_norm - HADDD m5, m4 ; pulses - - movd dword r4d, xm5 ; zero extends to the rest of r4q - - sub Kd, r4d ; K -= pulses , also 32 bit operation zeroes high 32 bit in 64 bit mode. - jz %%finish ; K - pulses == 0 - - SET_HI_REG_MM_CONSTANT movaps, m8, const_float_0_5 - SET_HI_REG_MM_CONSTANT movaps, m9, const_float_1 - SET_HI_REG_MM_CONSTANT movdqa, m10, const_int32_offsets - ; Use Syy/2 in distortion parameter calculations. - ; Saves pre and post-caclulation to correct Y[] values. - ; Same precision, since float mantisa is normalized. - ; The SQRT approximation does differ. - HSUMPS m7, m0 ; Sxy_norm - mulps m6, mm_const_float_0_5 - - jc %%remove_pulses_loop ; K - pulses < 0 - -align 16 ; K - pulses > 0 -%%add_pulses_loop: - - PULSES_SEARCH add ; m6 Syy_norm ; m7 Sxy_norm - - sub Kd, 1 - jnz %%add_pulses_loop - - addps m6, m6 ; Syy*=2 - - jmp %%finish - -align 16 -%%remove_pulses_loop: - - PULSES_SEARCH sub ; m6 Syy_norm ; m7 Sxy_norm - - add Kd, 1 - jnz %%remove_pulses_loop - - addps m6, m6 ; Syy*=2 - -align 16 -%%finish: - lea r4d, [Nd - mmsize] - movaps m2, [const_float_sign_mask] - -align 16 -%%restore_sign_loop: - movaps m0, [tmpY + r4] ; m0 = Y[i] - movups m1, [inXq + r4] ; m1 = X[i] - andps m1, m2 ; m1 = sign(X[i]) - orps m0, m1 ; m0 = Y[i]*sign - cvtps2dq m3, m0 ; m3 = (int)m0 - movaps [outYq + r4], m3 - - sub r4d, mmsize - jnc %%restore_sign_loop -%%return: - -%if ARCH_X86_64 == 0 ; sbrdsp - movss r0m, xm6 ; return (float)Syy_norm - fld dword r0m -%else - movaps m0, m6 ; return (float)Syy_norm -%endif - - RET - -align 16 -%%zero_input: - lea r4d, [Nd - mmsize] - xorps m0, m0 -%%zero_loop: - movaps [outYq + r4], m0 - sub r4d, mmsize - jnc %%zero_loop - - movaps m6, [const_float_1] - jmp %%return -%endmacro - -; if 1, use a float op that give half precision but execute for around 3 cycles. -; On Skylake & Ryzen the division is much faster (around 11c/3), -; that makes the full precision code about 2% slower. -; Opus also does use rsqrt approximation in their intrinsics code. -%define USE_APPROXIMATION 1 - -INIT_XMM sse2 -PVQ_FAST_SEARCH _approx - -INIT_XMM sse4 -PVQ_FAST_SEARCH _approx - -%define USE_APPROXIMATION 0 - -INIT_XMM avx -PVQ_FAST_SEARCH _exact -- cgit v1.2.3