From f30ff542000490d6044df0403dd8f0aee78b0106 Mon Sep 17 00:00:00 2001 From: Diego Biurrun Date: Sat, 21 Jul 2012 21:17:30 +0000 Subject: doc: Clarify licensing issues arising from external libraries --- LICENSE | 37 ++++++++++++++++++++++++------------- 1 file changed, 24 insertions(+), 13 deletions(-) diff --git a/LICENSE b/LICENSE index e6d25f299e..1266627f39 100644 --- a/LICENSE +++ b/LICENSE @@ -1,5 +1,5 @@ Libav: ------- +====== Most files in Libav are under the GNU Lesser General Public License version 2.1 or later (LGPL v2.1+). Read the file COPYING.LGPLv2.1 for details. Some other @@ -38,18 +38,29 @@ for you. Read the file COPYING.LGPLv3 or, if you have enabled GPL parts, COPYING.GPLv3 to learn the exact legal terms that apply in this case. -external libraries: -------------------- +external libraries +================== -Some external libraries, e.g. libx264, are under GPL and can be used in -conjunction with Libav. They require --enable-gpl to be passed to configure -as well. +Libav can be combined with a number of external libraries, which sometimes +affect the licensing of binaries resulting from the combination. -The OpenCORE external libraries are under the Apache License 2.0. That license -is incompatible with the LGPL v2.1 and the GPL v2, but not with version 3 of -those licenses. So to combine the OpenCORE libraries with Libav, the license -version needs to be upgraded by passing --enable-version3 to configure. +compatible libraries +-------------------- -The nonfree external library libfaac can be hooked up in Libav. You need to -pass --enable-nonfree to configure to enable it. Employ this option with care -as Libav then becomes nonfree and unredistributable. +The libcdio, libx264, libxavs and libxvid libraries are under GPL. When +combining them with Libav, Libav needs to be licensed as GPL as well by +passing --enable-gpl to configure. + +The OpenCORE and VisualOn libraries are under the Apache License 2.0. That +license is incompatible with the LGPL v2.1 and the GPL v2, but not with +version 3 of those licenses. So to combine these libraries with Libav, the +license version needs to be upgraded by passing --enable-version3 to configure. + +incompatible libraries +---------------------- + +The Fraunhofer AAC library, FAAC and OpenSSL are under licenses incompatible +with all (L)GPL versions. Thus, unfortunately, since both licenses cannot be +satisfied simultaneously, binaries resulting from the combination of Libav +with these libraries are nonfree und unredistributable. If you wish to enable +any of these libraries nonetheless, pass --enable-nonfree to configure. -- cgit v1.2.3 From 3b9e832e17c3956082e589433fb0e08092cb291e Mon Sep 17 00:00:00 2001 From: Diego Biurrun Date: Wed, 8 Aug 2012 00:35:43 +0200 Subject: x86: Drop silly "_yasm" suffixes from filenames --- libavcodec/x86/Makefile | 6 +- libavcodec/x86/dsputil.asm | 1373 ++++++++++++++++++++++++++++++++++++ libavcodec/x86/dsputil_yasm.asm | 1373 ------------------------------------ libavcodec/x86/dsputilenc.asm | 342 +++++++++ libavcodec/x86/dsputilenc_yasm.asm | 342 --------- libavcodec/x86/vc1dsp.asm | 320 +++++++++ libavcodec/x86/vc1dsp_yasm.asm | 320 --------- 7 files changed, 2038 insertions(+), 2038 deletions(-) create mode 100644 libavcodec/x86/dsputil.asm delete mode 100644 libavcodec/x86/dsputil_yasm.asm create mode 100644 libavcodec/x86/dsputilenc.asm delete mode 100644 libavcodec/x86/dsputilenc_yasm.asm create mode 100644 libavcodec/x86/vc1dsp.asm delete mode 100644 libavcodec/x86/vc1dsp_yasm.asm diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile index eb82ef572e..57e73d8b2f 100644 --- a/libavcodec/x86/Makefile +++ b/libavcodec/x86/Makefile @@ -36,7 +36,7 @@ MMX-OBJS-$(CONFIG_VP8_DECODER) += x86/vp8dsp-init.o YASM-OBJS-$(CONFIG_AAC_DECODER) += x86/sbrdsp.o YASM-OBJS-$(CONFIG_AC3DSP) += x86/ac3dsp.o YASM-OBJS-$(CONFIG_DCT) += x86/dct32_sse.o -YASM-OBJS-$(CONFIG_ENCODERS) += x86/dsputilenc_yasm.o +YASM-OBJS-$(CONFIG_ENCODERS) += x86/dsputilenc.o YASM-OBJS-$(CONFIG_FFT) += x86/fft_mmx.o \ $(YASM-OBJS-FFT-yes) YASM-OBJS-$(CONFIG_H264CHROMA) += x86/h264_chromamc.o \ @@ -56,11 +56,11 @@ YASM-OBJS-$(CONFIG_PRORES_DECODER) += x86/proresdsp.o YASM-OBJS-$(CONFIG_RV30_DECODER) += x86/rv34dsp.o YASM-OBJS-$(CONFIG_RV40_DECODER) += x86/rv34dsp.o \ x86/rv40dsp.o -YASM-OBJS-$(CONFIG_VC1_DECODER) += x86/vc1dsp_yasm.o +YASM-OBJS-$(CONFIG_VC1_DECODER) += x86/vc1dsp.o YASM-OBJS-$(CONFIG_VP3DSP) += x86/vp3dsp.o YASM-OBJS-$(CONFIG_VP6_DECODER) += x86/vp56dsp.o YASM-OBJS-$(CONFIG_VP8_DECODER) += x86/vp8dsp.o -YASM-OBJS += x86/dsputil_yasm.o \ +YASM-OBJS += x86/dsputil.o \ x86/deinterlace.o \ x86/fmtconvert.o \ diff --git a/libavcodec/x86/dsputil.asm b/libavcodec/x86/dsputil.asm new file mode 100644 index 0000000000..d6cf824ecc --- /dev/null +++ b/libavcodec/x86/dsputil.asm @@ -0,0 +1,1373 @@ +;****************************************************************************** +;* MMX optimized DSP utils +;* Copyright (c) 2008 Loren Merritt +;* +;* This file is part of Libav. +;* +;* Libav is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* Libav is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with Libav; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "x86inc.asm" +%include "x86util.asm" + +SECTION_RODATA +pb_f: times 16 db 15 +pb_zzzzzzzz77777777: times 8 db -1 +pb_7: times 8 db 7 +pb_zzzz3333zzzzbbbb: db -1,-1,-1,-1,3,3,3,3,-1,-1,-1,-1,11,11,11,11 +pb_zz11zz55zz99zzdd: db -1,-1,1,1,-1,-1,5,5,-1,-1,9,9,-1,-1,13,13 +pb_revwords: SHUFFLE_MASK_W 7, 6, 5, 4, 3, 2, 1, 0 +pd_16384: times 4 dd 16384 +pb_bswap32: db 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 + +SECTION_TEXT + +%macro SCALARPRODUCT 1 +; int scalarproduct_int16(int16_t *v1, int16_t *v2, int order) +cglobal scalarproduct_int16_%1, 3,3,3, v1, v2, order + shl orderq, 1 + add v1q, orderq + add v2q, orderq + neg orderq + pxor m2, m2 +.loop: + movu m0, [v1q + orderq] + movu m1, [v1q + orderq + mmsize] + pmaddwd m0, [v2q + orderq] + pmaddwd m1, [v2q + orderq + mmsize] + paddd m2, m0 + paddd m2, m1 + add orderq, mmsize*2 + jl .loop +%if mmsize == 16 + movhlps m0, m2 + paddd m2, m0 + pshuflw m0, m2, 0x4e +%else + pshufw m0, m2, 0x4e +%endif + paddd m2, m0 + movd eax, m2 + RET + +; int scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul) +cglobal scalarproduct_and_madd_int16_%1, 4,4,8, v1, v2, v3, order, mul + shl orderq, 1 + movd m7, mulm +%if mmsize == 16 + pshuflw m7, m7, 0 + punpcklqdq m7, m7 +%else + pshufw m7, m7, 0 +%endif + pxor m6, m6 + add v1q, orderq + add v2q, orderq + add v3q, orderq + neg orderq +.loop: + movu m0, [v2q + orderq] + movu m1, [v2q + orderq + mmsize] + mova m4, [v1q + orderq] + mova m5, [v1q + orderq + mmsize] + movu m2, [v3q + orderq] + movu m3, [v3q + orderq + mmsize] + pmaddwd m0, m4 + pmaddwd m1, m5 + pmullw m2, m7 + pmullw m3, m7 + paddd m6, m0 + paddd m6, m1 + paddw m2, m4 + paddw m3, m5 + mova [v1q + orderq], m2 + mova [v1q + orderq + mmsize], m3 + add orderq, mmsize*2 + jl .loop +%if mmsize == 16 + movhlps m0, m6 + paddd m6, m0 + pshuflw m0, m6, 0x4e +%else + pshufw m0, m6, 0x4e +%endif + paddd m6, m0 + movd eax, m6 + RET +%endmacro + +INIT_MMX +SCALARPRODUCT mmx2 +INIT_XMM +SCALARPRODUCT sse2 + +%macro SCALARPRODUCT_LOOP 1 +align 16 +.loop%1: + sub orderq, mmsize*2 +%if %1 + mova m1, m4 + mova m4, [v2q + orderq] + mova m0, [v2q + orderq + mmsize] + palignr m1, m0, %1 + palignr m0, m4, %1 + mova m3, m5 + mova m5, [v3q + orderq] + mova m2, [v3q + orderq + mmsize] + palignr m3, m2, %1 + palignr m2, m5, %1 +%else + mova m0, [v2q + orderq] + mova m1, [v2q + orderq + mmsize] + mova m2, [v3q + orderq] + mova m3, [v3q + orderq + mmsize] +%endif + %define t0 [v1q + orderq] + %define t1 [v1q + orderq + mmsize] +%if ARCH_X86_64 + mova m8, t0 + mova m9, t1 + %define t0 m8 + %define t1 m9 +%endif + pmaddwd m0, t0 + pmaddwd m1, t1 + pmullw m2, m7 + pmullw m3, m7 + paddw m2, t0 + paddw m3, t1 + paddd m6, m0 + paddd m6, m1 + mova [v1q + orderq], m2 + mova [v1q + orderq + mmsize], m3 + jg .loop%1 +%if %1 + jmp .end +%endif +%endmacro + +; int scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul) +cglobal scalarproduct_and_madd_int16_ssse3, 4,5,10, v1, v2, v3, order, mul + shl orderq, 1 + movd m7, mulm + pshuflw m7, m7, 0 + punpcklqdq m7, m7 + pxor m6, m6 + mov r4d, v2d + and r4d, 15 + and v2q, ~15 + and v3q, ~15 + mova m4, [v2q + orderq] + mova m5, [v3q + orderq] + ; linear is faster than branch tree or jump table, because the branches taken are cyclic (i.e. predictable) + cmp r4d, 0 + je .loop0 + cmp r4d, 2 + je .loop2 + cmp r4d, 4 + je .loop4 + cmp r4d, 6 + je .loop6 + cmp r4d, 8 + je .loop8 + cmp r4d, 10 + je .loop10 + cmp r4d, 12 + je .loop12 +SCALARPRODUCT_LOOP 14 +SCALARPRODUCT_LOOP 12 +SCALARPRODUCT_LOOP 10 +SCALARPRODUCT_LOOP 8 +SCALARPRODUCT_LOOP 6 +SCALARPRODUCT_LOOP 4 +SCALARPRODUCT_LOOP 2 +SCALARPRODUCT_LOOP 0 +.end: + movhlps m0, m6 + paddd m6, m0 + pshuflw m0, m6, 0x4e + paddd m6, m0 + movd eax, m6 + RET + + +;----------------------------------------------------------------------------- +; void ff_apply_window_int16(int16_t *output, const int16_t *input, +; const int16_t *window, unsigned int len) +;----------------------------------------------------------------------------- + +%macro REVERSE_WORDS_MMXEXT 1-2 + pshufw %1, %1, 0x1B +%endmacro + +%macro REVERSE_WORDS_SSE2 1-2 + pshuflw %1, %1, 0x1B + pshufhw %1, %1, 0x1B + pshufd %1, %1, 0x4E +%endmacro + +%macro REVERSE_WORDS_SSSE3 2 + pshufb %1, %2 +%endmacro + +; dst = (dst * src) >> 15 +; pmulhw cuts off the bottom bit, so we have to lshift by 1 and add it back +; in from the pmullw result. +%macro MUL16FIXED_MMXEXT 3 ; dst, src, temp + mova %3, %1 + pmulhw %1, %2 + pmullw %3, %2 + psrlw %3, 15 + psllw %1, 1 + por %1, %3 +%endmacro + +; dst = ((dst * src) + (1<<14)) >> 15 +%macro MUL16FIXED_SSSE3 3 ; dst, src, unused + pmulhrsw %1, %2 +%endmacro + +%macro APPLY_WINDOW_INT16 3 ; %1=instruction set, %2=mmxext/sse2 bit exact version, %3=has_ssse3 +cglobal apply_window_int16_%1, 4,5,6, output, input, window, offset, offset2 + lea offset2q, [offsetq-mmsize] +%if %2 + mova m5, [pd_16384] +%elifidn %1, ssse3 + mova m5, [pb_revwords] + ALIGN 16 +%endif +.loop: +%if %2 + ; This version expands 16-bit to 32-bit, multiplies by the window, + ; adds 16384 for rounding, right shifts 15, then repacks back to words to + ; save to the output. The window is reversed for the second half. + mova m3, [windowq+offset2q] + mova m4, [ inputq+offset2q] + pxor m0, m0 + punpcklwd m0, m3 + punpcklwd m1, m4 + pmaddwd m0, m1 + paddd m0, m5 + psrad m0, 15 + pxor m2, m2 + punpckhwd m2, m3 + punpckhwd m1, m4 + pmaddwd m2, m1 + paddd m2, m5 + psrad m2, 15 + packssdw m0, m2 + mova [outputq+offset2q], m0 + REVERSE_WORDS m3 + mova m4, [ inputq+offsetq] + pxor m0, m0 + punpcklwd m0, m3 + punpcklwd m1, m4 + pmaddwd m0, m1 + paddd m0, m5 + psrad m0, 15 + pxor m2, m2 + punpckhwd m2, m3 + punpckhwd m1, m4 + pmaddwd m2, m1 + paddd m2, m5 + psrad m2, 15 + packssdw m0, m2 + mova [outputq+offsetq], m0 +%elif %3 + ; This version does the 16x16->16 multiplication in-place without expanding + ; to 32-bit. The ssse3 version is bit-identical. + mova m0, [windowq+offset2q] + mova m1, [ inputq+offset2q] + pmulhrsw m1, m0 + REVERSE_WORDS m0, m5 + pmulhrsw m0, [ inputq+offsetq ] + mova [outputq+offset2q], m1 + mova [outputq+offsetq ], m0 +%else + ; This version does the 16x16->16 multiplication in-place without expanding + ; to 32-bit. The mmxext and sse2 versions do not use rounding, and + ; therefore are not bit-identical to the C version. + mova m0, [windowq+offset2q] + mova m1, [ inputq+offset2q] + mova m2, [ inputq+offsetq ] + MUL16FIXED m1, m0, m3 + REVERSE_WORDS m0 + MUL16FIXED m2, m0, m3 + mova [outputq+offset2q], m1 + mova [outputq+offsetq ], m2 +%endif + add offsetd, mmsize + sub offset2d, mmsize + jae .loop + REP_RET +%endmacro + +INIT_MMX +%define REVERSE_WORDS REVERSE_WORDS_MMXEXT +%define MUL16FIXED MUL16FIXED_MMXEXT +APPLY_WINDOW_INT16 mmxext, 0, 0 +APPLY_WINDOW_INT16 mmxext_ba, 1, 0 +INIT_XMM +%define REVERSE_WORDS REVERSE_WORDS_SSE2 +APPLY_WINDOW_INT16 sse2, 0, 0 +APPLY_WINDOW_INT16 sse2_ba, 1, 0 +APPLY_WINDOW_INT16 ssse3_atom, 0, 1 +%define REVERSE_WORDS REVERSE_WORDS_SSSE3 +APPLY_WINDOW_INT16 ssse3, 0, 1 + + +; void add_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top) +cglobal add_hfyu_median_prediction_mmx2, 6,6,0, dst, top, diff, w, left, left_top + movq mm0, [topq] + movq mm2, mm0 + movd mm4, [left_topq] + psllq mm2, 8 + movq mm1, mm0 + por mm4, mm2 + movd mm3, [leftq] + psubb mm0, mm4 ; t-tl + add dstq, wq + add topq, wq + add diffq, wq + neg wq + jmp .skip +.loop: + movq mm4, [topq+wq] + movq mm0, mm4 + psllq mm4, 8 + por mm4, mm1 + movq mm1, mm0 ; t + psubb mm0, mm4 ; t-tl +.skip: + movq mm2, [diffq+wq] +%assign i 0 +%rep 8 + movq mm4, mm0 + paddb mm4, mm3 ; t-tl+l + movq mm5, mm3 + pmaxub mm3, mm1 + pminub mm5, mm1 + pminub mm3, mm4 + pmaxub mm3, mm5 ; median + paddb mm3, mm2 ; +residual +%if i==0 + movq mm7, mm3 + psllq mm7, 56 +%else + movq mm6, mm3 + psrlq mm7, 8 + psllq mm6, 56 + por mm7, mm6 +%endif +%if i<7 + psrlq mm0, 8 + psrlq mm1, 8 + psrlq mm2, 8 +%endif +%assign i i+1 +%endrep + movq [dstq+wq], mm7 + add wq, 8 + jl .loop + movzx r2d, byte [dstq-1] + mov [leftq], r2d + movzx r2d, byte [topq-1] + mov [left_topq], r2d + RET + + +%macro ADD_HFYU_LEFT_LOOP 2 ; %1 = dst_is_aligned, %2 = src_is_aligned + add srcq, wq + add dstq, wq + neg wq +%%.loop: +%if %2 + mova m1, [srcq+wq] +%else + movu m1, [srcq+wq] +%endif + mova m2, m1 + psllw m1, 8 + paddb m1, m2 + mova m2, m1 + pshufb m1, m3 + paddb m1, m2 + pshufb m0, m5 + mova m2, m1 + pshufb m1, m4 + paddb m1, m2 +%if mmsize == 16 + mova m2, m1 + pshufb m1, m6 + paddb m1, m2 +%endif + paddb m0, m1 +%if %1 + mova [dstq+wq], m0 +%else + movq [dstq+wq], m0 + movhps [dstq+wq+8], m0 +%endif + add wq, mmsize + jl %%.loop + mov eax, mmsize-1 + sub eax, wd + movd m1, eax + pshufb m0, m1 + movd eax, m0 + RET +%endmacro + +; int add_hfyu_left_prediction(uint8_t *dst, const uint8_t *src, int w, int left) +INIT_MMX +cglobal add_hfyu_left_prediction_ssse3, 3,3,7, dst, src, w, left +.skip_prologue: + mova m5, [pb_7] + mova m4, [pb_zzzz3333zzzzbbbb] + mova m3, [pb_zz11zz55zz99zzdd] + movd m0, leftm + psllq m0, 56 + ADD_HFYU_LEFT_LOOP 1, 1 + +INIT_XMM +cglobal add_hfyu_left_prediction_sse4, 3,3,7, dst, src, w, left + mova m5, [pb_f] + mova m6, [pb_zzzzzzzz77777777] + mova m4, [pb_zzzz3333zzzzbbbb] + mova m3, [pb_zz11zz55zz99zzdd] + movd m0, leftm + pslldq m0, 15 + test srcq, 15 + jnz .src_unaligned + test dstq, 15 + jnz .dst_unaligned + ADD_HFYU_LEFT_LOOP 1, 1 +.dst_unaligned: + ADD_HFYU_LEFT_LOOP 0, 1 +.src_unaligned: + ADD_HFYU_LEFT_LOOP 0, 0 + + +; float scalarproduct_float_sse(const float *v1, const float *v2, int len) +cglobal scalarproduct_float_sse, 3,3,2, v1, v2, offset + neg offsetq + shl offsetq, 2 + sub v1q, offsetq + sub v2q, offsetq + xorps xmm0, xmm0 + .loop: + movaps xmm1, [v1q+offsetq] + mulps xmm1, [v2q+offsetq] + addps xmm0, xmm1 + add offsetq, 16 + js .loop + movhlps xmm1, xmm0 + addps xmm0, xmm1 + movss xmm1, xmm0 + shufps xmm0, xmm0, 1 + addss xmm0, xmm1 +%if ARCH_X86_64 == 0 + movss r0m, xmm0 + fld dword r0m +%endif + RET + +; extern void ff_emu_edge_core(uint8_t *buf, const uint8_t *src, x86_reg linesize, +; x86_reg start_y, x86_reg end_y, x86_reg block_h, +; x86_reg start_x, x86_reg end_x, x86_reg block_w); +; +; The actual function itself is below. It basically wraps a very simple +; w = end_x - start_x +; if (w) { +; if (w > 22) { +; jump to the slow loop functions +; } else { +; jump to the fast loop functions +; } +; } +; +; ... and then the same for left/right extend also. See below for loop +; function implementations. Fast are fixed-width, slow is variable-width + +%macro EMU_EDGE_FUNC 0 +%if ARCH_X86_64 +%define w_reg r7 +cglobal emu_edge_core, 6, 9, 1 + mov r8, r5 ; save block_h +%else +%define w_reg r6 +cglobal emu_edge_core, 2, 7, 0 + mov r4, r4m ; end_y + mov r5, r5m ; block_h +%endif + + ; start with vertical extend (top/bottom) and body pixel copy + mov w_reg, r7m + sub w_reg, r6m ; w = start_x - end_x + sub r5, r4 +%if ARCH_X86_64 + sub r4, r3 +%else + sub r4, dword r3m +%endif + cmp w_reg, 22 + jg .slow_v_extend_loop +%if ARCH_X86_32 + mov r2, r2m ; linesize +%endif + sal w_reg, 7 ; w * 128 +%ifdef PIC + lea rax, [.emuedge_v_extend_1 - (.emuedge_v_extend_2 - .emuedge_v_extend_1)] + add w_reg, rax +%else + lea w_reg, [.emuedge_v_extend_1 - (.emuedge_v_extend_2 - .emuedge_v_extend_1)+w_reg] +%endif + call w_reg ; fast top extend, body copy and bottom extend +.v_extend_end: + + ; horizontal extend (left/right) + mov w_reg, r6m ; start_x + sub r0, w_reg +%if ARCH_X86_64 + mov r3, r0 ; backup of buf+block_h*linesize + mov r5, r8 +%else + mov r0m, r0 ; backup of buf+block_h*linesize + mov r5, r5m +%endif + test w_reg, w_reg + jz .right_extend + cmp w_reg, 22 + jg .slow_left_extend_loop + mov r1, w_reg + dec w_reg + ; FIXME we can do a if size == 1 here if that makes any speed difference, test me + sar w_reg, 1 + sal w_reg, 6 + ; r0=buf+block_h*linesize,r7(64)/r6(32)=start_x offset for funcs + ; r6(rax)/r3(ebx)=val,r2=linesize,r1=start_x,r5=block_h +%ifdef PIC + lea rax, [.emuedge_extend_left_2] + add w_reg, rax +%else + lea w_reg, [.emuedge_extend_left_2+w_reg] +%endif + call w_reg + + ; now r3(64)/r0(32)=buf,r2=linesize,r8/r5=block_h,r6/r3=val, r7/r6=end_x, r1=block_w +.right_extend: +%if ARCH_X86_32 + mov r0, r0m + mov r5, r5m +%endif + mov w_reg, r7m ; end_x + mov r1, r8m ; block_w + mov r4, r1 + sub r1, w_reg + jz .h_extend_end ; if (end_x == block_w) goto h_extend_end + cmp r1, 22 + jg .slow_right_extend_loop + dec r1 + ; FIXME we can do a if size == 1 here if that makes any speed difference, test me + sar r1, 1 + sal r1, 6 +%ifdef PIC + lea rax, [.emuedge_extend_right_2] + add r1, rax +%else + lea r1, [.emuedge_extend_right_2+r1] +%endif + call r1 +.h_extend_end: + RET + +%if ARCH_X86_64 +%define vall al +%define valh ah +%define valw ax +%define valw2 r7w +%define valw3 r3w +%if WIN64 +%define valw4 r7w +%else ; unix64 +%define valw4 r3w +%endif +%define vald eax +%else +%define vall bl +%define valh bh +%define valw bx +%define valw2 r6w +%define valw3 valw2 +%define valw4 valw3 +%define vald ebx +%define stack_offset 0x14 +%endif + +%endmacro + +; macro to read/write a horizontal number of pixels (%2) to/from registers +; on x86-64, - fills xmm0-15 for consecutive sets of 16 pixels +; - if (%2 & 15 == 8) fills the last 8 bytes into rax +; - else if (%2 & 8) fills 8 bytes into mm0 +; - if (%2 & 7 == 4) fills the last 4 bytes into rax +; - else if (%2 & 4) fills 4 bytes into mm0-1 +; - if (%2 & 3 == 3) fills 2 bytes into r7/r3, and 1 into eax +; (note that we're using r3 for body/bottom because it's a shorter +; opcode, and then the loop fits in 128 bytes) +; - else fills remaining bytes into rax +; on x86-32, - fills mm0-7 for consecutive sets of 8 pixels +; - if (%2 & 7 == 4) fills 4 bytes into ebx +; - else if (%2 & 4) fills 4 bytes into mm0-7 +; - if (%2 & 3 == 3) fills 2 bytes into r6, and 1 into ebx +; - else fills remaining bytes into ebx +; writing data out is in the same way +%macro READ_NUM_BYTES 2 +%assign %%src_off 0 ; offset in source buffer +%assign %%smidx 0 ; mmx register idx +%assign %%sxidx 0 ; xmm register idx + +%if cpuflag(sse) +%rep %2/16 + movups xmm %+ %%sxidx, [r1+%%src_off] +%assign %%src_off %%src_off+16 +%assign %%sxidx %%sxidx+1 +%endrep ; %2/16 +%endif + +%if ARCH_X86_64 +%if (%2-%%src_off) == 8 + mov rax, [r1+%%src_off] +%assign %%src_off %%src_off+8 +%endif ; (%2-%%src_off) == 8 +%endif ; x86-64 + +%rep (%2-%%src_off)/8 + movq mm %+ %%smidx, [r1+%%src_off] +%assign %%src_off %%src_off+8 +%assign %%smidx %%smidx+1 +%endrep ; (%2-%%dst_off)/8 + +%if (%2-%%src_off) == 4 + mov vald, [r1+%%src_off] +%elif (%2-%%src_off) & 4 + movd mm %+ %%smidx, [r1+%%src_off] +%assign %%src_off %%src_off+4 +%endif ; (%2-%%src_off) ==/& 4 + +%if (%2-%%src_off) == 1 + mov vall, [r1+%%src_off] +%elif (%2-%%src_off) == 2 + mov valw, [r1+%%src_off] +%elif (%2-%%src_off) == 3 +%ifidn %1, top + mov valw2, [r1+%%src_off] +%elifidn %1, body + mov valw3, [r1+%%src_off] +%elifidn %1, bottom + mov valw4, [r1+%%src_off] +%endif ; %1 ==/!= top + mov vall, [r1+%%src_off+2] +%endif ; (%2-%%src_off) == 1/2/3 +%endmacro ; READ_NUM_BYTES + +%macro WRITE_NUM_BYTES 2 +%assign %%dst_off 0 ; offset in destination buffer +%assign %%dmidx 0 ; mmx register idx +%assign %%dxidx 0 ; xmm register idx + +%if cpuflag(sse) +%rep %2/16 + movups [r0+%%dst_off], xmm %+ %%dxidx +%assign %%dst_off %%dst_off+16 +%assign %%dxidx %%dxidx+1 +%endrep ; %2/16 +%endif + +%if ARCH_X86_64 +%if (%2-%%dst_off) == 8 + mov [r0+%%dst_off], rax +%assign %%dst_off %%dst_off+8 +%endif ; (%2-%%dst_off) == 8 +%endif ; x86-64 + +%rep (%2-%%dst_off)/8 + movq [r0+%%dst_off], mm %+ %%dmidx +%assign %%dst_off %%dst_off+8 +%assign %%dmidx %%dmidx+1 +%endrep ; (%2-%%dst_off)/8 + +%if (%2-%%dst_off) == 4 + mov [r0+%%dst_off], vald +%elif (%2-%%dst_off) & 4 + movd [r0+%%dst_off], mm %+ %%dmidx +%assign %%dst_off %%dst_off+4 +%endif ; (%2-%%dst_off) ==/& 4 + +%if (%2-%%dst_off) == 1 + mov [r0+%%dst_off], vall +%elif (%2-%%dst_off) == 2 + mov [r0+%%dst_off], valw +%elif (%2-%%dst_off) == 3 +%ifidn %1, top + mov [r0+%%dst_off], valw2 +%elifidn %1, body + mov [r0+%%dst_off], valw3 +%elifidn %1, bottom + mov [r0+%%dst_off], valw4 +%endif ; %1 ==/!= top + mov [r0+%%dst_off+2], vall +%endif ; (%2-%%dst_off) == 1/2/3 +%endmacro ; WRITE_NUM_BYTES + +; vertical top/bottom extend and body copy fast loops +; these are function pointers to set-width line copy functions, i.e. +; they read a fixed number of pixels into set registers, and write +; those out into the destination buffer +; r0=buf,r1=src,r2=linesize,r3(64)/r3m(32)=start_x,r4=end_y,r5=block_h +; r6(eax/64)/r3(ebx/32)=val_reg +%macro VERTICAL_EXTEND 0 +%assign %%n 1 +%rep 22 +ALIGN 128 +.emuedge_v_extend_ %+ %%n: + ; extend pixels above body +%if ARCH_X86_64 + test r3 , r3 ; if (!start_y) + jz .emuedge_copy_body_ %+ %%n %+ _loop ; goto body +%else ; ARCH_X86_32 + cmp dword r3m, 0 + je .emuedge_copy_body_ %+ %%n %+ _loop +%endif ; ARCH_X86_64/32 + READ_NUM_BYTES top, %%n ; read bytes +.emuedge_extend_top_ %+ %%n %+ _loop: ; do { + WRITE_NUM_BYTES top, %%n ; write bytes + add r0 , r2 ; dst += linesize +%if ARCH_X86_64 + dec r3d +%else ; ARCH_X86_32 + dec dword r3m +%endif ; ARCH_X86_64/32 + jnz .emuedge_extend_top_ %+ %%n %+ _loop ; } while (--start_y) + + ; copy body pixels +.emuedge_copy_body_ %+ %%n %+ _loop: ; do { + READ_NUM_BYTES body, %%n ; read bytes + WRITE_NUM_BYTES body, %%n ; write bytes + add r0 , r2 ; dst += linesize + add r1 , r2 ; src += linesize + dec r4d + jnz .emuedge_copy_body_ %+ %%n %+ _loop ; } while (--end_y) + + ; copy bottom pixels + test r5 , r5 ; if (!block_h) + jz .emuedge_v_extend_end_ %+ %%n ; goto end + sub r1 , r2 ; src -= linesize + READ_NUM_BYTES bottom, %%n ; read bytes +.emuedge_extend_bottom_ %+ %%n %+ _loop: ; do { + WRITE_NUM_BYTES bottom, %%n ; write bytes + add r0 , r2 ; dst += linesize + dec r5d + jnz .emuedge_extend_bottom_ %+ %%n %+ _loop ; } while (--block_h) + +.emuedge_v_extend_end_ %+ %%n: +%if ARCH_X86_64 + ret +%else ; ARCH_X86_32 + rep ret +%endif ; ARCH_X86_64/32 +%assign %%n %%n+1 +%endrep +%endmacro VERTICAL_EXTEND + +; left/right (horizontal) fast extend functions +; these are essentially identical to the vertical extend ones above, +; just left/right separated because number of pixels to extend is +; obviously not the same on both sides. +; for reading, pixels are placed in eax (x86-64) or ebx (x86-64) in the +; lowest two bytes of the register (so val*0x0101), and are splatted +; into each byte of mm0 as well if n_pixels >= 8 + +%macro READ_V_PIXEL 2 + mov vall, %2 + mov valh, vall +%if %1 >= 8 + movd mm0, vald +%if cpuflag(mmx2) + pshufw mm0, mm0, 0 +%else ; mmx + punpcklwd mm0, mm0 + punpckldq mm0, mm0 +%endif ; sse +%endif ; %1 >= 8 +%endmacro + +%macro WRITE_V_PIXEL 2 +%assign %%dst_off 0 +%rep %1/8 + movq [%2+%%dst_off], mm0 +%assign %%dst_off %%dst_off+8 +%endrep +%if %1 & 4 +%if %1 >= 8 + movd [%2+%%dst_off], mm0 +%else ; %1 < 8 + mov [%2+%%dst_off] , valw + mov [%2+%%dst_off+2], valw +%endif ; %1 >=/< 8 +%assign %%dst_off %%dst_off+4 +%endif ; %1 & 4 +%if %1&2 + mov [%2+%%dst_off], valw +%endif ; %1 & 2 +%endmacro + +; r0=buf+block_h*linesize, r1=start_x, r2=linesize, r5=block_h, r6/r3=val +%macro LEFT_EXTEND 0 +%assign %%n 2 +%rep 11 +ALIGN 64 +.emuedge_extend_left_ %+ %%n: ; do { + sub r0, r2 ; dst -= linesize + READ_V_PIXEL %%n, [r0+r1] ; read pixels + WRITE_V_PIXEL %%n, r0 ; write pixels + dec r5 + jnz .emuedge_extend_left_ %+ %%n ; } while (--block_h) +%if ARCH_X86_64 + ret +%else ; ARCH_X86_32 + rep ret +%endif ; ARCH_X86_64/32 +%assign %%n %%n+2 +%endrep +%endmacro ; LEFT_EXTEND + +; r3/r0=buf+block_h*linesize, r2=linesize, r8/r5=block_h, r0/r6=end_x, r6/r3=val +%macro RIGHT_EXTEND 0 +%assign %%n 2 +%rep 11 +ALIGN 64 +.emuedge_extend_right_ %+ %%n: ; do { +%if ARCH_X86_64 + sub r3, r2 ; dst -= linesize + READ_V_PIXEL %%n, [r3+w_reg-1] ; read pixels + WRITE_V_PIXEL %%n, r3+r4-%%n ; write pixels + dec r8 +%else ; ARCH_X86_32 + sub r0, r2 ; dst -= linesize + READ_V_PIXEL %%n, [r0+w_reg-1] ; read pixels + WRITE_V_PIXEL %%n, r0+r4-%%n ; write pixels + dec r5 +%endif ; ARCH_X86_64/32 + jnz .emuedge_extend_right_ %+ %%n ; } while (--block_h) +%if ARCH_X86_64 + ret +%else ; ARCH_X86_32 + rep ret +%endif ; ARCH_X86_64/32 +%assign %%n %%n+2 +%endrep + +%if ARCH_X86_32 +%define stack_offset 0x10 +%endif +%endmacro ; RIGHT_EXTEND + +; below follow the "slow" copy/extend functions, these act on a non-fixed +; width specified in a register, and run a loop to copy the full amount +; of bytes. They are optimized for copying of large amounts of pixels per +; line, so they unconditionally splat data into mm registers to copy 8 +; bytes per loop iteration. It could be considered to use xmm for x86-64 +; also, but I haven't optimized this as much (i.e. FIXME) +%macro V_COPY_NPX 4-5 +%if %0 == 4 + test w_reg, %4 + jz .%1_skip_%4_px +%else ; %0 == 5 +.%1_%4_px_loop: +%endif + %3 %2, [r1+cnt_reg] + %3 [r0+cnt_reg], %2 + add cnt_reg, %4 +%if %0 == 5 + sub w_reg, %4 + test w_reg, %5 + jnz .%1_%4_px_loop +%endif +.%1_skip_%4_px: +%endmacro + +%macro V_COPY_ROW 2 +%ifidn %1, bottom + sub r1, linesize +%endif +.%1_copy_loop: + xor cnt_reg, cnt_reg +%if notcpuflag(sse) +%define linesize r2m + V_COPY_NPX %1, mm0, movq, 8, 0xFFFFFFF8 +%else ; sse + V_COPY_NPX %1, xmm0, movups, 16, 0xFFFFFFF0 +%if ARCH_X86_64 +%define linesize r2 + V_COPY_NPX %1, rax , mov, 8 +%else ; ARCH_X86_32 +%define linesize r2m + V_COPY_NPX %1, mm0, movq, 8 +%endif ; ARCH_X86_64/32 +%endif ; sse + V_COPY_NPX %1, vald, mov, 4 + V_COPY_NPX %1, valw, mov, 2 + V_COPY_NPX %1, vall, mov, 1 + mov w_reg, cnt_reg +%ifidn %1, body + add r1, linesize +%endif + add r0, linesize + dec %2 + jnz .%1_copy_loop +%endmacro + +%macro SLOW_V_EXTEND 0 +.slow_v_extend_loop: +; r0=buf,r1=src,r2(64)/r2m(32)=linesize,r3(64)/r3m(32)=start_x,r4=end_y,r5=block_h +; r8(64)/r3(later-64)/r2(32)=cnt_reg,r6(64)/r3(32)=val_reg,r7(64)/r6(32)=w=end_x-start_x +%if ARCH_X86_64 + push r8 ; save old value of block_h + test r3, r3 +%define cnt_reg r8 + jz .do_body_copy ; if (!start_y) goto do_body_copy + V_COPY_ROW top, r3 +%else + cmp dword r3m, 0 +%define cnt_reg r2 + je .do_body_copy ; if (!start_y) goto do_body_copy + V_COPY_ROW top, dword r3m +%endif + +.do_body_copy: + V_COPY_ROW body, r4 + +%if ARCH_X86_64 + pop r8 ; restore old value of block_h +%define cnt_reg r3 +%endif + test r5, r5 +%if ARCH_X86_64 + jz .v_extend_end +%else + jz .skip_bottom_extend +%endif + V_COPY_ROW bottom, r5 +%if ARCH_X86_32 +.skip_bottom_extend: + mov r2, r2m +%endif + jmp .v_extend_end +%endmacro + +%macro SLOW_LEFT_EXTEND 0 +.slow_left_extend_loop: +; r0=buf+block_h*linesize,r2=linesize,r6(64)/r3(32)=val,r5=block_h,r4=cntr,r7/r6=start_x + mov r4, 8 + sub r0, linesize + READ_V_PIXEL 8, [r0+w_reg] +.left_extend_8px_loop: + movq [r0+r4-8], mm0 + add r4, 8 + cmp r4, w_reg + jle .left_extend_8px_loop + sub r4, 8 + cmp r4, w_reg + jge .left_extend_loop_end +.left_extend_2px_loop: + mov [r0+r4], valw + add r4, 2 + cmp r4, w_reg + jl .left_extend_2px_loop +.left_extend_loop_end: + dec r5 + jnz .slow_left_extend_loop +%if ARCH_X86_32 + mov r2, r2m +%endif + jmp .right_extend +%endmacro + +%macro SLOW_RIGHT_EXTEND 0 +.slow_right_extend_loop: +; r3(64)/r0(32)=buf+block_h*linesize,r2=linesize,r4=block_w,r8(64)/r5(32)=block_h, +; r7(64)/r6(32)=end_x,r6/r3=val,r1=cntr +%if ARCH_X86_64 +%define buf_reg r3 +%define bh_reg r8 +%else +%define buf_reg r0 +%define bh_reg r5 +%endif + lea r1, [r4-8] + sub buf_reg, linesize + READ_V_PIXEL 8, [buf_reg+w_reg-1] +.right_extend_8px_loop: + movq [buf_reg+r1], mm0 + sub r1, 8 + cmp r1, w_reg + jge .right_extend_8px_loop + add r1, 8 + cmp r1, w_reg + je .right_extend_loop_end +.right_extend_2px_loop: + sub r1, 2 + mov [buf_reg+r1], valw + cmp r1, w_reg + jg .right_extend_2px_loop +.right_extend_loop_end: + dec bh_reg + jnz .slow_right_extend_loop + jmp .h_extend_end +%endmacro + +%macro emu_edge 1 +INIT_XMM %1 +EMU_EDGE_FUNC +VERTICAL_EXTEND +LEFT_EXTEND +RIGHT_EXTEND +SLOW_V_EXTEND +SLOW_LEFT_EXTEND +SLOW_RIGHT_EXTEND +%endmacro + +emu_edge sse +%if ARCH_X86_32 +emu_edge mmx +%endif + +;----------------------------------------------------------------------------- +; void ff_vector_clip_int32(int32_t *dst, const int32_t *src, int32_t min, +; int32_t max, unsigned int len) +;----------------------------------------------------------------------------- + +; %1 = number of xmm registers used +; %2 = number of inline load/process/store loops per asm loop +; %3 = process 4*mmsize (%3=0) or 8*mmsize (%3=1) bytes per loop +; %4 = CLIPD function takes min/max as float instead of int (CLIPD_SSE2) +; %5 = suffix +%macro VECTOR_CLIP_INT32 4-5 +cglobal vector_clip_int32%5, 5,5,%1, dst, src, min, max, len +%if %4 + cvtsi2ss m4, minm + cvtsi2ss m5, maxm +%else + movd m4, minm + movd m5, maxm +%endif + SPLATD m4 + SPLATD m5 +.loop: +%assign %%i 1 +%rep %2 + mova m0, [srcq+mmsize*0*%%i] + mova m1, [srcq+mmsize*1*%%i] + mova m2, [srcq+mmsize*2*%%i] + mova m3, [srcq+mmsize*3*%%i] +%if %3 + mova m7, [srcq+mmsize*4*%%i] + mova m8, [srcq+mmsize*5*%%i] + mova m9, [srcq+mmsize*6*%%i] + mova m10, [srcq+mmsize*7*%%i] +%endif + CLIPD m0, m4, m5, m6 + CLIPD m1, m4, m5, m6 + CLIPD m2, m4, m5, m6 + CLIPD m3, m4, m5, m6 +%if %3 + CLIPD m7, m4, m5, m6 + CLIPD m8, m4, m5, m6 + CLIPD m9, m4, m5, m6 + CLIPD m10, m4, m5, m6 +%endif + mova [dstq+mmsize*0*%%i], m0 + mova [dstq+mmsize*1*%%i], m1 + mova [dstq+mmsize*2*%%i], m2 + mova [dstq+mmsize*3*%%i], m3 +%if %3 + mova [dstq+mmsize*4*%%i], m7 + mova [dstq+mmsize*5*%%i], m8 + mova [dstq+mmsize*6*%%i], m9 + mova [dstq+mmsize*7*%%i], m10 +%endif +%assign %%i %%i+1 +%endrep + add srcq, mmsize*4*(%2+%3) + add dstq, mmsize*4*(%2+%3) + sub lend, mmsize*(%2+%3) + jg .loop + REP_RET +%endmacro + +INIT_MMX mmx +%define SPLATD SPLATD_MMX +%define CLIPD CLIPD_MMX +VECTOR_CLIP_INT32 0, 1, 0, 0 +INIT_XMM sse2 +%define SPLATD SPLATD_SSE2 +VECTOR_CLIP_INT32 6, 1, 0, 0, _int +%define CLIPD CLIPD_SSE2 +VECTOR_CLIP_INT32 6, 2, 0, 1 +INIT_XMM sse4 +%define CLIPD CLIPD_SSE41 +%ifdef m8 +VECTOR_CLIP_INT32 11, 1, 1, 0 +%else +VECTOR_CLIP_INT32 6, 1, 0, 0 +%endif + +;----------------------------------------------------------------------------- +; void vector_fmul_reverse(float *dst, const float *src0, const float *src1, +; int len) +;----------------------------------------------------------------------------- +%macro VECTOR_FMUL_REVERSE 0 +cglobal vector_fmul_reverse, 4,4,2, dst, src0, src1, len + lea lenq, [lend*4 - 2*mmsize] +ALIGN 16 +.loop: +%if cpuflag(avx) + vmovaps xmm0, [src1q + 16] + vinsertf128 m0, m0, [src1q], 1 + vshufps m0, m0, m0, q0123 + vmovaps xmm1, [src1q + mmsize + 16] + vinsertf128 m1, m1, [src1q + mmsize], 1 + vshufps m1, m1, m1, q0123 +%else + mova m0, [src1q] + mova m1, [src1q + mmsize] + shufps m0, m0, q0123 + shufps m1, m1, q0123 +%endif + mulps m0, m0, [src0q + lenq + mmsize] + mulps m1, m1, [src0q + lenq] + mova [dstq + lenq + mmsize], m0 + mova [dstq + lenq], m1 + add src1q, 2*mmsize + sub lenq, 2*mmsize + jge .loop + REP_RET +%endmacro + +INIT_XMM sse +VECTOR_FMUL_REVERSE +%if HAVE_AVX +INIT_YMM avx +VECTOR_FMUL_REVERSE +%endif + +;----------------------------------------------------------------------------- +; vector_fmul_add(float *dst, const float *src0, const float *src1, +; const float *src2, int len) +;----------------------------------------------------------------------------- +%macro VECTOR_FMUL_ADD 0 +cglobal vector_fmul_add, 5,5,2, dst, src0, src1, src2, len + lea lenq, [lend*4 - 2*mmsize] +ALIGN 16 +.loop: + mova m0, [src0q + lenq] + mova m1, [src0q + lenq + mmsize] + mulps m0, m0, [src1q + lenq] + mulps m1, m1, [src1q + lenq + mmsize] + addps m0, m0, [src2q + lenq] + addps m1, m1, [src2q + lenq + mmsize] + mova [dstq + lenq], m0 + mova [dstq + lenq + mmsize], m1 + + sub lenq, 2*mmsize + jge .loop + REP_RET +%endmacro + +INIT_XMM sse +VECTOR_FMUL_ADD +%if HAVE_AVX +INIT_YMM avx +VECTOR_FMUL_ADD +%endif + +;----------------------------------------------------------------------------- +; void ff_butterflies_float_interleave(float *dst, const float *src0, +; const float *src1, int len); +;----------------------------------------------------------------------------- + +%macro BUTTERFLIES_FLOAT_INTERLEAVE 0 +cglobal butterflies_float_interleave, 4,4,3, dst, src0, src1, len +%if ARCH_X86_64 + movsxd lenq, lend +%endif + test lenq, lenq + jz .end + shl lenq, 2 + lea src0q, [src0q + lenq] + lea src1q, [src1q + lenq] + lea dstq, [ dstq + 2*lenq] + neg lenq +.loop: + mova m0, [src0q + lenq] + mova m1, [src1q + lenq] + subps m2, m0, m1 + addps m0, m0, m1 + unpcklps m1, m0, m2 + unpckhps m0, m0, m2 +%if cpuflag(avx) + vextractf128 [dstq + 2*lenq ], m1, 0 + vextractf128 [dstq + 2*lenq + 16], m0, 0 + vextractf128 [dstq + 2*lenq + 32], m1, 1 + vextractf128 [dstq + 2*lenq + 48], m0, 1 +%else + mova [dstq + 2*lenq ], m1 + mova [dstq + 2*lenq + mmsize], m0 +%endif + add lenq, mmsize + jl .loop +.end: + REP_RET +%endmacro + +INIT_XMM sse +BUTTERFLIES_FLOAT_INTERLEAVE +%if HAVE_AVX +INIT_YMM avx +BUTTERFLIES_FLOAT_INTERLEAVE +%endif + +INIT_XMM sse2 +; %1 = aligned/unaligned +%macro BSWAP_LOOPS_SSE2 1 + mov r3, r2 + sar r2, 3 + jz .left4_%1 +.loop8_%1: + mov%1 m0, [r1 + 0] + mov%1 m1, [r1 + 16] + pshuflw m0, m0, 10110001b + pshuflw m1, m1, 10110001b + pshufhw m0, m0, 10110001b + pshufhw m1, m1, 10110001b + mova m2, m0 + mova m3, m1 + psllw m0, 8 + psllw m1, 8 + psrlw m2, 8 + psrlw m3, 8 + por m2, m0 + por m3, m1 + mova [r0 + 0], m2 + mova [r0 + 16], m3 + add r1, 32 + add r0, 32 + dec r2 + jnz .loop8_%1 +.left4_%1: + mov r2, r3 + and r3, 4 + jz .left + mov%1 m0, [r1] + pshuflw m0, m0, 10110001b + pshufhw m0, m0, 10110001b + mova m2, m0 + psllw m0, 8 + psrlw m2, 8 + por m2, m0 + mova [r0], m2 + add r1, 16 + add r0, 16 +%endmacro + +; void bswap_buf(uint32_t *dst, const uint32_t *src, int w); +cglobal bswap32_buf, 3,4,5 + mov r3, r1 + and r3, 15 + jz .start_align + BSWAP_LOOPS_SSE2 u + jmp .left +.start_align: + BSWAP_LOOPS_SSE2 a +.left: + and r2, 3 + jz .end +.loop2: + mov r3d, [r1] + bswap r3d + mov [r0], r3d + add r1, 4 + add r0, 4 + dec r2 + jnz .loop2 +.end: + RET + +; %1 = aligned/unaligned +%macro BSWAP_LOOPS_SSSE3 1 + mov r3, r2 + sar r2, 3 + jz .left4_%1 +.loop8_%1: + mov%1 m0, [r1 + 0] + mov%1 m1, [r1 + 16] + pshufb m0, m2 + pshufb m1, m2 + mova [r0 + 0], m0 + mova [r0 + 16], m1 + add r0, 32 + add r1, 32 + dec r2 + jnz .loop8_%1 +.left4_%1: + mov r2, r3 + and r3, 4 + jz .left2 + mov%1 m0, [r1] + pshufb m0, m2 + mova [r0], m0 + add r1, 16 + add r0, 16 +%endmacro + +INIT_XMM ssse3 +; void bswap_buf(uint32_t *dst, const uint32_t *src, int w); +cglobal bswap32_buf, 3,4,3 + mov r3, r1 + mova m2, [pb_bswap32] + and r3, 15 + jz .start_align + BSWAP_LOOPS_SSSE3 u + jmp .left2 +.start_align: + BSWAP_LOOPS_SSSE3 a +.left2: + mov r3, r2 + and r2, 2 + jz .left1 + movq m0, [r1] + pshufb m0, m2 + movq [r0], m0 + add r1, 8 + add r0, 8 +.left1: + and r3, 1 + jz .end + mov r2d, [r1] + bswap r2d + mov [r0], r2d +.end: + RET diff --git a/libavcodec/x86/dsputil_yasm.asm b/libavcodec/x86/dsputil_yasm.asm deleted file mode 100644 index d6cf824ecc..0000000000 --- a/libavcodec/x86/dsputil_yasm.asm +++ /dev/null @@ -1,1373 +0,0 @@ -;****************************************************************************** -;* MMX optimized DSP utils -;* Copyright (c) 2008 Loren Merritt -;* -;* This file is part of Libav. -;* -;* Libav is free software; you can redistribute it and/or -;* modify it under the terms of the GNU Lesser General Public -;* License as published by the Free Software Foundation; either -;* version 2.1 of the License, or (at your option) any later version. -;* -;* Libav is distributed in the hope that it will be useful, -;* but WITHOUT ANY WARRANTY; without even the implied warranty of -;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -;* Lesser General Public License for more details. -;* -;* You should have received a copy of the GNU Lesser General Public -;* License along with Libav; if not, write to the Free Software -;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -;****************************************************************************** - -%include "x86inc.asm" -%include "x86util.asm" - -SECTION_RODATA -pb_f: times 16 db 15 -pb_zzzzzzzz77777777: times 8 db -1 -pb_7: times 8 db 7 -pb_zzzz3333zzzzbbbb: db -1,-1,-1,-1,3,3,3,3,-1,-1,-1,-1,11,11,11,11 -pb_zz11zz55zz99zzdd: db -1,-1,1,1,-1,-1,5,5,-1,-1,9,9,-1,-1,13,13 -pb_revwords: SHUFFLE_MASK_W 7, 6, 5, 4, 3, 2, 1, 0 -pd_16384: times 4 dd 16384 -pb_bswap32: db 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 - -SECTION_TEXT - -%macro SCALARPRODUCT 1 -; int scalarproduct_int16(int16_t *v1, int16_t *v2, int order) -cglobal scalarproduct_int16_%1, 3,3,3, v1, v2, order - shl orderq, 1 - add v1q, orderq - add v2q, orderq - neg orderq - pxor m2, m2 -.loop: - movu m0, [v1q + orderq] - movu m1, [v1q + orderq + mmsize] - pmaddwd m0, [v2q + orderq] - pmaddwd m1, [v2q + orderq + mmsize] - paddd m2, m0 - paddd m2, m1 - add orderq, mmsize*2 - jl .loop -%if mmsize == 16 - movhlps m0, m2 - paddd m2, m0 - pshuflw m0, m2, 0x4e -%else - pshufw m0, m2, 0x4e -%endif - paddd m2, m0 - movd eax, m2 - RET - -; int scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul) -cglobal scalarproduct_and_madd_int16_%1, 4,4,8, v1, v2, v3, order, mul - shl orderq, 1 - movd m7, mulm -%if mmsize == 16 - pshuflw m7, m7, 0 - punpcklqdq m7, m7 -%else - pshufw m7, m7, 0 -%endif - pxor m6, m6 - add v1q, orderq - add v2q, orderq - add v3q, orderq - neg orderq -.loop: - movu m0, [v2q + orderq] - movu m1, [v2q + orderq + mmsize] - mova m4, [v1q + orderq] - mova m5, [v1q + orderq + mmsize] - movu m2, [v3q + orderq] - movu m3, [v3q + orderq + mmsize] - pmaddwd m0, m4 - pmaddwd m1, m5 - pmullw m2, m7 - pmullw m3, m7 - paddd m6, m0 - paddd m6, m1 - paddw m2, m4 - paddw m3, m5 - mova [v1q + orderq], m2 - mova [v1q + orderq + mmsize], m3 - add orderq, mmsize*2 - jl .loop -%if mmsize == 16 - movhlps m0, m6 - paddd m6, m0 - pshuflw m0, m6, 0x4e -%else - pshufw m0, m6, 0x4e -%endif - paddd m6, m0 - movd eax, m6 - RET -%endmacro - -INIT_MMX -SCALARPRODUCT mmx2 -INIT_XMM -SCALARPRODUCT sse2 - -%macro SCALARPRODUCT_LOOP 1 -align 16 -.loop%1: - sub orderq, mmsize*2 -%if %1 - mova m1, m4 - mova m4, [v2q + orderq] - mova m0, [v2q + orderq + mmsize] - palignr m1, m0, %1 - palignr m0, m4, %1 - mova m3, m5 - mova m5, [v3q + orderq] - mova m2, [v3q + orderq + mmsize] - palignr m3, m2, %1 - palignr m2, m5, %1 -%else - mova m0, [v2q + orderq] - mova m1, [v2q + orderq + mmsize] - mova m2, [v3q + orderq] - mova m3, [v3q + orderq + mmsize] -%endif - %define t0 [v1q + orderq] - %define t1 [v1q + orderq + mmsize] -%if ARCH_X86_64 - mova m8, t0 - mova m9, t1 - %define t0 m8 - %define t1 m9 -%endif - pmaddwd m0, t0 - pmaddwd m1, t1 - pmullw m2, m7 - pmullw m3, m7 - paddw m2, t0 - paddw m3, t1 - paddd m6, m0 - paddd m6, m1 - mova [v1q + orderq], m2 - mova [v1q + orderq + mmsize], m3 - jg .loop%1 -%if %1 - jmp .end -%endif -%endmacro - -; int scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul) -cglobal scalarproduct_and_madd_int16_ssse3, 4,5,10, v1, v2, v3, order, mul - shl orderq, 1 - movd m7, mulm - pshuflw m7, m7, 0 - punpcklqdq m7, m7 - pxor m6, m6 - mov r4d, v2d - and r4d, 15 - and v2q, ~15 - and v3q, ~15 - mova m4, [v2q + orderq] - mova m5, [v3q + orderq] - ; linear is faster than branch tree or jump table, because the branches taken are cyclic (i.e. predictable) - cmp r4d, 0 - je .loop0 - cmp r4d, 2 - je .loop2 - cmp r4d, 4 - je .loop4 - cmp r4d, 6 - je .loop6 - cmp r4d, 8 - je .loop8 - cmp r4d, 10 - je .loop10 - cmp r4d, 12 - je .loop12 -SCALARPRODUCT_LOOP 14 -SCALARPRODUCT_LOOP 12 -SCALARPRODUCT_LOOP 10 -SCALARPRODUCT_LOOP 8 -SCALARPRODUCT_LOOP 6 -SCALARPRODUCT_LOOP 4 -SCALARPRODUCT_LOOP 2 -SCALARPRODUCT_LOOP 0 -.end: - movhlps m0, m6 - paddd m6, m0 - pshuflw m0, m6, 0x4e - paddd m6, m0 - movd eax, m6 - RET - - -;----------------------------------------------------------------------------- -; void ff_apply_window_int16(int16_t *output, const int16_t *input, -; const int16_t *window, unsigned int len) -;----------------------------------------------------------------------------- - -%macro REVERSE_WORDS_MMXEXT 1-2 - pshufw %1, %1, 0x1B -%endmacro - -%macro REVERSE_WORDS_SSE2 1-2 - pshuflw %1, %1, 0x1B - pshufhw %1, %1, 0x1B - pshufd %1, %1, 0x4E -%endmacro - -%macro REVERSE_WORDS_SSSE3 2 - pshufb %1, %2 -%endmacro - -; dst = (dst * src) >> 15 -; pmulhw cuts off the bottom bit, so we have to lshift by 1 and add it back -; in from the pmullw result. -%macro MUL16FIXED_MMXEXT 3 ; dst, src, temp - mova %3, %1 - pmulhw %1, %2 - pmullw %3, %2 - psrlw %3, 15 - psllw %1, 1 - por %1, %3 -%endmacro - -; dst = ((dst * src) + (1<<14)) >> 15 -%macro MUL16FIXED_SSSE3 3 ; dst, src, unused - pmulhrsw %1, %2 -%endmacro - -%macro APPLY_WINDOW_INT16 3 ; %1=instruction set, %2=mmxext/sse2 bit exact version, %3=has_ssse3 -cglobal apply_window_int16_%1, 4,5,6, output, input, window, offset, offset2 - lea offset2q, [offsetq-mmsize] -%if %2 - mova m5, [pd_16384] -%elifidn %1, ssse3 - mova m5, [pb_revwords] - ALIGN 16 -%endif -.loop: -%if %2 - ; This version expands 16-bit to 32-bit, multiplies by the window, - ; adds 16384 for rounding, right shifts 15, then repacks back to words to - ; save to the output. The window is reversed for the second half. - mova m3, [windowq+offset2q] - mova m4, [ inputq+offset2q] - pxor m0, m0 - punpcklwd m0, m3 - punpcklwd m1, m4 - pmaddwd m0, m1 - paddd m0, m5 - psrad m0, 15 - pxor m2, m2 - punpckhwd m2, m3 - punpckhwd m1, m4 - pmaddwd m2, m1 - paddd m2, m5 - psrad m2, 15 - packssdw m0, m2 - mova [outputq+offset2q], m0 - REVERSE_WORDS m3 - mova m4, [ inputq+offsetq] - pxor m0, m0 - punpcklwd m0, m3 - punpcklwd m1, m4 - pmaddwd m0, m1 - paddd m0, m5 - psrad m0, 15 - pxor m2, m2 - punpckhwd m2, m3 - punpckhwd m1, m4 - pmaddwd m2, m1 - paddd m2, m5 - psrad m2, 15 - packssdw m0, m2 - mova [outputq+offsetq], m0 -%elif %3 - ; This version does the 16x16->16 multiplication in-place without expanding - ; to 32-bit. The ssse3 version is bit-identical. - mova m0, [windowq+offset2q] - mova m1, [ inputq+offset2q] - pmulhrsw m1, m0 - REVERSE_WORDS m0, m5 - pmulhrsw m0, [ inputq+offsetq ] - mova [outputq+offset2q], m1 - mova [outputq+offsetq ], m0 -%else - ; This version does the 16x16->16 multiplication in-place without expanding - ; to 32-bit. The mmxext and sse2 versions do not use rounding, and - ; therefore are not bit-identical to the C version. - mova m0, [windowq+offset2q] - mova m1, [ inputq+offset2q] - mova m2, [ inputq+offsetq ] - MUL16FIXED m1, m0, m3 - REVERSE_WORDS m0 - MUL16FIXED m2, m0, m3 - mova [outputq+offset2q], m1 - mova [outputq+offsetq ], m2 -%endif - add offsetd, mmsize - sub offset2d, mmsize - jae .loop - REP_RET -%endmacro - -INIT_MMX -%define REVERSE_WORDS REVERSE_WORDS_MMXEXT -%define MUL16FIXED MUL16FIXED_MMXEXT -APPLY_WINDOW_INT16 mmxext, 0, 0 -APPLY_WINDOW_INT16 mmxext_ba, 1, 0 -INIT_XMM -%define REVERSE_WORDS REVERSE_WORDS_SSE2 -APPLY_WINDOW_INT16 sse2, 0, 0 -APPLY_WINDOW_INT16 sse2_ba, 1, 0 -APPLY_WINDOW_INT16 ssse3_atom, 0, 1 -%define REVERSE_WORDS REVERSE_WORDS_SSSE3 -APPLY_WINDOW_INT16 ssse3, 0, 1 - - -; void add_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top) -cglobal add_hfyu_median_prediction_mmx2, 6,6,0, dst, top, diff, w, left, left_top - movq mm0, [topq] - movq mm2, mm0 - movd mm4, [left_topq] - psllq mm2, 8 - movq mm1, mm0 - por mm4, mm2 - movd mm3, [leftq] - psubb mm0, mm4 ; t-tl - add dstq, wq - add topq, wq - add diffq, wq - neg wq - jmp .skip -.loop: - movq mm4, [topq+wq] - movq mm0, mm4 - psllq mm4, 8 - por mm4, mm1 - movq mm1, mm0 ; t - psubb mm0, mm4 ; t-tl -.skip: - movq mm2, [diffq+wq] -%assign i 0 -%rep 8 - movq mm4, mm0 - paddb mm4, mm3 ; t-tl+l - movq mm5, mm3 - pmaxub mm3, mm1 - pminub mm5, mm1 - pminub mm3, mm4 - pmaxub mm3, mm5 ; median - paddb mm3, mm2 ; +residual -%if i==0 - movq mm7, mm3 - psllq mm7, 56 -%else - movq mm6, mm3 - psrlq mm7, 8 - psllq mm6, 56 - por mm7, mm6 -%endif -%if i<7 - psrlq mm0, 8 - psrlq mm1, 8 - psrlq mm2, 8 -%endif -%assign i i+1 -%endrep - movq [dstq+wq], mm7 - add wq, 8 - jl .loop - movzx r2d, byte [dstq-1] - mov [leftq], r2d - movzx r2d, byte [topq-1] - mov [left_topq], r2d - RET - - -%macro ADD_HFYU_LEFT_LOOP 2 ; %1 = dst_is_aligned, %2 = src_is_aligned - add srcq, wq - add dstq, wq - neg wq -%%.loop: -%if %2 - mova m1, [srcq+wq] -%else - movu m1, [srcq+wq] -%endif - mova m2, m1 - psllw m1, 8 - paddb m1, m2 - mova m2, m1 - pshufb m1, m3 - paddb m1, m2 - pshufb m0, m5 - mova m2, m1 - pshufb m1, m4 - paddb m1, m2 -%if mmsize == 16 - mova m2, m1 - pshufb m1, m6 - paddb m1, m2 -%endif - paddb m0, m1 -%if %1 - mova [dstq+wq], m0 -%else - movq [dstq+wq], m0 - movhps [dstq+wq+8], m0 -%endif - add wq, mmsize - jl %%.loop - mov eax, mmsize-1 - sub eax, wd - movd m1, eax - pshufb m0, m1 - movd eax, m0 - RET -%endmacro - -; int add_hfyu_left_prediction(uint8_t *dst, const uint8_t *src, int w, int left) -INIT_MMX -cglobal add_hfyu_left_prediction_ssse3, 3,3,7, dst, src, w, left -.skip_prologue: - mova m5, [pb_7] - mova m4, [pb_zzzz3333zzzzbbbb] - mova m3, [pb_zz11zz55zz99zzdd] - movd m0, leftm - psllq m0, 56 - ADD_HFYU_LEFT_LOOP 1, 1 - -INIT_XMM -cglobal add_hfyu_left_prediction_sse4, 3,3,7, dst, src, w, left - mova m5, [pb_f] - mova m6, [pb_zzzzzzzz77777777] - mova m4, [pb_zzzz3333zzzzbbbb] - mova m3, [pb_zz11zz55zz99zzdd] - movd m0, leftm - pslldq m0, 15 - test srcq, 15 - jnz .src_unaligned - test dstq, 15 - jnz .dst_unaligned - ADD_HFYU_LEFT_LOOP 1, 1 -.dst_unaligned: - ADD_HFYU_LEFT_LOOP 0, 1 -.src_unaligned: - ADD_HFYU_LEFT_LOOP 0, 0 - - -; float scalarproduct_float_sse(const float *v1, const float *v2, int len) -cglobal scalarproduct_float_sse, 3,3,2, v1, v2, offset - neg offsetq - shl offsetq, 2 - sub v1q, offsetq - sub v2q, offsetq - xorps xmm0, xmm0 - .loop: - movaps xmm1, [v1q+offsetq] - mulps xmm1, [v2q+offsetq] - addps xmm0, xmm1 - add offsetq, 16 - js .loop - movhlps xmm1, xmm0 - addps xmm0, xmm1 - movss xmm1, xmm0 - shufps xmm0, xmm0, 1 - addss xmm0, xmm1 -%if ARCH_X86_64 == 0 - movss r0m, xmm0 - fld dword r0m -%endif - RET - -; extern void ff_emu_edge_core(uint8_t *buf, const uint8_t *src, x86_reg linesize, -; x86_reg start_y, x86_reg end_y, x86_reg block_h, -; x86_reg start_x, x86_reg end_x, x86_reg block_w); -; -; The actual function itself is below. It basically wraps a very simple -; w = end_x - start_x -; if (w) { -; if (w > 22) { -; jump to the slow loop functions -; } else { -; jump to the fast loop functions -; } -; } -; -; ... and then the same for left/right extend also. See below for loop -; function implementations. Fast are fixed-width, slow is variable-width - -%macro EMU_EDGE_FUNC 0 -%if ARCH_X86_64 -%define w_reg r7 -cglobal emu_edge_core, 6, 9, 1 - mov r8, r5 ; save block_h -%else -%define w_reg r6 -cglobal emu_edge_core, 2, 7, 0 - mov r4, r4m ; end_y - mov r5, r5m ; block_h -%endif - - ; start with vertical extend (top/bottom) and body pixel copy - mov w_reg, r7m - sub w_reg, r6m ; w = start_x - end_x - sub r5, r4 -%if ARCH_X86_64 - sub r4, r3 -%else - sub r4, dword r3m -%endif - cmp w_reg, 22 - jg .slow_v_extend_loop -%if ARCH_X86_32 - mov r2, r2m ; linesize -%endif - sal w_reg, 7 ; w * 128 -%ifdef PIC - lea rax, [.emuedge_v_extend_1 - (.emuedge_v_extend_2 - .emuedge_v_extend_1)] - add w_reg, rax -%else - lea w_reg, [.emuedge_v_extend_1 - (.emuedge_v_extend_2 - .emuedge_v_extend_1)+w_reg] -%endif - call w_reg ; fast top extend, body copy and bottom extend -.v_extend_end: - - ; horizontal extend (left/right) - mov w_reg, r6m ; start_x - sub r0, w_reg -%if ARCH_X86_64 - mov r3, r0 ; backup of buf+block_h*linesize - mov r5, r8 -%else - mov r0m, r0 ; backup of buf+block_h*linesize - mov r5, r5m -%endif - test w_reg, w_reg - jz .right_extend - cmp w_reg, 22 - jg .slow_left_extend_loop - mov r1, w_reg - dec w_reg - ; FIXME we can do a if size == 1 here if that makes any speed difference, test me - sar w_reg, 1 - sal w_reg, 6 - ; r0=buf+block_h*linesize,r7(64)/r6(32)=start_x offset for funcs - ; r6(rax)/r3(ebx)=val,r2=linesize,r1=start_x,r5=block_h -%ifdef PIC - lea rax, [.emuedge_extend_left_2] - add w_reg, rax -%else - lea w_reg, [.emuedge_extend_left_2+w_reg] -%endif - call w_reg - - ; now r3(64)/r0(32)=buf,r2=linesize,r8/r5=block_h,r6/r3=val, r7/r6=end_x, r1=block_w -.right_extend: -%if ARCH_X86_32 - mov r0, r0m - mov r5, r5m -%endif - mov w_reg, r7m ; end_x - mov r1, r8m ; block_w - mov r4, r1 - sub r1, w_reg - jz .h_extend_end ; if (end_x == block_w) goto h_extend_end - cmp r1, 22 - jg .slow_right_extend_loop - dec r1 - ; FIXME we can do a if size == 1 here if that makes any speed difference, test me - sar r1, 1 - sal r1, 6 -%ifdef PIC - lea rax, [.emuedge_extend_right_2] - add r1, rax -%else - lea r1, [.emuedge_extend_right_2+r1] -%endif - call r1 -.h_extend_end: - RET - -%if ARCH_X86_64 -%define vall al -%define valh ah -%define valw ax -%define valw2 r7w -%define valw3 r3w -%if WIN64 -%define valw4 r7w -%else ; unix64 -%define valw4 r3w -%endif -%define vald eax -%else -%define vall bl -%define valh bh -%define valw bx -%define valw2 r6w -%define valw3 valw2 -%define valw4 valw3 -%define vald ebx -%define stack_offset 0x14 -%endif - -%endmacro - -; macro to read/write a horizontal number of pixels (%2) to/from registers -; on x86-64, - fills xmm0-15 for consecutive sets of 16 pixels -; - if (%2 & 15 == 8) fills the last 8 bytes into rax -; - else if (%2 & 8) fills 8 bytes into mm0 -; - if (%2 & 7 == 4) fills the last 4 bytes into rax -; - else if (%2 & 4) fills 4 bytes into mm0-1 -; - if (%2 & 3 == 3) fills 2 bytes into r7/r3, and 1 into eax -; (note that we're using r3 for body/bottom because it's a shorter -; opcode, and then the loop fits in 128 bytes) -; - else fills remaining bytes into rax -; on x86-32, - fills mm0-7 for consecutive sets of 8 pixels -; - if (%2 & 7 == 4) fills 4 bytes into ebx -; - else if (%2 & 4) fills 4 bytes into mm0-7 -; - if (%2 & 3 == 3) fills 2 bytes into r6, and 1 into ebx -; - else fills remaining bytes into ebx -; writing data out is in the same way -%macro READ_NUM_BYTES 2 -%assign %%src_off 0 ; offset in source buffer -%assign %%smidx 0 ; mmx register idx -%assign %%sxidx 0 ; xmm register idx - -%if cpuflag(sse) -%rep %2/16 - movups xmm %+ %%sxidx, [r1+%%src_off] -%assign %%src_off %%src_off+16 -%assign %%sxidx %%sxidx+1 -%endrep ; %2/16 -%endif - -%if ARCH_X86_64 -%if (%2-%%src_off) == 8 - mov rax, [r1+%%src_off] -%assign %%src_off %%src_off+8 -%endif ; (%2-%%src_off) == 8 -%endif ; x86-64 - -%rep (%2-%%src_off)/8 - movq mm %+ %%smidx, [r1+%%src_off] -%assign %%src_off %%src_off+8 -%assign %%smidx %%smidx+1 -%endrep ; (%2-%%dst_off)/8 - -%if (%2-%%src_off) == 4 - mov vald, [r1+%%src_off] -%elif (%2-%%src_off) & 4 - movd mm %+ %%smidx, [r1+%%src_off] -%assign %%src_off %%src_off+4 -%endif ; (%2-%%src_off) ==/& 4 - -%if (%2-%%src_off) == 1 - mov vall, [r1+%%src_off] -%elif (%2-%%src_off) == 2 - mov valw, [r1+%%src_off] -%elif (%2-%%src_off) == 3 -%ifidn %1, top - mov valw2, [r1+%%src_off] -%elifidn %1, body - mov valw3, [r1+%%src_off] -%elifidn %1, bottom - mov valw4, [r1+%%src_off] -%endif ; %1 ==/!= top - mov vall, [r1+%%src_off+2] -%endif ; (%2-%%src_off) == 1/2/3 -%endmacro ; READ_NUM_BYTES - -%macro WRITE_NUM_BYTES 2 -%assign %%dst_off 0 ; offset in destination buffer -%assign %%dmidx 0 ; mmx register idx -%assign %%dxidx 0 ; xmm register idx - -%if cpuflag(sse) -%rep %2/16 - movups [r0+%%dst_off], xmm %+ %%dxidx -%assign %%dst_off %%dst_off+16 -%assign %%dxidx %%dxidx+1 -%endrep ; %2/16 -%endif - -%if ARCH_X86_64 -%if (%2-%%dst_off) == 8 - mov [r0+%%dst_off], rax -%assign %%dst_off %%dst_off+8 -%endif ; (%2-%%dst_off) == 8 -%endif ; x86-64 - -%rep (%2-%%dst_off)/8 - movq [r0+%%dst_off], mm %+ %%dmidx -%assign %%dst_off %%dst_off+8 -%assign %%dmidx %%dmidx+1 -%endrep ; (%2-%%dst_off)/8 - -%if (%2-%%dst_off) == 4 - mov [r0+%%dst_off], vald -%elif (%2-%%dst_off) & 4 - movd [r0+%%dst_off], mm %+ %%dmidx -%assign %%dst_off %%dst_off+4 -%endif ; (%2-%%dst_off) ==/& 4 - -%if (%2-%%dst_off) == 1 - mov [r0+%%dst_off], vall -%elif (%2-%%dst_off) == 2 - mov [r0+%%dst_off], valw -%elif (%2-%%dst_off) == 3 -%ifidn %1, top - mov [r0+%%dst_off], valw2 -%elifidn %1, body - mov [r0+%%dst_off], valw3 -%elifidn %1, bottom - mov [r0+%%dst_off], valw4 -%endif ; %1 ==/!= top - mov [r0+%%dst_off+2], vall -%endif ; (%2-%%dst_off) == 1/2/3 -%endmacro ; WRITE_NUM_BYTES - -; vertical top/bottom extend and body copy fast loops -; these are function pointers to set-width line copy functions, i.e. -; they read a fixed number of pixels into set registers, and write -; those out into the destination buffer -; r0=buf,r1=src,r2=linesize,r3(64)/r3m(32)=start_x,r4=end_y,r5=block_h -; r6(eax/64)/r3(ebx/32)=val_reg -%macro VERTICAL_EXTEND 0 -%assign %%n 1 -%rep 22 -ALIGN 128 -.emuedge_v_extend_ %+ %%n: - ; extend pixels above body -%if ARCH_X86_64 - test r3 , r3 ; if (!start_y) - jz .emuedge_copy_body_ %+ %%n %+ _loop ; goto body -%else ; ARCH_X86_32 - cmp dword r3m, 0 - je .emuedge_copy_body_ %+ %%n %+ _loop -%endif ; ARCH_X86_64/32 - READ_NUM_BYTES top, %%n ; read bytes -.emuedge_extend_top_ %+ %%n %+ _loop: ; do { - WRITE_NUM_BYTES top, %%n ; write bytes - add r0 , r2 ; dst += linesize -%if ARCH_X86_64 - dec r3d -%else ; ARCH_X86_32 - dec dword r3m -%endif ; ARCH_X86_64/32 - jnz .emuedge_extend_top_ %+ %%n %+ _loop ; } while (--start_y) - - ; copy body pixels -.emuedge_copy_body_ %+ %%n %+ _loop: ; do { - READ_NUM_BYTES body, %%n ; read bytes - WRITE_NUM_BYTES body, %%n ; write bytes - add r0 , r2 ; dst += linesize - add r1 , r2 ; src += linesize - dec r4d - jnz .emuedge_copy_body_ %+ %%n %+ _loop ; } while (--end_y) - - ; copy bottom pixels - test r5 , r5 ; if (!block_h) - jz .emuedge_v_extend_end_ %+ %%n ; goto end - sub r1 , r2 ; src -= linesize - READ_NUM_BYTES bottom, %%n ; read bytes -.emuedge_extend_bottom_ %+ %%n %+ _loop: ; do { - WRITE_NUM_BYTES bottom, %%n ; write bytes - add r0 , r2 ; dst += linesize - dec r5d - jnz .emuedge_extend_bottom_ %+ %%n %+ _loop ; } while (--block_h) - -.emuedge_v_extend_end_ %+ %%n: -%if ARCH_X86_64 - ret -%else ; ARCH_X86_32 - rep ret -%endif ; ARCH_X86_64/32 -%assign %%n %%n+1 -%endrep -%endmacro VERTICAL_EXTEND - -; left/right (horizontal) fast extend functions -; these are essentially identical to the vertical extend ones above, -; just left/right separated because number of pixels to extend is -; obviously not the same on both sides. -; for reading, pixels are placed in eax (x86-64) or ebx (x86-64) in the -; lowest two bytes of the register (so val*0x0101), and are splatted -; into each byte of mm0 as well if n_pixels >= 8 - -%macro READ_V_PIXEL 2 - mov vall, %2 - mov valh, vall -%if %1 >= 8 - movd mm0, vald -%if cpuflag(mmx2) - pshufw mm0, mm0, 0 -%else ; mmx - punpcklwd mm0, mm0 - punpckldq mm0, mm0 -%endif ; sse -%endif ; %1 >= 8 -%endmacro - -%macro WRITE_V_PIXEL 2 -%assign %%dst_off 0 -%rep %1/8 - movq [%2+%%dst_off], mm0 -%assign %%dst_off %%dst_off+8 -%endrep -%if %1 & 4 -%if %1 >= 8 - movd [%2+%%dst_off], mm0 -%else ; %1 < 8 - mov [%2+%%dst_off] , valw - mov [%2+%%dst_off+2], valw -%endif ; %1 >=/< 8 -%assign %%dst_off %%dst_off+4 -%endif ; %1 & 4 -%if %1&2 - mov [%2+%%dst_off], valw -%endif ; %1 & 2 -%endmacro - -; r0=buf+block_h*linesize, r1=start_x, r2=linesize, r5=block_h, r6/r3=val -%macro LEFT_EXTEND 0 -%assign %%n 2 -%rep 11 -ALIGN 64 -.emuedge_extend_left_ %+ %%n: ; do { - sub r0, r2 ; dst -= linesize - READ_V_PIXEL %%n, [r0+r1] ; read pixels - WRITE_V_PIXEL %%n, r0 ; write pixels - dec r5 - jnz .emuedge_extend_left_ %+ %%n ; } while (--block_h) -%if ARCH_X86_64 - ret -%else ; ARCH_X86_32 - rep ret -%endif ; ARCH_X86_64/32 -%assign %%n %%n+2 -%endrep -%endmacro ; LEFT_EXTEND - -; r3/r0=buf+block_h*linesize, r2=linesize, r8/r5=block_h, r0/r6=end_x, r6/r3=val -%macro RIGHT_EXTEND 0 -%assign %%n 2 -%rep 11 -ALIGN 64 -.emuedge_extend_right_ %+ %%n: ; do { -%if ARCH_X86_64 - sub r3, r2 ; dst -= linesize - READ_V_PIXEL %%n, [r3+w_reg-1] ; read pixels - WRITE_V_PIXEL %%n, r3+r4-%%n ; write pixels - dec r8 -%else ; ARCH_X86_32 - sub r0, r2 ; dst -= linesize - READ_V_PIXEL %%n, [r0+w_reg-1] ; read pixels - WRITE_V_PIXEL %%n, r0+r4-%%n ; write pixels - dec r5 -%endif ; ARCH_X86_64/32 - jnz .emuedge_extend_right_ %+ %%n ; } while (--block_h) -%if ARCH_X86_64 - ret -%else ; ARCH_X86_32 - rep ret -%endif ; ARCH_X86_64/32 -%assign %%n %%n+2 -%endrep - -%if ARCH_X86_32 -%define stack_offset 0x10 -%endif -%endmacro ; RIGHT_EXTEND - -; below follow the "slow" copy/extend functions, these act on a non-fixed -; width specified in a register, and run a loop to copy the full amount -; of bytes. They are optimized for copying of large amounts of pixels per -; line, so they unconditionally splat data into mm registers to copy 8 -; bytes per loop iteration. It could be considered to use xmm for x86-64 -; also, but I haven't optimized this as much (i.e. FIXME) -%macro V_COPY_NPX 4-5 -%if %0 == 4 - test w_reg, %4 - jz .%1_skip_%4_px -%else ; %0 == 5 -.%1_%4_px_loop: -%endif - %3 %2, [r1+cnt_reg] - %3 [r0+cnt_reg], %2 - add cnt_reg, %4 -%if %0 == 5 - sub w_reg, %4 - test w_reg, %5 - jnz .%1_%4_px_loop -%endif -.%1_skip_%4_px: -%endmacro - -%macro V_COPY_ROW 2 -%ifidn %1, bottom - sub r1, linesize -%endif -.%1_copy_loop: - xor cnt_reg, cnt_reg -%if notcpuflag(sse) -%define linesize r2m - V_COPY_NPX %1, mm0, movq, 8, 0xFFFFFFF8 -%else ; sse - V_COPY_NPX %1, xmm0, movups, 16, 0xFFFFFFF0 -%if ARCH_X86_64 -%define linesize r2 - V_COPY_NPX %1, rax , mov, 8 -%else ; ARCH_X86_32 -%define linesize r2m - V_COPY_NPX %1, mm0, movq, 8 -%endif ; ARCH_X86_64/32 -%endif ; sse - V_COPY_NPX %1, vald, mov, 4 - V_COPY_NPX %1, valw, mov, 2 - V_COPY_NPX %1, vall, mov, 1 - mov w_reg, cnt_reg -%ifidn %1, body - add r1, linesize -%endif - add r0, linesize - dec %2 - jnz .%1_copy_loop -%endmacro - -%macro SLOW_V_EXTEND 0 -.slow_v_extend_loop: -; r0=buf,r1=src,r2(64)/r2m(32)=linesize,r3(64)/r3m(32)=start_x,r4=end_y,r5=block_h -; r8(64)/r3(later-64)/r2(32)=cnt_reg,r6(64)/r3(32)=val_reg,r7(64)/r6(32)=w=end_x-start_x -%if ARCH_X86_64 - push r8 ; save old value of block_h - test r3, r3 -%define cnt_reg r8 - jz .do_body_copy ; if (!start_y) goto do_body_copy - V_COPY_ROW top, r3 -%else - cmp dword r3m, 0 -%define cnt_reg r2 - je .do_body_copy ; if (!start_y) goto do_body_copy - V_COPY_ROW top, dword r3m -%endif - -.do_body_copy: - V_COPY_ROW body, r4 - -%if ARCH_X86_64 - pop r8 ; restore old value of block_h -%define cnt_reg r3 -%endif - test r5, r5 -%if ARCH_X86_64 - jz .v_extend_end -%else - jz .skip_bottom_extend -%endif - V_COPY_ROW bottom, r5 -%if ARCH_X86_32 -.skip_bottom_extend: - mov r2, r2m -%endif - jmp .v_extend_end -%endmacro - -%macro SLOW_LEFT_EXTEND 0 -.slow_left_extend_loop: -; r0=buf+block_h*linesize,r2=linesize,r6(64)/r3(32)=val,r5=block_h,r4=cntr,r7/r6=start_x - mov r4, 8 - sub r0, linesize - READ_V_PIXEL 8, [r0+w_reg] -.left_extend_8px_loop: - movq [r0+r4-8], mm0 - add r4, 8 - cmp r4, w_reg - jle .left_extend_8px_loop - sub r4, 8 - cmp r4, w_reg - jge .left_extend_loop_end -.left_extend_2px_loop: - mov [r0+r4], valw - add r4, 2 - cmp r4, w_reg - jl .left_extend_2px_loop -.left_extend_loop_end: - dec r5 - jnz .slow_left_extend_loop -%if ARCH_X86_32 - mov r2, r2m -%endif - jmp .right_extend -%endmacro - -%macro SLOW_RIGHT_EXTEND 0 -.slow_right_extend_loop: -; r3(64)/r0(32)=buf+block_h*linesize,r2=linesize,r4=block_w,r8(64)/r5(32)=block_h, -; r7(64)/r6(32)=end_x,r6/r3=val,r1=cntr -%if ARCH_X86_64 -%define buf_reg r3 -%define bh_reg r8 -%else -%define buf_reg r0 -%define bh_reg r5 -%endif - lea r1, [r4-8] - sub buf_reg, linesize - READ_V_PIXEL 8, [buf_reg+w_reg-1] -.right_extend_8px_loop: - movq [buf_reg+r1], mm0 - sub r1, 8 - cmp r1, w_reg - jge .right_extend_8px_loop - add r1, 8 - cmp r1, w_reg - je .right_extend_loop_end -.right_extend_2px_loop: - sub r1, 2 - mov [buf_reg+r1], valw - cmp r1, w_reg - jg .right_extend_2px_loop -.right_extend_loop_end: - dec bh_reg - jnz .slow_right_extend_loop - jmp .h_extend_end -%endmacro - -%macro emu_edge 1 -INIT_XMM %1 -EMU_EDGE_FUNC -VERTICAL_EXTEND -LEFT_EXTEND -RIGHT_EXTEND -SLOW_V_EXTEND -SLOW_LEFT_EXTEND -SLOW_RIGHT_EXTEND -%endmacro - -emu_edge sse -%if ARCH_X86_32 -emu_edge mmx -%endif - -;----------------------------------------------------------------------------- -; void ff_vector_clip_int32(int32_t *dst, const int32_t *src, int32_t min, -; int32_t max, unsigned int len) -;----------------------------------------------------------------------------- - -; %1 = number of xmm registers used -; %2 = number of inline load/process/store loops per asm loop -; %3 = process 4*mmsize (%3=0) or 8*mmsize (%3=1) bytes per loop -; %4 = CLIPD function takes min/max as float instead of int (CLIPD_SSE2) -; %5 = suffix -%macro VECTOR_CLIP_INT32 4-5 -cglobal vector_clip_int32%5, 5,5,%1, dst, src, min, max, len -%if %4 - cvtsi2ss m4, minm - cvtsi2ss m5, maxm -%else - movd m4, minm - movd m5, maxm -%endif - SPLATD m4 - SPLATD m5 -.loop: -%assign %%i 1 -%rep %2 - mova m0, [srcq+mmsize*0*%%i] - mova m1, [srcq+mmsize*1*%%i] - mova m2, [srcq+mmsize*2*%%i] - mova m3, [srcq+mmsize*3*%%i] -%if %3 - mova m7, [srcq+mmsize*4*%%i] - mova m8, [srcq+mmsize*5*%%i] - mova m9, [srcq+mmsize*6*%%i] - mova m10, [srcq+mmsize*7*%%i] -%endif - CLIPD m0, m4, m5, m6 - CLIPD m1, m4, m5, m6 - CLIPD m2, m4, m5, m6 - CLIPD m3, m4, m5, m6 -%if %3 - CLIPD m7, m4, m5, m6 - CLIPD m8, m4, m5, m6 - CLIPD m9, m4, m5, m6 - CLIPD m10, m4, m5, m6 -%endif - mova [dstq+mmsize*0*%%i], m0 - mova [dstq+mmsize*1*%%i], m1 - mova [dstq+mmsize*2*%%i], m2 - mova [dstq+mmsize*3*%%i], m3 -%if %3 - mova [dstq+mmsize*4*%%i], m7 - mova [dstq+mmsize*5*%%i], m8 - mova [dstq+mmsize*6*%%i], m9 - mova [dstq+mmsize*7*%%i], m10 -%endif -%assign %%i %%i+1 -%endrep - add srcq, mmsize*4*(%2+%3) - add dstq, mmsize*4*(%2+%3) - sub lend, mmsize*(%2+%3) - jg .loop - REP_RET -%endmacro - -INIT_MMX mmx -%define SPLATD SPLATD_MMX -%define CLIPD CLIPD_MMX -VECTOR_CLIP_INT32 0, 1, 0, 0 -INIT_XMM sse2 -%define SPLATD SPLATD_SSE2 -VECTOR_CLIP_INT32 6, 1, 0, 0, _int -%define CLIPD CLIPD_SSE2 -VECTOR_CLIP_INT32 6, 2, 0, 1 -INIT_XMM sse4 -%define CLIPD CLIPD_SSE41 -%ifdef m8 -VECTOR_CLIP_INT32 11, 1, 1, 0 -%else -VECTOR_CLIP_INT32 6, 1, 0, 0 -%endif - -;----------------------------------------------------------------------------- -; void vector_fmul_reverse(float *dst, const float *src0, const float *src1, -; int len) -;----------------------------------------------------------------------------- -%macro VECTOR_FMUL_REVERSE 0 -cglobal vector_fmul_reverse, 4,4,2, dst, src0, src1, len - lea lenq, [lend*4 - 2*mmsize] -ALIGN 16 -.loop: -%if cpuflag(avx) - vmovaps xmm0, [src1q + 16] - vinsertf128 m0, m0, [src1q], 1 - vshufps m0, m0, m0, q0123 - vmovaps xmm1, [src1q + mmsize + 16] - vinsertf128 m1, m1, [src1q + mmsize], 1 - vshufps m1, m1, m1, q0123 -%else - mova m0, [src1q] - mova m1, [src1q + mmsize] - shufps m0, m0, q0123 - shufps m1, m1, q0123 -%endif - mulps m0, m0, [src0q + lenq + mmsize] - mulps m1, m1, [src0q + lenq] - mova [dstq + lenq + mmsize], m0 - mova [dstq + lenq], m1 - add src1q, 2*mmsize - sub lenq, 2*mmsize - jge .loop - REP_RET -%endmacro - -INIT_XMM sse -VECTOR_FMUL_REVERSE -%if HAVE_AVX -INIT_YMM avx -VECTOR_FMUL_REVERSE -%endif - -;----------------------------------------------------------------------------- -; vector_fmul_add(float *dst, const float *src0, const float *src1, -; const float *src2, int len) -;----------------------------------------------------------------------------- -%macro VECTOR_FMUL_ADD 0 -cglobal vector_fmul_add, 5,5,2, dst, src0, src1, src2, len - lea lenq, [lend*4 - 2*mmsize] -ALIGN 16 -.loop: - mova m0, [src0q + lenq] - mova m1, [src0q + lenq + mmsize] - mulps m0, m0, [src1q + lenq] - mulps m1, m1, [src1q + lenq + mmsize] - addps m0, m0, [src2q + lenq] - addps m1, m1, [src2q + lenq + mmsize] - mova [dstq + lenq], m0 - mova [dstq + lenq + mmsize], m1 - - sub lenq, 2*mmsize - jge .loop - REP_RET -%endmacro - -INIT_XMM sse -VECTOR_FMUL_ADD -%if HAVE_AVX -INIT_YMM avx -VECTOR_FMUL_ADD -%endif - -;----------------------------------------------------------------------------- -; void ff_butterflies_float_interleave(float *dst, const float *src0, -; const float *src1, int len); -;----------------------------------------------------------------------------- - -%macro BUTTERFLIES_FLOAT_INTERLEAVE 0 -cglobal butterflies_float_interleave, 4,4,3, dst, src0, src1, len -%if ARCH_X86_64 - movsxd lenq, lend -%endif - test lenq, lenq - jz .end - shl lenq, 2 - lea src0q, [src0q + lenq] - lea src1q, [src1q + lenq] - lea dstq, [ dstq + 2*lenq] - neg lenq -.loop: - mova m0, [src0q + lenq] - mova m1, [src1q + lenq] - subps m2, m0, m1 - addps m0, m0, m1 - unpcklps m1, m0, m2 - unpckhps m0, m0, m2 -%if cpuflag(avx) - vextractf128 [dstq + 2*lenq ], m1, 0 - vextractf128 [dstq + 2*lenq + 16], m0, 0 - vextractf128 [dstq + 2*lenq + 32], m1, 1 - vextractf128 [dstq + 2*lenq + 48], m0, 1 -%else - mova [dstq + 2*lenq ], m1 - mova [dstq + 2*lenq + mmsize], m0 -%endif - add lenq, mmsize - jl .loop -.end: - REP_RET -%endmacro - -INIT_XMM sse -BUTTERFLIES_FLOAT_INTERLEAVE -%if HAVE_AVX -INIT_YMM avx -BUTTERFLIES_FLOAT_INTERLEAVE -%endif - -INIT_XMM sse2 -; %1 = aligned/unaligned -%macro BSWAP_LOOPS_SSE2 1 - mov r3, r2 - sar r2, 3 - jz .left4_%1 -.loop8_%1: - mov%1 m0, [r1 + 0] - mov%1 m1, [r1 + 16] - pshuflw m0, m0, 10110001b - pshuflw m1, m1, 10110001b - pshufhw m0, m0, 10110001b - pshufhw m1, m1, 10110001b - mova m2, m0 - mova m3, m1 - psllw m0, 8 - psllw m1, 8 - psrlw m2, 8 - psrlw m3, 8 - por m2, m0 - por m3, m1 - mova [r0 + 0], m2 - mova [r0 + 16], m3 - add r1, 32 - add r0, 32 - dec r2 - jnz .loop8_%1 -.left4_%1: - mov r2, r3 - and r3, 4 - jz .left - mov%1 m0, [r1] - pshuflw m0, m0, 10110001b - pshufhw m0, m0, 10110001b - mova m2, m0 - psllw m0, 8 - psrlw m2, 8 - por m2, m0 - mova [r0], m2 - add r1, 16 - add r0, 16 -%endmacro - -; void bswap_buf(uint32_t *dst, const uint32_t *src, int w); -cglobal bswap32_buf, 3,4,5 - mov r3, r1 - and r3, 15 - jz .start_align - BSWAP_LOOPS_SSE2 u - jmp .left -.start_align: - BSWAP_LOOPS_SSE2 a -.left: - and r2, 3 - jz .end -.loop2: - mov r3d, [r1] - bswap r3d - mov [r0], r3d - add r1, 4 - add r0, 4 - dec r2 - jnz .loop2 -.end: - RET - -; %1 = aligned/unaligned -%macro BSWAP_LOOPS_SSSE3 1 - mov r3, r2 - sar r2, 3 - jz .left4_%1 -.loop8_%1: - mov%1 m0, [r1 + 0] - mov%1 m1, [r1 + 16] - pshufb m0, m2 - pshufb m1, m2 - mova [r0 + 0], m0 - mova [r0 + 16], m1 - add r0, 32 - add r1, 32 - dec r2 - jnz .loop8_%1 -.left4_%1: - mov r2, r3 - and r3, 4 - jz .left2 - mov%1 m0, [r1] - pshufb m0, m2 - mova [r0], m0 - add r1, 16 - add r0, 16 -%endmacro - -INIT_XMM ssse3 -; void bswap_buf(uint32_t *dst, const uint32_t *src, int w); -cglobal bswap32_buf, 3,4,3 - mov r3, r1 - mova m2, [pb_bswap32] - and r3, 15 - jz .start_align - BSWAP_LOOPS_SSSE3 u - jmp .left2 -.start_align: - BSWAP_LOOPS_SSSE3 a -.left2: - mov r3, r2 - and r2, 2 - jz .left1 - movq m0, [r1] - pshufb m0, m2 - movq [r0], m0 - add r1, 8 - add r0, 8 -.left1: - and r3, 1 - jz .end - mov r2d, [r1] - bswap r2d - mov [r0], r2d -.end: - RET diff --git a/libavcodec/x86/dsputilenc.asm b/libavcodec/x86/dsputilenc.asm new file mode 100644 index 0000000000..b7078f10a9 --- /dev/null +++ b/libavcodec/x86/dsputilenc.asm @@ -0,0 +1,342 @@ +;***************************************************************************** +;* MMX optimized DSP utils +;***************************************************************************** +;* Copyright (c) 2000, 2001 Fabrice Bellard +;* Copyright (c) 2002-2004 Michael Niedermayer +;* +;* This file is part of Libav. +;* +;* Libav is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* Libav is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with Libav; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;***************************************************************************** + +%include "x86inc.asm" +%include "x86util.asm" + +SECTION .text + +%macro DIFF_PIXELS_1 4 + movh %1, %3 + movh %2, %4 + punpcklbw %2, %1 + punpcklbw %1, %1 + psubw %1, %2 +%endmacro + +; %1=uint8_t *pix1, %2=uint8_t *pix2, %3=static offset, %4=stride, %5=stride*3 +; %6=temporary storage location +; this macro requires $mmsize stack space (aligned) on %6 (except on SSE+x86-64) +%macro DIFF_PIXELS_8 6 + DIFF_PIXELS_1 m0, m7, [%1 +%3], [%2 +%3] + DIFF_PIXELS_1 m1, m7, [%1+%4 +%3], [%2+%4 +%3] + DIFF_PIXELS_1 m2, m7, [%1+%4*2+%3], [%2+%4*2+%3] + add %1, %5 + add %2, %5 + DIFF_PIXELS_1 m3, m7, [%1 +%3], [%2 +%3] + DIFF_PIXELS_1 m4, m7, [%1+%4 +%3], [%2+%4 +%3] + DIFF_PIXELS_1 m5, m7, [%1+%4*2+%3], [%2+%4*2+%3] + DIFF_PIXELS_1 m6, m7, [%1+%5 +%3], [%2+%5 +%3] +%ifdef m8 + DIFF_PIXELS_1 m7, m8, [%1+%4*4+%3], [%2+%4*4+%3] +%else + mova [%6], m0 + DIFF_PIXELS_1 m7, m0, [%1+%4*4+%3], [%2+%4*4+%3] + mova m0, [%6] +%endif + sub %1, %5 + sub %2, %5 +%endmacro + +%macro HADAMARD8 0 + SUMSUB_BADC w, 0, 1, 2, 3 + SUMSUB_BADC w, 4, 5, 6, 7 + SUMSUB_BADC w, 0, 2, 1, 3 + SUMSUB_BADC w, 4, 6, 5, 7 + SUMSUB_BADC w, 0, 4, 1, 5 + SUMSUB_BADC w, 2, 6, 3, 7 +%endmacro + +%macro ABS1_SUM 3 + ABS1 %1, %2 + paddusw %3, %1 +%endmacro + +%macro ABS2_SUM 6 + ABS2 %1, %2, %3, %4 + paddusw %5, %1 + paddusw %6, %2 +%endmacro + +%macro ABS_SUM_8x8_64 1 + ABS2 m0, m1, m8, m9 + ABS2_SUM m2, m3, m8, m9, m0, m1 + ABS2_SUM m4, m5, m8, m9, m0, m1 + ABS2_SUM m6, m7, m8, m9, m0, m1 + paddusw m0, m1 +%endmacro + +%macro ABS_SUM_8x8_32 1 + mova [%1], m7 + ABS1 m0, m7 + ABS1 m1, m7 + ABS1_SUM m2, m7, m0 + ABS1_SUM m3, m7, m1 + ABS1_SUM m4, m7, m0 + ABS1_SUM m5, m7, m1 + ABS1_SUM m6, m7, m0 + mova m2, [%1] + ABS1_SUM m2, m7, m1 + paddusw m0, m1 +%endmacro + +; FIXME: HSUM_* saturates at 64k, while an 8x8 hadamard or dct block can get up to +; about 100k on extreme inputs. But that's very unlikely to occur in natural video, +; and it's even more unlikely to not have any alternative mvs/modes with lower cost. +%macro HSUM_MMX 3 + mova %2, %1 + psrlq %1, 32 + paddusw %1, %2 + mova %2, %1 + psrlq %1, 16 + paddusw %1, %2 + movd %3, %1 +%endmacro + +%macro HSUM_MMX2 3 + pshufw %2, %1, 0xE + paddusw %1, %2 + pshufw %2, %1, 0x1 + paddusw %1, %2 + movd %3, %1 +%endmacro + +%macro HSUM_SSE2 3 + movhlps %2, %1 + paddusw %1, %2 + pshuflw %2, %1, 0xE + paddusw %1, %2 + pshuflw %2, %1, 0x1 + paddusw %1, %2 + movd %3, %1 +%endmacro + +%macro STORE4 5 + mova [%1+mmsize*0], %2 + mova [%1+mmsize*1], %3 + mova [%1+mmsize*2], %4 + mova [%1+mmsize*3], %5 +%endmacro + +%macro LOAD4 5 + mova %2, [%1+mmsize*0] + mova %3, [%1+mmsize*1] + mova %4, [%1+mmsize*2] + mova %5, [%1+mmsize*3] +%endmacro + +%macro hadamard8_16_wrapper 3 +cglobal hadamard8_diff_%1, 4, 4, %2 +%ifndef m8 + %assign pad %3*mmsize-(4+stack_offset&(mmsize-1)) + SUB rsp, pad +%endif + call hadamard8x8_diff_%1 +%ifndef m8 + ADD rsp, pad +%endif + RET + +cglobal hadamard8_diff16_%1, 5, 6, %2 +%ifndef m8 + %assign pad %3*mmsize-(4+stack_offset&(mmsize-1)) + SUB rsp, pad +%endif + + call hadamard8x8_diff_%1 + mov r5d, eax + + add r1, 8 + add r2, 8 + call hadamard8x8_diff_%1 + add r5d, eax + + cmp r4d, 16 + jne .done + + lea r1, [r1+r3*8-8] + lea r2, [r2+r3*8-8] + call hadamard8x8_diff_%1 + add r5d, eax + + add r1, 8 + add r2, 8 + call hadamard8x8_diff_%1 + add r5d, eax + +.done: + mov eax, r5d +%ifndef m8 + ADD rsp, pad +%endif + RET +%endmacro + +%macro HADAMARD8_DIFF_MMX 1 +ALIGN 16 +; int hadamard8_diff_##cpu(void *s, uint8_t *src1, uint8_t *src2, +; int stride, int h) +; r0 = void *s = unused, int h = unused (always 8) +; note how r1, r2 and r3 are not clobbered in this function, so 16x16 +; can simply call this 2x2x (and that's why we access rsp+gprsize +; everywhere, which is rsp of calling func +hadamard8x8_diff_%1: + lea r0, [r3*3] + + ; first 4x8 pixels + DIFF_PIXELS_8 r1, r2, 0, r3, r0, rsp+gprsize+0x60 + HADAMARD8 + mova [rsp+gprsize+0x60], m7 + TRANSPOSE4x4W 0, 1, 2, 3, 7 + STORE4 rsp+gprsize, m0, m1, m2, m3 + mova m7, [rsp+gprsize+0x60] + TRANSPOSE4x4W 4, 5, 6, 7, 0 + STORE4 rsp+gprsize+0x40, m4, m5, m6, m7 + + ; second 4x8 pixels + DIFF_PIXELS_8 r1, r2, 4, r3, r0, rsp+gprsize+0x60 + HADAMARD8 + mova [rsp+gprsize+0x60], m7 + TRANSPOSE4x4W 0, 1, 2, 3, 7 + STORE4 rsp+gprsize+0x20, m0, m1, m2, m3 + mova m7, [rsp+gprsize+0x60] + TRANSPOSE4x4W 4, 5, 6, 7, 0 + + LOAD4 rsp+gprsize+0x40, m0, m1, m2, m3 + HADAMARD8 + ABS_SUM_8x8_32 rsp+gprsize+0x60 + mova [rsp+gprsize+0x60], m0 + + LOAD4 rsp+gprsize , m0, m1, m2, m3 + LOAD4 rsp+gprsize+0x20, m4, m5, m6, m7 + HADAMARD8 + ABS_SUM_8x8_32 rsp+gprsize + paddusw m0, [rsp+gprsize+0x60] + + HSUM m0, m1, eax + and rax, 0xFFFF + ret + +hadamard8_16_wrapper %1, 0, 14 +%endmacro + +%macro HADAMARD8_DIFF_SSE2 2 +hadamard8x8_diff_%1: + lea r0, [r3*3] + DIFF_PIXELS_8 r1, r2, 0, r3, r0, rsp+gprsize + HADAMARD8 +%if ARCH_X86_64 + TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8 +%else + TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [rsp+gprsize], [rsp+mmsize+gprsize] +%endif + HADAMARD8 + ABS_SUM_8x8 rsp+gprsize + HSUM_SSE2 m0, m1, eax + and eax, 0xFFFF + ret + +hadamard8_16_wrapper %1, %2, 3 +%endmacro + +INIT_MMX +%define ABS1 ABS1_MMX +%define HSUM HSUM_MMX +HADAMARD8_DIFF_MMX mmx + +%define ABS1 ABS1_MMX2 +%define HSUM HSUM_MMX2 +HADAMARD8_DIFF_MMX mmx2 + +INIT_XMM +%define ABS2 ABS2_MMX2 +%if ARCH_X86_64 +%define ABS_SUM_8x8 ABS_SUM_8x8_64 +%else +%define ABS_SUM_8x8 ABS_SUM_8x8_32 +%endif +HADAMARD8_DIFF_SSE2 sse2, 10 + +%define ABS2 ABS2_SSSE3 +%define ABS_SUM_8x8 ABS_SUM_8x8_64 +HADAMARD8_DIFF_SSE2 ssse3, 9 + +INIT_XMM +; sse16_sse2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) +cglobal sse16_sse2, 5, 5, 8 + shr r4d, 1 + pxor m0, m0 ; mm0 = 0 + pxor m7, m7 ; mm7 holds the sum + +.next2lines: ; FIXME why are these unaligned movs? pix1[] is aligned + movu m1, [r1 ] ; mm1 = pix1[0][0-15] + movu m2, [r2 ] ; mm2 = pix2[0][0-15] + movu m3, [r1+r3] ; mm3 = pix1[1][0-15] + movu m4, [r2+r3] ; mm4 = pix2[1][0-15] + + ; todo: mm1-mm2, mm3-mm4 + ; algo: subtract mm1 from mm2 with saturation and vice versa + ; OR the result to get the absolute difference + mova m5, m1 + mova m6, m3 + psubusb m1, m2 + psubusb m3, m4 + psubusb m2, m5 + psubusb m4, m6 + + por m2, m1 + por m4, m3 + + ; now convert to 16-bit vectors so we can square them + mova m1, m2 + mova m3, m4 + + punpckhbw m2, m0 + punpckhbw m4, m0 + punpcklbw m1, m0 ; mm1 not spread over (mm1,mm2) + punpcklbw m3, m0 ; mm4 not spread over (mm3,mm4) + + pmaddwd m2, m2 + pmaddwd m4, m4 + pmaddwd m1, m1 + pmaddwd m3, m3 + + lea r1, [r1+r3*2] ; pix1 += 2*line_size + lea r2, [r2+r3*2] ; pix2 += 2*line_size + + paddd m1, m2 + paddd m3, m4 + paddd m7, m1 + paddd m7, m3 + + dec r4 + jnz .next2lines + + mova m1, m7 + psrldq m7, 8 ; shift hi qword to lo + paddd m7, m1 + mova m1, m7 + psrldq m7, 4 ; shift hi dword to lo + paddd m7, m1 + movd eax, m7 ; return value + RET diff --git a/libavcodec/x86/dsputilenc_yasm.asm b/libavcodec/x86/dsputilenc_yasm.asm deleted file mode 100644 index b7078f10a9..0000000000 --- a/libavcodec/x86/dsputilenc_yasm.asm +++ /dev/null @@ -1,342 +0,0 @@ -;***************************************************************************** -;* MMX optimized DSP utils -;***************************************************************************** -;* Copyright (c) 2000, 2001 Fabrice Bellard -;* Copyright (c) 2002-2004 Michael Niedermayer -;* -;* This file is part of Libav. -;* -;* Libav is free software; you can redistribute it and/or -;* modify it under the terms of the GNU Lesser General Public -;* License as published by the Free Software Foundation; either -;* version 2.1 of the License, or (at your option) any later version. -;* -;* Libav is distributed in the hope that it will be useful, -;* but WITHOUT ANY WARRANTY; without even the implied warranty of -;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -;* Lesser General Public License for more details. -;* -;* You should have received a copy of the GNU Lesser General Public -;* License along with Libav; if not, write to the Free Software -;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -;***************************************************************************** - -%include "x86inc.asm" -%include "x86util.asm" - -SECTION .text - -%macro DIFF_PIXELS_1 4 - movh %1, %3 - movh %2, %4 - punpcklbw %2, %1 - punpcklbw %1, %1 - psubw %1, %2 -%endmacro - -; %1=uint8_t *pix1, %2=uint8_t *pix2, %3=static offset, %4=stride, %5=stride*3 -; %6=temporary storage location -; this macro requires $mmsize stack space (aligned) on %6 (except on SSE+x86-64) -%macro DIFF_PIXELS_8 6 - DIFF_PIXELS_1 m0, m7, [%1 +%3], [%2 +%3] - DIFF_PIXELS_1 m1, m7, [%1+%4 +%3], [%2+%4 +%3] - DIFF_PIXELS_1 m2, m7, [%1+%4*2+%3], [%2+%4*2+%3] - add %1, %5 - add %2, %5 - DIFF_PIXELS_1 m3, m7, [%1 +%3], [%2 +%3] - DIFF_PIXELS_1 m4, m7, [%1+%4 +%3], [%2+%4 +%3] - DIFF_PIXELS_1 m5, m7, [%1+%4*2+%3], [%2+%4*2+%3] - DIFF_PIXELS_1 m6, m7, [%1+%5 +%3], [%2+%5 +%3] -%ifdef m8 - DIFF_PIXELS_1 m7, m8, [%1+%4*4+%3], [%2+%4*4+%3] -%else - mova [%6], m0 - DIFF_PIXELS_1 m7, m0, [%1+%4*4+%3], [%2+%4*4+%3] - mova m0, [%6] -%endif - sub %1, %5 - sub %2, %5 -%endmacro - -%macro HADAMARD8 0 - SUMSUB_BADC w, 0, 1, 2, 3 - SUMSUB_BADC w, 4, 5, 6, 7 - SUMSUB_BADC w, 0, 2, 1, 3 - SUMSUB_BADC w, 4, 6, 5, 7 - SUMSUB_BADC w, 0, 4, 1, 5 - SUMSUB_BADC w, 2, 6, 3, 7 -%endmacro - -%macro ABS1_SUM 3 - ABS1 %1, %2 - paddusw %3, %1 -%endmacro - -%macro ABS2_SUM 6 - ABS2 %1, %2, %3, %4 - paddusw %5, %1 - paddusw %6, %2 -%endmacro - -%macro ABS_SUM_8x8_64 1 - ABS2 m0, m1, m8, m9 - ABS2_SUM m2, m3, m8, m9, m0, m1 - ABS2_SUM m4, m5, m8, m9, m0, m1 - ABS2_SUM m6, m7, m8, m9, m0, m1 - paddusw m0, m1 -%endmacro - -%macro ABS_SUM_8x8_32 1 - mova [%1], m7 - ABS1 m0, m7 - ABS1 m1, m7 - ABS1_SUM m2, m7, m0 - ABS1_SUM m3, m7, m1 - ABS1_SUM m4, m7, m0 - ABS1_SUM m5, m7, m1 - ABS1_SUM m6, m7, m0 - mova m2, [%1] - ABS1_SUM m2, m7, m1 - paddusw m0, m1 -%endmacro - -; FIXME: HSUM_* saturates at 64k, while an 8x8 hadamard or dct block can get up to -; about 100k on extreme inputs. But that's very unlikely to occur in natural video, -; and it's even more unlikely to not have any alternative mvs/modes with lower cost. -%macro HSUM_MMX 3 - mova %2, %1 - psrlq %1, 32 - paddusw %1, %2 - mova %2, %1 - psrlq %1, 16 - paddusw %1, %2 - movd %3, %1 -%endmacro - -%macro HSUM_MMX2 3 - pshufw %2, %1, 0xE - paddusw %1, %2 - pshufw %2, %1, 0x1 - paddusw %1, %2 - movd %3, %1 -%endmacro - -%macro HSUM_SSE2 3 - movhlps %2, %1 - paddusw %1, %2 - pshuflw %2, %1, 0xE - paddusw %1, %2 - pshuflw %2, %1, 0x1 - paddusw %1, %2 - movd %3, %1 -%endmacro - -%macro STORE4 5 - mova [%1+mmsize*0], %2 - mova [%1+mmsize*1], %3 - mova [%1+mmsize*2], %4 - mova [%1+mmsize*3], %5 -%endmacro - -%macro LOAD4 5 - mova %2, [%1+mmsize*0] - mova %3, [%1+mmsize*1] - mova %4, [%1+mmsize*2] - mova %5, [%1+mmsize*3] -%endmacro - -%macro hadamard8_16_wrapper 3 -cglobal hadamard8_diff_%1, 4, 4, %2 -%ifndef m8 - %assign pad %3*mmsize-(4+stack_offset&(mmsize-1)) - SUB rsp, pad -%endif - call hadamard8x8_diff_%1 -%ifndef m8 - ADD rsp, pad -%endif - RET - -cglobal hadamard8_diff16_%1, 5, 6, %2 -%ifndef m8 - %assign pad %3*mmsize-(4+stack_offset&(mmsize-1)) - SUB rsp, pad -%endif - - call hadamard8x8_diff_%1 - mov r5d, eax - - add r1, 8 - add r2, 8 - call hadamard8x8_diff_%1 - add r5d, eax - - cmp r4d, 16 - jne .done - - lea r1, [r1+r3*8-8] - lea r2, [r2+r3*8-8] - call hadamard8x8_diff_%1 - add r5d, eax - - add r1, 8 - add r2, 8 - call hadamard8x8_diff_%1 - add r5d, eax - -.done: - mov eax, r5d -%ifndef m8 - ADD rsp, pad -%endif - RET -%endmacro - -%macro HADAMARD8_DIFF_MMX 1 -ALIGN 16 -; int hadamard8_diff_##cpu(void *s, uint8_t *src1, uint8_t *src2, -; int stride, int h) -; r0 = void *s = unused, int h = unused (always 8) -; note how r1, r2 and r3 are not clobbered in this function, so 16x16 -; can simply call this 2x2x (and that's why we access rsp+gprsize -; everywhere, which is rsp of calling func -hadamard8x8_diff_%1: - lea r0, [r3*3] - - ; first 4x8 pixels - DIFF_PIXELS_8 r1, r2, 0, r3, r0, rsp+gprsize+0x60 - HADAMARD8 - mova [rsp+gprsize+0x60], m7 - TRANSPOSE4x4W 0, 1, 2, 3, 7 - STORE4 rsp+gprsize, m0, m1, m2, m3 - mova m7, [rsp+gprsize+0x60] - TRANSPOSE4x4W 4, 5, 6, 7, 0 - STORE4 rsp+gprsize+0x40, m4, m5, m6, m7 - - ; second 4x8 pixels - DIFF_PIXELS_8 r1, r2, 4, r3, r0, rsp+gprsize+0x60 - HADAMARD8 - mova [rsp+gprsize+0x60], m7 - TRANSPOSE4x4W 0, 1, 2, 3, 7 - STORE4 rsp+gprsize+0x20, m0, m1, m2, m3 - mova m7, [rsp+gprsize+0x60] - TRANSPOSE4x4W 4, 5, 6, 7, 0 - - LOAD4 rsp+gprsize+0x40, m0, m1, m2, m3 - HADAMARD8 - ABS_SUM_8x8_32 rsp+gprsize+0x60 - mova [rsp+gprsize+0x60], m0 - - LOAD4 rsp+gprsize , m0, m1, m2, m3 - LOAD4 rsp+gprsize+0x20, m4, m5, m6, m7 - HADAMARD8 - ABS_SUM_8x8_32 rsp+gprsize - paddusw m0, [rsp+gprsize+0x60] - - HSUM m0, m1, eax - and rax, 0xFFFF - ret - -hadamard8_16_wrapper %1, 0, 14 -%endmacro - -%macro HADAMARD8_DIFF_SSE2 2 -hadamard8x8_diff_%1: - lea r0, [r3*3] - DIFF_PIXELS_8 r1, r2, 0, r3, r0, rsp+gprsize - HADAMARD8 -%if ARCH_X86_64 - TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8 -%else - TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [rsp+gprsize], [rsp+mmsize+gprsize] -%endif - HADAMARD8 - ABS_SUM_8x8 rsp+gprsize - HSUM_SSE2 m0, m1, eax - and eax, 0xFFFF - ret - -hadamard8_16_wrapper %1, %2, 3 -%endmacro - -INIT_MMX -%define ABS1 ABS1_MMX -%define HSUM HSUM_MMX -HADAMARD8_DIFF_MMX mmx - -%define ABS1 ABS1_MMX2 -%define HSUM HSUM_MMX2 -HADAMARD8_DIFF_MMX mmx2 - -INIT_XMM -%define ABS2 ABS2_MMX2 -%if ARCH_X86_64 -%define ABS_SUM_8x8 ABS_SUM_8x8_64 -%else -%define ABS_SUM_8x8 ABS_SUM_8x8_32 -%endif -HADAMARD8_DIFF_SSE2 sse2, 10 - -%define ABS2 ABS2_SSSE3 -%define ABS_SUM_8x8 ABS_SUM_8x8_64 -HADAMARD8_DIFF_SSE2 ssse3, 9 - -INIT_XMM -; sse16_sse2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) -cglobal sse16_sse2, 5, 5, 8 - shr r4d, 1 - pxor m0, m0 ; mm0 = 0 - pxor m7, m7 ; mm7 holds the sum - -.next2lines: ; FIXME why are these unaligned movs? pix1[] is aligned - movu m1, [r1 ] ; mm1 = pix1[0][0-15] - movu m2, [r2 ] ; mm2 = pix2[0][0-15] - movu m3, [r1+r3] ; mm3 = pix1[1][0-15] - movu m4, [r2+r3] ; mm4 = pix2[1][0-15] - - ; todo: mm1-mm2, mm3-mm4 - ; algo: subtract mm1 from mm2 with saturation and vice versa - ; OR the result to get the absolute difference - mova m5, m1 - mova m6, m3 - psubusb m1, m2 - psubusb m3, m4 - psubusb m2, m5 - psubusb m4, m6 - - por m2, m1 - por m4, m3 - - ; now convert to 16-bit vectors so we can square them - mova m1, m2 - mova m3, m4 - - punpckhbw m2, m0 - punpckhbw m4, m0 - punpcklbw m1, m0 ; mm1 not spread over (mm1,mm2) - punpcklbw m3, m0 ; mm4 not spread over (mm3,mm4) - - pmaddwd m2, m2 - pmaddwd m4, m4 - pmaddwd m1, m1 - pmaddwd m3, m3 - - lea r1, [r1+r3*2] ; pix1 += 2*line_size - lea r2, [r2+r3*2] ; pix2 += 2*line_size - - paddd m1, m2 - paddd m3, m4 - paddd m7, m1 - paddd m7, m3 - - dec r4 - jnz .next2lines - - mova m1, m7 - psrldq m7, 8 ; shift hi qword to lo - paddd m7, m1 - mova m1, m7 - psrldq m7, 4 ; shift hi dword to lo - paddd m7, m1 - movd eax, m7 ; return value - RET diff --git a/libavcodec/x86/vc1dsp.asm b/libavcodec/x86/vc1dsp.asm new file mode 100644 index 0000000000..ced2b5ba88 --- /dev/null +++ b/libavcodec/x86/vc1dsp.asm @@ -0,0 +1,320 @@ +;****************************************************************************** +;* VC1 deblocking optimizations +;* Copyright (c) 2009 David Conrad +;* +;* This file is part of Libav. +;* +;* Libav is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* Libav is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with Libav; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "x86inc.asm" +%include "x86util.asm" + +cextern pw_4 +cextern pw_5 + +section .text + +; dst_low, dst_high (src), zero +; zero-extends one vector from 8 to 16 bits +%macro UNPACK_8TO16 4 + mova m%2, m%3 + punpckh%1 m%3, m%4 + punpckl%1 m%2, m%4 +%endmacro + +%macro STORE_4_WORDS_MMX 6 + movd %6d, %5 +%if mmsize==16 + psrldq %5, 4 +%else + psrlq %5, 32 +%endif + mov %1, %6w + shr %6, 16 + mov %2, %6w + movd %6d, %5 + mov %3, %6w + shr %6, 16 + mov %4, %6w +%endmacro + +%macro STORE_4_WORDS_SSE4 6 + pextrw %1, %5, %6+0 + pextrw %2, %5, %6+1 + pextrw %3, %5, %6+2 + pextrw %4, %5, %6+3 +%endmacro + +; in: p1 p0 q0 q1, clobbers p0 +; out: p1 = (2*(p1 - q1) - 5*(p0 - q0) + 4) >> 3 +%macro VC1_LOOP_FILTER_A0 4 + psubw %1, %4 + psubw %2, %3 + paddw %1, %1 + pmullw %2, [pw_5] + psubw %1, %2 + paddw %1, [pw_4] + psraw %1, 3 +%endmacro + +; in: p0 q0 a0 a1 a2 +; m0 m1 m7 m6 m5 +; %1: size +; out: m0=p0' m1=q0' +%macro VC1_FILTER 1 + PABSW m4, m7 + PABSW m3, m6 + PABSW m2, m5 + mova m6, m4 + pminsw m3, m2 + pcmpgtw m6, m3 ; if (a2 < a0 || a1 < a0) + psubw m3, m4 + pmullw m3, [pw_5] ; 5*(a3 - a0) + PABSW m2, m3 + psraw m2, 3 ; abs(d/8) + pxor m7, m3 ; d_sign ^= a0_sign + + pxor m5, m5 + movd m3, r2d +%if %1 > 4 + punpcklbw m3, m3 +%endif + punpcklbw m3, m5 + pcmpgtw m3, m4 ; if (a0 < pq) + pand m6, m3 + + mova m3, m0 + psubw m3, m1 + PABSW m4, m3 + psraw m4, 1 + pxor m3, m7 ; d_sign ^ clip_sign + psraw m3, 15 + pminsw m2, m4 ; min(d, clip) + pcmpgtw m4, m5 + pand m6, m4 ; filt3 (C return value) + +; each set of 4 pixels is not filtered if the 3rd is not +%if mmsize==16 + pshuflw m4, m6, 0xaa +%if %1 > 4 + pshufhw m4, m4, 0xaa +%endif +%else + pshufw m4, m6, 0xaa +%endif + pandn m3, m4 + pand m2, m6 + pand m3, m2 ; d final + + psraw m7, 15 + pxor m3, m7 + psubw m3, m7 + psubw m0, m3 + paddw m1, m3 + packuswb m0, m0 + packuswb m1, m1 +%endmacro + +; 1st param: size of filter +; 2nd param: mov suffix equivalent to the filter size +%macro VC1_V_LOOP_FILTER 2 + pxor m5, m5 + mov%2 m6, [r4] + mov%2 m4, [r4+r1] + mov%2 m7, [r4+2*r1] + mov%2 m0, [r4+r3] + punpcklbw m6, m5 + punpcklbw m4, m5 + punpcklbw m7, m5 + punpcklbw m0, m5 + + VC1_LOOP_FILTER_A0 m6, m4, m7, m0 + mov%2 m1, [r0] + mov%2 m2, [r0+r1] + punpcklbw m1, m5 + punpcklbw m2, m5 + mova m4, m0 + VC1_LOOP_FILTER_A0 m7, m4, m1, m2 + mov%2 m3, [r0+2*r1] + mov%2 m4, [r0+r3] + punpcklbw m3, m5 + punpcklbw m4, m5 + mova m5, m1 + VC1_LOOP_FILTER_A0 m5, m2, m3, m4 + + VC1_FILTER %1 + mov%2 [r4+r3], m0 + mov%2 [r0], m1 +%endmacro + +; 1st param: size of filter +; NOTE: UNPACK_8TO16 this number of 8 bit numbers are in half a register +; 2nd (optional) param: temp register to use for storing words +%macro VC1_H_LOOP_FILTER 1-2 +%if %1 == 4 + movq m0, [r0 -4] + movq m1, [r0+ r1-4] + movq m2, [r0+2*r1-4] + movq m3, [r0+ r3-4] + TRANSPOSE4x4B 0, 1, 2, 3, 4 +%else + movq m0, [r0 -4] + movq m4, [r0+ r1-4] + movq m1, [r0+2*r1-4] + movq m5, [r0+ r3-4] + movq m2, [r4 -4] + movq m6, [r4+ r1-4] + movq m3, [r4+2*r1-4] + movq m7, [r4+ r3-4] + punpcklbw m0, m4 + punpcklbw m1, m5 + punpcklbw m2, m6 + punpcklbw m3, m7 + TRANSPOSE4x4W 0, 1, 2, 3, 4 +%endif + pxor m5, m5 + + UNPACK_8TO16 bw, 6, 0, 5 + UNPACK_8TO16 bw, 7, 1, 5 + VC1_LOOP_FILTER_A0 m6, m0, m7, m1 + UNPACK_8TO16 bw, 4, 2, 5 + mova m0, m1 ; m0 = p0 + VC1_LOOP_FILTER_A0 m7, m1, m4, m2 + UNPACK_8TO16 bw, 1, 3, 5 + mova m5, m4 + VC1_LOOP_FILTER_A0 m5, m2, m1, m3 + SWAP 1, 4 ; m1 = q0 + + VC1_FILTER %1 + punpcklbw m0, m1 +%if %0 > 1 + STORE_4_WORDS_MMX [r0-1], [r0+r1-1], [r0+2*r1-1], [r0+r3-1], m0, %2 +%if %1 > 4 + psrldq m0, 4 + STORE_4_WORDS_MMX [r4-1], [r4+r1-1], [r4+2*r1-1], [r4+r3-1], m0, %2 +%endif +%else + STORE_4_WORDS_SSE4 [r0-1], [r0+r1-1], [r0+2*r1-1], [r0+r3-1], m0, 0 + STORE_4_WORDS_SSE4 [r4-1], [r4+r1-1], [r4+2*r1-1], [r4+r3-1], m0, 4 +%endif +%endmacro + + +%macro START_V_FILTER 0 + mov r4, r0 + lea r3, [4*r1] + sub r4, r3 + lea r3, [r1+2*r1] + imul r2, 0x01010101 +%endmacro + +%macro START_H_FILTER 1 + lea r3, [r1+2*r1] +%if %1 > 4 + lea r4, [r0+4*r1] +%endif + imul r2, 0x01010101 +%endmacro + +%macro VC1_LF_MMX 1 +INIT_MMX +cglobal vc1_v_loop_filter_internal_%1 + VC1_V_LOOP_FILTER 4, d + ret + +cglobal vc1_h_loop_filter_internal_%1 + VC1_H_LOOP_FILTER 4, r4 + ret + +; void ff_vc1_v_loop_filter4_mmx2(uint8_t *src, int stride, int pq) +cglobal vc1_v_loop_filter4_%1, 3,5,0 + START_V_FILTER + call vc1_v_loop_filter_internal_%1 + RET + +; void ff_vc1_h_loop_filter4_mmx2(uint8_t *src, int stride, int pq) +cglobal vc1_h_loop_filter4_%1, 3,5,0 + START_H_FILTER 4 + call vc1_h_loop_filter_internal_%1 + RET + +; void ff_vc1_v_loop_filter8_mmx2(uint8_t *src, int stride, int pq) +cglobal vc1_v_loop_filter8_%1, 3,5,0 + START_V_FILTER + call vc1_v_loop_filter_internal_%1 + add r4, 4 + add r0, 4 + call vc1_v_loop_filter_internal_%1 + RET + +; void ff_vc1_h_loop_filter8_mmx2(uint8_t *src, int stride, int pq) +cglobal vc1_h_loop_filter8_%1, 3,5,0 + START_H_FILTER 4 + call vc1_h_loop_filter_internal_%1 + lea r0, [r0+4*r1] + call vc1_h_loop_filter_internal_%1 + RET +%endmacro + +%define PABSW PABSW_MMX2 +VC1_LF_MMX mmx2 + +INIT_XMM +; void ff_vc1_v_loop_filter8_sse2(uint8_t *src, int stride, int pq) +cglobal vc1_v_loop_filter8_sse2, 3,5,8 + START_V_FILTER + VC1_V_LOOP_FILTER 8, q + RET + +; void ff_vc1_h_loop_filter8_sse2(uint8_t *src, int stride, int pq) +cglobal vc1_h_loop_filter8_sse2, 3,6,8 + START_H_FILTER 8 + VC1_H_LOOP_FILTER 8, r5 + RET + +%define PABSW PABSW_SSSE3 + +INIT_MMX +; void ff_vc1_v_loop_filter4_ssse3(uint8_t *src, int stride, int pq) +cglobal vc1_v_loop_filter4_ssse3, 3,5,0 + START_V_FILTER + VC1_V_LOOP_FILTER 4, d + RET + +; void ff_vc1_h_loop_filter4_ssse3(uint8_t *src, int stride, int pq) +cglobal vc1_h_loop_filter4_ssse3, 3,5,0 + START_H_FILTER 4 + VC1_H_LOOP_FILTER 4, r4 + RET + +INIT_XMM +; void ff_vc1_v_loop_filter8_ssse3(uint8_t *src, int stride, int pq) +cglobal vc1_v_loop_filter8_ssse3, 3,5,8 + START_V_FILTER + VC1_V_LOOP_FILTER 8, q + RET + +; void ff_vc1_h_loop_filter8_ssse3(uint8_t *src, int stride, int pq) +cglobal vc1_h_loop_filter8_ssse3, 3,6,8 + START_H_FILTER 8 + VC1_H_LOOP_FILTER 8, r5 + RET + +; void ff_vc1_h_loop_filter8_sse4(uint8_t *src, int stride, int pq) +cglobal vc1_h_loop_filter8_sse4, 3,5,8 + START_H_FILTER 8 + VC1_H_LOOP_FILTER 8 + RET diff --git a/libavcodec/x86/vc1dsp_yasm.asm b/libavcodec/x86/vc1dsp_yasm.asm deleted file mode 100644 index ced2b5ba88..0000000000 --- a/libavcodec/x86/vc1dsp_yasm.asm +++ /dev/null @@ -1,320 +0,0 @@ -;****************************************************************************** -;* VC1 deblocking optimizations -;* Copyright (c) 2009 David Conrad -;* -;* This file is part of Libav. -;* -;* Libav is free software; you can redistribute it and/or -;* modify it under the terms of the GNU Lesser General Public -;* License as published by the Free Software Foundation; either -;* version 2.1 of the License, or (at your option) any later version. -;* -;* Libav is distributed in the hope that it will be useful, -;* but WITHOUT ANY WARRANTY; without even the implied warranty of -;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -;* Lesser General Public License for more details. -;* -;* You should have received a copy of the GNU Lesser General Public -;* License along with Libav; if not, write to the Free Software -;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -;****************************************************************************** - -%include "x86inc.asm" -%include "x86util.asm" - -cextern pw_4 -cextern pw_5 - -section .text - -; dst_low, dst_high (src), zero -; zero-extends one vector from 8 to 16 bits -%macro UNPACK_8TO16 4 - mova m%2, m%3 - punpckh%1 m%3, m%4 - punpckl%1 m%2, m%4 -%endmacro - -%macro STORE_4_WORDS_MMX 6 - movd %6d, %5 -%if mmsize==16 - psrldq %5, 4 -%else - psrlq %5, 32 -%endif - mov %1, %6w - shr %6, 16 - mov %2, %6w - movd %6d, %5 - mov %3, %6w - shr %6, 16 - mov %4, %6w -%endmacro - -%macro STORE_4_WORDS_SSE4 6 - pextrw %1, %5, %6+0 - pextrw %2, %5, %6+1 - pextrw %3, %5, %6+2 - pextrw %4, %5, %6+3 -%endmacro - -; in: p1 p0 q0 q1, clobbers p0 -; out: p1 = (2*(p1 - q1) - 5*(p0 - q0) + 4) >> 3 -%macro VC1_LOOP_FILTER_A0 4 - psubw %1, %4 - psubw %2, %3 - paddw %1, %1 - pmullw %2, [pw_5] - psubw %1, %2 - paddw %1, [pw_4] - psraw %1, 3 -%endmacro - -; in: p0 q0 a0 a1 a2 -; m0 m1 m7 m6 m5 -; %1: size -; out: m0=p0' m1=q0' -%macro VC1_FILTER 1 - PABSW m4, m7 - PABSW m3, m6 - PABSW m2, m5 - mova m6, m4 - pminsw m3, m2 - pcmpgtw m6, m3 ; if (a2 < a0 || a1 < a0) - psubw m3, m4 - pmullw m3, [pw_5] ; 5*(a3 - a0) - PABSW m2, m3 - psraw m2, 3 ; abs(d/8) - pxor m7, m3 ; d_sign ^= a0_sign - - pxor m5, m5 - movd m3, r2d -%if %1 > 4 - punpcklbw m3, m3 -%endif - punpcklbw m3, m5 - pcmpgtw m3, m4 ; if (a0 < pq) - pand m6, m3 - - mova m3, m0 - psubw m3, m1 - PABSW m4, m3 - psraw m4, 1 - pxor m3, m7 ; d_sign ^ clip_sign - psraw m3, 15 - pminsw m2, m4 ; min(d, clip) - pcmpgtw m4, m5 - pand m6, m4 ; filt3 (C return value) - -; each set of 4 pixels is not filtered if the 3rd is not -%if mmsize==16 - pshuflw m4, m6, 0xaa -%if %1 > 4 - pshufhw m4, m4, 0xaa -%endif -%else - pshufw m4, m6, 0xaa -%endif - pandn m3, m4 - pand m2, m6 - pand m3, m2 ; d final - - psraw m7, 15 - pxor m3, m7 - psubw m3, m7 - psubw m0, m3 - paddw m1, m3 - packuswb m0, m0 - packuswb m1, m1 -%endmacro - -; 1st param: size of filter -; 2nd param: mov suffix equivalent to the filter size -%macro VC1_V_LOOP_FILTER 2 - pxor m5, m5 - mov%2 m6, [r4] - mov%2 m4, [r4+r1] - mov%2 m7, [r4+2*r1] - mov%2 m0, [r4+r3] - punpcklbw m6, m5 - punpcklbw m4, m5 - punpcklbw m7, m5 - punpcklbw m0, m5 - - VC1_LOOP_FILTER_A0 m6, m4, m7, m0 - mov%2 m1, [r0] - mov%2 m2, [r0+r1] - punpcklbw m1, m5 - punpcklbw m2, m5 - mova m4, m0 - VC1_LOOP_FILTER_A0 m7, m4, m1, m2 - mov%2 m3, [r0+2*r1] - mov%2 m4, [r0+r3] - punpcklbw m3, m5 - punpcklbw m4, m5 - mova m5, m1 - VC1_LOOP_FILTER_A0 m5, m2, m3, m4 - - VC1_FILTER %1 - mov%2 [r4+r3], m0 - mov%2 [r0], m1 -%endmacro - -; 1st param: size of filter -; NOTE: UNPACK_8TO16 this number of 8 bit numbers are in half a register -; 2nd (optional) param: temp register to use for storing words -%macro VC1_H_LOOP_FILTER 1-2 -%if %1 == 4 - movq m0, [r0 -4] - movq m1, [r0+ r1-4] - movq m2, [r0+2*r1-4] - movq m3, [r0+ r3-4] - TRANSPOSE4x4B 0, 1, 2, 3, 4 -%else - movq m0, [r0 -4] - movq m4, [r0+ r1-4] - movq m1, [r0+2*r1-4] - movq m5, [r0+ r3-4] - movq m2, [r4 -4] - movq m6, [r4+ r1-4] - movq m3, [r4+2*r1-4] - movq m7, [r4+ r3-4] - punpcklbw m0, m4 - punpcklbw m1, m5 - punpcklbw m2, m6 - punpcklbw m3, m7 - TRANSPOSE4x4W 0, 1, 2, 3, 4 -%endif - pxor m5, m5 - - UNPACK_8TO16 bw, 6, 0, 5 - UNPACK_8TO16 bw, 7, 1, 5 - VC1_LOOP_FILTER_A0 m6, m0, m7, m1 - UNPACK_8TO16 bw, 4, 2, 5 - mova m0, m1 ; m0 = p0 - VC1_LOOP_FILTER_A0 m7, m1, m4, m2 - UNPACK_8TO16 bw, 1, 3, 5 - mova m5, m4 - VC1_LOOP_FILTER_A0 m5, m2, m1, m3 - SWAP 1, 4 ; m1 = q0 - - VC1_FILTER %1 - punpcklbw m0, m1 -%if %0 > 1 - STORE_4_WORDS_MMX [r0-1], [r0+r1-1], [r0+2*r1-1], [r0+r3-1], m0, %2 -%if %1 > 4 - psrldq m0, 4 - STORE_4_WORDS_MMX [r4-1], [r4+r1-1], [r4+2*r1-1], [r4+r3-1], m0, %2 -%endif -%else - STORE_4_WORDS_SSE4 [r0-1], [r0+r1-1], [r0+2*r1-1], [r0+r3-1], m0, 0 - STORE_4_WORDS_SSE4 [r4-1], [r4+r1-1], [r4+2*r1-1], [r4+r3-1], m0, 4 -%endif -%endmacro - - -%macro START_V_FILTER 0 - mov r4, r0 - lea r3, [4*r1] - sub r4, r3 - lea r3, [r1+2*r1] - imul r2, 0x01010101 -%endmacro - -%macro START_H_FILTER 1 - lea r3, [r1+2*r1] -%if %1 > 4 - lea r4, [r0+4*r1] -%endif - imul r2, 0x01010101 -%endmacro - -%macro VC1_LF_MMX 1 -INIT_MMX -cglobal vc1_v_loop_filter_internal_%1 - VC1_V_LOOP_FILTER 4, d - ret - -cglobal vc1_h_loop_filter_internal_%1 - VC1_H_LOOP_FILTER 4, r4 - ret - -; void ff_vc1_v_loop_filter4_mmx2(uint8_t *src, int stride, int pq) -cglobal vc1_v_loop_filter4_%1, 3,5,0 - START_V_FILTER - call vc1_v_loop_filter_internal_%1 - RET - -; void ff_vc1_h_loop_filter4_mmx2(uint8_t *src, int stride, int pq) -cglobal vc1_h_loop_filter4_%1, 3,5,0 - START_H_FILTER 4 - call vc1_h_loop_filter_internal_%1 - RET - -; void ff_vc1_v_loop_filter8_mmx2(uint8_t *src, int stride, int pq) -cglobal vc1_v_loop_filter8_%1, 3,5,0 - START_V_FILTER - call vc1_v_loop_filter_internal_%1 - add r4, 4 - add r0, 4 - call vc1_v_loop_filter_internal_%1 - RET - -; void ff_vc1_h_loop_filter8_mmx2(uint8_t *src, int stride, int pq) -cglobal vc1_h_loop_filter8_%1, 3,5,0 - START_H_FILTER 4 - call vc1_h_loop_filter_internal_%1 - lea r0, [r0+4*r1] - call vc1_h_loop_filter_internal_%1 - RET -%endmacro - -%define PABSW PABSW_MMX2 -VC1_LF_MMX mmx2 - -INIT_XMM -; void ff_vc1_v_loop_filter8_sse2(uint8_t *src, int stride, int pq) -cglobal vc1_v_loop_filter8_sse2, 3,5,8 - START_V_FILTER - VC1_V_LOOP_FILTER 8, q - RET - -; void ff_vc1_h_loop_filter8_sse2(uint8_t *src, int stride, int pq) -cglobal vc1_h_loop_filter8_sse2, 3,6,8 - START_H_FILTER 8 - VC1_H_LOOP_FILTER 8, r5 - RET - -%define PABSW PABSW_SSSE3 - -INIT_MMX -; void ff_vc1_v_loop_filter4_ssse3(uint8_t *src, int stride, int pq) -cglobal vc1_v_loop_filter4_ssse3, 3,5,0 - START_V_FILTER - VC1_V_LOOP_FILTER 4, d - RET - -; void ff_vc1_h_loop_filter4_ssse3(uint8_t *src, int stride, int pq) -cglobal vc1_h_loop_filter4_ssse3, 3,5,0 - START_H_FILTER 4 - VC1_H_LOOP_FILTER 4, r4 - RET - -INIT_XMM -; void ff_vc1_v_loop_filter8_ssse3(uint8_t *src, int stride, int pq) -cglobal vc1_v_loop_filter8_ssse3, 3,5,8 - START_V_FILTER - VC1_V_LOOP_FILTER 8, q - RET - -; void ff_vc1_h_loop_filter8_ssse3(uint8_t *src, int stride, int pq) -cglobal vc1_h_loop_filter8_ssse3, 3,6,8 - START_H_FILTER 8 - VC1_H_LOOP_FILTER 8, r5 - RET - -; void ff_vc1_h_loop_filter8_sse4(uint8_t *src, int stride, int pq) -cglobal vc1_h_loop_filter8_sse4, 3,5,8 - START_H_FILTER 8 - VC1_H_LOOP_FILTER 8 - RET -- cgit v1.2.3 From f4bb38cc26a4c91707194ea0b2290a028c546f7d Mon Sep 17 00:00:00 2001 From: Derek Buitenhuis Date: Fri, 10 Aug 2012 16:05:47 +0000 Subject: cllc: Rename some funcs to represent what they actually do This is in preparation for adding support for other colorspaces and coding types. Signed-off-by: Derek Buitenhuis --- libavcodec/cllc.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/libavcodec/cllc.c b/libavcodec/cllc.c index d23da181ba..703f546ea9 100644 --- a/libavcodec/cllc.c +++ b/libavcodec/cllc.c @@ -74,8 +74,8 @@ static int read_code_table(CLLCContext *ctx, GetBitContext *gb, VLC *vlc) codes, 2, 2, symbols, 1, 1, 0); } -static int read_line(CLLCContext *ctx, GetBitContext *gb, int *top_left, - VLC *vlc, uint8_t *outbuf) +static int read_rgb24_component_line(CLLCContext *ctx, GetBitContext *gb, + int *top_left, VLC *vlc, uint8_t *outbuf) { uint8_t *dst; int pred, code; @@ -104,7 +104,7 @@ static int read_line(CLLCContext *ctx, GetBitContext *gb, int *top_left, return 0; } -static int decode_bgr24_frame(CLLCContext *ctx, GetBitContext *gb, AVFrame *pic) +static int decode_rgb24_frame(CLLCContext *ctx, GetBitContext *gb, AVFrame *pic) { AVCodecContext *avctx = ctx->avctx; uint8_t *dst; @@ -137,7 +137,7 @@ static int decode_bgr24_frame(CLLCContext *ctx, GetBitContext *gb, AVFrame *pic) /* Read in and restore every line */ for (i = 0; i < avctx->height; i++) { for (j = 0; j < 3; j++) - read_line(ctx, gb, &pred[j], &vlc[j], &dst[j]); + read_rgb24_component_line(ctx, gb, &pred[j], &vlc[j], &dst[j]); dst += pic->linesize[0]; } @@ -219,7 +219,7 @@ static int cllc_decode_frame(AVCodecContext *avctx, void *data, return ret; } - ret = decode_bgr24_frame(ctx, &gb, pic); + ret = decode_rgb24_frame(ctx, &gb, pic); if (ret < 0) return ret; -- cgit v1.2.3 From 7fda47d53b5489d6140b9f490bfc9c531521d46b Mon Sep 17 00:00:00 2001 From: Derek Buitenhuis Date: Fri, 10 Aug 2012 17:19:03 +0000 Subject: cllc: Add support for QRGB Signed-off-by: Derek Buitenhuis --- libavcodec/cllc.c | 1 + 1 file changed, 1 insertion(+) diff --git a/libavcodec/cllc.c b/libavcodec/cllc.c index 703f546ea9..68668ef39d 100644 --- a/libavcodec/cllc.c +++ b/libavcodec/cllc.c @@ -210,6 +210,7 @@ static int cllc_decode_frame(AVCodecContext *avctx, void *data, switch (coding_type) { case 1: + case 2: avctx->pix_fmt = PIX_FMT_RGB24; avctx->bits_per_raw_sample = 8; -- cgit v1.2.3 From 17c11cef9f99d31e31dd9176ee2f26bdf6e5d351 Mon Sep 17 00:00:00 2001 From: Derek Buitenhuis Date: Fri, 10 Aug 2012 16:05:49 +0000 Subject: cllc: Implement ARGB support Signed-off-by: Derek Buitenhuis --- libavcodec/cllc.c | 133 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 133 insertions(+) diff --git a/libavcodec/cllc.c b/libavcodec/cllc.c index 68668ef39d..a1514179b7 100644 --- a/libavcodec/cllc.c +++ b/libavcodec/cllc.c @@ -74,6 +74,80 @@ static int read_code_table(CLLCContext *ctx, GetBitContext *gb, VLC *vlc) codes, 2, 2, symbols, 1, 1, 0); } +/* + * Unlike the RGB24 read/restore, which reads in a component at a time, + * ARGB read/restore reads in ARGB quads. + */ +static int read_argb_line(CLLCContext *ctx, GetBitContext *gb, int *top_left, + VLC *vlc, uint8_t *outbuf) +{ + uint8_t *dst; + int pred[4]; + int code; + int i; + + OPEN_READER(bits, gb); + + dst = outbuf; + pred[0] = top_left[0]; + pred[1] = top_left[1]; + pred[2] = top_left[2]; + pred[3] = top_left[3]; + + for (i = 0; i < ctx->avctx->width; i++) { + /* Always get the alpha component */ + UPDATE_CACHE(bits, gb); + GET_VLC(code, bits, gb, vlc[0].table, 7, 2); + + pred[0] += code; + dst[0] = pred[0]; + + /* Skip the components if they are entirely transparent */ + if (dst[0]) { + /* Red */ + UPDATE_CACHE(bits, gb); + GET_VLC(code, bits, gb, vlc[1].table, 7, 2); + + pred[1] += code; + dst[1] = pred[1]; + + /* Green */ + UPDATE_CACHE(bits, gb); + GET_VLC(code, bits, gb, vlc[2].table, 7, 2); + + pred[2] += code; + dst[2] = pred[2]; + + /* Blue */ + UPDATE_CACHE(bits, gb); + GET_VLC(code, bits, gb, vlc[3].table, 7, 2); + + pred[3] += code; + dst[3] = pred[3]; + } else { + dst[1] = 0; + dst[2] = 0; + dst[3] = 0; + } + + dst += 4; + } + + CLOSE_READER(bits, gb); + + dst -= 4 * ctx->avctx->width; + top_left[0] = dst[0]; + + /* Only stash components if they are not transparent */ + if (top_left[0]) { + top_left[1] = dst[1]; + top_left[2] = dst[2]; + top_left[3] = dst[3]; + } + + return 0; +} + static int read_rgb24_component_line(CLLCContext *ctx, GetBitContext *gb, int *top_left, VLC *vlc, uint8_t *outbuf) { @@ -104,6 +178,50 @@ static int read_rgb24_component_line(CLLCContext *ctx, GetBitContext *gb, return 0; } +static int decode_argb_frame(CLLCContext *ctx, GetBitContext *gb, AVFrame *pic) +{ + AVCodecContext *avctx = ctx->avctx; + uint8_t *dst; + int pred[4]; + int ret; + int i, j; + VLC vlc[4]; + + pred[0] = 0; + pred[1] = 0x80; + pred[2] = 0x80; + pred[3] = 0x80; + + dst = pic->data[0]; + + skip_bits(gb, 16); + + /* Read in code table for each plane */ + for (i = 0; i < 4; i++) { + ret = read_code_table(ctx, gb, &vlc[i]); + if (ret < 0) { + for (j = 0; j <= i; j++) + ff_free_vlc(&vlc[j]); + + av_log(ctx->avctx, AV_LOG_ERROR, + "Could not read code table %d.\n", i); + return ret; + } + } + + /* Read in and restore every line */ + for (i = 0; i < avctx->height; i++) { + read_argb_line(ctx, gb, pred, vlc, dst); + + dst += pic->linesize[0]; + } + + for (i = 0; i < 4; i++) + ff_free_vlc(&vlc[i]); + + return 0; +} + static int decode_rgb24_frame(CLLCContext *ctx, GetBitContext *gb, AVFrame *pic) { AVCodecContext *avctx = ctx->avctx; @@ -224,6 +342,21 @@ static int cllc_decode_frame(AVCodecContext *avctx, void *data, if (ret < 0) return ret; + break; + case 3: + avctx->pix_fmt = PIX_FMT_ARGB; + avctx->bits_per_raw_sample = 8; + + ret = avctx->get_buffer(avctx, pic); + if (ret < 0) { + av_log(avctx, AV_LOG_ERROR, "Could not allocate buffer.\n"); + return ret; + } + + ret = decode_argb_frame(ctx, &gb, pic); + if (ret < 0) + return ret; + break; default: av_log(avctx, AV_LOG_ERROR, "Unknown coding type: %d.\n", coding_type); -- cgit v1.2.3 From 6c4975eaafd7f8f91e81ad8d6be744a434241fd3 Mon Sep 17 00:00:00 2001 From: Mans Rullgard Date: Sat, 11 Aug 2012 01:15:19 +0100 Subject: libavutil: add saturating addition functions Fixed-point audio codecs often use saturating arithmetic, and special instructions for these operations are common. Signed-off-by: Mans Rullgard --- libavutil/arm/intmath.h | 15 +++++++++++++++ libavutil/common.h | 30 ++++++++++++++++++++++++++++++ 2 files changed, 45 insertions(+) diff --git a/libavutil/arm/intmath.h b/libavutil/arm/intmath.h index ce73404b37..d5a343c95f 100644 --- a/libavutil/arm/intmath.h +++ b/libavutil/arm/intmath.h @@ -83,6 +83,21 @@ static av_always_inline av_const unsigned av_clip_uintp2_arm(int a, int p) return x; } +#define av_sat_add32 av_sat_add32_arm +static av_always_inline int av_sat_add32_arm(int a, int b) +{ + int r; + __asm__ ("qadd %0, %1, %2" : "=r"(r) : "r"(a), "r"(b)); + return r; +} + +#define av_sat_dadd32 av_sat_dadd32_arm +static av_always_inline int av_sat_dadd32_arm(int a, int b) +{ + int r; + __asm__ ("qdadd %0, %1, %2" : "=r"(r) : "r"(a), "r"(b)); + return r; +} #else /* HAVE_ARMV6 */ diff --git a/libavutil/common.h b/libavutil/common.h index c99d858472..433e8e490a 100644 --- a/libavutil/common.h +++ b/libavutil/common.h @@ -181,6 +181,30 @@ static av_always_inline av_const unsigned av_clip_uintp2_c(int a, int p) else return a; } +/** + * Add two signed 32-bit values with saturation. + * + * @param a one value + * @param b another value + * @return sum with signed saturation + */ +static av_always_inline int av_sat_add32_c(int a, int b) +{ + return av_clipl_int32((int64_t)a + b); +} + +/** + * Add a doubled value to another value with saturation at both stages. + * + * @param a first value + * @param b value doubled and added to a + * @return sum with signed saturation + */ +static av_always_inline int av_sat_dadd32_c(int a, int b) +{ + return av_sat_add32(a, av_sat_add32(b, b)); +} + /** * Clip a float value into the amin-amax range. * @param a value to clip @@ -387,6 +411,12 @@ static av_always_inline av_const int av_popcount64_c(uint64_t x) #ifndef av_clip_uintp2 # define av_clip_uintp2 av_clip_uintp2_c #endif +#ifndef av_sat_add32 +# define av_sat_add32 av_sat_add32_c +#endif +#ifndef av_sat_dadd32 +# define av_sat_dadd32 av_sat_dadd32_c +#endif #ifndef av_clipf # define av_clipf av_clipf_c #endif -- cgit v1.2.3 From fddc5b9bea39968ed1f45c667869428865de7626 Mon Sep 17 00:00:00 2001 From: Mans Rullgard Date: Sat, 11 Aug 2012 04:18:53 +0100 Subject: celp: optimise ff_celp_lp_synthesis_filter() Adding instead of subtracting the products in the loop allows the compiler to generate more efficient multiply-accumulate instructions when 16-bit multiply-subtract is not available. ARM has only multiply-accumulate for 16-bit operands. In general, if only one variant exists, it is usually accumulate rather than subtract. In the same spirit, using the dedicated saturation function enables use of any special optimised versions of this. Signed-off-by: Mans Rullgard --- libavcodec/celp_filters.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/libavcodec/celp_filters.c b/libavcodec/celp_filters.c index 4e5bcda79a..d764d19219 100644 --- a/libavcodec/celp_filters.c +++ b/libavcodec/celp_filters.c @@ -63,17 +63,16 @@ int ff_celp_lp_synthesis_filter(int16_t *out, const int16_t *filter_coeffs, int i,n; for (n = 0; n < buffer_length; n++) { - int sum = rounder; + int sum = -rounder, sum1; for (i = 1; i <= filter_length; i++) - sum -= filter_coeffs[i-1] * out[n-i]; + sum += filter_coeffs[i-1] * out[n-i]; - sum = ((sum >> 12) + in[n]) >> shift; + sum1 = ((-sum >> 12) + in[n]) >> shift; + sum = av_clip_int16(sum1); + + if (stop_on_overflow && sum != sum1) + return 1; - if (sum + 0x8000 > 0xFFFFU) { - if (stop_on_overflow) - return 1; - sum = (sum >> 31) ^ 32767; - } out[n] = sum; } -- cgit v1.2.3 From 8b0de73464fcb110dce2f5601e4e27b2cbd33d20 Mon Sep 17 00:00:00 2001 From: Mans Rullgard Date: Sat, 11 Aug 2012 01:54:15 +0100 Subject: g723.1: deobfuscate "(x << 4) - x" to "15 * x" The compiler performs this optimisation. Signed-off-by: Mans Rullgard --- libavcodec/g723_1.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libavcodec/g723_1.c b/libavcodec/g723_1.c index 7d8a48e18a..d4158ffef7 100644 --- a/libavcodec/g723_1.c +++ b/libavcodec/g723_1.c @@ -914,7 +914,7 @@ static void gain_scale(G723_1_Context *p, int16_t * buf, int energy) } for (i = 0; i < SUBFRAME_LEN; i++) { - p->pf_gain = ((p->pf_gain << 4) - p->pf_gain + gain + (1 << 3)) >> 4; + p->pf_gain = (15 * p->pf_gain + gain + (1 << 3)) >> 4; buf[i] = av_clip_int16((buf[i] * (p->pf_gain + (p->pf_gain >> 4)) + (1 << 10)) >> 11); } -- cgit v1.2.3 From 5a43eba956d095157359e1abf639984c8ca508e4 Mon Sep 17 00:00:00 2001 From: Mans Rullgard Date: Fri, 10 Aug 2012 15:41:47 +0100 Subject: g723.1: remove unnecessary argument 'shift' from dot_product() The 'shift' argument is always 1 so there is no need to pass it explicitly in every call. Signed-off-by: Mans Rullgard --- libavcodec/g723_1.c | 26 +++++++++++--------------- 1 file changed, 11 insertions(+), 15 deletions(-) diff --git a/libavcodec/g723_1.c b/libavcodec/g723_1.c index d4158ffef7..45808001b9 100644 --- a/libavcodec/g723_1.c +++ b/libavcodec/g723_1.c @@ -569,13 +569,12 @@ static void get_residual(int16_t *residual, int16_t *prev_excitation, int lag) residual[i] = prev_excitation[offset + (i - 2) % lag]; } -static int dot_product(const int16_t *a, const int16_t *b, int length, - int shift) +static int dot_product(const int16_t *a, const int16_t *b, int length) { int i, sum = 0; for (i = 0; i < length; i++) { - int64_t prod = av_clipl_int32(MUL64(a[i], b[i]) << shift); + int64_t prod = av_clipl_int32(MUL64(a[i], b[i]) << 1); sum = av_clipl_int32(sum + prod); } return sum; @@ -606,7 +605,7 @@ static void gen_acb_excitation(int16_t *vector, int16_t *prev_excitation, /* Calculate adaptive vector */ cb_ptr += subfrm.ad_cb_gain * 20; for (i = 0; i < SUBFRAME_LEN; i++) { - sum = dot_product(residual + i, cb_ptr, PITCH_ORDER, 1); + sum = dot_product(residual + i, cb_ptr, PITCH_ORDER); vector[i] = av_clipl_int32((sum << 1) + (1 << 15)) >> 16; } } @@ -635,7 +634,7 @@ static int autocorr_max(G723_1_Context *p, int offset, int *ccr_max, limit = pitch_lag + 3; for (i = pitch_lag - 3; i <= limit; i++) { - ccr = dot_product(buf, buf + dir * i, length, 1); + ccr = dot_product(buf, buf + dir * i, length); if (ccr > *ccr_max) { *ccr_max = ccr; @@ -734,17 +733,15 @@ static void comp_ppf_coeff(G723_1_Context *p, int offset, int pitch_lag, return; /* Compute target energy */ - energy[0] = dot_product(buf, buf, SUBFRAME_LEN, 1); + energy[0] = dot_product(buf, buf, SUBFRAME_LEN); /* Compute forward residual energy */ if (fwd_lag) - energy[2] = dot_product(buf + fwd_lag, buf + fwd_lag, - SUBFRAME_LEN, 1); + energy[2] = dot_product(buf + fwd_lag, buf + fwd_lag, SUBFRAME_LEN); /* Compute backward residual energy */ if (back_lag) - energy[4] = dot_product(buf - back_lag, buf - back_lag, - SUBFRAME_LEN, 1); + energy[4] = dot_product(buf - back_lag, buf - back_lag, SUBFRAME_LEN); /* Normalize and shorten */ temp1 = 0; @@ -805,15 +802,14 @@ static int comp_interp_index(G723_1_Context *p, int pitch_lag, ccr = av_clipl_int32((int64_t)ccr + (1 << 15)) >> 16; /* Compute target energy */ - tgt_eng = dot_product(buf, buf, SUBFRAME_LEN * 2, 1); + tgt_eng = dot_product(buf, buf, SUBFRAME_LEN * 2); *exc_eng = av_clipl_int32((int64_t)tgt_eng + (1 << 15)) >> 16; if (ccr <= 0) return 0; /* Compute best energy */ - best_eng = dot_product(buf - index, buf - index, - SUBFRAME_LEN * 2, 1); + best_eng = dot_product(buf - index, buf - index, SUBFRAME_LEN * 2); best_eng = av_clipl_int32((int64_t)best_eng + (1 << 15)) >> 16; temp = best_eng * *exc_eng >> 3; @@ -966,8 +962,8 @@ static void formant_postfilter(G723_1_Context *p, int16_t *lpc, int16_t *buf) /* Compute auto correlation coefficients */ auto_corr[0] = dot_product(temp_vector, temp_vector + 1, - SUBFRAME_LEN - 1, 1); - auto_corr[1] = dot_product(temp_vector, temp_vector, SUBFRAME_LEN, 1); + SUBFRAME_LEN - 1); + auto_corr[1] = dot_product(temp_vector, temp_vector, SUBFRAME_LEN); /* Compute reflection coefficient */ temp = auto_corr[1] >> 16; -- cgit v1.2.3 From 1eb1f6f281eb6036d363e0317c1500be4a2708f2 Mon Sep 17 00:00:00 2001 From: Mans Rullgard Date: Fri, 10 Aug 2012 16:42:54 +0100 Subject: g723.1: remove useless uses of MUL64() The operands in both cases are 16-bit so cannot overflow a 32-bit destination. In gain_scale() the inputs are reduced to 14-bit, so even the shift cannot overflow. Signed-off-by: Mans Rullgard --- libavcodec/g723_1.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/libavcodec/g723_1.c b/libavcodec/g723_1.c index 45808001b9..f91f629311 100644 --- a/libavcodec/g723_1.c +++ b/libavcodec/g723_1.c @@ -574,7 +574,7 @@ static int dot_product(const int16_t *a, const int16_t *b, int length) int i, sum = 0; for (i = 0; i < length; i++) { - int64_t prod = av_clipl_int32(MUL64(a[i], b[i]) << 1); + int64_t prod = av_clipl_int32((int64_t)(a[i] * b[i]) << 1); sum = av_clipl_int32(sum + prod); } return sum; @@ -889,9 +889,9 @@ static void gain_scale(G723_1_Context *p, int16_t * buf, int energy) num = energy; denom = 0; for (i = 0; i < SUBFRAME_LEN; i++) { - int64_t temp = buf[i] >> 2; - temp = av_clipl_int32(MUL64(temp, temp) << 1); - denom = av_clipl_int32(denom + temp); + int temp = buf[i] >> 2; + temp *= temp; + denom = av_clipl_int32((int64_t)denom + (temp << 1)); } if (num && denom) { -- cgit v1.2.3 From 4aca716a531b0bc1f05c96209cf30577d6e48baa Mon Sep 17 00:00:00 2001 From: Mans Rullgard Date: Fri, 10 Aug 2012 18:15:41 +0100 Subject: g723.1: optimise scale_vector() Firstly, nothing in this function can overflow 32 bits so the use of a 64-bit type is completely unnecessary. Secondly, the scale is either a power of two or 0x7fff. Doing separate loops for these cases avoids using multiplications. Finally, since only the number of bits, not the actual value, of the maximum value is needed, the bitwise or of all the values serves the purpose while being faster. It is worth noting that even if overflow could happen, it was not handled correctly anyway. Signed-off-by: Mans Rullgard --- libavcodec/g723_1.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/libavcodec/g723_1.c b/libavcodec/g723_1.c index f91f629311..5be4fe046e 100644 --- a/libavcodec/g723_1.c +++ b/libavcodec/g723_1.c @@ -281,19 +281,21 @@ static int normalize_bits(int num, int width) static int scale_vector(int16_t *vector, int length) { int bits, max = 0; - int64_t scale; int i; for (i = 0; i < length; i++) - max = FFMAX(max, FFABS(vector[i])); + max |= FFABS(vector[i]); max = FFMIN(max, 0x7FFF); bits = normalize_bits(max, 15); - scale = (bits == 15) ? 0x7FFF : (1 << bits); - for (i = 0; i < length; i++) - vector[i] = av_clipl_int32(vector[i] * scale << 1) >> 4; + if (bits == 15) + for (i = 0; i < length; i++) + vector[i] = vector[i] * 0x7fff >> 3; + else + for (i = 0; i < length; i++) + vector[i] = vector[i] << bits >> 3; return bits - 3; } -- cgit v1.2.3 From 47c73a73b0967d54dfcbc6250d861406da7b84b6 Mon Sep 17 00:00:00 2001 From: Mans Rullgard Date: Sat, 11 Aug 2012 01:52:10 +0100 Subject: g723.1: use saturating addition functions Use saturating addition functions instead of 64-bit intermediates and separate clipping. This is much faster when dedicated instructions are available. Signed-off-by: Mans Rullgard --- libavcodec/g723_1.c | 29 ++++++++++++++--------------- 1 file changed, 14 insertions(+), 15 deletions(-) diff --git a/libavcodec/g723_1.c b/libavcodec/g723_1.c index 5be4fe046e..8a394f9340 100644 --- a/libavcodec/g723_1.c +++ b/libavcodec/g723_1.c @@ -394,11 +394,11 @@ static void lsp2lpc(int16_t *lpc) for (j = 0; j < LPC_ORDER; j++) { int index = lpc[j] >> 7; int offset = lpc[j] & 0x7f; - int64_t temp1 = cos_tab[index] << 16; + int temp1 = cos_tab[index] << 16; int temp2 = (cos_tab[index + 1] - cos_tab[index]) * ((offset << 8) + 0x80) << 1; - lpc[j] = -(av_clipl_int32(((temp1 + temp2) << 1) + (1 << 15)) >> 16); + lpc[j] = -(av_sat_dadd32(1 << 15, temp1 + temp2) >> 16); } /* @@ -576,8 +576,8 @@ static int dot_product(const int16_t *a, const int16_t *b, int length) int i, sum = 0; for (i = 0; i < length; i++) { - int64_t prod = av_clipl_int32((int64_t)(a[i] * b[i]) << 1); - sum = av_clipl_int32(sum + prod); + int prod = a[i] * b[i]; + sum = av_sat_dadd32(sum, prod); } return sum; } @@ -594,7 +594,7 @@ static void gen_acb_excitation(int16_t *vector, int16_t *prev_excitation, int lag = pitch_lag + subfrm.ad_cb_lag - 1; int i; - int64_t sum; + int sum; get_residual(residual, prev_excitation, lag); @@ -608,7 +608,7 @@ static void gen_acb_excitation(int16_t *vector, int16_t *prev_excitation, cb_ptr += subfrm.ad_cb_gain * 20; for (i = 0; i < SUBFRAME_LEN; i++) { sum = dot_product(residual + i, cb_ptr, PITCH_ORDER); - vector[i] = av_clipl_int32((sum << 1) + (1 << 15)) >> 16; + vector[i] = av_sat_dadd32(1 << 15, sum) >> 16; } } @@ -660,7 +660,7 @@ static void comp_ppf_gains(int lag, PPFParam *ppf, enum Rate cur_rate, int tgt_eng, int ccr, int res_eng) { int pf_residual; /* square of postfiltered residual */ - int64_t temp1, temp2; + int temp1, temp2; ppf->index = lag; @@ -677,7 +677,7 @@ static void comp_ppf_gains(int lag, PPFParam *ppf, enum Rate cur_rate, /* pf_res^2 = tgt_eng + 2*ccr*gain + res_eng*gain^2 */ temp1 = (tgt_eng << 15) + (ccr * ppf->opt_gain << 1); temp2 = (ppf->opt_gain * ppf->opt_gain >> 15) * res_eng; - pf_residual = av_clipl_int32(temp1 + temp2 + (1 << 15)) >> 16; + pf_residual = av_sat_add32(temp1, temp2 + (1 << 15)) >> 16; if (tgt_eng >= pf_residual << 1) { temp1 = 0x7fff; @@ -801,18 +801,18 @@ static int comp_interp_index(G723_1_Context *p, int pitch_lag, /* Compute maximum backward cross-correlation */ ccr = 0; index = autocorr_max(p, offset, &ccr, pitch_lag, SUBFRAME_LEN * 2, -1); - ccr = av_clipl_int32((int64_t)ccr + (1 << 15)) >> 16; + ccr = av_sat_add32(ccr, 1 << 15) >> 16; /* Compute target energy */ tgt_eng = dot_product(buf, buf, SUBFRAME_LEN * 2); - *exc_eng = av_clipl_int32((int64_t)tgt_eng + (1 << 15)) >> 16; + *exc_eng = av_sat_add32(tgt_eng, 1 << 15) >> 16; if (ccr <= 0) return 0; /* Compute best energy */ best_eng = dot_product(buf - index, buf - index, SUBFRAME_LEN * 2); - best_eng = av_clipl_int32((int64_t)best_eng + (1 << 15)) >> 16; + best_eng = av_sat_add32(best_eng, 1 << 15) >> 16; temp = best_eng * *exc_eng >> 3; @@ -893,7 +893,7 @@ static void gain_scale(G723_1_Context *p, int16_t * buf, int energy) for (i = 0; i < SUBFRAME_LEN; i++) { int temp = buf[i] >> 2; temp *= temp; - denom = av_clipl_int32((int64_t)denom + (temp << 1)); + denom = av_sat_dadd32(denom, temp); } if (num && denom) { @@ -977,9 +977,8 @@ static void formant_postfilter(G723_1_Context *p, int16_t *lpc, int16_t *buf) /* Compensation filter */ for (j = 0; j < SUBFRAME_LEN; j++) { - buf_ptr[j] = av_clipl_int32((int64_t)signal_ptr[j] + - ((signal_ptr[j - 1] >> 16) * - temp << 1)) >> 16; + buf_ptr[j] = av_sat_dadd32(signal_ptr[j], + (signal_ptr[j - 1] >> 16) * temp) >> 16; } /* Compute normalized signal energy */ -- cgit v1.2.3 From 371610510361f89948b87f57799ee56deab02503 Mon Sep 17 00:00:00 2001 From: Mans Rullgard Date: Sat, 11 Aug 2012 02:43:14 +0100 Subject: g723.1: do not needlessly use int64_t Signed-off-by: Mans Rullgard --- libavcodec/g723_1.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libavcodec/g723_1.c b/libavcodec/g723_1.c index 8a394f9340..8d5ac71ccd 100644 --- a/libavcodec/g723_1.c +++ b/libavcodec/g723_1.c @@ -710,7 +710,7 @@ static void comp_ppf_coeff(G723_1_Context *p, int offset, int pitch_lag, int16_t scale; int i; - int64_t temp1, temp2; + int temp1, temp2; /* * 0 - target energy -- cgit v1.2.3 From 783da0d6961c6de4ecc3f56fb291738f460782da Mon Sep 17 00:00:00 2001 From: Mans Rullgard Date: Sat, 11 Aug 2012 12:16:53 +0100 Subject: g723.1: make autocorr_max() work on an arbitrary buffer Signed-off-by: Mans Rullgard --- libavcodec/g723_1.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/libavcodec/g723_1.c b/libavcodec/g723_1.c index 8d5ac71ccd..e1d8591920 100644 --- a/libavcodec/g723_1.c +++ b/libavcodec/g723_1.c @@ -615,18 +615,17 @@ static void gen_acb_excitation(int16_t *vector, int16_t *prev_excitation, /** * Estimate maximum auto-correlation around pitch lag. * - * @param p the context + * @param buf buffer with offset applied * @param offset offset of the excitation vector * @param ccr_max pointer to the maximum auto-correlation * @param pitch_lag decoded pitch lag * @param length length of autocorrelation * @param dir forward lag(1) / backward lag(-1) */ -static int autocorr_max(G723_1_Context *p, int offset, int *ccr_max, +static int autocorr_max(const int16_t *buf, int offset, int *ccr_max, int pitch_lag, int length, int dir) { int limit, ccr, lag = 0; - int16_t *buf = p->excitation + offset; int i; pitch_lag = FFMIN(PITCH_MAX - 3, pitch_lag); @@ -721,9 +720,9 @@ static void comp_ppf_coeff(G723_1_Context *p, int offset, int pitch_lag, */ int energy[5] = {0, 0, 0, 0, 0}; int16_t *buf = p->excitation + offset; - int fwd_lag = autocorr_max(p, offset, &energy[1], pitch_lag, + int fwd_lag = autocorr_max(buf, offset, &energy[1], pitch_lag, SUBFRAME_LEN, 1); - int back_lag = autocorr_max(p, offset, &energy[3], pitch_lag, + int back_lag = autocorr_max(buf, offset, &energy[3], pitch_lag, SUBFRAME_LEN, -1); ppf->index = 0; @@ -800,7 +799,7 @@ static int comp_interp_index(G723_1_Context *p, int pitch_lag, /* Compute maximum backward cross-correlation */ ccr = 0; - index = autocorr_max(p, offset, &ccr, pitch_lag, SUBFRAME_LEN * 2, -1); + index = autocorr_max(buf, offset, &ccr, pitch_lag, SUBFRAME_LEN * 2, -1); ccr = av_sat_add32(ccr, 1 << 15) >> 16; /* Compute target energy */ -- cgit v1.2.3 From b2af2c4bee7eb9f256a9f4143ca90b020d2c9569 Mon Sep 17 00:00:00 2001 From: Mans Rullgard Date: Sat, 11 Aug 2012 12:21:41 +0100 Subject: g723.1: make scale_vector() output to a separate buffer Signed-off-by: Mans Rullgard --- libavcodec/g723_1.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/libavcodec/g723_1.c b/libavcodec/g723_1.c index e1d8591920..6f4898cfb9 100644 --- a/libavcodec/g723_1.c +++ b/libavcodec/g723_1.c @@ -278,7 +278,7 @@ static int normalize_bits(int num, int width) /** * Scale vector contents based on the largest of their absolutes. */ -static int scale_vector(int16_t *vector, int length) +static int scale_vector(int16_t *dst, const int16_t *vector, int length) { int bits, max = 0; int i; @@ -292,10 +292,10 @@ static int scale_vector(int16_t *vector, int length) if (bits == 15) for (i = 0; i < length; i++) - vector[i] = vector[i] * 0x7fff >> 3; + dst[i] = vector[i] * 0x7fff >> 3; else for (i = 0; i < length; i++) - vector[i] = vector[i] << bits >> 3; + dst[i] = vector[i] << bits >> 3; return bits - 3; } @@ -791,11 +791,11 @@ static int comp_interp_index(G723_1_Context *p, int pitch_lag, int *exc_eng, int *scale) { int offset = PITCH_MAX + 2 * SUBFRAME_LEN; - int16_t *buf = p->excitation + offset; + const int16_t *buf = p->excitation + offset; int index, ccr, tgt_eng, best_eng, temp; - *scale = scale_vector(p->excitation, FRAME_LEN + PITCH_MAX); + *scale = scale_vector(p->excitation, p->excitation, FRAME_LEN + PITCH_MAX); /* Compute maximum backward cross-correlation */ ccr = 0; @@ -958,8 +958,7 @@ static void formant_postfilter(G723_1_Context *p, int16_t *lpc, int16_t *buf) int scale, energy; /* Normalize */ - memcpy(temp_vector, buf_ptr, SUBFRAME_LEN * sizeof(*temp_vector)); - scale = scale_vector(temp_vector, SUBFRAME_LEN); + scale = scale_vector(temp_vector, buf_ptr, SUBFRAME_LEN); /* Compute auto correlation coefficients */ auto_corr[0] = dot_product(temp_vector, temp_vector + 1, -- cgit v1.2.3 From 1953264331ec653bff2e3d85391b91c0ea5299ae Mon Sep 17 00:00:00 2001 From: Mans Rullgard Date: Sat, 11 Aug 2012 03:39:30 +0100 Subject: g723.1: drop unnecessary variable buf_ptr in formant_postfilter() Signed-off-by: Mans Rullgard --- libavcodec/g723_1.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/libavcodec/g723_1.c b/libavcodec/g723_1.c index 6f4898cfb9..a2f7dee4f4 100644 --- a/libavcodec/g723_1.c +++ b/libavcodec/g723_1.c @@ -926,7 +926,7 @@ static void gain_scale(G723_1_Context *p, int16_t * buf, int energy) */ static void formant_postfilter(G723_1_Context *p, int16_t *lpc, int16_t *buf) { - int16_t filter_coef[2][LPC_ORDER], *buf_ptr; + int16_t filter_coef[2][LPC_ORDER]; int filter_signal[LPC_ORDER + FRAME_LEN], *signal_ptr; int i, j, k; @@ -949,7 +949,7 @@ static void formant_postfilter(G723_1_Context *p, int16_t *lpc, int16_t *buf) memcpy(p->iir_mem, filter_signal + FRAME_LEN, LPC_ORDER * sizeof(*p->iir_mem)); - buf_ptr = buf + LPC_ORDER; + buf += LPC_ORDER; signal_ptr = filter_signal + LPC_ORDER; for (i = 0; i < SUBFRAMES; i++) { int16_t temp_vector[SUBFRAME_LEN]; @@ -958,7 +958,7 @@ static void formant_postfilter(G723_1_Context *p, int16_t *lpc, int16_t *buf) int scale, energy; /* Normalize */ - scale = scale_vector(temp_vector, buf_ptr, SUBFRAME_LEN); + scale = scale_vector(temp_vector, buf, SUBFRAME_LEN); /* Compute auto correlation coefficients */ auto_corr[0] = dot_product(temp_vector, temp_vector + 1, @@ -975,8 +975,8 @@ static void formant_postfilter(G723_1_Context *p, int16_t *lpc, int16_t *buf) /* Compensation filter */ for (j = 0; j < SUBFRAME_LEN; j++) { - buf_ptr[j] = av_sat_dadd32(signal_ptr[j], - (signal_ptr[j - 1] >> 16) * temp) >> 16; + buf[j] = av_sat_dadd32(signal_ptr[j], + (signal_ptr[j - 1] >> 16) * temp) >> 16; } /* Compute normalized signal energy */ @@ -986,9 +986,9 @@ static void formant_postfilter(G723_1_Context *p, int16_t *lpc, int16_t *buf) } else energy = auto_corr[1] >> temp; - gain_scale(p, buf_ptr, energy); + gain_scale(p, buf, energy); - buf_ptr += SUBFRAME_LEN; + buf += SUBFRAME_LEN; signal_ptr += SUBFRAME_LEN; } } -- cgit v1.2.3 From f645710cf31f6268fbf279f4515e6012dcd11ac2 Mon Sep 17 00:00:00 2001 From: Mans Rullgard Date: Sat, 11 Aug 2012 22:26:38 +0100 Subject: g723.1: make postfilter write directly to output buffer Signed-off-by: Mans Rullgard --- libavcodec/g723_1.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/libavcodec/g723_1.c b/libavcodec/g723_1.c index a2f7dee4f4..c980ec2f50 100644 --- a/libavcodec/g723_1.c +++ b/libavcodec/g723_1.c @@ -922,9 +922,11 @@ static void gain_scale(G723_1_Context *p, int16_t * buf, int energy) * * @param p the context * @param lpc quantized lpc coefficients - * @param buf output buffer + * @param buf input buffer + * @param dst output buffer */ -static void formant_postfilter(G723_1_Context *p, int16_t *lpc, int16_t *buf) +static void formant_postfilter(G723_1_Context *p, int16_t *lpc, + int16_t *buf, int16_t *dst) { int16_t filter_coef[2][LPC_ORDER]; int filter_signal[LPC_ORDER + FRAME_LEN], *signal_ptr; @@ -952,18 +954,16 @@ static void formant_postfilter(G723_1_Context *p, int16_t *lpc, int16_t *buf) buf += LPC_ORDER; signal_ptr = filter_signal + LPC_ORDER; for (i = 0; i < SUBFRAMES; i++) { - int16_t temp_vector[SUBFRAME_LEN]; int temp; int auto_corr[2]; int scale, energy; /* Normalize */ - scale = scale_vector(temp_vector, buf, SUBFRAME_LEN); + scale = scale_vector(dst, buf, SUBFRAME_LEN); /* Compute auto correlation coefficients */ - auto_corr[0] = dot_product(temp_vector, temp_vector + 1, - SUBFRAME_LEN - 1); - auto_corr[1] = dot_product(temp_vector, temp_vector, SUBFRAME_LEN); + auto_corr[0] = dot_product(dst, dst + 1, SUBFRAME_LEN - 1); + auto_corr[1] = dot_product(dst, dst, SUBFRAME_LEN); /* Compute reflection coefficient */ temp = auto_corr[1] >> 16; @@ -975,7 +975,7 @@ static void formant_postfilter(G723_1_Context *p, int16_t *lpc, int16_t *buf) /* Compensation filter */ for (j = 0; j < SUBFRAME_LEN; j++) { - buf[j] = av_sat_dadd32(signal_ptr[j], + dst[j] = av_sat_dadd32(signal_ptr[j], (signal_ptr[j - 1] >> 16) * temp) >> 16; } @@ -986,10 +986,11 @@ static void formant_postfilter(G723_1_Context *p, int16_t *lpc, int16_t *buf) } else energy = auto_corr[1] >> temp; - gain_scale(p, buf, energy); + gain_scale(p, dst, energy); buf += SUBFRAME_LEN; signal_ptr += SUBFRAME_LEN; + dst += SUBFRAME_LEN; } } @@ -1136,8 +1137,7 @@ static int g723_1_decode_frame(AVCodecContext *avctx, void *data, memcpy(p->synth_mem, p->audio + FRAME_LEN, LPC_ORDER * sizeof(*p->audio)); if (p->postfilter) { - formant_postfilter(p, lpc, p->audio); - memcpy(p->frame.data[0], p->audio + LPC_ORDER, FRAME_LEN * 2); + formant_postfilter(p, lpc, p->audio, out); } else { // if output is not postfiltered it should be scaled by 2 for (i = 0; i < FRAME_LEN; i++) out[i] = av_clip_int16(p->audio[LPC_ORDER + i] << 1); -- cgit v1.2.3 From 4b728b4712403058ac4dc45daa8b5c03a688fadf Mon Sep 17 00:00:00 2001 From: Mans Rullgard Date: Sat, 11 Aug 2012 05:23:59 +0100 Subject: g723.1: avoid unnecessary memcpy() in residual_interp() Signed-off-by: Mans Rullgard --- libavcodec/g723_1.c | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/libavcodec/g723_1.c b/libavcodec/g723_1.c index c980ec2f50..050278d16c 100644 --- a/libavcodec/g723_1.c +++ b/libavcodec/g723_1.c @@ -838,10 +838,9 @@ static void residual_interp(int16_t *buf, int16_t *out, int lag, int16_t *vector_ptr = buf + PITCH_MAX; /* Attenuate */ for (i = 0; i < lag; i++) - vector_ptr[i - lag] = vector_ptr[i - lag] * 3 >> 2; - av_memcpy_backptr((uint8_t*)vector_ptr, lag * sizeof(*vector_ptr), - FRAME_LEN * sizeof(*vector_ptr)); - memcpy(out, vector_ptr, FRAME_LEN * sizeof(*vector_ptr)); + out[i] = vector_ptr[i - lag] * 3 >> 2; + av_memcpy_backptr((uint8_t*)(out + lag), lag * sizeof(*out), + (FRAME_LEN - lag) * sizeof(*out)); } else { /* Unvoiced */ for (i = 0; i < FRAME_LEN; i++) { *rseed = *rseed * 521 + 259; @@ -1100,23 +1099,31 @@ static int g723_1_decode_frame(AVCodecContext *avctx, void *data, ppf[j].opt_gain, 1 << 14, 15, SUBFRAME_LEN); + /* Save the excitation for the next frame */ + memcpy(p->prev_excitation, p->excitation + FRAME_LEN, + PITCH_MAX * sizeof(*p->excitation)); } else { p->interp_gain = (p->interp_gain * 3 + 2) >> 2; if (p->erased_frames == 3) { /* Mute output */ memset(p->excitation, 0, (FRAME_LEN + PITCH_MAX) * sizeof(*p->excitation)); + memset(p->prev_excitation, 0, + PITCH_MAX * sizeof(*p->excitation)); memset(p->frame.data[0], 0, (FRAME_LEN + LPC_ORDER) * sizeof(int16_t)); } else { + int16_t *buf = p->audio + LPC_ORDER; + /* Regenerate frame */ - residual_interp(p->excitation, p->audio + LPC_ORDER, p->interp_index, + residual_interp(p->excitation, buf, p->interp_index, p->interp_gain, &p->random_seed); + + /* Save the excitation for the next frame */ + memcpy(p->prev_excitation, buf + (FRAME_LEN - PITCH_MAX), + PITCH_MAX * sizeof(*p->excitation)); } } - /* Save the excitation for the next frame */ - memcpy(p->prev_excitation, p->excitation + FRAME_LEN, - PITCH_MAX * sizeof(*p->excitation)); } else { memset(out, 0, FRAME_LEN * 2); av_log(avctx, AV_LOG_WARNING, -- cgit v1.2.3 From 35b533e4dede0b1abe3ddbe927c893819006ee75 Mon Sep 17 00:00:00 2001 From: Mans Rullgard Date: Sat, 11 Aug 2012 19:59:08 +0100 Subject: g723.1: avoid saving/restoring excitation Writing the scaled excitation to a scratch buffer (borrowing the 'audio' array) instead of modifying it in place avoids the need to save and restore the unscaled values. Signed-off-by: Mans Rullgard --- libavcodec/g723_1.c | 27 +++++++++++---------------- 1 file changed, 11 insertions(+), 16 deletions(-) diff --git a/libavcodec/g723_1.c b/libavcodec/g723_1.c index 050278d16c..c0cea192c2 100644 --- a/libavcodec/g723_1.c +++ b/libavcodec/g723_1.c @@ -99,7 +99,7 @@ typedef struct g723_1_context { int pf_gain; int postfilter; - int16_t audio[FRAME_LEN + LPC_ORDER]; + int16_t audio[FRAME_LEN + LPC_ORDER + PITCH_MAX]; } G723_1_Context; static av_cold int g723_1_decode_init(AVCodecContext *avctx) @@ -719,7 +719,7 @@ static void comp_ppf_coeff(G723_1_Context *p, int offset, int pitch_lag, * 4 - backward residual energy */ int energy[5] = {0, 0, 0, 0, 0}; - int16_t *buf = p->excitation + offset; + int16_t *buf = p->audio + LPC_ORDER + offset; int fwd_lag = autocorr_max(buf, offset, &energy[1], pitch_lag, SUBFRAME_LEN, 1); int back_lag = autocorr_max(buf, offset, &energy[3], pitch_lag, @@ -791,11 +791,12 @@ static int comp_interp_index(G723_1_Context *p, int pitch_lag, int *exc_eng, int *scale) { int offset = PITCH_MAX + 2 * SUBFRAME_LEN; - const int16_t *buf = p->excitation + offset; + int16_t *buf = p->audio + LPC_ORDER; int index, ccr, tgt_eng, best_eng, temp; - *scale = scale_vector(p->excitation, p->excitation, FRAME_LEN + PITCH_MAX); + *scale = scale_vector(buf, p->excitation, FRAME_LEN + PITCH_MAX); + buf += offset; /* Compute maximum backward cross-correlation */ ccr = 0; @@ -1008,6 +1009,7 @@ static int g723_1_decode_frame(AVCodecContext *avctx, void *data, int16_t *vector_ptr; int16_t *out; int bad_frame = 0, i, j, ret; + int16_t *audio = p->audio; if (buf_size < frame_size[dec_mode]) { if (buf_size) @@ -1071,26 +1073,16 @@ static int g723_1_decode_frame(AVCodecContext *avctx, void *data, vector_ptr = p->excitation + PITCH_MAX; - /* Save the excitation */ - memcpy(p->audio + LPC_ORDER, vector_ptr, FRAME_LEN * sizeof(*p->audio)); - p->interp_index = comp_interp_index(p, p->pitch_lag[1], &p->sid_gain, &p->cur_gain); + /* Peform pitch postfiltering */ if (p->postfilter) { i = PITCH_MAX; for (j = 0; j < SUBFRAMES; i += SUBFRAME_LEN, j++) comp_ppf_coeff(p, i, p->pitch_lag[j >> 1], ppf + j, p->cur_rate); - } - - /* Restore the original excitation */ - memcpy(p->excitation, p->prev_excitation, - PITCH_MAX * sizeof(*p->excitation)); - memcpy(vector_ptr, p->audio + LPC_ORDER, FRAME_LEN * sizeof(*vector_ptr)); - /* Peform pitch postfiltering */ - if (p->postfilter) for (i = 0, j = 0; j < SUBFRAMES; i += SUBFRAME_LEN, j++) ff_acelp_weighted_vector_sum(p->audio + LPC_ORDER + i, vector_ptr + i, @@ -1098,6 +1090,9 @@ static int g723_1_decode_frame(AVCodecContext *avctx, void *data, ppf[j].sc_gain, ppf[j].opt_gain, 1 << 14, 15, SUBFRAME_LEN); + } else { + audio = vector_ptr - LPC_ORDER; + } /* Save the excitation for the next frame */ memcpy(p->prev_excitation, p->excitation + FRAME_LEN, @@ -1139,7 +1134,7 @@ static int g723_1_decode_frame(AVCodecContext *avctx, void *data, memcpy(p->audio, p->synth_mem, LPC_ORDER * sizeof(*p->audio)); for (i = LPC_ORDER, j = 0; j < SUBFRAMES; i += SUBFRAME_LEN, j++) ff_celp_lp_synthesis_filter(p->audio + i, &lpc[j * LPC_ORDER], - p->audio + i, SUBFRAME_LEN, LPC_ORDER, + audio + i, SUBFRAME_LEN, LPC_ORDER, 0, 1, 1 << 12); memcpy(p->synth_mem, p->audio + FRAME_LEN, LPC_ORDER * sizeof(*p->audio)); -- cgit v1.2.3 From cbcf1b411fac6aabf731e14e4f48bca3d956f868 Mon Sep 17 00:00:00 2001 From: Mans Rullgard Date: Sat, 11 Aug 2012 20:19:39 +0100 Subject: g723.1: declare a variable in the block it is used Signed-off-by: Mans Rullgard --- libavcodec/g723_1.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/libavcodec/g723_1.c b/libavcodec/g723_1.c index c0cea192c2..657c144895 100644 --- a/libavcodec/g723_1.c +++ b/libavcodec/g723_1.c @@ -1006,7 +1006,6 @@ static int g723_1_decode_frame(AVCodecContext *avctx, void *data, int16_t cur_lsp[LPC_ORDER]; int16_t lpc[SUBFRAMES * LPC_ORDER]; int16_t acb_vector[SUBFRAME_LEN]; - int16_t *vector_ptr; int16_t *out; int bad_frame = 0, i, j, ret; int16_t *audio = p->audio; @@ -1051,8 +1050,9 @@ static int g723_1_decode_frame(AVCodecContext *avctx, void *data, /* Generate the excitation for the frame */ memcpy(p->excitation, p->prev_excitation, PITCH_MAX * sizeof(*p->excitation)); - vector_ptr = p->excitation + PITCH_MAX; if (!p->erased_frames) { + int16_t *vector_ptr = p->excitation + PITCH_MAX; + /* Update interpolation gain memory */ p->interp_gain = fixed_cb_gain[(p->subframe[2].amp_index + p->subframe[3].amp_index) >> 1]; -- cgit v1.2.3 From 138914dcd83132f6edc6f1799c5a17e0b6b559bb Mon Sep 17 00:00:00 2001 From: Mans Rullgard Date: Sat, 11 Aug 2012 21:00:21 +0100 Subject: g723.1: do not bounce intermediate values via memory Although a reasonable compiler will probably optimise out the actual store and load, this operation still implies a truncation to 16 bits which the compiler will probably not realise is not necessary here. Signed-off-by: Mans Rullgard --- libavcodec/g723_1.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/libavcodec/g723_1.c b/libavcodec/g723_1.c index 657c144895..4c1c4dad66 100644 --- a/libavcodec/g723_1.c +++ b/libavcodec/g723_1.c @@ -1064,9 +1064,8 @@ static int g723_1_decode_frame(AVCodecContext *avctx, void *data, p->cur_rate); /* Get the total excitation */ for (j = 0; j < SUBFRAME_LEN; j++) { - vector_ptr[j] = av_clip_int16(vector_ptr[j] << 1); - vector_ptr[j] = av_clip_int16(vector_ptr[j] + - acb_vector[j]); + int v = av_clip_int16(vector_ptr[j] << 1); + vector_ptr[j] = av_clip_int16(v + acb_vector[j]); } vector_ptr += SUBFRAME_LEN; } -- cgit v1.2.3 From 69665bd6f40f02ecf822f80c05dd2765da2dfa7b Mon Sep 17 00:00:00 2001 From: Mans Rullgard Date: Sat, 11 Aug 2012 21:04:14 +0100 Subject: g723.1: do not pass large structs by value Signed-off-by: Mans Rullgard --- libavcodec/g723_1.c | 42 +++++++++++++++++++++--------------------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/libavcodec/g723_1.c b/libavcodec/g723_1.c index 4c1c4dad66..37b7ff5f09 100644 --- a/libavcodec/g723_1.c +++ b/libavcodec/g723_1.c @@ -498,7 +498,7 @@ static void gen_dirac_train(int16_t *buf, int pitch_lag) * @param pitch_lag closed loop pitch lag * @param index current subframe index */ -static void gen_fcb_excitation(int16_t *vector, G723_1_Subframe subfrm, +static void gen_fcb_excitation(int16_t *vector, G723_1_Subframe *subfrm, enum Rate cur_rate, int pitch_lag, int index) { int temp, i, j; @@ -506,34 +506,34 @@ static void gen_fcb_excitation(int16_t *vector, G723_1_Subframe subfrm, memset(vector, 0, SUBFRAME_LEN * sizeof(*vector)); if (cur_rate == RATE_6300) { - if (subfrm.pulse_pos >= max_pos[index]) + if (subfrm->pulse_pos >= max_pos[index]) return; /* Decode amplitudes and positions */ j = PULSE_MAX - pulses[index]; - temp = subfrm.pulse_pos; + temp = subfrm->pulse_pos; for (i = 0; i < SUBFRAME_LEN / GRID_SIZE; i++) { temp -= combinatorial_table[j][i]; if (temp >= 0) continue; temp += combinatorial_table[j++][i]; - if (subfrm.pulse_sign & (1 << (PULSE_MAX - j))) { - vector[subfrm.grid_index + GRID_SIZE * i] = - -fixed_cb_gain[subfrm.amp_index]; + if (subfrm->pulse_sign & (1 << (PULSE_MAX - j))) { + vector[subfrm->grid_index + GRID_SIZE * i] = + -fixed_cb_gain[subfrm->amp_index]; } else { - vector[subfrm.grid_index + GRID_SIZE * i] = - fixed_cb_gain[subfrm.amp_index]; + vector[subfrm->grid_index + GRID_SIZE * i] = + fixed_cb_gain[subfrm->amp_index]; } if (j == PULSE_MAX) break; } - if (subfrm.dirac_train == 1) + if (subfrm->dirac_train == 1) gen_dirac_train(vector, pitch_lag); } else { /* 5300 bps */ - int cb_gain = fixed_cb_gain[subfrm.amp_index]; - int cb_shift = subfrm.grid_index; - int cb_sign = subfrm.pulse_sign; - int cb_pos = subfrm.pulse_pos; + int cb_gain = fixed_cb_gain[subfrm->amp_index]; + int cb_shift = subfrm->grid_index; + int cb_sign = subfrm->pulse_sign; + int cb_pos = subfrm->pulse_pos; int offset, beta, lag; for (i = 0; i < 8; i += 2) { @@ -544,9 +544,9 @@ static void gen_fcb_excitation(int16_t *vector, G723_1_Subframe subfrm, } /* Enhance harmonic components */ - lag = pitch_contrib[subfrm.ad_cb_gain << 1] + pitch_lag + - subfrm.ad_cb_lag - 1; - beta = pitch_contrib[(subfrm.ad_cb_gain << 1) + 1]; + lag = pitch_contrib[subfrm->ad_cb_gain << 1] + pitch_lag + + subfrm->ad_cb_lag - 1; + beta = pitch_contrib[(subfrm->ad_cb_gain << 1) + 1]; if (lag < SUBFRAME_LEN - 2) { for (i = lag; i < SUBFRAME_LEN; i++) @@ -586,12 +586,12 @@ static int dot_product(const int16_t *a, const int16_t *b, int length) * Generate adaptive codebook excitation. */ static void gen_acb_excitation(int16_t *vector, int16_t *prev_excitation, - int pitch_lag, G723_1_Subframe subfrm, + int pitch_lag, G723_1_Subframe *subfrm, enum Rate cur_rate) { int16_t residual[SUBFRAME_LEN + PITCH_ORDER - 1]; const int16_t *cb_ptr; - int lag = pitch_lag + subfrm.ad_cb_lag - 1; + int lag = pitch_lag + subfrm->ad_cb_lag - 1; int i; int sum; @@ -605,7 +605,7 @@ static void gen_acb_excitation(int16_t *vector, int16_t *prev_excitation, cb_ptr = adaptive_cb_gain170; /* Calculate adaptive vector */ - cb_ptr += subfrm.ad_cb_gain * 20; + cb_ptr += subfrm->ad_cb_gain * 20; for (i = 0; i < SUBFRAME_LEN; i++) { sum = dot_product(residual + i, cb_ptr, PITCH_ORDER); vector[i] = av_sat_dadd32(1 << 15, sum) >> 16; @@ -1057,10 +1057,10 @@ static int g723_1_decode_frame(AVCodecContext *avctx, void *data, p->interp_gain = fixed_cb_gain[(p->subframe[2].amp_index + p->subframe[3].amp_index) >> 1]; for (i = 0; i < SUBFRAMES; i++) { - gen_fcb_excitation(vector_ptr, p->subframe[i], p->cur_rate, + gen_fcb_excitation(vector_ptr, &p->subframe[i], p->cur_rate, p->pitch_lag[i >> 1], i); gen_acb_excitation(acb_vector, &p->excitation[SUBFRAME_LEN * i], - p->pitch_lag[i >> 1], p->subframe[i], + p->pitch_lag[i >> 1], &p->subframe[i], p->cur_rate); /* Get the total excitation */ for (j = 0; j < SUBFRAME_LEN; j++) { -- cgit v1.2.3