From e66240f22e240b0f0d970d1b138db80ceb517097 Mon Sep 17 00:00:00 2001 From: Diego Biurrun Date: Fri, 1 Feb 2013 13:14:31 +0100 Subject: avfilter: x86: consistent filenames for filter optimizations --- libavfilter/x86/Makefile | 8 +- libavfilter/x86/gradfun.c | 190 ------------------------------ libavfilter/x86/hqdn3d.asm | 106 ----------------- libavfilter/x86/vf_gradfun.c | 190 ++++++++++++++++++++++++++++++ libavfilter/x86/vf_hqdn3d.asm | 106 +++++++++++++++++ libavfilter/x86/vf_yadif.asm | 254 ++++++++++++++++++++++++++++++++++++++++ libavfilter/x86/vf_yadif_init.c | 54 +++++++++ libavfilter/x86/yadif.asm | 254 ---------------------------------------- libavfilter/x86/yadif_init.c | 54 --------- 9 files changed, 608 insertions(+), 608 deletions(-) delete mode 100644 libavfilter/x86/gradfun.c delete mode 100644 libavfilter/x86/hqdn3d.asm create mode 100644 libavfilter/x86/vf_gradfun.c create mode 100644 libavfilter/x86/vf_hqdn3d.asm create mode 100644 libavfilter/x86/vf_yadif.asm create mode 100644 libavfilter/x86/vf_yadif_init.c delete mode 100644 libavfilter/x86/yadif.asm delete mode 100644 libavfilter/x86/yadif_init.c (limited to 'libavfilter/x86') diff --git a/libavfilter/x86/Makefile b/libavfilter/x86/Makefile index af5a9998b5..59cefe8988 100644 --- a/libavfilter/x86/Makefile +++ b/libavfilter/x86/Makefile @@ -1,8 +1,8 @@ -OBJS-$(CONFIG_GRADFUN_FILTER) += x86/gradfun.o +OBJS-$(CONFIG_GRADFUN_FILTER) += x86/vf_gradfun.o OBJS-$(CONFIG_HQDN3D_FILTER) += x86/vf_hqdn3d_init.o OBJS-$(CONFIG_VOLUME_FILTER) += x86/af_volume_init.o -OBJS-$(CONFIG_YADIF_FILTER) += x86/yadif_init.o +OBJS-$(CONFIG_YADIF_FILTER) += x86/vf_yadif_init.o -YASM-OBJS-$(CONFIG_HQDN3D_FILTER) += x86/hqdn3d.o +YASM-OBJS-$(CONFIG_HQDN3D_FILTER) += x86/vf_hqdn3d.o YASM-OBJS-$(CONFIG_VOLUME_FILTER) += x86/af_volume.o -YASM-OBJS-$(CONFIG_YADIF_FILTER) += x86/yadif.o +YASM-OBJS-$(CONFIG_YADIF_FILTER) += x86/vf_yadif.o diff --git a/libavfilter/x86/gradfun.c b/libavfilter/x86/gradfun.c deleted file mode 100644 index b4ca86c617..0000000000 --- a/libavfilter/x86/gradfun.c +++ /dev/null @@ -1,190 +0,0 @@ -/* - * Copyright (C) 2009 Loren Merritt - * - * This file is part of Libav. - * - * Libav is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * Libav is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "libavutil/attributes.h" -#include "libavutil/cpu.h" -#include "libavutil/mem.h" -#include "libavutil/x86/asm.h" -#include "libavfilter/gradfun.h" - -#if HAVE_INLINE_ASM - -DECLARE_ALIGNED(16, static const uint16_t, pw_7f)[8] = {0x7F,0x7F,0x7F,0x7F,0x7F,0x7F,0x7F,0x7F}; -DECLARE_ALIGNED(16, static const uint16_t, pw_ff)[8] = {0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF}; - -#if HAVE_MMXEXT_INLINE -static void gradfun_filter_line_mmxext(uint8_t *dst, uint8_t *src, uint16_t *dc, - int width, int thresh, - const uint16_t *dithers) -{ - intptr_t x; - if (width & 3) { - x = width & ~3; - ff_gradfun_filter_line_c(dst + x, src + x, dc + x / 2, width - x, thresh, dithers); - width = x; - } - x = -width; - __asm__ volatile( - "movd %4, %%mm5 \n" - "pxor %%mm7, %%mm7 \n" - "pshufw $0, %%mm5, %%mm5 \n" - "movq %6, %%mm6 \n" - "movq %5, %%mm4 \n" - "1: \n" - "movd (%2,%0), %%mm0 \n" - "movd (%3,%0), %%mm1 \n" - "punpcklbw %%mm7, %%mm0 \n" - "punpcklwd %%mm1, %%mm1 \n" - "psllw $7, %%mm0 \n" - "pxor %%mm2, %%mm2 \n" - "psubw %%mm0, %%mm1 \n" // delta = dc - pix - "psubw %%mm1, %%mm2 \n" - "pmaxsw %%mm1, %%mm2 \n" - "pmulhuw %%mm5, %%mm2 \n" // m = abs(delta) * thresh >> 16 - "psubw %%mm6, %%mm2 \n" - "pminsw %%mm7, %%mm2 \n" // m = -max(0, 127-m) - "pmullw %%mm2, %%mm2 \n" - "paddw %%mm4, %%mm0 \n" // pix += dither - "pmulhw %%mm2, %%mm1 \n" - "psllw $2, %%mm1 \n" // m = m*m*delta >> 14 - "paddw %%mm1, %%mm0 \n" // pix += m - "psraw $7, %%mm0 \n" - "packuswb %%mm0, %%mm0 \n" - "movd %%mm0, (%1,%0) \n" // dst = clip(pix>>7) - "add $4, %0 \n" - "jl 1b \n" - "emms \n" - :"+r"(x) - :"r"(dst+width), "r"(src+width), "r"(dc+width/2), - "rm"(thresh), "m"(*dithers), "m"(*pw_7f) - :"memory" - ); -} -#endif - -#if HAVE_SSSE3_INLINE -static void gradfun_filter_line_ssse3(uint8_t *dst, uint8_t *src, uint16_t *dc, int width, int thresh, const uint16_t *dithers) -{ - intptr_t x; - if (width & 7) { - // could be 10% faster if I somehow eliminated this - x = width & ~7; - ff_gradfun_filter_line_c(dst + x, src + x, dc + x / 2, width - x, thresh, dithers); - width = x; - } - x = -width; - __asm__ volatile( - "movd %4, %%xmm5 \n" - "pxor %%xmm7, %%xmm7 \n" - "pshuflw $0,%%xmm5, %%xmm5 \n" - "movdqa %6, %%xmm6 \n" - "punpcklqdq %%xmm5, %%xmm5 \n" - "movdqa %5, %%xmm4 \n" - "1: \n" - "movq (%2,%0), %%xmm0 \n" - "movq (%3,%0), %%xmm1 \n" - "punpcklbw %%xmm7, %%xmm0 \n" - "punpcklwd %%xmm1, %%xmm1 \n" - "psllw $7, %%xmm0 \n" - "psubw %%xmm0, %%xmm1 \n" // delta = dc - pix - "pabsw %%xmm1, %%xmm2 \n" - "pmulhuw %%xmm5, %%xmm2 \n" // m = abs(delta) * thresh >> 16 - "psubw %%xmm6, %%xmm2 \n" - "pminsw %%xmm7, %%xmm2 \n" // m = -max(0, 127-m) - "pmullw %%xmm2, %%xmm2 \n" - "psllw $1, %%xmm2 \n" - "paddw %%xmm4, %%xmm0 \n" // pix += dither - "pmulhrsw %%xmm2, %%xmm1 \n" // m = m*m*delta >> 14 - "paddw %%xmm1, %%xmm0 \n" // pix += m - "psraw $7, %%xmm0 \n" - "packuswb %%xmm0, %%xmm0 \n" - "movq %%xmm0, (%1,%0) \n" // dst = clip(pix>>7) - "add $8, %0 \n" - "jl 1b \n" - :"+&r"(x) - :"r"(dst+width), "r"(src+width), "r"(dc+width/2), - "rm"(thresh), "m"(*dithers), "m"(*pw_7f) - :"memory" - ); -} -#endif /* HAVE_SSSE3_INLINE */ - -#if HAVE_SSE2_INLINE -static void gradfun_blur_line_sse2(uint16_t *dc, uint16_t *buf, uint16_t *buf1, uint8_t *src, int src_linesize, int width) -{ -#define BLURV(load)\ - intptr_t x = -2*width;\ - __asm__ volatile(\ - "movdqa %6, %%xmm7 \n"\ - "1: \n"\ - load" (%4,%0), %%xmm0 \n"\ - load" (%5,%0), %%xmm1 \n"\ - "movdqa %%xmm0, %%xmm2 \n"\ - "movdqa %%xmm1, %%xmm3 \n"\ - "psrlw $8, %%xmm0 \n"\ - "psrlw $8, %%xmm1 \n"\ - "pand %%xmm7, %%xmm2 \n"\ - "pand %%xmm7, %%xmm3 \n"\ - "paddw %%xmm1, %%xmm0 \n"\ - "paddw %%xmm3, %%xmm2 \n"\ - "paddw %%xmm2, %%xmm0 \n"\ - "paddw (%2,%0), %%xmm0 \n"\ - "movdqa (%1,%0), %%xmm1 \n"\ - "movdqa %%xmm0, (%1,%0) \n"\ - "psubw %%xmm1, %%xmm0 \n"\ - "movdqa %%xmm0, (%3,%0) \n"\ - "add $16, %0 \n"\ - "jl 1b \n"\ - :"+&r"(x)\ - :"r"(buf+width),\ - "r"(buf1+width),\ - "r"(dc+width),\ - "r"(src+width*2),\ - "r"(src+width*2+src_linesize),\ - "m"(*pw_ff)\ - :"memory"\ - ); - if (((intptr_t) src | src_linesize) & 15) { - BLURV("movdqu"); - } else { - BLURV("movdqa"); - } -} -#endif /* HAVE_SSE2_INLINE */ - -#endif /* HAVE_INLINE_ASM */ - -av_cold void ff_gradfun_init_x86(GradFunContext *gf) -{ - int cpu_flags = av_get_cpu_flags(); - -#if HAVE_MMXEXT_INLINE - if (cpu_flags & AV_CPU_FLAG_MMXEXT) - gf->filter_line = gradfun_filter_line_mmxext; -#endif -#if HAVE_SSSE3_INLINE - if (cpu_flags & AV_CPU_FLAG_SSSE3) - gf->filter_line = gradfun_filter_line_ssse3; -#endif -#if HAVE_SSE2_INLINE - if (cpu_flags & AV_CPU_FLAG_SSE2) - gf->blur_line = gradfun_blur_line_sse2; -#endif -} diff --git a/libavfilter/x86/hqdn3d.asm b/libavfilter/x86/hqdn3d.asm deleted file mode 100644 index dee2c96131..0000000000 --- a/libavfilter/x86/hqdn3d.asm +++ /dev/null @@ -1,106 +0,0 @@ -;****************************************************************************** -;* Copyright (c) 2012 Loren Merritt -;* -;* This file is part of Libav. -;* -;* Libav is free software; you can redistribute it and/or -;* modify it under the terms of the GNU Lesser General Public -;* License as published by the Free Software Foundation; either -;* version 2.1 of the License, or (at your option) any later version. -;* -;* Libav is distributed in the hope that it will be useful, -;* but WITHOUT ANY WARRANTY; without even the implied warranty of -;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -;* Lesser General Public License for more details. -;* -;* You should have received a copy of the GNU Lesser General Public -;* License along with Libav; if not, write to the Free Software -;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -;****************************************************************************** - -%include "libavutil/x86/x86util.asm" - -SECTION .text - -%macro LOWPASS 3 ; prevsample, cursample, lut - sub %1q, %2q -%if lut_bits != 8 - sar %1q, 8-lut_bits -%endif - movsx %1d, word [%3q+%1q*2] - add %1d, %2d -%endmacro - -%macro LOAD 3 ; dstreg, x, bitdepth -%if %3 == 8 - movzx %1, byte [srcq+%2] -%else - movzx %1, word [srcq+(%2)*2] -%endif -%if %3 != 16 - shl %1, 16-%3 -%endif -%endmacro - -%macro HQDN3D_ROW 1 ; bitdepth -%if ARCH_X86_64 -cglobal hqdn3d_row_%1_x86, 7,10,0, src, dst, lineant, frameant, width, spatial, temporal, pixelant, t0, t1 -%else -cglobal hqdn3d_row_%1_x86, 7,7,0, src, dst, lineant, frameant, width, spatial, temporal -%endif - %assign bytedepth (%1+7)>>3 - %assign lut_bits 4+4*(%1/16) - dec widthq - lea srcq, [srcq+widthq*bytedepth] - lea dstq, [dstq+widthq*bytedepth] - lea frameantq, [frameantq+widthq*2] - lea lineantq, [lineantq+widthq*2] - neg widthq - %define xq widthq -%if ARCH_X86_32 - mov dstmp, dstq - mov srcmp, srcq - mov frameantmp, frameantq - mov lineantmp, lineantq - %define dstq r0 - %define frameantq r0 - %define lineantq r0 - %define pixelantq r1 - %define pixelantd r1d - DECLARE_REG_TMP 2,3 -%endif - LOAD pixelantd, xq, %1 -ALIGN 16 -.loop: - movifnidn srcq, srcmp - LOAD t0d, xq+1, %1 ; skip on the last iteration to avoid overread -.loop2: - movifnidn lineantq, lineantmp - movzx t1d, word [lineantq+xq*2] - LOWPASS t1, pixelant, spatial - mov [lineantq+xq*2], t1w - LOWPASS pixelant, t0, spatial - movifnidn frameantq, frameantmp - movzx t0d, word [frameantq+xq*2] - LOWPASS t0, t1, temporal - mov [frameantq+xq*2], t0w - movifnidn dstq, dstmp -%if %1 != 16 - add t0d, (1<<(15-%1))-1 - shr t0d, 16-%1 ; could eliminate this by storing from t0h, but only with some contraints on register allocation -%endif -%if %1 == 8 - mov [dstq+xq], t0b -%else - mov [dstq+xq*2], t0w -%endif - inc xq - jl .loop - je .loop2 - REP_RET -%endmacro ; HQDN3D_ROW - -HQDN3D_ROW 8 -HQDN3D_ROW 9 -HQDN3D_ROW 10 -HQDN3D_ROW 16 diff --git a/libavfilter/x86/vf_gradfun.c b/libavfilter/x86/vf_gradfun.c new file mode 100644 index 0000000000..b4ca86c617 --- /dev/null +++ b/libavfilter/x86/vf_gradfun.c @@ -0,0 +1,190 @@ +/* + * Copyright (C) 2009 Loren Merritt + * + * This file is part of Libav. + * + * Libav is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * Libav is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Libav; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/attributes.h" +#include "libavutil/cpu.h" +#include "libavutil/mem.h" +#include "libavutil/x86/asm.h" +#include "libavfilter/gradfun.h" + +#if HAVE_INLINE_ASM + +DECLARE_ALIGNED(16, static const uint16_t, pw_7f)[8] = {0x7F,0x7F,0x7F,0x7F,0x7F,0x7F,0x7F,0x7F}; +DECLARE_ALIGNED(16, static const uint16_t, pw_ff)[8] = {0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF}; + +#if HAVE_MMXEXT_INLINE +static void gradfun_filter_line_mmxext(uint8_t *dst, uint8_t *src, uint16_t *dc, + int width, int thresh, + const uint16_t *dithers) +{ + intptr_t x; + if (width & 3) { + x = width & ~3; + ff_gradfun_filter_line_c(dst + x, src + x, dc + x / 2, width - x, thresh, dithers); + width = x; + } + x = -width; + __asm__ volatile( + "movd %4, %%mm5 \n" + "pxor %%mm7, %%mm7 \n" + "pshufw $0, %%mm5, %%mm5 \n" + "movq %6, %%mm6 \n" + "movq %5, %%mm4 \n" + "1: \n" + "movd (%2,%0), %%mm0 \n" + "movd (%3,%0), %%mm1 \n" + "punpcklbw %%mm7, %%mm0 \n" + "punpcklwd %%mm1, %%mm1 \n" + "psllw $7, %%mm0 \n" + "pxor %%mm2, %%mm2 \n" + "psubw %%mm0, %%mm1 \n" // delta = dc - pix + "psubw %%mm1, %%mm2 \n" + "pmaxsw %%mm1, %%mm2 \n" + "pmulhuw %%mm5, %%mm2 \n" // m = abs(delta) * thresh >> 16 + "psubw %%mm6, %%mm2 \n" + "pminsw %%mm7, %%mm2 \n" // m = -max(0, 127-m) + "pmullw %%mm2, %%mm2 \n" + "paddw %%mm4, %%mm0 \n" // pix += dither + "pmulhw %%mm2, %%mm1 \n" + "psllw $2, %%mm1 \n" // m = m*m*delta >> 14 + "paddw %%mm1, %%mm0 \n" // pix += m + "psraw $7, %%mm0 \n" + "packuswb %%mm0, %%mm0 \n" + "movd %%mm0, (%1,%0) \n" // dst = clip(pix>>7) + "add $4, %0 \n" + "jl 1b \n" + "emms \n" + :"+r"(x) + :"r"(dst+width), "r"(src+width), "r"(dc+width/2), + "rm"(thresh), "m"(*dithers), "m"(*pw_7f) + :"memory" + ); +} +#endif + +#if HAVE_SSSE3_INLINE +static void gradfun_filter_line_ssse3(uint8_t *dst, uint8_t *src, uint16_t *dc, int width, int thresh, const uint16_t *dithers) +{ + intptr_t x; + if (width & 7) { + // could be 10% faster if I somehow eliminated this + x = width & ~7; + ff_gradfun_filter_line_c(dst + x, src + x, dc + x / 2, width - x, thresh, dithers); + width = x; + } + x = -width; + __asm__ volatile( + "movd %4, %%xmm5 \n" + "pxor %%xmm7, %%xmm7 \n" + "pshuflw $0,%%xmm5, %%xmm5 \n" + "movdqa %6, %%xmm6 \n" + "punpcklqdq %%xmm5, %%xmm5 \n" + "movdqa %5, %%xmm4 \n" + "1: \n" + "movq (%2,%0), %%xmm0 \n" + "movq (%3,%0), %%xmm1 \n" + "punpcklbw %%xmm7, %%xmm0 \n" + "punpcklwd %%xmm1, %%xmm1 \n" + "psllw $7, %%xmm0 \n" + "psubw %%xmm0, %%xmm1 \n" // delta = dc - pix + "pabsw %%xmm1, %%xmm2 \n" + "pmulhuw %%xmm5, %%xmm2 \n" // m = abs(delta) * thresh >> 16 + "psubw %%xmm6, %%xmm2 \n" + "pminsw %%xmm7, %%xmm2 \n" // m = -max(0, 127-m) + "pmullw %%xmm2, %%xmm2 \n" + "psllw $1, %%xmm2 \n" + "paddw %%xmm4, %%xmm0 \n" // pix += dither + "pmulhrsw %%xmm2, %%xmm1 \n" // m = m*m*delta >> 14 + "paddw %%xmm1, %%xmm0 \n" // pix += m + "psraw $7, %%xmm0 \n" + "packuswb %%xmm0, %%xmm0 \n" + "movq %%xmm0, (%1,%0) \n" // dst = clip(pix>>7) + "add $8, %0 \n" + "jl 1b \n" + :"+&r"(x) + :"r"(dst+width), "r"(src+width), "r"(dc+width/2), + "rm"(thresh), "m"(*dithers), "m"(*pw_7f) + :"memory" + ); +} +#endif /* HAVE_SSSE3_INLINE */ + +#if HAVE_SSE2_INLINE +static void gradfun_blur_line_sse2(uint16_t *dc, uint16_t *buf, uint16_t *buf1, uint8_t *src, int src_linesize, int width) +{ +#define BLURV(load)\ + intptr_t x = -2*width;\ + __asm__ volatile(\ + "movdqa %6, %%xmm7 \n"\ + "1: \n"\ + load" (%4,%0), %%xmm0 \n"\ + load" (%5,%0), %%xmm1 \n"\ + "movdqa %%xmm0, %%xmm2 \n"\ + "movdqa %%xmm1, %%xmm3 \n"\ + "psrlw $8, %%xmm0 \n"\ + "psrlw $8, %%xmm1 \n"\ + "pand %%xmm7, %%xmm2 \n"\ + "pand %%xmm7, %%xmm3 \n"\ + "paddw %%xmm1, %%xmm0 \n"\ + "paddw %%xmm3, %%xmm2 \n"\ + "paddw %%xmm2, %%xmm0 \n"\ + "paddw (%2,%0), %%xmm0 \n"\ + "movdqa (%1,%0), %%xmm1 \n"\ + "movdqa %%xmm0, (%1,%0) \n"\ + "psubw %%xmm1, %%xmm0 \n"\ + "movdqa %%xmm0, (%3,%0) \n"\ + "add $16, %0 \n"\ + "jl 1b \n"\ + :"+&r"(x)\ + :"r"(buf+width),\ + "r"(buf1+width),\ + "r"(dc+width),\ + "r"(src+width*2),\ + "r"(src+width*2+src_linesize),\ + "m"(*pw_ff)\ + :"memory"\ + ); + if (((intptr_t) src | src_linesize) & 15) { + BLURV("movdqu"); + } else { + BLURV("movdqa"); + } +} +#endif /* HAVE_SSE2_INLINE */ + +#endif /* HAVE_INLINE_ASM */ + +av_cold void ff_gradfun_init_x86(GradFunContext *gf) +{ + int cpu_flags = av_get_cpu_flags(); + +#if HAVE_MMXEXT_INLINE + if (cpu_flags & AV_CPU_FLAG_MMXEXT) + gf->filter_line = gradfun_filter_line_mmxext; +#endif +#if HAVE_SSSE3_INLINE + if (cpu_flags & AV_CPU_FLAG_SSSE3) + gf->filter_line = gradfun_filter_line_ssse3; +#endif +#if HAVE_SSE2_INLINE + if (cpu_flags & AV_CPU_FLAG_SSE2) + gf->blur_line = gradfun_blur_line_sse2; +#endif +} diff --git a/libavfilter/x86/vf_hqdn3d.asm b/libavfilter/x86/vf_hqdn3d.asm new file mode 100644 index 0000000000..dee2c96131 --- /dev/null +++ b/libavfilter/x86/vf_hqdn3d.asm @@ -0,0 +1,106 @@ +;****************************************************************************** +;* Copyright (c) 2012 Loren Merritt +;* +;* This file is part of Libav. +;* +;* Libav is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* Libav is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with Libav; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "libavutil/x86/x86util.asm" + +SECTION .text + +%macro LOWPASS 3 ; prevsample, cursample, lut + sub %1q, %2q +%if lut_bits != 8 + sar %1q, 8-lut_bits +%endif + movsx %1d, word [%3q+%1q*2] + add %1d, %2d +%endmacro + +%macro LOAD 3 ; dstreg, x, bitdepth +%if %3 == 8 + movzx %1, byte [srcq+%2] +%else + movzx %1, word [srcq+(%2)*2] +%endif +%if %3 != 16 + shl %1, 16-%3 +%endif +%endmacro + +%macro HQDN3D_ROW 1 ; bitdepth +%if ARCH_X86_64 +cglobal hqdn3d_row_%1_x86, 7,10,0, src, dst, lineant, frameant, width, spatial, temporal, pixelant, t0, t1 +%else +cglobal hqdn3d_row_%1_x86, 7,7,0, src, dst, lineant, frameant, width, spatial, temporal +%endif + %assign bytedepth (%1+7)>>3 + %assign lut_bits 4+4*(%1/16) + dec widthq + lea srcq, [srcq+widthq*bytedepth] + lea dstq, [dstq+widthq*bytedepth] + lea frameantq, [frameantq+widthq*2] + lea lineantq, [lineantq+widthq*2] + neg widthq + %define xq widthq +%if ARCH_X86_32 + mov dstmp, dstq + mov srcmp, srcq + mov frameantmp, frameantq + mov lineantmp, lineantq + %define dstq r0 + %define frameantq r0 + %define lineantq r0 + %define pixelantq r1 + %define pixelantd r1d + DECLARE_REG_TMP 2,3 +%endif + LOAD pixelantd, xq, %1 +ALIGN 16 +.loop: + movifnidn srcq, srcmp + LOAD t0d, xq+1, %1 ; skip on the last iteration to avoid overread +.loop2: + movifnidn lineantq, lineantmp + movzx t1d, word [lineantq+xq*2] + LOWPASS t1, pixelant, spatial + mov [lineantq+xq*2], t1w + LOWPASS pixelant, t0, spatial + movifnidn frameantq, frameantmp + movzx t0d, word [frameantq+xq*2] + LOWPASS t0, t1, temporal + mov [frameantq+xq*2], t0w + movifnidn dstq, dstmp +%if %1 != 16 + add t0d, (1<<(15-%1))-1 + shr t0d, 16-%1 ; could eliminate this by storing from t0h, but only with some contraints on register allocation +%endif +%if %1 == 8 + mov [dstq+xq], t0b +%else + mov [dstq+xq*2], t0w +%endif + inc xq + jl .loop + je .loop2 + REP_RET +%endmacro ; HQDN3D_ROW + +HQDN3D_ROW 8 +HQDN3D_ROW 9 +HQDN3D_ROW 10 +HQDN3D_ROW 16 diff --git a/libavfilter/x86/vf_yadif.asm b/libavfilter/x86/vf_yadif.asm new file mode 100644 index 0000000000..bc4b3ce030 --- /dev/null +++ b/libavfilter/x86/vf_yadif.asm @@ -0,0 +1,254 @@ +;***************************************************************************** +;* x86-optimized functions for yadif filter +;* +;* Copyright (C) 2006 Michael Niedermayer +;* Copyright (c) 2013 Daniel Kang +;* +;* This file is part of Libav. +;* +;* Libav is free software; you can redistribute it and/or modify +;* it under the terms of the GNU General Public License as published by +;* the Free Software Foundation; either version 2 of the License, or +;* (at your option) any later version. +;* +;* Libav is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +;* GNU General Public License for more details. +;* +;* You should have received a copy of the GNU General Public License along +;* with Libav; if not, write to the Free Software Foundation, Inc., +;* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +;****************************************************************************** + +%include "libavutil/x86/x86util.asm" + +SECTION_RODATA + +pb_1: times 16 db 1 +pw_1: times 8 dw 1 + +SECTION .text + +%macro CHECK 2 + movu m2, [curq+t1+%1] + movu m3, [curq+t0+%2] + mova m4, m2 + mova m5, m2 + pxor m4, m3 + pavgb m5, m3 + pand m4, [pb_1] + psubusb m5, m4 +%if mmsize == 16 + psrldq m5, 1 +%else + psrlq m5, 8 +%endif + punpcklbw m5, m7 + mova m4, m2 + psubusb m2, m3 + psubusb m3, m4 + pmaxub m2, m3 + mova m3, m2 + mova m4, m2 +%if mmsize == 16 + psrldq m3, 1 + psrldq m4, 2 +%else + psrlq m3, 8 + psrlq m4, 16 +%endif + punpcklbw m2, m7 + punpcklbw m3, m7 + punpcklbw m4, m7 + paddw m2, m3 + paddw m2, m4 +%endmacro + +%macro CHECK1 0 + mova m3, m0 + pcmpgtw m3, m2 + pminsw m0, m2 + mova m6, m3 + pand m5, m3 + pandn m3, m1 + por m3, m5 + mova m1, m3 +%endmacro + +%macro CHECK2 0 + paddw m6, [pw_1] + psllw m6, 14 + paddsw m2, m6 + mova m3, m0 + pcmpgtw m3, m2 + pminsw m0, m2 + pand m5, m3 + pandn m3, m1 + por m3, m5 + mova m1, m3 +%endmacro + +%macro LOAD 2 + movh m%1, %2 + punpcklbw m%1, m7 +%endmacro + +%macro FILTER 3 +.loop%1: + pxor m7, m7 + LOAD 0, [curq+t1] + LOAD 1, [curq+t0] + LOAD 2, [%2] + LOAD 3, [%3] + mova m4, m3 + paddw m3, m2 + psraw m3, 1 + mova [rsp+ 0], m0 + mova [rsp+16], m3 + mova [rsp+32], m1 + psubw m2, m4 + ABS1 m2, m4 + LOAD 3, [prevq+t1] + LOAD 4, [prevq+t0] + psubw m3, m0 + psubw m4, m1 + ABS1 m3, m5 + ABS1 m4, m5 + paddw m3, m4 + psrlw m2, 1 + psrlw m3, 1 + pmaxsw m2, m3 + LOAD 3, [nextq+t1] + LOAD 4, [nextq+t0] + psubw m3, m0 + psubw m4, m1 + ABS1 m3, m5 + ABS1 m4, m5 + paddw m3, m4 + psrlw m3, 1 + pmaxsw m2, m3 + mova [rsp+48], m2 + + paddw m1, m0 + paddw m0, m0 + psubw m0, m1 + psrlw m1, 1 + ABS1 m0, m2 + + movu m2, [curq+t1-1] + movu m3, [curq+t0-1] + mova m4, m2 + psubusb m2, m3 + psubusb m3, m4 + pmaxub m2, m3 +%if mmsize == 16 + mova m3, m2 + psrldq m3, 2 +%else + pshufw m3, m2, q0021 +%endif + punpcklbw m2, m7 + punpcklbw m3, m7 + paddw m0, m2 + paddw m0, m3 + psubw m0, [pw_1] + + CHECK -2, 0 + CHECK1 + CHECK -3, 1 + CHECK2 + CHECK 0, -2 + CHECK1 + CHECK 1, -3 + CHECK2 + + mova m6, [rsp+48] + cmp DWORD r8m, 2 + jge .end%1 + LOAD 2, [%2+t1*2] + LOAD 4, [%3+t1*2] + LOAD 3, [%2+t0*2] + LOAD 5, [%3+t0*2] + paddw m2, m4 + paddw m3, m5 + psrlw m2, 1 + psrlw m3, 1 + mova m4, [rsp+ 0] + mova m5, [rsp+16] + mova m7, [rsp+32] + psubw m2, m4 + psubw m3, m7 + mova m0, m5 + psubw m5, m4 + psubw m0, m7 + mova m4, m2 + pminsw m2, m3 + pmaxsw m3, m4 + pmaxsw m2, m5 + pminsw m3, m5 + pmaxsw m2, m0 + pminsw m3, m0 + pxor m4, m4 + pmaxsw m6, m3 + psubw m4, m2 + pmaxsw m6, m4 + +.end%1: + mova m2, [rsp+16] + mova m3, m2 + psubw m2, m6 + paddw m3, m6 + pmaxsw m1, m2 + pminsw m1, m3 + packuswb m1, m1 + + movh [dstq], m1 + add dstq, mmsize/2 + add prevq, mmsize/2 + add curq, mmsize/2 + add nextq, mmsize/2 + sub DWORD r4m, mmsize/2 + jg .loop%1 +%endmacro + +%macro YADIF 0 +%if ARCH_X86_32 +cglobal yadif_filter_line, 4, 6, 8, 80, dst, prev, cur, next, w, prefs, \ + mrefs, parity, mode +%else +cglobal yadif_filter_line, 4, 7, 8, 80, dst, prev, cur, next, w, prefs, \ + mrefs, parity, mode +%endif + cmp DWORD wm, 0 + jle .ret +%if ARCH_X86_32 + mov r4, r5mp + mov r5, r6mp + DECLARE_REG_TMP 4,5 +%else + movsxd r5, DWORD r5m + movsxd r6, DWORD r6m + DECLARE_REG_TMP 5,6 +%endif + + cmp DWORD paritym, 0 + je .parity0 + FILTER 1, prevq, curq + jmp .ret + +.parity0: + FILTER 0, curq, nextq + +.ret: + RET +%endmacro + +INIT_XMM ssse3 +YADIF +INIT_XMM sse2 +YADIF +%if ARCH_X86_32 +INIT_MMX mmxext +YADIF +%endif diff --git a/libavfilter/x86/vf_yadif_init.c b/libavfilter/x86/vf_yadif_init.c new file mode 100644 index 0000000000..0cee8e56b4 --- /dev/null +++ b/libavfilter/x86/vf_yadif_init.c @@ -0,0 +1,54 @@ +/* + * Copyright (C) 2006 Michael Niedermayer + * + * This file is part of Libav. + * + * Libav is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * Libav is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with Libav; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + */ + +#include "libavutil/attributes.h" +#include "libavutil/cpu.h" +#include "libavutil/internal.h" +#include "libavutil/mem.h" +#include "libavutil/x86/asm.h" +#include "libavutil/x86/cpu.h" +#include "libavcodec/x86/dsputil_mmx.h" +#include "libavfilter/yadif.h" + +void ff_yadif_filter_line_mmxext(uint8_t *dst, uint8_t *prev, uint8_t *cur, + uint8_t *next, int w, int prefs, + int mrefs, int parity, int mode); +void ff_yadif_filter_line_sse2(uint8_t *dst, uint8_t *prev, uint8_t *cur, + uint8_t *next, int w, int prefs, + int mrefs, int parity, int mode); +void ff_yadif_filter_line_ssse3(uint8_t *dst, uint8_t *prev, uint8_t *cur, + uint8_t *next, int w, int prefs, + int mrefs, int parity, int mode); + +av_cold void ff_yadif_init_x86(YADIFContext *yadif) +{ + int cpu_flags = av_get_cpu_flags(); + +#if HAVE_YASM +#if ARCH_X86_32 + if (EXTERNAL_MMXEXT(cpu_flags)) + yadif->filter_line = ff_yadif_filter_line_mmxext; +#endif /* ARCH_X86_32 */ + if (EXTERNAL_SSE2(cpu_flags)) + yadif->filter_line = ff_yadif_filter_line_sse2; + if (EXTERNAL_SSSE3(cpu_flags)) + yadif->filter_line = ff_yadif_filter_line_ssse3; +#endif /* HAVE_YASM */ +} diff --git a/libavfilter/x86/yadif.asm b/libavfilter/x86/yadif.asm deleted file mode 100644 index bc4b3ce030..0000000000 --- a/libavfilter/x86/yadif.asm +++ /dev/null @@ -1,254 +0,0 @@ -;***************************************************************************** -;* x86-optimized functions for yadif filter -;* -;* Copyright (C) 2006 Michael Niedermayer -;* Copyright (c) 2013 Daniel Kang -;* -;* This file is part of Libav. -;* -;* Libav is free software; you can redistribute it and/or modify -;* it under the terms of the GNU General Public License as published by -;* the Free Software Foundation; either version 2 of the License, or -;* (at your option) any later version. -;* -;* Libav is distributed in the hope that it will be useful, -;* but WITHOUT ANY WARRANTY; without even the implied warranty of -;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -;* GNU General Public License for more details. -;* -;* You should have received a copy of the GNU General Public License along -;* with Libav; if not, write to the Free Software Foundation, Inc., -;* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. -;****************************************************************************** - -%include "libavutil/x86/x86util.asm" - -SECTION_RODATA - -pb_1: times 16 db 1 -pw_1: times 8 dw 1 - -SECTION .text - -%macro CHECK 2 - movu m2, [curq+t1+%1] - movu m3, [curq+t0+%2] - mova m4, m2 - mova m5, m2 - pxor m4, m3 - pavgb m5, m3 - pand m4, [pb_1] - psubusb m5, m4 -%if mmsize == 16 - psrldq m5, 1 -%else - psrlq m5, 8 -%endif - punpcklbw m5, m7 - mova m4, m2 - psubusb m2, m3 - psubusb m3, m4 - pmaxub m2, m3 - mova m3, m2 - mova m4, m2 -%if mmsize == 16 - psrldq m3, 1 - psrldq m4, 2 -%else - psrlq m3, 8 - psrlq m4, 16 -%endif - punpcklbw m2, m7 - punpcklbw m3, m7 - punpcklbw m4, m7 - paddw m2, m3 - paddw m2, m4 -%endmacro - -%macro CHECK1 0 - mova m3, m0 - pcmpgtw m3, m2 - pminsw m0, m2 - mova m6, m3 - pand m5, m3 - pandn m3, m1 - por m3, m5 - mova m1, m3 -%endmacro - -%macro CHECK2 0 - paddw m6, [pw_1] - psllw m6, 14 - paddsw m2, m6 - mova m3, m0 - pcmpgtw m3, m2 - pminsw m0, m2 - pand m5, m3 - pandn m3, m1 - por m3, m5 - mova m1, m3 -%endmacro - -%macro LOAD 2 - movh m%1, %2 - punpcklbw m%1, m7 -%endmacro - -%macro FILTER 3 -.loop%1: - pxor m7, m7 - LOAD 0, [curq+t1] - LOAD 1, [curq+t0] - LOAD 2, [%2] - LOAD 3, [%3] - mova m4, m3 - paddw m3, m2 - psraw m3, 1 - mova [rsp+ 0], m0 - mova [rsp+16], m3 - mova [rsp+32], m1 - psubw m2, m4 - ABS1 m2, m4 - LOAD 3, [prevq+t1] - LOAD 4, [prevq+t0] - psubw m3, m0 - psubw m4, m1 - ABS1 m3, m5 - ABS1 m4, m5 - paddw m3, m4 - psrlw m2, 1 - psrlw m3, 1 - pmaxsw m2, m3 - LOAD 3, [nextq+t1] - LOAD 4, [nextq+t0] - psubw m3, m0 - psubw m4, m1 - ABS1 m3, m5 - ABS1 m4, m5 - paddw m3, m4 - psrlw m3, 1 - pmaxsw m2, m3 - mova [rsp+48], m2 - - paddw m1, m0 - paddw m0, m0 - psubw m0, m1 - psrlw m1, 1 - ABS1 m0, m2 - - movu m2, [curq+t1-1] - movu m3, [curq+t0-1] - mova m4, m2 - psubusb m2, m3 - psubusb m3, m4 - pmaxub m2, m3 -%if mmsize == 16 - mova m3, m2 - psrldq m3, 2 -%else - pshufw m3, m2, q0021 -%endif - punpcklbw m2, m7 - punpcklbw m3, m7 - paddw m0, m2 - paddw m0, m3 - psubw m0, [pw_1] - - CHECK -2, 0 - CHECK1 - CHECK -3, 1 - CHECK2 - CHECK 0, -2 - CHECK1 - CHECK 1, -3 - CHECK2 - - mova m6, [rsp+48] - cmp DWORD r8m, 2 - jge .end%1 - LOAD 2, [%2+t1*2] - LOAD 4, [%3+t1*2] - LOAD 3, [%2+t0*2] - LOAD 5, [%3+t0*2] - paddw m2, m4 - paddw m3, m5 - psrlw m2, 1 - psrlw m3, 1 - mova m4, [rsp+ 0] - mova m5, [rsp+16] - mova m7, [rsp+32] - psubw m2, m4 - psubw m3, m7 - mova m0, m5 - psubw m5, m4 - psubw m0, m7 - mova m4, m2 - pminsw m2, m3 - pmaxsw m3, m4 - pmaxsw m2, m5 - pminsw m3, m5 - pmaxsw m2, m0 - pminsw m3, m0 - pxor m4, m4 - pmaxsw m6, m3 - psubw m4, m2 - pmaxsw m6, m4 - -.end%1: - mova m2, [rsp+16] - mova m3, m2 - psubw m2, m6 - paddw m3, m6 - pmaxsw m1, m2 - pminsw m1, m3 - packuswb m1, m1 - - movh [dstq], m1 - add dstq, mmsize/2 - add prevq, mmsize/2 - add curq, mmsize/2 - add nextq, mmsize/2 - sub DWORD r4m, mmsize/2 - jg .loop%1 -%endmacro - -%macro YADIF 0 -%if ARCH_X86_32 -cglobal yadif_filter_line, 4, 6, 8, 80, dst, prev, cur, next, w, prefs, \ - mrefs, parity, mode -%else -cglobal yadif_filter_line, 4, 7, 8, 80, dst, prev, cur, next, w, prefs, \ - mrefs, parity, mode -%endif - cmp DWORD wm, 0 - jle .ret -%if ARCH_X86_32 - mov r4, r5mp - mov r5, r6mp - DECLARE_REG_TMP 4,5 -%else - movsxd r5, DWORD r5m - movsxd r6, DWORD r6m - DECLARE_REG_TMP 5,6 -%endif - - cmp DWORD paritym, 0 - je .parity0 - FILTER 1, prevq, curq - jmp .ret - -.parity0: - FILTER 0, curq, nextq - -.ret: - RET -%endmacro - -INIT_XMM ssse3 -YADIF -INIT_XMM sse2 -YADIF -%if ARCH_X86_32 -INIT_MMX mmxext -YADIF -%endif diff --git a/libavfilter/x86/yadif_init.c b/libavfilter/x86/yadif_init.c deleted file mode 100644 index 0cee8e56b4..0000000000 --- a/libavfilter/x86/yadif_init.c +++ /dev/null @@ -1,54 +0,0 @@ -/* - * Copyright (C) 2006 Michael Niedermayer - * - * This file is part of Libav. - * - * Libav is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * Libav is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License along - * with Libav; if not, write to the Free Software Foundation, Inc., - * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. - */ - -#include "libavutil/attributes.h" -#include "libavutil/cpu.h" -#include "libavutil/internal.h" -#include "libavutil/mem.h" -#include "libavutil/x86/asm.h" -#include "libavutil/x86/cpu.h" -#include "libavcodec/x86/dsputil_mmx.h" -#include "libavfilter/yadif.h" - -void ff_yadif_filter_line_mmxext(uint8_t *dst, uint8_t *prev, uint8_t *cur, - uint8_t *next, int w, int prefs, - int mrefs, int parity, int mode); -void ff_yadif_filter_line_sse2(uint8_t *dst, uint8_t *prev, uint8_t *cur, - uint8_t *next, int w, int prefs, - int mrefs, int parity, int mode); -void ff_yadif_filter_line_ssse3(uint8_t *dst, uint8_t *prev, uint8_t *cur, - uint8_t *next, int w, int prefs, - int mrefs, int parity, int mode); - -av_cold void ff_yadif_init_x86(YADIFContext *yadif) -{ - int cpu_flags = av_get_cpu_flags(); - -#if HAVE_YASM -#if ARCH_X86_32 - if (EXTERNAL_MMXEXT(cpu_flags)) - yadif->filter_line = ff_yadif_filter_line_mmxext; -#endif /* ARCH_X86_32 */ - if (EXTERNAL_SSE2(cpu_flags)) - yadif->filter_line = ff_yadif_filter_line_sse2; - if (EXTERNAL_SSSE3(cpu_flags)) - yadif->filter_line = ff_yadif_filter_line_ssse3; -#endif /* HAVE_YASM */ -} -- cgit v1.2.3