From 29cfdd37674e3444557c385eaffef06c1b325414 Mon Sep 17 00:00:00 2001 From: Diego Biurrun Date: Sat, 11 Aug 2012 22:45:53 +0200 Subject: x86: avcodec: Appropriately name files containing only init functions --- libavcodec/x86/Makefile | 8 +- libavcodec/x86/ac3dsp_init.c | 93 ++++++++++ libavcodec/x86/ac3dsp_mmx.c | 93 ---------- libavcodec/x86/fft.c | 72 -------- libavcodec/x86/fft_init.c | 72 ++++++++ libavcodec/x86/fmtconvert_init.c | 147 +++++++++++++++ libavcodec/x86/fmtconvert_mmx.c | 147 --------------- libavcodec/x86/h264dsp_init.c | 385 +++++++++++++++++++++++++++++++++++++++ libavcodec/x86/h264dsp_mmx.c | 385 --------------------------------------- 9 files changed, 701 insertions(+), 701 deletions(-) create mode 100644 libavcodec/x86/ac3dsp_init.c delete mode 100644 libavcodec/x86/ac3dsp_mmx.c delete mode 100644 libavcodec/x86/fft.c create mode 100644 libavcodec/x86/fft_init.c create mode 100644 libavcodec/x86/fmtconvert_init.c delete mode 100644 libavcodec/x86/fmtconvert_mmx.c create mode 100644 libavcodec/x86/h264dsp_init.c delete mode 100644 libavcodec/x86/h264dsp_mmx.c diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile index 57e73d8b2f..4d06685975 100644 --- a/libavcodec/x86/Makefile +++ b/libavcodec/x86/Makefile @@ -5,7 +5,7 @@ OBJS-$(CONFIG_XMM_CLOBBER_TEST) += x86/w64xmmtest.o MMX-OBJS += x86/dsputil_mmx.o \ x86/fdct_mmx.o \ - x86/fmtconvert_mmx.o \ + x86/fmtconvert_init.o \ x86/idct_mmx_xvid.o \ x86/idct_sse2_xvid.o \ x86/motion_est_mmx.o \ @@ -13,13 +13,13 @@ MMX-OBJS += x86/dsputil_mmx.o \ x86/simple_idct_mmx.o \ MMX-OBJS-$(CONFIG_AAC_DECODER) += x86/sbrdsp_init.o -MMX-OBJS-$(CONFIG_AC3DSP) += x86/ac3dsp_mmx.o +MMX-OBJS-$(CONFIG_AC3DSP) += x86/ac3dsp_init.o MMX-OBJS-$(CONFIG_CAVS_DECODER) += x86/cavsdsp_mmx.o MMX-OBJS-$(CONFIG_DNXHD_ENCODER) += x86/dnxhd_mmx.o MMX-OBJS-$(CONFIG_DWT) += x86/snowdsp_mmx.o MMX-OBJS-$(CONFIG_ENCODERS) += x86/dsputilenc_mmx.o -MMX-OBJS-$(CONFIG_FFT) += x86/fft.o -MMX-OBJS-$(CONFIG_H264DSP) += x86/h264dsp_mmx.o +MMX-OBJS-$(CONFIG_FFT) += x86/fft_init.o +MMX-OBJS-$(CONFIG_H264DSP) += x86/h264dsp_init.o MMX-OBJS-$(CONFIG_H264PRED) += x86/h264_intrapred_init.o MMX-OBJS-$(CONFIG_LPC) += x86/lpc_mmx.o MMX-OBJS-$(CONFIG_MPEGAUDIODSP) += x86/mpegaudiodec_mmx.o diff --git a/libavcodec/x86/ac3dsp_init.c b/libavcodec/x86/ac3dsp_init.c new file mode 100644 index 0000000000..f3db67a84f --- /dev/null +++ b/libavcodec/x86/ac3dsp_init.c @@ -0,0 +1,93 @@ +/* + * x86-optimized AC-3 DSP utils + * Copyright (c) 2011 Justin Ruggles + * + * This file is part of Libav. + * + * Libav is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * Libav is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Libav; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/x86/asm.h" +#include "dsputil_mmx.h" +#include "libavcodec/ac3dsp.h" + +extern void ff_ac3_exponent_min_mmx (uint8_t *exp, int num_reuse_blocks, int nb_coefs); +extern void ff_ac3_exponent_min_mmxext(uint8_t *exp, int num_reuse_blocks, int nb_coefs); +extern void ff_ac3_exponent_min_sse2 (uint8_t *exp, int num_reuse_blocks, int nb_coefs); + +extern int ff_ac3_max_msb_abs_int16_mmx (const int16_t *src, int len); +extern int ff_ac3_max_msb_abs_int16_mmx2 (const int16_t *src, int len); +extern int ff_ac3_max_msb_abs_int16_sse2 (const int16_t *src, int len); +extern int ff_ac3_max_msb_abs_int16_ssse3(const int16_t *src, int len); + +extern void ff_ac3_lshift_int16_mmx (int16_t *src, unsigned int len, unsigned int shift); +extern void ff_ac3_lshift_int16_sse2(int16_t *src, unsigned int len, unsigned int shift); + +extern void ff_ac3_rshift_int32_mmx (int32_t *src, unsigned int len, unsigned int shift); +extern void ff_ac3_rshift_int32_sse2(int32_t *src, unsigned int len, unsigned int shift); + +extern void ff_float_to_fixed24_3dnow(int32_t *dst, const float *src, unsigned int len); +extern void ff_float_to_fixed24_sse (int32_t *dst, const float *src, unsigned int len); +extern void ff_float_to_fixed24_sse2 (int32_t *dst, const float *src, unsigned int len); + +extern int ff_ac3_compute_mantissa_size_sse2(uint16_t mant_cnt[6][16]); + +extern void ff_ac3_extract_exponents_3dnow(uint8_t *exp, int32_t *coef, int nb_coefs); +extern void ff_ac3_extract_exponents_sse2 (uint8_t *exp, int32_t *coef, int nb_coefs); +extern void ff_ac3_extract_exponents_ssse3(uint8_t *exp, int32_t *coef, int nb_coefs); + +av_cold void ff_ac3dsp_init_x86(AC3DSPContext *c, int bit_exact) +{ +#if HAVE_YASM + int mm_flags = av_get_cpu_flags(); + + if (mm_flags & AV_CPU_FLAG_MMX) { + c->ac3_exponent_min = ff_ac3_exponent_min_mmx; + c->ac3_max_msb_abs_int16 = ff_ac3_max_msb_abs_int16_mmx; + c->ac3_lshift_int16 = ff_ac3_lshift_int16_mmx; + c->ac3_rshift_int32 = ff_ac3_rshift_int32_mmx; + } + if (mm_flags & AV_CPU_FLAG_3DNOW && HAVE_AMD3DNOW) { + c->extract_exponents = ff_ac3_extract_exponents_3dnow; + if (!bit_exact) { + c->float_to_fixed24 = ff_float_to_fixed24_3dnow; + } + } + if (mm_flags & AV_CPU_FLAG_MMXEXT && HAVE_MMXEXT) { + c->ac3_exponent_min = ff_ac3_exponent_min_mmxext; + c->ac3_max_msb_abs_int16 = ff_ac3_max_msb_abs_int16_mmx2; + } + if (mm_flags & AV_CPU_FLAG_SSE && HAVE_SSE) { + c->float_to_fixed24 = ff_float_to_fixed24_sse; + } + if (mm_flags & AV_CPU_FLAG_SSE2 && HAVE_SSE) { + c->ac3_exponent_min = ff_ac3_exponent_min_sse2; + c->ac3_max_msb_abs_int16 = ff_ac3_max_msb_abs_int16_sse2; + c->float_to_fixed24 = ff_float_to_fixed24_sse2; + c->compute_mantissa_size = ff_ac3_compute_mantissa_size_sse2; + c->extract_exponents = ff_ac3_extract_exponents_sse2; + if (!(mm_flags & AV_CPU_FLAG_SSE2SLOW)) { + c->ac3_lshift_int16 = ff_ac3_lshift_int16_sse2; + c->ac3_rshift_int32 = ff_ac3_rshift_int32_sse2; + } + } + if (mm_flags & AV_CPU_FLAG_SSSE3 && HAVE_SSSE3) { + c->ac3_max_msb_abs_int16 = ff_ac3_max_msb_abs_int16_ssse3; + if (!(mm_flags & AV_CPU_FLAG_ATOM)) { + c->extract_exponents = ff_ac3_extract_exponents_ssse3; + } + } +#endif +} diff --git a/libavcodec/x86/ac3dsp_mmx.c b/libavcodec/x86/ac3dsp_mmx.c deleted file mode 100644 index f3db67a84f..0000000000 --- a/libavcodec/x86/ac3dsp_mmx.c +++ /dev/null @@ -1,93 +0,0 @@ -/* - * x86-optimized AC-3 DSP utils - * Copyright (c) 2011 Justin Ruggles - * - * This file is part of Libav. - * - * Libav is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * Libav is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "libavutil/x86/asm.h" -#include "dsputil_mmx.h" -#include "libavcodec/ac3dsp.h" - -extern void ff_ac3_exponent_min_mmx (uint8_t *exp, int num_reuse_blocks, int nb_coefs); -extern void ff_ac3_exponent_min_mmxext(uint8_t *exp, int num_reuse_blocks, int nb_coefs); -extern void ff_ac3_exponent_min_sse2 (uint8_t *exp, int num_reuse_blocks, int nb_coefs); - -extern int ff_ac3_max_msb_abs_int16_mmx (const int16_t *src, int len); -extern int ff_ac3_max_msb_abs_int16_mmx2 (const int16_t *src, int len); -extern int ff_ac3_max_msb_abs_int16_sse2 (const int16_t *src, int len); -extern int ff_ac3_max_msb_abs_int16_ssse3(const int16_t *src, int len); - -extern void ff_ac3_lshift_int16_mmx (int16_t *src, unsigned int len, unsigned int shift); -extern void ff_ac3_lshift_int16_sse2(int16_t *src, unsigned int len, unsigned int shift); - -extern void ff_ac3_rshift_int32_mmx (int32_t *src, unsigned int len, unsigned int shift); -extern void ff_ac3_rshift_int32_sse2(int32_t *src, unsigned int len, unsigned int shift); - -extern void ff_float_to_fixed24_3dnow(int32_t *dst, const float *src, unsigned int len); -extern void ff_float_to_fixed24_sse (int32_t *dst, const float *src, unsigned int len); -extern void ff_float_to_fixed24_sse2 (int32_t *dst, const float *src, unsigned int len); - -extern int ff_ac3_compute_mantissa_size_sse2(uint16_t mant_cnt[6][16]); - -extern void ff_ac3_extract_exponents_3dnow(uint8_t *exp, int32_t *coef, int nb_coefs); -extern void ff_ac3_extract_exponents_sse2 (uint8_t *exp, int32_t *coef, int nb_coefs); -extern void ff_ac3_extract_exponents_ssse3(uint8_t *exp, int32_t *coef, int nb_coefs); - -av_cold void ff_ac3dsp_init_x86(AC3DSPContext *c, int bit_exact) -{ -#if HAVE_YASM - int mm_flags = av_get_cpu_flags(); - - if (mm_flags & AV_CPU_FLAG_MMX) { - c->ac3_exponent_min = ff_ac3_exponent_min_mmx; - c->ac3_max_msb_abs_int16 = ff_ac3_max_msb_abs_int16_mmx; - c->ac3_lshift_int16 = ff_ac3_lshift_int16_mmx; - c->ac3_rshift_int32 = ff_ac3_rshift_int32_mmx; - } - if (mm_flags & AV_CPU_FLAG_3DNOW && HAVE_AMD3DNOW) { - c->extract_exponents = ff_ac3_extract_exponents_3dnow; - if (!bit_exact) { - c->float_to_fixed24 = ff_float_to_fixed24_3dnow; - } - } - if (mm_flags & AV_CPU_FLAG_MMXEXT && HAVE_MMXEXT) { - c->ac3_exponent_min = ff_ac3_exponent_min_mmxext; - c->ac3_max_msb_abs_int16 = ff_ac3_max_msb_abs_int16_mmx2; - } - if (mm_flags & AV_CPU_FLAG_SSE && HAVE_SSE) { - c->float_to_fixed24 = ff_float_to_fixed24_sse; - } - if (mm_flags & AV_CPU_FLAG_SSE2 && HAVE_SSE) { - c->ac3_exponent_min = ff_ac3_exponent_min_sse2; - c->ac3_max_msb_abs_int16 = ff_ac3_max_msb_abs_int16_sse2; - c->float_to_fixed24 = ff_float_to_fixed24_sse2; - c->compute_mantissa_size = ff_ac3_compute_mantissa_size_sse2; - c->extract_exponents = ff_ac3_extract_exponents_sse2; - if (!(mm_flags & AV_CPU_FLAG_SSE2SLOW)) { - c->ac3_lshift_int16 = ff_ac3_lshift_int16_sse2; - c->ac3_rshift_int32 = ff_ac3_rshift_int32_sse2; - } - } - if (mm_flags & AV_CPU_FLAG_SSSE3 && HAVE_SSSE3) { - c->ac3_max_msb_abs_int16 = ff_ac3_max_msb_abs_int16_ssse3; - if (!(mm_flags & AV_CPU_FLAG_ATOM)) { - c->extract_exponents = ff_ac3_extract_exponents_ssse3; - } - } -#endif -} diff --git a/libavcodec/x86/fft.c b/libavcodec/x86/fft.c deleted file mode 100644 index fcde3fa797..0000000000 --- a/libavcodec/x86/fft.c +++ /dev/null @@ -1,72 +0,0 @@ -/* - * This file is part of Libav. - * - * Libav is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * Libav is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "libavutil/cpu.h" -#include "libavcodec/dsputil.h" -#include "libavcodec/dct.h" -#include "fft.h" - -av_cold void ff_fft_init_mmx(FFTContext *s) -{ -#if HAVE_YASM - int has_vectors = av_get_cpu_flags(); -#if ARCH_X86_32 - if (has_vectors & AV_CPU_FLAG_3DNOW && HAVE_AMD3DNOW) { - /* 3DNow! for K6-2/3 */ - s->imdct_calc = ff_imdct_calc_3dnow; - s->imdct_half = ff_imdct_half_3dnow; - s->fft_calc = ff_fft_calc_3dnow; - } - if (has_vectors & AV_CPU_FLAG_3DNOWEXT && HAVE_AMD3DNOWEXT) { - /* 3DNowEx for K7 */ - s->imdct_calc = ff_imdct_calc_3dnowext; - s->imdct_half = ff_imdct_half_3dnowext; - s->fft_calc = ff_fft_calc_3dnowext; - } -#endif - if (has_vectors & AV_CPU_FLAG_SSE && HAVE_SSE) { - /* SSE for P3/P4/K8 */ - s->imdct_calc = ff_imdct_calc_sse; - s->imdct_half = ff_imdct_half_sse; - s->fft_permute = ff_fft_permute_sse; - s->fft_calc = ff_fft_calc_sse; - s->fft_permutation = FF_FFT_PERM_SWAP_LSBS; - } - if (has_vectors & AV_CPU_FLAG_AVX && HAVE_AVX && s->nbits >= 5) { - /* AVX for SB */ - s->imdct_half = ff_imdct_half_avx; - s->fft_calc = ff_fft_calc_avx; - s->fft_permutation = FF_FFT_PERM_AVX; - } -#endif -} - -#if CONFIG_DCT -av_cold void ff_dct_init_mmx(DCTContext *s) -{ -#if HAVE_YASM - int has_vectors = av_get_cpu_flags(); - if (has_vectors & AV_CPU_FLAG_SSE && HAVE_SSE) - s->dct32 = ff_dct32_float_sse; - if (has_vectors & AV_CPU_FLAG_SSE2 && HAVE_SSE) - s->dct32 = ff_dct32_float_sse2; - if (has_vectors & AV_CPU_FLAG_AVX && HAVE_AVX) - s->dct32 = ff_dct32_float_avx; -#endif -} -#endif diff --git a/libavcodec/x86/fft_init.c b/libavcodec/x86/fft_init.c new file mode 100644 index 0000000000..fcde3fa797 --- /dev/null +++ b/libavcodec/x86/fft_init.c @@ -0,0 +1,72 @@ +/* + * This file is part of Libav. + * + * Libav is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * Libav is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Libav; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/cpu.h" +#include "libavcodec/dsputil.h" +#include "libavcodec/dct.h" +#include "fft.h" + +av_cold void ff_fft_init_mmx(FFTContext *s) +{ +#if HAVE_YASM + int has_vectors = av_get_cpu_flags(); +#if ARCH_X86_32 + if (has_vectors & AV_CPU_FLAG_3DNOW && HAVE_AMD3DNOW) { + /* 3DNow! for K6-2/3 */ + s->imdct_calc = ff_imdct_calc_3dnow; + s->imdct_half = ff_imdct_half_3dnow; + s->fft_calc = ff_fft_calc_3dnow; + } + if (has_vectors & AV_CPU_FLAG_3DNOWEXT && HAVE_AMD3DNOWEXT) { + /* 3DNowEx for K7 */ + s->imdct_calc = ff_imdct_calc_3dnowext; + s->imdct_half = ff_imdct_half_3dnowext; + s->fft_calc = ff_fft_calc_3dnowext; + } +#endif + if (has_vectors & AV_CPU_FLAG_SSE && HAVE_SSE) { + /* SSE for P3/P4/K8 */ + s->imdct_calc = ff_imdct_calc_sse; + s->imdct_half = ff_imdct_half_sse; + s->fft_permute = ff_fft_permute_sse; + s->fft_calc = ff_fft_calc_sse; + s->fft_permutation = FF_FFT_PERM_SWAP_LSBS; + } + if (has_vectors & AV_CPU_FLAG_AVX && HAVE_AVX && s->nbits >= 5) { + /* AVX for SB */ + s->imdct_half = ff_imdct_half_avx; + s->fft_calc = ff_fft_calc_avx; + s->fft_permutation = FF_FFT_PERM_AVX; + } +#endif +} + +#if CONFIG_DCT +av_cold void ff_dct_init_mmx(DCTContext *s) +{ +#if HAVE_YASM + int has_vectors = av_get_cpu_flags(); + if (has_vectors & AV_CPU_FLAG_SSE && HAVE_SSE) + s->dct32 = ff_dct32_float_sse; + if (has_vectors & AV_CPU_FLAG_SSE2 && HAVE_SSE) + s->dct32 = ff_dct32_float_sse2; + if (has_vectors & AV_CPU_FLAG_AVX && HAVE_AVX) + s->dct32 = ff_dct32_float_avx; +#endif +} +#endif diff --git a/libavcodec/x86/fmtconvert_init.c b/libavcodec/x86/fmtconvert_init.c new file mode 100644 index 0000000000..6f3d14aedc --- /dev/null +++ b/libavcodec/x86/fmtconvert_init.c @@ -0,0 +1,147 @@ +/* + * Format Conversion Utils + * Copyright (c) 2000, 2001 Fabrice Bellard + * Copyright (c) 2002-2004 Michael Niedermayer + * + * This file is part of Libav. + * + * Libav is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * Libav is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Libav; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + * + * MMX optimization by Nick Kurshev + */ + +#include "libavutil/cpu.h" +#include "libavutil/x86/asm.h" +#include "libavcodec/fmtconvert.h" +#include "libavcodec/dsputil.h" + +#if HAVE_YASM + +void ff_int32_to_float_fmul_scalar_sse (float *dst, const int *src, float mul, int len); +void ff_int32_to_float_fmul_scalar_sse2(float *dst, const int *src, float mul, int len); + +void ff_float_to_int16_3dnow(int16_t *dst, const float *src, long len); +void ff_float_to_int16_sse (int16_t *dst, const float *src, long len); +void ff_float_to_int16_sse2 (int16_t *dst, const float *src, long len); + +void ff_float_to_int16_step_3dnow(int16_t *dst, const float *src, long len, long step); +void ff_float_to_int16_step_sse (int16_t *dst, const float *src, long len, long step); +void ff_float_to_int16_step_sse2 (int16_t *dst, const float *src, long len, long step); + +void ff_float_to_int16_interleave2_3dnow(int16_t *dst, const float **src, long len); +void ff_float_to_int16_interleave2_sse (int16_t *dst, const float **src, long len); +void ff_float_to_int16_interleave2_sse2 (int16_t *dst, const float **src, long len); + +void ff_float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len); +void ff_float_to_int16_interleave6_3dnow(int16_t *dst, const float **src, int len); +void ff_float_to_int16_interleave6_3dnowext(int16_t *dst, const float **src, int len); + +#define ff_float_to_int16_interleave6_sse2 ff_float_to_int16_interleave6_sse + +#define FLOAT_TO_INT16_INTERLEAVE(cpu) \ +/* gcc pessimizes register allocation if this is in the same function as float_to_int16_interleave_sse2*/\ +static av_noinline void float_to_int16_interleave_misc_##cpu(int16_t *dst, const float **src, long len, int channels){\ + int c;\ + for(c=0; cfloat_interleave = float_interleave_mmx; + + if (HAVE_AMD3DNOW && mm_flags & AV_CPU_FLAG_3DNOW) { + if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ + c->float_to_int16 = ff_float_to_int16_3dnow; + c->float_to_int16_interleave = float_to_int16_interleave_3dnow; + } + } + if (HAVE_AMD3DNOWEXT && mm_flags & AV_CPU_FLAG_3DNOWEXT) { + if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ + c->float_to_int16_interleave = float_to_int16_interleave_3dnowext; + } + } + if (HAVE_SSE && mm_flags & AV_CPU_FLAG_SSE) { + c->int32_to_float_fmul_scalar = ff_int32_to_float_fmul_scalar_sse; + c->float_to_int16 = ff_float_to_int16_sse; + c->float_to_int16_interleave = float_to_int16_interleave_sse; + c->float_interleave = float_interleave_sse; + } + if (HAVE_SSE && mm_flags & AV_CPU_FLAG_SSE2) { + c->int32_to_float_fmul_scalar = ff_int32_to_float_fmul_scalar_sse2; + c->float_to_int16 = ff_float_to_int16_sse2; + c->float_to_int16_interleave = float_to_int16_interleave_sse2; + } + } +#endif +} diff --git a/libavcodec/x86/fmtconvert_mmx.c b/libavcodec/x86/fmtconvert_mmx.c deleted file mode 100644 index 6f3d14aedc..0000000000 --- a/libavcodec/x86/fmtconvert_mmx.c +++ /dev/null @@ -1,147 +0,0 @@ -/* - * Format Conversion Utils - * Copyright (c) 2000, 2001 Fabrice Bellard - * Copyright (c) 2002-2004 Michael Niedermayer - * - * This file is part of Libav. - * - * Libav is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * Libav is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - * - * MMX optimization by Nick Kurshev - */ - -#include "libavutil/cpu.h" -#include "libavutil/x86/asm.h" -#include "libavcodec/fmtconvert.h" -#include "libavcodec/dsputil.h" - -#if HAVE_YASM - -void ff_int32_to_float_fmul_scalar_sse (float *dst, const int *src, float mul, int len); -void ff_int32_to_float_fmul_scalar_sse2(float *dst, const int *src, float mul, int len); - -void ff_float_to_int16_3dnow(int16_t *dst, const float *src, long len); -void ff_float_to_int16_sse (int16_t *dst, const float *src, long len); -void ff_float_to_int16_sse2 (int16_t *dst, const float *src, long len); - -void ff_float_to_int16_step_3dnow(int16_t *dst, const float *src, long len, long step); -void ff_float_to_int16_step_sse (int16_t *dst, const float *src, long len, long step); -void ff_float_to_int16_step_sse2 (int16_t *dst, const float *src, long len, long step); - -void ff_float_to_int16_interleave2_3dnow(int16_t *dst, const float **src, long len); -void ff_float_to_int16_interleave2_sse (int16_t *dst, const float **src, long len); -void ff_float_to_int16_interleave2_sse2 (int16_t *dst, const float **src, long len); - -void ff_float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len); -void ff_float_to_int16_interleave6_3dnow(int16_t *dst, const float **src, int len); -void ff_float_to_int16_interleave6_3dnowext(int16_t *dst, const float **src, int len); - -#define ff_float_to_int16_interleave6_sse2 ff_float_to_int16_interleave6_sse - -#define FLOAT_TO_INT16_INTERLEAVE(cpu) \ -/* gcc pessimizes register allocation if this is in the same function as float_to_int16_interleave_sse2*/\ -static av_noinline void float_to_int16_interleave_misc_##cpu(int16_t *dst, const float **src, long len, int channels){\ - int c;\ - for(c=0; cfloat_interleave = float_interleave_mmx; - - if (HAVE_AMD3DNOW && mm_flags & AV_CPU_FLAG_3DNOW) { - if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ - c->float_to_int16 = ff_float_to_int16_3dnow; - c->float_to_int16_interleave = float_to_int16_interleave_3dnow; - } - } - if (HAVE_AMD3DNOWEXT && mm_flags & AV_CPU_FLAG_3DNOWEXT) { - if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ - c->float_to_int16_interleave = float_to_int16_interleave_3dnowext; - } - } - if (HAVE_SSE && mm_flags & AV_CPU_FLAG_SSE) { - c->int32_to_float_fmul_scalar = ff_int32_to_float_fmul_scalar_sse; - c->float_to_int16 = ff_float_to_int16_sse; - c->float_to_int16_interleave = float_to_int16_interleave_sse; - c->float_interleave = float_interleave_sse; - } - if (HAVE_SSE && mm_flags & AV_CPU_FLAG_SSE2) { - c->int32_to_float_fmul_scalar = ff_int32_to_float_fmul_scalar_sse2; - c->float_to_int16 = ff_float_to_int16_sse2; - c->float_to_int16_interleave = float_to_int16_interleave_sse2; - } - } -#endif -} diff --git a/libavcodec/x86/h264dsp_init.c b/libavcodec/x86/h264dsp_init.c new file mode 100644 index 0000000000..f24f751fb3 --- /dev/null +++ b/libavcodec/x86/h264dsp_init.c @@ -0,0 +1,385 @@ +/* + * Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt + * + * This file is part of Libav. + * + * Libav is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * Libav is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Libav; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/cpu.h" +#include "libavutil/x86/asm.h" +#include "libavcodec/h264dsp.h" +#include "dsputil_mmx.h" + +/***********************************/ +/* IDCT */ +#define IDCT_ADD_FUNC(NUM, DEPTH, OPT) \ +void ff_h264_idct ## NUM ## _add_ ## DEPTH ## _ ## OPT(uint8_t *dst, \ + int16_t *block, \ + int stride); + +IDCT_ADD_FUNC(, 8, mmx) +IDCT_ADD_FUNC(, 10, sse2) +IDCT_ADD_FUNC(_dc, 8, mmx2) +IDCT_ADD_FUNC(_dc, 10, mmx2) +IDCT_ADD_FUNC(8_dc, 8, mmx2) +IDCT_ADD_FUNC(8_dc, 10, sse2) +IDCT_ADD_FUNC(8, 8, mmx) +IDCT_ADD_FUNC(8, 8, sse2) +IDCT_ADD_FUNC(8, 10, sse2) +#if HAVE_AVX +IDCT_ADD_FUNC(, 10, avx) +IDCT_ADD_FUNC(8_dc, 10, avx) +IDCT_ADD_FUNC(8, 10, avx) +#endif + + +#define IDCT_ADD_REP_FUNC(NUM, REP, DEPTH, OPT) \ +void ff_h264_idct ## NUM ## _add ## REP ## _ ## DEPTH ## _ ## OPT \ + (uint8_t *dst, const int *block_offset, \ + DCTELEM *block, int stride, const uint8_t nnzc[6 * 8]); + +IDCT_ADD_REP_FUNC(8, 4, 8, mmx) +IDCT_ADD_REP_FUNC(8, 4, 8, mmx2) +IDCT_ADD_REP_FUNC(8, 4, 8, sse2) +IDCT_ADD_REP_FUNC(8, 4, 10, sse2) +IDCT_ADD_REP_FUNC(8, 4, 10, avx) +IDCT_ADD_REP_FUNC(, 16, 8, mmx) +IDCT_ADD_REP_FUNC(, 16, 8, mmx2) +IDCT_ADD_REP_FUNC(, 16, 8, sse2) +IDCT_ADD_REP_FUNC(, 16, 10, sse2) +IDCT_ADD_REP_FUNC(, 16intra, 8, mmx) +IDCT_ADD_REP_FUNC(, 16intra, 8, mmx2) +IDCT_ADD_REP_FUNC(, 16intra, 8, sse2) +IDCT_ADD_REP_FUNC(, 16intra, 10, sse2) +#if HAVE_AVX +IDCT_ADD_REP_FUNC(, 16, 10, avx) +IDCT_ADD_REP_FUNC(, 16intra, 10, avx) +#endif + + +#define IDCT_ADD_REP_FUNC2(NUM, REP, DEPTH, OPT) \ +void ff_h264_idct ## NUM ## _add ## REP ## _ ## DEPTH ## _ ## OPT \ + (uint8_t **dst, const int *block_offset, \ + DCTELEM *block, int stride, const uint8_t nnzc[6 * 8]); + +IDCT_ADD_REP_FUNC2(, 8, 8, mmx) +IDCT_ADD_REP_FUNC2(, 8, 8, mmx2) +IDCT_ADD_REP_FUNC2(, 8, 8, sse2) +IDCT_ADD_REP_FUNC2(, 8, 10, sse2) +#if HAVE_AVX +IDCT_ADD_REP_FUNC2(, 8, 10, avx) +#endif + +void ff_h264_luma_dc_dequant_idct_mmx(DCTELEM *output, DCTELEM *input, int qmul); +void ff_h264_luma_dc_dequant_idct_sse2(DCTELEM *output, DCTELEM *input, int qmul); + +/***********************************/ +/* deblocking */ + +void ff_h264_loop_filter_strength_mmx2(int16_t bS[2][4][4], uint8_t nnz[40], + int8_t ref[2][40], int16_t mv[2][40][2], + int bidir, int edges, int step, + int mask_mv0, int mask_mv1, int field); + +#define LF_FUNC(DIR, TYPE, DEPTH, OPT) \ +void ff_deblock_ ## DIR ## _ ## TYPE ## _ ## DEPTH ## _ ## OPT(uint8_t *pix, \ + int stride, \ + int alpha, \ + int beta, \ + int8_t *tc0); +#define LF_IFUNC(DIR, TYPE, DEPTH, OPT) \ +void ff_deblock_ ## DIR ## _ ## TYPE ## _ ## DEPTH ## _ ## OPT(uint8_t *pix, \ + int stride, \ + int alpha, \ + int beta); + +#define LF_FUNCS(type, depth) \ +LF_FUNC(h, chroma, depth, mmx2) \ +LF_IFUNC(h, chroma_intra, depth, mmx2) \ +LF_FUNC(v, chroma, depth, mmx2) \ +LF_IFUNC(v, chroma_intra, depth, mmx2) \ +LF_FUNC(h, luma, depth, mmx2) \ +LF_IFUNC(h, luma_intra, depth, mmx2) \ +LF_FUNC(h, luma, depth, sse2) \ +LF_IFUNC(h, luma_intra, depth, sse2) \ +LF_FUNC(v, luma, depth, sse2) \ +LF_IFUNC(v, luma_intra, depth, sse2) \ +LF_FUNC(h, chroma, depth, sse2) \ +LF_IFUNC(h, chroma_intra, depth, sse2) \ +LF_FUNC(v, chroma, depth, sse2) \ +LF_IFUNC(v, chroma_intra, depth, sse2) \ +LF_FUNC(h, luma, depth, avx) \ +LF_IFUNC(h, luma_intra, depth, avx) \ +LF_FUNC(v, luma, depth, avx) \ +LF_IFUNC(v, luma_intra, depth, avx) \ +LF_FUNC(h, chroma, depth, avx) \ +LF_IFUNC(h, chroma_intra, depth, avx) \ +LF_FUNC(v, chroma, depth, avx) \ +LF_IFUNC(v, chroma_intra, depth, avx) + +LF_FUNCS(uint8_t, 8) +LF_FUNCS(uint16_t, 10) + +#if ARCH_X86_32 +LF_FUNC(v8, luma, 8, mmx2) +static void ff_deblock_v_luma_8_mmx2(uint8_t *pix, int stride, int alpha, + int beta, int8_t *tc0) +{ + if ((tc0[0] & tc0[1]) >= 0) + ff_deblock_v8_luma_8_mmx2(pix + 0, stride, alpha, beta, tc0); + if ((tc0[2] & tc0[3]) >= 0) + ff_deblock_v8_luma_8_mmx2(pix + 8, stride, alpha, beta, tc0 + 2); +} + +LF_IFUNC(v8, luma_intra, 8, mmx2) +static void ff_deblock_v_luma_intra_8_mmx2(uint8_t *pix, int stride, + int alpha, int beta) +{ + ff_deblock_v8_luma_intra_8_mmx2(pix + 0, stride, alpha, beta); + ff_deblock_v8_luma_intra_8_mmx2(pix + 8, stride, alpha, beta); +} +#endif /* ARCH_X86_32 */ + +LF_FUNC(v, luma, 10, mmx2) +LF_IFUNC(v, luma_intra, 10, mmx2) + +/***********************************/ +/* weighted prediction */ + +#define H264_WEIGHT(W, OPT) \ +void ff_h264_weight_ ## W ## _ ## OPT(uint8_t *dst, int stride, \ + int height, int log2_denom, \ + int weight, int offset); + +#define H264_BIWEIGHT(W, OPT) \ +void ff_h264_biweight_ ## W ## _ ## OPT(uint8_t *dst, uint8_t *src, \ + int stride, int height, \ + int log2_denom, int weightd, \ + int weights, int offset); + +#define H264_BIWEIGHT_MMX(W) \ + H264_WEIGHT(W, mmx2) \ + H264_BIWEIGHT(W, mmx2) + +#define H264_BIWEIGHT_MMX_SSE(W) \ + H264_BIWEIGHT_MMX(W) \ + H264_WEIGHT(W, sse2) \ + H264_BIWEIGHT(W, sse2) \ + H264_BIWEIGHT(W, ssse3) + +H264_BIWEIGHT_MMX_SSE(16) +H264_BIWEIGHT_MMX_SSE(8) +H264_BIWEIGHT_MMX(4) + +#define H264_WEIGHT_10(W, DEPTH, OPT) \ +void ff_h264_weight_ ## W ## _ ## DEPTH ## _ ## OPT(uint8_t *dst, \ + int stride, \ + int height, \ + int log2_denom, \ + int weight, \ + int offset); + +#define H264_BIWEIGHT_10(W, DEPTH, OPT) \ +void ff_h264_biweight_ ## W ## _ ## DEPTH ## _ ## OPT(uint8_t *dst, \ + uint8_t *src, \ + int stride, \ + int height, \ + int log2_denom, \ + int weightd, \ + int weights, \ + int offset); + +#define H264_BIWEIGHT_10_SSE(W, DEPTH) \ + H264_WEIGHT_10(W, DEPTH, sse2) \ + H264_WEIGHT_10(W, DEPTH, sse4) \ + H264_BIWEIGHT_10(W, DEPTH, sse2) \ + H264_BIWEIGHT_10(W, DEPTH, sse4) + +H264_BIWEIGHT_10_SSE(16, 10) +H264_BIWEIGHT_10_SSE(8, 10) +H264_BIWEIGHT_10_SSE(4, 10) + +void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth, + const int chroma_format_idc) +{ +#if HAVE_YASM + int mm_flags = av_get_cpu_flags(); + + if (chroma_format_idc == 1 && mm_flags & AV_CPU_FLAG_MMXEXT) + c->h264_loop_filter_strength = ff_h264_loop_filter_strength_mmx2; + + if (bit_depth == 8) { + if (mm_flags & AV_CPU_FLAG_MMX) { + c->h264_idct_dc_add = + c->h264_idct_add = ff_h264_idct_add_8_mmx; + c->h264_idct8_dc_add = + c->h264_idct8_add = ff_h264_idct8_add_8_mmx; + + c->h264_idct_add16 = ff_h264_idct_add16_8_mmx; + c->h264_idct8_add4 = ff_h264_idct8_add4_8_mmx; + if (chroma_format_idc == 1) + c->h264_idct_add8 = ff_h264_idct_add8_8_mmx; + c->h264_idct_add16intra = ff_h264_idct_add16intra_8_mmx; + if (mm_flags & AV_CPU_FLAG_CMOV) + c->h264_luma_dc_dequant_idct = ff_h264_luma_dc_dequant_idct_mmx; + + if (mm_flags & AV_CPU_FLAG_MMXEXT) { + c->h264_idct_dc_add = ff_h264_idct_dc_add_8_mmx2; + c->h264_idct8_dc_add = ff_h264_idct8_dc_add_8_mmx2; + c->h264_idct_add16 = ff_h264_idct_add16_8_mmx2; + c->h264_idct8_add4 = ff_h264_idct8_add4_8_mmx2; + if (chroma_format_idc == 1) + c->h264_idct_add8 = ff_h264_idct_add8_8_mmx2; + c->h264_idct_add16intra = ff_h264_idct_add16intra_8_mmx2; + + c->h264_v_loop_filter_chroma = ff_deblock_v_chroma_8_mmx2; + c->h264_v_loop_filter_chroma_intra = ff_deblock_v_chroma_intra_8_mmx2; + if (chroma_format_idc == 1) { + c->h264_h_loop_filter_chroma = ff_deblock_h_chroma_8_mmx2; + c->h264_h_loop_filter_chroma_intra = ff_deblock_h_chroma_intra_8_mmx2; + } +#if ARCH_X86_32 + c->h264_v_loop_filter_luma = ff_deblock_v_luma_8_mmx2; + c->h264_h_loop_filter_luma = ff_deblock_h_luma_8_mmx2; + c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_8_mmx2; + c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_8_mmx2; +#endif /* ARCH_X86_32 */ + c->weight_h264_pixels_tab[0] = ff_h264_weight_16_mmx2; + c->weight_h264_pixels_tab[1] = ff_h264_weight_8_mmx2; + c->weight_h264_pixels_tab[2] = ff_h264_weight_4_mmx2; + + c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_mmx2; + c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_mmx2; + c->biweight_h264_pixels_tab[2] = ff_h264_biweight_4_mmx2; + + if (mm_flags & AV_CPU_FLAG_SSE2) { + c->h264_idct8_add = ff_h264_idct8_add_8_sse2; + + c->h264_idct_add16 = ff_h264_idct_add16_8_sse2; + c->h264_idct8_add4 = ff_h264_idct8_add4_8_sse2; + if (chroma_format_idc == 1) + c->h264_idct_add8 = ff_h264_idct_add8_8_sse2; + c->h264_idct_add16intra = ff_h264_idct_add16intra_8_sse2; + c->h264_luma_dc_dequant_idct = ff_h264_luma_dc_dequant_idct_sse2; + + c->weight_h264_pixels_tab[0] = ff_h264_weight_16_sse2; + c->weight_h264_pixels_tab[1] = ff_h264_weight_8_sse2; + + c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_sse2; + c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_sse2; + +#if HAVE_ALIGNED_STACK + c->h264_v_loop_filter_luma = ff_deblock_v_luma_8_sse2; + c->h264_h_loop_filter_luma = ff_deblock_h_luma_8_sse2; + c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_8_sse2; + c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_8_sse2; +#endif /* HAVE_ALIGNED_STACK */ + } + if (mm_flags & AV_CPU_FLAG_SSSE3) { + c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_ssse3; + c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_ssse3; + } + if (mm_flags & AV_CPU_FLAG_AVX) { +#if HAVE_ALIGNED_STACK + c->h264_v_loop_filter_luma = ff_deblock_v_luma_8_avx; + c->h264_h_loop_filter_luma = ff_deblock_h_luma_8_avx; + c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_8_avx; + c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_8_avx; +#endif /* HAVE_ALIGNED_STACK */ + } + } + } + } else if (bit_depth == 10) { + if (mm_flags & AV_CPU_FLAG_MMX) { + if (mm_flags & AV_CPU_FLAG_MMXEXT) { +#if ARCH_X86_32 + c->h264_v_loop_filter_chroma = ff_deblock_v_chroma_10_mmx2; + c->h264_v_loop_filter_chroma_intra = ff_deblock_v_chroma_intra_10_mmx2; + c->h264_v_loop_filter_luma = ff_deblock_v_luma_10_mmx2; + c->h264_h_loop_filter_luma = ff_deblock_h_luma_10_mmx2; + c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_10_mmx2; + c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_10_mmx2; +#endif /* ARCH_X86_32 */ + c->h264_idct_dc_add = ff_h264_idct_dc_add_10_mmx2; + if (mm_flags & AV_CPU_FLAG_SSE2) { + c->h264_idct_add = ff_h264_idct_add_10_sse2; + c->h264_idct8_dc_add = ff_h264_idct8_dc_add_10_sse2; + + c->h264_idct_add16 = ff_h264_idct_add16_10_sse2; + if (chroma_format_idc == 1) + c->h264_idct_add8 = ff_h264_idct_add8_10_sse2; + c->h264_idct_add16intra = ff_h264_idct_add16intra_10_sse2; +#if HAVE_ALIGNED_STACK + c->h264_idct8_add = ff_h264_idct8_add_10_sse2; + c->h264_idct8_add4 = ff_h264_idct8_add4_10_sse2; +#endif /* HAVE_ALIGNED_STACK */ + + c->weight_h264_pixels_tab[0] = ff_h264_weight_16_10_sse2; + c->weight_h264_pixels_tab[1] = ff_h264_weight_8_10_sse2; + c->weight_h264_pixels_tab[2] = ff_h264_weight_4_10_sse2; + + c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_10_sse2; + c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_10_sse2; + c->biweight_h264_pixels_tab[2] = ff_h264_biweight_4_10_sse2; + + c->h264_v_loop_filter_chroma = ff_deblock_v_chroma_10_sse2; + c->h264_v_loop_filter_chroma_intra = ff_deblock_v_chroma_intra_10_sse2; +#if HAVE_ALIGNED_STACK + c->h264_v_loop_filter_luma = ff_deblock_v_luma_10_sse2; + c->h264_h_loop_filter_luma = ff_deblock_h_luma_10_sse2; + c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_10_sse2; + c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_10_sse2; +#endif /* HAVE_ALIGNED_STACK */ + } + if (mm_flags & AV_CPU_FLAG_SSE4) { + c->weight_h264_pixels_tab[0] = ff_h264_weight_16_10_sse4; + c->weight_h264_pixels_tab[1] = ff_h264_weight_8_10_sse4; + c->weight_h264_pixels_tab[2] = ff_h264_weight_4_10_sse4; + + c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_10_sse4; + c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_10_sse4; + c->biweight_h264_pixels_tab[2] = ff_h264_biweight_4_10_sse4; + } +#if HAVE_AVX + if (mm_flags & AV_CPU_FLAG_AVX) { + c->h264_idct_dc_add = + c->h264_idct_add = ff_h264_idct_add_10_avx; + c->h264_idct8_dc_add = ff_h264_idct8_dc_add_10_avx; + + c->h264_idct_add16 = ff_h264_idct_add16_10_avx; + if (chroma_format_idc == 1) + c->h264_idct_add8 = ff_h264_idct_add8_10_avx; + c->h264_idct_add16intra = ff_h264_idct_add16intra_10_avx; +#if HAVE_ALIGNED_STACK + c->h264_idct8_add = ff_h264_idct8_add_10_avx; + c->h264_idct8_add4 = ff_h264_idct8_add4_10_avx; +#endif /* HAVE_ALIGNED_STACK */ + + c->h264_v_loop_filter_chroma = ff_deblock_v_chroma_10_avx; + c->h264_v_loop_filter_chroma_intra = ff_deblock_v_chroma_intra_10_avx; +#if HAVE_ALIGNED_STACK + c->h264_v_loop_filter_luma = ff_deblock_v_luma_10_avx; + c->h264_h_loop_filter_luma = ff_deblock_h_luma_10_avx; + c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_10_avx; + c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_10_avx; +#endif /* HAVE_ALIGNED_STACK */ + } +#endif /* HAVE_AVX */ + } + } + } +#endif /* HAVE_YASM */ +} diff --git a/libavcodec/x86/h264dsp_mmx.c b/libavcodec/x86/h264dsp_mmx.c deleted file mode 100644 index f24f751fb3..0000000000 --- a/libavcodec/x86/h264dsp_mmx.c +++ /dev/null @@ -1,385 +0,0 @@ -/* - * Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt - * - * This file is part of Libav. - * - * Libav is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * Libav is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "libavutil/cpu.h" -#include "libavutil/x86/asm.h" -#include "libavcodec/h264dsp.h" -#include "dsputil_mmx.h" - -/***********************************/ -/* IDCT */ -#define IDCT_ADD_FUNC(NUM, DEPTH, OPT) \ -void ff_h264_idct ## NUM ## _add_ ## DEPTH ## _ ## OPT(uint8_t *dst, \ - int16_t *block, \ - int stride); - -IDCT_ADD_FUNC(, 8, mmx) -IDCT_ADD_FUNC(, 10, sse2) -IDCT_ADD_FUNC(_dc, 8, mmx2) -IDCT_ADD_FUNC(_dc, 10, mmx2) -IDCT_ADD_FUNC(8_dc, 8, mmx2) -IDCT_ADD_FUNC(8_dc, 10, sse2) -IDCT_ADD_FUNC(8, 8, mmx) -IDCT_ADD_FUNC(8, 8, sse2) -IDCT_ADD_FUNC(8, 10, sse2) -#if HAVE_AVX -IDCT_ADD_FUNC(, 10, avx) -IDCT_ADD_FUNC(8_dc, 10, avx) -IDCT_ADD_FUNC(8, 10, avx) -#endif - - -#define IDCT_ADD_REP_FUNC(NUM, REP, DEPTH, OPT) \ -void ff_h264_idct ## NUM ## _add ## REP ## _ ## DEPTH ## _ ## OPT \ - (uint8_t *dst, const int *block_offset, \ - DCTELEM *block, int stride, const uint8_t nnzc[6 * 8]); - -IDCT_ADD_REP_FUNC(8, 4, 8, mmx) -IDCT_ADD_REP_FUNC(8, 4, 8, mmx2) -IDCT_ADD_REP_FUNC(8, 4, 8, sse2) -IDCT_ADD_REP_FUNC(8, 4, 10, sse2) -IDCT_ADD_REP_FUNC(8, 4, 10, avx) -IDCT_ADD_REP_FUNC(, 16, 8, mmx) -IDCT_ADD_REP_FUNC(, 16, 8, mmx2) -IDCT_ADD_REP_FUNC(, 16, 8, sse2) -IDCT_ADD_REP_FUNC(, 16, 10, sse2) -IDCT_ADD_REP_FUNC(, 16intra, 8, mmx) -IDCT_ADD_REP_FUNC(, 16intra, 8, mmx2) -IDCT_ADD_REP_FUNC(, 16intra, 8, sse2) -IDCT_ADD_REP_FUNC(, 16intra, 10, sse2) -#if HAVE_AVX -IDCT_ADD_REP_FUNC(, 16, 10, avx) -IDCT_ADD_REP_FUNC(, 16intra, 10, avx) -#endif - - -#define IDCT_ADD_REP_FUNC2(NUM, REP, DEPTH, OPT) \ -void ff_h264_idct ## NUM ## _add ## REP ## _ ## DEPTH ## _ ## OPT \ - (uint8_t **dst, const int *block_offset, \ - DCTELEM *block, int stride, const uint8_t nnzc[6 * 8]); - -IDCT_ADD_REP_FUNC2(, 8, 8, mmx) -IDCT_ADD_REP_FUNC2(, 8, 8, mmx2) -IDCT_ADD_REP_FUNC2(, 8, 8, sse2) -IDCT_ADD_REP_FUNC2(, 8, 10, sse2) -#if HAVE_AVX -IDCT_ADD_REP_FUNC2(, 8, 10, avx) -#endif - -void ff_h264_luma_dc_dequant_idct_mmx(DCTELEM *output, DCTELEM *input, int qmul); -void ff_h264_luma_dc_dequant_idct_sse2(DCTELEM *output, DCTELEM *input, int qmul); - -/***********************************/ -/* deblocking */ - -void ff_h264_loop_filter_strength_mmx2(int16_t bS[2][4][4], uint8_t nnz[40], - int8_t ref[2][40], int16_t mv[2][40][2], - int bidir, int edges, int step, - int mask_mv0, int mask_mv1, int field); - -#define LF_FUNC(DIR, TYPE, DEPTH, OPT) \ -void ff_deblock_ ## DIR ## _ ## TYPE ## _ ## DEPTH ## _ ## OPT(uint8_t *pix, \ - int stride, \ - int alpha, \ - int beta, \ - int8_t *tc0); -#define LF_IFUNC(DIR, TYPE, DEPTH, OPT) \ -void ff_deblock_ ## DIR ## _ ## TYPE ## _ ## DEPTH ## _ ## OPT(uint8_t *pix, \ - int stride, \ - int alpha, \ - int beta); - -#define LF_FUNCS(type, depth) \ -LF_FUNC(h, chroma, depth, mmx2) \ -LF_IFUNC(h, chroma_intra, depth, mmx2) \ -LF_FUNC(v, chroma, depth, mmx2) \ -LF_IFUNC(v, chroma_intra, depth, mmx2) \ -LF_FUNC(h, luma, depth, mmx2) \ -LF_IFUNC(h, luma_intra, depth, mmx2) \ -LF_FUNC(h, luma, depth, sse2) \ -LF_IFUNC(h, luma_intra, depth, sse2) \ -LF_FUNC(v, luma, depth, sse2) \ -LF_IFUNC(v, luma_intra, depth, sse2) \ -LF_FUNC(h, chroma, depth, sse2) \ -LF_IFUNC(h, chroma_intra, depth, sse2) \ -LF_FUNC(v, chroma, depth, sse2) \ -LF_IFUNC(v, chroma_intra, depth, sse2) \ -LF_FUNC(h, luma, depth, avx) \ -LF_IFUNC(h, luma_intra, depth, avx) \ -LF_FUNC(v, luma, depth, avx) \ -LF_IFUNC(v, luma_intra, depth, avx) \ -LF_FUNC(h, chroma, depth, avx) \ -LF_IFUNC(h, chroma_intra, depth, avx) \ -LF_FUNC(v, chroma, depth, avx) \ -LF_IFUNC(v, chroma_intra, depth, avx) - -LF_FUNCS(uint8_t, 8) -LF_FUNCS(uint16_t, 10) - -#if ARCH_X86_32 -LF_FUNC(v8, luma, 8, mmx2) -static void ff_deblock_v_luma_8_mmx2(uint8_t *pix, int stride, int alpha, - int beta, int8_t *tc0) -{ - if ((tc0[0] & tc0[1]) >= 0) - ff_deblock_v8_luma_8_mmx2(pix + 0, stride, alpha, beta, tc0); - if ((tc0[2] & tc0[3]) >= 0) - ff_deblock_v8_luma_8_mmx2(pix + 8, stride, alpha, beta, tc0 + 2); -} - -LF_IFUNC(v8, luma_intra, 8, mmx2) -static void ff_deblock_v_luma_intra_8_mmx2(uint8_t *pix, int stride, - int alpha, int beta) -{ - ff_deblock_v8_luma_intra_8_mmx2(pix + 0, stride, alpha, beta); - ff_deblock_v8_luma_intra_8_mmx2(pix + 8, stride, alpha, beta); -} -#endif /* ARCH_X86_32 */ - -LF_FUNC(v, luma, 10, mmx2) -LF_IFUNC(v, luma_intra, 10, mmx2) - -/***********************************/ -/* weighted prediction */ - -#define H264_WEIGHT(W, OPT) \ -void ff_h264_weight_ ## W ## _ ## OPT(uint8_t *dst, int stride, \ - int height, int log2_denom, \ - int weight, int offset); - -#define H264_BIWEIGHT(W, OPT) \ -void ff_h264_biweight_ ## W ## _ ## OPT(uint8_t *dst, uint8_t *src, \ - int stride, int height, \ - int log2_denom, int weightd, \ - int weights, int offset); - -#define H264_BIWEIGHT_MMX(W) \ - H264_WEIGHT(W, mmx2) \ - H264_BIWEIGHT(W, mmx2) - -#define H264_BIWEIGHT_MMX_SSE(W) \ - H264_BIWEIGHT_MMX(W) \ - H264_WEIGHT(W, sse2) \ - H264_BIWEIGHT(W, sse2) \ - H264_BIWEIGHT(W, ssse3) - -H264_BIWEIGHT_MMX_SSE(16) -H264_BIWEIGHT_MMX_SSE(8) -H264_BIWEIGHT_MMX(4) - -#define H264_WEIGHT_10(W, DEPTH, OPT) \ -void ff_h264_weight_ ## W ## _ ## DEPTH ## _ ## OPT(uint8_t *dst, \ - int stride, \ - int height, \ - int log2_denom, \ - int weight, \ - int offset); - -#define H264_BIWEIGHT_10(W, DEPTH, OPT) \ -void ff_h264_biweight_ ## W ## _ ## DEPTH ## _ ## OPT(uint8_t *dst, \ - uint8_t *src, \ - int stride, \ - int height, \ - int log2_denom, \ - int weightd, \ - int weights, \ - int offset); - -#define H264_BIWEIGHT_10_SSE(W, DEPTH) \ - H264_WEIGHT_10(W, DEPTH, sse2) \ - H264_WEIGHT_10(W, DEPTH, sse4) \ - H264_BIWEIGHT_10(W, DEPTH, sse2) \ - H264_BIWEIGHT_10(W, DEPTH, sse4) - -H264_BIWEIGHT_10_SSE(16, 10) -H264_BIWEIGHT_10_SSE(8, 10) -H264_BIWEIGHT_10_SSE(4, 10) - -void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth, - const int chroma_format_idc) -{ -#if HAVE_YASM - int mm_flags = av_get_cpu_flags(); - - if (chroma_format_idc == 1 && mm_flags & AV_CPU_FLAG_MMXEXT) - c->h264_loop_filter_strength = ff_h264_loop_filter_strength_mmx2; - - if (bit_depth == 8) { - if (mm_flags & AV_CPU_FLAG_MMX) { - c->h264_idct_dc_add = - c->h264_idct_add = ff_h264_idct_add_8_mmx; - c->h264_idct8_dc_add = - c->h264_idct8_add = ff_h264_idct8_add_8_mmx; - - c->h264_idct_add16 = ff_h264_idct_add16_8_mmx; - c->h264_idct8_add4 = ff_h264_idct8_add4_8_mmx; - if (chroma_format_idc == 1) - c->h264_idct_add8 = ff_h264_idct_add8_8_mmx; - c->h264_idct_add16intra = ff_h264_idct_add16intra_8_mmx; - if (mm_flags & AV_CPU_FLAG_CMOV) - c->h264_luma_dc_dequant_idct = ff_h264_luma_dc_dequant_idct_mmx; - - if (mm_flags & AV_CPU_FLAG_MMXEXT) { - c->h264_idct_dc_add = ff_h264_idct_dc_add_8_mmx2; - c->h264_idct8_dc_add = ff_h264_idct8_dc_add_8_mmx2; - c->h264_idct_add16 = ff_h264_idct_add16_8_mmx2; - c->h264_idct8_add4 = ff_h264_idct8_add4_8_mmx2; - if (chroma_format_idc == 1) - c->h264_idct_add8 = ff_h264_idct_add8_8_mmx2; - c->h264_idct_add16intra = ff_h264_idct_add16intra_8_mmx2; - - c->h264_v_loop_filter_chroma = ff_deblock_v_chroma_8_mmx2; - c->h264_v_loop_filter_chroma_intra = ff_deblock_v_chroma_intra_8_mmx2; - if (chroma_format_idc == 1) { - c->h264_h_loop_filter_chroma = ff_deblock_h_chroma_8_mmx2; - c->h264_h_loop_filter_chroma_intra = ff_deblock_h_chroma_intra_8_mmx2; - } -#if ARCH_X86_32 - c->h264_v_loop_filter_luma = ff_deblock_v_luma_8_mmx2; - c->h264_h_loop_filter_luma = ff_deblock_h_luma_8_mmx2; - c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_8_mmx2; - c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_8_mmx2; -#endif /* ARCH_X86_32 */ - c->weight_h264_pixels_tab[0] = ff_h264_weight_16_mmx2; - c->weight_h264_pixels_tab[1] = ff_h264_weight_8_mmx2; - c->weight_h264_pixels_tab[2] = ff_h264_weight_4_mmx2; - - c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_mmx2; - c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_mmx2; - c->biweight_h264_pixels_tab[2] = ff_h264_biweight_4_mmx2; - - if (mm_flags & AV_CPU_FLAG_SSE2) { - c->h264_idct8_add = ff_h264_idct8_add_8_sse2; - - c->h264_idct_add16 = ff_h264_idct_add16_8_sse2; - c->h264_idct8_add4 = ff_h264_idct8_add4_8_sse2; - if (chroma_format_idc == 1) - c->h264_idct_add8 = ff_h264_idct_add8_8_sse2; - c->h264_idct_add16intra = ff_h264_idct_add16intra_8_sse2; - c->h264_luma_dc_dequant_idct = ff_h264_luma_dc_dequant_idct_sse2; - - c->weight_h264_pixels_tab[0] = ff_h264_weight_16_sse2; - c->weight_h264_pixels_tab[1] = ff_h264_weight_8_sse2; - - c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_sse2; - c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_sse2; - -#if HAVE_ALIGNED_STACK - c->h264_v_loop_filter_luma = ff_deblock_v_luma_8_sse2; - c->h264_h_loop_filter_luma = ff_deblock_h_luma_8_sse2; - c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_8_sse2; - c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_8_sse2; -#endif /* HAVE_ALIGNED_STACK */ - } - if (mm_flags & AV_CPU_FLAG_SSSE3) { - c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_ssse3; - c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_ssse3; - } - if (mm_flags & AV_CPU_FLAG_AVX) { -#if HAVE_ALIGNED_STACK - c->h264_v_loop_filter_luma = ff_deblock_v_luma_8_avx; - c->h264_h_loop_filter_luma = ff_deblock_h_luma_8_avx; - c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_8_avx; - c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_8_avx; -#endif /* HAVE_ALIGNED_STACK */ - } - } - } - } else if (bit_depth == 10) { - if (mm_flags & AV_CPU_FLAG_MMX) { - if (mm_flags & AV_CPU_FLAG_MMXEXT) { -#if ARCH_X86_32 - c->h264_v_loop_filter_chroma = ff_deblock_v_chroma_10_mmx2; - c->h264_v_loop_filter_chroma_intra = ff_deblock_v_chroma_intra_10_mmx2; - c->h264_v_loop_filter_luma = ff_deblock_v_luma_10_mmx2; - c->h264_h_loop_filter_luma = ff_deblock_h_luma_10_mmx2; - c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_10_mmx2; - c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_10_mmx2; -#endif /* ARCH_X86_32 */ - c->h264_idct_dc_add = ff_h264_idct_dc_add_10_mmx2; - if (mm_flags & AV_CPU_FLAG_SSE2) { - c->h264_idct_add = ff_h264_idct_add_10_sse2; - c->h264_idct8_dc_add = ff_h264_idct8_dc_add_10_sse2; - - c->h264_idct_add16 = ff_h264_idct_add16_10_sse2; - if (chroma_format_idc == 1) - c->h264_idct_add8 = ff_h264_idct_add8_10_sse2; - c->h264_idct_add16intra = ff_h264_idct_add16intra_10_sse2; -#if HAVE_ALIGNED_STACK - c->h264_idct8_add = ff_h264_idct8_add_10_sse2; - c->h264_idct8_add4 = ff_h264_idct8_add4_10_sse2; -#endif /* HAVE_ALIGNED_STACK */ - - c->weight_h264_pixels_tab[0] = ff_h264_weight_16_10_sse2; - c->weight_h264_pixels_tab[1] = ff_h264_weight_8_10_sse2; - c->weight_h264_pixels_tab[2] = ff_h264_weight_4_10_sse2; - - c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_10_sse2; - c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_10_sse2; - c->biweight_h264_pixels_tab[2] = ff_h264_biweight_4_10_sse2; - - c->h264_v_loop_filter_chroma = ff_deblock_v_chroma_10_sse2; - c->h264_v_loop_filter_chroma_intra = ff_deblock_v_chroma_intra_10_sse2; -#if HAVE_ALIGNED_STACK - c->h264_v_loop_filter_luma = ff_deblock_v_luma_10_sse2; - c->h264_h_loop_filter_luma = ff_deblock_h_luma_10_sse2; - c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_10_sse2; - c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_10_sse2; -#endif /* HAVE_ALIGNED_STACK */ - } - if (mm_flags & AV_CPU_FLAG_SSE4) { - c->weight_h264_pixels_tab[0] = ff_h264_weight_16_10_sse4; - c->weight_h264_pixels_tab[1] = ff_h264_weight_8_10_sse4; - c->weight_h264_pixels_tab[2] = ff_h264_weight_4_10_sse4; - - c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_10_sse4; - c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_10_sse4; - c->biweight_h264_pixels_tab[2] = ff_h264_biweight_4_10_sse4; - } -#if HAVE_AVX - if (mm_flags & AV_CPU_FLAG_AVX) { - c->h264_idct_dc_add = - c->h264_idct_add = ff_h264_idct_add_10_avx; - c->h264_idct8_dc_add = ff_h264_idct8_dc_add_10_avx; - - c->h264_idct_add16 = ff_h264_idct_add16_10_avx; - if (chroma_format_idc == 1) - c->h264_idct_add8 = ff_h264_idct_add8_10_avx; - c->h264_idct_add16intra = ff_h264_idct_add16intra_10_avx; -#if HAVE_ALIGNED_STACK - c->h264_idct8_add = ff_h264_idct8_add_10_avx; - c->h264_idct8_add4 = ff_h264_idct8_add4_10_avx; -#endif /* HAVE_ALIGNED_STACK */ - - c->h264_v_loop_filter_chroma = ff_deblock_v_chroma_10_avx; - c->h264_v_loop_filter_chroma_intra = ff_deblock_v_chroma_intra_10_avx; -#if HAVE_ALIGNED_STACK - c->h264_v_loop_filter_luma = ff_deblock_v_luma_10_avx; - c->h264_h_loop_filter_luma = ff_deblock_h_luma_10_avx; - c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_10_avx; - c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_10_avx; -#endif /* HAVE_ALIGNED_STACK */ - } -#endif /* HAVE_AVX */ - } - } - } -#endif /* HAVE_YASM */ -} -- cgit v1.2.3