From 88174d2d1720fc8341ee21f0967e7aa726a2c3e8 Mon Sep 17 00:00:00 2001 From: XhmikosR Date: Wed, 15 Sep 2010 15:46:24 +0000 Subject: merge changes from trunk r2365-r2561 git-svn-id: https://mpc-hc.svn.sourceforge.net/svnroot/mpc-hc/branches/legacy@2562 10f7b99b-c216-0410-bff0-8a66a9350fd8 --- .../MPCVideoDec/ffmpeg/libavcodec/ac3dec.c | 4 +- .../MPCVideoDec/ffmpeg/libavcodec/adpcm.c | 4 +- .../MPCVideoDec/ffmpeg/libavcodec/audioconvert.h | 8 +- .../MPCVideoDec/ffmpeg/libavcodec/avcodec.h | 173 +- .../MPCVideoDec/ffmpeg/libavcodec/dsputil.c | 59 +- .../MPCVideoDec/ffmpeg/libavcodec/dsputil.h | 56 +- .../MPCVideoDec/ffmpeg/libavcodec/flvdec.c | 2 +- .../transform/MPCVideoDec/ffmpeg/libavcodec/h263.h | 2 +- .../MPCVideoDec/ffmpeg/libavcodec/h263dec.c | 7 +- .../transform/MPCVideoDec/ffmpeg/libavcodec/h264.c | 15 +- .../transform/MPCVideoDec/ffmpeg/libavcodec/h264.h | 12 +- .../MPCVideoDec/ffmpeg/libavcodec/h264_ps.c | 2 +- .../MPCVideoDec/ffmpeg/libavcodec/imgconvert.c | 53 +- .../MPCVideoDec/ffmpeg/libavcodec/libamr.c | 4 +- .../MPCVideoDec/ffmpeg/libavcodec/mjpegdec.c | 6 +- .../MPCVideoDec/ffmpeg/libavcodec/mjpegdec.h | 2 +- .../MPCVideoDec/ffmpeg/libavcodec/mlpdec.c | 4 +- .../MPCVideoDec/ffmpeg/libavcodec/mpeg12.c | 4 +- .../MPCVideoDec/ffmpeg/libavcodec/nellymoserdec.c | 4 +- .../transform/MPCVideoDec/ffmpeg/libavcodec/rv10.c | 6 +- .../transform/MPCVideoDec/ffmpeg/libavcodec/rv34.c | 4 +- .../transform/MPCVideoDec/ffmpeg/libavcodec/rv34.h | 2 +- .../transform/MPCVideoDec/ffmpeg/libavcodec/rv40.c | 2 +- .../MPCVideoDec/ffmpeg/libavcodec/sp5xdec.c | 106 +- .../MPCVideoDec/ffmpeg/libavcodec/svq1dec.c | 4 +- .../transform/MPCVideoDec/ffmpeg/libavcodec/svq3.c | 4 +- .../MPCVideoDec/ffmpeg/libavcodec/utils.c | 66 +- .../MPCVideoDec/ffmpeg/libavcodec/vc1dec.c | 4 +- .../MPCVideoDec/ffmpeg/libavcodec/vc1dsp.c | 4 +- .../transform/MPCVideoDec/ffmpeg/libavcodec/vp3.c | 27 +- .../transform/MPCVideoDec/ffmpeg/libavcodec/vp56.c | 7 +- .../transform/MPCVideoDec/ffmpeg/libavcodec/vp56.h | 2 +- .../MPCVideoDec/ffmpeg/libavcodec/vp56dsp.c | 2 +- .../transform/MPCVideoDec/ffmpeg/libavcodec/vp8.c | 8 +- .../MPCVideoDec/ffmpeg/libavcodec/x86/config.asm | 1 + .../MPCVideoDec/ffmpeg/libavcodec/x86/cpuid.c | 142 -- .../libavcodec/x86/dsputil_h264_template_mmx.c | 304 --- .../libavcodec/x86/dsputil_h264_template_ssse3.c | 208 -- .../ffmpeg/libavcodec/x86/dsputil_mmx.c | 342 ++- .../ffmpeg/libavcodec/x86/dsputil_mmx.h | 37 +- .../MPCVideoDec/ffmpeg/libavcodec/x86/fft.c | 13 +- .../MPCVideoDec/ffmpeg/libavcodec/x86/fft_mmx.asm | 45 +- .../ffmpeg/libavcodec/x86/h264_chromamc.asm | 671 ++++++ .../ffmpeg/libavcodec/x86/h264_deblock.asm | 889 +++++++ .../ffmpeg/libavcodec/x86/h264_deblock_sse2.asm | 761 ------ .../ffmpeg/libavcodec/x86/h264_idct.asm | 865 +++++++ .../ffmpeg/libavcodec/x86/h264_idct_sse2.asm | 54 - .../ffmpeg/libavcodec/x86/h264_intrapred_init.c | 103 + .../ffmpeg/libavcodec/x86/h264_qpel_mmx.c | 1209 ++++++++++ .../ffmpeg/libavcodec/x86/h264_weight.asm | 375 +++ .../ffmpeg/libavcodec/x86/h264_weight_sse2.asm | 170 -- .../ffmpeg/libavcodec/x86/h264dsp_mmx.c | 2422 ++------------------ .../ffmpeg/libavcodec/x86/idct_sse2_xvid.c | 4 +- .../ffmpeg/libavcodec/x86/mpegvideo_mmx.c | 13 +- .../ffmpeg/libavcodec/x86/simple_idct_mmx.c | 4 +- .../MPCVideoDec/ffmpeg/libavcodec/x86/vc1dsp_mmx.c | 15 +- .../ffmpeg/libavcodec/x86/vc1dsp_yasm.asm | 6 +- .../MPCVideoDec/ffmpeg/libavcodec/x86/vp3dsp.asm | 618 +++++ .../MPCVideoDec/ffmpeg/libavcodec/x86/vp3dsp_mmx.c | 436 ---- .../MPCVideoDec/ffmpeg/libavcodec/x86/vp3dsp_mmx.h | 36 - .../ffmpeg/libavcodec/x86/vp3dsp_sse2.c | 187 -- .../ffmpeg/libavcodec/x86/vp3dsp_sse2.h | 31 - .../MPCVideoDec/ffmpeg/libavcodec/x86/vp56dsp.asm | 10 +- .../ffmpeg/libavcodec/x86/vp56dsp_init.c | 7 +- .../ffmpeg/libavcodec/x86/vp8dsp-init.c | 17 +- .../MPCVideoDec/ffmpeg/libavcodec/x86/vp8dsp.asm | 26 +- 66 files changed, 5498 insertions(+), 5192 deletions(-) create mode 100644 src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/config.asm delete mode 100644 src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/cpuid.c delete mode 100644 src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/dsputil_h264_template_mmx.c delete mode 100644 src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/dsputil_h264_template_ssse3.c create mode 100644 src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/h264_chromamc.asm create mode 100644 src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/h264_deblock.asm delete mode 100644 src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/h264_deblock_sse2.asm create mode 100644 src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/h264_idct.asm delete mode 100644 src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/h264_idct_sse2.asm create mode 100644 src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/h264_intrapred_init.c create mode 100644 src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/h264_qpel_mmx.c create mode 100644 src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/h264_weight.asm delete mode 100644 src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/h264_weight_sse2.asm create mode 100644 src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/vp3dsp.asm delete mode 100644 src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/vp3dsp_mmx.c delete mode 100644 src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/vp3dsp_mmx.h delete mode 100644 src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/vp3dsp_sse2.c delete mode 100644 src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/vp3dsp_sse2.h (limited to 'src/filters/transform/MPCVideoDec/ffmpeg/libavcodec') diff --git a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/ac3dec.c b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/ac3dec.c index 126424440..5992715b8 100644 --- a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/ac3dec.c +++ b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/ac3dec.c @@ -1311,8 +1311,10 @@ static int decode_audio_block(AC3DecodeContext *s, int blk) * Decode a single AC-3 frame. */ static int ac3_decode_frame(AVCodecContext * avctx, void *data, int *data_size, - const uint8_t *buf, int buf_size) + AVPacket *avpkt) { + const uint8_t *buf = avpkt->data; + int buf_size = avpkt->size; AC3DecodeContext *s = avctx->priv_data; int16_t *out_samples = (int16_t *)data; int blk, ch, err; diff --git a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/adpcm.c b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/adpcm.c index 6fda2f8bf..6ecd98e95 100644 --- a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/adpcm.c +++ b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/adpcm.c @@ -365,8 +365,10 @@ static void xa_decode(short *out, const unsigned char *in, static int adpcm_decode_frame(AVCodecContext *avctx, void *data, int *data_size, - const uint8_t *buf, int buf_size) + AVPacket *avpkt) { + const uint8_t *buf = avpkt->data; + int buf_size = avpkt->size; ADPCMContext *c = avctx->priv_data; ADPCMChannelStatus *cs; int n, m, channel, i; diff --git a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/audioconvert.h b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/audioconvert.h index 81b6cded3..349065edc 100644 --- a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/audioconvert.h +++ b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/audioconvert.h @@ -29,6 +29,7 @@ */ +#include "libavutil/cpu.h" #include "avcodec.h" @@ -59,6 +60,11 @@ enum SampleFormat avcodec_get_sample_fmt(const char* name); */ const char *avcodec_get_channel_name(int channel_id); +/** + * @return channel layout that matches name, 0 if no match + */ +int64_t avcodec_get_channel_layout(const char *name); + /** * Return description of channel layout */ @@ -88,7 +94,7 @@ typedef struct AVAudioConvert AVAudioConvert; * @param in_fmt Input sample format * @param in_channels Number of input channels * @param[in] matrix Channel mixing matrix (of dimension in_channel*out_channels). Set to NULL to ignore. - * @param flags See FF_MM_xx + * @param flags See AV_CPU_FLAG_xx * @return NULL on error */ AVAudioConvert *av_audio_convert_alloc(enum SampleFormat out_fmt, int out_channels, diff --git a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/avcodec.h b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/avcodec.h index a841de108..c5f35eda2 100644 --- a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/avcodec.h +++ b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/avcodec.h @@ -42,10 +42,11 @@ #include #include "libavutil/avutil.h" +#include "libavutil/cpu.h" #define LIBAVCODEC_VERSION_MAJOR 52 -#define LIBAVCODEC_VERSION_MINOR 85 -#define LIBAVCODEC_VERSION_MICRO 1 +#define LIBAVCODEC_VERSION_MINOR 87 +#define LIBAVCODEC_VERSION_MICRO 5 #define LIBAVCODEC_VERSION_INT AV_VERSION_INT(LIBAVCODEC_VERSION_MAJOR, \ LIBAVCODEC_VERSION_MINOR, \ @@ -57,6 +58,17 @@ #define LIBAVCODEC_IDENT "Lavc" AV_STRINGIFY(LIBAVCODEC_VERSION) +/** + * Those FF_API_* defines are not part of public API. + * They may change, break or disappear at any time. + */ +#ifndef FF_API_PALETTE_CONTROL +#define FF_API_PALETTE_CONTROL (LIBAVCODEC_VERSION_MAJOR < 54) +#endif +#ifndef FF_API_MM_FLAGS +#define FF_API_MM_FLAGS (LIBAVCODEC_VERSION_MAJOR < 53) +#endif + #define AV_NOPTS_VALUE INT64_C(0x8000000000000000) #define AV_TIME_BASE 1000000 static const AVRational AV_TIME_BASE_Q={1, AV_TIME_BASE}; @@ -1399,27 +1411,25 @@ typedef struct AVCodecContext { * result into program crash.) */ unsigned dsp_mask; -#define FF_MM_FORCE 0x80000000 /* Force usage of selected flags (OR) */ - /* lower 16 bits - CPU features */ -#define FF_MM_MMX 0x0001 ///< standard MMX -#define FF_MM_3DNOW 0x0004 ///< AMD 3DNOW -#if LIBAVCODEC_VERSION_MAJOR < 53 -#define FF_MM_MMXEXT 0x0002 ///< SSE integer functions or AMD MMX ext + +#if FF_API_MM_FLAGS +#define FF_MM_FORCE AV_CPU_FLAG_FORCE +#define FF_MM_MMX AV_CPU_FLAG_MMX +#define FF_MM_3DNOW AV_CPU_FLAG_3DNOW +#define FF_MM_MMXEXT AV_CPU_FLAG_MMX2 +#define FF_MM_MMX2 AV_CPU_FLAG_MMX2 +#define FF_MM_SSE AV_CPU_FLAG_SSE +#define FF_MM_SSE2 AV_CPU_FLAG_SSE2 +#define FF_MM_SSE2SLOW AV_CPU_FLAG_SSE2SLOW +#define FF_MM_3DNOWEXT AV_CPU_FLAG_3DNOWEXT +#define FF_MM_SSE3 AV_CPU_FLAG_SSE3 +#define FF_MM_SSE3SLOW AV_CPU_FLAG_SSE3SLOW +#define FF_MM_SSSE3 AV_CPU_FLAG_SSSE3 +#define FF_MM_SSE4 AV_CPU_FLAG_SSE4 +#define FF_MM_SSE42 AV_CPU_FLAG_SSE42 +#define FF_MM_IWMMXT AV_CPU_FLAG_IWMMXT +#define FF_MM_ALTIVEC AV_CPU_FLAG_ALTIVEC #endif -#define FF_MM_MMX2 0x0002 ///< SSE integer functions or AMD MMX ext -#define FF_MM_SSE 0x0008 ///< SSE functions -#define FF_MM_SSE2 0x0010 ///< PIV SSE2 functions -#define FF_MM_SSE2SLOW 0x40000000 ///< SSE2 supported, but usually not faster - ///< than regular MMX/SSE (e.g. Core1) -#define FF_MM_3DNOWEXT 0x0020 ///< AMD 3DNowExt -#define FF_MM_SSE3 0x0040 ///< Prescott SSE3 functions -#define FF_MM_SSE3SLOW 0x20000000 ///< SSE3 supported, but usually not faster - ///< than regular MMX/SSE (e.g. Core1) -#define FF_MM_SSSE3 0x0080 ///< Conroe SSSE3 functions -#define FF_MM_SSE4 0x0100 ///< Penryn SSE4.1 functions -#define FF_MM_SSE42 0x0200 ///< Nehalem SSE4.2 functions -#define FF_MM_IWMMXT 0x0100 ///< XScale IWMMXT -#define FF_MM_ALTIVEC 0x0001 ///< standard AltiVec /** * bits per sample/pixel from the demuxer (needed for huffyuv). @@ -1776,12 +1786,14 @@ typedef struct AVCodecContext { */ int lmax; +#if FF_API_PALETTE_CONTROL /** * palette control structure * - encoding: ??? (no palette-enabled encoder yet) * - decoding: Set by user. */ struct AVPaletteControl *palctrl; +#endif /** * noise reduction strength @@ -2616,8 +2628,7 @@ typedef struct AVCodec { int (*init)(AVCodecContext *); int (*encode)(AVCodecContext *, uint8_t *buf, int buf_size, void *data); int (*close)(AVCodecContext *); - int (*decode)(AVCodecContext *, void *outdata, int *outdata_size, - const uint8_t *buf, int buf_size); + int (*decode)(AVCodecContext *, void *outdata, int *outdata_size, AVPacket *avpkt); /** * Codec capabilities. * see CODEC_CAP_* @@ -2834,59 +2845,91 @@ void avcodec_get_encoder_info(AVCodecContext *avctx,int *xvid_build,int *divx_ve */ FF_EXPORT int avcodec_open(AVCodecContext *avctx, AVCodec *codec); +#if LIBAVCODEC_VERSION_MAJOR < 53 /** - * @deprecated Use avcodec_decode_audio2() instead. + * Decode an audio frame from buf into samples. + * Wrapper function which calls avcodec_decode_audio3. + * + * @deprecated Use avcodec_decode_audio3 instead. + * @param avctx the codec context + * @param[out] samples the output buffer + * @param[in,out] frame_size_ptr the output buffer size in bytes + * @param[in] buf the input buffer + * @param[in] buf_size the input buffer size in bytes + * @return On error a negative value is returned, otherwise the number of bytes + * used or zero if no frame could be decompressed. */ -attribute_deprecated int avcodec_decode_audio(AVCodecContext *avctx, int16_t *samples, +FF_EXPORT int avcodec_decode_audio2(AVCodecContext *avctx, int16_t *samples, int *frame_size_ptr, const uint8_t *buf, int buf_size); +#endif /** - * Decodes an audio frame from buf into samples. - * The avcodec_decode_audio2() function decodes an audio frame from the input - * buffer buf of size buf_size. To decode it, it makes use of the - * audio codec which was coupled with avctx using avcodec_open(). The - * resulting decoded frame is stored in output buffer samples. If no frame - * could be decompressed, frame_size_ptr is zero. Otherwise, it is the + * Decode the audio frame of size avpkt->size from avpkt->data into samples. + * Some decoders may support multiple frames in a single AVPacket, such + * decoders would then just decode the first frame. In this case, + * avcodec_decode_audio3 has to be called again with an AVPacket that contains + * the remaining data in order to decode the second frame etc. + * If no frame + * could be outputted, frame_size_ptr is zero. Otherwise, it is the * decompressed frame size in bytes. * * @warning You must set frame_size_ptr to the allocated size of the - * output buffer before calling avcodec_decode_audio2(). + * output buffer before calling avcodec_decode_audio3(). * * @warning The input buffer must be FF_INPUT_BUFFER_PADDING_SIZE larger than * the actual read bytes because some optimized bitstream readers read 32 or 64 * bits at once and could read over the end. * - * @warning The end of the input buffer buf should be set to 0 to ensure that + * @warning The end of the input buffer avpkt->data should be set to 0 to ensure that * no overreading happens for damaged MPEG streams. * - * @note You might have to align the input buffer buf and output buffer + * @note You might have to align the input buffer avpkt->data and output buffer * samples. The alignment requirements depend on the CPU: On some CPUs it isn't * necessary at all, on others it won't work at all if not aligned and on others - * it will work but it will have an impact on performance. In practice, the - * bitstream should have 4 byte alignment at minimum and all sample data should - * be 16 byte aligned unless the CPU doesn't need it (AltiVec and SSE do). If - * the linesize is not a multiple of 16 then there's no sense in aligning the - * start of the buffer to 16. + * it will work but it will have an impact on performance. + * + * In practice, avpkt->data should have 4 byte alignment at minimum and + * samples should be 16 byte aligned unless the CPU doesn't need it + * (AltiVec and SSE do). * * @param avctx the codec context - * @param[out] samples the output buffer + * @param[out] samples the output buffer, sample type in avctx->sample_fmt * @param[in,out] frame_size_ptr the output buffer size in bytes + * @param[in] avpkt The input AVPacket containing the input buffer. + * You can create such packet with av_init_packet() and by then setting + * data and size, some decoders might in addition need other fields. + * All decoders are designed to use the least fields possible though. + * @return On error a negative value is returned, otherwise the number of bytes + * used or zero if no frame data was decompressed (used) from the input AVPacket. + */ +int avcodec_decode_audio3(AVCodecContext *avctx, int16_t *samples, + int *frame_size_ptr, + AVPacket *avpkt); + +#if LIBAVCODEC_VERSION_MAJOR < 53 +/** + * Decode a video frame from buf into picture. + * Wrapper function which calls avcodec_decode_video2. + * + * @deprecated Use avcodec_decode_video2 instead. + * @param avctx the codec context + * @param[out] picture The AVFrame in which the decoded video frame will be stored. * @param[in] buf the input buffer - * @param[in] buf_size the input buffer size in bytes + * @param[in] buf_size the size of the input buffer in bytes + * @param[in,out] got_picture_ptr Zero if no frame could be decompressed, otherwise, it is nonzero. * @return On error a negative value is returned, otherwise the number of bytes * used or zero if no frame could be decompressed. */ -FF_EXPORT int avcodec_decode_audio2(AVCodecContext *avctx, int16_t *samples, - int *frame_size_ptr, +FF_EXPORT int avcodec_decode_video(AVCodecContext *avctx, AVFrame *picture, + int *got_picture_ptr, const uint8_t *buf, int buf_size); +#endif /** - * Decodes a video frame from buf into picture. - * The avcodec_decode_video() function decodes a video frame from the input - * buffer buf of size buf_size. To decode it, it makes use of the - * video codec which was coupled with avctx using avcodec_open(). The - * resulting decoded frame is stored in picture. + * Decode the video frame of size avpkt->size from avpkt->data into picture. + * Some decoders may support multiple frames in a single AVPacket, such + * decoders would then just decode the first frame. * * @warning The input buffer must be FF_INPUT_BUFFER_PADDING_SIZE larger than * the actual read bytes because some optimized bitstream readers read 32 or 64 @@ -2895,29 +2938,37 @@ FF_EXPORT int avcodec_decode_audio2(AVCodecContext *avctx, int16_t *samples, * @warning The end of the input buffer buf should be set to 0 to ensure that * no overreading happens for damaged MPEG streams. * - * @note You might have to align the input buffer buf and output buffer - * samples. The alignment requirements depend on the CPU: on some CPUs it isn't + * @note You might have to align the input buffer avpkt->data. + * The alignment requirements depend on the CPU: on some CPUs it isn't * necessary at all, on others it won't work at all if not aligned and on others - * it will work but it will have an impact on performance. In practice, the - * bitstream should have 4 byte alignment at minimum and all sample data should - * be 16 byte aligned unless the CPU doesn't need it (AltiVec and SSE do). If - * the linesize is not a multiple of 16 then there's no sense in aligning the - * start of the buffer to 16. + * it will work but it will have an impact on performance. + * + * In practice, avpkt->data should have 4 byte alignment at minimum. * * @note Some codecs have a delay between input and output, these need to be - * feeded with buf=NULL, buf_size=0 at the end to return the remaining frames. + * fed with avpkt->data=NULL, avpkt->size=0 at the end to return the remaining frames. * * @param avctx the codec context * @param[out] picture The AVFrame in which the decoded video frame will be stored. - * @param[in] buf the input buffer - * @param[in] buf_size the size of the input buffer in bytes + * Use avcodec_alloc_frame to get an AVFrame, the codec will + * allocate memory for the actual bitmap. + * with default get/release_buffer(), the decoder frees/reuses the bitmap as it sees fit. + * with overridden get/release_buffer() (needs CODEC_CAP_DR1) the user decides into what buffer the decoder + * decodes and the decoder tells the user once it does not need the data anymore, + * the user app can at this point free/reuse/keep the memory as it sees fit. + * + * @param[in] avpkt The input AVpacket containing the input buffer. + * You can create such packet with av_init_packet() and by then setting + * data and size, some decoders might in addition need other fields like + * flags&AV_PKT_FLAG_KEY. All decoders are designed to use the least + * fields possible. * @param[in,out] got_picture_ptr Zero if no frame could be decompressed, otherwise, it is nonzero. * @return On error a negative value is returned, otherwise the number of bytes * used or zero if no frame could be decompressed. */ -FF_EXPORT int avcodec_decode_video(AVCodecContext *avctx, AVFrame *picture, +int avcodec_decode_video2(AVCodecContext *avctx, AVFrame *picture, int *got_picture_ptr, - const uint8_t *buf, int buf_size); + AVPacket *avpkt); int avcodec_parse_frame(AVCodecContext *avctx, uint8_t **pdata, diff --git a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/dsputil.c b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/dsputil.c index e4a4a7ad6..3d6b46d75 100644 --- a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/dsputil.c +++ b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/dsputil.c @@ -27,6 +27,7 @@ * DSP utils */ +#include "libavcore/imgutils.h" #include "avcodec.h" #include "dsputil.h" #include "simple_idct.h" @@ -121,6 +122,9 @@ void ff_init_scantable(uint8_t *permutation, ScanTable *st, const uint8_t *src_s int j; j = src_scantable[i]; st->permutated[i] = permutation[j]; +#if ARCH_PPC + st->inverse[j] = i; +#endif } end=-1; @@ -1158,7 +1162,7 @@ CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels8_c , 8)\ CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\ CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\ CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\ -CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c , OPNAME ## _pixels8_c , 8)\ +av_unused CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c , OPNAME ## _pixels8_c , 8)\ CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\ CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\ CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\ @@ -1172,6 +1176,9 @@ PIXOP2(put, op_put) #undef op_avg #undef op_put +#define put_no_rnd_pixels8_c put_pixels8_c +#define put_no_rnd_pixels16_c put_pixels16_c + #define avg2(a,b) ((a+b+1)>>1) #define avg4(a,b,c,d) ((a+b+c+d+2)>>2) @@ -1754,10 +1761,6 @@ static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dst }\ }\ \ -static void OPNAME ## qpel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){\ - OPNAME ## pixels8_c(dst, src, stride, 8);\ -}\ -\ static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\ uint8_t half[64];\ put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\ @@ -1936,9 +1939,6 @@ static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\ put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\ OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\ }\ -static void OPNAME ## qpel16_mc00_c (uint8_t *dst, uint8_t *src, int stride){\ - OPNAME ## pixels16_c(dst, src, stride, 16);\ -}\ \ static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\ uint8_t half[256];\ @@ -2133,6 +2133,13 @@ QPEL_MC(0, avg_ , _ , op_avg) #undef op_put #undef op_put_no_rnd +#define put_qpel8_mc00_c ff_put_pixels8x8_c +#define avg_qpel8_mc00_c ff_avg_pixels8x8_c +#define put_qpel16_mc00_c ff_put_pixels16x16_c +#define avg_qpel16_mc00_c ff_avg_pixels16x16_c +#define put_no_rnd_qpel8_mc00_c ff_put_pixels8x8_c +#define put_no_rnd_qpel16_mc00_c ff_put_pixels16x16_c + #if 1 #define H264_LOWPASS(OPNAME, OP, OP2) \ static av_unused void OPNAME ## h264_qpel2_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ @@ -2399,7 +2406,7 @@ static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t }\ #define H264_MC(OPNAME, SIZE) \ -static void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\ +static av_unused void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\ OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\ }\ \ @@ -2557,6 +2564,11 @@ H264_MC(avg_, 16) #undef op2_put #endif +#define put_h264_qpel8_mc00_c ff_put_pixels8x8_c +#define avg_h264_qpel8_mc00_c ff_avg_pixels8x8_c +#define put_h264_qpel16_mc00_c ff_put_pixels16x16_c +#define avg_h264_qpel16_mc00_c ff_avg_pixels16x16_c + static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){ uint8_t *cm = ff_cropTbl + MAX_NEG_CROP; int i; @@ -2575,31 +2587,18 @@ static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int } } -#if CONFIG_CAVS_DECODER -/* AVS specific */ -void ff_put_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) { +void ff_put_pixels8x8_c(uint8_t *dst, uint8_t *src, int stride) { put_pixels8_c(dst, src, stride, 8); } -void ff_avg_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) { +void ff_avg_pixels8x8_c(uint8_t *dst, uint8_t *src, int stride) { avg_pixels8_c(dst, src, stride, 8); } -void ff_put_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) { +void ff_put_pixels16x16_c(uint8_t *dst, uint8_t *src, int stride) { put_pixels16_c(dst, src, stride, 16); } -void ff_avg_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) { +void ff_avg_pixels16x16_c(uint8_t *dst, uint8_t *src, int stride) { avg_pixels16_c(dst, src, stride, 16); } -#endif /* CONFIG_CAVS_DECODER */ - -#if CONFIG_VC1_DECODER -/* VC-1 specific */ -void ff_put_vc1_mspel_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int rnd) { - put_pixels8_c(dst, src, stride, 8); -} -void ff_avg_vc1_mspel_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int rnd) { - avg_pixels8_c(dst, src, stride, 8); -} -#endif /* CONFIG_VC1_DECODER */ #if CONFIG_RV40_DECODER static void put_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){ @@ -2645,10 +2644,6 @@ static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int } } -static void put_mspel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){ - put_pixels8_c(dst, src, stride, 8); -} - static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){ uint8_t half[64]; wmv2_mspel8_h_lowpass(half, src, 8, stride, 8); @@ -4352,7 +4347,7 @@ av_cold void attribute_align_arg dsputil_init(DSPContext* c, AVCodecContext *avc c->avg_rv40_qpel_pixels_tab[1][15] = avg_rv40_qpel8_mc33_c; #endif - c->put_mspel_pixels_tab[0]= put_mspel8_mc00_c; + c->put_mspel_pixels_tab[0]= ff_put_pixels8x8_c; c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c; c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c; c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c; @@ -4456,7 +4451,7 @@ av_cold void attribute_align_arg dsputil_init(DSPContext* c, AVCodecContext *avc c->sv_fmul_scalar[1] = sv_fmul_scalar_4_c; #endif - c->shrink[0]= ff_img_copy_plane; + c->shrink[0]= av_image_copy_plane; c->shrink[1]= ff_shrink22; c->shrink[2]= ff_shrink44; c->shrink[3]= ff_shrink88; diff --git a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/dsputil.h b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/dsputil.h index cfd1b7f33..8c1499165 100644 --- a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/dsputil.h +++ b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/dsputil.h @@ -82,6 +82,11 @@ extern const uint8_t ff_zigzag248_direct[64]; extern uint32_t ff_squareTbl[512]; extern uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP]; +void ff_put_pixels8x8_c(uint8_t *dst, uint8_t *src, int stride); +void ff_avg_pixels8x8_c(uint8_t *dst, uint8_t *src, int stride); +void ff_put_pixels16x16_c(uint8_t *dst, uint8_t *src, int stride); +void ff_avg_pixels16x16_c(uint8_t *dst, uint8_t *src, int stride); + /* VP3 DSP functions */ void ff_vp3_idct_c(DCTELEM *block/* align 16*/); void ff_vp3_idct_put_c(uint8_t *dest/*align 8*/, int line_size, DCTELEM *block/*align 16*/); @@ -91,22 +96,15 @@ void ff_vp3_idct_dc_add_c(uint8_t *dest/*align 8*/, int line_size, const DCTELEM void ff_vp3_v_loop_filter_c(uint8_t *src, int stride, int *bounding_values); void ff_vp3_h_loop_filter_c(uint8_t *src, int stride, int *bounding_values); -/* VP6 DSP functions */ -void ff_vp6_filter_diag4_c(uint8_t *dst, uint8_t *src, int stride, - const int16_t *h_weights, const int16_t *v_weights); - -/* CAVS functions */ -void ff_put_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride); -void ff_avg_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride); -void ff_put_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride); -void ff_avg_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride); - -/* VC1 functions */ -void ff_put_vc1_mspel_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int rnd); -void ff_avg_vc1_mspel_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int rnd); - /* 1/2^n downscaling functions from imgconvert.c */ +#if LIBAVCODEC_VERSION_MAJOR < 53 +/** + * @deprecated Use av_image_copy_plane() instead. + */ +attribute_deprecated void ff_img_copy_plane(uint8_t *dst, int dst_wrap, const uint8_t *src, int src_wrap, int width, int height); +#endif + void ff_shrink22(uint8_t *dst, int dst_wrap, const uint8_t *src, int src_wrap, int width, int height); void ff_shrink44(uint8_t *dst, int dst_wrap, const uint8_t *src, int src_wrap, int width, int height); void ff_shrink88(uint8_t *dst, int dst_wrap, const uint8_t *src, int src_wrap, int width, int height); @@ -181,6 +179,10 @@ typedef struct ScanTable{ const uint8_t *scantable; uint8_t permutated[64]; uint8_t raster_end[64]; +#if ARCH_PPC + /** Used by dct_quantize_altivec to find last-non-zero */ + DECLARE_ALIGNED(16, uint8_t, inverse)[64]; +#endif } ScanTable; void ff_init_scantable(uint8_t *, ScanTable *st, const uint8_t *src_scantable); @@ -598,11 +600,15 @@ static inline int get_penalty_factor(int lambda, int lambda2, int type){ */ #define emms_c() -/* should be defined by architectures supporting - one or more MultiMedia extension */ -int mm_support(void); - +void dsputil_init_alpha(DSPContext* c, AVCodecContext *avctx); +void dsputil_init_arm(DSPContext* c, AVCodecContext *avctx); +void dsputil_init_bfin(DSPContext* c, AVCodecContext *avctx); +void dsputil_init_mlib(DSPContext* c, AVCodecContext *avctx); +void dsputil_init_mmi(DSPContext* c, AVCodecContext *avctx); void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx); +void dsputil_init_ppc(DSPContext* c, AVCodecContext *avctx); +void dsputil_init_sh4(DSPContext* c, AVCodecContext *avctx); +void dsputil_init_vis(DSPContext* c, AVCodecContext *avctx); void ff_dsputil_init_dwt(DSPContext *c); void ff_rv30dsp_init(DSPContext* c, AVCodecContext *avctx); @@ -627,9 +633,19 @@ static inline void emms(void) #define emms_c() emms() -#else +#elif ARCH_ARM + +#if HAVE_NEON +# define STRIDE_ALIGN 16 +#endif + +#elif ARCH_PPC + +#define STRIDE_ALIGN 16 + +#elif HAVE_MMI -#define mm_support() 0 +#define STRIDE_ALIGN 16 #endif diff --git a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/flvdec.c b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/flvdec.c index 32f595a27..8f05945cb 100644 --- a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/flvdec.c +++ b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/flvdec.c @@ -83,7 +83,7 @@ int ff_flv_decode_picture_header(MpegEncContext *s) width = height = 0; break; } - if(av_check_image_size(width, height, 0, s->avctx)) + if(av_image_check_size(width, height, 0, s->avctx)) return -1; s->width = width; s->height = height; diff --git a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/h263.h b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/h263.h index d8b7abe4f..6c05565c7 100644 --- a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/h263.h +++ b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/h263.h @@ -70,7 +70,7 @@ av_const int ff_h263_aspect_to_info(AVRational aspect); int ff_h263_decode_init(AVCodecContext *avctx); int ff_h263_decode_frame(AVCodecContext *avctx, void *data, int *data_size, - const uint8_t *buf, int buf_size); + AVPacket *avpkt); int ff_h263_decode_end(AVCodecContext *avctx); void h263_encode_mb(MpegEncContext *s, DCTELEM block[6][64], diff --git a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/h263dec.c b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/h263dec.c index 503c3b6ba..92beb1bd3 100644 --- a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/h263dec.c +++ b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/h263dec.c @@ -25,6 +25,7 @@ * H.263 decoder. */ +#include "libavutil/cpu.h" #include "internal.h" #include "avcodec.h" #include "dsputil.h" @@ -321,8 +322,10 @@ static int decode_slice(MpegEncContext *s){ int ff_h263_decode_frame(AVCodecContext *avctx, void *data, int *data_size, - const uint8_t *buf, int buf_size) + AVPacket *avpkt) { + const uint8_t *buf = avpkt->data; + int buf_size = avpkt->size; MpegEncContext *s = avctx->priv_data; int ret; AVFrame *pict = data; @@ -542,7 +545,7 @@ retry: #endif #if HAVE_MMX - if(s->codec_id == CODEC_ID_MPEG4 && s->xvid_build>=0 && avctx->idct_algo == FF_IDCT_AUTO && (mm_support() & FF_MM_MMX)){ + if (s->codec_id == CODEC_ID_MPEG4 && s->xvid_build>=0 && avctx->idct_algo == FF_IDCT_AUTO && (av_get_cpu_flags() & AV_CPU_FLAG_MMX)) { avctx->idct_algo= FF_IDCT_XVIDMMX; avctx->coded_width= 0; // force reinit // dsputil_init(&s->dsp, avctx); diff --git a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/h264.c b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/h264.c index bcc6e6b8f..7ab3e6311 100644 --- a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/h264.c +++ b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/h264.c @@ -1250,14 +1250,9 @@ static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple){ chroma_dc_dequant_idct_c(h->mb + 16*16, h->chroma_qp[0], h->dequant4_coeff[IS_INTRA(mb_type) ? 1:4][h->chroma_qp[0]][0]); chroma_dc_dequant_idct_c(h->mb + 16*16+4*16, h->chroma_qp[1], h->dequant4_coeff[IS_INTRA(mb_type) ? 2:5][h->chroma_qp[1]][0]); if(is_h264){ - idct_add = h->h264dsp.h264_idct_add; - idct_dc_add = h->h264dsp.h264_idct_dc_add; - for(i=16; i<16+8; i++){ - if(h->non_zero_count_cache[ scan8[i] ]) - idct_add (dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize); - else if(h->mb[i*16]) - idct_dc_add(dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize); - } + h->h264dsp.h264_idct_add8(dest, block_offset, + h->mb, uvlinesize, + h->non_zero_count_cache); }else{ for(i=16; i<16+8; i++){ if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){ @@ -2891,8 +2886,10 @@ static int get_consumed_bytes(MpegEncContext *s, int pos, int buf_size){ static int decode_frame(AVCodecContext *avctx, void *data, int *data_size, - const uint8_t *buf, int buf_size) + AVPacket *avpkt) { + const uint8_t *buf = avpkt->data; + int buf_size = avpkt->size; H264Context *h = avctx->priv_data; MpegEncContext *s = &h->s; AVFrame *pict = data; diff --git a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/h264.h b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/h264.h index d87f9d01a..64db7072a 100644 --- a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/h264.h +++ b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/h264.h @@ -607,12 +607,12 @@ typedef struct H264Context{ int sp_for_switch_flag; int slice_qs_delta; int slice_qp_delta; - unsigned int first_mb_in_slice; - int bit_offset_to_slice_data; - int raw_slice_type; - int64_t outputed_rtstart; - void* dxva_slice_long; - int ref_pic_flag; + unsigned int first_mb_in_slice; + int bit_offset_to_slice_data; + int raw_slice_type; + int64_t outputed_rtstart; + void* dxva_slice_long; + int ref_pic_flag; // <== End patch MPC }H264Context; diff --git a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/h264_ps.c b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/h264_ps.c index 40b71ec24..96f99ab49 100644 --- a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/h264_ps.c +++ b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/h264_ps.c @@ -344,7 +344,7 @@ int ff_h264_decode_seq_parameter_set(H264Context *h){ sps->mb_width = get_ue_golomb(&s->gb) + 1; sps->mb_height= get_ue_golomb(&s->gb) + 1; if((unsigned)sps->mb_width >= INT_MAX/16 || (unsigned)sps->mb_height >= INT_MAX/16 || - av_check_image_size(16*sps->mb_width, 16*sps->mb_height, 0, h->s.avctx)){ + av_image_check_size(16*sps->mb_width, 16*sps->mb_height, 0, h->s.avctx)){ av_log(h->s.avctx, AV_LOG_ERROR, "mb_width/height overflow\n"); goto fail; } diff --git a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/imgconvert.c b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/imgconvert.c index f6ea7a7f1..eb2cdc376 100644 --- a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/imgconvert.c +++ b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/imgconvert.c @@ -37,7 +37,7 @@ #include "libavutil/pixdesc.h" #include "libavcore/imgutils.h" -#if HAVE_MMX +#if HAVE_MMX && HAVE_YASM #include "x86/dsputil_mmx.h" #endif @@ -751,13 +751,13 @@ int ff_set_systematic_pal(uint32_t pal[256], enum PixelFormat pix_fmt){ #if LIBAVCODEC_VERSION_MAJOR < 53 int ff_fill_linesize(AVPicture *picture, enum PixelFormat pix_fmt, int width) { - return av_fill_image_linesizes(picture->linesize, pix_fmt, width); + return av_image_fill_linesizes(picture->linesize, pix_fmt, width); } int ff_fill_pointer(AVPicture *picture, uint8_t *ptr, enum PixelFormat pix_fmt, int height) { - return av_fill_image_pointers(picture->data, pix_fmt, height, ptr, picture->linesize); + return av_image_fill_pointers(picture->data, pix_fmt, height, ptr, picture->linesize); } #endif @@ -813,64 +813,33 @@ static int avg_bits_per_pixel(enum PixelFormat pix_fmt) return bits; } +#if LIBAVCODEC_VERSION_MAJOR < 53 void ff_img_copy_plane(uint8_t *dst, int dst_wrap, const uint8_t *src, int src_wrap, int width, int height) { - if((!dst) || (!src)) - return; - for(;height > 0; height--) { - memcpy(dst, src, width); - dst += dst_wrap; - src += src_wrap; - } + av_image_copy_plane(dst, dst_wrap, src, src_wrap, width, height); } -#if LIBAVCODEC_VERSION_MAJOR < 53 int ff_get_plane_bytewidth(enum PixelFormat pix_fmt, int width, int plane) { - return av_get_image_linesize(pix_fmt, width, plane); + return av_image_get_linesize(pix_fmt, width, plane); } -#endif void av_picture_data_copy(uint8_t *dst_data[4], int dst_linesize[4], uint8_t *src_data[4], int src_linesize[4], enum PixelFormat pix_fmt, int width, int height) { - int i; - const PixFmtInfo *pf = &pix_fmt_info[pix_fmt]; - const AVPixFmtDescriptor *desc = &av_pix_fmt_descriptors[pix_fmt]; - - switch(pf->pixel_type) { - case FF_PIXEL_PACKED: - case FF_PIXEL_PLANAR: - for(i = 0; i < pf->nb_channels; i++) { - int h; - int bwidth = av_get_image_linesize(pix_fmt, width, i); - h = height; - if (i == 1 || i == 2) { - h= -((-height)>>desc->log2_chroma_h); - } - ff_img_copy_plane(dst_data[i], dst_linesize[i], - src_data[i], src_linesize[i], - bwidth, h); - } - break; - case FF_PIXEL_PALETTE: - ff_img_copy_plane(dst_data[0], dst_linesize[0], - src_data[0], src_linesize[0], - width, height); - /* copy the palette */ - memcpy(dst_data[1], src_data[1], 4*256); - break; - } + av_image_copy(dst_data, dst_linesize, src_data, src_linesize, + pix_fmt, width, height); } +#endif void av_picture_copy(AVPicture *dst, const AVPicture *src, enum PixelFormat pix_fmt, int width, int height) { - av_picture_data_copy(dst->data, dst->linesize, src->data, - src->linesize, pix_fmt, width, height); + av_image_copy(dst->data, dst->linesize, src->data, + src->linesize, pix_fmt, width, height); } /* 2x2 -> 1x1 */ diff --git a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/libamr.c b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/libamr.c index 695fc4ca7..1c0ba07a8 100644 --- a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/libamr.c +++ b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/libamr.c @@ -137,8 +137,10 @@ static av_cold int amr_nb_decode_close(AVCodecContext *avctx) } static int amr_nb_decode_frame(AVCodecContext * avctx, void *data, - int *data_size, const uint8_t * buf, int buf_size) + int *data_size, AVPacket *avpkt) { + const uint8_t *buf = avpkt->data; + int buf_size = avpkt->size; AMRContext *s = avctx->priv_data; const uint8_t *amrData = buf; static const uint8_t block_size[16] = { 12, 13, 15, 17, 19, 20, 26, 31, 5, 0, 0, 0, 0, 0, 0, 0 }; diff --git a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/mjpegdec.c b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/mjpegdec.c index fcb4f2011..8095eff9f 100644 --- a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/mjpegdec.c +++ b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/mjpegdec.c @@ -219,7 +219,7 @@ int ff_mjpeg_decode_sof(MJpegDecodeContext *s) height= s->height; av_log(s->avctx, AV_LOG_DEBUG, "sof0: picture: %dx%d\n", width, height); - if(av_check_image_size(width, height, 0, s->avctx)) + if(av_image_check_size(width, height, 0, s->avctx)) return -1; nb_components = get_bits(&s->gb, 8); @@ -1205,8 +1205,10 @@ found: int ff_mjpeg_decode_frame(AVCodecContext *avctx, void *data, int *data_size, - const uint8_t *buf, int buf_size) + AVPacket *avpkt) { + const uint8_t *buf = avpkt->data; + int buf_size = avpkt->size; MJpegDecodeContext *s = avctx->priv_data; const uint8_t *buf_end, *buf_ptr; int start_code; diff --git a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/mjpegdec.h b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/mjpegdec.h index 5a9da5902..bbf734b56 100644 --- a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/mjpegdec.h +++ b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/mjpegdec.h @@ -111,7 +111,7 @@ int ff_mjpeg_decode_init(AVCodecContext *avctx); int ff_mjpeg_decode_end(AVCodecContext *avctx); int ff_mjpeg_decode_frame(AVCodecContext *avctx, void *data, int *data_size, - const uint8_t *buf, int buf_size); + AVPacket *avpkt); int ff_mjpeg_decode_dqt(MJpegDecodeContext *s); int ff_mjpeg_decode_dht(MJpegDecodeContext *s); int ff_mjpeg_decode_sof(MJpegDecodeContext *s); diff --git a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/mlpdec.c b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/mlpdec.c index e392d971b..46fc32891 100644 --- a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/mlpdec.c +++ b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/mlpdec.c @@ -943,8 +943,10 @@ static int output_data(MLPDecodeContext *m, unsigned int substr, * otherwise the number of bytes consumed. */ static int read_access_unit(AVCodecContext *avctx, void* data, int *data_size, - const uint8_t *buf, int buf_size) + AVPacket *avpkt) { + const uint8_t *buf = avpkt->data; + int buf_size = avpkt->size; MLPDecodeContext *m = avctx->priv_data; GetBitContext gb; unsigned int length, substr; diff --git a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/mpeg12.c b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/mpeg12.c index 93268052a..6aab625c8 100644 --- a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/mpeg12.c +++ b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/mpeg12.c @@ -2235,8 +2235,10 @@ static int decode_chunks(AVCodecContext *avctx, /* handle buffering and image synchronisation */ static int mpeg_decode_frame(AVCodecContext *avctx, void *data, int *data_size, - const uint8_t *buf, int buf_size) + AVPacket *avpkt) { + const uint8_t *buf = avpkt->data; + int buf_size = avpkt->size; Mpeg1Context *s = avctx->priv_data; AVFrame *picture = data; MpegEncContext *s2 = &s->mpeg_enc_ctx; diff --git a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/nellymoserdec.c b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/nellymoserdec.c index 1f0a51ff9..6729c13fd 100644 --- a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/nellymoserdec.c +++ b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/nellymoserdec.c @@ -154,7 +154,9 @@ static av_cold int decode_init(AVCodecContext * avctx) { static int decode_tag(AVCodecContext * avctx, void *data, int *data_size, - const uint8_t * buf, int buf_size) { + AVPacket *avpkt) { + const uint8_t *buf = avpkt->data; + int buf_size = avpkt->size; NellyMoserDecodeContext *s = avctx->priv_data; int blocks, i; int16_t* samples; diff --git a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/rv10.c b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/rv10.c index 829c9b3b1..df3b664e7 100644 --- a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/rv10.c +++ b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/rv10.c @@ -370,7 +370,7 @@ static int rv20_decode_picture_header(MpegEncContext *s) } if(new_w != s->width || new_h != s->height){ av_log(s->avctx, AV_LOG_DEBUG, "attempting to change resolution to %dx%d\n", new_w, new_h); - if (av_check_image_size(new_w, new_h, 0, s->avctx) < 0) + if (av_image_check_size(new_w, new_h, 0, s->avctx) < 0) return -1; MPV_common_end(s); avcodec_set_dimensions(s->avctx, new_w, new_h); @@ -645,8 +645,10 @@ static int get_slice_offset(AVCodecContext *avctx, const uint8_t *buf, int n) static int rv10_decode_frame(AVCodecContext *avctx, void *data, int *data_size, - const uint8_t *buf, int buf_size) + AVPacket *avpkt) { + const uint8_t *buf = avpkt->data; + int buf_size = avpkt->size; MpegEncContext *s = avctx->priv_data; int i; AVFrame *pict = data; diff --git a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/rv34.c b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/rv34.c index ce92c7850..b586aa0b8 100644 --- a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/rv34.c +++ b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/rv34.c @@ -1410,8 +1410,10 @@ static int get_slice_offset(AVCodecContext *avctx, const uint8_t *buf, int n) int ff_rv34_decode_frame(AVCodecContext *avctx, void *data, int *data_size, - const uint8_t *buf, int buf_size) + AVPacket *avpkt) { + const uint8_t *buf = avpkt->data; + int buf_size = avpkt->size; RV34DecContext *r = avctx->priv_data; MpegEncContext *s = &r->s; AVFrame *pict = data; diff --git a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/rv34.h b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/rv34.h index 3d25af2b1..24a27ce48 100644 --- a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/rv34.h +++ b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/rv34.h @@ -124,7 +124,7 @@ typedef struct RV34DecContext{ */ int ff_rv34_get_start_offset(GetBitContext *gb, int blocks); int ff_rv34_decode_init(AVCodecContext *avctx); -int ff_rv34_decode_frame(AVCodecContext *avctx, void *data, int *data_size, const uint8_t *buf, int buf_size); +int ff_rv34_decode_frame(AVCodecContext *avctx, void *data, int *data_size, AVPacket *avpkt); int ff_rv34_decode_end(AVCodecContext *avctx); #endif /* AVCODEC_RV34_H */ diff --git a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/rv40.c b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/rv40.c index 13ba5b6ee..157169196 100644 --- a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/rv40.c +++ b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/rv40.c @@ -144,7 +144,7 @@ static int rv40_parse_slice_header(RV34DecContext *r, GetBitContext *gb, SliceIn si->pts = get_bits(gb, 13); if(!si->type || !get_bits1(gb)) rv40_parse_picture_size(gb, &w, &h); - if(av_check_image_size(w, h, 0, r->s.avctx) < 0) + if(av_image_check_size(w, h, 0, r->s.avctx) < 0) return -1; si->width = w; si->height = h; diff --git a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/sp5xdec.c b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/sp5xdec.c index a63d52259..9db7d32ed 100644 --- a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/sp5xdec.c +++ b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/sp5xdec.c @@ -32,11 +32,11 @@ static int sp5x_decode_frame(AVCodecContext *avctx, void *data, int *data_size, - const uint8_t *buf, int buf_size) + AVPacket *avpkt) { -#if 0 - MJpegDecodeContext *s = avctx->priv_data; -#endif + const uint8_t *buf = avpkt->data; + int buf_size = avpkt->size; + AVPacket avpkt_recoded; const int qscale = 5; const uint8_t *buf_ptr; uint8_t *recoded; @@ -47,7 +47,6 @@ static int sp5x_decode_frame(AVCodecContext *avctx, buf_ptr = buf; -#if 1 recoded = av_mallocz(buf_size + 1024); if (!recoded) return -1; @@ -88,102 +87,13 @@ static int sp5x_decode_frame(AVCodecContext *avctx, recoded[j++] = 0xD9; avctx->flags &= ~CODEC_FLAG_EMU_EDGE; - i = ff_mjpeg_decode_frame(avctx, data, data_size, recoded, j); + av_init_packet(&avpkt_recoded); + avpkt_recoded.data = recoded; + avpkt_recoded.size = j; + i = ff_mjpeg_decode_frame(avctx, data, data_size, &avpkt_recoded); av_free(recoded); -#else - /* SOF */ - s->bits = 8; - s->width = avctx->coded_width; - s->height = avctx->coded_height; - s->nb_components = 3; - s->component_id[0] = 0; - s->h_count[0] = 2; - s->v_count[0] = 2; - s->quant_index[0] = 0; - s->component_id[1] = 1; - s->h_count[1] = 1; - s->v_count[1] = 1; - s->quant_index[1] = 1; - s->component_id[2] = 2; - s->h_count[2] = 1; - s->v_count[2] = 1; - s->quant_index[2] = 1; - s->h_max = 2; - s->v_max = 2; - - s->qscale_table = av_mallocz((s->width+15)/16); - avctx->pix_fmt = s->cs_itu601 ? PIX_FMT_YUV420P : PIX_FMT_YUVJ420; - s->interlaced = 0; - - s->picture.reference = 0; - if (avctx->get_buffer(avctx, &s->picture) < 0) - { - av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n"); - return -1; - } - - s->picture.pict_type = FF_I_TYPE; - s->picture.key_frame = 1; - - for (i = 0; i < 3; i++) - s->linesize[i] = s->picture.linesize[i] << s->interlaced; - - /* DQT */ - for (i = 0; i < 64; i++) - { - j = s->scantable.permutated[i]; - s->quant_matrixes[0][j] = sp5x_quant_table[(qscale * 2) + i]; - } - s->qscale[0] = FFMAX( - s->quant_matrixes[0][s->scantable.permutated[1]], - s->quant_matrixes[0][s->scantable.permutated[8]]) >> 1; - - for (i = 0; i < 64; i++) - { - j = s->scantable.permutated[i]; - s->quant_matrixes[1][j] = sp5x_quant_table[(qscale * 2) + 1 + i]; - } - s->qscale[1] = FFMAX( - s->quant_matrixes[1][s->scantable.permutated[1]], - s->quant_matrixes[1][s->scantable.permutated[8]]) >> 1; - - /* DHT */ - - /* SOS */ - s->comp_index[0] = 0; - s->nb_blocks[0] = s->h_count[0] * s->v_count[0]; - s->h_scount[0] = s->h_count[0]; - s->v_scount[0] = s->v_count[0]; - s->dc_index[0] = 0; - s->ac_index[0] = 0; - - s->comp_index[1] = 1; - s->nb_blocks[1] = s->h_count[1] * s->v_count[1]; - s->h_scount[1] = s->h_count[1]; - s->v_scount[1] = s->v_count[1]; - s->dc_index[1] = 1; - s->ac_index[1] = 1; - - s->comp_index[2] = 2; - s->nb_blocks[2] = s->h_count[2] * s->v_count[2]; - s->h_scount[2] = s->h_count[2]; - s->v_scount[2] = s->v_count[2]; - s->dc_index[2] = 1; - s->ac_index[2] = 1; - - for (i = 0; i < 3; i++) - s->last_dc[i] = 1024; - - s->mb_width = (s->width * s->h_max * 8 -1) / (s->h_max * 8); - s->mb_height = (s->height * s->v_max * 8 -1) / (s->v_max * 8); - - init_get_bits(&s->gb, buf+14, (buf_size-14)*8); - - return mjpeg_decode_scan(s); -#endif - return i; } diff --git a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/svq1dec.c b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/svq1dec.c index 74fede36a..2df76316d 100644 --- a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/svq1dec.c +++ b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/svq1dec.c @@ -642,8 +642,10 @@ static int svq1_decode_frame_header (GetBitContext *bitbuf,MpegEncContext *s) { static int svq1_decode_frame(AVCodecContext *avctx, void *data, int *data_size, - const uint8_t *buf, int buf_size) + AVPacket *avpkt) { + const uint8_t *buf = avpkt->data; + int buf_size = avpkt->size; MpegEncContext *s=avctx->priv_data; uint8_t *current, *previous; int result, i, x, y, width, height; diff --git a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/svq3.c b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/svq3.c index 752400fa5..d7fe4aa5c 100644 --- a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/svq3.c +++ b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/svq3.c @@ -917,8 +917,10 @@ static av_cold int svq3_decode_init(AVCodecContext *avctx) static int svq3_decode_frame(AVCodecContext *avctx, void *data, int *data_size, - const uint8_t *buf, int buf_size) + AVPacket *avpkt) { + const uint8_t *buf = avpkt->data; + int buf_size = avpkt->size; MpegEncContext *const s = avctx->priv_data; H264Context *const h = avctx->priv_data; int m, mb_type; diff --git a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/utils.c b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/utils.c index 240ae68f1..aad1d9521 100644 --- a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/utils.c +++ b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/utils.c @@ -212,7 +212,7 @@ void avcodec_align_dimensions(AVCodecContext *s, int *width, int *height){ #if LIBAVCODEC_VERSION_MAJOR < 53 int avcodec_check_dimensions(void *av_log_ctx, unsigned int w, unsigned int h){ - return av_check_image_size(w, h, 0, av_log_ctx); + return av_image_check_size(w, h, 0, av_log_ctx); } #endif @@ -232,7 +232,7 @@ int avcodec_default_get_buffer(AVCodecContext *s, AVFrame *pic){ return -1; } - if(av_check_image_size(w, h, 0, s)) + if(av_image_check_size(w, h, 0, s)) return -1; if(s->internal_buffer==NULL){ @@ -280,7 +280,7 @@ int avcodec_default_get_buffer(AVCodecContext *s, AVFrame *pic){ do { // NOTE: do not align linesizes individually, this breaks e.g. assumptions // that linesize[0] == 2*linesize[1] in the MPEG-encoder for 4:2:2 - av_fill_image_linesizes(picture.linesize, s->pix_fmt, w); + av_image_fill_linesizes(picture.linesize, s->pix_fmt, w); // increase alignment of w for next try (rhs gives the lowest bit set in w) w += w & ~(w-1); @@ -290,7 +290,7 @@ int avcodec_default_get_buffer(AVCodecContext *s, AVFrame *pic){ } } while (unaligned); - tmpsize = av_fill_image_pointers(picture.data, s->pix_fmt, h, NULL, picture.linesize); + tmpsize = av_image_fill_pointers(picture.data, s->pix_fmt, h, NULL, picture.linesize); if (tmpsize < 0) return -1; @@ -489,7 +489,7 @@ int attribute_align_arg avcodec_open(AVCodecContext *avctx, AVCodec *codec) #define SANE_NB_CHANNELS 128U if (((avctx->coded_width || avctx->coded_height) - && av_check_image_size(avctx->coded_width, avctx->coded_height, 0, avctx)) + && av_image_check_size(avctx->coded_width, avctx->coded_height, 0, avctx)) || avctx->channels > SANE_NB_CHANNELS) { ret = AVERROR(EINVAL); goto free_and_end; @@ -498,14 +498,13 @@ int attribute_align_arg avcodec_open(AVCodecContext *avctx, AVCodec *codec) avctx->codec = codec; avctx->codec_id = codec->id; /* ffdshow custom code */ avctx->frame_number = 0; - if(avctx->codec->init){ - if(avctx->codec_type == AVMEDIA_TYPE_VIDEO && - avctx->codec->max_lowres < avctx->lowres){ - av_log(avctx, AV_LOG_ERROR, "The maximum value for lowres supported by the decoder is %d\n", - avctx->codec->max_lowres); - goto free_and_end; - } + if (avctx->codec->max_lowres < avctx->lowres) { + av_log(avctx, AV_LOG_ERROR, "The maximum value for lowres supported by the decoder is %d\n", + avctx->codec->max_lowres); + goto free_and_end; + } + if(avctx->codec->init){ ret = avctx->codec->init(avctx); if (ret < 0) { goto free_and_end; @@ -548,7 +547,7 @@ int attribute_align_arg avcodec_encode_video(AVCodecContext *avctx, uint8_t *buf av_log(avctx, AV_LOG_ERROR, "buffer smaller than minimum size\n"); return -1; } - if(av_check_image_size(avctx->width, avctx->height, 0, avctx)) + if(av_image_check_size(avctx->width, avctx->height, 0, avctx)) return -1; if((avctx->codec->capabilities & CODEC_CAP_DELAY) || pict){ int ret = avctx->codec->encode(avctx, buf, buf_size, pict); @@ -560,18 +559,34 @@ int attribute_align_arg avcodec_encode_video(AVCodecContext *avctx, uint8_t *buf return 0; } +#if LIBAVCODEC_VERSION_MAJOR < 53 int attribute_align_arg avcodec_decode_video(AVCodecContext *avctx, AVFrame *picture, int *got_picture_ptr, const uint8_t *buf, int buf_size) +{ + AVPacket avpkt; + av_init_packet(&avpkt); + avpkt.data = buf; + avpkt.size = buf_size; + // HACK for CorePNG to decode as normal PNG by default + avpkt.flags = AV_PKT_FLAG_KEY; + + return avcodec_decode_video2(avctx, picture, got_picture_ptr, &avpkt); +} +#endif + +int attribute_align_arg avcodec_decode_video2(AVCodecContext *avctx, AVFrame *picture, + int *got_picture_ptr, + AVPacket *avpkt) { int ret; *got_picture_ptr= 0; - if((avctx->coded_width||avctx->coded_height) && av_check_image_size(avctx->coded_width, avctx->coded_height, 0, avctx)) + if((avctx->coded_width||avctx->coded_height) && av_image_check_size(avctx->coded_width, avctx->coded_height, 0, avctx)) return -1; - if((avctx->codec->capabilities & CODEC_CAP_DELAY) || buf_size){ + if((avctx->codec->capabilities & CODEC_CAP_DELAY) || avpkt->size){ ret = avctx->codec->decode(avctx, picture, got_picture_ptr, - buf, buf_size); + avpkt); emms_c(); //needed to avoid an emms_c() call before every return; @@ -583,13 +598,27 @@ int attribute_align_arg avcodec_decode_video(AVCodecContext *avctx, AVFrame *pic return ret; } +#if LIBAVCODEC_VERSION_MAJOR < 53 int attribute_align_arg avcodec_decode_audio2(AVCodecContext *avctx, int16_t *samples, int *frame_size_ptr, const uint8_t *buf, int buf_size) +{ + AVPacket avpkt; + av_init_packet(&avpkt); + avpkt.data = buf; + avpkt.size = buf_size; + + return avcodec_decode_audio3(avctx, samples, frame_size_ptr, &avpkt); +} +#endif + +int attribute_align_arg avcodec_decode_audio3(AVCodecContext *avctx, int16_t *samples, + int *frame_size_ptr, + AVPacket *avpkt) { int ret; - if((avctx->codec->capabilities & CODEC_CAP_DELAY) || buf_size){ + if((avctx->codec->capabilities & CODEC_CAP_DELAY) || avpkt->size){ //FIXME remove the check below _after_ ensuring that all audio check that the available space is enough if(*frame_size_ptr < AVCODEC_MAX_AUDIO_FRAME_SIZE){ av_log(avctx, AV_LOG_ERROR, "buffer smaller than AVCODEC_MAX_AUDIO_FRAME_SIZE\n"); @@ -601,8 +630,7 @@ int attribute_align_arg avcodec_decode_audio2(AVCodecContext *avctx, int16_t *sa return -1; } - ret = avctx->codec->decode(avctx, samples, frame_size_ptr, - buf, buf_size); + ret = avctx->codec->decode(avctx, samples, frame_size_ptr, avpkt); avctx->frame_number++; }else{ ret= 0; diff --git a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/vc1dec.c b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/vc1dec.c index 47cd5e811..fe960473f 100644 --- a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/vc1dec.c +++ b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/vc1dec.c @@ -3135,8 +3135,10 @@ static av_cold int vc1_decode_init(AVCodecContext *avctx) */ static int vc1_decode_frame(AVCodecContext *avctx, void *data, int *data_size, - const uint8_t *buf, int buf_size) + AVPacket *avpkt) { + const uint8_t *buf = avpkt->data; + int buf_size = avpkt->size; VC1Context *v = avctx->priv_data; MpegEncContext *s = &v->s; AVFrame *pict = data; diff --git a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/vc1dsp.c b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/vc1dsp.c index 8634bef69..aab169479 100644 --- a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/vc1dsp.c +++ b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/vc1dsp.c @@ -630,7 +630,7 @@ av_cold void ff_vc1dsp_init(DSPContext* dsp, AVCodecContext *avctx) { dsp->vc1_v_loop_filter16 = vc1_v_loop_filter16_c; dsp->vc1_h_loop_filter16 = vc1_h_loop_filter16_c; - dsp->put_vc1_mspel_pixels_tab[ 0] = ff_put_vc1_mspel_mc00_c; + dsp->put_vc1_mspel_pixels_tab[ 0] = ff_put_pixels8x8_c; dsp->put_vc1_mspel_pixels_tab[ 1] = put_vc1_mspel_mc10_c; dsp->put_vc1_mspel_pixels_tab[ 2] = put_vc1_mspel_mc20_c; dsp->put_vc1_mspel_pixels_tab[ 3] = put_vc1_mspel_mc30_c; @@ -647,7 +647,7 @@ av_cold void ff_vc1dsp_init(DSPContext* dsp, AVCodecContext *avctx) { dsp->put_vc1_mspel_pixels_tab[14] = put_vc1_mspel_mc23_c; dsp->put_vc1_mspel_pixels_tab[15] = put_vc1_mspel_mc33_c; - dsp->avg_vc1_mspel_pixels_tab[ 0] = ff_avg_vc1_mspel_mc00_c; + dsp->avg_vc1_mspel_pixels_tab[ 0] = ff_avg_pixels8x8_c; dsp->avg_vc1_mspel_pixels_tab[ 1] = avg_vc1_mspel_mc10_c; dsp->avg_vc1_mspel_pixels_tab[ 2] = avg_vc1_mspel_mc20_c; dsp->avg_vc1_mspel_pixels_tab[ 3] = avg_vc1_mspel_mc30_c; diff --git a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/vp3.c b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/vp3.c index 4d0e2a4b0..437ecffa3 100644 --- a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/vp3.c +++ b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/vp3.c @@ -1329,12 +1329,11 @@ static void vp3_draw_horiz_band(Vp3DecodeContext *s, int y) return; h= y - s->last_slice_end; + s->last_slice_end= y; y -= h; if (!s->flipped_image) { - if (y == 0) - h -= s->height - s->avctx->height; // account for non-mod16 - y = s->height - y - h; + y = s->avctx->height - y - h; } cy = y >> s->chroma_y_shift; @@ -1345,7 +1344,6 @@ static void vp3_draw_horiz_band(Vp3DecodeContext *s, int y) emms_c(); s->avctx->draw_horiz_band(s->avctx, &s->current_frame, offset, y, 3, h); - s->last_slice_end= y + h; } /* @@ -1516,7 +1514,7 @@ static void render_slice(Vp3DecodeContext *s, int slice) * dispatch (slice - 1); */ - vp3_draw_horiz_band(s, FFMIN(64*slice + 64-16, s->height-16)); + vp3_draw_horiz_band(s, FFMIN((32 << s->chroma_y_shift) * (slice + 1) -16, s->height-16)); } /* @@ -1737,8 +1735,10 @@ static int64_t theora_granule_frame(Vp3DecodeContext *s,int64_t granulepos) */ static int vp3_decode_frame(AVCodecContext *avctx, void *data, int *data_size, - const uint8_t *buf, int buf_size) + AVPacket *avpkt) { + const uint8_t *buf = avpkt->data; + int buf_size = avpkt->size; Vp3DecodeContext *s = avctx->priv_data; GetBitContext gb; static int counter = 0; @@ -1868,7 +1868,7 @@ static int vp3_decode_frame(AVCodecContext *avctx, int row = (s->height >> (3+(i && s->chroma_y_shift))) - 1; apply_loop_filter(s, i, row, row+1); } - vp3_draw_horiz_band(s, s->height); + vp3_draw_horiz_band(s, s->avctx->height); /* MPC Custom code begin */ #if 0 @@ -2009,7 +2009,7 @@ static int theora_decode_header(AVCodecContext *avctx, GetBitContext *gb) Vp3DecodeContext *s = avctx->priv_data; int visible_width, visible_height, colorspace; int offset_x = 0, offset_y = 0; - AVRational fps; + AVRational fps, aspect; s->theora = get_bits_long(gb, 24); av_log(avctx, AV_LOG_DEBUG, "Theora bitstream version %X\n", s->theora); @@ -2025,7 +2025,7 @@ static int theora_decode_header(AVCodecContext *avctx, GetBitContext *gb) visible_width = s->width = get_bits(gb, 16) << 4; visible_height = s->height = get_bits(gb, 16) << 4; - if(av_check_image_size(s->width, s->height, 0, avctx)){ + if(av_image_check_size(s->width, s->height, 0, avctx)){ av_log(avctx, AV_LOG_ERROR, "Invalid dimensions (%dx%d)\n", s->width, s->height); s->width= s->height= 0; return -1; @@ -2046,8 +2046,13 @@ static int theora_decode_header(AVCodecContext *avctx, GetBitContext *gb) fps.den, fps.num, 1<<30); } - avctx->sample_aspect_ratio.num = get_bits_long(gb, 24); - avctx->sample_aspect_ratio.den = get_bits_long(gb, 24); + aspect.num = get_bits_long(gb, 24); + aspect.den = get_bits_long(gb, 24); + if (aspect.num && aspect.den) { + av_reduce(&avctx->sample_aspect_ratio.num, + &avctx->sample_aspect_ratio.den, + aspect.num, aspect.den, 1<<30); + } if (s->theora < 0x030200) s->keyframe_frequency_force=1<data; VP56Context *s = avctx->priv_data; AVFrame *const p = s->framep[VP56_FRAME_CURRENT]; - int remaining_buf_size = buf_size; + int remaining_buf_size = avpkt->size; int is_alpha, av_uninit(alpha_offset); if (s->has_alpha) { @@ -635,7 +636,7 @@ int ff_vp56_decode_frame(AVCodecContext *avctx, void *data, int *data_size, *(AVFrame*)data = *p; *data_size = sizeof(AVFrame); - return buf_size; + return avpkt->size; } av_cold void ff_vp56_init(AVCodecContext *avctx, int flip, int has_alpha) diff --git a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/vp56.h b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/vp56.h index f9500cfb5..da6b1b64b 100644 --- a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/vp56.h +++ b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/vp56.h @@ -174,7 +174,7 @@ void ff_vp56_init(AVCodecContext *avctx, int flip, int has_alpha); int ff_vp56_free(AVCodecContext *avctx); void ff_vp56_init_dequant(VP56Context *s, int quantizer); int ff_vp56_decode_frame(AVCodecContext *avctx, void *data, int *data_size, - const uint8_t *buf, int buf_size); + AVPacket *avpkt); /** diff --git a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/vp56dsp.c b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/vp56dsp.c index d67604b01..0fe9e3e55 100644 --- a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/vp56dsp.c +++ b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/vp56dsp.c @@ -84,7 +84,7 @@ void ff_vp56dsp_init(VP56DSPContext *s, enum CodecID codec) s->edge_filter_ver = vp6_edge_filter_ver; if (CONFIG_VP6_DECODER) { - s->vp6_filter_diag4= ff_vp6_filter_diag4_c; + s->vp6_filter_diag4 = ff_vp6_filter_diag4_c; } } diff --git a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/vp8.c b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/vp8.c index d8d7cdaa2..de97489a8 100644 --- a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/vp8.c +++ b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/vp8.c @@ -223,7 +223,7 @@ static void vp8_decode_flush(AVCodecContext *avctx) static int update_dimensions(VP8Context *s, int width, int height) { - if (av_check_image_size(width, height, 0, s->avctx)) + if (av_image_check_size(width, height, 0, s->avctx)) return AVERROR_INVALIDDATA; vp8_decode_flush(s->avctx); @@ -1471,14 +1471,14 @@ static void filter_mb_row_simple(VP8Context *s, int mb_y) } static int vp8_decode_frame(AVCodecContext *avctx, void *data, int *data_size, - const uint8_t *buf, int buf_size) + AVPacket *avpkt) { VP8Context *s = avctx->priv_data; int ret, mb_x, mb_y, i, y, referenced; enum AVDiscard skip_thresh; AVFrame *av_uninit(curframe); - if ((ret = decode_frame_header(s, buf, buf_size)) < 0) + if ((ret = decode_frame_header(s, avpkt->data, avpkt->size)) < 0) return ret; referenced = s->update_last || s->update_golden == VP56_FRAME_CURRENT @@ -1644,7 +1644,7 @@ skip_decode: *data_size = sizeof(AVFrame); } - return buf_size; + return avpkt->size; } static av_cold int vp8_decode_init(AVCodecContext *avctx) diff --git a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/config.asm b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/config.asm new file mode 100644 index 000000000..8efc2c533 --- /dev/null +++ b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/config.asm @@ -0,0 +1 @@ +%define ARCH_X86 diff --git a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/cpuid.c b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/cpuid.c deleted file mode 100644 index e96e3a93c..000000000 --- a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/cpuid.c +++ /dev/null @@ -1,142 +0,0 @@ -/* - * CPU detection code, extracted from mmx.h - * (c)1997-99 by H. Dietz and R. Fisher - * Converted to C and improved by Fabrice Bellard. - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include -#include "libavutil/x86_cpu.h" -#include "libavcodec/dsputil.h" - -#undef printf - -/* ebx saving is necessary for PIC. gcc seems unable to see it alone */ -#define cpuid(index,eax,ebx,ecx,edx)\ - __asm__ volatile\ - ("mov %%"REG_b", %%"REG_S"\n\t"\ - "cpuid\n\t"\ - "xchg %%"REG_b", %%"REG_S\ - : "=a" (eax), "=S" (ebx),\ - "=c" (ecx), "=d" (edx)\ - : "0" (index)); - -/* Function to test if multimedia instructions are supported... */ -int mm_support(void) -{ - int rval = 0; - int eax, ebx, ecx, edx; - int max_std_level, max_ext_level, std_caps=0, ext_caps=0; - int family=0, model=0; - union { int i[3]; char c[12]; } vendor; - -#if ARCH_X86_32 - x86_reg a, c; - __asm__ volatile ( - /* See if CPUID instruction is supported ... */ - /* ... Get copies of EFLAGS into eax and ecx */ - "pushfl\n\t" - "pop %0\n\t" - "mov %0, %1\n\t" - - /* ... Toggle the ID bit in one copy and store */ - /* to the EFLAGS reg */ - "xor $0x200000, %0\n\t" - "push %0\n\t" - "popfl\n\t" - - /* ... Get the (hopefully modified) EFLAGS */ - "pushfl\n\t" - "pop %0\n\t" - : "=a" (a), "=c" (c) - : - : "cc" - ); - - if (a == c) - return 0; /* CPUID not supported */ -#endif - - cpuid(0, max_std_level, vendor.i[0], vendor.i[2], vendor.i[1]); - - if(max_std_level >= 1){ - cpuid(1, eax, ebx, ecx, std_caps); - family = ((eax>>8)&0xf) + ((eax>>20)&0xff); - model = ((eax>>4)&0xf) + ((eax>>12)&0xf0); - if (std_caps & (1<<23)) - rval |= FF_MM_MMX; - if (std_caps & (1<<25)) - rval |= FF_MM_MMX2 -#if HAVE_SSE - | FF_MM_SSE; - if (std_caps & (1<<26)) - rval |= FF_MM_SSE2; - if (ecx & 1) - rval |= FF_MM_SSE3; - if (ecx & 0x00000200 ) - rval |= FF_MM_SSSE3; - if (ecx & 0x00080000 ) - rval |= FF_MM_SSE4; - if (ecx & 0x00100000 ) - rval |= FF_MM_SSE42; -#endif - ; - } - - cpuid(0x80000000, max_ext_level, ebx, ecx, edx); - - if(max_ext_level >= 0x80000001){ - cpuid(0x80000001, eax, ebx, ecx, ext_caps); - if (ext_caps & (1<<31)) - rval |= FF_MM_3DNOW; - if (ext_caps & (1<<30)) - rval |= FF_MM_3DNOWEXT; - if (ext_caps & (1<<23)) - rval |= FF_MM_MMX; - if (ext_caps & (1<<22)) - rval |= FF_MM_MMX2; - } - - if (!strncmp(vendor.c, "GenuineIntel", 12) && - family == 6 && (model == 9 || model == 13 || model == 14)) { - /* 6/9 (pentium-m "banias"), 6/13 (pentium-m "dothan"), and 6/14 (core1 "yonah") - * theoretically support sse2, but it's usually slower than mmx, - * so let's just pretend they don't. */ - if (rval & FF_MM_SSE2) rval ^= FF_MM_SSE2SLOW|FF_MM_SSE2; - if (rval & FF_MM_SSE3) rval ^= FF_MM_SSE3SLOW|FF_MM_SSE3; - } - -#if 0 - av_log(NULL, AV_LOG_DEBUG, "%s%s%s%s%s%s%s%s%s%s%s%s\n", - (rval&FF_MM_MMX) ? "MMX ":"", - (rval&FF_MM_MMX2) ? "MMX2 ":"", - (rval&FF_MM_SSE) ? "SSE ":"", - (rval&FF_MM_SSE2) ? "SSE2 ":"", - (rval&FF_MM_SSE2SLOW) ? "SSE2(slow) ":"", - (rval&FF_MM_SSE3) ? "SSE3 ":"", - (rval&FF_MM_SSE3SLOW) ? "SSE3(slow) ":"", - (rval&FF_MM_SSSE3) ? "SSSE3 ":"", - (rval&FF_MM_SSE4) ? "SSE4.1 ":"", - (rval&FF_MM_SSE42) ? "SSE4.2 ":"", - (rval&FF_MM_3DNOW) ? "3DNow ":"", - (rval&FF_MM_3DNOWEXT) ? "3DNowExt ":""); -#endif - return rval; - - /* TODO: allow overriding with ffdshow settings for disabling extensions */ -} diff --git a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/dsputil_h264_template_mmx.c b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/dsputil_h264_template_mmx.c deleted file mode 100644 index ff359230c..000000000 --- a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/dsputil_h264_template_mmx.c +++ /dev/null @@ -1,304 +0,0 @@ -/* - * Copyright (c) 2005 Zoltan Hidvegi , - * Loren Merritt - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -/** - * MMX optimized version of (put|avg)_h264_chroma_mc8. - * H264_CHROMA_MC8_TMPL must be defined to the desired function name - * H264_CHROMA_OP must be defined to empty for put and pavgb/pavgusb for avg - * H264_CHROMA_MC8_MV0 must be defined to a (put|avg)_pixels8 function - */ -static void H264_CHROMA_MC8_TMPL(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y, const uint64_t *rnd_reg) -{ - DECLARE_ALIGNED(8, uint64_t, AA); - DECLARE_ALIGNED(8, uint64_t, DD); - int i; - - if(y==0 && x==0) { - /* no filter needed */ - H264_CHROMA_MC8_MV0(dst, src, stride, h); - return; - } - - assert(x<8 && y<8 && x>=0 && y>=0); - - if(y==0 || x==0) - { - /* 1 dimensional filter only */ - const int dxy = x ? 1 : stride; - - __asm__ volatile( - "movd %0, %%mm5\n\t" - "movq %1, %%mm4\n\t" - "movq %2, %%mm6\n\t" /* mm6 = rnd >> 3 */ - "punpcklwd %%mm5, %%mm5\n\t" - "punpckldq %%mm5, %%mm5\n\t" /* mm5 = B = x */ - "pxor %%mm7, %%mm7\n\t" - "psubw %%mm5, %%mm4\n\t" /* mm4 = A = 8-x */ - :: "rm"(x+y), "m"(ff_pw_8), "m"(*(rnd_reg+1))); - - for(i=0; i> 3)) >> 3 */ - "paddw %%mm6, %%mm0\n\t" - "paddw %%mm6, %%mm1\n\t" - "paddw %%mm2, %%mm0\n\t" - "paddw %%mm3, %%mm1\n\t" - "psrlw $3, %%mm0\n\t" - "psrlw $3, %%mm1\n\t" - "packuswb %%mm1, %%mm0\n\t" - H264_CHROMA_OP(%0, %%mm0) - "movq %%mm0, %0\n\t" - : "=m" (dst[0])); - - src += stride; - dst += stride; - } - return; - } - - /* general case, bilinear */ - __asm__ volatile("movd %2, %%mm4\n\t" - "movd %3, %%mm6\n\t" - "punpcklwd %%mm4, %%mm4\n\t" - "punpcklwd %%mm6, %%mm6\n\t" - "punpckldq %%mm4, %%mm4\n\t" /* mm4 = x words */ - "punpckldq %%mm6, %%mm6\n\t" /* mm6 = y words */ - "movq %%mm4, %%mm5\n\t" - "pmullw %%mm6, %%mm4\n\t" /* mm4 = x * y */ - "psllw $3, %%mm5\n\t" - "psllw $3, %%mm6\n\t" - "movq %%mm5, %%mm7\n\t" - "paddw %%mm6, %%mm7\n\t" - "movq %%mm4, %1\n\t" /* DD = x * y */ - "psubw %%mm4, %%mm5\n\t" /* mm5 = B = 8x - xy */ - "psubw %%mm4, %%mm6\n\t" /* mm6 = C = 8y - xy */ - "paddw %4, %%mm4\n\t" - "psubw %%mm7, %%mm4\n\t" /* mm4 = A = xy - (8x+8y) + 64 */ - "pxor %%mm7, %%mm7\n\t" - "movq %%mm4, %0\n\t" - : "=m" (AA), "=m" (DD) : "rm" (x), "rm" (y), "m" (ff_pw_64)); - - __asm__ volatile( - /* mm0 = src[0..7], mm1 = src[1..8] */ - "movq %0, %%mm0\n\t" - "movq %1, %%mm1\n\t" - : : "m" (src[0]), "m" (src[1])); - - for(i=0; i> 6 */ - "paddw %1, %%mm2\n\t" - "paddw %1, %%mm3\n\t" - "psrlw $6, %%mm2\n\t" - "psrlw $6, %%mm3\n\t" - "packuswb %%mm3, %%mm2\n\t" - H264_CHROMA_OP(%0, %%mm2) - "movq %%mm2, %0\n\t" - : "=m" (dst[0]) : "m" (*rnd_reg)); - dst+= stride; - } -} - -static void H264_CHROMA_MC4_TMPL(uint8_t *dst/*align 4*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y, const uint64_t *rnd_reg) -{ - __asm__ volatile( - "pxor %%mm7, %%mm7 \n\t" - "movd %5, %%mm2 \n\t" - "movd %6, %%mm3 \n\t" - "movq "MANGLE(ff_pw_8)", %%mm4\n\t" - "movq "MANGLE(ff_pw_8)", %%mm5\n\t" - "punpcklwd %%mm2, %%mm2 \n\t" - "punpcklwd %%mm3, %%mm3 \n\t" - "punpcklwd %%mm2, %%mm2 \n\t" - "punpcklwd %%mm3, %%mm3 \n\t" - "psubw %%mm2, %%mm4 \n\t" - "psubw %%mm3, %%mm5 \n\t" - - "movd (%1), %%mm0 \n\t" - "movd 1(%1), %%mm6 \n\t" - "add %3, %1 \n\t" - "punpcklbw %%mm7, %%mm0 \n\t" - "punpcklbw %%mm7, %%mm6 \n\t" - "pmullw %%mm4, %%mm0 \n\t" - "pmullw %%mm2, %%mm6 \n\t" - "paddw %%mm0, %%mm6 \n\t" - - "1: \n\t" - "movd (%1), %%mm0 \n\t" - "movd 1(%1), %%mm1 \n\t" - "add %3, %1 \n\t" - "punpcklbw %%mm7, %%mm0 \n\t" - "punpcklbw %%mm7, %%mm1 \n\t" - "pmullw %%mm4, %%mm0 \n\t" - "pmullw %%mm2, %%mm1 \n\t" - "paddw %%mm0, %%mm1 \n\t" - "movq %%mm1, %%mm0 \n\t" - "pmullw %%mm5, %%mm6 \n\t" - "pmullw %%mm3, %%mm1 \n\t" - "paddw %4, %%mm6 \n\t" - "paddw %%mm6, %%mm1 \n\t" - "psrlw $6, %%mm1 \n\t" - "packuswb %%mm1, %%mm1 \n\t" - H264_CHROMA_OP4((%0), %%mm1, %%mm6) - "movd %%mm1, (%0) \n\t" - "add %3, %0 \n\t" - "movd (%1), %%mm6 \n\t" - "movd 1(%1), %%mm1 \n\t" - "add %3, %1 \n\t" - "punpcklbw %%mm7, %%mm6 \n\t" - "punpcklbw %%mm7, %%mm1 \n\t" - "pmullw %%mm4, %%mm6 \n\t" - "pmullw %%mm2, %%mm1 \n\t" - "paddw %%mm6, %%mm1 \n\t" - "movq %%mm1, %%mm6 \n\t" - "pmullw %%mm5, %%mm0 \n\t" - "pmullw %%mm3, %%mm1 \n\t" - "paddw %4, %%mm0 \n\t" - "paddw %%mm0, %%mm1 \n\t" - "psrlw $6, %%mm1 \n\t" - "packuswb %%mm1, %%mm1 \n\t" - H264_CHROMA_OP4((%0), %%mm1, %%mm0) - "movd %%mm1, (%0) \n\t" - "add %3, %0 \n\t" - "sub $2, %2 \n\t" - "jnz 1b \n\t" - : "+r"(dst), "+r"(src), "+r"(h) - : "r"((x86_reg)stride), "m"(*rnd_reg), "m"(x), "m"(y) - ); -} - -#ifdef H264_CHROMA_MC2_TMPL -static void H264_CHROMA_MC2_TMPL(uint8_t *dst/*align 2*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y) -{ - int tmp = ((1<<16)-1)*x + 8; - int CD= tmp*y; - int AB= (tmp<<3) - CD; - __asm__ volatile( - /* mm5 = {A,B,A,B} */ - /* mm6 = {C,D,C,D} */ - "movd %0, %%mm5\n\t" - "movd %1, %%mm6\n\t" - "punpckldq %%mm5, %%mm5\n\t" - "punpckldq %%mm6, %%mm6\n\t" - "pxor %%mm7, %%mm7\n\t" - /* mm0 = src[0,1,1,2] */ - "movd %2, %%mm2\n\t" - "punpcklbw %%mm7, %%mm2\n\t" - "pshufw $0x94, %%mm2, %%mm2\n\t" - :: "r"(AB), "r"(CD), "m"(src[0])); - - - __asm__ volatile( - "1:\n\t" - "add %4, %1\n\t" - /* mm1 = A * src[0,1] + B * src[1,2] */ - "movq %%mm2, %%mm1\n\t" - "pmaddwd %%mm5, %%mm1\n\t" - /* mm0 = src[0,1,1,2] */ - "movd (%1), %%mm0\n\t" - "punpcklbw %%mm7, %%mm0\n\t" - "pshufw $0x94, %%mm0, %%mm0\n\t" - /* mm1 += C * src[0,1] + D * src[1,2] */ - "movq %%mm0, %%mm2\n\t" - "pmaddwd %%mm6, %%mm0\n\t" - "paddw %3, %%mm1\n\t" - "paddw %%mm0, %%mm1\n\t" - /* dst[0,1] = pack((mm1 + 32) >> 6) */ - "psrlw $6, %%mm1\n\t" - "packssdw %%mm7, %%mm1\n\t" - "packuswb %%mm7, %%mm1\n\t" - H264_CHROMA_OP4((%0), %%mm1, %%mm3) - "movd %%mm1, %%esi\n\t" - "movw %%si, (%0)\n\t" - "add %4, %0\n\t" - "sub $1, %2\n\t" - "jnz 1b\n\t" - : "+r" (dst), "+r"(src), "+r"(h) - : "m" (ff_pw_32), "r"((x86_reg)stride) - : "%esi"); - -} -#endif - diff --git a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/dsputil_h264_template_ssse3.c b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/dsputil_h264_template_ssse3.c deleted file mode 100644 index 0eceb74f2..000000000 --- a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/dsputil_h264_template_ssse3.c +++ /dev/null @@ -1,208 +0,0 @@ -/* - * Copyright (c) 2008 Loren Merritt - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -/** - * SSSE3 optimized version of (put|avg)_h264_chroma_mc8. - * H264_CHROMA_MC8_TMPL must be defined to the desired function name - * H264_CHROMA_MC8_MV0 must be defined to a (put|avg)_pixels8 function - * AVG_OP must be defined to empty for put and the identify for avg - */ -static void H264_CHROMA_MC8_TMPL(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y, int rnd) -{ - if(y==0 && x==0) { - /* no filter needed */ - H264_CHROMA_MC8_MV0(dst, src, stride, h); - return; - } - - assert(x<8 && y<8 && x>=0 && y>=0); - - if(y==0 || x==0) - { - /* 1 dimensional filter only */ - __asm__ volatile( - "movd %0, %%xmm7 \n\t" - "movq %1, %%xmm6 \n\t" - "pshuflw $0, %%xmm7, %%xmm7 \n\t" - "movlhps %%xmm6, %%xmm6 \n\t" - "movlhps %%xmm7, %%xmm7 \n\t" - :: "r"(255*(x+y)+8), "m"(*(rnd?&ff_pw_4.a:&ff_pw_3)) - ); - - if(x) { - __asm__ volatile( - "1: \n\t" - "movq (%1), %%xmm0 \n\t" - "movq 1(%1), %%xmm1 \n\t" - "movq (%1,%3), %%xmm2 \n\t" - "movq 1(%1,%3), %%xmm3 \n\t" - "punpcklbw %%xmm1, %%xmm0 \n\t" - "punpcklbw %%xmm3, %%xmm2 \n\t" - "pmaddubsw %%xmm7, %%xmm0 \n\t" - "pmaddubsw %%xmm7, %%xmm2 \n\t" - AVG_OP("movq (%0), %%xmm4 \n\t") - AVG_OP("movhps (%0,%3), %%xmm4 \n\t") - "paddw %%xmm6, %%xmm0 \n\t" - "paddw %%xmm6, %%xmm2 \n\t" - "psrlw $3, %%xmm0 \n\t" - "psrlw $3, %%xmm2 \n\t" - "packuswb %%xmm2, %%xmm0 \n\t" - AVG_OP("pavgb %%xmm4, %%xmm0 \n\t") - "movq %%xmm0, (%0) \n\t" - "movhps %%xmm0, (%0,%3) \n\t" - "sub $2, %2 \n\t" - "lea (%1,%3,2), %1 \n\t" - "lea (%0,%3,2), %0 \n\t" - "jg 1b \n\t" - :"+r"(dst), "+r"(src), "+r"(h) - :"r"((x86_reg)stride) - ); - } else { - __asm__ volatile( - "1: \n\t" - "movq (%1), %%xmm0 \n\t" - "movq (%1,%3), %%xmm1 \n\t" - "movdqa %%xmm1, %%xmm2 \n\t" - "movq (%1,%3,2), %%xmm3 \n\t" - "punpcklbw %%xmm1, %%xmm0 \n\t" - "punpcklbw %%xmm3, %%xmm2 \n\t" - "pmaddubsw %%xmm7, %%xmm0 \n\t" - "pmaddubsw %%xmm7, %%xmm2 \n\t" - AVG_OP("movq (%0), %%xmm4 \n\t") - AVG_OP("movhps (%0,%3), %%xmm4 \n\t") - "paddw %%xmm6, %%xmm0 \n\t" - "paddw %%xmm6, %%xmm2 \n\t" - "psrlw $3, %%xmm0 \n\t" - "psrlw $3, %%xmm2 \n\t" - "packuswb %%xmm2, %%xmm0 \n\t" - AVG_OP("pavgb %%xmm4, %%xmm0 \n\t") - "movq %%xmm0, (%0) \n\t" - "movhps %%xmm0, (%0,%3) \n\t" - "sub $2, %2 \n\t" - "lea (%1,%3,2), %1 \n\t" - "lea (%0,%3,2), %0 \n\t" - "jg 1b \n\t" - :"+r"(dst), "+r"(src), "+r"(h) - :"r"((x86_reg)stride) - ); - } - return; - } - - /* general case, bilinear */ - __asm__ volatile( - "movd %0, %%xmm7 \n\t" - "movd %1, %%xmm6 \n\t" - "movdqa %2, %%xmm5 \n\t" - "pshuflw $0, %%xmm7, %%xmm7 \n\t" - "pshuflw $0, %%xmm6, %%xmm6 \n\t" - "movlhps %%xmm7, %%xmm7 \n\t" - "movlhps %%xmm6, %%xmm6 \n\t" - :: "r"((x*255+8)*(8-y)), "r"((x*255+8)*y), "m"(*(rnd?&ff_pw_32:&ff_pw_28)) - ); - - __asm__ volatile( - "movq (%1), %%xmm0 \n\t" - "movq 1(%1), %%xmm1 \n\t" - "punpcklbw %%xmm1, %%xmm0 \n\t" - "add %3, %1 \n\t" - "1: \n\t" - "movq (%1), %%xmm1 \n\t" - "movq 1(%1), %%xmm2 \n\t" - "movq (%1,%3), %%xmm3 \n\t" - "movq 1(%1,%3), %%xmm4 \n\t" - "lea (%1,%3,2), %1 \n\t" - "punpcklbw %%xmm2, %%xmm1 \n\t" - "punpcklbw %%xmm4, %%xmm3 \n\t" - "movdqa %%xmm1, %%xmm2 \n\t" - "movdqa %%xmm3, %%xmm4 \n\t" - "pmaddubsw %%xmm7, %%xmm0 \n\t" - "pmaddubsw %%xmm6, %%xmm1 \n\t" - "pmaddubsw %%xmm7, %%xmm2 \n\t" - "pmaddubsw %%xmm6, %%xmm3 \n\t" - "paddw %%xmm5, %%xmm0 \n\t" - "paddw %%xmm5, %%xmm2 \n\t" - "paddw %%xmm0, %%xmm1 \n\t" - "paddw %%xmm2, %%xmm3 \n\t" - "movdqa %%xmm4, %%xmm0 \n\t" - "psrlw $6, %%xmm1 \n\t" - "psrlw $6, %%xmm3 \n\t" - AVG_OP("movq (%0), %%xmm2 \n\t") - AVG_OP("movhps (%0,%3), %%xmm2 \n\t") - "packuswb %%xmm3, %%xmm1 \n\t" - AVG_OP("pavgb %%xmm2, %%xmm1 \n\t") - "movq %%xmm1, (%0)\n\t" - "movhps %%xmm1, (%0,%3)\n\t" - "sub $2, %2 \n\t" - "lea (%0,%3,2), %0 \n\t" - "jg 1b \n\t" - :"+r"(dst), "+r"(src), "+r"(h) - :"r"((x86_reg)stride) - ); -} - -static void H264_CHROMA_MC4_TMPL(uint8_t *dst/*align 4*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y) -{ - __asm__ volatile( - "movd %0, %%mm7 \n\t" - "movd %1, %%mm6 \n\t" - "movq %2, %%mm5 \n\t" - "pshufw $0, %%mm7, %%mm7 \n\t" - "pshufw $0, %%mm6, %%mm6 \n\t" - :: "r"((x*255+8)*(8-y)), "r"((x*255+8)*y), "m"(ff_pw_32) - ); - - __asm__ volatile( - "movd (%1), %%mm0 \n\t" - "punpcklbw 1(%1), %%mm0 \n\t" - "add %3, %1 \n\t" - "1: \n\t" - "movd (%1), %%mm1 \n\t" - "movd (%1,%3), %%mm3 \n\t" - "punpcklbw 1(%1), %%mm1 \n\t" - "punpcklbw 1(%1,%3), %%mm3 \n\t" - "lea (%1,%3,2), %1 \n\t" - "movq %%mm1, %%mm2 \n\t" - "movq %%mm3, %%mm4 \n\t" - "pmaddubsw %%mm7, %%mm0 \n\t" - "pmaddubsw %%mm6, %%mm1 \n\t" - "pmaddubsw %%mm7, %%mm2 \n\t" - "pmaddubsw %%mm6, %%mm3 \n\t" - "paddw %%mm5, %%mm0 \n\t" - "paddw %%mm5, %%mm2 \n\t" - "paddw %%mm0, %%mm1 \n\t" - "paddw %%mm2, %%mm3 \n\t" - "movq %%mm4, %%mm0 \n\t" - "psrlw $6, %%mm1 \n\t" - "psrlw $6, %%mm3 \n\t" - "packuswb %%mm1, %%mm1 \n\t" - "packuswb %%mm3, %%mm3 \n\t" - AVG_OP("pavgb (%0), %%mm1 \n\t") - AVG_OP("pavgb (%0,%3), %%mm3 \n\t") - "movd %%mm1, (%0)\n\t" - "movd %%mm3, (%0,%3)\n\t" - "sub $2, %2 \n\t" - "lea (%0,%3,2), %0 \n\t" - "jg 1b \n\t" - :"+r"(dst), "+r"(src), "+r"(h) - :"r"((x86_reg)stride) - ); -} - diff --git a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/dsputil_mmx.c b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/dsputil_mmx.c index c4939ec65..995df0564 100644 --- a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/dsputil_mmx.c +++ b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/dsputil_mmx.c @@ -22,14 +22,13 @@ * MMX optimization by Nick Kurshev */ +#include "libavutil/cpu.h" #include "libavutil/x86_cpu.h" #include "libavcodec/dsputil.h" #include "libavcodec/h264dsp.h" #include "libavcodec/mpegvideo.h" #include "libavcodec/simple_idct.h" #include "dsputil_mmx.h" -#include "vp3dsp_mmx.h" -#include "vp3dsp_sse2.h" #include "idct_xvid.h" //#undef NDEBUG @@ -62,6 +61,7 @@ DECLARE_ALIGNED(8, const uint64_t, ff_pw_96 ) = 0x0060006000600060ULL; DECLARE_ALIGNED(8, const uint64_t, ff_pw_128) = 0x0080008000800080ULL; DECLARE_ALIGNED(8, const uint64_t, ff_pw_255) = 0x00ff00ff00ff00ffULL; +DECLARE_ALIGNED(16, const xmm_reg, ff_pb_0 ) = {0x0000000000000000ULL, 0x0000000000000000ULL}; DECLARE_ALIGNED(16, const xmm_reg, ff_pb_1 ) = {0x0101010101010101ULL, 0x0101010101010101ULL}; DECLARE_ALIGNED(16, const xmm_reg, ff_pb_3 ) = {0x0303030303030303ULL, 0x0303030303030303ULL}; DECLARE_ALIGNED(16, const xmm_reg, ff_pb_4 ) = {0x0404040404040404ULL, 0x0404040404040404ULL}; @@ -70,7 +70,7 @@ DECLARE_ALIGNED(8, const uint64_t, ff_pb_1F ) = 0x1F1F1F1F1F1F1F1FULL; DECLARE_ALIGNED(8, const uint64_t, ff_pb_3F ) = 0x3F3F3F3F3F3F3F3FULL; DECLARE_ALIGNED(16, const xmm_reg, ff_pb_80 ) = {0x8080808080808080ULL, 0x8080808080808080ULL}; DECLARE_ALIGNED(8, const uint64_t, ff_pb_81 ) = 0x8181818181818181ULL; -DECLARE_ALIGNED(8, const uint64_t, ff_pb_A1 ) = 0xA1A1A1A1A1A1A1A1ULL; +DECLARE_ALIGNED(16, const xmm_reg, ff_pb_A1 ) = {0xA1A1A1A1A1A1A1A1ULL, 0xA1A1A1A1A1A1A1A1ULL}; DECLARE_ALIGNED(16, const xmm_reg, ff_pb_F8 ) = {0xF8F8F8F8F8F8F8F8ULL, 0xF8F8F8F8F8F8F8F8ULL}; DECLARE_ALIGNED(8, const uint64_t, ff_pb_FC ) = 0xFCFCFCFCFCFCFCFCULL; DECLARE_ALIGNED(16, const xmm_reg, ff_pb_FE ) = {0xFEFEFEFEFEFEFEFEULL, 0xFEFEFEFEFEFEFEFEULL}; @@ -228,7 +228,7 @@ DECLARE_ALIGNED(16, const double, ff_pd_2)[2] = { 2.0, 2.0 }; /***********************************/ /* standard MMX */ -void put_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size) +void ff_put_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size) { const DCTELEM *p; uint8_t *pix; @@ -304,7 +304,7 @@ DECLARE_ASM_CONST(8, uint8_t, ff_vector128)[8] = "movq %%mm3, (%0, %3, 2) \n\t"\ "movq %%mm4, (%0, %1) \n\t" -void put_signed_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size) +void ff_put_signed_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size) { x86_reg line_skip = line_size; x86_reg line_skip3; @@ -320,7 +320,7 @@ void put_signed_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int li :"memory"); } -void add_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size) +void ff_add_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size) { const DCTELEM *p; uint8_t *pix; @@ -728,35 +728,6 @@ static void h263_v_loop_filter_mmx(uint8_t *src, int stride, int qscale){ } } -static inline void transpose4x4(uint8_t *dst, uint8_t *src, int dst_stride, int src_stride){ - __asm__ volatile( //FIXME could save 1 instruction if done as 8x4 ... - "movd %4, %%mm0 \n\t" - "movd %5, %%mm1 \n\t" - "movd %6, %%mm2 \n\t" - "movd %7, %%mm3 \n\t" - "punpcklbw %%mm1, %%mm0 \n\t" - "punpcklbw %%mm3, %%mm2 \n\t" - "movq %%mm0, %%mm1 \n\t" - "punpcklwd %%mm2, %%mm0 \n\t" - "punpckhwd %%mm2, %%mm1 \n\t" - "movd %%mm0, %0 \n\t" - "punpckhdq %%mm0, %%mm0 \n\t" - "movd %%mm0, %1 \n\t" - "movd %%mm1, %2 \n\t" - "punpckhdq %%mm1, %%mm1 \n\t" - "movd %%mm1, %3 \n\t" - - : "=m" (*(uint32_t*)(dst + 0*dst_stride)), - "=m" (*(uint32_t*)(dst + 1*dst_stride)), - "=m" (*(uint32_t*)(dst + 2*dst_stride)), - "=m" (*(uint32_t*)(dst + 3*dst_stride)) - : "m" (*(uint32_t*)(src + 0*src_stride)), - "m" (*(uint32_t*)(src + 1*src_stride)), - "m" (*(uint32_t*)(src + 2*src_stride)), - "m" (*(uint32_t*)(src + 3*src_stride)) - ); -} - static void h263_h_loop_filter_mmx(uint8_t *src, int stride, int qscale){ if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) { const int strength= ff_h263_loop_filter_strength[qscale]; @@ -1820,8 +1791,59 @@ PREFETCH(prefetch_mmx2, prefetcht0) PREFETCH(prefetch_3dnow, prefetch) #undef PREFETCH -#include "h264dsp_mmx.c" -#include "rv40dsp_mmx.c" +#include "h264_qpel_mmx.c" + +void ff_put_h264_chroma_mc8_mmx_rnd (uint8_t *dst, uint8_t *src, + int stride, int h, int x, int y); +void ff_put_vc1_chroma_mc8_mmx_nornd (uint8_t *dst, uint8_t *src, + int stride, int h, int x, int y); +void ff_put_rv40_chroma_mc8_mmx (uint8_t *dst, uint8_t *src, + int stride, int h, int x, int y); +void ff_avg_h264_chroma_mc8_mmx2_rnd (uint8_t *dst, uint8_t *src, + int stride, int h, int x, int y); +void ff_avg_vc1_chroma_mc8_mmx2_nornd (uint8_t *dst, uint8_t *src, + int stride, int h, int x, int y); +void ff_avg_rv40_chroma_mc8_mmx2 (uint8_t *dst, uint8_t *src, + int stride, int h, int x, int y); +void ff_avg_h264_chroma_mc8_3dnow_rnd (uint8_t *dst, uint8_t *src, + int stride, int h, int x, int y); +void ff_avg_vc1_chroma_mc8_3dnow_nornd(uint8_t *dst, uint8_t *src, + int stride, int h, int x, int y); +void ff_avg_rv40_chroma_mc8_3dnow (uint8_t *dst, uint8_t *src, + int stride, int h, int x, int y); + +void ff_put_h264_chroma_mc4_mmx (uint8_t *dst, uint8_t *src, + int stride, int h, int x, int y); +void ff_put_rv40_chroma_mc4_mmx (uint8_t *dst, uint8_t *src, + int stride, int h, int x, int y); +void ff_avg_h264_chroma_mc4_mmx2 (uint8_t *dst, uint8_t *src, + int stride, int h, int x, int y); +void ff_avg_rv40_chroma_mc4_mmx2 (uint8_t *dst, uint8_t *src, + int stride, int h, int x, int y); +void ff_avg_h264_chroma_mc4_3dnow (uint8_t *dst, uint8_t *src, + int stride, int h, int x, int y); +void ff_avg_rv40_chroma_mc4_3dnow (uint8_t *dst, uint8_t *src, + int stride, int h, int x, int y); + +void ff_put_h264_chroma_mc2_mmx2 (uint8_t *dst, uint8_t *src, + int stride, int h, int x, int y); +void ff_avg_h264_chroma_mc2_mmx2 (uint8_t *dst, uint8_t *src, + int stride, int h, int x, int y); + +void ff_put_h264_chroma_mc8_ssse3_rnd (uint8_t *dst, uint8_t *src, + int stride, int h, int x, int y); +void ff_put_vc1_chroma_mc8_ssse3_nornd(uint8_t *dst, uint8_t *src, + int stride, int h, int x, int y); +void ff_put_h264_chroma_mc4_ssse3 (uint8_t *dst, uint8_t *src, + int stride, int h, int x, int y); + +void ff_avg_h264_chroma_mc8_ssse3_rnd (uint8_t *dst, uint8_t *src, + int stride, int h, int x, int y); +void ff_avg_vc1_chroma_mc8_ssse3_nornd(uint8_t *dst, uint8_t *src, + int stride, int h, int x, int y); +void ff_avg_h264_chroma_mc4_ssse3 (uint8_t *dst, uint8_t *src, + int stride, int h, int x, int y); + /* CAVS specific */ void ff_put_cavs_qpel8_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) { @@ -1851,43 +1873,43 @@ void ff_avg_vc1_mspel_mc00_mmx2(uint8_t *dst, const uint8_t *src, int stride, in static void ff_libmpeg2mmx_idct_put(uint8_t *dest, int line_size, DCTELEM *block) { ff_mmx_idct (block); - put_pixels_clamped_mmx(block, dest, line_size); + ff_put_pixels_clamped_mmx(block, dest, line_size); } static void ff_libmpeg2mmx_idct_add(uint8_t *dest, int line_size, DCTELEM *block) { ff_mmx_idct (block); - add_pixels_clamped_mmx(block, dest, line_size); + ff_add_pixels_clamped_mmx(block, dest, line_size); } static void ff_libmpeg2mmx2_idct_put(uint8_t *dest, int line_size, DCTELEM *block) { ff_mmxext_idct (block); - put_pixels_clamped_mmx(block, dest, line_size); + ff_put_pixels_clamped_mmx(block, dest, line_size); } static void ff_libmpeg2mmx2_idct_add(uint8_t *dest, int line_size, DCTELEM *block) { ff_mmxext_idct (block); - add_pixels_clamped_mmx(block, dest, line_size); + ff_add_pixels_clamped_mmx(block, dest, line_size); } #endif static void ff_idct_xvid_mmx_put(uint8_t *dest, int line_size, DCTELEM *block) { ff_idct_xvid_mmx (block); - put_pixels_clamped_mmx(block, dest, line_size); + ff_put_pixels_clamped_mmx(block, dest, line_size); } static void ff_idct_xvid_mmx_add(uint8_t *dest, int line_size, DCTELEM *block) { ff_idct_xvid_mmx (block); - add_pixels_clamped_mmx(block, dest, line_size); + ff_add_pixels_clamped_mmx(block, dest, line_size); } static void ff_idct_xvid_mmx2_put(uint8_t *dest, int line_size, DCTELEM *block) { ff_idct_xvid_mmx2 (block); - put_pixels_clamped_mmx(block, dest, line_size); + ff_put_pixels_clamped_mmx(block, dest, line_size); } static void ff_idct_xvid_mmx2_add(uint8_t *dest, int line_size, DCTELEM *block) { ff_idct_xvid_mmx2 (block); - add_pixels_clamped_mmx(block, dest, line_size); + ff_add_pixels_clamped_mmx(block, dest, line_size); } static void vorbis_inverse_coupling_3dnow(float *mag, float *ang, int blocksize) @@ -2376,6 +2398,19 @@ static void float_to_int16_sse2(int16_t *dst, const float *src, long len){ ); } +void ff_vp3_idct_mmx(int16_t *input_data); +void ff_vp3_idct_put_mmx(uint8_t *dest, int line_size, DCTELEM *block); +void ff_vp3_idct_add_mmx(uint8_t *dest, int line_size, DCTELEM *block); + +void ff_vp3_idct_dc_add_mmx2(uint8_t *dest, int line_size, const DCTELEM *block); + +void ff_vp3_v_loop_filter_mmx2(uint8_t *src, int stride, int *bounding_values); +void ff_vp3_h_loop_filter_mmx2(uint8_t *src, int stride, int *bounding_values); + +void ff_vp3_idct_sse2(int16_t *input_data); +void ff_vp3_idct_put_sse2(uint8_t *dest, int line_size, DCTELEM *block); +void ff_vp3_idct_add_sse2(uint8_t *dest, int line_size, DCTELEM *block); + void ff_float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len); void ff_float_to_int16_interleave6_3dnow(int16_t *dst, const float **src, int len); void ff_float_to_int16_interleave6_3dn2(int16_t *dst, const float **src, int len); @@ -2387,20 +2422,8 @@ int32_t ff_scalarproduct_and_madd_int16_ssse3(int16_t *v1, const int16_t *v2, co void ff_add_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top); int ff_add_hfyu_left_prediction_ssse3(uint8_t *dst, const uint8_t *src, int w, int left); int ff_add_hfyu_left_prediction_sse4(uint8_t *dst, const uint8_t *src, int w, int left); -void ff_x264_deblock_v_luma_sse2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0); -void ff_x264_deblock_h_luma_sse2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0); -void ff_x264_deblock_h_luma_intra_mmxext(uint8_t *pix, int stride, int alpha, int beta); -void ff_x264_deblock_v_luma_intra_sse2(uint8_t *pix, int stride, int alpha, int beta); -void ff_x264_deblock_h_luma_intra_sse2(uint8_t *pix, int stride, int alpha, int beta); - -#if HAVE_YASM && ARCH_X86_32 -void ff_x264_deblock_v8_luma_intra_mmxext(uint8_t *pix, int stride, int alpha, int beta); -static void ff_x264_deblock_v_luma_intra_mmxext(uint8_t *pix, int stride, int alpha, int beta) -{ - ff_x264_deblock_v8_luma_intra_mmxext(pix+0, stride, alpha, beta); - ff_x264_deblock_v8_luma_intra_mmxext(pix+8, stride, alpha, beta); -} -#elif !HAVE_YASM + +#if !HAVE_YASM #define ff_float_to_int16_interleave6_sse(a,b,c) float_to_int16_interleave_misc_sse(a,b,c,6) #define ff_float_to_int16_interleave6_3dnow(a,b,c) float_to_int16_interleave_misc_3dnow(a,b,c,6) #define ff_float_to_int16_interleave6_3dn2(a,b,c) float_to_int16_interleave_misc_3dnow(a,b,c,6) @@ -2500,10 +2523,10 @@ float ff_scalarproduct_float_sse(const float *v1, const float *v2, int order); void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) { - int mm_flags = mm_support(); + int mm_flags = av_get_cpu_flags(); if (avctx->dsp_mask) { - if (avctx->dsp_mask & FF_MM_FORCE) + if (avctx->dsp_mask & AV_CPU_FLAG_FORCE) mm_flags |= (avctx->dsp_mask & 0xffff); else mm_flags &= ~(avctx->dsp_mask & 0xffff); @@ -2511,20 +2534,20 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) #if 0 av_log(avctx, AV_LOG_INFO, "libavcodec: CPU flags:"); - if (mm_flags & FF_MM_MMX) + if (mm_flags & AV_CPU_FLAG_MMX) av_log(avctx, AV_LOG_INFO, " mmx"); - if (mm_flags & FF_MM_MMX2) + if (mm_flags & AV_CPU_FLAG_MMX2) av_log(avctx, AV_LOG_INFO, " mmx2"); - if (mm_flags & FF_MM_3DNOW) + if (mm_flags & AV_CPU_FLAG_3DNOW) av_log(avctx, AV_LOG_INFO, " 3dnow"); - if (mm_flags & FF_MM_SSE) + if (mm_flags & AV_CPU_FLAG_SSE) av_log(avctx, AV_LOG_INFO, " sse"); - if (mm_flags & FF_MM_SSE2) + if (mm_flags & AV_CPU_FLAG_SSE2) av_log(avctx, AV_LOG_INFO, " sse2"); av_log(avctx, AV_LOG_INFO, "\n"); #endif - if (mm_flags & FF_MM_MMX) { + if (mm_flags & AV_CPU_FLAG_MMX) { const int idct_algo= avctx->idct_algo; if(avctx->lowres==0){ @@ -2535,7 +2558,7 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) c->idct_permutation_type= FF_SIMPLE_IDCT_PERM; #if CONFIG_GPL }else if(idct_algo==FF_IDCT_LIBMPEG2MMX){ - if(mm_flags & FF_MM_MMX2){ + if(mm_flags & AV_CPU_FLAG_MMX2){ c->idct_put= ff_libmpeg2mmx2_idct_put; c->idct_add= ff_libmpeg2mmx2_idct_add; c->idct = ff_mmxext_idct; @@ -2547,8 +2570,8 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM; #endif }else if((CONFIG_VP3_DECODER || CONFIG_VP5_DECODER || CONFIG_VP6_DECODER) && - idct_algo==FF_IDCT_VP3){ - if(mm_flags & FF_MM_SSE2){ + idct_algo==FF_IDCT_VP3 && HAVE_YASM){ + if(mm_flags & AV_CPU_FLAG_SSE2){ c->idct_put= ff_vp3_idct_put_sse2; c->idct_add= ff_vp3_idct_add_sse2; c->idct = ff_vp3_idct_sse2; @@ -2562,12 +2585,12 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) }else if(idct_algo==FF_IDCT_CAVS){ c->idct_permutation_type= FF_TRANSPOSE_IDCT_PERM; }else if(idct_algo==FF_IDCT_XVIDMMX){ - if(mm_flags & FF_MM_SSE2){ + if(mm_flags & AV_CPU_FLAG_SSE2){ c->idct_put= ff_idct_xvid_sse2_put; c->idct_add= ff_idct_xvid_sse2_add; c->idct = ff_idct_xvid_sse2; c->idct_permutation_type= FF_SSE2_IDCT_PERM; - }else if(mm_flags & FF_MM_MMX2){ + }else if(mm_flags & AV_CPU_FLAG_MMX2){ c->idct_put= ff_idct_xvid_mmx2_put; c->idct_add= ff_idct_xvid_mmx2_add; c->idct = ff_idct_xvid_mmx2; @@ -2579,12 +2602,12 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) } } - c->put_pixels_clamped = put_pixels_clamped_mmx; - c->put_signed_pixels_clamped = put_signed_pixels_clamped_mmx; - c->add_pixels_clamped = add_pixels_clamped_mmx; + c->put_pixels_clamped = ff_put_pixels_clamped_mmx; + c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_mmx; + c->add_pixels_clamped = ff_add_pixels_clamped_mmx; c->clear_block = clear_block_mmx; c->clear_blocks = clear_blocks_mmx; - if (mm_flags & FF_MM_SSE){ + if (mm_flags & AV_CPU_FLAG_SSE){ c->clear_block = clear_block_sse; c->clear_blocks = clear_blocks_sse; } @@ -2615,14 +2638,17 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) c->h263_v_loop_filter= h263_v_loop_filter_mmx; c->h263_h_loop_filter= h263_h_loop_filter_mmx; } - c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_mmx_rnd; - c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_mmx; - c->put_no_rnd_vc1_chroma_pixels_tab[0]= put_vc1_chroma_mc8_mmx_nornd; - c->put_rv40_chroma_pixels_tab[0]= put_rv40_chroma_mc8_mmx; - c->put_rv40_chroma_pixels_tab[1]= put_rv40_chroma_mc4_mmx; +#if HAVE_YASM + c->put_h264_chroma_pixels_tab[0]= ff_put_h264_chroma_mc8_mmx_rnd; + c->put_h264_chroma_pixels_tab[1]= ff_put_h264_chroma_mc4_mmx; + c->put_no_rnd_vc1_chroma_pixels_tab[0]= ff_put_vc1_chroma_mc8_mmx_nornd; + + c->put_rv40_chroma_pixels_tab[0]= ff_put_rv40_chroma_mc8_mmx; + c->put_rv40_chroma_pixels_tab[1]= ff_put_rv40_chroma_mc4_mmx; +#endif - if (mm_flags & FF_MM_MMX2) { + if (mm_flags & AV_CPU_FLAG_MMX2) { c->prefetch = prefetch_mmx2; c->put_pixels_tab[0][1] = put_pixels16_x2_mmx2; @@ -2647,12 +2673,12 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmx2; c->avg_pixels_tab[1][3] = avg_pixels8_xy2_mmx2; - if (CONFIG_VP3_DECODER) { + if (CONFIG_VP3_DECODER && HAVE_YASM) { c->vp3_v_loop_filter= ff_vp3_v_loop_filter_mmx2; c->vp3_h_loop_filter= ff_vp3_h_loop_filter_mmx2; } } - if (CONFIG_VP3_DECODER) { + if (CONFIG_VP3_DECODER && HAVE_YASM) { c->vp3_idct_dc_add = ff_vp3_idct_dc_add_mmx2; } @@ -2699,21 +2725,21 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) SET_QPEL_FUNCS(avg_2tap_qpel, 0, 16, mmx2); SET_QPEL_FUNCS(avg_2tap_qpel, 1, 8, mmx2); - c->avg_rv40_chroma_pixels_tab[0]= avg_rv40_chroma_mc8_mmx2; - c->avg_rv40_chroma_pixels_tab[1]= avg_rv40_chroma_mc4_mmx2; +#if HAVE_YASM + c->avg_rv40_chroma_pixels_tab[0]= ff_avg_rv40_chroma_mc8_mmx2; + c->avg_rv40_chroma_pixels_tab[1]= ff_avg_rv40_chroma_mc4_mmx2; - c->avg_no_rnd_vc1_chroma_pixels_tab[0]= avg_vc1_chroma_mc8_mmx2_nornd; + c->avg_no_rnd_vc1_chroma_pixels_tab[0]= ff_avg_vc1_chroma_mc8_mmx2_nornd; - c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_mmx2_rnd; - c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_mmx2; - c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_mmx2; - c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_mmx2; + c->avg_h264_chroma_pixels_tab[0]= ff_avg_h264_chroma_mc8_mmx2_rnd; + c->avg_h264_chroma_pixels_tab[1]= ff_avg_h264_chroma_mc4_mmx2; + c->avg_h264_chroma_pixels_tab[2]= ff_avg_h264_chroma_mc2_mmx2; + c->put_h264_chroma_pixels_tab[2]= ff_put_h264_chroma_mc2_mmx2; -#if HAVE_YASM c->add_hfyu_median_prediction = ff_add_hfyu_median_prediction_mmx2; #endif #if HAVE_7REGS && HAVE_TEN_OPERANDS - if( mm_flags&FF_MM_3DNOW ) + if( mm_flags&AV_CPU_FLAG_3DNOW ) c->add_hfyu_median_prediction = add_hfyu_median_prediction_cmov; #endif @@ -2721,7 +2747,7 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) ff_vc1dsp_init_mmx(c, avctx); c->add_png_paeth_prediction= add_png_paeth_prediction_mmx2; - } else if (mm_flags & FF_MM_3DNOW) { + } else if (mm_flags & AV_CPU_FLAG_3DNOW) { c->prefetch = prefetch_3dnow; c->put_pixels_tab[0][1] = put_pixels16_x2_3dnow; @@ -2772,11 +2798,15 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) SET_QPEL_FUNCS(avg_2tap_qpel, 0, 16, 3dnow); SET_QPEL_FUNCS(avg_2tap_qpel, 1, 8, 3dnow); - c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_3dnow_rnd; - c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_3dnow; +#if HAVE_YASM + c->avg_h264_chroma_pixels_tab[0]= ff_avg_h264_chroma_mc8_3dnow_rnd; + c->avg_h264_chroma_pixels_tab[1]= ff_avg_h264_chroma_mc4_3dnow; + + c->avg_no_rnd_vc1_chroma_pixels_tab[0]= ff_avg_vc1_chroma_mc8_3dnow_nornd; - c->avg_rv40_chroma_pixels_tab[0]= avg_rv40_chroma_mc8_3dnow; - c->avg_rv40_chroma_pixels_tab[1]= avg_rv40_chroma_mc4_3dnow; + c->avg_rv40_chroma_pixels_tab[0]= ff_avg_rv40_chroma_mc8_3dnow; + c->avg_rv40_chroma_pixels_tab[1]= ff_avg_rv40_chroma_mc4_3dnow; +#endif } @@ -2785,13 +2815,13 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) c->put_h264_qpel_pixels_tab[1][x+y*4] = put_h264_qpel8_mc##x##y##_##CPU;\ c->avg_h264_qpel_pixels_tab[0][x+y*4] = avg_h264_qpel16_mc##x##y##_##CPU;\ c->avg_h264_qpel_pixels_tab[1][x+y*4] = avg_h264_qpel8_mc##x##y##_##CPU; - if((mm_flags & FF_MM_SSE2) && !(mm_flags & FF_MM_3DNOW)){ + if((mm_flags & AV_CPU_FLAG_SSE2) && !(mm_flags & AV_CPU_FLAG_3DNOW)){ // these functions are slower than mmx on AMD, but faster on Intel c->put_pixels_tab[0][0] = put_pixels16_sse2; c->avg_pixels_tab[0][0] = avg_pixels16_sse2; H264_QPEL_FUNCS(0, 0, sse2); } - if(mm_flags & FF_MM_SSE2){ + if(mm_flags & AV_CPU_FLAG_SSE2){ H264_QPEL_FUNCS(0, 1, sse2); H264_QPEL_FUNCS(0, 2, sse2); H264_QPEL_FUNCS(0, 3, sse2); @@ -2806,7 +2836,7 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) H264_QPEL_FUNCS(3, 3, sse2); } #if HAVE_SSSE3 - if(mm_flags & FF_MM_SSSE3){ + if(mm_flags & AV_CPU_FLAG_SSSE3){ H264_QPEL_FUNCS(1, 0, ssse3); H264_QPEL_FUNCS(1, 1, ssse3); H264_QPEL_FUNCS(1, 2, ssse3); @@ -2819,16 +2849,16 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) H264_QPEL_FUNCS(3, 1, ssse3); H264_QPEL_FUNCS(3, 2, ssse3); H264_QPEL_FUNCS(3, 3, ssse3); - c->put_no_rnd_vc1_chroma_pixels_tab[0]= put_vc1_chroma_mc8_ssse3_nornd; - c->avg_no_rnd_vc1_chroma_pixels_tab[0]= avg_vc1_chroma_mc8_ssse3_nornd; - c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_ssse3_rnd; - c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_ssse3_rnd; - c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_ssse3; - c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_ssse3; c->add_png_paeth_prediction= add_png_paeth_prediction_ssse3; #if HAVE_YASM + c->put_no_rnd_vc1_chroma_pixels_tab[0]= ff_put_vc1_chroma_mc8_ssse3_nornd; + c->avg_no_rnd_vc1_chroma_pixels_tab[0]= ff_avg_vc1_chroma_mc8_ssse3_nornd; + c->put_h264_chroma_pixels_tab[0]= ff_put_h264_chroma_mc8_ssse3_rnd; + c->avg_h264_chroma_pixels_tab[0]= ff_avg_h264_chroma_mc8_ssse3_rnd; + c->put_h264_chroma_pixels_tab[1]= ff_put_h264_chroma_mc4_ssse3; + c->avg_h264_chroma_pixels_tab[1]= ff_avg_h264_chroma_mc4_ssse3; c->add_hfyu_left_prediction = ff_add_hfyu_left_prediction_ssse3; - if (mm_flags & FF_MM_SSE4) // not really sse4, just slow on Conroe + if (mm_flags & AV_CPU_FLAG_SSE4) // not really sse4, just slow on Conroe c->add_hfyu_left_prediction = ff_add_hfyu_left_prediction_sse4; #endif } @@ -2838,7 +2868,7 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) * todo: test if it still causes crashes */ #if ARCH_X86_32 - if(mm_flags & FF_MM_3DNOW){ + if(mm_flags & AV_CPU_FLAG_3DNOW){ c->vorbis_inverse_coupling = vorbis_inverse_coupling_3dnow; c->vector_fmul = vector_fmul_3dnow; if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ @@ -2846,14 +2876,14 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) c->float_to_int16_interleave = float_to_int16_interleave_3dnow; } } - if(mm_flags & FF_MM_3DNOWEXT){ + if(mm_flags & AV_CPU_FLAG_3DNOWEXT){ c->vector_fmul_reverse = vector_fmul_reverse_3dnow2; c->vector_fmul_window = vector_fmul_window_3dnow2; if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ c->float_to_int16_interleave = float_to_int16_interleave_3dn2; } } - if(mm_flags & FF_MM_SSE){ + if(mm_flags & AV_CPU_FLAG_SSE){ c->vorbis_inverse_coupling = vorbis_inverse_coupling_sse; c->ac3_downmix = ac3_downmix_sse; c->vector_fmul = vector_fmul_sse; @@ -2869,9 +2899,9 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) #endif #endif } - if(mm_flags & FF_MM_3DNOW) + if(mm_flags & AV_CPU_FLAG_3DNOW) c->vector_fmul_add = vector_fmul_add_3dnow; // faster than sse - if(mm_flags & FF_MM_SSE2){ + if(mm_flags & AV_CPU_FLAG_SSE2){ c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse2; c->float_to_int16 = float_to_int16_sse2; c->float_to_int16_interleave = float_to_int16_interleave_sse2; @@ -2883,92 +2913,6 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) dsputilenc_init_mmx(c, avctx); } -#if CONFIG_H264DSP -void ff_h264dsp_init_x86(H264DSPContext *c) -{ - int mm_flags = mm_support(); - - if (mm_flags & FF_MM_MMX) { - c->h264_idct_dc_add= - c->h264_idct_add= ff_h264_idct_add_mmx; - c->h264_idct8_dc_add= - c->h264_idct8_add= ff_h264_idct8_add_mmx; - - c->h264_idct_add16 = ff_h264_idct_add16_mmx; - c->h264_idct8_add4 = ff_h264_idct8_add4_mmx; - c->h264_idct_add8 = ff_h264_idct_add8_mmx; - c->h264_idct_add16intra= ff_h264_idct_add16intra_mmx; - - if (mm_flags & FF_MM_MMX2) { - c->h264_idct_dc_add= ff_h264_idct_dc_add_mmx2; - c->h264_idct8_dc_add= ff_h264_idct8_dc_add_mmx2; - c->h264_idct_add16 = ff_h264_idct_add16_mmx2; - c->h264_idct8_add4 = ff_h264_idct8_add4_mmx2; - c->h264_idct_add8 = ff_h264_idct_add8_mmx2; - c->h264_idct_add16intra= ff_h264_idct_add16intra_mmx2; - - c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_mmx2; - c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_mmx2; - c->h264_v_loop_filter_chroma= h264_v_loop_filter_chroma_mmx2; - c->h264_h_loop_filter_chroma= h264_h_loop_filter_chroma_mmx2; - c->h264_v_loop_filter_chroma_intra= h264_v_loop_filter_chroma_intra_mmx2; - c->h264_h_loop_filter_chroma_intra= h264_h_loop_filter_chroma_intra_mmx2; - c->h264_loop_filter_strength= h264_loop_filter_strength_mmx2; - - c->weight_h264_pixels_tab[0]= ff_h264_weight_16x16_mmx2; - c->weight_h264_pixels_tab[1]= ff_h264_weight_16x8_mmx2; - c->weight_h264_pixels_tab[2]= ff_h264_weight_8x16_mmx2; - c->weight_h264_pixels_tab[3]= ff_h264_weight_8x8_mmx2; - c->weight_h264_pixels_tab[4]= ff_h264_weight_8x4_mmx2; - c->weight_h264_pixels_tab[5]= ff_h264_weight_4x8_mmx2; - c->weight_h264_pixels_tab[6]= ff_h264_weight_4x4_mmx2; - c->weight_h264_pixels_tab[7]= ff_h264_weight_4x2_mmx2; - - c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16x16_mmx2; - c->biweight_h264_pixels_tab[1]= ff_h264_biweight_16x8_mmx2; - c->biweight_h264_pixels_tab[2]= ff_h264_biweight_8x16_mmx2; - c->biweight_h264_pixels_tab[3]= ff_h264_biweight_8x8_mmx2; - c->biweight_h264_pixels_tab[4]= ff_h264_biweight_8x4_mmx2; - c->biweight_h264_pixels_tab[5]= ff_h264_biweight_4x8_mmx2; - c->biweight_h264_pixels_tab[6]= ff_h264_biweight_4x4_mmx2; - c->biweight_h264_pixels_tab[7]= ff_h264_biweight_4x2_mmx2; - } - if(mm_flags & FF_MM_SSE2){ - c->h264_idct8_add = ff_h264_idct8_add_sse2; - c->h264_idct8_add4= ff_h264_idct8_add4_sse2; - } - -#if HAVE_YASM - if (mm_flags & FF_MM_MMX2){ -#if ARCH_X86_32 - c->h264_v_loop_filter_luma_intra = ff_x264_deblock_v_luma_intra_mmxext; - c->h264_h_loop_filter_luma_intra = ff_x264_deblock_h_luma_intra_mmxext; -#endif - if( mm_flags&FF_MM_SSE2 ){ - c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16x16_sse2; - c->biweight_h264_pixels_tab[3]= ff_h264_biweight_8x8_sse2; -#if ARCH_X86_64 || !defined(__ICC) || __ICC > 1110 - c->h264_v_loop_filter_luma = ff_x264_deblock_v_luma_sse2; - c->h264_h_loop_filter_luma = ff_x264_deblock_h_luma_sse2; - c->h264_v_loop_filter_luma_intra = ff_x264_deblock_v_luma_intra_sse2; - c->h264_h_loop_filter_luma_intra = ff_x264_deblock_h_luma_intra_sse2; -#endif -#if CONFIG_GPL - c->h264_idct_add16 = ff_h264_idct_add16_sse2; - c->h264_idct_add8 = ff_h264_idct_add8_sse2; - c->h264_idct_add16intra = ff_h264_idct_add16intra_sse2; -#endif - } - if ( mm_flags&FF_MM_SSSE3 ){ - c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16x16_ssse3; - c->biweight_h264_pixels_tab[3]= ff_h264_biweight_8x8_ssse3; - } - } -#endif - } -} -#endif /* CONFIG_H264DSP */ - const char* avcodec_get_current_idct_mmx(AVCodecContext *avctx,DSPContext *c) { if (c->idct_put==ff_idct_xvid_mmx_put) diff --git a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/dsputil_mmx.h b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/dsputil_mmx.h index 33dafed1f..58256fd40 100644 --- a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/dsputil_mmx.h +++ b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/dsputil_mmx.h @@ -57,7 +57,7 @@ extern const uint64_t ff_pb_7; extern const uint64_t ff_pb_1F; extern const uint64_t ff_pb_3F; extern const uint64_t ff_pb_81; -extern const uint64_t ff_pb_A1; +extern const xmm_reg ff_pb_A1; extern const xmm_reg ff_pb_F8; extern const uint64_t ff_pb_FC; extern const xmm_reg ff_pb_FE; @@ -94,6 +94,35 @@ extern const double ff_pd_2[2]; SBUTTERFLY(a,c,d,dq,q) /* a=aeim d=bfjn */\ SBUTTERFLY(t,b,c,dq,q) /* t=cgko c=dhlp */ +static inline void transpose4x4(uint8_t *dst, uint8_t *src, int dst_stride, int src_stride){ + __asm__ volatile( //FIXME could save 1 instruction if done as 8x4 ... + "movd %4, %%mm0 \n\t" + "movd %5, %%mm1 \n\t" + "movd %6, %%mm2 \n\t" + "movd %7, %%mm3 \n\t" + "punpcklbw %%mm1, %%mm0 \n\t" + "punpcklbw %%mm3, %%mm2 \n\t" + "movq %%mm0, %%mm1 \n\t" + "punpcklwd %%mm2, %%mm0 \n\t" + "punpckhwd %%mm2, %%mm1 \n\t" + "movd %%mm0, %0 \n\t" + "punpckhdq %%mm0, %%mm0 \n\t" + "movd %%mm0, %1 \n\t" + "movd %%mm1, %2 \n\t" + "punpckhdq %%mm1, %%mm1 \n\t" + "movd %%mm1, %3 \n\t" + + : "=m" (*(uint32_t*)(dst + 0*dst_stride)), + "=m" (*(uint32_t*)(dst + 1*dst_stride)), + "=m" (*(uint32_t*)(dst + 2*dst_stride)), + "=m" (*(uint32_t*)(dst + 3*dst_stride)) + : "m" (*(uint32_t*)(src + 0*src_stride)), + "m" (*(uint32_t*)(src + 1*src_stride)), + "m" (*(uint32_t*)(src + 2*src_stride)), + "m" (*(uint32_t*)(src + 3*src_stride)) + ); +} + // e,f,g,h can be memory // out: a,d,t,c #define TRANSPOSE8x4(a,b,c,d,e,f,g,h,t)\ @@ -158,9 +187,9 @@ extern const double ff_pd_2[2]; void dsputilenc_init_mmx(DSPContext* c, AVCodecContext *avctx); void dsputil_init_pix_mmx(DSPContext* c, AVCodecContext *avctx); -void add_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size); -void put_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size); -void put_signed_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size); +void ff_add_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size); +void ff_put_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size); +void ff_put_signed_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size); void ff_put_cavs_qpel8_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride); void ff_avg_cavs_qpel8_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride); diff --git a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/fft.c b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/fft.c index eb5c65ecb..771b1e664 100644 --- a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/fft.c +++ b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/fft.c @@ -16,25 +16,26 @@ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ +#include "libavutil/cpu.h" #include "libavcodec/dsputil.h" #include "fft.h" av_cold void ff_fft_init_mmx(FFTContext *s) { #if HAVE_YASM - int has_vectors = mm_support(); - if (has_vectors & FF_MM_SSE && HAVE_SSE) { + int has_vectors = av_get_cpu_flags(); + if (has_vectors & AV_CPU_FLAG_SSE && HAVE_SSE) { /* SSE for P3/P4/K8 */ s->imdct_calc = ff_imdct_calc_sse; s->imdct_half = ff_imdct_half_sse; s->fft_permute = ff_fft_permute_sse; s->fft_calc = ff_fft_calc_sse; - } else if (has_vectors & FF_MM_3DNOWEXT && HAVE_AMD3DNOWEXT) { + } else if (has_vectors & AV_CPU_FLAG_3DNOWEXT && HAVE_AMD3DNOWEXT) { /* 3DNowEx for K7 */ s->imdct_calc = ff_imdct_calc_3dn2; s->imdct_half = ff_imdct_half_3dn2; s->fft_calc = ff_fft_calc_3dn2; - } else if (has_vectors & FF_MM_3DNOW && HAVE_AMD3DNOW) { + } else if (has_vectors & AV_CPU_FLAG_3DNOW && HAVE_AMD3DNOW) { /* 3DNow! for K6-2/3 */ s->imdct_calc = ff_imdct_calc_3dn; s->imdct_half = ff_imdct_half_3dn; @@ -46,8 +47,8 @@ av_cold void ff_fft_init_mmx(FFTContext *s) #if CONFIG_DCT av_cold void ff_dct_init_mmx(DCTContext *s) { - int has_vectors = mm_support(); - if (has_vectors & FF_MM_SSE && HAVE_SSE) + int has_vectors = av_get_cpu_flags(); + if (has_vectors & AV_CPU_FLAG_SSE && HAVE_SSE) s->dct32 = ff_dct32_float_sse; } #endif diff --git a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/fft_mmx.asm b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/fft_mmx.asm index 31176d6c9..b75ec0cc5 100644 --- a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/fft_mmx.asm +++ b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/fft_mmx.asm @@ -532,20 +532,15 @@ INIT_XMM unpckhps xmm0, xmm2 %endmacro -%macro PREROTATEW 3 ;addr1, addr2, xmm - movlps %1, %3 - movhps %2, %3 -%endmacro - %macro CMUL 6 ;j, xmm0, xmm1, 3, 4, 5 movaps xmm6, [%4+%1*2] movaps %2, [%4+%1*2+0x10] movaps %3, xmm6 movaps xmm7, %2 - mulps xmm6, [%5+%1*1] - mulps %2, [%6+%1*1] - mulps %3, [%6+%1*1] - mulps xmm7, [%5+%1*1] + mulps xmm6, [%5+%1] + mulps %2, [%6+%1] + mulps %3, [%6+%1] + mulps xmm7, [%5+%1] subps %2, xmm6 addps %3, xmm7 %endmacro @@ -576,8 +571,6 @@ cglobal imdct_half_sse, 3,7,8; FFTContext *s, FFTSample *output, const FFTSample %define rrevtab r10 %define rtcos r11 %define rtsin r12 - push r10 - push r11 push r12 push r13 push r14 @@ -620,21 +613,25 @@ cglobal imdct_half_sse, 3,7,8; FFTContext *s, FFTSample *output, const FFTSample PREROTATER r4, r3, r2, rtcos, rtsin %ifdef ARCH_X86_64 - movzx r5, word [rrevtab+r4*1-4] - movzx r6, word [rrevtab+r4*1-2] - movzx r13, word [rrevtab+r3*1] - movzx r14, word [rrevtab+r3*1+2] - PREROTATEW [r1+r5 *8], [r1+r6 *8], xmm0 - PREROTATEW [r1+r13*8], [r1+r14*8], xmm1 + movzx r5, word [rrevtab+r4-4] + movzx r6, word [rrevtab+r4-2] + movzx r13, word [rrevtab+r3] + movzx r14, word [rrevtab+r3+2] + movlps [r1+r5 *8], xmm0 + movhps [r1+r6 *8], xmm0 + movlps [r1+r13*8], xmm1 + movhps [r1+r14*8], xmm1 add r4, 4 %else mov r6, [esp] - movzx r5, word [r6+r4*1-4] - movzx r4, word [r6+r4*1-2] - PREROTATEW [r1+r5*8], [r1+r4*8], xmm0 - movzx r5, word [r6+r3*1] - movzx r4, word [r6+r3*1+2] - PREROTATEW [r1+r5*8], [r1+r4*8], xmm1 + movzx r5, word [r6+r4-4] + movzx r4, word [r6+r4-2] + movlps [r1+r5*8], xmm0 + movhps [r1+r4*8], xmm0 + movzx r5, word [r6+r3] + movzx r4, word [r6+r3+2] + movlps [r1+r5*8], xmm1 + movhps [r1+r4*8], xmm1 %endif sub r3, 4 jns .pre @@ -663,8 +660,6 @@ cglobal imdct_half_sse, 3,7,8; FFTContext *s, FFTSample *output, const FFTSample pop r14 pop r13 pop r12 - pop r11 - pop r10 %else add esp, 12 %endif diff --git a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/h264_chromamc.asm b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/h264_chromamc.asm new file mode 100644 index 000000000..6df82cc52 --- /dev/null +++ b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/h264_chromamc.asm @@ -0,0 +1,671 @@ +;****************************************************************************** +;* MMX/SSSE3-optimized functions for H264 chroma MC +;* Copyright (c) 2005 Zoltan Hidvegi , +;* 2005-2008 Loren Merritt +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with FFmpeg; if not, write to the Free Software +;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "x86inc.asm" +%include "x86util.asm" + +SECTION_RODATA + +rnd_rv40_2d_tbl: times 4 dw 0 + times 4 dw 16 + times 4 dw 32 + times 4 dw 16 + times 4 dw 32 + times 4 dw 28 + times 4 dw 32 + times 4 dw 28 + times 4 dw 0 + times 4 dw 32 + times 4 dw 16 + times 4 dw 32 + times 4 dw 32 + times 4 dw 28 + times 4 dw 32 + times 4 dw 28 +rnd_rv40_1d_tbl: times 4 dw 0 + times 4 dw 2 + times 4 dw 4 + times 4 dw 2 + times 4 dw 4 + times 4 dw 3 + times 4 dw 4 + times 4 dw 3 + times 4 dw 0 + times 4 dw 4 + times 4 dw 2 + times 4 dw 4 + times 4 dw 4 + times 4 dw 3 + times 4 dw 4 + times 4 dw 3 + +cextern pw_3 +cextern pw_4 +cextern pw_8 +cextern pw_28 +cextern pw_32 +cextern pw_64 + +SECTION .text + +%macro mv0_pixels_mc8 0 + lea r4, [r2*2 ] +.next4rows + movq mm0, [r1 ] + movq mm1, [r1+r2] + CHROMAMC_AVG mm0, [r0 ] + CHROMAMC_AVG mm1, [r0+r2] + movq [r0 ], mm0 + movq [r0+r2], mm1 + add r0, r4 + add r1, r4 + movq mm0, [r1 ] + movq mm1, [r1+r2] + CHROMAMC_AVG mm0, [r0 ] + CHROMAMC_AVG mm1, [r0+r2] + add r1, r4 + movq [r0 ], mm0 + movq [r0+r2], mm1 + add r0, r4 + sub r3d, 4 + jne .next4rows +%endmacro + +%macro chroma_mc8_mmx_func 3 +; put/avg_h264_chroma_mc8_mmx_*(uint8_t *dst /*align 8*/, uint8_t *src /*align 1*/, +; int stride, int h, int mx, int my) +cglobal %1_%2_chroma_mc8_%3, 6, 7, 0 +%ifdef ARCH_X86_64 + movsxd r2, r2d +%endif + mov r6d, r5d + or r6d, r4d + jne .at_least_one_non_zero + ; mx == 0 AND my == 0 - no filter needed + mv0_pixels_mc8 + REP_RET + +.at_least_one_non_zero +%ifidn %2, rv40 +%ifdef PIC +%define rnd_1d_rv40 r11 +%define rnd_2d_rv40 r11 +%else ; no-PIC +%define rnd_1d_rv40 rnd_rv40_1d_tbl +%define rnd_2d_rv40 rnd_rv40_2d_tbl +%endif +%ifdef ARCH_X86_64 + mov r10, r5 + and r10, 6 ; &~1 for mx/my=[0,7] + lea r10, [r10*4+r4] + sar r10d, 1 +%define rnd_bias r10 +%define dest_reg r0 +%else ; x86-32 + mov r0, r5 + and r0, 6 ; &~1 for mx/my=[0,7] + lea r0, [r0*4+r4] + sar r0d, 1 +%define rnd_bias r0 +%define dest_reg r5 +%endif +%else ; vc1, h264 +%define rnd_bias 0 +%define dest_reg r0 +%endif + + test r5d, r5d + mov r6, 1 + je .my_is_zero + test r4d, r4d + mov r6, r2 ; dxy = x ? 1 : stride + jne .both_non_zero +.my_is_zero + ; mx == 0 XOR my == 0 - 1 dimensional filter only + or r4d, r5d ; x + y + +%ifidn %2, rv40 +%ifdef PIC + lea r11, [rnd_rv40_1d_tbl] +%endif +%ifndef ARCH_X86_64 + mov r5, r0m +%endif +%endif + + movd m5, r4d + movq m4, [pw_8] + movq m6, [rnd_1d_%2+rnd_bias*8] ; mm6 = rnd >> 3 + punpcklwd m5, m5 + punpckldq m5, m5 ; mm5 = B = x + pxor m7, m7 + psubw m4, m5 ; mm4 = A = 8-x + +.next1drow + movq m0, [r1 ] ; mm0 = src[0..7] + movq m2, [r1+r6] ; mm1 = src[1..8] + + movq m1, m0 + movq m3, m2 + punpcklbw m0, m7 + punpckhbw m1, m7 + punpcklbw m2, m7 + punpckhbw m3, m7 + pmullw m0, m4 ; [mm0,mm1] = A * src[0..7] + pmullw m1, m4 + pmullw m2, m5 ; [mm2,mm3] = B * src[1..8] + pmullw m3, m5 + + paddw m0, m6 + paddw m1, m6 + paddw m0, m2 + paddw m1, m3 + psrlw m0, 3 + psrlw m1, 3 + packuswb m0, m1 + CHROMAMC_AVG m0, [dest_reg] + movq [dest_reg], m0 ; dst[0..7] = (A * src[0..7] + B * src[1..8] + (rnd >> 3)) >> 3 + + add dest_reg, r2 + add r1, r2 + dec r3d + jne .next1drow + REP_RET + +.both_non_zero ; general case, bilinear + movd m4, r4d ; x + movd m6, r5d ; y +%ifidn %2, rv40 +%ifdef PIC + lea r11, [rnd_rv40_2d_tbl] +%endif +%ifndef ARCH_X86_64 + mov r5, r0m +%endif +%endif + mov r6, rsp ; backup stack pointer + and rsp, ~(mmsize-1) ; align stack + sub rsp, 16 ; AA and DD + + punpcklwd m4, m4 + punpcklwd m6, m6 + punpckldq m4, m4 ; mm4 = x words + punpckldq m6, m6 ; mm6 = y words + movq m5, m4 + pmullw m4, m6 ; mm4 = x * y + psllw m5, 3 + psllw m6, 3 + movq m7, m5 + paddw m7, m6 + movq [rsp+8], m4 ; DD = x * y + psubw m5, m4 ; mm5 = B = 8x - xy + psubw m6, m4 ; mm6 = C = 8y - xy + paddw m4, [pw_64] + psubw m4, m7 ; mm4 = A = xy - (8x+8y) + 64 + pxor m7, m7 + movq [rsp ], m4 + + movq m0, [r1 ] ; mm0 = src[0..7] + movq m1, [r1+1] ; mm1 = src[1..8] +.next2drow + add r1, r2 + + movq m2, m0 + movq m3, m1 + punpckhbw m0, m7 + punpcklbw m1, m7 + punpcklbw m2, m7 + punpckhbw m3, m7 + pmullw m0, [rsp] + pmullw m2, [rsp] + pmullw m1, m5 + pmullw m3, m5 + paddw m2, m1 ; mm2 = A * src[0..3] + B * src[1..4] + paddw m3, m0 ; mm3 = A * src[4..7] + B * src[5..8] + + movq m0, [r1] + movq m1, m0 + punpcklbw m0, m7 + punpckhbw m1, m7 + pmullw m0, m6 + pmullw m1, m6 + paddw m2, m0 + paddw m3, m1 ; [mm2,mm3] += C * src[0..7] + + movq m1, [r1+1] + movq m0, m1 + movq m4, m1 + punpcklbw m0, m7 + punpckhbw m4, m7 + pmullw m0, [rsp+8] + pmullw m4, [rsp+8] + paddw m2, m0 + paddw m3, m4 ; [mm2,mm3] += D * src[1..8] + movq m0, [r1] + + paddw m2, [rnd_2d_%2+rnd_bias*8] + paddw m3, [rnd_2d_%2+rnd_bias*8] + psrlw m2, 6 + psrlw m3, 6 + packuswb m2, m3 + CHROMAMC_AVG m2, [dest_reg] + movq [dest_reg], m2 ; dst[0..7] = ([mm2,mm3] + rnd) >> 6 + + add dest_reg, r2 + dec r3d + jne .next2drow + mov rsp, r6 ; restore stack pointer + RET +%endmacro + +%macro chroma_mc4_mmx_func 3 +cglobal %1_%2_chroma_mc4_%3, 6, 6, 0 +%ifdef ARCH_X86_64 + movsxd r2, r2d +%endif + pxor m7, m7 + movd m2, r4d ; x + movd m3, r5d ; y + movq m4, [pw_8] + movq m5, [pw_8] + punpcklwd m2, m2 + punpcklwd m3, m3 + punpcklwd m2, m2 + punpcklwd m3, m3 + psubw m4, m2 + psubw m5, m3 + +%ifidn %2, rv40 +%ifdef PIC + lea r11, [rnd_rv40_2d_tbl] +%define rnd_2d_rv40 r11 +%else +%define rnd_2d_rv40 rnd_rv40_2d_tbl +%endif + and r5, 6 ; &~1 for mx/my=[0,7] + lea r5, [r5*4+r4] + sar r5d, 1 +%define rnd_bias r5 +%else ; vc1, h264 +%define rnd_bias 0 +%endif + + movd m0, [r1 ] + movd m6, [r1+1] + add r1, r2 + punpcklbw m0, m7 + punpcklbw m6, m7 + pmullw m0, m4 + pmullw m6, m2 + paddw m6, m0 + +.next2rows + movd m0, [r1 ] + movd m1, [r1+1] + add r1, r2 + punpcklbw m0, m7 + punpcklbw m1, m7 + pmullw m0, m4 + pmullw m1, m2 + paddw m1, m0 + movq m0, m1 + + pmullw m6, m5 + pmullw m1, m3 + paddw m6, [rnd_2d_%2+rnd_bias*8] + paddw m1, m6 + psrlw m1, 6 + packuswb m1, m1 + CHROMAMC_AVG4 m1, m6, [r0] + movd [r0], m1 + add r0, r2 + + movd m6, [r1 ] + movd m1, [r1+1] + add r1, r2 + punpcklbw m6, m7 + punpcklbw m1, m7 + pmullw m6, m4 + pmullw m1, m2 + paddw m1, m6 + movq m6, m1 + pmullw m0, m5 + pmullw m1, m3 + paddw m0, [rnd_2d_%2+rnd_bias*8] + paddw m1, m0 + psrlw m1, 6 + packuswb m1, m1 + CHROMAMC_AVG4 m1, m0, [r0] + movd [r0], m1 + add r0, r2 + sub r3d, 2 + jnz .next2rows + REP_RET +%endmacro + +%macro chroma_mc2_mmx_func 3 +cglobal %1_%2_chroma_mc2_%3, 6, 7, 0 +%ifdef ARCH_X86_64 + movsxd r2, r2d +%endif + + mov r6d, r4d + shl r4d, 16 + sub r4d, r6d + add r4d, 8 + imul r5d, r4d ; x*y<<16 | y*(8-x) + shl r4d, 3 + sub r4d, r5d ; x*(8-y)<<16 | (8-x)*(8-y) + + movd m5, r4d + movd m6, r5d + punpckldq m5, m5 ; mm5 = {A,B,A,B} + punpckldq m6, m6 ; mm6 = {C,D,C,D} + pxor m7, m7 + movd m2, [r1] + punpcklbw m2, m7 + pshufw m2, m2, 0x94 ; mm0 = src[0,1,1,2] + +.nextrow + add r1, r2 + movq m1, m2 + pmaddwd m1, m5 ; mm1 = A * src[0,1] + B * src[1,2] + movd m0, [r1] + punpcklbw m0, m7 + pshufw m0, m0, 0x94 ; mm0 = src[0,1,1,2] + movq m2, m0 + pmaddwd m0, m6 + paddw m1, [rnd_2d_%2] + paddw m1, m0 ; mm1 += C * src[0,1] + D * src[1,2] + psrlw m1, 6 + packssdw m1, m7 + packuswb m1, m7 + CHROMAMC_AVG4 m1, m3, [r0] + movd r5d, m1 + mov [r0], r5w + add r0, r2 + sub r3d, 1 + jnz .nextrow + REP_RET +%endmacro + +%define rnd_1d_h264 pw_4 +%define rnd_2d_h264 pw_32 +%define rnd_1d_vc1 pw_3 +%define rnd_2d_vc1 pw_28 + +%macro NOTHING 2-3 +%endmacro +%macro DIRECT_AVG 2 + PAVG %1, %2 +%endmacro +%macro COPY_AVG 3 + movd %2, %3 + PAVG %1, %2 +%endmacro + +INIT_MMX +%define CHROMAMC_AVG NOTHING +%define CHROMAMC_AVG4 NOTHING +chroma_mc8_mmx_func put, h264, mmx_rnd +chroma_mc8_mmx_func put, vc1, mmx_nornd +chroma_mc8_mmx_func put, rv40, mmx +chroma_mc4_mmx_func put, h264, mmx +chroma_mc4_mmx_func put, rv40, mmx +chroma_mc2_mmx_func put, h264, mmx2 + +%define CHROMAMC_AVG DIRECT_AVG +%define CHROMAMC_AVG4 COPY_AVG +%define PAVG pavgb +chroma_mc8_mmx_func avg, h264, mmx2_rnd +chroma_mc8_mmx_func avg, vc1, mmx2_nornd +chroma_mc8_mmx_func avg, rv40, mmx2 +chroma_mc4_mmx_func avg, h264, mmx2 +chroma_mc4_mmx_func avg, rv40, mmx2 +chroma_mc2_mmx_func avg, h264, mmx2 + +%define PAVG pavgusb +chroma_mc8_mmx_func avg, h264, 3dnow_rnd +chroma_mc8_mmx_func avg, vc1, 3dnow_nornd +chroma_mc8_mmx_func avg, rv40, 3dnow +chroma_mc4_mmx_func avg, h264, 3dnow +chroma_mc4_mmx_func avg, rv40, 3dnow + +%macro chroma_mc8_ssse3_func 3 +cglobal %1_%2_chroma_mc8_%3, 6, 7, 8 +%ifdef ARCH_X86_64 + movsxd r2, r2d +%endif + mov r6d, r5d + or r6d, r4d + jne .at_least_one_non_zero + ; mx == 0 AND my == 0 - no filter needed + mv0_pixels_mc8 + REP_RET + +.at_least_one_non_zero + test r5d, r5d + je .my_is_zero + test r4d, r4d + je .mx_is_zero + + ; general case, bilinear + mov r6d, r4d + shl r4d, 8 + sub r4, r6 + add r4, 8 ; x*288+8 = x<<8 | (8-x) + mov r6, 8 + sub r6d, r5d + imul r6, r4 ; (8-y)*(x*255+8) = (8-y)*x<<8 | (8-y)*(8-x) + imul r4d, r5d ; y *(x*255+8) = y *x<<8 | y *(8-x) + + movd m7, r6d + movd m6, r4d + movdqa m5, [rnd_2d_%2] + pshuflw m7, m7, 0 + pshuflw m6, m6, 0 + movlhps m7, m7 + movlhps m6, m6 + + movq m0, [r1 ] + movq m1, [r1 +1] + punpcklbw m0, m1 + add r1, r2 +.next2rows + movq m1, [r1 ] + movq m2, [r1 +1] + movq m3, [r1+r2 ] + movq m4, [r1+r2+1] + lea r1, [r1+r2*2] + punpcklbw m1, m2 + punpcklbw m3, m4 + movdqa m2, m1 + movdqa m4, m3 + pmaddubsw m0, m7 + pmaddubsw m1, m6 + pmaddubsw m2, m7 + pmaddubsw m3, m6 + paddw m0, m5 + paddw m2, m5 + paddw m1, m0 + paddw m3, m2 + movdqa m0, m4 + psrlw m1, 6 + psrlw m3, 6 +%ifidn %1, avg + movq m2, [r0 ] + movhps m2, [r0+r2] +%endif + packuswb m1, m3 + CHROMAMC_AVG m1, m2 + movq [r0 ], m1 + movhps [r0+r2], m1 + sub r3d, 2 + lea r0, [r0+r2*2] + jg .next2rows + REP_RET + +.my_is_zero + mov r5d, r4d + shl r4d, 8 + add r4, 8 + sub r4, r5 ; 255*x+8 = x<<8 | (8-x) + movd m7, r4d + movq m6, [rnd_1d_%2] + pshuflw m7, m7, 0 + movlhps m6, m6 + movlhps m7, m7 + +.next2xrows + movq m0, [r1 ] + movq m1, [r1 +1] + movq m2, [r1+r2 ] + movq m3, [r1+r2+1] + punpcklbw m0, m1 + punpcklbw m2, m3 + pmaddubsw m0, m7 + pmaddubsw m2, m7 +%ifidn %1, avg + movq m4, [r0 ] + movhps m4, [r0+r2] +%endif + paddw m0, m6 + paddw m2, m6 + psrlw m0, 3 + psrlw m2, 3 + packuswb m0, m2 + CHROMAMC_AVG m0, m4 + movq [r0 ], m0 + movhps [r0+r2], m0 + sub r3d, 2 + lea r0, [r0+r2*2] + lea r1, [r1+r2*2] + jg .next2xrows + REP_RET + +.mx_is_zero + mov r4d, r5d + shl r5d, 8 + add r5, 8 + sub r5, r4 ; 255*y+8 = y<<8 | (8-y) + movd m7, r5d + movq m6, [rnd_1d_%2] + pshuflw m7, m7, 0 + movlhps m6, m6 + movlhps m7, m7 + +.next2yrows + movq m0, [r1 ] + movq m1, [r1+r2 ] + movdqa m2, m1 + movq m3, [r1+r2*2] + punpcklbw m0, m1 + punpcklbw m2, m3 + pmaddubsw m0, m7 + pmaddubsw m2, m7 +%ifidn %1, avg + movq m4, [r0 ] + movhps m4, [r0+r2] +%endif + paddw m0, m6 + paddw m2, m6 + psrlw m0, 3 + psrlw m2, 3 + packuswb m0, m2 + CHROMAMC_AVG m0, m4 + movq [r0 ], m0 + movhps [r0+r2], m0 + sub r3d, 2 + lea r0, [r0+r2*2] + lea r1, [r1+r2*2] + jg .next2yrows + REP_RET +%endmacro + +%macro chroma_mc4_ssse3_func 3 +cglobal %1_%2_chroma_mc4_%3, 6, 7, 0 +%ifdef ARCH_X86_64 + movsxd r2, r2d +%endif + mov r6, r4 + shl r4d, 8 + sub r4d, r6d + add r4d, 8 ; x*288+8 + mov r6, 8 + sub r6d, r5d + imul r6d, r4d ; (8-y)*(x*255+8) = (8-y)*x<<8 | (8-y)*(8-x) + imul r4d, r5d ; y *(x*255+8) = y *x<<8 | y *(8-x) + + movd m7, r6d + movd m6, r4d + movq m5, [pw_32] + pshufw m7, m7, 0 + pshufw m6, m6, 0 + + movd m0, [r1 ] + punpcklbw m0, [r1 +1] + add r1, r2 +.next2rows + movd m1, [r1 ] + movd m3, [r1+r2 ] + punpcklbw m1, [r1 +1] + punpcklbw m3, [r1+r2+1] + lea r1, [r1+r2*2] + movq m2, m1 + movq m4, m3 + pmaddubsw m0, m7 + pmaddubsw m1, m6 + pmaddubsw m2, m7 + pmaddubsw m3, m6 + paddw m0, m5 + paddw m2, m5 + paddw m1, m0 + paddw m3, m2 + movq m0, m4 + psrlw m1, 6 + psrlw m3, 6 + packuswb m1, m1 + packuswb m3, m3 + CHROMAMC_AVG m1, [r0 ] + CHROMAMC_AVG m3, [r0+r2] + movd [r0 ], m1 + movd [r0+r2], m3 + sub r3d, 2 + lea r0, [r0+r2*2] + jg .next2rows + REP_RET +%endmacro + +%define CHROMAMC_AVG NOTHING +INIT_XMM +chroma_mc8_ssse3_func put, h264, ssse3_rnd +chroma_mc8_ssse3_func put, vc1, ssse3_nornd +INIT_MMX +chroma_mc4_ssse3_func put, h264, ssse3 + +%define CHROMAMC_AVG DIRECT_AVG +%define PAVG pavgb +INIT_XMM +chroma_mc8_ssse3_func avg, h264, ssse3_rnd +chroma_mc8_ssse3_func avg, vc1, ssse3_nornd +INIT_MMX +chroma_mc4_ssse3_func avg, h264, ssse3 diff --git a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/h264_deblock.asm b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/h264_deblock.asm new file mode 100644 index 000000000..fb9cacfd1 --- /dev/null +++ b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/h264_deblock.asm @@ -0,0 +1,889 @@ +;***************************************************************************** +;* MMX/SSE2-optimized H.264 deblocking code +;***************************************************************************** +;* Copyright (C) 2005-2008 x264 project +;* +;* Authors: Loren Merritt +;* Jason Garrett-Glaser +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with FFmpeg; if not, write to the Free Software +;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "x86inc.asm" +%include "x86util.asm" + +SECTION_RODATA + +cextern pb_0 +cextern pb_1 +cextern pb_3 +cextern pb_A1 + +SECTION .text + +; expands to [base],...,[base+7*stride] +%define PASS8ROWS(base, base3, stride, stride3) \ + [base], [base+stride], [base+stride*2], [base3], \ + [base3+stride], [base3+stride*2], [base3+stride3], [base3+stride*4] + +; in: 8 rows of 4 bytes in %1..%8 +; out: 4 rows of 8 bytes in m0..m3 +%macro TRANSPOSE4x8_LOAD 8 + movd m0, %1 + movd m2, %2 + movd m1, %3 + movd m3, %4 + punpcklbw m0, m2 + punpcklbw m1, m3 + movq m2, m0 + punpcklwd m0, m1 + punpckhwd m2, m1 + + movd m4, %5 + movd m6, %6 + movd m5, %7 + movd m7, %8 + punpcklbw m4, m6 + punpcklbw m5, m7 + movq m6, m4 + punpcklwd m4, m5 + punpckhwd m6, m5 + + movq m1, m0 + movq m3, m2 + punpckldq m0, m4 + punpckhdq m1, m4 + punpckldq m2, m6 + punpckhdq m3, m6 +%endmacro + +; in: 4 rows of 8 bytes in m0..m3 +; out: 8 rows of 4 bytes in %1..%8 +%macro TRANSPOSE8x4_STORE 8 + movq m4, m0 + movq m5, m1 + movq m6, m2 + punpckhdq m4, m4 + punpckhdq m5, m5 + punpckhdq m6, m6 + + punpcklbw m0, m1 + punpcklbw m2, m3 + movq m1, m0 + punpcklwd m0, m2 + punpckhwd m1, m2 + movd %1, m0 + punpckhdq m0, m0 + movd %2, m0 + movd %3, m1 + punpckhdq m1, m1 + movd %4, m1 + + punpckhdq m3, m3 + punpcklbw m4, m5 + punpcklbw m6, m3 + movq m5, m4 + punpcklwd m4, m6 + punpckhwd m5, m6 + movd %5, m4 + punpckhdq m4, m4 + movd %6, m4 + movd %7, m5 + punpckhdq m5, m5 + movd %8, m5 +%endmacro + +%macro SBUTTERFLY3 4 + movq %4, %2 + punpckl%1 %2, %3 + punpckh%1 %4, %3 +%endmacro + +; in: 8 rows of 8 (only the middle 6 pels are used) in %1..%8 +; out: 6 rows of 8 in [%9+0*16] .. [%9+5*16] +%macro TRANSPOSE6x8_MEM 9 + movq m0, %1 + movq m1, %2 + movq m2, %3 + movq m3, %4 + movq m4, %5 + movq m5, %6 + movq m6, %7 + SBUTTERFLY3 bw, m0, m1, m7 + SBUTTERFLY3 bw, m2, m3, m1 + SBUTTERFLY3 bw, m4, m5, m3 + movq [%9+0x10], m1 + SBUTTERFLY3 bw, m6, %8, m5 + SBUTTERFLY3 wd, m0, m2, m1 + SBUTTERFLY3 wd, m4, m6, m2 + punpckhdq m0, m4 + movq [%9+0x00], m0 + SBUTTERFLY3 wd, m7, [%9+0x10], m6 + SBUTTERFLY3 wd, m3, m5, m4 + SBUTTERFLY3 dq, m7, m3, m0 + SBUTTERFLY3 dq, m1, m2, m5 + punpckldq m6, m4 + movq [%9+0x10], m1 + movq [%9+0x20], m5 + movq [%9+0x30], m7 + movq [%9+0x40], m0 + movq [%9+0x50], m6 +%endmacro + +; in: 8 rows of 8 in %1..%8 +; out: 8 rows of 8 in %9..%16 +%macro TRANSPOSE8x8_MEM 16 + movq m0, %1 + movq m1, %2 + movq m2, %3 + movq m3, %4 + movq m4, %5 + movq m5, %6 + movq m6, %7 + SBUTTERFLY3 bw, m0, m1, m7 + SBUTTERFLY3 bw, m2, m3, m1 + SBUTTERFLY3 bw, m4, m5, m3 + SBUTTERFLY3 bw, m6, %8, m5 + movq %9, m3 + SBUTTERFLY3 wd, m0, m2, m3 + SBUTTERFLY3 wd, m4, m6, m2 + SBUTTERFLY3 wd, m7, m1, m6 + movq %11, m2 + movq m2, %9 + SBUTTERFLY3 wd, m2, m5, m1 + SBUTTERFLY3 dq, m0, m4, m5 + SBUTTERFLY3 dq, m7, m2, m4 + movq %9, m0 + movq %10, m5 + movq %13, m7 + movq %14, m4 + SBUTTERFLY3 dq, m3, %11, m0 + SBUTTERFLY3 dq, m6, m1, m5 + movq %11, m3 + movq %12, m0 + movq %15, m6 + movq %16, m5 +%endmacro + +; out: %4 = |%1-%2|>%3 +; clobbers: %5 +%macro DIFF_GT 5 + mova %5, %2 + mova %4, %1 + psubusb %5, %1 + psubusb %4, %2 + por %4, %5 + psubusb %4, %3 +%endmacro + +; out: %4 = |%1-%2|>%3 +; clobbers: %5 +%macro DIFF_GT2 5 + mova %5, %2 + mova %4, %1 + psubusb %5, %1 + psubusb %4, %2 + psubusb %5, %3 + psubusb %4, %3 + pcmpeqb %4, %5 +%endmacro + +%macro SPLATW 1 +%ifidn m0, xmm0 + pshuflw %1, %1, 0 + punpcklqdq %1, %1 +%else + pshufw %1, %1, 0 +%endif +%endmacro + +; in: m0=p1 m1=p0 m2=q0 m3=q1 %1=alpha-1 %2=beta-1 +; out: m5=beta-1, m7=mask, %3=alpha-1 +; clobbers: m4,m6 +%macro LOAD_MASK 2-3 + movd m4, %1 + movd m5, %2 + SPLATW m4 + SPLATW m5 + packuswb m4, m4 ; 16x alpha-1 + packuswb m5, m5 ; 16x beta-1 +%if %0>2 + mova %3, m4 +%endif + DIFF_GT m1, m2, m4, m7, m6 ; |p0-q0| > alpha-1 + DIFF_GT m0, m1, m5, m4, m6 ; |p1-p0| > beta-1 + por m7, m4 + DIFF_GT m3, m2, m5, m4, m6 ; |q1-q0| > beta-1 + por m7, m4 + pxor m6, m6 + pcmpeqb m7, m6 +%endmacro + +; in: m0=p1 m1=p0 m2=q0 m3=q1 m7=(tc&mask) +; out: m1=p0' m2=q0' +; clobbers: m0,3-6 +%macro DEBLOCK_P0_Q0 0 + mova m5, m1 + pxor m5, m2 ; p0^q0 + pand m5, [pb_1] ; (p0^q0)&1 + pcmpeqb m4, m4 + pxor m3, m4 + pavgb m3, m0 ; (p1 - q1 + 256)>>1 + pavgb m3, [pb_3] ; (((p1 - q1 + 256)>>1)+4)>>1 = 64+2+(p1-q1)>>2 + pxor m4, m1 + pavgb m4, m2 ; (q0 - p0 + 256)>>1 + pavgb m3, m5 + paddusb m3, m4 ; d+128+33 + mova m6, [pb_A1] + psubusb m6, m3 + psubusb m3, [pb_A1] + pminub m6, m7 + pminub m3, m7 + psubusb m1, m6 + psubusb m2, m3 + paddusb m1, m3 + paddusb m2, m6 +%endmacro + +; in: m1=p0 m2=q0 +; %1=p1 %2=q2 %3=[q2] %4=[q1] %5=tc0 %6=tmp +; out: [q1] = clip( (q2+((p0+q0+1)>>1))>>1, q1-tc0, q1+tc0 ) +; clobbers: q2, tmp, tc0 +%macro LUMA_Q1 6 + mova %6, m1 + pavgb %6, m2 + pavgb %2, %6 ; avg(p2,avg(p0,q0)) + pxor %6, %3 + pand %6, [pb_1] ; (p2^avg(p0,q0))&1 + psubusb %2, %6 ; (p2+((p0+q0+1)>>1))>>1 + mova %6, %1 + psubusb %6, %5 + paddusb %5, %1 + pmaxub %2, %6 + pminub %2, %5 + mova %4, %2 +%endmacro + +%ifdef ARCH_X86_64 +;----------------------------------------------------------------------------- +; void x264_deblock_v_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) +;----------------------------------------------------------------------------- +INIT_XMM +cglobal x264_deblock_v_luma_sse2, 5,5,10 + movd m8, [r4] ; tc0 + lea r4, [r1*3] + dec r2d ; alpha-1 + neg r4 + dec r3d ; beta-1 + add r4, r0 ; pix-3*stride + + mova m0, [r4+r1] ; p1 + mova m1, [r4+2*r1] ; p0 + mova m2, [r0] ; q0 + mova m3, [r0+r1] ; q1 + LOAD_MASK r2d, r3d + + punpcklbw m8, m8 + punpcklbw m8, m8 ; tc = 4x tc0[3], 4x tc0[2], 4x tc0[1], 4x tc0[0] + pcmpeqb m9, m9 + pcmpeqb m9, m8 + pandn m9, m7 + pand m8, m9 + + movdqa m3, [r4] ; p2 + DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0| > beta-1 + pand m6, m9 + mova m7, m8 + psubb m7, m6 + pand m6, m8 + LUMA_Q1 m0, m3, [r4], [r4+r1], m6, m4 + + movdqa m4, [r0+2*r1] ; q2 + DIFF_GT2 m2, m4, m5, m6, m3 ; |q2-q0| > beta-1 + pand m6, m9 + pand m8, m6 + psubb m7, m6 + mova m3, [r0+r1] + LUMA_Q1 m3, m4, [r0+2*r1], [r0+r1], m8, m6 + + DEBLOCK_P0_Q0 + mova [r4+2*r1], m1 + mova [r0], m2 + RET + +;----------------------------------------------------------------------------- +; void x264_deblock_h_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) +;----------------------------------------------------------------------------- +INIT_MMX +cglobal x264_deblock_h_luma_sse2, 5,7 + movsxd r10, r1d + lea r11, [r10+r10*2] + lea r6, [r0-4] + lea r5, [r0-4+r11] +%ifdef WIN64 + sub rsp, 0x98 + %define pix_tmp rsp+0x30 +%else + sub rsp, 0x68 + %define pix_tmp rsp +%endif + + ; transpose 6x16 -> tmp space + TRANSPOSE6x8_MEM PASS8ROWS(r6, r5, r10, r11), pix_tmp + lea r6, [r6+r10*8] + lea r5, [r5+r10*8] + TRANSPOSE6x8_MEM PASS8ROWS(r6, r5, r10, r11), pix_tmp+8 + + ; vertical filter + ; alpha, beta, tc0 are still in r2d, r3d, r4 + ; don't backup r6, r5, r10, r11 because x264_deblock_v_luma_sse2 doesn't use them + lea r0, [pix_tmp+0x30] + mov r1d, 0x10 +%ifdef WIN64 + mov [rsp+0x20], r4 +%endif + call x264_deblock_v_luma_sse2 + + ; transpose 16x4 -> original space (only the middle 4 rows were changed by the filter) + add r6, 2 + add r5, 2 + movq m0, [pix_tmp+0x18] + movq m1, [pix_tmp+0x28] + movq m2, [pix_tmp+0x38] + movq m3, [pix_tmp+0x48] + TRANSPOSE8x4_STORE PASS8ROWS(r6, r5, r10, r11) + + shl r10, 3 + sub r6, r10 + sub r5, r10 + shr r10, 3 + movq m0, [pix_tmp+0x10] + movq m1, [pix_tmp+0x20] + movq m2, [pix_tmp+0x30] + movq m3, [pix_tmp+0x40] + TRANSPOSE8x4_STORE PASS8ROWS(r6, r5, r10, r11) + +%ifdef WIN64 + add rsp, 0x98 +%else + add rsp, 0x68 +%endif + RET + +%else + +%macro DEBLOCK_LUMA 3 +;----------------------------------------------------------------------------- +; void x264_deblock_v8_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) +;----------------------------------------------------------------------------- +cglobal x264_deblock_%2_luma_%1, 5,5 + lea r4, [r1*3] + dec r2 ; alpha-1 + neg r4 + dec r3 ; beta-1 + add r4, r0 ; pix-3*stride + %assign pad 2*%3+12-(stack_offset&15) + SUB esp, pad + + mova m0, [r4+r1] ; p1 + mova m1, [r4+2*r1] ; p0 + mova m2, [r0] ; q0 + mova m3, [r0+r1] ; q1 + LOAD_MASK r2, r3 + + mov r3, r4mp + movd m4, [r3] ; tc0 + punpcklbw m4, m4 + punpcklbw m4, m4 ; tc = 4x tc0[3], 4x tc0[2], 4x tc0[1], 4x tc0[0] + mova [esp+%3], m4 ; tc + pcmpeqb m3, m3 + pcmpgtb m4, m3 + pand m4, m7 + mova [esp], m4 ; mask + + mova m3, [r4] ; p2 + DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0| > beta-1 + pand m6, m4 + pand m4, [esp+%3] ; tc + mova m7, m4 + psubb m7, m6 + pand m6, m4 + LUMA_Q1 m0, m3, [r4], [r4+r1], m6, m4 + + mova m4, [r0+2*r1] ; q2 + DIFF_GT2 m2, m4, m5, m6, m3 ; |q2-q0| > beta-1 + mova m5, [esp] ; mask + pand m6, m5 + mova m5, [esp+%3] ; tc + pand m5, m6 + psubb m7, m6 + mova m3, [r0+r1] + LUMA_Q1 m3, m4, [r0+2*r1], [r0+r1], m5, m6 + + DEBLOCK_P0_Q0 + mova [r4+2*r1], m1 + mova [r0], m2 + ADD esp, pad + RET + +;----------------------------------------------------------------------------- +; void x264_deblock_h_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) +;----------------------------------------------------------------------------- +INIT_MMX +cglobal x264_deblock_h_luma_%1, 0,5 + mov r0, r0mp + mov r3, r1m + lea r4, [r3*3] + sub r0, 4 + lea r1, [r0+r4] + %assign pad 0x78-(stack_offset&15) + SUB esp, pad +%define pix_tmp esp+12 + + ; transpose 6x16 -> tmp space + TRANSPOSE6x8_MEM PASS8ROWS(r0, r1, r3, r4), pix_tmp + lea r0, [r0+r3*8] + lea r1, [r1+r3*8] + TRANSPOSE6x8_MEM PASS8ROWS(r0, r1, r3, r4), pix_tmp+8 + + ; vertical filter + lea r0, [pix_tmp+0x30] + PUSH dword r4m + PUSH dword r3m + PUSH dword r2m + PUSH dword 16 + PUSH dword r0 + call x264_deblock_%2_luma_%1 +%ifidn %2, v8 + add dword [esp ], 8 ; pix_tmp+0x38 + add dword [esp+16], 2 ; tc0+2 + call x264_deblock_%2_luma_%1 +%endif + ADD esp, 20 + + ; transpose 16x4 -> original space (only the middle 4 rows were changed by the filter) + mov r0, r0mp + sub r0, 2 + lea r1, [r0+r4] + + movq m0, [pix_tmp+0x10] + movq m1, [pix_tmp+0x20] + movq m2, [pix_tmp+0x30] + movq m3, [pix_tmp+0x40] + TRANSPOSE8x4_STORE PASS8ROWS(r0, r1, r3, r4) + + lea r0, [r0+r3*8] + lea r1, [r1+r3*8] + movq m0, [pix_tmp+0x18] + movq m1, [pix_tmp+0x28] + movq m2, [pix_tmp+0x38] + movq m3, [pix_tmp+0x48] + TRANSPOSE8x4_STORE PASS8ROWS(r0, r1, r3, r4) + + ADD esp, pad + RET +%endmacro ; DEBLOCK_LUMA + +INIT_MMX +DEBLOCK_LUMA mmxext, v8, 8 +INIT_XMM +DEBLOCK_LUMA sse2, v, 16 + +%endif ; ARCH + + + +%macro LUMA_INTRA_P012 4 ; p0..p3 in memory + mova t0, p2 + mova t1, p0 + pavgb t0, p1 + pavgb t1, q0 + pavgb t0, t1 ; ((p2+p1+1)/2 + (p0+q0+1)/2 + 1)/2 + mova t5, t1 + mova t2, p2 + mova t3, p0 + paddb t2, p1 + paddb t3, q0 + paddb t2, t3 + mova t3, t2 + mova t4, t2 + psrlw t2, 1 + pavgb t2, mpb_0 + pxor t2, t0 + pand t2, mpb_1 + psubb t0, t2 ; p1' = (p2+p1+p0+q0+2)/4; + + mova t1, p2 + mova t2, p2 + pavgb t1, q1 + psubb t2, q1 + paddb t3, t3 + psubb t3, t2 ; p2+2*p1+2*p0+2*q0+q1 + pand t2, mpb_1 + psubb t1, t2 + pavgb t1, p1 + pavgb t1, t5 ; (((p2+q1)/2 + p1+1)/2 + (p0+q0+1)/2 + 1)/2 + psrlw t3, 2 + pavgb t3, mpb_0 + pxor t3, t1 + pand t3, mpb_1 + psubb t1, t3 ; p0'a = (p2+2*p1+2*p0+2*q0+q1+4)/8 + + mova t3, p0 + mova t2, p0 + pxor t3, q1 + pavgb t2, q1 + pand t3, mpb_1 + psubb t2, t3 + pavgb t2, p1 ; p0'b = (2*p1+p0+q0+2)/4 + + pxor t1, t2 + pxor t2, p0 + pand t1, mask1p + pand t2, mask0 + pxor t1, t2 + pxor t1, p0 + mova %1, t1 ; store p0 + + mova t1, %4 ; p3 + mova t2, t1 + pavgb t1, p2 + paddb t2, p2 + pavgb t1, t0 ; (p3+p2+1)/2 + (p2+p1+p0+q0+2)/4 + paddb t2, t2 + paddb t2, t4 ; 2*p3+3*p2+p1+p0+q0 + psrlw t2, 2 + pavgb t2, mpb_0 + pxor t2, t1 + pand t2, mpb_1 + psubb t1, t2 ; p2' = (2*p3+3*p2+p1+p0+q0+4)/8 + + pxor t0, p1 + pxor t1, p2 + pand t0, mask1p + pand t1, mask1p + pxor t0, p1 + pxor t1, p2 + mova %2, t0 ; store p1 + mova %3, t1 ; store p2 +%endmacro + +%macro LUMA_INTRA_SWAP_PQ 0 + %define q1 m0 + %define q0 m1 + %define p0 m2 + %define p1 m3 + %define p2 q2 + %define mask1p mask1q +%endmacro + +%macro DEBLOCK_LUMA_INTRA 2 + %define p1 m0 + %define p0 m1 + %define q0 m2 + %define q1 m3 + %define t0 m4 + %define t1 m5 + %define t2 m6 + %define t3 m7 +%ifdef ARCH_X86_64 + %define p2 m8 + %define q2 m9 + %define t4 m10 + %define t5 m11 + %define mask0 m12 + %define mask1p m13 + %define mask1q [rsp-24] + %define mpb_0 m14 + %define mpb_1 m15 +%else + %define spill(x) [esp+16*x+((stack_offset+4)&15)] + %define p2 [r4+r1] + %define q2 [r0+2*r1] + %define t4 spill(0) + %define t5 spill(1) + %define mask0 spill(2) + %define mask1p spill(3) + %define mask1q spill(4) + %define mpb_0 [pb_0] + %define mpb_1 [pb_1] +%endif + +;----------------------------------------------------------------------------- +; void x264_deblock_v_luma_intra_sse2( uint8_t *pix, int stride, int alpha, int beta ) +;----------------------------------------------------------------------------- +cglobal x264_deblock_%2_luma_intra_%1, 4,6,16 +%ifndef ARCH_X86_64 + sub esp, 0x60 +%endif + lea r4, [r1*4] + lea r5, [r1*3] ; 3*stride + dec r2d ; alpha-1 + jl .end + neg r4 + dec r3d ; beta-1 + jl .end + add r4, r0 ; pix-4*stride + mova p1, [r4+2*r1] + mova p0, [r4+r5] + mova q0, [r0] + mova q1, [r0+r1] +%ifdef ARCH_X86_64 + pxor mpb_0, mpb_0 + mova mpb_1, [pb_1] + LOAD_MASK r2d, r3d, t5 ; m5=beta-1, t5=alpha-1, m7=mask0 + SWAP 7, 12 ; m12=mask0 + pavgb t5, mpb_0 + pavgb t5, mpb_1 ; alpha/4+1 + movdqa p2, [r4+r1] + movdqa q2, [r0+2*r1] + DIFF_GT2 p0, q0, t5, t0, t3 ; t0 = |p0-q0| > alpha/4+1 + DIFF_GT2 p0, p2, m5, t2, t5 ; mask1 = |p2-p0| > beta-1 + DIFF_GT2 q0, q2, m5, t4, t5 ; t4 = |q2-q0| > beta-1 + pand t0, mask0 + pand t4, t0 + pand t2, t0 + mova mask1q, t4 + mova mask1p, t2 +%else + LOAD_MASK r2d, r3d, t5 ; m5=beta-1, t5=alpha-1, m7=mask0 + mova m4, t5 + mova mask0, m7 + pavgb m4, [pb_0] + pavgb m4, [pb_1] ; alpha/4+1 + DIFF_GT2 p0, q0, m4, m6, m7 ; m6 = |p0-q0| > alpha/4+1 + pand m6, mask0 + DIFF_GT2 p0, p2, m5, m4, m7 ; m4 = |p2-p0| > beta-1 + pand m4, m6 + mova mask1p, m4 + DIFF_GT2 q0, q2, m5, m4, m7 ; m4 = |q2-q0| > beta-1 + pand m4, m6 + mova mask1q, m4 +%endif + LUMA_INTRA_P012 [r4+r5], [r4+2*r1], [r4+r1], [r4] + LUMA_INTRA_SWAP_PQ + LUMA_INTRA_P012 [r0], [r0+r1], [r0+2*r1], [r0+r5] +.end: +%ifndef ARCH_X86_64 + add esp, 0x60 +%endif + RET + +INIT_MMX +%ifdef ARCH_X86_64 +;----------------------------------------------------------------------------- +; void x264_deblock_h_luma_intra_sse2( uint8_t *pix, int stride, int alpha, int beta ) +;----------------------------------------------------------------------------- +cglobal x264_deblock_h_luma_intra_%1, 4,7 + movsxd r10, r1d + lea r11, [r10*3] + lea r6, [r0-4] + lea r5, [r0-4+r11] + sub rsp, 0x88 + %define pix_tmp rsp + + ; transpose 8x16 -> tmp space + TRANSPOSE8x8_MEM PASS8ROWS(r6, r5, r10, r11), PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30) + lea r6, [r6+r10*8] + lea r5, [r5+r10*8] + TRANSPOSE8x8_MEM PASS8ROWS(r6, r5, r10, r11), PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30) + + lea r0, [pix_tmp+0x40] + mov r1, 0x10 + call x264_deblock_v_luma_intra_%1 + + ; transpose 16x6 -> original space (but we can't write only 6 pixels, so really 16x8) + lea r5, [r6+r11] + TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30), PASS8ROWS(r6, r5, r10, r11) + shl r10, 3 + sub r6, r10 + sub r5, r10 + shr r10, 3 + TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30), PASS8ROWS(r6, r5, r10, r11) + add rsp, 0x88 + RET +%else +cglobal x264_deblock_h_luma_intra_%1, 2,4 + lea r3, [r1*3] + sub r0, 4 + lea r2, [r0+r3] +%assign pad 0x8c-(stack_offset&15) + SUB rsp, pad + %define pix_tmp rsp + + ; transpose 8x16 -> tmp space + TRANSPOSE8x8_MEM PASS8ROWS(r0, r2, r1, r3), PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30) + lea r0, [r0+r1*8] + lea r2, [r2+r1*8] + TRANSPOSE8x8_MEM PASS8ROWS(r0, r2, r1, r3), PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30) + + lea r0, [pix_tmp+0x40] + PUSH dword r3m + PUSH dword r2m + PUSH dword 16 + PUSH r0 + call x264_deblock_%2_luma_intra_%1 +%ifidn %2, v8 + add dword [rsp], 8 ; pix_tmp+8 + call x264_deblock_%2_luma_intra_%1 +%endif + ADD esp, 16 + + mov r1, r1m + mov r0, r0mp + lea r3, [r1*3] + sub r0, 4 + lea r2, [r0+r3] + ; transpose 16x6 -> original space (but we can't write only 6 pixels, so really 16x8) + TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30), PASS8ROWS(r0, r2, r1, r3) + lea r0, [r0+r1*8] + lea r2, [r2+r1*8] + TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30), PASS8ROWS(r0, r2, r1, r3) + ADD rsp, pad + RET +%endif ; ARCH_X86_64 +%endmacro ; DEBLOCK_LUMA_INTRA + +INIT_XMM +DEBLOCK_LUMA_INTRA sse2, v +%ifndef ARCH_X86_64 +INIT_MMX +DEBLOCK_LUMA_INTRA mmxext, v8 +%endif + + + +INIT_MMX + +%macro CHROMA_V_START 0 + dec r2d ; alpha-1 + dec r3d ; beta-1 + mov t5, r0 + sub t5, r1 + sub t5, r1 +%endmacro + +%macro CHROMA_H_START 0 + dec r2d + dec r3d + sub r0, 2 + lea t6, [r1*3] + mov t5, r0 + add r0, t6 +%endmacro + +%define t5 r5 +%define t6 r6 + +;----------------------------------------------------------------------------- +; void x264_deblock_v_chroma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) +;----------------------------------------------------------------------------- +cglobal x264_deblock_v_chroma_mmxext, 5,6 + CHROMA_V_START + movq m0, [t5] + movq m1, [t5+r1] + movq m2, [r0] + movq m3, [r0+r1] + call x264_chroma_inter_body_mmxext + movq [t5+r1], m1 + movq [r0], m2 + RET + +;----------------------------------------------------------------------------- +; void x264_deblock_h_chroma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) +;----------------------------------------------------------------------------- +cglobal x264_deblock_h_chroma_mmxext, 5,7 +%ifdef ARCH_X86_64 + %define buf0 [rsp-24] + %define buf1 [rsp-16] +%else + %define buf0 r0m + %define buf1 r2m +%endif + CHROMA_H_START + TRANSPOSE4x8_LOAD PASS8ROWS(t5, r0, r1, t6) + movq buf0, m0 + movq buf1, m3 + call x264_chroma_inter_body_mmxext + movq m0, buf0 + movq m3, buf1 + TRANSPOSE8x4_STORE PASS8ROWS(t5, r0, r1, t6) + RET + +ALIGN 16 +x264_chroma_inter_body_mmxext: + LOAD_MASK r2d, r3d + movd m6, [r4] ; tc0 + punpcklbw m6, m6 + pand m7, m6 + DEBLOCK_P0_Q0 + ret + + + +; in: %1=p0 %2=p1 %3=q1 +; out: p0 = (p0 + q1 + 2*p1 + 2) >> 2 +%macro CHROMA_INTRA_P0 3 + movq m4, %1 + pxor m4, %3 + pand m4, [pb_1] ; m4 = (p0^q1)&1 + pavgb %1, %3 + psubusb %1, m4 + pavgb %1, %2 ; dst = avg(p1, avg(p0,q1) - ((p0^q1)&1)) +%endmacro + +%define t5 r4 +%define t6 r5 + +;----------------------------------------------------------------------------- +; void x264_deblock_v_chroma_intra( uint8_t *pix, int stride, int alpha, int beta ) +;----------------------------------------------------------------------------- +cglobal x264_deblock_v_chroma_intra_mmxext, 4,5 + CHROMA_V_START + movq m0, [t5] + movq m1, [t5+r1] + movq m2, [r0] + movq m3, [r0+r1] + call x264_chroma_intra_body_mmxext + movq [t5+r1], m1 + movq [r0], m2 + RET + +;----------------------------------------------------------------------------- +; void x264_deblock_h_chroma_intra( uint8_t *pix, int stride, int alpha, int beta ) +;----------------------------------------------------------------------------- +cglobal x264_deblock_h_chroma_intra_mmxext, 4,6 + CHROMA_H_START + TRANSPOSE4x8_LOAD PASS8ROWS(t5, r0, r1, t6) + call x264_chroma_intra_body_mmxext + TRANSPOSE8x4_STORE PASS8ROWS(t5, r0, r1, t6) + RET + +ALIGN 16 +x264_chroma_intra_body_mmxext: + LOAD_MASK r2d, r3d + movq m5, m1 + movq m6, m2 + CHROMA_INTRA_P0 m1, m0, m3 + CHROMA_INTRA_P0 m2, m3, m0 + psubb m1, m5 + psubb m2, m6 + pand m1, m7 + pand m2, m7 + paddb m1, m5 + paddb m2, m6 + ret diff --git a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/h264_deblock_sse2.asm b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/h264_deblock_sse2.asm deleted file mode 100644 index a9e6dea3d..000000000 --- a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/h264_deblock_sse2.asm +++ /dev/null @@ -1,761 +0,0 @@ -;***************************************************************************** -;* MMX/SSE2-optimized H.264 deblocking code -;***************************************************************************** -;* Copyright (C) 2005-2008 x264 project -;* -;* Authors: Loren Merritt -;* -;* This file is part of FFmpeg. -;* -;* FFmpeg is free software; you can redistribute it and/or -;* modify it under the terms of the GNU Lesser General Public -;* License as published by the Free Software Foundation; either -;* version 2.1 of the License, or (at your option) any later version. -;* -;* FFmpeg is distributed in the hope that it will be useful, -;* but WITHOUT ANY WARRANTY; without even the implied warranty of -;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -;* Lesser General Public License for more details. -;* -;* You should have received a copy of the GNU Lesser General Public -;* License along with FFmpeg; if not, write to the Free Software -;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -;****************************************************************************** - -%include "x86inc.asm" - -SECTION_RODATA -pb_00: times 16 db 0x00 -pb_01: times 16 db 0x01 -pb_03: times 16 db 0x03 -pb_a1: times 16 db 0xa1 - -SECTION .text - -; expands to [base],...,[base+7*stride] -%define PASS8ROWS(base, base3, stride, stride3) \ - [base], [base+stride], [base+stride*2], [base3], \ - [base3+stride], [base3+stride*2], [base3+stride3], [base3+stride*4] - -; in: 8 rows of 4 bytes in %1..%8 -; out: 4 rows of 8 bytes in m0..m3 -%macro TRANSPOSE4x8_LOAD 8 - movd m0, %1 - movd m2, %2 - movd m1, %3 - movd m3, %4 - punpcklbw m0, m2 - punpcklbw m1, m3 - movq m2, m0 - punpcklwd m0, m1 - punpckhwd m2, m1 - - movd m4, %5 - movd m6, %6 - movd m5, %7 - movd m7, %8 - punpcklbw m4, m6 - punpcklbw m5, m7 - movq m6, m4 - punpcklwd m4, m5 - punpckhwd m6, m5 - - movq m1, m0 - movq m3, m2 - punpckldq m0, m4 - punpckhdq m1, m4 - punpckldq m2, m6 - punpckhdq m3, m6 -%endmacro - -; in: 4 rows of 8 bytes in m0..m3 -; out: 8 rows of 4 bytes in %1..%8 -%macro TRANSPOSE8x4_STORE 8 - movq m4, m0 - movq m5, m1 - movq m6, m2 - punpckhdq m4, m4 - punpckhdq m5, m5 - punpckhdq m6, m6 - - punpcklbw m0, m1 - punpcklbw m2, m3 - movq m1, m0 - punpcklwd m0, m2 - punpckhwd m1, m2 - movd %1, m0 - punpckhdq m0, m0 - movd %2, m0 - movd %3, m1 - punpckhdq m1, m1 - movd %4, m1 - - punpckhdq m3, m3 - punpcklbw m4, m5 - punpcklbw m6, m3 - movq m5, m4 - punpcklwd m4, m6 - punpckhwd m5, m6 - movd %5, m4 - punpckhdq m4, m4 - movd %6, m4 - movd %7, m5 - punpckhdq m5, m5 - movd %8, m5 -%endmacro - -%macro SBUTTERFLY 4 - movq %4, %2 - punpckl%1 %2, %3 - punpckh%1 %4, %3 -%endmacro - -; in: 8 rows of 8 (only the middle 6 pels are used) in %1..%8 -; out: 6 rows of 8 in [%9+0*16] .. [%9+5*16] -%macro TRANSPOSE6x8_MEM 9 - movq m0, %1 - movq m1, %2 - movq m2, %3 - movq m3, %4 - movq m4, %5 - movq m5, %6 - movq m6, %7 - SBUTTERFLY bw, m0, m1, m7 - SBUTTERFLY bw, m2, m3, m1 - SBUTTERFLY bw, m4, m5, m3 - movq [%9+0x10], m1 - SBUTTERFLY bw, m6, %8, m5 - SBUTTERFLY wd, m0, m2, m1 - SBUTTERFLY wd, m4, m6, m2 - punpckhdq m0, m4 - movq [%9+0x00], m0 - SBUTTERFLY wd, m7, [%9+0x10], m6 - SBUTTERFLY wd, m3, m5, m4 - SBUTTERFLY dq, m7, m3, m0 - SBUTTERFLY dq, m1, m2, m5 - punpckldq m6, m4 - movq [%9+0x10], m1 - movq [%9+0x20], m5 - movq [%9+0x30], m7 - movq [%9+0x40], m0 - movq [%9+0x50], m6 -%endmacro - -; in: 8 rows of 8 in %1..%8 -; out: 8 rows of 8 in %9..%16 -%macro TRANSPOSE8x8_MEM 16 - movq m0, %1 - movq m1, %2 - movq m2, %3 - movq m3, %4 - movq m4, %5 - movq m5, %6 - movq m6, %7 - SBUTTERFLY bw, m0, m1, m7 - SBUTTERFLY bw, m2, m3, m1 - SBUTTERFLY bw, m4, m5, m3 - SBUTTERFLY bw, m6, %8, m5 - movq %9, m3 - SBUTTERFLY wd, m0, m2, m3 - SBUTTERFLY wd, m4, m6, m2 - SBUTTERFLY wd, m7, m1, m6 - movq %11, m2 - movq m2, %9 - SBUTTERFLY wd, m2, m5, m1 - SBUTTERFLY dq, m0, m4, m5 - SBUTTERFLY dq, m7, m2, m4 - movq %9, m0 - movq %10, m5 - movq %13, m7 - movq %14, m4 - SBUTTERFLY dq, m3, %11, m0 - SBUTTERFLY dq, m6, m1, m5 - movq %11, m3 - movq %12, m0 - movq %15, m6 - movq %16, m5 -%endmacro - -; out: %4 = |%1-%2|>%3 -; clobbers: %5 -%macro DIFF_GT 5 - mova %5, %2 - mova %4, %1 - psubusb %5, %1 - psubusb %4, %2 - por %4, %5 - psubusb %4, %3 -%endmacro - -; out: %4 = |%1-%2|>%3 -; clobbers: %5 -%macro DIFF_GT2 5 - mova %5, %2 - mova %4, %1 - psubusb %5, %1 - psubusb %4, %2 - psubusb %5, %3 - psubusb %4, %3 - pcmpeqb %4, %5 -%endmacro - -%macro SPLATW 1 -%ifidn m0, xmm0 - pshuflw %1, %1, 0 - punpcklqdq %1, %1 -%else - pshufw %1, %1, 0 -%endif -%endmacro - -; in: m0=p1 m1=p0 m2=q0 m3=q1 %1=alpha-1 %2=beta-1 -; out: m5=beta-1, m7=mask, %3=alpha-1 -; clobbers: m4,m6 -%macro LOAD_MASK 2-3 - movd m4, %1 - movd m5, %2 - SPLATW m4 - SPLATW m5 - packuswb m4, m4 ; 16x alpha-1 - packuswb m5, m5 ; 16x beta-1 -%if %0>2 - mova %3, m4 -%endif - DIFF_GT m1, m2, m4, m7, m6 ; |p0-q0| > alpha-1 - DIFF_GT m0, m1, m5, m4, m6 ; |p1-p0| > beta-1 - por m7, m4 - DIFF_GT m3, m2, m5, m4, m6 ; |q1-q0| > beta-1 - por m7, m4 - pxor m6, m6 - pcmpeqb m7, m6 -%endmacro - -; in: m0=p1 m1=p0 m2=q0 m3=q1 m7=(tc&mask) -; out: m1=p0' m2=q0' -; clobbers: m0,3-6 -%macro DEBLOCK_P0_Q0 0 - mova m5, m1 - pxor m5, m2 ; p0^q0 - pand m5, [pb_01] ; (p0^q0)&1 - pcmpeqb m4, m4 - pxor m3, m4 - pavgb m3, m0 ; (p1 - q1 + 256)>>1 - pavgb m3, [pb_03] ; (((p1 - q1 + 256)>>1)+4)>>1 = 64+2+(p1-q1)>>2 - pxor m4, m1 - pavgb m4, m2 ; (q0 - p0 + 256)>>1 - pavgb m3, m5 - paddusb m3, m4 ; d+128+33 - mova m6, [pb_a1] - psubusb m6, m3 - psubusb m3, [pb_a1] - pminub m6, m7 - pminub m3, m7 - psubusb m1, m6 - psubusb m2, m3 - paddusb m1, m3 - paddusb m2, m6 -%endmacro - -; in: m1=p0 m2=q0 -; %1=p1 %2=q2 %3=[q2] %4=[q1] %5=tc0 %6=tmp -; out: [q1] = clip( (q2+((p0+q0+1)>>1))>>1, q1-tc0, q1+tc0 ) -; clobbers: q2, tmp, tc0 -%macro LUMA_Q1 6 - mova %6, m1 - pavgb %6, m2 - pavgb %2, %6 ; avg(p2,avg(p0,q0)) - pxor %6, %3 - pand %6, [pb_01] ; (p2^avg(p0,q0))&1 - psubusb %2, %6 ; (p2+((p0+q0+1)>>1))>>1 - mova %6, %1 - psubusb %6, %5 - paddusb %5, %1 - pmaxub %2, %6 - pminub %2, %5 - mova %4, %2 -%endmacro - -%ifdef ARCH_X86_64 -;----------------------------------------------------------------------------- -; void x264_deblock_v_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) -;----------------------------------------------------------------------------- -INIT_XMM -cglobal x264_deblock_v_luma_sse2, 5,5,10 - movd m8, [r4] ; tc0 - lea r4, [r1*3] - dec r2d ; alpha-1 - neg r4 - dec r3d ; beta-1 - add r4, r0 ; pix-3*stride - - mova m0, [r4+r1] ; p1 - mova m1, [r4+2*r1] ; p0 - mova m2, [r0] ; q0 - mova m3, [r0+r1] ; q1 - LOAD_MASK r2d, r3d - - punpcklbw m8, m8 - punpcklbw m8, m8 ; tc = 4x tc0[3], 4x tc0[2], 4x tc0[1], 4x tc0[0] - pcmpeqb m9, m9 - pcmpeqb m9, m8 - pandn m9, m7 - pand m8, m9 - - movdqa m3, [r4] ; p2 - DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0| > beta-1 - pand m6, m9 - mova m7, m8 - psubb m7, m6 - pand m6, m8 - LUMA_Q1 m0, m3, [r4], [r4+r1], m6, m4 - - movdqa m4, [r0+2*r1] ; q2 - DIFF_GT2 m2, m4, m5, m6, m3 ; |q2-q0| > beta-1 - pand m6, m9 - pand m8, m6 - psubb m7, m6 - mova m3, [r0+r1] - LUMA_Q1 m3, m4, [r0+2*r1], [r0+r1], m8, m6 - - DEBLOCK_P0_Q0 - mova [r4+2*r1], m1 - mova [r0], m2 - RET - -;----------------------------------------------------------------------------- -; void x264_deblock_h_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) -;----------------------------------------------------------------------------- -INIT_MMX -cglobal x264_deblock_h_luma_sse2, 5,7 - movsxd r10, r1d - lea r11, [r10+r10*2] - lea r6, [r0-4] - lea r5, [r0-4+r11] -%ifdef WIN64 - sub rsp, 0x98 - %define pix_tmp rsp+0x30 -%else - sub rsp, 0x68 - %define pix_tmp rsp -%endif - - ; transpose 6x16 -> tmp space - TRANSPOSE6x8_MEM PASS8ROWS(r6, r5, r10, r11), pix_tmp - lea r6, [r6+r10*8] - lea r5, [r5+r10*8] - TRANSPOSE6x8_MEM PASS8ROWS(r6, r5, r10, r11), pix_tmp+8 - - ; vertical filter - ; alpha, beta, tc0 are still in r2d, r3d, r4 - ; don't backup r6, r5, r10, r11 because x264_deblock_v_luma_sse2 doesn't use them - lea r0, [pix_tmp+0x30] - mov r1d, 0x10 -%ifdef WIN64 - mov [rsp+0x20], r4 -%endif - call x264_deblock_v_luma_sse2 - - ; transpose 16x4 -> original space (only the middle 4 rows were changed by the filter) - add r6, 2 - add r5, 2 - movq m0, [pix_tmp+0x18] - movq m1, [pix_tmp+0x28] - movq m2, [pix_tmp+0x38] - movq m3, [pix_tmp+0x48] - TRANSPOSE8x4_STORE PASS8ROWS(r6, r5, r10, r11) - - shl r10, 3 - sub r6, r10 - sub r5, r10 - shr r10, 3 - movq m0, [pix_tmp+0x10] - movq m1, [pix_tmp+0x20] - movq m2, [pix_tmp+0x30] - movq m3, [pix_tmp+0x40] - TRANSPOSE8x4_STORE PASS8ROWS(r6, r5, r10, r11) - -%ifdef WIN64 - add rsp, 0x98 -%else - add rsp, 0x68 -%endif - RET - -%else - -%macro DEBLOCK_LUMA 3 -;----------------------------------------------------------------------------- -; void x264_deblock_v8_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) -;----------------------------------------------------------------------------- -cglobal x264_deblock_%2_luma_%1, 5,5 - lea r4, [r1*3] - dec r2 ; alpha-1 - neg r4 - dec r3 ; beta-1 - add r4, r0 ; pix-3*stride - %assign pad 2*%3+12-(stack_offset&15) - SUB esp, pad - - mova m0, [r4+r1] ; p1 - mova m1, [r4+2*r1] ; p0 - mova m2, [r0] ; q0 - mova m3, [r0+r1] ; q1 - LOAD_MASK r2, r3 - - mov r3, r4mp - movd m4, [r3] ; tc0 - punpcklbw m4, m4 - punpcklbw m4, m4 ; tc = 4x tc0[3], 4x tc0[2], 4x tc0[1], 4x tc0[0] - mova [esp+%3], m4 ; tc - pcmpeqb m3, m3 - pcmpgtb m4, m3 - pand m4, m7 - mova [esp], m4 ; mask - - mova m3, [r4] ; p2 - DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0| > beta-1 - pand m6, m4 - pand m4, [esp+%3] ; tc - mova m7, m4 - psubb m7, m6 - pand m6, m4 - LUMA_Q1 m0, m3, [r4], [r4+r1], m6, m4 - - mova m4, [r0+2*r1] ; q2 - DIFF_GT2 m2, m4, m5, m6, m3 ; |q2-q0| > beta-1 - mova m5, [esp] ; mask - pand m6, m5 - mova m5, [esp+%3] ; tc - pand m5, m6 - psubb m7, m6 - mova m3, [r0+r1] - LUMA_Q1 m3, m4, [r0+2*r1], [r0+r1], m5, m6 - - DEBLOCK_P0_Q0 - mova [r4+2*r1], m1 - mova [r0], m2 - ADD esp, pad - RET - -;----------------------------------------------------------------------------- -; void x264_deblock_h_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) -;----------------------------------------------------------------------------- -INIT_MMX -cglobal x264_deblock_h_luma_%1, 0,5 - mov r0, r0mp - mov r3, r1m - lea r4, [r3*3] - sub r0, 4 - lea r1, [r0+r4] - %assign pad 0x78-(stack_offset&15) - SUB esp, pad -%define pix_tmp esp+12 - - ; transpose 6x16 -> tmp space - TRANSPOSE6x8_MEM PASS8ROWS(r0, r1, r3, r4), pix_tmp - lea r0, [r0+r3*8] - lea r1, [r1+r3*8] - TRANSPOSE6x8_MEM PASS8ROWS(r0, r1, r3, r4), pix_tmp+8 - - ; vertical filter - lea r0, [pix_tmp+0x30] - PUSH dword r4m - PUSH dword r3m - PUSH dword r2m - PUSH dword 16 - PUSH dword r0 - call x264_deblock_%2_luma_%1 -%ifidn %2, v8 - add dword [esp ], 8 ; pix_tmp+0x38 - add dword [esp+16], 2 ; tc0+2 - call x264_deblock_%2_luma_%1 -%endif - ADD esp, 20 - - ; transpose 16x4 -> original space (only the middle 4 rows were changed by the filter) - mov r0, r0mp - sub r0, 2 - lea r1, [r0+r4] - - movq m0, [pix_tmp+0x10] - movq m1, [pix_tmp+0x20] - movq m2, [pix_tmp+0x30] - movq m3, [pix_tmp+0x40] - TRANSPOSE8x4_STORE PASS8ROWS(r0, r1, r3, r4) - - lea r0, [r0+r3*8] - lea r1, [r1+r3*8] - movq m0, [pix_tmp+0x18] - movq m1, [pix_tmp+0x28] - movq m2, [pix_tmp+0x38] - movq m3, [pix_tmp+0x48] - TRANSPOSE8x4_STORE PASS8ROWS(r0, r1, r3, r4) - - ADD esp, pad - RET -%endmacro ; DEBLOCK_LUMA - -INIT_XMM -DEBLOCK_LUMA sse2, v, 16 - -%endif ; ARCH - - - -%macro LUMA_INTRA_P012 4 ; p0..p3 in memory - mova t0, p2 - mova t1, p0 - pavgb t0, p1 - pavgb t1, q0 - pavgb t0, t1 ; ((p2+p1+1)/2 + (p0+q0+1)/2 + 1)/2 - mova t5, t1 - mova t2, p2 - mova t3, p0 - paddb t2, p1 - paddb t3, q0 - paddb t2, t3 - mova t3, t2 - mova t4, t2 - psrlw t2, 1 - pavgb t2, mpb_00 - pxor t2, t0 - pand t2, mpb_01 - psubb t0, t2 ; p1' = (p2+p1+p0+q0+2)/4; - - mova t1, p2 - mova t2, p2 - pavgb t1, q1 - psubb t2, q1 - paddb t3, t3 - psubb t3, t2 ; p2+2*p1+2*p0+2*q0+q1 - pand t2, mpb_01 - psubb t1, t2 - pavgb t1, p1 - pavgb t1, t5 ; (((p2+q1)/2 + p1+1)/2 + (p0+q0+1)/2 + 1)/2 - psrlw t3, 2 - pavgb t3, mpb_00 - pxor t3, t1 - pand t3, mpb_01 - psubb t1, t3 ; p0'a = (p2+2*p1+2*p0+2*q0+q1+4)/8 - - mova t3, p0 - mova t2, p0 - pxor t3, q1 - pavgb t2, q1 - pand t3, mpb_01 - psubb t2, t3 - pavgb t2, p1 ; p0'b = (2*p1+p0+q0+2)/4 - - pxor t1, t2 - pxor t2, p0 - pand t1, mask1p - pand t2, mask0 - pxor t1, t2 - pxor t1, p0 - mova %1, t1 ; store p0 - - mova t1, %4 ; p3 - mova t2, t1 - pavgb t1, p2 - paddb t2, p2 - pavgb t1, t0 ; (p3+p2+1)/2 + (p2+p1+p0+q0+2)/4 - paddb t2, t2 - paddb t2, t4 ; 2*p3+3*p2+p1+p0+q0 - psrlw t2, 2 - pavgb t2, mpb_00 - pxor t2, t1 - pand t2, mpb_01 - psubb t1, t2 ; p2' = (2*p3+3*p2+p1+p0+q0+4)/8 - - pxor t0, p1 - pxor t1, p2 - pand t0, mask1p - pand t1, mask1p - pxor t0, p1 - pxor t1, p2 - mova %2, t0 ; store p1 - mova %3, t1 ; store p2 -%endmacro - -%macro LUMA_INTRA_SWAP_PQ 0 - %define q1 m0 - %define q0 m1 - %define p0 m2 - %define p1 m3 - %define p2 q2 - %define mask1p mask1q -%endmacro - -%macro DEBLOCK_LUMA_INTRA 2 - %define p1 m0 - %define p0 m1 - %define q0 m2 - %define q1 m3 - %define t0 m4 - %define t1 m5 - %define t2 m6 - %define t3 m7 -%ifdef ARCH_X86_64 - %define p2 m8 - %define q2 m9 - %define t4 m10 - %define t5 m11 - %define mask0 m12 - %define mask1p m13 - %define mask1q [rsp-24] - %define mpb_00 m14 - %define mpb_01 m15 -%else - %define spill(x) [esp+16*x+((stack_offset+4)&15)] - %define p2 [r4+r1] - %define q2 [r0+2*r1] - %define t4 spill(0) - %define t5 spill(1) - %define mask0 spill(2) - %define mask1p spill(3) - %define mask1q spill(4) - %define mpb_00 [pb_00] - %define mpb_01 [pb_01] -%endif - -;----------------------------------------------------------------------------- -; void x264_deblock_v_luma_intra_sse2( uint8_t *pix, int stride, int alpha, int beta ) -;----------------------------------------------------------------------------- -cglobal x264_deblock_%2_luma_intra_%1, 4,6,16 -%ifndef ARCH_X86_64 - sub esp, 0x60 -%endif - lea r4, [r1*4] - lea r5, [r1*3] ; 3*stride - dec r2d ; alpha-1 - jl .end - neg r4 - dec r3d ; beta-1 - jl .end - add r4, r0 ; pix-4*stride - mova p1, [r4+2*r1] - mova p0, [r4+r5] - mova q0, [r0] - mova q1, [r0+r1] -%ifdef ARCH_X86_64 - pxor mpb_00, mpb_00 - mova mpb_01, [pb_01] - LOAD_MASK r2d, r3d, t5 ; m5=beta-1, t5=alpha-1, m7=mask0 - SWAP 7, 12 ; m12=mask0 - pavgb t5, mpb_00 - pavgb t5, mpb_01 ; alpha/4+1 - movdqa p2, [r4+r1] - movdqa q2, [r0+2*r1] - DIFF_GT2 p0, q0, t5, t0, t3 ; t0 = |p0-q0| > alpha/4+1 - DIFF_GT2 p0, p2, m5, t2, t5 ; mask1 = |p2-p0| > beta-1 - DIFF_GT2 q0, q2, m5, t4, t5 ; t4 = |q2-q0| > beta-1 - pand t0, mask0 - pand t4, t0 - pand t2, t0 - mova mask1q, t4 - mova mask1p, t2 -%else - LOAD_MASK r2d, r3d, t5 ; m5=beta-1, t5=alpha-1, m7=mask0 - mova m4, t5 - mova mask0, m7 - pavgb m4, [pb_00] - pavgb m4, [pb_01] ; alpha/4+1 - DIFF_GT2 p0, q0, m4, m6, m7 ; m6 = |p0-q0| > alpha/4+1 - pand m6, mask0 - DIFF_GT2 p0, p2, m5, m4, m7 ; m4 = |p2-p0| > beta-1 - pand m4, m6 - mova mask1p, m4 - DIFF_GT2 q0, q2, m5, m4, m7 ; m4 = |q2-q0| > beta-1 - pand m4, m6 - mova mask1q, m4 -%endif - LUMA_INTRA_P012 [r4+r5], [r4+2*r1], [r4+r1], [r4] - LUMA_INTRA_SWAP_PQ - LUMA_INTRA_P012 [r0], [r0+r1], [r0+2*r1], [r0+r5] -.end: -%ifndef ARCH_X86_64 - add esp, 0x60 -%endif - RET - -INIT_MMX -%ifdef ARCH_X86_64 -;----------------------------------------------------------------------------- -; void x264_deblock_h_luma_intra_sse2( uint8_t *pix, int stride, int alpha, int beta ) -;----------------------------------------------------------------------------- -cglobal x264_deblock_h_luma_intra_%1, 4,7 - movsxd r10, r1d - lea r11, [r10*3] - lea r6, [r0-4] - lea r5, [r0-4+r11] - sub rsp, 0x88 - %define pix_tmp rsp - - ; transpose 8x16 -> tmp space - TRANSPOSE8x8_MEM PASS8ROWS(r6, r5, r10, r11), PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30) - lea r6, [r6+r10*8] - lea r5, [r5+r10*8] - TRANSPOSE8x8_MEM PASS8ROWS(r6, r5, r10, r11), PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30) - - lea r0, [pix_tmp+0x40] - mov r1, 0x10 - call x264_deblock_v_luma_intra_%1 - - ; transpose 16x6 -> original space (but we can't write only 6 pixels, so really 16x8) - lea r5, [r6+r11] - TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30), PASS8ROWS(r6, r5, r10, r11) - shl r10, 3 - sub r6, r10 - sub r5, r10 - shr r10, 3 - TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30), PASS8ROWS(r6, r5, r10, r11) - add rsp, 0x88 - RET -%else -cglobal x264_deblock_h_luma_intra_%1, 2,4 - lea r3, [r1*3] - sub r0, 4 - lea r2, [r0+r3] -%assign pad 0x8c-(stack_offset&15) - SUB rsp, pad - %define pix_tmp rsp - - ; transpose 8x16 -> tmp space - TRANSPOSE8x8_MEM PASS8ROWS(r0, r2, r1, r3), PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30) - lea r0, [r0+r1*8] - lea r2, [r2+r1*8] - TRANSPOSE8x8_MEM PASS8ROWS(r0, r2, r1, r3), PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30) - - lea r0, [pix_tmp+0x40] - PUSH dword r3m - PUSH dword r2m - PUSH dword 16 - PUSH r0 - call x264_deblock_%2_luma_intra_%1 -%ifidn %2, v8 - add dword [rsp], 8 ; pix_tmp+8 - call x264_deblock_%2_luma_intra_%1 -%endif - ADD esp, 16 - - mov r1, r1m - mov r0, r0mp - lea r3, [r1*3] - sub r0, 4 - lea r2, [r0+r3] - ; transpose 16x6 -> original space (but we can't write only 6 pixels, so really 16x8) - TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30), PASS8ROWS(r0, r2, r1, r3) - lea r0, [r0+r1*8] - lea r2, [r2+r1*8] - TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30), PASS8ROWS(r0, r2, r1, r3) - ADD rsp, pad - RET -%endif ; ARCH_X86_64 -%endmacro ; DEBLOCK_LUMA_INTRA - -INIT_XMM -DEBLOCK_LUMA_INTRA sse2, v -%ifndef ARCH_X86_64 -INIT_MMX -DEBLOCK_LUMA_INTRA mmxext, v8 -%endif diff --git a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/h264_idct.asm b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/h264_idct.asm new file mode 100644 index 000000000..3311ab559 --- /dev/null +++ b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/h264_idct.asm @@ -0,0 +1,865 @@ +;***************************************************************************** +;* MMX/SSE2-optimized H.264 iDCT +;***************************************************************************** +;* Copyright (C) 2004-2005 Michael Niedermayer, Loren Merritt +;* Copyright (C) 2003-2008 x264 project +;* +;* Authors: Laurent Aimar +;* Loren Merritt +;* Holger Lubitz +;* Min Chen +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with FFmpeg; if not, write to the Free Software +;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;***************************************************************************** + +%include "x86inc.asm" +%include "x86util.asm" + +SECTION_RODATA + +; FIXME this table is a duplicate from h264data.h, and will be removed once the tables from, h264 have been split +scan8_mem: db 4+1*8, 5+1*8, 4+2*8, 5+2*8 + db 6+1*8, 7+1*8, 6+2*8, 7+2*8 + db 4+3*8, 5+3*8, 4+4*8, 5+4*8 + db 6+3*8, 7+3*8, 6+4*8, 7+4*8 + db 1+1*8, 2+1*8 + db 1+2*8, 2+2*8 + db 1+4*8, 2+4*8 + db 1+5*8, 2+5*8 +%ifdef PIC +%define scan8 r11 +%else +%define scan8 scan8_mem +%endif + +cextern pw_32 + +SECTION .text + +; %1=uint8_t *dst, %2=int16_t *block, %3=int stride +%macro IDCT4_ADD 3 + ; Load dct coeffs + movq m0, [%2] + movq m1, [%2+8] + movq m2, [%2+16] + movq m3, [%2+24] + + IDCT4_1D 0, 1, 2, 3, 4, 5 + mova m6, [pw_32] + TRANSPOSE4x4W 0, 1, 2, 3, 4 + paddw m0, m6 + IDCT4_1D 0, 1, 2, 3, 4, 5 + pxor m7, m7 + + STORE_DIFFx2 m0, m1, m4, m5, m7, 6, %1, %3 + lea %1, [%1+%3*2] + STORE_DIFFx2 m2, m3, m4, m5, m7, 6, %1, %3 +%endmacro + +INIT_MMX +; ff_h264_idct_add_mmx(uint8_t *dst, int16_t *block, int stride) +cglobal h264_idct_add_mmx, 3, 3, 0 + IDCT4_ADD r0, r1, r2 + RET + +%macro IDCT8_1D 2 + mova m4, m5 + mova m0, m1 + psraw m4, 1 + psraw m1, 1 + paddw m4, m5 + paddw m1, m0 + paddw m4, m7 + paddw m1, m5 + psubw m4, m0 + paddw m1, m3 + + psubw m0, m3 + psubw m5, m3 + paddw m0, m7 + psubw m5, m7 + psraw m3, 1 + psraw m7, 1 + psubw m0, m3 + psubw m5, m7 + + mova m3, m4 + mova m7, m1 + psraw m1, 2 + psraw m3, 2 + paddw m3, m0 + psraw m0, 2 + paddw m1, m5 + psraw m5, 2 + psubw m0, m4 + psubw m7, m5 + + mova m4, m2 + mova m5, m6 + psraw m4, 1 + psraw m6, 1 + psubw m4, m5 + paddw m6, m2 + + mova m2, %1 + mova m5, %2 + SUMSUB_BA m5, m2 + SUMSUB_BA m6, m5 + SUMSUB_BA m4, m2 + SUMSUB_BA m7, m6 + SUMSUB_BA m0, m4 + SUMSUB_BA m3, m2 + SUMSUB_BA m1, m5 + SWAP 7, 6, 4, 5, 2, 3, 1, 0 ; 70315246 -> 01234567 +%endmacro + +%macro IDCT8_1D_FULL 1 + mova m7, [%1+112] + mova m6, [%1+ 96] + mova m5, [%1+ 80] + mova m3, [%1+ 48] + mova m2, [%1+ 32] + mova m1, [%1+ 16] + IDCT8_1D [%1], [%1+ 64] +%endmacro + +; %1=int16_t *block, %2=int16_t *dstblock +%macro IDCT8_ADD_MMX_START 2 + IDCT8_1D_FULL %1 + mova [%1], m7 + TRANSPOSE4x4W 0, 1, 2, 3, 7 + mova m7, [%1] + mova [%2 ], m0 + mova [%2+16], m1 + mova [%2+32], m2 + mova [%2+48], m3 + TRANSPOSE4x4W 4, 5, 6, 7, 3 + mova [%2+ 8], m4 + mova [%2+24], m5 + mova [%2+40], m6 + mova [%2+56], m7 +%endmacro + +; %1=uint8_t *dst, %2=int16_t *block, %3=int stride +%macro IDCT8_ADD_MMX_END 3 + IDCT8_1D_FULL %2 + mova [%2 ], m5 + mova [%2+16], m6 + mova [%2+32], m7 + + pxor m7, m7 + STORE_DIFFx2 m0, m1, m5, m6, m7, 6, %1, %3 + lea %1, [%1+%3*2] + STORE_DIFFx2 m2, m3, m5, m6, m7, 6, %1, %3 + mova m0, [%2 ] + mova m1, [%2+16] + mova m2, [%2+32] + lea %1, [%1+%3*2] + STORE_DIFFx2 m4, m0, m5, m6, m7, 6, %1, %3 + lea %1, [%1+%3*2] + STORE_DIFFx2 m1, m2, m5, m6, m7, 6, %1, %3 +%endmacro + +INIT_MMX +; ff_h264_idct8_add_mmx(uint8_t *dst, int16_t *block, int stride) +cglobal h264_idct8_add_mmx, 3, 4, 0 + %assign pad 128+4-(stack_offset&7) + SUB rsp, pad + + add word [r1], 32 + IDCT8_ADD_MMX_START r1 , rsp + IDCT8_ADD_MMX_START r1+8, rsp+64 + lea r3, [r0+4] + IDCT8_ADD_MMX_END r0 , rsp, r2 + IDCT8_ADD_MMX_END r3 , rsp+8, r2 + + ADD rsp, pad + RET + +; %1=uint8_t *dst, %2=int16_t *block, %3=int stride +%macro IDCT8_ADD_SSE 4 + IDCT8_1D_FULL %2 +%ifdef ARCH_X86_64 + TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8 +%else + TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [%2], [%2+16] +%endif + paddw m0, [pw_32] + +%ifndef ARCH_X86_64 + mova [%2 ], m0 + mova [%2+16], m4 + IDCT8_1D [%2], [%2+ 16] + mova [%2 ], m6 + mova [%2+16], m7 +%else + SWAP 0, 8 + SWAP 4, 9 + IDCT8_1D m8, m9 + SWAP 6, 8 + SWAP 7, 9 +%endif + + pxor m7, m7 + lea %4, [%3*3] + STORE_DIFF m0, m6, m7, [%1 ] + STORE_DIFF m1, m6, m7, [%1+%3 ] + STORE_DIFF m2, m6, m7, [%1+%3*2] + STORE_DIFF m3, m6, m7, [%1+%4 ] +%ifndef ARCH_X86_64 + mova m0, [%2 ] + mova m1, [%2+16] +%else + SWAP 0, 8 + SWAP 1, 9 +%endif + lea %1, [%1+%3*4] + STORE_DIFF m4, m6, m7, [%1 ] + STORE_DIFF m5, m6, m7, [%1+%3 ] + STORE_DIFF m0, m6, m7, [%1+%3*2] + STORE_DIFF m1, m6, m7, [%1+%4 ] +%endmacro + +INIT_XMM +; ff_h264_idct8_add_sse2(uint8_t *dst, int16_t *block, int stride) +cglobal h264_idct8_add_sse2, 3, 4, 10 + IDCT8_ADD_SSE r0, r1, r2, r3 + RET + +%macro DC_ADD_MMX2_INIT 2-3 +%if %0 == 2 + movsx %1, word [%1] + add %1, 32 + sar %1, 6 + movd m0, %1 + lea %1, [%2*3] +%else + add %3, 32 + sar %3, 6 + movd m0, %3 + lea %3, [%2*3] +%endif + pshufw m0, m0, 0 + pxor m1, m1 + psubw m1, m0 + packuswb m0, m0 + packuswb m1, m1 +%endmacro + +%macro DC_ADD_MMX2_OP 3-4 + %1 m2, [%2 ] + %1 m3, [%2+%3 ] + %1 m4, [%2+%3*2] + %1 m5, [%2+%4 ] + paddusb m2, m0 + paddusb m3, m0 + paddusb m4, m0 + paddusb m5, m0 + psubusb m2, m1 + psubusb m3, m1 + psubusb m4, m1 + psubusb m5, m1 + %1 [%2 ], m2 + %1 [%2+%3 ], m3 + %1 [%2+%3*2], m4 + %1 [%2+%4 ], m5 +%endmacro + +INIT_MMX +; ff_h264_idct_dc_add_mmx2(uint8_t *dst, int16_t *block, int stride) +cglobal h264_idct_dc_add_mmx2, 3, 3, 0 + DC_ADD_MMX2_INIT r1, r2 + DC_ADD_MMX2_OP movh, r0, r2, r1 + RET + +; ff_h264_idct8_dc_add_mmx2(uint8_t *dst, int16_t *block, int stride) +cglobal h264_idct8_dc_add_mmx2, 3, 3, 0 + DC_ADD_MMX2_INIT r1, r2 + DC_ADD_MMX2_OP mova, r0, r2, r1 + lea r0, [r0+r2*4] + DC_ADD_MMX2_OP mova, r0, r2, r1 + RET + +; ff_h264_idct_add16_mmx(uint8_t *dst, const int *block_offset, +; DCTELEM *block, int stride, const uint8_t nnzc[6*8]) +cglobal h264_idct_add16_mmx, 5, 7, 0 + xor r5, r5 +%ifdef PIC + lea r11, [scan8_mem] +%endif +.nextblock + movzx r6, byte [scan8+r5] + movzx r6, byte [r4+r6] + test r6, r6 + jz .skipblock + mov r6d, dword [r1+r5*4] + lea r6, [r0+r6] + IDCT4_ADD r6, r2, r3 +.skipblock + inc r5 + add r2, 32 + cmp r5, 16 + jl .nextblock + REP_RET + +; ff_h264_idct8_add4_mmx(uint8_t *dst, const int *block_offset, +; DCTELEM *block, int stride, const uint8_t nnzc[6*8]) +cglobal h264_idct8_add4_mmx, 5, 7, 0 + %assign pad 128+4-(stack_offset&7) + SUB rsp, pad + + xor r5, r5 +%ifdef PIC + lea r11, [scan8_mem] +%endif +.nextblock + movzx r6, byte [scan8+r5] + movzx r6, byte [r4+r6] + test r6, r6 + jz .skipblock + mov r6d, dword [r1+r5*4] + lea r6, [r0+r6] + add word [r2], 32 + IDCT8_ADD_MMX_START r2 , rsp + IDCT8_ADD_MMX_START r2+8, rsp+64 + IDCT8_ADD_MMX_END r6 , rsp, r3 + mov r6d, dword [r1+r5*4] + lea r6, [r0+r6+4] + IDCT8_ADD_MMX_END r6 , rsp+8, r3 +.skipblock + add r5, 4 + add r2, 128 + cmp r5, 16 + jl .nextblock + ADD rsp, pad + RET + +; ff_h264_idct_add16_mmx2(uint8_t *dst, const int *block_offset, +; DCTELEM *block, int stride, const uint8_t nnzc[6*8]) +cglobal h264_idct_add16_mmx2, 5, 7, 0 + xor r5, r5 +%ifdef PIC + lea r11, [scan8_mem] +%endif +.nextblock + movzx r6, byte [scan8+r5] + movzx r6, byte [r4+r6] + test r6, r6 + jz .skipblock + cmp r6, 1 + jnz .no_dc + movsx r6, word [r2] + test r6, r6 + jz .no_dc + DC_ADD_MMX2_INIT r2, r3, r6 +%ifdef ARCH_X86_64 +%define dst_reg r10 +%define dst_regd r10d +%else +%define dst_reg r1 +%define dst_regd r1d +%endif + mov dst_regd, dword [r1+r5*4] + lea dst_reg, [r0+dst_reg] + DC_ADD_MMX2_OP movh, dst_reg, r3, r6 +%ifndef ARCH_X86_64 + mov r1, r1m +%endif + inc r5 + add r2, 32 + cmp r5, 16 + jl .nextblock + REP_RET +.no_dc + mov r6d, dword [r1+r5*4] + lea r6, [r0+r6] + IDCT4_ADD r6, r2, r3 +.skipblock + inc r5 + add r2, 32 + cmp r5, 16 + jl .nextblock + REP_RET + +; ff_h264_idct_add16intra_mmx(uint8_t *dst, const int *block_offset, +; DCTELEM *block, int stride, const uint8_t nnzc[6*8]) +cglobal h264_idct_add16intra_mmx, 5, 7, 0 + xor r5, r5 +%ifdef PIC + lea r11, [scan8_mem] +%endif +.nextblock + movzx r6, byte [scan8+r5] + movzx r6, byte [r4+r6] + or r6w, word [r2] + test r6, r6 + jz .skipblock + mov r6d, dword [r1+r5*4] + lea r6, [r0+r6] + IDCT4_ADD r6, r2, r3 +.skipblock + inc r5 + add r2, 32 + cmp r5, 16 + jl .nextblock + REP_RET + +; ff_h264_idct_add16intra_mmx2(uint8_t *dst, const int *block_offset, +; DCTELEM *block, int stride, const uint8_t nnzc[6*8]) +cglobal h264_idct_add16intra_mmx2, 5, 7, 0 + xor r5, r5 +%ifdef PIC + lea r11, [scan8_mem] +%endif +.nextblock + movzx r6, byte [scan8+r5] + movzx r6, byte [r4+r6] + test r6, r6 + jz .try_dc + mov r6d, dword [r1+r5*4] + lea r6, [r0+r6] + IDCT4_ADD r6, r2, r3 + inc r5 + add r2, 32 + cmp r5, 16 + jl .nextblock + REP_RET +.try_dc + movsx r6, word [r2] + test r6, r6 + jz .skipblock + DC_ADD_MMX2_INIT r2, r3, r6 +%ifdef ARCH_X86_64 +%define dst_reg r10 +%define dst_regd r10d +%else +%define dst_reg r1 +%define dst_regd r1d +%endif + mov dst_regd, dword [r1+r5*4] + lea dst_reg, [r0+dst_reg] + DC_ADD_MMX2_OP movh, dst_reg, r3, r6 +%ifndef ARCH_X86_64 + mov r1, r1m +%endif +.skipblock + inc r5 + add r2, 32 + cmp r5, 16 + jl .nextblock + REP_RET + +; ff_h264_idct8_add4_mmx2(uint8_t *dst, const int *block_offset, +; DCTELEM *block, int stride, const uint8_t nnzc[6*8]) +cglobal h264_idct8_add4_mmx2, 5, 7, 0 + %assign pad 128+4-(stack_offset&7) + SUB rsp, pad + + xor r5, r5 +%ifdef PIC + lea r11, [scan8_mem] +%endif +.nextblock + movzx r6, byte [scan8+r5] + movzx r6, byte [r4+r6] + test r6, r6 + jz .skipblock + cmp r6, 1 + jnz .no_dc + movsx r6, word [r2] + test r6, r6 + jz .no_dc + DC_ADD_MMX2_INIT r2, r3, r6 +%ifdef ARCH_X86_64 +%define dst_reg r10 +%define dst_regd r10d +%else +%define dst_reg r1 +%define dst_regd r1d +%endif + mov dst_regd, dword [r1+r5*4] + lea dst_reg, [r0+dst_reg] + DC_ADD_MMX2_OP mova, dst_reg, r3, r6 + lea dst_reg, [dst_reg+r3*4] + DC_ADD_MMX2_OP mova, dst_reg, r3, r6 +%ifndef ARCH_X86_64 + mov r1, r1m +%endif + add r5, 4 + add r2, 128 + cmp r5, 16 + jl .nextblock + + ADD rsp, pad + RET +.no_dc + mov r6d, dword [r1+r5*4] + lea r6, [r0+r6] + add word [r2], 32 + IDCT8_ADD_MMX_START r2 , rsp + IDCT8_ADD_MMX_START r2+8, rsp+64 + IDCT8_ADD_MMX_END r6 , rsp, r3 + mov r6d, dword [r1+r5*4] + lea r6, [r0+r6+4] + IDCT8_ADD_MMX_END r6 , rsp+8, r3 +.skipblock + add r5, 4 + add r2, 128 + cmp r5, 16 + jl .nextblock + + ADD rsp, pad + RET + +INIT_XMM +; ff_h264_idct8_add4_sse2(uint8_t *dst, const int *block_offset, +; DCTELEM *block, int stride, const uint8_t nnzc[6*8]) +cglobal h264_idct8_add4_sse2, 5, 7, 10 + xor r5, r5 +%ifdef PIC + lea r11, [scan8_mem] +%endif +.nextblock + movzx r6, byte [scan8+r5] + movzx r6, byte [r4+r6] + test r6, r6 + jz .skipblock + cmp r6, 1 + jnz .no_dc + movsx r6, word [r2] + test r6, r6 + jz .no_dc +INIT_MMX + DC_ADD_MMX2_INIT r2, r3, r6 +%ifdef ARCH_X86_64 +%define dst_reg r10 +%define dst_regd r10d +%else +%define dst_reg r1 +%define dst_regd r1d +%endif + mov dst_regd, dword [r1+r5*4] + lea dst_reg, [r0+dst_reg] + DC_ADD_MMX2_OP mova, dst_reg, r3, r6 + lea dst_reg, [dst_reg+r3*4] + DC_ADD_MMX2_OP mova, dst_reg, r3, r6 +%ifndef ARCH_X86_64 + mov r1, r1m +%endif + add r5, 4 + add r2, 128 + cmp r5, 16 + jl .nextblock + REP_RET +.no_dc +INIT_XMM + mov dst_regd, dword [r1+r5*4] + lea dst_reg, [r0+dst_reg] + IDCT8_ADD_SSE dst_reg, r2, r3, r6 +%ifndef ARCH_X86_64 + mov r1, r1m +%endif +.skipblock + add r5, 4 + add r2, 128 + cmp r5, 16 + jl .nextblock + REP_RET + +INIT_MMX +h264_idct_add8_mmx_plane: +.nextblock + movzx r6, byte [scan8+r5] + movzx r6, byte [r4+r6] + or r6w, word [r2] + test r6, r6 + jz .skipblock +%ifdef ARCH_X86_64 + mov r0d, dword [r1+r5*4] + add r0, [r10] +%else + mov r0, r1m ; XXX r1m here is actually r0m of the calling func + mov r0, [r0] + add r0, dword [r1+r5*4] +%endif + IDCT4_ADD r0, r2, r3 +.skipblock + inc r5 + add r2, 32 + test r5, 3 + jnz .nextblock + rep ret + +; ff_h264_idct_add8_mmx(uint8_t **dest, const int *block_offset, +; DCTELEM *block, int stride, const uint8_t nnzc[6*8]) +cglobal h264_idct_add8_mmx, 5, 7, 0 + mov r5, 16 + add r2, 512 +%ifdef PIC + lea r11, [scan8_mem] +%endif +%ifdef ARCH_X86_64 + mov r10, r0 +%endif + call h264_idct_add8_mmx_plane +%ifdef ARCH_X86_64 + add r10, gprsize +%else + add r0mp, gprsize +%endif + call h264_idct_add8_mmx_plane + RET + +h264_idct_add8_mmx2_plane +.nextblock + movzx r6, byte [scan8+r5] + movzx r6, byte [r4+r6] + test r6, r6 + jz .try_dc +%ifdef ARCH_X86_64 + mov r0d, dword [r1+r5*4] + add r0, [r10] +%else + mov r0, r1m ; XXX r1m here is actually r0m of the calling func + mov r0, [r0] + add r0, dword [r1+r5*4] +%endif + IDCT4_ADD r0, r2, r3 + inc r5 + add r2, 32 + test r5, 3 + jnz .nextblock + rep ret +.try_dc + movsx r6, word [r2] + test r6, r6 + jz .skipblock + DC_ADD_MMX2_INIT r2, r3, r6 +%ifdef ARCH_X86_64 + mov r0d, dword [r1+r5*4] + add r0, [r10] +%else + mov r0, r1m ; XXX r1m here is actually r0m of the calling func + mov r0, [r0] + add r0, dword [r1+r5*4] +%endif + DC_ADD_MMX2_OP movh, r0, r3, r6 +.skipblock + inc r5 + add r2, 32 + test r5, 3 + jnz .nextblock + rep ret + +; ff_h264_idct_add8_mmx2(uint8_t **dest, const int *block_offset, +; DCTELEM *block, int stride, const uint8_t nnzc[6*8]) +cglobal h264_idct_add8_mmx2, 5, 7, 0 + mov r5, 16 + add r2, 512 +%ifdef ARCH_X86_64 + mov r10, r0 +%endif +%ifdef PIC + lea r11, [scan8_mem] +%endif + call h264_idct_add8_mmx2_plane +%ifdef ARCH_X86_64 + add r10, gprsize +%else + add r0mp, gprsize +%endif + call h264_idct_add8_mmx2_plane + RET + +INIT_MMX +; r0 = uint8_t *dst, r2 = int16_t *block, r3 = int stride, r6=clobbered +h264_idct_dc_add8_mmx2: + movd m0, [r2 ] ; 0 0 X D + punpcklwd m0, [r2+32] ; x X d D + paddsw m0, [pw_32] + psraw m0, 6 + punpcklwd m0, m0 ; d d D D + pxor m1, m1 ; 0 0 0 0 + psubw m1, m0 ; -d-d-D-D + packuswb m0, m1 ; -d-d-D-D d d D D + pshufw m1, m0, 0xFA ; -d-d-d-d-D-D-D-D + punpcklwd m0, m0 ; d d d d D D D D + lea r6, [r3*3] + DC_ADD_MMX2_OP movq, r0, r3, r6 + ret + +ALIGN 16 +INIT_XMM +; r0 = uint8_t *dst (clobbered), r2 = int16_t *block, r3 = int stride +x264_add8x4_idct_sse2: + movq m0, [r2+ 0] + movq m1, [r2+ 8] + movq m2, [r2+16] + movq m3, [r2+24] + movhps m0, [r2+32] + movhps m1, [r2+40] + movhps m2, [r2+48] + movhps m3, [r2+56] + IDCT4_1D 0,1,2,3,4,5 + TRANSPOSE2x4x4W 0,1,2,3,4 + paddw m0, [pw_32] + IDCT4_1D 0,1,2,3,4,5 + pxor m7, m7 + STORE_DIFFx2 m0, m1, m4, m5, m7, 6, r0, r3 + lea r0, [r0+r3*2] + STORE_DIFFx2 m2, m3, m4, m5, m7, 6, r0, r3 + ret + +%macro add16_sse2_cycle 2 + movzx r0, word [r4+%2] + test r0, r0 + jz .cycle%1end + mov r0d, dword [r1+%1*8] +%ifdef ARCH_X86_64 + add r0, r10 +%else + add r0, r0m +%endif + call x264_add8x4_idct_sse2 +.cycle%1end +%if %1 < 7 + add r2, 64 +%endif +%endmacro + +; ff_h264_idct_add16_sse2(uint8_t *dst, const int *block_offset, +; DCTELEM *block, int stride, const uint8_t nnzc[6*8]) +cglobal h264_idct_add16_sse2, 5, 5, 8 +%ifdef ARCH_X86_64 + mov r10, r0 +%endif + ; unrolling of the loop leads to an average performance gain of + ; 20-25% + add16_sse2_cycle 0, 0xc + add16_sse2_cycle 1, 0x14 + add16_sse2_cycle 2, 0xe + add16_sse2_cycle 3, 0x16 + add16_sse2_cycle 4, 0x1c + add16_sse2_cycle 5, 0x24 + add16_sse2_cycle 6, 0x1e + add16_sse2_cycle 7, 0x26 + RET + +; ff_h264_idct_add16intra_sse2(uint8_t *dst, const int *block_offset, +; DCTELEM *block, int stride, const uint8_t nnzc[6*8]) +cglobal h264_idct_add16intra_sse2, 5, 7, 8 + xor r5, r5 +%ifdef ARCH_X86_64 + mov r10, r0 +%endif +%ifdef PIC + lea r11, [scan8_mem] +%endif +.next2blocks + movzx r0, byte [scan8+r5] + movzx r0, word [r4+r0] + test r0, r0 + jz .try_dc + mov r0d, dword [r1+r5*4] +%ifdef ARCH_X86_64 + add r0, r10 +%else + add r0, r0m +%endif + call x264_add8x4_idct_sse2 + add r5, 2 + add r2, 64 + cmp r5, 16 + jl .next2blocks + REP_RET +.try_dc + movsx r0, word [r2 ] + or r0w, word [r2+32] + jz .skip2blocks + mov r0d, dword [r1+r5*4] +%ifdef ARCH_X86_64 + add r0, r10 +%else + add r0, r0m +%endif + call h264_idct_dc_add8_mmx2 +.skip2blocks + add r5, 2 + add r2, 64 + cmp r5, 16 + jl .next2blocks + REP_RET + +h264_idct_add8_sse2_plane: +.next2blocks + movzx r0, byte [scan8+r5] + movzx r0, word [r4+r0] + test r0, r0 + jz .try_dc +%ifdef ARCH_X86_64 + mov r0d, dword [r1+r5*4] + add r0, [r10] +%else + mov r0, r1m ; XXX r1m here is actually r0m of the calling func + mov r0, [r0] + add r0, dword [r1+r5*4] +%endif + call x264_add8x4_idct_sse2 + add r5, 2 + add r2, 64 + test r5, 3 + jnz .next2blocks + rep ret +.try_dc + movsx r0, word [r2 ] + or r0w, word [r2+32] + jz .skip2blocks +%ifdef ARCH_X86_64 + mov r0d, dword [r1+r5*4] + add r0, [r10] +%else + mov r0, r1m ; XXX r1m here is actually r0m of the calling func + mov r0, [r0] + add r0, dword [r1+r5*4] +%endif + call h264_idct_dc_add8_mmx2 +.skip2blocks + add r5, 2 + add r2, 64 + test r5, 3 + jnz .next2blocks + rep ret + +; ff_h264_idct_add8_sse2(uint8_t **dest, const int *block_offset, +; DCTELEM *block, int stride, const uint8_t nnzc[6*8]) +cglobal h264_idct_add8_sse2, 5, 7, 8 + mov r5, 16 + add r2, 512 +%ifdef PIC + lea r11, [scan8_mem] +%endif +%ifdef ARCH_X86_64 + mov r10, r0 +%endif + call h264_idct_add8_sse2_plane +%ifdef ARCH_X86_64 + add r10, gprsize +%else + add r0mp, gprsize +%endif + call h264_idct_add8_sse2_plane + RET diff --git a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/h264_idct_sse2.asm b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/h264_idct_sse2.asm deleted file mode 100644 index 86c1e66c7..000000000 --- a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/h264_idct_sse2.asm +++ /dev/null @@ -1,54 +0,0 @@ -;***************************************************************************** -;* SSE2-optimized H.264 iDCT -;***************************************************************************** -;* Copyright (C) 2003-2008 x264 project -;* -;* Authors: Laurent Aimar -;* Loren Merritt -;* Holger Lubitz -;* Min Chen -;* -;* This program is free software; you can redistribute it and/or modify -;* it under the terms of the GNU General Public License as published by -;* the Free Software Foundation; either version 2 of the License, or -;* (at your option) any later version. -;* -;* This program is distributed in the hope that it will be useful, -;* but WITHOUT ANY WARRANTY; without even the implied warranty of -;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -;* GNU General Public License for more details. -;* -;* You should have received a copy of the GNU General Public License -;* along with this program; if not, write to the Free Software -;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. -;***************************************************************************** - -%include "x86inc.asm" -%include "x86util.asm" - -SECTION_RODATA -pw_32: times 8 dw 32 - -SECTION .text - -INIT_XMM -cglobal x264_add8x4_idct_sse2, 3,3,8 - movq m0, [r1+ 0] - movq m1, [r1+ 8] - movq m2, [r1+16] - movq m3, [r1+24] - movhps m0, [r1+32] - movhps m1, [r1+40] - movhps m2, [r1+48] - movhps m3, [r1+56] - IDCT4_1D 0,1,2,3,4,5 - TRANSPOSE2x4x4W 0,1,2,3,4 - paddw m0, [pw_32] - IDCT4_1D 0,1,2,3,4,5 - pxor m7, m7 - STORE_DIFF m0, m4, m7, [r0] - STORE_DIFF m1, m4, m7, [r0+r2] - lea r0, [r0+r2*2] - STORE_DIFF m2, m4, m7, [r0] - STORE_DIFF m3, m4, m7, [r0+r2] - RET diff --git a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/h264_intrapred_init.c b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/h264_intrapred_init.c new file mode 100644 index 000000000..e01a17bd6 --- /dev/null +++ b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/h264_intrapred_init.c @@ -0,0 +1,103 @@ +/* + * Copyright (c) 2010 Jason Garrett-Glaser + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/cpu.h" +#include "libavcodec/h264pred.h" + +void ff_pred16x16_vertical_mmx (uint8_t *src, int stride); +void ff_pred16x16_vertical_sse (uint8_t *src, int stride); +void ff_pred16x16_horizontal_mmx (uint8_t *src, int stride); +void ff_pred16x16_horizontal_mmxext(uint8_t *src, int stride); +void ff_pred16x16_horizontal_ssse3 (uint8_t *src, int stride); +void ff_pred16x16_dc_mmxext (uint8_t *src, int stride); +void ff_pred16x16_dc_sse2 (uint8_t *src, int stride); +void ff_pred16x16_dc_ssse3 (uint8_t *src, int stride); +void ff_pred16x16_tm_vp8_mmx (uint8_t *src, int stride); +void ff_pred16x16_tm_vp8_mmxext (uint8_t *src, int stride); +void ff_pred16x16_tm_vp8_sse2 (uint8_t *src, int stride); +void ff_pred8x8_dc_rv40_mmxext (uint8_t *src, int stride); +void ff_pred8x8_vertical_mmx (uint8_t *src, int stride); +void ff_pred8x8_horizontal_mmx (uint8_t *src, int stride); +void ff_pred8x8_horizontal_mmxext (uint8_t *src, int stride); +void ff_pred8x8_horizontal_ssse3 (uint8_t *src, int stride); +void ff_pred8x8_tm_vp8_mmx (uint8_t *src, int stride); +void ff_pred8x8_tm_vp8_mmxext (uint8_t *src, int stride); +void ff_pred8x8_tm_vp8_sse2 (uint8_t *src, int stride); +void ff_pred8x8_tm_vp8_ssse3 (uint8_t *src, int stride); +void ff_pred4x4_dc_mmxext (uint8_t *src, const uint8_t *topright, int stride); +void ff_pred4x4_tm_vp8_mmx (uint8_t *src, const uint8_t *topright, int stride); +void ff_pred4x4_tm_vp8_mmxext (uint8_t *src, const uint8_t *topright, int stride); +void ff_pred4x4_tm_vp8_ssse3 (uint8_t *src, const uint8_t *topright, int stride); +void ff_pred4x4_vertical_vp8_mmxext(uint8_t *src, const uint8_t *topright, int stride); + +void ff_h264_pred_init_x86(H264PredContext *h, int codec_id) +{ + int mm_flags = av_get_cpu_flags(); + +#if HAVE_YASM + if (mm_flags & AV_CPU_FLAG_MMX) { + h->pred16x16[VERT_PRED8x8] = ff_pred16x16_vertical_mmx; + h->pred16x16[HOR_PRED8x8 ] = ff_pred16x16_horizontal_mmx; + h->pred8x8 [VERT_PRED8x8] = ff_pred8x8_vertical_mmx; + h->pred8x8 [HOR_PRED8x8 ] = ff_pred8x8_horizontal_mmx; + if (codec_id == CODEC_ID_VP8) { + h->pred16x16[PLANE_PRED8x8] = ff_pred16x16_tm_vp8_mmx; + h->pred8x8 [PLANE_PRED8x8] = ff_pred8x8_tm_vp8_mmx; + h->pred4x4 [TM_VP8_PRED ] = ff_pred4x4_tm_vp8_mmx; + } + } + + if (mm_flags & AV_CPU_FLAG_MMX2) { + h->pred16x16[HOR_PRED8x8 ] = ff_pred16x16_horizontal_mmxext; + h->pred16x16[DC_PRED8x8 ] = ff_pred16x16_dc_mmxext; + h->pred8x8 [HOR_PRED8x8 ] = ff_pred8x8_horizontal_mmxext; + h->pred4x4 [DC_PRED ] = ff_pred4x4_dc_mmxext; + if (codec_id == CODEC_ID_VP8) { + h->pred16x16[PLANE_PRED8x8] = ff_pred16x16_tm_vp8_mmxext; + h->pred8x8 [DC_PRED8x8 ] = ff_pred8x8_dc_rv40_mmxext; + h->pred8x8 [PLANE_PRED8x8] = ff_pred8x8_tm_vp8_mmxext; + h->pred4x4 [TM_VP8_PRED ] = ff_pred4x4_tm_vp8_mmxext; + h->pred4x4 [VERT_PRED ] = ff_pred4x4_vertical_vp8_mmxext; + } + } + + if (mm_flags & AV_CPU_FLAG_SSE) { + h->pred16x16[VERT_PRED8x8] = ff_pred16x16_vertical_sse; + } + + if (mm_flags & AV_CPU_FLAG_SSE2) { + h->pred16x16[DC_PRED8x8 ] = ff_pred16x16_dc_sse2; + if (codec_id == CODEC_ID_VP8) { + h->pred16x16[PLANE_PRED8x8] = ff_pred16x16_tm_vp8_sse2; + h->pred8x8 [PLANE_PRED8x8] = ff_pred8x8_tm_vp8_sse2; + } + } + + if (mm_flags & AV_CPU_FLAG_SSSE3) { + h->pred16x16[HOR_PRED8x8 ] = ff_pred16x16_horizontal_ssse3; + h->pred16x16[DC_PRED8x8 ] = ff_pred16x16_dc_ssse3; + h->pred8x8 [HOR_PRED8x8 ] = ff_pred8x8_horizontal_ssse3; + if (codec_id == CODEC_ID_VP8) { + h->pred8x8 [PLANE_PRED8x8] = ff_pred8x8_tm_vp8_ssse3; + h->pred4x4 [TM_VP8_PRED ] = ff_pred4x4_tm_vp8_ssse3; + } + } +#endif +} diff --git a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/h264_qpel_mmx.c b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/h264_qpel_mmx.c new file mode 100644 index 000000000..e94ed0935 --- /dev/null +++ b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/h264_qpel_mmx.c @@ -0,0 +1,1209 @@ +/* + * Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "dsputil_mmx.h" + +/***********************************/ +/* motion compensation */ + +#define QPEL_H264V_MM(A,B,C,D,E,F,OP,T,Z,d,q)\ + "mov"#q" "#C", "#T" \n\t"\ + "mov"#d" (%0), "#F" \n\t"\ + "paddw "#D", "#T" \n\t"\ + "psllw $2, "#T" \n\t"\ + "psubw "#B", "#T" \n\t"\ + "psubw "#E", "#T" \n\t"\ + "punpcklbw "#Z", "#F" \n\t"\ + "pmullw %4, "#T" \n\t"\ + "paddw %5, "#A" \n\t"\ + "add %2, %0 \n\t"\ + "paddw "#F", "#A" \n\t"\ + "paddw "#A", "#T" \n\t"\ + "psraw $5, "#T" \n\t"\ + "packuswb "#T", "#T" \n\t"\ + OP(T, (%1), A, d)\ + "add %3, %1 \n\t" + +#define QPEL_H264HV_MM(A,B,C,D,E,F,OF,T,Z,d,q)\ + "mov"#q" "#C", "#T" \n\t"\ + "mov"#d" (%0), "#F" \n\t"\ + "paddw "#D", "#T" \n\t"\ + "psllw $2, "#T" \n\t"\ + "paddw %4, "#A" \n\t"\ + "psubw "#B", "#T" \n\t"\ + "psubw "#E", "#T" \n\t"\ + "punpcklbw "#Z", "#F" \n\t"\ + "pmullw %3, "#T" \n\t"\ + "paddw "#F", "#A" \n\t"\ + "add %2, %0 \n\t"\ + "paddw "#A", "#T" \n\t"\ + "mov"#q" "#T", "#OF"(%1) \n\t" + +#define QPEL_H264V(A,B,C,D,E,F,OP) QPEL_H264V_MM(A,B,C,D,E,F,OP,%%mm6,%%mm7,d,q) +#define QPEL_H264HV(A,B,C,D,E,F,OF) QPEL_H264HV_MM(A,B,C,D,E,F,OF,%%mm6,%%mm7,d,q) +#define QPEL_H264V_XMM(A,B,C,D,E,F,OP) QPEL_H264V_MM(A,B,C,D,E,F,OP,%%xmm6,%%xmm7,q,dqa) +#define QPEL_H264HV_XMM(A,B,C,D,E,F,OF) QPEL_H264HV_MM(A,B,C,D,E,F,OF,%%xmm6,%%xmm7,q,dqa) + + +#define QPEL_H264(OPNAME, OP, MMX)\ +static av_noinline void OPNAME ## h264_qpel4_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ + int h=4;\ +\ + __asm__ volatile(\ + "pxor %%mm7, %%mm7 \n\t"\ + "movq "MANGLE(ff_pw_5) ", %%mm4\n\t"\ + "movq "MANGLE(ff_pw_16)", %%mm5\n\t"\ + "1: \n\t"\ + "movd -1(%0), %%mm1 \n\t"\ + "movd (%0), %%mm2 \n\t"\ + "movd 1(%0), %%mm3 \n\t"\ + "movd 2(%0), %%mm0 \n\t"\ + "punpcklbw %%mm7, %%mm1 \n\t"\ + "punpcklbw %%mm7, %%mm2 \n\t"\ + "punpcklbw %%mm7, %%mm3 \n\t"\ + "punpcklbw %%mm7, %%mm0 \n\t"\ + "paddw %%mm0, %%mm1 \n\t"\ + "paddw %%mm3, %%mm2 \n\t"\ + "movd -2(%0), %%mm0 \n\t"\ + "movd 3(%0), %%mm3 \n\t"\ + "punpcklbw %%mm7, %%mm0 \n\t"\ + "punpcklbw %%mm7, %%mm3 \n\t"\ + "paddw %%mm3, %%mm0 \n\t"\ + "psllw $2, %%mm2 \n\t"\ + "psubw %%mm1, %%mm2 \n\t"\ + "pmullw %%mm4, %%mm2 \n\t"\ + "paddw %%mm5, %%mm0 \n\t"\ + "paddw %%mm2, %%mm0 \n\t"\ + "psraw $5, %%mm0 \n\t"\ + "packuswb %%mm0, %%mm0 \n\t"\ + OP(%%mm0, (%1),%%mm6, d)\ + "add %3, %0 \n\t"\ + "add %4, %1 \n\t"\ + "decl %2 \n\t"\ + " jnz 1b \n\t"\ + : "+a"(src), "+c"(dst), "+g"(h)\ + : "d"((x86_reg)srcStride), "S"((x86_reg)dstStride)\ + : "memory"\ + );\ +}\ +static av_noinline void OPNAME ## h264_qpel4_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\ + int h=4;\ + __asm__ volatile(\ + "pxor %%mm7, %%mm7 \n\t"\ + "movq %0, %%mm4 \n\t"\ + "movq %1, %%mm5 \n\t"\ + :: "m"(ff_pw_5), "m"(ff_pw_16)\ + );\ + do{\ + __asm__ volatile(\ + "movd -1(%0), %%mm1 \n\t"\ + "movd (%0), %%mm2 \n\t"\ + "movd 1(%0), %%mm3 \n\t"\ + "movd 2(%0), %%mm0 \n\t"\ + "punpcklbw %%mm7, %%mm1 \n\t"\ + "punpcklbw %%mm7, %%mm2 \n\t"\ + "punpcklbw %%mm7, %%mm3 \n\t"\ + "punpcklbw %%mm7, %%mm0 \n\t"\ + "paddw %%mm0, %%mm1 \n\t"\ + "paddw %%mm3, %%mm2 \n\t"\ + "movd -2(%0), %%mm0 \n\t"\ + "movd 3(%0), %%mm3 \n\t"\ + "punpcklbw %%mm7, %%mm0 \n\t"\ + "punpcklbw %%mm7, %%mm3 \n\t"\ + "paddw %%mm3, %%mm0 \n\t"\ + "psllw $2, %%mm2 \n\t"\ + "psubw %%mm1, %%mm2 \n\t"\ + "pmullw %%mm4, %%mm2 \n\t"\ + "paddw %%mm5, %%mm0 \n\t"\ + "paddw %%mm2, %%mm0 \n\t"\ + "movd (%2), %%mm3 \n\t"\ + "psraw $5, %%mm0 \n\t"\ + "packuswb %%mm0, %%mm0 \n\t"\ + PAVGB" %%mm3, %%mm0 \n\t"\ + OP(%%mm0, (%1),%%mm6, d)\ + "add %4, %0 \n\t"\ + "add %4, %1 \n\t"\ + "add %3, %2 \n\t"\ + : "+a"(src), "+c"(dst), "+d"(src2)\ + : "D"((x86_reg)src2Stride), "S"((x86_reg)dstStride)\ + : "memory"\ + );\ + }while(--h);\ +}\ +static av_noinline void OPNAME ## h264_qpel4_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ + src -= 2*srcStride;\ + __asm__ volatile(\ + "pxor %%mm7, %%mm7 \n\t"\ + "movd (%0), %%mm0 \n\t"\ + "add %2, %0 \n\t"\ + "movd (%0), %%mm1 \n\t"\ + "add %2, %0 \n\t"\ + "movd (%0), %%mm2 \n\t"\ + "add %2, %0 \n\t"\ + "movd (%0), %%mm3 \n\t"\ + "add %2, %0 \n\t"\ + "movd (%0), %%mm4 \n\t"\ + "add %2, %0 \n\t"\ + "punpcklbw %%mm7, %%mm0 \n\t"\ + "punpcklbw %%mm7, %%mm1 \n\t"\ + "punpcklbw %%mm7, %%mm2 \n\t"\ + "punpcklbw %%mm7, %%mm3 \n\t"\ + "punpcklbw %%mm7, %%mm4 \n\t"\ + QPEL_H264V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\ + QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\ + QPEL_H264V(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\ + QPEL_H264V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\ + \ + : "+a"(src), "+c"(dst)\ + : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\ + : "memory"\ + );\ +}\ +static av_noinline void OPNAME ## h264_qpel4_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\ + int h=4;\ + int w=3;\ + src -= 2*srcStride+2;\ + while(w--){\ + __asm__ volatile(\ + "pxor %%mm7, %%mm7 \n\t"\ + "movd (%0), %%mm0 \n\t"\ + "add %2, %0 \n\t"\ + "movd (%0), %%mm1 \n\t"\ + "add %2, %0 \n\t"\ + "movd (%0), %%mm2 \n\t"\ + "add %2, %0 \n\t"\ + "movd (%0), %%mm3 \n\t"\ + "add %2, %0 \n\t"\ + "movd (%0), %%mm4 \n\t"\ + "add %2, %0 \n\t"\ + "punpcklbw %%mm7, %%mm0 \n\t"\ + "punpcklbw %%mm7, %%mm1 \n\t"\ + "punpcklbw %%mm7, %%mm2 \n\t"\ + "punpcklbw %%mm7, %%mm3 \n\t"\ + "punpcklbw %%mm7, %%mm4 \n\t"\ + QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 0*8*3)\ + QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 1*8*3)\ + QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 2*8*3)\ + QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 3*8*3)\ + \ + : "+a"(src)\ + : "c"(tmp), "S"((x86_reg)srcStride), "m"(ff_pw_5), "m"(ff_pw_16)\ + : "memory"\ + );\ + tmp += 4;\ + src += 4 - 9*srcStride;\ + }\ + tmp -= 3*4;\ + __asm__ volatile(\ + "1: \n\t"\ + "movq (%0), %%mm0 \n\t"\ + "paddw 10(%0), %%mm0 \n\t"\ + "movq 2(%0), %%mm1 \n\t"\ + "paddw 8(%0), %%mm1 \n\t"\ + "movq 4(%0), %%mm2 \n\t"\ + "paddw 6(%0), %%mm2 \n\t"\ + "psubw %%mm1, %%mm0 \n\t"/*a-b (abccba)*/\ + "psraw $2, %%mm0 \n\t"/*(a-b)/4 */\ + "psubw %%mm1, %%mm0 \n\t"/*(a-b)/4-b */\ + "paddsw %%mm2, %%mm0 \n\t"\ + "psraw $2, %%mm0 \n\t"/*((a-b)/4-b+c)/4 */\ + "paddw %%mm2, %%mm0 \n\t"/*(a-5*b+20*c)/16 */\ + "psraw $6, %%mm0 \n\t"\ + "packuswb %%mm0, %%mm0 \n\t"\ + OP(%%mm0, (%1),%%mm7, d)\ + "add $24, %0 \n\t"\ + "add %3, %1 \n\t"\ + "decl %2 \n\t"\ + " jnz 1b \n\t"\ + : "+a"(tmp), "+c"(dst), "+g"(h)\ + : "S"((x86_reg)dstStride)\ + : "memory"\ + );\ +}\ +\ +static av_noinline void OPNAME ## h264_qpel8_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ + int h=8;\ + __asm__ volatile(\ + "pxor %%mm7, %%mm7 \n\t"\ + "movq "MANGLE(ff_pw_5)", %%mm6\n\t"\ + "1: \n\t"\ + "movq (%0), %%mm0 \n\t"\ + "movq 1(%0), %%mm2 \n\t"\ + "movq %%mm0, %%mm1 \n\t"\ + "movq %%mm2, %%mm3 \n\t"\ + "punpcklbw %%mm7, %%mm0 \n\t"\ + "punpckhbw %%mm7, %%mm1 \n\t"\ + "punpcklbw %%mm7, %%mm2 \n\t"\ + "punpckhbw %%mm7, %%mm3 \n\t"\ + "paddw %%mm2, %%mm0 \n\t"\ + "paddw %%mm3, %%mm1 \n\t"\ + "psllw $2, %%mm0 \n\t"\ + "psllw $2, %%mm1 \n\t"\ + "movq -1(%0), %%mm2 \n\t"\ + "movq 2(%0), %%mm4 \n\t"\ + "movq %%mm2, %%mm3 \n\t"\ + "movq %%mm4, %%mm5 \n\t"\ + "punpcklbw %%mm7, %%mm2 \n\t"\ + "punpckhbw %%mm7, %%mm3 \n\t"\ + "punpcklbw %%mm7, %%mm4 \n\t"\ + "punpckhbw %%mm7, %%mm5 \n\t"\ + "paddw %%mm4, %%mm2 \n\t"\ + "paddw %%mm3, %%mm5 \n\t"\ + "psubw %%mm2, %%mm0 \n\t"\ + "psubw %%mm5, %%mm1 \n\t"\ + "pmullw %%mm6, %%mm0 \n\t"\ + "pmullw %%mm6, %%mm1 \n\t"\ + "movd -2(%0), %%mm2 \n\t"\ + "movd 7(%0), %%mm5 \n\t"\ + "punpcklbw %%mm7, %%mm2 \n\t"\ + "punpcklbw %%mm7, %%mm5 \n\t"\ + "paddw %%mm3, %%mm2 \n\t"\ + "paddw %%mm5, %%mm4 \n\t"\ + "movq "MANGLE(ff_pw_16)", %%mm5\n\t"\ + "paddw %%mm5, %%mm2 \n\t"\ + "paddw %%mm5, %%mm4 \n\t"\ + "paddw %%mm2, %%mm0 \n\t"\ + "paddw %%mm4, %%mm1 \n\t"\ + "psraw $5, %%mm0 \n\t"\ + "psraw $5, %%mm1 \n\t"\ + "packuswb %%mm1, %%mm0 \n\t"\ + OP(%%mm0, (%1),%%mm5, q)\ + "add %3, %0 \n\t"\ + "add %4, %1 \n\t"\ + "decl %2 \n\t"\ + " jnz 1b \n\t"\ + : "+a"(src), "+c"(dst), "+g"(h)\ + : "d"((x86_reg)srcStride), "S"((x86_reg)dstStride)\ + : "memory"\ + );\ +}\ +\ +static av_noinline void OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\ + int h=8;\ + __asm__ volatile(\ + "pxor %%mm7, %%mm7 \n\t"\ + "movq %0, %%mm6 \n\t"\ + :: "m"(ff_pw_5)\ + );\ + do{\ + __asm__ volatile(\ + "movq (%0), %%mm0 \n\t"\ + "movq 1(%0), %%mm2 \n\t"\ + "movq %%mm0, %%mm1 \n\t"\ + "movq %%mm2, %%mm3 \n\t"\ + "punpcklbw %%mm7, %%mm0 \n\t"\ + "punpckhbw %%mm7, %%mm1 \n\t"\ + "punpcklbw %%mm7, %%mm2 \n\t"\ + "punpckhbw %%mm7, %%mm3 \n\t"\ + "paddw %%mm2, %%mm0 \n\t"\ + "paddw %%mm3, %%mm1 \n\t"\ + "psllw $2, %%mm0 \n\t"\ + "psllw $2, %%mm1 \n\t"\ + "movq -1(%0), %%mm2 \n\t"\ + "movq 2(%0), %%mm4 \n\t"\ + "movq %%mm2, %%mm3 \n\t"\ + "movq %%mm4, %%mm5 \n\t"\ + "punpcklbw %%mm7, %%mm2 \n\t"\ + "punpckhbw %%mm7, %%mm3 \n\t"\ + "punpcklbw %%mm7, %%mm4 \n\t"\ + "punpckhbw %%mm7, %%mm5 \n\t"\ + "paddw %%mm4, %%mm2 \n\t"\ + "paddw %%mm3, %%mm5 \n\t"\ + "psubw %%mm2, %%mm0 \n\t"\ + "psubw %%mm5, %%mm1 \n\t"\ + "pmullw %%mm6, %%mm0 \n\t"\ + "pmullw %%mm6, %%mm1 \n\t"\ + "movd -2(%0), %%mm2 \n\t"\ + "movd 7(%0), %%mm5 \n\t"\ + "punpcklbw %%mm7, %%mm2 \n\t"\ + "punpcklbw %%mm7, %%mm5 \n\t"\ + "paddw %%mm3, %%mm2 \n\t"\ + "paddw %%mm5, %%mm4 \n\t"\ + "movq %5, %%mm5 \n\t"\ + "paddw %%mm5, %%mm2 \n\t"\ + "paddw %%mm5, %%mm4 \n\t"\ + "paddw %%mm2, %%mm0 \n\t"\ + "paddw %%mm4, %%mm1 \n\t"\ + "psraw $5, %%mm0 \n\t"\ + "psraw $5, %%mm1 \n\t"\ + "movq (%2), %%mm4 \n\t"\ + "packuswb %%mm1, %%mm0 \n\t"\ + PAVGB" %%mm4, %%mm0 \n\t"\ + OP(%%mm0, (%1),%%mm5, q)\ + "add %4, %0 \n\t"\ + "add %4, %1 \n\t"\ + "add %3, %2 \n\t"\ + : "+a"(src), "+c"(dst), "+d"(src2)\ + : "D"((x86_reg)src2Stride), "S"((x86_reg)dstStride),\ + "m"(ff_pw_16)\ + : "memory"\ + );\ + }while(--h);\ +}\ +\ +static av_noinline void OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\ + int w= 2;\ + src -= 2*srcStride;\ + \ + while(w--){\ + __asm__ volatile(\ + "pxor %%mm7, %%mm7 \n\t"\ + "movd (%0), %%mm0 \n\t"\ + "add %2, %0 \n\t"\ + "movd (%0), %%mm1 \n\t"\ + "add %2, %0 \n\t"\ + "movd (%0), %%mm2 \n\t"\ + "add %2, %0 \n\t"\ + "movd (%0), %%mm3 \n\t"\ + "add %2, %0 \n\t"\ + "movd (%0), %%mm4 \n\t"\ + "add %2, %0 \n\t"\ + "punpcklbw %%mm7, %%mm0 \n\t"\ + "punpcklbw %%mm7, %%mm1 \n\t"\ + "punpcklbw %%mm7, %%mm2 \n\t"\ + "punpcklbw %%mm7, %%mm3 \n\t"\ + "punpcklbw %%mm7, %%mm4 \n\t"\ + QPEL_H264V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\ + QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\ + QPEL_H264V(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\ + QPEL_H264V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\ + QPEL_H264V(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, OP)\ + QPEL_H264V(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, OP)\ + QPEL_H264V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\ + QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\ + \ + : "+a"(src), "+c"(dst)\ + : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\ + : "memory"\ + );\ + if(h==16){\ + __asm__ volatile(\ + QPEL_H264V(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\ + QPEL_H264V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\ + QPEL_H264V(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, OP)\ + QPEL_H264V(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, OP)\ + QPEL_H264V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\ + QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\ + QPEL_H264V(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\ + QPEL_H264V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\ + \ + : "+a"(src), "+c"(dst)\ + : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\ + : "memory"\ + );\ + }\ + src += 4-(h+5)*srcStride;\ + dst += 4-h*dstStride;\ + }\ +}\ +static av_always_inline void OPNAME ## h264_qpel8or16_hv1_lowpass_ ## MMX(int16_t *tmp, uint8_t *src, int tmpStride, int srcStride, int size){\ + int w = (size+8)>>2;\ + src -= 2*srcStride+2;\ + while(w--){\ + __asm__ volatile(\ + "pxor %%mm7, %%mm7 \n\t"\ + "movd (%0), %%mm0 \n\t"\ + "add %2, %0 \n\t"\ + "movd (%0), %%mm1 \n\t"\ + "add %2, %0 \n\t"\ + "movd (%0), %%mm2 \n\t"\ + "add %2, %0 \n\t"\ + "movd (%0), %%mm3 \n\t"\ + "add %2, %0 \n\t"\ + "movd (%0), %%mm4 \n\t"\ + "add %2, %0 \n\t"\ + "punpcklbw %%mm7, %%mm0 \n\t"\ + "punpcklbw %%mm7, %%mm1 \n\t"\ + "punpcklbw %%mm7, %%mm2 \n\t"\ + "punpcklbw %%mm7, %%mm3 \n\t"\ + "punpcklbw %%mm7, %%mm4 \n\t"\ + QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 0*48)\ + QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 1*48)\ + QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 2*48)\ + QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 3*48)\ + QPEL_H264HV(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, 4*48)\ + QPEL_H264HV(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, 5*48)\ + QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 6*48)\ + QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 7*48)\ + : "+a"(src)\ + : "c"(tmp), "S"((x86_reg)srcStride), "m"(ff_pw_5), "m"(ff_pw_16)\ + : "memory"\ + );\ + if(size==16){\ + __asm__ volatile(\ + QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 8*48)\ + QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 9*48)\ + QPEL_H264HV(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, 10*48)\ + QPEL_H264HV(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, 11*48)\ + QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 12*48)\ + QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 13*48)\ + QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 14*48)\ + QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 15*48)\ + : "+a"(src)\ + : "c"(tmp), "S"((x86_reg)srcStride), "m"(ff_pw_5), "m"(ff_pw_16)\ + : "memory"\ + );\ + }\ + tmp += 4;\ + src += 4 - (size+5)*srcStride;\ + }\ +}\ +static av_always_inline void OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, int dstStride, int tmpStride, int size){\ + int w = size>>4;\ + do{\ + int h = size;\ + __asm__ volatile(\ + "1: \n\t"\ + "movq (%0), %%mm0 \n\t"\ + "movq 8(%0), %%mm3 \n\t"\ + "movq 2(%0), %%mm1 \n\t"\ + "movq 10(%0), %%mm4 \n\t"\ + "paddw %%mm4, %%mm0 \n\t"\ + "paddw %%mm3, %%mm1 \n\t"\ + "paddw 18(%0), %%mm3 \n\t"\ + "paddw 16(%0), %%mm4 \n\t"\ + "movq 4(%0), %%mm2 \n\t"\ + "movq 12(%0), %%mm5 \n\t"\ + "paddw 6(%0), %%mm2 \n\t"\ + "paddw 14(%0), %%mm5 \n\t"\ + "psubw %%mm1, %%mm0 \n\t"\ + "psubw %%mm4, %%mm3 \n\t"\ + "psraw $2, %%mm0 \n\t"\ + "psraw $2, %%mm3 \n\t"\ + "psubw %%mm1, %%mm0 \n\t"\ + "psubw %%mm4, %%mm3 \n\t"\ + "paddsw %%mm2, %%mm0 \n\t"\ + "paddsw %%mm5, %%mm3 \n\t"\ + "psraw $2, %%mm0 \n\t"\ + "psraw $2, %%mm3 \n\t"\ + "paddw %%mm2, %%mm0 \n\t"\ + "paddw %%mm5, %%mm3 \n\t"\ + "psraw $6, %%mm0 \n\t"\ + "psraw $6, %%mm3 \n\t"\ + "packuswb %%mm3, %%mm0 \n\t"\ + OP(%%mm0, (%1),%%mm7, q)\ + "add $48, %0 \n\t"\ + "add %3, %1 \n\t"\ + "decl %2 \n\t"\ + " jnz 1b \n\t"\ + : "+a"(tmp), "+c"(dst), "+g"(h)\ + : "S"((x86_reg)dstStride)\ + : "memory"\ + );\ + tmp += 8 - size*24;\ + dst += 8 - size*dstStride;\ + }while(w--);\ +}\ +\ +static void OPNAME ## h264_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ + OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride, srcStride, 8);\ +}\ +static av_noinline void OPNAME ## h264_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ + OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride, srcStride, 16);\ + OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\ +}\ +\ +static void OPNAME ## h264_qpel16_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ + OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\ + OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\ + src += 8*srcStride;\ + dst += 8*dstStride;\ + OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\ + OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\ +}\ +\ +static av_noinline void OPNAME ## h264_qpel16_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\ + OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\ + OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\ + src += 8*dstStride;\ + dst += 8*dstStride;\ + src2 += 8*src2Stride;\ + OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\ + OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\ +}\ +\ +static av_noinline void OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride, int size){\ + put_h264_qpel8or16_hv1_lowpass_ ## MMX(tmp, src, tmpStride, srcStride, size);\ + OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(dst, tmp, dstStride, tmpStride, size);\ +}\ +static void OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\ + OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst , tmp , src , dstStride, tmpStride, srcStride, 8);\ +}\ +\ +static void OPNAME ## h264_qpel16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\ + OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst , tmp , src , dstStride, tmpStride, srcStride, 16);\ +}\ +\ +static av_noinline void OPNAME ## pixels4_l2_shift5_ ## MMX(uint8_t *dst, int16_t *src16, uint8_t *src8, int dstStride, int src8Stride, int h)\ +{\ + __asm__ volatile(\ + "movq (%1), %%mm0 \n\t"\ + "movq 24(%1), %%mm1 \n\t"\ + "psraw $5, %%mm0 \n\t"\ + "psraw $5, %%mm1 \n\t"\ + "packuswb %%mm0, %%mm0 \n\t"\ + "packuswb %%mm1, %%mm1 \n\t"\ + PAVGB" (%0), %%mm0 \n\t"\ + PAVGB" (%0,%3), %%mm1 \n\t"\ + OP(%%mm0, (%2), %%mm4, d)\ + OP(%%mm1, (%2,%4), %%mm5, d)\ + "lea (%0,%3,2), %0 \n\t"\ + "lea (%2,%4,2), %2 \n\t"\ + "movq 48(%1), %%mm0 \n\t"\ + "movq 72(%1), %%mm1 \n\t"\ + "psraw $5, %%mm0 \n\t"\ + "psraw $5, %%mm1 \n\t"\ + "packuswb %%mm0, %%mm0 \n\t"\ + "packuswb %%mm1, %%mm1 \n\t"\ + PAVGB" (%0), %%mm0 \n\t"\ + PAVGB" (%0,%3), %%mm1 \n\t"\ + OP(%%mm0, (%2), %%mm4, d)\ + OP(%%mm1, (%2,%4), %%mm5, d)\ + :"+a"(src8), "+c"(src16), "+d"(dst)\ + :"S"((x86_reg)src8Stride), "D"((x86_reg)dstStride)\ + :"memory");\ +}\ +static av_noinline void OPNAME ## pixels8_l2_shift5_ ## MMX(uint8_t *dst, int16_t *src16, uint8_t *src8, int dstStride, int src8Stride, int h)\ +{\ + do{\ + __asm__ volatile(\ + "movq (%1), %%mm0 \n\t"\ + "movq 8(%1), %%mm1 \n\t"\ + "movq 48(%1), %%mm2 \n\t"\ + "movq 8+48(%1), %%mm3 \n\t"\ + "psraw $5, %%mm0 \n\t"\ + "psraw $5, %%mm1 \n\t"\ + "psraw $5, %%mm2 \n\t"\ + "psraw $5, %%mm3 \n\t"\ + "packuswb %%mm1, %%mm0 \n\t"\ + "packuswb %%mm3, %%mm2 \n\t"\ + PAVGB" (%0), %%mm0 \n\t"\ + PAVGB" (%0,%3), %%mm2 \n\t"\ + OP(%%mm0, (%2), %%mm5, q)\ + OP(%%mm2, (%2,%4), %%mm5, q)\ + ::"a"(src8), "c"(src16), "d"(dst),\ + "r"((x86_reg)src8Stride), "r"((x86_reg)dstStride)\ + :"memory");\ + src8 += 2L*src8Stride;\ + src16 += 48;\ + dst += 2L*dstStride;\ + }while(h-=2);\ +}\ +static void OPNAME ## pixels16_l2_shift5_ ## MMX(uint8_t *dst, int16_t *src16, uint8_t *src8, int dstStride, int src8Stride, int h)\ +{\ + OPNAME ## pixels8_l2_shift5_ ## MMX(dst , src16 , src8 , dstStride, src8Stride, h);\ + OPNAME ## pixels8_l2_shift5_ ## MMX(dst+8, src16+8, src8+8, dstStride, src8Stride, h);\ +}\ + + +#if ARCH_X86_64 +#define QPEL_H264_H16_XMM(OPNAME, OP, MMX)\ +static av_noinline void OPNAME ## h264_qpel16_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\ + int h=16;\ + __asm__ volatile(\ + "pxor %%xmm15, %%xmm15 \n\t"\ + "movdqa %6, %%xmm14 \n\t"\ + "movdqa %7, %%xmm13 \n\t"\ + "1: \n\t"\ + "lddqu 6(%0), %%xmm1 \n\t"\ + "lddqu -2(%0), %%xmm7 \n\t"\ + "movdqa %%xmm1, %%xmm0 \n\t"\ + "punpckhbw %%xmm15, %%xmm1 \n\t"\ + "punpcklbw %%xmm15, %%xmm0 \n\t"\ + "punpcklbw %%xmm15, %%xmm7 \n\t"\ + "movdqa %%xmm1, %%xmm2 \n\t"\ + "movdqa %%xmm0, %%xmm6 \n\t"\ + "movdqa %%xmm1, %%xmm3 \n\t"\ + "movdqa %%xmm0, %%xmm8 \n\t"\ + "movdqa %%xmm1, %%xmm4 \n\t"\ + "movdqa %%xmm0, %%xmm9 \n\t"\ + "movdqa %%xmm0, %%xmm12 \n\t"\ + "movdqa %%xmm1, %%xmm11 \n\t"\ + "palignr $10,%%xmm0, %%xmm11\n\t"\ + "palignr $10,%%xmm7, %%xmm12\n\t"\ + "palignr $2, %%xmm0, %%xmm4 \n\t"\ + "palignr $2, %%xmm7, %%xmm9 \n\t"\ + "palignr $4, %%xmm0, %%xmm3 \n\t"\ + "palignr $4, %%xmm7, %%xmm8 \n\t"\ + "palignr $6, %%xmm0, %%xmm2 \n\t"\ + "palignr $6, %%xmm7, %%xmm6 \n\t"\ + "paddw %%xmm0 ,%%xmm11 \n\t"\ + "palignr $8, %%xmm0, %%xmm1 \n\t"\ + "palignr $8, %%xmm7, %%xmm0 \n\t"\ + "paddw %%xmm12,%%xmm7 \n\t"\ + "paddw %%xmm3, %%xmm2 \n\t"\ + "paddw %%xmm8, %%xmm6 \n\t"\ + "paddw %%xmm4, %%xmm1 \n\t"\ + "paddw %%xmm9, %%xmm0 \n\t"\ + "psllw $2, %%xmm2 \n\t"\ + "psllw $2, %%xmm6 \n\t"\ + "psubw %%xmm1, %%xmm2 \n\t"\ + "psubw %%xmm0, %%xmm6 \n\t"\ + "paddw %%xmm13,%%xmm11 \n\t"\ + "paddw %%xmm13,%%xmm7 \n\t"\ + "pmullw %%xmm14,%%xmm2 \n\t"\ + "pmullw %%xmm14,%%xmm6 \n\t"\ + "lddqu (%2), %%xmm3 \n\t"\ + "paddw %%xmm11,%%xmm2 \n\t"\ + "paddw %%xmm7, %%xmm6 \n\t"\ + "psraw $5, %%xmm2 \n\t"\ + "psraw $5, %%xmm6 \n\t"\ + "packuswb %%xmm2,%%xmm6 \n\t"\ + "pavgb %%xmm3, %%xmm6 \n\t"\ + OP(%%xmm6, (%1), %%xmm4, dqa)\ + "add %5, %0 \n\t"\ + "add %5, %1 \n\t"\ + "add %4, %2 \n\t"\ + "decl %3 \n\t"\ + "jg 1b \n\t"\ + : "+a"(src), "+c"(dst), "+d"(src2), "+g"(h)\ + : "D"((x86_reg)src2Stride), "S"((x86_reg)dstStride),\ + "m"(ff_pw_5), "m"(ff_pw_16)\ + : "memory"\ + );\ +} +#else // ARCH_X86_64 +#define QPEL_H264_H16_XMM(OPNAME, OP, MMX)\ +static av_noinline void OPNAME ## h264_qpel16_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\ + OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\ + OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\ + src += 8*dstStride;\ + dst += 8*dstStride;\ + src2 += 8*src2Stride;\ + OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\ + OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\ +} +#endif // ARCH_X86_64 + +#define QPEL_H264_H_XMM(OPNAME, OP, MMX)\ +static av_noinline void OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\ + int h=8;\ + __asm__ volatile(\ + "pxor %%xmm7, %%xmm7 \n\t"\ + "movdqa %0, %%xmm6 \n\t"\ + :: "m"(ff_pw_5)\ + );\ + do{\ + __asm__ volatile(\ + "lddqu -2(%0), %%xmm1 \n\t"\ + "movdqa %%xmm1, %%xmm0 \n\t"\ + "punpckhbw %%xmm7, %%xmm1 \n\t"\ + "punpcklbw %%xmm7, %%xmm0 \n\t"\ + "movdqa %%xmm1, %%xmm2 \n\t"\ + "movdqa %%xmm1, %%xmm3 \n\t"\ + "movdqa %%xmm1, %%xmm4 \n\t"\ + "movdqa %%xmm1, %%xmm5 \n\t"\ + "palignr $2, %%xmm0, %%xmm4 \n\t"\ + "palignr $4, %%xmm0, %%xmm3 \n\t"\ + "palignr $6, %%xmm0, %%xmm2 \n\t"\ + "palignr $8, %%xmm0, %%xmm1 \n\t"\ + "palignr $10,%%xmm0, %%xmm5 \n\t"\ + "paddw %%xmm5, %%xmm0 \n\t"\ + "paddw %%xmm3, %%xmm2 \n\t"\ + "paddw %%xmm4, %%xmm1 \n\t"\ + "psllw $2, %%xmm2 \n\t"\ + "movq (%2), %%xmm3 \n\t"\ + "psubw %%xmm1, %%xmm2 \n\t"\ + "paddw %5, %%xmm0 \n\t"\ + "pmullw %%xmm6, %%xmm2 \n\t"\ + "paddw %%xmm0, %%xmm2 \n\t"\ + "psraw $5, %%xmm2 \n\t"\ + "packuswb %%xmm2, %%xmm2 \n\t"\ + "pavgb %%xmm3, %%xmm2 \n\t"\ + OP(%%xmm2, (%1), %%xmm4, q)\ + "add %4, %0 \n\t"\ + "add %4, %1 \n\t"\ + "add %3, %2 \n\t"\ + : "+a"(src), "+c"(dst), "+d"(src2)\ + : "D"((x86_reg)src2Stride), "S"((x86_reg)dstStride),\ + "m"(ff_pw_16)\ + : "memory"\ + );\ + }while(--h);\ +}\ +QPEL_H264_H16_XMM(OPNAME, OP, MMX)\ +\ +static av_noinline void OPNAME ## h264_qpel8_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ + int h=8;\ + __asm__ volatile(\ + "pxor %%xmm7, %%xmm7 \n\t"\ + "movdqa "MANGLE(ff_pw_5)", %%xmm6\n\t"\ + "1: \n\t"\ + "lddqu -2(%0), %%xmm1 \n\t"\ + "movdqa %%xmm1, %%xmm0 \n\t"\ + "punpckhbw %%xmm7, %%xmm1 \n\t"\ + "punpcklbw %%xmm7, %%xmm0 \n\t"\ + "movdqa %%xmm1, %%xmm2 \n\t"\ + "movdqa %%xmm1, %%xmm3 \n\t"\ + "movdqa %%xmm1, %%xmm4 \n\t"\ + "movdqa %%xmm1, %%xmm5 \n\t"\ + "palignr $2, %%xmm0, %%xmm4 \n\t"\ + "palignr $4, %%xmm0, %%xmm3 \n\t"\ + "palignr $6, %%xmm0, %%xmm2 \n\t"\ + "palignr $8, %%xmm0, %%xmm1 \n\t"\ + "palignr $10,%%xmm0, %%xmm5 \n\t"\ + "paddw %%xmm5, %%xmm0 \n\t"\ + "paddw %%xmm3, %%xmm2 \n\t"\ + "paddw %%xmm4, %%xmm1 \n\t"\ + "psllw $2, %%xmm2 \n\t"\ + "psubw %%xmm1, %%xmm2 \n\t"\ + "paddw "MANGLE(ff_pw_16)", %%xmm0\n\t"\ + "pmullw %%xmm6, %%xmm2 \n\t"\ + "paddw %%xmm0, %%xmm2 \n\t"\ + "psraw $5, %%xmm2 \n\t"\ + "packuswb %%xmm2, %%xmm2 \n\t"\ + OP(%%xmm2, (%1), %%xmm4, q)\ + "add %3, %0 \n\t"\ + "add %4, %1 \n\t"\ + "decl %2 \n\t"\ + " jnz 1b \n\t"\ + : "+a"(src), "+c"(dst), "+g"(h)\ + : "D"((x86_reg)srcStride), "S"((x86_reg)dstStride)\ + : "memory"\ + );\ +}\ +static void OPNAME ## h264_qpel16_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ + OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\ + OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\ + src += 8*srcStride;\ + dst += 8*dstStride;\ + OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\ + OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\ +}\ + +#define QPEL_H264_V_XMM(OPNAME, OP, MMX)\ +static av_noinline void OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\ + src -= 2*srcStride;\ + \ + __asm__ volatile(\ + "pxor %%xmm7, %%xmm7 \n\t"\ + "movq (%0), %%xmm0 \n\t"\ + "add %2, %0 \n\t"\ + "movq (%0), %%xmm1 \n\t"\ + "add %2, %0 \n\t"\ + "movq (%0), %%xmm2 \n\t"\ + "add %2, %0 \n\t"\ + "movq (%0), %%xmm3 \n\t"\ + "add %2, %0 \n\t"\ + "movq (%0), %%xmm4 \n\t"\ + "add %2, %0 \n\t"\ + "punpcklbw %%xmm7, %%xmm0 \n\t"\ + "punpcklbw %%xmm7, %%xmm1 \n\t"\ + "punpcklbw %%xmm7, %%xmm2 \n\t"\ + "punpcklbw %%xmm7, %%xmm3 \n\t"\ + "punpcklbw %%xmm7, %%xmm4 \n\t"\ + QPEL_H264V_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, OP)\ + QPEL_H264V_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, OP)\ + QPEL_H264V_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, OP)\ + QPEL_H264V_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, OP)\ + QPEL_H264V_XMM(%%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, OP)\ + QPEL_H264V_XMM(%%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, OP)\ + QPEL_H264V_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, OP)\ + QPEL_H264V_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, OP)\ + \ + : "+a"(src), "+c"(dst)\ + : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\ + : "memory"\ + );\ + if(h==16){\ + __asm__ volatile(\ + QPEL_H264V_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, OP)\ + QPEL_H264V_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, OP)\ + QPEL_H264V_XMM(%%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, OP)\ + QPEL_H264V_XMM(%%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, OP)\ + QPEL_H264V_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, OP)\ + QPEL_H264V_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, OP)\ + QPEL_H264V_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, OP)\ + QPEL_H264V_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, OP)\ + \ + : "+a"(src), "+c"(dst)\ + : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\ + : "memory"\ + );\ + }\ +}\ +static void OPNAME ## h264_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ + OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride, srcStride, 8);\ +}\ +static av_noinline void OPNAME ## h264_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ + OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride, srcStride, 16);\ + OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\ +} + +static av_always_inline void put_h264_qpel8or16_hv1_lowpass_sse2(int16_t *tmp, uint8_t *src, int tmpStride, int srcStride, int size){ + int w = (size+8)>>3; + src -= 2*srcStride+2; + while(w--){ + __asm__ volatile( + "pxor %%xmm7, %%xmm7 \n\t" + "movq (%0), %%xmm0 \n\t" + "add %2, %0 \n\t" + "movq (%0), %%xmm1 \n\t" + "add %2, %0 \n\t" + "movq (%0), %%xmm2 \n\t" + "add %2, %0 \n\t" + "movq (%0), %%xmm3 \n\t" + "add %2, %0 \n\t" + "movq (%0), %%xmm4 \n\t" + "add %2, %0 \n\t" + "punpcklbw %%xmm7, %%xmm0 \n\t" + "punpcklbw %%xmm7, %%xmm1 \n\t" + "punpcklbw %%xmm7, %%xmm2 \n\t" + "punpcklbw %%xmm7, %%xmm3 \n\t" + "punpcklbw %%xmm7, %%xmm4 \n\t" + QPEL_H264HV_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, 0*48) + QPEL_H264HV_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, 1*48) + QPEL_H264HV_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, 2*48) + QPEL_H264HV_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, 3*48) + QPEL_H264HV_XMM(%%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, 4*48) + QPEL_H264HV_XMM(%%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, 5*48) + QPEL_H264HV_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, 6*48) + QPEL_H264HV_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, 7*48) + : "+a"(src) + : "c"(tmp), "S"((x86_reg)srcStride), "m"(ff_pw_5), "m"(ff_pw_16) + : "memory" + ); + if(size==16){ + __asm__ volatile( + QPEL_H264HV_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, 8*48) + QPEL_H264HV_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, 9*48) + QPEL_H264HV_XMM(%%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, 10*48) + QPEL_H264HV_XMM(%%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, 11*48) + QPEL_H264HV_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, 12*48) + QPEL_H264HV_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, 13*48) + QPEL_H264HV_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, 14*48) + QPEL_H264HV_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, 15*48) + : "+a"(src) + : "c"(tmp), "S"((x86_reg)srcStride), "m"(ff_pw_5), "m"(ff_pw_16) + : "memory" + ); + } + tmp += 8; + src += 8 - (size+5)*srcStride; + } +} + +#define QPEL_H264_HV2_XMM(OPNAME, OP, MMX)\ +static av_always_inline void OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, int dstStride, int tmpStride, int size){\ + int h = size;\ + if(size == 16){\ + __asm__ volatile(\ + "1: \n\t"\ + "movdqa 32(%0), %%xmm4 \n\t"\ + "movdqa 16(%0), %%xmm5 \n\t"\ + "movdqa (%0), %%xmm7 \n\t"\ + "movdqa %%xmm4, %%xmm3 \n\t"\ + "movdqa %%xmm4, %%xmm2 \n\t"\ + "movdqa %%xmm4, %%xmm1 \n\t"\ + "movdqa %%xmm4, %%xmm0 \n\t"\ + "palignr $10, %%xmm5, %%xmm0 \n\t"\ + "palignr $8, %%xmm5, %%xmm1 \n\t"\ + "palignr $6, %%xmm5, %%xmm2 \n\t"\ + "palignr $4, %%xmm5, %%xmm3 \n\t"\ + "palignr $2, %%xmm5, %%xmm4 \n\t"\ + "paddw %%xmm5, %%xmm0 \n\t"\ + "paddw %%xmm4, %%xmm1 \n\t"\ + "paddw %%xmm3, %%xmm2 \n\t"\ + "movdqa %%xmm5, %%xmm6 \n\t"\ + "movdqa %%xmm5, %%xmm4 \n\t"\ + "movdqa %%xmm5, %%xmm3 \n\t"\ + "palignr $8, %%xmm7, %%xmm4 \n\t"\ + "palignr $2, %%xmm7, %%xmm6 \n\t"\ + "palignr $10, %%xmm7, %%xmm3 \n\t"\ + "paddw %%xmm6, %%xmm4 \n\t"\ + "movdqa %%xmm5, %%xmm6 \n\t"\ + "palignr $6, %%xmm7, %%xmm5 \n\t"\ + "palignr $4, %%xmm7, %%xmm6 \n\t"\ + "paddw %%xmm7, %%xmm3 \n\t"\ + "paddw %%xmm6, %%xmm5 \n\t"\ + \ + "psubw %%xmm1, %%xmm0 \n\t"\ + "psubw %%xmm4, %%xmm3 \n\t"\ + "psraw $2, %%xmm0 \n\t"\ + "psraw $2, %%xmm3 \n\t"\ + "psubw %%xmm1, %%xmm0 \n\t"\ + "psubw %%xmm4, %%xmm3 \n\t"\ + "paddw %%xmm2, %%xmm0 \n\t"\ + "paddw %%xmm5, %%xmm3 \n\t"\ + "psraw $2, %%xmm0 \n\t"\ + "psraw $2, %%xmm3 \n\t"\ + "paddw %%xmm2, %%xmm0 \n\t"\ + "paddw %%xmm5, %%xmm3 \n\t"\ + "psraw $6, %%xmm0 \n\t"\ + "psraw $6, %%xmm3 \n\t"\ + "packuswb %%xmm0, %%xmm3 \n\t"\ + OP(%%xmm3, (%1), %%xmm7, dqa)\ + "add $48, %0 \n\t"\ + "add %3, %1 \n\t"\ + "decl %2 \n\t"\ + " jnz 1b \n\t"\ + : "+a"(tmp), "+c"(dst), "+g"(h)\ + : "S"((x86_reg)dstStride)\ + : "memory"\ + );\ + }else{\ + __asm__ volatile(\ + "1: \n\t"\ + "movdqa 16(%0), %%xmm1 \n\t"\ + "movdqa (%0), %%xmm0 \n\t"\ + "movdqa %%xmm1, %%xmm2 \n\t"\ + "movdqa %%xmm1, %%xmm3 \n\t"\ + "movdqa %%xmm1, %%xmm4 \n\t"\ + "movdqa %%xmm1, %%xmm5 \n\t"\ + "palignr $10, %%xmm0, %%xmm5 \n\t"\ + "palignr $8, %%xmm0, %%xmm4 \n\t"\ + "palignr $6, %%xmm0, %%xmm3 \n\t"\ + "palignr $4, %%xmm0, %%xmm2 \n\t"\ + "palignr $2, %%xmm0, %%xmm1 \n\t"\ + "paddw %%xmm5, %%xmm0 \n\t"\ + "paddw %%xmm4, %%xmm1 \n\t"\ + "paddw %%xmm3, %%xmm2 \n\t"\ + "psubw %%xmm1, %%xmm0 \n\t"\ + "psraw $2, %%xmm0 \n\t"\ + "psubw %%xmm1, %%xmm0 \n\t"\ + "paddw %%xmm2, %%xmm0 \n\t"\ + "psraw $2, %%xmm0 \n\t"\ + "paddw %%xmm2, %%xmm0 \n\t"\ + "psraw $6, %%xmm0 \n\t"\ + "packuswb %%xmm0, %%xmm0 \n\t"\ + OP(%%xmm0, (%1), %%xmm7, q)\ + "add $48, %0 \n\t"\ + "add %3, %1 \n\t"\ + "decl %2 \n\t"\ + " jnz 1b \n\t"\ + : "+a"(tmp), "+c"(dst), "+g"(h)\ + : "S"((x86_reg)dstStride)\ + : "memory"\ + );\ + }\ +} + +#define QPEL_H264_HV_XMM(OPNAME, OP, MMX)\ +static av_noinline void OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride, int size){\ + put_h264_qpel8or16_hv1_lowpass_sse2(tmp, src, tmpStride, srcStride, size);\ + OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(dst, tmp, dstStride, tmpStride, size);\ +}\ +static void OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\ + OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst, tmp, src, dstStride, tmpStride, srcStride, 8);\ +}\ +static void OPNAME ## h264_qpel16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\ + OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst, tmp, src, dstStride, tmpStride, srcStride, 16);\ +}\ + +#define put_pixels8_l2_sse2 put_pixels8_l2_mmx2 +#define avg_pixels8_l2_sse2 avg_pixels8_l2_mmx2 +#define put_pixels16_l2_sse2 put_pixels16_l2_mmx2 +#define avg_pixels16_l2_sse2 avg_pixels16_l2_mmx2 +#define put_pixels8_l2_ssse3 put_pixels8_l2_mmx2 +#define avg_pixels8_l2_ssse3 avg_pixels8_l2_mmx2 +#define put_pixels16_l2_ssse3 put_pixels16_l2_mmx2 +#define avg_pixels16_l2_ssse3 avg_pixels16_l2_mmx2 + +#define put_pixels8_l2_shift5_sse2 put_pixels8_l2_shift5_mmx2 +#define avg_pixels8_l2_shift5_sse2 avg_pixels8_l2_shift5_mmx2 +#define put_pixels16_l2_shift5_sse2 put_pixels16_l2_shift5_mmx2 +#define avg_pixels16_l2_shift5_sse2 avg_pixels16_l2_shift5_mmx2 +#define put_pixels8_l2_shift5_ssse3 put_pixels8_l2_shift5_mmx2 +#define avg_pixels8_l2_shift5_ssse3 avg_pixels8_l2_shift5_mmx2 +#define put_pixels16_l2_shift5_ssse3 put_pixels16_l2_shift5_mmx2 +#define avg_pixels16_l2_shift5_ssse3 avg_pixels16_l2_shift5_mmx2 + +#define put_h264_qpel8_h_lowpass_l2_sse2 put_h264_qpel8_h_lowpass_l2_mmx2 +#define avg_h264_qpel8_h_lowpass_l2_sse2 avg_h264_qpel8_h_lowpass_l2_mmx2 +#define put_h264_qpel16_h_lowpass_l2_sse2 put_h264_qpel16_h_lowpass_l2_mmx2 +#define avg_h264_qpel16_h_lowpass_l2_sse2 avg_h264_qpel16_h_lowpass_l2_mmx2 + +#define put_h264_qpel8_v_lowpass_ssse3 put_h264_qpel8_v_lowpass_sse2 +#define avg_h264_qpel8_v_lowpass_ssse3 avg_h264_qpel8_v_lowpass_sse2 +#define put_h264_qpel16_v_lowpass_ssse3 put_h264_qpel16_v_lowpass_sse2 +#define avg_h264_qpel16_v_lowpass_ssse3 avg_h264_qpel16_v_lowpass_sse2 + +#define put_h264_qpel8or16_hv2_lowpass_sse2 put_h264_qpel8or16_hv2_lowpass_mmx2 +#define avg_h264_qpel8or16_hv2_lowpass_sse2 avg_h264_qpel8or16_hv2_lowpass_mmx2 + +#define H264_MC(OPNAME, SIZE, MMX, ALIGN) \ +H264_MC_C(OPNAME, SIZE, MMX, ALIGN)\ +H264_MC_V(OPNAME, SIZE, MMX, ALIGN)\ +H264_MC_H(OPNAME, SIZE, MMX, ALIGN)\ +H264_MC_HV(OPNAME, SIZE, MMX, ALIGN)\ + +static void put_h264_qpel16_mc00_sse2 (uint8_t *dst, uint8_t *src, int stride){ + put_pixels16_sse2(dst, src, stride, 16); +} +static void avg_h264_qpel16_mc00_sse2 (uint8_t *dst, uint8_t *src, int stride){ + avg_pixels16_sse2(dst, src, stride, 16); +} +#define put_h264_qpel8_mc00_sse2 put_h264_qpel8_mc00_mmx2 +#define avg_h264_qpel8_mc00_sse2 avg_h264_qpel8_mc00_mmx2 + +#define H264_MC_C(OPNAME, SIZE, MMX, ALIGN) \ +static void OPNAME ## h264_qpel ## SIZE ## _mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\ + OPNAME ## pixels ## SIZE ## _ ## MMX(dst, src, stride, SIZE);\ +}\ + +#define H264_MC_H(OPNAME, SIZE, MMX, ALIGN) \ +static void OPNAME ## h264_qpel ## SIZE ## _mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ + OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, src, stride, stride);\ +}\ +\ +static void OPNAME ## h264_qpel ## SIZE ## _mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ + OPNAME ## h264_qpel ## SIZE ## _h_lowpass_ ## MMX(dst, src, stride, stride);\ +}\ +\ +static void OPNAME ## h264_qpel ## SIZE ## _mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ + OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, src+1, stride, stride);\ +}\ + +#define H264_MC_V(OPNAME, SIZE, MMX, ALIGN) \ +static void OPNAME ## h264_qpel ## SIZE ## _mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ + DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\ + put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\ + OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, src, temp, stride, stride, SIZE);\ +}\ +\ +static void OPNAME ## h264_qpel ## SIZE ## _mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ + OPNAME ## h264_qpel ## SIZE ## _v_lowpass_ ## MMX(dst, src, stride, stride);\ +}\ +\ +static void OPNAME ## h264_qpel ## SIZE ## _mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ + DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\ + put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\ + OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, src+stride, temp, stride, stride, SIZE);\ +}\ + +#define H264_MC_HV(OPNAME, SIZE, MMX, ALIGN) \ +static void OPNAME ## h264_qpel ## SIZE ## _mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ + DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\ + put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\ + OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, temp, stride, SIZE);\ +}\ +\ +static void OPNAME ## h264_qpel ## SIZE ## _mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ + DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\ + put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src+1, SIZE, stride);\ + OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, temp, stride, SIZE);\ +}\ +\ +static void OPNAME ## h264_qpel ## SIZE ## _mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ + DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\ + put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\ + OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, temp, stride, SIZE);\ +}\ +\ +static void OPNAME ## h264_qpel ## SIZE ## _mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ + DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\ + put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src+1, SIZE, stride);\ + OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, temp, stride, SIZE);\ +}\ +\ +static void OPNAME ## h264_qpel ## SIZE ## _mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ + DECLARE_ALIGNED(ALIGN, uint16_t, temp)[SIZE*(SIZE<8?12:24)];\ + OPNAME ## h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(dst, temp, src, stride, SIZE, stride);\ +}\ +\ +static void OPNAME ## h264_qpel ## SIZE ## _mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ + DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE];\ + uint8_t * const halfHV= temp;\ + int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\ + assert(((int)temp & 7) == 0);\ + put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\ + OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, halfHV, stride, SIZE);\ +}\ +\ +static void OPNAME ## h264_qpel ## SIZE ## _mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ + DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE];\ + uint8_t * const halfHV= temp;\ + int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\ + assert(((int)temp & 7) == 0);\ + put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\ + OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, halfHV, stride, SIZE);\ +}\ +\ +static void OPNAME ## h264_qpel ## SIZE ## _mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ + DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE];\ + uint8_t * const halfHV= temp;\ + int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\ + assert(((int)temp & 7) == 0);\ + put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\ + OPNAME ## pixels ## SIZE ## _l2_shift5_ ## MMX(dst, halfV+2, halfHV, stride, SIZE, SIZE);\ +}\ +\ +static void OPNAME ## h264_qpel ## SIZE ## _mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ + DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE];\ + uint8_t * const halfHV= temp;\ + int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\ + assert(((int)temp & 7) == 0);\ + put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\ + OPNAME ## pixels ## SIZE ## _l2_shift5_ ## MMX(dst, halfV+3, halfHV, stride, SIZE, SIZE);\ +}\ + +#define H264_MC_4816(MMX)\ +H264_MC(put_, 4, MMX, 8)\ +H264_MC(put_, 8, MMX, 8)\ +H264_MC(put_, 16,MMX, 8)\ +H264_MC(avg_, 4, MMX, 8)\ +H264_MC(avg_, 8, MMX, 8)\ +H264_MC(avg_, 16,MMX, 8)\ + +#define H264_MC_816(QPEL, XMM)\ +QPEL(put_, 8, XMM, 16)\ +QPEL(put_, 16,XMM, 16)\ +QPEL(avg_, 8, XMM, 16)\ +QPEL(avg_, 16,XMM, 16)\ + + +#define AVG_3DNOW_OP(a,b,temp, size) \ +"mov" #size " " #b ", " #temp " \n\t"\ +"pavgusb " #temp ", " #a " \n\t"\ +"mov" #size " " #a ", " #b " \n\t" +#define AVG_MMX2_OP(a,b,temp, size) \ +"mov" #size " " #b ", " #temp " \n\t"\ +"pavgb " #temp ", " #a " \n\t"\ +"mov" #size " " #a ", " #b " \n\t" + +#define PAVGB "pavgusb" +QPEL_H264(put_, PUT_OP, 3dnow) +QPEL_H264(avg_, AVG_3DNOW_OP, 3dnow) +#undef PAVGB +#define PAVGB "pavgb" +QPEL_H264(put_, PUT_OP, mmx2) +QPEL_H264(avg_, AVG_MMX2_OP, mmx2) +QPEL_H264_V_XMM(put_, PUT_OP, sse2) +QPEL_H264_V_XMM(avg_, AVG_MMX2_OP, sse2) +QPEL_H264_HV_XMM(put_, PUT_OP, sse2) +QPEL_H264_HV_XMM(avg_, AVG_MMX2_OP, sse2) +#if HAVE_SSSE3 +QPEL_H264_H_XMM(put_, PUT_OP, ssse3) +QPEL_H264_H_XMM(avg_, AVG_MMX2_OP, ssse3) +QPEL_H264_HV2_XMM(put_, PUT_OP, ssse3) +QPEL_H264_HV2_XMM(avg_, AVG_MMX2_OP, ssse3) +QPEL_H264_HV_XMM(put_, PUT_OP, ssse3) +QPEL_H264_HV_XMM(avg_, AVG_MMX2_OP, ssse3) +#endif +#undef PAVGB + +H264_MC_4816(3dnow) +H264_MC_4816(mmx2) +H264_MC_816(H264_MC_V, sse2) +H264_MC_816(H264_MC_HV, sse2) +#if HAVE_SSSE3 +H264_MC_816(H264_MC_H, ssse3) +H264_MC_816(H264_MC_HV, ssse3) +#endif diff --git a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/h264_weight.asm b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/h264_weight.asm new file mode 100644 index 000000000..53aa21047 --- /dev/null +++ b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/h264_weight.asm @@ -0,0 +1,375 @@ +;***************************************************************************** +;* SSE2-optimized weighted prediction code +;***************************************************************************** +;* Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt +;* Copyright (C) 2010 Eli Friedman +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with FFmpeg; if not, write to the Free Software +;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "x86inc.asm" + +SECTION .text + +;----------------------------------------------------------------------------- +; biweight pred: +; +; void h264_biweight_16x16_sse2(uint8_t *dst, uint8_t *src, int stride, +; int log2_denom, int weightd, int weights, +; int offset); +; and +; void h264_weight_16x16_sse2(uint8_t *dst, int stride, +; int log2_denom, int weight, +; int offset); +;----------------------------------------------------------------------------- + +%macro WEIGHT_SETUP 0 + add r4, r4 + inc r4 + movd m3, r3d + movd m5, r4d + movd m6, r2d + pslld m5, m6 + psrld m5, 1 +%if mmsize == 16 + pshuflw m3, m3, 0 + pshuflw m5, m5, 0 + punpcklqdq m3, m3 + punpcklqdq m5, m5 +%else + pshufw m3, m3, 0 + pshufw m5, m5, 0 +%endif + pxor m7, m7 +%endmacro + +%macro WEIGHT_OP 2 + movh m0, [r0+%1] + movh m1, [r0+%2] + punpcklbw m0, m7 + punpcklbw m1, m7 + pmullw m0, m3 + pmullw m1, m3 + paddsw m0, m5 + paddsw m1, m5 + psraw m0, m6 + psraw m1, m6 + packuswb m0, m1 +%endmacro + +%macro WEIGHT_FUNC_DBL_MM 1 +cglobal h264_weight_16x%1_mmx2, 5, 5, 0 + WEIGHT_SETUP + mov r2, %1 +%if %1 == 16 +.nextrow + WEIGHT_OP 0, 4 + mova [r0 ], m0 + WEIGHT_OP 8, 12 + mova [r0+8], m0 + add r0, r1 + dec r2 + jnz .nextrow + REP_RET +%else + jmp mangle(ff_h264_weight_16x16_mmx2.nextrow) +%endif +%endmacro + +INIT_MMX +WEIGHT_FUNC_DBL_MM 16 +WEIGHT_FUNC_DBL_MM 8 + +%macro WEIGHT_FUNC_MM 4 +cglobal h264_weight_%1x%2_%4, 7, 7, %3 + WEIGHT_SETUP + mov r2, %2 +%if %2 == 16 +.nextrow + WEIGHT_OP 0, mmsize/2 + mova [r0], m0 + add r0, r1 + dec r2 + jnz .nextrow + REP_RET +%else + jmp mangle(ff_h264_weight_%1x16_%4.nextrow) +%endif +%endmacro + +INIT_MMX +WEIGHT_FUNC_MM 8, 16, 0, mmx2 +WEIGHT_FUNC_MM 8, 8, 0, mmx2 +WEIGHT_FUNC_MM 8, 4, 0, mmx2 +INIT_XMM +WEIGHT_FUNC_MM 16, 16, 8, sse2 +WEIGHT_FUNC_MM 16, 8, 8, sse2 + +%macro WEIGHT_FUNC_HALF_MM 5 +cglobal h264_weight_%1x%2_%5, 5, 5, %4 + WEIGHT_SETUP + mov r2, %2/2 + lea r3, [r1*2] +%if %2 == mmsize +.nextrow + WEIGHT_OP 0, r1 + movh [r0], m0 +%if mmsize == 16 + movhps [r0+r1], m0 +%else + psrlq m0, 32 + movh [r0+r1], m0 +%endif + add r0, r3 + dec r2 + jnz .nextrow + REP_RET +%else + jmp mangle(ff_h264_weight_%1x%3_%5.nextrow) +%endif +%endmacro + +INIT_MMX +WEIGHT_FUNC_HALF_MM 4, 8, 8, 0, mmx2 +WEIGHT_FUNC_HALF_MM 4, 4, 8, 0, mmx2 +WEIGHT_FUNC_HALF_MM 4, 2, 8, 0, mmx2 +INIT_XMM +WEIGHT_FUNC_HALF_MM 8, 16, 16, 8, sse2 +WEIGHT_FUNC_HALF_MM 8, 8, 16, 8, sse2 +WEIGHT_FUNC_HALF_MM 8, 4, 16, 8, sse2 + +%macro BIWEIGHT_SETUP 0 + add r6, 1 + or r6, 1 + add r3, 1 + movd m3, r4d + movd m4, r5d + movd m5, r6d + movd m6, r3d + pslld m5, m6 + psrld m5, 1 +%if mmsize == 16 + pshuflw m3, m3, 0 + pshuflw m4, m4, 0 + pshuflw m5, m5, 0 + punpcklqdq m3, m3 + punpcklqdq m4, m4 + punpcklqdq m5, m5 +%else + pshufw m3, m3, 0 + pshufw m4, m4, 0 + pshufw m5, m5, 0 +%endif + pxor m7, m7 +%endmacro + +%macro BIWEIGHT_STEPA 3 + movh m%1, [r0+%3] + movh m%2, [r1+%3] + punpcklbw m%1, m7 + punpcklbw m%2, m7 + pmullw m%1, m3 + pmullw m%2, m4 + paddsw m%1, m%2 +%endmacro + +%macro BIWEIGHT_STEPB 0 + paddsw m0, m5 + paddsw m1, m5 + psraw m0, m6 + psraw m1, m6 + packuswb m0, m1 +%endmacro + +%macro BIWEIGHT_FUNC_DBL_MM 1 +cglobal h264_biweight_16x%1_mmx2, 7, 7, 0 + BIWEIGHT_SETUP + mov r3, %1 +%if %1 == 16 +.nextrow + BIWEIGHT_STEPA 0, 1, 0 + BIWEIGHT_STEPA 1, 2, 4 + BIWEIGHT_STEPB + mova [r0], m0 + BIWEIGHT_STEPA 0, 1, 8 + BIWEIGHT_STEPA 1, 2, 12 + BIWEIGHT_STEPB + mova [r0+8], m0 + add r0, r2 + add r1, r2 + dec r3 + jnz .nextrow + REP_RET +%else + jmp mangle(ff_h264_biweight_16x16_mmx2.nextrow) +%endif +%endmacro + +INIT_MMX +BIWEIGHT_FUNC_DBL_MM 16 +BIWEIGHT_FUNC_DBL_MM 8 + +%macro BIWEIGHT_FUNC_MM 4 +cglobal h264_biweight_%1x%2_%4, 7, 7, %3 + BIWEIGHT_SETUP + mov r3, %2 +%if %2 == 16 +.nextrow + BIWEIGHT_STEPA 0, 1, 0 + BIWEIGHT_STEPA 1, 2, mmsize/2 + BIWEIGHT_STEPB + mova [r0], m0 + add r0, r2 + add r1, r2 + dec r3 + jnz .nextrow + REP_RET +%else + jmp mangle(ff_h264_biweight_%1x16_%4.nextrow) +%endif +%endmacro + +INIT_MMX +BIWEIGHT_FUNC_MM 8, 16, 0, mmx2 +BIWEIGHT_FUNC_MM 8, 8, 0, mmx2 +BIWEIGHT_FUNC_MM 8, 4, 0, mmx2 +INIT_XMM +BIWEIGHT_FUNC_MM 16, 16, 8, sse2 +BIWEIGHT_FUNC_MM 16, 8, 8, sse2 + +%macro BIWEIGHT_FUNC_HALF_MM 5 +cglobal h264_biweight_%1x%2_%5, 7, 7, %4 + BIWEIGHT_SETUP + mov r3, %2/2 + lea r4, [r2*2] +%if %2 == mmsize +.nextrow + BIWEIGHT_STEPA 0, 1, 0 + BIWEIGHT_STEPA 1, 2, r2 + BIWEIGHT_STEPB + movh [r0], m0 +%if mmsize == 16 + movhps [r0+r2], m0 +%else + psrlq m0, 32 + movh [r0+r2], m0 +%endif + add r0, r4 + add r1, r4 + dec r3 + jnz .nextrow + REP_RET +%else + jmp mangle(ff_h264_biweight_%1x%3_%5.nextrow) +%endif +%endmacro + +INIT_MMX +BIWEIGHT_FUNC_HALF_MM 4, 8, 8, 0, mmx2 +BIWEIGHT_FUNC_HALF_MM 4, 4, 8, 0, mmx2 +BIWEIGHT_FUNC_HALF_MM 4, 2, 8, 0, mmx2 +INIT_XMM +BIWEIGHT_FUNC_HALF_MM 8, 16, 16, 8, sse2 +BIWEIGHT_FUNC_HALF_MM 8, 8, 16, 8, sse2 +BIWEIGHT_FUNC_HALF_MM 8, 4, 16, 8, sse2 + +%macro BIWEIGHT_SSSE3_SETUP 0 + add r6, 1 + or r6, 1 + add r3, 1 + movd m4, r4d + movd m0, r5d + movd m5, r6d + movd m6, r3d + pslld m5, m6 + psrld m5, 1 + punpcklbw m4, m0 + pshuflw m4, m4, 0 + pshuflw m5, m5, 0 + punpcklqdq m4, m4 + punpcklqdq m5, m5 +%endmacro + +%macro BIWEIGHT_SSSE3_OP 0 + pmaddubsw m0, m4 + pmaddubsw m2, m4 + paddsw m0, m5 + paddsw m2, m5 + psraw m0, m6 + psraw m2, m6 + packuswb m0, m2 +%endmacro + +%macro BIWEIGHT_SSSE3_16 1 +cglobal h264_biweight_16x%1_ssse3, 7, 7, 8 + BIWEIGHT_SSSE3_SETUP + mov r3, %1 + +%if %1 == 16 +.nextrow + movh m0, [r0] + movh m2, [r0+8] + movh m3, [r1+8] + punpcklbw m0, [r1] + punpcklbw m2, m3 + BIWEIGHT_SSSE3_OP + mova [r0], m0 + add r0, r2 + add r1, r2 + dec r3 + jnz .nextrow + REP_RET +%else + jmp mangle(ff_h264_biweight_16x16_ssse3.nextrow) +%endif +%endmacro + +INIT_XMM +BIWEIGHT_SSSE3_16 16 +BIWEIGHT_SSSE3_16 8 + +%macro BIWEIGHT_SSSE3_8 1 +cglobal h264_biweight_8x%1_ssse3, 7, 7, 8 + BIWEIGHT_SSSE3_SETUP + mov r3, %1/2 + lea r4, [r2*2] + +%if %1 == 16 +.nextrow + movh m0, [r0] + movh m1, [r1] + movh m2, [r0+r2] + movh m3, [r1+r2] + punpcklbw m0, m1 + punpcklbw m2, m3 + BIWEIGHT_SSSE3_OP + movh [r0], m0 + movhps [r0+r2], m0 + add r0, r4 + add r1, r4 + dec r3 + jnz .nextrow + REP_RET +%else + jmp mangle(ff_h264_biweight_8x16_ssse3.nextrow) +%endif +%endmacro + +INIT_XMM +BIWEIGHT_SSSE3_8 16 +BIWEIGHT_SSSE3_8 8 +BIWEIGHT_SSSE3_8 4 diff --git a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/h264_weight_sse2.asm b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/h264_weight_sse2.asm deleted file mode 100644 index 8667f0690..000000000 --- a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/h264_weight_sse2.asm +++ /dev/null @@ -1,170 +0,0 @@ -;***************************************************************************** -;* SSE2-optimized weighted prediction code -;***************************************************************************** -;* Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt -;* Copyright (C) 2010 Eli Friedman -;* -;* This file is part of FFmpeg. -;* -;* FFmpeg is free software; you can redistribute it and/or -;* modify it under the terms of the GNU Lesser General Public -;* License as published by the Free Software Foundation; either -;* version 2.1 of the License, or (at your option) any later version. -;* -;* FFmpeg is distributed in the hope that it will be useful, -;* but WITHOUT ANY WARRANTY; without even the implied warranty of -;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -;* Lesser General Public License for more details. -;* -;* You should have received a copy of the GNU Lesser General Public -;* License along with FFmpeg; if not, write to the Free Software -;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -;****************************************************************************** - -%include "x86inc.asm" - -SECTION .text -INIT_XMM - -;----------------------------------------------------------------------------- -; biweight pred: -; -; void h264_biweight_16x16_sse2(uint8_t *dst, uint8_t *src, int stride, -; int log2_denom, int weightd, int weights, -; int offset); -;----------------------------------------------------------------------------- - -%macro BIWEIGHT_SSE2_SETUP 0 - add r6, 1 - or r6, 1 - add r3, 1 - movd m3, r4 - movd m4, r5 - movd m5, r6 - movd m6, r3 - pslld m5, m6 - psrld m5, 1 - pshuflw m3, m3, 0 - pshuflw m4, m4, 0 - pshuflw m5, m5, 0 - punpcklqdq m3, m3 - punpcklqdq m4, m4 - punpcklqdq m5, m5 - pxor m7, m7 -%endmacro - -%macro BIWEIGHT_SSE2_STEPA 3 - movh m%1, [r0+%3] - movh m%2, [r1+%3] - punpcklbw m%1, m7 - punpcklbw m%2, m7 - pmullw m%1, m3 - pmullw m%2, m4 - paddsw m%1, m%2 -%endmacro - -%macro BIWEIGHT_SSE2_STEPB 0 - paddsw m0, m5 - paddsw m1, m5 - psraw m0, m6 - psraw m1, m6 - packuswb m0, m1 -%endmacro - -cglobal h264_biweight_16x16_sse2, 7, 7, 8 - BIWEIGHT_SSE2_SETUP - mov r3, 16 - -.nextrow - BIWEIGHT_SSE2_STEPA 0, 1, 0 - BIWEIGHT_SSE2_STEPA 1, 2, 8 - BIWEIGHT_SSE2_STEPB - mova [r0], m0 - add r0, r2 - add r1, r2 - dec r3 - jnz .nextrow - REP_RET - -cglobal h264_biweight_8x8_sse2, 7, 7, 8 - BIWEIGHT_SSE2_SETUP - mov r3, 4 - lea r4, [r2*2] - -.nextrow - BIWEIGHT_SSE2_STEPA 0, 1, 0 - BIWEIGHT_SSE2_STEPA 1, 2, r2 - BIWEIGHT_SSE2_STEPB - movh [r0], m0 - movhps [r0+r2], m0 - add r0, r4 - add r1, r4 - dec r3 - jnz .nextrow - REP_RET - -%macro BIWEIGHT_SSSE3_SETUP 0 - add r6, 1 - or r6, 1 - add r3, 1 - movd m4, r4 - movd m0, r5 - movd m5, r6 - movd m6, r3 - pslld m5, m6 - psrld m5, 1 - punpcklbw m4, m0 - pshuflw m4, m4, 0 - pshuflw m5, m5, 0 - punpcklqdq m4, m4 - punpcklqdq m5, m5 -%endmacro - -%macro BIWEIGHT_SSSE3_OP 0 - pmaddubsw m0, m4 - pmaddubsw m2, m4 - paddsw m0, m5 - paddsw m2, m5 - psraw m0, m6 - psraw m2, m6 - packuswb m0, m2 -%endmacro - -cglobal h264_biweight_16x16_ssse3, 7, 7, 8 - BIWEIGHT_SSSE3_SETUP - mov r3, 16 - -.nextrow - movh m0, [r0] - movh m2, [r0+8] - movh m3, [r1+8] - punpcklbw m0, [r1] - punpcklbw m2, m3 - BIWEIGHT_SSSE3_OP - mova [r0], m0 - add r0, r2 - add r1, r2 - dec r3 - jnz .nextrow - REP_RET - -cglobal h264_biweight_8x8_ssse3, 7, 7, 8 - BIWEIGHT_SSSE3_SETUP - mov r3, 4 - lea r4, [r2*2] - -.nextrow - movh m0, [r0] - movh m1, [r1] - movh m2, [r0+r2] - movh m3, [r1+r2] - punpcklbw m0, m1 - punpcklbw m2, m3 - BIWEIGHT_SSSE3_OP - movh [r0], m0 - movhps [r0+r2], m0 - add r0, r4 - add r1, r4 - dec r3 - jnz .nextrow - REP_RET diff --git a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/h264dsp_mmx.c b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/h264dsp_mmx.c index 4b2e54603..efd8b78f1 100644 --- a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/h264dsp_mmx.c +++ b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/h264dsp_mmx.c @@ -18,8 +18,10 @@ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ +#include "libavutil/cpu.h" +#include "libavutil/x86_cpu.h" +#include "libavcodec/h264dsp.h" #include "dsputil_mmx.h" -#include "libavcodec/h264pred.h" DECLARE_ALIGNED(8, static const uint64_t, ff_pb_3_1 ) = 0x0103010301030103ULL; DECLARE_ALIGNED(8, static const uint64_t, ff_pb_7_3 ) = 0x0307030703070307ULL; @@ -27,772 +29,41 @@ DECLARE_ALIGNED(8, static const uint64_t, ff_pb_7_3 ) = 0x0307030703070307ULL; /***********************************/ /* IDCT */ -#define SUMSUB_BADC( a, b, c, d ) \ - "paddw "#b", "#a" \n\t"\ - "paddw "#d", "#c" \n\t"\ - "paddw "#b", "#b" \n\t"\ - "paddw "#d", "#d" \n\t"\ - "psubw "#a", "#b" \n\t"\ - "psubw "#c", "#d" \n\t" - -#define SUMSUBD2_AB( a, b, t ) \ - "movq "#b", "#t" \n\t"\ - "psraw $1 , "#b" \n\t"\ - "paddw "#a", "#b" \n\t"\ - "psraw $1 , "#a" \n\t"\ - "psubw "#t", "#a" \n\t" - -#define IDCT4_1D( s02, s13, d02, d13, t ) \ - SUMSUB_BA ( s02, d02 )\ - SUMSUBD2_AB( s13, d13, t )\ - SUMSUB_BADC( d13, s02, s13, d02 ) - -#define STORE_DIFF_4P( p, t, z ) \ - "psraw $6, "#p" \n\t"\ - "movd (%0), "#t" \n\t"\ - "punpcklbw "#z", "#t" \n\t"\ - "paddsw "#t", "#p" \n\t"\ - "packuswb "#z", "#p" \n\t"\ - "movd "#p", (%0) \n\t" - -static void ff_h264_idct_add_mmx(uint8_t *dst, int16_t *block, int stride) -{ - /* Load dct coeffs */ - __asm__ volatile( - "movq (%0), %%mm0 \n\t" - "movq 8(%0), %%mm1 \n\t" - "movq 16(%0), %%mm2 \n\t" - "movq 24(%0), %%mm3 \n\t" - :: "r"(block) ); - - __asm__ volatile( - /* mm1=s02+s13 mm2=s02-s13 mm4=d02+d13 mm0=d02-d13 */ - IDCT4_1D( %%mm2, %%mm1, %%mm0, %%mm3, %%mm4 ) - - "movq %0, %%mm6 \n\t" - /* in: 1,4,0,2 out: 1,2,3,0 */ - TRANSPOSE4( %%mm3, %%mm1, %%mm0, %%mm2, %%mm4 ) - - "paddw %%mm6, %%mm3 \n\t" - - /* mm2=s02+s13 mm3=s02-s13 mm4=d02+d13 mm1=d02-d13 */ - IDCT4_1D( %%mm4, %%mm2, %%mm3, %%mm0, %%mm1 ) - - "pxor %%mm7, %%mm7 \n\t" - :: "m"(ff_pw_32)); - - __asm__ volatile( - STORE_DIFF_4P( %%mm0, %%mm1, %%mm7) - "add %1, %0 \n\t" - STORE_DIFF_4P( %%mm2, %%mm1, %%mm7) - "add %1, %0 \n\t" - STORE_DIFF_4P( %%mm3, %%mm1, %%mm7) - "add %1, %0 \n\t" - STORE_DIFF_4P( %%mm4, %%mm1, %%mm7) - : "+r"(dst) - : "r" ((x86_reg)stride) - ); -} - -static inline void h264_idct8_1d(int16_t *block) -{ - __asm__ volatile( - "movq 112(%0), %%mm7 \n\t" - "movq 80(%0), %%mm0 \n\t" - "movq 48(%0), %%mm3 \n\t" - "movq 16(%0), %%mm5 \n\t" - - "movq %%mm0, %%mm4 \n\t" - "movq %%mm5, %%mm1 \n\t" - "psraw $1, %%mm4 \n\t" - "psraw $1, %%mm1 \n\t" - "paddw %%mm0, %%mm4 \n\t" - "paddw %%mm5, %%mm1 \n\t" - "paddw %%mm7, %%mm4 \n\t" - "paddw %%mm0, %%mm1 \n\t" - "psubw %%mm5, %%mm4 \n\t" - "paddw %%mm3, %%mm1 \n\t" - - "psubw %%mm3, %%mm5 \n\t" - "psubw %%mm3, %%mm0 \n\t" - "paddw %%mm7, %%mm5 \n\t" - "psubw %%mm7, %%mm0 \n\t" - "psraw $1, %%mm3 \n\t" - "psraw $1, %%mm7 \n\t" - "psubw %%mm3, %%mm5 \n\t" - "psubw %%mm7, %%mm0 \n\t" - - "movq %%mm4, %%mm3 \n\t" - "movq %%mm1, %%mm7 \n\t" - "psraw $2, %%mm1 \n\t" - "psraw $2, %%mm3 \n\t" - "paddw %%mm5, %%mm3 \n\t" - "psraw $2, %%mm5 \n\t" - "paddw %%mm0, %%mm1 \n\t" - "psraw $2, %%mm0 \n\t" - "psubw %%mm4, %%mm5 \n\t" - "psubw %%mm0, %%mm7 \n\t" - - "movq 32(%0), %%mm2 \n\t" - "movq 96(%0), %%mm6 \n\t" - "movq %%mm2, %%mm4 \n\t" - "movq %%mm6, %%mm0 \n\t" - "psraw $1, %%mm4 \n\t" - "psraw $1, %%mm6 \n\t" - "psubw %%mm0, %%mm4 \n\t" - "paddw %%mm2, %%mm6 \n\t" - - "movq (%0), %%mm2 \n\t" - "movq 64(%0), %%mm0 \n\t" - SUMSUB_BA( %%mm0, %%mm2 ) - SUMSUB_BA( %%mm6, %%mm0 ) - SUMSUB_BA( %%mm4, %%mm2 ) - SUMSUB_BA( %%mm7, %%mm6 ) - SUMSUB_BA( %%mm5, %%mm4 ) - SUMSUB_BA( %%mm3, %%mm2 ) - SUMSUB_BA( %%mm1, %%mm0 ) - :: "r"(block) - ); -} - -static void ff_h264_idct8_add_mmx(uint8_t *dst, int16_t *block, int stride) -{ - int i; - DECLARE_ALIGNED(8, int16_t, b2)[64]; - - block[0] += 32; - - for(i=0; i<2; i++){ - DECLARE_ALIGNED(8, uint64_t, tmp); - - h264_idct8_1d(block+4*i); - - __asm__ volatile( - "movq %%mm7, %0 \n\t" - TRANSPOSE4( %%mm0, %%mm2, %%mm4, %%mm6, %%mm7 ) - "movq %%mm0, 8(%1) \n\t" - "movq %%mm6, 24(%1) \n\t" - "movq %%mm7, 40(%1) \n\t" - "movq %%mm4, 56(%1) \n\t" - "movq %0, %%mm7 \n\t" - TRANSPOSE4( %%mm7, %%mm5, %%mm3, %%mm1, %%mm0 ) - "movq %%mm7, (%1) \n\t" - "movq %%mm1, 16(%1) \n\t" - "movq %%mm0, 32(%1) \n\t" - "movq %%mm3, 48(%1) \n\t" - : "=m"(tmp) - : "r"(b2+32*i) - : "memory" - ); - } - - for(i=0; i<2; i++){ - h264_idct8_1d(b2+4*i); - - __asm__ volatile( - "psraw $6, %%mm7 \n\t" - "psraw $6, %%mm6 \n\t" - "psraw $6, %%mm5 \n\t" - "psraw $6, %%mm4 \n\t" - "psraw $6, %%mm3 \n\t" - "psraw $6, %%mm2 \n\t" - "psraw $6, %%mm1 \n\t" - "psraw $6, %%mm0 \n\t" - - "movq %%mm7, (%0) \n\t" - "movq %%mm5, 16(%0) \n\t" - "movq %%mm3, 32(%0) \n\t" - "movq %%mm1, 48(%0) \n\t" - "movq %%mm0, 64(%0) \n\t" - "movq %%mm2, 80(%0) \n\t" - "movq %%mm4, 96(%0) \n\t" - "movq %%mm6, 112(%0) \n\t" - :: "r"(b2+4*i) - : "memory" - ); - } - - add_pixels_clamped_mmx(b2, dst, stride); -} - -#define STORE_DIFF_8P( p, d, t, z )\ - "movq "#d", "#t" \n"\ - "psraw $6, "#p" \n"\ - "punpcklbw "#z", "#t" \n"\ - "paddsw "#t", "#p" \n"\ - "packuswb "#p", "#p" \n"\ - "movq "#p", "#d" \n" - -#define H264_IDCT8_1D_SSE2(a,b,c,d,e,f,g,h)\ - "movdqa "#c", "#a" \n"\ - "movdqa "#g", "#e" \n"\ - "psraw $1, "#c" \n"\ - "psraw $1, "#g" \n"\ - "psubw "#e", "#c" \n"\ - "paddw "#a", "#g" \n"\ - "movdqa "#b", "#e" \n"\ - "psraw $1, "#e" \n"\ - "paddw "#b", "#e" \n"\ - "paddw "#d", "#e" \n"\ - "paddw "#f", "#e" \n"\ - "movdqa "#f", "#a" \n"\ - "psraw $1, "#a" \n"\ - "paddw "#f", "#a" \n"\ - "paddw "#h", "#a" \n"\ - "psubw "#b", "#a" \n"\ - "psubw "#d", "#b" \n"\ - "psubw "#d", "#f" \n"\ - "paddw "#h", "#b" \n"\ - "psubw "#h", "#f" \n"\ - "psraw $1, "#d" \n"\ - "psraw $1, "#h" \n"\ - "psubw "#d", "#b" \n"\ - "psubw "#h", "#f" \n"\ - "movdqa "#e", "#d" \n"\ - "movdqa "#a", "#h" \n"\ - "psraw $2, "#d" \n"\ - "psraw $2, "#h" \n"\ - "paddw "#f", "#d" \n"\ - "paddw "#b", "#h" \n"\ - "psraw $2, "#f" \n"\ - "psraw $2, "#b" \n"\ - "psubw "#f", "#e" \n"\ - "psubw "#a", "#b" \n"\ - "movdqa 0x00(%1), "#a" \n"\ - "movdqa 0x40(%1), "#f" \n"\ - SUMSUB_BA(f, a)\ - SUMSUB_BA(g, f)\ - SUMSUB_BA(c, a)\ - SUMSUB_BA(e, g)\ - SUMSUB_BA(b, c)\ - SUMSUB_BA(h, a)\ - SUMSUB_BA(d, f) - -static void ff_h264_idct8_add_sse2(uint8_t *dst, int16_t *block, int stride) -{ - __asm__ volatile( - "movdqa 0x10(%1), %%xmm1 \n" - "movdqa 0x20(%1), %%xmm2 \n" - "movdqa 0x30(%1), %%xmm3 \n" - "movdqa 0x50(%1), %%xmm5 \n" - "movdqa 0x60(%1), %%xmm6 \n" - "movdqa 0x70(%1), %%xmm7 \n" - H264_IDCT8_1D_SSE2(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm6, %%xmm7) - TRANSPOSE8(%%xmm4, %%xmm1, %%xmm7, %%xmm3, %%xmm5, %%xmm0, %%xmm2, %%xmm6, (%1)) - "paddw %4, %%xmm4 \n" - "movdqa %%xmm4, 0x00(%1) \n" - "movdqa %%xmm2, 0x40(%1) \n" - H264_IDCT8_1D_SSE2(%%xmm4, %%xmm0, %%xmm6, %%xmm3, %%xmm2, %%xmm5, %%xmm7, %%xmm1) - "movdqa %%xmm6, 0x60(%1) \n" - "movdqa %%xmm7, 0x70(%1) \n" - "pxor %%xmm7, %%xmm7 \n" - STORE_DIFF_8P(%%xmm2, (%0), %%xmm6, %%xmm7) - STORE_DIFF_8P(%%xmm0, (%0,%2), %%xmm6, %%xmm7) - STORE_DIFF_8P(%%xmm1, (%0,%2,2), %%xmm6, %%xmm7) - STORE_DIFF_8P(%%xmm3, (%0,%3), %%xmm6, %%xmm7) - "lea (%0,%2,4), %0 \n" - STORE_DIFF_8P(%%xmm5, (%0), %%xmm6, %%xmm7) - STORE_DIFF_8P(%%xmm4, (%0,%2), %%xmm6, %%xmm7) - "movdqa 0x60(%1), %%xmm0 \n" - "movdqa 0x70(%1), %%xmm1 \n" - STORE_DIFF_8P(%%xmm0, (%0,%2,2), %%xmm6, %%xmm7) - STORE_DIFF_8P(%%xmm1, (%0,%3), %%xmm6, %%xmm7) - :"+r"(dst) - :"r"(block), "r"((x86_reg)stride), "r"((x86_reg)3L*stride), "m"(ff_pw_32) - ); -} - -static void ff_h264_idct_dc_add_mmx2(uint8_t *dst, int16_t *block, int stride) -{ - int dc = (block[0] + 32) >> 6; - __asm__ volatile( - "movd %0, %%mm0 \n\t" - "pshufw $0, %%mm0, %%mm0 \n\t" - "pxor %%mm1, %%mm1 \n\t" - "psubw %%mm0, %%mm1 \n\t" - "packuswb %%mm0, %%mm0 \n\t" - "packuswb %%mm1, %%mm1 \n\t" - ::"r"(dc) - ); - __asm__ volatile( - "movd %0, %%mm2 \n\t" - "movd %1, %%mm3 \n\t" - "movd %2, %%mm4 \n\t" - "movd %3, %%mm5 \n\t" - "paddusb %%mm0, %%mm2 \n\t" - "paddusb %%mm0, %%mm3 \n\t" - "paddusb %%mm0, %%mm4 \n\t" - "paddusb %%mm0, %%mm5 \n\t" - "psubusb %%mm1, %%mm2 \n\t" - "psubusb %%mm1, %%mm3 \n\t" - "psubusb %%mm1, %%mm4 \n\t" - "psubusb %%mm1, %%mm5 \n\t" - "movd %%mm2, %0 \n\t" - "movd %%mm3, %1 \n\t" - "movd %%mm4, %2 \n\t" - "movd %%mm5, %3 \n\t" - :"+m"(*(uint32_t*)(dst+0*stride)), - "+m"(*(uint32_t*)(dst+1*stride)), - "+m"(*(uint32_t*)(dst+2*stride)), - "+m"(*(uint32_t*)(dst+3*stride)) - ); -} - -static void ff_h264_idct8_dc_add_mmx2(uint8_t *dst, int16_t *block, int stride) -{ - int dc = (block[0] + 32) >> 6; - int y; - __asm__ volatile( - "movd %0, %%mm0 \n\t" - "pshufw $0, %%mm0, %%mm0 \n\t" - "pxor %%mm1, %%mm1 \n\t" - "psubw %%mm0, %%mm1 \n\t" - "packuswb %%mm0, %%mm0 \n\t" - "packuswb %%mm1, %%mm1 \n\t" - ::"r"(dc) - ); - for(y=2; y--; dst += 4*stride){ - __asm__ volatile( - "movq %0, %%mm2 \n\t" - "movq %1, %%mm3 \n\t" - "movq %2, %%mm4 \n\t" - "movq %3, %%mm5 \n\t" - "paddusb %%mm0, %%mm2 \n\t" - "paddusb %%mm0, %%mm3 \n\t" - "paddusb %%mm0, %%mm4 \n\t" - "paddusb %%mm0, %%mm5 \n\t" - "psubusb %%mm1, %%mm2 \n\t" - "psubusb %%mm1, %%mm3 \n\t" - "psubusb %%mm1, %%mm4 \n\t" - "psubusb %%mm1, %%mm5 \n\t" - "movq %%mm2, %0 \n\t" - "movq %%mm3, %1 \n\t" - "movq %%mm4, %2 \n\t" - "movq %%mm5, %3 \n\t" - :"+m"(*(uint64_t*)(dst+0*stride)), - "+m"(*(uint64_t*)(dst+1*stride)), - "+m"(*(uint64_t*)(dst+2*stride)), - "+m"(*(uint64_t*)(dst+3*stride)) - ); - } -} - -//FIXME this table is a duplicate from h264data.h, and will be removed once the tables from, h264 have been split -static const uint8_t scan8[16 + 2*4]={ - 4+1*8, 5+1*8, 4+2*8, 5+2*8, - 6+1*8, 7+1*8, 6+2*8, 7+2*8, - 4+3*8, 5+3*8, 4+4*8, 5+4*8, - 6+3*8, 7+3*8, 6+4*8, 7+4*8, - 1+1*8, 2+1*8, - 1+2*8, 2+2*8, - 1+4*8, 2+4*8, - 1+5*8, 2+5*8, -}; - -static void ff_h264_idct_add16_mmx(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){ - int i; - for(i=0; i<16; i++){ - if(nnzc[ scan8[i] ]) - ff_h264_idct_add_mmx(dst + block_offset[i], block + i*16, stride); - } -} - -static void ff_h264_idct8_add4_mmx(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){ - int i; - for(i=0; i<16; i+=4){ - if(nnzc[ scan8[i] ]) - ff_h264_idct8_add_mmx(dst + block_offset[i], block + i*16, stride); - } -} - - -static void ff_h264_idct_add16_mmx2(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){ - int i; - for(i=0; i<16; i++){ - int nnz = nnzc[ scan8[i] ]; - if(nnz){ - if(nnz==1 && block[i*16]) ff_h264_idct_dc_add_mmx2(dst + block_offset[i], block + i*16, stride); - else ff_h264_idct_add_mmx (dst + block_offset[i], block + i*16, stride); - } - } -} - -static void ff_h264_idct_add16intra_mmx(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){ - int i; - for(i=0; i<16; i++){ - if(nnzc[ scan8[i] ] || block[i*16]) - ff_h264_idct_add_mmx(dst + block_offset[i], block + i*16, stride); - } -} - -static void ff_h264_idct_add16intra_mmx2(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){ - int i; - for(i=0; i<16; i++){ - if(nnzc[ scan8[i] ]) ff_h264_idct_add_mmx (dst + block_offset[i], block + i*16, stride); - else if(block[i*16]) ff_h264_idct_dc_add_mmx2(dst + block_offset[i], block + i*16, stride); - } -} - -static void ff_h264_idct8_add4_mmx2(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){ - int i; - for(i=0; i<16; i+=4){ - int nnz = nnzc[ scan8[i] ]; - if(nnz){ - if(nnz==1 && block[i*16]) ff_h264_idct8_dc_add_mmx2(dst + block_offset[i], block + i*16, stride); - else ff_h264_idct8_add_mmx (dst + block_offset[i], block + i*16, stride); - } - } -} - -static void ff_h264_idct8_add4_sse2(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){ - int i; - for(i=0; i<16; i+=4){ - int nnz = nnzc[ scan8[i] ]; - if(nnz){ - if(nnz==1 && block[i*16]) ff_h264_idct8_dc_add_mmx2(dst + block_offset[i], block + i*16, stride); - else ff_h264_idct8_add_sse2 (dst + block_offset[i], block + i*16, stride); - } - } -} - -static void ff_h264_idct_add8_mmx(uint8_t **dest, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){ - int i; - for(i=16; i<16+8; i++){ - if(nnzc[ scan8[i] ] || block[i*16]) - ff_h264_idct_add_mmx (dest[(i&4)>>2] + block_offset[i], block + i*16, stride); - } -} - -static void ff_h264_idct_add8_mmx2(uint8_t **dest, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){ - int i; - for(i=16; i<16+8; i++){ - if(nnzc[ scan8[i] ]) - ff_h264_idct_add_mmx (dest[(i&4)>>2] + block_offset[i], block + i*16, stride); - else if(block[i*16]) - ff_h264_idct_dc_add_mmx2(dest[(i&4)>>2] + block_offset[i], block + i*16, stride); - } -} - -#if CONFIG_GPL && HAVE_YASM -static void ff_h264_idct_dc_add8_mmx2(uint8_t *dst, int16_t *block, int stride) -{ - __asm__ volatile( - "movd %0, %%mm0 \n\t" // 0 0 X D - "punpcklwd %1, %%mm0 \n\t" // x X d D - "paddsw %2, %%mm0 \n\t" - "psraw $6, %%mm0 \n\t" - "punpcklwd %%mm0, %%mm0 \n\t" // d d D D - "pxor %%mm1, %%mm1 \n\t" // 0 0 0 0 - "psubw %%mm0, %%mm1 \n\t" // -d-d-D-D - "packuswb %%mm1, %%mm0 \n\t" // -d-d-D-D d d D D - "pshufw $0xFA, %%mm0, %%mm1 \n\t" // -d-d-d-d-D-D-D-D - "punpcklwd %%mm0, %%mm0 \n\t" // d d d d D D D D - ::"m"(block[ 0]), - "m"(block[16]), - "m"(ff_pw_32) - ); - __asm__ volatile( - "movq %0, %%mm2 \n\t" - "movq %1, %%mm3 \n\t" - "movq %2, %%mm4 \n\t" - "movq %3, %%mm5 \n\t" - "paddusb %%mm0, %%mm2 \n\t" - "paddusb %%mm0, %%mm3 \n\t" - "paddusb %%mm0, %%mm4 \n\t" - "paddusb %%mm0, %%mm5 \n\t" - "psubusb %%mm1, %%mm2 \n\t" - "psubusb %%mm1, %%mm3 \n\t" - "psubusb %%mm1, %%mm4 \n\t" - "psubusb %%mm1, %%mm5 \n\t" - "movq %%mm2, %0 \n\t" - "movq %%mm3, %1 \n\t" - "movq %%mm4, %2 \n\t" - "movq %%mm5, %3 \n\t" - :"+m"(*(uint64_t*)(dst+0*stride)), - "+m"(*(uint64_t*)(dst+1*stride)), - "+m"(*(uint64_t*)(dst+2*stride)), - "+m"(*(uint64_t*)(dst+3*stride)) - ); -} - -extern void ff_x264_add8x4_idct_sse2(uint8_t *dst, int16_t *block, int stride); - -static void ff_h264_idct_add16_sse2(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){ - int i; - for(i=0; i<16; i+=2) - if(nnzc[ scan8[i+0] ]|nnzc[ scan8[i+1] ]) - ff_x264_add8x4_idct_sse2 (dst + block_offset[i], block + i*16, stride); -} - -static void ff_h264_idct_add16intra_sse2(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){ - int i; - for(i=0; i<16; i+=2){ - if(nnzc[ scan8[i+0] ]|nnzc[ scan8[i+1] ]) - ff_x264_add8x4_idct_sse2 (dst + block_offset[i], block + i*16, stride); - else if(block[i*16]|block[i*16+16]) - ff_h264_idct_dc_add8_mmx2(dst + block_offset[i], block + i*16, stride); - } -} - -static void ff_h264_idct_add8_sse2(uint8_t **dest, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){ - int i; - for(i=16; i<16+8; i+=2){ - if(nnzc[ scan8[i+0] ]|nnzc[ scan8[i+1] ]) - ff_x264_add8x4_idct_sse2 (dest[(i&4)>>2] + block_offset[i], block + i*16, stride); - else if(block[i*16]|block[i*16+16]) - ff_h264_idct_dc_add8_mmx2(dest[(i&4)>>2] + block_offset[i], block + i*16, stride); - } -} -#endif +void ff_h264_idct_add_mmx (uint8_t *dst, int16_t *block, int stride); +void ff_h264_idct8_add_mmx (uint8_t *dst, int16_t *block, int stride); +void ff_h264_idct8_add_sse2 (uint8_t *dst, int16_t *block, int stride); +void ff_h264_idct_dc_add_mmx2 (uint8_t *dst, int16_t *block, int stride); +void ff_h264_idct8_dc_add_mmx2(uint8_t *dst, int16_t *block, int stride); + +void ff_h264_idct_add16_mmx (uint8_t *dst, const int *block_offset, + DCTELEM *block, int stride, const uint8_t nnzc[6*8]); +void ff_h264_idct8_add4_mmx (uint8_t *dst, const int *block_offset, + DCTELEM *block, int stride, const uint8_t nnzc[6*8]); +void ff_h264_idct_add16_mmx2 (uint8_t *dst, const int *block_offset, + DCTELEM *block, int stride, const uint8_t nnzc[6*8]); +void ff_h264_idct_add16intra_mmx (uint8_t *dst, const int *block_offset, + DCTELEM *block, int stride, const uint8_t nnzc[6*8]); +void ff_h264_idct_add16intra_mmx2(uint8_t *dst, const int *block_offset, + DCTELEM *block, int stride, const uint8_t nnzc[6*8]); +void ff_h264_idct8_add4_mmx2 (uint8_t *dst, const int *block_offset, + DCTELEM *block, int stride, const uint8_t nnzc[6*8]); +void ff_h264_idct8_add4_sse2 (uint8_t *dst, const int *block_offset, + DCTELEM *block, int stride, const uint8_t nnzc[6*8]); +void ff_h264_idct_add8_mmx (uint8_t **dest, const int *block_offset, + DCTELEM *block, int stride, const uint8_t nnzc[6*8]); +void ff_h264_idct_add8_mmx2 (uint8_t **dest, const int *block_offset, + DCTELEM *block, int stride, const uint8_t nnzc[6*8]); + +void ff_h264_idct_add16_sse2 (uint8_t *dst, const int *block_offset, DCTELEM *block, + int stride, const uint8_t nnzc[6*8]); +void ff_h264_idct_add16intra_sse2(uint8_t *dst, const int *block_offset, DCTELEM *block, + int stride, const uint8_t nnzc[6*8]); +void ff_h264_idct_add8_sse2 (uint8_t **dest, const int *block_offset, DCTELEM *block, + int stride, const uint8_t nnzc[6*8]); /***********************************/ /* deblocking */ -// out: o = |x-y|>a -// clobbers: t -#define DIFF_GT_MMX(x,y,a,o,t)\ - "movq "#y", "#t" \n\t"\ - "movq "#x", "#o" \n\t"\ - "psubusb "#x", "#t" \n\t"\ - "psubusb "#y", "#o" \n\t"\ - "por "#t", "#o" \n\t"\ - "psubusb "#a", "#o" \n\t" - -// out: o = |x-y|>a -// clobbers: t -#define DIFF_GT2_MMX(x,y,a,o,t)\ - "movq "#y", "#t" \n\t"\ - "movq "#x", "#o" \n\t"\ - "psubusb "#x", "#t" \n\t"\ - "psubusb "#y", "#o" \n\t"\ - "psubusb "#a", "#t" \n\t"\ - "psubusb "#a", "#o" \n\t"\ - "pcmpeqb "#t", "#o" \n\t"\ - -// in: mm0=p1 mm1=p0 mm2=q0 mm3=q1 -// out: mm5=beta-1, mm7=mask -// clobbers: mm4,mm6 -#define H264_DEBLOCK_MASK(alpha1, beta1) \ - "pshufw $0, "#alpha1", %%mm4 \n\t"\ - "pshufw $0, "#beta1 ", %%mm5 \n\t"\ - "packuswb %%mm4, %%mm4 \n\t"\ - "packuswb %%mm5, %%mm5 \n\t"\ - DIFF_GT_MMX(%%mm1, %%mm2, %%mm4, %%mm7, %%mm6) /* |p0-q0| > alpha-1 */\ - DIFF_GT_MMX(%%mm0, %%mm1, %%mm5, %%mm4, %%mm6) /* |p1-p0| > beta-1 */\ - "por %%mm4, %%mm7 \n\t"\ - DIFF_GT_MMX(%%mm3, %%mm2, %%mm5, %%mm4, %%mm6) /* |q1-q0| > beta-1 */\ - "por %%mm4, %%mm7 \n\t"\ - "pxor %%mm6, %%mm6 \n\t"\ - "pcmpeqb %%mm6, %%mm7 \n\t" - -// in: mm0=p1 mm1=p0 mm2=q0 mm3=q1 mm7=(tc&mask) -// out: mm1=p0' mm2=q0' -// clobbers: mm0,3-6 -#define H264_DEBLOCK_P0_Q0(pb_01, pb_3f)\ - "movq %%mm1 , %%mm5 \n\t"\ - "pxor %%mm2 , %%mm5 \n\t" /* p0^q0*/\ - "pand "#pb_01" , %%mm5 \n\t" /* (p0^q0)&1*/\ - "pcmpeqb %%mm4 , %%mm4 \n\t"\ - "pxor %%mm4 , %%mm3 \n\t"\ - "pavgb %%mm0 , %%mm3 \n\t" /* (p1 - q1 + 256)>>1*/\ - "pavgb "MANGLE(ff_pb_3)" , %%mm3 \n\t" /*(((p1 - q1 + 256)>>1)+4)>>1 = 64+2+(p1-q1)>>2*/\ - "pxor %%mm1 , %%mm4 \n\t"\ - "pavgb %%mm2 , %%mm4 \n\t" /* (q0 - p0 + 256)>>1*/\ - "pavgb %%mm5 , %%mm3 \n\t"\ - "paddusb %%mm4 , %%mm3 \n\t" /* d+128+33*/\ - "movq "MANGLE(ff_pb_A1)" , %%mm6 \n\t"\ - "psubusb %%mm3 , %%mm6 \n\t"\ - "psubusb "MANGLE(ff_pb_A1)" , %%mm3 \n\t"\ - "pminub %%mm7 , %%mm6 \n\t"\ - "pminub %%mm7 , %%mm3 \n\t"\ - "psubusb %%mm6 , %%mm1 \n\t"\ - "psubusb %%mm3 , %%mm2 \n\t"\ - "paddusb %%mm3 , %%mm1 \n\t"\ - "paddusb %%mm6 , %%mm2 \n\t" - -// in: mm0=p1 mm1=p0 mm2=q0 mm3=q1 mm7=(tc&mask) %8=ff_bone -// out: (q1addr) = av_clip( (q2+((p0+q0+1)>>1))>>1, q1-tc0, q1+tc0 ) -// clobbers: q2, tmp, tc0 -#define H264_DEBLOCK_Q1(p1, q2, q2addr, q1addr, tc0, tmp)\ - "movq %%mm1, "#tmp" \n\t"\ - "pavgb %%mm2, "#tmp" \n\t"\ - "pavgb "#tmp", "#q2" \n\t" /* avg(p2,avg(p0,q0)) */\ - "pxor "q2addr", "#tmp" \n\t"\ - "pand %9, "#tmp" \n\t" /* (p2^avg(p0,q0))&1 */\ - "psubusb "#tmp", "#q2" \n\t" /* (p2+((p0+q0+1)>>1))>>1 */\ - "movq "#p1", "#tmp" \n\t"\ - "psubusb "#tc0", "#tmp" \n\t"\ - "paddusb "#p1", "#tc0" \n\t"\ - "pmaxub "#tmp", "#q2" \n\t"\ - "pminub "#tc0", "#q2" \n\t"\ - "movq "#q2", "q1addr" \n\t" - -static inline void h264_loop_filter_luma_mmx2(uint8_t *pix, int stride, int alpha1, int beta1, int8_t *tc0) -{ - DECLARE_ALIGNED(8, uint64_t, tmp0)[2]; - - __asm__ volatile( - "movq (%2,%4), %%mm0 \n\t" //p1 - "movq (%2,%4,2), %%mm1 \n\t" //p0 - "movq (%3), %%mm2 \n\t" //q0 - "movq (%3,%4), %%mm3 \n\t" //q1 - H264_DEBLOCK_MASK(%7, %8) - - "movd %6, %%mm4 \n\t" - "punpcklbw %%mm4, %%mm4 \n\t" - "punpcklwd %%mm4, %%mm4 \n\t" - "pcmpeqb %%mm3, %%mm3 \n\t" - "movq %%mm4, %%mm6 \n\t" - "pcmpgtb %%mm3, %%mm4 \n\t" - "movq %%mm6, %1 \n\t" - "pand %%mm4, %%mm7 \n\t" - "movq %%mm7, %0 \n\t" - - /* filter p1 */ - "movq (%2), %%mm3 \n\t" //p2 - DIFF_GT2_MMX(%%mm1, %%mm3, %%mm5, %%mm6, %%mm4) // |p2-p0|>beta-1 - "pand %%mm7, %%mm6 \n\t" // mask & |p2-p0|beta-1 - "pand %0, %%mm6 \n\t" - "movq %1, %%mm5 \n\t" // can be merged with the and below but is slower then - "pand %%mm6, %%mm5 \n\t" - "psubb %%mm6, %%mm7 \n\t" - "movq (%3,%4), %%mm3 \n\t" - H264_DEBLOCK_Q1(%%mm3, %%mm4, "(%3,%4,2)", "(%3,%4)", %%mm5, %%mm6) - - /* filter p0, q0 */ - H264_DEBLOCK_P0_Q0(%9, unused) - "movq %%mm1, (%2,%4,2) \n\t" - "movq %%mm2, (%3) \n\t" - - : "=m"(tmp0[0]), "=m"(tmp0[1]) - : "r"(pix-3*stride), "r"(pix), "r"((x86_reg)stride), - "m"(*tmp0/*unused*/), "m"(*(uint32_t*)tc0), "m"(alpha1), "m"(beta1), - "m"(ff_bone) - ); -} - -static void h264_v_loop_filter_luma_mmx2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) -{ - if((tc0[0] & tc0[1]) >= 0) - h264_loop_filter_luma_mmx2(pix, stride, alpha-1, beta-1, tc0); - if((tc0[2] & tc0[3]) >= 0) - h264_loop_filter_luma_mmx2(pix+8, stride, alpha-1, beta-1, tc0+2); -} -static void h264_h_loop_filter_luma_mmx2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) -{ - //FIXME: could cut some load/stores by merging transpose with filter - // also, it only needs to transpose 6x8 - DECLARE_ALIGNED(8, uint8_t, trans)[8*8]; - int i; - for(i=0; i<2; i++, pix+=8*stride, tc0+=2) { - if((tc0[0] & tc0[1]) < 0) - continue; - transpose4x4(trans, pix-4, 8, stride); - transpose4x4(trans +4*8, pix, 8, stride); - transpose4x4(trans+4, pix-4+4*stride, 8, stride); - transpose4x4(trans+4+4*8, pix +4*stride, 8, stride); - h264_loop_filter_luma_mmx2(trans+4*8, 8, alpha-1, beta-1, tc0); - transpose4x4(pix-2, trans +2*8, stride, 8); - transpose4x4(pix-2+4*stride, trans+4+2*8, stride, 8); - } -} - -static inline void h264_loop_filter_chroma_mmx2(uint8_t *pix, int stride, int alpha1, int beta1, int8_t *tc0) -{ - __asm__ volatile( - "movq (%0), %%mm0 \n\t" //p1 - "movq (%0,%2), %%mm1 \n\t" //p0 - "movq (%1), %%mm2 \n\t" //q0 - "movq (%1,%2), %%mm3 \n\t" //q1 - H264_DEBLOCK_MASK(%4, %5) - "movd %3, %%mm6 \n\t" - "punpcklbw %%mm6, %%mm6 \n\t" - "pand %%mm6, %%mm7 \n\t" // mm7 = tc&mask - H264_DEBLOCK_P0_Q0(%6, %7) - "movq %%mm1, (%0,%2) \n\t" - "movq %%mm2, (%1) \n\t" - - :: "r"(pix-2*stride), "r"(pix), "r"((x86_reg)stride), - "r"(*(uint32_t*)tc0), - "m"(alpha1), "m"(beta1), "m"(ff_bone), "m"(ff_pb_3F) - ); -} - -static void h264_v_loop_filter_chroma_mmx2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) -{ - h264_loop_filter_chroma_mmx2(pix, stride, alpha-1, beta-1, tc0); -} - -static void h264_h_loop_filter_chroma_mmx2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) -{ - //FIXME: could cut some load/stores by merging transpose with filter - DECLARE_ALIGNED(8, uint8_t, trans)[8*4]; - transpose4x4(trans, pix-2, 8, stride); - transpose4x4(trans+4, pix-2+4*stride, 8, stride); - h264_loop_filter_chroma_mmx2(trans+2*8, 8, alpha-1, beta-1, tc0); - transpose4x4(pix-2, trans, stride, 8); - transpose4x4(pix-2+4*stride, trans+4, stride, 8); -} - -// p0 = (p0 + q1 + 2*p1 + 2) >> 2 -#define H264_FILTER_CHROMA4(p0, p1, q1, one) \ - "movq "#p0", %%mm4 \n\t"\ - "pxor "#q1", %%mm4 \n\t"\ - "pand "#one", %%mm4 \n\t" /* mm4 = (p0^q1)&1 */\ - "pavgb "#q1", "#p0" \n\t"\ - "psubusb %%mm4, "#p0" \n\t"\ - "pavgb "#p1", "#p0" \n\t" /* dst = avg(p1, avg(p0,q1) - ((p0^q1)&1)) */\ - -static inline void h264_loop_filter_chroma_intra_mmx2(uint8_t *pix, int stride, int alpha1, int beta1) -{ - __asm__ volatile( - "movq (%0), %%mm0 \n\t" - "movq (%0,%2), %%mm1 \n\t" - "movq (%1), %%mm2 \n\t" - "movq (%1,%2), %%mm3 \n\t" - H264_DEBLOCK_MASK(%3, %4) - "movq %%mm1, %%mm5 \n\t" - "movq %%mm2, %%mm6 \n\t" - H264_FILTER_CHROMA4(%%mm1, %%mm0, %%mm3, %5) //p0' - H264_FILTER_CHROMA4(%%mm2, %%mm3, %%mm0, %5) //q0' - "psubb %%mm5, %%mm1 \n\t" - "psubb %%mm6, %%mm2 \n\t" - "pand %%mm7, %%mm1 \n\t" - "pand %%mm7, %%mm2 \n\t" - "paddb %%mm5, %%mm1 \n\t" - "paddb %%mm6, %%mm2 \n\t" - "movq %%mm1, (%0,%2) \n\t" - "movq %%mm2, (%1) \n\t" - :: "r"(pix-2*stride), "r"(pix), "r"((x86_reg)stride), - "m"(alpha1), "m"(beta1), "m"(ff_bone) - ); -} - -static void h264_v_loop_filter_chroma_intra_mmx2(uint8_t *pix, int stride, int alpha, int beta) -{ - h264_loop_filter_chroma_intra_mmx2(pix, stride, alpha-1, beta-1); -} - -static void h264_h_loop_filter_chroma_intra_mmx2(uint8_t *pix, int stride, int alpha, int beta) -{ - //FIXME: could cut some load/stores by merging transpose with filter - DECLARE_ALIGNED(8, uint8_t, trans)[8*4]; - transpose4x4(trans, pix-2, 8, stride); - transpose4x4(trans+4, pix-2+4*stride, 8, stride); - h264_loop_filter_chroma_intra_mmx2(trans+2*8, 8, alpha-1, beta-1); - transpose4x4(pix-2, trans, stride, 8); - transpose4x4(pix-2+4*stride, trans+4, stride, 8); -} - static void h264_loop_filter_strength_mmx2( int16_t bS[2][4][4], uint8_t nnz[40], int8_t ref[2][40], int16_t mv[2][40][2], int bidir, int edges, int step, int mask_mv0, int mask_mv1, int field ) { int dir; @@ -917,1507 +188,162 @@ static void h264_loop_filter_strength_mmx2( int16_t bS[2][4][4], uint8_t nnz[40] ); } -/***********************************/ -/* motion compensation */ - -#define QPEL_H264V_MM(A,B,C,D,E,F,OP,T,Z,d,q)\ - "mov"#q" "#C", "#T" \n\t"\ - "mov"#d" (%0), "#F" \n\t"\ - "paddw "#D", "#T" \n\t"\ - "psllw $2, "#T" \n\t"\ - "psubw "#B", "#T" \n\t"\ - "psubw "#E", "#T" \n\t"\ - "punpcklbw "#Z", "#F" \n\t"\ - "pmullw %4, "#T" \n\t"\ - "paddw %5, "#A" \n\t"\ - "add %2, %0 \n\t"\ - "paddw "#F", "#A" \n\t"\ - "paddw "#A", "#T" \n\t"\ - "psraw $5, "#T" \n\t"\ - "packuswb "#T", "#T" \n\t"\ - OP(T, (%1), A, d)\ - "add %3, %1 \n\t" - -#define QPEL_H264HV_MM(A,B,C,D,E,F,OF,T,Z,d,q)\ - "mov"#q" "#C", "#T" \n\t"\ - "mov"#d" (%0), "#F" \n\t"\ - "paddw "#D", "#T" \n\t"\ - "psllw $2, "#T" \n\t"\ - "paddw %4, "#A" \n\t"\ - "psubw "#B", "#T" \n\t"\ - "psubw "#E", "#T" \n\t"\ - "punpcklbw "#Z", "#F" \n\t"\ - "pmullw %3, "#T" \n\t"\ - "paddw "#F", "#A" \n\t"\ - "add %2, %0 \n\t"\ - "paddw "#A", "#T" \n\t"\ - "mov"#q" "#T", "#OF"(%1) \n\t" - -#define QPEL_H264V(A,B,C,D,E,F,OP) QPEL_H264V_MM(A,B,C,D,E,F,OP,%%mm6,%%mm7,d,q) -#define QPEL_H264HV(A,B,C,D,E,F,OF) QPEL_H264HV_MM(A,B,C,D,E,F,OF,%%mm6,%%mm7,d,q) -#define QPEL_H264V_XMM(A,B,C,D,E,F,OP) QPEL_H264V_MM(A,B,C,D,E,F,OP,%%xmm6,%%xmm7,q,dqa) -#define QPEL_H264HV_XMM(A,B,C,D,E,F,OF) QPEL_H264HV_MM(A,B,C,D,E,F,OF,%%xmm6,%%xmm7,q,dqa) - - -#define QPEL_H264(OPNAME, OP, MMX)\ -static av_noinline void OPNAME ## h264_qpel4_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ - int h=4;\ -\ - __asm__ volatile(\ - "pxor %%mm7, %%mm7 \n\t"\ - "movq "MANGLE(ff_pw_5) ", %%mm4\n\t"\ - "movq "MANGLE(ff_pw_16)", %%mm5\n\t"\ - "1: \n\t"\ - "movd -1(%0), %%mm1 \n\t"\ - "movd (%0), %%mm2 \n\t"\ - "movd 1(%0), %%mm3 \n\t"\ - "movd 2(%0), %%mm0 \n\t"\ - "punpcklbw %%mm7, %%mm1 \n\t"\ - "punpcklbw %%mm7, %%mm2 \n\t"\ - "punpcklbw %%mm7, %%mm3 \n\t"\ - "punpcklbw %%mm7, %%mm0 \n\t"\ - "paddw %%mm0, %%mm1 \n\t"\ - "paddw %%mm3, %%mm2 \n\t"\ - "movd -2(%0), %%mm0 \n\t"\ - "movd 3(%0), %%mm3 \n\t"\ - "punpcklbw %%mm7, %%mm0 \n\t"\ - "punpcklbw %%mm7, %%mm3 \n\t"\ - "paddw %%mm3, %%mm0 \n\t"\ - "psllw $2, %%mm2 \n\t"\ - "psubw %%mm1, %%mm2 \n\t"\ - "pmullw %%mm4, %%mm2 \n\t"\ - "paddw %%mm5, %%mm0 \n\t"\ - "paddw %%mm2, %%mm0 \n\t"\ - "psraw $5, %%mm0 \n\t"\ - "packuswb %%mm0, %%mm0 \n\t"\ - OP(%%mm0, (%1),%%mm6, d)\ - "add %3, %0 \n\t"\ - "add %4, %1 \n\t"\ - "decl %2 \n\t"\ - " jnz 1b \n\t"\ - : "+a"(src), "+c"(dst), "+g"(h)\ - : "d"((x86_reg)srcStride), "S"((x86_reg)dstStride)\ - : "memory"\ - );\ -}\ -static av_noinline void OPNAME ## h264_qpel4_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\ - int h=4;\ - __asm__ volatile(\ - "pxor %%mm7, %%mm7 \n\t"\ - "movq %0, %%mm4 \n\t"\ - "movq %1, %%mm5 \n\t"\ - :: "m"(ff_pw_5), "m"(ff_pw_16)\ - );\ - do{\ - __asm__ volatile(\ - "movd -1(%0), %%mm1 \n\t"\ - "movd (%0), %%mm2 \n\t"\ - "movd 1(%0), %%mm3 \n\t"\ - "movd 2(%0), %%mm0 \n\t"\ - "punpcklbw %%mm7, %%mm1 \n\t"\ - "punpcklbw %%mm7, %%mm2 \n\t"\ - "punpcklbw %%mm7, %%mm3 \n\t"\ - "punpcklbw %%mm7, %%mm0 \n\t"\ - "paddw %%mm0, %%mm1 \n\t"\ - "paddw %%mm3, %%mm2 \n\t"\ - "movd -2(%0), %%mm0 \n\t"\ - "movd 3(%0), %%mm3 \n\t"\ - "punpcklbw %%mm7, %%mm0 \n\t"\ - "punpcklbw %%mm7, %%mm3 \n\t"\ - "paddw %%mm3, %%mm0 \n\t"\ - "psllw $2, %%mm2 \n\t"\ - "psubw %%mm1, %%mm2 \n\t"\ - "pmullw %%mm4, %%mm2 \n\t"\ - "paddw %%mm5, %%mm0 \n\t"\ - "paddw %%mm2, %%mm0 \n\t"\ - "movd (%2), %%mm3 \n\t"\ - "psraw $5, %%mm0 \n\t"\ - "packuswb %%mm0, %%mm0 \n\t"\ - PAVGB" %%mm3, %%mm0 \n\t"\ - OP(%%mm0, (%1),%%mm6, d)\ - "add %4, %0 \n\t"\ - "add %4, %1 \n\t"\ - "add %3, %2 \n\t"\ - : "+a"(src), "+c"(dst), "+d"(src2)\ - : "D"((x86_reg)src2Stride), "S"((x86_reg)dstStride)\ - : "memory"\ - );\ - }while(--h);\ -}\ -static av_noinline void OPNAME ## h264_qpel4_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ - src -= 2*srcStride;\ - __asm__ volatile(\ - "pxor %%mm7, %%mm7 \n\t"\ - "movd (%0), %%mm0 \n\t"\ - "add %2, %0 \n\t"\ - "movd (%0), %%mm1 \n\t"\ - "add %2, %0 \n\t"\ - "movd (%0), %%mm2 \n\t"\ - "add %2, %0 \n\t"\ - "movd (%0), %%mm3 \n\t"\ - "add %2, %0 \n\t"\ - "movd (%0), %%mm4 \n\t"\ - "add %2, %0 \n\t"\ - "punpcklbw %%mm7, %%mm0 \n\t"\ - "punpcklbw %%mm7, %%mm1 \n\t"\ - "punpcklbw %%mm7, %%mm2 \n\t"\ - "punpcklbw %%mm7, %%mm3 \n\t"\ - "punpcklbw %%mm7, %%mm4 \n\t"\ - QPEL_H264V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\ - QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\ - QPEL_H264V(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\ - QPEL_H264V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\ - \ - : "+a"(src), "+c"(dst)\ - : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\ - : "memory"\ - );\ -}\ -static av_noinline void OPNAME ## h264_qpel4_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\ - int h=4;\ - int w=3;\ - src -= 2*srcStride+2;\ - while(w--){\ - __asm__ volatile(\ - "pxor %%mm7, %%mm7 \n\t"\ - "movd (%0), %%mm0 \n\t"\ - "add %2, %0 \n\t"\ - "movd (%0), %%mm1 \n\t"\ - "add %2, %0 \n\t"\ - "movd (%0), %%mm2 \n\t"\ - "add %2, %0 \n\t"\ - "movd (%0), %%mm3 \n\t"\ - "add %2, %0 \n\t"\ - "movd (%0), %%mm4 \n\t"\ - "add %2, %0 \n\t"\ - "punpcklbw %%mm7, %%mm0 \n\t"\ - "punpcklbw %%mm7, %%mm1 \n\t"\ - "punpcklbw %%mm7, %%mm2 \n\t"\ - "punpcklbw %%mm7, %%mm3 \n\t"\ - "punpcklbw %%mm7, %%mm4 \n\t"\ - QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 0*8*3)\ - QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 1*8*3)\ - QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 2*8*3)\ - QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 3*8*3)\ - \ - : "+a"(src)\ - : "c"(tmp), "S"((x86_reg)srcStride), "m"(ff_pw_5), "m"(ff_pw_16)\ - : "memory"\ - );\ - tmp += 4;\ - src += 4 - 9*srcStride;\ - }\ - tmp -= 3*4;\ - __asm__ volatile(\ - "1: \n\t"\ - "movq (%0), %%mm0 \n\t"\ - "paddw 10(%0), %%mm0 \n\t"\ - "movq 2(%0), %%mm1 \n\t"\ - "paddw 8(%0), %%mm1 \n\t"\ - "movq 4(%0), %%mm2 \n\t"\ - "paddw 6(%0), %%mm2 \n\t"\ - "psubw %%mm1, %%mm0 \n\t"/*a-b (abccba)*/\ - "psraw $2, %%mm0 \n\t"/*(a-b)/4 */\ - "psubw %%mm1, %%mm0 \n\t"/*(a-b)/4-b */\ - "paddsw %%mm2, %%mm0 \n\t"\ - "psraw $2, %%mm0 \n\t"/*((a-b)/4-b+c)/4 */\ - "paddw %%mm2, %%mm0 \n\t"/*(a-5*b+20*c)/16 */\ - "psraw $6, %%mm0 \n\t"\ - "packuswb %%mm0, %%mm0 \n\t"\ - OP(%%mm0, (%1),%%mm7, d)\ - "add $24, %0 \n\t"\ - "add %3, %1 \n\t"\ - "decl %2 \n\t"\ - " jnz 1b \n\t"\ - : "+a"(tmp), "+c"(dst), "+g"(h)\ - : "S"((x86_reg)dstStride)\ - : "memory"\ - );\ -}\ -\ -static av_noinline void OPNAME ## h264_qpel8_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ - int h=8;\ - __asm__ volatile(\ - "pxor %%mm7, %%mm7 \n\t"\ - "movq "MANGLE(ff_pw_5)", %%mm6\n\t"\ - "1: \n\t"\ - "movq (%0), %%mm0 \n\t"\ - "movq 1(%0), %%mm2 \n\t"\ - "movq %%mm0, %%mm1 \n\t"\ - "movq %%mm2, %%mm3 \n\t"\ - "punpcklbw %%mm7, %%mm0 \n\t"\ - "punpckhbw %%mm7, %%mm1 \n\t"\ - "punpcklbw %%mm7, %%mm2 \n\t"\ - "punpckhbw %%mm7, %%mm3 \n\t"\ - "paddw %%mm2, %%mm0 \n\t"\ - "paddw %%mm3, %%mm1 \n\t"\ - "psllw $2, %%mm0 \n\t"\ - "psllw $2, %%mm1 \n\t"\ - "movq -1(%0), %%mm2 \n\t"\ - "movq 2(%0), %%mm4 \n\t"\ - "movq %%mm2, %%mm3 \n\t"\ - "movq %%mm4, %%mm5 \n\t"\ - "punpcklbw %%mm7, %%mm2 \n\t"\ - "punpckhbw %%mm7, %%mm3 \n\t"\ - "punpcklbw %%mm7, %%mm4 \n\t"\ - "punpckhbw %%mm7, %%mm5 \n\t"\ - "paddw %%mm4, %%mm2 \n\t"\ - "paddw %%mm3, %%mm5 \n\t"\ - "psubw %%mm2, %%mm0 \n\t"\ - "psubw %%mm5, %%mm1 \n\t"\ - "pmullw %%mm6, %%mm0 \n\t"\ - "pmullw %%mm6, %%mm1 \n\t"\ - "movd -2(%0), %%mm2 \n\t"\ - "movd 7(%0), %%mm5 \n\t"\ - "punpcklbw %%mm7, %%mm2 \n\t"\ - "punpcklbw %%mm7, %%mm5 \n\t"\ - "paddw %%mm3, %%mm2 \n\t"\ - "paddw %%mm5, %%mm4 \n\t"\ - "movq "MANGLE(ff_pw_16)", %%mm5\n\t"\ - "paddw %%mm5, %%mm2 \n\t"\ - "paddw %%mm5, %%mm4 \n\t"\ - "paddw %%mm2, %%mm0 \n\t"\ - "paddw %%mm4, %%mm1 \n\t"\ - "psraw $5, %%mm0 \n\t"\ - "psraw $5, %%mm1 \n\t"\ - "packuswb %%mm1, %%mm0 \n\t"\ - OP(%%mm0, (%1),%%mm5, q)\ - "add %3, %0 \n\t"\ - "add %4, %1 \n\t"\ - "decl %2 \n\t"\ - " jnz 1b \n\t"\ - : "+a"(src), "+c"(dst), "+g"(h)\ - : "d"((x86_reg)srcStride), "S"((x86_reg)dstStride)\ - : "memory"\ - );\ -}\ -\ -static av_noinline void OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\ - int h=8;\ - __asm__ volatile(\ - "pxor %%mm7, %%mm7 \n\t"\ - "movq %0, %%mm6 \n\t"\ - :: "m"(ff_pw_5)\ - );\ - do{\ - __asm__ volatile(\ - "movq (%0), %%mm0 \n\t"\ - "movq 1(%0), %%mm2 \n\t"\ - "movq %%mm0, %%mm1 \n\t"\ - "movq %%mm2, %%mm3 \n\t"\ - "punpcklbw %%mm7, %%mm0 \n\t"\ - "punpckhbw %%mm7, %%mm1 \n\t"\ - "punpcklbw %%mm7, %%mm2 \n\t"\ - "punpckhbw %%mm7, %%mm3 \n\t"\ - "paddw %%mm2, %%mm0 \n\t"\ - "paddw %%mm3, %%mm1 \n\t"\ - "psllw $2, %%mm0 \n\t"\ - "psllw $2, %%mm1 \n\t"\ - "movq -1(%0), %%mm2 \n\t"\ - "movq 2(%0), %%mm4 \n\t"\ - "movq %%mm2, %%mm3 \n\t"\ - "movq %%mm4, %%mm5 \n\t"\ - "punpcklbw %%mm7, %%mm2 \n\t"\ - "punpckhbw %%mm7, %%mm3 \n\t"\ - "punpcklbw %%mm7, %%mm4 \n\t"\ - "punpckhbw %%mm7, %%mm5 \n\t"\ - "paddw %%mm4, %%mm2 \n\t"\ - "paddw %%mm3, %%mm5 \n\t"\ - "psubw %%mm2, %%mm0 \n\t"\ - "psubw %%mm5, %%mm1 \n\t"\ - "pmullw %%mm6, %%mm0 \n\t"\ - "pmullw %%mm6, %%mm1 \n\t"\ - "movd -2(%0), %%mm2 \n\t"\ - "movd 7(%0), %%mm5 \n\t"\ - "punpcklbw %%mm7, %%mm2 \n\t"\ - "punpcklbw %%mm7, %%mm5 \n\t"\ - "paddw %%mm3, %%mm2 \n\t"\ - "paddw %%mm5, %%mm4 \n\t"\ - "movq %5, %%mm5 \n\t"\ - "paddw %%mm5, %%mm2 \n\t"\ - "paddw %%mm5, %%mm4 \n\t"\ - "paddw %%mm2, %%mm0 \n\t"\ - "paddw %%mm4, %%mm1 \n\t"\ - "psraw $5, %%mm0 \n\t"\ - "psraw $5, %%mm1 \n\t"\ - "movq (%2), %%mm4 \n\t"\ - "packuswb %%mm1, %%mm0 \n\t"\ - PAVGB" %%mm4, %%mm0 \n\t"\ - OP(%%mm0, (%1),%%mm5, q)\ - "add %4, %0 \n\t"\ - "add %4, %1 \n\t"\ - "add %3, %2 \n\t"\ - : "+a"(src), "+c"(dst), "+d"(src2)\ - : "D"((x86_reg)src2Stride), "S"((x86_reg)dstStride),\ - "m"(ff_pw_16)\ - : "memory"\ - );\ - }while(--h);\ -}\ -\ -static av_noinline void OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\ - int w= 2;\ - src -= 2*srcStride;\ - \ - while(w--){\ - __asm__ volatile(\ - "pxor %%mm7, %%mm7 \n\t"\ - "movd (%0), %%mm0 \n\t"\ - "add %2, %0 \n\t"\ - "movd (%0), %%mm1 \n\t"\ - "add %2, %0 \n\t"\ - "movd (%0), %%mm2 \n\t"\ - "add %2, %0 \n\t"\ - "movd (%0), %%mm3 \n\t"\ - "add %2, %0 \n\t"\ - "movd (%0), %%mm4 \n\t"\ - "add %2, %0 \n\t"\ - "punpcklbw %%mm7, %%mm0 \n\t"\ - "punpcklbw %%mm7, %%mm1 \n\t"\ - "punpcklbw %%mm7, %%mm2 \n\t"\ - "punpcklbw %%mm7, %%mm3 \n\t"\ - "punpcklbw %%mm7, %%mm4 \n\t"\ - QPEL_H264V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\ - QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\ - QPEL_H264V(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\ - QPEL_H264V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\ - QPEL_H264V(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, OP)\ - QPEL_H264V(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, OP)\ - QPEL_H264V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\ - QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\ - \ - : "+a"(src), "+c"(dst)\ - : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\ - : "memory"\ - );\ - if(h==16){\ - __asm__ volatile(\ - QPEL_H264V(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\ - QPEL_H264V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\ - QPEL_H264V(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, OP)\ - QPEL_H264V(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, OP)\ - QPEL_H264V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\ - QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\ - QPEL_H264V(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\ - QPEL_H264V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\ - \ - : "+a"(src), "+c"(dst)\ - : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\ - : "memory"\ - );\ - }\ - src += 4-(h+5)*srcStride;\ - dst += 4-h*dstStride;\ - }\ -}\ -static av_always_inline void OPNAME ## h264_qpel8or16_hv1_lowpass_ ## MMX(int16_t *tmp, uint8_t *src, int tmpStride, int srcStride, int size){\ - int w = (size+8)>>2;\ - src -= 2*srcStride+2;\ - while(w--){\ - __asm__ volatile(\ - "pxor %%mm7, %%mm7 \n\t"\ - "movd (%0), %%mm0 \n\t"\ - "add %2, %0 \n\t"\ - "movd (%0), %%mm1 \n\t"\ - "add %2, %0 \n\t"\ - "movd (%0), %%mm2 \n\t"\ - "add %2, %0 \n\t"\ - "movd (%0), %%mm3 \n\t"\ - "add %2, %0 \n\t"\ - "movd (%0), %%mm4 \n\t"\ - "add %2, %0 \n\t"\ - "punpcklbw %%mm7, %%mm0 \n\t"\ - "punpcklbw %%mm7, %%mm1 \n\t"\ - "punpcklbw %%mm7, %%mm2 \n\t"\ - "punpcklbw %%mm7, %%mm3 \n\t"\ - "punpcklbw %%mm7, %%mm4 \n\t"\ - QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 0*48)\ - QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 1*48)\ - QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 2*48)\ - QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 3*48)\ - QPEL_H264HV(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, 4*48)\ - QPEL_H264HV(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, 5*48)\ - QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 6*48)\ - QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 7*48)\ - : "+a"(src)\ - : "c"(tmp), "S"((x86_reg)srcStride), "m"(ff_pw_5), "m"(ff_pw_16)\ - : "memory"\ - );\ - if(size==16){\ - __asm__ volatile(\ - QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 8*48)\ - QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 9*48)\ - QPEL_H264HV(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, 10*48)\ - QPEL_H264HV(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, 11*48)\ - QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 12*48)\ - QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 13*48)\ - QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 14*48)\ - QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 15*48)\ - : "+a"(src)\ - : "c"(tmp), "S"((x86_reg)srcStride), "m"(ff_pw_5), "m"(ff_pw_16)\ - : "memory"\ - );\ - }\ - tmp += 4;\ - src += 4 - (size+5)*srcStride;\ - }\ -}\ -static av_always_inline void OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, int dstStride, int tmpStride, int size){\ - int w = size>>4;\ - do{\ - int h = size;\ - __asm__ volatile(\ - "1: \n\t"\ - "movq (%0), %%mm0 \n\t"\ - "movq 8(%0), %%mm3 \n\t"\ - "movq 2(%0), %%mm1 \n\t"\ - "movq 10(%0), %%mm4 \n\t"\ - "paddw %%mm4, %%mm0 \n\t"\ - "paddw %%mm3, %%mm1 \n\t"\ - "paddw 18(%0), %%mm3 \n\t"\ - "paddw 16(%0), %%mm4 \n\t"\ - "movq 4(%0), %%mm2 \n\t"\ - "movq 12(%0), %%mm5 \n\t"\ - "paddw 6(%0), %%mm2 \n\t"\ - "paddw 14(%0), %%mm5 \n\t"\ - "psubw %%mm1, %%mm0 \n\t"\ - "psubw %%mm4, %%mm3 \n\t"\ - "psraw $2, %%mm0 \n\t"\ - "psraw $2, %%mm3 \n\t"\ - "psubw %%mm1, %%mm0 \n\t"\ - "psubw %%mm4, %%mm3 \n\t"\ - "paddsw %%mm2, %%mm0 \n\t"\ - "paddsw %%mm5, %%mm3 \n\t"\ - "psraw $2, %%mm0 \n\t"\ - "psraw $2, %%mm3 \n\t"\ - "paddw %%mm2, %%mm0 \n\t"\ - "paddw %%mm5, %%mm3 \n\t"\ - "psraw $6, %%mm0 \n\t"\ - "psraw $6, %%mm3 \n\t"\ - "packuswb %%mm3, %%mm0 \n\t"\ - OP(%%mm0, (%1),%%mm7, q)\ - "add $48, %0 \n\t"\ - "add %3, %1 \n\t"\ - "decl %2 \n\t"\ - " jnz 1b \n\t"\ - : "+a"(tmp), "+c"(dst), "+g"(h)\ - : "S"((x86_reg)dstStride)\ - : "memory"\ - );\ - tmp += 8 - size*24;\ - dst += 8 - size*dstStride;\ - }while(w--);\ -}\ -\ -static void OPNAME ## h264_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ - OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride, srcStride, 8);\ -}\ -static av_noinline void OPNAME ## h264_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ - OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride, srcStride, 16);\ - OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\ -}\ -\ -static void OPNAME ## h264_qpel16_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ - OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\ - OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\ - src += 8*srcStride;\ - dst += 8*dstStride;\ - OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\ - OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\ -}\ -\ -static av_noinline void OPNAME ## h264_qpel16_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\ - OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\ - OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\ - src += 8*dstStride;\ - dst += 8*dstStride;\ - src2 += 8*src2Stride;\ - OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\ - OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\ -}\ -\ -static av_noinline void OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride, int size){\ - put_h264_qpel8or16_hv1_lowpass_ ## MMX(tmp, src, tmpStride, srcStride, size);\ - OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(dst, tmp, dstStride, tmpStride, size);\ -}\ -static void OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\ - OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst , tmp , src , dstStride, tmpStride, srcStride, 8);\ -}\ -\ -static void OPNAME ## h264_qpel16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\ - OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst , tmp , src , dstStride, tmpStride, srcStride, 16);\ -}\ -\ -static av_noinline void OPNAME ## pixels4_l2_shift5_ ## MMX(uint8_t *dst, int16_t *src16, uint8_t *src8, int dstStride, int src8Stride, int h)\ -{\ - __asm__ volatile(\ - "movq (%1), %%mm0 \n\t"\ - "movq 24(%1), %%mm1 \n\t"\ - "psraw $5, %%mm0 \n\t"\ - "psraw $5, %%mm1 \n\t"\ - "packuswb %%mm0, %%mm0 \n\t"\ - "packuswb %%mm1, %%mm1 \n\t"\ - PAVGB" (%0), %%mm0 \n\t"\ - PAVGB" (%0,%3), %%mm1 \n\t"\ - OP(%%mm0, (%2), %%mm4, d)\ - OP(%%mm1, (%2,%4), %%mm5, d)\ - "lea (%0,%3,2), %0 \n\t"\ - "lea (%2,%4,2), %2 \n\t"\ - "movq 48(%1), %%mm0 \n\t"\ - "movq 72(%1), %%mm1 \n\t"\ - "psraw $5, %%mm0 \n\t"\ - "psraw $5, %%mm1 \n\t"\ - "packuswb %%mm0, %%mm0 \n\t"\ - "packuswb %%mm1, %%mm1 \n\t"\ - PAVGB" (%0), %%mm0 \n\t"\ - PAVGB" (%0,%3), %%mm1 \n\t"\ - OP(%%mm0, (%2), %%mm4, d)\ - OP(%%mm1, (%2,%4), %%mm5, d)\ - :"+a"(src8), "+c"(src16), "+d"(dst)\ - :"S"((x86_reg)src8Stride), "D"((x86_reg)dstStride)\ - :"memory");\ -}\ -static av_noinline void OPNAME ## pixels8_l2_shift5_ ## MMX(uint8_t *dst, int16_t *src16, uint8_t *src8, int dstStride, int src8Stride, int h)\ -{\ - do{\ - __asm__ volatile(\ - "movq (%1), %%mm0 \n\t"\ - "movq 8(%1), %%mm1 \n\t"\ - "movq 48(%1), %%mm2 \n\t"\ - "movq 8+48(%1), %%mm3 \n\t"\ - "psraw $5, %%mm0 \n\t"\ - "psraw $5, %%mm1 \n\t"\ - "psraw $5, %%mm2 \n\t"\ - "psraw $5, %%mm3 \n\t"\ - "packuswb %%mm1, %%mm0 \n\t"\ - "packuswb %%mm3, %%mm2 \n\t"\ - PAVGB" (%0), %%mm0 \n\t"\ - PAVGB" (%0,%3), %%mm2 \n\t"\ - OP(%%mm0, (%2), %%mm5, q)\ - OP(%%mm2, (%2,%4), %%mm5, q)\ - ::"a"(src8), "c"(src16), "d"(dst),\ - "r"((x86_reg)src8Stride), "r"((x86_reg)dstStride)\ - :"memory");\ - src8 += 2L*src8Stride;\ - src16 += 48;\ - dst += 2L*dstStride;\ - }while(h-=2);\ -}\ -static void OPNAME ## pixels16_l2_shift5_ ## MMX(uint8_t *dst, int16_t *src16, uint8_t *src8, int dstStride, int src8Stride, int h)\ -{\ - OPNAME ## pixels8_l2_shift5_ ## MMX(dst , src16 , src8 , dstStride, src8Stride, h);\ - OPNAME ## pixels8_l2_shift5_ ## MMX(dst+8, src16+8, src8+8, dstStride, src8Stride, h);\ -}\ - - -#if ARCH_X86_64 -#define QPEL_H264_H16_XMM(OPNAME, OP, MMX)\ -static av_noinline void OPNAME ## h264_qpel16_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\ - int h=16;\ - __asm__ volatile(\ - "pxor %%xmm15, %%xmm15 \n\t"\ - "movdqa %6, %%xmm14 \n\t"\ - "movdqa %7, %%xmm13 \n\t"\ - "1: \n\t"\ - "lddqu 6(%0), %%xmm1 \n\t"\ - "lddqu -2(%0), %%xmm7 \n\t"\ - "movdqa %%xmm1, %%xmm0 \n\t"\ - "punpckhbw %%xmm15, %%xmm1 \n\t"\ - "punpcklbw %%xmm15, %%xmm0 \n\t"\ - "punpcklbw %%xmm15, %%xmm7 \n\t"\ - "movdqa %%xmm1, %%xmm2 \n\t"\ - "movdqa %%xmm0, %%xmm6 \n\t"\ - "movdqa %%xmm1, %%xmm3 \n\t"\ - "movdqa %%xmm0, %%xmm8 \n\t"\ - "movdqa %%xmm1, %%xmm4 \n\t"\ - "movdqa %%xmm0, %%xmm9 \n\t"\ - "movdqa %%xmm0, %%xmm12 \n\t"\ - "movdqa %%xmm1, %%xmm11 \n\t"\ - "palignr $10,%%xmm0, %%xmm11\n\t"\ - "palignr $10,%%xmm7, %%xmm12\n\t"\ - "palignr $2, %%xmm0, %%xmm4 \n\t"\ - "palignr $2, %%xmm7, %%xmm9 \n\t"\ - "palignr $4, %%xmm0, %%xmm3 \n\t"\ - "palignr $4, %%xmm7, %%xmm8 \n\t"\ - "palignr $6, %%xmm0, %%xmm2 \n\t"\ - "palignr $6, %%xmm7, %%xmm6 \n\t"\ - "paddw %%xmm0 ,%%xmm11 \n\t"\ - "palignr $8, %%xmm0, %%xmm1 \n\t"\ - "palignr $8, %%xmm7, %%xmm0 \n\t"\ - "paddw %%xmm12,%%xmm7 \n\t"\ - "paddw %%xmm3, %%xmm2 \n\t"\ - "paddw %%xmm8, %%xmm6 \n\t"\ - "paddw %%xmm4, %%xmm1 \n\t"\ - "paddw %%xmm9, %%xmm0 \n\t"\ - "psllw $2, %%xmm2 \n\t"\ - "psllw $2, %%xmm6 \n\t"\ - "psubw %%xmm1, %%xmm2 \n\t"\ - "psubw %%xmm0, %%xmm6 \n\t"\ - "paddw %%xmm13,%%xmm11 \n\t"\ - "paddw %%xmm13,%%xmm7 \n\t"\ - "pmullw %%xmm14,%%xmm2 \n\t"\ - "pmullw %%xmm14,%%xmm6 \n\t"\ - "lddqu (%2), %%xmm3 \n\t"\ - "paddw %%xmm11,%%xmm2 \n\t"\ - "paddw %%xmm7, %%xmm6 \n\t"\ - "psraw $5, %%xmm2 \n\t"\ - "psraw $5, %%xmm6 \n\t"\ - "packuswb %%xmm2,%%xmm6 \n\t"\ - "pavgb %%xmm3, %%xmm6 \n\t"\ - OP(%%xmm6, (%1), %%xmm4, dqa)\ - "add %5, %0 \n\t"\ - "add %5, %1 \n\t"\ - "add %4, %2 \n\t"\ - "decl %3 \n\t"\ - "jg 1b \n\t"\ - : "+a"(src), "+c"(dst), "+d"(src2), "+g"(h)\ - : "D"((x86_reg)src2Stride), "S"((x86_reg)dstStride),\ - "m"(ff_pw_5), "m"(ff_pw_16)\ - : "memory"\ - );\ -} -#else // ARCH_X86_64 -#define QPEL_H264_H16_XMM(OPNAME, OP, MMX)\ -static av_noinline void OPNAME ## h264_qpel16_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\ - OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\ - OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\ - src += 8*dstStride;\ - dst += 8*dstStride;\ - src2 += 8*src2Stride;\ - OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\ - OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\ -} -#endif // ARCH_X86_64 - -#define QPEL_H264_H_XMM(OPNAME, OP, MMX)\ -static av_noinline void OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\ - int h=8;\ - __asm__ volatile(\ - "pxor %%xmm7, %%xmm7 \n\t"\ - "movdqa %0, %%xmm6 \n\t"\ - :: "m"(ff_pw_5)\ - );\ - do{\ - __asm__ volatile(\ - "lddqu -2(%0), %%xmm1 \n\t"\ - "movdqa %%xmm1, %%xmm0 \n\t"\ - "punpckhbw %%xmm7, %%xmm1 \n\t"\ - "punpcklbw %%xmm7, %%xmm0 \n\t"\ - "movdqa %%xmm1, %%xmm2 \n\t"\ - "movdqa %%xmm1, %%xmm3 \n\t"\ - "movdqa %%xmm1, %%xmm4 \n\t"\ - "movdqa %%xmm1, %%xmm5 \n\t"\ - "palignr $2, %%xmm0, %%xmm4 \n\t"\ - "palignr $4, %%xmm0, %%xmm3 \n\t"\ - "palignr $6, %%xmm0, %%xmm2 \n\t"\ - "palignr $8, %%xmm0, %%xmm1 \n\t"\ - "palignr $10,%%xmm0, %%xmm5 \n\t"\ - "paddw %%xmm5, %%xmm0 \n\t"\ - "paddw %%xmm3, %%xmm2 \n\t"\ - "paddw %%xmm4, %%xmm1 \n\t"\ - "psllw $2, %%xmm2 \n\t"\ - "movq (%2), %%xmm3 \n\t"\ - "psubw %%xmm1, %%xmm2 \n\t"\ - "paddw %5, %%xmm0 \n\t"\ - "pmullw %%xmm6, %%xmm2 \n\t"\ - "paddw %%xmm0, %%xmm2 \n\t"\ - "psraw $5, %%xmm2 \n\t"\ - "packuswb %%xmm2, %%xmm2 \n\t"\ - "pavgb %%xmm3, %%xmm2 \n\t"\ - OP(%%xmm2, (%1), %%xmm4, q)\ - "add %4, %0 \n\t"\ - "add %4, %1 \n\t"\ - "add %3, %2 \n\t"\ - : "+a"(src), "+c"(dst), "+d"(src2)\ - : "D"((x86_reg)src2Stride), "S"((x86_reg)dstStride),\ - "m"(ff_pw_16)\ - : "memory"\ - );\ - }while(--h);\ -}\ -QPEL_H264_H16_XMM(OPNAME, OP, MMX)\ -\ -static av_noinline void OPNAME ## h264_qpel8_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ - int h=8;\ - __asm__ volatile(\ - "pxor %%xmm7, %%xmm7 \n\t"\ - "movdqa "MANGLE(ff_pw_5)", %%xmm6\n\t"\ - "1: \n\t"\ - "lddqu -2(%0), %%xmm1 \n\t"\ - "movdqa %%xmm1, %%xmm0 \n\t"\ - "punpckhbw %%xmm7, %%xmm1 \n\t"\ - "punpcklbw %%xmm7, %%xmm0 \n\t"\ - "movdqa %%xmm1, %%xmm2 \n\t"\ - "movdqa %%xmm1, %%xmm3 \n\t"\ - "movdqa %%xmm1, %%xmm4 \n\t"\ - "movdqa %%xmm1, %%xmm5 \n\t"\ - "palignr $2, %%xmm0, %%xmm4 \n\t"\ - "palignr $4, %%xmm0, %%xmm3 \n\t"\ - "palignr $6, %%xmm0, %%xmm2 \n\t"\ - "palignr $8, %%xmm0, %%xmm1 \n\t"\ - "palignr $10,%%xmm0, %%xmm5 \n\t"\ - "paddw %%xmm5, %%xmm0 \n\t"\ - "paddw %%xmm3, %%xmm2 \n\t"\ - "paddw %%xmm4, %%xmm1 \n\t"\ - "psllw $2, %%xmm2 \n\t"\ - "psubw %%xmm1, %%xmm2 \n\t"\ - "paddw "MANGLE(ff_pw_16)", %%xmm0\n\t"\ - "pmullw %%xmm6, %%xmm2 \n\t"\ - "paddw %%xmm0, %%xmm2 \n\t"\ - "psraw $5, %%xmm2 \n\t"\ - "packuswb %%xmm2, %%xmm2 \n\t"\ - OP(%%xmm2, (%1), %%xmm4, q)\ - "add %3, %0 \n\t"\ - "add %4, %1 \n\t"\ - "decl %2 \n\t"\ - " jnz 1b \n\t"\ - : "+a"(src), "+c"(dst), "+g"(h)\ - : "D"((x86_reg)srcStride), "S"((x86_reg)dstStride)\ - : "memory"\ - );\ -}\ -static void OPNAME ## h264_qpel16_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ - OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\ - OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\ - src += 8*srcStride;\ - dst += 8*dstStride;\ - OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\ - OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\ -}\ - -#define QPEL_H264_V_XMM(OPNAME, OP, MMX)\ -static av_noinline void OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\ - src -= 2*srcStride;\ - \ - __asm__ volatile(\ - "pxor %%xmm7, %%xmm7 \n\t"\ - "movq (%0), %%xmm0 \n\t"\ - "add %2, %0 \n\t"\ - "movq (%0), %%xmm1 \n\t"\ - "add %2, %0 \n\t"\ - "movq (%0), %%xmm2 \n\t"\ - "add %2, %0 \n\t"\ - "movq (%0), %%xmm3 \n\t"\ - "add %2, %0 \n\t"\ - "movq (%0), %%xmm4 \n\t"\ - "add %2, %0 \n\t"\ - "punpcklbw %%xmm7, %%xmm0 \n\t"\ - "punpcklbw %%xmm7, %%xmm1 \n\t"\ - "punpcklbw %%xmm7, %%xmm2 \n\t"\ - "punpcklbw %%xmm7, %%xmm3 \n\t"\ - "punpcklbw %%xmm7, %%xmm4 \n\t"\ - QPEL_H264V_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, OP)\ - QPEL_H264V_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, OP)\ - QPEL_H264V_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, OP)\ - QPEL_H264V_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, OP)\ - QPEL_H264V_XMM(%%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, OP)\ - QPEL_H264V_XMM(%%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, OP)\ - QPEL_H264V_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, OP)\ - QPEL_H264V_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, OP)\ - \ - : "+a"(src), "+c"(dst)\ - : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\ - : "memory"\ - );\ - if(h==16){\ - __asm__ volatile(\ - QPEL_H264V_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, OP)\ - QPEL_H264V_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, OP)\ - QPEL_H264V_XMM(%%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, OP)\ - QPEL_H264V_XMM(%%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, OP)\ - QPEL_H264V_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, OP)\ - QPEL_H264V_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, OP)\ - QPEL_H264V_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, OP)\ - QPEL_H264V_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, OP)\ - \ - : "+a"(src), "+c"(dst)\ - : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\ - : "memory"\ - );\ - }\ -}\ -static void OPNAME ## h264_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ - OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride, srcStride, 8);\ -}\ -static av_noinline void OPNAME ## h264_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ - OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride, srcStride, 16);\ - OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\ -} - -static av_always_inline void put_h264_qpel8or16_hv1_lowpass_sse2(int16_t *tmp, uint8_t *src, int tmpStride, int srcStride, int size){ - int w = (size+8)>>3; - src -= 2*srcStride+2; - while(w--){ - __asm__ volatile( - "pxor %%xmm7, %%xmm7 \n\t" - "movq (%0), %%xmm0 \n\t" - "add %2, %0 \n\t" - "movq (%0), %%xmm1 \n\t" - "add %2, %0 \n\t" - "movq (%0), %%xmm2 \n\t" - "add %2, %0 \n\t" - "movq (%0), %%xmm3 \n\t" - "add %2, %0 \n\t" - "movq (%0), %%xmm4 \n\t" - "add %2, %0 \n\t" - "punpcklbw %%xmm7, %%xmm0 \n\t" - "punpcklbw %%xmm7, %%xmm1 \n\t" - "punpcklbw %%xmm7, %%xmm2 \n\t" - "punpcklbw %%xmm7, %%xmm3 \n\t" - "punpcklbw %%xmm7, %%xmm4 \n\t" - QPEL_H264HV_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, 0*48) - QPEL_H264HV_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, 1*48) - QPEL_H264HV_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, 2*48) - QPEL_H264HV_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, 3*48) - QPEL_H264HV_XMM(%%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, 4*48) - QPEL_H264HV_XMM(%%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, 5*48) - QPEL_H264HV_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, 6*48) - QPEL_H264HV_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, 7*48) - : "+a"(src) - : "c"(tmp), "S"((x86_reg)srcStride), "m"(ff_pw_5), "m"(ff_pw_16) - : "memory" - ); - if(size==16){ - __asm__ volatile( - QPEL_H264HV_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, 8*48) - QPEL_H264HV_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, 9*48) - QPEL_H264HV_XMM(%%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, 10*48) - QPEL_H264HV_XMM(%%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, 11*48) - QPEL_H264HV_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, 12*48) - QPEL_H264HV_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, 13*48) - QPEL_H264HV_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, 14*48) - QPEL_H264HV_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, 15*48) - : "+a"(src) - : "c"(tmp), "S"((x86_reg)srcStride), "m"(ff_pw_5), "m"(ff_pw_16) - : "memory" - ); - } - tmp += 8; - src += 8 - (size+5)*srcStride; - } -} - -#define QPEL_H264_HV2_XMM(OPNAME, OP, MMX)\ -static av_always_inline void OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, int dstStride, int tmpStride, int size){\ - int h = size;\ - if(size == 16){\ - __asm__ volatile(\ - "1: \n\t"\ - "movdqa 32(%0), %%xmm4 \n\t"\ - "movdqa 16(%0), %%xmm5 \n\t"\ - "movdqa (%0), %%xmm7 \n\t"\ - "movdqa %%xmm4, %%xmm3 \n\t"\ - "movdqa %%xmm4, %%xmm2 \n\t"\ - "movdqa %%xmm4, %%xmm1 \n\t"\ - "movdqa %%xmm4, %%xmm0 \n\t"\ - "palignr $10, %%xmm5, %%xmm0 \n\t"\ - "palignr $8, %%xmm5, %%xmm1 \n\t"\ - "palignr $6, %%xmm5, %%xmm2 \n\t"\ - "palignr $4, %%xmm5, %%xmm3 \n\t"\ - "palignr $2, %%xmm5, %%xmm4 \n\t"\ - "paddw %%xmm5, %%xmm0 \n\t"\ - "paddw %%xmm4, %%xmm1 \n\t"\ - "paddw %%xmm3, %%xmm2 \n\t"\ - "movdqa %%xmm5, %%xmm6 \n\t"\ - "movdqa %%xmm5, %%xmm4 \n\t"\ - "movdqa %%xmm5, %%xmm3 \n\t"\ - "palignr $8, %%xmm7, %%xmm4 \n\t"\ - "palignr $2, %%xmm7, %%xmm6 \n\t"\ - "palignr $10, %%xmm7, %%xmm3 \n\t"\ - "paddw %%xmm6, %%xmm4 \n\t"\ - "movdqa %%xmm5, %%xmm6 \n\t"\ - "palignr $6, %%xmm7, %%xmm5 \n\t"\ - "palignr $4, %%xmm7, %%xmm6 \n\t"\ - "paddw %%xmm7, %%xmm3 \n\t"\ - "paddw %%xmm6, %%xmm5 \n\t"\ - \ - "psubw %%xmm1, %%xmm0 \n\t"\ - "psubw %%xmm4, %%xmm3 \n\t"\ - "psraw $2, %%xmm0 \n\t"\ - "psraw $2, %%xmm3 \n\t"\ - "psubw %%xmm1, %%xmm0 \n\t"\ - "psubw %%xmm4, %%xmm3 \n\t"\ - "paddw %%xmm2, %%xmm0 \n\t"\ - "paddw %%xmm5, %%xmm3 \n\t"\ - "psraw $2, %%xmm0 \n\t"\ - "psraw $2, %%xmm3 \n\t"\ - "paddw %%xmm2, %%xmm0 \n\t"\ - "paddw %%xmm5, %%xmm3 \n\t"\ - "psraw $6, %%xmm0 \n\t"\ - "psraw $6, %%xmm3 \n\t"\ - "packuswb %%xmm0, %%xmm3 \n\t"\ - OP(%%xmm3, (%1), %%xmm7, dqa)\ - "add $48, %0 \n\t"\ - "add %3, %1 \n\t"\ - "decl %2 \n\t"\ - " jnz 1b \n\t"\ - : "+a"(tmp), "+c"(dst), "+g"(h)\ - : "S"((x86_reg)dstStride)\ - : "memory"\ - );\ - }else{\ - __asm__ volatile(\ - "1: \n\t"\ - "movdqa 16(%0), %%xmm1 \n\t"\ - "movdqa (%0), %%xmm0 \n\t"\ - "movdqa %%xmm1, %%xmm2 \n\t"\ - "movdqa %%xmm1, %%xmm3 \n\t"\ - "movdqa %%xmm1, %%xmm4 \n\t"\ - "movdqa %%xmm1, %%xmm5 \n\t"\ - "palignr $10, %%xmm0, %%xmm5 \n\t"\ - "palignr $8, %%xmm0, %%xmm4 \n\t"\ - "palignr $6, %%xmm0, %%xmm3 \n\t"\ - "palignr $4, %%xmm0, %%xmm2 \n\t"\ - "palignr $2, %%xmm0, %%xmm1 \n\t"\ - "paddw %%xmm5, %%xmm0 \n\t"\ - "paddw %%xmm4, %%xmm1 \n\t"\ - "paddw %%xmm3, %%xmm2 \n\t"\ - "psubw %%xmm1, %%xmm0 \n\t"\ - "psraw $2, %%xmm0 \n\t"\ - "psubw %%xmm1, %%xmm0 \n\t"\ - "paddw %%xmm2, %%xmm0 \n\t"\ - "psraw $2, %%xmm0 \n\t"\ - "paddw %%xmm2, %%xmm0 \n\t"\ - "psraw $6, %%xmm0 \n\t"\ - "packuswb %%xmm0, %%xmm0 \n\t"\ - OP(%%xmm0, (%1), %%xmm7, q)\ - "add $48, %0 \n\t"\ - "add %3, %1 \n\t"\ - "decl %2 \n\t"\ - " jnz 1b \n\t"\ - : "+a"(tmp), "+c"(dst), "+g"(h)\ - : "S"((x86_reg)dstStride)\ - : "memory"\ - );\ - }\ -} - -#define QPEL_H264_HV_XMM(OPNAME, OP, MMX)\ -static av_noinline void OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride, int size){\ - put_h264_qpel8or16_hv1_lowpass_sse2(tmp, src, tmpStride, srcStride, size);\ - OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(dst, tmp, dstStride, tmpStride, size);\ -}\ -static void OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\ - OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst, tmp, src, dstStride, tmpStride, srcStride, 8);\ -}\ -static void OPNAME ## h264_qpel16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\ - OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst, tmp, src, dstStride, tmpStride, srcStride, 16);\ -}\ - -#define put_pixels8_l2_sse2 put_pixels8_l2_mmx2 -#define avg_pixels8_l2_sse2 avg_pixels8_l2_mmx2 -#define put_pixels16_l2_sse2 put_pixels16_l2_mmx2 -#define avg_pixels16_l2_sse2 avg_pixels16_l2_mmx2 -#define put_pixels8_l2_ssse3 put_pixels8_l2_mmx2 -#define avg_pixels8_l2_ssse3 avg_pixels8_l2_mmx2 -#define put_pixels16_l2_ssse3 put_pixels16_l2_mmx2 -#define avg_pixels16_l2_ssse3 avg_pixels16_l2_mmx2 - -#define put_pixels8_l2_shift5_sse2 put_pixels8_l2_shift5_mmx2 -#define avg_pixels8_l2_shift5_sse2 avg_pixels8_l2_shift5_mmx2 -#define put_pixels16_l2_shift5_sse2 put_pixels16_l2_shift5_mmx2 -#define avg_pixels16_l2_shift5_sse2 avg_pixels16_l2_shift5_mmx2 -#define put_pixels8_l2_shift5_ssse3 put_pixels8_l2_shift5_mmx2 -#define avg_pixels8_l2_shift5_ssse3 avg_pixels8_l2_shift5_mmx2 -#define put_pixels16_l2_shift5_ssse3 put_pixels16_l2_shift5_mmx2 -#define avg_pixels16_l2_shift5_ssse3 avg_pixels16_l2_shift5_mmx2 - -#define put_h264_qpel8_h_lowpass_l2_sse2 put_h264_qpel8_h_lowpass_l2_mmx2 -#define avg_h264_qpel8_h_lowpass_l2_sse2 avg_h264_qpel8_h_lowpass_l2_mmx2 -#define put_h264_qpel16_h_lowpass_l2_sse2 put_h264_qpel16_h_lowpass_l2_mmx2 -#define avg_h264_qpel16_h_lowpass_l2_sse2 avg_h264_qpel16_h_lowpass_l2_mmx2 - -#define put_h264_qpel8_v_lowpass_ssse3 put_h264_qpel8_v_lowpass_sse2 -#define avg_h264_qpel8_v_lowpass_ssse3 avg_h264_qpel8_v_lowpass_sse2 -#define put_h264_qpel16_v_lowpass_ssse3 put_h264_qpel16_v_lowpass_sse2 -#define avg_h264_qpel16_v_lowpass_ssse3 avg_h264_qpel16_v_lowpass_sse2 - -#define put_h264_qpel8or16_hv2_lowpass_sse2 put_h264_qpel8or16_hv2_lowpass_mmx2 -#define avg_h264_qpel8or16_hv2_lowpass_sse2 avg_h264_qpel8or16_hv2_lowpass_mmx2 - -#define H264_MC(OPNAME, SIZE, MMX, ALIGN) \ -H264_MC_C(OPNAME, SIZE, MMX, ALIGN)\ -H264_MC_V(OPNAME, SIZE, MMX, ALIGN)\ -H264_MC_H(OPNAME, SIZE, MMX, ALIGN)\ -H264_MC_HV(OPNAME, SIZE, MMX, ALIGN)\ - -static void put_h264_qpel16_mc00_sse2 (uint8_t *dst, uint8_t *src, int stride){ - put_pixels16_sse2(dst, src, stride, 16); -} -static void avg_h264_qpel16_mc00_sse2 (uint8_t *dst, uint8_t *src, int stride){ - avg_pixels16_sse2(dst, src, stride, 16); -} -#define put_h264_qpel8_mc00_sse2 put_h264_qpel8_mc00_mmx2 -#define avg_h264_qpel8_mc00_sse2 avg_h264_qpel8_mc00_mmx2 - -#define H264_MC_C(OPNAME, SIZE, MMX, ALIGN) \ -static void OPNAME ## h264_qpel ## SIZE ## _mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\ - OPNAME ## pixels ## SIZE ## _ ## MMX(dst, src, stride, SIZE);\ -}\ - -#define H264_MC_H(OPNAME, SIZE, MMX, ALIGN) \ -static void OPNAME ## h264_qpel ## SIZE ## _mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ - OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, src, stride, stride);\ -}\ -\ -static void OPNAME ## h264_qpel ## SIZE ## _mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ - OPNAME ## h264_qpel ## SIZE ## _h_lowpass_ ## MMX(dst, src, stride, stride);\ -}\ -\ -static void OPNAME ## h264_qpel ## SIZE ## _mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ - OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, src+1, stride, stride);\ -}\ - -#define H264_MC_V(OPNAME, SIZE, MMX, ALIGN) \ -static void OPNAME ## h264_qpel ## SIZE ## _mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ - DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\ - put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\ - OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, src, temp, stride, stride, SIZE);\ -}\ -\ -static void OPNAME ## h264_qpel ## SIZE ## _mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ - OPNAME ## h264_qpel ## SIZE ## _v_lowpass_ ## MMX(dst, src, stride, stride);\ -}\ -\ -static void OPNAME ## h264_qpel ## SIZE ## _mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ - DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\ - put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\ - OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, src+stride, temp, stride, stride, SIZE);\ -}\ - -#define H264_MC_HV(OPNAME, SIZE, MMX, ALIGN) \ -static void OPNAME ## h264_qpel ## SIZE ## _mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ - DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\ - put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\ - OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, temp, stride, SIZE);\ -}\ -\ -static void OPNAME ## h264_qpel ## SIZE ## _mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ - DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\ - put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src+1, SIZE, stride);\ - OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, temp, stride, SIZE);\ -}\ -\ -static void OPNAME ## h264_qpel ## SIZE ## _mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ - DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\ - put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\ - OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, temp, stride, SIZE);\ -}\ -\ -static void OPNAME ## h264_qpel ## SIZE ## _mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ - DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\ - put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src+1, SIZE, stride);\ - OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, temp, stride, SIZE);\ -}\ -\ -static void OPNAME ## h264_qpel ## SIZE ## _mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ - DECLARE_ALIGNED(ALIGN, uint16_t, temp)[SIZE*(SIZE<8?12:24)];\ - OPNAME ## h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(dst, temp, src, stride, SIZE, stride);\ -}\ -\ -static void OPNAME ## h264_qpel ## SIZE ## _mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ - DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE];\ - uint8_t * const halfHV= temp;\ - int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\ - assert(((int)temp & 7) == 0);\ - put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\ - OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, halfHV, stride, SIZE);\ -}\ -\ -static void OPNAME ## h264_qpel ## SIZE ## _mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ - DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE];\ - uint8_t * const halfHV= temp;\ - int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\ - assert(((int)temp & 7) == 0);\ - put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\ - OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, halfHV, stride, SIZE);\ -}\ -\ -static void OPNAME ## h264_qpel ## SIZE ## _mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ - DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE];\ - uint8_t * const halfHV= temp;\ - int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\ - assert(((int)temp & 7) == 0);\ - put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\ - OPNAME ## pixels ## SIZE ## _l2_shift5_ ## MMX(dst, halfV+2, halfHV, stride, SIZE, SIZE);\ -}\ -\ -static void OPNAME ## h264_qpel ## SIZE ## _mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ - DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE];\ - uint8_t * const halfHV= temp;\ - int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\ - assert(((int)temp & 7) == 0);\ - put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\ - OPNAME ## pixels ## SIZE ## _l2_shift5_ ## MMX(dst, halfV+3, halfHV, stride, SIZE, SIZE);\ -}\ - -#define H264_MC_4816(MMX)\ -H264_MC(put_, 4, MMX, 8)\ -H264_MC(put_, 8, MMX, 8)\ -H264_MC(put_, 16,MMX, 8)\ -H264_MC(avg_, 4, MMX, 8)\ -H264_MC(avg_, 8, MMX, 8)\ -H264_MC(avg_, 16,MMX, 8)\ - -#define H264_MC_816(QPEL, XMM)\ -QPEL(put_, 8, XMM, 16)\ -QPEL(put_, 16,XMM, 16)\ -QPEL(avg_, 8, XMM, 16)\ -QPEL(avg_, 16,XMM, 16)\ - - -#define AVG_3DNOW_OP(a,b,temp, size) \ -"mov" #size " " #b ", " #temp " \n\t"\ -"pavgusb " #temp ", " #a " \n\t"\ -"mov" #size " " #a ", " #b " \n\t" -#define AVG_MMX2_OP(a,b,temp, size) \ -"mov" #size " " #b ", " #temp " \n\t"\ -"pavgb " #temp ", " #a " \n\t"\ -"mov" #size " " #a ", " #b " \n\t" - -#define PAVGB "pavgusb" -QPEL_H264(put_, PUT_OP, 3dnow) -QPEL_H264(avg_, AVG_3DNOW_OP, 3dnow) -#undef PAVGB -#define PAVGB "pavgb" -QPEL_H264(put_, PUT_OP, mmx2) -QPEL_H264(avg_, AVG_MMX2_OP, mmx2) -QPEL_H264_V_XMM(put_, PUT_OP, sse2) -QPEL_H264_V_XMM(avg_, AVG_MMX2_OP, sse2) -QPEL_H264_HV_XMM(put_, PUT_OP, sse2) -QPEL_H264_HV_XMM(avg_, AVG_MMX2_OP, sse2) -#if HAVE_SSSE3 -QPEL_H264_H_XMM(put_, PUT_OP, ssse3) -QPEL_H264_H_XMM(avg_, AVG_MMX2_OP, ssse3) -QPEL_H264_HV2_XMM(put_, PUT_OP, ssse3) -QPEL_H264_HV2_XMM(avg_, AVG_MMX2_OP, ssse3) -QPEL_H264_HV_XMM(put_, PUT_OP, ssse3) -QPEL_H264_HV_XMM(avg_, AVG_MMX2_OP, ssse3) -#endif -#undef PAVGB - -H264_MC_4816(3dnow) -H264_MC_4816(mmx2) -H264_MC_816(H264_MC_V, sse2) -H264_MC_816(H264_MC_HV, sse2) -#if HAVE_SSSE3 -H264_MC_816(H264_MC_H, ssse3) -H264_MC_816(H264_MC_HV, ssse3) -#endif - -/* rnd interleaved with rnd div 8, use p+1 to access rnd div 8 */ -DECLARE_ALIGNED(8, static const uint64_t, h264_rnd_reg)[4] = { - 0x0020002000200020ULL, 0x0004000400040004ULL, 0x001C001C001C001CULL, 0x0003000300030003ULL -}; - -#define H264_CHROMA_OP(S,D) -#define H264_CHROMA_OP4(S,D,T) -#define H264_CHROMA_MC8_TMPL put_h264_chroma_generic_mc8_mmx -#define H264_CHROMA_MC4_TMPL put_h264_chroma_generic_mc4_mmx -#define H264_CHROMA_MC2_TMPL put_h264_chroma_mc2_mmx2 -#define H264_CHROMA_MC8_MV0 put_pixels8_mmx -#include "dsputil_h264_template_mmx.c" - -static void put_h264_chroma_mc8_mmx_rnd(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y) -{ - put_h264_chroma_generic_mc8_mmx(dst, src, stride, h, x, y, h264_rnd_reg); -} -static void put_vc1_chroma_mc8_mmx_nornd(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y) -{ - put_h264_chroma_generic_mc8_mmx(dst, src, stride, h, x, y, h264_rnd_reg+2); -} -static void put_h264_chroma_mc4_mmx(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y) -{ - put_h264_chroma_generic_mc4_mmx(dst, src, stride, h, x, y, h264_rnd_reg); -} - -#undef H264_CHROMA_OP -#undef H264_CHROMA_OP4 -#undef H264_CHROMA_MC8_TMPL -#undef H264_CHROMA_MC4_TMPL -#undef H264_CHROMA_MC2_TMPL -#undef H264_CHROMA_MC8_MV0 +#define LF_FUNC(DIR, TYPE, OPT) \ +void ff_x264_deblock_ ## DIR ## _ ## TYPE ## _ ## OPT (uint8_t *pix, int stride, \ + int alpha, int beta, int8_t *tc0); +#define LF_IFUNC(DIR, TYPE, OPT) \ +void ff_x264_deblock_ ## DIR ## _ ## TYPE ## _ ## OPT (uint8_t *pix, int stride, \ + int alpha, int beta); -#define H264_CHROMA_OP(S,D) "pavgb " #S ", " #D " \n\t" -#define H264_CHROMA_OP4(S,D,T) "movd " #S ", " #T " \n\t"\ - "pavgb " #T ", " #D " \n\t" -#define H264_CHROMA_MC8_TMPL avg_h264_chroma_generic_mc8_mmx2 -#define H264_CHROMA_MC4_TMPL avg_h264_chroma_generic_mc4_mmx2 -#define H264_CHROMA_MC2_TMPL avg_h264_chroma_mc2_mmx2 -#define H264_CHROMA_MC8_MV0 avg_pixels8_mmx2 -#include "dsputil_h264_template_mmx.c" -static void avg_h264_chroma_mc8_mmx2_rnd(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y) -{ - avg_h264_chroma_generic_mc8_mmx2(dst, src, stride, h, x, y, h264_rnd_reg); -} -static void avg_vc1_chroma_mc8_mmx2_nornd(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y) -{ - avg_h264_chroma_generic_mc8_mmx2(dst, src, stride, h, x, y, h264_rnd_reg+2); -} -static void avg_h264_chroma_mc4_mmx2(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y) -{ - avg_h264_chroma_generic_mc4_mmx2(dst, src, stride, h, x, y, h264_rnd_reg); -} -#undef H264_CHROMA_OP -#undef H264_CHROMA_OP4 -#undef H264_CHROMA_MC8_TMPL -#undef H264_CHROMA_MC4_TMPL -#undef H264_CHROMA_MC2_TMPL -#undef H264_CHROMA_MC8_MV0 +LF_FUNC (h, chroma, mmxext) +LF_IFUNC(h, chroma_intra, mmxext) +LF_FUNC (v, chroma, mmxext) +LF_IFUNC(v, chroma_intra, mmxext) -#define H264_CHROMA_OP(S,D) "pavgusb " #S ", " #D " \n\t" -#define H264_CHROMA_OP4(S,D,T) "movd " #S ", " #T " \n\t"\ - "pavgusb " #T ", " #D " \n\t" -#define H264_CHROMA_MC8_TMPL avg_h264_chroma_generic_mc8_3dnow -#define H264_CHROMA_MC4_TMPL avg_h264_chroma_generic_mc4_3dnow -#define H264_CHROMA_MC8_MV0 avg_pixels8_3dnow -#include "dsputil_h264_template_mmx.c" -static void avg_h264_chroma_mc8_3dnow_rnd(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y) -{ - avg_h264_chroma_generic_mc8_3dnow(dst, src, stride, h, x, y, h264_rnd_reg); -} -static void avg_h264_chroma_mc4_3dnow(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y) +LF_FUNC (h, luma, mmxext) +LF_IFUNC(h, luma_intra, mmxext) +#if HAVE_YASM && ARCH_X86_32 +LF_FUNC (v8, luma, mmxext) +static void ff_x264_deblock_v_luma_mmxext(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) { - avg_h264_chroma_generic_mc4_3dnow(dst, src, stride, h, x, y, h264_rnd_reg); -} -#undef H264_CHROMA_OP -#undef H264_CHROMA_OP4 -#undef H264_CHROMA_MC8_TMPL -#undef H264_CHROMA_MC4_TMPL -#undef H264_CHROMA_MC8_MV0 - -#if HAVE_SSSE3 -#define AVG_OP(X) -#undef H264_CHROMA_MC8_TMPL -#undef H264_CHROMA_MC4_TMPL -#define H264_CHROMA_MC8_TMPL put_h264_chroma_mc8_ssse3 -#define H264_CHROMA_MC4_TMPL put_h264_chroma_mc4_ssse3 -#define H264_CHROMA_MC8_MV0 put_pixels8_mmx -#include "dsputil_h264_template_ssse3.c" -static void put_h264_chroma_mc8_ssse3_rnd(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y) -{ - put_h264_chroma_mc8_ssse3(dst, src, stride, h, x, y, 1); -} -static void put_vc1_chroma_mc8_ssse3_nornd(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y) -{ - put_h264_chroma_mc8_ssse3(dst, src, stride, h, x, y, 0); -} - -#undef AVG_OP -#undef H264_CHROMA_MC8_TMPL -#undef H264_CHROMA_MC4_TMPL -#undef H264_CHROMA_MC8_MV0 -#define AVG_OP(X) X -#define H264_CHROMA_MC8_TMPL avg_h264_chroma_mc8_ssse3 -#define H264_CHROMA_MC4_TMPL avg_h264_chroma_mc4_ssse3 -#define H264_CHROMA_MC8_MV0 avg_pixels8_mmx2 -#include "dsputil_h264_template_ssse3.c" -static void avg_h264_chroma_mc8_ssse3_rnd(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y) -{ - avg_h264_chroma_mc8_ssse3(dst, src, stride, h, x, y, 1); + if((tc0[0] & tc0[1]) >= 0) + ff_x264_deblock_v8_luma_mmxext(pix+0, stride, alpha, beta, tc0); + if((tc0[2] & tc0[3]) >= 0) + ff_x264_deblock_v8_luma_mmxext(pix+8, stride, alpha, beta, tc0+2); } -static void avg_vc1_chroma_mc8_ssse3_nornd(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y) +LF_IFUNC(v8, luma_intra, mmxext) +static void ff_x264_deblock_v_luma_intra_mmxext(uint8_t *pix, int stride, int alpha, int beta) { - avg_h264_chroma_mc8_ssse3(dst, src, stride, h, x, y, 0); + ff_x264_deblock_v8_luma_intra_mmxext(pix+0, stride, alpha, beta); + ff_x264_deblock_v8_luma_intra_mmxext(pix+8, stride, alpha, beta); } -#undef AVG_OP -#undef H264_CHROMA_MC8_TMPL -#undef H264_CHROMA_MC4_TMPL -#undef H264_CHROMA_MC8_MV0 #endif +LF_FUNC (h, luma, sse2) +LF_IFUNC(h, luma_intra, sse2) +LF_FUNC (v, luma, sse2) +LF_IFUNC(v, luma_intra, sse2) + /***********************************/ /* weighted prediction */ -static inline void ff_h264_weight_WxH_mmx2(uint8_t *dst, int stride, int log2_denom, int weight, int offset, int w, int h) +#define H264_WEIGHT(W, H, OPT) \ +void ff_h264_weight_ ## W ## x ## H ## _ ## OPT(uint8_t *dst, \ + int stride, int log2_denom, int weight, int offset); + +#define H264_BIWEIGHT(W, H, OPT) \ +void ff_h264_biweight_ ## W ## x ## H ## _ ## OPT(uint8_t *dst, \ + uint8_t *src, int stride, int log2_denom, int weightd, \ + int weights, int offset); + +#define H264_BIWEIGHT_MMX(W,H) \ +H264_WEIGHT (W, H, mmx2) \ +H264_BIWEIGHT(W, H, mmx2) + +#define H264_BIWEIGHT_MMX_SSE(W,H) \ +H264_BIWEIGHT_MMX(W, H) \ +H264_WEIGHT (W, H, sse2) \ +H264_BIWEIGHT (W, H, sse2) \ +H264_BIWEIGHT (W, H, ssse3) + +H264_BIWEIGHT_MMX_SSE(16, 16) +H264_BIWEIGHT_MMX_SSE(16, 8) +H264_BIWEIGHT_MMX_SSE( 8, 16) +H264_BIWEIGHT_MMX_SSE( 8, 8) +H264_BIWEIGHT_MMX_SSE( 8, 4) +H264_BIWEIGHT_MMX ( 4, 8) +H264_BIWEIGHT_MMX ( 4, 4) +H264_BIWEIGHT_MMX ( 4, 2) + +void ff_h264dsp_init_x86(H264DSPContext *c) { - int x, y; - offset <<= log2_denom; - offset += (1 << log2_denom) >> 1; - __asm__ volatile( - "movd %0, %%mm4 \n\t" - "movd %1, %%mm5 \n\t" - "movd %2, %%mm6 \n\t" - "pshufw $0, %%mm4, %%mm4 \n\t" - "pshufw $0, %%mm5, %%mm5 \n\t" - "pxor %%mm7, %%mm7 \n\t" - :: "g"(weight), "g"(offset), "g"(log2_denom) - ); - for(y=0; yh264_loop_filter_strength= h264_loop_filter_strength_mmx2; } -} - -#define H264_WEIGHT(W,H) \ -static void ff_h264_biweight_ ## W ## x ## H ## _mmx2(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset){ \ - ff_h264_biweight_WxH_mmx2(dst, src, stride, log2_denom, weightd, weights, offset, W, H); \ -} \ -static void ff_h264_weight_ ## W ## x ## H ## _mmx2(uint8_t *dst, int stride, int log2_denom, int weight, int offset){ \ - ff_h264_weight_WxH_mmx2(dst, stride, log2_denom, weight, offset, W, H); \ -} - -H264_WEIGHT(16,16) -H264_WEIGHT(16, 8) -H264_WEIGHT( 8,16) -H264_WEIGHT( 8, 8) -H264_WEIGHT( 8, 4) -H264_WEIGHT( 4, 8) -H264_WEIGHT( 4, 4) -H264_WEIGHT( 4, 2) - -void ff_h264_biweight_8x8_sse2(uint8_t *dst, uint8_t *src, int stride, - int log2_denom, int weightd, int weights, - int offset); - -void ff_h264_biweight_16x16_sse2(uint8_t *dst, uint8_t *src, int stride, - int log2_denom, int weightd, int weights, - int offset); - -void ff_h264_biweight_8x8_ssse3(uint8_t *dst, uint8_t *src, int stride, - int log2_denom, int weightd, int weights, - int offset); - -void ff_h264_biweight_16x16_ssse3(uint8_t *dst, uint8_t *src, int stride, - int log2_denom, int weightd, int weights, - int offset); - -void ff_pred16x16_vertical_mmx (uint8_t *src, int stride); -void ff_pred16x16_vertical_sse (uint8_t *src, int stride); -void ff_pred16x16_horizontal_mmx (uint8_t *src, int stride); -void ff_pred16x16_horizontal_mmxext(uint8_t *src, int stride); -void ff_pred16x16_horizontal_ssse3 (uint8_t *src, int stride); -void ff_pred16x16_dc_mmxext (uint8_t *src, int stride); -void ff_pred16x16_dc_sse2 (uint8_t *src, int stride); -void ff_pred16x16_dc_ssse3 (uint8_t *src, int stride); -void ff_pred16x16_tm_vp8_mmx (uint8_t *src, int stride); -void ff_pred16x16_tm_vp8_mmxext (uint8_t *src, int stride); -void ff_pred16x16_tm_vp8_sse2 (uint8_t *src, int stride); -void ff_pred8x8_dc_rv40_mmxext (uint8_t *src, int stride); -void ff_pred8x8_vertical_mmx (uint8_t *src, int stride); -void ff_pred8x8_horizontal_mmx (uint8_t *src, int stride); -void ff_pred8x8_horizontal_mmxext (uint8_t *src, int stride); -void ff_pred8x8_horizontal_ssse3 (uint8_t *src, int stride); -void ff_pred8x8_tm_vp8_mmx (uint8_t *src, int stride); -void ff_pred8x8_tm_vp8_mmxext (uint8_t *src, int stride); -void ff_pred8x8_tm_vp8_sse2 (uint8_t *src, int stride); -void ff_pred8x8_tm_vp8_ssse3 (uint8_t *src, int stride); -void ff_pred4x4_dc_mmxext (uint8_t *src, const uint8_t *topright, int stride); -void ff_pred4x4_tm_vp8_mmx (uint8_t *src, const uint8_t *topright, int stride); -void ff_pred4x4_tm_vp8_mmxext (uint8_t *src, const uint8_t *topright, int stride); -void ff_pred4x4_tm_vp8_ssse3 (uint8_t *src, const uint8_t *topright, int stride); -void ff_pred4x4_vertical_vp8_mmxext(uint8_t *src, const uint8_t *topright, int stride); - -#if CONFIG_H264PRED -void ff_h264_pred_init_x86(H264PredContext *h, int codec_id) -{ - int mm_flags = mm_support(); - #if HAVE_YASM - if (mm_flags & FF_MM_MMX) { - h->pred16x16[VERT_PRED8x8] = ff_pred16x16_vertical_mmx; - h->pred16x16[HOR_PRED8x8 ] = ff_pred16x16_horizontal_mmx; - h->pred8x8 [VERT_PRED8x8] = ff_pred8x8_vertical_mmx; - h->pred8x8 [HOR_PRED8x8 ] = ff_pred8x8_horizontal_mmx; - if (codec_id == CODEC_ID_VP8) { - h->pred16x16[PLANE_PRED8x8] = ff_pred16x16_tm_vp8_mmx; - h->pred8x8 [PLANE_PRED8x8] = ff_pred8x8_tm_vp8_mmx; - h->pred4x4 [TM_VP8_PRED ] = ff_pred4x4_tm_vp8_mmx; - } - } - - if (mm_flags & FF_MM_MMX2) { - h->pred16x16[HOR_PRED8x8 ] = ff_pred16x16_horizontal_mmxext; - h->pred16x16[DC_PRED8x8 ] = ff_pred16x16_dc_mmxext; - h->pred8x8 [HOR_PRED8x8 ] = ff_pred8x8_horizontal_mmxext; - h->pred4x4 [DC_PRED ] = ff_pred4x4_dc_mmxext; - if (codec_id == CODEC_ID_VP8) { - h->pred16x16[PLANE_PRED8x8] = ff_pred16x16_tm_vp8_mmxext; - h->pred8x8 [DC_PRED8x8 ] = ff_pred8x8_dc_rv40_mmxext; - h->pred8x8 [PLANE_PRED8x8] = ff_pred8x8_tm_vp8_mmxext; - h->pred4x4 [TM_VP8_PRED ] = ff_pred4x4_tm_vp8_mmxext; - h->pred4x4 [VERT_PRED ] = ff_pred4x4_vertical_vp8_mmxext; - } - } - - if (mm_flags & FF_MM_SSE) { - h->pred16x16[VERT_PRED8x8] = ff_pred16x16_vertical_sse; - } - - if (mm_flags & FF_MM_SSE2) { - h->pred16x16[DC_PRED8x8 ] = ff_pred16x16_dc_sse2; - if (codec_id == CODEC_ID_VP8) { - h->pred16x16[PLANE_PRED8x8] = ff_pred16x16_tm_vp8_sse2; - h->pred8x8 [PLANE_PRED8x8] = ff_pred8x8_tm_vp8_sse2; - } - } - - if (mm_flags & FF_MM_SSSE3) { - h->pred16x16[HOR_PRED8x8 ] = ff_pred16x16_horizontal_ssse3; - h->pred16x16[DC_PRED8x8 ] = ff_pred16x16_dc_ssse3; - h->pred8x8 [HOR_PRED8x8 ] = ff_pred8x8_horizontal_ssse3; - if (codec_id == CODEC_ID_VP8) { - h->pred8x8 [PLANE_PRED8x8] = ff_pred8x8_tm_vp8_ssse3; - h->pred4x4 [TM_VP8_PRED ] = ff_pred4x4_tm_vp8_ssse3; + if (mm_flags & AV_CPU_FLAG_MMX) { + c->h264_idct_dc_add= + c->h264_idct_add= ff_h264_idct_add_mmx; + c->h264_idct8_dc_add= + c->h264_idct8_add= ff_h264_idct8_add_mmx; + + c->h264_idct_add16 = ff_h264_idct_add16_mmx; + c->h264_idct8_add4 = ff_h264_idct8_add4_mmx; + c->h264_idct_add8 = ff_h264_idct_add8_mmx; + c->h264_idct_add16intra= ff_h264_idct_add16intra_mmx; + + if (mm_flags & AV_CPU_FLAG_MMX2) { + c->h264_idct_dc_add= ff_h264_idct_dc_add_mmx2; + c->h264_idct8_dc_add= ff_h264_idct8_dc_add_mmx2; + c->h264_idct_add16 = ff_h264_idct_add16_mmx2; + c->h264_idct8_add4 = ff_h264_idct8_add4_mmx2; + c->h264_idct_add8 = ff_h264_idct_add8_mmx2; + c->h264_idct_add16intra= ff_h264_idct_add16intra_mmx2; + + c->h264_v_loop_filter_chroma= ff_x264_deblock_v_chroma_mmxext; + c->h264_h_loop_filter_chroma= ff_x264_deblock_h_chroma_mmxext; + c->h264_v_loop_filter_chroma_intra= ff_x264_deblock_v_chroma_intra_mmxext; + c->h264_h_loop_filter_chroma_intra= ff_x264_deblock_h_chroma_intra_mmxext; +#if ARCH_X86_32 + c->h264_v_loop_filter_luma= ff_x264_deblock_v_luma_mmxext; + c->h264_h_loop_filter_luma= ff_x264_deblock_h_luma_mmxext; + c->h264_v_loop_filter_luma_intra = ff_x264_deblock_v_luma_intra_mmxext; + c->h264_h_loop_filter_luma_intra = ff_x264_deblock_h_luma_intra_mmxext; +#endif + c->weight_h264_pixels_tab[0]= ff_h264_weight_16x16_mmx2; + c->weight_h264_pixels_tab[1]= ff_h264_weight_16x8_mmx2; + c->weight_h264_pixels_tab[2]= ff_h264_weight_8x16_mmx2; + c->weight_h264_pixels_tab[3]= ff_h264_weight_8x8_mmx2; + c->weight_h264_pixels_tab[4]= ff_h264_weight_8x4_mmx2; + c->weight_h264_pixels_tab[5]= ff_h264_weight_4x8_mmx2; + c->weight_h264_pixels_tab[6]= ff_h264_weight_4x4_mmx2; + c->weight_h264_pixels_tab[7]= ff_h264_weight_4x2_mmx2; + + c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16x16_mmx2; + c->biweight_h264_pixels_tab[1]= ff_h264_biweight_16x8_mmx2; + c->biweight_h264_pixels_tab[2]= ff_h264_biweight_8x16_mmx2; + c->biweight_h264_pixels_tab[3]= ff_h264_biweight_8x8_mmx2; + c->biweight_h264_pixels_tab[4]= ff_h264_biweight_8x4_mmx2; + c->biweight_h264_pixels_tab[5]= ff_h264_biweight_4x8_mmx2; + c->biweight_h264_pixels_tab[6]= ff_h264_biweight_4x4_mmx2; + c->biweight_h264_pixels_tab[7]= ff_h264_biweight_4x2_mmx2; + + if (mm_flags&AV_CPU_FLAG_SSE2) { + c->h264_idct8_add = ff_h264_idct8_add_sse2; + c->h264_idct8_add4= ff_h264_idct8_add4_sse2; + + c->weight_h264_pixels_tab[0]= ff_h264_weight_16x16_sse2; + c->weight_h264_pixels_tab[1]= ff_h264_weight_16x8_sse2; + c->weight_h264_pixels_tab[2]= ff_h264_weight_8x16_sse2; + c->weight_h264_pixels_tab[3]= ff_h264_weight_8x8_sse2; + c->weight_h264_pixels_tab[4]= ff_h264_weight_8x4_sse2; + + c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16x16_sse2; + c->biweight_h264_pixels_tab[1]= ff_h264_biweight_16x8_sse2; + c->biweight_h264_pixels_tab[2]= ff_h264_biweight_8x16_sse2; + c->biweight_h264_pixels_tab[3]= ff_h264_biweight_8x8_sse2; + c->biweight_h264_pixels_tab[4]= ff_h264_biweight_8x4_sse2; + +#if ARCH_X86_64 || !defined(__ICC) || __ICC > 1110 + c->h264_v_loop_filter_luma = ff_x264_deblock_v_luma_sse2; + c->h264_h_loop_filter_luma = ff_x264_deblock_h_luma_sse2; + c->h264_v_loop_filter_luma_intra = ff_x264_deblock_v_luma_intra_sse2; + c->h264_h_loop_filter_luma_intra = ff_x264_deblock_h_luma_intra_sse2; +#endif + c->h264_idct_add16 = ff_h264_idct_add16_sse2; + c->h264_idct_add8 = ff_h264_idct_add8_sse2; + c->h264_idct_add16intra = ff_h264_idct_add16intra_sse2; + } + if (mm_flags&AV_CPU_FLAG_SSSE3) { + c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16x16_ssse3; + c->biweight_h264_pixels_tab[1]= ff_h264_biweight_16x8_ssse3; + c->biweight_h264_pixels_tab[2]= ff_h264_biweight_8x16_ssse3; + c->biweight_h264_pixels_tab[3]= ff_h264_biweight_8x8_ssse3; + c->biweight_h264_pixels_tab[4]= ff_h264_biweight_8x4_ssse3; + } } } #endif } -#endif diff --git a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/idct_sse2_xvid.c b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/idct_sse2_xvid.c index fc670e25d..d8a534240 100644 --- a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/idct_sse2_xvid.c +++ b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/idct_sse2_xvid.c @@ -385,11 +385,11 @@ inline void ff_idct_xvid_sse2(short *block) void ff_idct_xvid_sse2_put(uint8_t *dest, int line_size, short *block) { ff_idct_xvid_sse2(block); - put_pixels_clamped_mmx(block, dest, line_size); + ff_put_pixels_clamped_mmx(block, dest, line_size); } void ff_idct_xvid_sse2_add(uint8_t *dest, int line_size, short *block) { ff_idct_xvid_sse2(block); - add_pixels_clamped_mmx(block, dest, line_size); + ff_add_pixels_clamped_mmx(block, dest, line_size); } diff --git a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/mpegvideo_mmx.c b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/mpegvideo_mmx.c index 75ec4b2cf..f3d0eb336 100644 --- a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/mpegvideo_mmx.c +++ b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/mpegvideo_mmx.c @@ -22,6 +22,7 @@ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ +#include "libavutil/cpu.h" #include "libavutil/x86_cpu.h" #include "libavcodec/avcodec.h" #include "libavcodec/dsputil.h" @@ -625,9 +626,9 @@ static void denoise_dct_sse2(MpegEncContext *s, DCTELEM *block){ void MPV_common_init_mmx(MpegEncContext *s) { - int mm_flags = mm_support(); + int mm_flags = av_get_cpu_flags(); - if (mm_flags & FF_MM_MMX) { + if (mm_flags & AV_CPU_FLAG_MMX) { const int dct_algo = s->avctx->dct_algo; s->dct_unquantize_h263_intra = dct_unquantize_h263_intra_mmx; @@ -638,7 +639,7 @@ void MPV_common_init_mmx(MpegEncContext *s) s->dct_unquantize_mpeg2_intra = dct_unquantize_mpeg2_intra_mmx; s->dct_unquantize_mpeg2_inter = dct_unquantize_mpeg2_inter_mmx; - if (mm_flags & FF_MM_SSE2) { + if (mm_flags & AV_CPU_FLAG_SSE2) { s->denoise_dct= denoise_dct_sse2; } else { s->denoise_dct= denoise_dct_mmx; @@ -646,13 +647,13 @@ void MPV_common_init_mmx(MpegEncContext *s) if(dct_algo==FF_DCT_AUTO || dct_algo==FF_DCT_MMX){ #if HAVE_SSSE3 - if(mm_flags & FF_MM_SSSE3){ + if(mm_flags & AV_CPU_FLAG_SSSE3){ s->dct_quantize= dct_quantize_SSSE3; } else #endif - if(mm_flags & FF_MM_SSE2){ + if(mm_flags & AV_CPU_FLAG_SSE2){ s->dct_quantize= dct_quantize_SSE2; - } else if(mm_flags & FF_MM_MMX2){ + } else if(mm_flags & AV_CPU_FLAG_MMX2){ s->dct_quantize= dct_quantize_MMX2; } else { s->dct_quantize= dct_quantize_MMX; diff --git a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/simple_idct_mmx.c b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/simple_idct_mmx.c index e32b8f0b4..8ad0d3192 100644 --- a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/simple_idct_mmx.c +++ b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/simple_idct_mmx.c @@ -1287,10 +1287,10 @@ void ff_simple_idct_mmx(int16_t *block) void ff_simple_idct_put_mmx(uint8_t *dest, int line_size, DCTELEM *block) { idct(block); - put_pixels_clamped_mmx(block, dest, line_size); + ff_put_pixels_clamped_mmx(block, dest, line_size); } void ff_simple_idct_add_mmx(uint8_t *dest, int line_size, DCTELEM *block) { idct(block); - add_pixels_clamped_mmx(block, dest, line_size); + ff_add_pixels_clamped_mmx(block, dest, line_size); } diff --git a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/vc1dsp_mmx.c b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/vc1dsp_mmx.c index eb3ad2c32..8889bb36e 100644 --- a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/vc1dsp_mmx.c +++ b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/vc1dsp_mmx.c @@ -24,6 +24,7 @@ * OTHER DEALINGS IN THE SOFTWARE. */ +#include "libavutil/cpu.h" #include "libavutil/x86_cpu.h" #include "libavcodec/dsputil.h" #include "dsputil_mmx.h" @@ -714,7 +715,7 @@ static void vc1_h_loop_filter16_sse4(uint8_t *src, int stride, int pq) #endif void ff_vc1dsp_init_mmx(DSPContext* dsp, AVCodecContext *avctx) { - int mm_flags = mm_support(); + int mm_flags = av_get_cpu_flags(); dsp->put_vc1_mspel_pixels_tab[ 0] = ff_put_vc1_mspel_mc00_mmx; dsp->put_vc1_mspel_pixels_tab[ 4] = put_vc1_mspel_mc01_mmx; @@ -736,7 +737,7 @@ void ff_vc1dsp_init_mmx(DSPContext* dsp, AVCodecContext *avctx) { dsp->put_vc1_mspel_pixels_tab[11] = put_vc1_mspel_mc32_mmx; dsp->put_vc1_mspel_pixels_tab[15] = put_vc1_mspel_mc33_mmx; - if (mm_flags & FF_MM_MMX2){ + if (mm_flags & AV_CPU_FLAG_MMX2){ dsp->avg_vc1_mspel_pixels_tab[ 0] = ff_avg_vc1_mspel_mc00_mmx2; dsp->avg_vc1_mspel_pixels_tab[ 4] = avg_vc1_mspel_mc01_mmx2; dsp->avg_vc1_mspel_pixels_tab[ 8] = avg_vc1_mspel_mc02_mmx2; @@ -772,23 +773,23 @@ void ff_vc1dsp_init_mmx(DSPContext* dsp, AVCodecContext *avctx) { dsp->vc1_h_loop_filter16 = vc1_h_loop_filter16_ ## EXT #if HAVE_YASM - if (mm_flags & FF_MM_MMX) { + if (mm_flags & AV_CPU_FLAG_MMX) { ASSIGN_LF(mmx); } return; - if (mm_flags & FF_MM_MMX2) { + if (mm_flags & AV_CPU_FLAG_MMX2) { ASSIGN_LF(mmx2); } - if (mm_flags & FF_MM_SSE2) { + if (mm_flags & AV_CPU_FLAG_SSE2) { dsp->vc1_v_loop_filter8 = ff_vc1_v_loop_filter8_sse2; dsp->vc1_h_loop_filter8 = ff_vc1_h_loop_filter8_sse2; dsp->vc1_v_loop_filter16 = vc1_v_loop_filter16_sse2; dsp->vc1_h_loop_filter16 = vc1_h_loop_filter16_sse2; } - if (mm_flags & FF_MM_SSSE3) { + if (mm_flags & AV_CPU_FLAG_SSSE3) { ASSIGN_LF(ssse3); } - if (mm_flags & FF_MM_SSE4) { + if (mm_flags & AV_CPU_FLAG_SSE4) { dsp->vc1_h_loop_filter8 = ff_vc1_h_loop_filter8_sse4; dsp->vc1_h_loop_filter16 = vc1_h_loop_filter16_sse4; } diff --git a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/vc1dsp_yasm.asm b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/vc1dsp_yasm.asm index 660ff1169..3ea9d8db4 100644 --- a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/vc1dsp_yasm.asm +++ b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/vc1dsp_yasm.asm @@ -36,7 +36,7 @@ section .text %endmacro %macro STORE_4_WORDS_MMX 6 - movd %6, %5 + movd %6d, %5 %if mmsize==16 psrldq %5, 4 %else @@ -45,7 +45,7 @@ section .text mov %1, %6w shr %6, 16 mov %2, %6w - movd %6, %5 + movd %6d, %5 mov %3, %6w shr %6, 16 mov %4, %6w @@ -88,7 +88,7 @@ section .text pxor m7, m3 ; d_sign ^= a0_sign pxor m5, m5 - movd m3, r2 + movd m3, r2d %if %1 > 4 punpcklbw m3, m3 %endif diff --git a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/vp3dsp.asm b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/vp3dsp.asm new file mode 100644 index 000000000..f2b0af326 --- /dev/null +++ b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/vp3dsp.asm @@ -0,0 +1,618 @@ +;****************************************************************************** +;* MMX/SSE2-optimized functions for the VP3 decoder +;* Copyright (c) 2007 Aurelien Jacobs +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with FFmpeg; if not, write to the Free Software +;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "x86inc.asm" +%include "x86util.asm" + +; MMX-optimized functions cribbed from the original VP3 source code. + +SECTION_RODATA + +vp3_idct_data: times 8 dw 64277 + times 8 dw 60547 + times 8 dw 54491 + times 8 dw 46341 + times 8 dw 36410 + times 8 dw 25080 + times 8 dw 12785 + +cextern pb_1 +cextern pb_3 +cextern pb_7 +cextern pb_1F +cextern pb_81 + +cextern pw_8 + +cextern put_signed_pixels_clamped_mmx +cextern add_pixels_clamped_mmx + +SECTION .text + +; this is off by one or two for some cases when filter_limit is greater than 63 +; in: p0 in mm6, p1 in mm4, p2 in mm2, p3 in mm1 +; out: p1 in mm4, p2 in mm3 +%macro VP3_LOOP_FILTER 0 + movq m7, m6 + pand m6, [pb_7] ; p0&7 + psrlw m7, 3 + pand m7, [pb_1F] ; p0>>3 + movq m3, m2 ; p2 + pxor m2, m4 + pand m2, [pb_1] ; (p2^p1)&1 + movq m5, m2 + paddb m2, m2 + paddb m2, m5 ; 3*(p2^p1)&1 + paddb m2, m6 ; extra bits lost in shifts + pcmpeqb m0, m0 + pxor m1, m0 ; 255 - p3 + pavgb m1, m2 ; (256 - p3 + extrabits) >> 1 + pxor m0, m4 ; 255 - p1 + pavgb m0, m3 ; (256 + p2-p1) >> 1 + paddb m1, [pb_3] + pavgb m1, m0 ; 128+2+( p2-p1 - p3) >> 2 + pavgb m1, m0 ; 128+1+(3*(p2-p1) - p3) >> 3 + paddusb m7, m1 ; d+128+1 + movq m6, [pb_81] + psubusb m6, m7 + psubusb m7, [pb_81] + + movq m5, [r2+516] ; flim + pminub m6, m5 + pminub m7, m5 + movq m0, m6 + movq m1, m7 + paddb m6, m6 + paddb m7, m7 + pminub m6, m5 + pminub m7, m5 + psubb m6, m0 + psubb m7, m1 + paddusb m4, m7 + psubusb m4, m6 + psubusb m3, m7 + paddusb m3, m6 +%endmacro + +%macro STORE_4_WORDS 1 + movd r2d, %1 + mov [r0 -1], r2w + psrlq %1, 32 + shr r2, 16 + mov [r0+r1 -1], r2w + movd r2d, %1 + mov [r0+r1*2-1], r2w + shr r2, 16 + mov [r0+r3 -1], r2w +%endmacro + +INIT_MMX +cglobal vp3_v_loop_filter_mmx2, 3, 4 +%ifdef ARCH_X86_64 + movsxd r1, r1d +%endif + mov r3, r1 + neg r1 + movq m6, [r0+r1*2] + movq m4, [r0+r1 ] + movq m2, [r0 ] + movq m1, [r0+r3 ] + + VP3_LOOP_FILTER + + movq [r0+r1], m4 + movq [r0 ], m3 + RET + +cglobal vp3_h_loop_filter_mmx2, 3, 4 +%ifdef ARCH_X86_64 + movsxd r1, r1d +%endif + lea r3, [r1*3] + + movd m6, [r0 -2] + movd m4, [r0+r1 -2] + movd m2, [r0+r1*2-2] + movd m1, [r0+r3 -2] + lea r0, [r0+r1*4 ] + punpcklbw m6, [r0 -2] + punpcklbw m4, [r0+r1 -2] + punpcklbw m2, [r0+r1*2-2] + punpcklbw m1, [r0+r3 -2] + sub r0, r3 + sub r0, r1 + + TRANSPOSE4x4B 6, 4, 2, 1, 0 + VP3_LOOP_FILTER + SBUTTERFLY bw, 4, 3, 5 + + STORE_4_WORDS m4 + lea r0, [r0+r1*4 ] + STORE_4_WORDS m3 + RET + +; from original comments: The Macro does IDct on 4 1-D Dcts +%macro BeginIDCT 0 + movq m2, I(3) + movq m6, C(3) + movq m4, m2 + movq m7, J(5) + pmulhw m4, m6 ; r4 = c3*i3 - i3 + movq m1, C(5) + pmulhw m6, m7 ; r6 = c3*i5 - i5 + movq m5, m1 + pmulhw m1, m2 ; r1 = c5*i3 - i3 + movq m3, I(1) + pmulhw m5, m7 ; r5 = c5*i5 - i5 + movq m0, C(1) + paddw m4, m2 ; r4 = c3*i3 + paddw m6, m7 ; r6 = c3*i5 + paddw m2, m1 ; r2 = c5*i3 + movq m1, J(7) + paddw m7, m5 ; r7 = c5*i5 + movq m5, m0 ; r5 = c1 + pmulhw m0, m3 ; r0 = c1*i1 - i1 + paddsw m4, m7 ; r4 = C = c3*i3 + c5*i5 + pmulhw m5, m1 ; r5 = c1*i7 - i7 + movq m7, C(7) + psubsw m6, m2 ; r6 = D = c3*i5 - c5*i3 + paddw m0, m3 ; r0 = c1*i1 + pmulhw m3, m7 ; r3 = c7*i1 + movq m2, I(2) + pmulhw m7, m1 ; r7 = c7*i7 + paddw m5, m1 ; r5 = c1*i7 + movq m1, m2 ; r1 = i2 + pmulhw m2, C(2) ; r2 = c2*i2 - i2 + psubsw m3, m5 ; r3 = B = c7*i1 - c1*i7 + movq m5, J(6) + paddsw m0, m7 ; r0 = A = c1*i1 + c7*i7 + movq m7, m5 ; r7 = i6 + psubsw m0, m4 ; r0 = A - C + pmulhw m5, C(2) ; r5 = c2*i6 - i6 + paddw m2, m1 ; r2 = c2*i2 + pmulhw m1, C(6) ; r1 = c6*i2 + paddsw m4, m4 ; r4 = C + C + paddsw m4, m0 ; r4 = C. = A + C + psubsw m3, m6 ; r3 = B - D + paddw m5, m7 ; r5 = c2*i6 + paddsw m6, m6 ; r6 = D + D + pmulhw m7, C(6) ; r7 = c6*i6 + paddsw m6, m3 ; r6 = D. = B + D + movq I(1), m4 ; save C. at I(1) + psubsw m1, m5 ; r1 = H = c6*i2 - c2*i6 + movq m4, C(4) + movq m5, m3 ; r5 = B - D + pmulhw m3, m4 ; r3 = (c4 - 1) * (B - D) + paddsw m7, m2 ; r3 = (c4 - 1) * (B - D) + movq I(2), m6 ; save D. at I(2) + movq m2, m0 ; r2 = A - C + movq m6, I(0) + pmulhw m0, m4 ; r0 = (c4 - 1) * (A - C) + paddw m5, m3 ; r5 = B. = c4 * (B - D) + movq m3, J(4) + psubsw m5, m1 ; r5 = B.. = B. - H + paddw m2, m0 ; r0 = A. = c4 * (A - C) + psubsw m6, m3 ; r6 = i0 - i4 + movq m0, m6 + pmulhw m6, m4 ; r6 = (c4 - 1) * (i0 - i4) + paddsw m3, m3 ; r3 = i4 + i4 + paddsw m1, m1 ; r1 = H + H + paddsw m3, m0 ; r3 = i0 + i4 + paddsw m1, m5 ; r1 = H. = B + H + pmulhw m4, m3 ; r4 = (c4 - 1) * (i0 + i4) + paddsw m6, m0 ; r6 = F = c4 * (i0 - i4) + psubsw m6, m2 ; r6 = F. = F - A. + paddsw m2, m2 ; r2 = A. + A. + movq m0, I(1) ; r0 = C. + paddsw m2, m6 ; r2 = A.. = F + A. + paddw m4, m3 ; r4 = E = c4 * (i0 + i4) + psubsw m2, m1 ; r2 = R2 = A.. - H. +%endmacro + +; RowIDCT gets ready to transpose +%macro RowIDCT 0 + BeginIDCT + movq m3, I(2) ; r3 = D. + psubsw m4, m7 ; r4 = E. = E - G + paddsw m1, m1 ; r1 = H. + H. + paddsw m7, m7 ; r7 = G + G + paddsw m1, m2 ; r1 = R1 = A.. + H. + paddsw m7, m4 ; r1 = R1 = A.. + H. + psubsw m4, m3 ; r4 = R4 = E. - D. + paddsw m3, m3 + psubsw m6, m5 ; r6 = R6 = F. - B.. + paddsw m5, m5 + paddsw m3, m4 ; r3 = R3 = E. + D. + paddsw m5, m6 ; r5 = R5 = F. + B.. + psubsw m7, m0 ; r7 = R7 = G. - C. + paddsw m0, m0 + movq I(1), m1 ; save R1 + paddsw m0, m7 ; r0 = R0 = G. + C. +%endmacro + +; Column IDCT normalizes and stores final results +%macro ColumnIDCT 0 + BeginIDCT + paddsw m2, OC_8 ; adjust R2 (and R1) for shift + paddsw m1, m1 ; r1 = H. + H. + paddsw m1, m2 ; r1 = R1 = A.. + H. + psraw m2, 4 ; r2 = NR2 + psubsw m4, m7 ; r4 = E. = E - G + psraw m1, 4 ; r1 = NR2 + movq m3, I(2) ; r3 = D. + paddsw m7, m7 ; r7 = G + G + movq I(2), m2 ; store NR2 at I2 + paddsw m7, m4 ; r7 = G. = E + G + movq I(1), m1 ; store NR1 at I1 + psubsw m4, m3 ; r4 = R4 = E. - D. + paddsw m4, OC_8 ; adjust R4 (and R3) for shift + paddsw m3, m3 ; r3 = D. + D. + paddsw m3, m4 ; r3 = R3 = E. + D. + psraw m4, 4 ; r4 = NR4 + psubsw m6, m5 ; r6 = R6 = F. - B.. + psraw m3, 4 ; r3 = NR3 + paddsw m6, OC_8 ; adjust R6 (and R5) for shift + paddsw m5, m5 ; r5 = B.. + B.. + paddsw m5, m6 ; r5 = R5 = F. + B.. + psraw m6, 4 ; r6 = NR6 + movq J(4), m4 ; store NR4 at J4 + psraw m5, 4 ; r5 = NR5 + movq I(3), m3 ; store NR3 at I3 + psubsw m7, m0 ; r7 = R7 = G. - C. + paddsw m7, OC_8 ; adjust R7 (and R0) for shift + paddsw m0, m0 ; r0 = C. + C. + paddsw m0, m7 ; r0 = R0 = G. + C. + psraw m7, 4 ; r7 = NR7 + movq J(6), m6 ; store NR6 at J6 + psraw m0, 4 ; r0 = NR0 + movq J(5), m5 ; store NR5 at J5 + movq J(7), m7 ; store NR7 at J7 + movq I(0), m0 ; store NR0 at I0 +%endmacro + +; Following macro does two 4x4 transposes in place. +; +; At entry (we assume): +; +; r0 = a3 a2 a1 a0 +; I(1) = b3 b2 b1 b0 +; r2 = c3 c2 c1 c0 +; r3 = d3 d2 d1 d0 +; +; r4 = e3 e2 e1 e0 +; r5 = f3 f2 f1 f0 +; r6 = g3 g2 g1 g0 +; r7 = h3 h2 h1 h0 +; +; At exit, we have: +; +; I(0) = d0 c0 b0 a0 +; I(1) = d1 c1 b1 a1 +; I(2) = d2 c2 b2 a2 +; I(3) = d3 c3 b3 a3 +; +; J(4) = h0 g0 f0 e0 +; J(5) = h1 g1 f1 e1 +; J(6) = h2 g2 f2 e2 +; J(7) = h3 g3 f3 e3 +; +; I(0) I(1) I(2) I(3) is the transpose of r0 I(1) r2 r3. +; J(4) J(5) J(6) J(7) is the transpose of r4 r5 r6 r7. +; +; Since r1 is free at entry, we calculate the Js first. +%macro Transpose 0 + movq m1, m4 ; r1 = e3 e2 e1 e0 + punpcklwd m4, m5 ; r4 = f1 e1 f0 e0 + movq I(0), m0 ; save a3 a2 a1 a0 + punpckhwd m1, m5 ; r1 = f3 e3 f2 e2 + movq m0, m6 ; r0 = g3 g2 g1 g0 + punpcklwd m6, m7 ; r6 = h1 g1 h0 g0 + movq m5, m4 ; r5 = f1 e1 f0 e0 + punpckldq m4, m6 ; r4 = h0 g0 f0 e0 = R4 + punpckhdq m5, m6 ; r5 = h1 g1 f1 e1 = R5 + movq m6, m1 ; r6 = f3 e3 f2 e2 + movq J(4), m4 + punpckhwd m0, m7 ; r0 = h3 g3 h2 g2 + movq J(5), m5 + punpckhdq m6, m0 ; r6 = h3 g3 f3 e3 = R7 + movq m4, I(0) ; r4 = a3 a2 a1 a0 + punpckldq m1, m0 ; r1 = h2 g2 f2 e2 = R6 + movq m5, I(1) ; r5 = b3 b2 b1 b0 + movq m0, m4 ; r0 = a3 a2 a1 a0 + movq J(7), m6 + punpcklwd m0, m5 ; r0 = b1 a1 b0 a0 + movq J(6), m1 + punpckhwd m4, m5 ; r4 = b3 a3 b2 a2 + movq m5, m2 ; r5 = c3 c2 c1 c0 + punpcklwd m2, m3 ; r2 = d1 c1 d0 c0 + movq m1, m0 ; r1 = b1 a1 b0 a0 + punpckldq m0, m2 ; r0 = d0 c0 b0 a0 = R0 + punpckhdq m1, m2 ; r1 = d1 c1 b1 a1 = R1 + movq m2, m4 ; r2 = b3 a3 b2 a2 + movq I(0), m0 + punpckhwd m5, m3 ; r5 = d3 c3 d2 c2 + movq I(1), m1 + punpckhdq m4, m5 ; r4 = d3 c3 b3 a3 = R3 + punpckldq m2, m5 ; r2 = d2 c2 b2 a2 = R2 + movq I(3), m4 + movq I(2), m2 +%endmacro + +%macro VP3_IDCT_mmx 1 + ; eax = quantized input + ; ebx = dequantizer matrix + ; ecx = IDCT constants + ; M(I) = ecx + MaskOffset(0) + I * 8 + ; C(I) = ecx + CosineOffset(32) + (I-1) * 8 + ; edx = output + ; r0..r7 = mm0..mm7 +%define OC_8 [pw_8] +%define C(x) [vp3_idct_data+16*(x-1)] + + ; at this point, function has completed dequantization + dezigzag + + ; partial transposition; now do the idct itself +%define I(x) [%1+16* x ] +%define J(x) [%1+16*(x-4)+8] + RowIDCT + Transpose + +%define I(x) [%1+16* x +64] +%define J(x) [%1+16*(x-4)+72] + RowIDCT + Transpose + +%define I(x) [%1+16*x] +%define J(x) [%1+16*x] + ColumnIDCT + +%define I(x) [%1+16*x+8] +%define J(x) [%1+16*x+8] + ColumnIDCT +%endmacro + +%macro VP3_1D_IDCT_SSE2 0 + movdqa m2, I(3) ; xmm2 = i3 + movdqa m6, C(3) ; xmm6 = c3 + movdqa m4, m2 ; xmm4 = i3 + movdqa m7, I(5) ; xmm7 = i5 + pmulhw m4, m6 ; xmm4 = c3 * i3 - i3 + movdqa m1, C(5) ; xmm1 = c5 + pmulhw m6, m7 ; xmm6 = c3 * i5 - i5 + movdqa m5, m1 ; xmm5 = c5 + pmulhw m1, m2 ; xmm1 = c5 * i3 - i3 + movdqa m3, I(1) ; xmm3 = i1 + pmulhw m5, m7 ; xmm5 = c5 * i5 - i5 + movdqa m0, C(1) ; xmm0 = c1 + paddw m4, m2 ; xmm4 = c3 * i3 + paddw m6, m7 ; xmm6 = c3 * i5 + paddw m2, m1 ; xmm2 = c5 * i3 + movdqa m1, I(7) ; xmm1 = i7 + paddw m7, m5 ; xmm7 = c5 * i5 + movdqa m5, m0 ; xmm5 = c1 + pmulhw m0, m3 ; xmm0 = c1 * i1 - i1 + paddsw m4, m7 ; xmm4 = c3 * i3 + c5 * i5 = C + pmulhw m5, m1 ; xmm5 = c1 * i7 - i7 + movdqa m7, C(7) ; xmm7 = c7 + psubsw m6, m2 ; xmm6 = c3 * i5 - c5 * i3 = D + paddw m0, m3 ; xmm0 = c1 * i1 + pmulhw m3, m7 ; xmm3 = c7 * i1 + movdqa m2, I(2) ; xmm2 = i2 + pmulhw m7, m1 ; xmm7 = c7 * i7 + paddw m5, m1 ; xmm5 = c1 * i7 + movdqa m1, m2 ; xmm1 = i2 + pmulhw m2, C(2) ; xmm2 = i2 * c2 -i2 + psubsw m3, m5 ; xmm3 = c7 * i1 - c1 * i7 = B + movdqa m5, I(6) ; xmm5 = i6 + paddsw m0, m7 ; xmm0 = c1 * i1 + c7 * i7 = A + movdqa m7, m5 ; xmm7 = i6 + psubsw m0, m4 ; xmm0 = A - C + pmulhw m5, C(2) ; xmm5 = c2 * i6 - i6 + paddw m2, m1 ; xmm2 = i2 * c2 + pmulhw m1, C(6) ; xmm1 = c6 * i2 + paddsw m4, m4 ; xmm4 = C + C + paddsw m4, m0 ; xmm4 = A + C = C. + psubsw m3, m6 ; xmm3 = B - D + paddw m5, m7 ; xmm5 = c2 * i6 + paddsw m6, m6 ; xmm6 = D + D + pmulhw m7, C(6) ; xmm7 = c6 * i6 + paddsw m6, m3 ; xmm6 = B + D = D. + movdqa I(1), m4 ; Save C. at I(1) + psubsw m1, m5 ; xmm1 = c6 * i2 - c2 * i6 = H + movdqa m4, C(4) ; xmm4 = C4 + movdqa m5, m3 ; xmm5 = B - D + pmulhw m3, m4 ; xmm3 = ( c4 -1 ) * ( B - D ) + paddsw m7, m2 ; xmm7 = c2 * i2 + c6 * i6 = G + movdqa I(2), m6 ; save D. at I(2) + movdqa m2, m0 ; xmm2 = A - C + movdqa m6, I(0) ; xmm6 = i0 + pmulhw m0, m4 ; xmm0 = ( c4 - 1 ) * ( A - C ) = A. + paddw m5, m3 ; xmm5 = c4 * ( B - D ) = B. + movdqa m3, I(4) ; xmm3 = i4 + psubsw m5, m1 ; xmm5 = B. - H = B.. + paddw m2, m0 ; xmm2 = c4 * ( A - C) = A. + psubsw m6, m3 ; xmm6 = i0 - i4 + movdqa m0, m6 ; xmm0 = i0 - i4 + pmulhw m6, m4 ; xmm6 = (c4 - 1) * (i0 - i4) = F + paddsw m3, m3 ; xmm3 = i4 + i4 + paddsw m1, m1 ; xmm1 = H + H + paddsw m3, m0 ; xmm3 = i0 + i4 + paddsw m1, m5 ; xmm1 = B. + H = H. + pmulhw m4, m3 ; xmm4 = ( c4 - 1 ) * ( i0 + i4 ) + paddw m6, m0 ; xmm6 = c4 * ( i0 - i4 ) + psubsw m6, m2 ; xmm6 = F - A. = F. + paddsw m2, m2 ; xmm2 = A. + A. + movdqa m0, I(1) ; Load C. from I(1) + paddsw m2, m6 ; xmm2 = F + A. = A.. + paddw m4, m3 ; xmm4 = c4 * ( i0 + i4 ) = 3 + psubsw m2, m1 ; xmm2 = A.. - H. = R2 + ADD(m2) ; Adjust R2 and R1 before shifting + paddsw m1, m1 ; xmm1 = H. + H. + paddsw m1, m2 ; xmm1 = A.. + H. = R1 + SHIFT(m2) ; xmm2 = op2 + psubsw m4, m7 ; xmm4 = E - G = E. + SHIFT(m1) ; xmm1 = op1 + movdqa m3, I(2) ; Load D. from I(2) + paddsw m7, m7 ; xmm7 = G + G + paddsw m7, m4 ; xmm7 = E + G = G. + psubsw m4, m3 ; xmm4 = E. - D. = R4 + ADD(m4) ; Adjust R4 and R3 before shifting + paddsw m3, m3 ; xmm3 = D. + D. + paddsw m3, m4 ; xmm3 = E. + D. = R3 + SHIFT(m4) ; xmm4 = op4 + psubsw m6, m5 ; xmm6 = F. - B..= R6 + SHIFT(m3) ; xmm3 = op3 + ADD(m6) ; Adjust R6 and R5 before shifting + paddsw m5, m5 ; xmm5 = B.. + B.. + paddsw m5, m6 ; xmm5 = F. + B.. = R5 + SHIFT(m6) ; xmm6 = op6 + SHIFT(m5) ; xmm5 = op5 + psubsw m7, m0 ; xmm7 = G. - C. = R7 + ADD(m7) ; Adjust R7 and R0 before shifting + paddsw m0, m0 ; xmm0 = C. + C. + paddsw m0, m7 ; xmm0 = G. + C. + SHIFT(m7) ; xmm7 = op7 + SHIFT(m0) ; xmm0 = op0 +%endmacro + +%macro PUT_BLOCK 8 + movdqa O(0), m%1 + movdqa O(1), m%2 + movdqa O(2), m%3 + movdqa O(3), m%4 + movdqa O(4), m%5 + movdqa O(5), m%6 + movdqa O(6), m%7 + movdqa O(7), m%8 +%endmacro + +%macro VP3_IDCT_sse2 1 +%define I(x) [%1+16*x] +%define O(x) [%1+16*x] +%define C(x) [vp3_idct_data+16*(x-1)] +%define SHIFT(x) +%define ADD(x) + VP3_1D_IDCT_SSE2 +%ifdef ARCH_X86_64 + TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8 +%else + TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [%1], [%1+16] +%endif + PUT_BLOCK 0, 1, 2, 3, 4, 5, 6, 7 + +%define SHIFT(x) psraw x, 4 +%define ADD(x) paddsw x, [pw_8] + VP3_1D_IDCT_SSE2 + PUT_BLOCK 0, 1, 2, 3, 4, 5, 6, 7 +%endmacro + +%macro vp3_idct_funcs 3 +cglobal vp3_idct_%1, 1, 1, %2 + VP3_IDCT_%1 r0 + RET + +cglobal vp3_idct_put_%1, 3, %3, %2 + VP3_IDCT_%1 r2 +%ifdef ARCH_X86_64 + mov r3, r2 + mov r2, r1 + mov r1, r0 + mov r0, r3 +%else + mov r0m, r2 + mov r1m, r0 + mov r2m, r1 +%endif +%ifdef WIN64 + call put_signed_pixels_clamped_mmx + RET +%else + jmp put_signed_pixels_clamped_mmx +%endif + +cglobal vp3_idct_add_%1, 3, %3, %2 + VP3_IDCT_%1 r2 +%ifdef ARCH_X86_64 + mov r3, r2 + mov r2, r1 + mov r1, r0 + mov r0, r3 +%else + mov r0m, r2 + mov r1m, r0 + mov r2m, r1 +%endif +%ifdef WIN64 + call add_pixels_clamped_mmx + RET +%else + jmp add_pixels_clamped_mmx +%endif +%endmacro + +%ifdef ARCH_X86_64 +%define REGS 4 +%else +%define REGS 3 +%endif +INIT_MMX +vp3_idct_funcs mmx, 0, REGS +INIT_XMM +vp3_idct_funcs sse2, 9, REGS +%undef REGS + +%macro DC_ADD 0 + movq m2, [r0 ] + movq m3, [r0+r1 ] + paddusb m2, m0 + movq m4, [r0+r1*2] + paddusb m3, m0 + movq m5, [r0+r3 ] + paddusb m4, m0 + paddusb m5, m0 + psubusb m2, m1 + psubusb m3, m1 + movq [r0 ], m2 + psubusb m4, m1 + movq [r0+r1 ], m3 + psubusb m5, m1 + movq [r0+r1*2], m4 + movq [r0+r3 ], m5 +%endmacro + +INIT_MMX +cglobal vp3_idct_dc_add_mmx2, 3, 4 +%ifdef ARCH_X86_64 + movsxd r1, r1d +%endif + lea r3, [r1*3] + movsx r2, word [r2] + add r2, 15 + sar r2, 5 + movd m0, r2d + pshufw m0, m0, 0x0 + pxor m1, m1 + psubw m1, m0 + packuswb m0, m0 + packuswb m1, m1 + DC_ADD + lea r0, [r0+r1*4] + DC_ADD + RET diff --git a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/vp3dsp_mmx.c b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/vp3dsp_mmx.c deleted file mode 100644 index 92985921e..000000000 --- a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/vp3dsp_mmx.c +++ /dev/null @@ -1,436 +0,0 @@ -/* - * Copyright (C) 2004 the ffmpeg project - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -/** - * @file - * MMX-optimized functions cribbed from the original VP3 source code. - */ - -#include "libavutil/x86_cpu.h" -#include "libavcodec/dsputil.h" -#include "dsputil_mmx.h" -#include "vp3dsp_mmx.h" - -extern const uint16_t ff_vp3_idct_data[]; - -// this is off by one or two for some cases when filter_limit is greater than 63 -// in: p0 in mm6, p1 in mm4, p2 in mm2, p3 in mm1 -// out: p1 in mm4, p2 in mm3 -#define VP3_LOOP_FILTER(flim) \ - "movq %%mm6, %%mm7 \n\t" \ - "pand "MANGLE(ff_pb_7 )", %%mm6 \n\t" /* p0&7 */ \ - "psrlw $3, %%mm7 \n\t" \ - "pand "MANGLE(ff_pb_1F)", %%mm7 \n\t" /* p0>>3 */ \ - "movq %%mm2, %%mm3 \n\t" /* mm3 = p2 */ \ - "pxor %%mm4, %%mm2 \n\t" \ - "pand "MANGLE(ff_pb_1 )", %%mm2 \n\t" /* (p2^p1)&1 */ \ - "movq %%mm2, %%mm5 \n\t" \ - "paddb %%mm2, %%mm2 \n\t" \ - "paddb %%mm5, %%mm2 \n\t" /* 3*(p2^p1)&1 */ \ - "paddb %%mm6, %%mm2 \n\t" /* extra bits lost in shifts */ \ - "pcmpeqb %%mm0, %%mm0 \n\t" \ - "pxor %%mm0, %%mm1 \n\t" /* 255 - p3 */ \ - "pavgb %%mm2, %%mm1 \n\t" /* (256 - p3 + extrabits) >> 1 */ \ - "pxor %%mm4, %%mm0 \n\t" /* 255 - p1 */ \ - "pavgb %%mm3, %%mm0 \n\t" /* (256 + p2-p1) >> 1 */ \ - "paddb "MANGLE(ff_pb_3 )", %%mm1 \n\t" \ - "pavgb %%mm0, %%mm1 \n\t" /* 128+2+( p2-p1 - p3) >> 2 */ \ - "pavgb %%mm0, %%mm1 \n\t" /* 128+1+(3*(p2-p1) - p3) >> 3 */ \ - "paddusb %%mm1, %%mm7 \n\t" /* d+128+1 */ \ - "movq "MANGLE(ff_pb_81)", %%mm6 \n\t" \ - "psubusb %%mm7, %%mm6 \n\t" \ - "psubusb "MANGLE(ff_pb_81)", %%mm7 \n\t" \ -\ - "movq "#flim", %%mm5 \n\t" \ - "pminub %%mm5, %%mm6 \n\t" \ - "pminub %%mm5, %%mm7 \n\t" \ - "movq %%mm6, %%mm0 \n\t" \ - "movq %%mm7, %%mm1 \n\t" \ - "paddb %%mm6, %%mm6 \n\t" \ - "paddb %%mm7, %%mm7 \n\t" \ - "pminub %%mm5, %%mm6 \n\t" \ - "pminub %%mm5, %%mm7 \n\t" \ - "psubb %%mm0, %%mm6 \n\t" \ - "psubb %%mm1, %%mm7 \n\t" \ - "paddusb %%mm7, %%mm4 \n\t" \ - "psubusb %%mm6, %%mm4 \n\t" \ - "psubusb %%mm7, %%mm3 \n\t" \ - "paddusb %%mm6, %%mm3 \n\t" - -#define STORE_4_WORDS(dst0, dst1, dst2, dst3, mm) \ - "movd "#mm", %0 \n\t" \ - "movw %w0, -1"#dst0" \n\t" \ - "psrlq $32, "#mm" \n\t" \ - "shr $16, %0 \n\t" \ - "movw %w0, -1"#dst1" \n\t" \ - "movd "#mm", %0 \n\t" \ - "movw %w0, -1"#dst2" \n\t" \ - "shr $16, %0 \n\t" \ - "movw %w0, -1"#dst3" \n\t" - -void ff_vp3_v_loop_filter_mmx2(uint8_t *src, int stride, int *bounding_values) -{ - __asm__ volatile( - "movq %0, %%mm6 \n\t" - "movq %1, %%mm4 \n\t" - "movq %2, %%mm2 \n\t" - "movq %3, %%mm1 \n\t" - - VP3_LOOP_FILTER(%4) - - "movq %%mm4, %1 \n\t" - "movq %%mm3, %2 \n\t" - - : "+m" (*(uint64_t*)(src - 2*stride)), - "+m" (*(uint64_t*)(src - 1*stride)), - "+m" (*(uint64_t*)(src + 0*stride)), - "+m" (*(uint64_t*)(src + 1*stride)) - : "m"(*(uint64_t*)(bounding_values+129)) - ); -} - -void ff_vp3_h_loop_filter_mmx2(uint8_t *src, int stride, int *bounding_values) -{ - x86_reg tmp; - - __asm__ volatile( - "movd -2(%1), %%mm6 \n\t" - "movd -2(%1,%3), %%mm0 \n\t" - "movd -2(%1,%3,2), %%mm1 \n\t" - "movd -2(%1,%4), %%mm4 \n\t" - - TRANSPOSE8x4(%%mm6, %%mm0, %%mm1, %%mm4, -2(%2), -2(%2,%3), -2(%2,%3,2), -2(%2,%4), %%mm2) - VP3_LOOP_FILTER(%5) - SBUTTERFLY(%%mm4, %%mm3, %%mm5, bw, q) - - STORE_4_WORDS((%1), (%1,%3), (%1,%3,2), (%1,%4), %%mm4) - STORE_4_WORDS((%2), (%2,%3), (%2,%3,2), (%2,%4), %%mm5) - - : "=&r"(tmp) - : "r"(src), "r"(src+4*stride), "r"((x86_reg)stride), "r"((x86_reg)3*stride), - "m"(*(uint64_t*)(bounding_values+129)) - : "memory" - ); -} - -/* from original comments: The Macro does IDct on 4 1-D Dcts */ -#define BeginIDCT() \ - "movq "I(3)", %%mm2 \n\t" \ - "movq "C(3)", %%mm6 \n\t" \ - "movq %%mm2, %%mm4 \n\t" \ - "movq "J(5)", %%mm7 \n\t" \ - "pmulhw %%mm6, %%mm4 \n\t" /* r4 = c3*i3 - i3 */ \ - "movq "C(5)", %%mm1 \n\t" \ - "pmulhw %%mm7, %%mm6 \n\t" /* r6 = c3*i5 - i5 */ \ - "movq %%mm1, %%mm5 \n\t" \ - "pmulhw %%mm2, %%mm1 \n\t" /* r1 = c5*i3 - i3 */ \ - "movq "I(1)", %%mm3 \n\t" \ - "pmulhw %%mm7, %%mm5 \n\t" /* r5 = c5*i5 - i5 */ \ - "movq "C(1)", %%mm0 \n\t" \ - "paddw %%mm2, %%mm4 \n\t" /* r4 = c3*i3 */ \ - "paddw %%mm7, %%mm6 \n\t" /* r6 = c3*i5 */ \ - "paddw %%mm1, %%mm2 \n\t" /* r2 = c5*i3 */ \ - "movq "J(7)", %%mm1 \n\t" \ - "paddw %%mm5, %%mm7 \n\t" /* r7 = c5*i5 */ \ - "movq %%mm0, %%mm5 \n\t" /* r5 = c1 */ \ - "pmulhw %%mm3, %%mm0 \n\t" /* r0 = c1*i1 - i1 */ \ - "paddsw %%mm7, %%mm4 \n\t" /* r4 = C = c3*i3 + c5*i5 */ \ - "pmulhw %%mm1, %%mm5 \n\t" /* r5 = c1*i7 - i7 */ \ - "movq "C(7)", %%mm7 \n\t" \ - "psubsw %%mm2, %%mm6 \n\t" /* r6 = D = c3*i5 - c5*i3 */ \ - "paddw %%mm3, %%mm0 \n\t" /* r0 = c1*i1 */ \ - "pmulhw %%mm7, %%mm3 \n\t" /* r3 = c7*i1 */ \ - "movq "I(2)", %%mm2 \n\t" \ - "pmulhw %%mm1, %%mm7 \n\t" /* r7 = c7*i7 */ \ - "paddw %%mm1, %%mm5 \n\t" /* r5 = c1*i7 */ \ - "movq %%mm2, %%mm1 \n\t" /* r1 = i2 */ \ - "pmulhw "C(2)", %%mm2 \n\t" /* r2 = c2*i2 - i2 */ \ - "psubsw %%mm5, %%mm3 \n\t" /* r3 = B = c7*i1 - c1*i7 */ \ - "movq "J(6)", %%mm5 \n\t" \ - "paddsw %%mm7, %%mm0 \n\t" /* r0 = A = c1*i1 + c7*i7 */ \ - "movq %%mm5, %%mm7 \n\t" /* r7 = i6 */ \ - "psubsw %%mm4, %%mm0 \n\t" /* r0 = A - C */ \ - "pmulhw "C(2)", %%mm5 \n\t" /* r5 = c2*i6 - i6 */ \ - "paddw %%mm1, %%mm2 \n\t" /* r2 = c2*i2 */ \ - "pmulhw "C(6)", %%mm1 \n\t" /* r1 = c6*i2 */ \ - "paddsw %%mm4, %%mm4 \n\t" /* r4 = C + C */ \ - "paddsw %%mm0, %%mm4 \n\t" /* r4 = C. = A + C */ \ - "psubsw %%mm6, %%mm3 \n\t" /* r3 = B - D */ \ - "paddw %%mm7, %%mm5 \n\t" /* r5 = c2*i6 */ \ - "paddsw %%mm6, %%mm6 \n\t" /* r6 = D + D */ \ - "pmulhw "C(6)", %%mm7 \n\t" /* r7 = c6*i6 */ \ - "paddsw %%mm3, %%mm6 \n\t" /* r6 = D. = B + D */ \ - "movq %%mm4, "I(1)"\n\t" /* save C. at I(1) */ \ - "psubsw %%mm5, %%mm1 \n\t" /* r1 = H = c6*i2 - c2*i6 */ \ - "movq "C(4)", %%mm4 \n\t" \ - "movq %%mm3, %%mm5 \n\t" /* r5 = B - D */ \ - "pmulhw %%mm4, %%mm3 \n\t" /* r3 = (c4 - 1) * (B - D) */ \ - "paddsw %%mm2, %%mm7 \n\t" /* r3 = (c4 - 1) * (B - D) */ \ - "movq %%mm6, "I(2)"\n\t" /* save D. at I(2) */ \ - "movq %%mm0, %%mm2 \n\t" /* r2 = A - C */ \ - "movq "I(0)", %%mm6 \n\t" \ - "pmulhw %%mm4, %%mm0 \n\t" /* r0 = (c4 - 1) * (A - C) */ \ - "paddw %%mm3, %%mm5 \n\t" /* r5 = B. = c4 * (B - D) */ \ - "movq "J(4)", %%mm3 \n\t" \ - "psubsw %%mm1, %%mm5 \n\t" /* r5 = B.. = B. - H */ \ - "paddw %%mm0, %%mm2 \n\t" /* r0 = A. = c4 * (A - C) */ \ - "psubsw %%mm3, %%mm6 \n\t" /* r6 = i0 - i4 */ \ - "movq %%mm6, %%mm0 \n\t" \ - "pmulhw %%mm4, %%mm6 \n\t" /* r6 = (c4 - 1) * (i0 - i4) */ \ - "paddsw %%mm3, %%mm3 \n\t" /* r3 = i4 + i4 */ \ - "paddsw %%mm1, %%mm1 \n\t" /* r1 = H + H */ \ - "paddsw %%mm0, %%mm3 \n\t" /* r3 = i0 + i4 */ \ - "paddsw %%mm5, %%mm1 \n\t" /* r1 = H. = B + H */ \ - "pmulhw %%mm3, %%mm4 \n\t" /* r4 = (c4 - 1) * (i0 + i4) */ \ - "paddsw %%mm0, %%mm6 \n\t" /* r6 = F = c4 * (i0 - i4) */ \ - "psubsw %%mm2, %%mm6 \n\t" /* r6 = F. = F - A. */ \ - "paddsw %%mm2, %%mm2 \n\t" /* r2 = A. + A. */ \ - "movq "I(1)", %%mm0 \n\t" /* r0 = C. */ \ - "paddsw %%mm6, %%mm2 \n\t" /* r2 = A.. = F + A. */ \ - "paddw %%mm3, %%mm4 \n\t" /* r4 = E = c4 * (i0 + i4) */ \ - "psubsw %%mm1, %%mm2 \n\t" /* r2 = R2 = A.. - H. */ - -/* RowIDCT gets ready to transpose */ -#define RowIDCT() \ - BeginIDCT() \ - "movq "I(2)", %%mm3 \n\t" /* r3 = D. */ \ - "psubsw %%mm7, %%mm4 \n\t" /* r4 = E. = E - G */ \ - "paddsw %%mm1, %%mm1 \n\t" /* r1 = H. + H. */ \ - "paddsw %%mm7, %%mm7 \n\t" /* r7 = G + G */ \ - "paddsw %%mm2, %%mm1 \n\t" /* r1 = R1 = A.. + H. */ \ - "paddsw %%mm4, %%mm7 \n\t" /* r1 = R1 = A.. + H. */ \ - "psubsw %%mm3, %%mm4 \n\t" /* r4 = R4 = E. - D. */ \ - "paddsw %%mm3, %%mm3 \n\t" \ - "psubsw %%mm5, %%mm6 \n\t" /* r6 = R6 = F. - B.. */ \ - "paddsw %%mm5, %%mm5 \n\t" \ - "paddsw %%mm4, %%mm3 \n\t" /* r3 = R3 = E. + D. */ \ - "paddsw %%mm6, %%mm5 \n\t" /* r5 = R5 = F. + B.. */ \ - "psubsw %%mm0, %%mm7 \n\t" /* r7 = R7 = G. - C. */ \ - "paddsw %%mm0, %%mm0 \n\t" \ - "movq %%mm1, "I(1)"\n\t" /* save R1 */ \ - "paddsw %%mm7, %%mm0 \n\t" /* r0 = R0 = G. + C. */ - -/* Column IDCT normalizes and stores final results */ -#define ColumnIDCT() \ - BeginIDCT() \ - "paddsw "OC_8", %%mm2 \n\t" /* adjust R2 (and R1) for shift */ \ - "paddsw %%mm1, %%mm1 \n\t" /* r1 = H. + H. */ \ - "paddsw %%mm2, %%mm1 \n\t" /* r1 = R1 = A.. + H. */ \ - "psraw $4, %%mm2 \n\t" /* r2 = NR2 */ \ - "psubsw %%mm7, %%mm4 \n\t" /* r4 = E. = E - G */ \ - "psraw $4, %%mm1 \n\t" /* r1 = NR1 */ \ - "movq "I(2)", %%mm3 \n\t" /* r3 = D. */ \ - "paddsw %%mm7, %%mm7 \n\t" /* r7 = G + G */ \ - "movq %%mm2, "I(2)"\n\t" /* store NR2 at I2 */ \ - "paddsw %%mm4, %%mm7 \n\t" /* r7 = G. = E + G */ \ - "movq %%mm1, "I(1)"\n\t" /* store NR1 at I1 */ \ - "psubsw %%mm3, %%mm4 \n\t" /* r4 = R4 = E. - D. */ \ - "paddsw "OC_8", %%mm4 \n\t" /* adjust R4 (and R3) for shift */ \ - "paddsw %%mm3, %%mm3 \n\t" /* r3 = D. + D. */ \ - "paddsw %%mm4, %%mm3 \n\t" /* r3 = R3 = E. + D. */ \ - "psraw $4, %%mm4 \n\t" /* r4 = NR4 */ \ - "psubsw %%mm5, %%mm6 \n\t" /* r6 = R6 = F. - B.. */ \ - "psraw $4, %%mm3 \n\t" /* r3 = NR3 */ \ - "paddsw "OC_8", %%mm6 \n\t" /* adjust R6 (and R5) for shift */ \ - "paddsw %%mm5, %%mm5 \n\t" /* r5 = B.. + B.. */ \ - "paddsw %%mm6, %%mm5 \n\t" /* r5 = R5 = F. + B.. */ \ - "psraw $4, %%mm6 \n\t" /* r6 = NR6 */ \ - "movq %%mm4, "J(4)"\n\t" /* store NR4 at J4 */ \ - "psraw $4, %%mm5 \n\t" /* r5 = NR5 */ \ - "movq %%mm3, "I(3)"\n\t" /* store NR3 at I3 */ \ - "psubsw %%mm0, %%mm7 \n\t" /* r7 = R7 = G. - C. */ \ - "paddsw "OC_8", %%mm7 \n\t" /* adjust R7 (and R0) for shift */ \ - "paddsw %%mm0, %%mm0 \n\t" /* r0 = C. + C. */ \ - "paddsw %%mm7, %%mm0 \n\t" /* r0 = R0 = G. + C. */ \ - "psraw $4, %%mm7 \n\t" /* r7 = NR7 */ \ - "movq %%mm6, "J(6)"\n\t" /* store NR6 at J6 */ \ - "psraw $4, %%mm0 \n\t" /* r0 = NR0 */ \ - "movq %%mm5, "J(5)"\n\t" /* store NR5 at J5 */ \ - "movq %%mm7, "J(7)"\n\t" /* store NR7 at J7 */ \ - "movq %%mm0, "I(0)"\n\t" /* store NR0 at I0 */ - -/* Following macro does two 4x4 transposes in place. - - At entry (we assume): - - r0 = a3 a2 a1 a0 - I(1) = b3 b2 b1 b0 - r2 = c3 c2 c1 c0 - r3 = d3 d2 d1 d0 - - r4 = e3 e2 e1 e0 - r5 = f3 f2 f1 f0 - r6 = g3 g2 g1 g0 - r7 = h3 h2 h1 h0 - - At exit, we have: - - I(0) = d0 c0 b0 a0 - I(1) = d1 c1 b1 a1 - I(2) = d2 c2 b2 a2 - I(3) = d3 c3 b3 a3 - - J(4) = h0 g0 f0 e0 - J(5) = h1 g1 f1 e1 - J(6) = h2 g2 f2 e2 - J(7) = h3 g3 f3 e3 - - I(0) I(1) I(2) I(3) is the transpose of r0 I(1) r2 r3. - J(4) J(5) J(6) J(7) is the transpose of r4 r5 r6 r7. - - Since r1 is free at entry, we calculate the Js first. */ -#define Transpose() \ - "movq %%mm4, %%mm1 \n\t" /* r1 = e3 e2 e1 e0 */ \ - "punpcklwd %%mm5, %%mm4 \n\t" /* r4 = f1 e1 f0 e0 */ \ - "movq %%mm0, "I(0)"\n\t" /* save a3 a2 a1 a0 */ \ - "punpckhwd %%mm5, %%mm1 \n\t" /* r1 = f3 e3 f2 e2 */ \ - "movq %%mm6, %%mm0 \n\t" /* r0 = g3 g2 g1 g0 */ \ - "punpcklwd %%mm7, %%mm6 \n\t" /* r6 = h1 g1 h0 g0 */ \ - "movq %%mm4, %%mm5 \n\t" /* r5 = f1 e1 f0 e0 */ \ - "punpckldq %%mm6, %%mm4 \n\t" /* r4 = h0 g0 f0 e0 = R4 */ \ - "punpckhdq %%mm6, %%mm5 \n\t" /* r5 = h1 g1 f1 e1 = R5 */ \ - "movq %%mm1, %%mm6 \n\t" /* r6 = f3 e3 f2 e2 */ \ - "movq %%mm4, "J(4)"\n\t" \ - "punpckhwd %%mm7, %%mm0 \n\t" /* r0 = h3 g3 h2 g2 */ \ - "movq %%mm5, "J(5)"\n\t" \ - "punpckhdq %%mm0, %%mm6 \n\t" /* r6 = h3 g3 f3 e3 = R7 */ \ - "movq "I(0)", %%mm4 \n\t" /* r4 = a3 a2 a1 a0 */ \ - "punpckldq %%mm0, %%mm1 \n\t" /* r1 = h2 g2 f2 e2 = R6 */ \ - "movq "I(1)", %%mm5 \n\t" /* r5 = b3 b2 b1 b0 */ \ - "movq %%mm4, %%mm0 \n\t" /* r0 = a3 a2 a1 a0 */ \ - "movq %%mm6, "J(7)"\n\t" \ - "punpcklwd %%mm5, %%mm0 \n\t" /* r0 = b1 a1 b0 a0 */ \ - "movq %%mm1, "J(6)"\n\t" \ - "punpckhwd %%mm5, %%mm4 \n\t" /* r4 = b3 a3 b2 a2 */ \ - "movq %%mm2, %%mm5 \n\t" /* r5 = c3 c2 c1 c0 */ \ - "punpcklwd %%mm3, %%mm2 \n\t" /* r2 = d1 c1 d0 c0 */ \ - "movq %%mm0, %%mm1 \n\t" /* r1 = b1 a1 b0 a0 */ \ - "punpckldq %%mm2, %%mm0 \n\t" /* r0 = d0 c0 b0 a0 = R0 */ \ - "punpckhdq %%mm2, %%mm1 \n\t" /* r1 = d1 c1 b1 a1 = R1 */ \ - "movq %%mm4, %%mm2 \n\t" /* r2 = b3 a3 b2 a2 */ \ - "movq %%mm0, "I(0)"\n\t" \ - "punpckhwd %%mm3, %%mm5 \n\t" /* r5 = d3 c3 d2 c2 */ \ - "movq %%mm1, "I(1)"\n\t" \ - "punpckhdq %%mm5, %%mm4 \n\t" /* r4 = d3 c3 b3 a3 = R3 */ \ - "punpckldq %%mm5, %%mm2 \n\t" /* r2 = d2 c2 b2 a2 = R2 */ \ - "movq %%mm4, "I(3)"\n\t" \ - "movq %%mm2, "I(2)"\n\t" - -void ff_vp3_idct_mmx(int16_t *output_data) -{ - /* eax = quantized input - * ebx = dequantizer matrix - * ecx = IDCT constants - * M(I) = ecx + MaskOffset(0) + I * 8 - * C(I) = ecx + CosineOffset(32) + (I-1) * 8 - * edx = output - * r0..r7 = mm0..mm7 - */ - -#define C(x) AV_STRINGIFY(16*(x-1))"(%1)" -#define OC_8 "%2" - - /* at this point, function has completed dequantization + dezigzag + - * partial transposition; now do the idct itself */ -#define I(x) AV_STRINGIFY(16* x )"(%0)" -#define J(x) AV_STRINGIFY(16*(x-4) + 8)"(%0)" - - __asm__ volatile ( - RowIDCT() - Transpose() - -#undef I -#undef J -#define I(x) AV_STRINGIFY(16* x + 64)"(%0)" -#define J(x) AV_STRINGIFY(16*(x-4) + 72)"(%0)" - - RowIDCT() - Transpose() - -#undef I -#undef J -#define I(x) AV_STRINGIFY(16*x)"(%0)" -#define J(x) AV_STRINGIFY(16*x)"(%0)" - - ColumnIDCT() - -#undef I -#undef J -#define I(x) AV_STRINGIFY(16*x + 8)"(%0)" -#define J(x) AV_STRINGIFY(16*x + 8)"(%0)" - - ColumnIDCT() - :: "r"(output_data), "r"(ff_vp3_idct_data), "m"(ff_pw_8) - ); -#undef I -#undef J - -} - -void ff_vp3_idct_put_mmx(uint8_t *dest, int line_size, DCTELEM *block) -{ - ff_vp3_idct_mmx(block); - put_signed_pixels_clamped_mmx(block, dest, line_size); -} - -void ff_vp3_idct_add_mmx(uint8_t *dest, int line_size, DCTELEM *block) -{ - ff_vp3_idct_mmx(block); - add_pixels_clamped_mmx(block, dest, line_size); -} - -void ff_vp3_idct_dc_add_mmx2(uint8_t *dest, int linesize, const DCTELEM *block) -{ - int dc = (block[0] + 15) >> 5; - - __asm__ volatile( - "movd %3, %%mm0 \n\t" - "pshufw $0, %%mm0, %%mm0 \n\t" - "pxor %%mm1, %%mm1 \n\t" - "psubw %%mm0, %%mm1 \n\t" - "packuswb %%mm0, %%mm0 \n\t" - "packuswb %%mm1, %%mm1 \n\t" - -#define DC_ADD \ - "movq (%0), %%mm2 \n\t" \ - "movq (%0,%1), %%mm3 \n\t" \ - "paddusb %%mm0, %%mm2 \n\t" \ - "movq (%0,%1,2), %%mm4 \n\t" \ - "paddusb %%mm0, %%mm3 \n\t" \ - "movq (%0,%2), %%mm5 \n\t" \ - "paddusb %%mm0, %%mm4 \n\t" \ - "paddusb %%mm0, %%mm5 \n\t" \ - "psubusb %%mm1, %%mm2 \n\t" \ - "psubusb %%mm1, %%mm3 \n\t" \ - "movq %%mm2, (%0) \n\t" \ - "psubusb %%mm1, %%mm4 \n\t" \ - "movq %%mm3, (%0,%1) \n\t" \ - "psubusb %%mm1, %%mm5 \n\t" \ - "movq %%mm4, (%0,%1,2) \n\t" \ - "movq %%mm5, (%0,%2) \n\t" - - DC_ADD - "lea (%0,%1,4), %0 \n\t" - DC_ADD - - : "+r"(dest) - : "r"((x86_reg)linesize), "r"((x86_reg)3*linesize), "r"(dc) - ); -} diff --git a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/vp3dsp_mmx.h b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/vp3dsp_mmx.h deleted file mode 100644 index e0ebf0b0f..000000000 --- a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/vp3dsp_mmx.h +++ /dev/null @@ -1,36 +0,0 @@ -/* - * vp3dsp MMX function declarations - * Copyright (c) 2007 Aurelien Jacobs - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#ifndef AVCODEC_X86_VP3DSP_MMX_H -#define AVCODEC_X86_VP3DSP_MMX_H - -#include -#include "libavcodec/dsputil.h" - -void ff_vp3_idct_mmx(int16_t *data); -void ff_vp3_idct_put_mmx(uint8_t *dest, int line_size, DCTELEM *block); -void ff_vp3_idct_add_mmx(uint8_t *dest, int line_size, DCTELEM *block); -void ff_vp3_idct_dc_add_mmx2(uint8_t *dest, int line_size, const DCTELEM *block); - -void ff_vp3_v_loop_filter_mmx2(uint8_t *src, int stride, int *bounding_values); -void ff_vp3_h_loop_filter_mmx2(uint8_t *src, int stride, int *bounding_values); - -#endif /* AVCODEC_X86_VP3DSP_MMX_H */ diff --git a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/vp3dsp_sse2.c b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/vp3dsp_sse2.c deleted file mode 100644 index b54ffa39e..000000000 --- a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/vp3dsp_sse2.c +++ /dev/null @@ -1,187 +0,0 @@ -/* - * Copyright (C) 2004 the ffmpeg project - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -/** - * @file - * SSE2-optimized functions cribbed from the original VP3 source code. - */ - -#include "libavcodec/dsputil.h" -#include "dsputil_mmx.h" -#include "vp3dsp_sse2.h" - -DECLARE_ALIGNED(16, const uint16_t, ff_vp3_idct_data)[7 * 8] = -{ - 64277,64277,64277,64277,64277,64277,64277,64277, - 60547,60547,60547,60547,60547,60547,60547,60547, - 54491,54491,54491,54491,54491,54491,54491,54491, - 46341,46341,46341,46341,46341,46341,46341,46341, - 36410,36410,36410,36410,36410,36410,36410,36410, - 25080,25080,25080,25080,25080,25080,25080,25080, - 12785,12785,12785,12785,12785,12785,12785,12785 -}; - - -#define VP3_1D_IDCT_SSE2(ADD, SHIFT) \ - "movdqa "I(3)", %%xmm2 \n\t" /* xmm2 = i3 */ \ - "movdqa "C(3)", %%xmm6 \n\t" /* xmm6 = c3 */ \ - "movdqa %%xmm2, %%xmm4 \n\t" /* xmm4 = i3 */ \ - "movdqa "I(5)", %%xmm7 \n\t" /* xmm7 = i5 */ \ - "pmulhw %%xmm6, %%xmm4 \n\t" /* xmm4 = c3 * i3 - i3 */ \ - "movdqa "C(5)", %%xmm1 \n\t" /* xmm1 = c5 */ \ - "pmulhw %%xmm7, %%xmm6 \n\t" /* xmm6 = c3 * i5 - i5 */ \ - "movdqa %%xmm1, %%xmm5 \n\t" /* xmm5 = c5 */ \ - "pmulhw %%xmm2, %%xmm1 \n\t" /* xmm1 = c5 * i3 - i3 */ \ - "movdqa "I(1)", %%xmm3 \n\t" /* xmm3 = i1 */ \ - "pmulhw %%xmm7, %%xmm5 \n\t" /* xmm5 = c5 * i5 - i5 */ \ - "movdqa "C(1)", %%xmm0 \n\t" /* xmm0 = c1 */ \ - "paddw %%xmm2, %%xmm4 \n\t" /* xmm4 = c3 * i3 */ \ - "paddw %%xmm7, %%xmm6 \n\t" /* xmm6 = c3 * i5 */ \ - "paddw %%xmm1, %%xmm2 \n\t" /* xmm2 = c5 * i3 */ \ - "movdqa "I(7)", %%xmm1 \n\t" /* xmm1 = i7 */ \ - "paddw %%xmm5, %%xmm7 \n\t" /* xmm7 = c5 * i5 */ \ - "movdqa %%xmm0, %%xmm5 \n\t" /* xmm5 = c1 */ \ - "pmulhw %%xmm3, %%xmm0 \n\t" /* xmm0 = c1 * i1 - i1 */ \ - "paddsw %%xmm7, %%xmm4 \n\t" /* xmm4 = c3 * i3 + c5 * i5 = C */ \ - "pmulhw %%xmm1, %%xmm5 \n\t" /* xmm5 = c1 * i7 - i7 */ \ - "movdqa "C(7)", %%xmm7 \n\t" /* xmm7 = c7 */ \ - "psubsw %%xmm2, %%xmm6 \n\t" /* xmm6 = c3 * i5 - c5 * i3 = D */ \ - "paddw %%xmm3, %%xmm0 \n\t" /* xmm0 = c1 * i1 */ \ - "pmulhw %%xmm7, %%xmm3 \n\t" /* xmm3 = c7 * i1 */ \ - "movdqa "I(2)", %%xmm2 \n\t" /* xmm2 = i2 */ \ - "pmulhw %%xmm1, %%xmm7 \n\t" /* xmm7 = c7 * i7 */ \ - "paddw %%xmm1, %%xmm5 \n\t" /* xmm5 = c1 * i7 */ \ - "movdqa %%xmm2, %%xmm1 \n\t" /* xmm1 = i2 */ \ - "pmulhw "C(2)", %%xmm2 \n\t" /* xmm2 = i2 * c2 -i2 */ \ - "psubsw %%xmm5, %%xmm3 \n\t" /* xmm3 = c7 * i1 - c1 * i7 = B */ \ - "movdqa "I(6)", %%xmm5 \n\t" /* xmm5 = i6 */ \ - "paddsw %%xmm7, %%xmm0 \n\t" /* xmm0 = c1 * i1 + c7 * i7 = A */ \ - "movdqa %%xmm5, %%xmm7 \n\t" /* xmm7 = i6 */ \ - "psubsw %%xmm4, %%xmm0 \n\t" /* xmm0 = A - C */ \ - "pmulhw "C(2)", %%xmm5 \n\t" /* xmm5 = c2 * i6 - i6 */ \ - "paddw %%xmm1, %%xmm2 \n\t" /* xmm2 = i2 * c2 */ \ - "pmulhw "C(6)", %%xmm1 \n\t" /* xmm1 = c6 * i2 */ \ - "paddsw %%xmm4, %%xmm4 \n\t" /* xmm4 = C + C */ \ - "paddsw %%xmm0, %%xmm4 \n\t" /* xmm4 = A + C = C. */ \ - "psubsw %%xmm6, %%xmm3 \n\t" /* xmm3 = B - D */ \ - "paddw %%xmm7, %%xmm5 \n\t" /* xmm5 = c2 * i6 */ \ - "paddsw %%xmm6, %%xmm6 \n\t" /* xmm6 = D + D */ \ - "pmulhw "C(6)", %%xmm7 \n\t" /* xmm7 = c6 * i6 */ \ - "paddsw %%xmm3, %%xmm6 \n\t" /* xmm6 = B + D = D. */ \ - "movdqa %%xmm4, "I(1)" \n\t" /* Save C. at I(1) */ \ - "psubsw %%xmm5, %%xmm1 \n\t" /* xmm1 = c6 * i2 - c2 * i6 = H */ \ - "movdqa "C(4)", %%xmm4 \n\t" /* xmm4 = c4 */ \ - "movdqa %%xmm3, %%xmm5 \n\t" /* xmm5 = B - D */ \ - "pmulhw %%xmm4, %%xmm3 \n\t" /* xmm3 = ( c4 -1 ) * ( B - D ) */ \ - "paddsw %%xmm2, %%xmm7 \n\t" /* xmm7 = c2 * i2 + c6 * i6 = G */ \ - "movdqa %%xmm6, "I(2)" \n\t" /* Save D. at I(2) */ \ - "movdqa %%xmm0, %%xmm2 \n\t" /* xmm2 = A - C */ \ - "movdqa "I(0)", %%xmm6 \n\t" /* xmm6 = i0 */ \ - "pmulhw %%xmm4, %%xmm0 \n\t" /* xmm0 = ( c4 - 1 ) * ( A - C ) = A. */ \ - "paddw %%xmm3, %%xmm5 \n\t" /* xmm5 = c4 * ( B - D ) = B. */ \ - "movdqa "I(4)", %%xmm3 \n\t" /* xmm3 = i4 */ \ - "psubsw %%xmm1, %%xmm5 \n\t" /* xmm5 = B. - H = B.. */ \ - "paddw %%xmm0, %%xmm2 \n\t" /* xmm2 = c4 * ( A - C) = A. */ \ - "psubsw %%xmm3, %%xmm6 \n\t" /* xmm6 = i0 - i4 */ \ - "movdqa %%xmm6, %%xmm0 \n\t" /* xmm0 = i0 - i4 */ \ - "pmulhw %%xmm4, %%xmm6 \n\t" /* xmm6 = (c4 - 1) * (i0 - i4) = F */ \ - "paddsw %%xmm3, %%xmm3 \n\t" /* xmm3 = i4 + i4 */ \ - "paddsw %%xmm1, %%xmm1 \n\t" /* xmm1 = H + H */ \ - "paddsw %%xmm0, %%xmm3 \n\t" /* xmm3 = i0 + i4 */ \ - "paddsw %%xmm5, %%xmm1 \n\t" /* xmm1 = B. + H = H. */ \ - "pmulhw %%xmm3, %%xmm4 \n\t" /* xmm4 = ( c4 - 1 ) * ( i0 + i4 ) */ \ - "paddw %%xmm0, %%xmm6 \n\t" /* xmm6 = c4 * ( i0 - i4 ) */ \ - "psubsw %%xmm2, %%xmm6 \n\t" /* xmm6 = F - A. = F. */ \ - "paddsw %%xmm2, %%xmm2 \n\t" /* xmm2 = A. + A. */ \ - "movdqa "I(1)", %%xmm0 \n\t" /* Load C. from I(1) */ \ - "paddsw %%xmm6, %%xmm2 \n\t" /* xmm2 = F + A. = A.. */ \ - "paddw %%xmm3, %%xmm4 \n\t" /* xmm4 = c4 * ( i0 + i4 ) = 3 */ \ - "psubsw %%xmm1, %%xmm2 \n\t" /* xmm2 = A.. - H. = R2 */ \ - ADD(%%xmm2) /* Adjust R2 and R1 before shifting */ \ - "paddsw %%xmm1, %%xmm1 \n\t" /* xmm1 = H. + H. */ \ - "paddsw %%xmm2, %%xmm1 \n\t" /* xmm1 = A.. + H. = R1 */ \ - SHIFT(%%xmm2) /* xmm2 = op2 */ \ - "psubsw %%xmm7, %%xmm4 \n\t" /* xmm4 = E - G = E. */ \ - SHIFT(%%xmm1) /* xmm1 = op1 */ \ - "movdqa "I(2)", %%xmm3 \n\t" /* Load D. from I(2) */ \ - "paddsw %%xmm7, %%xmm7 \n\t" /* xmm7 = G + G */ \ - "paddsw %%xmm4, %%xmm7 \n\t" /* xmm7 = E + G = G. */ \ - "psubsw %%xmm3, %%xmm4 \n\t" /* xmm4 = E. - D. = R4 */ \ - ADD(%%xmm4) /* Adjust R4 and R3 before shifting */ \ - "paddsw %%xmm3, %%xmm3 \n\t" /* xmm3 = D. + D. */ \ - "paddsw %%xmm4, %%xmm3 \n\t" /* xmm3 = E. + D. = R3 */ \ - SHIFT(%%xmm4) /* xmm4 = op4 */ \ - "psubsw %%xmm5, %%xmm6 \n\t" /* xmm6 = F. - B..= R6 */ \ - SHIFT(%%xmm3) /* xmm3 = op3 */ \ - ADD(%%xmm6) /* Adjust R6 and R5 before shifting */ \ - "paddsw %%xmm5, %%xmm5 \n\t" /* xmm5 = B.. + B.. */ \ - "paddsw %%xmm6, %%xmm5 \n\t" /* xmm5 = F. + B.. = R5 */ \ - SHIFT(%%xmm6) /* xmm6 = op6 */ \ - SHIFT(%%xmm5) /* xmm5 = op5 */ \ - "psubsw %%xmm0, %%xmm7 \n\t" /* xmm7 = G. - C. = R7 */ \ - ADD(%%xmm7) /* Adjust R7 and R0 before shifting */ \ - "paddsw %%xmm0, %%xmm0 \n\t" /* xmm0 = C. + C. */ \ - "paddsw %%xmm7, %%xmm0 \n\t" /* xmm0 = G. + C. */ \ - SHIFT(%%xmm7) /* xmm7 = op7 */ \ - SHIFT(%%xmm0) /* xmm0 = op0 */ - -#define PUT_BLOCK(r0, r1, r2, r3, r4, r5, r6, r7) \ - "movdqa " #r0 ", " O(0) "\n\t" \ - "movdqa " #r1 ", " O(1) "\n\t" \ - "movdqa " #r2 ", " O(2) "\n\t" \ - "movdqa " #r3 ", " O(3) "\n\t" \ - "movdqa " #r4 ", " O(4) "\n\t" \ - "movdqa " #r5 ", " O(5) "\n\t" \ - "movdqa " #r6 ", " O(6) "\n\t" \ - "movdqa " #r7 ", " O(7) "\n\t" - -#define NOP(xmm) -#define SHIFT4(xmm) "psraw $4, "#xmm"\n\t" -#define ADD8(xmm) "paddsw %2, "#xmm"\n\t" - -void ff_vp3_idct_sse2(int16_t *input_data) -{ -#define I(x) AV_STRINGIFY(16*x)"(%0)" -#define O(x) I(x) -#define C(x) AV_STRINGIFY(16*(x-1))"(%1)" - - __asm__ volatile ( - VP3_1D_IDCT_SSE2(NOP, NOP) - - TRANSPOSE8(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm6, %%xmm7, (%0)) - PUT_BLOCK(%%xmm0, %%xmm5, %%xmm7, %%xmm3, %%xmm6, %%xmm4, %%xmm2, %%xmm1) - - VP3_1D_IDCT_SSE2(ADD8, SHIFT4) - PUT_BLOCK(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm6, %%xmm7) - :: "r"(input_data), "r"(ff_vp3_idct_data), "m"(ff_pw_8) - ); -} - -void ff_vp3_idct_put_sse2(uint8_t *dest, int line_size, DCTELEM *block) -{ - ff_vp3_idct_sse2(block); - put_signed_pixels_clamped_mmx(block, dest, line_size); -} - -void ff_vp3_idct_add_sse2(uint8_t *dest, int line_size, DCTELEM *block) -{ - ff_vp3_idct_sse2(block); - add_pixels_clamped_mmx(block, dest, line_size); -} diff --git a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/vp3dsp_sse2.h b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/vp3dsp_sse2.h deleted file mode 100644 index 9094620eb..000000000 --- a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/vp3dsp_sse2.h +++ /dev/null @@ -1,31 +0,0 @@ -/* - * vp3dsp SSE2 function declarations - * Copyright (c) 2007 Aurelien Jacobs - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#ifndef AVCODEC_X86_VP3DSP_SSE2_H -#define AVCODEC_X86_VP3DSP_SSE2_H - -#include "libavcodec/dsputil.h" - -void ff_vp3_idct_sse2(int16_t *input_data); -void ff_vp3_idct_put_sse2(uint8_t *dest, int line_size, DCTELEM *block); -void ff_vp3_idct_add_sse2(uint8_t *dest, int line_size, DCTELEM *block); - -#endif /* AVCODEC_X86_VP3DSP_SSE2_H */ diff --git a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/vp56dsp.asm b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/vp56dsp.asm index 1b3165e54..0543ba00c 100644 --- a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/vp56dsp.asm +++ b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/vp56dsp.asm @@ -48,8 +48,8 @@ SECTION .text movq m5, m2 punpcklbw m1, m7 punpcklbw m2, m7 - punpcklbw m4, m7 - punpcklbw m5, m7 + punpckhbw m4, m7 + punpckhbw m5, m7 pmullw m1, [rsp+8*13] ; src[x+8 ] * biweight [2] pmullw m2, [rsp+8*14] ; src[x+16] * biweight [3] pmullw m4, [rsp+8*13] ; src[x+8 ] * biweight [2] @@ -95,13 +95,13 @@ SECTION .text punpckldq m3, m3 punpckhdq m4, m4 punpckhwd m5, m5 - movq m6, m5 - punpckhdq m6, m6 + movq m2, m5 + punpckhdq m2, m2 punpckldq m5, m5 movq [rsp+8*11], m3 movq [rsp+8*12], m4 movq [rsp+8*13], m5 - movq [rsp+8*14], m6 + movq [rsp+8*14], m2 %endmacro %macro SPLAT4REGS_SSE2 0 diff --git a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/vp56dsp_init.c b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/vp56dsp_init.c index 5120ed231..87fc93531 100644 --- a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/vp56dsp_init.c +++ b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/vp56dsp_init.c @@ -20,6 +20,7 @@ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ +#include "libavutil/cpu.h" #include "libavutil/x86_cpu.h" #include "libavcodec/dsputil.h" #include "libavcodec/vp56dsp.h" @@ -32,14 +33,14 @@ void ff_vp6_filter_diag4_sse2(uint8_t *dst, uint8_t *src, int stride, av_cold void ff_vp56dsp_init_x86(VP56DSPContext* c, enum CodecID codec) { #if HAVE_YASM - int mm_flags = mm_support(); + int mm_flags = av_get_cpu_flags(); if (CONFIG_VP6_DECODER && codec == CODEC_ID_VP6) { - if (mm_flags & FF_MM_MMX) { + if (mm_flags & AV_CPU_FLAG_MMX) { c->vp6_filter_diag4 = ff_vp6_filter_diag4_mmx; } - if (mm_flags & FF_MM_SSE2) { + if (mm_flags & AV_CPU_FLAG_SSE2) { c->vp6_filter_diag4 = ff_vp6_filter_diag4_sse2; } } diff --git a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/vp8dsp-init.c b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/vp8dsp-init.c index ed5cf4602..201b34e24 100644 --- a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/vp8dsp-init.c +++ b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/vp8dsp-init.c @@ -20,6 +20,7 @@ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ +#include "libavutil/cpu.h" #include "libavutil/x86_cpu.h" #include "libavcodec/vp8dsp.h" @@ -282,10 +283,10 @@ DECLARE_LOOP_FILTER(sse4) av_cold void ff_vp8dsp_init_x86(VP8DSPContext* c) { - int mm_flags = mm_support(); + int mm_flags = av_get_cpu_flags(); #if HAVE_YASM - if (mm_flags & FF_MM_MMX) { + if (mm_flags & AV_CPU_FLAG_MMX) { c->vp8_idct_dc_add = ff_vp8_idct_dc_add_mmx; c->vp8_idct_dc_add4y = ff_vp8_idct_dc_add4y_mmx; c->vp8_idct_dc_add4uv = ff_vp8_idct_dc_add4uv_mmx; @@ -312,7 +313,7 @@ av_cold void ff_vp8dsp_init_x86(VP8DSPContext* c) /* note that 4-tap width=16 functions are missing because w=16 * is only used for luma, and luma is always a copy or sixtap. */ - if (mm_flags & FF_MM_MMX2) { + if (mm_flags & AV_CPU_FLAG_MMX2) { VP8_LUMA_MC_FUNC(0, 16, mmxext); VP8_MC_FUNC(1, 8, mmxext); VP8_MC_FUNC(2, 4, mmxext); @@ -334,14 +335,14 @@ av_cold void ff_vp8dsp_init_x86(VP8DSPContext* c) c->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_mbedge_mmxext; } - if (mm_flags & FF_MM_SSE) { + if (mm_flags & AV_CPU_FLAG_SSE) { c->vp8_idct_add = ff_vp8_idct_add_sse; c->vp8_luma_dc_wht = ff_vp8_luma_dc_wht_sse; c->put_vp8_epel_pixels_tab[0][0][0] = c->put_vp8_bilinear_pixels_tab[0][0][0] = ff_put_vp8_pixels16_sse; } - if (mm_flags & (FF_MM_SSE2|FF_MM_SSE2SLOW)) { + if (mm_flags & (AV_CPU_FLAG_SSE2|AV_CPU_FLAG_SSE2SLOW)) { VP8_LUMA_MC_FUNC(0, 16, sse2); VP8_MC_FUNC(1, 8, sse2); VP8_BILINEAR_MC_FUNC(0, 16, sse2); @@ -356,7 +357,7 @@ av_cold void ff_vp8dsp_init_x86(VP8DSPContext* c) c->vp8_v_loop_filter8uv = ff_vp8_v_loop_filter8uv_mbedge_sse2; } - if (mm_flags & FF_MM_SSE2) { + if (mm_flags & AV_CPU_FLAG_SSE2) { c->vp8_idct_dc_add4y = ff_vp8_idct_dc_add4y_sse2; c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_sse2; @@ -368,7 +369,7 @@ av_cold void ff_vp8dsp_init_x86(VP8DSPContext* c) c->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_mbedge_sse2; } - if (mm_flags & FF_MM_SSSE3) { + if (mm_flags & AV_CPU_FLAG_SSSE3) { VP8_LUMA_MC_FUNC(0, 16, ssse3); VP8_MC_FUNC(1, 8, ssse3); VP8_MC_FUNC(2, 4, ssse3); @@ -390,7 +391,7 @@ av_cold void ff_vp8dsp_init_x86(VP8DSPContext* c) c->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_mbedge_ssse3; } - if (mm_flags & FF_MM_SSE4) { + if (mm_flags & AV_CPU_FLAG_SSE4) { c->vp8_idct_dc_add = ff_vp8_idct_dc_add_sse4; c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_sse4; diff --git a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/vp8dsp.asm b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/vp8dsp.asm index 8cdbb3c7a..bc5ccc8e3 100644 --- a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/vp8dsp.asm +++ b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/vp8dsp.asm @@ -1342,7 +1342,7 @@ VP8_DC_WHT sse psrldq m%2, 4 %if %10 == 8 movd [%5+%8*2], m%1 - movd %5, m%3 + movd %5d, m%3 %endif psrldq m%3, 4 psrldq m%4, 4 @@ -1379,26 +1379,26 @@ VP8_DC_WHT sse ; 4 is a pointer to the destination's 4th line ; 5/6 is -stride and +stride %macro WRITE_2x4W 6 - movd %3, %1 + movd %3d, %1 punpckhdq %1, %1 mov [%4+%5*4], %3w shr %3, 16 add %4, %6 mov [%4+%5*4], %3w - movd %3, %1 + movd %3d, %1 add %4, %5 mov [%4+%5*2], %3w shr %3, 16 mov [%4+%5 ], %3w - movd %3, %2 + movd %3d, %2 punpckhdq %2, %2 mov [%4 ], %3w shr %3, 16 mov [%4+%6 ], %3w - movd %3, %2 + movd %3d, %2 add %4, %6 mov [%4+%6 ], %3w shr %3, 16 @@ -1407,27 +1407,27 @@ VP8_DC_WHT sse %endmacro %macro WRITE_8W_SSE2 5 - movd %2, %1 + movd %2d, %1 psrldq %1, 4 mov [%3+%4*4], %2w shr %2, 16 add %3, %5 mov [%3+%4*4], %2w - movd %2, %1 + movd %2d, %1 psrldq %1, 4 add %3, %4 mov [%3+%4*2], %2w shr %2, 16 mov [%3+%4 ], %2w - movd %2, %1 + movd %2d, %1 psrldq %1, 4 mov [%3 ], %2w shr %2, 16 mov [%3+%5 ], %2w - movd %2, %1 + movd %2d, %1 add %3, %5 mov [%3+%5 ], %2w shr %2, 16 @@ -1446,27 +1446,27 @@ VP8_DC_WHT sse %endmacro %macro SPLATB_REG_MMX 2-3 - movd %1, %2 + movd %1, %2d punpcklbw %1, %1 punpcklwd %1, %1 punpckldq %1, %1 %endmacro %macro SPLATB_REG_MMXEXT 2-3 - movd %1, %2 + movd %1, %2d punpcklbw %1, %1 pshufw %1, %1, 0x0 %endmacro %macro SPLATB_REG_SSE2 2-3 - movd %1, %2 + movd %1, %2d punpcklbw %1, %1 pshuflw %1, %1, 0x0 punpcklqdq %1, %1 %endmacro %macro SPLATB_REG_SSSE3 3 - movd %1, %2 + movd %1, %2d pshufb %1, %3 %endmacro -- cgit v1.2.3