diff options
Diffstat (limited to 'src/filters/transform/MPCVideoDec/ffmpeg')
89 files changed, 5875 insertions, 4487 deletions
diff --git a/src/filters/transform/MPCVideoDec/ffmpeg/Makefile b/src/filters/transform/MPCVideoDec/ffmpeg/Makefile index 666048770..9dda0709c 100644 --- a/src/filters/transform/MPCVideoDec/ffmpeg/Makefile +++ b/src/filters/transform/MPCVideoDec/ffmpeg/Makefile @@ -20,6 +20,7 @@ OUT_DIRS = ../../../../../bin/obj/Release_x64/libavcodec_gcc/ \ ../../../../../bin/obj/Release_x64/libavcodec_gcc/libavcodec/x86 \
../../../../../bin/obj/Release_x64/libavcodec_gcc/libavcore \
../../../../../bin/obj/Release_x64/libavcodec_gcc/libavutil \
+ ../../../../../bin/obj/Release_x64/libavcodec_gcc/libavutil/x86 \
../../../../../bin/obj/Release_x64/libavcodec_gcc/libswscale \
$(SLIB_DIR)
else
@@ -29,6 +30,7 @@ OUT_DIRS = ../../../../../bin/obj/Release_Win32/libavcodec_gcc/ \ ../../../../../bin/obj/Release_Win32/libavcodec_gcc/libavcodec/x86 \
../../../../../bin/obj/Release_Win32/libavcodec_gcc/libavcore \
../../../../../bin/obj/Release_Win32/libavcodec_gcc/libavutil \
+ ../../../../../bin/obj/Release_Win32/libavcodec_gcc/libavutil/x86 \
../../../../../bin/obj/Release_Win32/libavcodec_gcc/libswscale \
$(SLIB_DIR)
endif
@@ -36,6 +38,8 @@ endif CFLAGS+= -I. -I.. -I$(LAVC_DIR) -I$(LAVCORE_DIR) -I$(LAVU_DIR) -I$(LSWS_DIR) -I$(ZLIB_DIR) -I$(PNG_DIR) \
-DHAVE_AV_CONFIG_H -D_ISOC99_SOURCE -D_POSIX_C_SOURCE=200112 -std=gnu99
+YASMFLAGS+= -Pconfig.asm
+
SRCS_C=\
$(LAVC_DIR)/aac_ac3_parser.c \
$(LAVC_DIR)/ac3.c \
@@ -139,13 +143,14 @@ SRCS_C=\ $(LAVC_DIR)/amr_float/interf_dec.c \
$(LAVC_DIR)/amr_float/sp_dec.c \
\
- $(LAVC_DIR)/x86/cpuid.c \
$(LAVC_DIR)/x86/dsputil_mmx.c \
$(LAVC_DIR)/x86/fdct_mmx.c \
$(LAVC_DIR)/x86/fft.c \
$(LAVC_DIR)/x86/fft_3dn.c \
$(LAVC_DIR)/x86/fft_3dn2.c \
$(LAVC_DIR)/x86/fft_sse.c \
+ $(LAVC_DIR)/x86/h264_intrapred_init.c \
+ $(LAVC_DIR)/x86/h264dsp_mmx.c \
$(LAVC_DIR)/x86/idct_mmx.c \
$(LAVC_DIR)/x86/idct_mmx_xvid.c \
$(LAVC_DIR)/x86/idct_sse2_xvid.c \
@@ -153,15 +158,16 @@ SRCS_C=\ $(LAVC_DIR)/x86/mpegvideo_mmx.c \
$(LAVC_DIR)/x86/simple_idct_mmx.c \
$(LAVC_DIR)/x86/vc1dsp_mmx.c \
- $(LAVC_DIR)/x86/vp3dsp_mmx.c \
- $(LAVC_DIR)/x86/vp3dsp_sse2.c \
$(LAVC_DIR)/x86/vp56dsp_init.c \
$(LAVC_DIR)/x86/vp8dsp-init.c \
\
$(LAVCORE_DIR)/avcore_utils.c \
$(LAVCORE_DIR)/imgutils.c \
\
+ $(LAVU_DIR)/avstring.c \
+ $(LAVU_DIR)/cpu.c \
$(LAVU_DIR)/crc.c \
+ $(LAVU_DIR)/eval.c \
$(LAVU_DIR)/intfloat_readwrite.c \
$(LAVU_DIR)/inverse.c \
$(LAVU_DIR)/log.c \
@@ -173,6 +179,8 @@ SRCS_C=\ $(LAVU_DIR)/random_seed.c \
$(LAVU_DIR)/rational.c \
\
+ $(LAVU_DIR)/x86/cpu.c \
+\
$(LSWS_DIR)/rgb2rgb.c \
$(LSWS_DIR)/swscale.c \
$(LSWS_DIR)/sww32thread.c \
@@ -181,11 +189,13 @@ SRCS_C=\ SRCS_YASM=\
$(LAVC_DIR)/x86/dsputil_yasm.asm \
$(LAVC_DIR)/x86/fft_mmx.asm \
- $(LAVC_DIR)/x86/h264_deblock_sse2.asm \
- $(LAVC_DIR)/x86/h264_idct_sse2.asm \
+ $(LAVC_DIR)/x86/h264_chromamc.asm \
+ $(LAVC_DIR)/x86/h264_deblock.asm \
+ $(LAVC_DIR)/x86/h264_idct.asm \
$(LAVC_DIR)/x86/h264_intrapred.asm \
- $(LAVC_DIR)/x86/h264_weight_sse2.asm \
+ $(LAVC_DIR)/x86/h264_weight.asm \
$(LAVC_DIR)/x86/vc1dsp_yasm.asm \
+ $(LAVC_DIR)/x86/vp3dsp.asm \
$(LAVC_DIR)/x86/vp56dsp.asm \
$(LAVC_DIR)/x86/vp8dsp.asm \
$(LAVC_DIR)/x86/x86util.asm
@@ -222,5 +232,6 @@ clean: $(OUT_DIR)$(LAVC_DIR)/amr_float/*.o $(OUT_DIR)$(LAVC_DIR)/amr_float/*.d \
$(OUT_DIR)$(LAVCORE_DIR)/*.o $(OUT_DIR)$(LAVCORE_DIR)/*.d \
$(OUT_DIR)$(LAVU_DIR)/*.o $(OUT_DIR)$(LAVU_DIR)/*.d \
+ $(OUT_DIR)$(LAVU_DIR)/x86/*.o $(OUT_DIR)$(LAVU_DIR)/x86/*.d \
$(OUT_DIR)$(LSWS_DIR)/*.o $(OUT_DIR)$(LSWS_DIR)/*.d \
$(ZLIB_DIR)/*.o $(ZLIB_DIR)/*.d $(PNG_DIR)/*.o $(SLIB)
diff --git a/src/filters/transform/MPCVideoDec/ffmpeg/Makefile_2010 b/src/filters/transform/MPCVideoDec/ffmpeg/Makefile_2010 index 27b4f57f1..d0dc4b5ec 100644 --- a/src/filters/transform/MPCVideoDec/ffmpeg/Makefile_2010 +++ b/src/filters/transform/MPCVideoDec/ffmpeg/Makefile_2010 @@ -20,6 +20,7 @@ OUT_DIRS = ../../../../../bin10/obj/Release_x64/libavcodec_gcc/ \ ../../../../../bin10/obj/Release_x64/libavcodec_gcc/libavcodec/x86 \
../../../../../bin10/obj/Release_x64/libavcodec_gcc/libavcore \
../../../../../bin10/obj/Release_x64/libavcodec_gcc/libavutil \
+ ../../../../../bin10/obj/Release_x64/libavcodec_gcc/libavutil/x86 \
../../../../../bin10/obj/Release_x64/libavcodec_gcc/libswscale \
$(SLIB_DIR)
else
@@ -29,6 +30,7 @@ OUT_DIRS = ../../../../../bin10/obj/Release_Win32/libavcodec_gcc/ \ ../../../../../bin10/obj/Release_Win32/libavcodec_gcc/libavcodec/x86 \
../../../../../bin10/obj/Release_Win32/libavcodec_gcc/libavcore \
../../../../../bin10/obj/Release_Win32/libavcodec_gcc/libavutil \
+ ../../../../../bin10/obj/Release_Win32/libavcodec_gcc/libavutil/x86 \
../../../../../bin10/obj/Release_Win32/libavcodec_gcc/libswscale \
$(SLIB_DIR)
endif
@@ -36,6 +38,8 @@ endif CFLAGS+= -I. -I.. -I$(LAVC_DIR) -I$(LAVCORE_DIR) -I$(LAVU_DIR) -I$(LSWS_DIR) -I$(ZLIB_DIR) -I$(PNG_DIR) \
-DHAVE_AV_CONFIG_H -D_ISOC99_SOURCE -D_POSIX_C_SOURCE=200112 -std=gnu99
+YASMFLAGS+= -Pconfig.asm
+
SRCS_C=\
$(LAVC_DIR)/aac_ac3_parser.c \
$(LAVC_DIR)/ac3.c \
@@ -139,13 +143,14 @@ SRCS_C=\ $(LAVC_DIR)/amr_float/interf_dec.c \
$(LAVC_DIR)/amr_float/sp_dec.c \
\
- $(LAVC_DIR)/x86/cpuid.c \
$(LAVC_DIR)/x86/dsputil_mmx.c \
$(LAVC_DIR)/x86/fdct_mmx.c \
$(LAVC_DIR)/x86/fft.c \
$(LAVC_DIR)/x86/fft_3dn.c \
$(LAVC_DIR)/x86/fft_3dn2.c \
$(LAVC_DIR)/x86/fft_sse.c \
+ $(LAVC_DIR)/x86/h264_intrapred_init.c \
+ $(LAVC_DIR)/x86/h264dsp_mmx.c \
$(LAVC_DIR)/x86/idct_mmx.c \
$(LAVC_DIR)/x86/idct_mmx_xvid.c \
$(LAVC_DIR)/x86/idct_sse2_xvid.c \
@@ -153,15 +158,16 @@ SRCS_C=\ $(LAVC_DIR)/x86/mpegvideo_mmx.c \
$(LAVC_DIR)/x86/simple_idct_mmx.c \
$(LAVC_DIR)/x86/vc1dsp_mmx.c \
- $(LAVC_DIR)/x86/vp3dsp_mmx.c \
- $(LAVC_DIR)/x86/vp3dsp_sse2.c \
$(LAVC_DIR)/x86/vp56dsp_init.c \
$(LAVC_DIR)/x86/vp8dsp-init.c \
\
$(LAVCORE_DIR)/avcore_utils.c \
$(LAVCORE_DIR)/imgutils.c \
\
+ $(LAVU_DIR)/avstring.c \
+ $(LAVU_DIR)/cpu.c \
$(LAVU_DIR)/crc.c \
+ $(LAVU_DIR)/eval.c \
$(LAVU_DIR)/intfloat_readwrite.c \
$(LAVU_DIR)/inverse.c \
$(LAVU_DIR)/log.c \
@@ -173,6 +179,8 @@ SRCS_C=\ $(LAVU_DIR)/random_seed.c \
$(LAVU_DIR)/rational.c \
\
+ $(LAVU_DIR)/x86/cpu.c \
+\
$(LSWS_DIR)/rgb2rgb.c \
$(LSWS_DIR)/swscale.c \
$(LSWS_DIR)/sww32thread.c \
@@ -181,11 +189,13 @@ SRCS_C=\ SRCS_YASM=\
$(LAVC_DIR)/x86/dsputil_yasm.asm \
$(LAVC_DIR)/x86/fft_mmx.asm \
- $(LAVC_DIR)/x86/h264_deblock_sse2.asm \
- $(LAVC_DIR)/x86/h264_idct_sse2.asm \
+ $(LAVC_DIR)/x86/h264_chromamc.asm \
+ $(LAVC_DIR)/x86/h264_deblock.asm \
+ $(LAVC_DIR)/x86/h264_idct.asm \
$(LAVC_DIR)/x86/h264_intrapred.asm \
- $(LAVC_DIR)/x86/h264_weight_sse2.asm \
+ $(LAVC_DIR)/x86/h264_weight.asm \
$(LAVC_DIR)/x86/vc1dsp_yasm.asm \
+ $(LAVC_DIR)/x86/vp3dsp.asm \
$(LAVC_DIR)/x86/vp56dsp.asm \
$(LAVC_DIR)/x86/vp8dsp.asm \
$(LAVC_DIR)/x86/x86util.asm
@@ -222,5 +232,6 @@ clean: $(OUT_DIR)$(LAVC_DIR)/amr_float/*.o $(OUT_DIR)$(LAVC_DIR)/amr_float/*.d \
$(OUT_DIR)$(LAVCORE_DIR)/*.o $(OUT_DIR)$(LAVCORE_DIR)/*.d \
$(OUT_DIR)$(LAVU_DIR)/*.o $(OUT_DIR)$(LAVU_DIR)/*.d \
+ $(OUT_DIR)$(LAVU_DIR)/x86/*.o $(OUT_DIR)$(LAVU_DIR)/x86/*.d \
$(OUT_DIR)$(LSWS_DIR)/*.o $(OUT_DIR)$(LSWS_DIR)/*.d \
$(ZLIB_DIR)/*.o $(ZLIB_DIR)/*.d $(PNG_DIR)/*.o $(SLIB)
diff --git a/src/filters/transform/MPCVideoDec/ffmpeg/config.h b/src/filters/transform/MPCVideoDec/ffmpeg/config.h index 507cca37f..5ca6a5486 100644 --- a/src/filters/transform/MPCVideoDec/ffmpeg/config.h +++ b/src/filters/transform/MPCVideoDec/ffmpeg/config.h @@ -14,8 +14,6 @@ #ifdef ARCH_X86_64
#define HAVE_FAST_64BIT 1
- #define HAVE_CMOV 1
- #define HAVE_FAST_CMOV 1
#define HAVE_STRUCT_TIMESPEC 1
#else
#define ARCH_X86_32 1
@@ -36,8 +34,6 @@ #define ARCH_X86_32 0
#define ARCH_X86_64 0
#define HAVE_FAST_64BIT 0
- #define HAVE_CMOV 0
- #define HAVE_FAST_CMOV 0
#define restrict
#define __asm__ __asm
@@ -50,6 +46,7 @@ #define FFMPEG_CONFIGURATION "ffdshow custom"
#define FFMPEG_LICENSE "GPL version 2 or later"
+
#define CC_TYPE "gcc"
#define CC_VERSION __VERSION__
@@ -57,9 +54,9 @@ // MPC custom code for linking with MSVC
#if defined(__GNUC__) && ARCH_X86_64
-#define EXTERN_PREFIX ""
+ #define EXTERN_PREFIX ""
#else
-#define EXTERN_PREFIX "_"
+ #define EXTERN_PREFIX "_"
#endif
#define EXTERN_ASM _
@@ -96,9 +93,11 @@ #define HAVE_ALTIVEC_H 0
#define HAVE_BIGENDIAN 0
#define HAVE_BSWAP 1
+#define HAVE_CMOV 1
#define HAVE_EBP_AVAILABLE 1
#define HAVE_EBX_AVAILABLE 1
#define HAVE_FAST_CLZ 0
+#define HAVE_FAST_CMOV 1
#define HAVE_FAST_UNALIGNED 1
#define HAVE_LOCAL_ALIGNED_16 1
#define HAVE_LOCAL_ALIGNED_8 1
@@ -112,36 +111,34 @@ #ifdef __GNUC__
#define HAVE_ATTRIBUTE_PACKED 1
#define HAVE_ATTRIBUTE_MAY_ALIAS 1
+
+ #define HAVE_EXP2 1
+ #define HAVE_EXP2F 1
+ #define HAVE_LLRINT 1
+ #define HAVE_LOG2 1
+ #define HAVE_LOG2F 1
+ #define HAVE_LRINT 1
+ #define HAVE_LRINTF 1
+ #define HAVE_ROUND 1
+ #define HAVE_ROUNDF 1
+ #define HAVE_TRUNCF 1
#else
#define HAVE_ATTRIBUTE_PACKED 0
#define HAVE_ATTRIBUTE_MAY_ALIAS 0
#define EMULATE_FAST_INT
-#endif
-#ifdef __GNUC__
-#define HAVE_EXP2 1
-#define HAVE_EXP2F 1
-#define HAVE_LLRINT 1
-#define HAVE_LOG2 1
-#define HAVE_LOG2F 1
-#define HAVE_LRINT 1
-#define HAVE_LRINTF 1
-#define HAVE_ROUND 1
-#define HAVE_ROUNDF 1
-#define HAVE_TRUNCF 1
-#else
-#define HAVE_EXP2 1
-#define HAVE_EXP2F 1
-#define HAVE_LLRINT 0
-#define HAVE_LOG2 1
-#define HAVE_LOG2F 1
-#define HAVE_LRINT 0
-#define HAVE_LRINTF 0
-#define HAVE_ROUND 0
-#define HAVE_ROUNDF 1
-#define HAVE_TRUNCF 1
-#define rint(x) (int)(x+0.5)
-#define cbrtf(x) pow((float)x, (float)1.0/3)
+ #define HAVE_EXP2 1
+ #define HAVE_EXP2F 1
+ #define HAVE_LLRINT 0
+ #define HAVE_LOG2 1
+ #define HAVE_LOG2F 1
+ #define HAVE_LRINT 0
+ #define HAVE_LRINTF 0
+ #define HAVE_ROUND 0
+ #define HAVE_ROUNDF 1
+ #define HAVE_TRUNCF 1
+ #define rint(x) (int)(x+0.5)
+ #define cbrtf(x) pow((float)x, (float)1.0/3)
#endif
#define CONFIG_DCT 0
@@ -149,8 +146,8 @@ #define CONFIG_GPL 1
#define CONFIG_GRAY 1
-#define CONFIG_H264PRED 1
#define CONFIG_H264DSP 1
+#define CONFIG_H264PRED 1
#define CONFIG_HARDCODED_TABLES 0
#define CONFIG_HUFFMAN 0
#define CONFIG_LIBAMR_NB 1
diff --git a/src/filters/transform/MPCVideoDec/ffmpeg/custom_code.txt b/src/filters/transform/MPCVideoDec/ffmpeg/custom_code.txt index d0bb2b9a1..564cc184f 100644 --- a/src/filters/transform/MPCVideoDec/ffmpeg/custom_code.txt +++ b/src/filters/transform/MPCVideoDec/ffmpeg/custom_code.txt @@ -8,6 +8,7 @@ The following files have MPC-specific custom code (compared to ffdshow): * libavcodec/bitstream.c
* libavcodec/CompilatorVersion.c
* libavcodec/dsputil.c
+* libavcodec/dxva.h
* libavcodec/h264.c
* libavcodec/mpc_helper.c
* libavcodec/mpeg12.c
@@ -15,3 +16,4 @@ The following files have MPC-specific custom code (compared to ffdshow): * libavcore/avcore_utils.c (renamed from utils.c to avoid conflicts in MSVC2010)
* libavutil/internal.h
* libavutil/log.h
+* libswscale
diff --git a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec.def b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec.def index 6ee8d8d24..782a16c50 100644 --- a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec.def +++ b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec.def @@ -6,7 +6,9 @@ EXPORTS avcodec_alloc_frame
avcodec_close
avcodec_decode_video
- avcodec_decode_audio
+ avcodec_decode_video2
+ avcodec_decode_audio2
+ avcodec_decode_audio3
avcodec_default_get_buffer
avcodec_default_reget_buffer
avcodec_default_release_buffer
diff --git a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec.vcproj b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec.vcproj index b10994323..757205029 100644 --- a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec.vcproj +++ b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec.vcproj @@ -1,7 +1,7 @@ <?xml version="1.0" encoding="Windows-1252"?>
<VisualStudioProject
ProjectType="Visual C++"
- Version="9,00"
+ Version="9.00"
Name="libavcodec"
OwnerKey="{EAF909A5-FA59-4C3D-9431-0FCC20D5BCF9}"
ProjectGUID="{5CAF881C-9349-4EE2-9697-982C10795033}"
@@ -44,7 +44,6 @@ />
<Tool
Name="VCCLCompilerTool"
- AdditionalOptions="/MP"
Optimization="0"
AdditionalIncludeDirectories="..\..\..\..\thirdparty\zlib;.;libswscale;..\..\..\..\..\include"
PreprocessorDefinitions="WIN32;_CRT_SECURE_NO_DEPRECATE;_CRT_NON_CONFORMING_SWPRINTFS;_CRT_NONSTDC_NO_WARNINGS;_DEBUG;_WINDOWS;HAVE_AV_CONFIG_H;H264_MERGE_TESTING"
@@ -108,7 +107,6 @@ />
<Tool
Name="VCCLCompilerTool"
- AdditionalOptions="/MP"
Optimization="0"
AdditionalIncludeDirectories="..\..\..\..\thirdparty\zlib;.;libswscale;..\..\..\..\..\include"
PreprocessorDefinitions="_WIN64;_CRT_SECURE_NO_DEPRECATE;_CRT_NON_CONFORMING_SWPRINTFS;_CRT_NONSTDC_NO_WARNINGS;_DEBUG;_WINDOWS;HAVE_AV_CONFIG_H"
@@ -175,7 +173,6 @@ />
<Tool
Name="VCCLCompilerTool"
- AdditionalOptions="/MP"
InlineFunctionExpansion="2"
EnableIntrinsicFunctions="true"
FavorSizeOrSpeed="1"
@@ -191,7 +188,7 @@ AssemblerListingLocation="$(IntDir)\"
ObjectFile="$(IntDir)\"
ProgramDataBaseFileName="$(IntDir)\"
- WarningLevel="1"
+ WarningLevel="0"
/>
<Tool
Name="VCManagedResourceCompilerTool"
@@ -248,7 +245,6 @@ />
<Tool
Name="VCCLCompilerTool"
- AdditionalOptions="/MP"
Optimization="3"
InlineFunctionExpansion="2"
EnableIntrinsicFunctions="true"
@@ -1199,6 +1195,10 @@ >
</File>
<File
+ RelativePath=".\libavutil\avstring.c"
+ >
+ </File>
+ <File
RelativePath=".\libavutil\avstring.h"
>
</File>
@@ -1215,6 +1215,14 @@ >
</File>
<File
+ RelativePath=".\libavutil\cpu.c"
+ >
+ </File>
+ <File
+ RelativePath=".\libavutil\cpu.h"
+ >
+ </File>
+ <File
RelativePath=".\libavutil\crc.c"
>
</File>
@@ -1231,6 +1239,10 @@ >
</File>
<File
+ RelativePath=".\libavutil\eval.c"
+ >
+ </File>
+ <File
RelativePath=".\libavutil\eval.h"
>
</File>
diff --git a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec.vcxproj b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec.vcxproj index e190e548e..b0fa70df9 100644 --- a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec.vcxproj +++ b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec.vcxproj @@ -133,6 +133,7 @@ <ClInclude Include="libavutil\avutil.h" />
<ClInclude Include="libavutil\bswap.h" />
<ClInclude Include="libavutil\common.h" />
+ <ClInclude Include="libavutil\cpu.h" />
<ClInclude Include="libavutil\crc.h" />
<ClInclude Include="libavutil\crc_data.h" />
<ClInclude Include="libavutil\error.h" />
@@ -275,7 +276,10 @@ <ClCompile Include="libavcodec\xiph.c" />
<ClCompile Include="libavcore\imgutils.c" />
<ClCompile Include="libavcore\avcore_utils.c" />
+ <ClCompile Include="libavutil\avstring.c" />
+ <ClCompile Include="libavutil\cpu.c" />
<ClCompile Include="libavutil\crc.c" />
+ <ClCompile Include="libavutil\eval.c" />
<ClCompile Include="libavutil\intfloat_readwrite.c">
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
diff --git a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec.vcxproj.filters b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec.vcxproj.filters index d23ef9256..2dd323b4b 100644 --- a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec.vcxproj.filters +++ b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec.vcxproj.filters @@ -51,6 +51,9 @@ <ClInclude Include="libavutil\common.h">
<Filter>libavutil</Filter>
</ClInclude>
+ <ClInclude Include="libavutil\cpu.h">
+ <Filter>libavutil</Filter>
+ </ClInclude>
<ClInclude Include="libavutil\crc.h">
<Filter>libavutil</Filter>
</ClInclude>
@@ -459,9 +462,18 @@ <ClCompile Include="libswscale\yuv2rgb_template.c">
<Filter>libswscale</Filter>
</ClCompile>
+ <ClCompile Include="libavutil\avstring.c">
+ <Filter>libavutil</Filter>
+ </ClCompile>
+ <ClCompile Include="libavutil\cpu.c">
+ <Filter>libavutil</Filter>
+ </ClCompile>
<ClCompile Include="libavutil\crc.c">
<Filter>libavutil</Filter>
</ClCompile>
+ <ClCompile Include="libavutil\eval.c">
+ <Filter>libavutil</Filter>
+ </ClCompile>
<ClCompile Include="libavutil\lfg.c">
<Filter>libavutil</Filter>
</ClCompile>
diff --git a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/ac3dec.c b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/ac3dec.c index 126424440..5992715b8 100644 --- a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/ac3dec.c +++ b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/ac3dec.c @@ -1311,8 +1311,10 @@ static int decode_audio_block(AC3DecodeContext *s, int blk) * Decode a single AC-3 frame. */ static int ac3_decode_frame(AVCodecContext * avctx, void *data, int *data_size, - const uint8_t *buf, int buf_size) + AVPacket *avpkt) { + const uint8_t *buf = avpkt->data; + int buf_size = avpkt->size; AC3DecodeContext *s = avctx->priv_data; int16_t *out_samples = (int16_t *)data; int blk, ch, err; diff --git a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/adpcm.c b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/adpcm.c index 6fda2f8bf..6ecd98e95 100644 --- a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/adpcm.c +++ b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/adpcm.c @@ -365,8 +365,10 @@ static void xa_decode(short *out, const unsigned char *in, static int adpcm_decode_frame(AVCodecContext *avctx,
void *data, int *data_size,
- const uint8_t *buf, int buf_size)
+ AVPacket *avpkt)
{
+ const uint8_t *buf = avpkt->data;
+ int buf_size = avpkt->size;
ADPCMContext *c = avctx->priv_data;
ADPCMChannelStatus *cs;
int n, m, channel, i;
diff --git a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/audioconvert.h b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/audioconvert.h index 81b6cded3..349065edc 100644 --- a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/audioconvert.h +++ b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/audioconvert.h @@ -29,6 +29,7 @@ */ +#include "libavutil/cpu.h" #include "avcodec.h" @@ -60,6 +61,11 @@ enum SampleFormat avcodec_get_sample_fmt(const char* name); const char *avcodec_get_channel_name(int channel_id); /** + * @return channel layout that matches name, 0 if no match + */ +int64_t avcodec_get_channel_layout(const char *name); + +/** * Return description of channel layout */ void avcodec_get_channel_layout_string(char *buf, int buf_size, int nb_channels, int64_t channel_layout); @@ -88,7 +94,7 @@ typedef struct AVAudioConvert AVAudioConvert; * @param in_fmt Input sample format * @param in_channels Number of input channels * @param[in] matrix Channel mixing matrix (of dimension in_channel*out_channels). Set to NULL to ignore. - * @param flags See FF_MM_xx + * @param flags See AV_CPU_FLAG_xx * @return NULL on error */ AVAudioConvert *av_audio_convert_alloc(enum SampleFormat out_fmt, int out_channels, diff --git a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/avcodec.h b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/avcodec.h index a841de108..c5f35eda2 100644 --- a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/avcodec.h +++ b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/avcodec.h @@ -42,10 +42,11 @@ #include <errno.h>
#include "libavutil/avutil.h"
+#include "libavutil/cpu.h"
#define LIBAVCODEC_VERSION_MAJOR 52
-#define LIBAVCODEC_VERSION_MINOR 85
-#define LIBAVCODEC_VERSION_MICRO 1
+#define LIBAVCODEC_VERSION_MINOR 87
+#define LIBAVCODEC_VERSION_MICRO 5
#define LIBAVCODEC_VERSION_INT AV_VERSION_INT(LIBAVCODEC_VERSION_MAJOR, \
LIBAVCODEC_VERSION_MINOR, \
@@ -57,6 +58,17 @@ #define LIBAVCODEC_IDENT "Lavc" AV_STRINGIFY(LIBAVCODEC_VERSION)
+/**
+ * Those FF_API_* defines are not part of public API.
+ * They may change, break or disappear at any time.
+ */
+#ifndef FF_API_PALETTE_CONTROL
+#define FF_API_PALETTE_CONTROL (LIBAVCODEC_VERSION_MAJOR < 54)
+#endif
+#ifndef FF_API_MM_FLAGS
+#define FF_API_MM_FLAGS (LIBAVCODEC_VERSION_MAJOR < 53)
+#endif
+
#define AV_NOPTS_VALUE INT64_C(0x8000000000000000)
#define AV_TIME_BASE 1000000
static const AVRational AV_TIME_BASE_Q={1, AV_TIME_BASE};
@@ -1399,27 +1411,25 @@ typedef struct AVCodecContext { * result into program crash.)
*/
unsigned dsp_mask;
-#define FF_MM_FORCE 0x80000000 /* Force usage of selected flags (OR) */
- /* lower 16 bits - CPU features */
-#define FF_MM_MMX 0x0001 ///< standard MMX
-#define FF_MM_3DNOW 0x0004 ///< AMD 3DNOW
-#if LIBAVCODEC_VERSION_MAJOR < 53
-#define FF_MM_MMXEXT 0x0002 ///< SSE integer functions or AMD MMX ext
+
+#if FF_API_MM_FLAGS
+#define FF_MM_FORCE AV_CPU_FLAG_FORCE
+#define FF_MM_MMX AV_CPU_FLAG_MMX
+#define FF_MM_3DNOW AV_CPU_FLAG_3DNOW
+#define FF_MM_MMXEXT AV_CPU_FLAG_MMX2
+#define FF_MM_MMX2 AV_CPU_FLAG_MMX2
+#define FF_MM_SSE AV_CPU_FLAG_SSE
+#define FF_MM_SSE2 AV_CPU_FLAG_SSE2
+#define FF_MM_SSE2SLOW AV_CPU_FLAG_SSE2SLOW
+#define FF_MM_3DNOWEXT AV_CPU_FLAG_3DNOWEXT
+#define FF_MM_SSE3 AV_CPU_FLAG_SSE3
+#define FF_MM_SSE3SLOW AV_CPU_FLAG_SSE3SLOW
+#define FF_MM_SSSE3 AV_CPU_FLAG_SSSE3
+#define FF_MM_SSE4 AV_CPU_FLAG_SSE4
+#define FF_MM_SSE42 AV_CPU_FLAG_SSE42
+#define FF_MM_IWMMXT AV_CPU_FLAG_IWMMXT
+#define FF_MM_ALTIVEC AV_CPU_FLAG_ALTIVEC
#endif
-#define FF_MM_MMX2 0x0002 ///< SSE integer functions or AMD MMX ext
-#define FF_MM_SSE 0x0008 ///< SSE functions
-#define FF_MM_SSE2 0x0010 ///< PIV SSE2 functions
-#define FF_MM_SSE2SLOW 0x40000000 ///< SSE2 supported, but usually not faster
- ///< than regular MMX/SSE (e.g. Core1)
-#define FF_MM_3DNOWEXT 0x0020 ///< AMD 3DNowExt
-#define FF_MM_SSE3 0x0040 ///< Prescott SSE3 functions
-#define FF_MM_SSE3SLOW 0x20000000 ///< SSE3 supported, but usually not faster
- ///< than regular MMX/SSE (e.g. Core1)
-#define FF_MM_SSSE3 0x0080 ///< Conroe SSSE3 functions
-#define FF_MM_SSE4 0x0100 ///< Penryn SSE4.1 functions
-#define FF_MM_SSE42 0x0200 ///< Nehalem SSE4.2 functions
-#define FF_MM_IWMMXT 0x0100 ///< XScale IWMMXT
-#define FF_MM_ALTIVEC 0x0001 ///< standard AltiVec
/**
* bits per sample/pixel from the demuxer (needed for huffyuv).
@@ -1776,12 +1786,14 @@ typedef struct AVCodecContext { */
int lmax;
+#if FF_API_PALETTE_CONTROL
/**
* palette control structure
* - encoding: ??? (no palette-enabled encoder yet)
* - decoding: Set by user.
*/
struct AVPaletteControl *palctrl;
+#endif
/**
* noise reduction strength
@@ -2616,8 +2628,7 @@ typedef struct AVCodec { int (*init)(AVCodecContext *);
int (*encode)(AVCodecContext *, uint8_t *buf, int buf_size, void *data);
int (*close)(AVCodecContext *);
- int (*decode)(AVCodecContext *, void *outdata, int *outdata_size,
- const uint8_t *buf, int buf_size);
+ int (*decode)(AVCodecContext *, void *outdata, int *outdata_size, AVPacket *avpkt);
/**
* Codec capabilities.
* see CODEC_CAP_*
@@ -2834,59 +2845,91 @@ void avcodec_get_encoder_info(AVCodecContext *avctx,int *xvid_build,int *divx_ve */
FF_EXPORT int avcodec_open(AVCodecContext *avctx, AVCodec *codec);
+#if LIBAVCODEC_VERSION_MAJOR < 53
/**
- * @deprecated Use avcodec_decode_audio2() instead.
+ * Decode an audio frame from buf into samples.
+ * Wrapper function which calls avcodec_decode_audio3.
+ *
+ * @deprecated Use avcodec_decode_audio3 instead.
+ * @param avctx the codec context
+ * @param[out] samples the output buffer
+ * @param[in,out] frame_size_ptr the output buffer size in bytes
+ * @param[in] buf the input buffer
+ * @param[in] buf_size the input buffer size in bytes
+ * @return On error a negative value is returned, otherwise the number of bytes
+ * used or zero if no frame could be decompressed.
*/
-attribute_deprecated int avcodec_decode_audio(AVCodecContext *avctx, int16_t *samples,
+FF_EXPORT int avcodec_decode_audio2(AVCodecContext *avctx, int16_t *samples,
int *frame_size_ptr,
const uint8_t *buf, int buf_size);
+#endif
/**
- * Decodes an audio frame from buf into samples.
- * The avcodec_decode_audio2() function decodes an audio frame from the input
- * buffer buf of size buf_size. To decode it, it makes use of the
- * audio codec which was coupled with avctx using avcodec_open(). The
- * resulting decoded frame is stored in output buffer samples. If no frame
- * could be decompressed, frame_size_ptr is zero. Otherwise, it is the
+ * Decode the audio frame of size avpkt->size from avpkt->data into samples.
+ * Some decoders may support multiple frames in a single AVPacket, such
+ * decoders would then just decode the first frame. In this case,
+ * avcodec_decode_audio3 has to be called again with an AVPacket that contains
+ * the remaining data in order to decode the second frame etc.
+ * If no frame
+ * could be outputted, frame_size_ptr is zero. Otherwise, it is the
* decompressed frame size in bytes.
*
* @warning You must set frame_size_ptr to the allocated size of the
- * output buffer before calling avcodec_decode_audio2().
+ * output buffer before calling avcodec_decode_audio3().
*
* @warning The input buffer must be FF_INPUT_BUFFER_PADDING_SIZE larger than
* the actual read bytes because some optimized bitstream readers read 32 or 64
* bits at once and could read over the end.
*
- * @warning The end of the input buffer buf should be set to 0 to ensure that
+ * @warning The end of the input buffer avpkt->data should be set to 0 to ensure that
* no overreading happens for damaged MPEG streams.
*
- * @note You might have to align the input buffer buf and output buffer
+ * @note You might have to align the input buffer avpkt->data and output buffer
* samples. The alignment requirements depend on the CPU: On some CPUs it isn't
* necessary at all, on others it won't work at all if not aligned and on others
- * it will work but it will have an impact on performance. In practice, the
- * bitstream should have 4 byte alignment at minimum and all sample data should
- * be 16 byte aligned unless the CPU doesn't need it (AltiVec and SSE do). If
- * the linesize is not a multiple of 16 then there's no sense in aligning the
- * start of the buffer to 16.
+ * it will work but it will have an impact on performance.
+ *
+ * In practice, avpkt->data should have 4 byte alignment at minimum and
+ * samples should be 16 byte aligned unless the CPU doesn't need it
+ * (AltiVec and SSE do).
*
* @param avctx the codec context
- * @param[out] samples the output buffer
+ * @param[out] samples the output buffer, sample type in avctx->sample_fmt
* @param[in,out] frame_size_ptr the output buffer size in bytes
+ * @param[in] avpkt The input AVPacket containing the input buffer.
+ * You can create such packet with av_init_packet() and by then setting
+ * data and size, some decoders might in addition need other fields.
+ * All decoders are designed to use the least fields possible though.
+ * @return On error a negative value is returned, otherwise the number of bytes
+ * used or zero if no frame data was decompressed (used) from the input AVPacket.
+ */
+int avcodec_decode_audio3(AVCodecContext *avctx, int16_t *samples,
+ int *frame_size_ptr,
+ AVPacket *avpkt);
+
+#if LIBAVCODEC_VERSION_MAJOR < 53
+/**
+ * Decode a video frame from buf into picture.
+ * Wrapper function which calls avcodec_decode_video2.
+ *
+ * @deprecated Use avcodec_decode_video2 instead.
+ * @param avctx the codec context
+ * @param[out] picture The AVFrame in which the decoded video frame will be stored.
* @param[in] buf the input buffer
- * @param[in] buf_size the input buffer size in bytes
+ * @param[in] buf_size the size of the input buffer in bytes
+ * @param[in,out] got_picture_ptr Zero if no frame could be decompressed, otherwise, it is nonzero.
* @return On error a negative value is returned, otherwise the number of bytes
* used or zero if no frame could be decompressed.
*/
-FF_EXPORT int avcodec_decode_audio2(AVCodecContext *avctx, int16_t *samples,
- int *frame_size_ptr,
+FF_EXPORT int avcodec_decode_video(AVCodecContext *avctx, AVFrame *picture,
+ int *got_picture_ptr,
const uint8_t *buf, int buf_size);
+#endif
/**
- * Decodes a video frame from buf into picture.
- * The avcodec_decode_video() function decodes a video frame from the input
- * buffer buf of size buf_size. To decode it, it makes use of the
- * video codec which was coupled with avctx using avcodec_open(). The
- * resulting decoded frame is stored in picture.
+ * Decode the video frame of size avpkt->size from avpkt->data into picture.
+ * Some decoders may support multiple frames in a single AVPacket, such
+ * decoders would then just decode the first frame.
*
* @warning The input buffer must be FF_INPUT_BUFFER_PADDING_SIZE larger than
* the actual read bytes because some optimized bitstream readers read 32 or 64
@@ -2895,29 +2938,37 @@ FF_EXPORT int avcodec_decode_audio2(AVCodecContext *avctx, int16_t *samples, * @warning The end of the input buffer buf should be set to 0 to ensure that
* no overreading happens for damaged MPEG streams.
*
- * @note You might have to align the input buffer buf and output buffer
- * samples. The alignment requirements depend on the CPU: on some CPUs it isn't
+ * @note You might have to align the input buffer avpkt->data.
+ * The alignment requirements depend on the CPU: on some CPUs it isn't
* necessary at all, on others it won't work at all if not aligned and on others
- * it will work but it will have an impact on performance. In practice, the
- * bitstream should have 4 byte alignment at minimum and all sample data should
- * be 16 byte aligned unless the CPU doesn't need it (AltiVec and SSE do). If
- * the linesize is not a multiple of 16 then there's no sense in aligning the
- * start of the buffer to 16.
+ * it will work but it will have an impact on performance.
+ *
+ * In practice, avpkt->data should have 4 byte alignment at minimum.
*
* @note Some codecs have a delay between input and output, these need to be
- * feeded with buf=NULL, buf_size=0 at the end to return the remaining frames.
+ * fed with avpkt->data=NULL, avpkt->size=0 at the end to return the remaining frames.
*
* @param avctx the codec context
* @param[out] picture The AVFrame in which the decoded video frame will be stored.
- * @param[in] buf the input buffer
- * @param[in] buf_size the size of the input buffer in bytes
+ * Use avcodec_alloc_frame to get an AVFrame, the codec will
+ * allocate memory for the actual bitmap.
+ * with default get/release_buffer(), the decoder frees/reuses the bitmap as it sees fit.
+ * with overridden get/release_buffer() (needs CODEC_CAP_DR1) the user decides into what buffer the decoder
+ * decodes and the decoder tells the user once it does not need the data anymore,
+ * the user app can at this point free/reuse/keep the memory as it sees fit.
+ *
+ * @param[in] avpkt The input AVpacket containing the input buffer.
+ * You can create such packet with av_init_packet() and by then setting
+ * data and size, some decoders might in addition need other fields like
+ * flags&AV_PKT_FLAG_KEY. All decoders are designed to use the least
+ * fields possible.
* @param[in,out] got_picture_ptr Zero if no frame could be decompressed, otherwise, it is nonzero.
* @return On error a negative value is returned, otherwise the number of bytes
* used or zero if no frame could be decompressed.
*/
-FF_EXPORT int avcodec_decode_video(AVCodecContext *avctx, AVFrame *picture,
+int avcodec_decode_video2(AVCodecContext *avctx, AVFrame *picture,
int *got_picture_ptr,
- const uint8_t *buf, int buf_size);
+ AVPacket *avpkt);
int avcodec_parse_frame(AVCodecContext *avctx, uint8_t **pdata,
diff --git a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/dsputil.c b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/dsputil.c index e4a4a7ad6..3d6b46d75 100644 --- a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/dsputil.c +++ b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/dsputil.c @@ -27,6 +27,7 @@ * DSP utils
*/
+#include "libavcore/imgutils.h"
#include "avcodec.h"
#include "dsputil.h"
#include "simple_idct.h"
@@ -121,6 +122,9 @@ void ff_init_scantable(uint8_t *permutation, ScanTable *st, const uint8_t *src_s int j;
j = src_scantable[i];
st->permutated[i] = permutation[j];
+#if ARCH_PPC
+ st->inverse[j] = i;
+#endif
}
end=-1;
@@ -1158,7 +1162,7 @@ CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels8_c , 8)\ CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
-CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c , OPNAME ## _pixels8_c , 8)\
+av_unused CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c , OPNAME ## _pixels8_c , 8)\
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
@@ -1172,6 +1176,9 @@ PIXOP2(put, op_put) #undef op_avg
#undef op_put
+#define put_no_rnd_pixels8_c put_pixels8_c
+#define put_no_rnd_pixels16_c put_pixels16_c
+
#define avg2(a,b) ((a+b+1)>>1)
#define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
@@ -1754,10 +1761,6 @@ static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dst }\
}\
\
-static void OPNAME ## qpel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
- OPNAME ## pixels8_c(dst, src, stride, 8);\
-}\
-\
static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
uint8_t half[64];\
put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
@@ -1936,9 +1939,6 @@ static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\ put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
}\
-static void OPNAME ## qpel16_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
- OPNAME ## pixels16_c(dst, src, stride, 16);\
-}\
\
static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
uint8_t half[256];\
@@ -2133,6 +2133,13 @@ QPEL_MC(0, avg_ , _ , op_avg) #undef op_put
#undef op_put_no_rnd
+#define put_qpel8_mc00_c ff_put_pixels8x8_c
+#define avg_qpel8_mc00_c ff_avg_pixels8x8_c
+#define put_qpel16_mc00_c ff_put_pixels16x16_c
+#define avg_qpel16_mc00_c ff_avg_pixels16x16_c
+#define put_no_rnd_qpel8_mc00_c ff_put_pixels8x8_c
+#define put_no_rnd_qpel16_mc00_c ff_put_pixels16x16_c
+
#if 1
#define H264_LOWPASS(OPNAME, OP, OP2) \
static av_unused void OPNAME ## h264_qpel2_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
@@ -2399,7 +2406,7 @@ static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t }\
#define H264_MC(OPNAME, SIZE) \
-static void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
+static av_unused void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
}\
\
@@ -2557,6 +2564,11 @@ H264_MC(avg_, 16) #undef op2_put
#endif
+#define put_h264_qpel8_mc00_c ff_put_pixels8x8_c
+#define avg_h264_qpel8_mc00_c ff_avg_pixels8x8_c
+#define put_h264_qpel16_mc00_c ff_put_pixels16x16_c
+#define avg_h264_qpel16_mc00_c ff_avg_pixels16x16_c
+
static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
int i;
@@ -2575,31 +2587,18 @@ static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int }
}
-#if CONFIG_CAVS_DECODER
-/* AVS specific */
-void ff_put_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
+void ff_put_pixels8x8_c(uint8_t *dst, uint8_t *src, int stride) {
put_pixels8_c(dst, src, stride, 8);
}
-void ff_avg_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
+void ff_avg_pixels8x8_c(uint8_t *dst, uint8_t *src, int stride) {
avg_pixels8_c(dst, src, stride, 8);
}
-void ff_put_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
+void ff_put_pixels16x16_c(uint8_t *dst, uint8_t *src, int stride) {
put_pixels16_c(dst, src, stride, 16);
}
-void ff_avg_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
+void ff_avg_pixels16x16_c(uint8_t *dst, uint8_t *src, int stride) {
avg_pixels16_c(dst, src, stride, 16);
}
-#endif /* CONFIG_CAVS_DECODER */
-
-#if CONFIG_VC1_DECODER
-/* VC-1 specific */
-void ff_put_vc1_mspel_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int rnd) {
- put_pixels8_c(dst, src, stride, 8);
-}
-void ff_avg_vc1_mspel_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int rnd) {
- avg_pixels8_c(dst, src, stride, 8);
-}
-#endif /* CONFIG_VC1_DECODER */
#if CONFIG_RV40_DECODER
static void put_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
@@ -2645,10 +2644,6 @@ static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int }
}
-static void put_mspel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){
- put_pixels8_c(dst, src, stride, 8);
-}
-
static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
uint8_t half[64];
wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
@@ -4352,7 +4347,7 @@ av_cold void attribute_align_arg dsputil_init(DSPContext* c, AVCodecContext *avc c->avg_rv40_qpel_pixels_tab[1][15] = avg_rv40_qpel8_mc33_c;
#endif
- c->put_mspel_pixels_tab[0]= put_mspel8_mc00_c;
+ c->put_mspel_pixels_tab[0]= ff_put_pixels8x8_c;
c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
@@ -4456,7 +4451,7 @@ av_cold void attribute_align_arg dsputil_init(DSPContext* c, AVCodecContext *avc c->sv_fmul_scalar[1] = sv_fmul_scalar_4_c;
#endif
- c->shrink[0]= ff_img_copy_plane;
+ c->shrink[0]= av_image_copy_plane;
c->shrink[1]= ff_shrink22;
c->shrink[2]= ff_shrink44;
c->shrink[3]= ff_shrink88;
diff --git a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/dsputil.h b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/dsputil.h index cfd1b7f33..8c1499165 100644 --- a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/dsputil.h +++ b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/dsputil.h @@ -82,6 +82,11 @@ extern const uint8_t ff_zigzag248_direct[64]; extern uint32_t ff_squareTbl[512];
extern uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP];
+void ff_put_pixels8x8_c(uint8_t *dst, uint8_t *src, int stride);
+void ff_avg_pixels8x8_c(uint8_t *dst, uint8_t *src, int stride);
+void ff_put_pixels16x16_c(uint8_t *dst, uint8_t *src, int stride);
+void ff_avg_pixels16x16_c(uint8_t *dst, uint8_t *src, int stride);
+
/* VP3 DSP functions */
void ff_vp3_idct_c(DCTELEM *block/* align 16*/);
void ff_vp3_idct_put_c(uint8_t *dest/*align 8*/, int line_size, DCTELEM *block/*align 16*/);
@@ -91,22 +96,15 @@ void ff_vp3_idct_dc_add_c(uint8_t *dest/*align 8*/, int line_size, const DCTELEM void ff_vp3_v_loop_filter_c(uint8_t *src, int stride, int *bounding_values);
void ff_vp3_h_loop_filter_c(uint8_t *src, int stride, int *bounding_values);
-/* VP6 DSP functions */
-void ff_vp6_filter_diag4_c(uint8_t *dst, uint8_t *src, int stride,
- const int16_t *h_weights, const int16_t *v_weights);
-
-/* CAVS functions */
-void ff_put_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride);
-void ff_avg_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride);
-void ff_put_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride);
-void ff_avg_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride);
-
-/* VC1 functions */
-void ff_put_vc1_mspel_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int rnd);
-void ff_avg_vc1_mspel_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int rnd);
-
/* 1/2^n downscaling functions from imgconvert.c */
+#if LIBAVCODEC_VERSION_MAJOR < 53
+/**
+ * @deprecated Use av_image_copy_plane() instead.
+ */
+attribute_deprecated
void ff_img_copy_plane(uint8_t *dst, int dst_wrap, const uint8_t *src, int src_wrap, int width, int height);
+#endif
+
void ff_shrink22(uint8_t *dst, int dst_wrap, const uint8_t *src, int src_wrap, int width, int height);
void ff_shrink44(uint8_t *dst, int dst_wrap, const uint8_t *src, int src_wrap, int width, int height);
void ff_shrink88(uint8_t *dst, int dst_wrap, const uint8_t *src, int src_wrap, int width, int height);
@@ -181,6 +179,10 @@ typedef struct ScanTable{ const uint8_t *scantable;
uint8_t permutated[64];
uint8_t raster_end[64];
+#if ARCH_PPC
+ /** Used by dct_quantize_altivec to find last-non-zero */
+ DECLARE_ALIGNED(16, uint8_t, inverse)[64];
+#endif
} ScanTable;
void ff_init_scantable(uint8_t *, ScanTable *st, const uint8_t *src_scantable);
@@ -598,11 +600,15 @@ static inline int get_penalty_factor(int lambda, int lambda2, int type){ */
#define emms_c()
-/* should be defined by architectures supporting
- one or more MultiMedia extension */
-int mm_support(void);
-
+void dsputil_init_alpha(DSPContext* c, AVCodecContext *avctx);
+void dsputil_init_arm(DSPContext* c, AVCodecContext *avctx);
+void dsputil_init_bfin(DSPContext* c, AVCodecContext *avctx);
+void dsputil_init_mlib(DSPContext* c, AVCodecContext *avctx);
+void dsputil_init_mmi(DSPContext* c, AVCodecContext *avctx);
void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx);
+void dsputil_init_ppc(DSPContext* c, AVCodecContext *avctx);
+void dsputil_init_sh4(DSPContext* c, AVCodecContext *avctx);
+void dsputil_init_vis(DSPContext* c, AVCodecContext *avctx);
void ff_dsputil_init_dwt(DSPContext *c);
void ff_rv30dsp_init(DSPContext* c, AVCodecContext *avctx);
@@ -627,9 +633,19 @@ static inline void emms(void) #define emms_c() emms()
-#else
+#elif ARCH_ARM
+
+#if HAVE_NEON
+# define STRIDE_ALIGN 16
+#endif
+
+#elif ARCH_PPC
+
+#define STRIDE_ALIGN 16
+
+#elif HAVE_MMI
-#define mm_support() 0
+#define STRIDE_ALIGN 16
#endif
diff --git a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/flvdec.c b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/flvdec.c index 32f595a27..8f05945cb 100644 --- a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/flvdec.c +++ b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/flvdec.c @@ -83,7 +83,7 @@ int ff_flv_decode_picture_header(MpegEncContext *s) width = height = 0; break; } - if(av_check_image_size(width, height, 0, s->avctx)) + if(av_image_check_size(width, height, 0, s->avctx)) return -1; s->width = width; s->height = height; diff --git a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/h263.h b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/h263.h index d8b7abe4f..6c05565c7 100644 --- a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/h263.h +++ b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/h263.h @@ -70,7 +70,7 @@ av_const int ff_h263_aspect_to_info(AVRational aspect); int ff_h263_decode_init(AVCodecContext *avctx); int ff_h263_decode_frame(AVCodecContext *avctx, void *data, int *data_size, - const uint8_t *buf, int buf_size); + AVPacket *avpkt); int ff_h263_decode_end(AVCodecContext *avctx); void h263_encode_mb(MpegEncContext *s, DCTELEM block[6][64], diff --git a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/h263dec.c b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/h263dec.c index 503c3b6ba..92beb1bd3 100644 --- a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/h263dec.c +++ b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/h263dec.c @@ -25,6 +25,7 @@ * H.263 decoder.
*/
+#include "libavutil/cpu.h"
#include "internal.h"
#include "avcodec.h"
#include "dsputil.h"
@@ -321,8 +322,10 @@ static int decode_slice(MpegEncContext *s){ int ff_h263_decode_frame(AVCodecContext *avctx,
void *data, int *data_size,
- const uint8_t *buf, int buf_size)
+ AVPacket *avpkt)
{
+ const uint8_t *buf = avpkt->data;
+ int buf_size = avpkt->size;
MpegEncContext *s = avctx->priv_data;
int ret;
AVFrame *pict = data;
@@ -542,7 +545,7 @@ retry: #endif
#if HAVE_MMX
- if(s->codec_id == CODEC_ID_MPEG4 && s->xvid_build>=0 && avctx->idct_algo == FF_IDCT_AUTO && (mm_support() & FF_MM_MMX)){
+ if (s->codec_id == CODEC_ID_MPEG4 && s->xvid_build>=0 && avctx->idct_algo == FF_IDCT_AUTO && (av_get_cpu_flags() & AV_CPU_FLAG_MMX)) {
avctx->idct_algo= FF_IDCT_XVIDMMX;
avctx->coded_width= 0; // force reinit
// dsputil_init(&s->dsp, avctx);
diff --git a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/h264.c b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/h264.c index bcc6e6b8f..7ab3e6311 100644 --- a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/h264.c +++ b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/h264.c @@ -1250,14 +1250,9 @@ static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple){ chroma_dc_dequant_idct_c(h->mb + 16*16, h->chroma_qp[0], h->dequant4_coeff[IS_INTRA(mb_type) ? 1:4][h->chroma_qp[0]][0]);
chroma_dc_dequant_idct_c(h->mb + 16*16+4*16, h->chroma_qp[1], h->dequant4_coeff[IS_INTRA(mb_type) ? 2:5][h->chroma_qp[1]][0]);
if(is_h264){
- idct_add = h->h264dsp.h264_idct_add;
- idct_dc_add = h->h264dsp.h264_idct_dc_add;
- for(i=16; i<16+8; i++){
- if(h->non_zero_count_cache[ scan8[i] ])
- idct_add (dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize);
- else if(h->mb[i*16])
- idct_dc_add(dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize);
- }
+ h->h264dsp.h264_idct_add8(dest, block_offset,
+ h->mb, uvlinesize,
+ h->non_zero_count_cache);
}else{
for(i=16; i<16+8; i++){
if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){
@@ -2891,8 +2886,10 @@ static int get_consumed_bytes(MpegEncContext *s, int pos, int buf_size){ static int decode_frame(AVCodecContext *avctx,
void *data, int *data_size,
- const uint8_t *buf, int buf_size)
+ AVPacket *avpkt)
{
+ const uint8_t *buf = avpkt->data;
+ int buf_size = avpkt->size;
H264Context *h = avctx->priv_data;
MpegEncContext *s = &h->s;
AVFrame *pict = data;
diff --git a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/h264.h b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/h264.h index d87f9d01a..64db7072a 100644 --- a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/h264.h +++ b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/h264.h @@ -607,12 +607,12 @@ typedef struct H264Context{ int sp_for_switch_flag;
int slice_qs_delta;
int slice_qp_delta;
- unsigned int first_mb_in_slice;
- int bit_offset_to_slice_data;
- int raw_slice_type;
- int64_t outputed_rtstart;
- void* dxva_slice_long;
- int ref_pic_flag;
+ unsigned int first_mb_in_slice;
+ int bit_offset_to_slice_data;
+ int raw_slice_type;
+ int64_t outputed_rtstart;
+ void* dxva_slice_long;
+ int ref_pic_flag;
// <== End patch MPC
}H264Context;
diff --git a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/h264_ps.c b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/h264_ps.c index 40b71ec24..96f99ab49 100644 --- a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/h264_ps.c +++ b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/h264_ps.c @@ -344,7 +344,7 @@ int ff_h264_decode_seq_parameter_set(H264Context *h){ sps->mb_width = get_ue_golomb(&s->gb) + 1;
sps->mb_height= get_ue_golomb(&s->gb) + 1;
if((unsigned)sps->mb_width >= INT_MAX/16 || (unsigned)sps->mb_height >= INT_MAX/16 ||
- av_check_image_size(16*sps->mb_width, 16*sps->mb_height, 0, h->s.avctx)){
+ av_image_check_size(16*sps->mb_width, 16*sps->mb_height, 0, h->s.avctx)){
av_log(h->s.avctx, AV_LOG_ERROR, "mb_width/height overflow\n");
goto fail;
}
diff --git a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/imgconvert.c b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/imgconvert.c index f6ea7a7f1..eb2cdc376 100644 --- a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/imgconvert.c +++ b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/imgconvert.c @@ -37,7 +37,7 @@ #include "libavutil/pixdesc.h"
#include "libavcore/imgutils.h"
-#if HAVE_MMX
+#if HAVE_MMX && HAVE_YASM
#include "x86/dsputil_mmx.h"
#endif
@@ -751,13 +751,13 @@ int ff_set_systematic_pal(uint32_t pal[256], enum PixelFormat pix_fmt){ #if LIBAVCODEC_VERSION_MAJOR < 53
int ff_fill_linesize(AVPicture *picture, enum PixelFormat pix_fmt, int width)
{
- return av_fill_image_linesizes(picture->linesize, pix_fmt, width);
+ return av_image_fill_linesizes(picture->linesize, pix_fmt, width);
}
int ff_fill_pointer(AVPicture *picture, uint8_t *ptr, enum PixelFormat pix_fmt,
int height)
{
- return av_fill_image_pointers(picture->data, pix_fmt, height, ptr, picture->linesize);
+ return av_image_fill_pointers(picture->data, pix_fmt, height, ptr, picture->linesize);
}
#endif
@@ -813,64 +813,33 @@ static int avg_bits_per_pixel(enum PixelFormat pix_fmt) return bits;
}
+#if LIBAVCODEC_VERSION_MAJOR < 53
void ff_img_copy_plane(uint8_t *dst, int dst_wrap,
const uint8_t *src, int src_wrap,
int width, int height)
{
- if((!dst) || (!src))
- return;
- for(;height > 0; height--) {
- memcpy(dst, src, width);
- dst += dst_wrap;
- src += src_wrap;
- }
+ av_image_copy_plane(dst, dst_wrap, src, src_wrap, width, height);
}
-#if LIBAVCODEC_VERSION_MAJOR < 53
int ff_get_plane_bytewidth(enum PixelFormat pix_fmt, int width, int plane)
{
- return av_get_image_linesize(pix_fmt, width, plane);
+ return av_image_get_linesize(pix_fmt, width, plane);
}
-#endif
void av_picture_data_copy(uint8_t *dst_data[4], int dst_linesize[4],
uint8_t *src_data[4], int src_linesize[4],
enum PixelFormat pix_fmt, int width, int height)
{
- int i;
- const PixFmtInfo *pf = &pix_fmt_info[pix_fmt];
- const AVPixFmtDescriptor *desc = &av_pix_fmt_descriptors[pix_fmt];
-
- switch(pf->pixel_type) {
- case FF_PIXEL_PACKED:
- case FF_PIXEL_PLANAR:
- for(i = 0; i < pf->nb_channels; i++) {
- int h;
- int bwidth = av_get_image_linesize(pix_fmt, width, i);
- h = height;
- if (i == 1 || i == 2) {
- h= -((-height)>>desc->log2_chroma_h);
- }
- ff_img_copy_plane(dst_data[i], dst_linesize[i],
- src_data[i], src_linesize[i],
- bwidth, h);
- }
- break;
- case FF_PIXEL_PALETTE:
- ff_img_copy_plane(dst_data[0], dst_linesize[0],
- src_data[0], src_linesize[0],
- width, height);
- /* copy the palette */
- memcpy(dst_data[1], src_data[1], 4*256);
- break;
- }
+ av_image_copy(dst_data, dst_linesize, src_data, src_linesize,
+ pix_fmt, width, height);
}
+#endif
void av_picture_copy(AVPicture *dst, const AVPicture *src,
enum PixelFormat pix_fmt, int width, int height)
{
- av_picture_data_copy(dst->data, dst->linesize, src->data,
- src->linesize, pix_fmt, width, height);
+ av_image_copy(dst->data, dst->linesize, src->data,
+ src->linesize, pix_fmt, width, height);
}
/* 2x2 -> 1x1 */
diff --git a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/libamr.c b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/libamr.c index 695fc4ca7..1c0ba07a8 100644 --- a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/libamr.c +++ b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/libamr.c @@ -137,8 +137,10 @@ static av_cold int amr_nb_decode_close(AVCodecContext *avctx) }
static int amr_nb_decode_frame(AVCodecContext * avctx, void *data,
- int *data_size, const uint8_t * buf, int buf_size)
+ int *data_size, AVPacket *avpkt)
{
+ const uint8_t *buf = avpkt->data;
+ int buf_size = avpkt->size;
AMRContext *s = avctx->priv_data;
const uint8_t *amrData = buf;
static const uint8_t block_size[16] = { 12, 13, 15, 17, 19, 20, 26, 31, 5, 0, 0, 0, 0, 0, 0, 0 };
diff --git a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/mjpegdec.c b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/mjpegdec.c index fcb4f2011..8095eff9f 100644 --- a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/mjpegdec.c +++ b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/mjpegdec.c @@ -219,7 +219,7 @@ int ff_mjpeg_decode_sof(MJpegDecodeContext *s) height= s->height;
av_log(s->avctx, AV_LOG_DEBUG, "sof0: picture: %dx%d\n", width, height);
- if(av_check_image_size(width, height, 0, s->avctx))
+ if(av_image_check_size(width, height, 0, s->avctx))
return -1;
nb_components = get_bits(&s->gb, 8);
@@ -1205,8 +1205,10 @@ found: int ff_mjpeg_decode_frame(AVCodecContext *avctx,
void *data, int *data_size,
- const uint8_t *buf, int buf_size)
+ AVPacket *avpkt)
{
+ const uint8_t *buf = avpkt->data;
+ int buf_size = avpkt->size;
MJpegDecodeContext *s = avctx->priv_data;
const uint8_t *buf_end, *buf_ptr;
int start_code;
diff --git a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/mjpegdec.h b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/mjpegdec.h index 5a9da5902..bbf734b56 100644 --- a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/mjpegdec.h +++ b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/mjpegdec.h @@ -111,7 +111,7 @@ int ff_mjpeg_decode_init(AVCodecContext *avctx); int ff_mjpeg_decode_end(AVCodecContext *avctx); int ff_mjpeg_decode_frame(AVCodecContext *avctx, void *data, int *data_size, - const uint8_t *buf, int buf_size); + AVPacket *avpkt); int ff_mjpeg_decode_dqt(MJpegDecodeContext *s); int ff_mjpeg_decode_dht(MJpegDecodeContext *s); int ff_mjpeg_decode_sof(MJpegDecodeContext *s); diff --git a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/mlpdec.c b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/mlpdec.c index e392d971b..46fc32891 100644 --- a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/mlpdec.c +++ b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/mlpdec.c @@ -943,8 +943,10 @@ static int output_data(MLPDecodeContext *m, unsigned int substr, * otherwise the number of bytes consumed. */ static int read_access_unit(AVCodecContext *avctx, void* data, int *data_size, - const uint8_t *buf, int buf_size) + AVPacket *avpkt) { + const uint8_t *buf = avpkt->data; + int buf_size = avpkt->size; MLPDecodeContext *m = avctx->priv_data; GetBitContext gb; unsigned int length, substr; diff --git a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/mpeg12.c b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/mpeg12.c index 93268052a..6aab625c8 100644 --- a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/mpeg12.c +++ b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/mpeg12.c @@ -2235,8 +2235,10 @@ static int decode_chunks(AVCodecContext *avctx, /* handle buffering and image synchronisation */
static int mpeg_decode_frame(AVCodecContext *avctx,
void *data, int *data_size,
- const uint8_t *buf, int buf_size)
+ AVPacket *avpkt)
{
+ const uint8_t *buf = avpkt->data;
+ int buf_size = avpkt->size;
Mpeg1Context *s = avctx->priv_data;
AVFrame *picture = data;
MpegEncContext *s2 = &s->mpeg_enc_ctx;
diff --git a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/nellymoserdec.c b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/nellymoserdec.c index 1f0a51ff9..6729c13fd 100644 --- a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/nellymoserdec.c +++ b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/nellymoserdec.c @@ -154,7 +154,9 @@ static av_cold int decode_init(AVCodecContext * avctx) { static int decode_tag(AVCodecContext * avctx, void *data, int *data_size, - const uint8_t * buf, int buf_size) { + AVPacket *avpkt) { + const uint8_t *buf = avpkt->data; + int buf_size = avpkt->size; NellyMoserDecodeContext *s = avctx->priv_data; int blocks, i; int16_t* samples; diff --git a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/rv10.c b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/rv10.c index 829c9b3b1..df3b664e7 100644 --- a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/rv10.c +++ b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/rv10.c @@ -370,7 +370,7 @@ static int rv20_decode_picture_header(MpegEncContext *s) }
if(new_w != s->width || new_h != s->height){
av_log(s->avctx, AV_LOG_DEBUG, "attempting to change resolution to %dx%d\n", new_w, new_h);
- if (av_check_image_size(new_w, new_h, 0, s->avctx) < 0)
+ if (av_image_check_size(new_w, new_h, 0, s->avctx) < 0)
return -1;
MPV_common_end(s);
avcodec_set_dimensions(s->avctx, new_w, new_h);
@@ -645,8 +645,10 @@ static int get_slice_offset(AVCodecContext *avctx, const uint8_t *buf, int n) static int rv10_decode_frame(AVCodecContext *avctx,
void *data, int *data_size,
- const uint8_t *buf, int buf_size)
+ AVPacket *avpkt)
{
+ const uint8_t *buf = avpkt->data;
+ int buf_size = avpkt->size;
MpegEncContext *s = avctx->priv_data;
int i;
AVFrame *pict = data;
diff --git a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/rv34.c b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/rv34.c index ce92c7850..b586aa0b8 100644 --- a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/rv34.c +++ b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/rv34.c @@ -1410,8 +1410,10 @@ static int get_slice_offset(AVCodecContext *avctx, const uint8_t *buf, int n) int ff_rv34_decode_frame(AVCodecContext *avctx, void *data, int *data_size, - const uint8_t *buf, int buf_size) + AVPacket *avpkt) { + const uint8_t *buf = avpkt->data; + int buf_size = avpkt->size; RV34DecContext *r = avctx->priv_data; MpegEncContext *s = &r->s; AVFrame *pict = data; diff --git a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/rv34.h b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/rv34.h index 3d25af2b1..24a27ce48 100644 --- a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/rv34.h +++ b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/rv34.h @@ -124,7 +124,7 @@ typedef struct RV34DecContext{ */ int ff_rv34_get_start_offset(GetBitContext *gb, int blocks); int ff_rv34_decode_init(AVCodecContext *avctx); -int ff_rv34_decode_frame(AVCodecContext *avctx, void *data, int *data_size, const uint8_t *buf, int buf_size); +int ff_rv34_decode_frame(AVCodecContext *avctx, void *data, int *data_size, AVPacket *avpkt); int ff_rv34_decode_end(AVCodecContext *avctx); #endif /* AVCODEC_RV34_H */ diff --git a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/rv40.c b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/rv40.c index 13ba5b6ee..157169196 100644 --- a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/rv40.c +++ b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/rv40.c @@ -144,7 +144,7 @@ static int rv40_parse_slice_header(RV34DecContext *r, GetBitContext *gb, SliceIn si->pts = get_bits(gb, 13); if(!si->type || !get_bits1(gb)) rv40_parse_picture_size(gb, &w, &h); - if(av_check_image_size(w, h, 0, r->s.avctx) < 0) + if(av_image_check_size(w, h, 0, r->s.avctx) < 0) return -1; si->width = w; si->height = h; diff --git a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/sp5xdec.c b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/sp5xdec.c index a63d52259..9db7d32ed 100644 --- a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/sp5xdec.c +++ b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/sp5xdec.c @@ -32,11 +32,11 @@ static int sp5x_decode_frame(AVCodecContext *avctx,
void *data, int *data_size,
- const uint8_t *buf, int buf_size)
+ AVPacket *avpkt)
{
-#if 0
- MJpegDecodeContext *s = avctx->priv_data;
-#endif
+ const uint8_t *buf = avpkt->data;
+ int buf_size = avpkt->size;
+ AVPacket avpkt_recoded;
const int qscale = 5;
const uint8_t *buf_ptr;
uint8_t *recoded;
@@ -47,7 +47,6 @@ static int sp5x_decode_frame(AVCodecContext *avctx, buf_ptr = buf;
-#if 1
recoded = av_mallocz(buf_size + 1024);
if (!recoded)
return -1;
@@ -88,102 +87,13 @@ static int sp5x_decode_frame(AVCodecContext *avctx, recoded[j++] = 0xD9;
avctx->flags &= ~CODEC_FLAG_EMU_EDGE;
- i = ff_mjpeg_decode_frame(avctx, data, data_size, recoded, j);
+ av_init_packet(&avpkt_recoded);
+ avpkt_recoded.data = recoded;
+ avpkt_recoded.size = j;
+ i = ff_mjpeg_decode_frame(avctx, data, data_size, &avpkt_recoded);
av_free(recoded);
-#else
- /* SOF */
- s->bits = 8;
- s->width = avctx->coded_width;
- s->height = avctx->coded_height;
- s->nb_components = 3;
- s->component_id[0] = 0;
- s->h_count[0] = 2;
- s->v_count[0] = 2;
- s->quant_index[0] = 0;
- s->component_id[1] = 1;
- s->h_count[1] = 1;
- s->v_count[1] = 1;
- s->quant_index[1] = 1;
- s->component_id[2] = 2;
- s->h_count[2] = 1;
- s->v_count[2] = 1;
- s->quant_index[2] = 1;
- s->h_max = 2;
- s->v_max = 2;
-
- s->qscale_table = av_mallocz((s->width+15)/16);
- avctx->pix_fmt = s->cs_itu601 ? PIX_FMT_YUV420P : PIX_FMT_YUVJ420;
- s->interlaced = 0;
-
- s->picture.reference = 0;
- if (avctx->get_buffer(avctx, &s->picture) < 0)
- {
- av_log(avctx, AV_LOG_ERROR, "get_buffer() failed\n");
- return -1;
- }
-
- s->picture.pict_type = FF_I_TYPE;
- s->picture.key_frame = 1;
-
- for (i = 0; i < 3; i++)
- s->linesize[i] = s->picture.linesize[i] << s->interlaced;
-
- /* DQT */
- for (i = 0; i < 64; i++)
- {
- j = s->scantable.permutated[i];
- s->quant_matrixes[0][j] = sp5x_quant_table[(qscale * 2) + i];
- }
- s->qscale[0] = FFMAX(
- s->quant_matrixes[0][s->scantable.permutated[1]],
- s->quant_matrixes[0][s->scantable.permutated[8]]) >> 1;
-
- for (i = 0; i < 64; i++)
- {
- j = s->scantable.permutated[i];
- s->quant_matrixes[1][j] = sp5x_quant_table[(qscale * 2) + 1 + i];
- }
- s->qscale[1] = FFMAX(
- s->quant_matrixes[1][s->scantable.permutated[1]],
- s->quant_matrixes[1][s->scantable.permutated[8]]) >> 1;
-
- /* DHT */
-
- /* SOS */
- s->comp_index[0] = 0;
- s->nb_blocks[0] = s->h_count[0] * s->v_count[0];
- s->h_scount[0] = s->h_count[0];
- s->v_scount[0] = s->v_count[0];
- s->dc_index[0] = 0;
- s->ac_index[0] = 0;
-
- s->comp_index[1] = 1;
- s->nb_blocks[1] = s->h_count[1] * s->v_count[1];
- s->h_scount[1] = s->h_count[1];
- s->v_scount[1] = s->v_count[1];
- s->dc_index[1] = 1;
- s->ac_index[1] = 1;
-
- s->comp_index[2] = 2;
- s->nb_blocks[2] = s->h_count[2] * s->v_count[2];
- s->h_scount[2] = s->h_count[2];
- s->v_scount[2] = s->v_count[2];
- s->dc_index[2] = 1;
- s->ac_index[2] = 1;
-
- for (i = 0; i < 3; i++)
- s->last_dc[i] = 1024;
-
- s->mb_width = (s->width * s->h_max * 8 -1) / (s->h_max * 8);
- s->mb_height = (s->height * s->v_max * 8 -1) / (s->v_max * 8);
-
- init_get_bits(&s->gb, buf+14, (buf_size-14)*8);
-
- return mjpeg_decode_scan(s);
-#endif
-
return i;
}
diff --git a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/svq1dec.c b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/svq1dec.c index 74fede36a..2df76316d 100644 --- a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/svq1dec.c +++ b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/svq1dec.c @@ -642,8 +642,10 @@ static int svq1_decode_frame_header (GetBitContext *bitbuf,MpegEncContext *s) { static int svq1_decode_frame(AVCodecContext *avctx,
void *data, int *data_size,
- const uint8_t *buf, int buf_size)
+ AVPacket *avpkt)
{
+ const uint8_t *buf = avpkt->data;
+ int buf_size = avpkt->size;
MpegEncContext *s=avctx->priv_data;
uint8_t *current, *previous;
int result, i, x, y, width, height;
diff --git a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/svq3.c b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/svq3.c index 752400fa5..d7fe4aa5c 100644 --- a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/svq3.c +++ b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/svq3.c @@ -917,8 +917,10 @@ static av_cold int svq3_decode_init(AVCodecContext *avctx) static int svq3_decode_frame(AVCodecContext *avctx,
void *data, int *data_size,
- const uint8_t *buf, int buf_size)
+ AVPacket *avpkt)
{
+ const uint8_t *buf = avpkt->data;
+ int buf_size = avpkt->size;
MpegEncContext *const s = avctx->priv_data;
H264Context *const h = avctx->priv_data;
int m, mb_type;
diff --git a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/utils.c b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/utils.c index 240ae68f1..aad1d9521 100644 --- a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/utils.c +++ b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/utils.c @@ -212,7 +212,7 @@ void avcodec_align_dimensions(AVCodecContext *s, int *width, int *height){ #if LIBAVCODEC_VERSION_MAJOR < 53
int avcodec_check_dimensions(void *av_log_ctx, unsigned int w, unsigned int h){
- return av_check_image_size(w, h, 0, av_log_ctx);
+ return av_image_check_size(w, h, 0, av_log_ctx);
}
#endif
@@ -232,7 +232,7 @@ int avcodec_default_get_buffer(AVCodecContext *s, AVFrame *pic){ return -1;
}
- if(av_check_image_size(w, h, 0, s))
+ if(av_image_check_size(w, h, 0, s))
return -1;
if(s->internal_buffer==NULL){
@@ -280,7 +280,7 @@ int avcodec_default_get_buffer(AVCodecContext *s, AVFrame *pic){ do {
// NOTE: do not align linesizes individually, this breaks e.g. assumptions
// that linesize[0] == 2*linesize[1] in the MPEG-encoder for 4:2:2
- av_fill_image_linesizes(picture.linesize, s->pix_fmt, w);
+ av_image_fill_linesizes(picture.linesize, s->pix_fmt, w);
// increase alignment of w for next try (rhs gives the lowest bit set in w)
w += w & ~(w-1);
@@ -290,7 +290,7 @@ int avcodec_default_get_buffer(AVCodecContext *s, AVFrame *pic){ }
} while (unaligned);
- tmpsize = av_fill_image_pointers(picture.data, s->pix_fmt, h, NULL, picture.linesize);
+ tmpsize = av_image_fill_pointers(picture.data, s->pix_fmt, h, NULL, picture.linesize);
if (tmpsize < 0)
return -1;
@@ -489,7 +489,7 @@ int attribute_align_arg avcodec_open(AVCodecContext *avctx, AVCodec *codec) #define SANE_NB_CHANNELS 128U
if (((avctx->coded_width || avctx->coded_height)
- && av_check_image_size(avctx->coded_width, avctx->coded_height, 0, avctx))
+ && av_image_check_size(avctx->coded_width, avctx->coded_height, 0, avctx))
|| avctx->channels > SANE_NB_CHANNELS) {
ret = AVERROR(EINVAL);
goto free_and_end;
@@ -498,14 +498,13 @@ int attribute_align_arg avcodec_open(AVCodecContext *avctx, AVCodec *codec) avctx->codec = codec;
avctx->codec_id = codec->id; /* ffdshow custom code */
avctx->frame_number = 0;
- if(avctx->codec->init){
- if(avctx->codec_type == AVMEDIA_TYPE_VIDEO &&
- avctx->codec->max_lowres < avctx->lowres){
- av_log(avctx, AV_LOG_ERROR, "The maximum value for lowres supported by the decoder is %d\n",
- avctx->codec->max_lowres);
- goto free_and_end;
- }
+ if (avctx->codec->max_lowres < avctx->lowres) {
+ av_log(avctx, AV_LOG_ERROR, "The maximum value for lowres supported by the decoder is %d\n",
+ avctx->codec->max_lowres);
+ goto free_and_end;
+ }
+ if(avctx->codec->init){
ret = avctx->codec->init(avctx);
if (ret < 0) {
goto free_and_end;
@@ -548,7 +547,7 @@ int attribute_align_arg avcodec_encode_video(AVCodecContext *avctx, uint8_t *buf av_log(avctx, AV_LOG_ERROR, "buffer smaller than minimum size\n");
return -1;
}
- if(av_check_image_size(avctx->width, avctx->height, 0, avctx))
+ if(av_image_check_size(avctx->width, avctx->height, 0, avctx))
return -1;
if((avctx->codec->capabilities & CODEC_CAP_DELAY) || pict){
int ret = avctx->codec->encode(avctx, buf, buf_size, pict);
@@ -560,18 +559,34 @@ int attribute_align_arg avcodec_encode_video(AVCodecContext *avctx, uint8_t *buf return 0;
}
+#if LIBAVCODEC_VERSION_MAJOR < 53
int attribute_align_arg avcodec_decode_video(AVCodecContext *avctx, AVFrame *picture,
int *got_picture_ptr,
const uint8_t *buf, int buf_size)
{
+ AVPacket avpkt;
+ av_init_packet(&avpkt);
+ avpkt.data = buf;
+ avpkt.size = buf_size;
+ // HACK for CorePNG to decode as normal PNG by default
+ avpkt.flags = AV_PKT_FLAG_KEY;
+
+ return avcodec_decode_video2(avctx, picture, got_picture_ptr, &avpkt);
+}
+#endif
+
+int attribute_align_arg avcodec_decode_video2(AVCodecContext *avctx, AVFrame *picture,
+ int *got_picture_ptr,
+ AVPacket *avpkt)
+{
int ret;
*got_picture_ptr= 0;
- if((avctx->coded_width||avctx->coded_height) && av_check_image_size(avctx->coded_width, avctx->coded_height, 0, avctx))
+ if((avctx->coded_width||avctx->coded_height) && av_image_check_size(avctx->coded_width, avctx->coded_height, 0, avctx))
return -1;
- if((avctx->codec->capabilities & CODEC_CAP_DELAY) || buf_size){
+ if((avctx->codec->capabilities & CODEC_CAP_DELAY) || avpkt->size){
ret = avctx->codec->decode(avctx, picture, got_picture_ptr,
- buf, buf_size);
+ avpkt);
emms_c(); //needed to avoid an emms_c() call before every return;
@@ -583,13 +598,27 @@ int attribute_align_arg avcodec_decode_video(AVCodecContext *avctx, AVFrame *pic return ret;
}
+#if LIBAVCODEC_VERSION_MAJOR < 53
int attribute_align_arg avcodec_decode_audio2(AVCodecContext *avctx, int16_t *samples,
int *frame_size_ptr,
const uint8_t *buf, int buf_size)
{
+ AVPacket avpkt;
+ av_init_packet(&avpkt);
+ avpkt.data = buf;
+ avpkt.size = buf_size;
+
+ return avcodec_decode_audio3(avctx, samples, frame_size_ptr, &avpkt);
+}
+#endif
+
+int attribute_align_arg avcodec_decode_audio3(AVCodecContext *avctx, int16_t *samples,
+ int *frame_size_ptr,
+ AVPacket *avpkt)
+{
int ret;
- if((avctx->codec->capabilities & CODEC_CAP_DELAY) || buf_size){
+ if((avctx->codec->capabilities & CODEC_CAP_DELAY) || avpkt->size){
//FIXME remove the check below _after_ ensuring that all audio check that the available space is enough
if(*frame_size_ptr < AVCODEC_MAX_AUDIO_FRAME_SIZE){
av_log(avctx, AV_LOG_ERROR, "buffer smaller than AVCODEC_MAX_AUDIO_FRAME_SIZE\n");
@@ -601,8 +630,7 @@ int attribute_align_arg avcodec_decode_audio2(AVCodecContext *avctx, int16_t *sa return -1;
}
- ret = avctx->codec->decode(avctx, samples, frame_size_ptr,
- buf, buf_size);
+ ret = avctx->codec->decode(avctx, samples, frame_size_ptr, avpkt);
avctx->frame_number++;
}else{
ret= 0;
diff --git a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/vc1dec.c b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/vc1dec.c index 47cd5e811..fe960473f 100644 --- a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/vc1dec.c +++ b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/vc1dec.c @@ -3135,8 +3135,10 @@ static av_cold int vc1_decode_init(AVCodecContext *avctx) */
static int vc1_decode_frame(AVCodecContext *avctx,
void *data, int *data_size,
- const uint8_t *buf, int buf_size)
+ AVPacket *avpkt)
{
+ const uint8_t *buf = avpkt->data;
+ int buf_size = avpkt->size;
VC1Context *v = avctx->priv_data;
MpegEncContext *s = &v->s;
AVFrame *pict = data;
diff --git a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/vc1dsp.c b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/vc1dsp.c index 8634bef69..aab169479 100644 --- a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/vc1dsp.c +++ b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/vc1dsp.c @@ -630,7 +630,7 @@ av_cold void ff_vc1dsp_init(DSPContext* dsp, AVCodecContext *avctx) { dsp->vc1_v_loop_filter16 = vc1_v_loop_filter16_c; dsp->vc1_h_loop_filter16 = vc1_h_loop_filter16_c; - dsp->put_vc1_mspel_pixels_tab[ 0] = ff_put_vc1_mspel_mc00_c; + dsp->put_vc1_mspel_pixels_tab[ 0] = ff_put_pixels8x8_c; dsp->put_vc1_mspel_pixels_tab[ 1] = put_vc1_mspel_mc10_c; dsp->put_vc1_mspel_pixels_tab[ 2] = put_vc1_mspel_mc20_c; dsp->put_vc1_mspel_pixels_tab[ 3] = put_vc1_mspel_mc30_c; @@ -647,7 +647,7 @@ av_cold void ff_vc1dsp_init(DSPContext* dsp, AVCodecContext *avctx) { dsp->put_vc1_mspel_pixels_tab[14] = put_vc1_mspel_mc23_c; dsp->put_vc1_mspel_pixels_tab[15] = put_vc1_mspel_mc33_c; - dsp->avg_vc1_mspel_pixels_tab[ 0] = ff_avg_vc1_mspel_mc00_c; + dsp->avg_vc1_mspel_pixels_tab[ 0] = ff_avg_pixels8x8_c; dsp->avg_vc1_mspel_pixels_tab[ 1] = avg_vc1_mspel_mc10_c; dsp->avg_vc1_mspel_pixels_tab[ 2] = avg_vc1_mspel_mc20_c; dsp->avg_vc1_mspel_pixels_tab[ 3] = avg_vc1_mspel_mc30_c; diff --git a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/vp3.c b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/vp3.c index 4d0e2a4b0..437ecffa3 100644 --- a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/vp3.c +++ b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/vp3.c @@ -1329,12 +1329,11 @@ static void vp3_draw_horiz_band(Vp3DecodeContext *s, int y) return;
h= y - s->last_slice_end;
+ s->last_slice_end= y;
y -= h;
if (!s->flipped_image) {
- if (y == 0)
- h -= s->height - s->avctx->height; // account for non-mod16
- y = s->height - y - h;
+ y = s->avctx->height - y - h;
}
cy = y >> s->chroma_y_shift;
@@ -1345,7 +1344,6 @@ static void vp3_draw_horiz_band(Vp3DecodeContext *s, int y) emms_c();
s->avctx->draw_horiz_band(s->avctx, &s->current_frame, offset, y, 3, h);
- s->last_slice_end= y + h;
}
/*
@@ -1516,7 +1514,7 @@ static void render_slice(Vp3DecodeContext *s, int slice) * dispatch (slice - 1);
*/
- vp3_draw_horiz_band(s, FFMIN(64*slice + 64-16, s->height-16));
+ vp3_draw_horiz_band(s, FFMIN((32 << s->chroma_y_shift) * (slice + 1) -16, s->height-16));
}
/*
@@ -1737,8 +1735,10 @@ static int64_t theora_granule_frame(Vp3DecodeContext *s,int64_t granulepos) */
static int vp3_decode_frame(AVCodecContext *avctx,
void *data, int *data_size,
- const uint8_t *buf, int buf_size)
+ AVPacket *avpkt)
{
+ const uint8_t *buf = avpkt->data;
+ int buf_size = avpkt->size;
Vp3DecodeContext *s = avctx->priv_data;
GetBitContext gb;
static int counter = 0;
@@ -1868,7 +1868,7 @@ static int vp3_decode_frame(AVCodecContext *avctx, int row = (s->height >> (3+(i && s->chroma_y_shift))) - 1;
apply_loop_filter(s, i, row, row+1);
}
- vp3_draw_horiz_band(s, s->height);
+ vp3_draw_horiz_band(s, s->avctx->height);
/* MPC Custom code begin */
#if 0
@@ -2009,7 +2009,7 @@ static int theora_decode_header(AVCodecContext *avctx, GetBitContext *gb) Vp3DecodeContext *s = avctx->priv_data;
int visible_width, visible_height, colorspace;
int offset_x = 0, offset_y = 0;
- AVRational fps;
+ AVRational fps, aspect;
s->theora = get_bits_long(gb, 24);
av_log(avctx, AV_LOG_DEBUG, "Theora bitstream version %X\n", s->theora);
@@ -2025,7 +2025,7 @@ static int theora_decode_header(AVCodecContext *avctx, GetBitContext *gb) visible_width = s->width = get_bits(gb, 16) << 4;
visible_height = s->height = get_bits(gb, 16) << 4;
- if(av_check_image_size(s->width, s->height, 0, avctx)){
+ if(av_image_check_size(s->width, s->height, 0, avctx)){
av_log(avctx, AV_LOG_ERROR, "Invalid dimensions (%dx%d)\n", s->width, s->height);
s->width= s->height= 0;
return -1;
@@ -2046,8 +2046,13 @@ static int theora_decode_header(AVCodecContext *avctx, GetBitContext *gb) fps.den, fps.num, 1<<30);
}
- avctx->sample_aspect_ratio.num = get_bits_long(gb, 24);
- avctx->sample_aspect_ratio.den = get_bits_long(gb, 24);
+ aspect.num = get_bits_long(gb, 24);
+ aspect.den = get_bits_long(gb, 24);
+ if (aspect.num && aspect.den) {
+ av_reduce(&avctx->sample_aspect_ratio.num,
+ &avctx->sample_aspect_ratio.den,
+ aspect.num, aspect.den, 1<<30);
+ }
if (s->theora < 0x030200)
s->keyframe_frequency_force=1<<get_bits(gb, 5); /* keyframe frequency force */ /* ffdshow custom code */
diff --git a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/vp56.c b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/vp56.c index 3f5569eb8..4b937a49e 100644 --- a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/vp56.c +++ b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/vp56.c @@ -482,11 +482,12 @@ static int vp56_size_changed(AVCodecContext *avctx) } int ff_vp56_decode_frame(AVCodecContext *avctx, void *data, int *data_size, - const uint8_t *buf, int buf_size) + AVPacket *avpkt) { + const uint8_t *buf = avpkt->data; VP56Context *s = avctx->priv_data; AVFrame *const p = s->framep[VP56_FRAME_CURRENT]; - int remaining_buf_size = buf_size; + int remaining_buf_size = avpkt->size; int is_alpha, av_uninit(alpha_offset); if (s->has_alpha) { @@ -635,7 +636,7 @@ int ff_vp56_decode_frame(AVCodecContext *avctx, void *data, int *data_size, *(AVFrame*)data = *p; *data_size = sizeof(AVFrame); - return buf_size; + return avpkt->size; } av_cold void ff_vp56_init(AVCodecContext *avctx, int flip, int has_alpha) diff --git a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/vp56.h b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/vp56.h index f9500cfb5..da6b1b64b 100644 --- a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/vp56.h +++ b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/vp56.h @@ -174,7 +174,7 @@ void ff_vp56_init(AVCodecContext *avctx, int flip, int has_alpha); int ff_vp56_free(AVCodecContext *avctx); void ff_vp56_init_dequant(VP56Context *s, int quantizer); int ff_vp56_decode_frame(AVCodecContext *avctx, void *data, int *data_size, - const uint8_t *buf, int buf_size); + AVPacket *avpkt); /** diff --git a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/vp56dsp.c b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/vp56dsp.c index d67604b01..0fe9e3e55 100644 --- a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/vp56dsp.c +++ b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/vp56dsp.c @@ -84,7 +84,7 @@ void ff_vp56dsp_init(VP56DSPContext *s, enum CodecID codec) s->edge_filter_ver = vp6_edge_filter_ver; if (CONFIG_VP6_DECODER) { - s->vp6_filter_diag4= ff_vp6_filter_diag4_c; + s->vp6_filter_diag4 = ff_vp6_filter_diag4_c; } } diff --git a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/vp8.c b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/vp8.c index d8d7cdaa2..de97489a8 100644 --- a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/vp8.c +++ b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/vp8.c @@ -223,7 +223,7 @@ static void vp8_decode_flush(AVCodecContext *avctx) static int update_dimensions(VP8Context *s, int width, int height) { - if (av_check_image_size(width, height, 0, s->avctx)) + if (av_image_check_size(width, height, 0, s->avctx)) return AVERROR_INVALIDDATA; vp8_decode_flush(s->avctx); @@ -1471,14 +1471,14 @@ static void filter_mb_row_simple(VP8Context *s, int mb_y) } static int vp8_decode_frame(AVCodecContext *avctx, void *data, int *data_size, - const uint8_t *buf, int buf_size) + AVPacket *avpkt) { VP8Context *s = avctx->priv_data; int ret, mb_x, mb_y, i, y, referenced; enum AVDiscard skip_thresh; AVFrame *av_uninit(curframe); - if ((ret = decode_frame_header(s, buf, buf_size)) < 0) + if ((ret = decode_frame_header(s, avpkt->data, avpkt->size)) < 0) return ret; referenced = s->update_last || s->update_golden == VP56_FRAME_CURRENT @@ -1644,7 +1644,7 @@ skip_decode: *data_size = sizeof(AVFrame); } - return buf_size; + return avpkt->size; } static av_cold int vp8_decode_init(AVCodecContext *avctx) diff --git a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/config.asm b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/config.asm new file mode 100644 index 000000000..8efc2c533 --- /dev/null +++ b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/config.asm @@ -0,0 +1 @@ +%define ARCH_X86 diff --git a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/dsputil_h264_template_mmx.c b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/dsputil_h264_template_mmx.c deleted file mode 100644 index ff359230c..000000000 --- a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/dsputil_h264_template_mmx.c +++ /dev/null @@ -1,304 +0,0 @@ -/*
- * Copyright (c) 2005 Zoltan Hidvegi <hzoli -a- hzoli -d- com>,
- * Loren Merritt
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-/**
- * MMX optimized version of (put|avg)_h264_chroma_mc8.
- * H264_CHROMA_MC8_TMPL must be defined to the desired function name
- * H264_CHROMA_OP must be defined to empty for put and pavgb/pavgusb for avg
- * H264_CHROMA_MC8_MV0 must be defined to a (put|avg)_pixels8 function
- */
-static void H264_CHROMA_MC8_TMPL(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y, const uint64_t *rnd_reg)
-{
- DECLARE_ALIGNED(8, uint64_t, AA);
- DECLARE_ALIGNED(8, uint64_t, DD);
- int i;
-
- if(y==0 && x==0) {
- /* no filter needed */
- H264_CHROMA_MC8_MV0(dst, src, stride, h);
- return;
- }
-
- assert(x<8 && y<8 && x>=0 && y>=0);
-
- if(y==0 || x==0)
- {
- /* 1 dimensional filter only */
- const int dxy = x ? 1 : stride;
-
- __asm__ volatile(
- "movd %0, %%mm5\n\t"
- "movq %1, %%mm4\n\t"
- "movq %2, %%mm6\n\t" /* mm6 = rnd >> 3 */
- "punpcklwd %%mm5, %%mm5\n\t"
- "punpckldq %%mm5, %%mm5\n\t" /* mm5 = B = x */
- "pxor %%mm7, %%mm7\n\t"
- "psubw %%mm5, %%mm4\n\t" /* mm4 = A = 8-x */
- :: "rm"(x+y), "m"(ff_pw_8), "m"(*(rnd_reg+1)));
-
- for(i=0; i<h; i++) {
- __asm__ volatile(
- /* mm0 = src[0..7], mm1 = src[1..8] */
- "movq %0, %%mm0\n\t"
- "movq %1, %%mm2\n\t"
- :: "m"(src[0]), "m"(src[dxy]));
-
- __asm__ volatile(
- /* [mm0,mm1] = A * src[0..7] */
- /* [mm2,mm3] = B * src[1..8] */
- "movq %%mm0, %%mm1\n\t"
- "movq %%mm2, %%mm3\n\t"
- "punpcklbw %%mm7, %%mm0\n\t"
- "punpckhbw %%mm7, %%mm1\n\t"
- "punpcklbw %%mm7, %%mm2\n\t"
- "punpckhbw %%mm7, %%mm3\n\t"
- "pmullw %%mm4, %%mm0\n\t"
- "pmullw %%mm4, %%mm1\n\t"
- "pmullw %%mm5, %%mm2\n\t"
- "pmullw %%mm5, %%mm3\n\t"
-
- /* dst[0..7] = (A * src[0..7] + B * src[1..8] + (rnd >> 3)) >> 3 */
- "paddw %%mm6, %%mm0\n\t"
- "paddw %%mm6, %%mm1\n\t"
- "paddw %%mm2, %%mm0\n\t"
- "paddw %%mm3, %%mm1\n\t"
- "psrlw $3, %%mm0\n\t"
- "psrlw $3, %%mm1\n\t"
- "packuswb %%mm1, %%mm0\n\t"
- H264_CHROMA_OP(%0, %%mm0)
- "movq %%mm0, %0\n\t"
- : "=m" (dst[0]));
-
- src += stride;
- dst += stride;
- }
- return;
- }
-
- /* general case, bilinear */
- __asm__ volatile("movd %2, %%mm4\n\t"
- "movd %3, %%mm6\n\t"
- "punpcklwd %%mm4, %%mm4\n\t"
- "punpcklwd %%mm6, %%mm6\n\t"
- "punpckldq %%mm4, %%mm4\n\t" /* mm4 = x words */
- "punpckldq %%mm6, %%mm6\n\t" /* mm6 = y words */
- "movq %%mm4, %%mm5\n\t"
- "pmullw %%mm6, %%mm4\n\t" /* mm4 = x * y */
- "psllw $3, %%mm5\n\t"
- "psllw $3, %%mm6\n\t"
- "movq %%mm5, %%mm7\n\t"
- "paddw %%mm6, %%mm7\n\t"
- "movq %%mm4, %1\n\t" /* DD = x * y */
- "psubw %%mm4, %%mm5\n\t" /* mm5 = B = 8x - xy */
- "psubw %%mm4, %%mm6\n\t" /* mm6 = C = 8y - xy */
- "paddw %4, %%mm4\n\t"
- "psubw %%mm7, %%mm4\n\t" /* mm4 = A = xy - (8x+8y) + 64 */
- "pxor %%mm7, %%mm7\n\t"
- "movq %%mm4, %0\n\t"
- : "=m" (AA), "=m" (DD) : "rm" (x), "rm" (y), "m" (ff_pw_64));
-
- __asm__ volatile(
- /* mm0 = src[0..7], mm1 = src[1..8] */
- "movq %0, %%mm0\n\t"
- "movq %1, %%mm1\n\t"
- : : "m" (src[0]), "m" (src[1]));
-
- for(i=0; i<h; i++) {
- src += stride;
-
- __asm__ volatile(
- /* mm2 = A * src[0..3] + B * src[1..4] */
- /* mm3 = A * src[4..7] + B * src[5..8] */
- "movq %%mm0, %%mm2\n\t"
- "movq %%mm1, %%mm3\n\t"
- "punpckhbw %%mm7, %%mm0\n\t"
- "punpcklbw %%mm7, %%mm1\n\t"
- "punpcklbw %%mm7, %%mm2\n\t"
- "punpckhbw %%mm7, %%mm3\n\t"
- "pmullw %0, %%mm0\n\t"
- "pmullw %0, %%mm2\n\t"
- "pmullw %%mm5, %%mm1\n\t"
- "pmullw %%mm5, %%mm3\n\t"
- "paddw %%mm1, %%mm2\n\t"
- "paddw %%mm0, %%mm3\n\t"
- : : "m" (AA));
-
- __asm__ volatile(
- /* [mm2,mm3] += C * src[0..7] */
- "movq %0, %%mm0\n\t"
- "movq %%mm0, %%mm1\n\t"
- "punpcklbw %%mm7, %%mm0\n\t"
- "punpckhbw %%mm7, %%mm1\n\t"
- "pmullw %%mm6, %%mm0\n\t"
- "pmullw %%mm6, %%mm1\n\t"
- "paddw %%mm0, %%mm2\n\t"
- "paddw %%mm1, %%mm3\n\t"
- : : "m" (src[0]));
-
- __asm__ volatile(
- /* [mm2,mm3] += D * src[1..8] */
- "movq %1, %%mm1\n\t"
- "movq %%mm1, %%mm0\n\t"
- "movq %%mm1, %%mm4\n\t"
- "punpcklbw %%mm7, %%mm0\n\t"
- "punpckhbw %%mm7, %%mm4\n\t"
- "pmullw %2, %%mm0\n\t"
- "pmullw %2, %%mm4\n\t"
- "paddw %%mm0, %%mm2\n\t"
- "paddw %%mm4, %%mm3\n\t"
- "movq %0, %%mm0\n\t"
- : : "m" (src[0]), "m" (src[1]), "m" (DD));
-
- __asm__ volatile(
- /* dst[0..7] = ([mm2,mm3] + rnd) >> 6 */
- "paddw %1, %%mm2\n\t"
- "paddw %1, %%mm3\n\t"
- "psrlw $6, %%mm2\n\t"
- "psrlw $6, %%mm3\n\t"
- "packuswb %%mm3, %%mm2\n\t"
- H264_CHROMA_OP(%0, %%mm2)
- "movq %%mm2, %0\n\t"
- : "=m" (dst[0]) : "m" (*rnd_reg));
- dst+= stride;
- }
-}
-
-static void H264_CHROMA_MC4_TMPL(uint8_t *dst/*align 4*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y, const uint64_t *rnd_reg)
-{
- __asm__ volatile(
- "pxor %%mm7, %%mm7 \n\t"
- "movd %5, %%mm2 \n\t"
- "movd %6, %%mm3 \n\t"
- "movq "MANGLE(ff_pw_8)", %%mm4\n\t"
- "movq "MANGLE(ff_pw_8)", %%mm5\n\t"
- "punpcklwd %%mm2, %%mm2 \n\t"
- "punpcklwd %%mm3, %%mm3 \n\t"
- "punpcklwd %%mm2, %%mm2 \n\t"
- "punpcklwd %%mm3, %%mm3 \n\t"
- "psubw %%mm2, %%mm4 \n\t"
- "psubw %%mm3, %%mm5 \n\t"
-
- "movd (%1), %%mm0 \n\t"
- "movd 1(%1), %%mm6 \n\t"
- "add %3, %1 \n\t"
- "punpcklbw %%mm7, %%mm0 \n\t"
- "punpcklbw %%mm7, %%mm6 \n\t"
- "pmullw %%mm4, %%mm0 \n\t"
- "pmullw %%mm2, %%mm6 \n\t"
- "paddw %%mm0, %%mm6 \n\t"
-
- "1: \n\t"
- "movd (%1), %%mm0 \n\t"
- "movd 1(%1), %%mm1 \n\t"
- "add %3, %1 \n\t"
- "punpcklbw %%mm7, %%mm0 \n\t"
- "punpcklbw %%mm7, %%mm1 \n\t"
- "pmullw %%mm4, %%mm0 \n\t"
- "pmullw %%mm2, %%mm1 \n\t"
- "paddw %%mm0, %%mm1 \n\t"
- "movq %%mm1, %%mm0 \n\t"
- "pmullw %%mm5, %%mm6 \n\t"
- "pmullw %%mm3, %%mm1 \n\t"
- "paddw %4, %%mm6 \n\t"
- "paddw %%mm6, %%mm1 \n\t"
- "psrlw $6, %%mm1 \n\t"
- "packuswb %%mm1, %%mm1 \n\t"
- H264_CHROMA_OP4((%0), %%mm1, %%mm6)
- "movd %%mm1, (%0) \n\t"
- "add %3, %0 \n\t"
- "movd (%1), %%mm6 \n\t"
- "movd 1(%1), %%mm1 \n\t"
- "add %3, %1 \n\t"
- "punpcklbw %%mm7, %%mm6 \n\t"
- "punpcklbw %%mm7, %%mm1 \n\t"
- "pmullw %%mm4, %%mm6 \n\t"
- "pmullw %%mm2, %%mm1 \n\t"
- "paddw %%mm6, %%mm1 \n\t"
- "movq %%mm1, %%mm6 \n\t"
- "pmullw %%mm5, %%mm0 \n\t"
- "pmullw %%mm3, %%mm1 \n\t"
- "paddw %4, %%mm0 \n\t"
- "paddw %%mm0, %%mm1 \n\t"
- "psrlw $6, %%mm1 \n\t"
- "packuswb %%mm1, %%mm1 \n\t"
- H264_CHROMA_OP4((%0), %%mm1, %%mm0)
- "movd %%mm1, (%0) \n\t"
- "add %3, %0 \n\t"
- "sub $2, %2 \n\t"
- "jnz 1b \n\t"
- : "+r"(dst), "+r"(src), "+r"(h)
- : "r"((x86_reg)stride), "m"(*rnd_reg), "m"(x), "m"(y)
- );
-}
-
-#ifdef H264_CHROMA_MC2_TMPL
-static void H264_CHROMA_MC2_TMPL(uint8_t *dst/*align 2*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y)
-{
- int tmp = ((1<<16)-1)*x + 8;
- int CD= tmp*y;
- int AB= (tmp<<3) - CD;
- __asm__ volatile(
- /* mm5 = {A,B,A,B} */
- /* mm6 = {C,D,C,D} */
- "movd %0, %%mm5\n\t"
- "movd %1, %%mm6\n\t"
- "punpckldq %%mm5, %%mm5\n\t"
- "punpckldq %%mm6, %%mm6\n\t"
- "pxor %%mm7, %%mm7\n\t"
- /* mm0 = src[0,1,1,2] */
- "movd %2, %%mm2\n\t"
- "punpcklbw %%mm7, %%mm2\n\t"
- "pshufw $0x94, %%mm2, %%mm2\n\t"
- :: "r"(AB), "r"(CD), "m"(src[0]));
-
-
- __asm__ volatile(
- "1:\n\t"
- "add %4, %1\n\t"
- /* mm1 = A * src[0,1] + B * src[1,2] */
- "movq %%mm2, %%mm1\n\t"
- "pmaddwd %%mm5, %%mm1\n\t"
- /* mm0 = src[0,1,1,2] */
- "movd (%1), %%mm0\n\t"
- "punpcklbw %%mm7, %%mm0\n\t"
- "pshufw $0x94, %%mm0, %%mm0\n\t"
- /* mm1 += C * src[0,1] + D * src[1,2] */
- "movq %%mm0, %%mm2\n\t"
- "pmaddwd %%mm6, %%mm0\n\t"
- "paddw %3, %%mm1\n\t"
- "paddw %%mm0, %%mm1\n\t"
- /* dst[0,1] = pack((mm1 + 32) >> 6) */
- "psrlw $6, %%mm1\n\t"
- "packssdw %%mm7, %%mm1\n\t"
- "packuswb %%mm7, %%mm1\n\t"
- H264_CHROMA_OP4((%0), %%mm1, %%mm3)
- "movd %%mm1, %%esi\n\t"
- "movw %%si, (%0)\n\t"
- "add %4, %0\n\t"
- "sub $1, %2\n\t"
- "jnz 1b\n\t"
- : "+r" (dst), "+r"(src), "+r"(h)
- : "m" (ff_pw_32), "r"((x86_reg)stride)
- : "%esi");
-
-}
-#endif
-
diff --git a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/dsputil_h264_template_ssse3.c b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/dsputil_h264_template_ssse3.c deleted file mode 100644 index 0eceb74f2..000000000 --- a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/dsputil_h264_template_ssse3.c +++ /dev/null @@ -1,208 +0,0 @@ -/* - * Copyright (c) 2008 Loren Merritt - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -/** - * SSSE3 optimized version of (put|avg)_h264_chroma_mc8. - * H264_CHROMA_MC8_TMPL must be defined to the desired function name - * H264_CHROMA_MC8_MV0 must be defined to a (put|avg)_pixels8 function - * AVG_OP must be defined to empty for put and the identify for avg - */ -static void H264_CHROMA_MC8_TMPL(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y, int rnd) -{ - if(y==0 && x==0) { - /* no filter needed */ - H264_CHROMA_MC8_MV0(dst, src, stride, h); - return; - } - - assert(x<8 && y<8 && x>=0 && y>=0); - - if(y==0 || x==0) - { - /* 1 dimensional filter only */ - __asm__ volatile( - "movd %0, %%xmm7 \n\t" - "movq %1, %%xmm6 \n\t" - "pshuflw $0, %%xmm7, %%xmm7 \n\t" - "movlhps %%xmm6, %%xmm6 \n\t" - "movlhps %%xmm7, %%xmm7 \n\t" - :: "r"(255*(x+y)+8), "m"(*(rnd?&ff_pw_4.a:&ff_pw_3)) - ); - - if(x) { - __asm__ volatile( - "1: \n\t" - "movq (%1), %%xmm0 \n\t" - "movq 1(%1), %%xmm1 \n\t" - "movq (%1,%3), %%xmm2 \n\t" - "movq 1(%1,%3), %%xmm3 \n\t" - "punpcklbw %%xmm1, %%xmm0 \n\t" - "punpcklbw %%xmm3, %%xmm2 \n\t" - "pmaddubsw %%xmm7, %%xmm0 \n\t" - "pmaddubsw %%xmm7, %%xmm2 \n\t" - AVG_OP("movq (%0), %%xmm4 \n\t") - AVG_OP("movhps (%0,%3), %%xmm4 \n\t") - "paddw %%xmm6, %%xmm0 \n\t" - "paddw %%xmm6, %%xmm2 \n\t" - "psrlw $3, %%xmm0 \n\t" - "psrlw $3, %%xmm2 \n\t" - "packuswb %%xmm2, %%xmm0 \n\t" - AVG_OP("pavgb %%xmm4, %%xmm0 \n\t") - "movq %%xmm0, (%0) \n\t" - "movhps %%xmm0, (%0,%3) \n\t" - "sub $2, %2 \n\t" - "lea (%1,%3,2), %1 \n\t" - "lea (%0,%3,2), %0 \n\t" - "jg 1b \n\t" - :"+r"(dst), "+r"(src), "+r"(h) - :"r"((x86_reg)stride) - ); - } else { - __asm__ volatile( - "1: \n\t" - "movq (%1), %%xmm0 \n\t" - "movq (%1,%3), %%xmm1 \n\t" - "movdqa %%xmm1, %%xmm2 \n\t" - "movq (%1,%3,2), %%xmm3 \n\t" - "punpcklbw %%xmm1, %%xmm0 \n\t" - "punpcklbw %%xmm3, %%xmm2 \n\t" - "pmaddubsw %%xmm7, %%xmm0 \n\t" - "pmaddubsw %%xmm7, %%xmm2 \n\t" - AVG_OP("movq (%0), %%xmm4 \n\t") - AVG_OP("movhps (%0,%3), %%xmm4 \n\t") - "paddw %%xmm6, %%xmm0 \n\t" - "paddw %%xmm6, %%xmm2 \n\t" - "psrlw $3, %%xmm0 \n\t" - "psrlw $3, %%xmm2 \n\t" - "packuswb %%xmm2, %%xmm0 \n\t" - AVG_OP("pavgb %%xmm4, %%xmm0 \n\t") - "movq %%xmm0, (%0) \n\t" - "movhps %%xmm0, (%0,%3) \n\t" - "sub $2, %2 \n\t" - "lea (%1,%3,2), %1 \n\t" - "lea (%0,%3,2), %0 \n\t" - "jg 1b \n\t" - :"+r"(dst), "+r"(src), "+r"(h) - :"r"((x86_reg)stride) - ); - } - return; - } - - /* general case, bilinear */ - __asm__ volatile( - "movd %0, %%xmm7 \n\t" - "movd %1, %%xmm6 \n\t" - "movdqa %2, %%xmm5 \n\t" - "pshuflw $0, %%xmm7, %%xmm7 \n\t" - "pshuflw $0, %%xmm6, %%xmm6 \n\t" - "movlhps %%xmm7, %%xmm7 \n\t" - "movlhps %%xmm6, %%xmm6 \n\t" - :: "r"((x*255+8)*(8-y)), "r"((x*255+8)*y), "m"(*(rnd?&ff_pw_32:&ff_pw_28)) - ); - - __asm__ volatile( - "movq (%1), %%xmm0 \n\t" - "movq 1(%1), %%xmm1 \n\t" - "punpcklbw %%xmm1, %%xmm0 \n\t" - "add %3, %1 \n\t" - "1: \n\t" - "movq (%1), %%xmm1 \n\t" - "movq 1(%1), %%xmm2 \n\t" - "movq (%1,%3), %%xmm3 \n\t" - "movq 1(%1,%3), %%xmm4 \n\t" - "lea (%1,%3,2), %1 \n\t" - "punpcklbw %%xmm2, %%xmm1 \n\t" - "punpcklbw %%xmm4, %%xmm3 \n\t" - "movdqa %%xmm1, %%xmm2 \n\t" - "movdqa %%xmm3, %%xmm4 \n\t" - "pmaddubsw %%xmm7, %%xmm0 \n\t" - "pmaddubsw %%xmm6, %%xmm1 \n\t" - "pmaddubsw %%xmm7, %%xmm2 \n\t" - "pmaddubsw %%xmm6, %%xmm3 \n\t" - "paddw %%xmm5, %%xmm0 \n\t" - "paddw %%xmm5, %%xmm2 \n\t" - "paddw %%xmm0, %%xmm1 \n\t" - "paddw %%xmm2, %%xmm3 \n\t" - "movdqa %%xmm4, %%xmm0 \n\t" - "psrlw $6, %%xmm1 \n\t" - "psrlw $6, %%xmm3 \n\t" - AVG_OP("movq (%0), %%xmm2 \n\t") - AVG_OP("movhps (%0,%3), %%xmm2 \n\t") - "packuswb %%xmm3, %%xmm1 \n\t" - AVG_OP("pavgb %%xmm2, %%xmm1 \n\t") - "movq %%xmm1, (%0)\n\t" - "movhps %%xmm1, (%0,%3)\n\t" - "sub $2, %2 \n\t" - "lea (%0,%3,2), %0 \n\t" - "jg 1b \n\t" - :"+r"(dst), "+r"(src), "+r"(h) - :"r"((x86_reg)stride) - ); -} - -static void H264_CHROMA_MC4_TMPL(uint8_t *dst/*align 4*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y) -{ - __asm__ volatile( - "movd %0, %%mm7 \n\t" - "movd %1, %%mm6 \n\t" - "movq %2, %%mm5 \n\t" - "pshufw $0, %%mm7, %%mm7 \n\t" - "pshufw $0, %%mm6, %%mm6 \n\t" - :: "r"((x*255+8)*(8-y)), "r"((x*255+8)*y), "m"(ff_pw_32) - ); - - __asm__ volatile( - "movd (%1), %%mm0 \n\t" - "punpcklbw 1(%1), %%mm0 \n\t" - "add %3, %1 \n\t" - "1: \n\t" - "movd (%1), %%mm1 \n\t" - "movd (%1,%3), %%mm3 \n\t" - "punpcklbw 1(%1), %%mm1 \n\t" - "punpcklbw 1(%1,%3), %%mm3 \n\t" - "lea (%1,%3,2), %1 \n\t" - "movq %%mm1, %%mm2 \n\t" - "movq %%mm3, %%mm4 \n\t" - "pmaddubsw %%mm7, %%mm0 \n\t" - "pmaddubsw %%mm6, %%mm1 \n\t" - "pmaddubsw %%mm7, %%mm2 \n\t" - "pmaddubsw %%mm6, %%mm3 \n\t" - "paddw %%mm5, %%mm0 \n\t" - "paddw %%mm5, %%mm2 \n\t" - "paddw %%mm0, %%mm1 \n\t" - "paddw %%mm2, %%mm3 \n\t" - "movq %%mm4, %%mm0 \n\t" - "psrlw $6, %%mm1 \n\t" - "psrlw $6, %%mm3 \n\t" - "packuswb %%mm1, %%mm1 \n\t" - "packuswb %%mm3, %%mm3 \n\t" - AVG_OP("pavgb (%0), %%mm1 \n\t") - AVG_OP("pavgb (%0,%3), %%mm3 \n\t") - "movd %%mm1, (%0)\n\t" - "movd %%mm3, (%0,%3)\n\t" - "sub $2, %2 \n\t" - "lea (%0,%3,2), %0 \n\t" - "jg 1b \n\t" - :"+r"(dst), "+r"(src), "+r"(h) - :"r"((x86_reg)stride) - ); -} - diff --git a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/dsputil_mmx.c b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/dsputil_mmx.c index c4939ec65..995df0564 100644 --- a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/dsputil_mmx.c +++ b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/dsputil_mmx.c @@ -22,14 +22,13 @@ * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
*/
+#include "libavutil/cpu.h"
#include "libavutil/x86_cpu.h"
#include "libavcodec/dsputil.h"
#include "libavcodec/h264dsp.h"
#include "libavcodec/mpegvideo.h"
#include "libavcodec/simple_idct.h"
#include "dsputil_mmx.h"
-#include "vp3dsp_mmx.h"
-#include "vp3dsp_sse2.h"
#include "idct_xvid.h"
//#undef NDEBUG
@@ -62,6 +61,7 @@ DECLARE_ALIGNED(8, const uint64_t, ff_pw_96 ) = 0x0060006000600060ULL; DECLARE_ALIGNED(8, const uint64_t, ff_pw_128) = 0x0080008000800080ULL;
DECLARE_ALIGNED(8, const uint64_t, ff_pw_255) = 0x00ff00ff00ff00ffULL;
+DECLARE_ALIGNED(16, const xmm_reg, ff_pb_0 ) = {0x0000000000000000ULL, 0x0000000000000000ULL};
DECLARE_ALIGNED(16, const xmm_reg, ff_pb_1 ) = {0x0101010101010101ULL, 0x0101010101010101ULL};
DECLARE_ALIGNED(16, const xmm_reg, ff_pb_3 ) = {0x0303030303030303ULL, 0x0303030303030303ULL};
DECLARE_ALIGNED(16, const xmm_reg, ff_pb_4 ) = {0x0404040404040404ULL, 0x0404040404040404ULL};
@@ -70,7 +70,7 @@ DECLARE_ALIGNED(8, const uint64_t, ff_pb_1F ) = 0x1F1F1F1F1F1F1F1FULL; DECLARE_ALIGNED(8, const uint64_t, ff_pb_3F ) = 0x3F3F3F3F3F3F3F3FULL;
DECLARE_ALIGNED(16, const xmm_reg, ff_pb_80 ) = {0x8080808080808080ULL, 0x8080808080808080ULL};
DECLARE_ALIGNED(8, const uint64_t, ff_pb_81 ) = 0x8181818181818181ULL;
-DECLARE_ALIGNED(8, const uint64_t, ff_pb_A1 ) = 0xA1A1A1A1A1A1A1A1ULL;
+DECLARE_ALIGNED(16, const xmm_reg, ff_pb_A1 ) = {0xA1A1A1A1A1A1A1A1ULL, 0xA1A1A1A1A1A1A1A1ULL};
DECLARE_ALIGNED(16, const xmm_reg, ff_pb_F8 ) = {0xF8F8F8F8F8F8F8F8ULL, 0xF8F8F8F8F8F8F8F8ULL};
DECLARE_ALIGNED(8, const uint64_t, ff_pb_FC ) = 0xFCFCFCFCFCFCFCFCULL;
DECLARE_ALIGNED(16, const xmm_reg, ff_pb_FE ) = {0xFEFEFEFEFEFEFEFEULL, 0xFEFEFEFEFEFEFEFEULL};
@@ -228,7 +228,7 @@ DECLARE_ALIGNED(16, const double, ff_pd_2)[2] = { 2.0, 2.0 }; /***********************************/
/* standard MMX */
-void put_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
+void ff_put_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
{
const DCTELEM *p;
uint8_t *pix;
@@ -304,7 +304,7 @@ DECLARE_ASM_CONST(8, uint8_t, ff_vector128)[8] = "movq %%mm3, (%0, %3, 2) \n\t"\
"movq %%mm4, (%0, %1) \n\t"
-void put_signed_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
+void ff_put_signed_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
{
x86_reg line_skip = line_size;
x86_reg line_skip3;
@@ -320,7 +320,7 @@ void put_signed_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int li :"memory");
}
-void add_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
+void ff_add_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
{
const DCTELEM *p;
uint8_t *pix;
@@ -728,35 +728,6 @@ static void h263_v_loop_filter_mmx(uint8_t *src, int stride, int qscale){ }
}
-static inline void transpose4x4(uint8_t *dst, uint8_t *src, int dst_stride, int src_stride){
- __asm__ volatile( //FIXME could save 1 instruction if done as 8x4 ...
- "movd %4, %%mm0 \n\t"
- "movd %5, %%mm1 \n\t"
- "movd %6, %%mm2 \n\t"
- "movd %7, %%mm3 \n\t"
- "punpcklbw %%mm1, %%mm0 \n\t"
- "punpcklbw %%mm3, %%mm2 \n\t"
- "movq %%mm0, %%mm1 \n\t"
- "punpcklwd %%mm2, %%mm0 \n\t"
- "punpckhwd %%mm2, %%mm1 \n\t"
- "movd %%mm0, %0 \n\t"
- "punpckhdq %%mm0, %%mm0 \n\t"
- "movd %%mm0, %1 \n\t"
- "movd %%mm1, %2 \n\t"
- "punpckhdq %%mm1, %%mm1 \n\t"
- "movd %%mm1, %3 \n\t"
-
- : "=m" (*(uint32_t*)(dst + 0*dst_stride)),
- "=m" (*(uint32_t*)(dst + 1*dst_stride)),
- "=m" (*(uint32_t*)(dst + 2*dst_stride)),
- "=m" (*(uint32_t*)(dst + 3*dst_stride))
- : "m" (*(uint32_t*)(src + 0*src_stride)),
- "m" (*(uint32_t*)(src + 1*src_stride)),
- "m" (*(uint32_t*)(src + 2*src_stride)),
- "m" (*(uint32_t*)(src + 3*src_stride))
- );
-}
-
static void h263_h_loop_filter_mmx(uint8_t *src, int stride, int qscale){
if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
const int strength= ff_h263_loop_filter_strength[qscale];
@@ -1820,8 +1791,59 @@ PREFETCH(prefetch_mmx2, prefetcht0) PREFETCH(prefetch_3dnow, prefetch)
#undef PREFETCH
-#include "h264dsp_mmx.c"
-#include "rv40dsp_mmx.c"
+#include "h264_qpel_mmx.c"
+
+void ff_put_h264_chroma_mc8_mmx_rnd (uint8_t *dst, uint8_t *src,
+ int stride, int h, int x, int y);
+void ff_put_vc1_chroma_mc8_mmx_nornd (uint8_t *dst, uint8_t *src,
+ int stride, int h, int x, int y);
+void ff_put_rv40_chroma_mc8_mmx (uint8_t *dst, uint8_t *src,
+ int stride, int h, int x, int y);
+void ff_avg_h264_chroma_mc8_mmx2_rnd (uint8_t *dst, uint8_t *src,
+ int stride, int h, int x, int y);
+void ff_avg_vc1_chroma_mc8_mmx2_nornd (uint8_t *dst, uint8_t *src,
+ int stride, int h, int x, int y);
+void ff_avg_rv40_chroma_mc8_mmx2 (uint8_t *dst, uint8_t *src,
+ int stride, int h, int x, int y);
+void ff_avg_h264_chroma_mc8_3dnow_rnd (uint8_t *dst, uint8_t *src,
+ int stride, int h, int x, int y);
+void ff_avg_vc1_chroma_mc8_3dnow_nornd(uint8_t *dst, uint8_t *src,
+ int stride, int h, int x, int y);
+void ff_avg_rv40_chroma_mc8_3dnow (uint8_t *dst, uint8_t *src,
+ int stride, int h, int x, int y);
+
+void ff_put_h264_chroma_mc4_mmx (uint8_t *dst, uint8_t *src,
+ int stride, int h, int x, int y);
+void ff_put_rv40_chroma_mc4_mmx (uint8_t *dst, uint8_t *src,
+ int stride, int h, int x, int y);
+void ff_avg_h264_chroma_mc4_mmx2 (uint8_t *dst, uint8_t *src,
+ int stride, int h, int x, int y);
+void ff_avg_rv40_chroma_mc4_mmx2 (uint8_t *dst, uint8_t *src,
+ int stride, int h, int x, int y);
+void ff_avg_h264_chroma_mc4_3dnow (uint8_t *dst, uint8_t *src,
+ int stride, int h, int x, int y);
+void ff_avg_rv40_chroma_mc4_3dnow (uint8_t *dst, uint8_t *src,
+ int stride, int h, int x, int y);
+
+void ff_put_h264_chroma_mc2_mmx2 (uint8_t *dst, uint8_t *src,
+ int stride, int h, int x, int y);
+void ff_avg_h264_chroma_mc2_mmx2 (uint8_t *dst, uint8_t *src,
+ int stride, int h, int x, int y);
+
+void ff_put_h264_chroma_mc8_ssse3_rnd (uint8_t *dst, uint8_t *src,
+ int stride, int h, int x, int y);
+void ff_put_vc1_chroma_mc8_ssse3_nornd(uint8_t *dst, uint8_t *src,
+ int stride, int h, int x, int y);
+void ff_put_h264_chroma_mc4_ssse3 (uint8_t *dst, uint8_t *src,
+ int stride, int h, int x, int y);
+
+void ff_avg_h264_chroma_mc8_ssse3_rnd (uint8_t *dst, uint8_t *src,
+ int stride, int h, int x, int y);
+void ff_avg_vc1_chroma_mc8_ssse3_nornd(uint8_t *dst, uint8_t *src,
+ int stride, int h, int x, int y);
+void ff_avg_h264_chroma_mc4_ssse3 (uint8_t *dst, uint8_t *src,
+ int stride, int h, int x, int y);
+
/* CAVS specific */
void ff_put_cavs_qpel8_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) {
@@ -1851,43 +1873,43 @@ void ff_avg_vc1_mspel_mc00_mmx2(uint8_t *dst, const uint8_t *src, int stride, in static void ff_libmpeg2mmx_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
{
ff_mmx_idct (block);
- put_pixels_clamped_mmx(block, dest, line_size);
+ ff_put_pixels_clamped_mmx(block, dest, line_size);
}
static void ff_libmpeg2mmx_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
{
ff_mmx_idct (block);
- add_pixels_clamped_mmx(block, dest, line_size);
+ ff_add_pixels_clamped_mmx(block, dest, line_size);
}
static void ff_libmpeg2mmx2_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
{
ff_mmxext_idct (block);
- put_pixels_clamped_mmx(block, dest, line_size);
+ ff_put_pixels_clamped_mmx(block, dest, line_size);
}
static void ff_libmpeg2mmx2_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
{
ff_mmxext_idct (block);
- add_pixels_clamped_mmx(block, dest, line_size);
+ ff_add_pixels_clamped_mmx(block, dest, line_size);
}
#endif
static void ff_idct_xvid_mmx_put(uint8_t *dest, int line_size, DCTELEM *block)
{
ff_idct_xvid_mmx (block);
- put_pixels_clamped_mmx(block, dest, line_size);
+ ff_put_pixels_clamped_mmx(block, dest, line_size);
}
static void ff_idct_xvid_mmx_add(uint8_t *dest, int line_size, DCTELEM *block)
{
ff_idct_xvid_mmx (block);
- add_pixels_clamped_mmx(block, dest, line_size);
+ ff_add_pixels_clamped_mmx(block, dest, line_size);
}
static void ff_idct_xvid_mmx2_put(uint8_t *dest, int line_size, DCTELEM *block)
{
ff_idct_xvid_mmx2 (block);
- put_pixels_clamped_mmx(block, dest, line_size);
+ ff_put_pixels_clamped_mmx(block, dest, line_size);
}
static void ff_idct_xvid_mmx2_add(uint8_t *dest, int line_size, DCTELEM *block)
{
ff_idct_xvid_mmx2 (block);
- add_pixels_clamped_mmx(block, dest, line_size);
+ ff_add_pixels_clamped_mmx(block, dest, line_size);
}
static void vorbis_inverse_coupling_3dnow(float *mag, float *ang, int blocksize)
@@ -2376,6 +2398,19 @@ static void float_to_int16_sse2(int16_t *dst, const float *src, long len){ );
}
+void ff_vp3_idct_mmx(int16_t *input_data);
+void ff_vp3_idct_put_mmx(uint8_t *dest, int line_size, DCTELEM *block);
+void ff_vp3_idct_add_mmx(uint8_t *dest, int line_size, DCTELEM *block);
+
+void ff_vp3_idct_dc_add_mmx2(uint8_t *dest, int line_size, const DCTELEM *block);
+
+void ff_vp3_v_loop_filter_mmx2(uint8_t *src, int stride, int *bounding_values);
+void ff_vp3_h_loop_filter_mmx2(uint8_t *src, int stride, int *bounding_values);
+
+void ff_vp3_idct_sse2(int16_t *input_data);
+void ff_vp3_idct_put_sse2(uint8_t *dest, int line_size, DCTELEM *block);
+void ff_vp3_idct_add_sse2(uint8_t *dest, int line_size, DCTELEM *block);
+
void ff_float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len);
void ff_float_to_int16_interleave6_3dnow(int16_t *dst, const float **src, int len);
void ff_float_to_int16_interleave6_3dn2(int16_t *dst, const float **src, int len);
@@ -2387,20 +2422,8 @@ int32_t ff_scalarproduct_and_madd_int16_ssse3(int16_t *v1, const int16_t *v2, co void ff_add_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top);
int ff_add_hfyu_left_prediction_ssse3(uint8_t *dst, const uint8_t *src, int w, int left);
int ff_add_hfyu_left_prediction_sse4(uint8_t *dst, const uint8_t *src, int w, int left);
-void ff_x264_deblock_v_luma_sse2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0);
-void ff_x264_deblock_h_luma_sse2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0);
-void ff_x264_deblock_h_luma_intra_mmxext(uint8_t *pix, int stride, int alpha, int beta);
-void ff_x264_deblock_v_luma_intra_sse2(uint8_t *pix, int stride, int alpha, int beta);
-void ff_x264_deblock_h_luma_intra_sse2(uint8_t *pix, int stride, int alpha, int beta);
-
-#if HAVE_YASM && ARCH_X86_32
-void ff_x264_deblock_v8_luma_intra_mmxext(uint8_t *pix, int stride, int alpha, int beta);
-static void ff_x264_deblock_v_luma_intra_mmxext(uint8_t *pix, int stride, int alpha, int beta)
-{
- ff_x264_deblock_v8_luma_intra_mmxext(pix+0, stride, alpha, beta);
- ff_x264_deblock_v8_luma_intra_mmxext(pix+8, stride, alpha, beta);
-}
-#elif !HAVE_YASM
+
+#if !HAVE_YASM
#define ff_float_to_int16_interleave6_sse(a,b,c) float_to_int16_interleave_misc_sse(a,b,c,6)
#define ff_float_to_int16_interleave6_3dnow(a,b,c) float_to_int16_interleave_misc_3dnow(a,b,c,6)
#define ff_float_to_int16_interleave6_3dn2(a,b,c) float_to_int16_interleave_misc_3dnow(a,b,c,6)
@@ -2500,10 +2523,10 @@ float ff_scalarproduct_float_sse(const float *v1, const float *v2, int order); void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
{
- int mm_flags = mm_support();
+ int mm_flags = av_get_cpu_flags();
if (avctx->dsp_mask) {
- if (avctx->dsp_mask & FF_MM_FORCE)
+ if (avctx->dsp_mask & AV_CPU_FLAG_FORCE)
mm_flags |= (avctx->dsp_mask & 0xffff);
else
mm_flags &= ~(avctx->dsp_mask & 0xffff);
@@ -2511,20 +2534,20 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) #if 0
av_log(avctx, AV_LOG_INFO, "libavcodec: CPU flags:");
- if (mm_flags & FF_MM_MMX)
+ if (mm_flags & AV_CPU_FLAG_MMX)
av_log(avctx, AV_LOG_INFO, " mmx");
- if (mm_flags & FF_MM_MMX2)
+ if (mm_flags & AV_CPU_FLAG_MMX2)
av_log(avctx, AV_LOG_INFO, " mmx2");
- if (mm_flags & FF_MM_3DNOW)
+ if (mm_flags & AV_CPU_FLAG_3DNOW)
av_log(avctx, AV_LOG_INFO, " 3dnow");
- if (mm_flags & FF_MM_SSE)
+ if (mm_flags & AV_CPU_FLAG_SSE)
av_log(avctx, AV_LOG_INFO, " sse");
- if (mm_flags & FF_MM_SSE2)
+ if (mm_flags & AV_CPU_FLAG_SSE2)
av_log(avctx, AV_LOG_INFO, " sse2");
av_log(avctx, AV_LOG_INFO, "\n");
#endif
- if (mm_flags & FF_MM_MMX) {
+ if (mm_flags & AV_CPU_FLAG_MMX) {
const int idct_algo= avctx->idct_algo;
if(avctx->lowres==0){
@@ -2535,7 +2558,7 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) c->idct_permutation_type= FF_SIMPLE_IDCT_PERM;
#if CONFIG_GPL
}else if(idct_algo==FF_IDCT_LIBMPEG2MMX){
- if(mm_flags & FF_MM_MMX2){
+ if(mm_flags & AV_CPU_FLAG_MMX2){
c->idct_put= ff_libmpeg2mmx2_idct_put;
c->idct_add= ff_libmpeg2mmx2_idct_add;
c->idct = ff_mmxext_idct;
@@ -2547,8 +2570,8 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
#endif
}else if((CONFIG_VP3_DECODER || CONFIG_VP5_DECODER || CONFIG_VP6_DECODER) &&
- idct_algo==FF_IDCT_VP3){
- if(mm_flags & FF_MM_SSE2){
+ idct_algo==FF_IDCT_VP3 && HAVE_YASM){
+ if(mm_flags & AV_CPU_FLAG_SSE2){
c->idct_put= ff_vp3_idct_put_sse2;
c->idct_add= ff_vp3_idct_add_sse2;
c->idct = ff_vp3_idct_sse2;
@@ -2562,12 +2585,12 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) }else if(idct_algo==FF_IDCT_CAVS){
c->idct_permutation_type= FF_TRANSPOSE_IDCT_PERM;
}else if(idct_algo==FF_IDCT_XVIDMMX){
- if(mm_flags & FF_MM_SSE2){
+ if(mm_flags & AV_CPU_FLAG_SSE2){
c->idct_put= ff_idct_xvid_sse2_put;
c->idct_add= ff_idct_xvid_sse2_add;
c->idct = ff_idct_xvid_sse2;
c->idct_permutation_type= FF_SSE2_IDCT_PERM;
- }else if(mm_flags & FF_MM_MMX2){
+ }else if(mm_flags & AV_CPU_FLAG_MMX2){
c->idct_put= ff_idct_xvid_mmx2_put;
c->idct_add= ff_idct_xvid_mmx2_add;
c->idct = ff_idct_xvid_mmx2;
@@ -2579,12 +2602,12 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) }
}
- c->put_pixels_clamped = put_pixels_clamped_mmx;
- c->put_signed_pixels_clamped = put_signed_pixels_clamped_mmx;
- c->add_pixels_clamped = add_pixels_clamped_mmx;
+ c->put_pixels_clamped = ff_put_pixels_clamped_mmx;
+ c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_mmx;
+ c->add_pixels_clamped = ff_add_pixels_clamped_mmx;
c->clear_block = clear_block_mmx;
c->clear_blocks = clear_blocks_mmx;
- if (mm_flags & FF_MM_SSE){
+ if (mm_flags & AV_CPU_FLAG_SSE){
c->clear_block = clear_block_sse;
c->clear_blocks = clear_blocks_sse;
}
@@ -2615,14 +2638,17 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) c->h263_v_loop_filter= h263_v_loop_filter_mmx;
c->h263_h_loop_filter= h263_h_loop_filter_mmx;
}
- c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_mmx_rnd;
- c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_mmx;
- c->put_no_rnd_vc1_chroma_pixels_tab[0]= put_vc1_chroma_mc8_mmx_nornd;
- c->put_rv40_chroma_pixels_tab[0]= put_rv40_chroma_mc8_mmx;
- c->put_rv40_chroma_pixels_tab[1]= put_rv40_chroma_mc4_mmx;
+#if HAVE_YASM
+ c->put_h264_chroma_pixels_tab[0]= ff_put_h264_chroma_mc8_mmx_rnd;
+ c->put_h264_chroma_pixels_tab[1]= ff_put_h264_chroma_mc4_mmx;
+ c->put_no_rnd_vc1_chroma_pixels_tab[0]= ff_put_vc1_chroma_mc8_mmx_nornd;
+
+ c->put_rv40_chroma_pixels_tab[0]= ff_put_rv40_chroma_mc8_mmx;
+ c->put_rv40_chroma_pixels_tab[1]= ff_put_rv40_chroma_mc4_mmx;
+#endif
- if (mm_flags & FF_MM_MMX2) {
+ if (mm_flags & AV_CPU_FLAG_MMX2) {
c->prefetch = prefetch_mmx2;
c->put_pixels_tab[0][1] = put_pixels16_x2_mmx2;
@@ -2647,12 +2673,12 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmx2;
c->avg_pixels_tab[1][3] = avg_pixels8_xy2_mmx2;
- if (CONFIG_VP3_DECODER) {
+ if (CONFIG_VP3_DECODER && HAVE_YASM) {
c->vp3_v_loop_filter= ff_vp3_v_loop_filter_mmx2;
c->vp3_h_loop_filter= ff_vp3_h_loop_filter_mmx2;
}
}
- if (CONFIG_VP3_DECODER) {
+ if (CONFIG_VP3_DECODER && HAVE_YASM) {
c->vp3_idct_dc_add = ff_vp3_idct_dc_add_mmx2;
}
@@ -2699,21 +2725,21 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) SET_QPEL_FUNCS(avg_2tap_qpel, 0, 16, mmx2);
SET_QPEL_FUNCS(avg_2tap_qpel, 1, 8, mmx2);
- c->avg_rv40_chroma_pixels_tab[0]= avg_rv40_chroma_mc8_mmx2;
- c->avg_rv40_chroma_pixels_tab[1]= avg_rv40_chroma_mc4_mmx2;
+#if HAVE_YASM
+ c->avg_rv40_chroma_pixels_tab[0]= ff_avg_rv40_chroma_mc8_mmx2;
+ c->avg_rv40_chroma_pixels_tab[1]= ff_avg_rv40_chroma_mc4_mmx2;
- c->avg_no_rnd_vc1_chroma_pixels_tab[0]= avg_vc1_chroma_mc8_mmx2_nornd;
+ c->avg_no_rnd_vc1_chroma_pixels_tab[0]= ff_avg_vc1_chroma_mc8_mmx2_nornd;
- c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_mmx2_rnd;
- c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_mmx2;
- c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_mmx2;
- c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_mmx2;
+ c->avg_h264_chroma_pixels_tab[0]= ff_avg_h264_chroma_mc8_mmx2_rnd;
+ c->avg_h264_chroma_pixels_tab[1]= ff_avg_h264_chroma_mc4_mmx2;
+ c->avg_h264_chroma_pixels_tab[2]= ff_avg_h264_chroma_mc2_mmx2;
+ c->put_h264_chroma_pixels_tab[2]= ff_put_h264_chroma_mc2_mmx2;
-#if HAVE_YASM
c->add_hfyu_median_prediction = ff_add_hfyu_median_prediction_mmx2;
#endif
#if HAVE_7REGS && HAVE_TEN_OPERANDS
- if( mm_flags&FF_MM_3DNOW )
+ if( mm_flags&AV_CPU_FLAG_3DNOW )
c->add_hfyu_median_prediction = add_hfyu_median_prediction_cmov;
#endif
@@ -2721,7 +2747,7 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) ff_vc1dsp_init_mmx(c, avctx);
c->add_png_paeth_prediction= add_png_paeth_prediction_mmx2;
- } else if (mm_flags & FF_MM_3DNOW) {
+ } else if (mm_flags & AV_CPU_FLAG_3DNOW) {
c->prefetch = prefetch_3dnow;
c->put_pixels_tab[0][1] = put_pixels16_x2_3dnow;
@@ -2772,11 +2798,15 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) SET_QPEL_FUNCS(avg_2tap_qpel, 0, 16, 3dnow);
SET_QPEL_FUNCS(avg_2tap_qpel, 1, 8, 3dnow);
- c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_3dnow_rnd;
- c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_3dnow;
+#if HAVE_YASM
+ c->avg_h264_chroma_pixels_tab[0]= ff_avg_h264_chroma_mc8_3dnow_rnd;
+ c->avg_h264_chroma_pixels_tab[1]= ff_avg_h264_chroma_mc4_3dnow;
+
+ c->avg_no_rnd_vc1_chroma_pixels_tab[0]= ff_avg_vc1_chroma_mc8_3dnow_nornd;
- c->avg_rv40_chroma_pixels_tab[0]= avg_rv40_chroma_mc8_3dnow;
- c->avg_rv40_chroma_pixels_tab[1]= avg_rv40_chroma_mc4_3dnow;
+ c->avg_rv40_chroma_pixels_tab[0]= ff_avg_rv40_chroma_mc8_3dnow;
+ c->avg_rv40_chroma_pixels_tab[1]= ff_avg_rv40_chroma_mc4_3dnow;
+#endif
}
@@ -2785,13 +2815,13 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) c->put_h264_qpel_pixels_tab[1][x+y*4] = put_h264_qpel8_mc##x##y##_##CPU;\
c->avg_h264_qpel_pixels_tab[0][x+y*4] = avg_h264_qpel16_mc##x##y##_##CPU;\
c->avg_h264_qpel_pixels_tab[1][x+y*4] = avg_h264_qpel8_mc##x##y##_##CPU;
- if((mm_flags & FF_MM_SSE2) && !(mm_flags & FF_MM_3DNOW)){
+ if((mm_flags & AV_CPU_FLAG_SSE2) && !(mm_flags & AV_CPU_FLAG_3DNOW)){
// these functions are slower than mmx on AMD, but faster on Intel
c->put_pixels_tab[0][0] = put_pixels16_sse2;
c->avg_pixels_tab[0][0] = avg_pixels16_sse2;
H264_QPEL_FUNCS(0, 0, sse2);
}
- if(mm_flags & FF_MM_SSE2){
+ if(mm_flags & AV_CPU_FLAG_SSE2){
H264_QPEL_FUNCS(0, 1, sse2);
H264_QPEL_FUNCS(0, 2, sse2);
H264_QPEL_FUNCS(0, 3, sse2);
@@ -2806,7 +2836,7 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) H264_QPEL_FUNCS(3, 3, sse2);
}
#if HAVE_SSSE3
- if(mm_flags & FF_MM_SSSE3){
+ if(mm_flags & AV_CPU_FLAG_SSSE3){
H264_QPEL_FUNCS(1, 0, ssse3);
H264_QPEL_FUNCS(1, 1, ssse3);
H264_QPEL_FUNCS(1, 2, ssse3);
@@ -2819,16 +2849,16 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) H264_QPEL_FUNCS(3, 1, ssse3);
H264_QPEL_FUNCS(3, 2, ssse3);
H264_QPEL_FUNCS(3, 3, ssse3);
- c->put_no_rnd_vc1_chroma_pixels_tab[0]= put_vc1_chroma_mc8_ssse3_nornd;
- c->avg_no_rnd_vc1_chroma_pixels_tab[0]= avg_vc1_chroma_mc8_ssse3_nornd;
- c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_ssse3_rnd;
- c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_ssse3_rnd;
- c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_ssse3;
- c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_ssse3;
c->add_png_paeth_prediction= add_png_paeth_prediction_ssse3;
#if HAVE_YASM
+ c->put_no_rnd_vc1_chroma_pixels_tab[0]= ff_put_vc1_chroma_mc8_ssse3_nornd;
+ c->avg_no_rnd_vc1_chroma_pixels_tab[0]= ff_avg_vc1_chroma_mc8_ssse3_nornd;
+ c->put_h264_chroma_pixels_tab[0]= ff_put_h264_chroma_mc8_ssse3_rnd;
+ c->avg_h264_chroma_pixels_tab[0]= ff_avg_h264_chroma_mc8_ssse3_rnd;
+ c->put_h264_chroma_pixels_tab[1]= ff_put_h264_chroma_mc4_ssse3;
+ c->avg_h264_chroma_pixels_tab[1]= ff_avg_h264_chroma_mc4_ssse3;
c->add_hfyu_left_prediction = ff_add_hfyu_left_prediction_ssse3;
- if (mm_flags & FF_MM_SSE4) // not really sse4, just slow on Conroe
+ if (mm_flags & AV_CPU_FLAG_SSE4) // not really sse4, just slow on Conroe
c->add_hfyu_left_prediction = ff_add_hfyu_left_prediction_sse4;
#endif
}
@@ -2838,7 +2868,7 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) * todo: test if it still causes crashes
*/
#if ARCH_X86_32
- if(mm_flags & FF_MM_3DNOW){
+ if(mm_flags & AV_CPU_FLAG_3DNOW){
c->vorbis_inverse_coupling = vorbis_inverse_coupling_3dnow;
c->vector_fmul = vector_fmul_3dnow;
if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
@@ -2846,14 +2876,14 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) c->float_to_int16_interleave = float_to_int16_interleave_3dnow;
}
}
- if(mm_flags & FF_MM_3DNOWEXT){
+ if(mm_flags & AV_CPU_FLAG_3DNOWEXT){
c->vector_fmul_reverse = vector_fmul_reverse_3dnow2;
c->vector_fmul_window = vector_fmul_window_3dnow2;
if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
c->float_to_int16_interleave = float_to_int16_interleave_3dn2;
}
}
- if(mm_flags & FF_MM_SSE){
+ if(mm_flags & AV_CPU_FLAG_SSE){
c->vorbis_inverse_coupling = vorbis_inverse_coupling_sse;
c->ac3_downmix = ac3_downmix_sse;
c->vector_fmul = vector_fmul_sse;
@@ -2869,9 +2899,9 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) #endif
#endif
}
- if(mm_flags & FF_MM_3DNOW)
+ if(mm_flags & AV_CPU_FLAG_3DNOW)
c->vector_fmul_add = vector_fmul_add_3dnow; // faster than sse
- if(mm_flags & FF_MM_SSE2){
+ if(mm_flags & AV_CPU_FLAG_SSE2){
c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse2;
c->float_to_int16 = float_to_int16_sse2;
c->float_to_int16_interleave = float_to_int16_interleave_sse2;
@@ -2883,92 +2913,6 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) dsputilenc_init_mmx(c, avctx);
}
-#if CONFIG_H264DSP
-void ff_h264dsp_init_x86(H264DSPContext *c)
-{
- int mm_flags = mm_support();
-
- if (mm_flags & FF_MM_MMX) {
- c->h264_idct_dc_add=
- c->h264_idct_add= ff_h264_idct_add_mmx;
- c->h264_idct8_dc_add=
- c->h264_idct8_add= ff_h264_idct8_add_mmx;
-
- c->h264_idct_add16 = ff_h264_idct_add16_mmx;
- c->h264_idct8_add4 = ff_h264_idct8_add4_mmx;
- c->h264_idct_add8 = ff_h264_idct_add8_mmx;
- c->h264_idct_add16intra= ff_h264_idct_add16intra_mmx;
-
- if (mm_flags & FF_MM_MMX2) {
- c->h264_idct_dc_add= ff_h264_idct_dc_add_mmx2;
- c->h264_idct8_dc_add= ff_h264_idct8_dc_add_mmx2;
- c->h264_idct_add16 = ff_h264_idct_add16_mmx2;
- c->h264_idct8_add4 = ff_h264_idct8_add4_mmx2;
- c->h264_idct_add8 = ff_h264_idct_add8_mmx2;
- c->h264_idct_add16intra= ff_h264_idct_add16intra_mmx2;
-
- c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_mmx2;
- c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_mmx2;
- c->h264_v_loop_filter_chroma= h264_v_loop_filter_chroma_mmx2;
- c->h264_h_loop_filter_chroma= h264_h_loop_filter_chroma_mmx2;
- c->h264_v_loop_filter_chroma_intra= h264_v_loop_filter_chroma_intra_mmx2;
- c->h264_h_loop_filter_chroma_intra= h264_h_loop_filter_chroma_intra_mmx2;
- c->h264_loop_filter_strength= h264_loop_filter_strength_mmx2;
-
- c->weight_h264_pixels_tab[0]= ff_h264_weight_16x16_mmx2;
- c->weight_h264_pixels_tab[1]= ff_h264_weight_16x8_mmx2;
- c->weight_h264_pixels_tab[2]= ff_h264_weight_8x16_mmx2;
- c->weight_h264_pixels_tab[3]= ff_h264_weight_8x8_mmx2;
- c->weight_h264_pixels_tab[4]= ff_h264_weight_8x4_mmx2;
- c->weight_h264_pixels_tab[5]= ff_h264_weight_4x8_mmx2;
- c->weight_h264_pixels_tab[6]= ff_h264_weight_4x4_mmx2;
- c->weight_h264_pixels_tab[7]= ff_h264_weight_4x2_mmx2;
-
- c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16x16_mmx2;
- c->biweight_h264_pixels_tab[1]= ff_h264_biweight_16x8_mmx2;
- c->biweight_h264_pixels_tab[2]= ff_h264_biweight_8x16_mmx2;
- c->biweight_h264_pixels_tab[3]= ff_h264_biweight_8x8_mmx2;
- c->biweight_h264_pixels_tab[4]= ff_h264_biweight_8x4_mmx2;
- c->biweight_h264_pixels_tab[5]= ff_h264_biweight_4x8_mmx2;
- c->biweight_h264_pixels_tab[6]= ff_h264_biweight_4x4_mmx2;
- c->biweight_h264_pixels_tab[7]= ff_h264_biweight_4x2_mmx2;
- }
- if(mm_flags & FF_MM_SSE2){
- c->h264_idct8_add = ff_h264_idct8_add_sse2;
- c->h264_idct8_add4= ff_h264_idct8_add4_sse2;
- }
-
-#if HAVE_YASM
- if (mm_flags & FF_MM_MMX2){
-#if ARCH_X86_32
- c->h264_v_loop_filter_luma_intra = ff_x264_deblock_v_luma_intra_mmxext;
- c->h264_h_loop_filter_luma_intra = ff_x264_deblock_h_luma_intra_mmxext;
-#endif
- if( mm_flags&FF_MM_SSE2 ){
- c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16x16_sse2;
- c->biweight_h264_pixels_tab[3]= ff_h264_biweight_8x8_sse2;
-#if ARCH_X86_64 || !defined(__ICC) || __ICC > 1110
- c->h264_v_loop_filter_luma = ff_x264_deblock_v_luma_sse2;
- c->h264_h_loop_filter_luma = ff_x264_deblock_h_luma_sse2;
- c->h264_v_loop_filter_luma_intra = ff_x264_deblock_v_luma_intra_sse2;
- c->h264_h_loop_filter_luma_intra = ff_x264_deblock_h_luma_intra_sse2;
-#endif
-#if CONFIG_GPL
- c->h264_idct_add16 = ff_h264_idct_add16_sse2;
- c->h264_idct_add8 = ff_h264_idct_add8_sse2;
- c->h264_idct_add16intra = ff_h264_idct_add16intra_sse2;
-#endif
- }
- if ( mm_flags&FF_MM_SSSE3 ){
- c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16x16_ssse3;
- c->biweight_h264_pixels_tab[3]= ff_h264_biweight_8x8_ssse3;
- }
- }
-#endif
- }
-}
-#endif /* CONFIG_H264DSP */
-
const char* avcodec_get_current_idct_mmx(AVCodecContext *avctx,DSPContext *c)
{
if (c->idct_put==ff_idct_xvid_mmx_put)
diff --git a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/dsputil_mmx.h b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/dsputil_mmx.h index 33dafed1f..58256fd40 100644 --- a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/dsputil_mmx.h +++ b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/dsputil_mmx.h @@ -57,7 +57,7 @@ extern const uint64_t ff_pb_7; extern const uint64_t ff_pb_1F; extern const uint64_t ff_pb_3F; extern const uint64_t ff_pb_81; -extern const uint64_t ff_pb_A1; +extern const xmm_reg ff_pb_A1; extern const xmm_reg ff_pb_F8; extern const uint64_t ff_pb_FC; extern const xmm_reg ff_pb_FE; @@ -94,6 +94,35 @@ extern const double ff_pd_2[2]; SBUTTERFLY(a,c,d,dq,q) /* a=aeim d=bfjn */\ SBUTTERFLY(t,b,c,dq,q) /* t=cgko c=dhlp */ +static inline void transpose4x4(uint8_t *dst, uint8_t *src, int dst_stride, int src_stride){ + __asm__ volatile( //FIXME could save 1 instruction if done as 8x4 ... + "movd %4, %%mm0 \n\t" + "movd %5, %%mm1 \n\t" + "movd %6, %%mm2 \n\t" + "movd %7, %%mm3 \n\t" + "punpcklbw %%mm1, %%mm0 \n\t" + "punpcklbw %%mm3, %%mm2 \n\t" + "movq %%mm0, %%mm1 \n\t" + "punpcklwd %%mm2, %%mm0 \n\t" + "punpckhwd %%mm2, %%mm1 \n\t" + "movd %%mm0, %0 \n\t" + "punpckhdq %%mm0, %%mm0 \n\t" + "movd %%mm0, %1 \n\t" + "movd %%mm1, %2 \n\t" + "punpckhdq %%mm1, %%mm1 \n\t" + "movd %%mm1, %3 \n\t" + + : "=m" (*(uint32_t*)(dst + 0*dst_stride)), + "=m" (*(uint32_t*)(dst + 1*dst_stride)), + "=m" (*(uint32_t*)(dst + 2*dst_stride)), + "=m" (*(uint32_t*)(dst + 3*dst_stride)) + : "m" (*(uint32_t*)(src + 0*src_stride)), + "m" (*(uint32_t*)(src + 1*src_stride)), + "m" (*(uint32_t*)(src + 2*src_stride)), + "m" (*(uint32_t*)(src + 3*src_stride)) + ); +} + // e,f,g,h can be memory // out: a,d,t,c #define TRANSPOSE8x4(a,b,c,d,e,f,g,h,t)\ @@ -158,9 +187,9 @@ extern const double ff_pd_2[2]; void dsputilenc_init_mmx(DSPContext* c, AVCodecContext *avctx); void dsputil_init_pix_mmx(DSPContext* c, AVCodecContext *avctx); -void add_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size); -void put_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size); -void put_signed_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size); +void ff_add_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size); +void ff_put_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size); +void ff_put_signed_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size); void ff_put_cavs_qpel8_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride); void ff_avg_cavs_qpel8_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride); diff --git a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/fft.c b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/fft.c index eb5c65ecb..771b1e664 100644 --- a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/fft.c +++ b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/fft.c @@ -16,25 +16,26 @@ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ +#include "libavutil/cpu.h" #include "libavcodec/dsputil.h" #include "fft.h" av_cold void ff_fft_init_mmx(FFTContext *s) { #if HAVE_YASM - int has_vectors = mm_support(); - if (has_vectors & FF_MM_SSE && HAVE_SSE) { + int has_vectors = av_get_cpu_flags(); + if (has_vectors & AV_CPU_FLAG_SSE && HAVE_SSE) { /* SSE for P3/P4/K8 */ s->imdct_calc = ff_imdct_calc_sse; s->imdct_half = ff_imdct_half_sse; s->fft_permute = ff_fft_permute_sse; s->fft_calc = ff_fft_calc_sse; - } else if (has_vectors & FF_MM_3DNOWEXT && HAVE_AMD3DNOWEXT) { + } else if (has_vectors & AV_CPU_FLAG_3DNOWEXT && HAVE_AMD3DNOWEXT) { /* 3DNowEx for K7 */ s->imdct_calc = ff_imdct_calc_3dn2; s->imdct_half = ff_imdct_half_3dn2; s->fft_calc = ff_fft_calc_3dn2; - } else if (has_vectors & FF_MM_3DNOW && HAVE_AMD3DNOW) { + } else if (has_vectors & AV_CPU_FLAG_3DNOW && HAVE_AMD3DNOW) { /* 3DNow! for K6-2/3 */ s->imdct_calc = ff_imdct_calc_3dn; s->imdct_half = ff_imdct_half_3dn; @@ -46,8 +47,8 @@ av_cold void ff_fft_init_mmx(FFTContext *s) #if CONFIG_DCT av_cold void ff_dct_init_mmx(DCTContext *s) { - int has_vectors = mm_support(); - if (has_vectors & FF_MM_SSE && HAVE_SSE) + int has_vectors = av_get_cpu_flags(); + if (has_vectors & AV_CPU_FLAG_SSE && HAVE_SSE) s->dct32 = ff_dct32_float_sse; } #endif diff --git a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/fft_mmx.asm b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/fft_mmx.asm index 31176d6c9..b75ec0cc5 100644 --- a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/fft_mmx.asm +++ b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/fft_mmx.asm @@ -532,20 +532,15 @@ INIT_XMM unpckhps xmm0, xmm2 %endmacro -%macro PREROTATEW 3 ;addr1, addr2, xmm - movlps %1, %3 - movhps %2, %3 -%endmacro - %macro CMUL 6 ;j, xmm0, xmm1, 3, 4, 5 movaps xmm6, [%4+%1*2] movaps %2, [%4+%1*2+0x10] movaps %3, xmm6 movaps xmm7, %2 - mulps xmm6, [%5+%1*1] - mulps %2, [%6+%1*1] - mulps %3, [%6+%1*1] - mulps xmm7, [%5+%1*1] + mulps xmm6, [%5+%1] + mulps %2, [%6+%1] + mulps %3, [%6+%1] + mulps xmm7, [%5+%1] subps %2, xmm6 addps %3, xmm7 %endmacro @@ -576,8 +571,6 @@ cglobal imdct_half_sse, 3,7,8; FFTContext *s, FFTSample *output, const FFTSample %define rrevtab r10 %define rtcos r11 %define rtsin r12 - push r10 - push r11 push r12 push r13 push r14 @@ -620,21 +613,25 @@ cglobal imdct_half_sse, 3,7,8; FFTContext *s, FFTSample *output, const FFTSample PREROTATER r4, r3, r2, rtcos, rtsin %ifdef ARCH_X86_64 - movzx r5, word [rrevtab+r4*1-4] - movzx r6, word [rrevtab+r4*1-2] - movzx r13, word [rrevtab+r3*1] - movzx r14, word [rrevtab+r3*1+2] - PREROTATEW [r1+r5 *8], [r1+r6 *8], xmm0 - PREROTATEW [r1+r13*8], [r1+r14*8], xmm1 + movzx r5, word [rrevtab+r4-4] + movzx r6, word [rrevtab+r4-2] + movzx r13, word [rrevtab+r3] + movzx r14, word [rrevtab+r3+2] + movlps [r1+r5 *8], xmm0 + movhps [r1+r6 *8], xmm0 + movlps [r1+r13*8], xmm1 + movhps [r1+r14*8], xmm1 add r4, 4 %else mov r6, [esp] - movzx r5, word [r6+r4*1-4] - movzx r4, word [r6+r4*1-2] - PREROTATEW [r1+r5*8], [r1+r4*8], xmm0 - movzx r5, word [r6+r3*1] - movzx r4, word [r6+r3*1+2] - PREROTATEW [r1+r5*8], [r1+r4*8], xmm1 + movzx r5, word [r6+r4-4] + movzx r4, word [r6+r4-2] + movlps [r1+r5*8], xmm0 + movhps [r1+r4*8], xmm0 + movzx r5, word [r6+r3] + movzx r4, word [r6+r3+2] + movlps [r1+r5*8], xmm1 + movhps [r1+r4*8], xmm1 %endif sub r3, 4 jns .pre @@ -663,8 +660,6 @@ cglobal imdct_half_sse, 3,7,8; FFTContext *s, FFTSample *output, const FFTSample pop r14 pop r13 pop r12 - pop r11 - pop r10 %else add esp, 12 %endif diff --git a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/h264_chromamc.asm b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/h264_chromamc.asm new file mode 100644 index 000000000..6df82cc52 --- /dev/null +++ b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/h264_chromamc.asm @@ -0,0 +1,671 @@ +;****************************************************************************** +;* MMX/SSSE3-optimized functions for H264 chroma MC +;* Copyright (c) 2005 Zoltan Hidvegi <hzoli -a- hzoli -d- com>, +;* 2005-2008 Loren Merritt +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with FFmpeg; if not, write to the Free Software +;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "x86inc.asm" +%include "x86util.asm" + +SECTION_RODATA + +rnd_rv40_2d_tbl: times 4 dw 0 + times 4 dw 16 + times 4 dw 32 + times 4 dw 16 + times 4 dw 32 + times 4 dw 28 + times 4 dw 32 + times 4 dw 28 + times 4 dw 0 + times 4 dw 32 + times 4 dw 16 + times 4 dw 32 + times 4 dw 32 + times 4 dw 28 + times 4 dw 32 + times 4 dw 28 +rnd_rv40_1d_tbl: times 4 dw 0 + times 4 dw 2 + times 4 dw 4 + times 4 dw 2 + times 4 dw 4 + times 4 dw 3 + times 4 dw 4 + times 4 dw 3 + times 4 dw 0 + times 4 dw 4 + times 4 dw 2 + times 4 dw 4 + times 4 dw 4 + times 4 dw 3 + times 4 dw 4 + times 4 dw 3 + +cextern pw_3 +cextern pw_4 +cextern pw_8 +cextern pw_28 +cextern pw_32 +cextern pw_64 + +SECTION .text + +%macro mv0_pixels_mc8 0 + lea r4, [r2*2 ] +.next4rows + movq mm0, [r1 ] + movq mm1, [r1+r2] + CHROMAMC_AVG mm0, [r0 ] + CHROMAMC_AVG mm1, [r0+r2] + movq [r0 ], mm0 + movq [r0+r2], mm1 + add r0, r4 + add r1, r4 + movq mm0, [r1 ] + movq mm1, [r1+r2] + CHROMAMC_AVG mm0, [r0 ] + CHROMAMC_AVG mm1, [r0+r2] + add r1, r4 + movq [r0 ], mm0 + movq [r0+r2], mm1 + add r0, r4 + sub r3d, 4 + jne .next4rows +%endmacro + +%macro chroma_mc8_mmx_func 3 +; put/avg_h264_chroma_mc8_mmx_*(uint8_t *dst /*align 8*/, uint8_t *src /*align 1*/, +; int stride, int h, int mx, int my) +cglobal %1_%2_chroma_mc8_%3, 6, 7, 0 +%ifdef ARCH_X86_64 + movsxd r2, r2d +%endif + mov r6d, r5d + or r6d, r4d + jne .at_least_one_non_zero + ; mx == 0 AND my == 0 - no filter needed + mv0_pixels_mc8 + REP_RET + +.at_least_one_non_zero +%ifidn %2, rv40 +%ifdef PIC +%define rnd_1d_rv40 r11 +%define rnd_2d_rv40 r11 +%else ; no-PIC +%define rnd_1d_rv40 rnd_rv40_1d_tbl +%define rnd_2d_rv40 rnd_rv40_2d_tbl +%endif +%ifdef ARCH_X86_64 + mov r10, r5 + and r10, 6 ; &~1 for mx/my=[0,7] + lea r10, [r10*4+r4] + sar r10d, 1 +%define rnd_bias r10 +%define dest_reg r0 +%else ; x86-32 + mov r0, r5 + and r0, 6 ; &~1 for mx/my=[0,7] + lea r0, [r0*4+r4] + sar r0d, 1 +%define rnd_bias r0 +%define dest_reg r5 +%endif +%else ; vc1, h264 +%define rnd_bias 0 +%define dest_reg r0 +%endif + + test r5d, r5d + mov r6, 1 + je .my_is_zero + test r4d, r4d + mov r6, r2 ; dxy = x ? 1 : stride + jne .both_non_zero +.my_is_zero + ; mx == 0 XOR my == 0 - 1 dimensional filter only + or r4d, r5d ; x + y + +%ifidn %2, rv40 +%ifdef PIC + lea r11, [rnd_rv40_1d_tbl] +%endif +%ifndef ARCH_X86_64 + mov r5, r0m +%endif +%endif + + movd m5, r4d + movq m4, [pw_8] + movq m6, [rnd_1d_%2+rnd_bias*8] ; mm6 = rnd >> 3 + punpcklwd m5, m5 + punpckldq m5, m5 ; mm5 = B = x + pxor m7, m7 + psubw m4, m5 ; mm4 = A = 8-x + +.next1drow + movq m0, [r1 ] ; mm0 = src[0..7] + movq m2, [r1+r6] ; mm1 = src[1..8] + + movq m1, m0 + movq m3, m2 + punpcklbw m0, m7 + punpckhbw m1, m7 + punpcklbw m2, m7 + punpckhbw m3, m7 + pmullw m0, m4 ; [mm0,mm1] = A * src[0..7] + pmullw m1, m4 + pmullw m2, m5 ; [mm2,mm3] = B * src[1..8] + pmullw m3, m5 + + paddw m0, m6 + paddw m1, m6 + paddw m0, m2 + paddw m1, m3 + psrlw m0, 3 + psrlw m1, 3 + packuswb m0, m1 + CHROMAMC_AVG m0, [dest_reg] + movq [dest_reg], m0 ; dst[0..7] = (A * src[0..7] + B * src[1..8] + (rnd >> 3)) >> 3 + + add dest_reg, r2 + add r1, r2 + dec r3d + jne .next1drow + REP_RET + +.both_non_zero ; general case, bilinear + movd m4, r4d ; x + movd m6, r5d ; y +%ifidn %2, rv40 +%ifdef PIC + lea r11, [rnd_rv40_2d_tbl] +%endif +%ifndef ARCH_X86_64 + mov r5, r0m +%endif +%endif + mov r6, rsp ; backup stack pointer + and rsp, ~(mmsize-1) ; align stack + sub rsp, 16 ; AA and DD + + punpcklwd m4, m4 + punpcklwd m6, m6 + punpckldq m4, m4 ; mm4 = x words + punpckldq m6, m6 ; mm6 = y words + movq m5, m4 + pmullw m4, m6 ; mm4 = x * y + psllw m5, 3 + psllw m6, 3 + movq m7, m5 + paddw m7, m6 + movq [rsp+8], m4 ; DD = x * y + psubw m5, m4 ; mm5 = B = 8x - xy + psubw m6, m4 ; mm6 = C = 8y - xy + paddw m4, [pw_64] + psubw m4, m7 ; mm4 = A = xy - (8x+8y) + 64 + pxor m7, m7 + movq [rsp ], m4 + + movq m0, [r1 ] ; mm0 = src[0..7] + movq m1, [r1+1] ; mm1 = src[1..8] +.next2drow + add r1, r2 + + movq m2, m0 + movq m3, m1 + punpckhbw m0, m7 + punpcklbw m1, m7 + punpcklbw m2, m7 + punpckhbw m3, m7 + pmullw m0, [rsp] + pmullw m2, [rsp] + pmullw m1, m5 + pmullw m3, m5 + paddw m2, m1 ; mm2 = A * src[0..3] + B * src[1..4] + paddw m3, m0 ; mm3 = A * src[4..7] + B * src[5..8] + + movq m0, [r1] + movq m1, m0 + punpcklbw m0, m7 + punpckhbw m1, m7 + pmullw m0, m6 + pmullw m1, m6 + paddw m2, m0 + paddw m3, m1 ; [mm2,mm3] += C * src[0..7] + + movq m1, [r1+1] + movq m0, m1 + movq m4, m1 + punpcklbw m0, m7 + punpckhbw m4, m7 + pmullw m0, [rsp+8] + pmullw m4, [rsp+8] + paddw m2, m0 + paddw m3, m4 ; [mm2,mm3] += D * src[1..8] + movq m0, [r1] + + paddw m2, [rnd_2d_%2+rnd_bias*8] + paddw m3, [rnd_2d_%2+rnd_bias*8] + psrlw m2, 6 + psrlw m3, 6 + packuswb m2, m3 + CHROMAMC_AVG m2, [dest_reg] + movq [dest_reg], m2 ; dst[0..7] = ([mm2,mm3] + rnd) >> 6 + + add dest_reg, r2 + dec r3d + jne .next2drow + mov rsp, r6 ; restore stack pointer + RET +%endmacro + +%macro chroma_mc4_mmx_func 3 +cglobal %1_%2_chroma_mc4_%3, 6, 6, 0 +%ifdef ARCH_X86_64 + movsxd r2, r2d +%endif + pxor m7, m7 + movd m2, r4d ; x + movd m3, r5d ; y + movq m4, [pw_8] + movq m5, [pw_8] + punpcklwd m2, m2 + punpcklwd m3, m3 + punpcklwd m2, m2 + punpcklwd m3, m3 + psubw m4, m2 + psubw m5, m3 + +%ifidn %2, rv40 +%ifdef PIC + lea r11, [rnd_rv40_2d_tbl] +%define rnd_2d_rv40 r11 +%else +%define rnd_2d_rv40 rnd_rv40_2d_tbl +%endif + and r5, 6 ; &~1 for mx/my=[0,7] + lea r5, [r5*4+r4] + sar r5d, 1 +%define rnd_bias r5 +%else ; vc1, h264 +%define rnd_bias 0 +%endif + + movd m0, [r1 ] + movd m6, [r1+1] + add r1, r2 + punpcklbw m0, m7 + punpcklbw m6, m7 + pmullw m0, m4 + pmullw m6, m2 + paddw m6, m0 + +.next2rows + movd m0, [r1 ] + movd m1, [r1+1] + add r1, r2 + punpcklbw m0, m7 + punpcklbw m1, m7 + pmullw m0, m4 + pmullw m1, m2 + paddw m1, m0 + movq m0, m1 + + pmullw m6, m5 + pmullw m1, m3 + paddw m6, [rnd_2d_%2+rnd_bias*8] + paddw m1, m6 + psrlw m1, 6 + packuswb m1, m1 + CHROMAMC_AVG4 m1, m6, [r0] + movd [r0], m1 + add r0, r2 + + movd m6, [r1 ] + movd m1, [r1+1] + add r1, r2 + punpcklbw m6, m7 + punpcklbw m1, m7 + pmullw m6, m4 + pmullw m1, m2 + paddw m1, m6 + movq m6, m1 + pmullw m0, m5 + pmullw m1, m3 + paddw m0, [rnd_2d_%2+rnd_bias*8] + paddw m1, m0 + psrlw m1, 6 + packuswb m1, m1 + CHROMAMC_AVG4 m1, m0, [r0] + movd [r0], m1 + add r0, r2 + sub r3d, 2 + jnz .next2rows + REP_RET +%endmacro + +%macro chroma_mc2_mmx_func 3 +cglobal %1_%2_chroma_mc2_%3, 6, 7, 0 +%ifdef ARCH_X86_64 + movsxd r2, r2d +%endif + + mov r6d, r4d + shl r4d, 16 + sub r4d, r6d + add r4d, 8 + imul r5d, r4d ; x*y<<16 | y*(8-x) + shl r4d, 3 + sub r4d, r5d ; x*(8-y)<<16 | (8-x)*(8-y) + + movd m5, r4d + movd m6, r5d + punpckldq m5, m5 ; mm5 = {A,B,A,B} + punpckldq m6, m6 ; mm6 = {C,D,C,D} + pxor m7, m7 + movd m2, [r1] + punpcklbw m2, m7 + pshufw m2, m2, 0x94 ; mm0 = src[0,1,1,2] + +.nextrow + add r1, r2 + movq m1, m2 + pmaddwd m1, m5 ; mm1 = A * src[0,1] + B * src[1,2] + movd m0, [r1] + punpcklbw m0, m7 + pshufw m0, m0, 0x94 ; mm0 = src[0,1,1,2] + movq m2, m0 + pmaddwd m0, m6 + paddw m1, [rnd_2d_%2] + paddw m1, m0 ; mm1 += C * src[0,1] + D * src[1,2] + psrlw m1, 6 + packssdw m1, m7 + packuswb m1, m7 + CHROMAMC_AVG4 m1, m3, [r0] + movd r5d, m1 + mov [r0], r5w + add r0, r2 + sub r3d, 1 + jnz .nextrow + REP_RET +%endmacro + +%define rnd_1d_h264 pw_4 +%define rnd_2d_h264 pw_32 +%define rnd_1d_vc1 pw_3 +%define rnd_2d_vc1 pw_28 + +%macro NOTHING 2-3 +%endmacro +%macro DIRECT_AVG 2 + PAVG %1, %2 +%endmacro +%macro COPY_AVG 3 + movd %2, %3 + PAVG %1, %2 +%endmacro + +INIT_MMX +%define CHROMAMC_AVG NOTHING +%define CHROMAMC_AVG4 NOTHING +chroma_mc8_mmx_func put, h264, mmx_rnd +chroma_mc8_mmx_func put, vc1, mmx_nornd +chroma_mc8_mmx_func put, rv40, mmx +chroma_mc4_mmx_func put, h264, mmx +chroma_mc4_mmx_func put, rv40, mmx +chroma_mc2_mmx_func put, h264, mmx2 + +%define CHROMAMC_AVG DIRECT_AVG +%define CHROMAMC_AVG4 COPY_AVG +%define PAVG pavgb +chroma_mc8_mmx_func avg, h264, mmx2_rnd +chroma_mc8_mmx_func avg, vc1, mmx2_nornd +chroma_mc8_mmx_func avg, rv40, mmx2 +chroma_mc4_mmx_func avg, h264, mmx2 +chroma_mc4_mmx_func avg, rv40, mmx2 +chroma_mc2_mmx_func avg, h264, mmx2 + +%define PAVG pavgusb +chroma_mc8_mmx_func avg, h264, 3dnow_rnd +chroma_mc8_mmx_func avg, vc1, 3dnow_nornd +chroma_mc8_mmx_func avg, rv40, 3dnow +chroma_mc4_mmx_func avg, h264, 3dnow +chroma_mc4_mmx_func avg, rv40, 3dnow + +%macro chroma_mc8_ssse3_func 3 +cglobal %1_%2_chroma_mc8_%3, 6, 7, 8 +%ifdef ARCH_X86_64 + movsxd r2, r2d +%endif + mov r6d, r5d + or r6d, r4d + jne .at_least_one_non_zero + ; mx == 0 AND my == 0 - no filter needed + mv0_pixels_mc8 + REP_RET + +.at_least_one_non_zero + test r5d, r5d + je .my_is_zero + test r4d, r4d + je .mx_is_zero + + ; general case, bilinear + mov r6d, r4d + shl r4d, 8 + sub r4, r6 + add r4, 8 ; x*288+8 = x<<8 | (8-x) + mov r6, 8 + sub r6d, r5d + imul r6, r4 ; (8-y)*(x*255+8) = (8-y)*x<<8 | (8-y)*(8-x) + imul r4d, r5d ; y *(x*255+8) = y *x<<8 | y *(8-x) + + movd m7, r6d + movd m6, r4d + movdqa m5, [rnd_2d_%2] + pshuflw m7, m7, 0 + pshuflw m6, m6, 0 + movlhps m7, m7 + movlhps m6, m6 + + movq m0, [r1 ] + movq m1, [r1 +1] + punpcklbw m0, m1 + add r1, r2 +.next2rows + movq m1, [r1 ] + movq m2, [r1 +1] + movq m3, [r1+r2 ] + movq m4, [r1+r2+1] + lea r1, [r1+r2*2] + punpcklbw m1, m2 + punpcklbw m3, m4 + movdqa m2, m1 + movdqa m4, m3 + pmaddubsw m0, m7 + pmaddubsw m1, m6 + pmaddubsw m2, m7 + pmaddubsw m3, m6 + paddw m0, m5 + paddw m2, m5 + paddw m1, m0 + paddw m3, m2 + movdqa m0, m4 + psrlw m1, 6 + psrlw m3, 6 +%ifidn %1, avg + movq m2, [r0 ] + movhps m2, [r0+r2] +%endif + packuswb m1, m3 + CHROMAMC_AVG m1, m2 + movq [r0 ], m1 + movhps [r0+r2], m1 + sub r3d, 2 + lea r0, [r0+r2*2] + jg .next2rows + REP_RET + +.my_is_zero + mov r5d, r4d + shl r4d, 8 + add r4, 8 + sub r4, r5 ; 255*x+8 = x<<8 | (8-x) + movd m7, r4d + movq m6, [rnd_1d_%2] + pshuflw m7, m7, 0 + movlhps m6, m6 + movlhps m7, m7 + +.next2xrows + movq m0, [r1 ] + movq m1, [r1 +1] + movq m2, [r1+r2 ] + movq m3, [r1+r2+1] + punpcklbw m0, m1 + punpcklbw m2, m3 + pmaddubsw m0, m7 + pmaddubsw m2, m7 +%ifidn %1, avg + movq m4, [r0 ] + movhps m4, [r0+r2] +%endif + paddw m0, m6 + paddw m2, m6 + psrlw m0, 3 + psrlw m2, 3 + packuswb m0, m2 + CHROMAMC_AVG m0, m4 + movq [r0 ], m0 + movhps [r0+r2], m0 + sub r3d, 2 + lea r0, [r0+r2*2] + lea r1, [r1+r2*2] + jg .next2xrows + REP_RET + +.mx_is_zero + mov r4d, r5d + shl r5d, 8 + add r5, 8 + sub r5, r4 ; 255*y+8 = y<<8 | (8-y) + movd m7, r5d + movq m6, [rnd_1d_%2] + pshuflw m7, m7, 0 + movlhps m6, m6 + movlhps m7, m7 + +.next2yrows + movq m0, [r1 ] + movq m1, [r1+r2 ] + movdqa m2, m1 + movq m3, [r1+r2*2] + punpcklbw m0, m1 + punpcklbw m2, m3 + pmaddubsw m0, m7 + pmaddubsw m2, m7 +%ifidn %1, avg + movq m4, [r0 ] + movhps m4, [r0+r2] +%endif + paddw m0, m6 + paddw m2, m6 + psrlw m0, 3 + psrlw m2, 3 + packuswb m0, m2 + CHROMAMC_AVG m0, m4 + movq [r0 ], m0 + movhps [r0+r2], m0 + sub r3d, 2 + lea r0, [r0+r2*2] + lea r1, [r1+r2*2] + jg .next2yrows + REP_RET +%endmacro + +%macro chroma_mc4_ssse3_func 3 +cglobal %1_%2_chroma_mc4_%3, 6, 7, 0 +%ifdef ARCH_X86_64 + movsxd r2, r2d +%endif + mov r6, r4 + shl r4d, 8 + sub r4d, r6d + add r4d, 8 ; x*288+8 + mov r6, 8 + sub r6d, r5d + imul r6d, r4d ; (8-y)*(x*255+8) = (8-y)*x<<8 | (8-y)*(8-x) + imul r4d, r5d ; y *(x*255+8) = y *x<<8 | y *(8-x) + + movd m7, r6d + movd m6, r4d + movq m5, [pw_32] + pshufw m7, m7, 0 + pshufw m6, m6, 0 + + movd m0, [r1 ] + punpcklbw m0, [r1 +1] + add r1, r2 +.next2rows + movd m1, [r1 ] + movd m3, [r1+r2 ] + punpcklbw m1, [r1 +1] + punpcklbw m3, [r1+r2+1] + lea r1, [r1+r2*2] + movq m2, m1 + movq m4, m3 + pmaddubsw m0, m7 + pmaddubsw m1, m6 + pmaddubsw m2, m7 + pmaddubsw m3, m6 + paddw m0, m5 + paddw m2, m5 + paddw m1, m0 + paddw m3, m2 + movq m0, m4 + psrlw m1, 6 + psrlw m3, 6 + packuswb m1, m1 + packuswb m3, m3 + CHROMAMC_AVG m1, [r0 ] + CHROMAMC_AVG m3, [r0+r2] + movd [r0 ], m1 + movd [r0+r2], m3 + sub r3d, 2 + lea r0, [r0+r2*2] + jg .next2rows + REP_RET +%endmacro + +%define CHROMAMC_AVG NOTHING +INIT_XMM +chroma_mc8_ssse3_func put, h264, ssse3_rnd +chroma_mc8_ssse3_func put, vc1, ssse3_nornd +INIT_MMX +chroma_mc4_ssse3_func put, h264, ssse3 + +%define CHROMAMC_AVG DIRECT_AVG +%define PAVG pavgb +INIT_XMM +chroma_mc8_ssse3_func avg, h264, ssse3_rnd +chroma_mc8_ssse3_func avg, vc1, ssse3_nornd +INIT_MMX +chroma_mc4_ssse3_func avg, h264, ssse3 diff --git a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/h264_deblock_sse2.asm b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/h264_deblock.asm index a9e6dea3d..fb9cacfd1 100644 --- a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/h264_deblock_sse2.asm +++ b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/h264_deblock.asm @@ -4,6 +4,7 @@ ;* Copyright (C) 2005-2008 x264 project ;* ;* Authors: Loren Merritt <lorenm@u.washington.edu> +;* Jason Garrett-Glaser <darkshikari@gmail.com> ;* ;* This file is part of FFmpeg. ;* @@ -23,12 +24,14 @@ ;****************************************************************************** %include "x86inc.asm" +%include "x86util.asm" SECTION_RODATA -pb_00: times 16 db 0x00 -pb_01: times 16 db 0x01 -pb_03: times 16 db 0x03 -pb_a1: times 16 db 0xa1 + +cextern pb_0 +cextern pb_1 +cextern pb_3 +cextern pb_A1 SECTION .text @@ -104,7 +107,7 @@ SECTION .text movd %8, m5 %endmacro -%macro SBUTTERFLY 4 +%macro SBUTTERFLY3 4 movq %4, %2 punpckl%1 %2, %3 punpckh%1 %4, %3 @@ -120,19 +123,19 @@ SECTION .text movq m4, %5 movq m5, %6 movq m6, %7 - SBUTTERFLY bw, m0, m1, m7 - SBUTTERFLY bw, m2, m3, m1 - SBUTTERFLY bw, m4, m5, m3 + SBUTTERFLY3 bw, m0, m1, m7 + SBUTTERFLY3 bw, m2, m3, m1 + SBUTTERFLY3 bw, m4, m5, m3 movq [%9+0x10], m1 - SBUTTERFLY bw, m6, %8, m5 - SBUTTERFLY wd, m0, m2, m1 - SBUTTERFLY wd, m4, m6, m2 + SBUTTERFLY3 bw, m6, %8, m5 + SBUTTERFLY3 wd, m0, m2, m1 + SBUTTERFLY3 wd, m4, m6, m2 punpckhdq m0, m4 movq [%9+0x00], m0 - SBUTTERFLY wd, m7, [%9+0x10], m6 - SBUTTERFLY wd, m3, m5, m4 - SBUTTERFLY dq, m7, m3, m0 - SBUTTERFLY dq, m1, m2, m5 + SBUTTERFLY3 wd, m7, [%9+0x10], m6 + SBUTTERFLY3 wd, m3, m5, m4 + SBUTTERFLY3 dq, m7, m3, m0 + SBUTTERFLY3 dq, m1, m2, m5 punpckldq m6, m4 movq [%9+0x10], m1 movq [%9+0x20], m5 @@ -151,25 +154,25 @@ SECTION .text movq m4, %5 movq m5, %6 movq m6, %7 - SBUTTERFLY bw, m0, m1, m7 - SBUTTERFLY bw, m2, m3, m1 - SBUTTERFLY bw, m4, m5, m3 - SBUTTERFLY bw, m6, %8, m5 + SBUTTERFLY3 bw, m0, m1, m7 + SBUTTERFLY3 bw, m2, m3, m1 + SBUTTERFLY3 bw, m4, m5, m3 + SBUTTERFLY3 bw, m6, %8, m5 movq %9, m3 - SBUTTERFLY wd, m0, m2, m3 - SBUTTERFLY wd, m4, m6, m2 - SBUTTERFLY wd, m7, m1, m6 + SBUTTERFLY3 wd, m0, m2, m3 + SBUTTERFLY3 wd, m4, m6, m2 + SBUTTERFLY3 wd, m7, m1, m6 movq %11, m2 movq m2, %9 - SBUTTERFLY wd, m2, m5, m1 - SBUTTERFLY dq, m0, m4, m5 - SBUTTERFLY dq, m7, m2, m4 + SBUTTERFLY3 wd, m2, m5, m1 + SBUTTERFLY3 dq, m0, m4, m5 + SBUTTERFLY3 dq, m7, m2, m4 movq %9, m0 movq %10, m5 movq %13, m7 movq %14, m4 - SBUTTERFLY dq, m3, %11, m0 - SBUTTERFLY dq, m6, m1, m5 + SBUTTERFLY3 dq, m3, %11, m0 + SBUTTERFLY3 dq, m6, m1, m5 movq %11, m3 movq %12, m0 movq %15, m6 @@ -235,19 +238,19 @@ SECTION .text ; clobbers: m0,3-6 %macro DEBLOCK_P0_Q0 0 mova m5, m1 - pxor m5, m2 ; p0^q0 - pand m5, [pb_01] ; (p0^q0)&1 + pxor m5, m2 ; p0^q0 + pand m5, [pb_1] ; (p0^q0)&1 pcmpeqb m4, m4 pxor m3, m4 - pavgb m3, m0 ; (p1 - q1 + 256)>>1 - pavgb m3, [pb_03] ; (((p1 - q1 + 256)>>1)+4)>>1 = 64+2+(p1-q1)>>2 + pavgb m3, m0 ; (p1 - q1 + 256)>>1 + pavgb m3, [pb_3] ; (((p1 - q1 + 256)>>1)+4)>>1 = 64+2+(p1-q1)>>2 pxor m4, m1 - pavgb m4, m2 ; (q0 - p0 + 256)>>1 + pavgb m4, m2 ; (q0 - p0 + 256)>>1 pavgb m3, m5 - paddusb m3, m4 ; d+128+33 - mova m6, [pb_a1] + paddusb m3, m4 ; d+128+33 + mova m6, [pb_A1] psubusb m6, m3 - psubusb m3, [pb_a1] + psubusb m3, [pb_A1] pminub m6, m7 pminub m3, m7 psubusb m1, m6 @@ -263,10 +266,10 @@ SECTION .text %macro LUMA_Q1 6 mova %6, m1 pavgb %6, m2 - pavgb %2, %6 ; avg(p2,avg(p0,q0)) + pavgb %2, %6 ; avg(p2,avg(p0,q0)) pxor %6, %3 - pand %6, [pb_01] ; (p2^avg(p0,q0))&1 - psubusb %2, %6 ; (p2+((p0+q0+1)>>1))>>1 + pand %6, [pb_1] ; (p2^avg(p0,q0))&1 + psubusb %2, %6 ; (p2+((p0+q0+1)>>1))>>1 mova %6, %1 psubusb %6, %5 paddusb %5, %1 @@ -495,6 +498,8 @@ cglobal x264_deblock_h_luma_%1, 0,5 RET %endmacro ; DEBLOCK_LUMA +INIT_MMX +DEBLOCK_LUMA mmxext, v8, 8 INIT_XMM DEBLOCK_LUMA sse2, v, 16 @@ -517,9 +522,9 @@ DEBLOCK_LUMA sse2, v, 16 mova t3, t2 mova t4, t2 psrlw t2, 1 - pavgb t2, mpb_00 + pavgb t2, mpb_0 pxor t2, t0 - pand t2, mpb_01 + pand t2, mpb_1 psubb t0, t2 ; p1' = (p2+p1+p0+q0+2)/4; mova t1, p2 @@ -528,21 +533,21 @@ DEBLOCK_LUMA sse2, v, 16 psubb t2, q1 paddb t3, t3 psubb t3, t2 ; p2+2*p1+2*p0+2*q0+q1 - pand t2, mpb_01 + pand t2, mpb_1 psubb t1, t2 pavgb t1, p1 pavgb t1, t5 ; (((p2+q1)/2 + p1+1)/2 + (p0+q0+1)/2 + 1)/2 psrlw t3, 2 - pavgb t3, mpb_00 + pavgb t3, mpb_0 pxor t3, t1 - pand t3, mpb_01 + pand t3, mpb_1 psubb t1, t3 ; p0'a = (p2+2*p1+2*p0+2*q0+q1+4)/8 mova t3, p0 mova t2, p0 pxor t3, q1 pavgb t2, q1 - pand t3, mpb_01 + pand t3, mpb_1 psubb t2, t3 pavgb t2, p1 ; p0'b = (2*p1+p0+q0+2)/4 @@ -562,9 +567,9 @@ DEBLOCK_LUMA sse2, v, 16 paddb t2, t2 paddb t2, t4 ; 2*p3+3*p2+p1+p0+q0 psrlw t2, 2 - pavgb t2, mpb_00 + pavgb t2, mpb_0 pxor t2, t1 - pand t2, mpb_01 + pand t2, mpb_1 psubb t1, t2 ; p2' = (2*p3+3*p2+p1+p0+q0+4)/8 pxor t0, p1 @@ -603,8 +608,8 @@ DEBLOCK_LUMA sse2, v, 16 %define mask0 m12 %define mask1p m13 %define mask1q [rsp-24] - %define mpb_00 m14 - %define mpb_01 m15 + %define mpb_0 m14 + %define mpb_1 m15 %else %define spill(x) [esp+16*x+((stack_offset+4)&15)] %define p2 [r4+r1] @@ -614,8 +619,8 @@ DEBLOCK_LUMA sse2, v, 16 %define mask0 spill(2) %define mask1p spill(3) %define mask1q spill(4) - %define mpb_00 [pb_00] - %define mpb_01 [pb_01] + %define mpb_0 [pb_0] + %define mpb_1 [pb_1] %endif ;----------------------------------------------------------------------------- @@ -638,12 +643,12 @@ cglobal x264_deblock_%2_luma_intra_%1, 4,6,16 mova q0, [r0] mova q1, [r0+r1] %ifdef ARCH_X86_64 - pxor mpb_00, mpb_00 - mova mpb_01, [pb_01] + pxor mpb_0, mpb_0 + mova mpb_1, [pb_1] LOAD_MASK r2d, r3d, t5 ; m5=beta-1, t5=alpha-1, m7=mask0 SWAP 7, 12 ; m12=mask0 - pavgb t5, mpb_00 - pavgb t5, mpb_01 ; alpha/4+1 + pavgb t5, mpb_0 + pavgb t5, mpb_1 ; alpha/4+1 movdqa p2, [r4+r1] movdqa q2, [r0+2*r1] DIFF_GT2 p0, q0, t5, t0, t3 ; t0 = |p0-q0| > alpha/4+1 @@ -658,8 +663,8 @@ cglobal x264_deblock_%2_luma_intra_%1, 4,6,16 LOAD_MASK r2d, r3d, t5 ; m5=beta-1, t5=alpha-1, m7=mask0 mova m4, t5 mova mask0, m7 - pavgb m4, [pb_00] - pavgb m4, [pb_01] ; alpha/4+1 + pavgb m4, [pb_0] + pavgb m4, [pb_1] ; alpha/4+1 DIFF_GT2 p0, q0, m4, m6, m7 ; m6 = |p0-q0| > alpha/4+1 pand m6, mask0 DIFF_GT2 p0, p2, m5, m4, m7 ; m4 = |p2-p0| > beta-1 @@ -759,3 +764,126 @@ DEBLOCK_LUMA_INTRA sse2, v INIT_MMX DEBLOCK_LUMA_INTRA mmxext, v8 %endif + + + +INIT_MMX + +%macro CHROMA_V_START 0 + dec r2d ; alpha-1 + dec r3d ; beta-1 + mov t5, r0 + sub t5, r1 + sub t5, r1 +%endmacro + +%macro CHROMA_H_START 0 + dec r2d + dec r3d + sub r0, 2 + lea t6, [r1*3] + mov t5, r0 + add r0, t6 +%endmacro + +%define t5 r5 +%define t6 r6 + +;----------------------------------------------------------------------------- +; void x264_deblock_v_chroma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) +;----------------------------------------------------------------------------- +cglobal x264_deblock_v_chroma_mmxext, 5,6 + CHROMA_V_START + movq m0, [t5] + movq m1, [t5+r1] + movq m2, [r0] + movq m3, [r0+r1] + call x264_chroma_inter_body_mmxext + movq [t5+r1], m1 + movq [r0], m2 + RET + +;----------------------------------------------------------------------------- +; void x264_deblock_h_chroma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) +;----------------------------------------------------------------------------- +cglobal x264_deblock_h_chroma_mmxext, 5,7 +%ifdef ARCH_X86_64 + %define buf0 [rsp-24] + %define buf1 [rsp-16] +%else + %define buf0 r0m + %define buf1 r2m +%endif + CHROMA_H_START + TRANSPOSE4x8_LOAD PASS8ROWS(t5, r0, r1, t6) + movq buf0, m0 + movq buf1, m3 + call x264_chroma_inter_body_mmxext + movq m0, buf0 + movq m3, buf1 + TRANSPOSE8x4_STORE PASS8ROWS(t5, r0, r1, t6) + RET + +ALIGN 16 +x264_chroma_inter_body_mmxext: + LOAD_MASK r2d, r3d + movd m6, [r4] ; tc0 + punpcklbw m6, m6 + pand m7, m6 + DEBLOCK_P0_Q0 + ret + + + +; in: %1=p0 %2=p1 %3=q1 +; out: p0 = (p0 + q1 + 2*p1 + 2) >> 2 +%macro CHROMA_INTRA_P0 3 + movq m4, %1 + pxor m4, %3 + pand m4, [pb_1] ; m4 = (p0^q1)&1 + pavgb %1, %3 + psubusb %1, m4 + pavgb %1, %2 ; dst = avg(p1, avg(p0,q1) - ((p0^q1)&1)) +%endmacro + +%define t5 r4 +%define t6 r5 + +;----------------------------------------------------------------------------- +; void x264_deblock_v_chroma_intra( uint8_t *pix, int stride, int alpha, int beta ) +;----------------------------------------------------------------------------- +cglobal x264_deblock_v_chroma_intra_mmxext, 4,5 + CHROMA_V_START + movq m0, [t5] + movq m1, [t5+r1] + movq m2, [r0] + movq m3, [r0+r1] + call x264_chroma_intra_body_mmxext + movq [t5+r1], m1 + movq [r0], m2 + RET + +;----------------------------------------------------------------------------- +; void x264_deblock_h_chroma_intra( uint8_t *pix, int stride, int alpha, int beta ) +;----------------------------------------------------------------------------- +cglobal x264_deblock_h_chroma_intra_mmxext, 4,6 + CHROMA_H_START + TRANSPOSE4x8_LOAD PASS8ROWS(t5, r0, r1, t6) + call x264_chroma_intra_body_mmxext + TRANSPOSE8x4_STORE PASS8ROWS(t5, r0, r1, t6) + RET + +ALIGN 16 +x264_chroma_intra_body_mmxext: + LOAD_MASK r2d, r3d + movq m5, m1 + movq m6, m2 + CHROMA_INTRA_P0 m1, m0, m3 + CHROMA_INTRA_P0 m2, m3, m0 + psubb m1, m5 + psubb m2, m6 + pand m1, m7 + pand m2, m7 + paddb m1, m5 + paddb m2, m6 + ret diff --git a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/h264_idct.asm b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/h264_idct.asm new file mode 100644 index 000000000..3311ab559 --- /dev/null +++ b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/h264_idct.asm @@ -0,0 +1,865 @@ +;***************************************************************************** +;* MMX/SSE2-optimized H.264 iDCT +;***************************************************************************** +;* Copyright (C) 2004-2005 Michael Niedermayer, Loren Merritt +;* Copyright (C) 2003-2008 x264 project +;* +;* Authors: Laurent Aimar <fenrir@via.ecp.fr> +;* Loren Merritt <lorenm@u.washington.edu> +;* Holger Lubitz <hal@duncan.ol.sub.de> +;* Min Chen <chenm001.163.com> +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with FFmpeg; if not, write to the Free Software +;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;***************************************************************************** + +%include "x86inc.asm" +%include "x86util.asm" + +SECTION_RODATA + +; FIXME this table is a duplicate from h264data.h, and will be removed once the tables from, h264 have been split +scan8_mem: db 4+1*8, 5+1*8, 4+2*8, 5+2*8 + db 6+1*8, 7+1*8, 6+2*8, 7+2*8 + db 4+3*8, 5+3*8, 4+4*8, 5+4*8 + db 6+3*8, 7+3*8, 6+4*8, 7+4*8 + db 1+1*8, 2+1*8 + db 1+2*8, 2+2*8 + db 1+4*8, 2+4*8 + db 1+5*8, 2+5*8 +%ifdef PIC +%define scan8 r11 +%else +%define scan8 scan8_mem +%endif + +cextern pw_32 + +SECTION .text + +; %1=uint8_t *dst, %2=int16_t *block, %3=int stride +%macro IDCT4_ADD 3 + ; Load dct coeffs + movq m0, [%2] + movq m1, [%2+8] + movq m2, [%2+16] + movq m3, [%2+24] + + IDCT4_1D 0, 1, 2, 3, 4, 5 + mova m6, [pw_32] + TRANSPOSE4x4W 0, 1, 2, 3, 4 + paddw m0, m6 + IDCT4_1D 0, 1, 2, 3, 4, 5 + pxor m7, m7 + + STORE_DIFFx2 m0, m1, m4, m5, m7, 6, %1, %3 + lea %1, [%1+%3*2] + STORE_DIFFx2 m2, m3, m4, m5, m7, 6, %1, %3 +%endmacro + +INIT_MMX +; ff_h264_idct_add_mmx(uint8_t *dst, int16_t *block, int stride) +cglobal h264_idct_add_mmx, 3, 3, 0 + IDCT4_ADD r0, r1, r2 + RET + +%macro IDCT8_1D 2 + mova m4, m5 + mova m0, m1 + psraw m4, 1 + psraw m1, 1 + paddw m4, m5 + paddw m1, m0 + paddw m4, m7 + paddw m1, m5 + psubw m4, m0 + paddw m1, m3 + + psubw m0, m3 + psubw m5, m3 + paddw m0, m7 + psubw m5, m7 + psraw m3, 1 + psraw m7, 1 + psubw m0, m3 + psubw m5, m7 + + mova m3, m4 + mova m7, m1 + psraw m1, 2 + psraw m3, 2 + paddw m3, m0 + psraw m0, 2 + paddw m1, m5 + psraw m5, 2 + psubw m0, m4 + psubw m7, m5 + + mova m4, m2 + mova m5, m6 + psraw m4, 1 + psraw m6, 1 + psubw m4, m5 + paddw m6, m2 + + mova m2, %1 + mova m5, %2 + SUMSUB_BA m5, m2 + SUMSUB_BA m6, m5 + SUMSUB_BA m4, m2 + SUMSUB_BA m7, m6 + SUMSUB_BA m0, m4 + SUMSUB_BA m3, m2 + SUMSUB_BA m1, m5 + SWAP 7, 6, 4, 5, 2, 3, 1, 0 ; 70315246 -> 01234567 +%endmacro + +%macro IDCT8_1D_FULL 1 + mova m7, [%1+112] + mova m6, [%1+ 96] + mova m5, [%1+ 80] + mova m3, [%1+ 48] + mova m2, [%1+ 32] + mova m1, [%1+ 16] + IDCT8_1D [%1], [%1+ 64] +%endmacro + +; %1=int16_t *block, %2=int16_t *dstblock +%macro IDCT8_ADD_MMX_START 2 + IDCT8_1D_FULL %1 + mova [%1], m7 + TRANSPOSE4x4W 0, 1, 2, 3, 7 + mova m7, [%1] + mova [%2 ], m0 + mova [%2+16], m1 + mova [%2+32], m2 + mova [%2+48], m3 + TRANSPOSE4x4W 4, 5, 6, 7, 3 + mova [%2+ 8], m4 + mova [%2+24], m5 + mova [%2+40], m6 + mova [%2+56], m7 +%endmacro + +; %1=uint8_t *dst, %2=int16_t *block, %3=int stride +%macro IDCT8_ADD_MMX_END 3 + IDCT8_1D_FULL %2 + mova [%2 ], m5 + mova [%2+16], m6 + mova [%2+32], m7 + + pxor m7, m7 + STORE_DIFFx2 m0, m1, m5, m6, m7, 6, %1, %3 + lea %1, [%1+%3*2] + STORE_DIFFx2 m2, m3, m5, m6, m7, 6, %1, %3 + mova m0, [%2 ] + mova m1, [%2+16] + mova m2, [%2+32] + lea %1, [%1+%3*2] + STORE_DIFFx2 m4, m0, m5, m6, m7, 6, %1, %3 + lea %1, [%1+%3*2] + STORE_DIFFx2 m1, m2, m5, m6, m7, 6, %1, %3 +%endmacro + +INIT_MMX +; ff_h264_idct8_add_mmx(uint8_t *dst, int16_t *block, int stride) +cglobal h264_idct8_add_mmx, 3, 4, 0 + %assign pad 128+4-(stack_offset&7) + SUB rsp, pad + + add word [r1], 32 + IDCT8_ADD_MMX_START r1 , rsp + IDCT8_ADD_MMX_START r1+8, rsp+64 + lea r3, [r0+4] + IDCT8_ADD_MMX_END r0 , rsp, r2 + IDCT8_ADD_MMX_END r3 , rsp+8, r2 + + ADD rsp, pad + RET + +; %1=uint8_t *dst, %2=int16_t *block, %3=int stride +%macro IDCT8_ADD_SSE 4 + IDCT8_1D_FULL %2 +%ifdef ARCH_X86_64 + TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8 +%else + TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [%2], [%2+16] +%endif + paddw m0, [pw_32] + +%ifndef ARCH_X86_64 + mova [%2 ], m0 + mova [%2+16], m4 + IDCT8_1D [%2], [%2+ 16] + mova [%2 ], m6 + mova [%2+16], m7 +%else + SWAP 0, 8 + SWAP 4, 9 + IDCT8_1D m8, m9 + SWAP 6, 8 + SWAP 7, 9 +%endif + + pxor m7, m7 + lea %4, [%3*3] + STORE_DIFF m0, m6, m7, [%1 ] + STORE_DIFF m1, m6, m7, [%1+%3 ] + STORE_DIFF m2, m6, m7, [%1+%3*2] + STORE_DIFF m3, m6, m7, [%1+%4 ] +%ifndef ARCH_X86_64 + mova m0, [%2 ] + mova m1, [%2+16] +%else + SWAP 0, 8 + SWAP 1, 9 +%endif + lea %1, [%1+%3*4] + STORE_DIFF m4, m6, m7, [%1 ] + STORE_DIFF m5, m6, m7, [%1+%3 ] + STORE_DIFF m0, m6, m7, [%1+%3*2] + STORE_DIFF m1, m6, m7, [%1+%4 ] +%endmacro + +INIT_XMM +; ff_h264_idct8_add_sse2(uint8_t *dst, int16_t *block, int stride) +cglobal h264_idct8_add_sse2, 3, 4, 10 + IDCT8_ADD_SSE r0, r1, r2, r3 + RET + +%macro DC_ADD_MMX2_INIT 2-3 +%if %0 == 2 + movsx %1, word [%1] + add %1, 32 + sar %1, 6 + movd m0, %1 + lea %1, [%2*3] +%else + add %3, 32 + sar %3, 6 + movd m0, %3 + lea %3, [%2*3] +%endif + pshufw m0, m0, 0 + pxor m1, m1 + psubw m1, m0 + packuswb m0, m0 + packuswb m1, m1 +%endmacro + +%macro DC_ADD_MMX2_OP 3-4 + %1 m2, [%2 ] + %1 m3, [%2+%3 ] + %1 m4, [%2+%3*2] + %1 m5, [%2+%4 ] + paddusb m2, m0 + paddusb m3, m0 + paddusb m4, m0 + paddusb m5, m0 + psubusb m2, m1 + psubusb m3, m1 + psubusb m4, m1 + psubusb m5, m1 + %1 [%2 ], m2 + %1 [%2+%3 ], m3 + %1 [%2+%3*2], m4 + %1 [%2+%4 ], m5 +%endmacro + +INIT_MMX +; ff_h264_idct_dc_add_mmx2(uint8_t *dst, int16_t *block, int stride) +cglobal h264_idct_dc_add_mmx2, 3, 3, 0 + DC_ADD_MMX2_INIT r1, r2 + DC_ADD_MMX2_OP movh, r0, r2, r1 + RET + +; ff_h264_idct8_dc_add_mmx2(uint8_t *dst, int16_t *block, int stride) +cglobal h264_idct8_dc_add_mmx2, 3, 3, 0 + DC_ADD_MMX2_INIT r1, r2 + DC_ADD_MMX2_OP mova, r0, r2, r1 + lea r0, [r0+r2*4] + DC_ADD_MMX2_OP mova, r0, r2, r1 + RET + +; ff_h264_idct_add16_mmx(uint8_t *dst, const int *block_offset, +; DCTELEM *block, int stride, const uint8_t nnzc[6*8]) +cglobal h264_idct_add16_mmx, 5, 7, 0 + xor r5, r5 +%ifdef PIC + lea r11, [scan8_mem] +%endif +.nextblock + movzx r6, byte [scan8+r5] + movzx r6, byte [r4+r6] + test r6, r6 + jz .skipblock + mov r6d, dword [r1+r5*4] + lea r6, [r0+r6] + IDCT4_ADD r6, r2, r3 +.skipblock + inc r5 + add r2, 32 + cmp r5, 16 + jl .nextblock + REP_RET + +; ff_h264_idct8_add4_mmx(uint8_t *dst, const int *block_offset, +; DCTELEM *block, int stride, const uint8_t nnzc[6*8]) +cglobal h264_idct8_add4_mmx, 5, 7, 0 + %assign pad 128+4-(stack_offset&7) + SUB rsp, pad + + xor r5, r5 +%ifdef PIC + lea r11, [scan8_mem] +%endif +.nextblock + movzx r6, byte [scan8+r5] + movzx r6, byte [r4+r6] + test r6, r6 + jz .skipblock + mov r6d, dword [r1+r5*4] + lea r6, [r0+r6] + add word [r2], 32 + IDCT8_ADD_MMX_START r2 , rsp + IDCT8_ADD_MMX_START r2+8, rsp+64 + IDCT8_ADD_MMX_END r6 , rsp, r3 + mov r6d, dword [r1+r5*4] + lea r6, [r0+r6+4] + IDCT8_ADD_MMX_END r6 , rsp+8, r3 +.skipblock + add r5, 4 + add r2, 128 + cmp r5, 16 + jl .nextblock + ADD rsp, pad + RET + +; ff_h264_idct_add16_mmx2(uint8_t *dst, const int *block_offset, +; DCTELEM *block, int stride, const uint8_t nnzc[6*8]) +cglobal h264_idct_add16_mmx2, 5, 7, 0 + xor r5, r5 +%ifdef PIC + lea r11, [scan8_mem] +%endif +.nextblock + movzx r6, byte [scan8+r5] + movzx r6, byte [r4+r6] + test r6, r6 + jz .skipblock + cmp r6, 1 + jnz .no_dc + movsx r6, word [r2] + test r6, r6 + jz .no_dc + DC_ADD_MMX2_INIT r2, r3, r6 +%ifdef ARCH_X86_64 +%define dst_reg r10 +%define dst_regd r10d +%else +%define dst_reg r1 +%define dst_regd r1d +%endif + mov dst_regd, dword [r1+r5*4] + lea dst_reg, [r0+dst_reg] + DC_ADD_MMX2_OP movh, dst_reg, r3, r6 +%ifndef ARCH_X86_64 + mov r1, r1m +%endif + inc r5 + add r2, 32 + cmp r5, 16 + jl .nextblock + REP_RET +.no_dc + mov r6d, dword [r1+r5*4] + lea r6, [r0+r6] + IDCT4_ADD r6, r2, r3 +.skipblock + inc r5 + add r2, 32 + cmp r5, 16 + jl .nextblock + REP_RET + +; ff_h264_idct_add16intra_mmx(uint8_t *dst, const int *block_offset, +; DCTELEM *block, int stride, const uint8_t nnzc[6*8]) +cglobal h264_idct_add16intra_mmx, 5, 7, 0 + xor r5, r5 +%ifdef PIC + lea r11, [scan8_mem] +%endif +.nextblock + movzx r6, byte [scan8+r5] + movzx r6, byte [r4+r6] + or r6w, word [r2] + test r6, r6 + jz .skipblock + mov r6d, dword [r1+r5*4] + lea r6, [r0+r6] + IDCT4_ADD r6, r2, r3 +.skipblock + inc r5 + add r2, 32 + cmp r5, 16 + jl .nextblock + REP_RET + +; ff_h264_idct_add16intra_mmx2(uint8_t *dst, const int *block_offset, +; DCTELEM *block, int stride, const uint8_t nnzc[6*8]) +cglobal h264_idct_add16intra_mmx2, 5, 7, 0 + xor r5, r5 +%ifdef PIC + lea r11, [scan8_mem] +%endif +.nextblock + movzx r6, byte [scan8+r5] + movzx r6, byte [r4+r6] + test r6, r6 + jz .try_dc + mov r6d, dword [r1+r5*4] + lea r6, [r0+r6] + IDCT4_ADD r6, r2, r3 + inc r5 + add r2, 32 + cmp r5, 16 + jl .nextblock + REP_RET +.try_dc + movsx r6, word [r2] + test r6, r6 + jz .skipblock + DC_ADD_MMX2_INIT r2, r3, r6 +%ifdef ARCH_X86_64 +%define dst_reg r10 +%define dst_regd r10d +%else +%define dst_reg r1 +%define dst_regd r1d +%endif + mov dst_regd, dword [r1+r5*4] + lea dst_reg, [r0+dst_reg] + DC_ADD_MMX2_OP movh, dst_reg, r3, r6 +%ifndef ARCH_X86_64 + mov r1, r1m +%endif +.skipblock + inc r5 + add r2, 32 + cmp r5, 16 + jl .nextblock + REP_RET + +; ff_h264_idct8_add4_mmx2(uint8_t *dst, const int *block_offset, +; DCTELEM *block, int stride, const uint8_t nnzc[6*8]) +cglobal h264_idct8_add4_mmx2, 5, 7, 0 + %assign pad 128+4-(stack_offset&7) + SUB rsp, pad + + xor r5, r5 +%ifdef PIC + lea r11, [scan8_mem] +%endif +.nextblock + movzx r6, byte [scan8+r5] + movzx r6, byte [r4+r6] + test r6, r6 + jz .skipblock + cmp r6, 1 + jnz .no_dc + movsx r6, word [r2] + test r6, r6 + jz .no_dc + DC_ADD_MMX2_INIT r2, r3, r6 +%ifdef ARCH_X86_64 +%define dst_reg r10 +%define dst_regd r10d +%else +%define dst_reg r1 +%define dst_regd r1d +%endif + mov dst_regd, dword [r1+r5*4] + lea dst_reg, [r0+dst_reg] + DC_ADD_MMX2_OP mova, dst_reg, r3, r6 + lea dst_reg, [dst_reg+r3*4] + DC_ADD_MMX2_OP mova, dst_reg, r3, r6 +%ifndef ARCH_X86_64 + mov r1, r1m +%endif + add r5, 4 + add r2, 128 + cmp r5, 16 + jl .nextblock + + ADD rsp, pad + RET +.no_dc + mov r6d, dword [r1+r5*4] + lea r6, [r0+r6] + add word [r2], 32 + IDCT8_ADD_MMX_START r2 , rsp + IDCT8_ADD_MMX_START r2+8, rsp+64 + IDCT8_ADD_MMX_END r6 , rsp, r3 + mov r6d, dword [r1+r5*4] + lea r6, [r0+r6+4] + IDCT8_ADD_MMX_END r6 , rsp+8, r3 +.skipblock + add r5, 4 + add r2, 128 + cmp r5, 16 + jl .nextblock + + ADD rsp, pad + RET + +INIT_XMM +; ff_h264_idct8_add4_sse2(uint8_t *dst, const int *block_offset, +; DCTELEM *block, int stride, const uint8_t nnzc[6*8]) +cglobal h264_idct8_add4_sse2, 5, 7, 10 + xor r5, r5 +%ifdef PIC + lea r11, [scan8_mem] +%endif +.nextblock + movzx r6, byte [scan8+r5] + movzx r6, byte [r4+r6] + test r6, r6 + jz .skipblock + cmp r6, 1 + jnz .no_dc + movsx r6, word [r2] + test r6, r6 + jz .no_dc +INIT_MMX + DC_ADD_MMX2_INIT r2, r3, r6 +%ifdef ARCH_X86_64 +%define dst_reg r10 +%define dst_regd r10d +%else +%define dst_reg r1 +%define dst_regd r1d +%endif + mov dst_regd, dword [r1+r5*4] + lea dst_reg, [r0+dst_reg] + DC_ADD_MMX2_OP mova, dst_reg, r3, r6 + lea dst_reg, [dst_reg+r3*4] + DC_ADD_MMX2_OP mova, dst_reg, r3, r6 +%ifndef ARCH_X86_64 + mov r1, r1m +%endif + add r5, 4 + add r2, 128 + cmp r5, 16 + jl .nextblock + REP_RET +.no_dc +INIT_XMM + mov dst_regd, dword [r1+r5*4] + lea dst_reg, [r0+dst_reg] + IDCT8_ADD_SSE dst_reg, r2, r3, r6 +%ifndef ARCH_X86_64 + mov r1, r1m +%endif +.skipblock + add r5, 4 + add r2, 128 + cmp r5, 16 + jl .nextblock + REP_RET + +INIT_MMX +h264_idct_add8_mmx_plane: +.nextblock + movzx r6, byte [scan8+r5] + movzx r6, byte [r4+r6] + or r6w, word [r2] + test r6, r6 + jz .skipblock +%ifdef ARCH_X86_64 + mov r0d, dword [r1+r5*4] + add r0, [r10] +%else + mov r0, r1m ; XXX r1m here is actually r0m of the calling func + mov r0, [r0] + add r0, dword [r1+r5*4] +%endif + IDCT4_ADD r0, r2, r3 +.skipblock + inc r5 + add r2, 32 + test r5, 3 + jnz .nextblock + rep ret + +; ff_h264_idct_add8_mmx(uint8_t **dest, const int *block_offset, +; DCTELEM *block, int stride, const uint8_t nnzc[6*8]) +cglobal h264_idct_add8_mmx, 5, 7, 0 + mov r5, 16 + add r2, 512 +%ifdef PIC + lea r11, [scan8_mem] +%endif +%ifdef ARCH_X86_64 + mov r10, r0 +%endif + call h264_idct_add8_mmx_plane +%ifdef ARCH_X86_64 + add r10, gprsize +%else + add r0mp, gprsize +%endif + call h264_idct_add8_mmx_plane + RET + +h264_idct_add8_mmx2_plane +.nextblock + movzx r6, byte [scan8+r5] + movzx r6, byte [r4+r6] + test r6, r6 + jz .try_dc +%ifdef ARCH_X86_64 + mov r0d, dword [r1+r5*4] + add r0, [r10] +%else + mov r0, r1m ; XXX r1m here is actually r0m of the calling func + mov r0, [r0] + add r0, dword [r1+r5*4] +%endif + IDCT4_ADD r0, r2, r3 + inc r5 + add r2, 32 + test r5, 3 + jnz .nextblock + rep ret +.try_dc + movsx r6, word [r2] + test r6, r6 + jz .skipblock + DC_ADD_MMX2_INIT r2, r3, r6 +%ifdef ARCH_X86_64 + mov r0d, dword [r1+r5*4] + add r0, [r10] +%else + mov r0, r1m ; XXX r1m here is actually r0m of the calling func + mov r0, [r0] + add r0, dword [r1+r5*4] +%endif + DC_ADD_MMX2_OP movh, r0, r3, r6 +.skipblock + inc r5 + add r2, 32 + test r5, 3 + jnz .nextblock + rep ret + +; ff_h264_idct_add8_mmx2(uint8_t **dest, const int *block_offset, +; DCTELEM *block, int stride, const uint8_t nnzc[6*8]) +cglobal h264_idct_add8_mmx2, 5, 7, 0 + mov r5, 16 + add r2, 512 +%ifdef ARCH_X86_64 + mov r10, r0 +%endif +%ifdef PIC + lea r11, [scan8_mem] +%endif + call h264_idct_add8_mmx2_plane +%ifdef ARCH_X86_64 + add r10, gprsize +%else + add r0mp, gprsize +%endif + call h264_idct_add8_mmx2_plane + RET + +INIT_MMX +; r0 = uint8_t *dst, r2 = int16_t *block, r3 = int stride, r6=clobbered +h264_idct_dc_add8_mmx2: + movd m0, [r2 ] ; 0 0 X D + punpcklwd m0, [r2+32] ; x X d D + paddsw m0, [pw_32] + psraw m0, 6 + punpcklwd m0, m0 ; d d D D + pxor m1, m1 ; 0 0 0 0 + psubw m1, m0 ; -d-d-D-D + packuswb m0, m1 ; -d-d-D-D d d D D + pshufw m1, m0, 0xFA ; -d-d-d-d-D-D-D-D + punpcklwd m0, m0 ; d d d d D D D D + lea r6, [r3*3] + DC_ADD_MMX2_OP movq, r0, r3, r6 + ret + +ALIGN 16 +INIT_XMM +; r0 = uint8_t *dst (clobbered), r2 = int16_t *block, r3 = int stride +x264_add8x4_idct_sse2: + movq m0, [r2+ 0] + movq m1, [r2+ 8] + movq m2, [r2+16] + movq m3, [r2+24] + movhps m0, [r2+32] + movhps m1, [r2+40] + movhps m2, [r2+48] + movhps m3, [r2+56] + IDCT4_1D 0,1,2,3,4,5 + TRANSPOSE2x4x4W 0,1,2,3,4 + paddw m0, [pw_32] + IDCT4_1D 0,1,2,3,4,5 + pxor m7, m7 + STORE_DIFFx2 m0, m1, m4, m5, m7, 6, r0, r3 + lea r0, [r0+r3*2] + STORE_DIFFx2 m2, m3, m4, m5, m7, 6, r0, r3 + ret + +%macro add16_sse2_cycle 2 + movzx r0, word [r4+%2] + test r0, r0 + jz .cycle%1end + mov r0d, dword [r1+%1*8] +%ifdef ARCH_X86_64 + add r0, r10 +%else + add r0, r0m +%endif + call x264_add8x4_idct_sse2 +.cycle%1end +%if %1 < 7 + add r2, 64 +%endif +%endmacro + +; ff_h264_idct_add16_sse2(uint8_t *dst, const int *block_offset, +; DCTELEM *block, int stride, const uint8_t nnzc[6*8]) +cglobal h264_idct_add16_sse2, 5, 5, 8 +%ifdef ARCH_X86_64 + mov r10, r0 +%endif + ; unrolling of the loop leads to an average performance gain of + ; 20-25% + add16_sse2_cycle 0, 0xc + add16_sse2_cycle 1, 0x14 + add16_sse2_cycle 2, 0xe + add16_sse2_cycle 3, 0x16 + add16_sse2_cycle 4, 0x1c + add16_sse2_cycle 5, 0x24 + add16_sse2_cycle 6, 0x1e + add16_sse2_cycle 7, 0x26 + RET + +; ff_h264_idct_add16intra_sse2(uint8_t *dst, const int *block_offset, +; DCTELEM *block, int stride, const uint8_t nnzc[6*8]) +cglobal h264_idct_add16intra_sse2, 5, 7, 8 + xor r5, r5 +%ifdef ARCH_X86_64 + mov r10, r0 +%endif +%ifdef PIC + lea r11, [scan8_mem] +%endif +.next2blocks + movzx r0, byte [scan8+r5] + movzx r0, word [r4+r0] + test r0, r0 + jz .try_dc + mov r0d, dword [r1+r5*4] +%ifdef ARCH_X86_64 + add r0, r10 +%else + add r0, r0m +%endif + call x264_add8x4_idct_sse2 + add r5, 2 + add r2, 64 + cmp r5, 16 + jl .next2blocks + REP_RET +.try_dc + movsx r0, word [r2 ] + or r0w, word [r2+32] + jz .skip2blocks + mov r0d, dword [r1+r5*4] +%ifdef ARCH_X86_64 + add r0, r10 +%else + add r0, r0m +%endif + call h264_idct_dc_add8_mmx2 +.skip2blocks + add r5, 2 + add r2, 64 + cmp r5, 16 + jl .next2blocks + REP_RET + +h264_idct_add8_sse2_plane: +.next2blocks + movzx r0, byte [scan8+r5] + movzx r0, word [r4+r0] + test r0, r0 + jz .try_dc +%ifdef ARCH_X86_64 + mov r0d, dword [r1+r5*4] + add r0, [r10] +%else + mov r0, r1m ; XXX r1m here is actually r0m of the calling func + mov r0, [r0] + add r0, dword [r1+r5*4] +%endif + call x264_add8x4_idct_sse2 + add r5, 2 + add r2, 64 + test r5, 3 + jnz .next2blocks + rep ret +.try_dc + movsx r0, word [r2 ] + or r0w, word [r2+32] + jz .skip2blocks +%ifdef ARCH_X86_64 + mov r0d, dword [r1+r5*4] + add r0, [r10] +%else + mov r0, r1m ; XXX r1m here is actually r0m of the calling func + mov r0, [r0] + add r0, dword [r1+r5*4] +%endif + call h264_idct_dc_add8_mmx2 +.skip2blocks + add r5, 2 + add r2, 64 + test r5, 3 + jnz .next2blocks + rep ret + +; ff_h264_idct_add8_sse2(uint8_t **dest, const int *block_offset, +; DCTELEM *block, int stride, const uint8_t nnzc[6*8]) +cglobal h264_idct_add8_sse2, 5, 7, 8 + mov r5, 16 + add r2, 512 +%ifdef PIC + lea r11, [scan8_mem] +%endif +%ifdef ARCH_X86_64 + mov r10, r0 +%endif + call h264_idct_add8_sse2_plane +%ifdef ARCH_X86_64 + add r10, gprsize +%else + add r0mp, gprsize +%endif + call h264_idct_add8_sse2_plane + RET diff --git a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/h264_idct_sse2.asm b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/h264_idct_sse2.asm deleted file mode 100644 index 86c1e66c7..000000000 --- a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/h264_idct_sse2.asm +++ /dev/null @@ -1,54 +0,0 @@ -;***************************************************************************** -;* SSE2-optimized H.264 iDCT -;***************************************************************************** -;* Copyright (C) 2003-2008 x264 project -;* -;* Authors: Laurent Aimar <fenrir@via.ecp.fr> -;* Loren Merritt <lorenm@u.washington.edu> -;* Holger Lubitz <hal@duncan.ol.sub.de> -;* Min Chen <chenm001.163.com> -;* -;* This program is free software; you can redistribute it and/or modify -;* it under the terms of the GNU General Public License as published by -;* the Free Software Foundation; either version 2 of the License, or -;* (at your option) any later version. -;* -;* This program is distributed in the hope that it will be useful, -;* but WITHOUT ANY WARRANTY; without even the implied warranty of -;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -;* GNU General Public License for more details. -;* -;* You should have received a copy of the GNU General Public License -;* along with this program; if not, write to the Free Software -;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. -;***************************************************************************** - -%include "x86inc.asm" -%include "x86util.asm" - -SECTION_RODATA -pw_32: times 8 dw 32 - -SECTION .text - -INIT_XMM -cglobal x264_add8x4_idct_sse2, 3,3,8 - movq m0, [r1+ 0] - movq m1, [r1+ 8] - movq m2, [r1+16] - movq m3, [r1+24] - movhps m0, [r1+32] - movhps m1, [r1+40] - movhps m2, [r1+48] - movhps m3, [r1+56] - IDCT4_1D 0,1,2,3,4,5 - TRANSPOSE2x4x4W 0,1,2,3,4 - paddw m0, [pw_32] - IDCT4_1D 0,1,2,3,4,5 - pxor m7, m7 - STORE_DIFF m0, m4, m7, [r0] - STORE_DIFF m1, m4, m7, [r0+r2] - lea r0, [r0+r2*2] - STORE_DIFF m2, m4, m7, [r0] - STORE_DIFF m3, m4, m7, [r0+r2] - RET diff --git a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/h264_intrapred_init.c b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/h264_intrapred_init.c new file mode 100644 index 000000000..e01a17bd6 --- /dev/null +++ b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/h264_intrapred_init.c @@ -0,0 +1,103 @@ +/*
+ * Copyright (c) 2010 Jason Garrett-Glaser
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/cpu.h"
+#include "libavcodec/h264pred.h"
+
+void ff_pred16x16_vertical_mmx (uint8_t *src, int stride);
+void ff_pred16x16_vertical_sse (uint8_t *src, int stride);
+void ff_pred16x16_horizontal_mmx (uint8_t *src, int stride);
+void ff_pred16x16_horizontal_mmxext(uint8_t *src, int stride);
+void ff_pred16x16_horizontal_ssse3 (uint8_t *src, int stride);
+void ff_pred16x16_dc_mmxext (uint8_t *src, int stride);
+void ff_pred16x16_dc_sse2 (uint8_t *src, int stride);
+void ff_pred16x16_dc_ssse3 (uint8_t *src, int stride);
+void ff_pred16x16_tm_vp8_mmx (uint8_t *src, int stride);
+void ff_pred16x16_tm_vp8_mmxext (uint8_t *src, int stride);
+void ff_pred16x16_tm_vp8_sse2 (uint8_t *src, int stride);
+void ff_pred8x8_dc_rv40_mmxext (uint8_t *src, int stride);
+void ff_pred8x8_vertical_mmx (uint8_t *src, int stride);
+void ff_pred8x8_horizontal_mmx (uint8_t *src, int stride);
+void ff_pred8x8_horizontal_mmxext (uint8_t *src, int stride);
+void ff_pred8x8_horizontal_ssse3 (uint8_t *src, int stride);
+void ff_pred8x8_tm_vp8_mmx (uint8_t *src, int stride);
+void ff_pred8x8_tm_vp8_mmxext (uint8_t *src, int stride);
+void ff_pred8x8_tm_vp8_sse2 (uint8_t *src, int stride);
+void ff_pred8x8_tm_vp8_ssse3 (uint8_t *src, int stride);
+void ff_pred4x4_dc_mmxext (uint8_t *src, const uint8_t *topright, int stride);
+void ff_pred4x4_tm_vp8_mmx (uint8_t *src, const uint8_t *topright, int stride);
+void ff_pred4x4_tm_vp8_mmxext (uint8_t *src, const uint8_t *topright, int stride);
+void ff_pred4x4_tm_vp8_ssse3 (uint8_t *src, const uint8_t *topright, int stride);
+void ff_pred4x4_vertical_vp8_mmxext(uint8_t *src, const uint8_t *topright, int stride);
+
+void ff_h264_pred_init_x86(H264PredContext *h, int codec_id)
+{
+ int mm_flags = av_get_cpu_flags();
+
+#if HAVE_YASM
+ if (mm_flags & AV_CPU_FLAG_MMX) {
+ h->pred16x16[VERT_PRED8x8] = ff_pred16x16_vertical_mmx;
+ h->pred16x16[HOR_PRED8x8 ] = ff_pred16x16_horizontal_mmx;
+ h->pred8x8 [VERT_PRED8x8] = ff_pred8x8_vertical_mmx;
+ h->pred8x8 [HOR_PRED8x8 ] = ff_pred8x8_horizontal_mmx;
+ if (codec_id == CODEC_ID_VP8) {
+ h->pred16x16[PLANE_PRED8x8] = ff_pred16x16_tm_vp8_mmx;
+ h->pred8x8 [PLANE_PRED8x8] = ff_pred8x8_tm_vp8_mmx;
+ h->pred4x4 [TM_VP8_PRED ] = ff_pred4x4_tm_vp8_mmx;
+ }
+ }
+
+ if (mm_flags & AV_CPU_FLAG_MMX2) {
+ h->pred16x16[HOR_PRED8x8 ] = ff_pred16x16_horizontal_mmxext;
+ h->pred16x16[DC_PRED8x8 ] = ff_pred16x16_dc_mmxext;
+ h->pred8x8 [HOR_PRED8x8 ] = ff_pred8x8_horizontal_mmxext;
+ h->pred4x4 [DC_PRED ] = ff_pred4x4_dc_mmxext;
+ if (codec_id == CODEC_ID_VP8) {
+ h->pred16x16[PLANE_PRED8x8] = ff_pred16x16_tm_vp8_mmxext;
+ h->pred8x8 [DC_PRED8x8 ] = ff_pred8x8_dc_rv40_mmxext;
+ h->pred8x8 [PLANE_PRED8x8] = ff_pred8x8_tm_vp8_mmxext;
+ h->pred4x4 [TM_VP8_PRED ] = ff_pred4x4_tm_vp8_mmxext;
+ h->pred4x4 [VERT_PRED ] = ff_pred4x4_vertical_vp8_mmxext;
+ }
+ }
+
+ if (mm_flags & AV_CPU_FLAG_SSE) {
+ h->pred16x16[VERT_PRED8x8] = ff_pred16x16_vertical_sse;
+ }
+
+ if (mm_flags & AV_CPU_FLAG_SSE2) {
+ h->pred16x16[DC_PRED8x8 ] = ff_pred16x16_dc_sse2;
+ if (codec_id == CODEC_ID_VP8) {
+ h->pred16x16[PLANE_PRED8x8] = ff_pred16x16_tm_vp8_sse2;
+ h->pred8x8 [PLANE_PRED8x8] = ff_pred8x8_tm_vp8_sse2;
+ }
+ }
+
+ if (mm_flags & AV_CPU_FLAG_SSSE3) {
+ h->pred16x16[HOR_PRED8x8 ] = ff_pred16x16_horizontal_ssse3;
+ h->pred16x16[DC_PRED8x8 ] = ff_pred16x16_dc_ssse3;
+ h->pred8x8 [HOR_PRED8x8 ] = ff_pred8x8_horizontal_ssse3;
+ if (codec_id == CODEC_ID_VP8) {
+ h->pred8x8 [PLANE_PRED8x8] = ff_pred8x8_tm_vp8_ssse3;
+ h->pred4x4 [TM_VP8_PRED ] = ff_pred4x4_tm_vp8_ssse3;
+ }
+ }
+#endif
+}
diff --git a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/h264_qpel_mmx.c b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/h264_qpel_mmx.c new file mode 100644 index 000000000..e94ed0935 --- /dev/null +++ b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/h264_qpel_mmx.c @@ -0,0 +1,1209 @@ +/*
+ * Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "dsputil_mmx.h"
+
+/***********************************/
+/* motion compensation */
+
+#define QPEL_H264V_MM(A,B,C,D,E,F,OP,T,Z,d,q)\
+ "mov"#q" "#C", "#T" \n\t"\
+ "mov"#d" (%0), "#F" \n\t"\
+ "paddw "#D", "#T" \n\t"\
+ "psllw $2, "#T" \n\t"\
+ "psubw "#B", "#T" \n\t"\
+ "psubw "#E", "#T" \n\t"\
+ "punpcklbw "#Z", "#F" \n\t"\
+ "pmullw %4, "#T" \n\t"\
+ "paddw %5, "#A" \n\t"\
+ "add %2, %0 \n\t"\
+ "paddw "#F", "#A" \n\t"\
+ "paddw "#A", "#T" \n\t"\
+ "psraw $5, "#T" \n\t"\
+ "packuswb "#T", "#T" \n\t"\
+ OP(T, (%1), A, d)\
+ "add %3, %1 \n\t"
+
+#define QPEL_H264HV_MM(A,B,C,D,E,F,OF,T,Z,d,q)\
+ "mov"#q" "#C", "#T" \n\t"\
+ "mov"#d" (%0), "#F" \n\t"\
+ "paddw "#D", "#T" \n\t"\
+ "psllw $2, "#T" \n\t"\
+ "paddw %4, "#A" \n\t"\
+ "psubw "#B", "#T" \n\t"\
+ "psubw "#E", "#T" \n\t"\
+ "punpcklbw "#Z", "#F" \n\t"\
+ "pmullw %3, "#T" \n\t"\
+ "paddw "#F", "#A" \n\t"\
+ "add %2, %0 \n\t"\
+ "paddw "#A", "#T" \n\t"\
+ "mov"#q" "#T", "#OF"(%1) \n\t"
+
+#define QPEL_H264V(A,B,C,D,E,F,OP) QPEL_H264V_MM(A,B,C,D,E,F,OP,%%mm6,%%mm7,d,q)
+#define QPEL_H264HV(A,B,C,D,E,F,OF) QPEL_H264HV_MM(A,B,C,D,E,F,OF,%%mm6,%%mm7,d,q)
+#define QPEL_H264V_XMM(A,B,C,D,E,F,OP) QPEL_H264V_MM(A,B,C,D,E,F,OP,%%xmm6,%%xmm7,q,dqa)
+#define QPEL_H264HV_XMM(A,B,C,D,E,F,OF) QPEL_H264HV_MM(A,B,C,D,E,F,OF,%%xmm6,%%xmm7,q,dqa)
+
+
+#define QPEL_H264(OPNAME, OP, MMX)\
+static av_noinline void OPNAME ## h264_qpel4_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
+ int h=4;\
+\
+ __asm__ volatile(\
+ "pxor %%mm7, %%mm7 \n\t"\
+ "movq "MANGLE(ff_pw_5) ", %%mm4\n\t"\
+ "movq "MANGLE(ff_pw_16)", %%mm5\n\t"\
+ "1: \n\t"\
+ "movd -1(%0), %%mm1 \n\t"\
+ "movd (%0), %%mm2 \n\t"\
+ "movd 1(%0), %%mm3 \n\t"\
+ "movd 2(%0), %%mm0 \n\t"\
+ "punpcklbw %%mm7, %%mm1 \n\t"\
+ "punpcklbw %%mm7, %%mm2 \n\t"\
+ "punpcklbw %%mm7, %%mm3 \n\t"\
+ "punpcklbw %%mm7, %%mm0 \n\t"\
+ "paddw %%mm0, %%mm1 \n\t"\
+ "paddw %%mm3, %%mm2 \n\t"\
+ "movd -2(%0), %%mm0 \n\t"\
+ "movd 3(%0), %%mm3 \n\t"\
+ "punpcklbw %%mm7, %%mm0 \n\t"\
+ "punpcklbw %%mm7, %%mm3 \n\t"\
+ "paddw %%mm3, %%mm0 \n\t"\
+ "psllw $2, %%mm2 \n\t"\
+ "psubw %%mm1, %%mm2 \n\t"\
+ "pmullw %%mm4, %%mm2 \n\t"\
+ "paddw %%mm5, %%mm0 \n\t"\
+ "paddw %%mm2, %%mm0 \n\t"\
+ "psraw $5, %%mm0 \n\t"\
+ "packuswb %%mm0, %%mm0 \n\t"\
+ OP(%%mm0, (%1),%%mm6, d)\
+ "add %3, %0 \n\t"\
+ "add %4, %1 \n\t"\
+ "decl %2 \n\t"\
+ " jnz 1b \n\t"\
+ : "+a"(src), "+c"(dst), "+g"(h)\
+ : "d"((x86_reg)srcStride), "S"((x86_reg)dstStride)\
+ : "memory"\
+ );\
+}\
+static av_noinline void OPNAME ## h264_qpel4_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
+ int h=4;\
+ __asm__ volatile(\
+ "pxor %%mm7, %%mm7 \n\t"\
+ "movq %0, %%mm4 \n\t"\
+ "movq %1, %%mm5 \n\t"\
+ :: "m"(ff_pw_5), "m"(ff_pw_16)\
+ );\
+ do{\
+ __asm__ volatile(\
+ "movd -1(%0), %%mm1 \n\t"\
+ "movd (%0), %%mm2 \n\t"\
+ "movd 1(%0), %%mm3 \n\t"\
+ "movd 2(%0), %%mm0 \n\t"\
+ "punpcklbw %%mm7, %%mm1 \n\t"\
+ "punpcklbw %%mm7, %%mm2 \n\t"\
+ "punpcklbw %%mm7, %%mm3 \n\t"\
+ "punpcklbw %%mm7, %%mm0 \n\t"\
+ "paddw %%mm0, %%mm1 \n\t"\
+ "paddw %%mm3, %%mm2 \n\t"\
+ "movd -2(%0), %%mm0 \n\t"\
+ "movd 3(%0), %%mm3 \n\t"\
+ "punpcklbw %%mm7, %%mm0 \n\t"\
+ "punpcklbw %%mm7, %%mm3 \n\t"\
+ "paddw %%mm3, %%mm0 \n\t"\
+ "psllw $2, %%mm2 \n\t"\
+ "psubw %%mm1, %%mm2 \n\t"\
+ "pmullw %%mm4, %%mm2 \n\t"\
+ "paddw %%mm5, %%mm0 \n\t"\
+ "paddw %%mm2, %%mm0 \n\t"\
+ "movd (%2), %%mm3 \n\t"\
+ "psraw $5, %%mm0 \n\t"\
+ "packuswb %%mm0, %%mm0 \n\t"\
+ PAVGB" %%mm3, %%mm0 \n\t"\
+ OP(%%mm0, (%1),%%mm6, d)\
+ "add %4, %0 \n\t"\
+ "add %4, %1 \n\t"\
+ "add %3, %2 \n\t"\
+ : "+a"(src), "+c"(dst), "+d"(src2)\
+ : "D"((x86_reg)src2Stride), "S"((x86_reg)dstStride)\
+ : "memory"\
+ );\
+ }while(--h);\
+}\
+static av_noinline void OPNAME ## h264_qpel4_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
+ src -= 2*srcStride;\
+ __asm__ volatile(\
+ "pxor %%mm7, %%mm7 \n\t"\
+ "movd (%0), %%mm0 \n\t"\
+ "add %2, %0 \n\t"\
+ "movd (%0), %%mm1 \n\t"\
+ "add %2, %0 \n\t"\
+ "movd (%0), %%mm2 \n\t"\
+ "add %2, %0 \n\t"\
+ "movd (%0), %%mm3 \n\t"\
+ "add %2, %0 \n\t"\
+ "movd (%0), %%mm4 \n\t"\
+ "add %2, %0 \n\t"\
+ "punpcklbw %%mm7, %%mm0 \n\t"\
+ "punpcklbw %%mm7, %%mm1 \n\t"\
+ "punpcklbw %%mm7, %%mm2 \n\t"\
+ "punpcklbw %%mm7, %%mm3 \n\t"\
+ "punpcklbw %%mm7, %%mm4 \n\t"\
+ QPEL_H264V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\
+ QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\
+ QPEL_H264V(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\
+ QPEL_H264V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\
+ \
+ : "+a"(src), "+c"(dst)\
+ : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\
+ : "memory"\
+ );\
+}\
+static av_noinline void OPNAME ## h264_qpel4_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
+ int h=4;\
+ int w=3;\
+ src -= 2*srcStride+2;\
+ while(w--){\
+ __asm__ volatile(\
+ "pxor %%mm7, %%mm7 \n\t"\
+ "movd (%0), %%mm0 \n\t"\
+ "add %2, %0 \n\t"\
+ "movd (%0), %%mm1 \n\t"\
+ "add %2, %0 \n\t"\
+ "movd (%0), %%mm2 \n\t"\
+ "add %2, %0 \n\t"\
+ "movd (%0), %%mm3 \n\t"\
+ "add %2, %0 \n\t"\
+ "movd (%0), %%mm4 \n\t"\
+ "add %2, %0 \n\t"\
+ "punpcklbw %%mm7, %%mm0 \n\t"\
+ "punpcklbw %%mm7, %%mm1 \n\t"\
+ "punpcklbw %%mm7, %%mm2 \n\t"\
+ "punpcklbw %%mm7, %%mm3 \n\t"\
+ "punpcklbw %%mm7, %%mm4 \n\t"\
+ QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 0*8*3)\
+ QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 1*8*3)\
+ QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 2*8*3)\
+ QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 3*8*3)\
+ \
+ : "+a"(src)\
+ : "c"(tmp), "S"((x86_reg)srcStride), "m"(ff_pw_5), "m"(ff_pw_16)\
+ : "memory"\
+ );\
+ tmp += 4;\
+ src += 4 - 9*srcStride;\
+ }\
+ tmp -= 3*4;\
+ __asm__ volatile(\
+ "1: \n\t"\
+ "movq (%0), %%mm0 \n\t"\
+ "paddw 10(%0), %%mm0 \n\t"\
+ "movq 2(%0), %%mm1 \n\t"\
+ "paddw 8(%0), %%mm1 \n\t"\
+ "movq 4(%0), %%mm2 \n\t"\
+ "paddw 6(%0), %%mm2 \n\t"\
+ "psubw %%mm1, %%mm0 \n\t"/*a-b (abccba)*/\
+ "psraw $2, %%mm0 \n\t"/*(a-b)/4 */\
+ "psubw %%mm1, %%mm0 \n\t"/*(a-b)/4-b */\
+ "paddsw %%mm2, %%mm0 \n\t"\
+ "psraw $2, %%mm0 \n\t"/*((a-b)/4-b+c)/4 */\
+ "paddw %%mm2, %%mm0 \n\t"/*(a-5*b+20*c)/16 */\
+ "psraw $6, %%mm0 \n\t"\
+ "packuswb %%mm0, %%mm0 \n\t"\
+ OP(%%mm0, (%1),%%mm7, d)\
+ "add $24, %0 \n\t"\
+ "add %3, %1 \n\t"\
+ "decl %2 \n\t"\
+ " jnz 1b \n\t"\
+ : "+a"(tmp), "+c"(dst), "+g"(h)\
+ : "S"((x86_reg)dstStride)\
+ : "memory"\
+ );\
+}\
+\
+static av_noinline void OPNAME ## h264_qpel8_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
+ int h=8;\
+ __asm__ volatile(\
+ "pxor %%mm7, %%mm7 \n\t"\
+ "movq "MANGLE(ff_pw_5)", %%mm6\n\t"\
+ "1: \n\t"\
+ "movq (%0), %%mm0 \n\t"\
+ "movq 1(%0), %%mm2 \n\t"\
+ "movq %%mm0, %%mm1 \n\t"\
+ "movq %%mm2, %%mm3 \n\t"\
+ "punpcklbw %%mm7, %%mm0 \n\t"\
+ "punpckhbw %%mm7, %%mm1 \n\t"\
+ "punpcklbw %%mm7, %%mm2 \n\t"\
+ "punpckhbw %%mm7, %%mm3 \n\t"\
+ "paddw %%mm2, %%mm0 \n\t"\
+ "paddw %%mm3, %%mm1 \n\t"\
+ "psllw $2, %%mm0 \n\t"\
+ "psllw $2, %%mm1 \n\t"\
+ "movq -1(%0), %%mm2 \n\t"\
+ "movq 2(%0), %%mm4 \n\t"\
+ "movq %%mm2, %%mm3 \n\t"\
+ "movq %%mm4, %%mm5 \n\t"\
+ "punpcklbw %%mm7, %%mm2 \n\t"\
+ "punpckhbw %%mm7, %%mm3 \n\t"\
+ "punpcklbw %%mm7, %%mm4 \n\t"\
+ "punpckhbw %%mm7, %%mm5 \n\t"\
+ "paddw %%mm4, %%mm2 \n\t"\
+ "paddw %%mm3, %%mm5 \n\t"\
+ "psubw %%mm2, %%mm0 \n\t"\
+ "psubw %%mm5, %%mm1 \n\t"\
+ "pmullw %%mm6, %%mm0 \n\t"\
+ "pmullw %%mm6, %%mm1 \n\t"\
+ "movd -2(%0), %%mm2 \n\t"\
+ "movd 7(%0), %%mm5 \n\t"\
+ "punpcklbw %%mm7, %%mm2 \n\t"\
+ "punpcklbw %%mm7, %%mm5 \n\t"\
+ "paddw %%mm3, %%mm2 \n\t"\
+ "paddw %%mm5, %%mm4 \n\t"\
+ "movq "MANGLE(ff_pw_16)", %%mm5\n\t"\
+ "paddw %%mm5, %%mm2 \n\t"\
+ "paddw %%mm5, %%mm4 \n\t"\
+ "paddw %%mm2, %%mm0 \n\t"\
+ "paddw %%mm4, %%mm1 \n\t"\
+ "psraw $5, %%mm0 \n\t"\
+ "psraw $5, %%mm1 \n\t"\
+ "packuswb %%mm1, %%mm0 \n\t"\
+ OP(%%mm0, (%1),%%mm5, q)\
+ "add %3, %0 \n\t"\
+ "add %4, %1 \n\t"\
+ "decl %2 \n\t"\
+ " jnz 1b \n\t"\
+ : "+a"(src), "+c"(dst), "+g"(h)\
+ : "d"((x86_reg)srcStride), "S"((x86_reg)dstStride)\
+ : "memory"\
+ );\
+}\
+\
+static av_noinline void OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
+ int h=8;\
+ __asm__ volatile(\
+ "pxor %%mm7, %%mm7 \n\t"\
+ "movq %0, %%mm6 \n\t"\
+ :: "m"(ff_pw_5)\
+ );\
+ do{\
+ __asm__ volatile(\
+ "movq (%0), %%mm0 \n\t"\
+ "movq 1(%0), %%mm2 \n\t"\
+ "movq %%mm0, %%mm1 \n\t"\
+ "movq %%mm2, %%mm3 \n\t"\
+ "punpcklbw %%mm7, %%mm0 \n\t"\
+ "punpckhbw %%mm7, %%mm1 \n\t"\
+ "punpcklbw %%mm7, %%mm2 \n\t"\
+ "punpckhbw %%mm7, %%mm3 \n\t"\
+ "paddw %%mm2, %%mm0 \n\t"\
+ "paddw %%mm3, %%mm1 \n\t"\
+ "psllw $2, %%mm0 \n\t"\
+ "psllw $2, %%mm1 \n\t"\
+ "movq -1(%0), %%mm2 \n\t"\
+ "movq 2(%0), %%mm4 \n\t"\
+ "movq %%mm2, %%mm3 \n\t"\
+ "movq %%mm4, %%mm5 \n\t"\
+ "punpcklbw %%mm7, %%mm2 \n\t"\
+ "punpckhbw %%mm7, %%mm3 \n\t"\
+ "punpcklbw %%mm7, %%mm4 \n\t"\
+ "punpckhbw %%mm7, %%mm5 \n\t"\
+ "paddw %%mm4, %%mm2 \n\t"\
+ "paddw %%mm3, %%mm5 \n\t"\
+ "psubw %%mm2, %%mm0 \n\t"\
+ "psubw %%mm5, %%mm1 \n\t"\
+ "pmullw %%mm6, %%mm0 \n\t"\
+ "pmullw %%mm6, %%mm1 \n\t"\
+ "movd -2(%0), %%mm2 \n\t"\
+ "movd 7(%0), %%mm5 \n\t"\
+ "punpcklbw %%mm7, %%mm2 \n\t"\
+ "punpcklbw %%mm7, %%mm5 \n\t"\
+ "paddw %%mm3, %%mm2 \n\t"\
+ "paddw %%mm5, %%mm4 \n\t"\
+ "movq %5, %%mm5 \n\t"\
+ "paddw %%mm5, %%mm2 \n\t"\
+ "paddw %%mm5, %%mm4 \n\t"\
+ "paddw %%mm2, %%mm0 \n\t"\
+ "paddw %%mm4, %%mm1 \n\t"\
+ "psraw $5, %%mm0 \n\t"\
+ "psraw $5, %%mm1 \n\t"\
+ "movq (%2), %%mm4 \n\t"\
+ "packuswb %%mm1, %%mm0 \n\t"\
+ PAVGB" %%mm4, %%mm0 \n\t"\
+ OP(%%mm0, (%1),%%mm5, q)\
+ "add %4, %0 \n\t"\
+ "add %4, %1 \n\t"\
+ "add %3, %2 \n\t"\
+ : "+a"(src), "+c"(dst), "+d"(src2)\
+ : "D"((x86_reg)src2Stride), "S"((x86_reg)dstStride),\
+ "m"(ff_pw_16)\
+ : "memory"\
+ );\
+ }while(--h);\
+}\
+\
+static av_noinline void OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
+ int w= 2;\
+ src -= 2*srcStride;\
+ \
+ while(w--){\
+ __asm__ volatile(\
+ "pxor %%mm7, %%mm7 \n\t"\
+ "movd (%0), %%mm0 \n\t"\
+ "add %2, %0 \n\t"\
+ "movd (%0), %%mm1 \n\t"\
+ "add %2, %0 \n\t"\
+ "movd (%0), %%mm2 \n\t"\
+ "add %2, %0 \n\t"\
+ "movd (%0), %%mm3 \n\t"\
+ "add %2, %0 \n\t"\
+ "movd (%0), %%mm4 \n\t"\
+ "add %2, %0 \n\t"\
+ "punpcklbw %%mm7, %%mm0 \n\t"\
+ "punpcklbw %%mm7, %%mm1 \n\t"\
+ "punpcklbw %%mm7, %%mm2 \n\t"\
+ "punpcklbw %%mm7, %%mm3 \n\t"\
+ "punpcklbw %%mm7, %%mm4 \n\t"\
+ QPEL_H264V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\
+ QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\
+ QPEL_H264V(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\
+ QPEL_H264V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\
+ QPEL_H264V(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, OP)\
+ QPEL_H264V(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, OP)\
+ QPEL_H264V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\
+ QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\
+ \
+ : "+a"(src), "+c"(dst)\
+ : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\
+ : "memory"\
+ );\
+ if(h==16){\
+ __asm__ volatile(\
+ QPEL_H264V(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\
+ QPEL_H264V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\
+ QPEL_H264V(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, OP)\
+ QPEL_H264V(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, OP)\
+ QPEL_H264V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\
+ QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\
+ QPEL_H264V(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\
+ QPEL_H264V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\
+ \
+ : "+a"(src), "+c"(dst)\
+ : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\
+ : "memory"\
+ );\
+ }\
+ src += 4-(h+5)*srcStride;\
+ dst += 4-h*dstStride;\
+ }\
+}\
+static av_always_inline void OPNAME ## h264_qpel8or16_hv1_lowpass_ ## MMX(int16_t *tmp, uint8_t *src, int tmpStride, int srcStride, int size){\
+ int w = (size+8)>>2;\
+ src -= 2*srcStride+2;\
+ while(w--){\
+ __asm__ volatile(\
+ "pxor %%mm7, %%mm7 \n\t"\
+ "movd (%0), %%mm0 \n\t"\
+ "add %2, %0 \n\t"\
+ "movd (%0), %%mm1 \n\t"\
+ "add %2, %0 \n\t"\
+ "movd (%0), %%mm2 \n\t"\
+ "add %2, %0 \n\t"\
+ "movd (%0), %%mm3 \n\t"\
+ "add %2, %0 \n\t"\
+ "movd (%0), %%mm4 \n\t"\
+ "add %2, %0 \n\t"\
+ "punpcklbw %%mm7, %%mm0 \n\t"\
+ "punpcklbw %%mm7, %%mm1 \n\t"\
+ "punpcklbw %%mm7, %%mm2 \n\t"\
+ "punpcklbw %%mm7, %%mm3 \n\t"\
+ "punpcklbw %%mm7, %%mm4 \n\t"\
+ QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 0*48)\
+ QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 1*48)\
+ QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 2*48)\
+ QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 3*48)\
+ QPEL_H264HV(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, 4*48)\
+ QPEL_H264HV(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, 5*48)\
+ QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 6*48)\
+ QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 7*48)\
+ : "+a"(src)\
+ : "c"(tmp), "S"((x86_reg)srcStride), "m"(ff_pw_5), "m"(ff_pw_16)\
+ : "memory"\
+ );\
+ if(size==16){\
+ __asm__ volatile(\
+ QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 8*48)\
+ QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 9*48)\
+ QPEL_H264HV(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, 10*48)\
+ QPEL_H264HV(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, 11*48)\
+ QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 12*48)\
+ QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 13*48)\
+ QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 14*48)\
+ QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 15*48)\
+ : "+a"(src)\
+ : "c"(tmp), "S"((x86_reg)srcStride), "m"(ff_pw_5), "m"(ff_pw_16)\
+ : "memory"\
+ );\
+ }\
+ tmp += 4;\
+ src += 4 - (size+5)*srcStride;\
+ }\
+}\
+static av_always_inline void OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, int dstStride, int tmpStride, int size){\
+ int w = size>>4;\
+ do{\
+ int h = size;\
+ __asm__ volatile(\
+ "1: \n\t"\
+ "movq (%0), %%mm0 \n\t"\
+ "movq 8(%0), %%mm3 \n\t"\
+ "movq 2(%0), %%mm1 \n\t"\
+ "movq 10(%0), %%mm4 \n\t"\
+ "paddw %%mm4, %%mm0 \n\t"\
+ "paddw %%mm3, %%mm1 \n\t"\
+ "paddw 18(%0), %%mm3 \n\t"\
+ "paddw 16(%0), %%mm4 \n\t"\
+ "movq 4(%0), %%mm2 \n\t"\
+ "movq 12(%0), %%mm5 \n\t"\
+ "paddw 6(%0), %%mm2 \n\t"\
+ "paddw 14(%0), %%mm5 \n\t"\
+ "psubw %%mm1, %%mm0 \n\t"\
+ "psubw %%mm4, %%mm3 \n\t"\
+ "psraw $2, %%mm0 \n\t"\
+ "psraw $2, %%mm3 \n\t"\
+ "psubw %%mm1, %%mm0 \n\t"\
+ "psubw %%mm4, %%mm3 \n\t"\
+ "paddsw %%mm2, %%mm0 \n\t"\
+ "paddsw %%mm5, %%mm3 \n\t"\
+ "psraw $2, %%mm0 \n\t"\
+ "psraw $2, %%mm3 \n\t"\
+ "paddw %%mm2, %%mm0 \n\t"\
+ "paddw %%mm5, %%mm3 \n\t"\
+ "psraw $6, %%mm0 \n\t"\
+ "psraw $6, %%mm3 \n\t"\
+ "packuswb %%mm3, %%mm0 \n\t"\
+ OP(%%mm0, (%1),%%mm7, q)\
+ "add $48, %0 \n\t"\
+ "add %3, %1 \n\t"\
+ "decl %2 \n\t"\
+ " jnz 1b \n\t"\
+ : "+a"(tmp), "+c"(dst), "+g"(h)\
+ : "S"((x86_reg)dstStride)\
+ : "memory"\
+ );\
+ tmp += 8 - size*24;\
+ dst += 8 - size*dstStride;\
+ }while(w--);\
+}\
+\
+static void OPNAME ## h264_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
+ OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride, srcStride, 8);\
+}\
+static av_noinline void OPNAME ## h264_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
+ OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride, srcStride, 16);\
+ OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\
+}\
+\
+static void OPNAME ## h264_qpel16_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
+ OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\
+ OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
+ src += 8*srcStride;\
+ dst += 8*dstStride;\
+ OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\
+ OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
+}\
+\
+static av_noinline void OPNAME ## h264_qpel16_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
+ OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\
+ OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\
+ src += 8*dstStride;\
+ dst += 8*dstStride;\
+ src2 += 8*src2Stride;\
+ OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\
+ OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\
+}\
+\
+static av_noinline void OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride, int size){\
+ put_h264_qpel8or16_hv1_lowpass_ ## MMX(tmp, src, tmpStride, srcStride, size);\
+ OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(dst, tmp, dstStride, tmpStride, size);\
+}\
+static void OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
+ OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst , tmp , src , dstStride, tmpStride, srcStride, 8);\
+}\
+\
+static void OPNAME ## h264_qpel16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
+ OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst , tmp , src , dstStride, tmpStride, srcStride, 16);\
+}\
+\
+static av_noinline void OPNAME ## pixels4_l2_shift5_ ## MMX(uint8_t *dst, int16_t *src16, uint8_t *src8, int dstStride, int src8Stride, int h)\
+{\
+ __asm__ volatile(\
+ "movq (%1), %%mm0 \n\t"\
+ "movq 24(%1), %%mm1 \n\t"\
+ "psraw $5, %%mm0 \n\t"\
+ "psraw $5, %%mm1 \n\t"\
+ "packuswb %%mm0, %%mm0 \n\t"\
+ "packuswb %%mm1, %%mm1 \n\t"\
+ PAVGB" (%0), %%mm0 \n\t"\
+ PAVGB" (%0,%3), %%mm1 \n\t"\
+ OP(%%mm0, (%2), %%mm4, d)\
+ OP(%%mm1, (%2,%4), %%mm5, d)\
+ "lea (%0,%3,2), %0 \n\t"\
+ "lea (%2,%4,2), %2 \n\t"\
+ "movq 48(%1), %%mm0 \n\t"\
+ "movq 72(%1), %%mm1 \n\t"\
+ "psraw $5, %%mm0 \n\t"\
+ "psraw $5, %%mm1 \n\t"\
+ "packuswb %%mm0, %%mm0 \n\t"\
+ "packuswb %%mm1, %%mm1 \n\t"\
+ PAVGB" (%0), %%mm0 \n\t"\
+ PAVGB" (%0,%3), %%mm1 \n\t"\
+ OP(%%mm0, (%2), %%mm4, d)\
+ OP(%%mm1, (%2,%4), %%mm5, d)\
+ :"+a"(src8), "+c"(src16), "+d"(dst)\
+ :"S"((x86_reg)src8Stride), "D"((x86_reg)dstStride)\
+ :"memory");\
+}\
+static av_noinline void OPNAME ## pixels8_l2_shift5_ ## MMX(uint8_t *dst, int16_t *src16, uint8_t *src8, int dstStride, int src8Stride, int h)\
+{\
+ do{\
+ __asm__ volatile(\
+ "movq (%1), %%mm0 \n\t"\
+ "movq 8(%1), %%mm1 \n\t"\
+ "movq 48(%1), %%mm2 \n\t"\
+ "movq 8+48(%1), %%mm3 \n\t"\
+ "psraw $5, %%mm0 \n\t"\
+ "psraw $5, %%mm1 \n\t"\
+ "psraw $5, %%mm2 \n\t"\
+ "psraw $5, %%mm3 \n\t"\
+ "packuswb %%mm1, %%mm0 \n\t"\
+ "packuswb %%mm3, %%mm2 \n\t"\
+ PAVGB" (%0), %%mm0 \n\t"\
+ PAVGB" (%0,%3), %%mm2 \n\t"\
+ OP(%%mm0, (%2), %%mm5, q)\
+ OP(%%mm2, (%2,%4), %%mm5, q)\
+ ::"a"(src8), "c"(src16), "d"(dst),\
+ "r"((x86_reg)src8Stride), "r"((x86_reg)dstStride)\
+ :"memory");\
+ src8 += 2L*src8Stride;\
+ src16 += 48;\
+ dst += 2L*dstStride;\
+ }while(h-=2);\
+}\
+static void OPNAME ## pixels16_l2_shift5_ ## MMX(uint8_t *dst, int16_t *src16, uint8_t *src8, int dstStride, int src8Stride, int h)\
+{\
+ OPNAME ## pixels8_l2_shift5_ ## MMX(dst , src16 , src8 , dstStride, src8Stride, h);\
+ OPNAME ## pixels8_l2_shift5_ ## MMX(dst+8, src16+8, src8+8, dstStride, src8Stride, h);\
+}\
+
+
+#if ARCH_X86_64
+#define QPEL_H264_H16_XMM(OPNAME, OP, MMX)\
+static av_noinline void OPNAME ## h264_qpel16_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
+ int h=16;\
+ __asm__ volatile(\
+ "pxor %%xmm15, %%xmm15 \n\t"\
+ "movdqa %6, %%xmm14 \n\t"\
+ "movdqa %7, %%xmm13 \n\t"\
+ "1: \n\t"\
+ "lddqu 6(%0), %%xmm1 \n\t"\
+ "lddqu -2(%0), %%xmm7 \n\t"\
+ "movdqa %%xmm1, %%xmm0 \n\t"\
+ "punpckhbw %%xmm15, %%xmm1 \n\t"\
+ "punpcklbw %%xmm15, %%xmm0 \n\t"\
+ "punpcklbw %%xmm15, %%xmm7 \n\t"\
+ "movdqa %%xmm1, %%xmm2 \n\t"\
+ "movdqa %%xmm0, %%xmm6 \n\t"\
+ "movdqa %%xmm1, %%xmm3 \n\t"\
+ "movdqa %%xmm0, %%xmm8 \n\t"\
+ "movdqa %%xmm1, %%xmm4 \n\t"\
+ "movdqa %%xmm0, %%xmm9 \n\t"\
+ "movdqa %%xmm0, %%xmm12 \n\t"\
+ "movdqa %%xmm1, %%xmm11 \n\t"\
+ "palignr $10,%%xmm0, %%xmm11\n\t"\
+ "palignr $10,%%xmm7, %%xmm12\n\t"\
+ "palignr $2, %%xmm0, %%xmm4 \n\t"\
+ "palignr $2, %%xmm7, %%xmm9 \n\t"\
+ "palignr $4, %%xmm0, %%xmm3 \n\t"\
+ "palignr $4, %%xmm7, %%xmm8 \n\t"\
+ "palignr $6, %%xmm0, %%xmm2 \n\t"\
+ "palignr $6, %%xmm7, %%xmm6 \n\t"\
+ "paddw %%xmm0 ,%%xmm11 \n\t"\
+ "palignr $8, %%xmm0, %%xmm1 \n\t"\
+ "palignr $8, %%xmm7, %%xmm0 \n\t"\
+ "paddw %%xmm12,%%xmm7 \n\t"\
+ "paddw %%xmm3, %%xmm2 \n\t"\
+ "paddw %%xmm8, %%xmm6 \n\t"\
+ "paddw %%xmm4, %%xmm1 \n\t"\
+ "paddw %%xmm9, %%xmm0 \n\t"\
+ "psllw $2, %%xmm2 \n\t"\
+ "psllw $2, %%xmm6 \n\t"\
+ "psubw %%xmm1, %%xmm2 \n\t"\
+ "psubw %%xmm0, %%xmm6 \n\t"\
+ "paddw %%xmm13,%%xmm11 \n\t"\
+ "paddw %%xmm13,%%xmm7 \n\t"\
+ "pmullw %%xmm14,%%xmm2 \n\t"\
+ "pmullw %%xmm14,%%xmm6 \n\t"\
+ "lddqu (%2), %%xmm3 \n\t"\
+ "paddw %%xmm11,%%xmm2 \n\t"\
+ "paddw %%xmm7, %%xmm6 \n\t"\
+ "psraw $5, %%xmm2 \n\t"\
+ "psraw $5, %%xmm6 \n\t"\
+ "packuswb %%xmm2,%%xmm6 \n\t"\
+ "pavgb %%xmm3, %%xmm6 \n\t"\
+ OP(%%xmm6, (%1), %%xmm4, dqa)\
+ "add %5, %0 \n\t"\
+ "add %5, %1 \n\t"\
+ "add %4, %2 \n\t"\
+ "decl %3 \n\t"\
+ "jg 1b \n\t"\
+ : "+a"(src), "+c"(dst), "+d"(src2), "+g"(h)\
+ : "D"((x86_reg)src2Stride), "S"((x86_reg)dstStride),\
+ "m"(ff_pw_5), "m"(ff_pw_16)\
+ : "memory"\
+ );\
+}
+#else // ARCH_X86_64
+#define QPEL_H264_H16_XMM(OPNAME, OP, MMX)\
+static av_noinline void OPNAME ## h264_qpel16_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
+ OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\
+ OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\
+ src += 8*dstStride;\
+ dst += 8*dstStride;\
+ src2 += 8*src2Stride;\
+ OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\
+ OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\
+}
+#endif // ARCH_X86_64
+
+#define QPEL_H264_H_XMM(OPNAME, OP, MMX)\
+static av_noinline void OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
+ int h=8;\
+ __asm__ volatile(\
+ "pxor %%xmm7, %%xmm7 \n\t"\
+ "movdqa %0, %%xmm6 \n\t"\
+ :: "m"(ff_pw_5)\
+ );\
+ do{\
+ __asm__ volatile(\
+ "lddqu -2(%0), %%xmm1 \n\t"\
+ "movdqa %%xmm1, %%xmm0 \n\t"\
+ "punpckhbw %%xmm7, %%xmm1 \n\t"\
+ "punpcklbw %%xmm7, %%xmm0 \n\t"\
+ "movdqa %%xmm1, %%xmm2 \n\t"\
+ "movdqa %%xmm1, %%xmm3 \n\t"\
+ "movdqa %%xmm1, %%xmm4 \n\t"\
+ "movdqa %%xmm1, %%xmm5 \n\t"\
+ "palignr $2, %%xmm0, %%xmm4 \n\t"\
+ "palignr $4, %%xmm0, %%xmm3 \n\t"\
+ "palignr $6, %%xmm0, %%xmm2 \n\t"\
+ "palignr $8, %%xmm0, %%xmm1 \n\t"\
+ "palignr $10,%%xmm0, %%xmm5 \n\t"\
+ "paddw %%xmm5, %%xmm0 \n\t"\
+ "paddw %%xmm3, %%xmm2 \n\t"\
+ "paddw %%xmm4, %%xmm1 \n\t"\
+ "psllw $2, %%xmm2 \n\t"\
+ "movq (%2), %%xmm3 \n\t"\
+ "psubw %%xmm1, %%xmm2 \n\t"\
+ "paddw %5, %%xmm0 \n\t"\
+ "pmullw %%xmm6, %%xmm2 \n\t"\
+ "paddw %%xmm0, %%xmm2 \n\t"\
+ "psraw $5, %%xmm2 \n\t"\
+ "packuswb %%xmm2, %%xmm2 \n\t"\
+ "pavgb %%xmm3, %%xmm2 \n\t"\
+ OP(%%xmm2, (%1), %%xmm4, q)\
+ "add %4, %0 \n\t"\
+ "add %4, %1 \n\t"\
+ "add %3, %2 \n\t"\
+ : "+a"(src), "+c"(dst), "+d"(src2)\
+ : "D"((x86_reg)src2Stride), "S"((x86_reg)dstStride),\
+ "m"(ff_pw_16)\
+ : "memory"\
+ );\
+ }while(--h);\
+}\
+QPEL_H264_H16_XMM(OPNAME, OP, MMX)\
+\
+static av_noinline void OPNAME ## h264_qpel8_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
+ int h=8;\
+ __asm__ volatile(\
+ "pxor %%xmm7, %%xmm7 \n\t"\
+ "movdqa "MANGLE(ff_pw_5)", %%xmm6\n\t"\
+ "1: \n\t"\
+ "lddqu -2(%0), %%xmm1 \n\t"\
+ "movdqa %%xmm1, %%xmm0 \n\t"\
+ "punpckhbw %%xmm7, %%xmm1 \n\t"\
+ "punpcklbw %%xmm7, %%xmm0 \n\t"\
+ "movdqa %%xmm1, %%xmm2 \n\t"\
+ "movdqa %%xmm1, %%xmm3 \n\t"\
+ "movdqa %%xmm1, %%xmm4 \n\t"\
+ "movdqa %%xmm1, %%xmm5 \n\t"\
+ "palignr $2, %%xmm0, %%xmm4 \n\t"\
+ "palignr $4, %%xmm0, %%xmm3 \n\t"\
+ "palignr $6, %%xmm0, %%xmm2 \n\t"\
+ "palignr $8, %%xmm0, %%xmm1 \n\t"\
+ "palignr $10,%%xmm0, %%xmm5 \n\t"\
+ "paddw %%xmm5, %%xmm0 \n\t"\
+ "paddw %%xmm3, %%xmm2 \n\t"\
+ "paddw %%xmm4, %%xmm1 \n\t"\
+ "psllw $2, %%xmm2 \n\t"\
+ "psubw %%xmm1, %%xmm2 \n\t"\
+ "paddw "MANGLE(ff_pw_16)", %%xmm0\n\t"\
+ "pmullw %%xmm6, %%xmm2 \n\t"\
+ "paddw %%xmm0, %%xmm2 \n\t"\
+ "psraw $5, %%xmm2 \n\t"\
+ "packuswb %%xmm2, %%xmm2 \n\t"\
+ OP(%%xmm2, (%1), %%xmm4, q)\
+ "add %3, %0 \n\t"\
+ "add %4, %1 \n\t"\
+ "decl %2 \n\t"\
+ " jnz 1b \n\t"\
+ : "+a"(src), "+c"(dst), "+g"(h)\
+ : "D"((x86_reg)srcStride), "S"((x86_reg)dstStride)\
+ : "memory"\
+ );\
+}\
+static void OPNAME ## h264_qpel16_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
+ OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\
+ OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
+ src += 8*srcStride;\
+ dst += 8*dstStride;\
+ OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\
+ OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
+}\
+
+#define QPEL_H264_V_XMM(OPNAME, OP, MMX)\
+static av_noinline void OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
+ src -= 2*srcStride;\
+ \
+ __asm__ volatile(\
+ "pxor %%xmm7, %%xmm7 \n\t"\
+ "movq (%0), %%xmm0 \n\t"\
+ "add %2, %0 \n\t"\
+ "movq (%0), %%xmm1 \n\t"\
+ "add %2, %0 \n\t"\
+ "movq (%0), %%xmm2 \n\t"\
+ "add %2, %0 \n\t"\
+ "movq (%0), %%xmm3 \n\t"\
+ "add %2, %0 \n\t"\
+ "movq (%0), %%xmm4 \n\t"\
+ "add %2, %0 \n\t"\
+ "punpcklbw %%xmm7, %%xmm0 \n\t"\
+ "punpcklbw %%xmm7, %%xmm1 \n\t"\
+ "punpcklbw %%xmm7, %%xmm2 \n\t"\
+ "punpcklbw %%xmm7, %%xmm3 \n\t"\
+ "punpcklbw %%xmm7, %%xmm4 \n\t"\
+ QPEL_H264V_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, OP)\
+ QPEL_H264V_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, OP)\
+ QPEL_H264V_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, OP)\
+ QPEL_H264V_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, OP)\
+ QPEL_H264V_XMM(%%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, OP)\
+ QPEL_H264V_XMM(%%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, OP)\
+ QPEL_H264V_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, OP)\
+ QPEL_H264V_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, OP)\
+ \
+ : "+a"(src), "+c"(dst)\
+ : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\
+ : "memory"\
+ );\
+ if(h==16){\
+ __asm__ volatile(\
+ QPEL_H264V_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, OP)\
+ QPEL_H264V_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, OP)\
+ QPEL_H264V_XMM(%%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, OP)\
+ QPEL_H264V_XMM(%%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, OP)\
+ QPEL_H264V_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, OP)\
+ QPEL_H264V_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, OP)\
+ QPEL_H264V_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, OP)\
+ QPEL_H264V_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, OP)\
+ \
+ : "+a"(src), "+c"(dst)\
+ : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\
+ : "memory"\
+ );\
+ }\
+}\
+static void OPNAME ## h264_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
+ OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride, srcStride, 8);\
+}\
+static av_noinline void OPNAME ## h264_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
+ OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride, srcStride, 16);\
+ OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\
+}
+
+static av_always_inline void put_h264_qpel8or16_hv1_lowpass_sse2(int16_t *tmp, uint8_t *src, int tmpStride, int srcStride, int size){
+ int w = (size+8)>>3;
+ src -= 2*srcStride+2;
+ while(w--){
+ __asm__ volatile(
+ "pxor %%xmm7, %%xmm7 \n\t"
+ "movq (%0), %%xmm0 \n\t"
+ "add %2, %0 \n\t"
+ "movq (%0), %%xmm1 \n\t"
+ "add %2, %0 \n\t"
+ "movq (%0), %%xmm2 \n\t"
+ "add %2, %0 \n\t"
+ "movq (%0), %%xmm3 \n\t"
+ "add %2, %0 \n\t"
+ "movq (%0), %%xmm4 \n\t"
+ "add %2, %0 \n\t"
+ "punpcklbw %%xmm7, %%xmm0 \n\t"
+ "punpcklbw %%xmm7, %%xmm1 \n\t"
+ "punpcklbw %%xmm7, %%xmm2 \n\t"
+ "punpcklbw %%xmm7, %%xmm3 \n\t"
+ "punpcklbw %%xmm7, %%xmm4 \n\t"
+ QPEL_H264HV_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, 0*48)
+ QPEL_H264HV_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, 1*48)
+ QPEL_H264HV_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, 2*48)
+ QPEL_H264HV_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, 3*48)
+ QPEL_H264HV_XMM(%%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, 4*48)
+ QPEL_H264HV_XMM(%%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, 5*48)
+ QPEL_H264HV_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, 6*48)
+ QPEL_H264HV_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, 7*48)
+ : "+a"(src)
+ : "c"(tmp), "S"((x86_reg)srcStride), "m"(ff_pw_5), "m"(ff_pw_16)
+ : "memory"
+ );
+ if(size==16){
+ __asm__ volatile(
+ QPEL_H264HV_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, 8*48)
+ QPEL_H264HV_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, 9*48)
+ QPEL_H264HV_XMM(%%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, 10*48)
+ QPEL_H264HV_XMM(%%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, 11*48)
+ QPEL_H264HV_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, 12*48)
+ QPEL_H264HV_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, 13*48)
+ QPEL_H264HV_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, 14*48)
+ QPEL_H264HV_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, 15*48)
+ : "+a"(src)
+ : "c"(tmp), "S"((x86_reg)srcStride), "m"(ff_pw_5), "m"(ff_pw_16)
+ : "memory"
+ );
+ }
+ tmp += 8;
+ src += 8 - (size+5)*srcStride;
+ }
+}
+
+#define QPEL_H264_HV2_XMM(OPNAME, OP, MMX)\
+static av_always_inline void OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, int dstStride, int tmpStride, int size){\
+ int h = size;\
+ if(size == 16){\
+ __asm__ volatile(\
+ "1: \n\t"\
+ "movdqa 32(%0), %%xmm4 \n\t"\
+ "movdqa 16(%0), %%xmm5 \n\t"\
+ "movdqa (%0), %%xmm7 \n\t"\
+ "movdqa %%xmm4, %%xmm3 \n\t"\
+ "movdqa %%xmm4, %%xmm2 \n\t"\
+ "movdqa %%xmm4, %%xmm1 \n\t"\
+ "movdqa %%xmm4, %%xmm0 \n\t"\
+ "palignr $10, %%xmm5, %%xmm0 \n\t"\
+ "palignr $8, %%xmm5, %%xmm1 \n\t"\
+ "palignr $6, %%xmm5, %%xmm2 \n\t"\
+ "palignr $4, %%xmm5, %%xmm3 \n\t"\
+ "palignr $2, %%xmm5, %%xmm4 \n\t"\
+ "paddw %%xmm5, %%xmm0 \n\t"\
+ "paddw %%xmm4, %%xmm1 \n\t"\
+ "paddw %%xmm3, %%xmm2 \n\t"\
+ "movdqa %%xmm5, %%xmm6 \n\t"\
+ "movdqa %%xmm5, %%xmm4 \n\t"\
+ "movdqa %%xmm5, %%xmm3 \n\t"\
+ "palignr $8, %%xmm7, %%xmm4 \n\t"\
+ "palignr $2, %%xmm7, %%xmm6 \n\t"\
+ "palignr $10, %%xmm7, %%xmm3 \n\t"\
+ "paddw %%xmm6, %%xmm4 \n\t"\
+ "movdqa %%xmm5, %%xmm6 \n\t"\
+ "palignr $6, %%xmm7, %%xmm5 \n\t"\
+ "palignr $4, %%xmm7, %%xmm6 \n\t"\
+ "paddw %%xmm7, %%xmm3 \n\t"\
+ "paddw %%xmm6, %%xmm5 \n\t"\
+ \
+ "psubw %%xmm1, %%xmm0 \n\t"\
+ "psubw %%xmm4, %%xmm3 \n\t"\
+ "psraw $2, %%xmm0 \n\t"\
+ "psraw $2, %%xmm3 \n\t"\
+ "psubw %%xmm1, %%xmm0 \n\t"\
+ "psubw %%xmm4, %%xmm3 \n\t"\
+ "paddw %%xmm2, %%xmm0 \n\t"\
+ "paddw %%xmm5, %%xmm3 \n\t"\
+ "psraw $2, %%xmm0 \n\t"\
+ "psraw $2, %%xmm3 \n\t"\
+ "paddw %%xmm2, %%xmm0 \n\t"\
+ "paddw %%xmm5, %%xmm3 \n\t"\
+ "psraw $6, %%xmm0 \n\t"\
+ "psraw $6, %%xmm3 \n\t"\
+ "packuswb %%xmm0, %%xmm3 \n\t"\
+ OP(%%xmm3, (%1), %%xmm7, dqa)\
+ "add $48, %0 \n\t"\
+ "add %3, %1 \n\t"\
+ "decl %2 \n\t"\
+ " jnz 1b \n\t"\
+ : "+a"(tmp), "+c"(dst), "+g"(h)\
+ : "S"((x86_reg)dstStride)\
+ : "memory"\
+ );\
+ }else{\
+ __asm__ volatile(\
+ "1: \n\t"\
+ "movdqa 16(%0), %%xmm1 \n\t"\
+ "movdqa (%0), %%xmm0 \n\t"\
+ "movdqa %%xmm1, %%xmm2 \n\t"\
+ "movdqa %%xmm1, %%xmm3 \n\t"\
+ "movdqa %%xmm1, %%xmm4 \n\t"\
+ "movdqa %%xmm1, %%xmm5 \n\t"\
+ "palignr $10, %%xmm0, %%xmm5 \n\t"\
+ "palignr $8, %%xmm0, %%xmm4 \n\t"\
+ "palignr $6, %%xmm0, %%xmm3 \n\t"\
+ "palignr $4, %%xmm0, %%xmm2 \n\t"\
+ "palignr $2, %%xmm0, %%xmm1 \n\t"\
+ "paddw %%xmm5, %%xmm0 \n\t"\
+ "paddw %%xmm4, %%xmm1 \n\t"\
+ "paddw %%xmm3, %%xmm2 \n\t"\
+ "psubw %%xmm1, %%xmm0 \n\t"\
+ "psraw $2, %%xmm0 \n\t"\
+ "psubw %%xmm1, %%xmm0 \n\t"\
+ "paddw %%xmm2, %%xmm0 \n\t"\
+ "psraw $2, %%xmm0 \n\t"\
+ "paddw %%xmm2, %%xmm0 \n\t"\
+ "psraw $6, %%xmm0 \n\t"\
+ "packuswb %%xmm0, %%xmm0 \n\t"\
+ OP(%%xmm0, (%1), %%xmm7, q)\
+ "add $48, %0 \n\t"\
+ "add %3, %1 \n\t"\
+ "decl %2 \n\t"\
+ " jnz 1b \n\t"\
+ : "+a"(tmp), "+c"(dst), "+g"(h)\
+ : "S"((x86_reg)dstStride)\
+ : "memory"\
+ );\
+ }\
+}
+
+#define QPEL_H264_HV_XMM(OPNAME, OP, MMX)\
+static av_noinline void OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride, int size){\
+ put_h264_qpel8or16_hv1_lowpass_sse2(tmp, src, tmpStride, srcStride, size);\
+ OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(dst, tmp, dstStride, tmpStride, size);\
+}\
+static void OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
+ OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst, tmp, src, dstStride, tmpStride, srcStride, 8);\
+}\
+static void OPNAME ## h264_qpel16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
+ OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst, tmp, src, dstStride, tmpStride, srcStride, 16);\
+}\
+
+#define put_pixels8_l2_sse2 put_pixels8_l2_mmx2
+#define avg_pixels8_l2_sse2 avg_pixels8_l2_mmx2
+#define put_pixels16_l2_sse2 put_pixels16_l2_mmx2
+#define avg_pixels16_l2_sse2 avg_pixels16_l2_mmx2
+#define put_pixels8_l2_ssse3 put_pixels8_l2_mmx2
+#define avg_pixels8_l2_ssse3 avg_pixels8_l2_mmx2
+#define put_pixels16_l2_ssse3 put_pixels16_l2_mmx2
+#define avg_pixels16_l2_ssse3 avg_pixels16_l2_mmx2
+
+#define put_pixels8_l2_shift5_sse2 put_pixels8_l2_shift5_mmx2
+#define avg_pixels8_l2_shift5_sse2 avg_pixels8_l2_shift5_mmx2
+#define put_pixels16_l2_shift5_sse2 put_pixels16_l2_shift5_mmx2
+#define avg_pixels16_l2_shift5_sse2 avg_pixels16_l2_shift5_mmx2
+#define put_pixels8_l2_shift5_ssse3 put_pixels8_l2_shift5_mmx2
+#define avg_pixels8_l2_shift5_ssse3 avg_pixels8_l2_shift5_mmx2
+#define put_pixels16_l2_shift5_ssse3 put_pixels16_l2_shift5_mmx2
+#define avg_pixels16_l2_shift5_ssse3 avg_pixels16_l2_shift5_mmx2
+
+#define put_h264_qpel8_h_lowpass_l2_sse2 put_h264_qpel8_h_lowpass_l2_mmx2
+#define avg_h264_qpel8_h_lowpass_l2_sse2 avg_h264_qpel8_h_lowpass_l2_mmx2
+#define put_h264_qpel16_h_lowpass_l2_sse2 put_h264_qpel16_h_lowpass_l2_mmx2
+#define avg_h264_qpel16_h_lowpass_l2_sse2 avg_h264_qpel16_h_lowpass_l2_mmx2
+
+#define put_h264_qpel8_v_lowpass_ssse3 put_h264_qpel8_v_lowpass_sse2
+#define avg_h264_qpel8_v_lowpass_ssse3 avg_h264_qpel8_v_lowpass_sse2
+#define put_h264_qpel16_v_lowpass_ssse3 put_h264_qpel16_v_lowpass_sse2
+#define avg_h264_qpel16_v_lowpass_ssse3 avg_h264_qpel16_v_lowpass_sse2
+
+#define put_h264_qpel8or16_hv2_lowpass_sse2 put_h264_qpel8or16_hv2_lowpass_mmx2
+#define avg_h264_qpel8or16_hv2_lowpass_sse2 avg_h264_qpel8or16_hv2_lowpass_mmx2
+
+#define H264_MC(OPNAME, SIZE, MMX, ALIGN) \
+H264_MC_C(OPNAME, SIZE, MMX, ALIGN)\
+H264_MC_V(OPNAME, SIZE, MMX, ALIGN)\
+H264_MC_H(OPNAME, SIZE, MMX, ALIGN)\
+H264_MC_HV(OPNAME, SIZE, MMX, ALIGN)\
+
+static void put_h264_qpel16_mc00_sse2 (uint8_t *dst, uint8_t *src, int stride){
+ put_pixels16_sse2(dst, src, stride, 16);
+}
+static void avg_h264_qpel16_mc00_sse2 (uint8_t *dst, uint8_t *src, int stride){
+ avg_pixels16_sse2(dst, src, stride, 16);
+}
+#define put_h264_qpel8_mc00_sse2 put_h264_qpel8_mc00_mmx2
+#define avg_h264_qpel8_mc00_sse2 avg_h264_qpel8_mc00_mmx2
+
+#define H264_MC_C(OPNAME, SIZE, MMX, ALIGN) \
+static void OPNAME ## h264_qpel ## SIZE ## _mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\
+ OPNAME ## pixels ## SIZE ## _ ## MMX(dst, src, stride, SIZE);\
+}\
+
+#define H264_MC_H(OPNAME, SIZE, MMX, ALIGN) \
+static void OPNAME ## h264_qpel ## SIZE ## _mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+ OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, src, stride, stride);\
+}\
+\
+static void OPNAME ## h264_qpel ## SIZE ## _mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+ OPNAME ## h264_qpel ## SIZE ## _h_lowpass_ ## MMX(dst, src, stride, stride);\
+}\
+\
+static void OPNAME ## h264_qpel ## SIZE ## _mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+ OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, src+1, stride, stride);\
+}\
+
+#define H264_MC_V(OPNAME, SIZE, MMX, ALIGN) \
+static void OPNAME ## h264_qpel ## SIZE ## _mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+ DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\
+ put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
+ OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, src, temp, stride, stride, SIZE);\
+}\
+\
+static void OPNAME ## h264_qpel ## SIZE ## _mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+ OPNAME ## h264_qpel ## SIZE ## _v_lowpass_ ## MMX(dst, src, stride, stride);\
+}\
+\
+static void OPNAME ## h264_qpel ## SIZE ## _mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+ DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\
+ put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
+ OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, src+stride, temp, stride, stride, SIZE);\
+}\
+
+#define H264_MC_HV(OPNAME, SIZE, MMX, ALIGN) \
+static void OPNAME ## h264_qpel ## SIZE ## _mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+ DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\
+ put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
+ OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, temp, stride, SIZE);\
+}\
+\
+static void OPNAME ## h264_qpel ## SIZE ## _mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+ DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\
+ put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src+1, SIZE, stride);\
+ OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, temp, stride, SIZE);\
+}\
+\
+static void OPNAME ## h264_qpel ## SIZE ## _mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+ DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\
+ put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
+ OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, temp, stride, SIZE);\
+}\
+\
+static void OPNAME ## h264_qpel ## SIZE ## _mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+ DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\
+ put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src+1, SIZE, stride);\
+ OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, temp, stride, SIZE);\
+}\
+\
+static void OPNAME ## h264_qpel ## SIZE ## _mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+ DECLARE_ALIGNED(ALIGN, uint16_t, temp)[SIZE*(SIZE<8?12:24)];\
+ OPNAME ## h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(dst, temp, src, stride, SIZE, stride);\
+}\
+\
+static void OPNAME ## h264_qpel ## SIZE ## _mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+ DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE];\
+ uint8_t * const halfHV= temp;\
+ int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
+ assert(((int)temp & 7) == 0);\
+ put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
+ OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, halfHV, stride, SIZE);\
+}\
+\
+static void OPNAME ## h264_qpel ## SIZE ## _mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+ DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE];\
+ uint8_t * const halfHV= temp;\
+ int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
+ assert(((int)temp & 7) == 0);\
+ put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
+ OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, halfHV, stride, SIZE);\
+}\
+\
+static void OPNAME ## h264_qpel ## SIZE ## _mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+ DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE];\
+ uint8_t * const halfHV= temp;\
+ int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
+ assert(((int)temp & 7) == 0);\
+ put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
+ OPNAME ## pixels ## SIZE ## _l2_shift5_ ## MMX(dst, halfV+2, halfHV, stride, SIZE, SIZE);\
+}\
+\
+static void OPNAME ## h264_qpel ## SIZE ## _mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+ DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE];\
+ uint8_t * const halfHV= temp;\
+ int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
+ assert(((int)temp & 7) == 0);\
+ put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
+ OPNAME ## pixels ## SIZE ## _l2_shift5_ ## MMX(dst, halfV+3, halfHV, stride, SIZE, SIZE);\
+}\
+
+#define H264_MC_4816(MMX)\
+H264_MC(put_, 4, MMX, 8)\
+H264_MC(put_, 8, MMX, 8)\
+H264_MC(put_, 16,MMX, 8)\
+H264_MC(avg_, 4, MMX, 8)\
+H264_MC(avg_, 8, MMX, 8)\
+H264_MC(avg_, 16,MMX, 8)\
+
+#define H264_MC_816(QPEL, XMM)\
+QPEL(put_, 8, XMM, 16)\
+QPEL(put_, 16,XMM, 16)\
+QPEL(avg_, 8, XMM, 16)\
+QPEL(avg_, 16,XMM, 16)\
+
+
+#define AVG_3DNOW_OP(a,b,temp, size) \
+"mov" #size " " #b ", " #temp " \n\t"\
+"pavgusb " #temp ", " #a " \n\t"\
+"mov" #size " " #a ", " #b " \n\t"
+#define AVG_MMX2_OP(a,b,temp, size) \
+"mov" #size " " #b ", " #temp " \n\t"\
+"pavgb " #temp ", " #a " \n\t"\
+"mov" #size " " #a ", " #b " \n\t"
+
+#define PAVGB "pavgusb"
+QPEL_H264(put_, PUT_OP, 3dnow)
+QPEL_H264(avg_, AVG_3DNOW_OP, 3dnow)
+#undef PAVGB
+#define PAVGB "pavgb"
+QPEL_H264(put_, PUT_OP, mmx2)
+QPEL_H264(avg_, AVG_MMX2_OP, mmx2)
+QPEL_H264_V_XMM(put_, PUT_OP, sse2)
+QPEL_H264_V_XMM(avg_, AVG_MMX2_OP, sse2)
+QPEL_H264_HV_XMM(put_, PUT_OP, sse2)
+QPEL_H264_HV_XMM(avg_, AVG_MMX2_OP, sse2)
+#if HAVE_SSSE3
+QPEL_H264_H_XMM(put_, PUT_OP, ssse3)
+QPEL_H264_H_XMM(avg_, AVG_MMX2_OP, ssse3)
+QPEL_H264_HV2_XMM(put_, PUT_OP, ssse3)
+QPEL_H264_HV2_XMM(avg_, AVG_MMX2_OP, ssse3)
+QPEL_H264_HV_XMM(put_, PUT_OP, ssse3)
+QPEL_H264_HV_XMM(avg_, AVG_MMX2_OP, ssse3)
+#endif
+#undef PAVGB
+
+H264_MC_4816(3dnow)
+H264_MC_4816(mmx2)
+H264_MC_816(H264_MC_V, sse2)
+H264_MC_816(H264_MC_HV, sse2)
+#if HAVE_SSSE3
+H264_MC_816(H264_MC_H, ssse3)
+H264_MC_816(H264_MC_HV, ssse3)
+#endif
diff --git a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/h264_weight.asm b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/h264_weight.asm new file mode 100644 index 000000000..53aa21047 --- /dev/null +++ b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/h264_weight.asm @@ -0,0 +1,375 @@ +;***************************************************************************** +;* SSE2-optimized weighted prediction code +;***************************************************************************** +;* Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt +;* Copyright (C) 2010 Eli Friedman <eli.friedman@gmail.com> +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with FFmpeg; if not, write to the Free Software +;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "x86inc.asm" + +SECTION .text + +;----------------------------------------------------------------------------- +; biweight pred: +; +; void h264_biweight_16x16_sse2(uint8_t *dst, uint8_t *src, int stride, +; int log2_denom, int weightd, int weights, +; int offset); +; and +; void h264_weight_16x16_sse2(uint8_t *dst, int stride, +; int log2_denom, int weight, +; int offset); +;----------------------------------------------------------------------------- + +%macro WEIGHT_SETUP 0 + add r4, r4 + inc r4 + movd m3, r3d + movd m5, r4d + movd m6, r2d + pslld m5, m6 + psrld m5, 1 +%if mmsize == 16 + pshuflw m3, m3, 0 + pshuflw m5, m5, 0 + punpcklqdq m3, m3 + punpcklqdq m5, m5 +%else + pshufw m3, m3, 0 + pshufw m5, m5, 0 +%endif + pxor m7, m7 +%endmacro + +%macro WEIGHT_OP 2 + movh m0, [r0+%1] + movh m1, [r0+%2] + punpcklbw m0, m7 + punpcklbw m1, m7 + pmullw m0, m3 + pmullw m1, m3 + paddsw m0, m5 + paddsw m1, m5 + psraw m0, m6 + psraw m1, m6 + packuswb m0, m1 +%endmacro + +%macro WEIGHT_FUNC_DBL_MM 1 +cglobal h264_weight_16x%1_mmx2, 5, 5, 0 + WEIGHT_SETUP + mov r2, %1 +%if %1 == 16 +.nextrow + WEIGHT_OP 0, 4 + mova [r0 ], m0 + WEIGHT_OP 8, 12 + mova [r0+8], m0 + add r0, r1 + dec r2 + jnz .nextrow + REP_RET +%else + jmp mangle(ff_h264_weight_16x16_mmx2.nextrow) +%endif +%endmacro + +INIT_MMX +WEIGHT_FUNC_DBL_MM 16 +WEIGHT_FUNC_DBL_MM 8 + +%macro WEIGHT_FUNC_MM 4 +cglobal h264_weight_%1x%2_%4, 7, 7, %3 + WEIGHT_SETUP + mov r2, %2 +%if %2 == 16 +.nextrow + WEIGHT_OP 0, mmsize/2 + mova [r0], m0 + add r0, r1 + dec r2 + jnz .nextrow + REP_RET +%else + jmp mangle(ff_h264_weight_%1x16_%4.nextrow) +%endif +%endmacro + +INIT_MMX +WEIGHT_FUNC_MM 8, 16, 0, mmx2 +WEIGHT_FUNC_MM 8, 8, 0, mmx2 +WEIGHT_FUNC_MM 8, 4, 0, mmx2 +INIT_XMM +WEIGHT_FUNC_MM 16, 16, 8, sse2 +WEIGHT_FUNC_MM 16, 8, 8, sse2 + +%macro WEIGHT_FUNC_HALF_MM 5 +cglobal h264_weight_%1x%2_%5, 5, 5, %4 + WEIGHT_SETUP + mov r2, %2/2 + lea r3, [r1*2] +%if %2 == mmsize +.nextrow + WEIGHT_OP 0, r1 + movh [r0], m0 +%if mmsize == 16 + movhps [r0+r1], m0 +%else + psrlq m0, 32 + movh [r0+r1], m0 +%endif + add r0, r3 + dec r2 + jnz .nextrow + REP_RET +%else + jmp mangle(ff_h264_weight_%1x%3_%5.nextrow) +%endif +%endmacro + +INIT_MMX +WEIGHT_FUNC_HALF_MM 4, 8, 8, 0, mmx2 +WEIGHT_FUNC_HALF_MM 4, 4, 8, 0, mmx2 +WEIGHT_FUNC_HALF_MM 4, 2, 8, 0, mmx2 +INIT_XMM +WEIGHT_FUNC_HALF_MM 8, 16, 16, 8, sse2 +WEIGHT_FUNC_HALF_MM 8, 8, 16, 8, sse2 +WEIGHT_FUNC_HALF_MM 8, 4, 16, 8, sse2 + +%macro BIWEIGHT_SETUP 0 + add r6, 1 + or r6, 1 + add r3, 1 + movd m3, r4d + movd m4, r5d + movd m5, r6d + movd m6, r3d + pslld m5, m6 + psrld m5, 1 +%if mmsize == 16 + pshuflw m3, m3, 0 + pshuflw m4, m4, 0 + pshuflw m5, m5, 0 + punpcklqdq m3, m3 + punpcklqdq m4, m4 + punpcklqdq m5, m5 +%else + pshufw m3, m3, 0 + pshufw m4, m4, 0 + pshufw m5, m5, 0 +%endif + pxor m7, m7 +%endmacro + +%macro BIWEIGHT_STEPA 3 + movh m%1, [r0+%3] + movh m%2, [r1+%3] + punpcklbw m%1, m7 + punpcklbw m%2, m7 + pmullw m%1, m3 + pmullw m%2, m4 + paddsw m%1, m%2 +%endmacro + +%macro BIWEIGHT_STEPB 0 + paddsw m0, m5 + paddsw m1, m5 + psraw m0, m6 + psraw m1, m6 + packuswb m0, m1 +%endmacro + +%macro BIWEIGHT_FUNC_DBL_MM 1 +cglobal h264_biweight_16x%1_mmx2, 7, 7, 0 + BIWEIGHT_SETUP + mov r3, %1 +%if %1 == 16 +.nextrow + BIWEIGHT_STEPA 0, 1, 0 + BIWEIGHT_STEPA 1, 2, 4 + BIWEIGHT_STEPB + mova [r0], m0 + BIWEIGHT_STEPA 0, 1, 8 + BIWEIGHT_STEPA 1, 2, 12 + BIWEIGHT_STEPB + mova [r0+8], m0 + add r0, r2 + add r1, r2 + dec r3 + jnz .nextrow + REP_RET +%else + jmp mangle(ff_h264_biweight_16x16_mmx2.nextrow) +%endif +%endmacro + +INIT_MMX +BIWEIGHT_FUNC_DBL_MM 16 +BIWEIGHT_FUNC_DBL_MM 8 + +%macro BIWEIGHT_FUNC_MM 4 +cglobal h264_biweight_%1x%2_%4, 7, 7, %3 + BIWEIGHT_SETUP + mov r3, %2 +%if %2 == 16 +.nextrow + BIWEIGHT_STEPA 0, 1, 0 + BIWEIGHT_STEPA 1, 2, mmsize/2 + BIWEIGHT_STEPB + mova [r0], m0 + add r0, r2 + add r1, r2 + dec r3 + jnz .nextrow + REP_RET +%else + jmp mangle(ff_h264_biweight_%1x16_%4.nextrow) +%endif +%endmacro + +INIT_MMX +BIWEIGHT_FUNC_MM 8, 16, 0, mmx2 +BIWEIGHT_FUNC_MM 8, 8, 0, mmx2 +BIWEIGHT_FUNC_MM 8, 4, 0, mmx2 +INIT_XMM +BIWEIGHT_FUNC_MM 16, 16, 8, sse2 +BIWEIGHT_FUNC_MM 16, 8, 8, sse2 + +%macro BIWEIGHT_FUNC_HALF_MM 5 +cglobal h264_biweight_%1x%2_%5, 7, 7, %4 + BIWEIGHT_SETUP + mov r3, %2/2 + lea r4, [r2*2] +%if %2 == mmsize +.nextrow + BIWEIGHT_STEPA 0, 1, 0 + BIWEIGHT_STEPA 1, 2, r2 + BIWEIGHT_STEPB + movh [r0], m0 +%if mmsize == 16 + movhps [r0+r2], m0 +%else + psrlq m0, 32 + movh [r0+r2], m0 +%endif + add r0, r4 + add r1, r4 + dec r3 + jnz .nextrow + REP_RET +%else + jmp mangle(ff_h264_biweight_%1x%3_%5.nextrow) +%endif +%endmacro + +INIT_MMX +BIWEIGHT_FUNC_HALF_MM 4, 8, 8, 0, mmx2 +BIWEIGHT_FUNC_HALF_MM 4, 4, 8, 0, mmx2 +BIWEIGHT_FUNC_HALF_MM 4, 2, 8, 0, mmx2 +INIT_XMM +BIWEIGHT_FUNC_HALF_MM 8, 16, 16, 8, sse2 +BIWEIGHT_FUNC_HALF_MM 8, 8, 16, 8, sse2 +BIWEIGHT_FUNC_HALF_MM 8, 4, 16, 8, sse2 + +%macro BIWEIGHT_SSSE3_SETUP 0 + add r6, 1 + or r6, 1 + add r3, 1 + movd m4, r4d + movd m0, r5d + movd m5, r6d + movd m6, r3d + pslld m5, m6 + psrld m5, 1 + punpcklbw m4, m0 + pshuflw m4, m4, 0 + pshuflw m5, m5, 0 + punpcklqdq m4, m4 + punpcklqdq m5, m5 +%endmacro + +%macro BIWEIGHT_SSSE3_OP 0 + pmaddubsw m0, m4 + pmaddubsw m2, m4 + paddsw m0, m5 + paddsw m2, m5 + psraw m0, m6 + psraw m2, m6 + packuswb m0, m2 +%endmacro + +%macro BIWEIGHT_SSSE3_16 1 +cglobal h264_biweight_16x%1_ssse3, 7, 7, 8 + BIWEIGHT_SSSE3_SETUP + mov r3, %1 + +%if %1 == 16 +.nextrow + movh m0, [r0] + movh m2, [r0+8] + movh m3, [r1+8] + punpcklbw m0, [r1] + punpcklbw m2, m3 + BIWEIGHT_SSSE3_OP + mova [r0], m0 + add r0, r2 + add r1, r2 + dec r3 + jnz .nextrow + REP_RET +%else + jmp mangle(ff_h264_biweight_16x16_ssse3.nextrow) +%endif +%endmacro + +INIT_XMM +BIWEIGHT_SSSE3_16 16 +BIWEIGHT_SSSE3_16 8 + +%macro BIWEIGHT_SSSE3_8 1 +cglobal h264_biweight_8x%1_ssse3, 7, 7, 8 + BIWEIGHT_SSSE3_SETUP + mov r3, %1/2 + lea r4, [r2*2] + +%if %1 == 16 +.nextrow + movh m0, [r0] + movh m1, [r1] + movh m2, [r0+r2] + movh m3, [r1+r2] + punpcklbw m0, m1 + punpcklbw m2, m3 + BIWEIGHT_SSSE3_OP + movh [r0], m0 + movhps [r0+r2], m0 + add r0, r4 + add r1, r4 + dec r3 + jnz .nextrow + REP_RET +%else + jmp mangle(ff_h264_biweight_8x16_ssse3.nextrow) +%endif +%endmacro + +INIT_XMM +BIWEIGHT_SSSE3_8 16 +BIWEIGHT_SSSE3_8 8 +BIWEIGHT_SSSE3_8 4 diff --git a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/h264_weight_sse2.asm b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/h264_weight_sse2.asm deleted file mode 100644 index 8667f0690..000000000 --- a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/h264_weight_sse2.asm +++ /dev/null @@ -1,170 +0,0 @@ -;***************************************************************************** -;* SSE2-optimized weighted prediction code -;***************************************************************************** -;* Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt -;* Copyright (C) 2010 Eli Friedman <eli.friedman@gmail.com> -;* -;* This file is part of FFmpeg. -;* -;* FFmpeg is free software; you can redistribute it and/or -;* modify it under the terms of the GNU Lesser General Public -;* License as published by the Free Software Foundation; either -;* version 2.1 of the License, or (at your option) any later version. -;* -;* FFmpeg is distributed in the hope that it will be useful, -;* but WITHOUT ANY WARRANTY; without even the implied warranty of -;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -;* Lesser General Public License for more details. -;* -;* You should have received a copy of the GNU Lesser General Public -;* License along with FFmpeg; if not, write to the Free Software -;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -;****************************************************************************** - -%include "x86inc.asm" - -SECTION .text -INIT_XMM - -;----------------------------------------------------------------------------- -; biweight pred: -; -; void h264_biweight_16x16_sse2(uint8_t *dst, uint8_t *src, int stride, -; int log2_denom, int weightd, int weights, -; int offset); -;----------------------------------------------------------------------------- - -%macro BIWEIGHT_SSE2_SETUP 0 - add r6, 1 - or r6, 1 - add r3, 1 - movd m3, r4 - movd m4, r5 - movd m5, r6 - movd m6, r3 - pslld m5, m6 - psrld m5, 1 - pshuflw m3, m3, 0 - pshuflw m4, m4, 0 - pshuflw m5, m5, 0 - punpcklqdq m3, m3 - punpcklqdq m4, m4 - punpcklqdq m5, m5 - pxor m7, m7 -%endmacro - -%macro BIWEIGHT_SSE2_STEPA 3 - movh m%1, [r0+%3] - movh m%2, [r1+%3] - punpcklbw m%1, m7 - punpcklbw m%2, m7 - pmullw m%1, m3 - pmullw m%2, m4 - paddsw m%1, m%2 -%endmacro - -%macro BIWEIGHT_SSE2_STEPB 0 - paddsw m0, m5 - paddsw m1, m5 - psraw m0, m6 - psraw m1, m6 - packuswb m0, m1 -%endmacro - -cglobal h264_biweight_16x16_sse2, 7, 7, 8 - BIWEIGHT_SSE2_SETUP - mov r3, 16 - -.nextrow - BIWEIGHT_SSE2_STEPA 0, 1, 0 - BIWEIGHT_SSE2_STEPA 1, 2, 8 - BIWEIGHT_SSE2_STEPB - mova [r0], m0 - add r0, r2 - add r1, r2 - dec r3 - jnz .nextrow - REP_RET - -cglobal h264_biweight_8x8_sse2, 7, 7, 8 - BIWEIGHT_SSE2_SETUP - mov r3, 4 - lea r4, [r2*2] - -.nextrow - BIWEIGHT_SSE2_STEPA 0, 1, 0 - BIWEIGHT_SSE2_STEPA 1, 2, r2 - BIWEIGHT_SSE2_STEPB - movh [r0], m0 - movhps [r0+r2], m0 - add r0, r4 - add r1, r4 - dec r3 - jnz .nextrow - REP_RET - -%macro BIWEIGHT_SSSE3_SETUP 0 - add r6, 1 - or r6, 1 - add r3, 1 - movd m4, r4 - movd m0, r5 - movd m5, r6 - movd m6, r3 - pslld m5, m6 - psrld m5, 1 - punpcklbw m4, m0 - pshuflw m4, m4, 0 - pshuflw m5, m5, 0 - punpcklqdq m4, m4 - punpcklqdq m5, m5 -%endmacro - -%macro BIWEIGHT_SSSE3_OP 0 - pmaddubsw m0, m4 - pmaddubsw m2, m4 - paddsw m0, m5 - paddsw m2, m5 - psraw m0, m6 - psraw m2, m6 - packuswb m0, m2 -%endmacro - -cglobal h264_biweight_16x16_ssse3, 7, 7, 8 - BIWEIGHT_SSSE3_SETUP - mov r3, 16 - -.nextrow - movh m0, [r0] - movh m2, [r0+8] - movh m3, [r1+8] - punpcklbw m0, [r1] - punpcklbw m2, m3 - BIWEIGHT_SSSE3_OP - mova [r0], m0 - add r0, r2 - add r1, r2 - dec r3 - jnz .nextrow - REP_RET - -cglobal h264_biweight_8x8_ssse3, 7, 7, 8 - BIWEIGHT_SSSE3_SETUP - mov r3, 4 - lea r4, [r2*2] - -.nextrow - movh m0, [r0] - movh m1, [r1] - movh m2, [r0+r2] - movh m3, [r1+r2] - punpcklbw m0, m1 - punpcklbw m2, m3 - BIWEIGHT_SSSE3_OP - movh [r0], m0 - movhps [r0+r2], m0 - add r0, r4 - add r1, r4 - dec r3 - jnz .nextrow - REP_RET diff --git a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/h264dsp_mmx.c b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/h264dsp_mmx.c index 4b2e54603..efd8b78f1 100644 --- a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/h264dsp_mmx.c +++ b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/h264dsp_mmx.c @@ -18,8 +18,10 @@ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
+#include "libavutil/cpu.h"
+#include "libavutil/x86_cpu.h"
+#include "libavcodec/h264dsp.h"
#include "dsputil_mmx.h"
-#include "libavcodec/h264pred.h"
DECLARE_ALIGNED(8, static const uint64_t, ff_pb_3_1 ) = 0x0103010301030103ULL;
DECLARE_ALIGNED(8, static const uint64_t, ff_pb_7_3 ) = 0x0307030703070307ULL;
@@ -27,772 +29,41 @@ DECLARE_ALIGNED(8, static const uint64_t, ff_pb_7_3 ) = 0x0307030703070307ULL; /***********************************/
/* IDCT */
-#define SUMSUB_BADC( a, b, c, d ) \
- "paddw "#b", "#a" \n\t"\
- "paddw "#d", "#c" \n\t"\
- "paddw "#b", "#b" \n\t"\
- "paddw "#d", "#d" \n\t"\
- "psubw "#a", "#b" \n\t"\
- "psubw "#c", "#d" \n\t"
-
-#define SUMSUBD2_AB( a, b, t ) \
- "movq "#b", "#t" \n\t"\
- "psraw $1 , "#b" \n\t"\
- "paddw "#a", "#b" \n\t"\
- "psraw $1 , "#a" \n\t"\
- "psubw "#t", "#a" \n\t"
-
-#define IDCT4_1D( s02, s13, d02, d13, t ) \
- SUMSUB_BA ( s02, d02 )\
- SUMSUBD2_AB( s13, d13, t )\
- SUMSUB_BADC( d13, s02, s13, d02 )
-
-#define STORE_DIFF_4P( p, t, z ) \
- "psraw $6, "#p" \n\t"\
- "movd (%0), "#t" \n\t"\
- "punpcklbw "#z", "#t" \n\t"\
- "paddsw "#t", "#p" \n\t"\
- "packuswb "#z", "#p" \n\t"\
- "movd "#p", (%0) \n\t"
-
-static void ff_h264_idct_add_mmx(uint8_t *dst, int16_t *block, int stride)
-{
- /* Load dct coeffs */
- __asm__ volatile(
- "movq (%0), %%mm0 \n\t"
- "movq 8(%0), %%mm1 \n\t"
- "movq 16(%0), %%mm2 \n\t"
- "movq 24(%0), %%mm3 \n\t"
- :: "r"(block) );
-
- __asm__ volatile(
- /* mm1=s02+s13 mm2=s02-s13 mm4=d02+d13 mm0=d02-d13 */
- IDCT4_1D( %%mm2, %%mm1, %%mm0, %%mm3, %%mm4 )
-
- "movq %0, %%mm6 \n\t"
- /* in: 1,4,0,2 out: 1,2,3,0 */
- TRANSPOSE4( %%mm3, %%mm1, %%mm0, %%mm2, %%mm4 )
-
- "paddw %%mm6, %%mm3 \n\t"
-
- /* mm2=s02+s13 mm3=s02-s13 mm4=d02+d13 mm1=d02-d13 */
- IDCT4_1D( %%mm4, %%mm2, %%mm3, %%mm0, %%mm1 )
-
- "pxor %%mm7, %%mm7 \n\t"
- :: "m"(ff_pw_32));
-
- __asm__ volatile(
- STORE_DIFF_4P( %%mm0, %%mm1, %%mm7)
- "add %1, %0 \n\t"
- STORE_DIFF_4P( %%mm2, %%mm1, %%mm7)
- "add %1, %0 \n\t"
- STORE_DIFF_4P( %%mm3, %%mm1, %%mm7)
- "add %1, %0 \n\t"
- STORE_DIFF_4P( %%mm4, %%mm1, %%mm7)
- : "+r"(dst)
- : "r" ((x86_reg)stride)
- );
-}
-
-static inline void h264_idct8_1d(int16_t *block)
-{
- __asm__ volatile(
- "movq 112(%0), %%mm7 \n\t"
- "movq 80(%0), %%mm0 \n\t"
- "movq 48(%0), %%mm3 \n\t"
- "movq 16(%0), %%mm5 \n\t"
-
- "movq %%mm0, %%mm4 \n\t"
- "movq %%mm5, %%mm1 \n\t"
- "psraw $1, %%mm4 \n\t"
- "psraw $1, %%mm1 \n\t"
- "paddw %%mm0, %%mm4 \n\t"
- "paddw %%mm5, %%mm1 \n\t"
- "paddw %%mm7, %%mm4 \n\t"
- "paddw %%mm0, %%mm1 \n\t"
- "psubw %%mm5, %%mm4 \n\t"
- "paddw %%mm3, %%mm1 \n\t"
-
- "psubw %%mm3, %%mm5 \n\t"
- "psubw %%mm3, %%mm0 \n\t"
- "paddw %%mm7, %%mm5 \n\t"
- "psubw %%mm7, %%mm0 \n\t"
- "psraw $1, %%mm3 \n\t"
- "psraw $1, %%mm7 \n\t"
- "psubw %%mm3, %%mm5 \n\t"
- "psubw %%mm7, %%mm0 \n\t"
-
- "movq %%mm4, %%mm3 \n\t"
- "movq %%mm1, %%mm7 \n\t"
- "psraw $2, %%mm1 \n\t"
- "psraw $2, %%mm3 \n\t"
- "paddw %%mm5, %%mm3 \n\t"
- "psraw $2, %%mm5 \n\t"
- "paddw %%mm0, %%mm1 \n\t"
- "psraw $2, %%mm0 \n\t"
- "psubw %%mm4, %%mm5 \n\t"
- "psubw %%mm0, %%mm7 \n\t"
-
- "movq 32(%0), %%mm2 \n\t"
- "movq 96(%0), %%mm6 \n\t"
- "movq %%mm2, %%mm4 \n\t"
- "movq %%mm6, %%mm0 \n\t"
- "psraw $1, %%mm4 \n\t"
- "psraw $1, %%mm6 \n\t"
- "psubw %%mm0, %%mm4 \n\t"
- "paddw %%mm2, %%mm6 \n\t"
-
- "movq (%0), %%mm2 \n\t"
- "movq 64(%0), %%mm0 \n\t"
- SUMSUB_BA( %%mm0, %%mm2 )
- SUMSUB_BA( %%mm6, %%mm0 )
- SUMSUB_BA( %%mm4, %%mm2 )
- SUMSUB_BA( %%mm7, %%mm6 )
- SUMSUB_BA( %%mm5, %%mm4 )
- SUMSUB_BA( %%mm3, %%mm2 )
- SUMSUB_BA( %%mm1, %%mm0 )
- :: "r"(block)
- );
-}
-
-static void ff_h264_idct8_add_mmx(uint8_t *dst, int16_t *block, int stride)
-{
- int i;
- DECLARE_ALIGNED(8, int16_t, b2)[64];
-
- block[0] += 32;
-
- for(i=0; i<2; i++){
- DECLARE_ALIGNED(8, uint64_t, tmp);
-
- h264_idct8_1d(block+4*i);
-
- __asm__ volatile(
- "movq %%mm7, %0 \n\t"
- TRANSPOSE4( %%mm0, %%mm2, %%mm4, %%mm6, %%mm7 )
- "movq %%mm0, 8(%1) \n\t"
- "movq %%mm6, 24(%1) \n\t"
- "movq %%mm7, 40(%1) \n\t"
- "movq %%mm4, 56(%1) \n\t"
- "movq %0, %%mm7 \n\t"
- TRANSPOSE4( %%mm7, %%mm5, %%mm3, %%mm1, %%mm0 )
- "movq %%mm7, (%1) \n\t"
- "movq %%mm1, 16(%1) \n\t"
- "movq %%mm0, 32(%1) \n\t"
- "movq %%mm3, 48(%1) \n\t"
- : "=m"(tmp)
- : "r"(b2+32*i)
- : "memory"
- );
- }
-
- for(i=0; i<2; i++){
- h264_idct8_1d(b2+4*i);
-
- __asm__ volatile(
- "psraw $6, %%mm7 \n\t"
- "psraw $6, %%mm6 \n\t"
- "psraw $6, %%mm5 \n\t"
- "psraw $6, %%mm4 \n\t"
- "psraw $6, %%mm3 \n\t"
- "psraw $6, %%mm2 \n\t"
- "psraw $6, %%mm1 \n\t"
- "psraw $6, %%mm0 \n\t"
-
- "movq %%mm7, (%0) \n\t"
- "movq %%mm5, 16(%0) \n\t"
- "movq %%mm3, 32(%0) \n\t"
- "movq %%mm1, 48(%0) \n\t"
- "movq %%mm0, 64(%0) \n\t"
- "movq %%mm2, 80(%0) \n\t"
- "movq %%mm4, 96(%0) \n\t"
- "movq %%mm6, 112(%0) \n\t"
- :: "r"(b2+4*i)
- : "memory"
- );
- }
-
- add_pixels_clamped_mmx(b2, dst, stride);
-}
-
-#define STORE_DIFF_8P( p, d, t, z )\
- "movq "#d", "#t" \n"\
- "psraw $6, "#p" \n"\
- "punpcklbw "#z", "#t" \n"\
- "paddsw "#t", "#p" \n"\
- "packuswb "#p", "#p" \n"\
- "movq "#p", "#d" \n"
-
-#define H264_IDCT8_1D_SSE2(a,b,c,d,e,f,g,h)\
- "movdqa "#c", "#a" \n"\
- "movdqa "#g", "#e" \n"\
- "psraw $1, "#c" \n"\
- "psraw $1, "#g" \n"\
- "psubw "#e", "#c" \n"\
- "paddw "#a", "#g" \n"\
- "movdqa "#b", "#e" \n"\
- "psraw $1, "#e" \n"\
- "paddw "#b", "#e" \n"\
- "paddw "#d", "#e" \n"\
- "paddw "#f", "#e" \n"\
- "movdqa "#f", "#a" \n"\
- "psraw $1, "#a" \n"\
- "paddw "#f", "#a" \n"\
- "paddw "#h", "#a" \n"\
- "psubw "#b", "#a" \n"\
- "psubw "#d", "#b" \n"\
- "psubw "#d", "#f" \n"\
- "paddw "#h", "#b" \n"\
- "psubw "#h", "#f" \n"\
- "psraw $1, "#d" \n"\
- "psraw $1, "#h" \n"\
- "psubw "#d", "#b" \n"\
- "psubw "#h", "#f" \n"\
- "movdqa "#e", "#d" \n"\
- "movdqa "#a", "#h" \n"\
- "psraw $2, "#d" \n"\
- "psraw $2, "#h" \n"\
- "paddw "#f", "#d" \n"\
- "paddw "#b", "#h" \n"\
- "psraw $2, "#f" \n"\
- "psraw $2, "#b" \n"\
- "psubw "#f", "#e" \n"\
- "psubw "#a", "#b" \n"\
- "movdqa 0x00(%1), "#a" \n"\
- "movdqa 0x40(%1), "#f" \n"\
- SUMSUB_BA(f, a)\
- SUMSUB_BA(g, f)\
- SUMSUB_BA(c, a)\
- SUMSUB_BA(e, g)\
- SUMSUB_BA(b, c)\
- SUMSUB_BA(h, a)\
- SUMSUB_BA(d, f)
-
-static void ff_h264_idct8_add_sse2(uint8_t *dst, int16_t *block, int stride)
-{
- __asm__ volatile(
- "movdqa 0x10(%1), %%xmm1 \n"
- "movdqa 0x20(%1), %%xmm2 \n"
- "movdqa 0x30(%1), %%xmm3 \n"
- "movdqa 0x50(%1), %%xmm5 \n"
- "movdqa 0x60(%1), %%xmm6 \n"
- "movdqa 0x70(%1), %%xmm7 \n"
- H264_IDCT8_1D_SSE2(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm6, %%xmm7)
- TRANSPOSE8(%%xmm4, %%xmm1, %%xmm7, %%xmm3, %%xmm5, %%xmm0, %%xmm2, %%xmm6, (%1))
- "paddw %4, %%xmm4 \n"
- "movdqa %%xmm4, 0x00(%1) \n"
- "movdqa %%xmm2, 0x40(%1) \n"
- H264_IDCT8_1D_SSE2(%%xmm4, %%xmm0, %%xmm6, %%xmm3, %%xmm2, %%xmm5, %%xmm7, %%xmm1)
- "movdqa %%xmm6, 0x60(%1) \n"
- "movdqa %%xmm7, 0x70(%1) \n"
- "pxor %%xmm7, %%xmm7 \n"
- STORE_DIFF_8P(%%xmm2, (%0), %%xmm6, %%xmm7)
- STORE_DIFF_8P(%%xmm0, (%0,%2), %%xmm6, %%xmm7)
- STORE_DIFF_8P(%%xmm1, (%0,%2,2), %%xmm6, %%xmm7)
- STORE_DIFF_8P(%%xmm3, (%0,%3), %%xmm6, %%xmm7)
- "lea (%0,%2,4), %0 \n"
- STORE_DIFF_8P(%%xmm5, (%0), %%xmm6, %%xmm7)
- STORE_DIFF_8P(%%xmm4, (%0,%2), %%xmm6, %%xmm7)
- "movdqa 0x60(%1), %%xmm0 \n"
- "movdqa 0x70(%1), %%xmm1 \n"
- STORE_DIFF_8P(%%xmm0, (%0,%2,2), %%xmm6, %%xmm7)
- STORE_DIFF_8P(%%xmm1, (%0,%3), %%xmm6, %%xmm7)
- :"+r"(dst)
- :"r"(block), "r"((x86_reg)stride), "r"((x86_reg)3L*stride), "m"(ff_pw_32)
- );
-}
-
-static void ff_h264_idct_dc_add_mmx2(uint8_t *dst, int16_t *block, int stride)
-{
- int dc = (block[0] + 32) >> 6;
- __asm__ volatile(
- "movd %0, %%mm0 \n\t"
- "pshufw $0, %%mm0, %%mm0 \n\t"
- "pxor %%mm1, %%mm1 \n\t"
- "psubw %%mm0, %%mm1 \n\t"
- "packuswb %%mm0, %%mm0 \n\t"
- "packuswb %%mm1, %%mm1 \n\t"
- ::"r"(dc)
- );
- __asm__ volatile(
- "movd %0, %%mm2 \n\t"
- "movd %1, %%mm3 \n\t"
- "movd %2, %%mm4 \n\t"
- "movd %3, %%mm5 \n\t"
- "paddusb %%mm0, %%mm2 \n\t"
- "paddusb %%mm0, %%mm3 \n\t"
- "paddusb %%mm0, %%mm4 \n\t"
- "paddusb %%mm0, %%mm5 \n\t"
- "psubusb %%mm1, %%mm2 \n\t"
- "psubusb %%mm1, %%mm3 \n\t"
- "psubusb %%mm1, %%mm4 \n\t"
- "psubusb %%mm1, %%mm5 \n\t"
- "movd %%mm2, %0 \n\t"
- "movd %%mm3, %1 \n\t"
- "movd %%mm4, %2 \n\t"
- "movd %%mm5, %3 \n\t"
- :"+m"(*(uint32_t*)(dst+0*stride)),
- "+m"(*(uint32_t*)(dst+1*stride)),
- "+m"(*(uint32_t*)(dst+2*stride)),
- "+m"(*(uint32_t*)(dst+3*stride))
- );
-}
-
-static void ff_h264_idct8_dc_add_mmx2(uint8_t *dst, int16_t *block, int stride)
-{
- int dc = (block[0] + 32) >> 6;
- int y;
- __asm__ volatile(
- "movd %0, %%mm0 \n\t"
- "pshufw $0, %%mm0, %%mm0 \n\t"
- "pxor %%mm1, %%mm1 \n\t"
- "psubw %%mm0, %%mm1 \n\t"
- "packuswb %%mm0, %%mm0 \n\t"
- "packuswb %%mm1, %%mm1 \n\t"
- ::"r"(dc)
- );
- for(y=2; y--; dst += 4*stride){
- __asm__ volatile(
- "movq %0, %%mm2 \n\t"
- "movq %1, %%mm3 \n\t"
- "movq %2, %%mm4 \n\t"
- "movq %3, %%mm5 \n\t"
- "paddusb %%mm0, %%mm2 \n\t"
- "paddusb %%mm0, %%mm3 \n\t"
- "paddusb %%mm0, %%mm4 \n\t"
- "paddusb %%mm0, %%mm5 \n\t"
- "psubusb %%mm1, %%mm2 \n\t"
- "psubusb %%mm1, %%mm3 \n\t"
- "psubusb %%mm1, %%mm4 \n\t"
- "psubusb %%mm1, %%mm5 \n\t"
- "movq %%mm2, %0 \n\t"
- "movq %%mm3, %1 \n\t"
- "movq %%mm4, %2 \n\t"
- "movq %%mm5, %3 \n\t"
- :"+m"(*(uint64_t*)(dst+0*stride)),
- "+m"(*(uint64_t*)(dst+1*stride)),
- "+m"(*(uint64_t*)(dst+2*stride)),
- "+m"(*(uint64_t*)(dst+3*stride))
- );
- }
-}
-
-//FIXME this table is a duplicate from h264data.h, and will be removed once the tables from, h264 have been split
-static const uint8_t scan8[16 + 2*4]={
- 4+1*8, 5+1*8, 4+2*8, 5+2*8,
- 6+1*8, 7+1*8, 6+2*8, 7+2*8,
- 4+3*8, 5+3*8, 4+4*8, 5+4*8,
- 6+3*8, 7+3*8, 6+4*8, 7+4*8,
- 1+1*8, 2+1*8,
- 1+2*8, 2+2*8,
- 1+4*8, 2+4*8,
- 1+5*8, 2+5*8,
-};
-
-static void ff_h264_idct_add16_mmx(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
- int i;
- for(i=0; i<16; i++){
- if(nnzc[ scan8[i] ])
- ff_h264_idct_add_mmx(dst + block_offset[i], block + i*16, stride);
- }
-}
-
-static void ff_h264_idct8_add4_mmx(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
- int i;
- for(i=0; i<16; i+=4){
- if(nnzc[ scan8[i] ])
- ff_h264_idct8_add_mmx(dst + block_offset[i], block + i*16, stride);
- }
-}
-
-
-static void ff_h264_idct_add16_mmx2(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
- int i;
- for(i=0; i<16; i++){
- int nnz = nnzc[ scan8[i] ];
- if(nnz){
- if(nnz==1 && block[i*16]) ff_h264_idct_dc_add_mmx2(dst + block_offset[i], block + i*16, stride);
- else ff_h264_idct_add_mmx (dst + block_offset[i], block + i*16, stride);
- }
- }
-}
-
-static void ff_h264_idct_add16intra_mmx(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
- int i;
- for(i=0; i<16; i++){
- if(nnzc[ scan8[i] ] || block[i*16])
- ff_h264_idct_add_mmx(dst + block_offset[i], block + i*16, stride);
- }
-}
-
-static void ff_h264_idct_add16intra_mmx2(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
- int i;
- for(i=0; i<16; i++){
- if(nnzc[ scan8[i] ]) ff_h264_idct_add_mmx (dst + block_offset[i], block + i*16, stride);
- else if(block[i*16]) ff_h264_idct_dc_add_mmx2(dst + block_offset[i], block + i*16, stride);
- }
-}
-
-static void ff_h264_idct8_add4_mmx2(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
- int i;
- for(i=0; i<16; i+=4){
- int nnz = nnzc[ scan8[i] ];
- if(nnz){
- if(nnz==1 && block[i*16]) ff_h264_idct8_dc_add_mmx2(dst + block_offset[i], block + i*16, stride);
- else ff_h264_idct8_add_mmx (dst + block_offset[i], block + i*16, stride);
- }
- }
-}
-
-static void ff_h264_idct8_add4_sse2(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
- int i;
- for(i=0; i<16; i+=4){
- int nnz = nnzc[ scan8[i] ];
- if(nnz){
- if(nnz==1 && block[i*16]) ff_h264_idct8_dc_add_mmx2(dst + block_offset[i], block + i*16, stride);
- else ff_h264_idct8_add_sse2 (dst + block_offset[i], block + i*16, stride);
- }
- }
-}
-
-static void ff_h264_idct_add8_mmx(uint8_t **dest, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
- int i;
- for(i=16; i<16+8; i++){
- if(nnzc[ scan8[i] ] || block[i*16])
- ff_h264_idct_add_mmx (dest[(i&4)>>2] + block_offset[i], block + i*16, stride);
- }
-}
-
-static void ff_h264_idct_add8_mmx2(uint8_t **dest, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
- int i;
- for(i=16; i<16+8; i++){
- if(nnzc[ scan8[i] ])
- ff_h264_idct_add_mmx (dest[(i&4)>>2] + block_offset[i], block + i*16, stride);
- else if(block[i*16])
- ff_h264_idct_dc_add_mmx2(dest[(i&4)>>2] + block_offset[i], block + i*16, stride);
- }
-}
-
-#if CONFIG_GPL && HAVE_YASM
-static void ff_h264_idct_dc_add8_mmx2(uint8_t *dst, int16_t *block, int stride)
-{
- __asm__ volatile(
- "movd %0, %%mm0 \n\t" // 0 0 X D
- "punpcklwd %1, %%mm0 \n\t" // x X d D
- "paddsw %2, %%mm0 \n\t"
- "psraw $6, %%mm0 \n\t"
- "punpcklwd %%mm0, %%mm0 \n\t" // d d D D
- "pxor %%mm1, %%mm1 \n\t" // 0 0 0 0
- "psubw %%mm0, %%mm1 \n\t" // -d-d-D-D
- "packuswb %%mm1, %%mm0 \n\t" // -d-d-D-D d d D D
- "pshufw $0xFA, %%mm0, %%mm1 \n\t" // -d-d-d-d-D-D-D-D
- "punpcklwd %%mm0, %%mm0 \n\t" // d d d d D D D D
- ::"m"(block[ 0]),
- "m"(block[16]),
- "m"(ff_pw_32)
- );
- __asm__ volatile(
- "movq %0, %%mm2 \n\t"
- "movq %1, %%mm3 \n\t"
- "movq %2, %%mm4 \n\t"
- "movq %3, %%mm5 \n\t"
- "paddusb %%mm0, %%mm2 \n\t"
- "paddusb %%mm0, %%mm3 \n\t"
- "paddusb %%mm0, %%mm4 \n\t"
- "paddusb %%mm0, %%mm5 \n\t"
- "psubusb %%mm1, %%mm2 \n\t"
- "psubusb %%mm1, %%mm3 \n\t"
- "psubusb %%mm1, %%mm4 \n\t"
- "psubusb %%mm1, %%mm5 \n\t"
- "movq %%mm2, %0 \n\t"
- "movq %%mm3, %1 \n\t"
- "movq %%mm4, %2 \n\t"
- "movq %%mm5, %3 \n\t"
- :"+m"(*(uint64_t*)(dst+0*stride)),
- "+m"(*(uint64_t*)(dst+1*stride)),
- "+m"(*(uint64_t*)(dst+2*stride)),
- "+m"(*(uint64_t*)(dst+3*stride))
- );
-}
-
-extern void ff_x264_add8x4_idct_sse2(uint8_t *dst, int16_t *block, int stride);
-
-static void ff_h264_idct_add16_sse2(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
- int i;
- for(i=0; i<16; i+=2)
- if(nnzc[ scan8[i+0] ]|nnzc[ scan8[i+1] ])
- ff_x264_add8x4_idct_sse2 (dst + block_offset[i], block + i*16, stride);
-}
-
-static void ff_h264_idct_add16intra_sse2(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
- int i;
- for(i=0; i<16; i+=2){
- if(nnzc[ scan8[i+0] ]|nnzc[ scan8[i+1] ])
- ff_x264_add8x4_idct_sse2 (dst + block_offset[i], block + i*16, stride);
- else if(block[i*16]|block[i*16+16])
- ff_h264_idct_dc_add8_mmx2(dst + block_offset[i], block + i*16, stride);
- }
-}
-
-static void ff_h264_idct_add8_sse2(uint8_t **dest, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
- int i;
- for(i=16; i<16+8; i+=2){
- if(nnzc[ scan8[i+0] ]|nnzc[ scan8[i+1] ])
- ff_x264_add8x4_idct_sse2 (dest[(i&4)>>2] + block_offset[i], block + i*16, stride);
- else if(block[i*16]|block[i*16+16])
- ff_h264_idct_dc_add8_mmx2(dest[(i&4)>>2] + block_offset[i], block + i*16, stride);
- }
-}
-#endif
+void ff_h264_idct_add_mmx (uint8_t *dst, int16_t *block, int stride);
+void ff_h264_idct8_add_mmx (uint8_t *dst, int16_t *block, int stride);
+void ff_h264_idct8_add_sse2 (uint8_t *dst, int16_t *block, int stride);
+void ff_h264_idct_dc_add_mmx2 (uint8_t *dst, int16_t *block, int stride);
+void ff_h264_idct8_dc_add_mmx2(uint8_t *dst, int16_t *block, int stride);
+
+void ff_h264_idct_add16_mmx (uint8_t *dst, const int *block_offset,
+ DCTELEM *block, int stride, const uint8_t nnzc[6*8]);
+void ff_h264_idct8_add4_mmx (uint8_t *dst, const int *block_offset,
+ DCTELEM *block, int stride, const uint8_t nnzc[6*8]);
+void ff_h264_idct_add16_mmx2 (uint8_t *dst, const int *block_offset,
+ DCTELEM *block, int stride, const uint8_t nnzc[6*8]);
+void ff_h264_idct_add16intra_mmx (uint8_t *dst, const int *block_offset,
+ DCTELEM *block, int stride, const uint8_t nnzc[6*8]);
+void ff_h264_idct_add16intra_mmx2(uint8_t *dst, const int *block_offset,
+ DCTELEM *block, int stride, const uint8_t nnzc[6*8]);
+void ff_h264_idct8_add4_mmx2 (uint8_t *dst, const int *block_offset,
+ DCTELEM *block, int stride, const uint8_t nnzc[6*8]);
+void ff_h264_idct8_add4_sse2 (uint8_t *dst, const int *block_offset,
+ DCTELEM *block, int stride, const uint8_t nnzc[6*8]);
+void ff_h264_idct_add8_mmx (uint8_t **dest, const int *block_offset,
+ DCTELEM *block, int stride, const uint8_t nnzc[6*8]);
+void ff_h264_idct_add8_mmx2 (uint8_t **dest, const int *block_offset,
+ DCTELEM *block, int stride, const uint8_t nnzc[6*8]);
+
+void ff_h264_idct_add16_sse2 (uint8_t *dst, const int *block_offset, DCTELEM *block,
+ int stride, const uint8_t nnzc[6*8]);
+void ff_h264_idct_add16intra_sse2(uint8_t *dst, const int *block_offset, DCTELEM *block,
+ int stride, const uint8_t nnzc[6*8]);
+void ff_h264_idct_add8_sse2 (uint8_t **dest, const int *block_offset, DCTELEM *block,
+ int stride, const uint8_t nnzc[6*8]);
/***********************************/
/* deblocking */
-// out: o = |x-y|>a
-// clobbers: t
-#define DIFF_GT_MMX(x,y,a,o,t)\
- "movq "#y", "#t" \n\t"\
- "movq "#x", "#o" \n\t"\
- "psubusb "#x", "#t" \n\t"\
- "psubusb "#y", "#o" \n\t"\
- "por "#t", "#o" \n\t"\
- "psubusb "#a", "#o" \n\t"
-
-// out: o = |x-y|>a
-// clobbers: t
-#define DIFF_GT2_MMX(x,y,a,o,t)\
- "movq "#y", "#t" \n\t"\
- "movq "#x", "#o" \n\t"\
- "psubusb "#x", "#t" \n\t"\
- "psubusb "#y", "#o" \n\t"\
- "psubusb "#a", "#t" \n\t"\
- "psubusb "#a", "#o" \n\t"\
- "pcmpeqb "#t", "#o" \n\t"\
-
-// in: mm0=p1 mm1=p0 mm2=q0 mm3=q1
-// out: mm5=beta-1, mm7=mask
-// clobbers: mm4,mm6
-#define H264_DEBLOCK_MASK(alpha1, beta1) \
- "pshufw $0, "#alpha1", %%mm4 \n\t"\
- "pshufw $0, "#beta1 ", %%mm5 \n\t"\
- "packuswb %%mm4, %%mm4 \n\t"\
- "packuswb %%mm5, %%mm5 \n\t"\
- DIFF_GT_MMX(%%mm1, %%mm2, %%mm4, %%mm7, %%mm6) /* |p0-q0| > alpha-1 */\
- DIFF_GT_MMX(%%mm0, %%mm1, %%mm5, %%mm4, %%mm6) /* |p1-p0| > beta-1 */\
- "por %%mm4, %%mm7 \n\t"\
- DIFF_GT_MMX(%%mm3, %%mm2, %%mm5, %%mm4, %%mm6) /* |q1-q0| > beta-1 */\
- "por %%mm4, %%mm7 \n\t"\
- "pxor %%mm6, %%mm6 \n\t"\
- "pcmpeqb %%mm6, %%mm7 \n\t"
-
-// in: mm0=p1 mm1=p0 mm2=q0 mm3=q1 mm7=(tc&mask)
-// out: mm1=p0' mm2=q0'
-// clobbers: mm0,3-6
-#define H264_DEBLOCK_P0_Q0(pb_01, pb_3f)\
- "movq %%mm1 , %%mm5 \n\t"\
- "pxor %%mm2 , %%mm5 \n\t" /* p0^q0*/\
- "pand "#pb_01" , %%mm5 \n\t" /* (p0^q0)&1*/\
- "pcmpeqb %%mm4 , %%mm4 \n\t"\
- "pxor %%mm4 , %%mm3 \n\t"\
- "pavgb %%mm0 , %%mm3 \n\t" /* (p1 - q1 + 256)>>1*/\
- "pavgb "MANGLE(ff_pb_3)" , %%mm3 \n\t" /*(((p1 - q1 + 256)>>1)+4)>>1 = 64+2+(p1-q1)>>2*/\
- "pxor %%mm1 , %%mm4 \n\t"\
- "pavgb %%mm2 , %%mm4 \n\t" /* (q0 - p0 + 256)>>1*/\
- "pavgb %%mm5 , %%mm3 \n\t"\
- "paddusb %%mm4 , %%mm3 \n\t" /* d+128+33*/\
- "movq "MANGLE(ff_pb_A1)" , %%mm6 \n\t"\
- "psubusb %%mm3 , %%mm6 \n\t"\
- "psubusb "MANGLE(ff_pb_A1)" , %%mm3 \n\t"\
- "pminub %%mm7 , %%mm6 \n\t"\
- "pminub %%mm7 , %%mm3 \n\t"\
- "psubusb %%mm6 , %%mm1 \n\t"\
- "psubusb %%mm3 , %%mm2 \n\t"\
- "paddusb %%mm3 , %%mm1 \n\t"\
- "paddusb %%mm6 , %%mm2 \n\t"
-
-// in: mm0=p1 mm1=p0 mm2=q0 mm3=q1 mm7=(tc&mask) %8=ff_bone
-// out: (q1addr) = av_clip( (q2+((p0+q0+1)>>1))>>1, q1-tc0, q1+tc0 )
-// clobbers: q2, tmp, tc0
-#define H264_DEBLOCK_Q1(p1, q2, q2addr, q1addr, tc0, tmp)\
- "movq %%mm1, "#tmp" \n\t"\
- "pavgb %%mm2, "#tmp" \n\t"\
- "pavgb "#tmp", "#q2" \n\t" /* avg(p2,avg(p0,q0)) */\
- "pxor "q2addr", "#tmp" \n\t"\
- "pand %9, "#tmp" \n\t" /* (p2^avg(p0,q0))&1 */\
- "psubusb "#tmp", "#q2" \n\t" /* (p2+((p0+q0+1)>>1))>>1 */\
- "movq "#p1", "#tmp" \n\t"\
- "psubusb "#tc0", "#tmp" \n\t"\
- "paddusb "#p1", "#tc0" \n\t"\
- "pmaxub "#tmp", "#q2" \n\t"\
- "pminub "#tc0", "#q2" \n\t"\
- "movq "#q2", "q1addr" \n\t"
-
-static inline void h264_loop_filter_luma_mmx2(uint8_t *pix, int stride, int alpha1, int beta1, int8_t *tc0)
-{
- DECLARE_ALIGNED(8, uint64_t, tmp0)[2];
-
- __asm__ volatile(
- "movq (%2,%4), %%mm0 \n\t" //p1
- "movq (%2,%4,2), %%mm1 \n\t" //p0
- "movq (%3), %%mm2 \n\t" //q0
- "movq (%3,%4), %%mm3 \n\t" //q1
- H264_DEBLOCK_MASK(%7, %8)
-
- "movd %6, %%mm4 \n\t"
- "punpcklbw %%mm4, %%mm4 \n\t"
- "punpcklwd %%mm4, %%mm4 \n\t"
- "pcmpeqb %%mm3, %%mm3 \n\t"
- "movq %%mm4, %%mm6 \n\t"
- "pcmpgtb %%mm3, %%mm4 \n\t"
- "movq %%mm6, %1 \n\t"
- "pand %%mm4, %%mm7 \n\t"
- "movq %%mm7, %0 \n\t"
-
- /* filter p1 */
- "movq (%2), %%mm3 \n\t" //p2
- DIFF_GT2_MMX(%%mm1, %%mm3, %%mm5, %%mm6, %%mm4) // |p2-p0|>beta-1
- "pand %%mm7, %%mm6 \n\t" // mask & |p2-p0|<beta
- "pand %1, %%mm7 \n\t" // mask & tc0
- "movq %%mm7, %%mm4 \n\t"
- "psubb %%mm6, %%mm7 \n\t"
- "pand %%mm4, %%mm6 \n\t" // mask & |p2-p0|<beta & tc0
- H264_DEBLOCK_Q1(%%mm0, %%mm3, "(%2)", "(%2,%4)", %%mm6, %%mm4)
-
- /* filter q1 */
- "movq (%3,%4,2), %%mm4 \n\t" //q2
- DIFF_GT2_MMX(%%mm2, %%mm4, %%mm5, %%mm6, %%mm3) // |q2-q0|>beta-1
- "pand %0, %%mm6 \n\t"
- "movq %1, %%mm5 \n\t" // can be merged with the and below but is slower then
- "pand %%mm6, %%mm5 \n\t"
- "psubb %%mm6, %%mm7 \n\t"
- "movq (%3,%4), %%mm3 \n\t"
- H264_DEBLOCK_Q1(%%mm3, %%mm4, "(%3,%4,2)", "(%3,%4)", %%mm5, %%mm6)
-
- /* filter p0, q0 */
- H264_DEBLOCK_P0_Q0(%9, unused)
- "movq %%mm1, (%2,%4,2) \n\t"
- "movq %%mm2, (%3) \n\t"
-
- : "=m"(tmp0[0]), "=m"(tmp0[1])
- : "r"(pix-3*stride), "r"(pix), "r"((x86_reg)stride),
- "m"(*tmp0/*unused*/), "m"(*(uint32_t*)tc0), "m"(alpha1), "m"(beta1),
- "m"(ff_bone)
- );
-}
-
-static void h264_v_loop_filter_luma_mmx2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
-{
- if((tc0[0] & tc0[1]) >= 0)
- h264_loop_filter_luma_mmx2(pix, stride, alpha-1, beta-1, tc0);
- if((tc0[2] & tc0[3]) >= 0)
- h264_loop_filter_luma_mmx2(pix+8, stride, alpha-1, beta-1, tc0+2);
-}
-static void h264_h_loop_filter_luma_mmx2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
-{
- //FIXME: could cut some load/stores by merging transpose with filter
- // also, it only needs to transpose 6x8
- DECLARE_ALIGNED(8, uint8_t, trans)[8*8];
- int i;
- for(i=0; i<2; i++, pix+=8*stride, tc0+=2) {
- if((tc0[0] & tc0[1]) < 0)
- continue;
- transpose4x4(trans, pix-4, 8, stride);
- transpose4x4(trans +4*8, pix, 8, stride);
- transpose4x4(trans+4, pix-4+4*stride, 8, stride);
- transpose4x4(trans+4+4*8, pix +4*stride, 8, stride);
- h264_loop_filter_luma_mmx2(trans+4*8, 8, alpha-1, beta-1, tc0);
- transpose4x4(pix-2, trans +2*8, stride, 8);
- transpose4x4(pix-2+4*stride, trans+4+2*8, stride, 8);
- }
-}
-
-static inline void h264_loop_filter_chroma_mmx2(uint8_t *pix, int stride, int alpha1, int beta1, int8_t *tc0)
-{
- __asm__ volatile(
- "movq (%0), %%mm0 \n\t" //p1
- "movq (%0,%2), %%mm1 \n\t" //p0
- "movq (%1), %%mm2 \n\t" //q0
- "movq (%1,%2), %%mm3 \n\t" //q1
- H264_DEBLOCK_MASK(%4, %5)
- "movd %3, %%mm6 \n\t"
- "punpcklbw %%mm6, %%mm6 \n\t"
- "pand %%mm6, %%mm7 \n\t" // mm7 = tc&mask
- H264_DEBLOCK_P0_Q0(%6, %7)
- "movq %%mm1, (%0,%2) \n\t"
- "movq %%mm2, (%1) \n\t"
-
- :: "r"(pix-2*stride), "r"(pix), "r"((x86_reg)stride),
- "r"(*(uint32_t*)tc0),
- "m"(alpha1), "m"(beta1), "m"(ff_bone), "m"(ff_pb_3F)
- );
-}
-
-static void h264_v_loop_filter_chroma_mmx2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
-{
- h264_loop_filter_chroma_mmx2(pix, stride, alpha-1, beta-1, tc0);
-}
-
-static void h264_h_loop_filter_chroma_mmx2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
-{
- //FIXME: could cut some load/stores by merging transpose with filter
- DECLARE_ALIGNED(8, uint8_t, trans)[8*4];
- transpose4x4(trans, pix-2, 8, stride);
- transpose4x4(trans+4, pix-2+4*stride, 8, stride);
- h264_loop_filter_chroma_mmx2(trans+2*8, 8, alpha-1, beta-1, tc0);
- transpose4x4(pix-2, trans, stride, 8);
- transpose4x4(pix-2+4*stride, trans+4, stride, 8);
-}
-
-// p0 = (p0 + q1 + 2*p1 + 2) >> 2
-#define H264_FILTER_CHROMA4(p0, p1, q1, one) \
- "movq "#p0", %%mm4 \n\t"\
- "pxor "#q1", %%mm4 \n\t"\
- "pand "#one", %%mm4 \n\t" /* mm4 = (p0^q1)&1 */\
- "pavgb "#q1", "#p0" \n\t"\
- "psubusb %%mm4, "#p0" \n\t"\
- "pavgb "#p1", "#p0" \n\t" /* dst = avg(p1, avg(p0,q1) - ((p0^q1)&1)) */\
-
-static inline void h264_loop_filter_chroma_intra_mmx2(uint8_t *pix, int stride, int alpha1, int beta1)
-{
- __asm__ volatile(
- "movq (%0), %%mm0 \n\t"
- "movq (%0,%2), %%mm1 \n\t"
- "movq (%1), %%mm2 \n\t"
- "movq (%1,%2), %%mm3 \n\t"
- H264_DEBLOCK_MASK(%3, %4)
- "movq %%mm1, %%mm5 \n\t"
- "movq %%mm2, %%mm6 \n\t"
- H264_FILTER_CHROMA4(%%mm1, %%mm0, %%mm3, %5) //p0'
- H264_FILTER_CHROMA4(%%mm2, %%mm3, %%mm0, %5) //q0'
- "psubb %%mm5, %%mm1 \n\t"
- "psubb %%mm6, %%mm2 \n\t"
- "pand %%mm7, %%mm1 \n\t"
- "pand %%mm7, %%mm2 \n\t"
- "paddb %%mm5, %%mm1 \n\t"
- "paddb %%mm6, %%mm2 \n\t"
- "movq %%mm1, (%0,%2) \n\t"
- "movq %%mm2, (%1) \n\t"
- :: "r"(pix-2*stride), "r"(pix), "r"((x86_reg)stride),
- "m"(alpha1), "m"(beta1), "m"(ff_bone)
- );
-}
-
-static void h264_v_loop_filter_chroma_intra_mmx2(uint8_t *pix, int stride, int alpha, int beta)
-{
- h264_loop_filter_chroma_intra_mmx2(pix, stride, alpha-1, beta-1);
-}
-
-static void h264_h_loop_filter_chroma_intra_mmx2(uint8_t *pix, int stride, int alpha, int beta)
-{
- //FIXME: could cut some load/stores by merging transpose with filter
- DECLARE_ALIGNED(8, uint8_t, trans)[8*4];
- transpose4x4(trans, pix-2, 8, stride);
- transpose4x4(trans+4, pix-2+4*stride, 8, stride);
- h264_loop_filter_chroma_intra_mmx2(trans+2*8, 8, alpha-1, beta-1);
- transpose4x4(pix-2, trans, stride, 8);
- transpose4x4(pix-2+4*stride, trans+4, stride, 8);
-}
-
static void h264_loop_filter_strength_mmx2( int16_t bS[2][4][4], uint8_t nnz[40], int8_t ref[2][40], int16_t mv[2][40][2],
int bidir, int edges, int step, int mask_mv0, int mask_mv1, int field ) {
int dir;
@@ -917,1507 +188,162 @@ static void h264_loop_filter_strength_mmx2( int16_t bS[2][4][4], uint8_t nnz[40] );
}
-/***********************************/
-/* motion compensation */
-
-#define QPEL_H264V_MM(A,B,C,D,E,F,OP,T,Z,d,q)\
- "mov"#q" "#C", "#T" \n\t"\
- "mov"#d" (%0), "#F" \n\t"\
- "paddw "#D", "#T" \n\t"\
- "psllw $2, "#T" \n\t"\
- "psubw "#B", "#T" \n\t"\
- "psubw "#E", "#T" \n\t"\
- "punpcklbw "#Z", "#F" \n\t"\
- "pmullw %4, "#T" \n\t"\
- "paddw %5, "#A" \n\t"\
- "add %2, %0 \n\t"\
- "paddw "#F", "#A" \n\t"\
- "paddw "#A", "#T" \n\t"\
- "psraw $5, "#T" \n\t"\
- "packuswb "#T", "#T" \n\t"\
- OP(T, (%1), A, d)\
- "add %3, %1 \n\t"
-
-#define QPEL_H264HV_MM(A,B,C,D,E,F,OF,T,Z,d,q)\
- "mov"#q" "#C", "#T" \n\t"\
- "mov"#d" (%0), "#F" \n\t"\
- "paddw "#D", "#T" \n\t"\
- "psllw $2, "#T" \n\t"\
- "paddw %4, "#A" \n\t"\
- "psubw "#B", "#T" \n\t"\
- "psubw "#E", "#T" \n\t"\
- "punpcklbw "#Z", "#F" \n\t"\
- "pmullw %3, "#T" \n\t"\
- "paddw "#F", "#A" \n\t"\
- "add %2, %0 \n\t"\
- "paddw "#A", "#T" \n\t"\
- "mov"#q" "#T", "#OF"(%1) \n\t"
-
-#define QPEL_H264V(A,B,C,D,E,F,OP) QPEL_H264V_MM(A,B,C,D,E,F,OP,%%mm6,%%mm7,d,q)
-#define QPEL_H264HV(A,B,C,D,E,F,OF) QPEL_H264HV_MM(A,B,C,D,E,F,OF,%%mm6,%%mm7,d,q)
-#define QPEL_H264V_XMM(A,B,C,D,E,F,OP) QPEL_H264V_MM(A,B,C,D,E,F,OP,%%xmm6,%%xmm7,q,dqa)
-#define QPEL_H264HV_XMM(A,B,C,D,E,F,OF) QPEL_H264HV_MM(A,B,C,D,E,F,OF,%%xmm6,%%xmm7,q,dqa)
-
-
-#define QPEL_H264(OPNAME, OP, MMX)\
-static av_noinline void OPNAME ## h264_qpel4_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
- int h=4;\
-\
- __asm__ volatile(\
- "pxor %%mm7, %%mm7 \n\t"\
- "movq "MANGLE(ff_pw_5) ", %%mm4\n\t"\
- "movq "MANGLE(ff_pw_16)", %%mm5\n\t"\
- "1: \n\t"\
- "movd -1(%0), %%mm1 \n\t"\
- "movd (%0), %%mm2 \n\t"\
- "movd 1(%0), %%mm3 \n\t"\
- "movd 2(%0), %%mm0 \n\t"\
- "punpcklbw %%mm7, %%mm1 \n\t"\
- "punpcklbw %%mm7, %%mm2 \n\t"\
- "punpcklbw %%mm7, %%mm3 \n\t"\
- "punpcklbw %%mm7, %%mm0 \n\t"\
- "paddw %%mm0, %%mm1 \n\t"\
- "paddw %%mm3, %%mm2 \n\t"\
- "movd -2(%0), %%mm0 \n\t"\
- "movd 3(%0), %%mm3 \n\t"\
- "punpcklbw %%mm7, %%mm0 \n\t"\
- "punpcklbw %%mm7, %%mm3 \n\t"\
- "paddw %%mm3, %%mm0 \n\t"\
- "psllw $2, %%mm2 \n\t"\
- "psubw %%mm1, %%mm2 \n\t"\
- "pmullw %%mm4, %%mm2 \n\t"\
- "paddw %%mm5, %%mm0 \n\t"\
- "paddw %%mm2, %%mm0 \n\t"\
- "psraw $5, %%mm0 \n\t"\
- "packuswb %%mm0, %%mm0 \n\t"\
- OP(%%mm0, (%1),%%mm6, d)\
- "add %3, %0 \n\t"\
- "add %4, %1 \n\t"\
- "decl %2 \n\t"\
- " jnz 1b \n\t"\
- : "+a"(src), "+c"(dst), "+g"(h)\
- : "d"((x86_reg)srcStride), "S"((x86_reg)dstStride)\
- : "memory"\
- );\
-}\
-static av_noinline void OPNAME ## h264_qpel4_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
- int h=4;\
- __asm__ volatile(\
- "pxor %%mm7, %%mm7 \n\t"\
- "movq %0, %%mm4 \n\t"\
- "movq %1, %%mm5 \n\t"\
- :: "m"(ff_pw_5), "m"(ff_pw_16)\
- );\
- do{\
- __asm__ volatile(\
- "movd -1(%0), %%mm1 \n\t"\
- "movd (%0), %%mm2 \n\t"\
- "movd 1(%0), %%mm3 \n\t"\
- "movd 2(%0), %%mm0 \n\t"\
- "punpcklbw %%mm7, %%mm1 \n\t"\
- "punpcklbw %%mm7, %%mm2 \n\t"\
- "punpcklbw %%mm7, %%mm3 \n\t"\
- "punpcklbw %%mm7, %%mm0 \n\t"\
- "paddw %%mm0, %%mm1 \n\t"\
- "paddw %%mm3, %%mm2 \n\t"\
- "movd -2(%0), %%mm0 \n\t"\
- "movd 3(%0), %%mm3 \n\t"\
- "punpcklbw %%mm7, %%mm0 \n\t"\
- "punpcklbw %%mm7, %%mm3 \n\t"\
- "paddw %%mm3, %%mm0 \n\t"\
- "psllw $2, %%mm2 \n\t"\
- "psubw %%mm1, %%mm2 \n\t"\
- "pmullw %%mm4, %%mm2 \n\t"\
- "paddw %%mm5, %%mm0 \n\t"\
- "paddw %%mm2, %%mm0 \n\t"\
- "movd (%2), %%mm3 \n\t"\
- "psraw $5, %%mm0 \n\t"\
- "packuswb %%mm0, %%mm0 \n\t"\
- PAVGB" %%mm3, %%mm0 \n\t"\
- OP(%%mm0, (%1),%%mm6, d)\
- "add %4, %0 \n\t"\
- "add %4, %1 \n\t"\
- "add %3, %2 \n\t"\
- : "+a"(src), "+c"(dst), "+d"(src2)\
- : "D"((x86_reg)src2Stride), "S"((x86_reg)dstStride)\
- : "memory"\
- );\
- }while(--h);\
-}\
-static av_noinline void OPNAME ## h264_qpel4_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
- src -= 2*srcStride;\
- __asm__ volatile(\
- "pxor %%mm7, %%mm7 \n\t"\
- "movd (%0), %%mm0 \n\t"\
- "add %2, %0 \n\t"\
- "movd (%0), %%mm1 \n\t"\
- "add %2, %0 \n\t"\
- "movd (%0), %%mm2 \n\t"\
- "add %2, %0 \n\t"\
- "movd (%0), %%mm3 \n\t"\
- "add %2, %0 \n\t"\
- "movd (%0), %%mm4 \n\t"\
- "add %2, %0 \n\t"\
- "punpcklbw %%mm7, %%mm0 \n\t"\
- "punpcklbw %%mm7, %%mm1 \n\t"\
- "punpcklbw %%mm7, %%mm2 \n\t"\
- "punpcklbw %%mm7, %%mm3 \n\t"\
- "punpcklbw %%mm7, %%mm4 \n\t"\
- QPEL_H264V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\
- QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\
- QPEL_H264V(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\
- QPEL_H264V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\
- \
- : "+a"(src), "+c"(dst)\
- : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\
- : "memory"\
- );\
-}\
-static av_noinline void OPNAME ## h264_qpel4_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
- int h=4;\
- int w=3;\
- src -= 2*srcStride+2;\
- while(w--){\
- __asm__ volatile(\
- "pxor %%mm7, %%mm7 \n\t"\
- "movd (%0), %%mm0 \n\t"\
- "add %2, %0 \n\t"\
- "movd (%0), %%mm1 \n\t"\
- "add %2, %0 \n\t"\
- "movd (%0), %%mm2 \n\t"\
- "add %2, %0 \n\t"\
- "movd (%0), %%mm3 \n\t"\
- "add %2, %0 \n\t"\
- "movd (%0), %%mm4 \n\t"\
- "add %2, %0 \n\t"\
- "punpcklbw %%mm7, %%mm0 \n\t"\
- "punpcklbw %%mm7, %%mm1 \n\t"\
- "punpcklbw %%mm7, %%mm2 \n\t"\
- "punpcklbw %%mm7, %%mm3 \n\t"\
- "punpcklbw %%mm7, %%mm4 \n\t"\
- QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 0*8*3)\
- QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 1*8*3)\
- QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 2*8*3)\
- QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 3*8*3)\
- \
- : "+a"(src)\
- : "c"(tmp), "S"((x86_reg)srcStride), "m"(ff_pw_5), "m"(ff_pw_16)\
- : "memory"\
- );\
- tmp += 4;\
- src += 4 - 9*srcStride;\
- }\
- tmp -= 3*4;\
- __asm__ volatile(\
- "1: \n\t"\
- "movq (%0), %%mm0 \n\t"\
- "paddw 10(%0), %%mm0 \n\t"\
- "movq 2(%0), %%mm1 \n\t"\
- "paddw 8(%0), %%mm1 \n\t"\
- "movq 4(%0), %%mm2 \n\t"\
- "paddw 6(%0), %%mm2 \n\t"\
- "psubw %%mm1, %%mm0 \n\t"/*a-b (abccba)*/\
- "psraw $2, %%mm0 \n\t"/*(a-b)/4 */\
- "psubw %%mm1, %%mm0 \n\t"/*(a-b)/4-b */\
- "paddsw %%mm2, %%mm0 \n\t"\
- "psraw $2, %%mm0 \n\t"/*((a-b)/4-b+c)/4 */\
- "paddw %%mm2, %%mm0 \n\t"/*(a-5*b+20*c)/16 */\
- "psraw $6, %%mm0 \n\t"\
- "packuswb %%mm0, %%mm0 \n\t"\
- OP(%%mm0, (%1),%%mm7, d)\
- "add $24, %0 \n\t"\
- "add %3, %1 \n\t"\
- "decl %2 \n\t"\
- " jnz 1b \n\t"\
- : "+a"(tmp), "+c"(dst), "+g"(h)\
- : "S"((x86_reg)dstStride)\
- : "memory"\
- );\
-}\
-\
-static av_noinline void OPNAME ## h264_qpel8_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
- int h=8;\
- __asm__ volatile(\
- "pxor %%mm7, %%mm7 \n\t"\
- "movq "MANGLE(ff_pw_5)", %%mm6\n\t"\
- "1: \n\t"\
- "movq (%0), %%mm0 \n\t"\
- "movq 1(%0), %%mm2 \n\t"\
- "movq %%mm0, %%mm1 \n\t"\
- "movq %%mm2, %%mm3 \n\t"\
- "punpcklbw %%mm7, %%mm0 \n\t"\
- "punpckhbw %%mm7, %%mm1 \n\t"\
- "punpcklbw %%mm7, %%mm2 \n\t"\
- "punpckhbw %%mm7, %%mm3 \n\t"\
- "paddw %%mm2, %%mm0 \n\t"\
- "paddw %%mm3, %%mm1 \n\t"\
- "psllw $2, %%mm0 \n\t"\
- "psllw $2, %%mm1 \n\t"\
- "movq -1(%0), %%mm2 \n\t"\
- "movq 2(%0), %%mm4 \n\t"\
- "movq %%mm2, %%mm3 \n\t"\
- "movq %%mm4, %%mm5 \n\t"\
- "punpcklbw %%mm7, %%mm2 \n\t"\
- "punpckhbw %%mm7, %%mm3 \n\t"\
- "punpcklbw %%mm7, %%mm4 \n\t"\
- "punpckhbw %%mm7, %%mm5 \n\t"\
- "paddw %%mm4, %%mm2 \n\t"\
- "paddw %%mm3, %%mm5 \n\t"\
- "psubw %%mm2, %%mm0 \n\t"\
- "psubw %%mm5, %%mm1 \n\t"\
- "pmullw %%mm6, %%mm0 \n\t"\
- "pmullw %%mm6, %%mm1 \n\t"\
- "movd -2(%0), %%mm2 \n\t"\
- "movd 7(%0), %%mm5 \n\t"\
- "punpcklbw %%mm7, %%mm2 \n\t"\
- "punpcklbw %%mm7, %%mm5 \n\t"\
- "paddw %%mm3, %%mm2 \n\t"\
- "paddw %%mm5, %%mm4 \n\t"\
- "movq "MANGLE(ff_pw_16)", %%mm5\n\t"\
- "paddw %%mm5, %%mm2 \n\t"\
- "paddw %%mm5, %%mm4 \n\t"\
- "paddw %%mm2, %%mm0 \n\t"\
- "paddw %%mm4, %%mm1 \n\t"\
- "psraw $5, %%mm0 \n\t"\
- "psraw $5, %%mm1 \n\t"\
- "packuswb %%mm1, %%mm0 \n\t"\
- OP(%%mm0, (%1),%%mm5, q)\
- "add %3, %0 \n\t"\
- "add %4, %1 \n\t"\
- "decl %2 \n\t"\
- " jnz 1b \n\t"\
- : "+a"(src), "+c"(dst), "+g"(h)\
- : "d"((x86_reg)srcStride), "S"((x86_reg)dstStride)\
- : "memory"\
- );\
-}\
-\
-static av_noinline void OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
- int h=8;\
- __asm__ volatile(\
- "pxor %%mm7, %%mm7 \n\t"\
- "movq %0, %%mm6 \n\t"\
- :: "m"(ff_pw_5)\
- );\
- do{\
- __asm__ volatile(\
- "movq (%0), %%mm0 \n\t"\
- "movq 1(%0), %%mm2 \n\t"\
- "movq %%mm0, %%mm1 \n\t"\
- "movq %%mm2, %%mm3 \n\t"\
- "punpcklbw %%mm7, %%mm0 \n\t"\
- "punpckhbw %%mm7, %%mm1 \n\t"\
- "punpcklbw %%mm7, %%mm2 \n\t"\
- "punpckhbw %%mm7, %%mm3 \n\t"\
- "paddw %%mm2, %%mm0 \n\t"\
- "paddw %%mm3, %%mm1 \n\t"\
- "psllw $2, %%mm0 \n\t"\
- "psllw $2, %%mm1 \n\t"\
- "movq -1(%0), %%mm2 \n\t"\
- "movq 2(%0), %%mm4 \n\t"\
- "movq %%mm2, %%mm3 \n\t"\
- "movq %%mm4, %%mm5 \n\t"\
- "punpcklbw %%mm7, %%mm2 \n\t"\
- "punpckhbw %%mm7, %%mm3 \n\t"\
- "punpcklbw %%mm7, %%mm4 \n\t"\
- "punpckhbw %%mm7, %%mm5 \n\t"\
- "paddw %%mm4, %%mm2 \n\t"\
- "paddw %%mm3, %%mm5 \n\t"\
- "psubw %%mm2, %%mm0 \n\t"\
- "psubw %%mm5, %%mm1 \n\t"\
- "pmullw %%mm6, %%mm0 \n\t"\
- "pmullw %%mm6, %%mm1 \n\t"\
- "movd -2(%0), %%mm2 \n\t"\
- "movd 7(%0), %%mm5 \n\t"\
- "punpcklbw %%mm7, %%mm2 \n\t"\
- "punpcklbw %%mm7, %%mm5 \n\t"\
- "paddw %%mm3, %%mm2 \n\t"\
- "paddw %%mm5, %%mm4 \n\t"\
- "movq %5, %%mm5 \n\t"\
- "paddw %%mm5, %%mm2 \n\t"\
- "paddw %%mm5, %%mm4 \n\t"\
- "paddw %%mm2, %%mm0 \n\t"\
- "paddw %%mm4, %%mm1 \n\t"\
- "psraw $5, %%mm0 \n\t"\
- "psraw $5, %%mm1 \n\t"\
- "movq (%2), %%mm4 \n\t"\
- "packuswb %%mm1, %%mm0 \n\t"\
- PAVGB" %%mm4, %%mm0 \n\t"\
- OP(%%mm0, (%1),%%mm5, q)\
- "add %4, %0 \n\t"\
- "add %4, %1 \n\t"\
- "add %3, %2 \n\t"\
- : "+a"(src), "+c"(dst), "+d"(src2)\
- : "D"((x86_reg)src2Stride), "S"((x86_reg)dstStride),\
- "m"(ff_pw_16)\
- : "memory"\
- );\
- }while(--h);\
-}\
-\
-static av_noinline void OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
- int w= 2;\
- src -= 2*srcStride;\
- \
- while(w--){\
- __asm__ volatile(\
- "pxor %%mm7, %%mm7 \n\t"\
- "movd (%0), %%mm0 \n\t"\
- "add %2, %0 \n\t"\
- "movd (%0), %%mm1 \n\t"\
- "add %2, %0 \n\t"\
- "movd (%0), %%mm2 \n\t"\
- "add %2, %0 \n\t"\
- "movd (%0), %%mm3 \n\t"\
- "add %2, %0 \n\t"\
- "movd (%0), %%mm4 \n\t"\
- "add %2, %0 \n\t"\
- "punpcklbw %%mm7, %%mm0 \n\t"\
- "punpcklbw %%mm7, %%mm1 \n\t"\
- "punpcklbw %%mm7, %%mm2 \n\t"\
- "punpcklbw %%mm7, %%mm3 \n\t"\
- "punpcklbw %%mm7, %%mm4 \n\t"\
- QPEL_H264V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\
- QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\
- QPEL_H264V(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\
- QPEL_H264V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\
- QPEL_H264V(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, OP)\
- QPEL_H264V(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, OP)\
- QPEL_H264V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\
- QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\
- \
- : "+a"(src), "+c"(dst)\
- : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\
- : "memory"\
- );\
- if(h==16){\
- __asm__ volatile(\
- QPEL_H264V(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\
- QPEL_H264V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\
- QPEL_H264V(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, OP)\
- QPEL_H264V(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, OP)\
- QPEL_H264V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\
- QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\
- QPEL_H264V(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\
- QPEL_H264V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\
- \
- : "+a"(src), "+c"(dst)\
- : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\
- : "memory"\
- );\
- }\
- src += 4-(h+5)*srcStride;\
- dst += 4-h*dstStride;\
- }\
-}\
-static av_always_inline void OPNAME ## h264_qpel8or16_hv1_lowpass_ ## MMX(int16_t *tmp, uint8_t *src, int tmpStride, int srcStride, int size){\
- int w = (size+8)>>2;\
- src -= 2*srcStride+2;\
- while(w--){\
- __asm__ volatile(\
- "pxor %%mm7, %%mm7 \n\t"\
- "movd (%0), %%mm0 \n\t"\
- "add %2, %0 \n\t"\
- "movd (%0), %%mm1 \n\t"\
- "add %2, %0 \n\t"\
- "movd (%0), %%mm2 \n\t"\
- "add %2, %0 \n\t"\
- "movd (%0), %%mm3 \n\t"\
- "add %2, %0 \n\t"\
- "movd (%0), %%mm4 \n\t"\
- "add %2, %0 \n\t"\
- "punpcklbw %%mm7, %%mm0 \n\t"\
- "punpcklbw %%mm7, %%mm1 \n\t"\
- "punpcklbw %%mm7, %%mm2 \n\t"\
- "punpcklbw %%mm7, %%mm3 \n\t"\
- "punpcklbw %%mm7, %%mm4 \n\t"\
- QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 0*48)\
- QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 1*48)\
- QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 2*48)\
- QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 3*48)\
- QPEL_H264HV(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, 4*48)\
- QPEL_H264HV(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, 5*48)\
- QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 6*48)\
- QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 7*48)\
- : "+a"(src)\
- : "c"(tmp), "S"((x86_reg)srcStride), "m"(ff_pw_5), "m"(ff_pw_16)\
- : "memory"\
- );\
- if(size==16){\
- __asm__ volatile(\
- QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 8*48)\
- QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 9*48)\
- QPEL_H264HV(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, 10*48)\
- QPEL_H264HV(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, 11*48)\
- QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 12*48)\
- QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 13*48)\
- QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 14*48)\
- QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 15*48)\
- : "+a"(src)\
- : "c"(tmp), "S"((x86_reg)srcStride), "m"(ff_pw_5), "m"(ff_pw_16)\
- : "memory"\
- );\
- }\
- tmp += 4;\
- src += 4 - (size+5)*srcStride;\
- }\
-}\
-static av_always_inline void OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, int dstStride, int tmpStride, int size){\
- int w = size>>4;\
- do{\
- int h = size;\
- __asm__ volatile(\
- "1: \n\t"\
- "movq (%0), %%mm0 \n\t"\
- "movq 8(%0), %%mm3 \n\t"\
- "movq 2(%0), %%mm1 \n\t"\
- "movq 10(%0), %%mm4 \n\t"\
- "paddw %%mm4, %%mm0 \n\t"\
- "paddw %%mm3, %%mm1 \n\t"\
- "paddw 18(%0), %%mm3 \n\t"\
- "paddw 16(%0), %%mm4 \n\t"\
- "movq 4(%0), %%mm2 \n\t"\
- "movq 12(%0), %%mm5 \n\t"\
- "paddw 6(%0), %%mm2 \n\t"\
- "paddw 14(%0), %%mm5 \n\t"\
- "psubw %%mm1, %%mm0 \n\t"\
- "psubw %%mm4, %%mm3 \n\t"\
- "psraw $2, %%mm0 \n\t"\
- "psraw $2, %%mm3 \n\t"\
- "psubw %%mm1, %%mm0 \n\t"\
- "psubw %%mm4, %%mm3 \n\t"\
- "paddsw %%mm2, %%mm0 \n\t"\
- "paddsw %%mm5, %%mm3 \n\t"\
- "psraw $2, %%mm0 \n\t"\
- "psraw $2, %%mm3 \n\t"\
- "paddw %%mm2, %%mm0 \n\t"\
- "paddw %%mm5, %%mm3 \n\t"\
- "psraw $6, %%mm0 \n\t"\
- "psraw $6, %%mm3 \n\t"\
- "packuswb %%mm3, %%mm0 \n\t"\
- OP(%%mm0, (%1),%%mm7, q)\
- "add $48, %0 \n\t"\
- "add %3, %1 \n\t"\
- "decl %2 \n\t"\
- " jnz 1b \n\t"\
- : "+a"(tmp), "+c"(dst), "+g"(h)\
- : "S"((x86_reg)dstStride)\
- : "memory"\
- );\
- tmp += 8 - size*24;\
- dst += 8 - size*dstStride;\
- }while(w--);\
-}\
-\
-static void OPNAME ## h264_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
- OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride, srcStride, 8);\
-}\
-static av_noinline void OPNAME ## h264_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
- OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride, srcStride, 16);\
- OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\
-}\
-\
-static void OPNAME ## h264_qpel16_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
- OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\
- OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
- src += 8*srcStride;\
- dst += 8*dstStride;\
- OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\
- OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
-}\
-\
-static av_noinline void OPNAME ## h264_qpel16_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
- OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\
- OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\
- src += 8*dstStride;\
- dst += 8*dstStride;\
- src2 += 8*src2Stride;\
- OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\
- OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\
-}\
-\
-static av_noinline void OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride, int size){\
- put_h264_qpel8or16_hv1_lowpass_ ## MMX(tmp, src, tmpStride, srcStride, size);\
- OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(dst, tmp, dstStride, tmpStride, size);\
-}\
-static void OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
- OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst , tmp , src , dstStride, tmpStride, srcStride, 8);\
-}\
-\
-static void OPNAME ## h264_qpel16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
- OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst , tmp , src , dstStride, tmpStride, srcStride, 16);\
-}\
-\
-static av_noinline void OPNAME ## pixels4_l2_shift5_ ## MMX(uint8_t *dst, int16_t *src16, uint8_t *src8, int dstStride, int src8Stride, int h)\
-{\
- __asm__ volatile(\
- "movq (%1), %%mm0 \n\t"\
- "movq 24(%1), %%mm1 \n\t"\
- "psraw $5, %%mm0 \n\t"\
- "psraw $5, %%mm1 \n\t"\
- "packuswb %%mm0, %%mm0 \n\t"\
- "packuswb %%mm1, %%mm1 \n\t"\
- PAVGB" (%0), %%mm0 \n\t"\
- PAVGB" (%0,%3), %%mm1 \n\t"\
- OP(%%mm0, (%2), %%mm4, d)\
- OP(%%mm1, (%2,%4), %%mm5, d)\
- "lea (%0,%3,2), %0 \n\t"\
- "lea (%2,%4,2), %2 \n\t"\
- "movq 48(%1), %%mm0 \n\t"\
- "movq 72(%1), %%mm1 \n\t"\
- "psraw $5, %%mm0 \n\t"\
- "psraw $5, %%mm1 \n\t"\
- "packuswb %%mm0, %%mm0 \n\t"\
- "packuswb %%mm1, %%mm1 \n\t"\
- PAVGB" (%0), %%mm0 \n\t"\
- PAVGB" (%0,%3), %%mm1 \n\t"\
- OP(%%mm0, (%2), %%mm4, d)\
- OP(%%mm1, (%2,%4), %%mm5, d)\
- :"+a"(src8), "+c"(src16), "+d"(dst)\
- :"S"((x86_reg)src8Stride), "D"((x86_reg)dstStride)\
- :"memory");\
-}\
-static av_noinline void OPNAME ## pixels8_l2_shift5_ ## MMX(uint8_t *dst, int16_t *src16, uint8_t *src8, int dstStride, int src8Stride, int h)\
-{\
- do{\
- __asm__ volatile(\
- "movq (%1), %%mm0 \n\t"\
- "movq 8(%1), %%mm1 \n\t"\
- "movq 48(%1), %%mm2 \n\t"\
- "movq 8+48(%1), %%mm3 \n\t"\
- "psraw $5, %%mm0 \n\t"\
- "psraw $5, %%mm1 \n\t"\
- "psraw $5, %%mm2 \n\t"\
- "psraw $5, %%mm3 \n\t"\
- "packuswb %%mm1, %%mm0 \n\t"\
- "packuswb %%mm3, %%mm2 \n\t"\
- PAVGB" (%0), %%mm0 \n\t"\
- PAVGB" (%0,%3), %%mm2 \n\t"\
- OP(%%mm0, (%2), %%mm5, q)\
- OP(%%mm2, (%2,%4), %%mm5, q)\
- ::"a"(src8), "c"(src16), "d"(dst),\
- "r"((x86_reg)src8Stride), "r"((x86_reg)dstStride)\
- :"memory");\
- src8 += 2L*src8Stride;\
- src16 += 48;\
- dst += 2L*dstStride;\
- }while(h-=2);\
-}\
-static void OPNAME ## pixels16_l2_shift5_ ## MMX(uint8_t *dst, int16_t *src16, uint8_t *src8, int dstStride, int src8Stride, int h)\
-{\
- OPNAME ## pixels8_l2_shift5_ ## MMX(dst , src16 , src8 , dstStride, src8Stride, h);\
- OPNAME ## pixels8_l2_shift5_ ## MMX(dst+8, src16+8, src8+8, dstStride, src8Stride, h);\
-}\
-
-
-#if ARCH_X86_64
-#define QPEL_H264_H16_XMM(OPNAME, OP, MMX)\
-static av_noinline void OPNAME ## h264_qpel16_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
- int h=16;\
- __asm__ volatile(\
- "pxor %%xmm15, %%xmm15 \n\t"\
- "movdqa %6, %%xmm14 \n\t"\
- "movdqa %7, %%xmm13 \n\t"\
- "1: \n\t"\
- "lddqu 6(%0), %%xmm1 \n\t"\
- "lddqu -2(%0), %%xmm7 \n\t"\
- "movdqa %%xmm1, %%xmm0 \n\t"\
- "punpckhbw %%xmm15, %%xmm1 \n\t"\
- "punpcklbw %%xmm15, %%xmm0 \n\t"\
- "punpcklbw %%xmm15, %%xmm7 \n\t"\
- "movdqa %%xmm1, %%xmm2 \n\t"\
- "movdqa %%xmm0, %%xmm6 \n\t"\
- "movdqa %%xmm1, %%xmm3 \n\t"\
- "movdqa %%xmm0, %%xmm8 \n\t"\
- "movdqa %%xmm1, %%xmm4 \n\t"\
- "movdqa %%xmm0, %%xmm9 \n\t"\
- "movdqa %%xmm0, %%xmm12 \n\t"\
- "movdqa %%xmm1, %%xmm11 \n\t"\
- "palignr $10,%%xmm0, %%xmm11\n\t"\
- "palignr $10,%%xmm7, %%xmm12\n\t"\
- "palignr $2, %%xmm0, %%xmm4 \n\t"\
- "palignr $2, %%xmm7, %%xmm9 \n\t"\
- "palignr $4, %%xmm0, %%xmm3 \n\t"\
- "palignr $4, %%xmm7, %%xmm8 \n\t"\
- "palignr $6, %%xmm0, %%xmm2 \n\t"\
- "palignr $6, %%xmm7, %%xmm6 \n\t"\
- "paddw %%xmm0 ,%%xmm11 \n\t"\
- "palignr $8, %%xmm0, %%xmm1 \n\t"\
- "palignr $8, %%xmm7, %%xmm0 \n\t"\
- "paddw %%xmm12,%%xmm7 \n\t"\
- "paddw %%xmm3, %%xmm2 \n\t"\
- "paddw %%xmm8, %%xmm6 \n\t"\
- "paddw %%xmm4, %%xmm1 \n\t"\
- "paddw %%xmm9, %%xmm0 \n\t"\
- "psllw $2, %%xmm2 \n\t"\
- "psllw $2, %%xmm6 \n\t"\
- "psubw %%xmm1, %%xmm2 \n\t"\
- "psubw %%xmm0, %%xmm6 \n\t"\
- "paddw %%xmm13,%%xmm11 \n\t"\
- "paddw %%xmm13,%%xmm7 \n\t"\
- "pmullw %%xmm14,%%xmm2 \n\t"\
- "pmullw %%xmm14,%%xmm6 \n\t"\
- "lddqu (%2), %%xmm3 \n\t"\
- "paddw %%xmm11,%%xmm2 \n\t"\
- "paddw %%xmm7, %%xmm6 \n\t"\
- "psraw $5, %%xmm2 \n\t"\
- "psraw $5, %%xmm6 \n\t"\
- "packuswb %%xmm2,%%xmm6 \n\t"\
- "pavgb %%xmm3, %%xmm6 \n\t"\
- OP(%%xmm6, (%1), %%xmm4, dqa)\
- "add %5, %0 \n\t"\
- "add %5, %1 \n\t"\
- "add %4, %2 \n\t"\
- "decl %3 \n\t"\
- "jg 1b \n\t"\
- : "+a"(src), "+c"(dst), "+d"(src2), "+g"(h)\
- : "D"((x86_reg)src2Stride), "S"((x86_reg)dstStride),\
- "m"(ff_pw_5), "m"(ff_pw_16)\
- : "memory"\
- );\
-}
-#else // ARCH_X86_64
-#define QPEL_H264_H16_XMM(OPNAME, OP, MMX)\
-static av_noinline void OPNAME ## h264_qpel16_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
- OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\
- OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\
- src += 8*dstStride;\
- dst += 8*dstStride;\
- src2 += 8*src2Stride;\
- OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\
- OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\
-}
-#endif // ARCH_X86_64
-
-#define QPEL_H264_H_XMM(OPNAME, OP, MMX)\
-static av_noinline void OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
- int h=8;\
- __asm__ volatile(\
- "pxor %%xmm7, %%xmm7 \n\t"\
- "movdqa %0, %%xmm6 \n\t"\
- :: "m"(ff_pw_5)\
- );\
- do{\
- __asm__ volatile(\
- "lddqu -2(%0), %%xmm1 \n\t"\
- "movdqa %%xmm1, %%xmm0 \n\t"\
- "punpckhbw %%xmm7, %%xmm1 \n\t"\
- "punpcklbw %%xmm7, %%xmm0 \n\t"\
- "movdqa %%xmm1, %%xmm2 \n\t"\
- "movdqa %%xmm1, %%xmm3 \n\t"\
- "movdqa %%xmm1, %%xmm4 \n\t"\
- "movdqa %%xmm1, %%xmm5 \n\t"\
- "palignr $2, %%xmm0, %%xmm4 \n\t"\
- "palignr $4, %%xmm0, %%xmm3 \n\t"\
- "palignr $6, %%xmm0, %%xmm2 \n\t"\
- "palignr $8, %%xmm0, %%xmm1 \n\t"\
- "palignr $10,%%xmm0, %%xmm5 \n\t"\
- "paddw %%xmm5, %%xmm0 \n\t"\
- "paddw %%xmm3, %%xmm2 \n\t"\
- "paddw %%xmm4, %%xmm1 \n\t"\
- "psllw $2, %%xmm2 \n\t"\
- "movq (%2), %%xmm3 \n\t"\
- "psubw %%xmm1, %%xmm2 \n\t"\
- "paddw %5, %%xmm0 \n\t"\
- "pmullw %%xmm6, %%xmm2 \n\t"\
- "paddw %%xmm0, %%xmm2 \n\t"\
- "psraw $5, %%xmm2 \n\t"\
- "packuswb %%xmm2, %%xmm2 \n\t"\
- "pavgb %%xmm3, %%xmm2 \n\t"\
- OP(%%xmm2, (%1), %%xmm4, q)\
- "add %4, %0 \n\t"\
- "add %4, %1 \n\t"\
- "add %3, %2 \n\t"\
- : "+a"(src), "+c"(dst), "+d"(src2)\
- : "D"((x86_reg)src2Stride), "S"((x86_reg)dstStride),\
- "m"(ff_pw_16)\
- : "memory"\
- );\
- }while(--h);\
-}\
-QPEL_H264_H16_XMM(OPNAME, OP, MMX)\
-\
-static av_noinline void OPNAME ## h264_qpel8_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
- int h=8;\
- __asm__ volatile(\
- "pxor %%xmm7, %%xmm7 \n\t"\
- "movdqa "MANGLE(ff_pw_5)", %%xmm6\n\t"\
- "1: \n\t"\
- "lddqu -2(%0), %%xmm1 \n\t"\
- "movdqa %%xmm1, %%xmm0 \n\t"\
- "punpckhbw %%xmm7, %%xmm1 \n\t"\
- "punpcklbw %%xmm7, %%xmm0 \n\t"\
- "movdqa %%xmm1, %%xmm2 \n\t"\
- "movdqa %%xmm1, %%xmm3 \n\t"\
- "movdqa %%xmm1, %%xmm4 \n\t"\
- "movdqa %%xmm1, %%xmm5 \n\t"\
- "palignr $2, %%xmm0, %%xmm4 \n\t"\
- "palignr $4, %%xmm0, %%xmm3 \n\t"\
- "palignr $6, %%xmm0, %%xmm2 \n\t"\
- "palignr $8, %%xmm0, %%xmm1 \n\t"\
- "palignr $10,%%xmm0, %%xmm5 \n\t"\
- "paddw %%xmm5, %%xmm0 \n\t"\
- "paddw %%xmm3, %%xmm2 \n\t"\
- "paddw %%xmm4, %%xmm1 \n\t"\
- "psllw $2, %%xmm2 \n\t"\
- "psubw %%xmm1, %%xmm2 \n\t"\
- "paddw "MANGLE(ff_pw_16)", %%xmm0\n\t"\
- "pmullw %%xmm6, %%xmm2 \n\t"\
- "paddw %%xmm0, %%xmm2 \n\t"\
- "psraw $5, %%xmm2 \n\t"\
- "packuswb %%xmm2, %%xmm2 \n\t"\
- OP(%%xmm2, (%1), %%xmm4, q)\
- "add %3, %0 \n\t"\
- "add %4, %1 \n\t"\
- "decl %2 \n\t"\
- " jnz 1b \n\t"\
- : "+a"(src), "+c"(dst), "+g"(h)\
- : "D"((x86_reg)srcStride), "S"((x86_reg)dstStride)\
- : "memory"\
- );\
-}\
-static void OPNAME ## h264_qpel16_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
- OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\
- OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
- src += 8*srcStride;\
- dst += 8*dstStride;\
- OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\
- OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
-}\
-
-#define QPEL_H264_V_XMM(OPNAME, OP, MMX)\
-static av_noinline void OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
- src -= 2*srcStride;\
- \
- __asm__ volatile(\
- "pxor %%xmm7, %%xmm7 \n\t"\
- "movq (%0), %%xmm0 \n\t"\
- "add %2, %0 \n\t"\
- "movq (%0), %%xmm1 \n\t"\
- "add %2, %0 \n\t"\
- "movq (%0), %%xmm2 \n\t"\
- "add %2, %0 \n\t"\
- "movq (%0), %%xmm3 \n\t"\
- "add %2, %0 \n\t"\
- "movq (%0), %%xmm4 \n\t"\
- "add %2, %0 \n\t"\
- "punpcklbw %%xmm7, %%xmm0 \n\t"\
- "punpcklbw %%xmm7, %%xmm1 \n\t"\
- "punpcklbw %%xmm7, %%xmm2 \n\t"\
- "punpcklbw %%xmm7, %%xmm3 \n\t"\
- "punpcklbw %%xmm7, %%xmm4 \n\t"\
- QPEL_H264V_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, OP)\
- QPEL_H264V_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, OP)\
- QPEL_H264V_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, OP)\
- QPEL_H264V_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, OP)\
- QPEL_H264V_XMM(%%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, OP)\
- QPEL_H264V_XMM(%%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, OP)\
- QPEL_H264V_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, OP)\
- QPEL_H264V_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, OP)\
- \
- : "+a"(src), "+c"(dst)\
- : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\
- : "memory"\
- );\
- if(h==16){\
- __asm__ volatile(\
- QPEL_H264V_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, OP)\
- QPEL_H264V_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, OP)\
- QPEL_H264V_XMM(%%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, OP)\
- QPEL_H264V_XMM(%%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, OP)\
- QPEL_H264V_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, OP)\
- QPEL_H264V_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, OP)\
- QPEL_H264V_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, OP)\
- QPEL_H264V_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, OP)\
- \
- : "+a"(src), "+c"(dst)\
- : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\
- : "memory"\
- );\
- }\
-}\
-static void OPNAME ## h264_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
- OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride, srcStride, 8);\
-}\
-static av_noinline void OPNAME ## h264_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
- OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride, srcStride, 16);\
- OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\
-}
-
-static av_always_inline void put_h264_qpel8or16_hv1_lowpass_sse2(int16_t *tmp, uint8_t *src, int tmpStride, int srcStride, int size){
- int w = (size+8)>>3;
- src -= 2*srcStride+2;
- while(w--){
- __asm__ volatile(
- "pxor %%xmm7, %%xmm7 \n\t"
- "movq (%0), %%xmm0 \n\t"
- "add %2, %0 \n\t"
- "movq (%0), %%xmm1 \n\t"
- "add %2, %0 \n\t"
- "movq (%0), %%xmm2 \n\t"
- "add %2, %0 \n\t"
- "movq (%0), %%xmm3 \n\t"
- "add %2, %0 \n\t"
- "movq (%0), %%xmm4 \n\t"
- "add %2, %0 \n\t"
- "punpcklbw %%xmm7, %%xmm0 \n\t"
- "punpcklbw %%xmm7, %%xmm1 \n\t"
- "punpcklbw %%xmm7, %%xmm2 \n\t"
- "punpcklbw %%xmm7, %%xmm3 \n\t"
- "punpcklbw %%xmm7, %%xmm4 \n\t"
- QPEL_H264HV_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, 0*48)
- QPEL_H264HV_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, 1*48)
- QPEL_H264HV_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, 2*48)
- QPEL_H264HV_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, 3*48)
- QPEL_H264HV_XMM(%%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, 4*48)
- QPEL_H264HV_XMM(%%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, 5*48)
- QPEL_H264HV_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, 6*48)
- QPEL_H264HV_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, 7*48)
- : "+a"(src)
- : "c"(tmp), "S"((x86_reg)srcStride), "m"(ff_pw_5), "m"(ff_pw_16)
- : "memory"
- );
- if(size==16){
- __asm__ volatile(
- QPEL_H264HV_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, 8*48)
- QPEL_H264HV_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, 9*48)
- QPEL_H264HV_XMM(%%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, 10*48)
- QPEL_H264HV_XMM(%%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, 11*48)
- QPEL_H264HV_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, 12*48)
- QPEL_H264HV_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, 13*48)
- QPEL_H264HV_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, 14*48)
- QPEL_H264HV_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, 15*48)
- : "+a"(src)
- : "c"(tmp), "S"((x86_reg)srcStride), "m"(ff_pw_5), "m"(ff_pw_16)
- : "memory"
- );
- }
- tmp += 8;
- src += 8 - (size+5)*srcStride;
- }
-}
-
-#define QPEL_H264_HV2_XMM(OPNAME, OP, MMX)\
-static av_always_inline void OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, int dstStride, int tmpStride, int size){\
- int h = size;\
- if(size == 16){\
- __asm__ volatile(\
- "1: \n\t"\
- "movdqa 32(%0), %%xmm4 \n\t"\
- "movdqa 16(%0), %%xmm5 \n\t"\
- "movdqa (%0), %%xmm7 \n\t"\
- "movdqa %%xmm4, %%xmm3 \n\t"\
- "movdqa %%xmm4, %%xmm2 \n\t"\
- "movdqa %%xmm4, %%xmm1 \n\t"\
- "movdqa %%xmm4, %%xmm0 \n\t"\
- "palignr $10, %%xmm5, %%xmm0 \n\t"\
- "palignr $8, %%xmm5, %%xmm1 \n\t"\
- "palignr $6, %%xmm5, %%xmm2 \n\t"\
- "palignr $4, %%xmm5, %%xmm3 \n\t"\
- "palignr $2, %%xmm5, %%xmm4 \n\t"\
- "paddw %%xmm5, %%xmm0 \n\t"\
- "paddw %%xmm4, %%xmm1 \n\t"\
- "paddw %%xmm3, %%xmm2 \n\t"\
- "movdqa %%xmm5, %%xmm6 \n\t"\
- "movdqa %%xmm5, %%xmm4 \n\t"\
- "movdqa %%xmm5, %%xmm3 \n\t"\
- "palignr $8, %%xmm7, %%xmm4 \n\t"\
- "palignr $2, %%xmm7, %%xmm6 \n\t"\
- "palignr $10, %%xmm7, %%xmm3 \n\t"\
- "paddw %%xmm6, %%xmm4 \n\t"\
- "movdqa %%xmm5, %%xmm6 \n\t"\
- "palignr $6, %%xmm7, %%xmm5 \n\t"\
- "palignr $4, %%xmm7, %%xmm6 \n\t"\
- "paddw %%xmm7, %%xmm3 \n\t"\
- "paddw %%xmm6, %%xmm5 \n\t"\
- \
- "psubw %%xmm1, %%xmm0 \n\t"\
- "psubw %%xmm4, %%xmm3 \n\t"\
- "psraw $2, %%xmm0 \n\t"\
- "psraw $2, %%xmm3 \n\t"\
- "psubw %%xmm1, %%xmm0 \n\t"\
- "psubw %%xmm4, %%xmm3 \n\t"\
- "paddw %%xmm2, %%xmm0 \n\t"\
- "paddw %%xmm5, %%xmm3 \n\t"\
- "psraw $2, %%xmm0 \n\t"\
- "psraw $2, %%xmm3 \n\t"\
- "paddw %%xmm2, %%xmm0 \n\t"\
- "paddw %%xmm5, %%xmm3 \n\t"\
- "psraw $6, %%xmm0 \n\t"\
- "psraw $6, %%xmm3 \n\t"\
- "packuswb %%xmm0, %%xmm3 \n\t"\
- OP(%%xmm3, (%1), %%xmm7, dqa)\
- "add $48, %0 \n\t"\
- "add %3, %1 \n\t"\
- "decl %2 \n\t"\
- " jnz 1b \n\t"\
- : "+a"(tmp), "+c"(dst), "+g"(h)\
- : "S"((x86_reg)dstStride)\
- : "memory"\
- );\
- }else{\
- __asm__ volatile(\
- "1: \n\t"\
- "movdqa 16(%0), %%xmm1 \n\t"\
- "movdqa (%0), %%xmm0 \n\t"\
- "movdqa %%xmm1, %%xmm2 \n\t"\
- "movdqa %%xmm1, %%xmm3 \n\t"\
- "movdqa %%xmm1, %%xmm4 \n\t"\
- "movdqa %%xmm1, %%xmm5 \n\t"\
- "palignr $10, %%xmm0, %%xmm5 \n\t"\
- "palignr $8, %%xmm0, %%xmm4 \n\t"\
- "palignr $6, %%xmm0, %%xmm3 \n\t"\
- "palignr $4, %%xmm0, %%xmm2 \n\t"\
- "palignr $2, %%xmm0, %%xmm1 \n\t"\
- "paddw %%xmm5, %%xmm0 \n\t"\
- "paddw %%xmm4, %%xmm1 \n\t"\
- "paddw %%xmm3, %%xmm2 \n\t"\
- "psubw %%xmm1, %%xmm0 \n\t"\
- "psraw $2, %%xmm0 \n\t"\
- "psubw %%xmm1, %%xmm0 \n\t"\
- "paddw %%xmm2, %%xmm0 \n\t"\
- "psraw $2, %%xmm0 \n\t"\
- "paddw %%xmm2, %%xmm0 \n\t"\
- "psraw $6, %%xmm0 \n\t"\
- "packuswb %%xmm0, %%xmm0 \n\t"\
- OP(%%xmm0, (%1), %%xmm7, q)\
- "add $48, %0 \n\t"\
- "add %3, %1 \n\t"\
- "decl %2 \n\t"\
- " jnz 1b \n\t"\
- : "+a"(tmp), "+c"(dst), "+g"(h)\
- : "S"((x86_reg)dstStride)\
- : "memory"\
- );\
- }\
-}
-
-#define QPEL_H264_HV_XMM(OPNAME, OP, MMX)\
-static av_noinline void OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride, int size){\
- put_h264_qpel8or16_hv1_lowpass_sse2(tmp, src, tmpStride, srcStride, size);\
- OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(dst, tmp, dstStride, tmpStride, size);\
-}\
-static void OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
- OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst, tmp, src, dstStride, tmpStride, srcStride, 8);\
-}\
-static void OPNAME ## h264_qpel16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
- OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst, tmp, src, dstStride, tmpStride, srcStride, 16);\
-}\
-
-#define put_pixels8_l2_sse2 put_pixels8_l2_mmx2
-#define avg_pixels8_l2_sse2 avg_pixels8_l2_mmx2
-#define put_pixels16_l2_sse2 put_pixels16_l2_mmx2
-#define avg_pixels16_l2_sse2 avg_pixels16_l2_mmx2
-#define put_pixels8_l2_ssse3 put_pixels8_l2_mmx2
-#define avg_pixels8_l2_ssse3 avg_pixels8_l2_mmx2
-#define put_pixels16_l2_ssse3 put_pixels16_l2_mmx2
-#define avg_pixels16_l2_ssse3 avg_pixels16_l2_mmx2
-
-#define put_pixels8_l2_shift5_sse2 put_pixels8_l2_shift5_mmx2
-#define avg_pixels8_l2_shift5_sse2 avg_pixels8_l2_shift5_mmx2
-#define put_pixels16_l2_shift5_sse2 put_pixels16_l2_shift5_mmx2
-#define avg_pixels16_l2_shift5_sse2 avg_pixels16_l2_shift5_mmx2
-#define put_pixels8_l2_shift5_ssse3 put_pixels8_l2_shift5_mmx2
-#define avg_pixels8_l2_shift5_ssse3 avg_pixels8_l2_shift5_mmx2
-#define put_pixels16_l2_shift5_ssse3 put_pixels16_l2_shift5_mmx2
-#define avg_pixels16_l2_shift5_ssse3 avg_pixels16_l2_shift5_mmx2
-
-#define put_h264_qpel8_h_lowpass_l2_sse2 put_h264_qpel8_h_lowpass_l2_mmx2
-#define avg_h264_qpel8_h_lowpass_l2_sse2 avg_h264_qpel8_h_lowpass_l2_mmx2
-#define put_h264_qpel16_h_lowpass_l2_sse2 put_h264_qpel16_h_lowpass_l2_mmx2
-#define avg_h264_qpel16_h_lowpass_l2_sse2 avg_h264_qpel16_h_lowpass_l2_mmx2
-
-#define put_h264_qpel8_v_lowpass_ssse3 put_h264_qpel8_v_lowpass_sse2
-#define avg_h264_qpel8_v_lowpass_ssse3 avg_h264_qpel8_v_lowpass_sse2
-#define put_h264_qpel16_v_lowpass_ssse3 put_h264_qpel16_v_lowpass_sse2
-#define avg_h264_qpel16_v_lowpass_ssse3 avg_h264_qpel16_v_lowpass_sse2
-
-#define put_h264_qpel8or16_hv2_lowpass_sse2 put_h264_qpel8or16_hv2_lowpass_mmx2
-#define avg_h264_qpel8or16_hv2_lowpass_sse2 avg_h264_qpel8or16_hv2_lowpass_mmx2
-
-#define H264_MC(OPNAME, SIZE, MMX, ALIGN) \
-H264_MC_C(OPNAME, SIZE, MMX, ALIGN)\
-H264_MC_V(OPNAME, SIZE, MMX, ALIGN)\
-H264_MC_H(OPNAME, SIZE, MMX, ALIGN)\
-H264_MC_HV(OPNAME, SIZE, MMX, ALIGN)\
-
-static void put_h264_qpel16_mc00_sse2 (uint8_t *dst, uint8_t *src, int stride){
- put_pixels16_sse2(dst, src, stride, 16);
-}
-static void avg_h264_qpel16_mc00_sse2 (uint8_t *dst, uint8_t *src, int stride){
- avg_pixels16_sse2(dst, src, stride, 16);
-}
-#define put_h264_qpel8_mc00_sse2 put_h264_qpel8_mc00_mmx2
-#define avg_h264_qpel8_mc00_sse2 avg_h264_qpel8_mc00_mmx2
-
-#define H264_MC_C(OPNAME, SIZE, MMX, ALIGN) \
-static void OPNAME ## h264_qpel ## SIZE ## _mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\
- OPNAME ## pixels ## SIZE ## _ ## MMX(dst, src, stride, SIZE);\
-}\
-
-#define H264_MC_H(OPNAME, SIZE, MMX, ALIGN) \
-static void OPNAME ## h264_qpel ## SIZE ## _mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
- OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, src, stride, stride);\
-}\
-\
-static void OPNAME ## h264_qpel ## SIZE ## _mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
- OPNAME ## h264_qpel ## SIZE ## _h_lowpass_ ## MMX(dst, src, stride, stride);\
-}\
-\
-static void OPNAME ## h264_qpel ## SIZE ## _mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
- OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, src+1, stride, stride);\
-}\
-
-#define H264_MC_V(OPNAME, SIZE, MMX, ALIGN) \
-static void OPNAME ## h264_qpel ## SIZE ## _mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
- DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\
- put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
- OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, src, temp, stride, stride, SIZE);\
-}\
-\
-static void OPNAME ## h264_qpel ## SIZE ## _mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
- OPNAME ## h264_qpel ## SIZE ## _v_lowpass_ ## MMX(dst, src, stride, stride);\
-}\
-\
-static void OPNAME ## h264_qpel ## SIZE ## _mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
- DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\
- put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
- OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, src+stride, temp, stride, stride, SIZE);\
-}\
-
-#define H264_MC_HV(OPNAME, SIZE, MMX, ALIGN) \
-static void OPNAME ## h264_qpel ## SIZE ## _mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
- DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\
- put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
- OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, temp, stride, SIZE);\
-}\
-\
-static void OPNAME ## h264_qpel ## SIZE ## _mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
- DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\
- put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src+1, SIZE, stride);\
- OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, temp, stride, SIZE);\
-}\
-\
-static void OPNAME ## h264_qpel ## SIZE ## _mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
- DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\
- put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
- OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, temp, stride, SIZE);\
-}\
-\
-static void OPNAME ## h264_qpel ## SIZE ## _mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
- DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\
- put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src+1, SIZE, stride);\
- OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, temp, stride, SIZE);\
-}\
-\
-static void OPNAME ## h264_qpel ## SIZE ## _mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
- DECLARE_ALIGNED(ALIGN, uint16_t, temp)[SIZE*(SIZE<8?12:24)];\
- OPNAME ## h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(dst, temp, src, stride, SIZE, stride);\
-}\
-\
-static void OPNAME ## h264_qpel ## SIZE ## _mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
- DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE];\
- uint8_t * const halfHV= temp;\
- int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
- assert(((int)temp & 7) == 0);\
- put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
- OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, halfHV, stride, SIZE);\
-}\
-\
-static void OPNAME ## h264_qpel ## SIZE ## _mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
- DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE];\
- uint8_t * const halfHV= temp;\
- int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
- assert(((int)temp & 7) == 0);\
- put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
- OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, halfHV, stride, SIZE);\
-}\
-\
-static void OPNAME ## h264_qpel ## SIZE ## _mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
- DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE];\
- uint8_t * const halfHV= temp;\
- int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
- assert(((int)temp & 7) == 0);\
- put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
- OPNAME ## pixels ## SIZE ## _l2_shift5_ ## MMX(dst, halfV+2, halfHV, stride, SIZE, SIZE);\
-}\
-\
-static void OPNAME ## h264_qpel ## SIZE ## _mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
- DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE];\
- uint8_t * const halfHV= temp;\
- int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
- assert(((int)temp & 7) == 0);\
- put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
- OPNAME ## pixels ## SIZE ## _l2_shift5_ ## MMX(dst, halfV+3, halfHV, stride, SIZE, SIZE);\
-}\
-
-#define H264_MC_4816(MMX)\
-H264_MC(put_, 4, MMX, 8)\
-H264_MC(put_, 8, MMX, 8)\
-H264_MC(put_, 16,MMX, 8)\
-H264_MC(avg_, 4, MMX, 8)\
-H264_MC(avg_, 8, MMX, 8)\
-H264_MC(avg_, 16,MMX, 8)\
-
-#define H264_MC_816(QPEL, XMM)\
-QPEL(put_, 8, XMM, 16)\
-QPEL(put_, 16,XMM, 16)\
-QPEL(avg_, 8, XMM, 16)\
-QPEL(avg_, 16,XMM, 16)\
-
-
-#define AVG_3DNOW_OP(a,b,temp, size) \
-"mov" #size " " #b ", " #temp " \n\t"\
-"pavgusb " #temp ", " #a " \n\t"\
-"mov" #size " " #a ", " #b " \n\t"
-#define AVG_MMX2_OP(a,b,temp, size) \
-"mov" #size " " #b ", " #temp " \n\t"\
-"pavgb " #temp ", " #a " \n\t"\
-"mov" #size " " #a ", " #b " \n\t"
-
-#define PAVGB "pavgusb"
-QPEL_H264(put_, PUT_OP, 3dnow)
-QPEL_H264(avg_, AVG_3DNOW_OP, 3dnow)
-#undef PAVGB
-#define PAVGB "pavgb"
-QPEL_H264(put_, PUT_OP, mmx2)
-QPEL_H264(avg_, AVG_MMX2_OP, mmx2)
-QPEL_H264_V_XMM(put_, PUT_OP, sse2)
-QPEL_H264_V_XMM(avg_, AVG_MMX2_OP, sse2)
-QPEL_H264_HV_XMM(put_, PUT_OP, sse2)
-QPEL_H264_HV_XMM(avg_, AVG_MMX2_OP, sse2)
-#if HAVE_SSSE3
-QPEL_H264_H_XMM(put_, PUT_OP, ssse3)
-QPEL_H264_H_XMM(avg_, AVG_MMX2_OP, ssse3)
-QPEL_H264_HV2_XMM(put_, PUT_OP, ssse3)
-QPEL_H264_HV2_XMM(avg_, AVG_MMX2_OP, ssse3)
-QPEL_H264_HV_XMM(put_, PUT_OP, ssse3)
-QPEL_H264_HV_XMM(avg_, AVG_MMX2_OP, ssse3)
-#endif
-#undef PAVGB
-
-H264_MC_4816(3dnow)
-H264_MC_4816(mmx2)
-H264_MC_816(H264_MC_V, sse2)
-H264_MC_816(H264_MC_HV, sse2)
-#if HAVE_SSSE3
-H264_MC_816(H264_MC_H, ssse3)
-H264_MC_816(H264_MC_HV, ssse3)
-#endif
-
-/* rnd interleaved with rnd div 8, use p+1 to access rnd div 8 */
-DECLARE_ALIGNED(8, static const uint64_t, h264_rnd_reg)[4] = {
- 0x0020002000200020ULL, 0x0004000400040004ULL, 0x001C001C001C001CULL, 0x0003000300030003ULL
-};
-
-#define H264_CHROMA_OP(S,D)
-#define H264_CHROMA_OP4(S,D,T)
-#define H264_CHROMA_MC8_TMPL put_h264_chroma_generic_mc8_mmx
-#define H264_CHROMA_MC4_TMPL put_h264_chroma_generic_mc4_mmx
-#define H264_CHROMA_MC2_TMPL put_h264_chroma_mc2_mmx2
-#define H264_CHROMA_MC8_MV0 put_pixels8_mmx
-#include "dsputil_h264_template_mmx.c"
-
-static void put_h264_chroma_mc8_mmx_rnd(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y)
-{
- put_h264_chroma_generic_mc8_mmx(dst, src, stride, h, x, y, h264_rnd_reg);
-}
-static void put_vc1_chroma_mc8_mmx_nornd(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y)
-{
- put_h264_chroma_generic_mc8_mmx(dst, src, stride, h, x, y, h264_rnd_reg+2);
-}
-static void put_h264_chroma_mc4_mmx(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y)
-{
- put_h264_chroma_generic_mc4_mmx(dst, src, stride, h, x, y, h264_rnd_reg);
-}
-
-#undef H264_CHROMA_OP
-#undef H264_CHROMA_OP4
-#undef H264_CHROMA_MC8_TMPL
-#undef H264_CHROMA_MC4_TMPL
-#undef H264_CHROMA_MC2_TMPL
-#undef H264_CHROMA_MC8_MV0
+#define LF_FUNC(DIR, TYPE, OPT) \
+void ff_x264_deblock_ ## DIR ## _ ## TYPE ## _ ## OPT (uint8_t *pix, int stride, \
+ int alpha, int beta, int8_t *tc0);
+#define LF_IFUNC(DIR, TYPE, OPT) \
+void ff_x264_deblock_ ## DIR ## _ ## TYPE ## _ ## OPT (uint8_t *pix, int stride, \
+ int alpha, int beta);
-#define H264_CHROMA_OP(S,D) "pavgb " #S ", " #D " \n\t"
-#define H264_CHROMA_OP4(S,D,T) "movd " #S ", " #T " \n\t"\
- "pavgb " #T ", " #D " \n\t"
-#define H264_CHROMA_MC8_TMPL avg_h264_chroma_generic_mc8_mmx2
-#define H264_CHROMA_MC4_TMPL avg_h264_chroma_generic_mc4_mmx2
-#define H264_CHROMA_MC2_TMPL avg_h264_chroma_mc2_mmx2
-#define H264_CHROMA_MC8_MV0 avg_pixels8_mmx2
-#include "dsputil_h264_template_mmx.c"
-static void avg_h264_chroma_mc8_mmx2_rnd(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y)
-{
- avg_h264_chroma_generic_mc8_mmx2(dst, src, stride, h, x, y, h264_rnd_reg);
-}
-static void avg_vc1_chroma_mc8_mmx2_nornd(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y)
-{
- avg_h264_chroma_generic_mc8_mmx2(dst, src, stride, h, x, y, h264_rnd_reg+2);
-}
-static void avg_h264_chroma_mc4_mmx2(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y)
-{
- avg_h264_chroma_generic_mc4_mmx2(dst, src, stride, h, x, y, h264_rnd_reg);
-}
-#undef H264_CHROMA_OP
-#undef H264_CHROMA_OP4
-#undef H264_CHROMA_MC8_TMPL
-#undef H264_CHROMA_MC4_TMPL
-#undef H264_CHROMA_MC2_TMPL
-#undef H264_CHROMA_MC8_MV0
+LF_FUNC (h, chroma, mmxext)
+LF_IFUNC(h, chroma_intra, mmxext)
+LF_FUNC (v, chroma, mmxext)
+LF_IFUNC(v, chroma_intra, mmxext)
-#define H264_CHROMA_OP(S,D) "pavgusb " #S ", " #D " \n\t"
-#define H264_CHROMA_OP4(S,D,T) "movd " #S ", " #T " \n\t"\
- "pavgusb " #T ", " #D " \n\t"
-#define H264_CHROMA_MC8_TMPL avg_h264_chroma_generic_mc8_3dnow
-#define H264_CHROMA_MC4_TMPL avg_h264_chroma_generic_mc4_3dnow
-#define H264_CHROMA_MC8_MV0 avg_pixels8_3dnow
-#include "dsputil_h264_template_mmx.c"
-static void avg_h264_chroma_mc8_3dnow_rnd(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y)
-{
- avg_h264_chroma_generic_mc8_3dnow(dst, src, stride, h, x, y, h264_rnd_reg);
-}
-static void avg_h264_chroma_mc4_3dnow(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y)
+LF_FUNC (h, luma, mmxext)
+LF_IFUNC(h, luma_intra, mmxext)
+#if HAVE_YASM && ARCH_X86_32
+LF_FUNC (v8, luma, mmxext)
+static void ff_x264_deblock_v_luma_mmxext(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
{
- avg_h264_chroma_generic_mc4_3dnow(dst, src, stride, h, x, y, h264_rnd_reg);
-}
-#undef H264_CHROMA_OP
-#undef H264_CHROMA_OP4
-#undef H264_CHROMA_MC8_TMPL
-#undef H264_CHROMA_MC4_TMPL
-#undef H264_CHROMA_MC8_MV0
-
-#if HAVE_SSSE3
-#define AVG_OP(X)
-#undef H264_CHROMA_MC8_TMPL
-#undef H264_CHROMA_MC4_TMPL
-#define H264_CHROMA_MC8_TMPL put_h264_chroma_mc8_ssse3
-#define H264_CHROMA_MC4_TMPL put_h264_chroma_mc4_ssse3
-#define H264_CHROMA_MC8_MV0 put_pixels8_mmx
-#include "dsputil_h264_template_ssse3.c"
-static void put_h264_chroma_mc8_ssse3_rnd(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y)
-{
- put_h264_chroma_mc8_ssse3(dst, src, stride, h, x, y, 1);
-}
-static void put_vc1_chroma_mc8_ssse3_nornd(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y)
-{
- put_h264_chroma_mc8_ssse3(dst, src, stride, h, x, y, 0);
-}
-
-#undef AVG_OP
-#undef H264_CHROMA_MC8_TMPL
-#undef H264_CHROMA_MC4_TMPL
-#undef H264_CHROMA_MC8_MV0
-#define AVG_OP(X) X
-#define H264_CHROMA_MC8_TMPL avg_h264_chroma_mc8_ssse3
-#define H264_CHROMA_MC4_TMPL avg_h264_chroma_mc4_ssse3
-#define H264_CHROMA_MC8_MV0 avg_pixels8_mmx2
-#include "dsputil_h264_template_ssse3.c"
-static void avg_h264_chroma_mc8_ssse3_rnd(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y)
-{
- avg_h264_chroma_mc8_ssse3(dst, src, stride, h, x, y, 1);
+ if((tc0[0] & tc0[1]) >= 0)
+ ff_x264_deblock_v8_luma_mmxext(pix+0, stride, alpha, beta, tc0);
+ if((tc0[2] & tc0[3]) >= 0)
+ ff_x264_deblock_v8_luma_mmxext(pix+8, stride, alpha, beta, tc0+2);
}
-static void avg_vc1_chroma_mc8_ssse3_nornd(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y)
+LF_IFUNC(v8, luma_intra, mmxext)
+static void ff_x264_deblock_v_luma_intra_mmxext(uint8_t *pix, int stride, int alpha, int beta)
{
- avg_h264_chroma_mc8_ssse3(dst, src, stride, h, x, y, 0);
+ ff_x264_deblock_v8_luma_intra_mmxext(pix+0, stride, alpha, beta);
+ ff_x264_deblock_v8_luma_intra_mmxext(pix+8, stride, alpha, beta);
}
-#undef AVG_OP
-#undef H264_CHROMA_MC8_TMPL
-#undef H264_CHROMA_MC4_TMPL
-#undef H264_CHROMA_MC8_MV0
#endif
+LF_FUNC (h, luma, sse2)
+LF_IFUNC(h, luma_intra, sse2)
+LF_FUNC (v, luma, sse2)
+LF_IFUNC(v, luma_intra, sse2)
+
/***********************************/
/* weighted prediction */
-static inline void ff_h264_weight_WxH_mmx2(uint8_t *dst, int stride, int log2_denom, int weight, int offset, int w, int h)
+#define H264_WEIGHT(W, H, OPT) \
+void ff_h264_weight_ ## W ## x ## H ## _ ## OPT(uint8_t *dst, \
+ int stride, int log2_denom, int weight, int offset);
+
+#define H264_BIWEIGHT(W, H, OPT) \
+void ff_h264_biweight_ ## W ## x ## H ## _ ## OPT(uint8_t *dst, \
+ uint8_t *src, int stride, int log2_denom, int weightd, \
+ int weights, int offset);
+
+#define H264_BIWEIGHT_MMX(W,H) \
+H264_WEIGHT (W, H, mmx2) \
+H264_BIWEIGHT(W, H, mmx2)
+
+#define H264_BIWEIGHT_MMX_SSE(W,H) \
+H264_BIWEIGHT_MMX(W, H) \
+H264_WEIGHT (W, H, sse2) \
+H264_BIWEIGHT (W, H, sse2) \
+H264_BIWEIGHT (W, H, ssse3)
+
+H264_BIWEIGHT_MMX_SSE(16, 16)
+H264_BIWEIGHT_MMX_SSE(16, 8)
+H264_BIWEIGHT_MMX_SSE( 8, 16)
+H264_BIWEIGHT_MMX_SSE( 8, 8)
+H264_BIWEIGHT_MMX_SSE( 8, 4)
+H264_BIWEIGHT_MMX ( 4, 8)
+H264_BIWEIGHT_MMX ( 4, 4)
+H264_BIWEIGHT_MMX ( 4, 2)
+
+void ff_h264dsp_init_x86(H264DSPContext *c)
{
- int x, y;
- offset <<= log2_denom;
- offset += (1 << log2_denom) >> 1;
- __asm__ volatile(
- "movd %0, %%mm4 \n\t"
- "movd %1, %%mm5 \n\t"
- "movd %2, %%mm6 \n\t"
- "pshufw $0, %%mm4, %%mm4 \n\t"
- "pshufw $0, %%mm5, %%mm5 \n\t"
- "pxor %%mm7, %%mm7 \n\t"
- :: "g"(weight), "g"(offset), "g"(log2_denom)
- );
- for(y=0; y<h; y+=2){
- for(x=0; x<w; x+=4){
- __asm__ volatile(
- "movd %0, %%mm0 \n\t"
- "movd %1, %%mm1 \n\t"
- "punpcklbw %%mm7, %%mm0 \n\t"
- "punpcklbw %%mm7, %%mm1 \n\t"
- "pmullw %%mm4, %%mm0 \n\t"
- "pmullw %%mm4, %%mm1 \n\t"
- "paddsw %%mm5, %%mm0 \n\t"
- "paddsw %%mm5, %%mm1 \n\t"
- "psraw %%mm6, %%mm0 \n\t"
- "psraw %%mm6, %%mm1 \n\t"
- "packuswb %%mm7, %%mm0 \n\t"
- "packuswb %%mm7, %%mm1 \n\t"
- "movd %%mm0, %0 \n\t"
- "movd %%mm1, %1 \n\t"
- : "+m"(*(uint32_t*)(dst+x)),
- "+m"(*(uint32_t*)(dst+x+stride))
- );
- }
- dst += 2*stride;
- }
-}
+ int mm_flags = av_get_cpu_flags();
-static inline void ff_h264_biweight_WxH_mmx2(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset, int w, int h)
-{
- int x, y;
- offset = ((offset + 1) | 1) << log2_denom;
- __asm__ volatile(
- "movd %0, %%mm3 \n\t"
- "movd %1, %%mm4 \n\t"
- "movd %2, %%mm5 \n\t"
- "movd %3, %%mm6 \n\t"
- "pshufw $0, %%mm3, %%mm3 \n\t"
- "pshufw $0, %%mm4, %%mm4 \n\t"
- "pshufw $0, %%mm5, %%mm5 \n\t"
- "pxor %%mm7, %%mm7 \n\t"
- :: "g"(weightd), "g"(weights), "g"(offset), "g"(log2_denom+1)
- );
- for(y=0; y<h; y++){
- for(x=0; x<w; x+=4){
- __asm__ volatile(
- "movd %0, %%mm0 \n\t"
- "movd %1, %%mm1 \n\t"
- "punpcklbw %%mm7, %%mm0 \n\t"
- "punpcklbw %%mm7, %%mm1 \n\t"
- "pmullw %%mm3, %%mm0 \n\t"
- "pmullw %%mm4, %%mm1 \n\t"
- "paddsw %%mm1, %%mm0 \n\t"
- "paddsw %%mm5, %%mm0 \n\t"
- "psraw %%mm6, %%mm0 \n\t"
- "packuswb %%mm0, %%mm0 \n\t"
- "movd %%mm0, %0 \n\t"
- : "+m"(*(uint32_t*)(dst+x))
- : "m"(*(uint32_t*)(src+x))
- );
- }
- src += stride;
- dst += stride;
+ if (mm_flags & AV_CPU_FLAG_MMX2) {
+ c->h264_loop_filter_strength= h264_loop_filter_strength_mmx2;
}
-}
-
-#define H264_WEIGHT(W,H) \
-static void ff_h264_biweight_ ## W ## x ## H ## _mmx2(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset){ \
- ff_h264_biweight_WxH_mmx2(dst, src, stride, log2_denom, weightd, weights, offset, W, H); \
-} \
-static void ff_h264_weight_ ## W ## x ## H ## _mmx2(uint8_t *dst, int stride, int log2_denom, int weight, int offset){ \
- ff_h264_weight_WxH_mmx2(dst, stride, log2_denom, weight, offset, W, H); \
-}
-
-H264_WEIGHT(16,16)
-H264_WEIGHT(16, 8)
-H264_WEIGHT( 8,16)
-H264_WEIGHT( 8, 8)
-H264_WEIGHT( 8, 4)
-H264_WEIGHT( 4, 8)
-H264_WEIGHT( 4, 4)
-H264_WEIGHT( 4, 2)
-
-void ff_h264_biweight_8x8_sse2(uint8_t *dst, uint8_t *src, int stride,
- int log2_denom, int weightd, int weights,
- int offset);
-
-void ff_h264_biweight_16x16_sse2(uint8_t *dst, uint8_t *src, int stride,
- int log2_denom, int weightd, int weights,
- int offset);
-
-void ff_h264_biweight_8x8_ssse3(uint8_t *dst, uint8_t *src, int stride,
- int log2_denom, int weightd, int weights,
- int offset);
-
-void ff_h264_biweight_16x16_ssse3(uint8_t *dst, uint8_t *src, int stride,
- int log2_denom, int weightd, int weights,
- int offset);
-
-void ff_pred16x16_vertical_mmx (uint8_t *src, int stride);
-void ff_pred16x16_vertical_sse (uint8_t *src, int stride);
-void ff_pred16x16_horizontal_mmx (uint8_t *src, int stride);
-void ff_pred16x16_horizontal_mmxext(uint8_t *src, int stride);
-void ff_pred16x16_horizontal_ssse3 (uint8_t *src, int stride);
-void ff_pred16x16_dc_mmxext (uint8_t *src, int stride);
-void ff_pred16x16_dc_sse2 (uint8_t *src, int stride);
-void ff_pred16x16_dc_ssse3 (uint8_t *src, int stride);
-void ff_pred16x16_tm_vp8_mmx (uint8_t *src, int stride);
-void ff_pred16x16_tm_vp8_mmxext (uint8_t *src, int stride);
-void ff_pred16x16_tm_vp8_sse2 (uint8_t *src, int stride);
-void ff_pred8x8_dc_rv40_mmxext (uint8_t *src, int stride);
-void ff_pred8x8_vertical_mmx (uint8_t *src, int stride);
-void ff_pred8x8_horizontal_mmx (uint8_t *src, int stride);
-void ff_pred8x8_horizontal_mmxext (uint8_t *src, int stride);
-void ff_pred8x8_horizontal_ssse3 (uint8_t *src, int stride);
-void ff_pred8x8_tm_vp8_mmx (uint8_t *src, int stride);
-void ff_pred8x8_tm_vp8_mmxext (uint8_t *src, int stride);
-void ff_pred8x8_tm_vp8_sse2 (uint8_t *src, int stride);
-void ff_pred8x8_tm_vp8_ssse3 (uint8_t *src, int stride);
-void ff_pred4x4_dc_mmxext (uint8_t *src, const uint8_t *topright, int stride);
-void ff_pred4x4_tm_vp8_mmx (uint8_t *src, const uint8_t *topright, int stride);
-void ff_pred4x4_tm_vp8_mmxext (uint8_t *src, const uint8_t *topright, int stride);
-void ff_pred4x4_tm_vp8_ssse3 (uint8_t *src, const uint8_t *topright, int stride);
-void ff_pred4x4_vertical_vp8_mmxext(uint8_t *src, const uint8_t *topright, int stride);
-
-#if CONFIG_H264PRED
-void ff_h264_pred_init_x86(H264PredContext *h, int codec_id)
-{
- int mm_flags = mm_support();
-
#if HAVE_YASM
- if (mm_flags & FF_MM_MMX) {
- h->pred16x16[VERT_PRED8x8] = ff_pred16x16_vertical_mmx;
- h->pred16x16[HOR_PRED8x8 ] = ff_pred16x16_horizontal_mmx;
- h->pred8x8 [VERT_PRED8x8] = ff_pred8x8_vertical_mmx;
- h->pred8x8 [HOR_PRED8x8 ] = ff_pred8x8_horizontal_mmx;
- if (codec_id == CODEC_ID_VP8) {
- h->pred16x16[PLANE_PRED8x8] = ff_pred16x16_tm_vp8_mmx;
- h->pred8x8 [PLANE_PRED8x8] = ff_pred8x8_tm_vp8_mmx;
- h->pred4x4 [TM_VP8_PRED ] = ff_pred4x4_tm_vp8_mmx;
- }
- }
-
- if (mm_flags & FF_MM_MMX2) {
- h->pred16x16[HOR_PRED8x8 ] = ff_pred16x16_horizontal_mmxext;
- h->pred16x16[DC_PRED8x8 ] = ff_pred16x16_dc_mmxext;
- h->pred8x8 [HOR_PRED8x8 ] = ff_pred8x8_horizontal_mmxext;
- h->pred4x4 [DC_PRED ] = ff_pred4x4_dc_mmxext;
- if (codec_id == CODEC_ID_VP8) {
- h->pred16x16[PLANE_PRED8x8] = ff_pred16x16_tm_vp8_mmxext;
- h->pred8x8 [DC_PRED8x8 ] = ff_pred8x8_dc_rv40_mmxext;
- h->pred8x8 [PLANE_PRED8x8] = ff_pred8x8_tm_vp8_mmxext;
- h->pred4x4 [TM_VP8_PRED ] = ff_pred4x4_tm_vp8_mmxext;
- h->pred4x4 [VERT_PRED ] = ff_pred4x4_vertical_vp8_mmxext;
- }
- }
-
- if (mm_flags & FF_MM_SSE) {
- h->pred16x16[VERT_PRED8x8] = ff_pred16x16_vertical_sse;
- }
-
- if (mm_flags & FF_MM_SSE2) {
- h->pred16x16[DC_PRED8x8 ] = ff_pred16x16_dc_sse2;
- if (codec_id == CODEC_ID_VP8) {
- h->pred16x16[PLANE_PRED8x8] = ff_pred16x16_tm_vp8_sse2;
- h->pred8x8 [PLANE_PRED8x8] = ff_pred8x8_tm_vp8_sse2;
- }
- }
-
- if (mm_flags & FF_MM_SSSE3) {
- h->pred16x16[HOR_PRED8x8 ] = ff_pred16x16_horizontal_ssse3;
- h->pred16x16[DC_PRED8x8 ] = ff_pred16x16_dc_ssse3;
- h->pred8x8 [HOR_PRED8x8 ] = ff_pred8x8_horizontal_ssse3;
- if (codec_id == CODEC_ID_VP8) {
- h->pred8x8 [PLANE_PRED8x8] = ff_pred8x8_tm_vp8_ssse3;
- h->pred4x4 [TM_VP8_PRED ] = ff_pred4x4_tm_vp8_ssse3;
+ if (mm_flags & AV_CPU_FLAG_MMX) {
+ c->h264_idct_dc_add=
+ c->h264_idct_add= ff_h264_idct_add_mmx;
+ c->h264_idct8_dc_add=
+ c->h264_idct8_add= ff_h264_idct8_add_mmx;
+
+ c->h264_idct_add16 = ff_h264_idct_add16_mmx;
+ c->h264_idct8_add4 = ff_h264_idct8_add4_mmx;
+ c->h264_idct_add8 = ff_h264_idct_add8_mmx;
+ c->h264_idct_add16intra= ff_h264_idct_add16intra_mmx;
+
+ if (mm_flags & AV_CPU_FLAG_MMX2) {
+ c->h264_idct_dc_add= ff_h264_idct_dc_add_mmx2;
+ c->h264_idct8_dc_add= ff_h264_idct8_dc_add_mmx2;
+ c->h264_idct_add16 = ff_h264_idct_add16_mmx2;
+ c->h264_idct8_add4 = ff_h264_idct8_add4_mmx2;
+ c->h264_idct_add8 = ff_h264_idct_add8_mmx2;
+ c->h264_idct_add16intra= ff_h264_idct_add16intra_mmx2;
+
+ c->h264_v_loop_filter_chroma= ff_x264_deblock_v_chroma_mmxext;
+ c->h264_h_loop_filter_chroma= ff_x264_deblock_h_chroma_mmxext;
+ c->h264_v_loop_filter_chroma_intra= ff_x264_deblock_v_chroma_intra_mmxext;
+ c->h264_h_loop_filter_chroma_intra= ff_x264_deblock_h_chroma_intra_mmxext;
+#if ARCH_X86_32
+ c->h264_v_loop_filter_luma= ff_x264_deblock_v_luma_mmxext;
+ c->h264_h_loop_filter_luma= ff_x264_deblock_h_luma_mmxext;
+ c->h264_v_loop_filter_luma_intra = ff_x264_deblock_v_luma_intra_mmxext;
+ c->h264_h_loop_filter_luma_intra = ff_x264_deblock_h_luma_intra_mmxext;
+#endif
+ c->weight_h264_pixels_tab[0]= ff_h264_weight_16x16_mmx2;
+ c->weight_h264_pixels_tab[1]= ff_h264_weight_16x8_mmx2;
+ c->weight_h264_pixels_tab[2]= ff_h264_weight_8x16_mmx2;
+ c->weight_h264_pixels_tab[3]= ff_h264_weight_8x8_mmx2;
+ c->weight_h264_pixels_tab[4]= ff_h264_weight_8x4_mmx2;
+ c->weight_h264_pixels_tab[5]= ff_h264_weight_4x8_mmx2;
+ c->weight_h264_pixels_tab[6]= ff_h264_weight_4x4_mmx2;
+ c->weight_h264_pixels_tab[7]= ff_h264_weight_4x2_mmx2;
+
+ c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16x16_mmx2;
+ c->biweight_h264_pixels_tab[1]= ff_h264_biweight_16x8_mmx2;
+ c->biweight_h264_pixels_tab[2]= ff_h264_biweight_8x16_mmx2;
+ c->biweight_h264_pixels_tab[3]= ff_h264_biweight_8x8_mmx2;
+ c->biweight_h264_pixels_tab[4]= ff_h264_biweight_8x4_mmx2;
+ c->biweight_h264_pixels_tab[5]= ff_h264_biweight_4x8_mmx2;
+ c->biweight_h264_pixels_tab[6]= ff_h264_biweight_4x4_mmx2;
+ c->biweight_h264_pixels_tab[7]= ff_h264_biweight_4x2_mmx2;
+
+ if (mm_flags&AV_CPU_FLAG_SSE2) {
+ c->h264_idct8_add = ff_h264_idct8_add_sse2;
+ c->h264_idct8_add4= ff_h264_idct8_add4_sse2;
+
+ c->weight_h264_pixels_tab[0]= ff_h264_weight_16x16_sse2;
+ c->weight_h264_pixels_tab[1]= ff_h264_weight_16x8_sse2;
+ c->weight_h264_pixels_tab[2]= ff_h264_weight_8x16_sse2;
+ c->weight_h264_pixels_tab[3]= ff_h264_weight_8x8_sse2;
+ c->weight_h264_pixels_tab[4]= ff_h264_weight_8x4_sse2;
+
+ c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16x16_sse2;
+ c->biweight_h264_pixels_tab[1]= ff_h264_biweight_16x8_sse2;
+ c->biweight_h264_pixels_tab[2]= ff_h264_biweight_8x16_sse2;
+ c->biweight_h264_pixels_tab[3]= ff_h264_biweight_8x8_sse2;
+ c->biweight_h264_pixels_tab[4]= ff_h264_biweight_8x4_sse2;
+
+#if ARCH_X86_64 || !defined(__ICC) || __ICC > 1110
+ c->h264_v_loop_filter_luma = ff_x264_deblock_v_luma_sse2;
+ c->h264_h_loop_filter_luma = ff_x264_deblock_h_luma_sse2;
+ c->h264_v_loop_filter_luma_intra = ff_x264_deblock_v_luma_intra_sse2;
+ c->h264_h_loop_filter_luma_intra = ff_x264_deblock_h_luma_intra_sse2;
+#endif
+ c->h264_idct_add16 = ff_h264_idct_add16_sse2;
+ c->h264_idct_add8 = ff_h264_idct_add8_sse2;
+ c->h264_idct_add16intra = ff_h264_idct_add16intra_sse2;
+ }
+ if (mm_flags&AV_CPU_FLAG_SSSE3) {
+ c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16x16_ssse3;
+ c->biweight_h264_pixels_tab[1]= ff_h264_biweight_16x8_ssse3;
+ c->biweight_h264_pixels_tab[2]= ff_h264_biweight_8x16_ssse3;
+ c->biweight_h264_pixels_tab[3]= ff_h264_biweight_8x8_ssse3;
+ c->biweight_h264_pixels_tab[4]= ff_h264_biweight_8x4_ssse3;
+ }
}
}
#endif
}
-#endif
diff --git a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/idct_sse2_xvid.c b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/idct_sse2_xvid.c index fc670e25d..d8a534240 100644 --- a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/idct_sse2_xvid.c +++ b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/idct_sse2_xvid.c @@ -385,11 +385,11 @@ inline void ff_idct_xvid_sse2(short *block) void ff_idct_xvid_sse2_put(uint8_t *dest, int line_size, short *block) { ff_idct_xvid_sse2(block); - put_pixels_clamped_mmx(block, dest, line_size); + ff_put_pixels_clamped_mmx(block, dest, line_size); } void ff_idct_xvid_sse2_add(uint8_t *dest, int line_size, short *block) { ff_idct_xvid_sse2(block); - add_pixels_clamped_mmx(block, dest, line_size); + ff_add_pixels_clamped_mmx(block, dest, line_size); } diff --git a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/mpegvideo_mmx.c b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/mpegvideo_mmx.c index 75ec4b2cf..f3d0eb336 100644 --- a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/mpegvideo_mmx.c +++ b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/mpegvideo_mmx.c @@ -22,6 +22,7 @@ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
+#include "libavutil/cpu.h"
#include "libavutil/x86_cpu.h"
#include "libavcodec/avcodec.h"
#include "libavcodec/dsputil.h"
@@ -625,9 +626,9 @@ static void denoise_dct_sse2(MpegEncContext *s, DCTELEM *block){ void MPV_common_init_mmx(MpegEncContext *s)
{
- int mm_flags = mm_support();
+ int mm_flags = av_get_cpu_flags();
- if (mm_flags & FF_MM_MMX) {
+ if (mm_flags & AV_CPU_FLAG_MMX) {
const int dct_algo = s->avctx->dct_algo;
s->dct_unquantize_h263_intra = dct_unquantize_h263_intra_mmx;
@@ -638,7 +639,7 @@ void MPV_common_init_mmx(MpegEncContext *s) s->dct_unquantize_mpeg2_intra = dct_unquantize_mpeg2_intra_mmx;
s->dct_unquantize_mpeg2_inter = dct_unquantize_mpeg2_inter_mmx;
- if (mm_flags & FF_MM_SSE2) {
+ if (mm_flags & AV_CPU_FLAG_SSE2) {
s->denoise_dct= denoise_dct_sse2;
} else {
s->denoise_dct= denoise_dct_mmx;
@@ -646,13 +647,13 @@ void MPV_common_init_mmx(MpegEncContext *s) if(dct_algo==FF_DCT_AUTO || dct_algo==FF_DCT_MMX){
#if HAVE_SSSE3
- if(mm_flags & FF_MM_SSSE3){
+ if(mm_flags & AV_CPU_FLAG_SSSE3){
s->dct_quantize= dct_quantize_SSSE3;
} else
#endif
- if(mm_flags & FF_MM_SSE2){
+ if(mm_flags & AV_CPU_FLAG_SSE2){
s->dct_quantize= dct_quantize_SSE2;
- } else if(mm_flags & FF_MM_MMX2){
+ } else if(mm_flags & AV_CPU_FLAG_MMX2){
s->dct_quantize= dct_quantize_MMX2;
} else {
s->dct_quantize= dct_quantize_MMX;
diff --git a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/simple_idct_mmx.c b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/simple_idct_mmx.c index e32b8f0b4..8ad0d3192 100644 --- a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/simple_idct_mmx.c +++ b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/simple_idct_mmx.c @@ -1287,10 +1287,10 @@ void ff_simple_idct_mmx(int16_t *block) void ff_simple_idct_put_mmx(uint8_t *dest, int line_size, DCTELEM *block)
{
idct(block);
- put_pixels_clamped_mmx(block, dest, line_size);
+ ff_put_pixels_clamped_mmx(block, dest, line_size);
}
void ff_simple_idct_add_mmx(uint8_t *dest, int line_size, DCTELEM *block)
{
idct(block);
- add_pixels_clamped_mmx(block, dest, line_size);
+ ff_add_pixels_clamped_mmx(block, dest, line_size);
}
diff --git a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/vc1dsp_mmx.c b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/vc1dsp_mmx.c index eb3ad2c32..8889bb36e 100644 --- a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/vc1dsp_mmx.c +++ b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/vc1dsp_mmx.c @@ -24,6 +24,7 @@ * OTHER DEALINGS IN THE SOFTWARE. */ +#include "libavutil/cpu.h" #include "libavutil/x86_cpu.h" #include "libavcodec/dsputil.h" #include "dsputil_mmx.h" @@ -714,7 +715,7 @@ static void vc1_h_loop_filter16_sse4(uint8_t *src, int stride, int pq) #endif void ff_vc1dsp_init_mmx(DSPContext* dsp, AVCodecContext *avctx) { - int mm_flags = mm_support(); + int mm_flags = av_get_cpu_flags(); dsp->put_vc1_mspel_pixels_tab[ 0] = ff_put_vc1_mspel_mc00_mmx; dsp->put_vc1_mspel_pixels_tab[ 4] = put_vc1_mspel_mc01_mmx; @@ -736,7 +737,7 @@ void ff_vc1dsp_init_mmx(DSPContext* dsp, AVCodecContext *avctx) { dsp->put_vc1_mspel_pixels_tab[11] = put_vc1_mspel_mc32_mmx; dsp->put_vc1_mspel_pixels_tab[15] = put_vc1_mspel_mc33_mmx; - if (mm_flags & FF_MM_MMX2){ + if (mm_flags & AV_CPU_FLAG_MMX2){ dsp->avg_vc1_mspel_pixels_tab[ 0] = ff_avg_vc1_mspel_mc00_mmx2; dsp->avg_vc1_mspel_pixels_tab[ 4] = avg_vc1_mspel_mc01_mmx2; dsp->avg_vc1_mspel_pixels_tab[ 8] = avg_vc1_mspel_mc02_mmx2; @@ -772,23 +773,23 @@ void ff_vc1dsp_init_mmx(DSPContext* dsp, AVCodecContext *avctx) { dsp->vc1_h_loop_filter16 = vc1_h_loop_filter16_ ## EXT #if HAVE_YASM - if (mm_flags & FF_MM_MMX) { + if (mm_flags & AV_CPU_FLAG_MMX) { ASSIGN_LF(mmx); } return; - if (mm_flags & FF_MM_MMX2) { + if (mm_flags & AV_CPU_FLAG_MMX2) { ASSIGN_LF(mmx2); } - if (mm_flags & FF_MM_SSE2) { + if (mm_flags & AV_CPU_FLAG_SSE2) { dsp->vc1_v_loop_filter8 = ff_vc1_v_loop_filter8_sse2; dsp->vc1_h_loop_filter8 = ff_vc1_h_loop_filter8_sse2; dsp->vc1_v_loop_filter16 = vc1_v_loop_filter16_sse2; dsp->vc1_h_loop_filter16 = vc1_h_loop_filter16_sse2; } - if (mm_flags & FF_MM_SSSE3) { + if (mm_flags & AV_CPU_FLAG_SSSE3) { ASSIGN_LF(ssse3); } - if (mm_flags & FF_MM_SSE4) { + if (mm_flags & AV_CPU_FLAG_SSE4) { dsp->vc1_h_loop_filter8 = ff_vc1_h_loop_filter8_sse4; dsp->vc1_h_loop_filter16 = vc1_h_loop_filter16_sse4; } diff --git a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/vc1dsp_yasm.asm b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/vc1dsp_yasm.asm index 660ff1169..3ea9d8db4 100644 --- a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/vc1dsp_yasm.asm +++ b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/vc1dsp_yasm.asm @@ -36,7 +36,7 @@ section .text %endmacro %macro STORE_4_WORDS_MMX 6 - movd %6, %5 + movd %6d, %5 %if mmsize==16 psrldq %5, 4 %else @@ -45,7 +45,7 @@ section .text mov %1, %6w shr %6, 16 mov %2, %6w - movd %6, %5 + movd %6d, %5 mov %3, %6w shr %6, 16 mov %4, %6w @@ -88,7 +88,7 @@ section .text pxor m7, m3 ; d_sign ^= a0_sign pxor m5, m5 - movd m3, r2 + movd m3, r2d %if %1 > 4 punpcklbw m3, m3 %endif diff --git a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/vp3dsp.asm b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/vp3dsp.asm new file mode 100644 index 000000000..f2b0af326 --- /dev/null +++ b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/vp3dsp.asm @@ -0,0 +1,618 @@ +;****************************************************************************** +;* MMX/SSE2-optimized functions for the VP3 decoder +;* Copyright (c) 2007 Aurelien Jacobs <aurel@gnuage.org> +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with FFmpeg; if not, write to the Free Software +;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "x86inc.asm" +%include "x86util.asm" + +; MMX-optimized functions cribbed from the original VP3 source code. + +SECTION_RODATA + +vp3_idct_data: times 8 dw 64277 + times 8 dw 60547 + times 8 dw 54491 + times 8 dw 46341 + times 8 dw 36410 + times 8 dw 25080 + times 8 dw 12785 + +cextern pb_1 +cextern pb_3 +cextern pb_7 +cextern pb_1F +cextern pb_81 + +cextern pw_8 + +cextern put_signed_pixels_clamped_mmx +cextern add_pixels_clamped_mmx + +SECTION .text + +; this is off by one or two for some cases when filter_limit is greater than 63 +; in: p0 in mm6, p1 in mm4, p2 in mm2, p3 in mm1 +; out: p1 in mm4, p2 in mm3 +%macro VP3_LOOP_FILTER 0 + movq m7, m6 + pand m6, [pb_7] ; p0&7 + psrlw m7, 3 + pand m7, [pb_1F] ; p0>>3 + movq m3, m2 ; p2 + pxor m2, m4 + pand m2, [pb_1] ; (p2^p1)&1 + movq m5, m2 + paddb m2, m2 + paddb m2, m5 ; 3*(p2^p1)&1 + paddb m2, m6 ; extra bits lost in shifts + pcmpeqb m0, m0 + pxor m1, m0 ; 255 - p3 + pavgb m1, m2 ; (256 - p3 + extrabits) >> 1 + pxor m0, m4 ; 255 - p1 + pavgb m0, m3 ; (256 + p2-p1) >> 1 + paddb m1, [pb_3] + pavgb m1, m0 ; 128+2+( p2-p1 - p3) >> 2 + pavgb m1, m0 ; 128+1+(3*(p2-p1) - p3) >> 3 + paddusb m7, m1 ; d+128+1 + movq m6, [pb_81] + psubusb m6, m7 + psubusb m7, [pb_81] + + movq m5, [r2+516] ; flim + pminub m6, m5 + pminub m7, m5 + movq m0, m6 + movq m1, m7 + paddb m6, m6 + paddb m7, m7 + pminub m6, m5 + pminub m7, m5 + psubb m6, m0 + psubb m7, m1 + paddusb m4, m7 + psubusb m4, m6 + psubusb m3, m7 + paddusb m3, m6 +%endmacro + +%macro STORE_4_WORDS 1 + movd r2d, %1 + mov [r0 -1], r2w + psrlq %1, 32 + shr r2, 16 + mov [r0+r1 -1], r2w + movd r2d, %1 + mov [r0+r1*2-1], r2w + shr r2, 16 + mov [r0+r3 -1], r2w +%endmacro + +INIT_MMX +cglobal vp3_v_loop_filter_mmx2, 3, 4 +%ifdef ARCH_X86_64 + movsxd r1, r1d +%endif + mov r3, r1 + neg r1 + movq m6, [r0+r1*2] + movq m4, [r0+r1 ] + movq m2, [r0 ] + movq m1, [r0+r3 ] + + VP3_LOOP_FILTER + + movq [r0+r1], m4 + movq [r0 ], m3 + RET + +cglobal vp3_h_loop_filter_mmx2, 3, 4 +%ifdef ARCH_X86_64 + movsxd r1, r1d +%endif + lea r3, [r1*3] + + movd m6, [r0 -2] + movd m4, [r0+r1 -2] + movd m2, [r0+r1*2-2] + movd m1, [r0+r3 -2] + lea r0, [r0+r1*4 ] + punpcklbw m6, [r0 -2] + punpcklbw m4, [r0+r1 -2] + punpcklbw m2, [r0+r1*2-2] + punpcklbw m1, [r0+r3 -2] + sub r0, r3 + sub r0, r1 + + TRANSPOSE4x4B 6, 4, 2, 1, 0 + VP3_LOOP_FILTER + SBUTTERFLY bw, 4, 3, 5 + + STORE_4_WORDS m4 + lea r0, [r0+r1*4 ] + STORE_4_WORDS m3 + RET + +; from original comments: The Macro does IDct on 4 1-D Dcts +%macro BeginIDCT 0 + movq m2, I(3) + movq m6, C(3) + movq m4, m2 + movq m7, J(5) + pmulhw m4, m6 ; r4 = c3*i3 - i3 + movq m1, C(5) + pmulhw m6, m7 ; r6 = c3*i5 - i5 + movq m5, m1 + pmulhw m1, m2 ; r1 = c5*i3 - i3 + movq m3, I(1) + pmulhw m5, m7 ; r5 = c5*i5 - i5 + movq m0, C(1) + paddw m4, m2 ; r4 = c3*i3 + paddw m6, m7 ; r6 = c3*i5 + paddw m2, m1 ; r2 = c5*i3 + movq m1, J(7) + paddw m7, m5 ; r7 = c5*i5 + movq m5, m0 ; r5 = c1 + pmulhw m0, m3 ; r0 = c1*i1 - i1 + paddsw m4, m7 ; r4 = C = c3*i3 + c5*i5 + pmulhw m5, m1 ; r5 = c1*i7 - i7 + movq m7, C(7) + psubsw m6, m2 ; r6 = D = c3*i5 - c5*i3 + paddw m0, m3 ; r0 = c1*i1 + pmulhw m3, m7 ; r3 = c7*i1 + movq m2, I(2) + pmulhw m7, m1 ; r7 = c7*i7 + paddw m5, m1 ; r5 = c1*i7 + movq m1, m2 ; r1 = i2 + pmulhw m2, C(2) ; r2 = c2*i2 - i2 + psubsw m3, m5 ; r3 = B = c7*i1 - c1*i7 + movq m5, J(6) + paddsw m0, m7 ; r0 = A = c1*i1 + c7*i7 + movq m7, m5 ; r7 = i6 + psubsw m0, m4 ; r0 = A - C + pmulhw m5, C(2) ; r5 = c2*i6 - i6 + paddw m2, m1 ; r2 = c2*i2 + pmulhw m1, C(6) ; r1 = c6*i2 + paddsw m4, m4 ; r4 = C + C + paddsw m4, m0 ; r4 = C. = A + C + psubsw m3, m6 ; r3 = B - D + paddw m5, m7 ; r5 = c2*i6 + paddsw m6, m6 ; r6 = D + D + pmulhw m7, C(6) ; r7 = c6*i6 + paddsw m6, m3 ; r6 = D. = B + D + movq I(1), m4 ; save C. at I(1) + psubsw m1, m5 ; r1 = H = c6*i2 - c2*i6 + movq m4, C(4) + movq m5, m3 ; r5 = B - D + pmulhw m3, m4 ; r3 = (c4 - 1) * (B - D) + paddsw m7, m2 ; r3 = (c4 - 1) * (B - D) + movq I(2), m6 ; save D. at I(2) + movq m2, m0 ; r2 = A - C + movq m6, I(0) + pmulhw m0, m4 ; r0 = (c4 - 1) * (A - C) + paddw m5, m3 ; r5 = B. = c4 * (B - D) + movq m3, J(4) + psubsw m5, m1 ; r5 = B.. = B. - H + paddw m2, m0 ; r0 = A. = c4 * (A - C) + psubsw m6, m3 ; r6 = i0 - i4 + movq m0, m6 + pmulhw m6, m4 ; r6 = (c4 - 1) * (i0 - i4) + paddsw m3, m3 ; r3 = i4 + i4 + paddsw m1, m1 ; r1 = H + H + paddsw m3, m0 ; r3 = i0 + i4 + paddsw m1, m5 ; r1 = H. = B + H + pmulhw m4, m3 ; r4 = (c4 - 1) * (i0 + i4) + paddsw m6, m0 ; r6 = F = c4 * (i0 - i4) + psubsw m6, m2 ; r6 = F. = F - A. + paddsw m2, m2 ; r2 = A. + A. + movq m0, I(1) ; r0 = C. + paddsw m2, m6 ; r2 = A.. = F + A. + paddw m4, m3 ; r4 = E = c4 * (i0 + i4) + psubsw m2, m1 ; r2 = R2 = A.. - H. +%endmacro + +; RowIDCT gets ready to transpose +%macro RowIDCT 0 + BeginIDCT + movq m3, I(2) ; r3 = D. + psubsw m4, m7 ; r4 = E. = E - G + paddsw m1, m1 ; r1 = H. + H. + paddsw m7, m7 ; r7 = G + G + paddsw m1, m2 ; r1 = R1 = A.. + H. + paddsw m7, m4 ; r1 = R1 = A.. + H. + psubsw m4, m3 ; r4 = R4 = E. - D. + paddsw m3, m3 + psubsw m6, m5 ; r6 = R6 = F. - B.. + paddsw m5, m5 + paddsw m3, m4 ; r3 = R3 = E. + D. + paddsw m5, m6 ; r5 = R5 = F. + B.. + psubsw m7, m0 ; r7 = R7 = G. - C. + paddsw m0, m0 + movq I(1), m1 ; save R1 + paddsw m0, m7 ; r0 = R0 = G. + C. +%endmacro + +; Column IDCT normalizes and stores final results +%macro ColumnIDCT 0 + BeginIDCT + paddsw m2, OC_8 ; adjust R2 (and R1) for shift + paddsw m1, m1 ; r1 = H. + H. + paddsw m1, m2 ; r1 = R1 = A.. + H. + psraw m2, 4 ; r2 = NR2 + psubsw m4, m7 ; r4 = E. = E - G + psraw m1, 4 ; r1 = NR2 + movq m3, I(2) ; r3 = D. + paddsw m7, m7 ; r7 = G + G + movq I(2), m2 ; store NR2 at I2 + paddsw m7, m4 ; r7 = G. = E + G + movq I(1), m1 ; store NR1 at I1 + psubsw m4, m3 ; r4 = R4 = E. - D. + paddsw m4, OC_8 ; adjust R4 (and R3) for shift + paddsw m3, m3 ; r3 = D. + D. + paddsw m3, m4 ; r3 = R3 = E. + D. + psraw m4, 4 ; r4 = NR4 + psubsw m6, m5 ; r6 = R6 = F. - B.. + psraw m3, 4 ; r3 = NR3 + paddsw m6, OC_8 ; adjust R6 (and R5) for shift + paddsw m5, m5 ; r5 = B.. + B.. + paddsw m5, m6 ; r5 = R5 = F. + B.. + psraw m6, 4 ; r6 = NR6 + movq J(4), m4 ; store NR4 at J4 + psraw m5, 4 ; r5 = NR5 + movq I(3), m3 ; store NR3 at I3 + psubsw m7, m0 ; r7 = R7 = G. - C. + paddsw m7, OC_8 ; adjust R7 (and R0) for shift + paddsw m0, m0 ; r0 = C. + C. + paddsw m0, m7 ; r0 = R0 = G. + C. + psraw m7, 4 ; r7 = NR7 + movq J(6), m6 ; store NR6 at J6 + psraw m0, 4 ; r0 = NR0 + movq J(5), m5 ; store NR5 at J5 + movq J(7), m7 ; store NR7 at J7 + movq I(0), m0 ; store NR0 at I0 +%endmacro + +; Following macro does two 4x4 transposes in place. +; +; At entry (we assume): +; +; r0 = a3 a2 a1 a0 +; I(1) = b3 b2 b1 b0 +; r2 = c3 c2 c1 c0 +; r3 = d3 d2 d1 d0 +; +; r4 = e3 e2 e1 e0 +; r5 = f3 f2 f1 f0 +; r6 = g3 g2 g1 g0 +; r7 = h3 h2 h1 h0 +; +; At exit, we have: +; +; I(0) = d0 c0 b0 a0 +; I(1) = d1 c1 b1 a1 +; I(2) = d2 c2 b2 a2 +; I(3) = d3 c3 b3 a3 +; +; J(4) = h0 g0 f0 e0 +; J(5) = h1 g1 f1 e1 +; J(6) = h2 g2 f2 e2 +; J(7) = h3 g3 f3 e3 +; +; I(0) I(1) I(2) I(3) is the transpose of r0 I(1) r2 r3. +; J(4) J(5) J(6) J(7) is the transpose of r4 r5 r6 r7. +; +; Since r1 is free at entry, we calculate the Js first. +%macro Transpose 0 + movq m1, m4 ; r1 = e3 e2 e1 e0 + punpcklwd m4, m5 ; r4 = f1 e1 f0 e0 + movq I(0), m0 ; save a3 a2 a1 a0 + punpckhwd m1, m5 ; r1 = f3 e3 f2 e2 + movq m0, m6 ; r0 = g3 g2 g1 g0 + punpcklwd m6, m7 ; r6 = h1 g1 h0 g0 + movq m5, m4 ; r5 = f1 e1 f0 e0 + punpckldq m4, m6 ; r4 = h0 g0 f0 e0 = R4 + punpckhdq m5, m6 ; r5 = h1 g1 f1 e1 = R5 + movq m6, m1 ; r6 = f3 e3 f2 e2 + movq J(4), m4 + punpckhwd m0, m7 ; r0 = h3 g3 h2 g2 + movq J(5), m5 + punpckhdq m6, m0 ; r6 = h3 g3 f3 e3 = R7 + movq m4, I(0) ; r4 = a3 a2 a1 a0 + punpckldq m1, m0 ; r1 = h2 g2 f2 e2 = R6 + movq m5, I(1) ; r5 = b3 b2 b1 b0 + movq m0, m4 ; r0 = a3 a2 a1 a0 + movq J(7), m6 + punpcklwd m0, m5 ; r0 = b1 a1 b0 a0 + movq J(6), m1 + punpckhwd m4, m5 ; r4 = b3 a3 b2 a2 + movq m5, m2 ; r5 = c3 c2 c1 c0 + punpcklwd m2, m3 ; r2 = d1 c1 d0 c0 + movq m1, m0 ; r1 = b1 a1 b0 a0 + punpckldq m0, m2 ; r0 = d0 c0 b0 a0 = R0 + punpckhdq m1, m2 ; r1 = d1 c1 b1 a1 = R1 + movq m2, m4 ; r2 = b3 a3 b2 a2 + movq I(0), m0 + punpckhwd m5, m3 ; r5 = d3 c3 d2 c2 + movq I(1), m1 + punpckhdq m4, m5 ; r4 = d3 c3 b3 a3 = R3 + punpckldq m2, m5 ; r2 = d2 c2 b2 a2 = R2 + movq I(3), m4 + movq I(2), m2 +%endmacro + +%macro VP3_IDCT_mmx 1 + ; eax = quantized input + ; ebx = dequantizer matrix + ; ecx = IDCT constants + ; M(I) = ecx + MaskOffset(0) + I * 8 + ; C(I) = ecx + CosineOffset(32) + (I-1) * 8 + ; edx = output + ; r0..r7 = mm0..mm7 +%define OC_8 [pw_8] +%define C(x) [vp3_idct_data+16*(x-1)] + + ; at this point, function has completed dequantization + dezigzag + + ; partial transposition; now do the idct itself +%define I(x) [%1+16* x ] +%define J(x) [%1+16*(x-4)+8] + RowIDCT + Transpose + +%define I(x) [%1+16* x +64] +%define J(x) [%1+16*(x-4)+72] + RowIDCT + Transpose + +%define I(x) [%1+16*x] +%define J(x) [%1+16*x] + ColumnIDCT + +%define I(x) [%1+16*x+8] +%define J(x) [%1+16*x+8] + ColumnIDCT +%endmacro + +%macro VP3_1D_IDCT_SSE2 0 + movdqa m2, I(3) ; xmm2 = i3 + movdqa m6, C(3) ; xmm6 = c3 + movdqa m4, m2 ; xmm4 = i3 + movdqa m7, I(5) ; xmm7 = i5 + pmulhw m4, m6 ; xmm4 = c3 * i3 - i3 + movdqa m1, C(5) ; xmm1 = c5 + pmulhw m6, m7 ; xmm6 = c3 * i5 - i5 + movdqa m5, m1 ; xmm5 = c5 + pmulhw m1, m2 ; xmm1 = c5 * i3 - i3 + movdqa m3, I(1) ; xmm3 = i1 + pmulhw m5, m7 ; xmm5 = c5 * i5 - i5 + movdqa m0, C(1) ; xmm0 = c1 + paddw m4, m2 ; xmm4 = c3 * i3 + paddw m6, m7 ; xmm6 = c3 * i5 + paddw m2, m1 ; xmm2 = c5 * i3 + movdqa m1, I(7) ; xmm1 = i7 + paddw m7, m5 ; xmm7 = c5 * i5 + movdqa m5, m0 ; xmm5 = c1 + pmulhw m0, m3 ; xmm0 = c1 * i1 - i1 + paddsw m4, m7 ; xmm4 = c3 * i3 + c5 * i5 = C + pmulhw m5, m1 ; xmm5 = c1 * i7 - i7 + movdqa m7, C(7) ; xmm7 = c7 + psubsw m6, m2 ; xmm6 = c3 * i5 - c5 * i3 = D + paddw m0, m3 ; xmm0 = c1 * i1 + pmulhw m3, m7 ; xmm3 = c7 * i1 + movdqa m2, I(2) ; xmm2 = i2 + pmulhw m7, m1 ; xmm7 = c7 * i7 + paddw m5, m1 ; xmm5 = c1 * i7 + movdqa m1, m2 ; xmm1 = i2 + pmulhw m2, C(2) ; xmm2 = i2 * c2 -i2 + psubsw m3, m5 ; xmm3 = c7 * i1 - c1 * i7 = B + movdqa m5, I(6) ; xmm5 = i6 + paddsw m0, m7 ; xmm0 = c1 * i1 + c7 * i7 = A + movdqa m7, m5 ; xmm7 = i6 + psubsw m0, m4 ; xmm0 = A - C + pmulhw m5, C(2) ; xmm5 = c2 * i6 - i6 + paddw m2, m1 ; xmm2 = i2 * c2 + pmulhw m1, C(6) ; xmm1 = c6 * i2 + paddsw m4, m4 ; xmm4 = C + C + paddsw m4, m0 ; xmm4 = A + C = C. + psubsw m3, m6 ; xmm3 = B - D + paddw m5, m7 ; xmm5 = c2 * i6 + paddsw m6, m6 ; xmm6 = D + D + pmulhw m7, C(6) ; xmm7 = c6 * i6 + paddsw m6, m3 ; xmm6 = B + D = D. + movdqa I(1), m4 ; Save C. at I(1) + psubsw m1, m5 ; xmm1 = c6 * i2 - c2 * i6 = H + movdqa m4, C(4) ; xmm4 = C4 + movdqa m5, m3 ; xmm5 = B - D + pmulhw m3, m4 ; xmm3 = ( c4 -1 ) * ( B - D ) + paddsw m7, m2 ; xmm7 = c2 * i2 + c6 * i6 = G + movdqa I(2), m6 ; save D. at I(2) + movdqa m2, m0 ; xmm2 = A - C + movdqa m6, I(0) ; xmm6 = i0 + pmulhw m0, m4 ; xmm0 = ( c4 - 1 ) * ( A - C ) = A. + paddw m5, m3 ; xmm5 = c4 * ( B - D ) = B. + movdqa m3, I(4) ; xmm3 = i4 + psubsw m5, m1 ; xmm5 = B. - H = B.. + paddw m2, m0 ; xmm2 = c4 * ( A - C) = A. + psubsw m6, m3 ; xmm6 = i0 - i4 + movdqa m0, m6 ; xmm0 = i0 - i4 + pmulhw m6, m4 ; xmm6 = (c4 - 1) * (i0 - i4) = F + paddsw m3, m3 ; xmm3 = i4 + i4 + paddsw m1, m1 ; xmm1 = H + H + paddsw m3, m0 ; xmm3 = i0 + i4 + paddsw m1, m5 ; xmm1 = B. + H = H. + pmulhw m4, m3 ; xmm4 = ( c4 - 1 ) * ( i0 + i4 ) + paddw m6, m0 ; xmm6 = c4 * ( i0 - i4 ) + psubsw m6, m2 ; xmm6 = F - A. = F. + paddsw m2, m2 ; xmm2 = A. + A. + movdqa m0, I(1) ; Load C. from I(1) + paddsw m2, m6 ; xmm2 = F + A. = A.. + paddw m4, m3 ; xmm4 = c4 * ( i0 + i4 ) = 3 + psubsw m2, m1 ; xmm2 = A.. - H. = R2 + ADD(m2) ; Adjust R2 and R1 before shifting + paddsw m1, m1 ; xmm1 = H. + H. + paddsw m1, m2 ; xmm1 = A.. + H. = R1 + SHIFT(m2) ; xmm2 = op2 + psubsw m4, m7 ; xmm4 = E - G = E. + SHIFT(m1) ; xmm1 = op1 + movdqa m3, I(2) ; Load D. from I(2) + paddsw m7, m7 ; xmm7 = G + G + paddsw m7, m4 ; xmm7 = E + G = G. + psubsw m4, m3 ; xmm4 = E. - D. = R4 + ADD(m4) ; Adjust R4 and R3 before shifting + paddsw m3, m3 ; xmm3 = D. + D. + paddsw m3, m4 ; xmm3 = E. + D. = R3 + SHIFT(m4) ; xmm4 = op4 + psubsw m6, m5 ; xmm6 = F. - B..= R6 + SHIFT(m3) ; xmm3 = op3 + ADD(m6) ; Adjust R6 and R5 before shifting + paddsw m5, m5 ; xmm5 = B.. + B.. + paddsw m5, m6 ; xmm5 = F. + B.. = R5 + SHIFT(m6) ; xmm6 = op6 + SHIFT(m5) ; xmm5 = op5 + psubsw m7, m0 ; xmm7 = G. - C. = R7 + ADD(m7) ; Adjust R7 and R0 before shifting + paddsw m0, m0 ; xmm0 = C. + C. + paddsw m0, m7 ; xmm0 = G. + C. + SHIFT(m7) ; xmm7 = op7 + SHIFT(m0) ; xmm0 = op0 +%endmacro + +%macro PUT_BLOCK 8 + movdqa O(0), m%1 + movdqa O(1), m%2 + movdqa O(2), m%3 + movdqa O(3), m%4 + movdqa O(4), m%5 + movdqa O(5), m%6 + movdqa O(6), m%7 + movdqa O(7), m%8 +%endmacro + +%macro VP3_IDCT_sse2 1 +%define I(x) [%1+16*x] +%define O(x) [%1+16*x] +%define C(x) [vp3_idct_data+16*(x-1)] +%define SHIFT(x) +%define ADD(x) + VP3_1D_IDCT_SSE2 +%ifdef ARCH_X86_64 + TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8 +%else + TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [%1], [%1+16] +%endif + PUT_BLOCK 0, 1, 2, 3, 4, 5, 6, 7 + +%define SHIFT(x) psraw x, 4 +%define ADD(x) paddsw x, [pw_8] + VP3_1D_IDCT_SSE2 + PUT_BLOCK 0, 1, 2, 3, 4, 5, 6, 7 +%endmacro + +%macro vp3_idct_funcs 3 +cglobal vp3_idct_%1, 1, 1, %2 + VP3_IDCT_%1 r0 + RET + +cglobal vp3_idct_put_%1, 3, %3, %2 + VP3_IDCT_%1 r2 +%ifdef ARCH_X86_64 + mov r3, r2 + mov r2, r1 + mov r1, r0 + mov r0, r3 +%else + mov r0m, r2 + mov r1m, r0 + mov r2m, r1 +%endif +%ifdef WIN64 + call put_signed_pixels_clamped_mmx + RET +%else + jmp put_signed_pixels_clamped_mmx +%endif + +cglobal vp3_idct_add_%1, 3, %3, %2 + VP3_IDCT_%1 r2 +%ifdef ARCH_X86_64 + mov r3, r2 + mov r2, r1 + mov r1, r0 + mov r0, r3 +%else + mov r0m, r2 + mov r1m, r0 + mov r2m, r1 +%endif +%ifdef WIN64 + call add_pixels_clamped_mmx + RET +%else + jmp add_pixels_clamped_mmx +%endif +%endmacro + +%ifdef ARCH_X86_64 +%define REGS 4 +%else +%define REGS 3 +%endif +INIT_MMX +vp3_idct_funcs mmx, 0, REGS +INIT_XMM +vp3_idct_funcs sse2, 9, REGS +%undef REGS + +%macro DC_ADD 0 + movq m2, [r0 ] + movq m3, [r0+r1 ] + paddusb m2, m0 + movq m4, [r0+r1*2] + paddusb m3, m0 + movq m5, [r0+r3 ] + paddusb m4, m0 + paddusb m5, m0 + psubusb m2, m1 + psubusb m3, m1 + movq [r0 ], m2 + psubusb m4, m1 + movq [r0+r1 ], m3 + psubusb m5, m1 + movq [r0+r1*2], m4 + movq [r0+r3 ], m5 +%endmacro + +INIT_MMX +cglobal vp3_idct_dc_add_mmx2, 3, 4 +%ifdef ARCH_X86_64 + movsxd r1, r1d +%endif + lea r3, [r1*3] + movsx r2, word [r2] + add r2, 15 + sar r2, 5 + movd m0, r2d + pshufw m0, m0, 0x0 + pxor m1, m1 + psubw m1, m0 + packuswb m0, m0 + packuswb m1, m1 + DC_ADD + lea r0, [r0+r1*4] + DC_ADD + RET diff --git a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/vp3dsp_mmx.c b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/vp3dsp_mmx.c deleted file mode 100644 index 92985921e..000000000 --- a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/vp3dsp_mmx.c +++ /dev/null @@ -1,436 +0,0 @@ -/*
- * Copyright (C) 2004 the ffmpeg project
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-/**
- * @file
- * MMX-optimized functions cribbed from the original VP3 source code.
- */
-
-#include "libavutil/x86_cpu.h"
-#include "libavcodec/dsputil.h"
-#include "dsputil_mmx.h"
-#include "vp3dsp_mmx.h"
-
-extern const uint16_t ff_vp3_idct_data[];
-
-// this is off by one or two for some cases when filter_limit is greater than 63
-// in: p0 in mm6, p1 in mm4, p2 in mm2, p3 in mm1
-// out: p1 in mm4, p2 in mm3
-#define VP3_LOOP_FILTER(flim) \
- "movq %%mm6, %%mm7 \n\t" \
- "pand "MANGLE(ff_pb_7 )", %%mm6 \n\t" /* p0&7 */ \
- "psrlw $3, %%mm7 \n\t" \
- "pand "MANGLE(ff_pb_1F)", %%mm7 \n\t" /* p0>>3 */ \
- "movq %%mm2, %%mm3 \n\t" /* mm3 = p2 */ \
- "pxor %%mm4, %%mm2 \n\t" \
- "pand "MANGLE(ff_pb_1 )", %%mm2 \n\t" /* (p2^p1)&1 */ \
- "movq %%mm2, %%mm5 \n\t" \
- "paddb %%mm2, %%mm2 \n\t" \
- "paddb %%mm5, %%mm2 \n\t" /* 3*(p2^p1)&1 */ \
- "paddb %%mm6, %%mm2 \n\t" /* extra bits lost in shifts */ \
- "pcmpeqb %%mm0, %%mm0 \n\t" \
- "pxor %%mm0, %%mm1 \n\t" /* 255 - p3 */ \
- "pavgb %%mm2, %%mm1 \n\t" /* (256 - p3 + extrabits) >> 1 */ \
- "pxor %%mm4, %%mm0 \n\t" /* 255 - p1 */ \
- "pavgb %%mm3, %%mm0 \n\t" /* (256 + p2-p1) >> 1 */ \
- "paddb "MANGLE(ff_pb_3 )", %%mm1 \n\t" \
- "pavgb %%mm0, %%mm1 \n\t" /* 128+2+( p2-p1 - p3) >> 2 */ \
- "pavgb %%mm0, %%mm1 \n\t" /* 128+1+(3*(p2-p1) - p3) >> 3 */ \
- "paddusb %%mm1, %%mm7 \n\t" /* d+128+1 */ \
- "movq "MANGLE(ff_pb_81)", %%mm6 \n\t" \
- "psubusb %%mm7, %%mm6 \n\t" \
- "psubusb "MANGLE(ff_pb_81)", %%mm7 \n\t" \
-\
- "movq "#flim", %%mm5 \n\t" \
- "pminub %%mm5, %%mm6 \n\t" \
- "pminub %%mm5, %%mm7 \n\t" \
- "movq %%mm6, %%mm0 \n\t" \
- "movq %%mm7, %%mm1 \n\t" \
- "paddb %%mm6, %%mm6 \n\t" \
- "paddb %%mm7, %%mm7 \n\t" \
- "pminub %%mm5, %%mm6 \n\t" \
- "pminub %%mm5, %%mm7 \n\t" \
- "psubb %%mm0, %%mm6 \n\t" \
- "psubb %%mm1, %%mm7 \n\t" \
- "paddusb %%mm7, %%mm4 \n\t" \
- "psubusb %%mm6, %%mm4 \n\t" \
- "psubusb %%mm7, %%mm3 \n\t" \
- "paddusb %%mm6, %%mm3 \n\t"
-
-#define STORE_4_WORDS(dst0, dst1, dst2, dst3, mm) \
- "movd "#mm", %0 \n\t" \
- "movw %w0, -1"#dst0" \n\t" \
- "psrlq $32, "#mm" \n\t" \
- "shr $16, %0 \n\t" \
- "movw %w0, -1"#dst1" \n\t" \
- "movd "#mm", %0 \n\t" \
- "movw %w0, -1"#dst2" \n\t" \
- "shr $16, %0 \n\t" \
- "movw %w0, -1"#dst3" \n\t"
-
-void ff_vp3_v_loop_filter_mmx2(uint8_t *src, int stride, int *bounding_values)
-{
- __asm__ volatile(
- "movq %0, %%mm6 \n\t"
- "movq %1, %%mm4 \n\t"
- "movq %2, %%mm2 \n\t"
- "movq %3, %%mm1 \n\t"
-
- VP3_LOOP_FILTER(%4)
-
- "movq %%mm4, %1 \n\t"
- "movq %%mm3, %2 \n\t"
-
- : "+m" (*(uint64_t*)(src - 2*stride)),
- "+m" (*(uint64_t*)(src - 1*stride)),
- "+m" (*(uint64_t*)(src + 0*stride)),
- "+m" (*(uint64_t*)(src + 1*stride))
- : "m"(*(uint64_t*)(bounding_values+129))
- );
-}
-
-void ff_vp3_h_loop_filter_mmx2(uint8_t *src, int stride, int *bounding_values)
-{
- x86_reg tmp;
-
- __asm__ volatile(
- "movd -2(%1), %%mm6 \n\t"
- "movd -2(%1,%3), %%mm0 \n\t"
- "movd -2(%1,%3,2), %%mm1 \n\t"
- "movd -2(%1,%4), %%mm4 \n\t"
-
- TRANSPOSE8x4(%%mm6, %%mm0, %%mm1, %%mm4, -2(%2), -2(%2,%3), -2(%2,%3,2), -2(%2,%4), %%mm2)
- VP3_LOOP_FILTER(%5)
- SBUTTERFLY(%%mm4, %%mm3, %%mm5, bw, q)
-
- STORE_4_WORDS((%1), (%1,%3), (%1,%3,2), (%1,%4), %%mm4)
- STORE_4_WORDS((%2), (%2,%3), (%2,%3,2), (%2,%4), %%mm5)
-
- : "=&r"(tmp)
- : "r"(src), "r"(src+4*stride), "r"((x86_reg)stride), "r"((x86_reg)3*stride),
- "m"(*(uint64_t*)(bounding_values+129))
- : "memory"
- );
-}
-
-/* from original comments: The Macro does IDct on 4 1-D Dcts */
-#define BeginIDCT() \
- "movq "I(3)", %%mm2 \n\t" \
- "movq "C(3)", %%mm6 \n\t" \
- "movq %%mm2, %%mm4 \n\t" \
- "movq "J(5)", %%mm7 \n\t" \
- "pmulhw %%mm6, %%mm4 \n\t" /* r4 = c3*i3 - i3 */ \
- "movq "C(5)", %%mm1 \n\t" \
- "pmulhw %%mm7, %%mm6 \n\t" /* r6 = c3*i5 - i5 */ \
- "movq %%mm1, %%mm5 \n\t" \
- "pmulhw %%mm2, %%mm1 \n\t" /* r1 = c5*i3 - i3 */ \
- "movq "I(1)", %%mm3 \n\t" \
- "pmulhw %%mm7, %%mm5 \n\t" /* r5 = c5*i5 - i5 */ \
- "movq "C(1)", %%mm0 \n\t" \
- "paddw %%mm2, %%mm4 \n\t" /* r4 = c3*i3 */ \
- "paddw %%mm7, %%mm6 \n\t" /* r6 = c3*i5 */ \
- "paddw %%mm1, %%mm2 \n\t" /* r2 = c5*i3 */ \
- "movq "J(7)", %%mm1 \n\t" \
- "paddw %%mm5, %%mm7 \n\t" /* r7 = c5*i5 */ \
- "movq %%mm0, %%mm5 \n\t" /* r5 = c1 */ \
- "pmulhw %%mm3, %%mm0 \n\t" /* r0 = c1*i1 - i1 */ \
- "paddsw %%mm7, %%mm4 \n\t" /* r4 = C = c3*i3 + c5*i5 */ \
- "pmulhw %%mm1, %%mm5 \n\t" /* r5 = c1*i7 - i7 */ \
- "movq "C(7)", %%mm7 \n\t" \
- "psubsw %%mm2, %%mm6 \n\t" /* r6 = D = c3*i5 - c5*i3 */ \
- "paddw %%mm3, %%mm0 \n\t" /* r0 = c1*i1 */ \
- "pmulhw %%mm7, %%mm3 \n\t" /* r3 = c7*i1 */ \
- "movq "I(2)", %%mm2 \n\t" \
- "pmulhw %%mm1, %%mm7 \n\t" /* r7 = c7*i7 */ \
- "paddw %%mm1, %%mm5 \n\t" /* r5 = c1*i7 */ \
- "movq %%mm2, %%mm1 \n\t" /* r1 = i2 */ \
- "pmulhw "C(2)", %%mm2 \n\t" /* r2 = c2*i2 - i2 */ \
- "psubsw %%mm5, %%mm3 \n\t" /* r3 = B = c7*i1 - c1*i7 */ \
- "movq "J(6)", %%mm5 \n\t" \
- "paddsw %%mm7, %%mm0 \n\t" /* r0 = A = c1*i1 + c7*i7 */ \
- "movq %%mm5, %%mm7 \n\t" /* r7 = i6 */ \
- "psubsw %%mm4, %%mm0 \n\t" /* r0 = A - C */ \
- "pmulhw "C(2)", %%mm5 \n\t" /* r5 = c2*i6 - i6 */ \
- "paddw %%mm1, %%mm2 \n\t" /* r2 = c2*i2 */ \
- "pmulhw "C(6)", %%mm1 \n\t" /* r1 = c6*i2 */ \
- "paddsw %%mm4, %%mm4 \n\t" /* r4 = C + C */ \
- "paddsw %%mm0, %%mm4 \n\t" /* r4 = C. = A + C */ \
- "psubsw %%mm6, %%mm3 \n\t" /* r3 = B - D */ \
- "paddw %%mm7, %%mm5 \n\t" /* r5 = c2*i6 */ \
- "paddsw %%mm6, %%mm6 \n\t" /* r6 = D + D */ \
- "pmulhw "C(6)", %%mm7 \n\t" /* r7 = c6*i6 */ \
- "paddsw %%mm3, %%mm6 \n\t" /* r6 = D. = B + D */ \
- "movq %%mm4, "I(1)"\n\t" /* save C. at I(1) */ \
- "psubsw %%mm5, %%mm1 \n\t" /* r1 = H = c6*i2 - c2*i6 */ \
- "movq "C(4)", %%mm4 \n\t" \
- "movq %%mm3, %%mm5 \n\t" /* r5 = B - D */ \
- "pmulhw %%mm4, %%mm3 \n\t" /* r3 = (c4 - 1) * (B - D) */ \
- "paddsw %%mm2, %%mm7 \n\t" /* r3 = (c4 - 1) * (B - D) */ \
- "movq %%mm6, "I(2)"\n\t" /* save D. at I(2) */ \
- "movq %%mm0, %%mm2 \n\t" /* r2 = A - C */ \
- "movq "I(0)", %%mm6 \n\t" \
- "pmulhw %%mm4, %%mm0 \n\t" /* r0 = (c4 - 1) * (A - C) */ \
- "paddw %%mm3, %%mm5 \n\t" /* r5 = B. = c4 * (B - D) */ \
- "movq "J(4)", %%mm3 \n\t" \
- "psubsw %%mm1, %%mm5 \n\t" /* r5 = B.. = B. - H */ \
- "paddw %%mm0, %%mm2 \n\t" /* r0 = A. = c4 * (A - C) */ \
- "psubsw %%mm3, %%mm6 \n\t" /* r6 = i0 - i4 */ \
- "movq %%mm6, %%mm0 \n\t" \
- "pmulhw %%mm4, %%mm6 \n\t" /* r6 = (c4 - 1) * (i0 - i4) */ \
- "paddsw %%mm3, %%mm3 \n\t" /* r3 = i4 + i4 */ \
- "paddsw %%mm1, %%mm1 \n\t" /* r1 = H + H */ \
- "paddsw %%mm0, %%mm3 \n\t" /* r3 = i0 + i4 */ \
- "paddsw %%mm5, %%mm1 \n\t" /* r1 = H. = B + H */ \
- "pmulhw %%mm3, %%mm4 \n\t" /* r4 = (c4 - 1) * (i0 + i4) */ \
- "paddsw %%mm0, %%mm6 \n\t" /* r6 = F = c4 * (i0 - i4) */ \
- "psubsw %%mm2, %%mm6 \n\t" /* r6 = F. = F - A. */ \
- "paddsw %%mm2, %%mm2 \n\t" /* r2 = A. + A. */ \
- "movq "I(1)", %%mm0 \n\t" /* r0 = C. */ \
- "paddsw %%mm6, %%mm2 \n\t" /* r2 = A.. = F + A. */ \
- "paddw %%mm3, %%mm4 \n\t" /* r4 = E = c4 * (i0 + i4) */ \
- "psubsw %%mm1, %%mm2 \n\t" /* r2 = R2 = A.. - H. */
-
-/* RowIDCT gets ready to transpose */
-#define RowIDCT() \
- BeginIDCT() \
- "movq "I(2)", %%mm3 \n\t" /* r3 = D. */ \
- "psubsw %%mm7, %%mm4 \n\t" /* r4 = E. = E - G */ \
- "paddsw %%mm1, %%mm1 \n\t" /* r1 = H. + H. */ \
- "paddsw %%mm7, %%mm7 \n\t" /* r7 = G + G */ \
- "paddsw %%mm2, %%mm1 \n\t" /* r1 = R1 = A.. + H. */ \
- "paddsw %%mm4, %%mm7 \n\t" /* r1 = R1 = A.. + H. */ \
- "psubsw %%mm3, %%mm4 \n\t" /* r4 = R4 = E. - D. */ \
- "paddsw %%mm3, %%mm3 \n\t" \
- "psubsw %%mm5, %%mm6 \n\t" /* r6 = R6 = F. - B.. */ \
- "paddsw %%mm5, %%mm5 \n\t" \
- "paddsw %%mm4, %%mm3 \n\t" /* r3 = R3 = E. + D. */ \
- "paddsw %%mm6, %%mm5 \n\t" /* r5 = R5 = F. + B.. */ \
- "psubsw %%mm0, %%mm7 \n\t" /* r7 = R7 = G. - C. */ \
- "paddsw %%mm0, %%mm0 \n\t" \
- "movq %%mm1, "I(1)"\n\t" /* save R1 */ \
- "paddsw %%mm7, %%mm0 \n\t" /* r0 = R0 = G. + C. */
-
-/* Column IDCT normalizes and stores final results */
-#define ColumnIDCT() \
- BeginIDCT() \
- "paddsw "OC_8", %%mm2 \n\t" /* adjust R2 (and R1) for shift */ \
- "paddsw %%mm1, %%mm1 \n\t" /* r1 = H. + H. */ \
- "paddsw %%mm2, %%mm1 \n\t" /* r1 = R1 = A.. + H. */ \
- "psraw $4, %%mm2 \n\t" /* r2 = NR2 */ \
- "psubsw %%mm7, %%mm4 \n\t" /* r4 = E. = E - G */ \
- "psraw $4, %%mm1 \n\t" /* r1 = NR1 */ \
- "movq "I(2)", %%mm3 \n\t" /* r3 = D. */ \
- "paddsw %%mm7, %%mm7 \n\t" /* r7 = G + G */ \
- "movq %%mm2, "I(2)"\n\t" /* store NR2 at I2 */ \
- "paddsw %%mm4, %%mm7 \n\t" /* r7 = G. = E + G */ \
- "movq %%mm1, "I(1)"\n\t" /* store NR1 at I1 */ \
- "psubsw %%mm3, %%mm4 \n\t" /* r4 = R4 = E. - D. */ \
- "paddsw "OC_8", %%mm4 \n\t" /* adjust R4 (and R3) for shift */ \
- "paddsw %%mm3, %%mm3 \n\t" /* r3 = D. + D. */ \
- "paddsw %%mm4, %%mm3 \n\t" /* r3 = R3 = E. + D. */ \
- "psraw $4, %%mm4 \n\t" /* r4 = NR4 */ \
- "psubsw %%mm5, %%mm6 \n\t" /* r6 = R6 = F. - B.. */ \
- "psraw $4, %%mm3 \n\t" /* r3 = NR3 */ \
- "paddsw "OC_8", %%mm6 \n\t" /* adjust R6 (and R5) for shift */ \
- "paddsw %%mm5, %%mm5 \n\t" /* r5 = B.. + B.. */ \
- "paddsw %%mm6, %%mm5 \n\t" /* r5 = R5 = F. + B.. */ \
- "psraw $4, %%mm6 \n\t" /* r6 = NR6 */ \
- "movq %%mm4, "J(4)"\n\t" /* store NR4 at J4 */ \
- "psraw $4, %%mm5 \n\t" /* r5 = NR5 */ \
- "movq %%mm3, "I(3)"\n\t" /* store NR3 at I3 */ \
- "psubsw %%mm0, %%mm7 \n\t" /* r7 = R7 = G. - C. */ \
- "paddsw "OC_8", %%mm7 \n\t" /* adjust R7 (and R0) for shift */ \
- "paddsw %%mm0, %%mm0 \n\t" /* r0 = C. + C. */ \
- "paddsw %%mm7, %%mm0 \n\t" /* r0 = R0 = G. + C. */ \
- "psraw $4, %%mm7 \n\t" /* r7 = NR7 */ \
- "movq %%mm6, "J(6)"\n\t" /* store NR6 at J6 */ \
- "psraw $4, %%mm0 \n\t" /* r0 = NR0 */ \
- "movq %%mm5, "J(5)"\n\t" /* store NR5 at J5 */ \
- "movq %%mm7, "J(7)"\n\t" /* store NR7 at J7 */ \
- "movq %%mm0, "I(0)"\n\t" /* store NR0 at I0 */
-
-/* Following macro does two 4x4 transposes in place.
-
- At entry (we assume):
-
- r0 = a3 a2 a1 a0
- I(1) = b3 b2 b1 b0
- r2 = c3 c2 c1 c0
- r3 = d3 d2 d1 d0
-
- r4 = e3 e2 e1 e0
- r5 = f3 f2 f1 f0
- r6 = g3 g2 g1 g0
- r7 = h3 h2 h1 h0
-
- At exit, we have:
-
- I(0) = d0 c0 b0 a0
- I(1) = d1 c1 b1 a1
- I(2) = d2 c2 b2 a2
- I(3) = d3 c3 b3 a3
-
- J(4) = h0 g0 f0 e0
- J(5) = h1 g1 f1 e1
- J(6) = h2 g2 f2 e2
- J(7) = h3 g3 f3 e3
-
- I(0) I(1) I(2) I(3) is the transpose of r0 I(1) r2 r3.
- J(4) J(5) J(6) J(7) is the transpose of r4 r5 r6 r7.
-
- Since r1 is free at entry, we calculate the Js first. */
-#define Transpose() \
- "movq %%mm4, %%mm1 \n\t" /* r1 = e3 e2 e1 e0 */ \
- "punpcklwd %%mm5, %%mm4 \n\t" /* r4 = f1 e1 f0 e0 */ \
- "movq %%mm0, "I(0)"\n\t" /* save a3 a2 a1 a0 */ \
- "punpckhwd %%mm5, %%mm1 \n\t" /* r1 = f3 e3 f2 e2 */ \
- "movq %%mm6, %%mm0 \n\t" /* r0 = g3 g2 g1 g0 */ \
- "punpcklwd %%mm7, %%mm6 \n\t" /* r6 = h1 g1 h0 g0 */ \
- "movq %%mm4, %%mm5 \n\t" /* r5 = f1 e1 f0 e0 */ \
- "punpckldq %%mm6, %%mm4 \n\t" /* r4 = h0 g0 f0 e0 = R4 */ \
- "punpckhdq %%mm6, %%mm5 \n\t" /* r5 = h1 g1 f1 e1 = R5 */ \
- "movq %%mm1, %%mm6 \n\t" /* r6 = f3 e3 f2 e2 */ \
- "movq %%mm4, "J(4)"\n\t" \
- "punpckhwd %%mm7, %%mm0 \n\t" /* r0 = h3 g3 h2 g2 */ \
- "movq %%mm5, "J(5)"\n\t" \
- "punpckhdq %%mm0, %%mm6 \n\t" /* r6 = h3 g3 f3 e3 = R7 */ \
- "movq "I(0)", %%mm4 \n\t" /* r4 = a3 a2 a1 a0 */ \
- "punpckldq %%mm0, %%mm1 \n\t" /* r1 = h2 g2 f2 e2 = R6 */ \
- "movq "I(1)", %%mm5 \n\t" /* r5 = b3 b2 b1 b0 */ \
- "movq %%mm4, %%mm0 \n\t" /* r0 = a3 a2 a1 a0 */ \
- "movq %%mm6, "J(7)"\n\t" \
- "punpcklwd %%mm5, %%mm0 \n\t" /* r0 = b1 a1 b0 a0 */ \
- "movq %%mm1, "J(6)"\n\t" \
- "punpckhwd %%mm5, %%mm4 \n\t" /* r4 = b3 a3 b2 a2 */ \
- "movq %%mm2, %%mm5 \n\t" /* r5 = c3 c2 c1 c0 */ \
- "punpcklwd %%mm3, %%mm2 \n\t" /* r2 = d1 c1 d0 c0 */ \
- "movq %%mm0, %%mm1 \n\t" /* r1 = b1 a1 b0 a0 */ \
- "punpckldq %%mm2, %%mm0 \n\t" /* r0 = d0 c0 b0 a0 = R0 */ \
- "punpckhdq %%mm2, %%mm1 \n\t" /* r1 = d1 c1 b1 a1 = R1 */ \
- "movq %%mm4, %%mm2 \n\t" /* r2 = b3 a3 b2 a2 */ \
- "movq %%mm0, "I(0)"\n\t" \
- "punpckhwd %%mm3, %%mm5 \n\t" /* r5 = d3 c3 d2 c2 */ \
- "movq %%mm1, "I(1)"\n\t" \
- "punpckhdq %%mm5, %%mm4 \n\t" /* r4 = d3 c3 b3 a3 = R3 */ \
- "punpckldq %%mm5, %%mm2 \n\t" /* r2 = d2 c2 b2 a2 = R2 */ \
- "movq %%mm4, "I(3)"\n\t" \
- "movq %%mm2, "I(2)"\n\t"
-
-void ff_vp3_idct_mmx(int16_t *output_data)
-{
- /* eax = quantized input
- * ebx = dequantizer matrix
- * ecx = IDCT constants
- * M(I) = ecx + MaskOffset(0) + I * 8
- * C(I) = ecx + CosineOffset(32) + (I-1) * 8
- * edx = output
- * r0..r7 = mm0..mm7
- */
-
-#define C(x) AV_STRINGIFY(16*(x-1))"(%1)"
-#define OC_8 "%2"
-
- /* at this point, function has completed dequantization + dezigzag +
- * partial transposition; now do the idct itself */
-#define I(x) AV_STRINGIFY(16* x )"(%0)"
-#define J(x) AV_STRINGIFY(16*(x-4) + 8)"(%0)"
-
- __asm__ volatile (
- RowIDCT()
- Transpose()
-
-#undef I
-#undef J
-#define I(x) AV_STRINGIFY(16* x + 64)"(%0)"
-#define J(x) AV_STRINGIFY(16*(x-4) + 72)"(%0)"
-
- RowIDCT()
- Transpose()
-
-#undef I
-#undef J
-#define I(x) AV_STRINGIFY(16*x)"(%0)"
-#define J(x) AV_STRINGIFY(16*x)"(%0)"
-
- ColumnIDCT()
-
-#undef I
-#undef J
-#define I(x) AV_STRINGIFY(16*x + 8)"(%0)"
-#define J(x) AV_STRINGIFY(16*x + 8)"(%0)"
-
- ColumnIDCT()
- :: "r"(output_data), "r"(ff_vp3_idct_data), "m"(ff_pw_8)
- );
-#undef I
-#undef J
-
-}
-
-void ff_vp3_idct_put_mmx(uint8_t *dest, int line_size, DCTELEM *block)
-{
- ff_vp3_idct_mmx(block);
- put_signed_pixels_clamped_mmx(block, dest, line_size);
-}
-
-void ff_vp3_idct_add_mmx(uint8_t *dest, int line_size, DCTELEM *block)
-{
- ff_vp3_idct_mmx(block);
- add_pixels_clamped_mmx(block, dest, line_size);
-}
-
-void ff_vp3_idct_dc_add_mmx2(uint8_t *dest, int linesize, const DCTELEM *block)
-{
- int dc = (block[0] + 15) >> 5;
-
- __asm__ volatile(
- "movd %3, %%mm0 \n\t"
- "pshufw $0, %%mm0, %%mm0 \n\t"
- "pxor %%mm1, %%mm1 \n\t"
- "psubw %%mm0, %%mm1 \n\t"
- "packuswb %%mm0, %%mm0 \n\t"
- "packuswb %%mm1, %%mm1 \n\t"
-
-#define DC_ADD \
- "movq (%0), %%mm2 \n\t" \
- "movq (%0,%1), %%mm3 \n\t" \
- "paddusb %%mm0, %%mm2 \n\t" \
- "movq (%0,%1,2), %%mm4 \n\t" \
- "paddusb %%mm0, %%mm3 \n\t" \
- "movq (%0,%2), %%mm5 \n\t" \
- "paddusb %%mm0, %%mm4 \n\t" \
- "paddusb %%mm0, %%mm5 \n\t" \
- "psubusb %%mm1, %%mm2 \n\t" \
- "psubusb %%mm1, %%mm3 \n\t" \
- "movq %%mm2, (%0) \n\t" \
- "psubusb %%mm1, %%mm4 \n\t" \
- "movq %%mm3, (%0,%1) \n\t" \
- "psubusb %%mm1, %%mm5 \n\t" \
- "movq %%mm4, (%0,%1,2) \n\t" \
- "movq %%mm5, (%0,%2) \n\t"
-
- DC_ADD
- "lea (%0,%1,4), %0 \n\t"
- DC_ADD
-
- : "+r"(dest)
- : "r"((x86_reg)linesize), "r"((x86_reg)3*linesize), "r"(dc)
- );
-}
diff --git a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/vp3dsp_mmx.h b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/vp3dsp_mmx.h deleted file mode 100644 index e0ebf0b0f..000000000 --- a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/vp3dsp_mmx.h +++ /dev/null @@ -1,36 +0,0 @@ -/* - * vp3dsp MMX function declarations - * Copyright (c) 2007 Aurelien Jacobs <aurel@gnuage.org> - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#ifndef AVCODEC_X86_VP3DSP_MMX_H -#define AVCODEC_X86_VP3DSP_MMX_H - -#include <stdint.h> -#include "libavcodec/dsputil.h" - -void ff_vp3_idct_mmx(int16_t *data); -void ff_vp3_idct_put_mmx(uint8_t *dest, int line_size, DCTELEM *block); -void ff_vp3_idct_add_mmx(uint8_t *dest, int line_size, DCTELEM *block); -void ff_vp3_idct_dc_add_mmx2(uint8_t *dest, int line_size, const DCTELEM *block); - -void ff_vp3_v_loop_filter_mmx2(uint8_t *src, int stride, int *bounding_values); -void ff_vp3_h_loop_filter_mmx2(uint8_t *src, int stride, int *bounding_values); - -#endif /* AVCODEC_X86_VP3DSP_MMX_H */ diff --git a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/vp3dsp_sse2.c b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/vp3dsp_sse2.c deleted file mode 100644 index b54ffa39e..000000000 --- a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/vp3dsp_sse2.c +++ /dev/null @@ -1,187 +0,0 @@ -/*
- * Copyright (C) 2004 the ffmpeg project
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-/**
- * @file
- * SSE2-optimized functions cribbed from the original VP3 source code.
- */
-
-#include "libavcodec/dsputil.h"
-#include "dsputil_mmx.h"
-#include "vp3dsp_sse2.h"
-
-DECLARE_ALIGNED(16, const uint16_t, ff_vp3_idct_data)[7 * 8] =
-{
- 64277,64277,64277,64277,64277,64277,64277,64277,
- 60547,60547,60547,60547,60547,60547,60547,60547,
- 54491,54491,54491,54491,54491,54491,54491,54491,
- 46341,46341,46341,46341,46341,46341,46341,46341,
- 36410,36410,36410,36410,36410,36410,36410,36410,
- 25080,25080,25080,25080,25080,25080,25080,25080,
- 12785,12785,12785,12785,12785,12785,12785,12785
-};
-
-
-#define VP3_1D_IDCT_SSE2(ADD, SHIFT) \
- "movdqa "I(3)", %%xmm2 \n\t" /* xmm2 = i3 */ \
- "movdqa "C(3)", %%xmm6 \n\t" /* xmm6 = c3 */ \
- "movdqa %%xmm2, %%xmm4 \n\t" /* xmm4 = i3 */ \
- "movdqa "I(5)", %%xmm7 \n\t" /* xmm7 = i5 */ \
- "pmulhw %%xmm6, %%xmm4 \n\t" /* xmm4 = c3 * i3 - i3 */ \
- "movdqa "C(5)", %%xmm1 \n\t" /* xmm1 = c5 */ \
- "pmulhw %%xmm7, %%xmm6 \n\t" /* xmm6 = c3 * i5 - i5 */ \
- "movdqa %%xmm1, %%xmm5 \n\t" /* xmm5 = c5 */ \
- "pmulhw %%xmm2, %%xmm1 \n\t" /* xmm1 = c5 * i3 - i3 */ \
- "movdqa "I(1)", %%xmm3 \n\t" /* xmm3 = i1 */ \
- "pmulhw %%xmm7, %%xmm5 \n\t" /* xmm5 = c5 * i5 - i5 */ \
- "movdqa "C(1)", %%xmm0 \n\t" /* xmm0 = c1 */ \
- "paddw %%xmm2, %%xmm4 \n\t" /* xmm4 = c3 * i3 */ \
- "paddw %%xmm7, %%xmm6 \n\t" /* xmm6 = c3 * i5 */ \
- "paddw %%xmm1, %%xmm2 \n\t" /* xmm2 = c5 * i3 */ \
- "movdqa "I(7)", %%xmm1 \n\t" /* xmm1 = i7 */ \
- "paddw %%xmm5, %%xmm7 \n\t" /* xmm7 = c5 * i5 */ \
- "movdqa %%xmm0, %%xmm5 \n\t" /* xmm5 = c1 */ \
- "pmulhw %%xmm3, %%xmm0 \n\t" /* xmm0 = c1 * i1 - i1 */ \
- "paddsw %%xmm7, %%xmm4 \n\t" /* xmm4 = c3 * i3 + c5 * i5 = C */ \
- "pmulhw %%xmm1, %%xmm5 \n\t" /* xmm5 = c1 * i7 - i7 */ \
- "movdqa "C(7)", %%xmm7 \n\t" /* xmm7 = c7 */ \
- "psubsw %%xmm2, %%xmm6 \n\t" /* xmm6 = c3 * i5 - c5 * i3 = D */ \
- "paddw %%xmm3, %%xmm0 \n\t" /* xmm0 = c1 * i1 */ \
- "pmulhw %%xmm7, %%xmm3 \n\t" /* xmm3 = c7 * i1 */ \
- "movdqa "I(2)", %%xmm2 \n\t" /* xmm2 = i2 */ \
- "pmulhw %%xmm1, %%xmm7 \n\t" /* xmm7 = c7 * i7 */ \
- "paddw %%xmm1, %%xmm5 \n\t" /* xmm5 = c1 * i7 */ \
- "movdqa %%xmm2, %%xmm1 \n\t" /* xmm1 = i2 */ \
- "pmulhw "C(2)", %%xmm2 \n\t" /* xmm2 = i2 * c2 -i2 */ \
- "psubsw %%xmm5, %%xmm3 \n\t" /* xmm3 = c7 * i1 - c1 * i7 = B */ \
- "movdqa "I(6)", %%xmm5 \n\t" /* xmm5 = i6 */ \
- "paddsw %%xmm7, %%xmm0 \n\t" /* xmm0 = c1 * i1 + c7 * i7 = A */ \
- "movdqa %%xmm5, %%xmm7 \n\t" /* xmm7 = i6 */ \
- "psubsw %%xmm4, %%xmm0 \n\t" /* xmm0 = A - C */ \
- "pmulhw "C(2)", %%xmm5 \n\t" /* xmm5 = c2 * i6 - i6 */ \
- "paddw %%xmm1, %%xmm2 \n\t" /* xmm2 = i2 * c2 */ \
- "pmulhw "C(6)", %%xmm1 \n\t" /* xmm1 = c6 * i2 */ \
- "paddsw %%xmm4, %%xmm4 \n\t" /* xmm4 = C + C */ \
- "paddsw %%xmm0, %%xmm4 \n\t" /* xmm4 = A + C = C. */ \
- "psubsw %%xmm6, %%xmm3 \n\t" /* xmm3 = B - D */ \
- "paddw %%xmm7, %%xmm5 \n\t" /* xmm5 = c2 * i6 */ \
- "paddsw %%xmm6, %%xmm6 \n\t" /* xmm6 = D + D */ \
- "pmulhw "C(6)", %%xmm7 \n\t" /* xmm7 = c6 * i6 */ \
- "paddsw %%xmm3, %%xmm6 \n\t" /* xmm6 = B + D = D. */ \
- "movdqa %%xmm4, "I(1)" \n\t" /* Save C. at I(1) */ \
- "psubsw %%xmm5, %%xmm1 \n\t" /* xmm1 = c6 * i2 - c2 * i6 = H */ \
- "movdqa "C(4)", %%xmm4 \n\t" /* xmm4 = c4 */ \
- "movdqa %%xmm3, %%xmm5 \n\t" /* xmm5 = B - D */ \
- "pmulhw %%xmm4, %%xmm3 \n\t" /* xmm3 = ( c4 -1 ) * ( B - D ) */ \
- "paddsw %%xmm2, %%xmm7 \n\t" /* xmm7 = c2 * i2 + c6 * i6 = G */ \
- "movdqa %%xmm6, "I(2)" \n\t" /* Save D. at I(2) */ \
- "movdqa %%xmm0, %%xmm2 \n\t" /* xmm2 = A - C */ \
- "movdqa "I(0)", %%xmm6 \n\t" /* xmm6 = i0 */ \
- "pmulhw %%xmm4, %%xmm0 \n\t" /* xmm0 = ( c4 - 1 ) * ( A - C ) = A. */ \
- "paddw %%xmm3, %%xmm5 \n\t" /* xmm5 = c4 * ( B - D ) = B. */ \
- "movdqa "I(4)", %%xmm3 \n\t" /* xmm3 = i4 */ \
- "psubsw %%xmm1, %%xmm5 \n\t" /* xmm5 = B. - H = B.. */ \
- "paddw %%xmm0, %%xmm2 \n\t" /* xmm2 = c4 * ( A - C) = A. */ \
- "psubsw %%xmm3, %%xmm6 \n\t" /* xmm6 = i0 - i4 */ \
- "movdqa %%xmm6, %%xmm0 \n\t" /* xmm0 = i0 - i4 */ \
- "pmulhw %%xmm4, %%xmm6 \n\t" /* xmm6 = (c4 - 1) * (i0 - i4) = F */ \
- "paddsw %%xmm3, %%xmm3 \n\t" /* xmm3 = i4 + i4 */ \
- "paddsw %%xmm1, %%xmm1 \n\t" /* xmm1 = H + H */ \
- "paddsw %%xmm0, %%xmm3 \n\t" /* xmm3 = i0 + i4 */ \
- "paddsw %%xmm5, %%xmm1 \n\t" /* xmm1 = B. + H = H. */ \
- "pmulhw %%xmm3, %%xmm4 \n\t" /* xmm4 = ( c4 - 1 ) * ( i0 + i4 ) */ \
- "paddw %%xmm0, %%xmm6 \n\t" /* xmm6 = c4 * ( i0 - i4 ) */ \
- "psubsw %%xmm2, %%xmm6 \n\t" /* xmm6 = F - A. = F. */ \
- "paddsw %%xmm2, %%xmm2 \n\t" /* xmm2 = A. + A. */ \
- "movdqa "I(1)", %%xmm0 \n\t" /* Load C. from I(1) */ \
- "paddsw %%xmm6, %%xmm2 \n\t" /* xmm2 = F + A. = A.. */ \
- "paddw %%xmm3, %%xmm4 \n\t" /* xmm4 = c4 * ( i0 + i4 ) = 3 */ \
- "psubsw %%xmm1, %%xmm2 \n\t" /* xmm2 = A.. - H. = R2 */ \
- ADD(%%xmm2) /* Adjust R2 and R1 before shifting */ \
- "paddsw %%xmm1, %%xmm1 \n\t" /* xmm1 = H. + H. */ \
- "paddsw %%xmm2, %%xmm1 \n\t" /* xmm1 = A.. + H. = R1 */ \
- SHIFT(%%xmm2) /* xmm2 = op2 */ \
- "psubsw %%xmm7, %%xmm4 \n\t" /* xmm4 = E - G = E. */ \
- SHIFT(%%xmm1) /* xmm1 = op1 */ \
- "movdqa "I(2)", %%xmm3 \n\t" /* Load D. from I(2) */ \
- "paddsw %%xmm7, %%xmm7 \n\t" /* xmm7 = G + G */ \
- "paddsw %%xmm4, %%xmm7 \n\t" /* xmm7 = E + G = G. */ \
- "psubsw %%xmm3, %%xmm4 \n\t" /* xmm4 = E. - D. = R4 */ \
- ADD(%%xmm4) /* Adjust R4 and R3 before shifting */ \
- "paddsw %%xmm3, %%xmm3 \n\t" /* xmm3 = D. + D. */ \
- "paddsw %%xmm4, %%xmm3 \n\t" /* xmm3 = E. + D. = R3 */ \
- SHIFT(%%xmm4) /* xmm4 = op4 */ \
- "psubsw %%xmm5, %%xmm6 \n\t" /* xmm6 = F. - B..= R6 */ \
- SHIFT(%%xmm3) /* xmm3 = op3 */ \
- ADD(%%xmm6) /* Adjust R6 and R5 before shifting */ \
- "paddsw %%xmm5, %%xmm5 \n\t" /* xmm5 = B.. + B.. */ \
- "paddsw %%xmm6, %%xmm5 \n\t" /* xmm5 = F. + B.. = R5 */ \
- SHIFT(%%xmm6) /* xmm6 = op6 */ \
- SHIFT(%%xmm5) /* xmm5 = op5 */ \
- "psubsw %%xmm0, %%xmm7 \n\t" /* xmm7 = G. - C. = R7 */ \
- ADD(%%xmm7) /* Adjust R7 and R0 before shifting */ \
- "paddsw %%xmm0, %%xmm0 \n\t" /* xmm0 = C. + C. */ \
- "paddsw %%xmm7, %%xmm0 \n\t" /* xmm0 = G. + C. */ \
- SHIFT(%%xmm7) /* xmm7 = op7 */ \
- SHIFT(%%xmm0) /* xmm0 = op0 */
-
-#define PUT_BLOCK(r0, r1, r2, r3, r4, r5, r6, r7) \
- "movdqa " #r0 ", " O(0) "\n\t" \
- "movdqa " #r1 ", " O(1) "\n\t" \
- "movdqa " #r2 ", " O(2) "\n\t" \
- "movdqa " #r3 ", " O(3) "\n\t" \
- "movdqa " #r4 ", " O(4) "\n\t" \
- "movdqa " #r5 ", " O(5) "\n\t" \
- "movdqa " #r6 ", " O(6) "\n\t" \
- "movdqa " #r7 ", " O(7) "\n\t"
-
-#define NOP(xmm)
-#define SHIFT4(xmm) "psraw $4, "#xmm"\n\t"
-#define ADD8(xmm) "paddsw %2, "#xmm"\n\t"
-
-void ff_vp3_idct_sse2(int16_t *input_data)
-{
-#define I(x) AV_STRINGIFY(16*x)"(%0)"
-#define O(x) I(x)
-#define C(x) AV_STRINGIFY(16*(x-1))"(%1)"
-
- __asm__ volatile (
- VP3_1D_IDCT_SSE2(NOP, NOP)
-
- TRANSPOSE8(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm6, %%xmm7, (%0))
- PUT_BLOCK(%%xmm0, %%xmm5, %%xmm7, %%xmm3, %%xmm6, %%xmm4, %%xmm2, %%xmm1)
-
- VP3_1D_IDCT_SSE2(ADD8, SHIFT4)
- PUT_BLOCK(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm6, %%xmm7)
- :: "r"(input_data), "r"(ff_vp3_idct_data), "m"(ff_pw_8)
- );
-}
-
-void ff_vp3_idct_put_sse2(uint8_t *dest, int line_size, DCTELEM *block)
-{
- ff_vp3_idct_sse2(block);
- put_signed_pixels_clamped_mmx(block, dest, line_size);
-}
-
-void ff_vp3_idct_add_sse2(uint8_t *dest, int line_size, DCTELEM *block)
-{
- ff_vp3_idct_sse2(block);
- add_pixels_clamped_mmx(block, dest, line_size);
-}
diff --git a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/vp3dsp_sse2.h b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/vp3dsp_sse2.h deleted file mode 100644 index 9094620eb..000000000 --- a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/vp3dsp_sse2.h +++ /dev/null @@ -1,31 +0,0 @@ -/* - * vp3dsp SSE2 function declarations - * Copyright (c) 2007 Aurelien Jacobs <aurel@gnuage.org> - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#ifndef AVCODEC_X86_VP3DSP_SSE2_H -#define AVCODEC_X86_VP3DSP_SSE2_H - -#include "libavcodec/dsputil.h" - -void ff_vp3_idct_sse2(int16_t *input_data); -void ff_vp3_idct_put_sse2(uint8_t *dest, int line_size, DCTELEM *block); -void ff_vp3_idct_add_sse2(uint8_t *dest, int line_size, DCTELEM *block); - -#endif /* AVCODEC_X86_VP3DSP_SSE2_H */ diff --git a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/vp56dsp.asm b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/vp56dsp.asm index 1b3165e54..0543ba00c 100644 --- a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/vp56dsp.asm +++ b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/vp56dsp.asm @@ -48,8 +48,8 @@ SECTION .text movq m5, m2 punpcklbw m1, m7 punpcklbw m2, m7 - punpcklbw m4, m7 - punpcklbw m5, m7 + punpckhbw m4, m7 + punpckhbw m5, m7 pmullw m1, [rsp+8*13] ; src[x+8 ] * biweight [2] pmullw m2, [rsp+8*14] ; src[x+16] * biweight [3] pmullw m4, [rsp+8*13] ; src[x+8 ] * biweight [2] @@ -95,13 +95,13 @@ SECTION .text punpckldq m3, m3 punpckhdq m4, m4 punpckhwd m5, m5 - movq m6, m5 - punpckhdq m6, m6 + movq m2, m5 + punpckhdq m2, m2 punpckldq m5, m5 movq [rsp+8*11], m3 movq [rsp+8*12], m4 movq [rsp+8*13], m5 - movq [rsp+8*14], m6 + movq [rsp+8*14], m2 %endmacro %macro SPLAT4REGS_SSE2 0 diff --git a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/vp56dsp_init.c b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/vp56dsp_init.c index 5120ed231..87fc93531 100644 --- a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/vp56dsp_init.c +++ b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/vp56dsp_init.c @@ -20,6 +20,7 @@ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ +#include "libavutil/cpu.h" #include "libavutil/x86_cpu.h" #include "libavcodec/dsputil.h" #include "libavcodec/vp56dsp.h" @@ -32,14 +33,14 @@ void ff_vp6_filter_diag4_sse2(uint8_t *dst, uint8_t *src, int stride, av_cold void ff_vp56dsp_init_x86(VP56DSPContext* c, enum CodecID codec) { #if HAVE_YASM - int mm_flags = mm_support(); + int mm_flags = av_get_cpu_flags(); if (CONFIG_VP6_DECODER && codec == CODEC_ID_VP6) { - if (mm_flags & FF_MM_MMX) { + if (mm_flags & AV_CPU_FLAG_MMX) { c->vp6_filter_diag4 = ff_vp6_filter_diag4_mmx; } - if (mm_flags & FF_MM_SSE2) { + if (mm_flags & AV_CPU_FLAG_SSE2) { c->vp6_filter_diag4 = ff_vp6_filter_diag4_sse2; } } diff --git a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/vp8dsp-init.c b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/vp8dsp-init.c index ed5cf4602..201b34e24 100644 --- a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/vp8dsp-init.c +++ b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/vp8dsp-init.c @@ -20,6 +20,7 @@ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ +#include "libavutil/cpu.h" #include "libavutil/x86_cpu.h" #include "libavcodec/vp8dsp.h" @@ -282,10 +283,10 @@ DECLARE_LOOP_FILTER(sse4) av_cold void ff_vp8dsp_init_x86(VP8DSPContext* c) { - int mm_flags = mm_support(); + int mm_flags = av_get_cpu_flags(); #if HAVE_YASM - if (mm_flags & FF_MM_MMX) { + if (mm_flags & AV_CPU_FLAG_MMX) { c->vp8_idct_dc_add = ff_vp8_idct_dc_add_mmx; c->vp8_idct_dc_add4y = ff_vp8_idct_dc_add4y_mmx; c->vp8_idct_dc_add4uv = ff_vp8_idct_dc_add4uv_mmx; @@ -312,7 +313,7 @@ av_cold void ff_vp8dsp_init_x86(VP8DSPContext* c) /* note that 4-tap width=16 functions are missing because w=16 * is only used for luma, and luma is always a copy or sixtap. */ - if (mm_flags & FF_MM_MMX2) { + if (mm_flags & AV_CPU_FLAG_MMX2) { VP8_LUMA_MC_FUNC(0, 16, mmxext); VP8_MC_FUNC(1, 8, mmxext); VP8_MC_FUNC(2, 4, mmxext); @@ -334,14 +335,14 @@ av_cold void ff_vp8dsp_init_x86(VP8DSPContext* c) c->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_mbedge_mmxext; } - if (mm_flags & FF_MM_SSE) { + if (mm_flags & AV_CPU_FLAG_SSE) { c->vp8_idct_add = ff_vp8_idct_add_sse; c->vp8_luma_dc_wht = ff_vp8_luma_dc_wht_sse; c->put_vp8_epel_pixels_tab[0][0][0] = c->put_vp8_bilinear_pixels_tab[0][0][0] = ff_put_vp8_pixels16_sse; } - if (mm_flags & (FF_MM_SSE2|FF_MM_SSE2SLOW)) { + if (mm_flags & (AV_CPU_FLAG_SSE2|AV_CPU_FLAG_SSE2SLOW)) { VP8_LUMA_MC_FUNC(0, 16, sse2); VP8_MC_FUNC(1, 8, sse2); VP8_BILINEAR_MC_FUNC(0, 16, sse2); @@ -356,7 +357,7 @@ av_cold void ff_vp8dsp_init_x86(VP8DSPContext* c) c->vp8_v_loop_filter8uv = ff_vp8_v_loop_filter8uv_mbedge_sse2; } - if (mm_flags & FF_MM_SSE2) { + if (mm_flags & AV_CPU_FLAG_SSE2) { c->vp8_idct_dc_add4y = ff_vp8_idct_dc_add4y_sse2; c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_sse2; @@ -368,7 +369,7 @@ av_cold void ff_vp8dsp_init_x86(VP8DSPContext* c) c->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_mbedge_sse2; } - if (mm_flags & FF_MM_SSSE3) { + if (mm_flags & AV_CPU_FLAG_SSSE3) { VP8_LUMA_MC_FUNC(0, 16, ssse3); VP8_MC_FUNC(1, 8, ssse3); VP8_MC_FUNC(2, 4, ssse3); @@ -390,7 +391,7 @@ av_cold void ff_vp8dsp_init_x86(VP8DSPContext* c) c->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_mbedge_ssse3; } - if (mm_flags & FF_MM_SSE4) { + if (mm_flags & AV_CPU_FLAG_SSE4) { c->vp8_idct_dc_add = ff_vp8_idct_dc_add_sse4; c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_sse4; diff --git a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/vp8dsp.asm b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/vp8dsp.asm index 8cdbb3c7a..bc5ccc8e3 100644 --- a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/vp8dsp.asm +++ b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/vp8dsp.asm @@ -1342,7 +1342,7 @@ VP8_DC_WHT sse psrldq m%2, 4 %if %10 == 8 movd [%5+%8*2], m%1 - movd %5, m%3 + movd %5d, m%3 %endif psrldq m%3, 4 psrldq m%4, 4 @@ -1379,26 +1379,26 @@ VP8_DC_WHT sse ; 4 is a pointer to the destination's 4th line ; 5/6 is -stride and +stride %macro WRITE_2x4W 6 - movd %3, %1 + movd %3d, %1 punpckhdq %1, %1 mov [%4+%5*4], %3w shr %3, 16 add %4, %6 mov [%4+%5*4], %3w - movd %3, %1 + movd %3d, %1 add %4, %5 mov [%4+%5*2], %3w shr %3, 16 mov [%4+%5 ], %3w - movd %3, %2 + movd %3d, %2 punpckhdq %2, %2 mov [%4 ], %3w shr %3, 16 mov [%4+%6 ], %3w - movd %3, %2 + movd %3d, %2 add %4, %6 mov [%4+%6 ], %3w shr %3, 16 @@ -1407,27 +1407,27 @@ VP8_DC_WHT sse %endmacro %macro WRITE_8W_SSE2 5 - movd %2, %1 + movd %2d, %1 psrldq %1, 4 mov [%3+%4*4], %2w shr %2, 16 add %3, %5 mov [%3+%4*4], %2w - movd %2, %1 + movd %2d, %1 psrldq %1, 4 add %3, %4 mov [%3+%4*2], %2w shr %2, 16 mov [%3+%4 ], %2w - movd %2, %1 + movd %2d, %1 psrldq %1, 4 mov [%3 ], %2w shr %2, 16 mov [%3+%5 ], %2w - movd %2, %1 + movd %2d, %1 add %3, %5 mov [%3+%5 ], %2w shr %2, 16 @@ -1446,27 +1446,27 @@ VP8_DC_WHT sse %endmacro %macro SPLATB_REG_MMX 2-3 - movd %1, %2 + movd %1, %2d punpcklbw %1, %1 punpcklwd %1, %1 punpckldq %1, %1 %endmacro %macro SPLATB_REG_MMXEXT 2-3 - movd %1, %2 + movd %1, %2d punpcklbw %1, %1 pshufw %1, %1, 0x0 %endmacro %macro SPLATB_REG_SSE2 2-3 - movd %1, %2 + movd %1, %2d punpcklbw %1, %1 pshuflw %1, %1, 0x0 punpcklqdq %1, %1 %endmacro %macro SPLATB_REG_SSSE3 3 - movd %1, %2 + movd %1, %2d pshufb %1, %3 %endmacro diff --git a/src/filters/transform/MPCVideoDec/ffmpeg/libavcore/avcore.h b/src/filters/transform/MPCVideoDec/ffmpeg/libavcore/avcore.h index 5ddb6167a..d73572e27 100644 --- a/src/filters/transform/MPCVideoDec/ffmpeg/libavcore/avcore.h +++ b/src/filters/transform/MPCVideoDec/ffmpeg/libavcore/avcore.h @@ -24,10 +24,10 @@ * shared media utilities for the libav* libraries */ -#include <libavutil/avutil.h> +#include "libavutil/avutil.h" #define LIBAVCORE_VERSION_MAJOR 0 -#define LIBAVCORE_VERSION_MINOR 2 +#define LIBAVCORE_VERSION_MINOR 9 #define LIBAVCORE_VERSION_MICRO 0 #define LIBAVCORE_VERSION_INT AV_VERSION_INT(LIBAVCORE_VERSION_MAJOR, \ @@ -55,4 +55,12 @@ const char *avcore_configuration(void); */ const char *avcore_license(void); +/** + * Those FF_API_* defines are not part of public API. + * They may change, break or disappear at any time. + */ +#ifndef FF_API_OLD_IMAGE_NAMES +#define FF_API_OLD_IMAGE_NAMES (LIBAVCORE_VERSION_MAJOR < 1) +#endif + #endif /* AVCORE_AVCORE_H */ diff --git a/src/filters/transform/MPCVideoDec/ffmpeg/libavcore/imgutils.c b/src/filters/transform/MPCVideoDec/ffmpeg/libavcore/imgutils.c index ebaeff16c..0a21f6de2 100644 --- a/src/filters/transform/MPCVideoDec/ffmpeg/libavcore/imgutils.c +++ b/src/filters/transform/MPCVideoDec/ffmpeg/libavcore/imgutils.c @@ -24,7 +24,25 @@ #include "imgutils.h" #include "libavutil/pixdesc.h" -int av_get_image_linesize(enum PixelFormat pix_fmt, int width, int plane) +void av_image_fill_max_pixsteps(int max_pixsteps[4], int max_pixstep_comps[4], + const AVPixFmtDescriptor *pixdesc) +{ + int i; + memset(max_pixsteps, 0, 4*sizeof(max_pixsteps[0])); + if (max_pixstep_comps) + memset(max_pixstep_comps, 0, 4*sizeof(max_pixstep_comps[0])); + + for (i = 0; i < 4; i++) { + const AVComponentDescriptor *comp = &(pixdesc->comp[i]); + if ((comp->step_minus1+1) > max_pixsteps[comp->plane]) { + max_pixsteps[comp->plane] = comp->step_minus1+1; + if (max_pixstep_comps) + max_pixstep_comps[comp->plane] = i; + } + } +} + +int av_image_get_linesize(enum PixelFormat pix_fmt, int width, int plane) { const AVPixFmtDescriptor *desc = &av_pix_fmt_descriptors[pix_fmt]; int max_step [4]; /* max pixel step for each plane */ @@ -34,12 +52,12 @@ int av_get_image_linesize(enum PixelFormat pix_fmt, int width, int plane) if (desc->flags & PIX_FMT_BITSTREAM) return (width * (desc->comp[0].step_minus1+1) + 7) >> 3; - av_fill_image_max_pixsteps(max_step, max_step_comp, desc); + av_image_fill_max_pixsteps(max_step, max_step_comp, desc); s = (max_step_comp[plane] == 1 || max_step_comp[plane] == 2) ? desc->log2_chroma_w : 0; return max_step[plane] * (((width + (1 << s) - 1)) >> s); } -int av_fill_image_linesizes(int linesizes[4], enum PixelFormat pix_fmt, int width) +int av_image_fill_linesizes(int linesizes[4], enum PixelFormat pix_fmt, int width) { int i; const AVPixFmtDescriptor *desc = &av_pix_fmt_descriptors[pix_fmt]; @@ -48,7 +66,7 @@ int av_fill_image_linesizes(int linesizes[4], enum PixelFormat pix_fmt, int widt memset(linesizes, 0, 4*sizeof(linesizes[0])); - if (desc->flags & PIX_FMT_HWACCEL) + if ((unsigned)pix_fmt >= PIX_FMT_NB || desc->flags & PIX_FMT_HWACCEL) return AVERROR(EINVAL); if (desc->flags & PIX_FMT_BITSTREAM) { @@ -56,7 +74,7 @@ int av_fill_image_linesizes(int linesizes[4], enum PixelFormat pix_fmt, int widt return 0; } - av_fill_image_max_pixsteps(max_step, max_step_comp, desc); + av_image_fill_max_pixsteps(max_step, max_step_comp, desc); for (i = 0; i < 4; i++) { int s = (max_step_comp[i] == 1 || max_step_comp[i] == 2) ? desc->log2_chroma_w : 0; linesizes[i] = max_step[i] * (((width + (1 << s) - 1)) >> s); @@ -65,7 +83,7 @@ int av_fill_image_linesizes(int linesizes[4], enum PixelFormat pix_fmt, int widt return 0; } -int av_fill_image_pointers(uint8_t *data[4], enum PixelFormat pix_fmt, int height, +int av_image_fill_pointers(uint8_t *data[4], enum PixelFormat pix_fmt, int height, uint8_t *ptr, const int linesizes[4]) { int i, total_size, size[4], has_plane[4]; @@ -75,7 +93,7 @@ int av_fill_image_pointers(uint8_t *data[4], enum PixelFormat pix_fmt, int heigh memset(size , 0, sizeof(size)); memset(has_plane, 0, sizeof(has_plane)); - if (desc->flags & PIX_FMT_HWACCEL) + if ((unsigned)pix_fmt >= PIX_FMT_NB || desc->flags & PIX_FMT_HWACCEL) return AVERROR(EINVAL); data[0] = ptr; @@ -110,7 +128,7 @@ typedef struct ImgUtils { static const AVClass imgutils_class = { "IMGUTILS", av_default_item_name, NULL, LIBAVUTIL_VERSION_INT, offsetof(ImgUtils, log_offset), offsetof(ImgUtils, log_ctx) }; -int av_check_image_size(unsigned int w, unsigned int h, int log_offset, void *log_ctx) +int av_image_check_size(unsigned int w, unsigned int h, int log_offset, void *log_ctx) { ImgUtils imgutils = { &imgutils_class, log_offset, log_ctx }; @@ -120,3 +138,79 @@ int av_check_image_size(unsigned int w, unsigned int h, int log_offset, void *lo av_log(&imgutils, AV_LOG_ERROR, "Picture size %ux%u is invalid\n", w, h); return AVERROR(EINVAL); } + +void av_image_copy_plane(uint8_t *dst, int dst_linesize, + const uint8_t *src, int src_linesize, + int bytewidth, int height) +{ + if (!dst || !src) + return; + for (;height > 0; height--) { + memcpy(dst, src, bytewidth); + dst += dst_linesize; + src += src_linesize; + } +} + +void av_image_copy(uint8_t *dst_data[4], int dst_linesizes[4], + const uint8_t *src_data[4], const int src_linesizes[4], + enum PixelFormat pix_fmt, int width, int height) +{ + const AVPixFmtDescriptor *desc = &av_pix_fmt_descriptors[pix_fmt]; + + if (desc->flags & PIX_FMT_HWACCEL) + return; + + if (desc->flags & PIX_FMT_PAL) { + av_image_copy_plane(dst_data[0], dst_linesizes[0], + src_data[0], src_linesizes[0], + width, height); + /* copy the palette */ + memcpy(dst_data[1], src_data[1], 4*256); + } else { + int i, planes_nb = 0; + + for (i = 0; i < desc->nb_components; i++) + planes_nb = FFMAX(planes_nb, desc->comp[i].plane + 1); + + for (i = 0; i < planes_nb; i++) { + int h = height; + int bwidth = av_image_get_linesize(pix_fmt, width, i); + if (i == 1 || i == 2) { + h= -((-height)>>desc->log2_chroma_h); + } + av_image_copy_plane(dst_data[i], dst_linesizes[i], + src_data[i], src_linesizes[i], + bwidth, h); + } + } +} + +#if FF_API_OLD_IMAGE_NAMES +void av_fill_image_max_pixsteps(int max_pixsteps[4], int max_pixstep_comps[4], + const AVPixFmtDescriptor *pixdesc) +{ + av_image_fill_max_pixsteps(max_pixsteps, max_pixstep_comps, pixdesc); +} + +int av_get_image_linesize(enum PixelFormat pix_fmt, int width, int plane) +{ + return av_image_get_linesize(pix_fmt, width, plane); +} + +int av_fill_image_linesizes(int linesizes[4], enum PixelFormat pix_fmt, int width) +{ + return av_image_fill_linesizes(linesizes, pix_fmt, width); +} + +int av_fill_image_pointers(uint8_t *data[4], enum PixelFormat pix_fmt, int height, + uint8_t *ptr, const int linesizes[4]) +{ + return av_image_fill_pointers(data, pix_fmt, height, ptr, linesizes); +} + +int av_check_image_size(unsigned int w, unsigned int h, int log_offset, void *log_ctx) +{ + return av_image_check_size(w, h, log_offset, log_ctx); +} +#endif diff --git a/src/filters/transform/MPCVideoDec/ffmpeg/libavcore/imgutils.h b/src/filters/transform/MPCVideoDec/ffmpeg/libavcore/imgutils.h index 8e08d4738..8458fc6bb 100644 --- a/src/filters/transform/MPCVideoDec/ffmpeg/libavcore/imgutils.h +++ b/src/filters/transform/MPCVideoDec/ffmpeg/libavcore/imgutils.h @@ -25,7 +25,6 @@ */ #include "libavutil/pixdesc.h" -#include "libavutil/pixfmt.h" #include "avcore.h" /** @@ -44,23 +43,8 @@ * @param max_pixstep_comps an array which is filled with the component * for each plane which has the max pixel step. May be NULL. */ -static inline void av_fill_image_max_pixsteps(int max_pixsteps[4], int max_pixstep_comps[4], - const AVPixFmtDescriptor *pixdesc) -{ - int i; - memset(max_pixsteps, 0, 4*sizeof(max_pixsteps[0])); - if (max_pixstep_comps) - memset(max_pixstep_comps, 0, 4*sizeof(max_pixstep_comps[0])); - - for (i = 0; i < 4; i++) { - const AVComponentDescriptor *comp = &(pixdesc->comp[i]); - if ((comp->step_minus1+1) > max_pixsteps[comp->plane]) { - max_pixsteps[comp->plane] = comp->step_minus1+1; - if (max_pixstep_comps) - max_pixstep_comps[comp->plane] = i; - } - } -} +void av_image_fill_max_pixsteps(int max_pixsteps[4], int max_pixstep_comps[4], + const AVPixFmtDescriptor *pixdesc); /** * Compute the size of an image line with format pix_fmt and width @@ -68,7 +52,7 @@ static inline void av_fill_image_max_pixsteps(int max_pixsteps[4], int max_pixst * * @return the computed size in bytes */ -int av_get_image_linesize(enum PixelFormat pix_fmt, int width, int plane); +int av_image_get_linesize(enum PixelFormat pix_fmt, int width, int plane); /** * Fill plane linesizes for an image with pixel format pix_fmt and @@ -77,7 +61,7 @@ int av_get_image_linesize(enum PixelFormat pix_fmt, int width, int plane); * @param linesizes array to be filled with the linesize for each plane * @return >= 0 in case of success, a negative error code otherwise */ -int av_fill_image_linesizes(int linesizes[4], enum PixelFormat pix_fmt, int width); +int av_image_fill_linesizes(int linesizes[4], enum PixelFormat pix_fmt, int width); /** * Fill plane data pointers for an image with pixel format pix_fmt and @@ -86,14 +70,37 @@ int av_fill_image_linesizes(int linesizes[4], enum PixelFormat pix_fmt, int widt * @param data pointers array to be filled with the pointer for each image plane * @param ptr the pointer to a buffer which will contain the image * @param linesizes[4] the array containing the linesize for each - * plane, should be filled by av_fill_image_linesizes() + * plane, should be filled by av_image_fill_linesizes() * @return the size in bytes required for the image buffer, a negative * error code in case of failure */ -int av_fill_image_pointers(uint8_t *data[4], enum PixelFormat pix_fmt, int height, +int av_image_fill_pointers(uint8_t *data[4], enum PixelFormat pix_fmt, int height, uint8_t *ptr, const int linesizes[4]); /** + * Copy image plane from src to dst. + * That is, copy "height" number of lines of "bytewidth" bytes each. + * The first byte of each successive line is separated by *_linesize + * bytes. + * + * @param dst_linesize linesize for the image plane in dst + * @param src_linesize linesize for the image plane in src + */ +void av_image_copy_plane(uint8_t *dst, int dst_linesize, + const uint8_t *src, int src_linesize, + int bytewidth, int height); + +/** + * Copy image in src_data to dst_data. + * + * @param dst_linesize linesizes for the image in dst_data + * @param src_linesize linesizes for the image in src_data + */ +void av_image_copy(uint8_t *dst_data[4], int dst_linesizes[4], + const uint8_t *src_data[4], const int src_linesizes[4], + enum PixelFormat pix_fmt, int width, int height); + +/** * Check if the given dimension of an image is valid, meaning that all * bytes of the image can be addressed with a signed int. * @@ -103,6 +110,25 @@ int av_fill_image_pointers(uint8_t *data[4], enum PixelFormat pix_fmt, int heigh * @param log_ctx the parent logging context, it may be NULL * @return >= 0 if valid, a negative error code otherwise */ +int av_image_check_size(unsigned int w, unsigned int h, int log_offset, void *log_ctx); + +#if FF_API_OLD_IMAGE_NAMES +attribute_deprecated +void av_fill_image_max_pixsteps(int max_pixsteps[4], int max_pixstep_comps[4], + const AVPixFmtDescriptor *pixdesc); + +attribute_deprecated +int av_get_image_linesize(enum PixelFormat pix_fmt, int width, int plane); + +attribute_deprecated +int av_fill_image_linesizes(int linesizes[4], enum PixelFormat pix_fmt, int width); + +attribute_deprecated +int av_fill_image_pointers(uint8_t *data[4], enum PixelFormat pix_fmt, int height, + uint8_t *ptr, const int linesizes[4]); + +attribute_deprecated int av_check_image_size(unsigned int w, unsigned int h, int log_offset, void *log_ctx); +#endif #endif /* AVCORE_IMGUTILS_H */ diff --git a/src/filters/transform/MPCVideoDec/ffmpeg/libavutil/avstring.c b/src/filters/transform/MPCVideoDec/ffmpeg/libavutil/avstring.c new file mode 100644 index 000000000..1ce680ac4 --- /dev/null +++ b/src/filters/transform/MPCVideoDec/ffmpeg/libavutil/avstring.c @@ -0,0 +1,102 @@ +/* + * Copyright (c) 2000, 2001, 2002 Fabrice Bellard + * Copyright (c) 2007 Mans Rullgard + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include <stdarg.h> +#include <stdio.h> +#include <string.h> +#include <ctype.h> +#include "avstring.h" +#include "mem.h" + +#define vsnprintf _vsnprintf +#define snprintf _snprintf + +int av_strstart(const char *str, const char *pfx, const char **ptr) +{ + while (*pfx && *pfx == *str) { + pfx++; + str++; + } + if (!*pfx && ptr) + *ptr = str; + return !*pfx; +} + +int av_stristart(const char *str, const char *pfx, const char **ptr) +{ + while (*pfx && toupper((unsigned)*pfx) == toupper((unsigned)*str)) { + pfx++; + str++; + } + if (!*pfx && ptr) + *ptr = str; + return !*pfx; +} + +char *av_stristr(const char *s1, const char *s2) +{ + if (!*s2) + return s1; + + do { + if (av_stristart(s1, s2, NULL)) + return s1; + } while (*s1++); + + return NULL; +} + +size_t av_strlcpy(char *dst, const char *src, size_t size) +{ + size_t len = 0; + while (++len < size && *src) + *dst++ = *src++; + if (len <= size) + *dst = 0; + return len + strlen(src) - 1; +} + +size_t av_strlcat(char *dst, const char *src, size_t size) +{ + size_t len = strlen(dst); + if (size <= len + 1) + return len + strlen(src); + return len + av_strlcpy(dst + len, src, size - len); +} + +size_t av_strlcatf(char *dst, size_t size, const char *fmt, ...) +{ + int len = strlen(dst); + va_list vl; + + va_start(vl, fmt); + len += vsnprintf(dst + len, size > len ? size - len : 0, fmt, vl); + va_end(vl); + + return len; +} + +char *av_d2str(double d) +{ + char *str= av_malloc(16); + if(str) snprintf(str, 16, "%f", d); + return str; +} diff --git a/src/filters/transform/MPCVideoDec/ffmpeg/libavutil/avutil.h b/src/filters/transform/MPCVideoDec/ffmpeg/libavutil/avutil.h index 2c6d2e5b6..a2b8f0130 100644 --- a/src/filters/transform/MPCVideoDec/ffmpeg/libavutil/avutil.h +++ b/src/filters/transform/MPCVideoDec/ffmpeg/libavutil/avutil.h @@ -40,7 +40,7 @@ #define AV_VERSION(a, b, c) AV_VERSION_DOT(a, b, c)
#define LIBAVUTIL_VERSION_MAJOR 50
-#define LIBAVUTIL_VERSION_MINOR 22
+#define LIBAVUTIL_VERSION_MINOR 27
#define LIBAVUTIL_VERSION_MICRO 0
#define LIBAVUTIL_VERSION_INT AV_VERSION_INT(LIBAVUTIL_VERSION_MAJOR, \
diff --git a/src/filters/transform/MPCVideoDec/ffmpeg/libavutil/common.h b/src/filters/transform/MPCVideoDec/ffmpeg/libavutil/common.h index 5d5e0f2c4..670d3b934 100644 --- a/src/filters/transform/MPCVideoDec/ffmpeg/libavutil/common.h +++ b/src/filters/transform/MPCVideoDec/ffmpeg/libavutil/common.h @@ -198,6 +198,20 @@ static inline av_const int av_ceil_log2_c(int x) return av_log2((x - 1) << 1);
}
+/**
+ * Count number of bits set to one in x
+ * @param x value to count bits of
+ * @return the number of bits set to one in x
+ */
+static inline av_const int av_popcount_c(uint32_t x)
+{
+ x -= (x >> 1) & 0x55555555;
+ x = (x & 0x33333333) + ((x >> 2) & 0x33333333);
+ x = (x + (x >> 4)) & 0x0F0F0F0F;
+ x += x >> 8;
+ return (x + (x >> 16)) & 0x3F;
+}
+
#define MKTAG(a,b,c,d) ((a) | ((b) << 8) | ((c) << 16) | ((d) << 24))
#define MKBETAG(a,b,c,d) ((d) | ((c) << 8) | ((b) << 16) | ((a) << 24))
@@ -360,5 +374,8 @@ static inline av_const int av_ceil_log2_c(int x) #ifndef av_clipf
# define av_clipf av_clipf_c
#endif
+#ifndef av_popcount
+# define av_popcount av_popcount_c
+#endif
#endif /* HAVE_AV_CONFIG_H */
diff --git a/src/filters/transform/MPCVideoDec/ffmpeg/libavutil/cpu.c b/src/filters/transform/MPCVideoDec/ffmpeg/libavutil/cpu.c new file mode 100644 index 000000000..1548530b8 --- /dev/null +++ b/src/filters/transform/MPCVideoDec/ffmpeg/libavutil/cpu.c @@ -0,0 +1,68 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "cpu.h" +#include "config.h" + +int av_get_cpu_flags(void) +{ + static int flags, checked; + + if (checked) + return flags; + + if (ARCH_ARM) flags = ff_get_cpu_flags_arm(); + if (ARCH_PPC) flags = ff_get_cpu_flags_ppc(); + if (ARCH_X86) flags = ff_get_cpu_flags_x86(); + + checked = 1; + return flags; +} + +#ifdef TEST + +#undef printf + +int main(void) +{ + int cpu_flags = av_get_cpu_flags(); + + printf("cpu_flags = 0x%08X\n", cpu_flags); + printf("cpu_flags = %s%s%s%s%s%s%s%s%s%s%s%s\n", +#if ARCH_ARM + cpu_flags & AV_CPU_FLAG_IWMMXT ? "IWMMXT " : "", +#elif ARCH_PPC + cpu_flags & AV_CPU_FLAG_ALTIVEC ? "ALTIVEC " : "", +#elif ARCH_X86 + cpu_flags & AV_CPU_FLAG_MMX ? "MMX " : "", + cpu_flags & AV_CPU_FLAG_MMX2 ? "MMX2 " : "", + cpu_flags & AV_CPU_FLAG_SSE ? "SSE " : "", + cpu_flags & AV_CPU_FLAG_SSE2 ? "SSE2 " : "", + cpu_flags & AV_CPU_FLAG_SSE2SLOW ? "SSE2(slow) " : "", + cpu_flags & AV_CPU_FLAG_SSE3 ? "SSE3 " : "", + cpu_flags & AV_CPU_FLAG_SSE3SLOW ? "SSE3(slow) " : "", + cpu_flags & AV_CPU_FLAG_SSSE3 ? "SSSE3 " : "", + cpu_flags & AV_CPU_FLAG_SSE4 ? "SSE4.1 " : "", + cpu_flags & AV_CPU_FLAG_SSE42 ? "SSE4.2 " : "", + cpu_flags & AV_CPU_FLAG_3DNOW ? "3DNow " : "", + cpu_flags & AV_CPU_FLAG_3DNOWEXT ? "3DNowExt " : ""); +#endif + return 0; +} + +#endif diff --git a/src/filters/transform/MPCVideoDec/ffmpeg/libavutil/cpu.h b/src/filters/transform/MPCVideoDec/ffmpeg/libavutil/cpu.h new file mode 100644 index 000000000..71cc26529 --- /dev/null +++ b/src/filters/transform/MPCVideoDec/ffmpeg/libavutil/cpu.h @@ -0,0 +1,52 @@ +/* + * Copyright (c) 2000, 2001, 2002 Fabrice Bellard + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVUTIL_CPU_H +#define AVUTIL_CPU_H + +#define AV_CPU_FLAG_FORCE 0x80000000 /* force usage of selected flags (OR) */ + + /* lower 16 bits - CPU features */ +#define AV_CPU_FLAG_MMX 0x0001 ///< standard MMX +#define AV_CPU_FLAG_MMX2 0x0002 ///< SSE integer functions or AMD MMX ext +#define AV_CPU_FLAG_3DNOW 0x0004 ///< AMD 3DNOW +#define AV_CPU_FLAG_SSE 0x0008 ///< SSE functions +#define AV_CPU_FLAG_SSE2 0x0010 ///< PIV SSE2 functions +#define AV_CPU_FLAG_SSE2SLOW 0x40000000 ///< SSE2 supported, but usually not faster +#define AV_CPU_FLAG_3DNOWEXT 0x0020 ///< AMD 3DNowExt +#define AV_CPU_FLAG_SSE3 0x0040 ///< Prescott SSE3 functions +#define AV_CPU_FLAG_SSE3SLOW 0x20000000 ///< SSE3 supported, but usually not faster +#define AV_CPU_FLAG_SSSE3 0x0080 ///< Conroe SSSE3 functions +#define AV_CPU_FLAG_SSE4 0x0100 ///< Penryn SSE4.1 functions +#define AV_CPU_FLAG_SSE42 0x0200 ///< Nehalem SSE4.2 functions +#define AV_CPU_FLAG_IWMMXT 0x0100 ///< XScale IWMMXT +#define AV_CPU_FLAG_ALTIVEC 0x0001 ///< standard + +/** + * Return the flags which specify extensions supported by the CPU. + */ +int av_get_cpu_flags(void); + +/* The following CPU-specific functions shall not be called directly. */ +int ff_get_cpu_flags_arm(void); +int ff_get_cpu_flags_ppc(void); +int ff_get_cpu_flags_x86(void); + +#endif /* AVUTIL_CPU_H */ diff --git a/src/filters/transform/MPCVideoDec/ffmpeg/libavutil/eval.c b/src/filters/transform/MPCVideoDec/ffmpeg/libavutil/eval.c new file mode 100644 index 000000000..db6e63a5e --- /dev/null +++ b/src/filters/transform/MPCVideoDec/ffmpeg/libavutil/eval.c @@ -0,0 +1,523 @@ +/*
+ * Copyright (c) 2002-2006 Michael Niedermayer <michaelni@gmx.at>
+ * Copyright (c) 2006 Oded Shimon <ods15@ods15.dyndns.org>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * simple arithmetic expression evaluator.
+ *
+ * see http://joe.hotchkiss.com/programming/eval/eval.html
+ */
+
+#include "libavutil/avutil.h"
+#include "eval.h"
+
+typedef struct Parser {
+ const AVClass *class;
+ int stack_index;
+ char *s;
+ const double *const_values;
+ const char * const *const_names; // NULL terminated
+ double (* const *funcs1)(void *, double a); // NULL terminated
+ const char * const *func1_names; // NULL terminated
+ double (* const *funcs2)(void *, double a, double b); // NULL terminated
+ const char * const *func2_names; // NULL terminated
+ void *opaque;
+ int log_offset;
+ void *log_ctx;
+#define VARS 10
+ double var[VARS];
+} Parser;
+
+static const AVClass class = { "Eval", av_default_item_name, NULL, LIBAVUTIL_VERSION_INT, offsetof(Parser,log_offset), offsetof(Parser,log_ctx) };
+
+#ifdef __GNUC__
+static const int8_t si_prefixes['z' - 'E' + 1]={
+ ['y'-'E']= -24,
+ ['z'-'E']= -21,
+ ['a'-'E']= -18,
+ ['f'-'E']= -15,
+ ['p'-'E']= -12,
+ ['n'-'E']= - 9,
+ ['u'-'E']= - 6,
+ ['m'-'E']= - 3,
+ ['c'-'E']= - 2,
+ ['d'-'E']= - 1,
+ ['h'-'E']= 2,
+ ['k'-'E']= 3,
+ ['K'-'E']= 3,
+ ['M'-'E']= 6,
+ ['G'-'E']= 9,
+ ['T'-'E']= 12,
+ ['P'-'E']= 15,
+ ['E'-'E']= 18,
+ ['Z'-'E']= 21,
+ ['Y'-'E']= 24,
+};
+#else
+static const int8_t si_prefixes['z' - 'E' + 1];
+#endif
+
+double av_strtod(const char *numstr, char **tail)
+{
+ double d;
+ char *next;
+ d = strtod(numstr, &next);
+ /* if parsing succeeded, check for and interpret postfixes */
+ if (next!=numstr) {
+ if (*next >= 'E' && *next <= 'z') {
+ int e= si_prefixes[*next - 'E'];
+ if (e) {
+ if (next[1] == 'i') {
+ d*= pow( 2, e/0.3);
+ next+=2;
+ } else {
+ d*= pow(10, e);
+ next++;
+ }
+ }
+ }
+
+ if (*next=='B') {
+ d*=8;
+ next++;
+ }
+ }
+ /* if requested, fill in tail with the position after the last parsed
+ character */
+ if (tail)
+ *tail = next;
+ return d;
+}
+
+static int strmatch(const char *s, const char *prefix)
+{
+ int i;
+ for (i=0; prefix[i]; i++) {
+ if (prefix[i] != s[i]) return 0;
+ }
+ return 1;
+}
+
+struct AVExpr {
+ enum {
+ e_value, e_const, e_func0, e_func1, e_func2,
+ e_squish, e_gauss, e_ld,
+ e_mod, e_max, e_min, e_eq, e_gt, e_gte,
+ e_pow, e_mul, e_div, e_add,
+ e_last, e_st, e_while,
+ } type;
+ double value; // is sign in other types
+ union {
+ int const_index;
+ double (*func0)(double);
+ double (*func1)(void *, double);
+ double (*func2)(void *, double, double);
+ } a;
+ struct AVExpr *param[2];
+};
+
+static double eval_expr(Parser *p, AVExpr *e)
+{
+ switch (e->type) {
+ case e_value: return e->value;
+ case e_const: return e->value * p->const_values[e->a.const_index];
+ case e_func0: return e->value * e->a.func0(eval_expr(p, e->param[0]));
+ case e_func1: return e->value * e->a.func1(p->opaque, eval_expr(p, e->param[0]));
+ case e_func2: return e->value * e->a.func2(p->opaque, eval_expr(p, e->param[0]), eval_expr(p, e->param[1]));
+ case e_squish: return 1/(1+exp(4*eval_expr(p, e->param[0])));
+ case e_gauss: { double d = eval_expr(p, e->param[0]); return exp(-d*d/2)/sqrt(2*M_PI); }
+ case e_ld: return e->value * p->var[av_clip(eval_expr(p, e->param[0]), 0, VARS-1)];
+ case e_while: {
+ double d = NAN;
+ while (eval_expr(p, e->param[0]))
+ d=eval_expr(p, e->param[1]);
+ return d;
+ }
+ default: {
+ double d = eval_expr(p, e->param[0]);
+ double d2 = eval_expr(p, e->param[1]);
+ switch (e->type) {
+ case e_mod: return e->value * (d - floor(d/d2)*d2);
+ case e_max: return e->value * (d > d2 ? d : d2);
+ case e_min: return e->value * (d < d2 ? d : d2);
+ case e_eq: return e->value * (d == d2 ? 1.0 : 0.0);
+ case e_gt: return e->value * (d > d2 ? 1.0 : 0.0);
+ case e_gte: return e->value * (d >= d2 ? 1.0 : 0.0);
+ case e_pow: return e->value * pow(d, d2);
+ case e_mul: return e->value * (d * d2);
+ case e_div: return e->value * (d / d2);
+ case e_add: return e->value * (d + d2);
+ case e_last:return e->value * d2;
+ case e_st : return e->value * (p->var[av_clip(d, 0, VARS-1)]= d2);
+ }
+ }
+ }
+ return NAN;
+}
+
+static int parse_expr(AVExpr **e, Parser *p);
+
+void av_free_expr(AVExpr *e)
+{
+ if (!e) return;
+ av_free_expr(e->param[0]);
+ av_free_expr(e->param[1]);
+ av_freep(&e);
+}
+
+static int parse_primary(AVExpr **e, Parser *p)
+{
+ AVExpr *d = av_mallocz(sizeof(AVExpr));
+ char *next = p->s, *s0 = p->s;
+ int ret, i;
+
+ if (!d)
+ return AVERROR(ENOMEM);
+
+ /* number */
+ d->value = av_strtod(p->s, &next);
+ if (next != p->s) {
+ d->type = e_value;
+ p->s= next;
+ *e = d;
+ return 0;
+ }
+ d->value = 1;
+
+ /* named constants */
+ for (i=0; p->const_names && p->const_names[i]; i++) {
+ if (strmatch(p->s, p->const_names[i])) {
+ p->s+= strlen(p->const_names[i]);
+ d->type = e_const;
+ d->a.const_index = i;
+ *e = d;
+ return 0;
+ }
+ }
+
+ p->s= strchr(p->s, '(');
+ if (p->s==NULL) {
+ av_log(p, AV_LOG_ERROR, "Undefined constant or missing '(' in '%s'\n", s0);
+ p->s= next;
+ av_free_expr(d);
+ return AVERROR(EINVAL);
+ }
+ p->s++; // "("
+ if (*next == '(') { // special case do-nothing
+ av_freep(&d);
+ if ((ret = parse_expr(&d, p)) < 0)
+ return ret;
+ if (p->s[0] != ')') {
+ av_log(p, AV_LOG_ERROR, "Missing ')' in '%s'\n", s0);
+ av_free_expr(d);
+ return AVERROR(EINVAL);
+ }
+ p->s++; // ")"
+ *e = d;
+ return 0;
+ }
+ if ((ret = parse_expr(&(d->param[0]), p)) < 0) {
+ av_free_expr(d);
+ return ret;
+ }
+ if (p->s[0]== ',') {
+ p->s++; // ","
+ parse_expr(&d->param[1], p);
+ }
+ if (p->s[0] != ')') {
+ av_log(p, AV_LOG_ERROR, "Missing ')' or too many args in '%s'\n", s0);
+ av_free_expr(d);
+ return AVERROR(EINVAL);
+ }
+ p->s++; // ")"
+
+ d->type = e_func0;
+ if (strmatch(next, "sinh" )) d->a.func0 = sinh;
+ else if (strmatch(next, "cosh" )) d->a.func0 = cosh;
+ else if (strmatch(next, "tanh" )) d->a.func0 = tanh;
+ else if (strmatch(next, "sin" )) d->a.func0 = sin;
+ else if (strmatch(next, "cos" )) d->a.func0 = cos;
+ else if (strmatch(next, "tan" )) d->a.func0 = tan;
+ else if (strmatch(next, "atan" )) d->a.func0 = atan;
+ else if (strmatch(next, "asin" )) d->a.func0 = asin;
+ else if (strmatch(next, "acos" )) d->a.func0 = acos;
+ else if (strmatch(next, "exp" )) d->a.func0 = exp;
+ else if (strmatch(next, "log" )) d->a.func0 = log;
+ else if (strmatch(next, "abs" )) d->a.func0 = fabs;
+ else if (strmatch(next, "squish")) d->type = e_squish;
+ else if (strmatch(next, "gauss" )) d->type = e_gauss;
+ else if (strmatch(next, "mod" )) d->type = e_mod;
+ else if (strmatch(next, "max" )) d->type = e_max;
+ else if (strmatch(next, "min" )) d->type = e_min;
+ else if (strmatch(next, "eq" )) d->type = e_eq;
+ else if (strmatch(next, "gte" )) d->type = e_gte;
+ else if (strmatch(next, "gt" )) d->type = e_gt;
+ else if (strmatch(next, "lte" )) { AVExpr *tmp = d->param[1]; d->param[1] = d->param[0]; d->param[0] = tmp; d->type = e_gt; }
+ else if (strmatch(next, "lt" )) { AVExpr *tmp = d->param[1]; d->param[1] = d->param[0]; d->param[0] = tmp; d->type = e_gte; }
+ else if (strmatch(next, "ld" )) d->type = e_ld;
+ else if (strmatch(next, "st" )) d->type = e_st;
+ else if (strmatch(next, "while" )) d->type = e_while;
+ else {
+ for (i=0; p->func1_names && p->func1_names[i]; i++) {
+ if (strmatch(next, p->func1_names[i])) {
+ d->a.func1 = p->funcs1[i];
+ d->type = e_func1;
+ *e = d;
+ return 0;
+ }
+ }
+
+ for (i=0; p->func2_names && p->func2_names[i]; i++) {
+ if (strmatch(next, p->func2_names[i])) {
+ d->a.func2 = p->funcs2[i];
+ d->type = e_func2;
+ *e = d;
+ return 0;
+ }
+ }
+
+ av_log(p, AV_LOG_ERROR, "Unknown function in '%s'\n", s0);
+ av_free_expr(d);
+ return AVERROR(EINVAL);
+ }
+
+ *e = d;
+ return 0;
+}
+
+static AVExpr *new_eval_expr(int type, int value, AVExpr *p0, AVExpr *p1)
+{
+ AVExpr *e = av_mallocz(sizeof(AVExpr));
+ if (!e)
+ return NULL;
+ e->type =type ;
+ e->value =value ;
+ e->param[0] =p0 ;
+ e->param[1] =p1 ;
+ return e;
+}
+
+static int parse_pow(AVExpr **e, Parser *p, int *sign)
+{
+ *sign= (*p->s == '+') - (*p->s == '-');
+ p->s += *sign&1;
+ return parse_primary(e, p);
+}
+
+static int parse_factor(AVExpr **e, Parser *p)
+{
+ int sign, sign2, ret;
+ AVExpr *e0, *e1, *e2;
+ if ((ret = parse_pow(&e0, p, &sign)) < 0)
+ return ret;
+ while(p->s[0]=='^'){
+ e1 = e0;
+ p->s++;
+ if ((ret = parse_pow(&e2, p, &sign2)) < 0) {
+ av_free_expr(e1);
+ return ret;
+ }
+ e0 = new_eval_expr(e_pow, 1, e1, e2);
+ if (!e0) {
+ av_free_expr(e1);
+ av_free_expr(e2);
+ return AVERROR(ENOMEM);
+ }
+ if (e0->param[1]) e0->param[1]->value *= (sign2|1);
+ }
+ if (e0) e0->value *= (sign|1);
+
+ *e = e0;
+ return 0;
+}
+
+static int parse_term(AVExpr **e, Parser *p)
+{
+ int ret;
+ AVExpr *e0, *e1, *e2;
+ if ((ret = parse_factor(&e0, p)) < 0)
+ return ret;
+ while (p->s[0]=='*' || p->s[0]=='/') {
+ int c= *p->s++;
+ e1 = e0;
+ if ((ret = parse_factor(&e2, p)) < 0) {
+ av_free_expr(e1);
+ return ret;
+ }
+ e0 = new_eval_expr(c == '*' ? e_mul : e_div, 1, e1, e2);
+ if (!e0) {
+ av_free_expr(e1);
+ av_free_expr(e2);
+ return AVERROR(ENOMEM);
+ }
+ }
+ *e = e0;
+ return 0;
+}
+
+static int parse_subexpr(AVExpr **e, Parser *p)
+{
+ int ret;
+ AVExpr *e0, *e1, *e2;
+ if ((ret = parse_term(&e0, p)) < 0)
+ return ret;
+ while (*p->s == '+' || *p->s == '-') {
+ e1 = e0;
+ if ((ret = parse_term(&e2, p)) < 0) {
+ av_free_expr(e1);
+ return ret;
+ }
+ e0 = new_eval_expr(e_add, 1, e1, e2);
+ if (!e0) {
+ av_free_expr(e1);
+ av_free_expr(e2);
+ return AVERROR(ENOMEM);
+ }
+ };
+
+ *e = e0;
+ return 0;
+}
+
+static int parse_expr(AVExpr **e, Parser *p)
+{
+ int ret;
+ AVExpr *e0, *e1, *e2;
+ if (p->stack_index <= 0) //protect against stack overflows
+ return AVERROR(EINVAL);
+ p->stack_index--;
+
+ if ((ret = parse_subexpr(&e0, p)) < 0)
+ return ret;
+ while (*p->s == ';') {
+ e1 = e0;
+ if ((ret = parse_subexpr(&e2, p)) < 0) {
+ av_free_expr(e1);
+ return ret;
+ }
+ p->s++;
+ e0 = new_eval_expr(e_last, 1, e1, e2);
+ if (!e0) {
+ av_free_expr(e1);
+ av_free_expr(e2);
+ return AVERROR(ENOMEM);
+ }
+ };
+
+ p->stack_index++;
+ *e = e0;
+ return 0;
+}
+
+static int verify_expr(AVExpr *e)
+{
+ if (!e) return 0;
+ switch (e->type) {
+ case e_value:
+ case e_const: return 1;
+ case e_func0:
+ case e_func1:
+ case e_squish:
+ case e_ld:
+ case e_gauss: return verify_expr(e->param[0]);
+ default: return verify_expr(e->param[0]) && verify_expr(e->param[1]);
+ }
+}
+
+int av_parse_expr(AVExpr **expr, const char *s,
+ const char * const *const_names,
+ const char * const *func1_names, double (* const *funcs1)(void *, double),
+ const char * const *func2_names, double (* const *funcs2)(void *, double, double),
+ int log_offset, void *log_ctx)
+{
+ Parser p;
+ AVExpr *e = NULL;
+ char *w = av_malloc(strlen(s) + 1);
+ char *wp = w;
+ const char *s0 = s;
+ int ret = 0;
+
+ if (!w)
+ return AVERROR(ENOMEM);
+
+ while (*s)
+ if (!isspace(*s++)) *wp++ = s[-1];
+ *wp++ = 0;
+
+ p.class = &class;
+ p.stack_index=100;
+ p.s= w;
+ p.const_names = const_names;
+ p.funcs1 = funcs1;
+ p.func1_names = func1_names;
+ p.funcs2 = funcs2;
+ p.func2_names = func2_names;
+ p.log_offset = log_offset;
+ p.log_ctx = log_ctx;
+
+ if ((ret = parse_expr(&e, &p)) < 0)
+ goto end;
+ if (*p.s) {
+ av_log(&p, AV_LOG_ERROR, "Invalid chars '%s' at the end of expression '%s'\n", p.s, s0);
+ ret = AVERROR(EINVAL);
+ goto end;
+ }
+ if (!verify_expr(e)) {
+ av_free_expr(e);
+ ret = AVERROR(EINVAL);
+ goto end;
+ }
+ *expr = e;
+end:
+ av_free(w);
+ return ret;
+}
+
+double av_eval_expr(AVExpr *e, const double *const_values, void *opaque)
+{
+ Parser p;
+
+ p.const_values = const_values;
+ p.opaque = opaque;
+ return eval_expr(&p, e);
+}
+
+int av_parse_and_eval_expr(double *d, const char *s,
+ const char * const *const_names, const double *const_values,
+ const char * const *func1_names, double (* const *funcs1)(void *, double),
+ const char * const *func2_names, double (* const *funcs2)(void *, double, double),
+ void *opaque, int log_offset, void *log_ctx)
+{
+ AVExpr *e = NULL;
+ int ret = av_parse_expr(&e, s, const_names, func1_names, funcs1, func2_names, funcs2, log_offset, log_ctx);
+
+ if (ret < 0) {
+ *d = NAN;
+ return ret;
+ }
+ *d = av_eval_expr(e, const_values, opaque);
+ av_free_expr(e);
+#ifdef __GNUC__
+ return isnan(*d) ? AVERROR(EINVAL) : 0;
+#else
+ return 0;
+#endif
+}
diff --git a/src/filters/transform/MPCVideoDec/ffmpeg/libavutil/internal.h b/src/filters/transform/MPCVideoDec/ffmpeg/libavutil/internal.h index 41f14ba89..1ebf77b83 100644 --- a/src/filters/transform/MPCVideoDec/ffmpeg/libavutil/internal.h +++ b/src/filters/transform/MPCVideoDec/ffmpeg/libavutil/internal.h @@ -106,7 +106,7 @@ #endif
/* MPC custom code start */
-#if defined(_DEBUG)
+#if defined (_DEBUG)
# define snprintf _snprintf
#endif
/* MPC custom code end */
diff --git a/src/filters/transform/MPCVideoDec/ffmpeg/libavutil/intfloat_readwrite.c b/src/filters/transform/MPCVideoDec/ffmpeg/libavutil/intfloat_readwrite.c index 1e60fc1db..036a2d130 100644 --- a/src/filters/transform/MPCVideoDec/ffmpeg/libavutil/intfloat_readwrite.c +++ b/src/filters/transform/MPCVideoDec/ffmpeg/libavutil/intfloat_readwrite.c @@ -21,7 +21,7 @@ */
/**
- * @file libavutil/intfloat_readwrite.c
+ * @file
* portable IEEE float/double read/write functions
*/
diff --git a/src/filters/transform/MPCVideoDec/ffmpeg/libavutil/lls.c b/src/filters/transform/MPCVideoDec/ffmpeg/libavutil/lls.c index 047f976c3..385579276 100644 --- a/src/filters/transform/MPCVideoDec/ffmpeg/libavutil/lls.c +++ b/src/filters/transform/MPCVideoDec/ffmpeg/libavutil/lls.c @@ -21,7 +21,7 @@ */ /** - * @file libavutil/lls.c + * @file * linear least squares model */ diff --git a/src/filters/transform/MPCVideoDec/ffmpeg/libavutil/log.c b/src/filters/transform/MPCVideoDec/ffmpeg/libavutil/log.c index b9f4e902a..4274c3ea7 100644 --- a/src/filters/transform/MPCVideoDec/ffmpeg/libavutil/log.c +++ b/src/filters/transform/MPCVideoDec/ffmpeg/libavutil/log.c @@ -43,6 +43,7 @@ void av_log_default_callback(void* ptr, int level, const char* fmt, va_list vl) static int print_prefix=1;
static int count;
static char line[1024], prev[1024];
+ static int detect_repeats;
AVClass* avc= ptr ? *(AVClass**)ptr : NULL;
if(level>av_log_level)
return;
@@ -61,7 +62,12 @@ void av_log_default_callback(void* ptr, int level, const char* fmt, va_list vl) vsnprintf(line + strlen(line), sizeof(line) - strlen(line), fmt, vl);
print_prefix= line[strlen(line)-1] == '\n';
- if(print_prefix && !strcmp(line, prev)){
+
+#if HAVE_ISATTY
+ if(!detect_repeats) detect_repeats= isatty(2) ? 1 : -1;
+#endif
+
+ if(print_prefix && detect_repeats==1 && !strcmp(line, prev)){
count++;
fprintf(stderr, " Last message repeated %d times\r", count);
return;
diff --git a/src/filters/transform/MPCVideoDec/ffmpeg/libavutil/pixdesc.c b/src/filters/transform/MPCVideoDec/ffmpeg/libavutil/pixdesc.c index 08daaa307..9015e4fa8 100644 --- a/src/filters/transform/MPCVideoDec/ffmpeg/libavutil/pixdesc.c +++ b/src/filters/transform/MPCVideoDec/ffmpeg/libavutil/pixdesc.c @@ -54,11 +54,14 @@ void av_read_image_line(uint16_t *dst, const uint8_t *data[4], const int linesiz } } else { const uint8_t *p = data[plane]+ y*linesize[plane] + x*step + comp.offset_plus1-1; + int is_8bit = shift + depth <= 8; + + if (is_8bit) + p += !!(flags & PIX_FMT_BE); while(w--){ - int val; - if(flags & PIX_FMT_BE) val= AV_RB16(p); - else val= AV_RL16(p); + int val = is_8bit ? *p : + flags & PIX_FMT_BE ? AV_RB16(p) : AV_RL16(p); val = (val>>shift) & mask; if(read_pal_component) val= data[1][4*val + c]; @@ -92,15 +95,23 @@ void av_write_image_line(const uint16_t *src, uint8_t *data[4], const int linesi int shift = comp.shift; uint8_t *p = data[plane]+ y*linesize[plane] + x*step + comp.offset_plus1-1; - while (w--) { - if (flags & PIX_FMT_BE) { - uint16_t val = AV_RB16(p) | (*src++<<shift); - AV_WB16(p, val); - } else { - uint16_t val = AV_RL16(p) | (*src++<<shift); - AV_WL16(p, val); + if (shift + depth <= 8) { + p += !!(flags & PIX_FMT_BE); + while (w--) { + *p |= (*src++<<shift); + p += step; + } + } else { + while (w--) { + if (flags & PIX_FMT_BE) { + uint16_t val = AV_RB16(p) | (*src++<<shift); + AV_WB16(p, val); + } else { + uint16_t val = AV_RL16(p) | (*src++<<shift); + AV_WL16(p, val); + } + p+= step; } - p+= step; } } } diff --git a/src/filters/transform/MPCVideoDec/ffmpeg/libavutil/x86/bswap.h b/src/filters/transform/MPCVideoDec/ffmpeg/libavutil/x86/bswap.h index 462309f41..b6ceb76d3 100644 --- a/src/filters/transform/MPCVideoDec/ffmpeg/libavutil/x86/bswap.h +++ b/src/filters/transform/MPCVideoDec/ffmpeg/libavutil/x86/bswap.h @@ -17,7 +17,7 @@ */ /** - * @file libavutil/x86/bswap.h + * @file * byte swapping routines */ @@ -28,15 +28,15 @@ #include "config.h" #include "libavutil/attributes.h" -#define bswap_16 bswap_16 -static av_always_inline av_const uint16_t bswap_16(uint16_t x) +#define av_bswap16 av_bswap16 +static av_always_inline av_const uint16_t av_bswap16(uint16_t x) { __asm__("rorw $8, %0" : "+r"(x)); return x; } -#define bswap_32 bswap_32 -static av_always_inline av_const uint32_t bswap_32(uint32_t x) +#define av_bswap32 av_bswap32 +static av_always_inline av_const uint32_t av_bswap32(uint32_t x) { #if HAVE_BSWAP __asm__("bswap %0" : "+r" (x)); @@ -50,8 +50,8 @@ static av_always_inline av_const uint32_t bswap_32(uint32_t x) } #if ARCH_X86_64 -#define bswap_64 bswap_64 -static inline uint64_t av_const bswap_64(uint64_t x) +#define av_bswap64 av_bswap64 +static inline uint64_t av_const av_bswap64(uint64_t x) { __asm__("bswap %0": "=r" (x) : "0" (x)); return x; diff --git a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/cpuid.c b/src/filters/transform/MPCVideoDec/ffmpeg/libavutil/x86/cpu.c index e96e3a93c..7f66ab7c3 100644 --- a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/cpuid.c +++ b/src/filters/transform/MPCVideoDec/ffmpeg/libavutil/x86/cpu.c @@ -21,10 +21,9 @@ */
#include <stdlib.h>
+#include <string.h>
#include "libavutil/x86_cpu.h"
-#include "libavcodec/dsputil.h"
-
-#undef printf
+#include "libavutil/cpu.h"
/* ebx saving is necessary for PIC. gcc seems unable to see it alone */
#define cpuid(index,eax,ebx,ecx,edx)\
@@ -37,7 +36,7 @@ : "0" (index));
/* Function to test if multimedia instructions are supported... */
-int mm_support(void)
+int ff_get_cpu_flags_x86(void)
{
int rval = 0;
int eax, ebx, ecx, edx;
@@ -79,21 +78,21 @@ int mm_support(void) family = ((eax>>8)&0xf) + ((eax>>20)&0xff);
model = ((eax>>4)&0xf) + ((eax>>12)&0xf0);
if (std_caps & (1<<23))
- rval |= FF_MM_MMX;
+ rval |= AV_CPU_FLAG_MMX;
if (std_caps & (1<<25))
- rval |= FF_MM_MMX2
+ rval |= AV_CPU_FLAG_MMX2
#if HAVE_SSE
- | FF_MM_SSE;
+ | AV_CPU_FLAG_SSE;
if (std_caps & (1<<26))
- rval |= FF_MM_SSE2;
+ rval |= AV_CPU_FLAG_SSE2;
if (ecx & 1)
- rval |= FF_MM_SSE3;
+ rval |= AV_CPU_FLAG_SSE3;
if (ecx & 0x00000200 )
- rval |= FF_MM_SSSE3;
+ rval |= AV_CPU_FLAG_SSSE3;
if (ecx & 0x00080000 )
- rval |= FF_MM_SSE4;
+ rval |= AV_CPU_FLAG_SSE4;
if (ecx & 0x00100000 )
- rval |= FF_MM_SSE42;
+ rval |= AV_CPU_FLAG_SSE42;
#endif
;
}
@@ -103,13 +102,13 @@ int mm_support(void) if(max_ext_level >= 0x80000001){
cpuid(0x80000001, eax, ebx, ecx, ext_caps);
if (ext_caps & (1<<31))
- rval |= FF_MM_3DNOW;
+ rval |= AV_CPU_FLAG_3DNOW;
if (ext_caps & (1<<30))
- rval |= FF_MM_3DNOWEXT;
+ rval |= AV_CPU_FLAG_3DNOWEXT;
if (ext_caps & (1<<23))
- rval |= FF_MM_MMX;
+ rval |= AV_CPU_FLAG_MMX;
if (ext_caps & (1<<22))
- rval |= FF_MM_MMX2;
+ rval |= AV_CPU_FLAG_MMX2;
}
if (!strncmp(vendor.c, "GenuineIntel", 12) &&
@@ -117,26 +116,9 @@ int mm_support(void) /* 6/9 (pentium-m "banias"), 6/13 (pentium-m "dothan"), and 6/14 (core1 "yonah")
* theoretically support sse2, but it's usually slower than mmx,
* so let's just pretend they don't. */
- if (rval & FF_MM_SSE2) rval ^= FF_MM_SSE2SLOW|FF_MM_SSE2;
- if (rval & FF_MM_SSE3) rval ^= FF_MM_SSE3SLOW|FF_MM_SSE3;
+ if (rval & AV_CPU_FLAG_SSE2) rval ^= AV_CPU_FLAG_SSE2SLOW|AV_CPU_FLAG_SSE2;
+ if (rval & AV_CPU_FLAG_SSE3) rval ^= AV_CPU_FLAG_SSE3SLOW|AV_CPU_FLAG_SSE3;
}
-#if 0
- av_log(NULL, AV_LOG_DEBUG, "%s%s%s%s%s%s%s%s%s%s%s%s\n",
- (rval&FF_MM_MMX) ? "MMX ":"",
- (rval&FF_MM_MMX2) ? "MMX2 ":"",
- (rval&FF_MM_SSE) ? "SSE ":"",
- (rval&FF_MM_SSE2) ? "SSE2 ":"",
- (rval&FF_MM_SSE2SLOW) ? "SSE2(slow) ":"",
- (rval&FF_MM_SSE3) ? "SSE3 ":"",
- (rval&FF_MM_SSE3SLOW) ? "SSE3(slow) ":"",
- (rval&FF_MM_SSSE3) ? "SSSE3 ":"",
- (rval&FF_MM_SSE4) ? "SSE4.1 ":"",
- (rval&FF_MM_SSE42) ? "SSE4.2 ":"",
- (rval&FF_MM_3DNOW) ? "3DNow ":"",
- (rval&FF_MM_3DNOWEXT) ? "3DNowExt ":"");
-#endif
return rval;
-
- /* TODO: allow overriding with ffdshow settings for disabling extensions */
}
diff --git a/src/filters/transform/MPCVideoDec/ffmpeg/makefile_c.inc b/src/filters/transform/MPCVideoDec/ffmpeg/makefile_c.inc index 7217f1b83..a8bf2d476 100644 --- a/src/filters/transform/MPCVideoDec/ffmpeg/makefile_c.inc +++ b/src/filters/transform/MPCVideoDec/ffmpeg/makefile_c.inc @@ -21,7 +21,7 @@ ifeq ($(64BIT),yes) else
TARGET_OS=i686-pc-mingw32
CFLAGS+=-DWIN32 -D_WIN32
- OPTFLAGS+=-O3 -march=i686 -mmmx
+ OPTFLAGS+=-O2 -march=i686 -mmmx -msse -mfpmath=sse
endif
CFLAGS+=-mno-cygwin -mdll -mthreads -pipe
|