From 66a02159ea9a09965dfa3e06ea55f41e5f615f90 Mon Sep 17 00:00:00 2001 From: "Ronald S. Bultje" Date: Thu, 14 Jun 2012 15:03:08 +0100 Subject: x86: fmtconvert: add special asm for float_to_int16_interleave_misc_* MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This gets rid of a variable-length array and a for loop in C code. Signed-off-by: Martin Storsjö --- libavcodec/x86/fmtconvert.asm | 78 +++++++++++++++++++++++++++++++++++++++++ libavcodec/x86/fmtconvert_mmx.c | 12 ++++--- 2 files changed, 85 insertions(+), 5 deletions(-) diff --git a/libavcodec/x86/fmtconvert.asm b/libavcodec/x86/fmtconvert.asm index 63befc94f6..4916e7af33 100644 --- a/libavcodec/x86/fmtconvert.asm +++ b/libavcodec/x86/fmtconvert.asm @@ -115,6 +115,84 @@ FLOAT_TO_INT16 sse, 0 FLOAT_TO_INT16 3dnow, 0 %undef cvtps2pi +;------------------------------------------------------------------------------ +; void ff_float_to_int16_step(int16_t *dst, const float *src, long len, long step); +;------------------------------------------------------------------------------ +%macro FLOAT_TO_INT16_STEP 2 +cglobal float_to_int16_step_%1, 4,7,%2, dst, src, len, step, step3, v1, v2 + add lenq, lenq + lea srcq, [srcq+2*lenq] + lea step3q, [stepq*3] + neg lenq +.loop: +%ifidn %1, sse2 + cvtps2dq m0, [srcq+2*lenq ] + cvtps2dq m1, [srcq+2*lenq+16] + packssdw m0, m1 + movd v1d, m0 + psrldq m0, 4 + movd v2d, m0 + psrldq m0, 4 + mov [dstq], v1w + mov [dstq+stepq*4], v2w + shr v1d, 16 + shr v2d, 16 + mov [dstq+stepq*2], v1w + mov [dstq+step3q*2], v2w + lea dstq, [dstq+stepq*8] + movd v1d, m0 + psrldq m0, 4 + movd v2d, m0 + mov [dstq], v1w + mov [dstq+stepq*4], v2w + shr v1d, 16 + shr v2d, 16 + mov [dstq+stepq*2], v1w + mov [dstq+step3q*2], v2w + lea dstq, [dstq+stepq*8] +%else + cvtps2pi m0, [srcq+2*lenq ] + cvtps2pi m1, [srcq+2*lenq+ 8] + cvtps2pi m2, [srcq+2*lenq+16] + cvtps2pi m3, [srcq+2*lenq+24] + packssdw m0, m1 + packssdw m2, m3 + movd v1d, m0 + psrlq m0, 32 + movd v2d, m0 + mov [dstq], v1w + mov [dstq+stepq*4], v2w + shr v1d, 16 + shr v2d, 16 + mov [dstq+stepq*2], v1w + mov [dstq+step3q*2], v2w + lea dstq, [dstq+stepq*8] + movd v1d, m2 + psrlq m2, 32 + movd v2d, m2 + mov [dstq], v1w + mov [dstq+stepq*4], v2w + shr v1d, 16 + shr v2d, 16 + mov [dstq+stepq*2], v1w + mov [dstq+step3q*2], v2w + lea dstq, [dstq+stepq*8] +%endif + add lenq, 16 + js .loop +%ifnidn %1, sse2 + emms +%endif + REP_RET +%endmacro + +INIT_XMM +FLOAT_TO_INT16_STEP sse2, 2 +INIT_MMX +FLOAT_TO_INT16_STEP sse, 0 +%define cvtps2pi pf2id +FLOAT_TO_INT16_STEP 3dnow, 0 +%undef cvtps2pi ;------------------------------------------------------------------------------- ; void ff_float_to_int16_interleave2(int16_t *dst, const float **src, long len); diff --git a/libavcodec/x86/fmtconvert_mmx.c b/libavcodec/x86/fmtconvert_mmx.c index 42cb0bc85b..aaf634d37f 100644 --- a/libavcodec/x86/fmtconvert_mmx.c +++ b/libavcodec/x86/fmtconvert_mmx.c @@ -25,6 +25,7 @@ #include "libavutil/cpu.h" #include "libavutil/x86_cpu.h" #include "libavcodec/fmtconvert.h" +#include "libavcodec/dsputil.h" #if HAVE_YASM @@ -35,6 +36,10 @@ void ff_float_to_int16_3dnow(int16_t *dst, const float *src, long len); void ff_float_to_int16_sse (int16_t *dst, const float *src, long len); void ff_float_to_int16_sse2 (int16_t *dst, const float *src, long len); +void ff_float_to_int16_step_3dnow(int16_t *dst, const float *src, long len, long step); +void ff_float_to_int16_step_sse (int16_t *dst, const float *src, long len, long step); +void ff_float_to_int16_step_sse2 (int16_t *dst, const float *src, long len, long step); + void ff_float_to_int16_interleave2_3dnow(int16_t *dst, const float **src, long len); void ff_float_to_int16_interleave2_sse (int16_t *dst, const float **src, long len); void ff_float_to_int16_interleave2_sse2 (int16_t *dst, const float **src, long len); @@ -48,12 +53,9 @@ void ff_float_to_int16_interleave6_3dn2(int16_t *dst, const float **src, int len #define FLOAT_TO_INT16_INTERLEAVE(cpu) \ /* gcc pessimizes register allocation if this is in the same function as float_to_int16_interleave_sse2*/\ static av_noinline void float_to_int16_interleave_misc_##cpu(int16_t *dst, const float **src, long len, int channels){\ - DECLARE_ALIGNED(16, int16_t, tmp)[len];\ - int i,j,c;\ + int c;\ for(c=0; c