Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/FFmpeg/FFmpeg.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorChristophe Gisquet <christophe.gisquet@gmail.com>2014-05-28 17:52:24 +0400
committerMichael Niedermayer <michaelni@gmx.at>2014-05-29 23:56:00 +0400
commit99a319c4e7538670847ac4633ef8b0f0629deb22 (patch)
tree5896b2cac40b1e50d28d115c2cd821cc391e78a7 /libavcodec/x86
parent226700398105075d27d07b652a0b67705aa06a1e (diff)
x86: huffyuvdsp: port add_bytes to yasm
C MMX SSE2 Cycles: 2972 587 302 Signed-off-by: Michael Niedermayer <michaelni@gmx.at>
Diffstat (limited to 'libavcodec/x86')
-rw-r--r--libavcodec/x86/huffyuvdsp.asm37
-rw-r--r--libavcodec/x86/huffyuvdsp_init.c9
-rw-r--r--libavcodec/x86/huffyuvdsp_mmx.c32
3 files changed, 45 insertions, 33 deletions
diff --git a/libavcodec/x86/huffyuvdsp.asm b/libavcodec/x86/huffyuvdsp.asm
index f183ebee54..a923e70e1e 100644
--- a/libavcodec/x86/huffyuvdsp.asm
+++ b/libavcodec/x86/huffyuvdsp.asm
@@ -163,3 +163,40 @@ cglobal add_hfyu_left_pred, 3,3,7, dst, src, w, left
ADD_HFYU_LEFT_LOOP 0, 1
.src_unaligned:
ADD_HFYU_LEFT_LOOP 0, 0
+
+%macro ADD_BYTES 0
+cglobal add_bytes, 3,4,2, dst, src, w, size
+ mov sizeq, wq
+ and sizeq, -2*mmsize
+ jz .2
+ add dstq, sizeq
+ add srcq, sizeq
+ neg sizeq
+.1:
+ mova m0, [srcq + sizeq]
+ mova m1, [srcq + sizeq + mmsize]
+ paddb m0, [dstq + sizeq]
+ paddb m1, [dstq + sizeq + mmsize]
+ mova [dstq + sizeq], m0
+ mova [dstq + sizeq + mmsize], m1
+ add sizeq, 2*mmsize
+ jl .1
+.2:
+ and wq, 2*mmsize-1
+ jz .end
+ add dstq, wq
+ add srcq, wq
+ neg wq
+.3
+ mov sizeb, [srcq + wq]
+ add [dstq + wq], sizeb
+ inc wq
+ jl .3
+.end:
+ REP_RET
+%endmacro
+
+INIT_MMX mmx
+ADD_BYTES
+INIT_XMM sse2
+ADD_BYTES
diff --git a/libavcodec/x86/huffyuvdsp_init.c b/libavcodec/x86/huffyuvdsp_init.c
index 1efb34dbbe..8a755e65b0 100644
--- a/libavcodec/x86/huffyuvdsp_init.c
+++ b/libavcodec/x86/huffyuvdsp_init.c
@@ -23,7 +23,8 @@
#include "libavutil/x86/cpu.h"
#include "libavcodec/huffyuvdsp.h"
-void ff_add_bytes_mmx(uint8_t *dst, uint8_t *src, int w);
+void ff_add_bytes_mmx(uint8_t *dst, uint8_t *src, intptr_t w);
+void ff_add_bytes_sse2(uint8_t *dst, uint8_t *src, intptr_t w);
void ff_add_hfyu_median_pred_cmov(uint8_t *dst, const uint8_t *top,
const uint8_t *diff, int w,
@@ -46,7 +47,7 @@ av_cold void ff_huffyuvdsp_init_x86(HuffYUVDSPContext *c)
c->add_hfyu_median_pred = ff_add_hfyu_median_pred_cmov;
#endif
- if (INLINE_MMX(cpu_flags))
+ if (EXTERNAL_MMX(cpu_flags))
c->add_bytes = ff_add_bytes_mmx;
if (EXTERNAL_MMXEXT(cpu_flags)) {
@@ -55,6 +56,10 @@ av_cold void ff_huffyuvdsp_init_x86(HuffYUVDSPContext *c)
c->add_hfyu_median_pred = ff_add_hfyu_median_pred_mmxext;
}
+ if (EXTERNAL_SSE2(cpu_flags)) {
+ c->add_bytes = ff_add_bytes_sse2;
+ }
+
if (EXTERNAL_SSSE3(cpu_flags)) {
c->add_hfyu_left_pred = ff_add_hfyu_left_pred_ssse3;
if (cpu_flags & AV_CPU_FLAG_SSE4) // not really SSE4, just slow on Conroe
diff --git a/libavcodec/x86/huffyuvdsp_mmx.c b/libavcodec/x86/huffyuvdsp_mmx.c
index 59422107d3..ee6ec91287 100644
--- a/libavcodec/x86/huffyuvdsp_mmx.c
+++ b/libavcodec/x86/huffyuvdsp_mmx.c
@@ -22,9 +22,7 @@
#include "libavutil/x86/asm.h"
#include "huffyuvdsp.h"
-#if HAVE_INLINE_ASM
-
-#if HAVE_7REGS
+#if HAVE_INLINE_ASM && HAVE_7REGS
void ff_add_hfyu_median_pred_cmov(uint8_t *dst, const uint8_t *top,
const uint8_t *diff, int w,
int *left, int *left_top)
@@ -61,31 +59,3 @@ void ff_add_hfyu_median_pred_cmov(uint8_t *dst, const uint8_t *top,
*left_top = tl;
}
#endif
-
-void ff_add_bytes_mmx(uint8_t *dst, uint8_t *src, int w)
-{
- x86_reg i = 0;
-
- __asm__ volatile (
- "jmp 2f \n\t"
- "1: \n\t"
- "movq (%1, %0), %%mm0 \n\t"
- "movq (%2, %0), %%mm1 \n\t"
- "paddb %%mm0, %%mm1 \n\t"
- "movq %%mm1, (%2, %0) \n\t"
- "movq 8(%1, %0), %%mm0 \n\t"
- "movq 8(%2, %0), %%mm1 \n\t"
- "paddb %%mm0, %%mm1 \n\t"
- "movq %%mm1, 8(%2, %0) \n\t"
- "add $16, %0 \n\t"
- "2: \n\t"
- "cmp %3, %0 \n\t"
- "js 1b \n\t"
- : "+r" (i)
- : "r" (src), "r" (dst), "r" ((x86_reg) w - 15));
-
- for (; i < w; i++)
- dst[i + 0] += src[i + 0];
-}
-
-#endif /* HAVE_INLINE_ASM */