diff options
author | Hendrik Leppkes <h.leppkes@gmail.com> | 2012-02-15 18:59:29 +0400 |
---|---|---|
committer | Hendrik Leppkes <h.leppkes@gmail.com> | 2012-02-15 19:40:14 +0400 |
commit | 565be07c7eb82bf5429a04bc7a616c331e89602a (patch) | |
tree | 9899459d1d44bfd0bf301002a8536cb0a13927f3 /decoder/LAVVideo/pixconv | |
parent | 17a9db8345a49354b11abaf9e6b016903078d6c1 (diff) |
pixconv: factor sse2 memcpy into macros
Diffstat (limited to 'decoder/LAVVideo/pixconv')
-rw-r--r-- | decoder/LAVVideo/pixconv/pixconv_sse2_templates.h | 51 | ||||
-rw-r--r-- | decoder/LAVVideo/pixconv/yuv2yuv_unscaled.cpp | 57 |
2 files changed, 67 insertions, 41 deletions
diff --git a/decoder/LAVVideo/pixconv/pixconv_sse2_templates.h b/decoder/LAVVideo/pixconv/pixconv_sse2_templates.h index e4f9a0fb..d5d82e82 100644 --- a/decoder/LAVVideo/pixconv/pixconv_sse2_templates.h +++ b/decoder/LAVVideo/pixconv/pixconv_sse2_templates.h @@ -90,3 +90,54 @@ // src - source memory #define PIXCONV_LOAD_4PIXEL16(reg,src) \ reg = _mm_loadl_epi64((const __m128i *)(src)); /* load 64-bit (4 pixel) */ + +// SSE2 Aligned memcpy +// dst - memory destination +// src - memory source +// len - size in bytes +#define PIXCONV_MEMCPY_ALIGNED(dst,src,len) \ + { \ + __m128i reg; \ + __m128i *dst128 = (__m128i *)(dst); \ + for (int i = 0; i < len; i+=16) { \ + PIXCONV_LOAD_PIXEL8_ALIGNED(reg,(src)+i); \ + _mm_stream_si128(dst128++, reg); \ + } \ + } + +// SSE2 Aligned memcpy (for 32-bit aligned data) +// dst - memory destination +// src - memory source +// len - size in bytes +#define PIXCONV_MEMCPY_ALIGNED32(dst,src,len) \ + { \ + __m128i reg1,reg2; \ + __m128i *dst128 = (__m128i *)(dst); \ + for (int i = 0; i < len; i+=32) { \ + PIXCONV_LOAD_PIXEL8_ALIGNED(reg1,(src)+i); \ + PIXCONV_LOAD_PIXEL8_ALIGNED(reg2,(src)+i+16); \ + _mm_stream_si128(dst128++, reg1); \ + _mm_stream_si128(dst128++, reg2); \ + } \ + } + +// SSE2 Aligned memcpy +// Copys the same size from two source into two destinations at the same time +// Can be useful to copy U/V planes in one go +// dst1 - memory destination +// src1 - memory source +// dst2 - memory destination +// src2 - memory source +// len - size in bytes +#define PIXCONV_MEMCPY_ALIGNED_TWO(dst1,src1,dst2,src2,len) \ + { \ + __m128i reg1,reg2; \ + __m128i *dst128_1 = (__m128i *)(dst1); \ + __m128i *dst128_2 = (__m128i *)(dst2); \ + for (int i = 0; i < len; i+=16) { \ + PIXCONV_LOAD_PIXEL8_ALIGNED(reg1,(src1)+i); \ + PIXCONV_LOAD_PIXEL8_ALIGNED(reg2,(src2)+i); \ + _mm_stream_si128(dst128_1++, reg1); \ + _mm_stream_si128(dst128_2++, reg2); \ + } \ + } diff --git a/decoder/LAVVideo/pixconv/yuv2yuv_unscaled.cpp b/decoder/LAVVideo/pixconv/yuv2yuv_unscaled.cpp index e45b9d6c..53699e49 100644 --- a/decoder/LAVVideo/pixconv/yuv2yuv_unscaled.cpp +++ b/decoder/LAVVideo/pixconv/yuv2yuv_unscaled.cpp @@ -216,26 +216,28 @@ DECLARE_CONV_FUNC_IMPL(convert_yuv_yv) outChromaStride = outChromaStride >> 1; } + uint8_t *dstY = dst; + uint8_t *dstV = dstY + height * outLumaStride; + uint8_t *dstU = dstV + chromaHeight * outChromaStride; + // Copy planes + _mm_sfence(); + // Y for(line = 0; line < height; ++line) { - memcpy(dst, y, width); + PIXCONV_MEMCPY_ALIGNED(dstY + outLumaStride * line, y, width); y += inLumaStride; - dst += outLumaStride; } - uint8_t *dstV = dst; - uint8_t *dstU = dst + chromaHeight * outChromaStride; - // U/V for(line = 0; line < chromaHeight; ++line) { - memcpy(dstU, u, chromaWidth); - memcpy(dstV, v, chromaWidth); + PIXCONV_MEMCPY_ALIGNED_TWO( + dstU + outChromaStride * line, u, + dstV + outChromaStride * line, v, + chromaWidth); u += inChromaStride; v += inChromaStride; - dstU += outChromaStride; - dstV += outChromaStride; } return S_OK; @@ -264,15 +266,7 @@ DECLARE_CONV_FUNC_IMPL(convert_yuv420_nv12) // Y for(line = 0; line < height; ++line) { - __m128i *dstY128 = (__m128i *)(dstY + outStride * line); - - for (i = 0; i < width; i+=32) { - PIXCONV_LOAD_PIXEL8_ALIGNED(xmm0, y+i+0); - PIXCONV_LOAD_PIXEL8_ALIGNED(xmm1, y+i+16); - _mm_stream_si128(dstY128++, xmm0); - _mm_stream_si128(dstY128++, xmm1); - } - + PIXCONV_MEMCPY_ALIGNED32(dstY + outStride * line, y, width); y += inLumaStride; } @@ -469,13 +463,7 @@ DECLARE_CONV_FUNC_IMPL(convert_nv12_yv12) // Copy the y for (line = 0; line < height; line++) { - __m128i *dstY128 = (__m128i *)(dstY + outLumaStride * line); - - for (i = 0; i < width; i+=16) { - PIXCONV_LOAD_PIXEL8_ALIGNED(xmm0, y+i+0); - _mm_stream_si128(dstY128++, xmm0); - } - + PIXCONV_MEMCPY_ALIGNED(dstY + outLumaStride * line, y, width); y += inStride; } @@ -520,31 +508,18 @@ DECLARE_CONV_FUNC_IMPL(convert_nv12_nv12) uint8_t *dstY = dst; uint8_t *dstUV = dstY + height * outStride; - int line, i; - __m128i xmm0; + int line; _mm_sfence(); // Copy the data for (line = 0; line < height; line++) { - __m128i *dstY128 = (__m128i *)(dstY + outStride * line); - - for (i = 0; i < width; i+=16) { - PIXCONV_LOAD_PIXEL8_ALIGNED(xmm0, y+i+0); - _mm_stream_si128(dstY128++, xmm0); - } - + PIXCONV_MEMCPY_ALIGNED(dstY + outStride * line, y, width); y += inStride; } for (line = 0; line < chromaHeight; line++) { - __m128i *dstUV128 = (__m128i *)(dstUV + outStride * line); - - for (i = 0; i < width; i+=16) { - PIXCONV_LOAD_PIXEL8_ALIGNED(xmm0, uv+i+0); - _mm_stream_si128(dstUV128++, xmm0); - } - + PIXCONV_MEMCPY_ALIGNED(dstUV + outStride * line, uv, width); uv += inStride; } |