diff options
author | Hendrik Leppkes <h.leppkes@gmail.com> | 2014-05-24 19:16:28 +0400 |
---|---|---|
committer | Hendrik Leppkes <h.leppkes@gmail.com> | 2014-05-24 19:17:47 +0400 |
commit | 55ff38d650aaed710fdce5a8b4d16faaf6bf0763 (patch) | |
tree | 4ec933521be1010b49f8fc5d6548aeab08ec200c /decoder | |
parent | 836874f9d44011de1bc8427d24bdba725db53ca3 (diff) |
Switch optimized converter functions back to custom SSE2 memory copy.
Diffstat (limited to 'decoder')
-rw-r--r-- | decoder/LAVVideo/pixconv/pixconv_sse2_templates.h | 84 | ||||
-rw-r--r-- | decoder/LAVVideo/pixconv/yuv2yuv_unscaled.cpp | 28 |
2 files changed, 73 insertions, 39 deletions
diff --git a/decoder/LAVVideo/pixconv/pixconv_sse2_templates.h b/decoder/LAVVideo/pixconv/pixconv_sse2_templates.h index 79ad73f2..610ff490 100644 --- a/decoder/LAVVideo/pixconv/pixconv_sse2_templates.h +++ b/decoder/LAVVideo/pixconv/pixconv_sse2_templates.h @@ -83,6 +83,10 @@ #define PIXCONV_LOAD_PIXEL8_ALIGNED PIXCONV_LOAD_ALIGNED +// Put 128-bit into memory, using streaming write +#define PIXCONV_PUT_STREAM(dst,reg) \ + _mm_stream_si128((__m128i *)(dst), reg); /* streaming write */ + // Load 4 8-bit pixels into the register // reg - register to store pixels in // src - source memory @@ -99,49 +103,65 @@ // dst - memory destination // src - memory source // len - size in bytes -#define PIXCONV_MEMCPY_ALIGNED(dst,src,len) \ - { \ - __m128i reg; \ - __m128i *dst128 = (__m128i *)(dst); \ - for (int i = 0; i < len; i+=16) { \ - PIXCONV_LOAD_PIXEL8_ALIGNED(reg,(src)+i); \ - _mm_stream_si128(dst128++, reg); \ - } \ - } - -// SSE2 Aligned memcpy (for 32-bit aligned data) -// dst - memory destination -// src - memory source -// len - size in bytes -#define PIXCONV_MEMCPY_ALIGNED32(dst,src,len) \ +#define PIXCONV_MEMCPY_ALIGNED(dst,src,len) \ { \ - __m128i reg1,reg2; \ - __m128i *dst128 = (__m128i *)(dst); \ - for (int i = 0; i < len; i+=32) { \ - PIXCONV_LOAD_PIXEL8_ALIGNED(reg1,(src)+i); \ - PIXCONV_LOAD_PIXEL8_ALIGNED(reg2,(src)+i+16); \ - _mm_stream_si128(dst128++, reg1); \ - _mm_stream_si128(dst128++, reg2); \ + const uint8_t * const srcLinePtr = (src); \ + uint8_t * const dstLinePtr = (dst); \ + __m128i r1, r2, r3, r4; \ + ptrdiff_t i; \ + for (i = 0; i < (len - 63); i += 64) { \ + PIXCONV_LOAD_ALIGNED(r1, srcLinePtr+i+ 0) \ + PIXCONV_LOAD_ALIGNED(r2, srcLinePtr+i+16); \ + PIXCONV_LOAD_ALIGNED(r3, srcLinePtr+i+32); \ + PIXCONV_LOAD_ALIGNED(r4, srcLinePtr+i+48); \ + PIXCONV_PUT_STREAM(dstLinePtr+i+ 0, r1); \ + PIXCONV_PUT_STREAM(dstLinePtr+i+16, r2); \ + PIXCONV_PUT_STREAM(dstLinePtr+i+32, r3); \ + PIXCONV_PUT_STREAM(dstLinePtr+i+48, r4); \ + } \ + for (; i < len; i += 16) { \ + PIXCONV_LOAD_ALIGNED(r1, srcLinePtr+i); \ + PIXCONV_PUT_STREAM(dstLinePtr+i, r1); \ } \ } // SSE2 Aligned memcpy // Copys the same size from two source into two destinations at the same time -// Can be useful to copy U/V planes in one go // dst1 - memory destination // src1 - memory source // dst2 - memory destination // src2 - memory source // len - size in bytes -#define PIXCONV_MEMCPY_ALIGNED_TWO(dst1,src1,dst2,src2,len) \ +#define PIXCONV_MEMCPY_ALIGNED_TWO(dst1,src1,dst2,src2,len) \ { \ - __m128i reg1,reg2; \ - __m128i *dst128_1 = (__m128i *)(dst1); \ - __m128i *dst128_2 = (__m128i *)(dst2); \ - for (int i = 0; i < len; i+=16) { \ - PIXCONV_LOAD_PIXEL8_ALIGNED(reg1,(src1)+i); \ - PIXCONV_LOAD_PIXEL8_ALIGNED(reg2,(src2)+i); \ - _mm_stream_si128(dst128_1++, reg1); \ - _mm_stream_si128(dst128_2++, reg2); \ + const uint8_t * const src1LinePtr = (src1); \ + const uint8_t * const src2LinePtr = (src2); \ + uint8_t * const dst1LinePtr = (dst1); \ + uint8_t * const dst2LinePtr = (dst2); \ + __m128i r1, r2, r3, r4, r5, r6, r7, r8; \ + ptrdiff_t i; \ + for (i = 0; i < (len - 63); i += 64) { \ + PIXCONV_LOAD_ALIGNED(r1, src1LinePtr+i+ 0); \ + PIXCONV_LOAD_ALIGNED(r2, src1LinePtr+i+16); \ + PIXCONV_LOAD_ALIGNED(r3, src1LinePtr+i+32); \ + PIXCONV_LOAD_ALIGNED(r4, src1LinePtr+i+48); \ + PIXCONV_LOAD_ALIGNED(r5, src2LinePtr+i+ 0); \ + PIXCONV_LOAD_ALIGNED(r6, src2LinePtr+i+16); \ + PIXCONV_LOAD_ALIGNED(r7, src2LinePtr+i+32); \ + PIXCONV_LOAD_ALIGNED(r8, src2LinePtr+i+48); \ + PIXCONV_PUT_STREAM(dst1LinePtr+i+ 0, r1); \ + PIXCONV_PUT_STREAM(dst1LinePtr+i+16, r2); \ + PIXCONV_PUT_STREAM(dst1LinePtr+i+32, r3); \ + PIXCONV_PUT_STREAM(dst1LinePtr+i+48, r4); \ + PIXCONV_PUT_STREAM(dst2LinePtr+i+ 0, r5); \ + PIXCONV_PUT_STREAM(dst2LinePtr+i+16, r6); \ + PIXCONV_PUT_STREAM(dst2LinePtr+i+32, r7); \ + PIXCONV_PUT_STREAM(dst2LinePtr+i+48, r8); \ + } \ + for (; i < len; i += 16) { \ + PIXCONV_LOAD_ALIGNED(r1, src1LinePtr+i); \ + PIXCONV_LOAD_ALIGNED(r2, src2LinePtr+i); \ + PIXCONV_PUT_STREAM(dst1LinePtr+i, r1); \ + PIXCONV_PUT_STREAM(dst2LinePtr+i, r2); \ } \ } diff --git a/decoder/LAVVideo/pixconv/yuv2yuv_unscaled.cpp b/decoder/LAVVideo/pixconv/yuv2yuv_unscaled.cpp index 7ce23345..e3da8841 100644 --- a/decoder/LAVVideo/pixconv/yuv2yuv_unscaled.cpp +++ b/decoder/LAVVideo/pixconv/yuv2yuv_unscaled.cpp @@ -199,14 +199,28 @@ DECLARE_CONV_FUNC_IMPL(convert_yuv_yv) _mm_sfence(); // Y - for (line = 0; line < height; ++line) { - memcpy(dst[0] + outLumaStride * line, y + inLumaStride * line, width); + if ((outLumaStride % 16) == 0 && ((intptr_t)dst[0] % 16u) == 0) { + for (line = 0; line < height; ++line) { + PIXCONV_MEMCPY_ALIGNED(dst[0] + outLumaStride * line, y + inLumaStride * line, width); + } + } else { + for (line = 0; line < height; ++line) { + memcpy(dst[0] + outLumaStride * line, y + inLumaStride * line, width); + } } // U/V - for(line = 0; line < chromaHeight; ++line) { - memcpy(dst[2] + outChromaStride * line, u + inChromaStride * line, chromaWidth); - memcpy(dst[1] + outChromaStride * line, v + inChromaStride * line, chromaWidth); + if ((outChromaStride % 16) == 0 && ((intptr_t)dst[1] % 16u) == 0) { + for(line = 0; line < chromaHeight; ++line) { + PIXCONV_MEMCPY_ALIGNED_TWO(dst[2] + outChromaStride * line, u + inChromaStride * line, + dst[1] + outChromaStride * line, v + inChromaStride * line, + chromaWidth); + } + } else { + for (line = 0; line < chromaHeight; ++line) { + memcpy(dst[2] + outChromaStride * line, u + inChromaStride * line, chromaWidth); + memcpy(dst[1] + outChromaStride * line, v + inChromaStride * line, chromaWidth); + } } return S_OK; @@ -230,7 +244,7 @@ DECLARE_CONV_FUNC_IMPL(convert_yuv420_nv12) // Y for(line = 0; line < height; ++line) { - memcpy(dst[0] + outLumaStride * line, src[0] + inLumaStride * line, width); + PIXCONV_MEMCPY_ALIGNED(dst[0] + outLumaStride * line, src[0] + inLumaStride * line, width); } // U/V @@ -414,7 +428,7 @@ DECLARE_CONV_FUNC_IMPL(convert_nv12_yv12) // Copy the y for (line = 0; line < height; line++) { - memcpy(dst[0] + outLumaStride * line, src[0] + inLumaStride * line, width); + PIXCONV_MEMCPY_ALIGNED(dst[0] + outLumaStride * line, src[0] + inLumaStride * line, width); } for (line = 0; line < chromaHeight; line++) { |