pixconv: factor sse2 memcpy into macros

author: Hendrik Leppkes <h.leppkes@gmail.com> 2012-02-15 18:59:29 +0400
committer: Hendrik Leppkes <h.leppkes@gmail.com> 2012-02-15 19:40:14 +0400
commit: 565be07c7eb82bf5429a04bc7a616c331e89602a (patch)
tree: 9899459d1d44bfd0bf301002a8536cb0a13927f3 /decoder/LAVVideo/pixconv
parent: 17a9db8345a49354b11abaf9e6b016903078d6c1 (diff)
2 files changed, 67 insertions, 41 deletions
diff --git a/decoder/LAVVideo/pixconv/pixconv_sse2_templates.h b/decoder/LAVVideo/pixconv/pixconv_sse2_templates.h
index e4f9a0fb..d5d82e82 100644
--- a/decoder/LAVVideo/pixconv/pixconv_sse2_templates.h
+++ b/decoder/LAVVideo/pixconv/pixconv_sse2_templates.h
@@ -90,3 +90,54 @@
 // src     - source memory
 #define PIXCONV_LOAD_4PIXEL16(reg,src) \
    reg = _mm_loadl_epi64((const __m128i *)(src)); /* load 64-bit (4 pixel) */
+
+// SSE2 Aligned memcpy
+// dst - memory destination
+// src - memory source
+// len - size in bytes
+#define PIXCONV_MEMCPY_ALIGNED(dst,src,len)     \
+  {                                             \
+    __m128i reg;                                \
+    __m128i *dst128 =  (__m128i *)(dst);        \
+    for (int i = 0; i < len; i+=16) {           \
+      PIXCONV_LOAD_PIXEL8_ALIGNED(reg,(src)+i); \
+      _mm_stream_si128(dst128++, reg);          \
+    }                                           \
+  }
+
+// SSE2 Aligned memcpy (for 32-bit aligned data)
+// dst - memory destination
+// src - memory source
+// len - size in bytes
+#define PIXCONV_MEMCPY_ALIGNED32(dst,src,len)    \
+  {                                              \
+    __m128i reg1,reg2;                           \
+    __m128i *dst128 =  (__m128i *)(dst);         \
+    for (int i = 0; i < len; i+=32) {            \
+      PIXCONV_LOAD_PIXEL8_ALIGNED(reg1,(src)+i); \
+      PIXCONV_LOAD_PIXEL8_ALIGNED(reg2,(src)+i+16); \
+      _mm_stream_si128(dst128++, reg1);          \
+      _mm_stream_si128(dst128++, reg2);          \
+    }                                            \
+  }
+
+// SSE2 Aligned memcpy
+// Copys the same size from two source into two destinations at the same time
+// Can be useful to copy U/V planes in one go
+// dst1 - memory destination
+// src1 - memory source
+// dst2 - memory destination
+// src2 - memory source
+// len  - size in bytes
+#define PIXCONV_MEMCPY_ALIGNED_TWO(dst1,src1,dst2,src2,len)     \
+  {                                               \
+    __m128i reg1,reg2;                            \
+    __m128i *dst128_1 =  (__m128i *)(dst1);       \
+    __m128i *dst128_2 =  (__m128i *)(dst2);       \
+    for (int i = 0; i < len; i+=16) {             \
+      PIXCONV_LOAD_PIXEL8_ALIGNED(reg1,(src1)+i); \
+      PIXCONV_LOAD_PIXEL8_ALIGNED(reg2,(src2)+i); \
+      _mm_stream_si128(dst128_1++, reg1);         \
+      _mm_stream_si128(dst128_2++, reg2);         \
+    }                                             \
+  }
diff --git a/decoder/LAVVideo/pixconv/yuv2yuv_unscaled.cpp b/decoder/LAVVideo/pixconv/yuv2yuv_unscaled.cpp
index e45b9d6c..53699e49 100644
--- a/decoder/LAVVideo/pixconv/yuv2yuv_unscaled.cpp
+++ b/decoder/LAVVideo/pixconv/yuv2yuv_unscaled.cpp
@@ -216,26 +216,28 @@ DECLARE_CONV_FUNC_IMPL(convert_yuv_yv)
     outChromaStride = outChromaStride >> 1;
   }
 
+  uint8_t *dstY = dst;
+  uint8_t *dstV = dstY + height * outLumaStride;
+  uint8_t *dstU = dstV + chromaHeight * outChromaStride;
+
   // Copy planes
 
+  _mm_sfence();
+
   // Y
   for(line = 0; line < height; ++line) {
-    memcpy(dst, y, width);
+    PIXCONV_MEMCPY_ALIGNED(dstY + outLumaStride * line, y, width);
     y += inLumaStride;
-    dst += outLumaStride;
   }
 
-  uint8_t *dstV = dst;
-  uint8_t *dstU = dst + chromaHeight * outChromaStride;
-
   // U/V
   for(line = 0; line < chromaHeight; ++line) {
-    memcpy(dstU, u, chromaWidth);
-    memcpy(dstV, v, chromaWidth);
+    PIXCONV_MEMCPY_ALIGNED_TWO(
+      dstU + outChromaStride * line, u,
+      dstV + outChromaStride * line, v,
+      chromaWidth);
     u += inChromaStride;
     v += inChromaStride;
-    dstU += outChromaStride;
-    dstV += outChromaStride;
   }
 
   return S_OK;
@@ -264,15 +266,7 @@ DECLARE_CONV_FUNC_IMPL(convert_yuv420_nv12)
 
   // Y
   for(line = 0; line < height; ++line) {
-    __m128i *dstY128 = (__m128i *)(dstY + outStride * line);
-
-    for (i = 0; i < width; i+=32) {
-      PIXCONV_LOAD_PIXEL8_ALIGNED(xmm0, y+i+0);
-      PIXCONV_LOAD_PIXEL8_ALIGNED(xmm1, y+i+16);
-      _mm_stream_si128(dstY128++, xmm0);
-      _mm_stream_si128(dstY128++, xmm1);
-    }
-
+    PIXCONV_MEMCPY_ALIGNED32(dstY + outStride * line, y, width);
     y += inLumaStride;
   }
 
@@ -469,13 +463,7 @@ DECLARE_CONV_FUNC_IMPL(convert_nv12_yv12)
 
   // Copy the y
   for (line = 0; line < height; line++) {
-    __m128i *dstY128 = (__m128i *)(dstY + outLumaStride * line);
-
-    for (i = 0; i < width; i+=16) {
-      PIXCONV_LOAD_PIXEL8_ALIGNED(xmm0, y+i+0);
-      _mm_stream_si128(dstY128++, xmm0);
-    }
-
+    PIXCONV_MEMCPY_ALIGNED(dstY + outLumaStride * line, y, width);
     y += inStride;
   }
 
@@ -520,31 +508,18 @@ DECLARE_CONV_FUNC_IMPL(convert_nv12_nv12)
   uint8_t *dstY = dst;
   uint8_t *dstUV = dstY + height * outStride;
 
-  int line, i;
-  __m128i xmm0;
+  int line;
 
   _mm_sfence();
 
   // Copy the data
   for (line = 0; line < height; line++) {
-    __m128i *dstY128 = (__m128i *)(dstY + outStride * line);
-
-    for (i = 0; i < width; i+=16) {
-      PIXCONV_LOAD_PIXEL8_ALIGNED(xmm0, y+i+0);
-      _mm_stream_si128(dstY128++, xmm0);
-    }
-
+    PIXCONV_MEMCPY_ALIGNED(dstY + outStride * line, y, width);
     y += inStride;
   }
 
   for (line = 0; line < chromaHeight; line++) {
-    __m128i *dstUV128 = (__m128i *)(dstUV + outStride * line);
-
-    for (i = 0; i < width; i+=16) {
-      PIXCONV_LOAD_PIXEL8_ALIGNED(xmm0, uv+i+0);
-      _mm_stream_si128(dstUV128++, xmm0);
-    }
-
+    PIXCONV_MEMCPY_ALIGNED(dstUV + outStride * line, uv, width);
     uv += inStride;
   }
author	Hendrik Leppkes <h.leppkes@gmail.com>	2012-02-15 18:59:29 +0400
committer	Hendrik Leppkes <h.leppkes@gmail.com>	2012-02-15 19:40:14 +0400
commit	565be07c7eb82bf5429a04bc7a616c331e89602a (patch)
tree	9899459d1d44bfd0bf301002a8536cb0a13927f3 /decoder/LAVVideo/pixconv
parent	17a9db8345a49354b11abaf9e6b016903078d6c1 (diff)