From cfb2c60541714d1110b2a257f835e5d4244a421b Mon Sep 17 00:00:00 2001
From: Hendrik Leppkes <h.leppkes@gmail.com>
Date: Sat, 24 May 2014 17:16:48 +0200
Subject: Optimize and simplify SSE2 load/store

---
 decoder/LAVVideo/pixconv/pixconv_sse2_templates.h |  16 +++
 decoder/LAVVideo/pixconv/yuv2yuv_unscaled.cpp     | 124 ++++++++++------------
 2 files changed, 75 insertions(+), 65 deletions(-)

(limited to 'decoder/LAVVideo/pixconv')

diff --git a/decoder/LAVVideo/pixconv/pixconv_sse2_templates.h b/decoder/LAVVideo/pixconv/pixconv_sse2_templates.h
index 610ff490..c4aa3c45 100644
--- a/decoder/LAVVideo/pixconv/pixconv_sse2_templates.h
+++ b/decoder/LAVVideo/pixconv/pixconv_sse2_templates.h
@@ -38,6 +38,22 @@
   reg = _mm_load_si128((const __m128i *)(src));  /* load (aligned) */  \
   reg = _mm_slli_epi16(reg, 16-bpp);             /* shift to 16-bit */
 
+
+// Load 2x8 16-bit pixels into registers, using aligned memory access
+// reg1   - register to store pixels in
+// reg2   - register to store pixels in
+// src1   - memory pointer of the source
+// src2   - memory pointer of the source
+// bpp   - bit depth of the pixels
+#define PIXCONV_LOAD_PIXEL16X2(reg1,reg2,src1,src2,bpp) \
+  {                                                     \
+    const __m128i shift = _mm_cvtsi32_si128(16 - bpp);  \
+    reg1 = _mm_load_si128((const __m128i *)(src1));     \
+    reg2 = _mm_load_si128((const __m128i *)(src2));     \
+    reg1 = _mm_sll_epi16(reg1, shift);                  \
+    reg2 = _mm_sll_epi16(reg2, shift);                  \
+  }
+
 // Load 8 16-bit pixels into a register, and dither them to 8 bit
 // The 8-bit pixels will be in the high-bytes of the 8 16-bit parts
 // NOTE: the low-bytes are clobbered, and not empty.
diff --git a/decoder/LAVVideo/pixconv/yuv2yuv_unscaled.cpp b/decoder/LAVVideo/pixconv/yuv2yuv_unscaled.cpp
index e3da8841..1bef94ad 100644
--- a/decoder/LAVVideo/pixconv/yuv2yuv_unscaled.cpp
+++ b/decoder/LAVVideo/pixconv/yuv2yuv_unscaled.cpp
@@ -65,8 +65,8 @@ DECLARE_CONV_FUNC_IMPL(convert_yuv_yv_nv12_dither_le)
       xmm4 = xmm5 = xmm6 = xmm7;
     }
 
-    __m128i *dst128Y = (__m128i *)(dst[0] + line * outYStride);
-    const uint16_t *y = (const uint16_t *)(src[0] + line * inYStride);
+    const uint16_t * const y  = (const uint16_t *)(src[0] + line * inYStride);
+          uint16_t * const dy = (      uint16_t *)(dst[0] + line * outYStride);
 
     for (i = 0; i < width; i+=32) {
       // Load pixels into registers, and apply dithering
@@ -78,18 +78,18 @@ DECLARE_CONV_FUNC_IMPL(convert_yuv_yv_nv12_dither_le)
       xmm2 = _mm_packus_epi16(xmm2, xmm3);                     /* YYYYYYYY */
 
       // Write data back
-      _mm_stream_si128(dst128Y++, xmm0);
-      _mm_stream_si128(dst128Y++, xmm2);
+      PIXCONV_PUT_STREAM(dy + (i >> 1) + 0, xmm0);
+      PIXCONV_PUT_STREAM(dy + (i >> 1) + 8, xmm2);
     }
 
     // Process U/V for chromaHeight lines
     if (line < chromaHeight) {
-      __m128i *dst128UV = (__m128i *)(dst[1] + line * outUVStride);
-      __m128i *dst128U = (__m128i *)(dst[2] + line * outUVStride);
-      __m128i *dst128V = (__m128i *)(dst[1] + line * outUVStride);
+      const uint16_t * const u = (const uint16_t *)(src[1] + line * inUVStride);
+      const uint16_t * const v = (const uint16_t *)(src[2] + line * inUVStride);
 
-      const uint16_t *u = (const uint16_t *)(src[1] + line * inUVStride);
-      const uint16_t *v = (const uint16_t *)(src[2] + line * inUVStride);
+      uint8_t * const duv = (uint8_t *)(dst[1] + line * outUVStride);
+      uint8_t * const du  = (uint8_t *)(dst[2] + line * outUVStride);
+      uint8_t * const dv  = (uint8_t *)(dst[1] + line * outUVStride);
 
        for (i = 0; i < chromaWidth; i+=16) {
         PIXCONV_LOAD_PIXEL16_DITHER(xmm0, xmm4, (u+i+0), bpp);  /* U0U0U0U0 */
@@ -104,11 +104,11 @@ DECLARE_CONV_FUNC_IMPL(convert_yuv_yv_nv12_dither_le)
           xmm0 = _mm_unpacklo_epi8(xmm0, xmm2);
           xmm1 = _mm_unpackhi_epi8(xmm1, xmm2);
 
-          _mm_stream_si128(dst128UV++, xmm0);
-          _mm_stream_si128(dst128UV++, xmm1);
+          PIXCONV_PUT_STREAM(duv + (i << 1) + 0, xmm0);
+          PIXCONV_PUT_STREAM(duv + (i << 1) + 16, xmm1);
         } else {
-          _mm_stream_si128(dst128U++, xmm0);
-          _mm_stream_si128(dst128V++, xmm2);
+          PIXCONV_PUT_STREAM(du + i, xmm0);
+          PIXCONV_PUT_STREAM(dv + i, xmm2);
         }
       }
     }
@@ -137,36 +137,34 @@ DECLARE_CONV_FUNC_IMPL(convert_yuv420_px1x_le)
 
   // Process Y
   for (line = 0; line < height; ++line) {
-    __m128i *dst128Y = (__m128i *)(dst[0] + line * outYStride);
-    const uint16_t *y = (const uint16_t *)(src[0] + line * inYStride);
+    const uint16_t * const y = (const uint16_t *)(src[0] + line * inYStride);
+          uint16_t * const d = (      uint16_t *)(dst[0] + line * outYStride);
 
     for (i = 0; i < width; i+=16) {
-      // Load 8 pixels into register
-      PIXCONV_LOAD_PIXEL16(xmm0, (y+i+0), bpp); /* YYYY */
-      PIXCONV_LOAD_PIXEL16(xmm1, (y+i+8), bpp); /* YYYY */
+      // Load 2x8 pixels into registers
+      PIXCONV_LOAD_PIXEL16X2(xmm0, xmm1, (y+i+0), (y+i+8), bpp);
       // and write them out
-      _mm_stream_si128(dst128Y++, xmm0);
-      _mm_stream_si128(dst128Y++, xmm1);
+      PIXCONV_PUT_STREAM(d+i+0, xmm0);
+      PIXCONV_PUT_STREAM(d+i+8, xmm1);
     }
   }
 
   // Process UV
   for (line = 0; line < uvHeight; ++line) {
-    __m128i *dst128UV = (__m128i *)(dst[1] + line * outUVStride);
-    const uint16_t *u = (const uint16_t *)(src[1] + line * inUVStride);
-    const uint16_t *v = (const uint16_t *)(src[2] + line * inUVStride);
+    const uint16_t * const u = (const uint16_t *)(src[1] + line * inUVStride);
+    const uint16_t * const v = (const uint16_t *)(src[2] + line * inUVStride);
+          uint16_t * const d = (      uint16_t *)(dst[1] + line * outUVStride);
 
     for (i = 0; i < uvWidth; i+=8) {
       // Load 8 pixels into register
-      PIXCONV_LOAD_PIXEL16(xmm0, (v+i), bpp); /* VVVV */
-      PIXCONV_LOAD_PIXEL16(xmm1, (u+i), bpp); /* UUUU */
+      PIXCONV_LOAD_PIXEL16X2(xmm0, xmm1, (v+i), (u+i), bpp); // Load V and U
 
       xmm2 = xmm0;
       xmm0 = _mm_unpacklo_epi16(xmm1, xmm0);    /* UVUV */
       xmm2 = _mm_unpackhi_epi16(xmm1, xmm2);    /* UVUV */
 
-      _mm_stream_si128(dst128UV++, xmm0);
-      _mm_stream_si128(dst128UV++, xmm2);
+      PIXCONV_PUT_STREAM(d + (i << 1) + 0, xmm0);
+      PIXCONV_PUT_STREAM(d + (i << 1) + 8, xmm2);
     }
   }
 
@@ -249,19 +247,19 @@ DECLARE_CONV_FUNC_IMPL(convert_yuv420_nv12)
 
   // U/V
   for(line = 0; line < chromaHeight; ++line) {
-    __m128i *dst128UV = (__m128i *)(dst[1] + line * outChromaStride);
-    const uint8_t *u = src[1] + line * inChromaStride;
-    const uint8_t *v = src[2] + line * inChromaStride;
+    const uint8_t * const u = src[1] + line * inChromaStride;
+    const uint8_t * const v = src[2] + line * inChromaStride;
+          uint8_t * const d = dst[1] + line * outChromaStride;
 
     for (i = 0; i < chromaWidth; i+=16) {
-      PIXCONV_LOAD_PIXEL8_ALIGNED(xmm0, (v+i));  /* VVVV */
-      PIXCONV_LOAD_PIXEL8_ALIGNED(xmm1, (u+i));  /* UUUU */
+      PIXCONV_LOAD_PIXEL8_ALIGNED(xmm0, v+i);  /* VVVV */
+      PIXCONV_LOAD_PIXEL8_ALIGNED(xmm1, u+i);  /* UUUU */
 
       xmm2 = _mm_unpacklo_epi8(xmm1, xmm0);      /* UVUV */
       xmm3 = _mm_unpackhi_epi8(xmm1, xmm0);      /* UVUV */
 
-      _mm_stream_si128(dst128UV++, xmm2);
-      _mm_stream_si128(dst128UV++, xmm3);
+      PIXCONV_PUT_STREAM(d + (i << 1) +  0, xmm2);
+      PIXCONV_PUT_STREAM(d + (i << 1) + 16, xmm3);
     }
   }
 
@@ -284,17 +282,17 @@ DECLARE_CONV_FUNC_IMPL(convert_yuv422_yuy2_uyvy)
   _mm_sfence();
 
   for (line = 0;  line < height; ++line) {
-    __m128i *dst128 = (__m128i *)(dst[0] + line * outStride);
-    const uint8_t *y = src[0] + line * inLumaStride;
-    const uint8_t *u = src[1] + line * inChromaStride;
-    const uint8_t *v = src[2] + line * inChromaStride;
+    const uint8_t * const y = src[0] + line * inLumaStride;
+    const uint8_t * const u = src[1] + line * inChromaStride;
+    const uint8_t * const v = src[2] + line * inChromaStride;
+          uint8_t * const d = dst[0] + line * outStride;
 
     for (i = 0; i < chromaWidth; i+=16) {
       // Load pixels
-      PIXCONV_LOAD_PIXEL8_ALIGNED(xmm0, (y+(i*2)+0));  /* YYYY */
-      PIXCONV_LOAD_PIXEL8_ALIGNED(xmm1, (y+(i*2)+16)); /* YYYY */
-      PIXCONV_LOAD_PIXEL8_ALIGNED(xmm2, (u+i));        /* UUUU */
-      PIXCONV_LOAD_PIXEL8_ALIGNED(xmm3, (v+i));        /* VVVV */
+      PIXCONV_LOAD_PIXEL8_ALIGNED(xmm0, (y+(i<<1)+ 0)); /* YYYY */
+      PIXCONV_LOAD_PIXEL8_ALIGNED(xmm1, (y+(i<<1)+16)); /* YYYY */
+      PIXCONV_LOAD_PIXEL8_ALIGNED(xmm2, (u+i));         /* UUUU */
+      PIXCONV_LOAD_PIXEL8_ALIGNED(xmm3, (v+i));         /* VVVV */
 
       // Interleave Us and Vs
       xmm4 = xmm2;
@@ -306,28 +304,24 @@ DECLARE_CONV_FUNC_IMPL(convert_yuv422_yuy2_uyvy)
         xmm3 = xmm4;
         xmm3 = _mm_unpacklo_epi8(xmm3, xmm0);
         xmm4 = _mm_unpackhi_epi8(xmm4, xmm0);
-      } else {
-        xmm3 = xmm0;
-        xmm3 = _mm_unpacklo_epi8(xmm3, xmm4);
-        xmm4 = _mm_unpackhi_epi8(xmm0, xmm4);
-      }
-
-      _mm_stream_si128(dst128++, xmm3);
-      _mm_stream_si128(dst128++, xmm4);
 
-      // Interlave those with the Ys
-      if (uyvy) {
         xmm5 = xmm2;
         xmm5 = _mm_unpacklo_epi8(xmm5, xmm1);
         xmm2 = _mm_unpackhi_epi8(xmm2, xmm1);
       } else {
+        xmm3 = xmm0;
+        xmm3 = _mm_unpacklo_epi8(xmm3, xmm4);
+        xmm4 = _mm_unpackhi_epi8(xmm0, xmm4);
+
         xmm5 = xmm1;
         xmm5 = _mm_unpacklo_epi8(xmm5, xmm2);
         xmm2 = _mm_unpackhi_epi8(xmm1, xmm2);
       }
 
-      _mm_stream_si128(dst128++, xmm5);
-      _mm_stream_si128(dst128++, xmm2);
+      PIXCONV_PUT_STREAM(d + (i << 2) + 0, xmm3);
+      PIXCONV_PUT_STREAM(d + (i << 2) + 8, xmm4);
+      PIXCONV_PUT_STREAM(d + (i << 2) + 16, xmm5);
+      PIXCONV_PUT_STREAM(d + (i << 2) + 24, xmm2);
     }
   }
 
@@ -357,10 +351,10 @@ DECLARE_CONV_FUNC_IMPL(convert_yuv422_yuy2_uyvy_dither_le)
   _mm_sfence();
 
   for (line = 0;  line < height; ++line) {
-    __m128i *dst128 = (__m128i *)(dst[0] + line * outStride);
-    const uint16_t *y = (const uint16_t *)(src[0] + line * inLumaStride);
-    const uint16_t *u = (const uint16_t *)(src[1] + line * inChromaStride);
-    const uint16_t *v = (const uint16_t *)(src[2] + line * inChromaStride);
+    const uint16_t * const y = (const uint16_t *)(src[0] + line * inLumaStride);
+    const uint16_t * const u = (const uint16_t *)(src[1] + line * inChromaStride);
+    const uint16_t * const v = (const uint16_t *)(src[2] + line * inChromaStride);
+          uint16_t * const d = (      uint16_t *)(dst[0] + line * outStride);
 
     // Load dithering coefficients for this line
     if (ditherMode == LAVDither_Random) {
@@ -399,8 +393,8 @@ DECLARE_CONV_FUNC_IMPL(convert_yuv422_yuy2_uyvy_dither_le)
         xmm2 = _mm_unpackhi_epi8(xmm0, xmm2);
       }
 
-      _mm_stream_si128(dst128++, xmm3);
-      _mm_stream_si128(dst128++, xmm2);
+      PIXCONV_PUT_STREAM(d + (i << 1) + 0, xmm3);
+      PIXCONV_PUT_STREAM(d + (i << 1) + 8, xmm2);
     }
   }
 
@@ -432,9 +426,9 @@ DECLARE_CONV_FUNC_IMPL(convert_nv12_yv12)
   }
 
   for (line = 0; line < chromaHeight; line++) {
-    __m128i *dstV128 = (__m128i *)(dst[1] + outChromaStride * line);
-    __m128i *dstU128 = (__m128i *)(dst[2] + outChromaStride * line);
-    const uint8_t *uv = src[1] + line * inChromaStride;
+    const uint8_t * const uv = src[1] + line * inChromaStride;
+          uint8_t * const dv = dst[1] + outChromaStride * line;
+          uint8_t * const du = dst[2] + outChromaStride * line;
 
     for (i = 0; i < width; i+=32) {
       PIXCONV_LOAD_PIXEL8_ALIGNED(xmm0, uv+i+0);
@@ -452,8 +446,8 @@ DECLARE_CONV_FUNC_IMPL(convert_nv12_yv12)
       xmm0 = _mm_packus_epi16(xmm0, xmm1);
       xmm2 = _mm_packus_epi16(xmm2, xmm3);
 
-      _mm_stream_si128(dstU128++, xmm0);
-      _mm_stream_si128(dstV128++, xmm2);
+      PIXCONV_PUT_STREAM(du + (i>>1), xmm0);
+      PIXCONV_PUT_STREAM(dv + (i>>1), xmm2);
     }
   }
 
-- 
cgit v1.2.3