Unify shift/bpp parameters of SSE2 pixel loading macros

author: Hendrik Leppkes <h.leppkes@gmail.com> 2012-07-05 15:57:12 +0400
committer: Hendrik Leppkes <h.leppkes@gmail.com> 2012-07-05 18:16:48 +0400
commit: 691f56aa69f9f7efc7641932281b9aaa62ef865a (patch)
tree: 55f7ee75e61f7a7dce0da2b686f399897779a883 /decoder/LAVVideo/pixconv
parent: 2bf951f323555e448255e6959d30b0a95aa39004 (diff)
4 files changed, 48 insertions, 55 deletions
diff --git a/decoder/LAVVideo/pixconv/interleave.cpp b/decoder/LAVVideo/pixconv/interleave.cpp
index cbd8773a..00280dff 100644
--- a/decoder/LAVVideo/pixconv/interleave.cpp
+++ b/decoder/LAVVideo/pixconv/interleave.cpp
@@ -47,9 +47,12 @@ DECLARE_CONV_FUNC_IMPL(convert_yuv444_y410)
     __m128i *dst128 = (__m128i *)(dst + line * outStride);
 
     for (i = 0; i < width; i+=8) {
-      PIXCONV_LOAD_PIXEL16(xmm0, (y+i), shift);
-      PIXCONV_LOAD_PIXEL16(xmm1, (u+i), shift);
-      PIXCONV_LOAD_PIXEL16(xmm2, (v+i), shift+4); // +4 so its directly aligned properly (data from bit 14 to bit 4)
+      PIXCONV_LOAD_PIXEL8_ALIGNED(xmm0, (y+i));
+      xmm0 = _mm_slli_epi16(xmm0, shift);
+      PIXCONV_LOAD_PIXEL8_ALIGNED(xmm1, (u+i));
+      xmm1 = _mm_slli_epi16(xmm1, shift);
+      PIXCONV_LOAD_PIXEL8_ALIGNED(xmm2, (v+i));
+      xmm2 = _mm_slli_epi16(xmm2, shift+4);  // +4 so its directly aligned properly (data from bit 14 to bit 4)
 
       xmm3 = _mm_unpacklo_epi16(xmm1, xmm2); // 0VVVVV00000UUUUU
       xmm4 = _mm_unpackhi_epi16(xmm1, xmm2); // 0VVVVV00000UUUUU
diff --git a/decoder/LAVVideo/pixconv/pixconv_sse2_templates.h b/decoder/LAVVideo/pixconv/pixconv_sse2_templates.h
index d5d82e82..366e87c2 100644
--- a/decoder/LAVVideo/pixconv/pixconv_sse2_templates.h
+++ b/decoder/LAVVideo/pixconv/pixconv_sse2_templates.h
@@ -19,6 +19,8 @@
 
 #pragma once
 
+#include <emmintrin.h>
+
 // Load the dithering coefficients for this line
 // reg   - register to load coefficients into
 // line  - index of line to process (0 based)
@@ -28,16 +30,23 @@
   reg = _mm_load_si128((const __m128i *)name);          \
   reg = _mm_srli_epi16(reg, 8-bits); /* shift to the required dithering strength */
 
+// Load 8 16-bit pixels into a register, using aligned memory access
+// reg   - register to store pixels in
+// src   - memory pointer of the source
+// bpp   - bit depth of the pixels
+#define PIXCONV_LOAD_PIXEL16(reg,src,bpp)                              \
+  reg = _mm_load_si128((const __m128i *)(src));  /* load (aligned) */  \
+  reg = _mm_slli_epi16(reg, 16-bpp);             /* shift to 16-bit */
+
 // Load 8 16-bit pixels into a register, and dither them to 8 bit
 // The 8-bit pixels will be in the high-bytes of the 8 16-bit parts
 // NOTE: the low-bytes are clobbered, and not empty.
 // reg   - register to store pixels in
 // dreg  - register with dithering coefficients
 // src   - memory pointer of the source
-// shift - shift offset to 8-bit (ie. 2 for 10bit)
-#define PIXCONV_LOAD_PIXEL16_DITHER_HIGH(reg,dreg,src,shift)          \
-  reg = _mm_load_si128((const __m128i *)(src)); /* load (aligned) */  \
-  reg = _mm_slli_epi16(reg, 8-shift);           /* shift to 16-bit */ \
+// bpp   - bit depth of the pixels
+#define PIXCONV_LOAD_PIXEL16_DITHER_HIGH(reg,dreg,src,bpp)      \
+  PIXCONV_LOAD_PIXEL16(reg,src,bpp)                             \
   reg = _mm_adds_epu16(reg, dreg);              /* dither */
 
 // Load 8 16-bit pixels into a register, and dither them to 8 bit
@@ -45,9 +54,9 @@
 // reg   - register to store pixels in
 // dreg  - register with dithering coefficients
 // src   - memory pointer of the source
-// shift - shift offset to 8-bit (ie. 2 for 10bit)
-#define PIXCONV_LOAD_PIXEL16_DITHER(reg,dreg,src,shift)          \
-  PIXCONV_LOAD_PIXEL16_DITHER_HIGH(reg,dreg,src,shift)           \
+// bpp   - bit depth of the pixels
+#define PIXCONV_LOAD_PIXEL16_DITHER(reg,dreg,src,bpp)          \
+  PIXCONV_LOAD_PIXEL16_DITHER_HIGH(reg,dreg,src,bpp)           \
   reg = _mm_srli_epi16(reg, 8);                 /* shift to 8-bit */
 
 // Load 8 16-bit pixels into a register, and dither them to 8 bit
@@ -55,9 +64,9 @@
 // reg   - register to store pixels in
 // dreg  - register with dithering coefficients
 // src   - memory pointer of the source
-// shift - shift offset to 16-bit (ie. 6 for 10bit)
-#define PIXCONV_LOAD_PIXEL16_DITHER_PACKED(reg,dreg,zero,src,shift)   \
-  PIXCONV_LOAD_PIXEL16_DITHER(reg,dreg,src,shift) /* load unpacked */ \
+// bpp   - bit depth of the pixels
+#define PIXCONV_LOAD_PIXEL16_DITHER_PACKED(reg,dreg,zero,src,bpp)   \
+  PIXCONV_LOAD_PIXEL16_DITHER(reg,dreg,src,bpp) /* load unpacked */ \
   reg = _mm_packus_epi16(reg, zero);              /* pack */
 
 // Load 16 8-bit pixels into a register
@@ -72,13 +81,6 @@
 #define PIXCONV_LOAD_PIXEL8_ALIGNED(reg,src) \
   reg = _mm_load_si128((const __m128i *)(src));      /* load (aligned) */
 
-// Load 8 16-bit pixels into a register, using aligned memory access
-// reg   - register to store pixels in
-// src   - memory pointer of the source
-#define PIXCONV_LOAD_PIXEL16(reg,src,shift)                             \
-  reg = _mm_load_si128((const __m128i *)(src));  /* load (aligned) */     \
-  reg = _mm_slli_epi16(reg, shift);            /* shift to 16-bit */
-
 // Load 4 8-bit pixels into the register
 // reg     - register to store pixels in
 // src     - source memory
diff --git a/decoder/LAVVideo/pixconv/yuv2yuv_unscaled.cpp b/decoder/LAVVideo/pixconv/yuv2yuv_unscaled.cpp
index c3c9a16e..c91d6f63 100644
--- a/decoder/LAVVideo/pixconv/yuv2yuv_unscaled.cpp
+++ b/decoder/LAVVideo/pixconv/yuv2yuv_unscaled.cpp
@@ -34,8 +34,6 @@ DECLARE_CONV_FUNC_IMPL(convert_yuv_yv_nv12_dither_le)
   const int inYStride = srcStride[0] >> 1;
   const int inUVStride = srcStride[1] >> 1;
 
-  const int shift = bpp - 8;
-
   int outLumaStride    = dstStride;
   int outChromaStride  = dstStride;
   int chromaWidth      = width;
@@ -80,12 +78,12 @@ DECLARE_CONV_FUNC_IMPL(convert_yuv_yv_nv12_dither_le)
 
     for (i = 0; i < width; i+=32) {
       // Load pixels into registers, and apply dithering
-      PIXCONV_LOAD_PIXEL16_DITHER(xmm0, xmm4, (y+i+ 0), shift);  /* Y0Y0Y0Y0 */
-      PIXCONV_LOAD_PIXEL16_DITHER(xmm1, xmm5, (y+i+ 8), shift);  /* Y0Y0Y0Y0 */
-      PIXCONV_LOAD_PIXEL16_DITHER(xmm2, xmm6, (y+i+16), shift);  /* Y0Y0Y0Y0 */
-      PIXCONV_LOAD_PIXEL16_DITHER(xmm3, xmm7, (y+i+24), shift);  /* Y0Y0Y0Y0 */
-      xmm0 = _mm_packus_epi16(xmm0, xmm1);                       /* YYYYYYYY */
-      xmm2 = _mm_packus_epi16(xmm2, xmm3);                       /* YYYYYYYY */
+      PIXCONV_LOAD_PIXEL16_DITHER(xmm0, xmm4, (y+i+ 0), bpp);  /* Y0Y0Y0Y0 */
+      PIXCONV_LOAD_PIXEL16_DITHER(xmm1, xmm5, (y+i+ 8), bpp);  /* Y0Y0Y0Y0 */
+      PIXCONV_LOAD_PIXEL16_DITHER(xmm2, xmm6, (y+i+16), bpp);  /* Y0Y0Y0Y0 */
+      PIXCONV_LOAD_PIXEL16_DITHER(xmm3, xmm7, (y+i+24), bpp);  /* Y0Y0Y0Y0 */
+      xmm0 = _mm_packus_epi16(xmm0, xmm1);                     /* YYYYYYYY */
+      xmm2 = _mm_packus_epi16(xmm2, xmm3);                     /* YYYYYYYY */
 
       // Write data back
       _mm_stream_si128(dst128Y++, xmm0);
@@ -99,13 +97,13 @@ DECLARE_CONV_FUNC_IMPL(convert_yuv_yv_nv12_dither_le)
       __m128i *dst128V = (__m128i *)(dstV + line * outChromaStride);
 
        for (i = 0; i < chromaWidth; i+=16) {
-        PIXCONV_LOAD_PIXEL16_DITHER(xmm0, xmm4, (u+i), shift);    /* U0U0U0U0 */
-        PIXCONV_LOAD_PIXEL16_DITHER(xmm1, xmm5, (u+i+8), shift);  /* U0U0U0U0 */
-        PIXCONV_LOAD_PIXEL16_DITHER(xmm2, xmm6, (v+i), shift);    /* V0V0V0V0 */
-        PIXCONV_LOAD_PIXEL16_DITHER(xmm3, xmm7, (v+i+8), shift);  /* V0V0V0V0 */
+        PIXCONV_LOAD_PIXEL16_DITHER(xmm0, xmm4, (u+i+0), bpp);  /* U0U0U0U0 */
+        PIXCONV_LOAD_PIXEL16_DITHER(xmm1, xmm5, (u+i+8), bpp);  /* U0U0U0U0 */
+        PIXCONV_LOAD_PIXEL16_DITHER(xmm2, xmm6, (v+i+0), bpp);  /* V0V0V0V0 */
+        PIXCONV_LOAD_PIXEL16_DITHER(xmm3, xmm7, (v+i+8), bpp);  /* V0V0V0V0 */
 
-        xmm0 = _mm_packus_epi16(xmm0, xmm1);                      /* UUUUUUUU */
-        xmm2 = _mm_packus_epi16(xmm2, xmm3);                      /* VVVVVVVV */
+        xmm0 = _mm_packus_epi16(xmm0, xmm1);                    /* UUUUUUUU */
+        xmm2 = _mm_packus_epi16(xmm2, xmm3);                    /* VVVVVVVV */
         if (nv12) {
           xmm1 = xmm0;
           xmm0 = _mm_unpacklo_epi8(xmm0, xmm2);
@@ -133,7 +131,6 @@ DECLARE_CONV_FUNC_IMPL(convert_yuv_yv_nv12_dither_le)
 template HRESULT CLAVPixFmtConverter::convert_yuv_yv_nv12_dither_le<0>CONV_FUNC_PARAMS;
 template HRESULT CLAVPixFmtConverter::convert_yuv_yv_nv12_dither_le<1>CONV_FUNC_PARAMS;
 
-template <int shift>
 DECLARE_CONV_FUNC_IMPL(convert_yuv420_px1x_le)
 {
   const uint16_t *y = (const uint16_t *)src[0];
@@ -157,8 +154,8 @@ DECLARE_CONV_FUNC_IMPL(convert_yuv420_px1x_le)
 
     for (i = 0; i < width; i+=16) {
       // Load 8 pixels into register
-      PIXCONV_LOAD_PIXEL16(xmm0, (y+i), shift); /* YYYY */
-      PIXCONV_LOAD_PIXEL16(xmm1, (y+i+8), shift); /* YYYY */
+      PIXCONV_LOAD_PIXEL16(xmm0, (y+i+0), bpp); /* YYYY */
+      PIXCONV_LOAD_PIXEL16(xmm1, (y+i+8), bpp); /* YYYY */
       // and write them out
       _mm_stream_si128(dst128Y++, xmm0);
       _mm_stream_si128(dst128Y++, xmm1);
@@ -175,8 +172,8 @@ DECLARE_CONV_FUNC_IMPL(convert_yuv420_px1x_le)
 
     for (i = 0; i < uvWidth; i+=8) {
       // Load 8 pixels into register
-      PIXCONV_LOAD_PIXEL16(xmm0, (v+i), shift); /* VVVV */
-      PIXCONV_LOAD_PIXEL16(xmm1, (u+i), shift); /* UUUU */
+      PIXCONV_LOAD_PIXEL16(xmm0, (v+i), bpp); /* VVVV */
+      PIXCONV_LOAD_PIXEL16(xmm1, (u+i), bpp); /* UUUU */
 
       xmm2 = xmm0;
       xmm0 = _mm_unpacklo_epi16(xmm1, xmm0);    /* UVUV */
@@ -193,11 +190,6 @@ DECLARE_CONV_FUNC_IMPL(convert_yuv420_px1x_le)
   return S_OK;
 }
 
-// Force creation of these two variants
-template HRESULT CLAVPixFmtConverter::convert_yuv420_px1x_le<0>CONV_FUNC_PARAMS;
-template HRESULT CLAVPixFmtConverter::convert_yuv420_px1x_le<6>CONV_FUNC_PARAMS;
-template HRESULT CLAVPixFmtConverter::convert_yuv420_px1x_le<7>CONV_FUNC_PARAMS;
-
 DECLARE_CONV_FUNC_IMPL(convert_yuv_yv)
 {
   const uint8_t *y = src[0];
@@ -396,7 +388,6 @@ DECLARE_CONV_FUNC_IMPL(convert_yuv422_yuy2_uyvy_dither_le)
   const int inChromaStride  = srcStride[1] >> 1;
   const int outStride       = dstStride << 1;
   const int chromaWidth     = (width + 1) >> 1;
-  const int shift           = bpp - 8;
 
   LAVDitherMode ditherMode = m_pSettings->GetDitherMode();
   const uint16_t *dithers = GetRandomDitherCoeffs(height, 4, 8, 0);
@@ -424,10 +415,10 @@ DECLARE_CONV_FUNC_IMPL(convert_yuv422_yuy2_uyvy_dither_le)
 
     for (i = 0; i < chromaWidth; i+=8) {
       // Load pixels
-      PIXCONV_LOAD_PIXEL16_DITHER(xmm0, xmm4, (y+(i*2)+0), shift);  /* YYYY */
-      PIXCONV_LOAD_PIXEL16_DITHER(xmm1, xmm5, (y+(i*2)+8), shift);  /* YYYY */
-      PIXCONV_LOAD_PIXEL16_DITHER(xmm2, xmm6, (u+i), shift);        /* UUUU */
-      PIXCONV_LOAD_PIXEL16_DITHER(xmm3, xmm7, (v+i), shift);        /* VVVV */
+      PIXCONV_LOAD_PIXEL16_DITHER(xmm0, xmm4, (y+(i*2)+0), bpp);  /* YYYY */
+      PIXCONV_LOAD_PIXEL16_DITHER(xmm1, xmm5, (y+(i*2)+8), bpp);  /* YYYY */
+      PIXCONV_LOAD_PIXEL16_DITHER(xmm2, xmm6, (u+i), bpp);        /* UUUU */
+      PIXCONV_LOAD_PIXEL16_DITHER(xmm3, xmm7, (v+i), bpp);        /* VVVV */
 
       // Pack Ys
       xmm0 = _mm_packus_epi16(xmm0, xmm1);
diff --git a/decoder/LAVVideo/pixconv/yuv444_ayuv.cpp b/decoder/LAVVideo/pixconv/yuv444_ayuv.cpp
index 0c3f328f..926fe3be 100644
--- a/decoder/LAVVideo/pixconv/yuv444_ayuv.cpp
+++ b/decoder/LAVVideo/pixconv/yuv444_ayuv.cpp
@@ -102,9 +102,6 @@ DECLARE_CONV_FUNC_IMPL(convert_yuv444_ayuv_dither_le)
   if (dithers == NULL)
     ditherMode = LAVDither_Ordered;
 
-  // Number of bits to shift to reach 8
-  int shift = bpp - 8;
-
   int line, i;
 
   __m128i xmm0,xmm1,xmm2,xmm3,xmm4,xmm5,xmm6,xmm7;
@@ -128,9 +125,9 @@ DECLARE_CONV_FUNC_IMPL(convert_yuv444_ayuv_dither_le)
 
     for (i = 0; i < width; i+=8) {
       // Load pixels into registers, and apply dithering
-      PIXCONV_LOAD_PIXEL16_DITHER(xmm0, xmm4, (y+i), shift); /* Y0Y0Y0Y0 */
-      PIXCONV_LOAD_PIXEL16_DITHER_HIGH(xmm1, xmm5, (u+i), shift); /* U0U0U0U0 */
-      PIXCONV_LOAD_PIXEL16_DITHER(xmm2, xmm6, (v+i), shift); /* V0V0V0V0 */
+      PIXCONV_LOAD_PIXEL16_DITHER(xmm0, xmm4, (y+i), bpp); /* Y0Y0Y0Y0 */
+      PIXCONV_LOAD_PIXEL16_DITHER_HIGH(xmm1, xmm5, (u+i), bpp); /* U0U0U0U0 */
+      PIXCONV_LOAD_PIXEL16_DITHER(xmm2, xmm6, (v+i), bpp); /* V0V0V0V0 */
 
       // Interlave into AYUV
       xmm0 = _mm_or_si128(xmm0, xmm7);          /* YAYAYAYA */
author	Hendrik Leppkes <h.leppkes@gmail.com>	2012-07-05 15:57:12 +0400
committer	Hendrik Leppkes <h.leppkes@gmail.com>	2012-07-05 18:16:48 +0400
commit	691f56aa69f9f7efc7641932281b9aaa62ef865a (patch)
tree	55f7ee75e61f7a7dce0da2b686f399897779a883 /decoder/LAVVideo/pixconv
parent	2bf951f323555e448255e6959d30b0a95aa39004 (diff)