From 4724973327077d99e73460531986effe97b5504a Mon Sep 17 00:00:00 2001
From: Hendrik Leppkes <h.leppkes@gmail.com>
Date: Thu, 9 May 2013 17:25:07 +0200
Subject: Add SSSE3 RGB48->RGB32/RGB24 converter

---
 decoder/LAVVideo/LAVPixFmtConverter.cpp       |   7 +-
 decoder/LAVVideo/LAVPixFmtConverter.h         |   3 +
 decoder/LAVVideo/LAVVideo.vcxproj             |   1 +
 decoder/LAVVideo/LAVVideo.vcxproj.filters     |   3 +
 decoder/LAVVideo/pixconv/rgb2rgb_unscaled.cpp | 130 ++++++++++++++++++++++++++
 5 files changed, 143 insertions(+), 1 deletion(-)
 create mode 100644 decoder/LAVVideo/pixconv/rgb2rgb_unscaled.cpp

(limited to 'decoder')
diff --git a/decoder/LAVVideo/LAVPixFmtConverter.cpp b/decoder/LAVVideo/LAVPixFmtConverter.cpp
index 6362e870..b60540f6 100644
--- a/decoder/LAVVideo/LAVPixFmtConverter.cpp
+++ b/decoder/LAVVideo/LAVPixFmtConverter.cpp
@@ -45,7 +45,7 @@
  * RGB24          -       -       -       -       -       -       -       -       -       -       -       -       -       -       -      x       -
  * RGB32          -       -       -       -       -       -       -       -       -       -       -       -       -       -       -      -       x
  * ARGB32         -       -       -       -       -       -       -       -       -       -       -       -       -       -       -      -       x
- * RGB48          -       -       -       -       -       -       -       -       -       -       -       -       -       -       -      -       -
+ * RGB48          -       -       -       -       -       -       -       -       -       -       -       -       -       -       -      x       x
  *
  * Every processing path has a swscale fallback (even those with a "-" above), every combination of input/output is possible, just not optimized (ugly and/or slow)
  */
@@ -322,6 +322,11 @@ void CLAVPixFmtConverter::SelectConvertFunction()
     || (m_OutputPixFmt == LAVOutPixFmt_RGB24 && m_InputPixFmt == LAVPixFmt_RGB24)) {
     convert = &CLAVPixFmtConverter::plane_copy;
     m_RequiredAlignment = 0;
+  } else if (m_InputPixFmt == LAVPixFmt_RGB48 && (m_OutputPixFmt == LAVOutPixFmt_RGB32 || m_OutputPixFmt == LAVOutPixFmt_RGB24) && (cpu & AV_CPU_FLAG_SSSE3)) {
+    if (m_OutputPixFmt == LAVOutPixFmt_RGB32)
+      convert = &CLAVPixFmtConverter::convert_rgb48_rgb32_ssse3;
+    else
+      convert = &CLAVPixFmtConverter::convert_rgb48_rgb24_ssse3;
   } else if (cpu & AV_CPU_FLAG_SSE2) {
     if (m_OutputPixFmt == LAVOutPixFmt_AYUV && m_InputPixFmt == LAVPixFmt_YUV444bX) {
       convert = &CLAVPixFmtConverter::convert_yuv444_ayuv_dither_le;
diff --git a/decoder/LAVVideo/LAVPixFmtConverter.h b/decoder/LAVVideo/LAVPixFmtConverter.h
index 7ac3e228..1b4895bf 100644
--- a/decoder/LAVVideo/LAVPixFmtConverter.h
+++ b/decoder/LAVVideo/LAVPixFmtConverter.h
@@ -143,6 +143,9 @@ private:
   template <int uyvy> DECLARE_CONV_FUNC(convert_yuv422_yuy2_uyvy_dither_le);
   template <int nv12> DECLARE_CONV_FUNC(convert_yuv_yv_nv12_dither_le);
 
+  DECLARE_CONV_FUNC(convert_rgb48_rgb32_ssse3);
+  DECLARE_CONV_FUNC(convert_rgb48_rgb24_ssse3);
+
   template <int out32> DECLARE_CONV_FUNC(convert_yuv_rgb);
   RGBCoeffs* getRGBCoeffs(int width, int height);
   const uint16_t* GetRandomDitherCoeffs(int height, int coeffs, int bits, int line);
diff --git a/decoder/LAVVideo/LAVVideo.vcxproj b/decoder/LAVVideo/LAVVideo.vcxproj
index 8e4c760a..01678879 100644
--- a/decoder/LAVVideo/LAVVideo.vcxproj
+++ b/decoder/LAVVideo/LAVVideo.vcxproj
@@ -116,6 +116,7 @@
     <ClCompile Include="pixconv\convert_generic.cpp" />
     <ClCompile Include="pixconv\interleave.cpp" />
     <ClCompile Include="pixconv\pixconv.cpp" />
+    <ClCompile Include="pixconv\rgb2rgb_unscaled.cpp" />
     <ClCompile Include="pixconv\yuv2rgb.cpp" />
     <ClCompile Include="pixconv\yuv2yuv_unscaled.cpp" />
     <ClCompile Include="pixconv\yuv420_yuy2.cpp" />
diff --git a/decoder/LAVVideo/LAVVideo.vcxproj.filters b/decoder/LAVVideo/LAVVideo.vcxproj.filters
index 4433b291..7164bbf5 100644
--- a/decoder/LAVVideo/LAVVideo.vcxproj.filters
+++ b/decoder/LAVVideo/LAVVideo.vcxproj.filters
@@ -153,6 +153,9 @@
     <ClCompile Include="VideoInputPin.cpp">
       <Filter>Source Files</Filter>
     </ClCompile>
+    <ClCompile Include="pixconv\rgb2rgb_unscaled.cpp">
+      <Filter>Source Files\pixconv</Filter>
+    </ClCompile>
   </ItemGroup>
   <ItemGroup>
     <ClInclude Include="stdafx.h">
diff --git a/decoder/LAVVideo/pixconv/rgb2rgb_unscaled.cpp b/decoder/LAVVideo/pixconv/rgb2rgb_unscaled.cpp
new file mode 100644
index 00000000..c7219659
--- /dev/null
+++ b/decoder/LAVVideo/pixconv/rgb2rgb_unscaled.cpp
@@ -0,0 +1,130 @@
+/*
+ *      Copyright (C) 2010-2013 Hendrik Leppkes
+ *      http://www.1f0.de
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License along
+ *  with this program; if not, write to the Free Software Foundation, Inc.,
+ *  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include "stdafx.h"
+
+#include <emmintrin.h>
+
+#include "pixconv_internal.h"
+#include "pixconv_sse2_templates.h"
+
+DECLARE_CONV_FUNC_IMPL(convert_rgb48_rgb32_ssse3)
+{
+  const uint16_t *rgb = (const uint16_t *)src[0];
+  const ptrdiff_t inStride = srcStride[0] >> 1;
+  const ptrdiff_t outStride = dstStride * 4;
+  ptrdiff_t line, i;
+
+  int processWidth = width * 3;
+
+  LAVDitherMode ditherMode = m_pSettings->GetDitherMode();
+  const uint16_t *dithers = GetRandomDitherCoeffs(height, 4, 8, 0);
+  if (dithers == NULL)
+    ditherMode = LAVDither_Ordered;
+
+  __m128i xmm0,xmm1,xmm2,xmm3,xmm4,xmm5,xmm6,xmm7;
+  __m128i mask = _mm_setr_epi8(0,1,2,3,4,5,-1,-1,6,7,8,9,10,11,-1,-1);
+
+  _mm_sfence();
+  for (line = 0; line < height; line++) {
+    __m128i *dst128 = (__m128i *)(dst + line * outStride);
+
+    // Load dithering coefficients for this line
+    if (ditherMode == LAVDither_Random) {
+      xmm5 = _mm_load_si128((const __m128i *)(dithers + (line << 5) + 0));
+      xmm6 = _mm_load_si128((const __m128i *)(dithers + (line << 5) + 8));
+      xmm7 = _mm_load_si128((const __m128i *)(dithers + (line << 5) + 16));
+    } else {
+      PIXCONV_LOAD_DITHER_COEFFS(xmm7,line,8,dithers);
+      xmm5 = xmm6 = xmm7;
+    }
+    for (i = 0; i < processWidth; i += 24) {
+      PIXCONV_LOAD_ALIGNED(xmm0, (rgb + i));      /* load */
+      _mm_adds_epu16(xmm0, xmm5);                 /* apply dithering coefficients */
+      xmm0 = _mm_srli_epi16(xmm0, 8);             /* shift to 8-bit */
+      PIXCONV_LOAD_ALIGNED(xmm1, (rgb + i + 8));  /* load */
+      _mm_adds_epu16(xmm1, xmm6);                 /* apply dithering coefficients */
+      xmm1 = _mm_srli_epi16(xmm1, 8);             /* shift to 8-bit */
+      PIXCONV_LOAD_ALIGNED(xmm2, (rgb + i + 16)); /* load */
+      _mm_adds_epu16(xmm2, xmm7);                 /* apply dithering coefficients */
+      xmm2 = _mm_srli_epi16(xmm2, 8);             /* shift to 8-bit */
+
+      xmm3 = _mm_shuffle_epi8(xmm0, mask);
+      xmm4 = _mm_shuffle_epi8(_mm_alignr_epi8(xmm1, xmm0, 12), mask);
+      xmm0 = _mm_shuffle_epi8(_mm_alignr_epi8(xmm2, xmm1, 8),  mask);
+      xmm1 = _mm_shuffle_epi8(_mm_alignr_epi8(xmm2, xmm2, 4),  mask);
+      
+      xmm3 = _mm_packus_epi16(xmm3, xmm4);
+      xmm0 = _mm_packus_epi16(xmm0, xmm1);
+
+      _mm_stream_si128(dst128++, xmm3);
+      _mm_stream_si128(dst128++, xmm0);
+    }
+
+    rgb += inStride;
+  }
+
+  return S_OK;
+}
+
+DECLARE_CONV_FUNC_IMPL(convert_rgb48_rgb24_ssse3)
+{
+  const uint16_t *rgb = (const uint16_t *)src[0];
+  const ptrdiff_t inStride = srcStride[0] >> 1;
+  const ptrdiff_t outStride = dstStride * 3;
+  ptrdiff_t line, i;
+
+  int processWidth = width * 3;
+
+  LAVDitherMode ditherMode = m_pSettings->GetDitherMode();
+  const uint16_t *dithers = GetRandomDitherCoeffs(height, 2, 8, 0);
+  if (dithers == NULL)
+    ditherMode = LAVDither_Ordered;
+
+  __m128i xmm0,xmm1,xmm6,xmm7;
+
+  _mm_sfence();
+  for (line = 0; line < height; line++) {
+    __m128i *dst128 = (__m128i *)(dst + line * outStride);
+
+    // Load dithering coefficients for this line
+    if (ditherMode == LAVDither_Random) {
+      xmm6 = _mm_load_si128((const __m128i *)(dithers + (line << 4) + 0));
+      xmm7 = _mm_load_si128((const __m128i *)(dithers + (line << 4) + 8));
+    } else {
+      PIXCONV_LOAD_DITHER_COEFFS(xmm7,line,8,dithers);
+      xmm6 = xmm7;
+    }
+    for (i = 0; i < processWidth; i += 16) {
+      PIXCONV_LOAD_ALIGNED(xmm0, (rgb + i));      /* load */
+      _mm_adds_epu16(xmm0, xmm6);                 /* apply dithering coefficients */
+      xmm0 = _mm_srli_epi16(xmm0, 8);             /* shift to 8-bit */
+      PIXCONV_LOAD_ALIGNED(xmm1, (rgb + i + 8));  /* load */
+      _mm_adds_epu16(xmm1, xmm7);                 /* apply dithering coefficients */
+      xmm1 = _mm_srli_epi16(xmm1, 8);             /* shift to 8-bit */
+      
+      xmm0 = _mm_packus_epi16(xmm0, xmm1);
+      _mm_stream_si128(dst128++, xmm0);
+    }
+
+    rgb += inStride;
+  }
+
+  return S_OK;
+}
-- 
cgit v1.2.3