From e521e0cb195bf42dccb04dd354ea5ff0186b6ae1 Mon Sep 17 00:00:00 2001
From: Hendrik Leppkes <h.leppkes@gmail.com>
Date: Mon, 2 Feb 2015 15:29:06 +0100
Subject: Add optimized P010 -> NV12 converters, and allow this conversion for
 direct mode

---
 decoder/LAVVideo/pixconv/convert_direct.cpp   | 127 ++++++++++++++++++++++++++
 decoder/LAVVideo/pixconv/yuv2yuv_unscaled.cpp |  78 ++++++++++++++++
 2 files changed, 205 insertions(+)

(limited to 'decoder/LAVVideo/pixconv')

diff --git a/decoder/LAVVideo/pixconv/convert_direct.cpp b/decoder/LAVVideo/pixconv/convert_direct.cpp
index f61dc38a..b2857317 100644
--- a/decoder/LAVVideo/pixconv/convert_direct.cpp
+++ b/decoder/LAVVideo/pixconv/convert_direct.cpp
@@ -88,3 +88,130 @@ DECLARE_CONV_FUNC_IMPL(plane_copy_direct_sse4)
 
   return S_OK;
 }
+
+DECLARE_CONV_FUNC_IMPL(convert_p010_nv12_direct_sse4)
+{
+  const ptrdiff_t inStride = srcStride[0];
+  const ptrdiff_t outStride = dstStride[0];
+  const ptrdiff_t chromaHeight = (height >> 1);
+
+  const ptrdiff_t byteWidth = width << 1;
+  const ptrdiff_t stride = min(FFALIGN(byteWidth, 64), min(inStride, outStride));
+
+  LAVDitherMode ditherMode = m_pSettings->GetDitherMode();
+  const uint16_t *dithers = GetRandomDitherCoeffs(height, 4, 8, 0);
+  if (dithers == nullptr)
+    ditherMode = LAVDither_Ordered;
+
+  __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
+
+  _mm_sfence();
+
+  ptrdiff_t line, i;
+
+  for (line = 0; line < height; line++) {
+    // Load dithering coefficients for this line
+    if (ditherMode == LAVDither_Random) {
+      xmm4 = _mm_load_si128((const __m128i *)(dithers + (line << 5) +  0));
+      xmm5 = _mm_load_si128((const __m128i *)(dithers + (line << 5) +  8));
+      xmm6 = _mm_load_si128((const __m128i *)(dithers + (line << 5) + 16));
+      xmm7 = _mm_load_si128((const __m128i *)(dithers + (line << 5) + 24));
+    } else {
+      PIXCONV_LOAD_DITHER_COEFFS(xmm7, line, 8, dithers);
+      xmm4 = xmm5 = xmm6 = xmm7;
+    }
+
+    const uint8_t *y = (src[0] + line * inStride);
+    uint8_t *dy = (dst[0] + line * outStride);
+
+    for (i = 0; i < (stride - 63); i += 64) {
+      PIXCONV_STREAM_LOAD(xmm0, y + i +  0);
+      PIXCONV_STREAM_LOAD(xmm1, y + i + 16);
+      PIXCONV_STREAM_LOAD(xmm2, y + i + 32);
+      PIXCONV_STREAM_LOAD(xmm3, y + i + 48);
+
+      _ReadWriteBarrier();
+
+      // apply dithering coeffs
+      xmm0 = _mm_adds_epu16(xmm0, xmm4);
+      xmm1 = _mm_adds_epu16(xmm1, xmm5);
+      xmm2 = _mm_adds_epu16(xmm2, xmm6);
+      xmm3 = _mm_adds_epu16(xmm3, xmm7);
+
+      // shift and pack to 8-bit
+      xmm0 = _mm_packus_epi16(_mm_srli_epi16(xmm0, 8), _mm_srli_epi16(xmm1, 8));
+      xmm2 = _mm_packus_epi16(_mm_srli_epi16(xmm2, 8), _mm_srli_epi16(xmm3, 8));
+
+      PIXCONV_PUT_STREAM(dy + (i >> 1) +  0, xmm0);
+      PIXCONV_PUT_STREAM(dy + (i >> 1) + 16, xmm2);
+    }
+
+    for (; i < byteWidth; i += 32) {
+      PIXCONV_LOAD_ALIGNED(xmm0, y + i +  0);
+      PIXCONV_LOAD_ALIGNED(xmm1, y + i + 16);
+
+      // apply dithering coeffs
+      xmm0 = _mm_adds_epu16(xmm0, xmm4);
+      xmm1 = _mm_adds_epu16(xmm1, xmm5);
+
+      // shift and pack to 8-bit
+      xmm0 = _mm_packus_epi16(_mm_srli_epi16(xmm0, 8), _mm_srli_epi16(xmm1, 8));
+
+      PIXCONV_PUT_STREAM(dy + (i >> 1), xmm0);
+    }
+  }
+
+  for (line = 0; line < chromaHeight; line++) {
+    // Load dithering coefficients for this line
+    if (ditherMode == LAVDither_Random) {
+      xmm4 = _mm_load_si128((const __m128i *)(dithers + (line << 5) +  0));
+      xmm5 = _mm_load_si128((const __m128i *)(dithers + (line << 5) +  8));
+      xmm6 = _mm_load_si128((const __m128i *)(dithers + (line << 5) + 16));
+      xmm7 = _mm_load_si128((const __m128i *)(dithers + (line << 5) + 24));
+    } else {
+      PIXCONV_LOAD_DITHER_COEFFS(xmm7, line, 8, dithers);
+      xmm4 = xmm5 = xmm6 = xmm7;
+    }
+
+    const uint8_t *uv = (src[1] + line * inStride);
+    uint8_t *duv = (dst[1] + line * outStride);
+
+    for (i = 0; i < (stride - 63); i += 64) {
+      PIXCONV_STREAM_LOAD(xmm0, uv + i +  0);
+      PIXCONV_STREAM_LOAD(xmm1, uv + i + 16);
+      PIXCONV_STREAM_LOAD(xmm2, uv + i + 32);
+      PIXCONV_STREAM_LOAD(xmm3, uv + i + 48);
+
+      _ReadWriteBarrier();
+
+      // apply dithering coeffs
+      xmm0 = _mm_adds_epu16(xmm0, xmm4);
+      xmm1 = _mm_adds_epu16(xmm1, xmm5);
+      xmm2 = _mm_adds_epu16(xmm2, xmm6);
+      xmm3 = _mm_adds_epu16(xmm3, xmm7);
+
+      // shift and pack to 8-bit
+      xmm0 = _mm_packus_epi16(_mm_srli_epi16(xmm0, 8), _mm_srli_epi16(xmm1, 8));
+      xmm2 = _mm_packus_epi16(_mm_srli_epi16(xmm2, 8), _mm_srli_epi16(xmm3, 8));
+
+      PIXCONV_PUT_STREAM(duv + (i >> 1) +  0, xmm0);
+      PIXCONV_PUT_STREAM(duv + (i >> 1) + 16, xmm2);
+    }
+
+    for (; i < byteWidth; i += 32) {
+      PIXCONV_LOAD_ALIGNED(xmm0, uv + i +  0);
+      PIXCONV_LOAD_ALIGNED(xmm1, uv + i + 16);
+
+      // apply dithering coeffs
+      xmm0 = _mm_adds_epu16(xmm0, xmm4);
+      xmm1 = _mm_adds_epu16(xmm1, xmm5);
+
+      // shift and pack to 8-bit
+      xmm0 = _mm_packus_epi16(_mm_srli_epi16(xmm0, 8), _mm_srli_epi16(xmm1, 8));
+
+      PIXCONV_PUT_STREAM(duv + (i >> 1), xmm0);
+    }
+  }
+
+  return S_OK;
+}
diff --git a/decoder/LAVVideo/pixconv/yuv2yuv_unscaled.cpp b/decoder/LAVVideo/pixconv/yuv2yuv_unscaled.cpp
index 0e7eeb63..7ebb30f4 100644
--- a/decoder/LAVVideo/pixconv/yuv2yuv_unscaled.cpp
+++ b/decoder/LAVVideo/pixconv/yuv2yuv_unscaled.cpp
@@ -473,3 +473,81 @@ DECLARE_CONV_FUNC_IMPL(convert_nv12_yv12)
 
   return S_OK;
 }
+
+DECLARE_CONV_FUNC_IMPL(convert_p010_nv12_sse2)
+{
+  const ptrdiff_t inStride = srcStride[0];
+  const ptrdiff_t outStride = dstStride[0];
+  const ptrdiff_t chromaHeight = (height >> 1);
+
+  const ptrdiff_t byteWidth = width << 1;
+
+  LAVDitherMode ditherMode = m_pSettings->GetDitherMode();
+  const uint16_t *dithers = GetRandomDitherCoeffs(height, 2, 8, 0);
+  if (dithers == nullptr)
+    ditherMode = LAVDither_Ordered;
+
+  __m128i xmm0, xmm1, xmm2, xmm3;
+
+  _mm_sfence();
+
+  ptrdiff_t line, i;
+
+  for (line = 0; line < height; line++) {
+    // Load dithering coefficients for this line
+    if (ditherMode == LAVDither_Random) {
+      xmm2 = _mm_load_si128((const __m128i *)(dithers + (line << 4) + 0));
+      xmm3 = _mm_load_si128((const __m128i *)(dithers + (line << 4) + 8));
+    } else {
+      PIXCONV_LOAD_DITHER_COEFFS(xmm2, line, 8, dithers);
+      xmm3 = xmm2;
+    }
+
+    const uint8_t *y = (src[0] + line * inStride);
+    uint8_t *dy = (dst[0] + line * outStride);
+
+    for (i = 0; i < byteWidth; i += 32) {
+      PIXCONV_LOAD_ALIGNED(xmm0, y + i + 0);
+      PIXCONV_LOAD_ALIGNED(xmm1, y + i + 16);
+
+      // apply dithering coeffs
+      xmm0 = _mm_adds_epu16(xmm0, xmm2);
+      xmm1 = _mm_adds_epu16(xmm1, xmm3);
+
+      // shift and pack to 8-bit
+      xmm0 = _mm_packus_epi16(_mm_srli_epi16(xmm0, 8), _mm_srli_epi16(xmm1, 8));
+
+      PIXCONV_PUT_STREAM(dy + (i >> 1), xmm0);
+    }
+  }
+
+  for (line = 0; line < chromaHeight; line++) {
+    // Load dithering coefficients for this line
+    if (ditherMode == LAVDither_Random) {
+      xmm2 = _mm_load_si128((const __m128i *)(dithers + (line << 4) + 0));
+      xmm3 = _mm_load_si128((const __m128i *)(dithers + (line << 4) + 8));
+    } else {
+      PIXCONV_LOAD_DITHER_COEFFS(xmm2, line, 8, dithers);
+      xmm3 = xmm2;
+    }
+
+    const uint8_t *uv = (src[1] + line * inStride);
+    uint8_t *duv = (dst[1] + line * outStride);
+
+    for (i = 0 ; i < byteWidth; i += 32) {
+      PIXCONV_LOAD_ALIGNED(xmm0, uv + i + 0);
+      PIXCONV_LOAD_ALIGNED(xmm1, uv + i + 16);
+
+      // apply dithering coeffs
+      xmm0 = _mm_adds_epu16(xmm0, xmm2);
+      xmm1 = _mm_adds_epu16(xmm1, xmm3);
+
+      // shift and pack to 8-bit
+      xmm0 = _mm_packus_epi16(_mm_srli_epi16(xmm0, 8), _mm_srli_epi16(xmm1, 8));
+
+      PIXCONV_PUT_STREAM(duv + (i >> 1), xmm0);
+    }
+  }
+
+  return S_OK;
+}
-- 
cgit v1.2.3