dxva2cb: implement direct output mode

This reduces the performance overhead and CPU usage by up to 50%
author: Hendrik Leppkes <h.leppkes@gmail.com> 2015-01-29 16:29:41 +0300
committer: Hendrik Leppkes <h.leppkes@gmail.com> 2015-01-31 17:48:15 +0300
commit: 4e576410b52577ff7d3274a94ae955fbb1c7e443 (patch)
tree: f30d4d6056ee64b815bcd9ec99b2122a933cf463 /decoder/LAVVideo/pixconv
parent: f2f72474d685df1c293996550d3ec2bbb7f38e2f (diff)
2 files changed, 96 insertions, 0 deletions
diff --git a/decoder/LAVVideo/pixconv/convert_direct.cpp b/decoder/LAVVideo/pixconv/convert_direct.cpp
new file mode 100644
index 00000000..f61dc38a
--- /dev/null
+++ b/decoder/LAVVideo/pixconv/convert_direct.cpp
@@ -0,0 +1,90 @@
+/*
+ *      Copyright (C) 2010-2015 Hendrik Leppkes
+ *      http://www.1f0.de
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License along
+ *  with this program; if not, write to the Free Software Foundation, Inc.,
+ *  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include "stdafx.h"
+
+#include <emmintrin.h>
+
+#include "pixconv_internal.h"
+#include "pixconv_sse2_templates.h"
+
+// This function is only designed for NV12-like pixel formats, like NV12, P010, P016, ...
+DECLARE_CONV_FUNC_IMPL(plane_copy_direct_sse4)
+{
+  const ptrdiff_t inStride     = srcStride[0];
+  const ptrdiff_t outStride    = dstStride[0];
+  const ptrdiff_t chromaHeight = (height >> 1);
+
+  const ptrdiff_t byteWidth    = (outputFormat == LAVOutPixFmt_P010 || outputFormat == LAVOutPixFmt_P016) ? width << 1 : width;
+  const ptrdiff_t stride       = min(FFALIGN(byteWidth, 64), min(inStride, outStride));
+
+  __m128i xmm0,xmm1,xmm2,xmm3;
+
+  _mm_sfence();
+
+  ptrdiff_t line, i;
+
+  for (line = 0; line < height; line++) {
+    const uint8_t *y  = (src[0] + line * inStride);
+          uint8_t *dy = (dst[0] + line * outStride);
+    for (i = 0; i < (stride - 63); i += 64) {
+      PIXCONV_STREAM_LOAD(xmm0, y + i +  0);
+      PIXCONV_STREAM_LOAD(xmm1, y + i + 16);
+      PIXCONV_STREAM_LOAD(xmm2, y + i + 32);
+      PIXCONV_STREAM_LOAD(xmm3, y + i + 48);
+
+      _ReadWriteBarrier();
+
+      PIXCONV_PUT_STREAM(dy + i +  0, xmm0);
+      PIXCONV_PUT_STREAM(dy + i + 16, xmm1);
+      PIXCONV_PUT_STREAM(dy + i + 32, xmm2);
+      PIXCONV_PUT_STREAM(dy + i + 48, xmm3);
+    }
+
+    for (; i < byteWidth; i += 16) {
+      PIXCONV_LOAD_ALIGNED(xmm0, y + i);
+      PIXCONV_PUT_STREAM(dy + i, xmm0);
+    }
+  }
+
+  for (line = 0; line < chromaHeight; line++) {
+    const uint8_t *uv  = (src[1] + line * inStride);
+          uint8_t *duv = (dst[1] + line * outStride);
+    for (i = 0; i < (stride - 63); i += 64) {
+      PIXCONV_STREAM_LOAD(xmm0, uv + i +  0);
+      PIXCONV_STREAM_LOAD(xmm1, uv + i + 16);
+      PIXCONV_STREAM_LOAD(xmm2, uv + i + 32);
+      PIXCONV_STREAM_LOAD(xmm3, uv + i + 48);
+
+      _ReadWriteBarrier();
+
+      PIXCONV_PUT_STREAM(duv + i +  0, xmm0);
+      PIXCONV_PUT_STREAM(duv + i + 16, xmm1);
+      PIXCONV_PUT_STREAM(duv + i + 32, xmm2);
+      PIXCONV_PUT_STREAM(duv + i + 48, xmm3);
+    }
+
+    for (; i < byteWidth; i += 16) {
+      PIXCONV_LOAD_ALIGNED(xmm0, uv + i);
+      PIXCONV_PUT_STREAM(duv + i, xmm0);
+    }
+  }
+
+  return S_OK;
+}
diff --git a/decoder/LAVVideo/pixconv/pixconv_sse2_templates.h b/decoder/LAVVideo/pixconv/pixconv_sse2_templates.h
index 6b913ad8..602b6905 100644
--- a/decoder/LAVVideo/pixconv/pixconv_sse2_templates.h
+++ b/decoder/LAVVideo/pixconv/pixconv_sse2_templates.h
@@ -97,6 +97,12 @@
 #define PIXCONV_LOAD_ALIGNED(reg,src) \
   reg = _mm_load_si128((const __m128i *)(src));      /* load (aligned) */
 
+// Load 128-bit into a register, using streaming  memory access
+// reg   - register to store pixels in
+// src   - memory pointer of the source
+#define PIXCONV_STREAM_LOAD(reg,src) \
+  reg = _mm_stream_load_si128((__m128i *)(src));      /* load (streaming) */
+
 #define PIXCONV_LOAD_PIXEL8_ALIGNED PIXCONV_LOAD_ALIGNED
 
 // Put 128-bit into memory, using streaming write
author	Hendrik Leppkes <h.leppkes@gmail.com>	2015-01-29 16:29:41 +0300
committer	Hendrik Leppkes <h.leppkes@gmail.com>	2015-01-31 17:48:15 +0300
commit	4e576410b52577ff7d3274a94ae955fbb1c7e443 (patch)
tree	f30d4d6056ee64b815bcd9ec99b2122a933cf463 /decoder/LAVVideo/pixconv
parent	f2f72474d685df1c293996550d3ec2bbb7f38e2f (diff)