Implement plane_copy_sse2, optimized plain copying of frames.

author: Hendrik Leppkes <h.leppkes@gmail.com> 2014-05-24 21:01:03 +0400
committer: Hendrik Leppkes <h.leppkes@gmail.com> 2014-05-24 21:01:03 +0400
commit: 87f98796793260a7c1f40779055ecf84e5e8595f (patch)
tree: d7cd3e4701a091f4628b1da4616d2c3ffd8d6ffe /decoder/LAVVideo
parent: ac6002a0a47a9c60bacda9ef21117f8a49faf9d2 (diff)
3 files changed, 37 insertions, 1 deletions
diff --git a/decoder/LAVVideo/LAVPixFmtConverter.cpp b/decoder/LAVVideo/LAVPixFmtConverter.cpp
index f3f89cdc..f6c533d8 100644
--- a/decoder/LAVVideo/LAVPixFmtConverter.cpp
+++ b/decoder/LAVVideo/LAVPixFmtConverter.cpp
@@ -323,7 +323,10 @@ void CLAVPixFmtConverter::SelectConvertFunction()
   } else if ((m_OutputPixFmt == LAVOutPixFmt_RGB32 && (m_InputPixFmt == LAVPixFmt_RGB32 || m_InputPixFmt == LAVPixFmt_ARGB32))
     || (m_OutputPixFmt == LAVOutPixFmt_RGB24 && m_InputPixFmt == LAVPixFmt_RGB24) || (m_OutputPixFmt == LAVOutPixFmt_RGB48 && m_InputPixFmt == LAVPixFmt_RGB48)
     || (m_OutputPixFmt == LAVOutPixFmt_NV12 && m_InputPixFmt == LAVPixFmt_NV12)) {
-    convert = &CLAVPixFmtConverter::plane_copy;
+    if (cpu & AV_CPU_FLAG_SSE2)
+      convert = &CLAVPixFmtConverter::plane_copy_sse2;
+    else
+      convert = &CLAVPixFmtConverter::plane_copy;
     m_RequiredAlignment = 0;
   } else if (m_InputPixFmt == LAVPixFmt_RGB48 && m_OutputPixFmt == LAVOutPixFmt_RGB32 && (cpu & AV_CPU_FLAG_SSSE3)) {
     convert = &CLAVPixFmtConverter::convert_rgb48_rgb32_ssse3;
diff --git a/decoder/LAVVideo/LAVPixFmtConverter.h b/decoder/LAVVideo/LAVPixFmtConverter.h
index 07533c2e..923b1841 100644
--- a/decoder/LAVVideo/LAVPixFmtConverter.h
+++ b/decoder/LAVVideo/LAVPixFmtConverter.h
@@ -114,6 +114,7 @@ private:
   // Pixel Implementations
   DECLARE_CONV_FUNC(convert_generic);
   DECLARE_CONV_FUNC(plane_copy);
+  DECLARE_CONV_FUNC(plane_copy_sse2);
   DECLARE_CONV_FUNC(convert_yuv444_ayuv);
   DECLARE_CONV_FUNC(convert_yuv444_ayuv_dither_le);
   DECLARE_CONV_FUNC(convert_yuv444_y410);
diff --git a/decoder/LAVVideo/pixconv/pixconv.cpp b/decoder/LAVVideo/pixconv/pixconv.cpp
index 68a5e4a0..b900c1e9 100644
--- a/decoder/LAVVideo/pixconv/pixconv.cpp
+++ b/decoder/LAVVideo/pixconv/pixconv.cpp
@@ -19,6 +19,7 @@
 
 #include "stdafx.h"
 #include "pixconv_internal.h"
+#include "pixconv_sse2_templates.h"
 
 // 8x8 Bayes ordered dithering table, scaled to the 0-255 range for 16->8 conversion
 // stored as 16-bit unsigned for optimized SIMD access
@@ -56,3 +57,34 @@ DECLARE_CONV_FUNC_IMPL(plane_copy)
 
   return S_OK;
 }
+
+DECLARE_CONV_FUNC_IMPL(plane_copy_sse2)
+{
+  LAVOutPixFmtDesc desc = lav_pixfmt_desc[outputFormat];
+
+  const int widthBytes = width * desc.codedbytes;
+  const int planes = max(desc.planes, 1);
+
+  ptrdiff_t line, plane;
+
+  for (plane = 0; plane < planes; plane++) {
+    const int planeWidth = widthBytes / desc.planeWidth[plane];
+    const int planeHeight = height / desc.planeHeight[plane];
+    const ptrdiff_t srcPlaneStride = srcStride[plane];
+    const ptrdiff_t dstPlaneStride = dstStride[plane];
+    const uint8_t * const srcBuf = src[plane];
+          uint8_t * const dstBuf = dst[plane];
+
+    if ((dstPlaneStride % 16) == 0 && ((intptr_t)dstBuf % 16u) == 0) {
+      for (line = 0; line < planeHeight; ++line) {
+        PIXCONV_MEMCPY_ALIGNED(dstBuf + line * dstPlaneStride, srcBuf + line * srcPlaneStride, planeWidth);
+      }
+    } else {
+      for (line = 0; line < planeHeight; ++line) {
+        memcpy(dstBuf + line * dstPlaneStride, srcBuf + line * srcPlaneStride, planeWidth);
+      }
+    }
+  }
+
+  return S_OK;
+}
author	Hendrik Leppkes <h.leppkes@gmail.com>	2014-05-24 21:01:03 +0400
committer	Hendrik Leppkes <h.leppkes@gmail.com>	2014-05-24 21:01:03 +0400
commit	87f98796793260a7c1f40779055ecf84e5e8595f (patch)
tree	d7cd3e4701a091f4628b1da4616d2c3ffd8d6ffe /decoder/LAVVideo
parent	ac6002a0a47a9c60bacda9ef21117f8a49faf9d2 (diff)