Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/mpc-hc/LAVFilters.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorHendrik Leppkes <h.leppkes@gmail.com>2012-02-15 18:59:29 +0400
committerHendrik Leppkes <h.leppkes@gmail.com>2012-02-15 19:40:14 +0400
commit565be07c7eb82bf5429a04bc7a616c331e89602a (patch)
tree9899459d1d44bfd0bf301002a8536cb0a13927f3 /decoder/LAVVideo/pixconv
parent17a9db8345a49354b11abaf9e6b016903078d6c1 (diff)
pixconv: factor sse2 memcpy into macros
Diffstat (limited to 'decoder/LAVVideo/pixconv')
-rw-r--r--decoder/LAVVideo/pixconv/pixconv_sse2_templates.h51
-rw-r--r--decoder/LAVVideo/pixconv/yuv2yuv_unscaled.cpp57
2 files changed, 67 insertions, 41 deletions
diff --git a/decoder/LAVVideo/pixconv/pixconv_sse2_templates.h b/decoder/LAVVideo/pixconv/pixconv_sse2_templates.h
index e4f9a0fb..d5d82e82 100644
--- a/decoder/LAVVideo/pixconv/pixconv_sse2_templates.h
+++ b/decoder/LAVVideo/pixconv/pixconv_sse2_templates.h
@@ -90,3 +90,54 @@
// src - source memory
#define PIXCONV_LOAD_4PIXEL16(reg,src) \
reg = _mm_loadl_epi64((const __m128i *)(src)); /* load 64-bit (4 pixel) */
+
+// SSE2 Aligned memcpy
+// dst - memory destination
+// src - memory source
+// len - size in bytes
+#define PIXCONV_MEMCPY_ALIGNED(dst,src,len) \
+ { \
+ __m128i reg; \
+ __m128i *dst128 = (__m128i *)(dst); \
+ for (int i = 0; i < len; i+=16) { \
+ PIXCONV_LOAD_PIXEL8_ALIGNED(reg,(src)+i); \
+ _mm_stream_si128(dst128++, reg); \
+ } \
+ }
+
+// SSE2 Aligned memcpy (for 32-bit aligned data)
+// dst - memory destination
+// src - memory source
+// len - size in bytes
+#define PIXCONV_MEMCPY_ALIGNED32(dst,src,len) \
+ { \
+ __m128i reg1,reg2; \
+ __m128i *dst128 = (__m128i *)(dst); \
+ for (int i = 0; i < len; i+=32) { \
+ PIXCONV_LOAD_PIXEL8_ALIGNED(reg1,(src)+i); \
+ PIXCONV_LOAD_PIXEL8_ALIGNED(reg2,(src)+i+16); \
+ _mm_stream_si128(dst128++, reg1); \
+ _mm_stream_si128(dst128++, reg2); \
+ } \
+ }
+
+// SSE2 Aligned memcpy
+// Copys the same size from two source into two destinations at the same time
+// Can be useful to copy U/V planes in one go
+// dst1 - memory destination
+// src1 - memory source
+// dst2 - memory destination
+// src2 - memory source
+// len - size in bytes
+#define PIXCONV_MEMCPY_ALIGNED_TWO(dst1,src1,dst2,src2,len) \
+ { \
+ __m128i reg1,reg2; \
+ __m128i *dst128_1 = (__m128i *)(dst1); \
+ __m128i *dst128_2 = (__m128i *)(dst2); \
+ for (int i = 0; i < len; i+=16) { \
+ PIXCONV_LOAD_PIXEL8_ALIGNED(reg1,(src1)+i); \
+ PIXCONV_LOAD_PIXEL8_ALIGNED(reg2,(src2)+i); \
+ _mm_stream_si128(dst128_1++, reg1); \
+ _mm_stream_si128(dst128_2++, reg2); \
+ } \
+ }
diff --git a/decoder/LAVVideo/pixconv/yuv2yuv_unscaled.cpp b/decoder/LAVVideo/pixconv/yuv2yuv_unscaled.cpp
index e45b9d6c..53699e49 100644
--- a/decoder/LAVVideo/pixconv/yuv2yuv_unscaled.cpp
+++ b/decoder/LAVVideo/pixconv/yuv2yuv_unscaled.cpp
@@ -216,26 +216,28 @@ DECLARE_CONV_FUNC_IMPL(convert_yuv_yv)
outChromaStride = outChromaStride >> 1;
}
+ uint8_t *dstY = dst;
+ uint8_t *dstV = dstY + height * outLumaStride;
+ uint8_t *dstU = dstV + chromaHeight * outChromaStride;
+
// Copy planes
+ _mm_sfence();
+
// Y
for(line = 0; line < height; ++line) {
- memcpy(dst, y, width);
+ PIXCONV_MEMCPY_ALIGNED(dstY + outLumaStride * line, y, width);
y += inLumaStride;
- dst += outLumaStride;
}
- uint8_t *dstV = dst;
- uint8_t *dstU = dst + chromaHeight * outChromaStride;
-
// U/V
for(line = 0; line < chromaHeight; ++line) {
- memcpy(dstU, u, chromaWidth);
- memcpy(dstV, v, chromaWidth);
+ PIXCONV_MEMCPY_ALIGNED_TWO(
+ dstU + outChromaStride * line, u,
+ dstV + outChromaStride * line, v,
+ chromaWidth);
u += inChromaStride;
v += inChromaStride;
- dstU += outChromaStride;
- dstV += outChromaStride;
}
return S_OK;
@@ -264,15 +266,7 @@ DECLARE_CONV_FUNC_IMPL(convert_yuv420_nv12)
// Y
for(line = 0; line < height; ++line) {
- __m128i *dstY128 = (__m128i *)(dstY + outStride * line);
-
- for (i = 0; i < width; i+=32) {
- PIXCONV_LOAD_PIXEL8_ALIGNED(xmm0, y+i+0);
- PIXCONV_LOAD_PIXEL8_ALIGNED(xmm1, y+i+16);
- _mm_stream_si128(dstY128++, xmm0);
- _mm_stream_si128(dstY128++, xmm1);
- }
-
+ PIXCONV_MEMCPY_ALIGNED32(dstY + outStride * line, y, width);
y += inLumaStride;
}
@@ -469,13 +463,7 @@ DECLARE_CONV_FUNC_IMPL(convert_nv12_yv12)
// Copy the y
for (line = 0; line < height; line++) {
- __m128i *dstY128 = (__m128i *)(dstY + outLumaStride * line);
-
- for (i = 0; i < width; i+=16) {
- PIXCONV_LOAD_PIXEL8_ALIGNED(xmm0, y+i+0);
- _mm_stream_si128(dstY128++, xmm0);
- }
-
+ PIXCONV_MEMCPY_ALIGNED(dstY + outLumaStride * line, y, width);
y += inStride;
}
@@ -520,31 +508,18 @@ DECLARE_CONV_FUNC_IMPL(convert_nv12_nv12)
uint8_t *dstY = dst;
uint8_t *dstUV = dstY + height * outStride;
- int line, i;
- __m128i xmm0;
+ int line;
_mm_sfence();
// Copy the data
for (line = 0; line < height; line++) {
- __m128i *dstY128 = (__m128i *)(dstY + outStride * line);
-
- for (i = 0; i < width; i+=16) {
- PIXCONV_LOAD_PIXEL8_ALIGNED(xmm0, y+i+0);
- _mm_stream_si128(dstY128++, xmm0);
- }
-
+ PIXCONV_MEMCPY_ALIGNED(dstY + outStride * line, y, width);
y += inStride;
}
for (line = 0; line < chromaHeight; line++) {
- __m128i *dstUV128 = (__m128i *)(dstUV + outStride * line);
-
- for (i = 0; i < width; i+=16) {
- PIXCONV_LOAD_PIXEL8_ALIGNED(xmm0, uv+i+0);
- _mm_stream_si128(dstUV128++, xmm0);
- }
-
+ PIXCONV_MEMCPY_ALIGNED(dstUV + outStride * line, uv, width);
uv += inStride;
}