Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/mpc-hc/LAVFilters.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorHendrik Leppkes <h.leppkes@gmail.com>2014-05-24 19:16:48 +0400
committerHendrik Leppkes <h.leppkes@gmail.com>2014-05-24 19:47:17 +0400
commitcfb2c60541714d1110b2a257f835e5d4244a421b (patch)
tree1c69e871f8bb9a1f53771220cfcd33ee0088540e /decoder/LAVVideo/pixconv
parent55ff38d650aaed710fdce5a8b4d16faaf6bf0763 (diff)
Optimize and simplify SSE2 load/store
Diffstat (limited to 'decoder/LAVVideo/pixconv')
-rw-r--r--decoder/LAVVideo/pixconv/pixconv_sse2_templates.h16
-rw-r--r--decoder/LAVVideo/pixconv/yuv2yuv_unscaled.cpp124
2 files changed, 75 insertions, 65 deletions
diff --git a/decoder/LAVVideo/pixconv/pixconv_sse2_templates.h b/decoder/LAVVideo/pixconv/pixconv_sse2_templates.h
index 610ff490..c4aa3c45 100644
--- a/decoder/LAVVideo/pixconv/pixconv_sse2_templates.h
+++ b/decoder/LAVVideo/pixconv/pixconv_sse2_templates.h
@@ -38,6 +38,22 @@
reg = _mm_load_si128((const __m128i *)(src)); /* load (aligned) */ \
reg = _mm_slli_epi16(reg, 16-bpp); /* shift to 16-bit */
+
+// Load 2x8 16-bit pixels into registers, using aligned memory access
+// reg1 - register to store pixels in
+// reg2 - register to store pixels in
+// src1 - memory pointer of the source
+// src2 - memory pointer of the source
+// bpp - bit depth of the pixels
+#define PIXCONV_LOAD_PIXEL16X2(reg1,reg2,src1,src2,bpp) \
+ { \
+ const __m128i shift = _mm_cvtsi32_si128(16 - bpp); \
+ reg1 = _mm_load_si128((const __m128i *)(src1)); \
+ reg2 = _mm_load_si128((const __m128i *)(src2)); \
+ reg1 = _mm_sll_epi16(reg1, shift); \
+ reg2 = _mm_sll_epi16(reg2, shift); \
+ }
+
// Load 8 16-bit pixels into a register, and dither them to 8 bit
// The 8-bit pixels will be in the high-bytes of the 8 16-bit parts
// NOTE: the low-bytes are clobbered, and not empty.
diff --git a/decoder/LAVVideo/pixconv/yuv2yuv_unscaled.cpp b/decoder/LAVVideo/pixconv/yuv2yuv_unscaled.cpp
index e3da8841..1bef94ad 100644
--- a/decoder/LAVVideo/pixconv/yuv2yuv_unscaled.cpp
+++ b/decoder/LAVVideo/pixconv/yuv2yuv_unscaled.cpp
@@ -65,8 +65,8 @@ DECLARE_CONV_FUNC_IMPL(convert_yuv_yv_nv12_dither_le)
xmm4 = xmm5 = xmm6 = xmm7;
}
- __m128i *dst128Y = (__m128i *)(dst[0] + line * outYStride);
- const uint16_t *y = (const uint16_t *)(src[0] + line * inYStride);
+ const uint16_t * const y = (const uint16_t *)(src[0] + line * inYStride);
+ uint16_t * const dy = ( uint16_t *)(dst[0] + line * outYStride);
for (i = 0; i < width; i+=32) {
// Load pixels into registers, and apply dithering
@@ -78,18 +78,18 @@ DECLARE_CONV_FUNC_IMPL(convert_yuv_yv_nv12_dither_le)
xmm2 = _mm_packus_epi16(xmm2, xmm3); /* YYYYYYYY */
// Write data back
- _mm_stream_si128(dst128Y++, xmm0);
- _mm_stream_si128(dst128Y++, xmm2);
+ PIXCONV_PUT_STREAM(dy + (i >> 1) + 0, xmm0);
+ PIXCONV_PUT_STREAM(dy + (i >> 1) + 8, xmm2);
}
// Process U/V for chromaHeight lines
if (line < chromaHeight) {
- __m128i *dst128UV = (__m128i *)(dst[1] + line * outUVStride);
- __m128i *dst128U = (__m128i *)(dst[2] + line * outUVStride);
- __m128i *dst128V = (__m128i *)(dst[1] + line * outUVStride);
+ const uint16_t * const u = (const uint16_t *)(src[1] + line * inUVStride);
+ const uint16_t * const v = (const uint16_t *)(src[2] + line * inUVStride);
- const uint16_t *u = (const uint16_t *)(src[1] + line * inUVStride);
- const uint16_t *v = (const uint16_t *)(src[2] + line * inUVStride);
+ uint8_t * const duv = (uint8_t *)(dst[1] + line * outUVStride);
+ uint8_t * const du = (uint8_t *)(dst[2] + line * outUVStride);
+ uint8_t * const dv = (uint8_t *)(dst[1] + line * outUVStride);
for (i = 0; i < chromaWidth; i+=16) {
PIXCONV_LOAD_PIXEL16_DITHER(xmm0, xmm4, (u+i+0), bpp); /* U0U0U0U0 */
@@ -104,11 +104,11 @@ DECLARE_CONV_FUNC_IMPL(convert_yuv_yv_nv12_dither_le)
xmm0 = _mm_unpacklo_epi8(xmm0, xmm2);
xmm1 = _mm_unpackhi_epi8(xmm1, xmm2);
- _mm_stream_si128(dst128UV++, xmm0);
- _mm_stream_si128(dst128UV++, xmm1);
+ PIXCONV_PUT_STREAM(duv + (i << 1) + 0, xmm0);
+ PIXCONV_PUT_STREAM(duv + (i << 1) + 16, xmm1);
} else {
- _mm_stream_si128(dst128U++, xmm0);
- _mm_stream_si128(dst128V++, xmm2);
+ PIXCONV_PUT_STREAM(du + i, xmm0);
+ PIXCONV_PUT_STREAM(dv + i, xmm2);
}
}
}
@@ -137,36 +137,34 @@ DECLARE_CONV_FUNC_IMPL(convert_yuv420_px1x_le)
// Process Y
for (line = 0; line < height; ++line) {
- __m128i *dst128Y = (__m128i *)(dst[0] + line * outYStride);
- const uint16_t *y = (const uint16_t *)(src[0] + line * inYStride);
+ const uint16_t * const y = (const uint16_t *)(src[0] + line * inYStride);
+ uint16_t * const d = ( uint16_t *)(dst[0] + line * outYStride);
for (i = 0; i < width; i+=16) {
- // Load 8 pixels into register
- PIXCONV_LOAD_PIXEL16(xmm0, (y+i+0), bpp); /* YYYY */
- PIXCONV_LOAD_PIXEL16(xmm1, (y+i+8), bpp); /* YYYY */
+ // Load 2x8 pixels into registers
+ PIXCONV_LOAD_PIXEL16X2(xmm0, xmm1, (y+i+0), (y+i+8), bpp);
// and write them out
- _mm_stream_si128(dst128Y++, xmm0);
- _mm_stream_si128(dst128Y++, xmm1);
+ PIXCONV_PUT_STREAM(d+i+0, xmm0);
+ PIXCONV_PUT_STREAM(d+i+8, xmm1);
}
}
// Process UV
for (line = 0; line < uvHeight; ++line) {
- __m128i *dst128UV = (__m128i *)(dst[1] + line * outUVStride);
- const uint16_t *u = (const uint16_t *)(src[1] + line * inUVStride);
- const uint16_t *v = (const uint16_t *)(src[2] + line * inUVStride);
+ const uint16_t * const u = (const uint16_t *)(src[1] + line * inUVStride);
+ const uint16_t * const v = (const uint16_t *)(src[2] + line * inUVStride);
+ uint16_t * const d = ( uint16_t *)(dst[1] + line * outUVStride);
for (i = 0; i < uvWidth; i+=8) {
// Load 8 pixels into register
- PIXCONV_LOAD_PIXEL16(xmm0, (v+i), bpp); /* VVVV */
- PIXCONV_LOAD_PIXEL16(xmm1, (u+i), bpp); /* UUUU */
+ PIXCONV_LOAD_PIXEL16X2(xmm0, xmm1, (v+i), (u+i), bpp); // Load V and U
xmm2 = xmm0;
xmm0 = _mm_unpacklo_epi16(xmm1, xmm0); /* UVUV */
xmm2 = _mm_unpackhi_epi16(xmm1, xmm2); /* UVUV */
- _mm_stream_si128(dst128UV++, xmm0);
- _mm_stream_si128(dst128UV++, xmm2);
+ PIXCONV_PUT_STREAM(d + (i << 1) + 0, xmm0);
+ PIXCONV_PUT_STREAM(d + (i << 1) + 8, xmm2);
}
}
@@ -249,19 +247,19 @@ DECLARE_CONV_FUNC_IMPL(convert_yuv420_nv12)
// U/V
for(line = 0; line < chromaHeight; ++line) {
- __m128i *dst128UV = (__m128i *)(dst[1] + line * outChromaStride);
- const uint8_t *u = src[1] + line * inChromaStride;
- const uint8_t *v = src[2] + line * inChromaStride;
+ const uint8_t * const u = src[1] + line * inChromaStride;
+ const uint8_t * const v = src[2] + line * inChromaStride;
+ uint8_t * const d = dst[1] + line * outChromaStride;
for (i = 0; i < chromaWidth; i+=16) {
- PIXCONV_LOAD_PIXEL8_ALIGNED(xmm0, (v+i)); /* VVVV */
- PIXCONV_LOAD_PIXEL8_ALIGNED(xmm1, (u+i)); /* UUUU */
+ PIXCONV_LOAD_PIXEL8_ALIGNED(xmm0, v+i); /* VVVV */
+ PIXCONV_LOAD_PIXEL8_ALIGNED(xmm1, u+i); /* UUUU */
xmm2 = _mm_unpacklo_epi8(xmm1, xmm0); /* UVUV */
xmm3 = _mm_unpackhi_epi8(xmm1, xmm0); /* UVUV */
- _mm_stream_si128(dst128UV++, xmm2);
- _mm_stream_si128(dst128UV++, xmm3);
+ PIXCONV_PUT_STREAM(d + (i << 1) + 0, xmm2);
+ PIXCONV_PUT_STREAM(d + (i << 1) + 16, xmm3);
}
}
@@ -284,17 +282,17 @@ DECLARE_CONV_FUNC_IMPL(convert_yuv422_yuy2_uyvy)
_mm_sfence();
for (line = 0; line < height; ++line) {
- __m128i *dst128 = (__m128i *)(dst[0] + line * outStride);
- const uint8_t *y = src[0] + line * inLumaStride;
- const uint8_t *u = src[1] + line * inChromaStride;
- const uint8_t *v = src[2] + line * inChromaStride;
+ const uint8_t * const y = src[0] + line * inLumaStride;
+ const uint8_t * const u = src[1] + line * inChromaStride;
+ const uint8_t * const v = src[2] + line * inChromaStride;
+ uint8_t * const d = dst[0] + line * outStride;
for (i = 0; i < chromaWidth; i+=16) {
// Load pixels
- PIXCONV_LOAD_PIXEL8_ALIGNED(xmm0, (y+(i*2)+0)); /* YYYY */
- PIXCONV_LOAD_PIXEL8_ALIGNED(xmm1, (y+(i*2)+16)); /* YYYY */
- PIXCONV_LOAD_PIXEL8_ALIGNED(xmm2, (u+i)); /* UUUU */
- PIXCONV_LOAD_PIXEL8_ALIGNED(xmm3, (v+i)); /* VVVV */
+ PIXCONV_LOAD_PIXEL8_ALIGNED(xmm0, (y+(i<<1)+ 0)); /* YYYY */
+ PIXCONV_LOAD_PIXEL8_ALIGNED(xmm1, (y+(i<<1)+16)); /* YYYY */
+ PIXCONV_LOAD_PIXEL8_ALIGNED(xmm2, (u+i)); /* UUUU */
+ PIXCONV_LOAD_PIXEL8_ALIGNED(xmm3, (v+i)); /* VVVV */
// Interleave Us and Vs
xmm4 = xmm2;
@@ -306,28 +304,24 @@ DECLARE_CONV_FUNC_IMPL(convert_yuv422_yuy2_uyvy)
xmm3 = xmm4;
xmm3 = _mm_unpacklo_epi8(xmm3, xmm0);
xmm4 = _mm_unpackhi_epi8(xmm4, xmm0);
- } else {
- xmm3 = xmm0;
- xmm3 = _mm_unpacklo_epi8(xmm3, xmm4);
- xmm4 = _mm_unpackhi_epi8(xmm0, xmm4);
- }
-
- _mm_stream_si128(dst128++, xmm3);
- _mm_stream_si128(dst128++, xmm4);
- // Interlave those with the Ys
- if (uyvy) {
xmm5 = xmm2;
xmm5 = _mm_unpacklo_epi8(xmm5, xmm1);
xmm2 = _mm_unpackhi_epi8(xmm2, xmm1);
} else {
+ xmm3 = xmm0;
+ xmm3 = _mm_unpacklo_epi8(xmm3, xmm4);
+ xmm4 = _mm_unpackhi_epi8(xmm0, xmm4);
+
xmm5 = xmm1;
xmm5 = _mm_unpacklo_epi8(xmm5, xmm2);
xmm2 = _mm_unpackhi_epi8(xmm1, xmm2);
}
- _mm_stream_si128(dst128++, xmm5);
- _mm_stream_si128(dst128++, xmm2);
+ PIXCONV_PUT_STREAM(d + (i << 2) + 0, xmm3);
+ PIXCONV_PUT_STREAM(d + (i << 2) + 8, xmm4);
+ PIXCONV_PUT_STREAM(d + (i << 2) + 16, xmm5);
+ PIXCONV_PUT_STREAM(d + (i << 2) + 24, xmm2);
}
}
@@ -357,10 +351,10 @@ DECLARE_CONV_FUNC_IMPL(convert_yuv422_yuy2_uyvy_dither_le)
_mm_sfence();
for (line = 0; line < height; ++line) {
- __m128i *dst128 = (__m128i *)(dst[0] + line * outStride);
- const uint16_t *y = (const uint16_t *)(src[0] + line * inLumaStride);
- const uint16_t *u = (const uint16_t *)(src[1] + line * inChromaStride);
- const uint16_t *v = (const uint16_t *)(src[2] + line * inChromaStride);
+ const uint16_t * const y = (const uint16_t *)(src[0] + line * inLumaStride);
+ const uint16_t * const u = (const uint16_t *)(src[1] + line * inChromaStride);
+ const uint16_t * const v = (const uint16_t *)(src[2] + line * inChromaStride);
+ uint16_t * const d = ( uint16_t *)(dst[0] + line * outStride);
// Load dithering coefficients for this line
if (ditherMode == LAVDither_Random) {
@@ -399,8 +393,8 @@ DECLARE_CONV_FUNC_IMPL(convert_yuv422_yuy2_uyvy_dither_le)
xmm2 = _mm_unpackhi_epi8(xmm0, xmm2);
}
- _mm_stream_si128(dst128++, xmm3);
- _mm_stream_si128(dst128++, xmm2);
+ PIXCONV_PUT_STREAM(d + (i << 1) + 0, xmm3);
+ PIXCONV_PUT_STREAM(d + (i << 1) + 8, xmm2);
}
}
@@ -432,9 +426,9 @@ DECLARE_CONV_FUNC_IMPL(convert_nv12_yv12)
}
for (line = 0; line < chromaHeight; line++) {
- __m128i *dstV128 = (__m128i *)(dst[1] + outChromaStride * line);
- __m128i *dstU128 = (__m128i *)(dst[2] + outChromaStride * line);
- const uint8_t *uv = src[1] + line * inChromaStride;
+ const uint8_t * const uv = src[1] + line * inChromaStride;
+ uint8_t * const dv = dst[1] + outChromaStride * line;
+ uint8_t * const du = dst[2] + outChromaStride * line;
for (i = 0; i < width; i+=32) {
PIXCONV_LOAD_PIXEL8_ALIGNED(xmm0, uv+i+0);
@@ -452,8 +446,8 @@ DECLARE_CONV_FUNC_IMPL(convert_nv12_yv12)
xmm0 = _mm_packus_epi16(xmm0, xmm1);
xmm2 = _mm_packus_epi16(xmm2, xmm3);
- _mm_stream_si128(dstU128++, xmm0);
- _mm_stream_si128(dstV128++, xmm2);
+ PIXCONV_PUT_STREAM(du + (i>>1), xmm0);
+ PIXCONV_PUT_STREAM(dv + (i>>1), xmm2);
}
}