From cfb2c60541714d1110b2a257f835e5d4244a421b Mon Sep 17 00:00:00 2001 From: Hendrik Leppkes Date: Sat, 24 May 2014 17:16:48 +0200 Subject: Optimize and simplify SSE2 load/store --- decoder/LAVVideo/pixconv/pixconv_sse2_templates.h | 16 +++ decoder/LAVVideo/pixconv/yuv2yuv_unscaled.cpp | 124 ++++++++++------------ 2 files changed, 75 insertions(+), 65 deletions(-) (limited to 'decoder/LAVVideo/pixconv') diff --git a/decoder/LAVVideo/pixconv/pixconv_sse2_templates.h b/decoder/LAVVideo/pixconv/pixconv_sse2_templates.h index 610ff490..c4aa3c45 100644 --- a/decoder/LAVVideo/pixconv/pixconv_sse2_templates.h +++ b/decoder/LAVVideo/pixconv/pixconv_sse2_templates.h @@ -38,6 +38,22 @@ reg = _mm_load_si128((const __m128i *)(src)); /* load (aligned) */ \ reg = _mm_slli_epi16(reg, 16-bpp); /* shift to 16-bit */ + +// Load 2x8 16-bit pixels into registers, using aligned memory access +// reg1 - register to store pixels in +// reg2 - register to store pixels in +// src1 - memory pointer of the source +// src2 - memory pointer of the source +// bpp - bit depth of the pixels +#define PIXCONV_LOAD_PIXEL16X2(reg1,reg2,src1,src2,bpp) \ + { \ + const __m128i shift = _mm_cvtsi32_si128(16 - bpp); \ + reg1 = _mm_load_si128((const __m128i *)(src1)); \ + reg2 = _mm_load_si128((const __m128i *)(src2)); \ + reg1 = _mm_sll_epi16(reg1, shift); \ + reg2 = _mm_sll_epi16(reg2, shift); \ + } + // Load 8 16-bit pixels into a register, and dither them to 8 bit // The 8-bit pixels will be in the high-bytes of the 8 16-bit parts // NOTE: the low-bytes are clobbered, and not empty. diff --git a/decoder/LAVVideo/pixconv/yuv2yuv_unscaled.cpp b/decoder/LAVVideo/pixconv/yuv2yuv_unscaled.cpp index e3da8841..1bef94ad 100644 --- a/decoder/LAVVideo/pixconv/yuv2yuv_unscaled.cpp +++ b/decoder/LAVVideo/pixconv/yuv2yuv_unscaled.cpp @@ -65,8 +65,8 @@ DECLARE_CONV_FUNC_IMPL(convert_yuv_yv_nv12_dither_le) xmm4 = xmm5 = xmm6 = xmm7; } - __m128i *dst128Y = (__m128i *)(dst[0] + line * outYStride); - const uint16_t *y = (const uint16_t *)(src[0] + line * inYStride); + const uint16_t * const y = (const uint16_t *)(src[0] + line * inYStride); + uint16_t * const dy = ( uint16_t *)(dst[0] + line * outYStride); for (i = 0; i < width; i+=32) { // Load pixels into registers, and apply dithering @@ -78,18 +78,18 @@ DECLARE_CONV_FUNC_IMPL(convert_yuv_yv_nv12_dither_le) xmm2 = _mm_packus_epi16(xmm2, xmm3); /* YYYYYYYY */ // Write data back - _mm_stream_si128(dst128Y++, xmm0); - _mm_stream_si128(dst128Y++, xmm2); + PIXCONV_PUT_STREAM(dy + (i >> 1) + 0, xmm0); + PIXCONV_PUT_STREAM(dy + (i >> 1) + 8, xmm2); } // Process U/V for chromaHeight lines if (line < chromaHeight) { - __m128i *dst128UV = (__m128i *)(dst[1] + line * outUVStride); - __m128i *dst128U = (__m128i *)(dst[2] + line * outUVStride); - __m128i *dst128V = (__m128i *)(dst[1] + line * outUVStride); + const uint16_t * const u = (const uint16_t *)(src[1] + line * inUVStride); + const uint16_t * const v = (const uint16_t *)(src[2] + line * inUVStride); - const uint16_t *u = (const uint16_t *)(src[1] + line * inUVStride); - const uint16_t *v = (const uint16_t *)(src[2] + line * inUVStride); + uint8_t * const duv = (uint8_t *)(dst[1] + line * outUVStride); + uint8_t * const du = (uint8_t *)(dst[2] + line * outUVStride); + uint8_t * const dv = (uint8_t *)(dst[1] + line * outUVStride); for (i = 0; i < chromaWidth; i+=16) { PIXCONV_LOAD_PIXEL16_DITHER(xmm0, xmm4, (u+i+0), bpp); /* U0U0U0U0 */ @@ -104,11 +104,11 @@ DECLARE_CONV_FUNC_IMPL(convert_yuv_yv_nv12_dither_le) xmm0 = _mm_unpacklo_epi8(xmm0, xmm2); xmm1 = _mm_unpackhi_epi8(xmm1, xmm2); - _mm_stream_si128(dst128UV++, xmm0); - _mm_stream_si128(dst128UV++, xmm1); + PIXCONV_PUT_STREAM(duv + (i << 1) + 0, xmm0); + PIXCONV_PUT_STREAM(duv + (i << 1) + 16, xmm1); } else { - _mm_stream_si128(dst128U++, xmm0); - _mm_stream_si128(dst128V++, xmm2); + PIXCONV_PUT_STREAM(du + i, xmm0); + PIXCONV_PUT_STREAM(dv + i, xmm2); } } } @@ -137,36 +137,34 @@ DECLARE_CONV_FUNC_IMPL(convert_yuv420_px1x_le) // Process Y for (line = 0; line < height; ++line) { - __m128i *dst128Y = (__m128i *)(dst[0] + line * outYStride); - const uint16_t *y = (const uint16_t *)(src[0] + line * inYStride); + const uint16_t * const y = (const uint16_t *)(src[0] + line * inYStride); + uint16_t * const d = ( uint16_t *)(dst[0] + line * outYStride); for (i = 0; i < width; i+=16) { - // Load 8 pixels into register - PIXCONV_LOAD_PIXEL16(xmm0, (y+i+0), bpp); /* YYYY */ - PIXCONV_LOAD_PIXEL16(xmm1, (y+i+8), bpp); /* YYYY */ + // Load 2x8 pixels into registers + PIXCONV_LOAD_PIXEL16X2(xmm0, xmm1, (y+i+0), (y+i+8), bpp); // and write them out - _mm_stream_si128(dst128Y++, xmm0); - _mm_stream_si128(dst128Y++, xmm1); + PIXCONV_PUT_STREAM(d+i+0, xmm0); + PIXCONV_PUT_STREAM(d+i+8, xmm1); } } // Process UV for (line = 0; line < uvHeight; ++line) { - __m128i *dst128UV = (__m128i *)(dst[1] + line * outUVStride); - const uint16_t *u = (const uint16_t *)(src[1] + line * inUVStride); - const uint16_t *v = (const uint16_t *)(src[2] + line * inUVStride); + const uint16_t * const u = (const uint16_t *)(src[1] + line * inUVStride); + const uint16_t * const v = (const uint16_t *)(src[2] + line * inUVStride); + uint16_t * const d = ( uint16_t *)(dst[1] + line * outUVStride); for (i = 0; i < uvWidth; i+=8) { // Load 8 pixels into register - PIXCONV_LOAD_PIXEL16(xmm0, (v+i), bpp); /* VVVV */ - PIXCONV_LOAD_PIXEL16(xmm1, (u+i), bpp); /* UUUU */ + PIXCONV_LOAD_PIXEL16X2(xmm0, xmm1, (v+i), (u+i), bpp); // Load V and U xmm2 = xmm0; xmm0 = _mm_unpacklo_epi16(xmm1, xmm0); /* UVUV */ xmm2 = _mm_unpackhi_epi16(xmm1, xmm2); /* UVUV */ - _mm_stream_si128(dst128UV++, xmm0); - _mm_stream_si128(dst128UV++, xmm2); + PIXCONV_PUT_STREAM(d + (i << 1) + 0, xmm0); + PIXCONV_PUT_STREAM(d + (i << 1) + 8, xmm2); } } @@ -249,19 +247,19 @@ DECLARE_CONV_FUNC_IMPL(convert_yuv420_nv12) // U/V for(line = 0; line < chromaHeight; ++line) { - __m128i *dst128UV = (__m128i *)(dst[1] + line * outChromaStride); - const uint8_t *u = src[1] + line * inChromaStride; - const uint8_t *v = src[2] + line * inChromaStride; + const uint8_t * const u = src[1] + line * inChromaStride; + const uint8_t * const v = src[2] + line * inChromaStride; + uint8_t * const d = dst[1] + line * outChromaStride; for (i = 0; i < chromaWidth; i+=16) { - PIXCONV_LOAD_PIXEL8_ALIGNED(xmm0, (v+i)); /* VVVV */ - PIXCONV_LOAD_PIXEL8_ALIGNED(xmm1, (u+i)); /* UUUU */ + PIXCONV_LOAD_PIXEL8_ALIGNED(xmm0, v+i); /* VVVV */ + PIXCONV_LOAD_PIXEL8_ALIGNED(xmm1, u+i); /* UUUU */ xmm2 = _mm_unpacklo_epi8(xmm1, xmm0); /* UVUV */ xmm3 = _mm_unpackhi_epi8(xmm1, xmm0); /* UVUV */ - _mm_stream_si128(dst128UV++, xmm2); - _mm_stream_si128(dst128UV++, xmm3); + PIXCONV_PUT_STREAM(d + (i << 1) + 0, xmm2); + PIXCONV_PUT_STREAM(d + (i << 1) + 16, xmm3); } } @@ -284,17 +282,17 @@ DECLARE_CONV_FUNC_IMPL(convert_yuv422_yuy2_uyvy) _mm_sfence(); for (line = 0; line < height; ++line) { - __m128i *dst128 = (__m128i *)(dst[0] + line * outStride); - const uint8_t *y = src[0] + line * inLumaStride; - const uint8_t *u = src[1] + line * inChromaStride; - const uint8_t *v = src[2] + line * inChromaStride; + const uint8_t * const y = src[0] + line * inLumaStride; + const uint8_t * const u = src[1] + line * inChromaStride; + const uint8_t * const v = src[2] + line * inChromaStride; + uint8_t * const d = dst[0] + line * outStride; for (i = 0; i < chromaWidth; i+=16) { // Load pixels - PIXCONV_LOAD_PIXEL8_ALIGNED(xmm0, (y+(i*2)+0)); /* YYYY */ - PIXCONV_LOAD_PIXEL8_ALIGNED(xmm1, (y+(i*2)+16)); /* YYYY */ - PIXCONV_LOAD_PIXEL8_ALIGNED(xmm2, (u+i)); /* UUUU */ - PIXCONV_LOAD_PIXEL8_ALIGNED(xmm3, (v+i)); /* VVVV */ + PIXCONV_LOAD_PIXEL8_ALIGNED(xmm0, (y+(i<<1)+ 0)); /* YYYY */ + PIXCONV_LOAD_PIXEL8_ALIGNED(xmm1, (y+(i<<1)+16)); /* YYYY */ + PIXCONV_LOAD_PIXEL8_ALIGNED(xmm2, (u+i)); /* UUUU */ + PIXCONV_LOAD_PIXEL8_ALIGNED(xmm3, (v+i)); /* VVVV */ // Interleave Us and Vs xmm4 = xmm2; @@ -306,28 +304,24 @@ DECLARE_CONV_FUNC_IMPL(convert_yuv422_yuy2_uyvy) xmm3 = xmm4; xmm3 = _mm_unpacklo_epi8(xmm3, xmm0); xmm4 = _mm_unpackhi_epi8(xmm4, xmm0); - } else { - xmm3 = xmm0; - xmm3 = _mm_unpacklo_epi8(xmm3, xmm4); - xmm4 = _mm_unpackhi_epi8(xmm0, xmm4); - } - - _mm_stream_si128(dst128++, xmm3); - _mm_stream_si128(dst128++, xmm4); - // Interlave those with the Ys - if (uyvy) { xmm5 = xmm2; xmm5 = _mm_unpacklo_epi8(xmm5, xmm1); xmm2 = _mm_unpackhi_epi8(xmm2, xmm1); } else { + xmm3 = xmm0; + xmm3 = _mm_unpacklo_epi8(xmm3, xmm4); + xmm4 = _mm_unpackhi_epi8(xmm0, xmm4); + xmm5 = xmm1; xmm5 = _mm_unpacklo_epi8(xmm5, xmm2); xmm2 = _mm_unpackhi_epi8(xmm1, xmm2); } - _mm_stream_si128(dst128++, xmm5); - _mm_stream_si128(dst128++, xmm2); + PIXCONV_PUT_STREAM(d + (i << 2) + 0, xmm3); + PIXCONV_PUT_STREAM(d + (i << 2) + 8, xmm4); + PIXCONV_PUT_STREAM(d + (i << 2) + 16, xmm5); + PIXCONV_PUT_STREAM(d + (i << 2) + 24, xmm2); } } @@ -357,10 +351,10 @@ DECLARE_CONV_FUNC_IMPL(convert_yuv422_yuy2_uyvy_dither_le) _mm_sfence(); for (line = 0; line < height; ++line) { - __m128i *dst128 = (__m128i *)(dst[0] + line * outStride); - const uint16_t *y = (const uint16_t *)(src[0] + line * inLumaStride); - const uint16_t *u = (const uint16_t *)(src[1] + line * inChromaStride); - const uint16_t *v = (const uint16_t *)(src[2] + line * inChromaStride); + const uint16_t * const y = (const uint16_t *)(src[0] + line * inLumaStride); + const uint16_t * const u = (const uint16_t *)(src[1] + line * inChromaStride); + const uint16_t * const v = (const uint16_t *)(src[2] + line * inChromaStride); + uint16_t * const d = ( uint16_t *)(dst[0] + line * outStride); // Load dithering coefficients for this line if (ditherMode == LAVDither_Random) { @@ -399,8 +393,8 @@ DECLARE_CONV_FUNC_IMPL(convert_yuv422_yuy2_uyvy_dither_le) xmm2 = _mm_unpackhi_epi8(xmm0, xmm2); } - _mm_stream_si128(dst128++, xmm3); - _mm_stream_si128(dst128++, xmm2); + PIXCONV_PUT_STREAM(d + (i << 1) + 0, xmm3); + PIXCONV_PUT_STREAM(d + (i << 1) + 8, xmm2); } } @@ -432,9 +426,9 @@ DECLARE_CONV_FUNC_IMPL(convert_nv12_yv12) } for (line = 0; line < chromaHeight; line++) { - __m128i *dstV128 = (__m128i *)(dst[1] + outChromaStride * line); - __m128i *dstU128 = (__m128i *)(dst[2] + outChromaStride * line); - const uint8_t *uv = src[1] + line * inChromaStride; + const uint8_t * const uv = src[1] + line * inChromaStride; + uint8_t * const dv = dst[1] + outChromaStride * line; + uint8_t * const du = dst[2] + outChromaStride * line; for (i = 0; i < width; i+=32) { PIXCONV_LOAD_PIXEL8_ALIGNED(xmm0, uv+i+0); @@ -452,8 +446,8 @@ DECLARE_CONV_FUNC_IMPL(convert_nv12_yv12) xmm0 = _mm_packus_epi16(xmm0, xmm1); xmm2 = _mm_packus_epi16(xmm2, xmm3); - _mm_stream_si128(dstU128++, xmm0); - _mm_stream_si128(dstV128++, xmm2); + PIXCONV_PUT_STREAM(du + (i>>1), xmm0); + PIXCONV_PUT_STREAM(dv + (i>>1), xmm2); } } -- cgit v1.2.3