diff options
author | Hendrik Leppkes <h.leppkes@gmail.com> | 2012-07-05 15:57:12 +0400 |
---|---|---|
committer | Hendrik Leppkes <h.leppkes@gmail.com> | 2012-07-05 18:16:48 +0400 |
commit | 691f56aa69f9f7efc7641932281b9aaa62ef865a (patch) | |
tree | 55f7ee75e61f7a7dce0da2b686f399897779a883 /decoder/LAVVideo/pixconv | |
parent | 2bf951f323555e448255e6959d30b0a95aa39004 (diff) |
Unify shift/bpp parameters of SSE2 pixel loading macros
Diffstat (limited to 'decoder/LAVVideo/pixconv')
-rw-r--r-- | decoder/LAVVideo/pixconv/interleave.cpp | 9 | ||||
-rw-r--r-- | decoder/LAVVideo/pixconv/pixconv_sse2_templates.h | 36 | ||||
-rw-r--r-- | decoder/LAVVideo/pixconv/yuv2yuv_unscaled.cpp | 49 | ||||
-rw-r--r-- | decoder/LAVVideo/pixconv/yuv444_ayuv.cpp | 9 |
4 files changed, 48 insertions, 55 deletions
diff --git a/decoder/LAVVideo/pixconv/interleave.cpp b/decoder/LAVVideo/pixconv/interleave.cpp index cbd8773a..00280dff 100644 --- a/decoder/LAVVideo/pixconv/interleave.cpp +++ b/decoder/LAVVideo/pixconv/interleave.cpp @@ -47,9 +47,12 @@ DECLARE_CONV_FUNC_IMPL(convert_yuv444_y410) __m128i *dst128 = (__m128i *)(dst + line * outStride); for (i = 0; i < width; i+=8) { - PIXCONV_LOAD_PIXEL16(xmm0, (y+i), shift); - PIXCONV_LOAD_PIXEL16(xmm1, (u+i), shift); - PIXCONV_LOAD_PIXEL16(xmm2, (v+i), shift+4); // +4 so its directly aligned properly (data from bit 14 to bit 4) + PIXCONV_LOAD_PIXEL8_ALIGNED(xmm0, (y+i)); + xmm0 = _mm_slli_epi16(xmm0, shift); + PIXCONV_LOAD_PIXEL8_ALIGNED(xmm1, (u+i)); + xmm1 = _mm_slli_epi16(xmm1, shift); + PIXCONV_LOAD_PIXEL8_ALIGNED(xmm2, (v+i)); + xmm2 = _mm_slli_epi16(xmm2, shift+4); // +4 so its directly aligned properly (data from bit 14 to bit 4) xmm3 = _mm_unpacklo_epi16(xmm1, xmm2); // 0VVVVV00000UUUUU xmm4 = _mm_unpackhi_epi16(xmm1, xmm2); // 0VVVVV00000UUUUU diff --git a/decoder/LAVVideo/pixconv/pixconv_sse2_templates.h b/decoder/LAVVideo/pixconv/pixconv_sse2_templates.h index d5d82e82..366e87c2 100644 --- a/decoder/LAVVideo/pixconv/pixconv_sse2_templates.h +++ b/decoder/LAVVideo/pixconv/pixconv_sse2_templates.h @@ -19,6 +19,8 @@ #pragma once +#include <emmintrin.h> + // Load the dithering coefficients for this line // reg - register to load coefficients into // line - index of line to process (0 based) @@ -28,16 +30,23 @@ reg = _mm_load_si128((const __m128i *)name); \ reg = _mm_srli_epi16(reg, 8-bits); /* shift to the required dithering strength */ +// Load 8 16-bit pixels into a register, using aligned memory access +// reg - register to store pixels in +// src - memory pointer of the source +// bpp - bit depth of the pixels +#define PIXCONV_LOAD_PIXEL16(reg,src,bpp) \ + reg = _mm_load_si128((const __m128i *)(src)); /* load (aligned) */ \ + reg = _mm_slli_epi16(reg, 16-bpp); /* shift to 16-bit */ + // Load 8 16-bit pixels into a register, and dither them to 8 bit // The 8-bit pixels will be in the high-bytes of the 8 16-bit parts // NOTE: the low-bytes are clobbered, and not empty. // reg - register to store pixels in // dreg - register with dithering coefficients // src - memory pointer of the source -// shift - shift offset to 8-bit (ie. 2 for 10bit) -#define PIXCONV_LOAD_PIXEL16_DITHER_HIGH(reg,dreg,src,shift) \ - reg = _mm_load_si128((const __m128i *)(src)); /* load (aligned) */ \ - reg = _mm_slli_epi16(reg, 8-shift); /* shift to 16-bit */ \ +// bpp - bit depth of the pixels +#define PIXCONV_LOAD_PIXEL16_DITHER_HIGH(reg,dreg,src,bpp) \ + PIXCONV_LOAD_PIXEL16(reg,src,bpp) \ reg = _mm_adds_epu16(reg, dreg); /* dither */ // Load 8 16-bit pixels into a register, and dither them to 8 bit @@ -45,9 +54,9 @@ // reg - register to store pixels in // dreg - register with dithering coefficients // src - memory pointer of the source -// shift - shift offset to 8-bit (ie. 2 for 10bit) -#define PIXCONV_LOAD_PIXEL16_DITHER(reg,dreg,src,shift) \ - PIXCONV_LOAD_PIXEL16_DITHER_HIGH(reg,dreg,src,shift) \ +// bpp - bit depth of the pixels +#define PIXCONV_LOAD_PIXEL16_DITHER(reg,dreg,src,bpp) \ + PIXCONV_LOAD_PIXEL16_DITHER_HIGH(reg,dreg,src,bpp) \ reg = _mm_srli_epi16(reg, 8); /* shift to 8-bit */ // Load 8 16-bit pixels into a register, and dither them to 8 bit @@ -55,9 +64,9 @@ // reg - register to store pixels in // dreg - register with dithering coefficients // src - memory pointer of the source -// shift - shift offset to 16-bit (ie. 6 for 10bit) -#define PIXCONV_LOAD_PIXEL16_DITHER_PACKED(reg,dreg,zero,src,shift) \ - PIXCONV_LOAD_PIXEL16_DITHER(reg,dreg,src,shift) /* load unpacked */ \ +// bpp - bit depth of the pixels +#define PIXCONV_LOAD_PIXEL16_DITHER_PACKED(reg,dreg,zero,src,bpp) \ + PIXCONV_LOAD_PIXEL16_DITHER(reg,dreg,src,bpp) /* load unpacked */ \ reg = _mm_packus_epi16(reg, zero); /* pack */ // Load 16 8-bit pixels into a register @@ -72,13 +81,6 @@ #define PIXCONV_LOAD_PIXEL8_ALIGNED(reg,src) \ reg = _mm_load_si128((const __m128i *)(src)); /* load (aligned) */ -// Load 8 16-bit pixels into a register, using aligned memory access -// reg - register to store pixels in -// src - memory pointer of the source -#define PIXCONV_LOAD_PIXEL16(reg,src,shift) \ - reg = _mm_load_si128((const __m128i *)(src)); /* load (aligned) */ \ - reg = _mm_slli_epi16(reg, shift); /* shift to 16-bit */ - // Load 4 8-bit pixels into the register // reg - register to store pixels in // src - source memory diff --git a/decoder/LAVVideo/pixconv/yuv2yuv_unscaled.cpp b/decoder/LAVVideo/pixconv/yuv2yuv_unscaled.cpp index c3c9a16e..c91d6f63 100644 --- a/decoder/LAVVideo/pixconv/yuv2yuv_unscaled.cpp +++ b/decoder/LAVVideo/pixconv/yuv2yuv_unscaled.cpp @@ -34,8 +34,6 @@ DECLARE_CONV_FUNC_IMPL(convert_yuv_yv_nv12_dither_le) const int inYStride = srcStride[0] >> 1; const int inUVStride = srcStride[1] >> 1; - const int shift = bpp - 8; - int outLumaStride = dstStride; int outChromaStride = dstStride; int chromaWidth = width; @@ -80,12 +78,12 @@ DECLARE_CONV_FUNC_IMPL(convert_yuv_yv_nv12_dither_le) for (i = 0; i < width; i+=32) { // Load pixels into registers, and apply dithering - PIXCONV_LOAD_PIXEL16_DITHER(xmm0, xmm4, (y+i+ 0), shift); /* Y0Y0Y0Y0 */ - PIXCONV_LOAD_PIXEL16_DITHER(xmm1, xmm5, (y+i+ 8), shift); /* Y0Y0Y0Y0 */ - PIXCONV_LOAD_PIXEL16_DITHER(xmm2, xmm6, (y+i+16), shift); /* Y0Y0Y0Y0 */ - PIXCONV_LOAD_PIXEL16_DITHER(xmm3, xmm7, (y+i+24), shift); /* Y0Y0Y0Y0 */ - xmm0 = _mm_packus_epi16(xmm0, xmm1); /* YYYYYYYY */ - xmm2 = _mm_packus_epi16(xmm2, xmm3); /* YYYYYYYY */ + PIXCONV_LOAD_PIXEL16_DITHER(xmm0, xmm4, (y+i+ 0), bpp); /* Y0Y0Y0Y0 */ + PIXCONV_LOAD_PIXEL16_DITHER(xmm1, xmm5, (y+i+ 8), bpp); /* Y0Y0Y0Y0 */ + PIXCONV_LOAD_PIXEL16_DITHER(xmm2, xmm6, (y+i+16), bpp); /* Y0Y0Y0Y0 */ + PIXCONV_LOAD_PIXEL16_DITHER(xmm3, xmm7, (y+i+24), bpp); /* Y0Y0Y0Y0 */ + xmm0 = _mm_packus_epi16(xmm0, xmm1); /* YYYYYYYY */ + xmm2 = _mm_packus_epi16(xmm2, xmm3); /* YYYYYYYY */ // Write data back _mm_stream_si128(dst128Y++, xmm0); @@ -99,13 +97,13 @@ DECLARE_CONV_FUNC_IMPL(convert_yuv_yv_nv12_dither_le) __m128i *dst128V = (__m128i *)(dstV + line * outChromaStride); for (i = 0; i < chromaWidth; i+=16) { - PIXCONV_LOAD_PIXEL16_DITHER(xmm0, xmm4, (u+i), shift); /* U0U0U0U0 */ - PIXCONV_LOAD_PIXEL16_DITHER(xmm1, xmm5, (u+i+8), shift); /* U0U0U0U0 */ - PIXCONV_LOAD_PIXEL16_DITHER(xmm2, xmm6, (v+i), shift); /* V0V0V0V0 */ - PIXCONV_LOAD_PIXEL16_DITHER(xmm3, xmm7, (v+i+8), shift); /* V0V0V0V0 */ + PIXCONV_LOAD_PIXEL16_DITHER(xmm0, xmm4, (u+i+0), bpp); /* U0U0U0U0 */ + PIXCONV_LOAD_PIXEL16_DITHER(xmm1, xmm5, (u+i+8), bpp); /* U0U0U0U0 */ + PIXCONV_LOAD_PIXEL16_DITHER(xmm2, xmm6, (v+i+0), bpp); /* V0V0V0V0 */ + PIXCONV_LOAD_PIXEL16_DITHER(xmm3, xmm7, (v+i+8), bpp); /* V0V0V0V0 */ - xmm0 = _mm_packus_epi16(xmm0, xmm1); /* UUUUUUUU */ - xmm2 = _mm_packus_epi16(xmm2, xmm3); /* VVVVVVVV */ + xmm0 = _mm_packus_epi16(xmm0, xmm1); /* UUUUUUUU */ + xmm2 = _mm_packus_epi16(xmm2, xmm3); /* VVVVVVVV */ if (nv12) { xmm1 = xmm0; xmm0 = _mm_unpacklo_epi8(xmm0, xmm2); @@ -133,7 +131,6 @@ DECLARE_CONV_FUNC_IMPL(convert_yuv_yv_nv12_dither_le) template HRESULT CLAVPixFmtConverter::convert_yuv_yv_nv12_dither_le<0>CONV_FUNC_PARAMS; template HRESULT CLAVPixFmtConverter::convert_yuv_yv_nv12_dither_le<1>CONV_FUNC_PARAMS; -template <int shift> DECLARE_CONV_FUNC_IMPL(convert_yuv420_px1x_le) { const uint16_t *y = (const uint16_t *)src[0]; @@ -157,8 +154,8 @@ DECLARE_CONV_FUNC_IMPL(convert_yuv420_px1x_le) for (i = 0; i < width; i+=16) { // Load 8 pixels into register - PIXCONV_LOAD_PIXEL16(xmm0, (y+i), shift); /* YYYY */ - PIXCONV_LOAD_PIXEL16(xmm1, (y+i+8), shift); /* YYYY */ + PIXCONV_LOAD_PIXEL16(xmm0, (y+i+0), bpp); /* YYYY */ + PIXCONV_LOAD_PIXEL16(xmm1, (y+i+8), bpp); /* YYYY */ // and write them out _mm_stream_si128(dst128Y++, xmm0); _mm_stream_si128(dst128Y++, xmm1); @@ -175,8 +172,8 @@ DECLARE_CONV_FUNC_IMPL(convert_yuv420_px1x_le) for (i = 0; i < uvWidth; i+=8) { // Load 8 pixels into register - PIXCONV_LOAD_PIXEL16(xmm0, (v+i), shift); /* VVVV */ - PIXCONV_LOAD_PIXEL16(xmm1, (u+i), shift); /* UUUU */ + PIXCONV_LOAD_PIXEL16(xmm0, (v+i), bpp); /* VVVV */ + PIXCONV_LOAD_PIXEL16(xmm1, (u+i), bpp); /* UUUU */ xmm2 = xmm0; xmm0 = _mm_unpacklo_epi16(xmm1, xmm0); /* UVUV */ @@ -193,11 +190,6 @@ DECLARE_CONV_FUNC_IMPL(convert_yuv420_px1x_le) return S_OK; } -// Force creation of these two variants -template HRESULT CLAVPixFmtConverter::convert_yuv420_px1x_le<0>CONV_FUNC_PARAMS; -template HRESULT CLAVPixFmtConverter::convert_yuv420_px1x_le<6>CONV_FUNC_PARAMS; -template HRESULT CLAVPixFmtConverter::convert_yuv420_px1x_le<7>CONV_FUNC_PARAMS; - DECLARE_CONV_FUNC_IMPL(convert_yuv_yv) { const uint8_t *y = src[0]; @@ -396,7 +388,6 @@ DECLARE_CONV_FUNC_IMPL(convert_yuv422_yuy2_uyvy_dither_le) const int inChromaStride = srcStride[1] >> 1; const int outStride = dstStride << 1; const int chromaWidth = (width + 1) >> 1; - const int shift = bpp - 8; LAVDitherMode ditherMode = m_pSettings->GetDitherMode(); const uint16_t *dithers = GetRandomDitherCoeffs(height, 4, 8, 0); @@ -424,10 +415,10 @@ DECLARE_CONV_FUNC_IMPL(convert_yuv422_yuy2_uyvy_dither_le) for (i = 0; i < chromaWidth; i+=8) { // Load pixels - PIXCONV_LOAD_PIXEL16_DITHER(xmm0, xmm4, (y+(i*2)+0), shift); /* YYYY */ - PIXCONV_LOAD_PIXEL16_DITHER(xmm1, xmm5, (y+(i*2)+8), shift); /* YYYY */ - PIXCONV_LOAD_PIXEL16_DITHER(xmm2, xmm6, (u+i), shift); /* UUUU */ - PIXCONV_LOAD_PIXEL16_DITHER(xmm3, xmm7, (v+i), shift); /* VVVV */ + PIXCONV_LOAD_PIXEL16_DITHER(xmm0, xmm4, (y+(i*2)+0), bpp); /* YYYY */ + PIXCONV_LOAD_PIXEL16_DITHER(xmm1, xmm5, (y+(i*2)+8), bpp); /* YYYY */ + PIXCONV_LOAD_PIXEL16_DITHER(xmm2, xmm6, (u+i), bpp); /* UUUU */ + PIXCONV_LOAD_PIXEL16_DITHER(xmm3, xmm7, (v+i), bpp); /* VVVV */ // Pack Ys xmm0 = _mm_packus_epi16(xmm0, xmm1); diff --git a/decoder/LAVVideo/pixconv/yuv444_ayuv.cpp b/decoder/LAVVideo/pixconv/yuv444_ayuv.cpp index 0c3f328f..926fe3be 100644 --- a/decoder/LAVVideo/pixconv/yuv444_ayuv.cpp +++ b/decoder/LAVVideo/pixconv/yuv444_ayuv.cpp @@ -102,9 +102,6 @@ DECLARE_CONV_FUNC_IMPL(convert_yuv444_ayuv_dither_le) if (dithers == NULL) ditherMode = LAVDither_Ordered; - // Number of bits to shift to reach 8 - int shift = bpp - 8; - int line, i; __m128i xmm0,xmm1,xmm2,xmm3,xmm4,xmm5,xmm6,xmm7; @@ -128,9 +125,9 @@ DECLARE_CONV_FUNC_IMPL(convert_yuv444_ayuv_dither_le) for (i = 0; i < width; i+=8) { // Load pixels into registers, and apply dithering - PIXCONV_LOAD_PIXEL16_DITHER(xmm0, xmm4, (y+i), shift); /* Y0Y0Y0Y0 */ - PIXCONV_LOAD_PIXEL16_DITHER_HIGH(xmm1, xmm5, (u+i), shift); /* U0U0U0U0 */ - PIXCONV_LOAD_PIXEL16_DITHER(xmm2, xmm6, (v+i), shift); /* V0V0V0V0 */ + PIXCONV_LOAD_PIXEL16_DITHER(xmm0, xmm4, (y+i), bpp); /* Y0Y0Y0Y0 */ + PIXCONV_LOAD_PIXEL16_DITHER_HIGH(xmm1, xmm5, (u+i), bpp); /* U0U0U0U0 */ + PIXCONV_LOAD_PIXEL16_DITHER(xmm2, xmm6, (v+i), bpp); /* V0V0V0V0 */ // Interlave into AYUV xmm0 = _mm_or_si128(xmm0, xmm7); /* YAYAYAYA */ |