From 4724973327077d99e73460531986effe97b5504a Mon Sep 17 00:00:00 2001 From: Hendrik Leppkes Date: Thu, 9 May 2013 17:25:07 +0200 Subject: Add SSSE3 RGB48->RGB32/RGB24 converter --- decoder/LAVVideo/LAVPixFmtConverter.cpp | 7 +- decoder/LAVVideo/LAVPixFmtConverter.h | 3 + decoder/LAVVideo/LAVVideo.vcxproj | 1 + decoder/LAVVideo/LAVVideo.vcxproj.filters | 3 + decoder/LAVVideo/pixconv/rgb2rgb_unscaled.cpp | 130 ++++++++++++++++++++++++++ 5 files changed, 143 insertions(+), 1 deletion(-) create mode 100644 decoder/LAVVideo/pixconv/rgb2rgb_unscaled.cpp (limited to 'decoder') diff --git a/decoder/LAVVideo/LAVPixFmtConverter.cpp b/decoder/LAVVideo/LAVPixFmtConverter.cpp index 6362e870..b60540f6 100644 --- a/decoder/LAVVideo/LAVPixFmtConverter.cpp +++ b/decoder/LAVVideo/LAVPixFmtConverter.cpp @@ -45,7 +45,7 @@ * RGB24 - - - - - - - - - - - - - - - x - * RGB32 - - - - - - - - - - - - - - - - x * ARGB32 - - - - - - - - - - - - - - - - x - * RGB48 - - - - - - - - - - - - - - - - - + * RGB48 - - - - - - - - - - - - - - - x x * * Every processing path has a swscale fallback (even those with a "-" above), every combination of input/output is possible, just not optimized (ugly and/or slow) */ @@ -322,6 +322,11 @@ void CLAVPixFmtConverter::SelectConvertFunction() || (m_OutputPixFmt == LAVOutPixFmt_RGB24 && m_InputPixFmt == LAVPixFmt_RGB24)) { convert = &CLAVPixFmtConverter::plane_copy; m_RequiredAlignment = 0; + } else if (m_InputPixFmt == LAVPixFmt_RGB48 && (m_OutputPixFmt == LAVOutPixFmt_RGB32 || m_OutputPixFmt == LAVOutPixFmt_RGB24) && (cpu & AV_CPU_FLAG_SSSE3)) { + if (m_OutputPixFmt == LAVOutPixFmt_RGB32) + convert = &CLAVPixFmtConverter::convert_rgb48_rgb32_ssse3; + else + convert = &CLAVPixFmtConverter::convert_rgb48_rgb24_ssse3; } else if (cpu & AV_CPU_FLAG_SSE2) { if (m_OutputPixFmt == LAVOutPixFmt_AYUV && m_InputPixFmt == LAVPixFmt_YUV444bX) { convert = &CLAVPixFmtConverter::convert_yuv444_ayuv_dither_le; diff --git a/decoder/LAVVideo/LAVPixFmtConverter.h b/decoder/LAVVideo/LAVPixFmtConverter.h index 7ac3e228..1b4895bf 100644 --- a/decoder/LAVVideo/LAVPixFmtConverter.h +++ b/decoder/LAVVideo/LAVPixFmtConverter.h @@ -143,6 +143,9 @@ private: template DECLARE_CONV_FUNC(convert_yuv422_yuy2_uyvy_dither_le); template DECLARE_CONV_FUNC(convert_yuv_yv_nv12_dither_le); + DECLARE_CONV_FUNC(convert_rgb48_rgb32_ssse3); + DECLARE_CONV_FUNC(convert_rgb48_rgb24_ssse3); + template DECLARE_CONV_FUNC(convert_yuv_rgb); RGBCoeffs* getRGBCoeffs(int width, int height); const uint16_t* GetRandomDitherCoeffs(int height, int coeffs, int bits, int line); diff --git a/decoder/LAVVideo/LAVVideo.vcxproj b/decoder/LAVVideo/LAVVideo.vcxproj index 8e4c760a..01678879 100644 --- a/decoder/LAVVideo/LAVVideo.vcxproj +++ b/decoder/LAVVideo/LAVVideo.vcxproj @@ -116,6 +116,7 @@ + diff --git a/decoder/LAVVideo/LAVVideo.vcxproj.filters b/decoder/LAVVideo/LAVVideo.vcxproj.filters index 4433b291..7164bbf5 100644 --- a/decoder/LAVVideo/LAVVideo.vcxproj.filters +++ b/decoder/LAVVideo/LAVVideo.vcxproj.filters @@ -153,6 +153,9 @@ Source Files + + Source Files\pixconv + diff --git a/decoder/LAVVideo/pixconv/rgb2rgb_unscaled.cpp b/decoder/LAVVideo/pixconv/rgb2rgb_unscaled.cpp new file mode 100644 index 00000000..c7219659 --- /dev/null +++ b/decoder/LAVVideo/pixconv/rgb2rgb_unscaled.cpp @@ -0,0 +1,130 @@ +/* + * Copyright (C) 2010-2013 Hendrik Leppkes + * http://www.1f0.de + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + */ + +#include "stdafx.h" + +#include + +#include "pixconv_internal.h" +#include "pixconv_sse2_templates.h" + +DECLARE_CONV_FUNC_IMPL(convert_rgb48_rgb32_ssse3) +{ + const uint16_t *rgb = (const uint16_t *)src[0]; + const ptrdiff_t inStride = srcStride[0] >> 1; + const ptrdiff_t outStride = dstStride * 4; + ptrdiff_t line, i; + + int processWidth = width * 3; + + LAVDitherMode ditherMode = m_pSettings->GetDitherMode(); + const uint16_t *dithers = GetRandomDitherCoeffs(height, 4, 8, 0); + if (dithers == NULL) + ditherMode = LAVDither_Ordered; + + __m128i xmm0,xmm1,xmm2,xmm3,xmm4,xmm5,xmm6,xmm7; + __m128i mask = _mm_setr_epi8(0,1,2,3,4,5,-1,-1,6,7,8,9,10,11,-1,-1); + + _mm_sfence(); + for (line = 0; line < height; line++) { + __m128i *dst128 = (__m128i *)(dst + line * outStride); + + // Load dithering coefficients for this line + if (ditherMode == LAVDither_Random) { + xmm5 = _mm_load_si128((const __m128i *)(dithers + (line << 5) + 0)); + xmm6 = _mm_load_si128((const __m128i *)(dithers + (line << 5) + 8)); + xmm7 = _mm_load_si128((const __m128i *)(dithers + (line << 5) + 16)); + } else { + PIXCONV_LOAD_DITHER_COEFFS(xmm7,line,8,dithers); + xmm5 = xmm6 = xmm7; + } + for (i = 0; i < processWidth; i += 24) { + PIXCONV_LOAD_ALIGNED(xmm0, (rgb + i)); /* load */ + _mm_adds_epu16(xmm0, xmm5); /* apply dithering coefficients */ + xmm0 = _mm_srli_epi16(xmm0, 8); /* shift to 8-bit */ + PIXCONV_LOAD_ALIGNED(xmm1, (rgb + i + 8)); /* load */ + _mm_adds_epu16(xmm1, xmm6); /* apply dithering coefficients */ + xmm1 = _mm_srli_epi16(xmm1, 8); /* shift to 8-bit */ + PIXCONV_LOAD_ALIGNED(xmm2, (rgb + i + 16)); /* load */ + _mm_adds_epu16(xmm2, xmm7); /* apply dithering coefficients */ + xmm2 = _mm_srli_epi16(xmm2, 8); /* shift to 8-bit */ + + xmm3 = _mm_shuffle_epi8(xmm0, mask); + xmm4 = _mm_shuffle_epi8(_mm_alignr_epi8(xmm1, xmm0, 12), mask); + xmm0 = _mm_shuffle_epi8(_mm_alignr_epi8(xmm2, xmm1, 8), mask); + xmm1 = _mm_shuffle_epi8(_mm_alignr_epi8(xmm2, xmm2, 4), mask); + + xmm3 = _mm_packus_epi16(xmm3, xmm4); + xmm0 = _mm_packus_epi16(xmm0, xmm1); + + _mm_stream_si128(dst128++, xmm3); + _mm_stream_si128(dst128++, xmm0); + } + + rgb += inStride; + } + + return S_OK; +} + +DECLARE_CONV_FUNC_IMPL(convert_rgb48_rgb24_ssse3) +{ + const uint16_t *rgb = (const uint16_t *)src[0]; + const ptrdiff_t inStride = srcStride[0] >> 1; + const ptrdiff_t outStride = dstStride * 3; + ptrdiff_t line, i; + + int processWidth = width * 3; + + LAVDitherMode ditherMode = m_pSettings->GetDitherMode(); + const uint16_t *dithers = GetRandomDitherCoeffs(height, 2, 8, 0); + if (dithers == NULL) + ditherMode = LAVDither_Ordered; + + __m128i xmm0,xmm1,xmm6,xmm7; + + _mm_sfence(); + for (line = 0; line < height; line++) { + __m128i *dst128 = (__m128i *)(dst + line * outStride); + + // Load dithering coefficients for this line + if (ditherMode == LAVDither_Random) { + xmm6 = _mm_load_si128((const __m128i *)(dithers + (line << 4) + 0)); + xmm7 = _mm_load_si128((const __m128i *)(dithers + (line << 4) + 8)); + } else { + PIXCONV_LOAD_DITHER_COEFFS(xmm7,line,8,dithers); + xmm6 = xmm7; + } + for (i = 0; i < processWidth; i += 16) { + PIXCONV_LOAD_ALIGNED(xmm0, (rgb + i)); /* load */ + _mm_adds_epu16(xmm0, xmm6); /* apply dithering coefficients */ + xmm0 = _mm_srli_epi16(xmm0, 8); /* shift to 8-bit */ + PIXCONV_LOAD_ALIGNED(xmm1, (rgb + i + 8)); /* load */ + _mm_adds_epu16(xmm1, xmm7); /* apply dithering coefficients */ + xmm1 = _mm_srli_epi16(xmm1, 8); /* shift to 8-bit */ + + xmm0 = _mm_packus_epi16(xmm0, xmm1); + _mm_stream_si128(dst128++, xmm0); + } + + rgb += inStride; + } + + return S_OK; +} -- cgit v1.2.3