diff options
author | Hendrik Leppkes <h.leppkes@gmail.com> | 2015-02-02 17:29:06 +0300 |
---|---|---|
committer | Hendrik Leppkes <h.leppkes@gmail.com> | 2015-02-02 17:49:10 +0300 |
commit | e521e0cb195bf42dccb04dd354ea5ff0186b6ae1 (patch) | |
tree | cfc9d6070609a1601426ae2bb9fac7185820ac70 /decoder | |
parent | 84fc84c7b165ac83599e575469ac1389c1766cad (diff) |
Add optimized P010 -> NV12 converters, and allow this conversion for direct mode
Diffstat (limited to 'decoder')
-rw-r--r-- | decoder/LAVVideo/LAVPixFmtConverter.cpp | 9 | ||||
-rw-r--r-- | decoder/LAVVideo/LAVPixFmtConverter.h | 2 | ||||
-rw-r--r-- | decoder/LAVVideo/LAVVideo.cpp | 2 | ||||
-rw-r--r-- | decoder/LAVVideo/pixconv/convert_direct.cpp | 127 | ||||
-rw-r--r-- | decoder/LAVVideo/pixconv/yuv2yuv_unscaled.cpp | 78 |
5 files changed, 216 insertions, 2 deletions
diff --git a/decoder/LAVVideo/LAVPixFmtConverter.cpp b/decoder/LAVVideo/LAVPixFmtConverter.cpp index bc6f3661..367266a9 100644 --- a/decoder/LAVVideo/LAVPixFmtConverter.cpp +++ b/decoder/LAVVideo/LAVPixFmtConverter.cpp @@ -41,7 +41,7 @@ * YUV444 - - - - - x x - - - - - - - - x x * YUV444bX - - - - - x x - - - x - - - x x x * NV12 x x - x x - - - - - - - - - - x x - * P010 - - - - - - - x - - - - x - - x x + * P010 - x - - - - - x - - - - x - - x x * YUY2 - - - - - - - - - - - - - - - - - * RGB24 - - - - - - - - - - - - - - - x - * RGB32 - - - - - - - - - - - - - - - - x @@ -396,6 +396,8 @@ void CLAVPixFmtConverter::SelectConvertFunction() convert = &CLAVPixFmtConverter::convert_rgb48_rgb<1>; else convert = &CLAVPixFmtConverter::convert_rgb48_rgb<0>; + } else if (m_InputPixFmt == LAVPixFmt_P010 && m_OutputPixFmt == LAVOutPixFmt_NV12) { + convert = &CLAVPixFmtConverter::convert_p010_nv12_sse2; } } @@ -420,6 +422,11 @@ void CLAVPixFmtConverter::SelectConvertFunctionDirect() convert_direct = &CLAVPixFmtConverter::plane_copy_sse2; else convert_direct = &CLAVPixFmtConverter::plane_copy; + } else if (m_InputPixFmt == LAVPixFmt_P010 && m_OutputPixFmt == LAVOutPixFmt_NV12) { + if (cpu & AV_CPU_FLAG_SSE4) + convert_direct = &CLAVPixFmtConverter::convert_p010_nv12_direct_sse4; + else if (cpu & AV_CPU_FLAG_SSE2) + convert_direct = &CLAVPixFmtConverter::convert_p010_nv12_sse2; } if (convert_direct != nullptr) diff --git a/decoder/LAVVideo/LAVPixFmtConverter.h b/decoder/LAVVideo/LAVPixFmtConverter.h index 3ab7d9b0..5dcb4182 100644 --- a/decoder/LAVVideo/LAVPixFmtConverter.h +++ b/decoder/LAVVideo/LAVPixFmtConverter.h @@ -126,6 +126,7 @@ private: DECLARE_CONV_FUNC(convert_yuv420_nv12); DECLARE_CONV_FUNC(convert_yuv_yv); DECLARE_CONV_FUNC(convert_nv12_yv12); + DECLARE_CONV_FUNC(convert_p010_nv12_sse2); template <int uyvy> DECLARE_CONV_FUNC(convert_yuv420_yuy2); template <int uyvy> DECLARE_CONV_FUNC(convert_yuv422_yuy2_uyvy); template <int uyvy> DECLARE_CONV_FUNC(convert_yuv422_yuy2_uyvy_dither_le); @@ -135,6 +136,7 @@ private: template <int out32> DECLARE_CONV_FUNC(convert_rgb48_rgb); DECLARE_CONV_FUNC(plane_copy_direct_sse4); + DECLARE_CONV_FUNC(convert_p010_nv12_direct_sse4); DECLARE_CONV_FUNC(convert_yuv_rgb); const RGBCoeffs* getRGBCoeffs(int width, int height); diff --git a/decoder/LAVVideo/LAVVideo.cpp b/decoder/LAVVideo/LAVVideo.cpp index 3fca2e4c..332006f4 100644 --- a/decoder/LAVVideo/LAVVideo.cpp +++ b/decoder/LAVVideo/LAVVideo.cpp @@ -686,7 +686,7 @@ HRESULT CLAVVideo::CheckDirectMode() bDirect = FALSE; else if (pix == LAVPixFmt_NV12 && m_pOutput->CurrentMediaType().subtype != MEDIASUBTYPE_NV12) bDirect = FALSE; - else if (pix == LAVPixFmt_P010 && m_pOutput->CurrentMediaType().subtype != MEDIASUBTYPE_P010 && m_pOutput->CurrentMediaType().subtype != MEDIASUBTYPE_P016) + else if (pix == LAVPixFmt_P010 && m_pOutput->CurrentMediaType().subtype != MEDIASUBTYPE_P010 && m_pOutput->CurrentMediaType().subtype != MEDIASUBTYPE_P016 && m_pOutput->CurrentMediaType().subtype != MEDIASUBTYPE_NV12) bDirect = FALSE; else if (m_SubtitleConsumer && m_SubtitleConsumer->HasProvider()) bDirect = FALSE; diff --git a/decoder/LAVVideo/pixconv/convert_direct.cpp b/decoder/LAVVideo/pixconv/convert_direct.cpp index f61dc38a..b2857317 100644 --- a/decoder/LAVVideo/pixconv/convert_direct.cpp +++ b/decoder/LAVVideo/pixconv/convert_direct.cpp @@ -88,3 +88,130 @@ DECLARE_CONV_FUNC_IMPL(plane_copy_direct_sse4) return S_OK; } + +DECLARE_CONV_FUNC_IMPL(convert_p010_nv12_direct_sse4) +{ + const ptrdiff_t inStride = srcStride[0]; + const ptrdiff_t outStride = dstStride[0]; + const ptrdiff_t chromaHeight = (height >> 1); + + const ptrdiff_t byteWidth = width << 1; + const ptrdiff_t stride = min(FFALIGN(byteWidth, 64), min(inStride, outStride)); + + LAVDitherMode ditherMode = m_pSettings->GetDitherMode(); + const uint16_t *dithers = GetRandomDitherCoeffs(height, 4, 8, 0); + if (dithers == nullptr) + ditherMode = LAVDither_Ordered; + + __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; + + _mm_sfence(); + + ptrdiff_t line, i; + + for (line = 0; line < height; line++) { + // Load dithering coefficients for this line + if (ditherMode == LAVDither_Random) { + xmm4 = _mm_load_si128((const __m128i *)(dithers + (line << 5) + 0)); + xmm5 = _mm_load_si128((const __m128i *)(dithers + (line << 5) + 8)); + xmm6 = _mm_load_si128((const __m128i *)(dithers + (line << 5) + 16)); + xmm7 = _mm_load_si128((const __m128i *)(dithers + (line << 5) + 24)); + } else { + PIXCONV_LOAD_DITHER_COEFFS(xmm7, line, 8, dithers); + xmm4 = xmm5 = xmm6 = xmm7; + } + + const uint8_t *y = (src[0] + line * inStride); + uint8_t *dy = (dst[0] + line * outStride); + + for (i = 0; i < (stride - 63); i += 64) { + PIXCONV_STREAM_LOAD(xmm0, y + i + 0); + PIXCONV_STREAM_LOAD(xmm1, y + i + 16); + PIXCONV_STREAM_LOAD(xmm2, y + i + 32); + PIXCONV_STREAM_LOAD(xmm3, y + i + 48); + + _ReadWriteBarrier(); + + // apply dithering coeffs + xmm0 = _mm_adds_epu16(xmm0, xmm4); + xmm1 = _mm_adds_epu16(xmm1, xmm5); + xmm2 = _mm_adds_epu16(xmm2, xmm6); + xmm3 = _mm_adds_epu16(xmm3, xmm7); + + // shift and pack to 8-bit + xmm0 = _mm_packus_epi16(_mm_srli_epi16(xmm0, 8), _mm_srli_epi16(xmm1, 8)); + xmm2 = _mm_packus_epi16(_mm_srli_epi16(xmm2, 8), _mm_srli_epi16(xmm3, 8)); + + PIXCONV_PUT_STREAM(dy + (i >> 1) + 0, xmm0); + PIXCONV_PUT_STREAM(dy + (i >> 1) + 16, xmm2); + } + + for (; i < byteWidth; i += 32) { + PIXCONV_LOAD_ALIGNED(xmm0, y + i + 0); + PIXCONV_LOAD_ALIGNED(xmm1, y + i + 16); + + // apply dithering coeffs + xmm0 = _mm_adds_epu16(xmm0, xmm4); + xmm1 = _mm_adds_epu16(xmm1, xmm5); + + // shift and pack to 8-bit + xmm0 = _mm_packus_epi16(_mm_srli_epi16(xmm0, 8), _mm_srli_epi16(xmm1, 8)); + + PIXCONV_PUT_STREAM(dy + (i >> 1), xmm0); + } + } + + for (line = 0; line < chromaHeight; line++) { + // Load dithering coefficients for this line + if (ditherMode == LAVDither_Random) { + xmm4 = _mm_load_si128((const __m128i *)(dithers + (line << 5) + 0)); + xmm5 = _mm_load_si128((const __m128i *)(dithers + (line << 5) + 8)); + xmm6 = _mm_load_si128((const __m128i *)(dithers + (line << 5) + 16)); + xmm7 = _mm_load_si128((const __m128i *)(dithers + (line << 5) + 24)); + } else { + PIXCONV_LOAD_DITHER_COEFFS(xmm7, line, 8, dithers); + xmm4 = xmm5 = xmm6 = xmm7; + } + + const uint8_t *uv = (src[1] + line * inStride); + uint8_t *duv = (dst[1] + line * outStride); + + for (i = 0; i < (stride - 63); i += 64) { + PIXCONV_STREAM_LOAD(xmm0, uv + i + 0); + PIXCONV_STREAM_LOAD(xmm1, uv + i + 16); + PIXCONV_STREAM_LOAD(xmm2, uv + i + 32); + PIXCONV_STREAM_LOAD(xmm3, uv + i + 48); + + _ReadWriteBarrier(); + + // apply dithering coeffs + xmm0 = _mm_adds_epu16(xmm0, xmm4); + xmm1 = _mm_adds_epu16(xmm1, xmm5); + xmm2 = _mm_adds_epu16(xmm2, xmm6); + xmm3 = _mm_adds_epu16(xmm3, xmm7); + + // shift and pack to 8-bit + xmm0 = _mm_packus_epi16(_mm_srli_epi16(xmm0, 8), _mm_srli_epi16(xmm1, 8)); + xmm2 = _mm_packus_epi16(_mm_srli_epi16(xmm2, 8), _mm_srli_epi16(xmm3, 8)); + + PIXCONV_PUT_STREAM(duv + (i >> 1) + 0, xmm0); + PIXCONV_PUT_STREAM(duv + (i >> 1) + 16, xmm2); + } + + for (; i < byteWidth; i += 32) { + PIXCONV_LOAD_ALIGNED(xmm0, uv + i + 0); + PIXCONV_LOAD_ALIGNED(xmm1, uv + i + 16); + + // apply dithering coeffs + xmm0 = _mm_adds_epu16(xmm0, xmm4); + xmm1 = _mm_adds_epu16(xmm1, xmm5); + + // shift and pack to 8-bit + xmm0 = _mm_packus_epi16(_mm_srli_epi16(xmm0, 8), _mm_srli_epi16(xmm1, 8)); + + PIXCONV_PUT_STREAM(duv + (i >> 1), xmm0); + } + } + + return S_OK; +} diff --git a/decoder/LAVVideo/pixconv/yuv2yuv_unscaled.cpp b/decoder/LAVVideo/pixconv/yuv2yuv_unscaled.cpp index 0e7eeb63..7ebb30f4 100644 --- a/decoder/LAVVideo/pixconv/yuv2yuv_unscaled.cpp +++ b/decoder/LAVVideo/pixconv/yuv2yuv_unscaled.cpp @@ -473,3 +473,81 @@ DECLARE_CONV_FUNC_IMPL(convert_nv12_yv12) return S_OK; } + +DECLARE_CONV_FUNC_IMPL(convert_p010_nv12_sse2) +{ + const ptrdiff_t inStride = srcStride[0]; + const ptrdiff_t outStride = dstStride[0]; + const ptrdiff_t chromaHeight = (height >> 1); + + const ptrdiff_t byteWidth = width << 1; + + LAVDitherMode ditherMode = m_pSettings->GetDitherMode(); + const uint16_t *dithers = GetRandomDitherCoeffs(height, 2, 8, 0); + if (dithers == nullptr) + ditherMode = LAVDither_Ordered; + + __m128i xmm0, xmm1, xmm2, xmm3; + + _mm_sfence(); + + ptrdiff_t line, i; + + for (line = 0; line < height; line++) { + // Load dithering coefficients for this line + if (ditherMode == LAVDither_Random) { + xmm2 = _mm_load_si128((const __m128i *)(dithers + (line << 4) + 0)); + xmm3 = _mm_load_si128((const __m128i *)(dithers + (line << 4) + 8)); + } else { + PIXCONV_LOAD_DITHER_COEFFS(xmm2, line, 8, dithers); + xmm3 = xmm2; + } + + const uint8_t *y = (src[0] + line * inStride); + uint8_t *dy = (dst[0] + line * outStride); + + for (i = 0; i < byteWidth; i += 32) { + PIXCONV_LOAD_ALIGNED(xmm0, y + i + 0); + PIXCONV_LOAD_ALIGNED(xmm1, y + i + 16); + + // apply dithering coeffs + xmm0 = _mm_adds_epu16(xmm0, xmm2); + xmm1 = _mm_adds_epu16(xmm1, xmm3); + + // shift and pack to 8-bit + xmm0 = _mm_packus_epi16(_mm_srli_epi16(xmm0, 8), _mm_srli_epi16(xmm1, 8)); + + PIXCONV_PUT_STREAM(dy + (i >> 1), xmm0); + } + } + + for (line = 0; line < chromaHeight; line++) { + // Load dithering coefficients for this line + if (ditherMode == LAVDither_Random) { + xmm2 = _mm_load_si128((const __m128i *)(dithers + (line << 4) + 0)); + xmm3 = _mm_load_si128((const __m128i *)(dithers + (line << 4) + 8)); + } else { + PIXCONV_LOAD_DITHER_COEFFS(xmm2, line, 8, dithers); + xmm3 = xmm2; + } + + const uint8_t *uv = (src[1] + line * inStride); + uint8_t *duv = (dst[1] + line * outStride); + + for (i = 0 ; i < byteWidth; i += 32) { + PIXCONV_LOAD_ALIGNED(xmm0, uv + i + 0); + PIXCONV_LOAD_ALIGNED(xmm1, uv + i + 16); + + // apply dithering coeffs + xmm0 = _mm_adds_epu16(xmm0, xmm2); + xmm1 = _mm_adds_epu16(xmm1, xmm3); + + // shift and pack to 8-bit + xmm0 = _mm_packus_epi16(_mm_srli_epi16(xmm0, 8), _mm_srli_epi16(xmm1, 8)); + + PIXCONV_PUT_STREAM(duv + (i >> 1), xmm0); + } + } + + return S_OK; +} |