diff options
author | Hendrik Leppkes <h.leppkes@gmail.com> | 2015-11-11 15:36:06 +0300 |
---|---|---|
committer | Hendrik Leppkes <h.leppkes@gmail.com> | 2015-11-11 15:40:29 +0300 |
commit | 46f6482da52c56195a887405f2a69fb1d1414011 (patch) | |
tree | 238c6ecef55201743be1e544da6ddb0d91b96afc /decoder/LAVVideo/pixconv | |
parent | 4179e8848862e943027fed649bebff0a545f309f (diff) |
Support NV12->YV12 output in DXVA2-CB Direct Mode
Diffstat (limited to 'decoder/LAVVideo/pixconv')
-rw-r--r-- | decoder/LAVVideo/pixconv/convert_direct.cpp | 93 |
1 files changed, 93 insertions, 0 deletions
diff --git a/decoder/LAVVideo/pixconv/convert_direct.cpp b/decoder/LAVVideo/pixconv/convert_direct.cpp index b2857317..a8017e81 100644 --- a/decoder/LAVVideo/pixconv/convert_direct.cpp +++ b/decoder/LAVVideo/pixconv/convert_direct.cpp @@ -89,6 +89,99 @@ DECLARE_CONV_FUNC_IMPL(plane_copy_direct_sse4) return S_OK; } +DECLARE_CONV_FUNC_IMPL(convert_nv12_yv12_direct_sse4) +{ + const ptrdiff_t inStride = srcStride[0]; + const ptrdiff_t outStride = dstStride[0]; + const ptrdiff_t outChromaStride = dstStride[1]; + const ptrdiff_t chromaHeight = (height >> 1); + + const ptrdiff_t stride = min(FFALIGN(width, 64), min(inStride, outStride)); + + __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm7; + xmm7 = _mm_set1_epi16(0x00FF); + + _mm_sfence(); + + ptrdiff_t line, i; + + for (line = 0; line < height; line++) { + const uint8_t *y = (src[0] + line * inStride); + uint8_t *dy = (dst[0] + line * outStride); + for (i = 0; i < (stride - 63); i += 64) { + PIXCONV_STREAM_LOAD(xmm0, y + i + 0); + PIXCONV_STREAM_LOAD(xmm1, y + i + 16); + PIXCONV_STREAM_LOAD(xmm2, y + i + 32); + PIXCONV_STREAM_LOAD(xmm3, y + i + 48); + + _ReadWriteBarrier(); + + PIXCONV_PUT_STREAM(dy + i + 0, xmm0); + PIXCONV_PUT_STREAM(dy + i + 16, xmm1); + PIXCONV_PUT_STREAM(dy + i + 32, xmm2); + PIXCONV_PUT_STREAM(dy + i + 48, xmm3); + } + + for (; i < width; i += 16) { + PIXCONV_LOAD_ALIGNED(xmm0, y + i); + PIXCONV_PUT_STREAM(dy + i, xmm0); + } + } + + for (line = 0; line < chromaHeight; line++) { + const uint8_t *uv = (src[1] + line * inStride); + uint8_t *dv = (dst[1] + line * outChromaStride); + uint8_t *du = (dst[2] + line * outChromaStride); + for (i = 0; i < (stride - 63); i += 64) { + PIXCONV_STREAM_LOAD(xmm0, uv + i + 0); + PIXCONV_STREAM_LOAD(xmm1, uv + i + 16); + PIXCONV_STREAM_LOAD(xmm2, uv + i + 32); + PIXCONV_STREAM_LOAD(xmm3, uv + i + 48); + + _ReadWriteBarrier(); + + // process first pair + xmm4 = _mm_srli_epi16(xmm0, 8); + xmm5 = _mm_srli_epi16(xmm1, 8); + xmm0 = _mm_and_si128(xmm0, xmm7); + xmm1 = _mm_and_si128(xmm1, xmm7); + xmm0 = _mm_packus_epi16(xmm0, xmm1); + xmm4 = _mm_packus_epi16(xmm4, xmm5); + + PIXCONV_PUT_STREAM(du + (i >> 1) + 0, xmm0); + PIXCONV_PUT_STREAM(dv + (i >> 1) + 0, xmm4); + + // and second pair + xmm4 = _mm_srli_epi16(xmm2, 8); + xmm5 = _mm_srli_epi16(xmm3, 8); + xmm2 = _mm_and_si128(xmm2, xmm7); + xmm3 = _mm_and_si128(xmm3, xmm7); + xmm2 = _mm_packus_epi16(xmm2, xmm3); + xmm4 = _mm_packus_epi16(xmm4, xmm5); + + PIXCONV_PUT_STREAM(du + (i >> 1) + 16, xmm2); + PIXCONV_PUT_STREAM(dv + (i >> 1) + 16, xmm4); + } + + for (; i < width; i += 32) { + PIXCONV_LOAD_ALIGNED(xmm0, uv + i); + PIXCONV_LOAD_ALIGNED(xmm1, uv + i); + + xmm4 = _mm_srli_epi16(xmm0, 8); + xmm5 = _mm_srli_epi16(xmm1, 8); + xmm0 = _mm_and_si128(xmm0, xmm7); + xmm1 = _mm_and_si128(xmm1, xmm7); + xmm0 = _mm_packus_epi16(xmm0, xmm1); + xmm4 = _mm_packus_epi16(xmm4, xmm5); + + PIXCONV_PUT_STREAM(du + (i >> 1), xmm0); + PIXCONV_PUT_STREAM(dv + (i >> 1), xmm4); + } + } + + return S_OK; +} + DECLARE_CONV_FUNC_IMPL(convert_p010_nv12_direct_sse4) { const ptrdiff_t inStride = srcStride[0]; |