Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/mpc-hc/LAVFilters.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorHendrik Leppkes <h.leppkes@gmail.com>2015-11-11 15:36:06 +0300
committerHendrik Leppkes <h.leppkes@gmail.com>2015-11-11 15:40:29 +0300
commit46f6482da52c56195a887405f2a69fb1d1414011 (patch)
tree238c6ecef55201743be1e544da6ddb0d91b96afc /decoder/LAVVideo/pixconv
parent4179e8848862e943027fed649bebff0a545f309f (diff)
Support NV12->YV12 output in DXVA2-CB Direct Mode
Diffstat (limited to 'decoder/LAVVideo/pixconv')
-rw-r--r--decoder/LAVVideo/pixconv/convert_direct.cpp93
1 files changed, 93 insertions, 0 deletions
diff --git a/decoder/LAVVideo/pixconv/convert_direct.cpp b/decoder/LAVVideo/pixconv/convert_direct.cpp
index b2857317..a8017e81 100644
--- a/decoder/LAVVideo/pixconv/convert_direct.cpp
+++ b/decoder/LAVVideo/pixconv/convert_direct.cpp
@@ -89,6 +89,99 @@ DECLARE_CONV_FUNC_IMPL(plane_copy_direct_sse4)
return S_OK;
}
+DECLARE_CONV_FUNC_IMPL(convert_nv12_yv12_direct_sse4)
+{
+ const ptrdiff_t inStride = srcStride[0];
+ const ptrdiff_t outStride = dstStride[0];
+ const ptrdiff_t outChromaStride = dstStride[1];
+ const ptrdiff_t chromaHeight = (height >> 1);
+
+ const ptrdiff_t stride = min(FFALIGN(width, 64), min(inStride, outStride));
+
+ __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm7;
+ xmm7 = _mm_set1_epi16(0x00FF);
+
+ _mm_sfence();
+
+ ptrdiff_t line, i;
+
+ for (line = 0; line < height; line++) {
+ const uint8_t *y = (src[0] + line * inStride);
+ uint8_t *dy = (dst[0] + line * outStride);
+ for (i = 0; i < (stride - 63); i += 64) {
+ PIXCONV_STREAM_LOAD(xmm0, y + i + 0);
+ PIXCONV_STREAM_LOAD(xmm1, y + i + 16);
+ PIXCONV_STREAM_LOAD(xmm2, y + i + 32);
+ PIXCONV_STREAM_LOAD(xmm3, y + i + 48);
+
+ _ReadWriteBarrier();
+
+ PIXCONV_PUT_STREAM(dy + i + 0, xmm0);
+ PIXCONV_PUT_STREAM(dy + i + 16, xmm1);
+ PIXCONV_PUT_STREAM(dy + i + 32, xmm2);
+ PIXCONV_PUT_STREAM(dy + i + 48, xmm3);
+ }
+
+ for (; i < width; i += 16) {
+ PIXCONV_LOAD_ALIGNED(xmm0, y + i);
+ PIXCONV_PUT_STREAM(dy + i, xmm0);
+ }
+ }
+
+ for (line = 0; line < chromaHeight; line++) {
+ const uint8_t *uv = (src[1] + line * inStride);
+ uint8_t *dv = (dst[1] + line * outChromaStride);
+ uint8_t *du = (dst[2] + line * outChromaStride);
+ for (i = 0; i < (stride - 63); i += 64) {
+ PIXCONV_STREAM_LOAD(xmm0, uv + i + 0);
+ PIXCONV_STREAM_LOAD(xmm1, uv + i + 16);
+ PIXCONV_STREAM_LOAD(xmm2, uv + i + 32);
+ PIXCONV_STREAM_LOAD(xmm3, uv + i + 48);
+
+ _ReadWriteBarrier();
+
+ // process first pair
+ xmm4 = _mm_srli_epi16(xmm0, 8);
+ xmm5 = _mm_srli_epi16(xmm1, 8);
+ xmm0 = _mm_and_si128(xmm0, xmm7);
+ xmm1 = _mm_and_si128(xmm1, xmm7);
+ xmm0 = _mm_packus_epi16(xmm0, xmm1);
+ xmm4 = _mm_packus_epi16(xmm4, xmm5);
+
+ PIXCONV_PUT_STREAM(du + (i >> 1) + 0, xmm0);
+ PIXCONV_PUT_STREAM(dv + (i >> 1) + 0, xmm4);
+
+ // and second pair
+ xmm4 = _mm_srli_epi16(xmm2, 8);
+ xmm5 = _mm_srli_epi16(xmm3, 8);
+ xmm2 = _mm_and_si128(xmm2, xmm7);
+ xmm3 = _mm_and_si128(xmm3, xmm7);
+ xmm2 = _mm_packus_epi16(xmm2, xmm3);
+ xmm4 = _mm_packus_epi16(xmm4, xmm5);
+
+ PIXCONV_PUT_STREAM(du + (i >> 1) + 16, xmm2);
+ PIXCONV_PUT_STREAM(dv + (i >> 1) + 16, xmm4);
+ }
+
+ for (; i < width; i += 32) {
+ PIXCONV_LOAD_ALIGNED(xmm0, uv + i);
+ PIXCONV_LOAD_ALIGNED(xmm1, uv + i);
+
+ xmm4 = _mm_srli_epi16(xmm0, 8);
+ xmm5 = _mm_srli_epi16(xmm1, 8);
+ xmm0 = _mm_and_si128(xmm0, xmm7);
+ xmm1 = _mm_and_si128(xmm1, xmm7);
+ xmm0 = _mm_packus_epi16(xmm0, xmm1);
+ xmm4 = _mm_packus_epi16(xmm4, xmm5);
+
+ PIXCONV_PUT_STREAM(du + (i >> 1), xmm0);
+ PIXCONV_PUT_STREAM(dv + (i >> 1), xmm4);
+ }
+ }
+
+ return S_OK;
+}
+
DECLARE_CONV_FUNC_IMPL(convert_p010_nv12_direct_sse4)
{
const ptrdiff_t inStride = srcStride[0];