Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/mpc-hc/LAVFilters.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorHendrik Leppkes <h.leppkes@gmail.com>2012-02-05 16:05:49 +0400
committerHendrik Leppkes <h.leppkes@gmail.com>2012-02-05 16:05:49 +0400
commit19cb09f2564bfe1d9bc8eb731b43d9b63af750f3 (patch)
tree46e2eef6ee57041f97868c1ffd74929de388a8e2 /decoder/LAVVideo/pixconv
parent9701202bc2346baa0ace21370e2922ace78e232e (diff)
Optimize the YUV4:2:0 10-bit -> YV12/NV12 path.
By combining luma & chroma processing into one loop, we only need to load the dithering coefficients once.
Diffstat (limited to 'decoder/LAVVideo/pixconv')
-rw-r--r--decoder/LAVVideo/pixconv/yuv2yuv_unscaled.cpp79
1 files changed, 33 insertions, 46 deletions
diff --git a/decoder/LAVVideo/pixconv/yuv2yuv_unscaled.cpp b/decoder/LAVVideo/pixconv/yuv2yuv_unscaled.cpp
index 59380697..3900efbe 100644
--- a/decoder/LAVVideo/pixconv/yuv2yuv_unscaled.cpp
+++ b/decoder/LAVVideo/pixconv/yuv2yuv_unscaled.cpp
@@ -90,55 +90,42 @@ DECLARE_CONV_FUNC_IMPL(convert_yuv_yv_nv12_dither_le)
_mm_stream_si128(dst128Y++, xmm2);
}
- y += inYStride;
- }
-
- xmm7 = _mm_set1_epi32(0xff00ff00);
-
- // Process U & V
- for (line = 0; line < chromaHeight; ++line) {
- // Load dithering coefficients for this line
- if (ditherMode == LAVDither_Random) {
- xmm4 = _mm_load_si128((const __m128i *)(dithers + (line << 5) + 0));
- xmm5 = _mm_load_si128((const __m128i *)(dithers + (line << 5) + 8));
- xmm6 = _mm_load_si128((const __m128i *)(dithers + (line << 5) + 16));
- xmm7 = _mm_load_si128((const __m128i *)(dithers + (line << 5) + 24));
- } else {
- PIXCONV_LOAD_DITHER_COEFFS(xmm4,line,8,dithers);
- xmm5 = xmm6 = xmm7 = xmm4;
- }
-
- __m128i *dst128UV = (__m128i *)(dstV + line * outLumaStride);
- __m128i *dst128U = (__m128i *)(dstU + line * outChromaStride);
- __m128i *dst128V = (__m128i *)(dstV + line * outChromaStride);
-
- for (i = 0; i < chromaWidth; i+=16) {
- PIXCONV_LOAD_PIXEL16_DITHER(xmm0, xmm4, (u+i), shift); /* U0U0U0U0 */
- PIXCONV_LOAD_PIXEL16_DITHER(xmm1, xmm5, (u+i+8), shift); /* U0U0U0U0 */
- PIXCONV_LOAD_PIXEL16_DITHER(xmm2, xmm6, (v+i), shift); /* V0V0V0V0 */
- PIXCONV_LOAD_PIXEL16_DITHER(xmm3, xmm7, (v+i+8), shift); /* V0V0V0V0 */
-
- if (nv12) {
- xmm0 = _mm_packus_epi16(xmm0, xmm1);
- xmm2 = _mm_packus_epi16(xmm2, xmm3);
-
- xmm1 = xmm0;
- xmm0 = _mm_unpacklo_epi8(xmm0, xmm2);
- xmm1 = _mm_unpackhi_epi8(xmm1, xmm2);
-
- _mm_stream_si128(dst128UV++, xmm0);
- _mm_stream_si128(dst128UV++, xmm1);
- } else {
- xmm0 = _mm_packus_epi16(xmm0, xmm1); /* UUUUUUUU */
- xmm2 = _mm_packus_epi16(xmm2, xmm3); /* VVVVVVVV */
-
- _mm_stream_si128(dst128U++, xmm0);
- _mm_stream_si128(dst128V++, xmm2);
+ // Process U/V for chromaHeight lines
+ if (line < chromaHeight) {
+ __m128i *dst128UV = (__m128i *)(dstV + line * outLumaStride);
+ __m128i *dst128U = (__m128i *)(dstU + line * outChromaStride);
+ __m128i *dst128V = (__m128i *)(dstV + line * outChromaStride);
+
+ for (i = 0; i < chromaWidth; i+=16) {
+ PIXCONV_LOAD_PIXEL16_DITHER(xmm0, xmm4, (u+i), shift); /* U0U0U0U0 */
+ PIXCONV_LOAD_PIXEL16_DITHER(xmm1, xmm5, (u+i+8), shift); /* U0U0U0U0 */
+ PIXCONV_LOAD_PIXEL16_DITHER(xmm2, xmm6, (v+i), shift); /* V0V0V0V0 */
+ PIXCONV_LOAD_PIXEL16_DITHER(xmm3, xmm7, (v+i+8), shift); /* V0V0V0V0 */
+
+ if (nv12) {
+ xmm0 = _mm_packus_epi16(xmm0, xmm1);
+ xmm2 = _mm_packus_epi16(xmm2, xmm3);
+
+ xmm1 = xmm0;
+ xmm0 = _mm_unpacklo_epi8(xmm0, xmm2);
+ xmm1 = _mm_unpackhi_epi8(xmm1, xmm2);
+
+ _mm_stream_si128(dst128UV++, xmm0);
+ _mm_stream_si128(dst128UV++, xmm1);
+ } else {
+ xmm0 = _mm_packus_epi16(xmm0, xmm1); /* UUUUUUUU */
+ xmm2 = _mm_packus_epi16(xmm2, xmm3); /* VVVVVVVV */
+
+ _mm_stream_si128(dst128U++, xmm0);
+ _mm_stream_si128(dst128V++, xmm2);
+ }
}
+
+ u += inUVStride;
+ v += inUVStride;
}
- u += inUVStride;
- v += inUVStride;
+ y += inYStride;
}
return S_OK;