pixconv: don't use SSE4 instructions for YCgCo conversion

author: Hendrik Leppkes <h.leppkes@gmail.com> 2012-07-11 00:03:28 +0400
committer: Hendrik Leppkes <h.leppkes@gmail.com> 2012-07-11 00:03:28 +0400
commit: fc7ae01c070fd499ce44c3f35e33fdb57cb5074f (patch)
tree: 53957c673195fc1eeb44a4843454f14f897b4a25 /decoder/LAVVideo/pixconv
parent: e88cb68608e5c97d58d0598abf3397215639db44 (diff)
1 files changed, 11 insertions, 7 deletions
diff --git a/decoder/LAVVideo/pixconv/yuv2rgb.cpp b/decoder/LAVVideo/pixconv/yuv2rgb.cpp
index 62c1468e..4c62bd84 100644
--- a/decoder/LAVVideo/pixconv/yuv2rgb.cpp
+++ b/decoder/LAVVideo/pixconv/yuv2rgb.cpp
@@ -227,10 +227,6 @@ static int yuv2rgb_convert_pixels(const uint8_t* &srcY, const uint8_t* &srcU, co
 
   xmm0 = _mm_unpacklo_epi64(xmm0, xmm5);                        /* YYYYYYYY */
 
-  xmm2 = coeffs->CbCr_center;                                   /* move CbCr/CgCo to proper range */
-  xmm1 = _mm_subs_epi16(xmm1, xmm2);
-  xmm3 = _mm_subs_epi16(xmm3, xmm2);
-
   // After this step, xmm1 & xmm3 contain 4 UV pairs, each in a 16-bit value, filling 12-bit.
   if (!ycgco) {
     // YCbCr conversion
@@ -244,6 +240,10 @@ static int yuv2rgb_convert_pixels(const uint8_t* &srcY, const uint8_t* &srcU, co
     xmm0 = _mm_mulhi_epi16(xmm0, coeffs->cy);                   /* Y*cy (result is 28 bits, with 12 high-bits packed into the result) */
     xmm0 = _mm_add_epi16(xmm0, coeffs->rgb_add);                /* Y*cy + 16 (in case of range compression) */
 
+    xmm2 = coeffs->CbCr_center;                                 /* move CbCr to proper range */
+    xmm1 = _mm_subs_epi16(xmm1, xmm2);
+    xmm3 = _mm_subs_epi16(xmm3, xmm2);
+
     xmm6 = xmm1;
     xmm4 = xmm3;
     xmm6 = _mm_madd_epi16(xmm6, coeffs->cR_Cr);                 /* Result is 25 bits (12 from chroma, 13 from coeff) */
@@ -290,11 +290,15 @@ static int yuv2rgb_convert_pixels(const uint8_t* &srcY, const uint8_t* &srcU, co
     xmm1 = _mm_and_si128(xmm1, xmm7);                          /* null out the high-order bytes to get the Cg values */
     xmm2 = _mm_and_si128(xmm2, xmm7);
 
-    xmm3 = _mm_srli_epi32(xmm3, 16);                           /* right shift the V values */
+    xmm3 = _mm_srli_epi32(xmm3, 16);                           /* right shift the Co values */
     xmm4 = _mm_srli_epi32(xmm4, 16);
 
-    xmm1 = _mm_packus_epi32(xmm1, xmm2);                       /* Pack Cg into xmm1 */
-    xmm3 = _mm_packus_epi32(xmm3, xmm4);                       /* Pack Co into xmm3 */
+    xmm1 = _mm_packs_epi32(xmm1, xmm2);                       /* Pack Cg into xmm1 */
+    xmm3 = _mm_packs_epi32(xmm3, xmm4);                       /* Pack Co into xmm3 */
+
+    xmm2 = coeffs->CbCr_center;                               /* move CgCo to proper range */
+    xmm1 = _mm_subs_epi16(xmm1, xmm2);
+    xmm3 = _mm_subs_epi16(xmm3, xmm2);
 
     xmm2 = xmm0;
     xmm2 = _mm_subs_epi16(xmm2, xmm1);                         /* tmp = Y - Cg */
author	Hendrik Leppkes <h.leppkes@gmail.com>	2012-07-11 00:03:28 +0400
committer	Hendrik Leppkes <h.leppkes@gmail.com>	2012-07-11 00:03:28 +0400
commit	fc7ae01c070fd499ce44c3f35e33fdb57cb5074f (patch)
tree	53957c673195fc1eeb44a4843454f14f897b4a25 /decoder/LAVVideo/pixconv
parent	e88cb68608e5c97d58d0598abf3397215639db44 (diff)