3 files changed, 2397 insertions, 2452 deletions
diff --git a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/simple_idct_mmx.c b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/simple_idct_mmx.c
index f6437ab00..e32b8f0b4 100644
--- a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/simple_idct_mmx.c
+++ b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/simple_idct_mmx.c
@@ -49,46 +49,45 @@
 #define ROW_SHIFT 11
 #define COL_SHIFT 20 // 6
 
-DECLARE_ASM_CONST(8, uint64_t, wm1010) = 0xFFFF0000FFFF0000ULL;
-DECLARE_ASM_CONST(8, uint64_t, d40000) = 0x0000000000040000ULL;
+DECLARE_ASM_CONST(8, uint64_t, wm1010)= 0xFFFF0000FFFF0000ULL;
+DECLARE_ASM_CONST(8, uint64_t, d40000)= 0x0000000000040000ULL;
 
-DECLARE_ALIGNED(8, static const int16_t, coeffs)[] =
-{
-    1 << (ROW_SHIFT - 1), 0, 1 << (ROW_SHIFT - 1), 0,
+DECLARE_ALIGNED(8, static const int16_t, coeffs)[]= {
+        1<<(ROW_SHIFT-1), 0, 1<<(ROW_SHIFT-1), 0,
 //        1<<(COL_SHIFT-1), 0, 1<<(COL_SHIFT-1), 0,
 //        0, 1<<(COL_SHIFT-1-16), 0, 1<<(COL_SHIFT-1-16),
-    1 << (ROW_SHIFT - 1), 1, 1 << (ROW_SHIFT - 1), 0,
-    // the 1 = ((1<<(COL_SHIFT-1))/C4)<<ROW_SHIFT :)
+        1<<(ROW_SHIFT-1), 1, 1<<(ROW_SHIFT-1), 0,
+        // the 1 = ((1<<(COL_SHIFT-1))/C4)<<ROW_SHIFT :)
 //        0, 0, 0, 0,
 //        0, 0, 0, 0,
 
-    C4,  C4,  C4,  C4,
-    C4, -C4,  C4, -C4,
+ C4,  C4,  C4,  C4,
+ C4, -C4,  C4, -C4,
 
-    C2,  C6,  C2,  C6,
-    C6, -C2,  C6, -C2,
+ C2,  C6,  C2,  C6,
+ C6, -C2,  C6, -C2,
 
-    C1,  C3,  C1,  C3,
-    C5,  C7,  C5,  C7,
+ C1,  C3,  C1,  C3,
+ C5,  C7,  C5,  C7,
 
-    C3, -C7,  C3, -C7,
-    -C1, -C5, -C1, -C5,
+ C3, -C7,  C3, -C7,
+-C1, -C5, -C1, -C5,
 
-    C5, -C1,  C5, -C1,
-    C7,  C3,  C7,  C3,
+ C5, -C1,  C5, -C1,
+ C7,  C3,  C7,  C3,
 
-    C7, -C5,  C7, -C5,
-    C3, -C1,  C3, -C1
+ C7, -C5,  C7, -C5,
+ C3, -C1,  C3, -C1
 };
 
 #if 0
 static void unused_var_killer(void)
 {
-    int a = wm1010 + d40000;
-    temp[0] = a;
+        int a= wm1010 + d40000;
+        temp[0]=a;
 }
 
-static void inline idctCol(int16_t * col, int16_t *input)
+static void inline idctCol (int16_t * col, int16_t *input)
 {
 #undef C0
 #undef C1
@@ -98,78 +97,107 @@ static void inline idctCol(int16_t * col, int16_t *input)
 #undef C5
 #undef C6
 #undef C7
-    int a0, a1, a2, a3, b0, b1, b2, b3;
-    const int C0 = 23170; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
-    const int C1 = 22725; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
-    const int C2 = 21407; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
-    const int C3 = 19266; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
-    const int C4 = 16383; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
-    const int C5 = 12873; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
-    const int C6 = 8867;  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
-    const int C7 = 4520;  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
-    /*
-            if( !(col[8*1] | col[8*2] |col[8*3] |col[8*4] |col[8*5] |col[8*6] | col[8*7])) {
-                    col[8*0] = col[8*1] = col[8*2] = col[8*3] = col[8*4] =
-                            col[8*5] = col[8*6] = col[8*7] = col[8*0]<<3;
-                    return;
-            }*/
-
-    col[8*0] = input[8*0 + 0];
-    col[8*1] = input[8*2 + 0];
-    col[8*2] = input[8*0 + 1];
-    col[8*3] = input[8*2 + 1];
-    col[8*4] = input[8*4 + 0];
-    col[8*5] = input[8*6 + 0];
-    col[8*6] = input[8*4 + 1];
-    col[8*7] = input[8*6 + 1];
-
-    a0 = C4 * col[8*0] + C2 * col[8*2] + C4 * col[8*4] + C6 * col[8*6] + (1 << (COL_SHIFT - 1));
-    a1 = C4 * col[8*0] + C6 * col[8*2] - C4 * col[8*4] - C2 * col[8*6] + (1 << (COL_SHIFT - 1));
-    a2 = C4 * col[8*0] - C6 * col[8*2] - C4 * col[8*4] + C2 * col[8*6] + (1 << (COL_SHIFT - 1));
-    a3 = C4 * col[8*0] - C2 * col[8*2] + C4 * col[8*4] - C6 * col[8*6] + (1 << (COL_SHIFT - 1));
-
-    b0 = C1 * col[8*1] + C3 * col[8*3] + C5 * col[8*5] + C7 * col[8*7];
-    b1 = C3 * col[8*1] - C7 * col[8*3] - C1 * col[8*5] - C5 * col[8*7];
-    b2 = C5 * col[8*1] - C1 * col[8*3] + C7 * col[8*5] + C3 * col[8*7];
-    b3 = C7 * col[8*1] - C5 * col[8*3] + C3 * col[8*5] - C1 * col[8*7];
-
-    col[8*0] = (a0 + b0) >> COL_SHIFT;
-    col[8*1] = (a1 + b1) >> COL_SHIFT;
-    col[8*2] = (a2 + b2) >> COL_SHIFT;
-    col[8*3] = (a3 + b3) >> COL_SHIFT;
-    col[8*4] = (a3 - b3) >> COL_SHIFT;
-    col[8*5] = (a2 - b2) >> COL_SHIFT;
-    col[8*6] = (a1 - b1) >> COL_SHIFT;
-    col[8*7] = (a0 - b0) >> COL_SHIFT;
+        int a0, a1, a2, a3, b0, b1, b2, b3;
+        const int C0 = 23170; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+        const int C1 = 22725; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+        const int C2 = 21407; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+        const int C3 = 19266; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+        const int C4 = 16383; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+        const int C5 = 12873; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+        const int C6 = 8867;  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+        const int C7 = 4520;  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+/*
+        if( !(col[8*1] | col[8*2] |col[8*3] |col[8*4] |col[8*5] |col[8*6] | col[8*7])) {
+                col[8*0] = col[8*1] = col[8*2] = col[8*3] = col[8*4] =
+                        col[8*5] = col[8*6] = col[8*7] = col[8*0]<<3;
+                return;
+        }*/
+
+col[8*0] = input[8*0 + 0];
+col[8*1] = input[8*2 + 0];
+col[8*2] = input[8*0 + 1];
+col[8*3] = input[8*2 + 1];
+col[8*4] = input[8*4 + 0];
+col[8*5] = input[8*6 + 0];
+col[8*6] = input[8*4 + 1];
+col[8*7] = input[8*6 + 1];
+
+        a0 = C4*col[8*0] + C2*col[8*2] + C4*col[8*4] + C6*col[8*6] + (1<<(COL_SHIFT-1));
+        a1 = C4*col[8*0] + C6*col[8*2] - C4*col[8*4] - C2*col[8*6] + (1<<(COL_SHIFT-1));
+        a2 = C4*col[8*0] - C6*col[8*2] - C4*col[8*4] + C2*col[8*6] + (1<<(COL_SHIFT-1));
+        a3 = C4*col[8*0] - C2*col[8*2] + C4*col[8*4] - C6*col[8*6] + (1<<(COL_SHIFT-1));
+
+        b0 = C1*col[8*1] + C3*col[8*3] + C5*col[8*5] + C7*col[8*7];
+        b1 = C3*col[8*1] - C7*col[8*3] - C1*col[8*5] - C5*col[8*7];
+        b2 = C5*col[8*1] - C1*col[8*3] + C7*col[8*5] + C3*col[8*7];
+        b3 = C7*col[8*1] - C5*col[8*3] + C3*col[8*5] - C1*col[8*7];
+
+        col[8*0] = (a0 + b0) >> COL_SHIFT;
+        col[8*1] = (a1 + b1) >> COL_SHIFT;
+        col[8*2] = (a2 + b2) >> COL_SHIFT;
+        col[8*3] = (a3 + b3) >> COL_SHIFT;
+        col[8*4] = (a3 - b3) >> COL_SHIFT;
+        col[8*5] = (a2 - b2) >> COL_SHIFT;
+        col[8*6] = (a1 - b1) >> COL_SHIFT;
+        col[8*7] = (a0 - b0) >> COL_SHIFT;
 }
 
-static void inline idctRow(int16_t * output, int16_t * input)
+static void inline idctRow (int16_t * output, int16_t * input)
 {
-    int16_t row[8];
-
-    int a0, a1, a2, a3, b0, b1, b2, b3;
-    const int C0 = 23170; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
-    const int C1 = 22725; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
-    const int C2 = 21407; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
-    const int C3 = 19266; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
-    const int C4 = 16383; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
-    const int C5 = 12873; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
-    const int C6 = 8867;  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
-    const int C7 = 4520;  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
-
-    row[0] = input[0];
-    row[2] = input[1];
-    row[4] = input[4];
-    row[6] = input[5];
-    row[1] = input[8];
-    row[3] = input[9];
-    row[5] = input[12];
-    row[7] = input[13];
-
-    if(!(row[1] | row[2] | row[3] | row[4] | row[5] | row[6] | row[7]))
-    {
-        row[0] = row[1] = row[2] = row[3] = row[4] =
-                                                row[5] = row[6] = row[7] = row[0] << 3;
+        int16_t row[8];
+
+        int a0, a1, a2, a3, b0, b1, b2, b3;
+        const int C0 = 23170; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+        const int C1 = 22725; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+        const int C2 = 21407; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+        const int C3 = 19266; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+        const int C4 = 16383; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+        const int C5 = 12873; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+        const int C6 = 8867;  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+        const int C7 = 4520;  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+
+row[0] = input[0];
+row[2] = input[1];
+row[4] = input[4];
+row[6] = input[5];
+row[1] = input[8];
+row[3] = input[9];
+row[5] = input[12];
+row[7] = input[13];
+
+        if( !(row[1] | row[2] |row[3] |row[4] |row[5] |row[6] | row[7]) ) {
+                row[0] = row[1] = row[2] = row[3] = row[4] =
+                        row[5] = row[6] = row[7] = row[0]<<3;
+        output[0]  = row[0];
+        output[2]  = row[1];
+        output[4]  = row[2];
+        output[6]  = row[3];
+        output[8]  = row[4];
+        output[10] = row[5];
+        output[12] = row[6];
+        output[14] = row[7];
+                return;
+        }
+
+        a0 = C4*row[0] + C2*row[2] + C4*row[4] + C6*row[6] + (1<<(ROW_SHIFT-1));
+        a1 = C4*row[0] + C6*row[2] - C4*row[4] - C2*row[6] + (1<<(ROW_SHIFT-1));
+        a2 = C4*row[0] - C6*row[2] - C4*row[4] + C2*row[6] + (1<<(ROW_SHIFT-1));
+        a3 = C4*row[0] - C2*row[2] + C4*row[4] - C6*row[6] + (1<<(ROW_SHIFT-1));
+
+        b0 = C1*row[1] + C3*row[3] + C5*row[5] + C7*row[7];
+        b1 = C3*row[1] - C7*row[3] - C1*row[5] - C5*row[7];
+        b2 = C5*row[1] - C1*row[3] + C7*row[5] + C3*row[7];
+        b3 = C7*row[1] - C5*row[3] + C3*row[5] - C1*row[7];
+
+        row[0] = (a0 + b0) >> ROW_SHIFT;
+        row[1] = (a1 + b1) >> ROW_SHIFT;
+        row[2] = (a2 + b2) >> ROW_SHIFT;
+        row[3] = (a3 + b3) >> ROW_SHIFT;
+        row[4] = (a3 - b3) >> ROW_SHIFT;
+        row[5] = (a2 - b2) >> ROW_SHIFT;
+        row[6] = (a1 - b1) >> ROW_SHIFT;
+        row[7] = (a0 - b0) >> ROW_SHIFT;
+
         output[0]  = row[0];
         output[2]  = row[1];
         output[4]  = row[2];
@@ -178,45 +206,15 @@ static void inline idctRow(int16_t * output, int16_t * input)
         output[10] = row[5];
         output[12] = row[6];
         output[14] = row[7];
-        return;
-    }
-
-    a0 = C4 * row[0] + C2 * row[2] + C4 * row[4] + C6 * row[6] + (1 << (ROW_SHIFT - 1));
-    a1 = C4 * row[0] + C6 * row[2] - C4 * row[4] - C2 * row[6] + (1 << (ROW_SHIFT - 1));
-    a2 = C4 * row[0] - C6 * row[2] - C4 * row[4] + C2 * row[6] + (1 << (ROW_SHIFT - 1));
-    a3 = C4 * row[0] - C2 * row[2] + C4 * row[4] - C6 * row[6] + (1 << (ROW_SHIFT - 1));
-
-    b0 = C1 * row[1] + C3 * row[3] + C5 * row[5] + C7 * row[7];
-    b1 = C3 * row[1] - C7 * row[3] - C1 * row[5] - C5 * row[7];
-    b2 = C5 * row[1] - C1 * row[3] + C7 * row[5] + C3 * row[7];
-    b3 = C7 * row[1] - C5 * row[3] + C3 * row[5] - C1 * row[7];
-
-    row[0] = (a0 + b0) >> ROW_SHIFT;
-    row[1] = (a1 + b1) >> ROW_SHIFT;
-    row[2] = (a2 + b2) >> ROW_SHIFT;
-    row[3] = (a3 + b3) >> ROW_SHIFT;
-    row[4] = (a3 - b3) >> ROW_SHIFT;
-    row[5] = (a2 - b2) >> ROW_SHIFT;
-    row[6] = (a1 - b1) >> ROW_SHIFT;
-    row[7] = (a0 - b0) >> ROW_SHIFT;
-
-    output[0]  = row[0];
-    output[2]  = row[1];
-    output[4]  = row[2];
-    output[6]  = row[3];
-    output[8]  = row[4];
-    output[10] = row[5];
-    output[12] = row[6];
-    output[14] = row[7];
 }
 #endif
 
 static inline void idct(int16_t *block)
 {
-    DECLARE_ALIGNED(8, int64_t, align_tmp)[16];
-    int16_t * const temp = (int16_t*)align_tmp;
+        DECLARE_ALIGNED(8, int64_t, align_tmp)[16];
+        int16_t * const temp= (int16_t*)align_tmp;
 
-    __asm__ volatile(
+        __asm__ volatile(
 #if 0 //Alternative, simpler variant
 
 #define ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
@@ -286,7 +284,7 @@ static inline void idct(int16_t *block)
         "psrad $" #shift ", %%mm4       \n\t"\
         "packssdw %%mm0, %%mm4          \n\t" /* A2-B2  a2-b2   A3-B3   a3-b3 */\
         "movq %%mm4, 16+" #dst "        \n\t"\
- 
+
 #define COL_IDCT(src0, src4, src1, src5, dst, shift) \
         "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
         "movq " #src4 ", %%mm1          \n\t" /* R6     R2      r6      r2 */\
@@ -360,7 +358,7 @@ static inline void idct(int16_t *block)
         "movd %%mm6, 48+" #dst "        \n\t"\
         "movd %%mm4, 64+" #dst "        \n\t"\
         "movd %%mm5, 80+" #dst "        \n\t"\
- 
+
 
 #define DC_COND_ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
         "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
@@ -452,21 +450,21 @@ static inline void idct(int16_t *block)
 
 
 //IDCT(      src0,   src4,   src1,   src5,    dst,    rounder, shift)
-        ROW_IDCT((%0),  8(%0), 16(%0), 24(%0),  0(%1), paddd 8(%2), 11)
-        /*ROW_IDCT(  32(%0), 40(%0), 48(%0), 56(%0), 32(%1), paddd (%2), 11)
-        ROW_IDCT(  64(%0), 72(%0), 80(%0), 88(%0), 64(%1), paddd (%2), 11)
-        ROW_IDCT(  96(%0),104(%0),112(%0),120(%0), 96(%1), paddd (%2), 11)*/
+ROW_IDCT(    (%0),  8(%0), 16(%0), 24(%0),  0(%1),paddd 8(%2), 11)
+/*ROW_IDCT(  32(%0), 40(%0), 48(%0), 56(%0), 32(%1), paddd (%2), 11)
+ROW_IDCT(  64(%0), 72(%0), 80(%0), 88(%0), 64(%1), paddd (%2), 11)
+ROW_IDCT(  96(%0),104(%0),112(%0),120(%0), 96(%1), paddd (%2), 11)*/
 
-        DC_COND_ROW_IDCT(32(%0), 40(%0), 48(%0), 56(%0), 32(%1), paddd(%2), 11)
-        DC_COND_ROW_IDCT(64(%0), 72(%0), 80(%0), 88(%0), 64(%1), paddd(%2), 11)
-        DC_COND_ROW_IDCT(96(%0), 104(%0), 112(%0), 120(%0), 96(%1), paddd(%2), 11)
+DC_COND_ROW_IDCT(  32(%0), 40(%0), 48(%0), 56(%0), 32(%1),paddd (%2), 11)
+DC_COND_ROW_IDCT(  64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11)
+DC_COND_ROW_IDCT(  96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11)
 
 
 //IDCT(      src0,   src4,   src1,    src5,    dst, shift)
-        COL_IDCT((%1), 64(%1), 32(%1),  96(%1),  0(%0), 20)
-        COL_IDCT(8(%1), 72(%1), 40(%1), 104(%1),  4(%0), 20)
-        COL_IDCT(16(%1), 80(%1), 48(%1), 112(%1),  8(%0), 20)
-        COL_IDCT(24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
+COL_IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0), 20)
+COL_IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0), 20)
+COL_IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0), 20)
+COL_IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
 
 #else
 
@@ -633,7 +631,7 @@ static inline void idct(int16_t *block)
         "psrad $" #shift ", %%mm4       \n\t"\
         "packssdw %%mm0, %%mm4          \n\t" /* A2-B2  a2-b2   A3-B3   a3-b3 */\
         "movq %%mm4, 16+" #dst "        \n\t"\
- 
+
 #define ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
         "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
         "movq " #src4 ", %%mm1          \n\t" /* R6     R2      r6      r2 */\
@@ -701,12 +699,12 @@ static inline void idct(int16_t *block)
         "psrad $" #shift ", %%mm4       \n\t"\
         "packssdw %%mm0, %%mm4          \n\t" /* A2-B2  a2-b2   A3-B3   a3-b3 */\
         "movq %%mm4, 16+" #dst "        \n\t"\
- 
+
 //IDCT(         src0,   src4,   src1,   src5,    dst,   rounder, shift)
-        DC_COND_IDCT(0(%0),  8(%0), 16(%0), 24(%0),  0(%1), paddd 8(%2), 11)
-        Z_COND_IDCT(32(%0), 40(%0), 48(%0), 56(%0), 32(%1), paddd(%2), 11, 4f)
-        Z_COND_IDCT(64(%0), 72(%0), 80(%0), 88(%0), 64(%1), paddd(%2), 11, 2f)
-        Z_COND_IDCT(96(%0), 104(%0), 112(%0), 120(%0), 96(%1), paddd(%2), 11, 1f)
+DC_COND_IDCT(  0(%0),  8(%0), 16(%0), 24(%0),  0(%1),paddd 8(%2), 11)
+Z_COND_IDCT(  32(%0), 40(%0), 48(%0), 56(%0), 32(%1),paddd (%2), 11, 4f)
+Z_COND_IDCT(  64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11, 2f)
+Z_COND_IDCT(  96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 1f)
 
 #undef IDCT
 #define IDCT(src0, src4, src1, src5, dst, shift) \
@@ -785,16 +783,16 @@ static inline void idct(int16_t *block)
 
 
 //IDCT(  src0,   src4,   src1,    src5,    dst, shift)
-        IDCT((%1), 64(%1), 32(%1),  96(%1),  0(%0), 20)
-        IDCT(8(%1), 72(%1), 40(%1), 104(%1),  4(%0), 20)
-        IDCT(16(%1), 80(%1), 48(%1), 112(%1),  8(%0), 20)
-        IDCT(24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
+IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0), 20)
+IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0), 20)
+IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0), 20)
+IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
         "jmp 9f                         \n\t"
 
         "#" ASMALIGN(4)                      \
         "4:                             \n\t"
-        Z_COND_IDCT(64(%0), 72(%0), 80(%0), 88(%0), 64(%1), paddd(%2), 11, 6f)
-        Z_COND_IDCT(96(%0), 104(%0), 112(%0), 120(%0), 96(%1), paddd(%2), 11, 5f)
+Z_COND_IDCT(  64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11, 6f)
+Z_COND_IDCT(  96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 5f)
 
 #undef IDCT
 #define IDCT(src0, src4, src1, src5, dst, shift) \
@@ -860,15 +858,15 @@ static inline void idct(int16_t *block)
         "movd %%mm5, 80+" #dst "        \n\t"
 
 //IDCT(  src0,   src4,   src1,    src5,    dst, shift)
-        IDCT((%1), 64(%1), 32(%1),  96(%1),  0(%0), 20)
-        IDCT(8(%1), 72(%1), 40(%1), 104(%1),  4(%0), 20)
-        IDCT(16(%1), 80(%1), 48(%1), 112(%1),  8(%0), 20)
-        IDCT(24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
+IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0), 20)
+IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0), 20)
+IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0), 20)
+IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
         "jmp 9f                         \n\t"
 
         "#" ASMALIGN(4)                      \
         "6:                             \n\t"
-        Z_COND_IDCT(96(%0), 104(%0), 112(%0), 120(%0), 96(%1), paddd(%2), 11, 7f)
+Z_COND_IDCT(  96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 7f)
 
 #undef IDCT
 #define IDCT(src0, src4, src1, src5, dst, shift) \
@@ -926,15 +924,15 @@ static inline void idct(int16_t *block)
 
 
 //IDCT(  src0,   src4,   src1,    src5,    dst, shift)
-        IDCT((%1), 64(%1), 32(%1),  96(%1),  0(%0), 20)
-        IDCT(8(%1), 72(%1), 40(%1), 104(%1),  4(%0), 20)
-        IDCT(16(%1), 80(%1), 48(%1), 112(%1),  8(%0), 20)
-        IDCT(24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
+IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0), 20)
+IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0), 20)
+IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0), 20)
+IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
         "jmp 9f                         \n\t"
 
         "#" ASMALIGN(4)                      \
         "2:                             \n\t"
-        Z_COND_IDCT(96(%0), 104(%0), 112(%0), 120(%0), 96(%1), paddd(%2), 11, 3f)
+Z_COND_IDCT(  96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 3f)
 
 #undef IDCT
 #define IDCT(src0, src4, src1, src5, dst, shift) \
@@ -1003,10 +1001,10 @@ static inline void idct(int16_t *block)
         "movd %%mm5, 80+" #dst "        \n\t"
 
 //IDCT(  src0,   src4,   src1,    src5,    dst, shift)
-        IDCT((%1), 64(%1), 32(%1),  96(%1),  0(%0), 20)
-        IDCT(8(%1), 72(%1), 40(%1), 104(%1),  4(%0), 20)
-        IDCT(16(%1), 80(%1), 48(%1), 112(%1),  8(%0), 20)
-        IDCT(24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
+IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0), 20)
+IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0), 20)
+IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0), 20)
+IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
         "jmp 9f                         \n\t"
 
         "#" ASMALIGN(4)                      \
@@ -1067,10 +1065,10 @@ static inline void idct(int16_t *block)
 
 
 //IDCT(  src0,   src4,   src1,    src5,    dst, shift)
-        IDCT((%1), 64(%1), 32(%1),  96(%1),  0(%0), 20)
-        IDCT(8(%1), 72(%1), 40(%1), 104(%1),  4(%0), 20)
-        IDCT(16(%1), 80(%1), 48(%1), 112(%1),  8(%0), 20)
-        IDCT(24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
+IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0), 20)
+IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0), 20)
+IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0), 20)
+IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
         "jmp 9f                         \n\t"
 
         "#" ASMALIGN(4)                      \
@@ -1131,9 +1129,9 @@ static inline void idct(int16_t *block)
 
 
 //IDCT(  src0,   src4,   src1,    src5,    dst, shift)
-        IDCT(0(%1), 64(%1), 32(%1),  96(%1),  0(%0), 20)
+IDCT(    0(%1), 64(%1), 32(%1),  96(%1),  0(%0), 20)
 //IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0), 20)
-        IDCT(16(%1), 80(%1), 48(%1), 112(%1),  8(%0), 20)
+IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0), 20)
 //IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
         "jmp 9f                         \n\t"
 
@@ -1205,10 +1203,10 @@ static inline void idct(int16_t *block)
 
 
 //IDCT(  src0,   src4,   src1,    src5,    dst, shift)
-        IDCT((%1), 64(%1), 32(%1),  96(%1),  0(%0), 20)
-        IDCT(8(%1), 72(%1), 40(%1), 104(%1),  4(%0), 20)
-        IDCT(16(%1), 80(%1), 48(%1), 112(%1),  8(%0), 20)
-        IDCT(24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
+IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0), 20)
+IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0), 20)
+IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0), 20)
+IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
         "jmp 9f                         \n\t"
 
 
@@ -1243,40 +1241,40 @@ static inline void idct(int16_t *block)
         "movq %%mm0, 80+" #dst "        \n\t"
 
 //IDCT(  src0,   src4,   src1,    src5,    dst, shift)
-        IDCT(0(%1), 64(%1), 32(%1),  96(%1),  0(%0), 20)
+IDCT(   0(%1), 64(%1), 32(%1),  96(%1),  0(%0), 20)
 //IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0), 20)
-        IDCT(16(%1), 80(%1), 48(%1), 112(%1),  8(%0), 20)
+IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0), 20)
 //IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
 
 
 #endif
 
-        /*
-        Input
-         00 40 04 44 20 60 24 64
-         10 30 14 34 50 70 54 74
-         01 41 03 43 21 61 23 63
-         11 31 13 33 51 71 53 73
-         02 42 06 46 22 62 26 66
-         12 32 16 36 52 72 56 76
-         05 45 07 47 25 65 27 67
-         15 35 17 37 55 75 57 77
-
-        Temp
-         00 04 10 14 20 24 30 34
-         40 44 50 54 60 64 70 74
-         01 03 11 13 21 23 31 33
-         41 43 51 53 61 63 71 73
-         02 06 12 16 22 26 32 36
-         42 46 52 56 62 66 72 76
-         05 07 15 17 25 27 35 37
-         45 47 55 57 65 67 75 77
-        */
-
-        "9: \n\t"
-        :: "r"(block), "r"(temp), "r"(coeffs)
-        : "%eax"
-    );
+/*
+Input
+ 00 40 04 44 20 60 24 64
+ 10 30 14 34 50 70 54 74
+ 01 41 03 43 21 61 23 63
+ 11 31 13 33 51 71 53 73
+ 02 42 06 46 22 62 26 66
+ 12 32 16 36 52 72 56 76
+ 05 45 07 47 25 65 27 67
+ 15 35 17 37 55 75 57 77
+
+Temp
+ 00 04 10 14 20 24 30 34
+ 40 44 50 54 60 64 70 74
+ 01 03 11 13 21 23 31 33
+ 41 43 51 53 61 63 71 73
+ 02 06 12 16 22 26 32 36
+ 42 46 52 56 62 66 72 76
+ 05 07 15 17 25 27 35 37
+ 45 47 55 57 65 67 75 77
+*/
+
+"9: \n\t"
+                :: "r" (block), "r" (temp), "r" (coeffs)
+                : "%eax"
+        );
 }
 
 void ff_simple_idct_mmx(int16_t *block)
diff --git a/src/filters/transform/MPCVideoDec/ffmpeg/libswscale/swscale_template.c b/src/filters/transform/MPCVideoDec/ffmpeg/libswscale/swscale_template.c
index 635c3ab79..ca8bcc475 100644
--- a/src/filters/transform/MPCVideoDec/ffmpeg/libswscale/swscale_template.c
+++ b/src/filters/transform/MPCVideoDec/ffmpeg/libswscale/swscale_template.c
@@ -20,7 +20,7 @@
  * The C code (not assembly, MMX, ...) of this file can be used
  * under the LGPL license.
  */
-
+ 
 /*
  * Modified to support multi-thread related features
  * by Haruhiko Yamagata <h.yamagata@nifty.com> in 2006.
@@ -236,7 +236,7 @@
 		"paddw %%mm5, %%mm7		\n\t"\
 		"test %%"REG_S", %%"REG_S"	\n\t"\
 		" jnz 2b			\n\t"\
- 
+
 #define YSCALEYUV2PACKEDX_END\
         :: "r" (&c->redDither), \
             "m" (dummy), "m" (dummy), "m" (dummy),\
@@ -337,7 +337,7 @@
                 "paddw %%mm0, %%mm7             \n\t"\
                 "movq  "U_TEMP"(%0), %%mm3      \n\t"\
                 "movq  "V_TEMP"(%0), %%mm4      \n\t"\
- 
+
 #define YSCALEYUV2RGBX \
 		"psubw "U_OFFSET"(%0), %%mm3	\n\t" /* (U-128)8*/\
 		"psubw "V_OFFSET"(%0), %%mm4	\n\t" /* (V-128)8*/\
@@ -410,7 +410,7 @@
 		"psraw $7, %%mm7		\n\t" /* buf0[eax] - buf1[eax] >>4*/\
 		"paddw %%mm0, %%mm1		\n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
 		"paddw %%mm6, %%mm7		\n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
- 
+
 #define YSCALEYUV2PACKED(index, c)  REAL_YSCALEYUV2PACKED(index, c)
 
 #define REAL_YSCALEYUV2RGB(index, c) \
@@ -491,7 +491,7 @@
 		"movq 8(%0, "#index", 2), %%mm7	\n\t" /*buf0[eax]*/\
 		"psraw $7, %%mm1		\n\t" \
 		"psraw $7, %%mm7		\n\t" \
- 
+
 #define YSCALEYUV2PACKED1(index, c)  REAL_YSCALEYUV2PACKED1(index, c)
 
 #define REAL_YSCALEYUV2RGB1(index, c) \
@@ -881,115 +881,108 @@
 
 
 static inline void RENAME(yuv2yuvX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
-                                    int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
-                                    uint8_t *dest, uint8_t *uDest, uint8_t *vDest, long dstW0, long chrDstW0)
+				    int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
+				    uint8_t *dest, uint8_t *uDest, uint8_t *vDest, long dstW0, long chrDstW0)
 {
-    const stride_t dstW = dstW0;
-    const stride_t chrDstW = chrDstW0;
+        const stride_t dstW = dstW0;
+        const stride_t chrDstW = chrDstW0;
 #if HAVE_MMX
-    if(c->params.subsampling & SWS_ACCURATE_RND)
-    {
-        if(uDest)
-        {
-            YSCALEYUV2YV12X_ACCURATE(0, CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
-            YSCALEYUV2YV12X_ACCURATE(4096, CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
-        }
+        if(c->params.subsampling & SWS_ACCURATE_RND){
+                if(uDest){
+                        YSCALEYUV2YV12X_ACCURATE(   0, CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
+                        YSCALEYUV2YV12X_ACCURATE(4096, CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
+                }
 
-        YSCALEYUV2YV12X_ACCURATE(0, LUM_MMX_FILTER_OFFSET, dest, dstW)
-    }
-    else
-    {
-        if(uDest)
-        {
-            YSCALEYUV2YV12X(0, CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
-            YSCALEYUV2YV12X(4096, CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
-        }
+                YSCALEYUV2YV12X_ACCURATE(0, LUM_MMX_FILTER_OFFSET, dest, dstW)
+        }else{
+                if(uDest){
+                        YSCALEYUV2YV12X(   0, CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
+                        YSCALEYUV2YV12X(4096, CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
+                }
 
-        YSCALEYUV2YV12X(0, LUM_MMX_FILTER_OFFSET, dest, dstW)
-    }
+                YSCALEYUV2YV12X(0, LUM_MMX_FILTER_OFFSET, dest, dstW)
+        }
 #else
 #ifdef HAVE_ALTIVEC
-    yuv2yuvX_altivec_real(lumFilter, lumSrc, lumFilterSize,
-                          chrFilter, chrSrc, chrFilterSize,
-                          dest, uDest, vDest, dstW, chrDstW);
+yuv2yuvX_altivec_real(lumFilter, lumSrc, lumFilterSize,
+		      chrFilter, chrSrc, chrFilterSize,
+		      dest, uDest, vDest, dstW, chrDstW);
 #else //HAVE_ALTIVEC
-    yuv2yuvXinC(lumFilter, lumSrc, lumFilterSize,
-                chrFilter, chrSrc, chrFilterSize,
-                dest, uDest, vDest, dstW, chrDstW);
+yuv2yuvXinC(lumFilter, lumSrc, lumFilterSize,
+	    chrFilter, chrSrc, chrFilterSize,
+	    dest, uDest, vDest, dstW, chrDstW);
 #endif //!HAVE_ALTIVEC
 #endif
 }
 
 static inline void RENAME(yuv2nv12X)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
-                                     int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
-                                     uint8_t *dest, uint8_t *uDest, int dstW, int chrDstW, int dstFormat)
+				     int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
+				     uint8_t *dest, uint8_t *uDest, int dstW, int chrDstW, int dstFormat)
 {
-    yuv2nv12XinC(lumFilter, lumSrc, lumFilterSize,
-                 chrFilter, chrSrc, chrFilterSize,
-                 dest, uDest, dstW, chrDstW, dstFormat);
+yuv2nv12XinC(lumFilter, lumSrc, lumFilterSize,
+	     chrFilter, chrSrc, chrFilterSize,
+	     dest, uDest, dstW, chrDstW, dstFormat);
 }
 
 static inline void RENAME(yuv2yuv1)(int16_t *lumSrc, int16_t *chrSrc,
-                                    uint8_t *dest, uint8_t *uDest, uint8_t *vDest, long dstW0, long chrDstW0)
+				    uint8_t *dest, uint8_t *uDest, uint8_t *vDest, long dstW0, long chrDstW0)
 {
-    const stride_t dstW = dstW0;
-    const stride_t chrDstW = chrDstW0;
+        const stride_t dstW = dstW0;
+        const stride_t chrDstW = chrDstW0;
 #if HAVE_MMX
-    if(uDest != NULL)
-    {
-        asm volatile(
-            YSCALEYUV2YV121
-            :: "r"(chrSrc + chrDstW), "r"(uDest + chrDstW),
-            "g"(-chrDstW)
-            : "%"REG_a
-        );
-
-        asm volatile(
-            YSCALEYUV2YV121
-            :: "r"(chrSrc + 2048 + chrDstW), "r"(vDest + chrDstW),
-            "g"(-chrDstW)
-            : "%"REG_a
-        );
-    }
-
-    asm volatile(
-        YSCALEYUV2YV121
-        :: "r"(lumSrc + dstW), "r"(dest + dstW),
-        "g"(-dstW)
-        : "%"REG_a
-    );
+	if(uDest != NULL)
+	{
+		asm volatile(
+				YSCALEYUV2YV121
+				:: "r" (chrSrc + chrDstW), "r" (uDest + chrDstW),
+				"g" (-chrDstW)
+				: "%"REG_a
+			);
+
+		asm volatile(
+				YSCALEYUV2YV121
+				:: "r" (chrSrc + 2048 + chrDstW), "r" (vDest + chrDstW),
+				"g" (-chrDstW)
+				: "%"REG_a
+			);
+	}
+
+	asm volatile(
+		YSCALEYUV2YV121
+		:: "r" (lumSrc + dstW), "r" (dest + dstW),
+		"g" (-dstW)
+		: "%"REG_a
+	);
 #else
-    int i;
-    for(i = 0; i < dstW; i++)
-    {
-        int val = lumSrc[i] >> 7;
-
-        if(val & 256)
-        {
-            if(val < 0) val = 0;
-            else      val = 255;
-        }
-
-        dest[i] = val;
-    }
-
-    if(uDest != NULL)
-        for(i = 0; i < chrDstW; i++)
-        {
-            int u = chrSrc[i] >> 7;
-            int v = chrSrc[i + 2048] >> 7;
-
-            if((u | v) & 256)
-            {
-                if(u < 0)         u = 0;
-                else if(u > 255) u = 255;
-                if(v < 0)         v = 0;
-                else if(v > 255) v = 255;
-            }
-
-            uDest[i] = u;
-            vDest[i] = v;
-        }
+	int i;
+	for(i=0; i<dstW; i++)
+	{
+		int val= lumSrc[i]>>7;
+
+		if(val&256){
+			if(val<0) val=0;
+			else      val=255;
+		}
+
+		dest[i]= val;
+	}
+
+	if(uDest != NULL)
+		for(i=0; i<chrDstW; i++)
+		{
+			int u=chrSrc[i]>>7;
+			int v=chrSrc[i + 2048]>>7;
+
+			if((u|v)&256){
+				if(u<0)         u=0;
+				else if (u>255) u=255;
+				if(v<0)         v=0;
+				else if (v>255) v=255;
+			}
+
+			uDest[i]= u;
+			vDest[i]= v;
+		}
 #endif
 }
 
@@ -998,1573 +991,1562 @@ static inline void RENAME(yuv2yuv1)(int16_t *lumSrc, int16_t *chrSrc,
  * vertical scale YV12 to RGB
  */
 static inline void RENAME(yuv2packedX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
-                                       int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
-                                       uint8_t *dest, long dstW, long dstY)
+				    int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
+			    uint8_t *dest, long dstW, long dstY)
 {
-    long dummy = 0;
+	long dummy=0;
 #if HAVE_MMX
-    if(c->params.subsampling & SWS_ACCURATE_RND)
-    {
-        switch(c->dstFormat)
-        {
-        case IMGFMT_BGR32:
-            YSCALEYUV2PACKEDX_ACCURATE
-            YSCALEYUV2RGBX
-            WRITEBGR32( % 4, % 5, % % REGa)
-
-            YSCALEYUV2PACKEDX_END
-            return;
-        case IMGFMT_BGR24:
-            YSCALEYUV2PACKEDX_ACCURATE
-            YSCALEYUV2RGBX
-            "lea (%%"REG_a", %%"REG_a", 2), %%"REG_b"\n\t" //FIXME optimize
-            "add %4, %%"REG_b"			\n\t"
-            WRITEBGR24( % % REGb, % 5, % % REGa)
-
-
-            :: "r"(&c->redDither),
-            "m"(dummy), "m"(dummy), "m"(dummy),
-            "r"(dest), "m"(dstW)
-                : "%"REG_a, "%"REG_b, "%"REG_d, "%"REG_S //FIXME ebx
-                );
-                return;
-            case IMGFMT_BGR15:
-                    YSCALEYUV2PACKEDX_ACCURATE
-                    YSCALEYUV2RGBX
-            /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
+    if(c->params.subsampling & SWS_ACCURATE_RND){
+                switch(c->dstFormat){
+                case IMGFMT_BGR32:
+                                YSCALEYUV2PACKEDX_ACCURATE
+				YSCALEYUV2RGBX
+				WRITEBGR32(%4, %5, %%REGa)
+
+                                YSCALEYUV2PACKEDX_END
+                        return;
+                case IMGFMT_BGR24:
+                                YSCALEYUV2PACKEDX_ACCURATE
+				YSCALEYUV2RGBX
+				"lea (%%"REG_a", %%"REG_a", 2), %%"REG_b"\n\t" //FIXME optimize
+				"add %4, %%"REG_b"			\n\t"
+				WRITEBGR24(%%REGb, %5, %%REGa)
+
+
+			:: "r" (&c->redDither),
+			   "m" (dummy), "m" (dummy), "m" (dummy),
+			   "r" (dest), "m" (dstW)
+			: "%"REG_a, "%"REG_b, "%"REG_d, "%"REG_S //FIXME ebx
+			);
+                        return;
+                case IMGFMT_BGR15:
+                                YSCALEYUV2PACKEDX_ACCURATE
+				YSCALEYUV2RGBX
+		/* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
 #ifdef DITHER1XBPP
-                    "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
-                    "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
-                    "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
+				"paddusb "MANGLE(b5Dither)", %%mm2\n\t"
+				"paddusb "MANGLE(g5Dither)", %%mm4\n\t"
+				"paddusb "MANGLE(r5Dither)", %%mm5\n\t"
 #endif
 
-                    WRITEBGR15( % 4, % 5, % % REGa)
-                    YSCALEYUV2PACKEDX_END
-                    return;
+				WRITEBGR15(%4, %5, %%REGa)
+                                YSCALEYUV2PACKEDX_END
+                        return;
                 case IMGFMT_BGR16:
-                        YSCALEYUV2PACKEDX_ACCURATE
-                        YSCALEYUV2RGBX
-                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
+                                YSCALEYUV2PACKEDX_ACCURATE
+				YSCALEYUV2RGBX
+		/* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
 #ifdef DITHER1XBPP
-                        "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
-                        "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
-                        "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
+				"paddusb "MANGLE(b5Dither)", %%mm2\n\t"
+				"paddusb "MANGLE(g6Dither)", %%mm4\n\t"
+				"paddusb "MANGLE(r5Dither)", %%mm5\n\t"
 #endif
 
-                        WRITEBGR16( % 4, % 5, % % REGa)
-                        YSCALEYUV2PACKEDX_END
+				WRITEBGR16(%4, %5, %%REGa)
+                                YSCALEYUV2PACKEDX_END
                         return;
-                    case IMGFMT_YUY2:
-                            YSCALEYUV2PACKEDX_ACCURATE
-                    /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
-
-                            "psraw $3, %%mm3		\n\t"
-                            "psraw $3, %%mm4		\n\t"
-                            "psraw $3, %%mm1		\n\t"
-                            "psraw $3, %%mm7		\n\t"
-                            WRITEYUY2( % 4, % 5, % % REGa)
-                            YSCALEYUV2PACKEDX_END
-                            return;
-                        }
-    }
-    else
-{
-        switch(c->dstFormat)
-        {
-        case IMGFMT_BGR32:
-            YSCALEYUV2PACKEDX
-            YSCALEYUV2RGBX
-            WRITEBGR32( % 4, % 5, % % REGa)
-            YSCALEYUV2PACKEDX_END
-            return;
-        case IMGFMT_BGR24:
-            YSCALEYUV2PACKEDX
-            YSCALEYUV2RGBX
-            "lea (%%"REG_a", %%"REG_a", 2), %%"REG_b"\n\t" //FIXME optimize
-            "add %4, %%"REG_b"			\n\t"
-            WRITEBGR24( % % REGb, % 5, % % REGa)
-
-            :: "r"(&c->redDither),
-            "m"(dummy), "m"(dummy), "m"(dummy),
-            "r"(dest), "m"(dstW)
-                : "%"REG_a, "%"REG_b, "%"REG_d, "%"REG_S //FIXME ebx
-                );
-                return;
-            case IMGFMT_BGR15:
-                    YSCALEYUV2PACKEDX
-                    YSCALEYUV2RGBX
-            /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
+                case IMGFMT_YUY2:
+				YSCALEYUV2PACKEDX_ACCURATE
+		/* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
+
+				"psraw $3, %%mm3		\n\t"
+				"psraw $3, %%mm4		\n\t"
+				"psraw $3, %%mm1		\n\t"
+				"psraw $3, %%mm7		\n\t"
+				WRITEYUY2(%4, %5, %%REGa)
+                                YSCALEYUV2PACKEDX_END
+                        return;
+                }
+    }else{
+	switch(c->dstFormat)
+	{
+	case IMGFMT_BGR32:
+                                YSCALEYUV2PACKEDX
+				YSCALEYUV2RGBX
+				WRITEBGR32(%4, %5, %%REGa)
+                                YSCALEYUV2PACKEDX_END
+		return;
+	case IMGFMT_BGR24:
+                                YSCALEYUV2PACKEDX
+				YSCALEYUV2RGBX
+				"lea (%%"REG_a", %%"REG_a", 2), %%"REG_b"\n\t" //FIXME optimize
+				"add %4, %%"REG_b"			\n\t"
+				WRITEBGR24(%%REGb, %5, %%REGa)
+
+			:: "r" (&c->redDither),
+			   "m" (dummy), "m" (dummy), "m" (dummy),
+			   "r" (dest), "m" (dstW)
+			: "%"REG_a, "%"REG_b, "%"REG_d, "%"REG_S //FIXME ebx
+			);
+		return;
+	case IMGFMT_BGR15:
+                                YSCALEYUV2PACKEDX
+				YSCALEYUV2RGBX
+		/* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
 #ifdef DITHER1XBPP
-                    "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
-                    "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
-                    "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
+				"paddusb "MANGLE(b5Dither)", %%mm2\n\t"
+				"paddusb "MANGLE(g5Dither)", %%mm4\n\t"
+				"paddusb "MANGLE(r5Dither)", %%mm5\n\t"
 #endif
 
-                    WRITEBGR15( % 4, % 5, % % REGa)
-                    YSCALEYUV2PACKEDX_END
-                    return;
-                case IMGFMT_BGR16:
-                        YSCALEYUV2PACKEDX
-                        YSCALEYUV2RGBX
-                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
+				WRITEBGR15(%4, %5, %%REGa)
+                                YSCALEYUV2PACKEDX_END
+		return;
+	case IMGFMT_BGR16:
+                                YSCALEYUV2PACKEDX
+				YSCALEYUV2RGBX
+		/* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
 #ifdef DITHER1XBPP
-                        "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
-                        "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
-                        "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
+				"paddusb "MANGLE(b5Dither)", %%mm2\n\t"
+				"paddusb "MANGLE(g6Dither)", %%mm4\n\t"
+				"paddusb "MANGLE(r5Dither)", %%mm5\n\t"
 #endif
 
-                        WRITEBGR16( % 4, % 5, % % REGa)
-                        YSCALEYUV2PACKEDX_END
-                        return;
-                    case IMGFMT_YUY2:
-                            YSCALEYUV2PACKEDX
-                    /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
-
-                            "psraw $3, %%mm3		\n\t"
-                            "psraw $3, %%mm4		\n\t"
-                            "psraw $3, %%mm1		\n\t"
-                            "psraw $3, %%mm7		\n\t"
-                            WRITEYUY2( % 4, % 5, % % REGa)
-                            YSCALEYUV2PACKEDX_END
-                            return;
-                        }
+				WRITEBGR16(%4, %5, %%REGa)
+                                YSCALEYUV2PACKEDX_END
+		return;
+	case IMGFMT_YUY2:
+				YSCALEYUV2PACKEDX
+		/* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
+
+				"psraw $3, %%mm3		\n\t"
+				"psraw $3, %%mm4		\n\t"
+				"psraw $3, %%mm1		\n\t"
+				"psraw $3, %%mm7		\n\t"
+				WRITEYUY2(%4, %5, %%REGa)
+                                YSCALEYUV2PACKEDX_END
+		return;
+        }
     }
 #endif
 #ifdef HAVE_ALTIVEC
-    /* The following list of supported dstFormat values should
-       match what's found in the body of altivec_yuv2packedX() */
-    if(c->dstFormat == IMGFMT_ABGR  || c->dstFormat == IMGFMT_BGRA  ||
-       c->dstFormat == IMGFMT_BGR24 || c->dstFormat == IMGFMT_RGB24 ||
-       c->dstFormat == IMGFMT_RGBA  || c->dstFormat == IMGFMT_ARGB)
-        altivec_yuv2packedX(c, lumFilter, lumSrc, lumFilterSize,
-                            chrFilter, chrSrc, chrFilterSize,
-                            dest, dstW, dstY);
-    else
+		/* The following list of supported dstFormat values should
+		   match what's found in the body of altivec_yuv2packedX() */
+		if(c->dstFormat==IMGFMT_ABGR  || c->dstFormat==IMGFMT_BGRA  ||
+		   c->dstFormat==IMGFMT_BGR24 || c->dstFormat==IMGFMT_RGB24 ||
+		   c->dstFormat==IMGFMT_RGBA  || c->dstFormat==IMGFMT_ARGB)
+			altivec_yuv2packedX (c, lumFilter, lumSrc, lumFilterSize,
+				    chrFilter, chrSrc, chrFilterSize,
+				    dest, dstW, dstY);
+		else
 #endif
-        yuv2packedXinC(c, lumFilter, lumSrc, lumFilterSize,
-                       chrFilter, chrSrc, chrFilterSize,
-                       dest, dstW, dstY);
+			yuv2packedXinC(c, lumFilter, lumSrc, lumFilterSize,
+				    chrFilter, chrSrc, chrFilterSize,
+				    dest, dstW, dstY);
 }
 
 /**
  * vertical bilinear scale YV12 to RGB
  */
 static inline void RENAME(yuv2packed2)(SwsContext *c, uint16_t *buf0, uint16_t *buf1, uint16_t *uvbuf0, uint16_t *uvbuf1,
-                                       uint8_t *dest, int dstW, int yalpha, int uvalpha, int y)
+			    uint8_t *dest, int dstW, int yalpha, int uvalpha, int y)
 {
-    int yalpha1 = yalpha ^ 4095;
-    int uvalpha1 = uvalpha ^ 4095;
-    int i;
+	int yalpha1=yalpha^4095;
+	int uvalpha1=uvalpha^4095;
+	int i;
 
 #if 0 //isn't used
-    if(flags & SWS_FULL_CHR_H_INT)
-    {
-        switch(dstFormat)
-        {
+	if(flags&SWS_FULL_CHR_H_INT)
+	{
+		switch(dstFormat)
+		{
 #if HAVE_MMX
-        case IMGFMT_BGR32:
-            asm volatile(
+		case IMGFMT_BGR32:
+			asm volatile(
 
 
-                FULL_YSCALEYUV2RGB
-                "punpcklbw %%mm1, %%mm3		\n\t" // BGBGBGBG
-                "punpcklbw %%mm7, %%mm0		\n\t" // R0R0R0R0
+FULL_YSCALEYUV2RGB
+			"punpcklbw %%mm1, %%mm3		\n\t" // BGBGBGBG
+			"punpcklbw %%mm7, %%mm0		\n\t" // R0R0R0R0
 
-                "movq %%mm3, %%mm1		\n\t"
-                "punpcklwd %%mm0, %%mm3		\n\t" // BGR0BGR0
-                "punpckhwd %%mm0, %%mm1		\n\t" // BGR0BGR0
+			"movq %%mm3, %%mm1		\n\t"
+			"punpcklwd %%mm0, %%mm3		\n\t" // BGR0BGR0
+			"punpckhwd %%mm0, %%mm1		\n\t" // BGR0BGR0
 
-                MOVNTQ(%%mm3, (%4, %%REGa, 4))
-                MOVNTQ(%%mm1, 8(%4, %%REGa, 4))
+			MOVNTQ(%%mm3, (%4, %%REGa, 4))
+			MOVNTQ(%%mm1, 8(%4, %%REGa, 4))
 
-                "add $4, %%"REG_a"		\n\t"
-                "cmp %5, %%"REG_a"		\n\t"
-                " jb 1b				\n\t"
+			"add $4, %%"REG_a"		\n\t"
+			"cmp %5, %%"REG_a"		\n\t"
+			" jb 1b				\n\t"
 
 
-                :: "r"(buf0), "r"(buf1), "r"(uvbuf0), "r"(uvbuf1), "r"(dest), "m"((long)dstW),
-                "m"(yalpha1), "m"(uvalpha1)
-                : "%"REG_a
-            );
-            break;
-        case IMGFMT_BGR24:
-            asm volatile(
+			:: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" ((long)dstW),
+			"m" (yalpha1), "m" (uvalpha1)
+			: "%"REG_a
+			);
+			break;
+		case IMGFMT_BGR24:
+			asm volatile(
 
-                FULL_YSCALEYUV2RGB
+FULL_YSCALEYUV2RGB
 
-                // lsb ... msb
-                "punpcklbw %%mm1, %%mm3		\n\t" // BGBGBGBG
-                "punpcklbw %%mm7, %%mm0		\n\t" // R0R0R0R0
+								// lsb ... msb
+			"punpcklbw %%mm1, %%mm3		\n\t" // BGBGBGBG
+			"punpcklbw %%mm7, %%mm0		\n\t" // R0R0R0R0
 
-                "movq %%mm3, %%mm1		\n\t"
-                "punpcklwd %%mm0, %%mm3		\n\t" // BGR0BGR0
-                "punpckhwd %%mm0, %%mm1		\n\t" // BGR0BGR0
+			"movq %%mm3, %%mm1		\n\t"
+			"punpcklwd %%mm0, %%mm3		\n\t" // BGR0BGR0
+			"punpckhwd %%mm0, %%mm1		\n\t" // BGR0BGR0
 
-                "movq %%mm3, %%mm2		\n\t" // BGR0BGR0
-                "psrlq $8, %%mm3		\n\t" // GR0BGR00
-                "pand "MANGLE(bm00000111)", %%mm2\n\t" // BGR00000
-                "pand "MANGLE(bm11111000)", %%mm3\n\t" // 000BGR00
-                "por %%mm2, %%mm3		\n\t" // BGRBGR00
-                "movq %%mm1, %%mm2		\n\t"
-                "psllq $48, %%mm1		\n\t" // 000000BG
-                "por %%mm1, %%mm3		\n\t" // BGRBGRBG
+			"movq %%mm3, %%mm2		\n\t" // BGR0BGR0
+			"psrlq $8, %%mm3		\n\t" // GR0BGR00
+			"pand "MANGLE(bm00000111)", %%mm2\n\t" // BGR00000
+			"pand "MANGLE(bm11111000)", %%mm3\n\t" // 000BGR00
+			"por %%mm2, %%mm3		\n\t" // BGRBGR00
+			"movq %%mm1, %%mm2		\n\t"
+			"psllq $48, %%mm1		\n\t" // 000000BG
+			"por %%mm1, %%mm3		\n\t" // BGRBGRBG
 
-                "movq %%mm2, %%mm1		\n\t" // BGR0BGR0
-                "psrld $16, %%mm2		\n\t" // R000R000
-                "psrlq $24, %%mm1		\n\t" // 0BGR0000
-                "por %%mm2, %%mm1		\n\t" // RBGRR000
+			"movq %%mm2, %%mm1		\n\t" // BGR0BGR0
+			"psrld $16, %%mm2		\n\t" // R000R000
+			"psrlq $24, %%mm1		\n\t" // 0BGR0000
+			"por %%mm2, %%mm1		\n\t" // RBGRR000
 
-                "mov %4, %%"REG_b"		\n\t"
-                "add %%"REG_a", %%"REG_b"	\n\t"
+			"mov %4, %%"REG_b"		\n\t"
+			"add %%"REG_a", %%"REG_b"	\n\t"
 
 #if HAVE_MMX2
-                //FIXME Alignment
-                "movntq %%mm3, (%%"REG_b", %%"REG_a", 2)\n\t"
-                "movntq %%mm1, 8(%%"REG_b", %%"REG_a", 2)\n\t"
+			//FIXME Alignment
+			"movntq %%mm3, (%%"REG_b", %%"REG_a", 2)\n\t"
+			"movntq %%mm1, 8(%%"REG_b", %%"REG_a", 2)\n\t"
 #else
-                "movd %%mm3, (%%"REG_b", %%"REG_a", 2)	\n\t"
-                "psrlq $32, %%mm3		\n\t"
-                "movd %%mm3, 4(%%"REG_b", %%"REG_a", 2)	\n\t"
-                "movd %%mm1, 8(%%"REG_b", %%"REG_a", 2)	\n\t"
+			"movd %%mm3, (%%"REG_b", %%"REG_a", 2)	\n\t"
+			"psrlq $32, %%mm3		\n\t"
+			"movd %%mm3, 4(%%"REG_b", %%"REG_a", 2)	\n\t"
+			"movd %%mm1, 8(%%"REG_b", %%"REG_a", 2)	\n\t"
 #endif
-                "add $4, %%"REG_a"		\n\t"
-                "cmp %5, %%"REG_a"		\n\t"
-                " jb 1b				\n\t"
-
-                :: "r"(buf0), "r"(buf1), "r"(uvbuf0), "r"(uvbuf1), "m"(dest), "m"(dstW),
-                "m"(yalpha1), "m"(uvalpha1)
-                : "%"REG_a, "%"REG_b
-            );
-            break;
-        case IMGFMT_BGR15:
-            asm volatile(
-
-                FULL_YSCALEYUV2RGB
+			"add $4, %%"REG_a"		\n\t"
+			"cmp %5, %%"REG_a"		\n\t"
+			" jb 1b				\n\t"
+
+			:: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstW),
+			"m" (yalpha1), "m" (uvalpha1)
+			: "%"REG_a, "%"REG_b
+			);
+			break;
+		case IMGFMT_BGR15:
+			asm volatile(
+
+FULL_YSCALEYUV2RGB
 #ifdef DITHER1XBPP
-                "paddusb "MANGLE(g5Dither)", %%mm1\n\t"
-                "paddusb "MANGLE(r5Dither)", %%mm0\n\t"
-                "paddusb "MANGLE(b5Dither)", %%mm3\n\t"
+			"paddusb "MANGLE(g5Dither)", %%mm1\n\t"
+			"paddusb "MANGLE(r5Dither)", %%mm0\n\t"
+			"paddusb "MANGLE(b5Dither)", %%mm3\n\t"
 #endif
-                "punpcklbw %%mm7, %%mm1		\n\t" // 0G0G0G0G
-                "punpcklbw %%mm7, %%mm3		\n\t" // 0B0B0B0B
-                "punpcklbw %%mm7, %%mm0		\n\t" // 0R0R0R0R
-
-                "psrlw $3, %%mm3		\n\t"
-                "psllw $2, %%mm1		\n\t"
-                "psllw $7, %%mm0		\n\t"
-                "pand "MANGLE(g15Mask)", %%mm1	\n\t"
-                "pand "MANGLE(r15Mask)", %%mm0	\n\t"
-
-                "por %%mm3, %%mm1		\n\t"
-                "por %%mm1, %%mm0		\n\t"
-
-                MOVNTQ(%%mm0, (%4, %%REGa, 2))
-
-                "add $4, %%"REG_a"		\n\t"
-                "cmp %5, %%"REG_a"		\n\t"
-                " jb 1b				\n\t"
-
-                :: "r"(buf0), "r"(buf1), "r"(uvbuf0), "r"(uvbuf1), "r"(dest), "m"(dstW),
-                "m"(yalpha1), "m"(uvalpha1)
-                : "%"REG_a
-            );
-            break;
-        case IMGFMT_BGR16:
-            asm volatile(
-
-                FULL_YSCALEYUV2RGB
+			"punpcklbw %%mm7, %%mm1		\n\t" // 0G0G0G0G
+			"punpcklbw %%mm7, %%mm3		\n\t" // 0B0B0B0B
+			"punpcklbw %%mm7, %%mm0		\n\t" // 0R0R0R0R
+
+			"psrlw $3, %%mm3		\n\t"
+			"psllw $2, %%mm1		\n\t"
+			"psllw $7, %%mm0		\n\t"
+			"pand "MANGLE(g15Mask)", %%mm1	\n\t"
+			"pand "MANGLE(r15Mask)", %%mm0	\n\t"
+
+			"por %%mm3, %%mm1		\n\t"
+			"por %%mm1, %%mm0		\n\t"
+
+			MOVNTQ(%%mm0, (%4, %%REGa, 2))
+
+			"add $4, %%"REG_a"		\n\t"
+			"cmp %5, %%"REG_a"		\n\t"
+			" jb 1b				\n\t"
+
+			:: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
+			"m" (yalpha1), "m" (uvalpha1)
+			: "%"REG_a
+			);
+			break;
+		case IMGFMT_BGR16:
+			asm volatile(
+
+FULL_YSCALEYUV2RGB
 #ifdef DITHER1XBPP
-                "paddusb "MANGLE(g6Dither)", %%mm1\n\t"
-                "paddusb "MANGLE(r5Dither)", %%mm0\n\t"
-                "paddusb "MANGLE(b5Dither)", %%mm3\n\t"
+			"paddusb "MANGLE(g6Dither)", %%mm1\n\t"
+			"paddusb "MANGLE(r5Dither)", %%mm0\n\t"
+			"paddusb "MANGLE(b5Dither)", %%mm3\n\t"
 #endif
-                "punpcklbw %%mm7, %%mm1		\n\t" // 0G0G0G0G
-                "punpcklbw %%mm7, %%mm3		\n\t" // 0B0B0B0B
-                "punpcklbw %%mm7, %%mm0		\n\t" // 0R0R0R0R
-
-                "psrlw $3, %%mm3		\n\t"
-                "psllw $3, %%mm1		\n\t"
-                "psllw $8, %%mm0		\n\t"
-                "pand "MANGLE(g16Mask)", %%mm1	\n\t"
-                "pand "MANGLE(r16Mask)", %%mm0	\n\t"
-
-                "por %%mm3, %%mm1		\n\t"
-                "por %%mm1, %%mm0		\n\t"
-
-                MOVNTQ(%%mm0, (%4, %%REGa, 2))
-
-                "add $4, %%"REG_a"		\n\t"
-                "cmp %5, %%"REG_a"		\n\t"
-                " jb 1b				\n\t"
-
-                :: "r"(buf0), "r"(buf1), "r"(uvbuf0), "r"(uvbuf1), "r"(dest), "m"(dstW),
-                "m"(yalpha1), "m"(uvalpha1)
-                : "%"REG_a
-            );
-            break;
+			"punpcklbw %%mm7, %%mm1		\n\t" // 0G0G0G0G
+			"punpcklbw %%mm7, %%mm3		\n\t" // 0B0B0B0B
+			"punpcklbw %%mm7, %%mm0		\n\t" // 0R0R0R0R
+
+			"psrlw $3, %%mm3		\n\t"
+			"psllw $3, %%mm1		\n\t"
+			"psllw $8, %%mm0		\n\t"
+			"pand "MANGLE(g16Mask)", %%mm1	\n\t"
+			"pand "MANGLE(r16Mask)", %%mm0	\n\t"
+
+			"por %%mm3, %%mm1		\n\t"
+			"por %%mm1, %%mm0		\n\t"
+
+			MOVNTQ(%%mm0, (%4, %%REGa, 2))
+
+			"add $4, %%"REG_a"		\n\t"
+			"cmp %5, %%"REG_a"		\n\t"
+			" jb 1b				\n\t"
+
+			:: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
+			"m" (yalpha1), "m" (uvalpha1)
+			: "%"REG_a
+			);
+		break;
 #endif
-        case IMGFMT_RGB32:
+		case IMGFMT_RGB32:
 #if !HAVE_MMX
-        case IMGFMT_BGR32:
+		case IMGFMT_BGR32:
 #endif
-            if(dstFormat == IMGFMT_BGR32)
-            {
-                int i;
+		if(dstFormat==IMGFMT_BGR32)
+		{
+			int i;
 #ifdef WORDS_BIGENDIAN
-                dest++;
+			dest++;
 #endif
-                for(i = 0; i < dstW; i++)
-                {
-                    // vertical linear interpolation && yuv2rgb in a single step:
-                    int Y = yuvtab_2568[((buf0[i] * yalpha1 + buf1[i] * yalpha) >> 19)];
-                    int U = ((uvbuf0[i] * uvalpha1 + uvbuf1[i] * uvalpha) >> 19);
-                    int V = ((uvbuf0[i+2048] * uvalpha1 + uvbuf1[i+2048] * uvalpha) >> 19);
-                    dest[0] = clip_table[((Y + yuvtab_40cf[U]) >> 13)];
-                    dest[1] = clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >> 13)];
-                    dest[2] = clip_table[((Y + yuvtab_3343[V]) >> 13)];
-                    dest += 4;
-                }
-            }
-            else if(dstFormat == IMGFMT_BGR24)
-            {
-                int i;
-                for(i = 0; i < dstW; i++)
-                {
-                    // vertical linear interpolation && yuv2rgb in a single step:
-                    int Y = yuvtab_2568[((buf0[i] * yalpha1 + buf1[i] * yalpha) >> 19)];
-                    int U = ((uvbuf0[i] * uvalpha1 + uvbuf1[i] * uvalpha) >> 19);
-                    int V = ((uvbuf0[i+2048] * uvalpha1 + uvbuf1[i+2048] * uvalpha) >> 19);
-                    dest[0] = clip_table[((Y + yuvtab_40cf[U]) >> 13)];
-                    dest[1] = clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >> 13)];
-                    dest[2] = clip_table[((Y + yuvtab_3343[V]) >> 13)];
-                    dest += 3;
-                }
-            }
-            else if(dstFormat == IMGFMT_BGR16)
-            {
-                int i;
-                for(i = 0; i < dstW; i++)
-                {
-                    // vertical linear interpolation && yuv2rgb in a single step:
-                    int Y = yuvtab_2568[((buf0[i] * yalpha1 + buf1[i] * yalpha) >> 19)];
-                    int U = ((uvbuf0[i] * uvalpha1 + uvbuf1[i] * uvalpha) >> 19);
-                    int V = ((uvbuf0[i+2048] * uvalpha1 + uvbuf1[i+2048] * uvalpha) >> 19);
-
-                    ((uint16_t*)dest)[i] =
-                        clip_table16b[(Y + yuvtab_40cf[U]) >> 13] |
-                        clip_table16g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >> 13] |
-                        clip_table16r[(Y + yuvtab_3343[V]) >> 13];
-                }
-            }
-            else if(dstFormat == IMGFMT_BGR15)
-            {
-                int i;
-                for(i = 0; i < dstW; i++)
-                {
-                    // vertical linear interpolation && yuv2rgb in a single step:
-                    int Y = yuvtab_2568[((buf0[i] * yalpha1 + buf1[i] * yalpha) >> 19)];
-                    int U = ((uvbuf0[i] * uvalpha1 + uvbuf1[i] * uvalpha) >> 19);
-                    int V = ((uvbuf0[i+2048] * uvalpha1 + uvbuf1[i+2048] * uvalpha) >> 19);
-
-                    ((uint16_t*)dest)[i] =
-                        clip_table15b[(Y + yuvtab_40cf[U]) >> 13] |
-                        clip_table15g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >> 13] |
-                        clip_table15r[(Y + yuvtab_3343[V]) >> 13];
-                }
-            }
-        }//FULL_UV_IPOL
-        else
-        {
+			for(i=0;i<dstW;i++){
+				// vertical linear interpolation && yuv2rgb in a single step:
+				int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
+				int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
+				int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
+				dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)];
+				dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)];
+				dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)];
+				dest+= 4;
+			}
+		}
+		else if(dstFormat==IMGFMT_BGR24)
+		{
+			int i;
+			for(i=0;i<dstW;i++){
+				// vertical linear interpolation && yuv2rgb in a single step:
+				int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
+				int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
+				int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
+				dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)];
+				dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)];
+				dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)];
+				dest+= 3;
+			}
+		}
+		else if(dstFormat==IMGFMT_BGR16)
+		{
+			int i;
+			for(i=0;i<dstW;i++){
+				// vertical linear interpolation && yuv2rgb in a single step:
+				int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
+				int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
+				int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
+
+				((uint16_t*)dest)[i] =
+					clip_table16b[(Y + yuvtab_40cf[U]) >>13] |
+					clip_table16g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] |
+					clip_table16r[(Y + yuvtab_3343[V]) >>13];
+			}
+		}
+		else if(dstFormat==IMGFMT_BGR15)
+		{
+			int i;
+			for(i=0;i<dstW;i++){
+				// vertical linear interpolation && yuv2rgb in a single step:
+				int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
+				int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
+				int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
+
+				((uint16_t*)dest)[i] =
+					clip_table15b[(Y + yuvtab_40cf[U]) >>13] |
+					clip_table15g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] |
+					clip_table15r[(Y + yuvtab_3343[V]) >>13];
+			}
+		}
+	}//FULL_UV_IPOL
+	else
+	{
 #endif // if 0
 #if HAVE_MMX
-            switch(c->dstFormat)
-            {
+	switch(c->dstFormat)
+	{
 //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
-            case IMGFMT_BGR32:
-                asm volatile(
-                    "mov %%"REG_b", "ESP_OFFSET"(%5)	\n\t"
-                    "mov %4, %%"REG_b"			\n\t"
-                    "push %%"REG_BP"                        \n\t"
-                    YSCALEYUV2RGB(%%REGBP, %5)
-                    WRITEBGR32(%%REGb, 8280(%5), %%REGBP)
-                    "pop %%"REG_BP"                         \n\t"
-                    "mov "ESP_OFFSET"(%5), %%"REG_b"	\n\t"
-
-                    :: "c"(buf0), "d"(buf1), "S"(uvbuf0), "D"(uvbuf1), "m"(dest),
-                    "a"(&c->redDither)
-                );
-                return;
-            case IMGFMT_BGR24:
-                asm volatile(
-                    "mov %%"REG_b", "ESP_OFFSET"(%5)	\n\t"
-                    "mov %4, %%"REG_b"			\n\t"
-                    "push %%"REG_BP"                        \n\t"
-                    YSCALEYUV2RGB(%%REGBP, %5)
-                    WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
-                    "pop %%"REG_BP"                         \n\t"
-                    "mov "ESP_OFFSET"(%5), %%"REG_b"	\n\t"
-                    :: "c"(buf0), "d"(buf1), "S"(uvbuf0), "D"(uvbuf1), "m"(dest),
-                    "a"(&c->redDither)
-                );
-                return;
-            case IMGFMT_BGR15:
-                asm volatile(
-                    "mov %%"REG_b", "ESP_OFFSET"(%5)	\n\t"
-                    "mov %4, %%"REG_b"			\n\t"
-                    "push %%"REG_BP"                        \n\t"
-                    YSCALEYUV2RGB(%%REGBP, %5)
-                    /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
+	case IMGFMT_BGR32:
+			asm volatile(
+				"mov %%"REG_b", "ESP_OFFSET"(%5)	\n\t"
+				"mov %4, %%"REG_b"			\n\t"
+                                "push %%"REG_BP"                        \n\t"
+				YSCALEYUV2RGB(%%REGBP, %5)
+				WRITEBGR32(%%REGb, 8280(%5), %%REGBP)
+                                "pop %%"REG_BP"                         \n\t"
+				"mov "ESP_OFFSET"(%5), %%"REG_b"	\n\t"
+
+			:: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
+			"a" (&c->redDither)
+			);
+			return;
+	case IMGFMT_BGR24:
+			asm volatile(
+				"mov %%"REG_b", "ESP_OFFSET"(%5)	\n\t"
+				"mov %4, %%"REG_b"			\n\t"
+                                "push %%"REG_BP"                        \n\t"
+				YSCALEYUV2RGB(%%REGBP, %5)
+				WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
+                                "pop %%"REG_BP"                         \n\t"
+				"mov "ESP_OFFSET"(%5), %%"REG_b"	\n\t"
+			:: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
+			"a" (&c->redDither)
+			);
+			return;
+	case IMGFMT_BGR15:
+			asm volatile(
+				"mov %%"REG_b", "ESP_OFFSET"(%5)	\n\t"
+				"mov %4, %%"REG_b"			\n\t"
+                                "push %%"REG_BP"                        \n\t"
+				YSCALEYUV2RGB(%%REGBP, %5)
+		/* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
 #ifdef DITHER1XBPP
-                    "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
-                    "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
-                    "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
+				"paddusb "MANGLE(b5Dither)", %%mm2\n\t"
+				"paddusb "MANGLE(g5Dither)", %%mm4\n\t"
+				"paddusb "MANGLE(r5Dither)", %%mm5\n\t"
 #endif
 
-                    WRITEBGR15(%%REGb, 8280(%5), %%REGBP)
-                    "pop %%"REG_BP"                         \n\t"
-                    "mov "ESP_OFFSET"(%5), %%"REG_b"	\n\t"
-
-                    :: "c"(buf0), "d"(buf1), "S"(uvbuf0), "D"(uvbuf1), "m"(dest),
-                    "a"(&c->redDither)
-                );
-                return;
-            case IMGFMT_BGR16:
-                asm volatile(
-                    "mov %%"REG_b", "ESP_OFFSET"(%5)	\n\t"
-                    "mov %4, %%"REG_b"			\n\t"
-                    "push %%"REG_BP"                        \n\t"
-                    YSCALEYUV2RGB(%%REGBP, %5)
-                    /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
+				WRITEBGR15(%%REGb, 8280(%5), %%REGBP)
+                                "pop %%"REG_BP"                         \n\t"
+				"mov "ESP_OFFSET"(%5), %%"REG_b"	\n\t"
+
+			:: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
+			"a" (&c->redDither)
+			);
+			return;
+	case IMGFMT_BGR16:
+			asm volatile(
+				"mov %%"REG_b", "ESP_OFFSET"(%5)	\n\t"
+				"mov %4, %%"REG_b"			\n\t"
+                                "push %%"REG_BP"                        \n\t"
+				YSCALEYUV2RGB(%%REGBP, %5)
+		/* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
 #ifdef DITHER1XBPP
-                    "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
-                    "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
-                    "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
+				"paddusb "MANGLE(b5Dither)", %%mm2\n\t"
+				"paddusb "MANGLE(g6Dither)", %%mm4\n\t"
+				"paddusb "MANGLE(r5Dither)", %%mm5\n\t"
 #endif
 
-                    WRITEBGR16(%%REGb, 8280(%5), %%REGBP)
-                    "pop %%"REG_BP"                         \n\t"
-                    "mov "ESP_OFFSET"(%5), %%"REG_b"	\n\t"
-                    :: "c"(buf0), "d"(buf1), "S"(uvbuf0), "D"(uvbuf1), "m"(dest),
-                    "a"(&c->redDither)
-                );
-                return;
-            case IMGFMT_YUY2:
-                asm volatile(
-                    "mov %%"REG_b", "ESP_OFFSET"(%5)	\n\t"
-                    "mov %4, %%"REG_b"			\n\t"
-                    "push %%"REG_BP"                        \n\t"
-                    YSCALEYUV2PACKED(%%REGBP, %5)
-                    WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
-                    "pop %%"REG_BP"                         \n\t"
-                    "mov "ESP_OFFSET"(%5), %%"REG_b"	\n\t"
-                    :: "c"(buf0), "d"(buf1), "S"(uvbuf0), "D"(uvbuf1), "m"(dest),
-                    "a"(&c->redDither)
-                );
-                return;
-            default:
-                break;
-            }
+				WRITEBGR16(%%REGb, 8280(%5), %%REGBP)
+                                "pop %%"REG_BP"                         \n\t"
+				"mov "ESP_OFFSET"(%5), %%"REG_b"	\n\t"
+			:: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
+			"a" (&c->redDither)
+			);
+			return;
+	case IMGFMT_YUY2:
+			asm volatile(
+				"mov %%"REG_b", "ESP_OFFSET"(%5)	\n\t"
+				"mov %4, %%"REG_b"			\n\t"
+                                "push %%"REG_BP"                        \n\t"
+				YSCALEYUV2PACKED(%%REGBP, %5)
+				WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
+                                "pop %%"REG_BP"                         \n\t"
+				"mov "ESP_OFFSET"(%5), %%"REG_b"	\n\t"
+			:: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
+			"a" (&c->redDither)
+			);
+			return;
+	default: break;
+	}
 #endif //HAVE_MMX
-            YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB2_C, YSCALE_YUV_2_PACKED2_C)
-        }
+YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB2_C, YSCALE_YUV_2_PACKED2_C)
+}
 
-        /**
-         * YV12 to RGB without scaling or interpolating
-         */
-        static inline void RENAME(yuv2packed1)(SwsContext * c, uint16_t * buf0, uint16_t * uvbuf0, uint16_t * uvbuf1,
-                                               uint8_t * dest, int dstW, int uvalpha, int dstFormat, int flags, int y)
-        {
-            const int yalpha1 = 0;
-            int i;
+/**
+ * YV12 to RGB without scaling or interpolating
+ */
+static inline void RENAME(yuv2packed1)(SwsContext *c, uint16_t *buf0, uint16_t *uvbuf0, uint16_t *uvbuf1,
+			    uint8_t *dest, int dstW, int uvalpha, int dstFormat, int flags, int y)
+{
+	const int yalpha1=0;
+	int i;
 
-            uint16_t *buf1 = buf0; //FIXME needed for the rgb1/bgr1
-            const int yalpha = 4096; //FIXME ...
+	uint16_t *buf1= buf0; //FIXME needed for the rgb1/bgr1
+	const int yalpha= 4096; //FIXME ...
 
-            if(flags & SWS_FULL_CHR_H_INT)
-            {
-                RENAME(yuv2packed2)(c, buf0, buf0, uvbuf0, uvbuf1, dest, dstW, 0, uvalpha, y);
-                return;
-            }
+	if(flags&SWS_FULL_CHR_H_INT)
+	{
+		RENAME(yuv2packed2)(c, buf0, buf0, uvbuf0, uvbuf1, dest, dstW, 0, uvalpha, y);
+		return;
+	}
 
 #if HAVE_MMX
-            if(uvalpha < 2048)   // note this is not correct (shifts chrominance by 0.5 pixels) but its a bit faster
-            {
-                switch(dstFormat)
-                {
-                case IMGFMT_BGR32:
-                    asm volatile(
-                        "mov %%"REG_b", "ESP_OFFSET"(%5)	\n\t"
-                        "mov %4, %%"REG_b"			\n\t"
-                        "push %%"REG_BP"                        \n\t"
-                        YSCALEYUV2RGB1(%%REGBP, %5)
-                        WRITEBGR32(%%REGb, 8280(%5), %%REGBP)
-                        "pop %%"REG_BP"                         \n\t"
-                        "mov "ESP_OFFSET"(%5), %%"REG_b"	\n\t"
-
-                        :: "c"(buf0), "d"(buf1), "S"(uvbuf0), "D"(uvbuf1), "m"(dest),
-                        "a"(&c->redDither)
-                    );
-                    return;
-                case IMGFMT_BGR24:
-                    asm volatile(
-                        "mov %%"REG_b", "ESP_OFFSET"(%5)	\n\t"
-                        "mov %4, %%"REG_b"			\n\t"
-                        "push %%"REG_BP"                        \n\t"
-                        YSCALEYUV2RGB1(%%REGBP, %5)
-                        WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
-                        "pop %%"REG_BP"                         \n\t"
-                        "mov "ESP_OFFSET"(%5), %%"REG_b"	\n\t"
-
-                        :: "c"(buf0), "d"(buf1), "S"(uvbuf0), "D"(uvbuf1), "m"(dest),
-                        "a"(&c->redDither)
-                    );
-                    return;
-                case IMGFMT_BGR15:
-                    asm volatile(
-                        "mov %%"REG_b", "ESP_OFFSET"(%5)	\n\t"
-                        "mov %4, %%"REG_b"			\n\t"
-                        "push %%"REG_BP"                        \n\t"
-                        YSCALEYUV2RGB1(%%REGBP, %5)
-                        /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
+	if( uvalpha < 2048 ) // note this is not correct (shifts chrominance by 0.5 pixels) but its a bit faster
+	{
+		switch(dstFormat)
+		{
+		case IMGFMT_BGR32:
+			asm volatile(
+				"mov %%"REG_b", "ESP_OFFSET"(%5)	\n\t"
+				"mov %4, %%"REG_b"			\n\t"
+                                "push %%"REG_BP"                        \n\t"
+				YSCALEYUV2RGB1(%%REGBP, %5)
+				WRITEBGR32(%%REGb, 8280(%5), %%REGBP)
+                                "pop %%"REG_BP"                         \n\t"
+				"mov "ESP_OFFSET"(%5), %%"REG_b"	\n\t"
+
+			:: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
+			"a" (&c->redDither)
+			);
+			return;
+		case IMGFMT_BGR24:
+			asm volatile(
+				"mov %%"REG_b", "ESP_OFFSET"(%5)	\n\t"
+				"mov %4, %%"REG_b"			\n\t"
+                                "push %%"REG_BP"                        \n\t"
+				YSCALEYUV2RGB1(%%REGBP, %5)
+				WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
+                                "pop %%"REG_BP"                         \n\t"
+				"mov "ESP_OFFSET"(%5), %%"REG_b"	\n\t"
+
+			:: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
+			"a" (&c->redDither)
+			);
+			return;
+		case IMGFMT_BGR15:
+			asm volatile(
+				"mov %%"REG_b", "ESP_OFFSET"(%5)	\n\t"
+				"mov %4, %%"REG_b"			\n\t"
+                                "push %%"REG_BP"                        \n\t"
+				YSCALEYUV2RGB1(%%REGBP, %5)
+		/* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
 #ifdef DITHER1XBPP
-                        "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
-                        "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
-                        "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
+				"paddusb "MANGLE(b5Dither)", %%mm2\n\t"
+				"paddusb "MANGLE(g5Dither)", %%mm4\n\t"
+				"paddusb "MANGLE(r5Dither)", %%mm5\n\t"
 #endif
-                        WRITEBGR15(%%REGb, 8280(%5), %%REGBP)
-                        "pop %%"REG_BP"                         \n\t"
-                        "mov "ESP_OFFSET"(%5), %%"REG_b"	\n\t"
-
-                        :: "c"(buf0), "d"(buf1), "S"(uvbuf0), "D"(uvbuf1), "m"(dest),
-                        "a"(&c->redDither)
-                    );
-                    return;
-                case IMGFMT_BGR16:
-                    asm volatile(
-                        "mov %%"REG_b", "ESP_OFFSET"(%5)	\n\t"
-                        "mov %4, %%"REG_b"			\n\t"
-                        "push %%"REG_BP"                        \n\t"
-                        YSCALEYUV2RGB1(%%REGBP, %5)
-                        /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
+				WRITEBGR15(%%REGb, 8280(%5), %%REGBP)
+                                "pop %%"REG_BP"                         \n\t"
+				"mov "ESP_OFFSET"(%5), %%"REG_b"	\n\t"
+
+			:: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
+			"a" (&c->redDither)
+			);
+			return;
+		case IMGFMT_BGR16:
+			asm volatile(
+				"mov %%"REG_b", "ESP_OFFSET"(%5)	\n\t"
+				"mov %4, %%"REG_b"			\n\t"
+                                "push %%"REG_BP"                        \n\t"
+				YSCALEYUV2RGB1(%%REGBP, %5)
+		/* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
 #ifdef DITHER1XBPP
-                        "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
-                        "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
-                        "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
+				"paddusb "MANGLE(b5Dither)", %%mm2\n\t"
+				"paddusb "MANGLE(g6Dither)", %%mm4\n\t"
+				"paddusb "MANGLE(r5Dither)", %%mm5\n\t"
 #endif
 
-                        WRITEBGR16(%%REGb, 8280(%5), %%REGBP)
-                        "pop %%"REG_BP"                         \n\t"
-                        "mov "ESP_OFFSET"(%5), %%"REG_b"	\n\t"
-
-                        :: "c"(buf0), "d"(buf1), "S"(uvbuf0), "D"(uvbuf1), "m"(dest),
-                        "a"(&c->redDither)
-                    );
-                    return;
-                case IMGFMT_YUY2:
-                    asm volatile(
-                        "mov %%"REG_b", "ESP_OFFSET"(%5)	\n\t"
-                        "mov %4, %%"REG_b"			\n\t"
-                        "push %%"REG_BP"                        \n\t"
-                        YSCALEYUV2PACKED1(%%REGBP, %5)
-                        WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
-                        "pop %%"REG_BP"                         \n\t"
-                        "mov "ESP_OFFSET"(%5), %%"REG_b"	\n\t"
-
-                        :: "c"(buf0), "d"(buf1), "S"(uvbuf0), "D"(uvbuf1), "m"(dest),
-                        "a"(&c->redDither)
-                    );
-                    return;
-                }
-            }
-            else
-            {
-                switch(dstFormat)
-                {
-                case IMGFMT_BGR32:
-                    asm volatile(
-                        "mov %%"REG_b", "ESP_OFFSET"(%5)	\n\t"
-                        "mov %4, %%"REG_b"			\n\t"
-                        "push %%"REG_BP"                        \n\t"
-                        YSCALEYUV2RGB1b(%%REGBP, %5)
-                        WRITEBGR32(%%REGb, 8280(%5), %%REGBP)
-                        "pop %%"REG_BP"                         \n\t"
-                        "mov "ESP_OFFSET"(%5), %%"REG_b"	\n\t"
-
-                        :: "c"(buf0), "d"(buf1), "S"(uvbuf0), "D"(uvbuf1), "m"(dest),
-                        "a"(&c->redDither)
-                    );
-                    return;
-                case IMGFMT_BGR24:
-                    asm volatile(
-                        "mov %%"REG_b", "ESP_OFFSET"(%5)	\n\t"
-                        "mov %4, %%"REG_b"			\n\t"
-                        "push %%"REG_BP"                        \n\t"
-                        YSCALEYUV2RGB1b(%%REGBP, %5)
-                        WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
-                        "pop %%"REG_BP"                         \n\t"
-                        "mov "ESP_OFFSET"(%5), %%"REG_b"	\n\t"
-
-                        :: "c"(buf0), "d"(buf1), "S"(uvbuf0), "D"(uvbuf1), "m"(dest),
-                        "a"(&c->redDither)
-                    );
-                    return;
-                case IMGFMT_BGR15:
-                    asm volatile(
-                        "mov %%"REG_b", "ESP_OFFSET"(%5)	\n\t"
-                        "mov %4, %%"REG_b"			\n\t"
-                        "push %%"REG_BP"                        \n\t"
-                        YSCALEYUV2RGB1b(%%REGBP, %5)
-                        /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
+				WRITEBGR16(%%REGb, 8280(%5), %%REGBP)
+                                "pop %%"REG_BP"                         \n\t"
+				"mov "ESP_OFFSET"(%5), %%"REG_b"	\n\t"
+
+			:: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
+			"a" (&c->redDither)
+			);
+			return;
+		case IMGFMT_YUY2:
+			asm volatile(
+				"mov %%"REG_b", "ESP_OFFSET"(%5)	\n\t"
+				"mov %4, %%"REG_b"			\n\t"
+                                "push %%"REG_BP"                        \n\t"
+				YSCALEYUV2PACKED1(%%REGBP, %5)
+				WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
+                                "pop %%"REG_BP"                         \n\t"
+				"mov "ESP_OFFSET"(%5), %%"REG_b"	\n\t"
+
+			:: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
+			"a" (&c->redDither)
+			);
+			return;
+		}
+	}
+	else
+	{
+		switch(dstFormat)
+		{
+		case IMGFMT_BGR32:
+			asm volatile(
+				"mov %%"REG_b", "ESP_OFFSET"(%5)	\n\t"
+				"mov %4, %%"REG_b"			\n\t"
+                                "push %%"REG_BP"                        \n\t"
+				YSCALEYUV2RGB1b(%%REGBP, %5)
+				WRITEBGR32(%%REGb, 8280(%5), %%REGBP)
+                                "pop %%"REG_BP"                         \n\t"
+				"mov "ESP_OFFSET"(%5), %%"REG_b"	\n\t"
+
+			:: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
+			"a" (&c->redDither)
+			);
+			return;
+		case IMGFMT_BGR24:
+			asm volatile(
+				"mov %%"REG_b", "ESP_OFFSET"(%5)	\n\t"
+				"mov %4, %%"REG_b"			\n\t"
+                                "push %%"REG_BP"                        \n\t"
+				YSCALEYUV2RGB1b(%%REGBP, %5)
+				WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
+                                "pop %%"REG_BP"                         \n\t"
+				"mov "ESP_OFFSET"(%5), %%"REG_b"	\n\t"
+
+			:: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
+			"a" (&c->redDither)
+			);
+			return;
+		case IMGFMT_BGR15:
+			asm volatile(
+				"mov %%"REG_b", "ESP_OFFSET"(%5)	\n\t"
+				"mov %4, %%"REG_b"			\n\t"
+                                "push %%"REG_BP"                        \n\t"
+				YSCALEYUV2RGB1b(%%REGBP, %5)
+		/* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
 #ifdef DITHER1XBPP
-                        "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
-                        "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
-                        "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
+				"paddusb "MANGLE(b5Dither)", %%mm2\n\t"
+				"paddusb "MANGLE(g5Dither)", %%mm4\n\t"
+				"paddusb "MANGLE(r5Dither)", %%mm5\n\t"
 #endif
-                        WRITEBGR15(%%REGb, 8280(%5), %%REGBP)
-                        "pop %%"REG_BP"                         \n\t"
-                        "mov "ESP_OFFSET"(%5), %%"REG_b"	\n\t"
-
-                        :: "c"(buf0), "d"(buf1), "S"(uvbuf0), "D"(uvbuf1), "m"(dest),
-                        "a"(&c->redDither)
-                    );
-                    return;
-                case IMGFMT_BGR16:
-                    asm volatile(
-                        "mov %%"REG_b", "ESP_OFFSET"(%5)	\n\t"
-                        "mov %4, %%"REG_b"			\n\t"
-                        "push %%"REG_BP"                        \n\t"
-                        YSCALEYUV2RGB1b(%%REGBP, %5)
-                        /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
+				WRITEBGR15(%%REGb, 8280(%5), %%REGBP)
+                                "pop %%"REG_BP"                         \n\t"
+				"mov "ESP_OFFSET"(%5), %%"REG_b"	\n\t"
+
+			:: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
+			"a" (&c->redDither)
+			);
+			return;
+		case IMGFMT_BGR16:
+			asm volatile(
+				"mov %%"REG_b", "ESP_OFFSET"(%5)	\n\t"
+				"mov %4, %%"REG_b"			\n\t"
+                                "push %%"REG_BP"                        \n\t"
+				YSCALEYUV2RGB1b(%%REGBP, %5)
+		/* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
 #ifdef DITHER1XBPP
-                        "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
-                        "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
-                        "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
+				"paddusb "MANGLE(b5Dither)", %%mm2\n\t"
+				"paddusb "MANGLE(g6Dither)", %%mm4\n\t"
+				"paddusb "MANGLE(r5Dither)", %%mm5\n\t"
 #endif
 
-                        WRITEBGR16(%%REGb, 8280(%5), %%REGBP)
-                        "pop %%"REG_BP"                         \n\t"
-                        "mov "ESP_OFFSET"(%5), %%"REG_b"	\n\t"
-
-                        :: "c"(buf0), "d"(buf1), "S"(uvbuf0), "D"(uvbuf1), "m"(dest),
-                        "a"(&c->redDither)
-                    );
-                    return;
-                case IMGFMT_YUY2:
-                    asm volatile(
-                        "mov %%"REG_b", "ESP_OFFSET"(%5)	\n\t"
-                        "mov %4, %%"REG_b"			\n\t"
-                        "push %%"REG_BP"                        \n\t"
-                        YSCALEYUV2PACKED1b(%%REGBP, %5)
-                        WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
-                        "pop %%"REG_BP"                         \n\t"
-                        "mov "ESP_OFFSET"(%5), %%"REG_b"	\n\t"
-
-                        :: "c"(buf0), "d"(buf1), "S"(uvbuf0), "D"(uvbuf1), "m"(dest),
-                        "a"(&c->redDither)
-                    );
-                    return;
-                }
-            }
+				WRITEBGR16(%%REGb, 8280(%5), %%REGBP)
+                                "pop %%"REG_BP"                         \n\t"
+				"mov "ESP_OFFSET"(%5), %%"REG_b"	\n\t"
+
+			:: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
+			"a" (&c->redDither)
+			);
+			return;
+		case IMGFMT_YUY2:
+			asm volatile(
+				"mov %%"REG_b", "ESP_OFFSET"(%5)	\n\t"
+				"mov %4, %%"REG_b"			\n\t"
+                                "push %%"REG_BP"                        \n\t"
+				YSCALEYUV2PACKED1b(%%REGBP, %5)
+				WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
+                                "pop %%"REG_BP"                         \n\t"
+				"mov "ESP_OFFSET"(%5), %%"REG_b"	\n\t"
+
+			:: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
+			"a" (&c->redDither)
+			);
+			return;
+		}
+	}
 #endif
-            if(uvalpha < 2048)
-            {
-                YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1_C, YSCALE_YUV_2_PACKED1_C)
-            }
-            else
-            {
-                YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1B_C, YSCALE_YUV_2_PACKED1B_C)
-            }
-        }
+	if( uvalpha < 2048 )
+	{
+		YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1_C, YSCALE_YUV_2_PACKED1_C)
+	}else{
+		YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1B_C, YSCALE_YUV_2_PACKED1B_C)
+	}
+}
 
 //FIXME yuy2* can read upto 7 samples to much
 
-        static inline void RENAME(yuy2ToY)(uint8_t * dst, uint8_t * src, long width)
-        {
+static inline void RENAME(yuy2ToY)(uint8_t *dst, uint8_t *src, long width)
+{
 #if HAVE_MMX
-            asm volatile(
-                "movq "MANGLE(bm01010101)", %%mm2\n\t"
-                "mov %0, %%"REG_a"		\n\t"
-                "1:				\n\t"
-                "movq (%1, %%"REG_a",2), %%mm0	\n\t"
-                "movq 8(%1, %%"REG_a",2), %%mm1	\n\t"
-                "pand %%mm2, %%mm0		\n\t"
-                "pand %%mm2, %%mm1		\n\t"
-                "packuswb %%mm1, %%mm0		\n\t"
-                "movq %%mm0, (%2, %%"REG_a")	\n\t"
-                "add $8, %%"REG_a"		\n\t"
-                " js 1b				\n\t"
-                : : "g"((stride_t)-width), "r"(src+width*2), "r"(dst+width)
-                : "%"REG_a
-            );
+	asm volatile(
+		"movq "MANGLE(bm01010101)", %%mm2\n\t"
+		"mov %0, %%"REG_a"		\n\t"
+		"1:				\n\t"
+		"movq (%1, %%"REG_a",2), %%mm0	\n\t"
+		"movq 8(%1, %%"REG_a",2), %%mm1	\n\t"
+		"pand %%mm2, %%mm0		\n\t"
+		"pand %%mm2, %%mm1		\n\t"
+		"packuswb %%mm1, %%mm0		\n\t"
+		"movq %%mm0, (%2, %%"REG_a")	\n\t"
+		"add $8, %%"REG_a"		\n\t"
+		" js 1b				\n\t"
+		: : "g" ((stride_t)-width), "r" (src+width*2), "r" (dst+width)
+		: "%"REG_a
+	);
 #else
-            int i;
-            for(i = 0; i < width; i++)
-                dst[i] = src[2*i];
+	int i;
+	for(i=0; i<width; i++)
+		dst[i]= src[2*i];
 #endif
-        }
+}
 
-        static inline void RENAME(yuy2ToUV)(uint8_t * dstU, uint8_t * dstV, uint8_t * src1, uint8_t * src2, long width)
-        {
+static inline void RENAME(yuy2ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width)
+{
 #if HAVE_MMX2 || HAVE_AMD3DNOW
-            asm volatile(
-                "movq "MANGLE(bm01010101)", %%mm4\n\t"
-                "mov %0, %%"REG_a"		\n\t"
-                "1:				\n\t"
-                "movq (%1, %%"REG_a",4), %%mm0	\n\t"
-                "movq 8(%1, %%"REG_a",4), %%mm1	\n\t"
-                "movq (%2, %%"REG_a",4), %%mm2	\n\t"
-                "movq 8(%2, %%"REG_a",4), %%mm3	\n\t"
-                PAVGB(%%mm2, %%mm0)
-                PAVGB(%%mm3, %%mm1)
-                "psrlw $8, %%mm0		\n\t"
-                "psrlw $8, %%mm1		\n\t"
-                "packuswb %%mm1, %%mm0		\n\t"
-                "movq %%mm0, %%mm1		\n\t"
-                "psrlw $8, %%mm0		\n\t"
-                "pand %%mm4, %%mm1		\n\t"
-                "packuswb %%mm0, %%mm0		\n\t"
-                "packuswb %%mm1, %%mm1		\n\t"
-                "movd %%mm0, (%4, %%"REG_a")	\n\t"
-                "movd %%mm1, (%3, %%"REG_a")	\n\t"
-                "add $4, %%"REG_a"		\n\t"
-                " js 1b				\n\t"
-                : : "g"((stride_t)-width), "r"(src1+width*4), "r"(src2+width*4), "r"(dstU+width), "r"(dstV+width)
-                : "%"REG_a
-            );
+	asm volatile(
+		"movq "MANGLE(bm01010101)", %%mm4\n\t"
+		"mov %0, %%"REG_a"		\n\t"
+		"1:				\n\t"
+		"movq (%1, %%"REG_a",4), %%mm0	\n\t"
+		"movq 8(%1, %%"REG_a",4), %%mm1	\n\t"
+		"movq (%2, %%"REG_a",4), %%mm2	\n\t"
+		"movq 8(%2, %%"REG_a",4), %%mm3	\n\t"
+		PAVGB(%%mm2, %%mm0)
+		PAVGB(%%mm3, %%mm1)
+		"psrlw $8, %%mm0		\n\t"
+		"psrlw $8, %%mm1		\n\t"
+		"packuswb %%mm1, %%mm0		\n\t"
+		"movq %%mm0, %%mm1		\n\t"
+		"psrlw $8, %%mm0		\n\t"
+		"pand %%mm4, %%mm1		\n\t"
+		"packuswb %%mm0, %%mm0		\n\t"
+		"packuswb %%mm1, %%mm1		\n\t"
+		"movd %%mm0, (%4, %%"REG_a")	\n\t"
+		"movd %%mm1, (%3, %%"REG_a")	\n\t"
+		"add $4, %%"REG_a"		\n\t"
+		" js 1b				\n\t"
+		: : "g" ((stride_t)-width), "r" (src1+width*4), "r" (src2+width*4), "r" (dstU+width), "r" (dstV+width)
+		: "%"REG_a
+	);
 #else
-            int i;
-            for(i = 0; i < width; i++)
-            {
-                dstU[i] = (src1[4*i + 1] + src2[4*i + 1]) >> 1;
-                dstV[i] = (src1[4*i + 3] + src2[4*i + 3]) >> 1;
-            }
+	int i;
+	for(i=0; i<width; i++)
+	{
+		dstU[i]= (src1[4*i + 1] + src2[4*i + 1])>>1;
+		dstV[i]= (src1[4*i + 3] + src2[4*i + 3])>>1;
+	}
 #endif
-        }
+}
 
 //this is allmost identical to the previous, end exists only cuz yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses
-        static inline void RENAME(uyvyToY)(uint8_t * dst, uint8_t * src, long width)
-        {
+static inline void RENAME(uyvyToY)(uint8_t *dst, uint8_t *src, long width)
+{
 #if HAVE_MMX
-            asm volatile(
-                "mov %0, %%"REG_a"		\n\t"
-                "1:				\n\t"
-                "movq (%1, %%"REG_a",2), %%mm0	\n\t"
-                "movq 8(%1, %%"REG_a",2), %%mm1	\n\t"
-                "psrlw $8, %%mm0		\n\t"
-                "psrlw $8, %%mm1		\n\t"
-                "packuswb %%mm1, %%mm0		\n\t"
-                "movq %%mm0, (%2, %%"REG_a")	\n\t"
-                "add $8, %%"REG_a"		\n\t"
-                " js 1b				\n\t"
-                : : "g"((stride_t)-width), "r"(src+width*2), "r"(dst+width)
-                : "%"REG_a
-            );
+	asm volatile(
+		"mov %0, %%"REG_a"		\n\t"
+		"1:				\n\t"
+		"movq (%1, %%"REG_a",2), %%mm0	\n\t"
+		"movq 8(%1, %%"REG_a",2), %%mm1	\n\t"
+		"psrlw $8, %%mm0		\n\t"
+		"psrlw $8, %%mm1		\n\t"
+		"packuswb %%mm1, %%mm0		\n\t"
+		"movq %%mm0, (%2, %%"REG_a")	\n\t"
+		"add $8, %%"REG_a"		\n\t"
+		" js 1b				\n\t"
+		: : "g" ((stride_t)-width), "r" (src+width*2), "r" (dst+width)
+		: "%"REG_a
+	);
 #else
-            int i;
-            for(i = 0; i < width; i++)
-                dst[i] = src[2*i+1];
+	int i;
+	for(i=0; i<width; i++)
+		dst[i]= src[2*i+1];
 #endif
-        }
+}
 
-        static inline void RENAME(uyvyToUV)(uint8_t * dstU, uint8_t * dstV, uint8_t * src1, uint8_t * src2, long width)
-        {
+static inline void RENAME(uyvyToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width)
+{
 #if HAVE_MMX2 || HAVE_AMD3DNOW
-            asm volatile(
-                "movq "MANGLE(bm01010101)", %%mm4\n\t"
-                "mov %0, %%"REG_a"		\n\t"
-                "1:				\n\t"
-                "movq (%1, %%"REG_a",4), %%mm0	\n\t"
-                "movq 8(%1, %%"REG_a",4), %%mm1	\n\t"
-                "movq (%2, %%"REG_a",4), %%mm2	\n\t"
-                "movq 8(%2, %%"REG_a",4), %%mm3	\n\t"
-                PAVGB(%%mm2, %%mm0)
-                PAVGB(%%mm3, %%mm1)
-                "pand %%mm4, %%mm0		\n\t"
-                "pand %%mm4, %%mm1		\n\t"
-                "packuswb %%mm1, %%mm0		\n\t"
-                "movq %%mm0, %%mm1		\n\t"
-                "psrlw $8, %%mm0		\n\t"
-                "pand %%mm4, %%mm1		\n\t"
-                "packuswb %%mm0, %%mm0		\n\t"
-                "packuswb %%mm1, %%mm1		\n\t"
-                "movd %%mm0, (%4, %%"REG_a")	\n\t"
-                "movd %%mm1, (%3, %%"REG_a")	\n\t"
-                "add $4, %%"REG_a"		\n\t"
-                " js 1b				\n\t"
-                : : "g"((stride_t)-width), "r"(src1+width*4), "r"(src2+width*4), "r"(dstU+width), "r"(dstV+width)
-                : "%"REG_a
-            );
+	asm volatile(
+		"movq "MANGLE(bm01010101)", %%mm4\n\t"
+		"mov %0, %%"REG_a"		\n\t"
+		"1:				\n\t"
+		"movq (%1, %%"REG_a",4), %%mm0	\n\t"
+		"movq 8(%1, %%"REG_a",4), %%mm1	\n\t"
+		"movq (%2, %%"REG_a",4), %%mm2	\n\t"
+		"movq 8(%2, %%"REG_a",4), %%mm3	\n\t"
+		PAVGB(%%mm2, %%mm0)
+		PAVGB(%%mm3, %%mm1)
+		"pand %%mm4, %%mm0		\n\t"
+		"pand %%mm4, %%mm1		\n\t"
+		"packuswb %%mm1, %%mm0		\n\t"
+		"movq %%mm0, %%mm1		\n\t"
+		"psrlw $8, %%mm0		\n\t"
+		"pand %%mm4, %%mm1		\n\t"
+		"packuswb %%mm0, %%mm0		\n\t"
+		"packuswb %%mm1, %%mm1		\n\t"
+		"movd %%mm0, (%4, %%"REG_a")	\n\t"
+		"movd %%mm1, (%3, %%"REG_a")	\n\t"
+		"add $4, %%"REG_a"		\n\t"
+		" js 1b				\n\t"
+		: : "g" ((stride_t)-width), "r" (src1+width*4), "r" (src2+width*4), "r" (dstU+width), "r" (dstV+width)
+		: "%"REG_a
+	);
 #else
-            int i;
-            for(i = 0; i < width; i++)
-            {
-                dstU[i] = (src1[4*i + 0] + src2[4*i + 0]) >> 1;
-                dstV[i] = (src1[4*i + 2] + src2[4*i + 2]) >> 1;
-            }
+	int i;
+	for(i=0; i<width; i++)
+	{
+		dstU[i]= (src1[4*i + 0] + src2[4*i + 0])>>1;
+		dstV[i]= (src1[4*i + 2] + src2[4*i + 2])>>1;
+	}
 #endif
-        }
-
-        static inline void RENAME(bgr32ToY)(uint8_t * dst, uint8_t * src, int width)
-        {
-            int i;
-            for(i = 0; i < width; i++)
-            {
-                int b = ((uint32_t*)src)[i] & 0xFF;
-                int g = (((uint32_t*)src)[i] >> 8) & 0xFF;
-                int r = (((uint32_t*)src)[i] >> 16) & 0xFF;
+}
 
-                dst[i] = ((RY * r + GY * g + BY * b + (33 << (RGB2YUV_SHIFT - 1))) >> RGB2YUV_SHIFT);
-            }
-        }
+static inline void RENAME(bgr32ToY)(uint8_t *dst, uint8_t *src, int width)
+{
+	int i;
+	for(i=0; i<width; i++)
+	{
+		int b=  ((uint32_t*)src)[i]&0xFF;
+		int g= (((uint32_t*)src)[i]>>8)&0xFF;
+		int r= (((uint32_t*)src)[i]>>16)&0xFF;
+
+		dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT);
+	}
+}
 
-        static inline void RENAME(bgr32ToUV)(uint8_t * dstU, uint8_t * dstV, uint8_t * src1, uint8_t * src2, int width)
-        {
-            int i;
-            for(i = 0; i < width; i++)
-            {
-                const int a = ((uint32_t*)src1)[2*i+0];
-                const int e = ((uint32_t*)src1)[2*i+1];
-                const int c = ((uint32_t*)src2)[2*i+0];
-                const int d = ((uint32_t*)src2)[2*i+1];
-                const int l = (a & 0xFF00FF) + (e & 0xFF00FF) + (c & 0xFF00FF) + (d & 0xFF00FF);
-                const int h = (a & 0x00FF00) + (e & 0x00FF00) + (c & 0x00FF00) + (d & 0x00FF00);
-                const int b =  l & 0x3FF;
-                const int g =  h >> 8;
-                const int r =  l >> 16;
-
-                dstU[i] = ((RU * r + GU * g + BU * b) >> (RGB2YUV_SHIFT + 2)) + 128;
-                dstV[i] = ((RV * r + GV * g + BV * b) >> (RGB2YUV_SHIFT + 2)) + 128;
-            }
-        }
+static inline void RENAME(bgr32ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
+{
+	int i;
+	for(i=0; i<width; i++)
+	{
+		const int a= ((uint32_t*)src1)[2*i+0];
+		const int e= ((uint32_t*)src1)[2*i+1];
+		const int c= ((uint32_t*)src2)[2*i+0];
+		const int d= ((uint32_t*)src2)[2*i+1];
+		const int l= (a&0xFF00FF) + (e&0xFF00FF) + (c&0xFF00FF) + (d&0xFF00FF);
+		const int h= (a&0x00FF00) + (e&0x00FF00) + (c&0x00FF00) + (d&0x00FF00);
+ 		const int b=  l&0x3FF;
+		const int g=  h>>8;
+		const int r=  l>>16;
+
+		dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
+		dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
+	}
+}
 
-        static inline void RENAME(bgr24ToY)(uint8_t * dst, uint8_t * src, long width)
-        {
+static inline void RENAME(bgr24ToY)(uint8_t *dst, uint8_t *src, long width)
+{
 #if HAVE_MMX
-            asm volatile(
-                "mov %2, %%"REG_a"		\n\t"
-                "movq "MANGLE(bgr2YCoeff)", %%mm6		\n\t"
-                "movq "MANGLE(w1111)", %%mm5		\n\t"
-                "pxor %%mm7, %%mm7		\n\t"
-                "lea (%%"REG_a", %%"REG_a", 2), %%"REG_b"\n\t"
-                ASMALIGN16
-                "1:				\n\t"
-                PREFETCH" 64(%0, %%"REG_b")	\n\t"
-                "movd (%0, %%"REG_b"), %%mm0	\n\t"
-                "movd 3(%0, %%"REG_b"), %%mm1	\n\t"
-                "punpcklbw %%mm7, %%mm0		\n\t"
-                "punpcklbw %%mm7, %%mm1		\n\t"
-                "movd 6(%0, %%"REG_b"), %%mm2	\n\t"
-                "movd 9(%0, %%"REG_b"), %%mm3	\n\t"
-                "punpcklbw %%mm7, %%mm2		\n\t"
-                "punpcklbw %%mm7, %%mm3		\n\t"
-                "pmaddwd %%mm6, %%mm0		\n\t"
-                "pmaddwd %%mm6, %%mm1		\n\t"
-                "pmaddwd %%mm6, %%mm2		\n\t"
-                "pmaddwd %%mm6, %%mm3		\n\t"
+	asm volatile(
+		"mov %2, %%"REG_a"		\n\t"
+		"movq "MANGLE(bgr2YCoeff)", %%mm6		\n\t"
+		"movq "MANGLE(w1111)", %%mm5		\n\t"
+		"pxor %%mm7, %%mm7		\n\t"
+		"lea (%%"REG_a", %%"REG_a", 2), %%"REG_b"\n\t"
+		ASMALIGN16
+		"1:				\n\t"
+		PREFETCH" 64(%0, %%"REG_b")	\n\t"
+		"movd (%0, %%"REG_b"), %%mm0	\n\t"
+		"movd 3(%0, %%"REG_b"), %%mm1	\n\t"
+		"punpcklbw %%mm7, %%mm0		\n\t"
+		"punpcklbw %%mm7, %%mm1		\n\t"
+		"movd 6(%0, %%"REG_b"), %%mm2	\n\t"
+		"movd 9(%0, %%"REG_b"), %%mm3	\n\t"
+		"punpcklbw %%mm7, %%mm2		\n\t"
+		"punpcklbw %%mm7, %%mm3		\n\t"
+		"pmaddwd %%mm6, %%mm0		\n\t"
+		"pmaddwd %%mm6, %%mm1		\n\t"
+		"pmaddwd %%mm6, %%mm2		\n\t"
+		"pmaddwd %%mm6, %%mm3		\n\t"
 #ifndef FAST_BGR2YV12
-                "psrad $8, %%mm0		\n\t"
-                "psrad $8, %%mm1		\n\t"
-                "psrad $8, %%mm2		\n\t"
-                "psrad $8, %%mm3		\n\t"
+		"psrad $8, %%mm0		\n\t"
+		"psrad $8, %%mm1		\n\t"
+		"psrad $8, %%mm2		\n\t"
+		"psrad $8, %%mm3		\n\t"
 #endif
-                "packssdw %%mm1, %%mm0		\n\t"
-                "packssdw %%mm3, %%mm2		\n\t"
-                "pmaddwd %%mm5, %%mm0		\n\t"
-                "pmaddwd %%mm5, %%mm2		\n\t"
-                "packssdw %%mm2, %%mm0		\n\t"
-                "psraw $7, %%mm0		\n\t"
-
-                "movd 12(%0, %%"REG_b"), %%mm4	\n\t"
-                "movd 15(%0, %%"REG_b"), %%mm1	\n\t"
-                "punpcklbw %%mm7, %%mm4		\n\t"
-                "punpcklbw %%mm7, %%mm1		\n\t"
-                "movd 18(%0, %%"REG_b"), %%mm2	\n\t"
-                "movd 21(%0, %%"REG_b"), %%mm3	\n\t"
-                "punpcklbw %%mm7, %%mm2		\n\t"
-                "punpcklbw %%mm7, %%mm3		\n\t"
-                "pmaddwd %%mm6, %%mm4		\n\t"
-                "pmaddwd %%mm6, %%mm1		\n\t"
-                "pmaddwd %%mm6, %%mm2		\n\t"
-                "pmaddwd %%mm6, %%mm3		\n\t"
+		"packssdw %%mm1, %%mm0		\n\t"
+		"packssdw %%mm3, %%mm2		\n\t"
+		"pmaddwd %%mm5, %%mm0		\n\t"
+		"pmaddwd %%mm5, %%mm2		\n\t"
+		"packssdw %%mm2, %%mm0		\n\t"
+		"psraw $7, %%mm0		\n\t"
+
+		"movd 12(%0, %%"REG_b"), %%mm4	\n\t"
+		"movd 15(%0, %%"REG_b"), %%mm1	\n\t"
+		"punpcklbw %%mm7, %%mm4		\n\t"
+		"punpcklbw %%mm7, %%mm1		\n\t"
+		"movd 18(%0, %%"REG_b"), %%mm2	\n\t"
+		"movd 21(%0, %%"REG_b"), %%mm3	\n\t"
+		"punpcklbw %%mm7, %%mm2		\n\t"
+		"punpcklbw %%mm7, %%mm3		\n\t"
+		"pmaddwd %%mm6, %%mm4		\n\t"
+		"pmaddwd %%mm6, %%mm1		\n\t"
+		"pmaddwd %%mm6, %%mm2		\n\t"
+		"pmaddwd %%mm6, %%mm3		\n\t"
 #ifndef FAST_BGR2YV12
-                "psrad $8, %%mm4		\n\t"
-                "psrad $8, %%mm1		\n\t"
-                "psrad $8, %%mm2		\n\t"
-                "psrad $8, %%mm3		\n\t"
+		"psrad $8, %%mm4		\n\t"
+		"psrad $8, %%mm1		\n\t"
+		"psrad $8, %%mm2		\n\t"
+		"psrad $8, %%mm3		\n\t"
 #endif
-                "packssdw %%mm1, %%mm4		\n\t"
-                "packssdw %%mm3, %%mm2		\n\t"
-                "pmaddwd %%mm5, %%mm4		\n\t"
-                "pmaddwd %%mm5, %%mm2		\n\t"
-                "add $24, %%"REG_b"		\n\t"
-                "packssdw %%mm2, %%mm4		\n\t"
-                "psraw $7, %%mm4		\n\t"
-
-                "packuswb %%mm4, %%mm0		\n\t"
-                "paddusb "MANGLE(bgr2YOffset)", %%mm0	\n\t"
-
-                "movq %%mm0, (%1, %%"REG_a")	\n\t"
-                "add $8, %%"REG_a"		\n\t"
-                " js 1b				\n\t"
-                : : "r"(src+width*3), "r"(dst+width), "g"((stride_t)-width)
-                : "%"REG_a, "%"REG_b
-            );
+		"packssdw %%mm1, %%mm4		\n\t"
+		"packssdw %%mm3, %%mm2		\n\t"
+		"pmaddwd %%mm5, %%mm4		\n\t"
+		"pmaddwd %%mm5, %%mm2		\n\t"
+		"add $24, %%"REG_b"		\n\t"
+		"packssdw %%mm2, %%mm4		\n\t"
+		"psraw $7, %%mm4		\n\t"
+
+		"packuswb %%mm4, %%mm0		\n\t"
+		"paddusb "MANGLE(bgr2YOffset)", %%mm0	\n\t"
+
+		"movq %%mm0, (%1, %%"REG_a")	\n\t"
+		"add $8, %%"REG_a"		\n\t"
+		" js 1b				\n\t"
+		: : "r" (src+width*3), "r" (dst+width), "g" ((stride_t)-width)
+		: "%"REG_a, "%"REG_b
+	);
 #else
-            int i;
-            for(i = 0; i < width; i++)
-            {
-                int b = src[i*3+0];
-                int g = src[i*3+1];
-                int r = src[i*3+2];
-
-                dst[i] = ((RY * r + GY * g + BY * b + (33 << (RGB2YUV_SHIFT - 1))) >> RGB2YUV_SHIFT);
-            }
+	int i;
+	for(i=0; i<width; i++)
+	{
+		int b= src[i*3+0];
+		int g= src[i*3+1];
+		int r= src[i*3+2];
+
+		dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT);
+	}
 #endif
-        }
+}
 
-        static inline void RENAME(bgr24ToUV)(uint8_t * dstU, uint8_t * dstV, uint8_t * src1, uint8_t * src2, long width)
-        {
+static inline void RENAME(bgr24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width)
+{
 #if HAVE_MMX
-            asm volatile(
-                "mov %4, %%"REG_a"		\n\t"
-                "movq "MANGLE(w1111)", %%mm5		\n\t"
-                "movq "MANGLE(bgr2UCoeff)", %%mm6		\n\t"
-                "pxor %%mm7, %%mm7		\n\t"
-                "lea (%%"REG_a", %%"REG_a", 2), %%"REG_b"	\n\t"
-                "add %%"REG_b", %%"REG_b"	\n\t"
-                ASMALIGN16
-                "1:				\n\t"
-                PREFETCH" 64(%0, %%"REG_b")	\n\t"
-                PREFETCH" 64(%1, %%"REG_b")	\n\t"
+	asm volatile(
+		"mov %4, %%"REG_a"		\n\t"
+		"movq "MANGLE(w1111)", %%mm5		\n\t"
+		"movq "MANGLE(bgr2UCoeff)", %%mm6		\n\t"
+		"pxor %%mm7, %%mm7		\n\t"
+		"lea (%%"REG_a", %%"REG_a", 2), %%"REG_b"	\n\t"
+		"add %%"REG_b", %%"REG_b"	\n\t"
+		ASMALIGN16
+		"1:				\n\t"
+		PREFETCH" 64(%0, %%"REG_b")	\n\t"
+		PREFETCH" 64(%1, %%"REG_b")	\n\t"
 #if HAVE_MMX2 || HAVE_AMD3DNOW
-                "movq (%0, %%"REG_b"), %%mm0	\n\t"
-                "movq (%1, %%"REG_b"), %%mm1	\n\t"
-                "movq 6(%0, %%"REG_b"), %%mm2	\n\t"
-                "movq 6(%1, %%"REG_b"), %%mm3	\n\t"
-                PAVGB(%%mm1, %%mm0)
-                PAVGB(%%mm3, %%mm2)
-                "movq %%mm0, %%mm1		\n\t"
-                "movq %%mm2, %%mm3		\n\t"
-                "psrlq $24, %%mm0		\n\t"
-                "psrlq $24, %%mm2		\n\t"
-                PAVGB(%%mm1, %%mm0)
-                PAVGB(%%mm3, %%mm2)
-                "punpcklbw %%mm7, %%mm0		\n\t"
-                "punpcklbw %%mm7, %%mm2		\n\t"
+		"movq (%0, %%"REG_b"), %%mm0	\n\t"
+		"movq (%1, %%"REG_b"), %%mm1	\n\t"
+		"movq 6(%0, %%"REG_b"), %%mm2	\n\t"
+		"movq 6(%1, %%"REG_b"), %%mm3	\n\t"
+		PAVGB(%%mm1, %%mm0)
+		PAVGB(%%mm3, %%mm2)
+		"movq %%mm0, %%mm1		\n\t"
+		"movq %%mm2, %%mm3		\n\t"
+		"psrlq $24, %%mm0		\n\t"
+		"psrlq $24, %%mm2		\n\t"
+		PAVGB(%%mm1, %%mm0)
+		PAVGB(%%mm3, %%mm2)
+		"punpcklbw %%mm7, %%mm0		\n\t"
+		"punpcklbw %%mm7, %%mm2		\n\t"
 #else
-                "movd (%0, %%"REG_b"), %%mm0	\n\t"
-                "movd (%1, %%"REG_b"), %%mm1	\n\t"
-                "movd 3(%0, %%"REG_b"), %%mm2	\n\t"
-                "movd 3(%1, %%"REG_b"), %%mm3	\n\t"
-                "punpcklbw %%mm7, %%mm0		\n\t"
-                "punpcklbw %%mm7, %%mm1		\n\t"
-                "punpcklbw %%mm7, %%mm2		\n\t"
-                "punpcklbw %%mm7, %%mm3		\n\t"
-                "paddw %%mm1, %%mm0		\n\t"
-                "paddw %%mm3, %%mm2		\n\t"
-                "paddw %%mm2, %%mm0		\n\t"
-                "movd 6(%0, %%"REG_b"), %%mm4	\n\t"
-                "movd 6(%1, %%"REG_b"), %%mm1	\n\t"
-                "movd 9(%0, %%"REG_b"), %%mm2	\n\t"
-                "movd 9(%1, %%"REG_b"), %%mm3	\n\t"
-                "punpcklbw %%mm7, %%mm4		\n\t"
-                "punpcklbw %%mm7, %%mm1		\n\t"
-                "punpcklbw %%mm7, %%mm2		\n\t"
-                "punpcklbw %%mm7, %%mm3		\n\t"
-                "paddw %%mm1, %%mm4		\n\t"
-                "paddw %%mm3, %%mm2		\n\t"
-                "paddw %%mm4, %%mm2		\n\t"
-                "psrlw $2, %%mm0		\n\t"
-                "psrlw $2, %%mm2		\n\t"
+		"movd (%0, %%"REG_b"), %%mm0	\n\t"
+		"movd (%1, %%"REG_b"), %%mm1	\n\t"
+		"movd 3(%0, %%"REG_b"), %%mm2	\n\t"
+		"movd 3(%1, %%"REG_b"), %%mm3	\n\t"
+		"punpcklbw %%mm7, %%mm0		\n\t"
+		"punpcklbw %%mm7, %%mm1		\n\t"
+		"punpcklbw %%mm7, %%mm2		\n\t"
+		"punpcklbw %%mm7, %%mm3		\n\t"
+		"paddw %%mm1, %%mm0		\n\t"
+		"paddw %%mm3, %%mm2		\n\t"
+		"paddw %%mm2, %%mm0		\n\t"
+		"movd 6(%0, %%"REG_b"), %%mm4	\n\t"
+		"movd 6(%1, %%"REG_b"), %%mm1	\n\t"
+		"movd 9(%0, %%"REG_b"), %%mm2	\n\t"
+		"movd 9(%1, %%"REG_b"), %%mm3	\n\t"
+		"punpcklbw %%mm7, %%mm4		\n\t"
+		"punpcklbw %%mm7, %%mm1		\n\t"
+		"punpcklbw %%mm7, %%mm2		\n\t"
+		"punpcklbw %%mm7, %%mm3		\n\t"
+		"paddw %%mm1, %%mm4		\n\t"
+		"paddw %%mm3, %%mm2		\n\t"
+		"paddw %%mm4, %%mm2		\n\t"
+		"psrlw $2, %%mm0		\n\t"
+		"psrlw $2, %%mm2		\n\t"
 #endif
-                "movq "MANGLE(bgr2VCoeff)", %%mm1		\n\t"
-                "movq "MANGLE(bgr2VCoeff)", %%mm3		\n\t"
+		"movq "MANGLE(bgr2VCoeff)", %%mm1		\n\t"
+		"movq "MANGLE(bgr2VCoeff)", %%mm3		\n\t"
 
-                "pmaddwd %%mm0, %%mm1		\n\t"
-                "pmaddwd %%mm2, %%mm3		\n\t"
-                "pmaddwd %%mm6, %%mm0		\n\t"
-                "pmaddwd %%mm6, %%mm2		\n\t"
+		"pmaddwd %%mm0, %%mm1		\n\t"
+		"pmaddwd %%mm2, %%mm3		\n\t"
+		"pmaddwd %%mm6, %%mm0		\n\t"
+		"pmaddwd %%mm6, %%mm2		\n\t"
 #ifndef FAST_BGR2YV12
-                "psrad $8, %%mm0		\n\t"
-                "psrad $8, %%mm1		\n\t"
-                "psrad $8, %%mm2		\n\t"
-                "psrad $8, %%mm3		\n\t"
+		"psrad $8, %%mm0		\n\t"
+		"psrad $8, %%mm1		\n\t"
+		"psrad $8, %%mm2		\n\t"
+		"psrad $8, %%mm3		\n\t"
 #endif
-                "packssdw %%mm2, %%mm0		\n\t"
-                "packssdw %%mm3, %%mm1		\n\t"
-                "pmaddwd %%mm5, %%mm0		\n\t"
-                "pmaddwd %%mm5, %%mm1		\n\t"
-                "packssdw %%mm1, %%mm0		\n\t" // V1 V0 U1 U0
-                "psraw $7, %%mm0		\n\t"
+		"packssdw %%mm2, %%mm0		\n\t"
+		"packssdw %%mm3, %%mm1		\n\t"
+		"pmaddwd %%mm5, %%mm0		\n\t"
+		"pmaddwd %%mm5, %%mm1		\n\t"
+		"packssdw %%mm1, %%mm0		\n\t" // V1 V0 U1 U0
+		"psraw $7, %%mm0		\n\t"
 
 #if HAVE_MMX2 || HAVE_AMD3DNOW
-                "movq 12(%0, %%"REG_b"), %%mm4	\n\t"
-                "movq 12(%1, %%"REG_b"), %%mm1	\n\t"
-                "movq 18(%0, %%"REG_b"), %%mm2	\n\t"
-                "movq 18(%1, %%"REG_b"), %%mm3	\n\t"
-                PAVGB(%%mm1, %%mm4)
-                PAVGB(%%mm3, %%mm2)
-                "movq %%mm4, %%mm1		\n\t"
-                "movq %%mm2, %%mm3		\n\t"
-                "psrlq $24, %%mm4		\n\t"
-                "psrlq $24, %%mm2		\n\t"
-                PAVGB(%%mm1, %%mm4)
-                PAVGB(%%mm3, %%mm2)
-                "punpcklbw %%mm7, %%mm4		\n\t"
-                "punpcklbw %%mm7, %%mm2		\n\t"
+		"movq 12(%0, %%"REG_b"), %%mm4	\n\t"
+		"movq 12(%1, %%"REG_b"), %%mm1	\n\t"
+		"movq 18(%0, %%"REG_b"), %%mm2	\n\t"
+		"movq 18(%1, %%"REG_b"), %%mm3	\n\t"
+		PAVGB(%%mm1, %%mm4)
+		PAVGB(%%mm3, %%mm2)
+		"movq %%mm4, %%mm1		\n\t"
+		"movq %%mm2, %%mm3		\n\t"
+		"psrlq $24, %%mm4		\n\t"
+		"psrlq $24, %%mm2		\n\t"
+		PAVGB(%%mm1, %%mm4)
+		PAVGB(%%mm3, %%mm2)
+		"punpcklbw %%mm7, %%mm4		\n\t"
+		"punpcklbw %%mm7, %%mm2		\n\t"
 #else
-                "movd 12(%0, %%"REG_b"), %%mm4	\n\t"
-                "movd 12(%1, %%"REG_b"), %%mm1	\n\t"
-                "movd 15(%0, %%"REG_b"), %%mm2	\n\t"
-                "movd 15(%1, %%"REG_b"), %%mm3	\n\t"
-                "punpcklbw %%mm7, %%mm4		\n\t"
-                "punpcklbw %%mm7, %%mm1		\n\t"
-                "punpcklbw %%mm7, %%mm2		\n\t"
-                "punpcklbw %%mm7, %%mm3		\n\t"
-                "paddw %%mm1, %%mm4		\n\t"
-                "paddw %%mm3, %%mm2		\n\t"
-                "paddw %%mm2, %%mm4		\n\t"
-                "movd 18(%0, %%"REG_b"), %%mm5	\n\t"
-                "movd 18(%1, %%"REG_b"), %%mm1	\n\t"
-                "movd 21(%0, %%"REG_b"), %%mm2	\n\t"
-                "movd 21(%1, %%"REG_b"), %%mm3	\n\t"
-                "punpcklbw %%mm7, %%mm5		\n\t"
-                "punpcklbw %%mm7, %%mm1		\n\t"
-                "punpcklbw %%mm7, %%mm2		\n\t"
-                "punpcklbw %%mm7, %%mm3		\n\t"
-                "paddw %%mm1, %%mm5		\n\t"
-                "paddw %%mm3, %%mm2		\n\t"
-                "paddw %%mm5, %%mm2		\n\t"
-                "movq "MANGLE(w1111)", %%mm5		\n\t"
-                "psrlw $2, %%mm4		\n\t"
-                "psrlw $2, %%mm2		\n\t"
+		"movd 12(%0, %%"REG_b"), %%mm4	\n\t"
+		"movd 12(%1, %%"REG_b"), %%mm1	\n\t"
+		"movd 15(%0, %%"REG_b"), %%mm2	\n\t"
+		"movd 15(%1, %%"REG_b"), %%mm3	\n\t"
+		"punpcklbw %%mm7, %%mm4		\n\t"
+		"punpcklbw %%mm7, %%mm1		\n\t"
+		"punpcklbw %%mm7, %%mm2		\n\t"
+		"punpcklbw %%mm7, %%mm3		\n\t"
+		"paddw %%mm1, %%mm4		\n\t"
+		"paddw %%mm3, %%mm2		\n\t"
+		"paddw %%mm2, %%mm4		\n\t"
+		"movd 18(%0, %%"REG_b"), %%mm5	\n\t"
+		"movd 18(%1, %%"REG_b"), %%mm1	\n\t"
+		"movd 21(%0, %%"REG_b"), %%mm2	\n\t"
+		"movd 21(%1, %%"REG_b"), %%mm3	\n\t"
+		"punpcklbw %%mm7, %%mm5		\n\t"
+		"punpcklbw %%mm7, %%mm1		\n\t"
+		"punpcklbw %%mm7, %%mm2		\n\t"
+		"punpcklbw %%mm7, %%mm3		\n\t"
+		"paddw %%mm1, %%mm5		\n\t"
+		"paddw %%mm3, %%mm2		\n\t"
+		"paddw %%mm5, %%mm2		\n\t"
+		"movq "MANGLE(w1111)", %%mm5		\n\t"
+		"psrlw $2, %%mm4		\n\t"
+		"psrlw $2, %%mm2		\n\t"
 #endif
-                "movq "MANGLE(bgr2VCoeff)", %%mm1		\n\t"
-                "movq "MANGLE(bgr2VCoeff)", %%mm3		\n\t"
+		"movq "MANGLE(bgr2VCoeff)", %%mm1		\n\t"
+		"movq "MANGLE(bgr2VCoeff)", %%mm3		\n\t"
 
-                "pmaddwd %%mm4, %%mm1		\n\t"
-                "pmaddwd %%mm2, %%mm3		\n\t"
-                "pmaddwd %%mm6, %%mm4		\n\t"
-                "pmaddwd %%mm6, %%mm2		\n\t"
+		"pmaddwd %%mm4, %%mm1		\n\t"
+		"pmaddwd %%mm2, %%mm3		\n\t"
+		"pmaddwd %%mm6, %%mm4		\n\t"
+		"pmaddwd %%mm6, %%mm2		\n\t"
 #ifndef FAST_BGR2YV12
-                "psrad $8, %%mm4		\n\t"
-                "psrad $8, %%mm1		\n\t"
-                "psrad $8, %%mm2		\n\t"
-                "psrad $8, %%mm3		\n\t"
+		"psrad $8, %%mm4		\n\t"
+		"psrad $8, %%mm1		\n\t"
+		"psrad $8, %%mm2		\n\t"
+		"psrad $8, %%mm3		\n\t"
 #endif
-                "packssdw %%mm2, %%mm4		\n\t"
-                "packssdw %%mm3, %%mm1		\n\t"
-                "pmaddwd %%mm5, %%mm4		\n\t"
-                "pmaddwd %%mm5, %%mm1		\n\t"
-                "add $24, %%"REG_b"		\n\t"
-                "packssdw %%mm1, %%mm4		\n\t" // V3 V2 U3 U2
-                "psraw $7, %%mm4		\n\t"
-
-                "movq %%mm0, %%mm1		\n\t"
-                "punpckldq %%mm4, %%mm0		\n\t"
-                "punpckhdq %%mm4, %%mm1		\n\t"
-                "packsswb %%mm1, %%mm0		\n\t"
-                "paddb "MANGLE(bgr2UVOffset)", %%mm0	\n\t"
-
-                "movd %%mm0, (%2, %%"REG_a")	\n\t"
-                "punpckhdq %%mm0, %%mm0		\n\t"
-                "movd %%mm0, (%3, %%"REG_a")	\n\t"
-                "add $4, %%"REG_a"		\n\t"
-                " js 1b				\n\t"
-                : : "r"(src1+width*6), "r"(src2+width*6), "r"(dstU+width), "r"(dstV+width), "g"((stride_t)-width)
-                : "%"REG_a, "%"REG_b
-            );
+		"packssdw %%mm2, %%mm4		\n\t"
+		"packssdw %%mm3, %%mm1		\n\t"
+		"pmaddwd %%mm5, %%mm4		\n\t"
+		"pmaddwd %%mm5, %%mm1		\n\t"
+		"add $24, %%"REG_b"		\n\t"
+		"packssdw %%mm1, %%mm4		\n\t" // V3 V2 U3 U2
+		"psraw $7, %%mm4		\n\t"
+
+		"movq %%mm0, %%mm1		\n\t"
+		"punpckldq %%mm4, %%mm0		\n\t"
+		"punpckhdq %%mm4, %%mm1		\n\t"
+		"packsswb %%mm1, %%mm0		\n\t"
+		"paddb "MANGLE(bgr2UVOffset)", %%mm0	\n\t"
+
+		"movd %%mm0, (%2, %%"REG_a")	\n\t"
+		"punpckhdq %%mm0, %%mm0		\n\t"
+		"movd %%mm0, (%3, %%"REG_a")	\n\t"
+		"add $4, %%"REG_a"		\n\t"
+		" js 1b				\n\t"
+		: : "r" (src1+width*6), "r" (src2+width*6), "r" (dstU+width), "r" (dstV+width), "g" ((stride_t)-width)
+		: "%"REG_a, "%"REG_b
+	);
 #else
-            int i;
-            for(i = 0; i < width; i++)
-            {
-                int b = src1[6*i + 0] + src1[6*i + 3] + src2[6*i + 0] + src2[6*i + 3];
-                int g = src1[6*i + 1] + src1[6*i + 4] + src2[6*i + 1] + src2[6*i + 4];
-                int r = src1[6*i + 2] + src1[6*i + 5] + src2[6*i + 2] + src2[6*i + 5];
-
-                dstU[i] = ((RU * r + GU * g + BU * b) >> (RGB2YUV_SHIFT + 2)) + 128;
-                dstV[i] = ((RV * r + GV * g + BV * b) >> (RGB2YUV_SHIFT + 2)) + 128;
-            }
+	int i;
+	for(i=0; i<width; i++)
+	{
+		int b= src1[6*i + 0] + src1[6*i + 3] + src2[6*i + 0] + src2[6*i + 3];
+		int g= src1[6*i + 1] + src1[6*i + 4] + src2[6*i + 1] + src2[6*i + 4];
+		int r= src1[6*i + 2] + src1[6*i + 5] + src2[6*i + 2] + src2[6*i + 5];
+
+		dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
+		dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
+	}
 #endif
-        }
-
-        static inline void RENAME(bgr16ToY)(uint8_t * dst, uint8_t * src, int width)
-        {
-            int i;
-            for(i = 0; i < width; i++)
-            {
-                int d = ((uint16_t*)src)[i];
-                int b = d & 0x1F;
-                int g = (d >> 5) & 0x3F;
-                int r = (d >> 11) & 0x1F;
-
-                dst[i] = ((2 * RY * r + GY * g + 2 * BY * b) >> (RGB2YUV_SHIFT - 2)) + 16;
-            }
-        }
-
-        static inline void RENAME(bgr16ToUV)(uint8_t * dstU, uint8_t * dstV, uint8_t * src1, uint8_t * src2, int width)
-        {
-            int i;
-            for(i = 0; i < width; i++)
-            {
-                int d0 = ((uint32_t*)src1)[i];
-                int d1 = ((uint32_t*)src2)[i];
-
-                int dl = (d0 & 0x07E0F81F) + (d1 & 0x07E0F81F);
-                int dh = ((d0 >> 5) & 0x07C0F83F) + ((d1 >> 5) & 0x07C0F83F);
-
-                int dh2 = (dh >> 11) + (dh << 21);
-                int d = dh2 + dl;
-
-                int b = d & 0x7F;
-                int r = (d >> 11) & 0x7F;
-                int g = d >> 21;
-                dstU[i] = ((2 * RU * r + GU * g + 2 * BU * b) >> (RGB2YUV_SHIFT + 2 - 2)) + 128;
-                dstV[i] = ((2 * RV * r + GV * g + 2 * BV * b) >> (RGB2YUV_SHIFT + 2 - 2)) + 128;
-            }
-        }
-
-        static inline void RENAME(bgr15ToY)(uint8_t * dst, uint8_t * src, int width)
-        {
-            int i;
-            for(i = 0; i < width; i++)
-            {
-                int d = ((uint16_t*)src)[i];
-                int b = d & 0x1F;
-                int g = (d >> 5) & 0x1F;
-                int r = (d >> 10) & 0x1F;
-
-                dst[i] = ((RY * r + GY * g + BY * b) >> (RGB2YUV_SHIFT - 3)) + 16;
-            }
-        }
-
-        static inline void RENAME(bgr15ToUV)(uint8_t * dstU, uint8_t * dstV, uint8_t * src1, uint8_t * src2, int width)
-        {
-            int i;
-            for(i = 0; i < width; i++)
-            {
-                int d0 = ((uint32_t*)src1)[i];
-                int d1 = ((uint32_t*)src2)[i];
-
-                int dl = (d0 & 0x03E07C1F) + (d1 & 0x03E07C1F);
-                int dh = ((d0 >> 5) & 0x03E0F81F) + ((d1 >> 5) & 0x03E0F81F);
-
-                int dh2 = (dh >> 11) + (dh << 21);
-                int d = dh2 + dl;
-
-                int b = d & 0x7F;
-                int r = (d >> 10) & 0x7F;
-                int g = d >> 21;
-                dstU[i] = ((RU * r + GU * g + BU * b) >> (RGB2YUV_SHIFT + 2 - 3)) + 128;
-                dstV[i] = ((RV * r + GV * g + BV * b) >> (RGB2YUV_SHIFT + 2 - 3)) + 128;
-            }
-        }
-
-
-        static inline void RENAME(rgb32ToY)(uint8_t * dst, uint8_t * src, int width)
-        {
-            int i;
-            for(i = 0; i < width; i++)
-            {
-                int r = ((uint32_t*)src)[i] & 0xFF;
-                int g = (((uint32_t*)src)[i] >> 8) & 0xFF;
-                int b = (((uint32_t*)src)[i] >> 16) & 0xFF;
-
-                dst[i] = ((RY * r + GY * g + BY * b + (33 << (RGB2YUV_SHIFT - 1))) >> RGB2YUV_SHIFT);
-            }
-        }
-
-        static inline void RENAME(rgb32ToUV)(uint8_t * dstU, uint8_t * dstV, uint8_t * src1, uint8_t * src2, int width)
-        {
-            int i;
-            for(i = 0; i < width; i++)
-            {
-                const int a = ((uint32_t*)src1)[2*i+0];
-                const int e = ((uint32_t*)src1)[2*i+1];
-                const int c = ((uint32_t*)src2)[2*i+0];
-                const int d = ((uint32_t*)src2)[2*i+1];
-                const int l = (a & 0xFF00FF) + (e & 0xFF00FF) + (c & 0xFF00FF) + (d & 0xFF00FF);
-                const int h = (a & 0x00FF00) + (e & 0x00FF00) + (c & 0x00FF00) + (d & 0x00FF00);
-                const int r =  l & 0x3FF;
-                const int g =  h >> 8;
-                const int b =  l >> 16;
-
-                dstU[i] = ((RU * r + GU * g + BU * b) >> (RGB2YUV_SHIFT + 2)) + 128;
-                dstV[i] = ((RV * r + GV * g + BV * b) >> (RGB2YUV_SHIFT + 2)) + 128;
-            }
-        }
-
-        static inline void RENAME(rgb24ToY)(uint8_t * dst, uint8_t * src, int width)
-        {
-            int i;
-            for(i = 0; i < width; i++)
-            {
-                int r = src[i*3+0];
-                int g = src[i*3+1];
-                int b = src[i*3+2];
-
-                dst[i] = ((RY * r + GY * g + BY * b + (33 << (RGB2YUV_SHIFT - 1))) >> RGB2YUV_SHIFT);
-            }
-        }
-
-        static inline void RENAME(rgb24ToUV)(uint8_t * dstU, uint8_t * dstV, uint8_t * src1, uint8_t * src2, int width)
-        {
-            int i;
-            for(i = 0; i < width; i++)
-            {
-                int r = src1[6*i + 0] + src1[6*i + 3] + src2[6*i + 0] + src2[6*i + 3];
-                int g = src1[6*i + 1] + src1[6*i + 4] + src2[6*i + 1] + src2[6*i + 4];
-                int b = src1[6*i + 2] + src1[6*i + 5] + src2[6*i + 2] + src2[6*i + 5];
+}
 
-                dstU[i] = ((RU * r + GU * g + BU * b) >> (RGB2YUV_SHIFT + 2)) + 128;
-                dstV[i] = ((RV * r + GV * g + BV * b) >> (RGB2YUV_SHIFT + 2)) + 128;
-            }
-        }
+static inline void RENAME(bgr16ToY)(uint8_t *dst, uint8_t *src, int width)
+{
+	int i;
+	for(i=0; i<width; i++)
+	{
+		int d= ((uint16_t*)src)[i];
+		int b= d&0x1F;
+		int g= (d>>5)&0x3F;
+		int r= (d>>11)&0x1F;
+
+		dst[i]= ((2*RY*r + GY*g + 2*BY*b)>>(RGB2YUV_SHIFT-2)) + 16;
+	}
+}
 
-        static inline void RENAME(rgb16ToY)(uint8_t * dst, uint8_t * src, int width)
-        {
-            int i;
-            for(i = 0; i < width; i++)
-            {
-                int d = ((uint16_t*)src)[i];
-                int b = d & 0x1F;
-                int g = (d >> 5) & 0x3F;
-                int r = (d >> 11) & 0x1F;
+static inline void RENAME(bgr16ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
+{
+	int i;
+	for(i=0; i<width; i++)
+	{
+		int d0= ((uint32_t*)src1)[i];
+		int d1= ((uint32_t*)src2)[i];
+
+		int dl= (d0&0x07E0F81F) + (d1&0x07E0F81F);
+		int dh= ((d0>>5)&0x07C0F83F) + ((d1>>5)&0x07C0F83F);
+
+		int dh2= (dh>>11) + (dh<<21);
+		int d= dh2 + dl;
+
+		int b= d&0x7F;
+		int r= (d>>11)&0x7F;
+		int g= d>>21;
+		dstU[i]= ((2*RU*r + GU*g + 2*BU*b)>>(RGB2YUV_SHIFT+2-2)) + 128;
+		dstV[i]= ((2*RV*r + GV*g + 2*BV*b)>>(RGB2YUV_SHIFT+2-2)) + 128;
+	}
+}
 
-                dst[i] = ((2 * RY * r + GY * g + 2 * BY * b) >> (RGB2YUV_SHIFT - 2)) + 16;
-            }
-        }
+static inline void RENAME(bgr15ToY)(uint8_t *dst, uint8_t *src, int width)
+{
+	int i;
+	for(i=0; i<width; i++)
+	{
+		int d= ((uint16_t*)src)[i];
+		int b= d&0x1F;
+		int g= (d>>5)&0x1F;
+		int r= (d>>10)&0x1F;
+
+		dst[i]= ((RY*r + GY*g + BY*b)>>(RGB2YUV_SHIFT-3)) + 16;
+	}
+}
 
-        static inline void RENAME(rgb16ToUV)(uint8_t * dstU, uint8_t * dstV, uint8_t * src1, uint8_t * src2, int width)
-        {
-            int i;
-            for(i = 0; i < width; i++)
-            {
-                int d0 = ((uint32_t*)src1)[i];
-                int d1 = ((uint32_t*)src2)[i];
+static inline void RENAME(bgr15ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
+{
+	int i;
+	for(i=0; i<width; i++)
+	{
+		int d0= ((uint32_t*)src1)[i];
+		int d1= ((uint32_t*)src2)[i];
+
+		int dl= (d0&0x03E07C1F) + (d1&0x03E07C1F);
+		int dh= ((d0>>5)&0x03E0F81F) + ((d1>>5)&0x03E0F81F);
+
+		int dh2= (dh>>11) + (dh<<21);
+		int d= dh2 + dl;
+
+		int b= d&0x7F;
+		int r= (d>>10)&0x7F;
+		int g= d>>21;
+		dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2-3)) + 128;
+		dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2-3)) + 128;
+	}
+}
 
-                int dl = (d0 & 0x07E0F81F) + (d1 & 0x07E0F81F);
-                int dh = ((d0 >> 5) & 0x07C0F83F) + ((d1 >> 5) & 0x07C0F83F);
 
-                int dh2 = (dh >> 11) + (dh << 21);
-                int d = dh2 + dl;
+static inline void RENAME(rgb32ToY)(uint8_t *dst, uint8_t *src, int width)
+{
+	int i;
+	for(i=0; i<width; i++)
+	{
+		int r=  ((uint32_t*)src)[i]&0xFF;
+		int g= (((uint32_t*)src)[i]>>8)&0xFF;
+		int b= (((uint32_t*)src)[i]>>16)&0xFF;
+
+		dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT);
+	}
+}
 
-                int b = d & 0x7F;
-                int r = (d >> 11) & 0x7F;
-                int g = d >> 21;
-                dstV[i] = ((2 * RU * r + GU * g + 2 * BU * b) >> (RGB2YUV_SHIFT + 2 - 2)) + 128;
-                dstU[i] = ((2 * RV * r + GV * g + 2 * BV * b) >> (RGB2YUV_SHIFT + 2 - 2)) + 128;
-            }
-        }
+static inline void RENAME(rgb32ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
+{
+	int i;
+	for(i=0; i<width; i++)
+	{
+		const int a= ((uint32_t*)src1)[2*i+0];
+		const int e= ((uint32_t*)src1)[2*i+1];
+		const int c= ((uint32_t*)src2)[2*i+0];
+		const int d= ((uint32_t*)src2)[2*i+1];
+		const int l= (a&0xFF00FF) + (e&0xFF00FF) + (c&0xFF00FF) + (d&0xFF00FF);
+		const int h= (a&0x00FF00) + (e&0x00FF00) + (c&0x00FF00) + (d&0x00FF00);
+ 		const int r=  l&0x3FF;
+		const int g=  h>>8;
+		const int b=  l>>16;
+
+		dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
+		dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
+	}
+}
 
-        static inline void RENAME(rgb15ToY)(uint8_t * dst, uint8_t * src, int width)
-        {
-            int i;
-            for(i = 0; i < width; i++)
-            {
-                int d = ((uint16_t*)src)[i];
-                int b = d & 0x1F;
-                int g = (d >> 5) & 0x1F;
-                int r = (d >> 10) & 0x1F;
+static inline void RENAME(rgb24ToY)(uint8_t *dst, uint8_t *src, int width)
+{
+	int i;
+	for(i=0; i<width; i++)
+	{
+		int r= src[i*3+0];
+		int g= src[i*3+1];
+		int b= src[i*3+2];
+
+		dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT);
+	}
+}
 
-                dst[i] = ((RY * r + GY * g + BY * b) >> (RGB2YUV_SHIFT - 3)) + 16;
-            }
-        }
+static inline void RENAME(rgb24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
+{
+	int i;
+	for(i=0; i<width; i++)
+	{
+		int r= src1[6*i + 0] + src1[6*i + 3] + src2[6*i + 0] + src2[6*i + 3];
+		int g= src1[6*i + 1] + src1[6*i + 4] + src2[6*i + 1] + src2[6*i + 4];
+		int b= src1[6*i + 2] + src1[6*i + 5] + src2[6*i + 2] + src2[6*i + 5];
+
+		dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
+		dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
+	}
+}
 
-        static inline void RENAME(rgb15ToUV)(uint8_t * dstU, uint8_t * dstV, uint8_t * src1, uint8_t * src2, int width)
-        {
-            int i;
-            for(i = 0; i < width; i++)
-            {
-                int d0 = ((uint32_t*)src1)[i];
-                int d1 = ((uint32_t*)src2)[i];
+static inline void RENAME(rgb16ToY)(uint8_t *dst, uint8_t *src, int width)
+{
+	int i;
+	for(i=0; i<width; i++)
+	{
+		int d= ((uint16_t*)src)[i];
+		int b= d&0x1F;
+		int g= (d>>5)&0x3F;
+		int r= (d>>11)&0x1F;
+
+		dst[i]= ((2*RY*r + GY*g + 2*BY*b)>>(RGB2YUV_SHIFT-2)) + 16;
+	}
+}
 
-                int dl = (d0 & 0x03E07C1F) + (d1 & 0x03E07C1F);
-                int dh = ((d0 >> 5) & 0x03E0F81F) + ((d1 >> 5) & 0x03E0F81F);
+static inline void RENAME(rgb16ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
+{
+	int i;
+	for(i=0; i<width; i++)
+	{
+		int d0= ((uint32_t*)src1)[i];
+		int d1= ((uint32_t*)src2)[i];
+
+		int dl= (d0&0x07E0F81F) + (d1&0x07E0F81F);
+		int dh= ((d0>>5)&0x07C0F83F) + ((d1>>5)&0x07C0F83F);
+
+		int dh2= (dh>>11) + (dh<<21);
+		int d= dh2 + dl;
+
+		int b= d&0x7F;
+		int r= (d>>11)&0x7F;
+		int g= d>>21;
+		dstV[i]= ((2*RU*r + GU*g + 2*BU*b)>>(RGB2YUV_SHIFT+2-2)) + 128;
+		dstU[i]= ((2*RV*r + GV*g + 2*BV*b)>>(RGB2YUV_SHIFT+2-2)) + 128;
+	}
+}
 
-                int dh2 = (dh >> 11) + (dh << 21);
-                int d = dh2 + dl;
+static inline void RENAME(rgb15ToY)(uint8_t *dst, uint8_t *src, int width)
+{
+	int i;
+	for(i=0; i<width; i++)
+	{
+		int d= ((uint16_t*)src)[i];
+		int b= d&0x1F;
+		int g= (d>>5)&0x1F;
+		int r= (d>>10)&0x1F;
+
+		dst[i]= ((RY*r + GY*g + BY*b)>>(RGB2YUV_SHIFT-3)) + 16;
+	}
+}
 
-                int b = d & 0x7F;
-                int r = (d >> 10) & 0x7F;
-                int g = d >> 21;
-                dstV[i] = ((RU * r + GU * g + BU * b) >> (RGB2YUV_SHIFT + 2 - 3)) + 128;
-                dstU[i] = ((RV * r + GV * g + BV * b) >> (RGB2YUV_SHIFT + 2 - 3)) + 128;
-            }
-        }
+static inline void RENAME(rgb15ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
+{
+	int i;
+	for(i=0; i<width; i++)
+	{
+		int d0= ((uint32_t*)src1)[i];
+		int d1= ((uint32_t*)src2)[i];
+
+		int dl= (d0&0x03E07C1F) + (d1&0x03E07C1F);
+		int dh= ((d0>>5)&0x03E0F81F) + ((d1>>5)&0x03E0F81F);
+
+		int dh2= (dh>>11) + (dh<<21);
+		int d= dh2 + dl;
+
+		int b= d&0x7F;
+		int r= (d>>10)&0x7F;
+		int g= d>>21;
+		dstV[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2-3)) + 128;
+		dstU[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2-3)) + 128;
+	}
+}
 
-        static inline void RENAME(nv12ToUV)(uint8_t * dstU, uint8_t * dstV, uint8_t * src1, uint8_t * src2, int width)
-        {
-            int i;
-            for(i = 0; i < width; i++)
-            {
-                dstU[i] = src1[i<<1];
-                dstV[i] = src1[(i<<1)+1];
-            }
-        }
+static inline void RENAME(nv12ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
+{
+	int i;
+	for(i=0; i<width; i++)
+	{
+		dstU[i]= src1[i<<1];
+		dstV[i]= src1[(i<<1)+1];
+	}
+}
 
-        static inline void RENAME(nv21ToUV)(uint8_t * dstU, uint8_t * dstV, uint8_t * src1, uint8_t * src2, int width)
-        {
-            int i;
-            for(i = 0; i < width; i++)
-            {
-                dstV[i] = src1[i<<1];
-                dstU[i] = src1[(i<<1)+1];
-            }
-        }
+static inline void RENAME(nv21ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
+{
+	int i;
+	for(i=0; i<width; i++)
+	{
+		dstV[i]= src1[i<<1];
+		dstU[i]= src1[(i<<1)+1];
+	}
+}
 
 // Bilinear / Bicubic scaling
-        static inline void RENAME(hScale)(int16_t * dst, int dstW, uint8_t * src, int srcW, int xInc,
-                                          int16_t * filter, int16_t * filterPos, long filterSize0)
-        {
-            const stride_t filterSize = filterSize0;
+static inline void RENAME(hScale)(int16_t *dst, int dstW, uint8_t *src, int srcW, int xInc,
+				  int16_t *filter, int16_t *filterPos, long filterSize0)
+{
+        const stride_t filterSize = filterSize0;
 #if HAVE_MMX
-            assert(filterSize % 4 == 0 && filterSize > 0);
-            if(filterSize == 4) // allways true for upscaling, sometimes for down too
-            {
-                stride_t counter = -2 * dstW;
-                filter -= counter * 2;
-                filterPos -= counter / 2;
-                dst -= counter / 2;
-                asm volatile(
-                    "pxor %%mm7, %%mm7		\n\t"
-                    "movq "MANGLE(w02)", %%mm6	\n\t"
-                    "push %%"REG_BP"		\n\t" // we use 7 regs here ...
-                    "mov %%"REG_a", %%"REG_BP"	\n\t"
-                    ASMALIGN16
-                    "1:				\n\t"
-                    "movzwl (%2, %%"REG_BP"), %%eax	\n\t"
-                    "movzwl 2(%2, %%"REG_BP"), %%ebx\n\t"
-                    "movq (%1, %%"REG_BP", 4), %%mm1\n\t"
-                    "movq 8(%1, %%"REG_BP", 4), %%mm3\n\t"
-                    "movd (%3, %%"REG_a"), %%mm0	\n\t"
-                    "movd (%3, %%"REG_b"), %%mm2	\n\t"
-                    "punpcklbw %%mm7, %%mm0		\n\t"
-                    "punpcklbw %%mm7, %%mm2		\n\t"
-                    "pmaddwd %%mm1, %%mm0		\n\t"
-                    "pmaddwd %%mm2, %%mm3		\n\t"
-                    "psrad $8, %%mm0		\n\t"
-                    "psrad $8, %%mm3		\n\t"
-                    "packssdw %%mm3, %%mm0		\n\t"
-                    "pmaddwd %%mm6, %%mm0		\n\t"
-                    "packssdw %%mm0, %%mm0		\n\t"
-                    "movd %%mm0, (%4, %%"REG_BP")	\n\t"
-                    "add $4, %%"REG_BP"		\n\t"
-                    " jnc 1b			\n\t"
-
-                    "pop %%"REG_BP"			\n\t"
-                    : "+a"(counter)
-                    : "c"(filter), "d"(filterPos), "S"(src), "D"(dst)
-                    : "%"REG_b
-                );
-            }
-            else if(filterSize == 8)
-            {
-                stride_t counter = -2 * dstW;
-                filter -= counter * 4;
-                filterPos -= counter / 2;
-                dst -= counter / 2;
-                asm volatile(
-                    "pxor %%mm7, %%mm7		\n\t"
-                    "movq "MANGLE(w02)", %%mm6	\n\t"
-                    "push %%"REG_BP"		\n\t" // we use 7 regs here ...
-                    "mov %%"REG_a", %%"REG_BP"	\n\t"
-                    ASMALIGN16
-                    "1:				\n\t"
-                    "movzwl (%2, %%"REG_BP"), %%eax	\n\t"
-                    "movzwl 2(%2, %%"REG_BP"), %%ebx\n\t"
-                    "movq (%1, %%"REG_BP", 8), %%mm1\n\t"
-                    "movq 16(%1, %%"REG_BP", 8), %%mm3\n\t"
-                    "movd (%3, %%"REG_a"), %%mm0	\n\t"
-                    "movd (%3, %%"REG_b"), %%mm2	\n\t"
-                    "punpcklbw %%mm7, %%mm0		\n\t"
-                    "punpcklbw %%mm7, %%mm2		\n\t"
-                    "pmaddwd %%mm1, %%mm0		\n\t"
-                    "pmaddwd %%mm2, %%mm3		\n\t"
-
-                    "movq 8(%1, %%"REG_BP", 8), %%mm1\n\t"
-                    "movq 24(%1, %%"REG_BP", 8), %%mm5\n\t"
-                    "movd 4(%3, %%"REG_a"), %%mm4	\n\t"
-                    "movd 4(%3, %%"REG_b"), %%mm2	\n\t"
-                    "punpcklbw %%mm7, %%mm4		\n\t"
-                    "punpcklbw %%mm7, %%mm2		\n\t"
-                    "pmaddwd %%mm1, %%mm4		\n\t"
-                    "pmaddwd %%mm2, %%mm5		\n\t"
-                    "paddd %%mm4, %%mm0		\n\t"
-                    "paddd %%mm5, %%mm3		\n\t"
-
-                    "psrad $8, %%mm0		\n\t"
-                    "psrad $8, %%mm3		\n\t"
-                    "packssdw %%mm3, %%mm0		\n\t"
-                    "pmaddwd %%mm6, %%mm0		\n\t"
-                    "packssdw %%mm0, %%mm0		\n\t"
-                    "movd %%mm0, (%4, %%"REG_BP")	\n\t"
-                    "add $4, %%"REG_BP"		\n\t"
-                    " jnc 1b			\n\t"
-
-                    "pop %%"REG_BP"			\n\t"
-                    : "+a"(counter)
-                    : "c"(filter), "d"(filterPos), "S"(src), "D"(dst)
-                    : "%"REG_b
-                );
-            }
-            else
-            {
-                uint8_t *offset = src + filterSize;
-                stride_t counter = -2 * dstW;
+	assert(filterSize % 4 == 0 && filterSize>0);
+	if(filterSize==4) // allways true for upscaling, sometimes for down too
+	{
+		stride_t counter= -2*dstW;
+		filter-= counter*2;
+		filterPos-= counter/2;
+		dst-= counter/2;
+		asm volatile(
+			"pxor %%mm7, %%mm7		\n\t"
+			"movq "MANGLE(w02)", %%mm6	\n\t"
+			"push %%"REG_BP"		\n\t" // we use 7 regs here ...
+			"mov %%"REG_a", %%"REG_BP"	\n\t"
+			ASMALIGN16
+			"1:				\n\t"
+			"movzwl (%2, %%"REG_BP"), %%eax	\n\t"
+			"movzwl 2(%2, %%"REG_BP"), %%ebx\n\t"
+			"movq (%1, %%"REG_BP", 4), %%mm1\n\t"
+			"movq 8(%1, %%"REG_BP", 4), %%mm3\n\t"
+			"movd (%3, %%"REG_a"), %%mm0	\n\t"
+			"movd (%3, %%"REG_b"), %%mm2	\n\t"
+			"punpcklbw %%mm7, %%mm0		\n\t"
+			"punpcklbw %%mm7, %%mm2		\n\t"
+			"pmaddwd %%mm1, %%mm0		\n\t"
+			"pmaddwd %%mm2, %%mm3		\n\t"
+			"psrad $8, %%mm0		\n\t"
+			"psrad $8, %%mm3		\n\t"
+			"packssdw %%mm3, %%mm0		\n\t"
+			"pmaddwd %%mm6, %%mm0		\n\t"
+			"packssdw %%mm0, %%mm0		\n\t"
+			"movd %%mm0, (%4, %%"REG_BP")	\n\t"
+			"add $4, %%"REG_BP"		\n\t"
+			" jnc 1b			\n\t"
+
+			"pop %%"REG_BP"			\n\t"
+			: "+a" (counter)
+			: "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
+			: "%"REG_b
+		);
+	}
+	else if(filterSize==8)
+	{
+		stride_t counter= -2*dstW;
+		filter-= counter*4;
+		filterPos-= counter/2;
+		dst-= counter/2;
+		asm volatile(
+			"pxor %%mm7, %%mm7		\n\t"
+			"movq "MANGLE(w02)", %%mm6	\n\t"
+			"push %%"REG_BP"		\n\t" // we use 7 regs here ...
+			"mov %%"REG_a", %%"REG_BP"	\n\t"
+			ASMALIGN16
+			"1:				\n\t"
+			"movzwl (%2, %%"REG_BP"), %%eax	\n\t"
+			"movzwl 2(%2, %%"REG_BP"), %%ebx\n\t"
+			"movq (%1, %%"REG_BP", 8), %%mm1\n\t"
+			"movq 16(%1, %%"REG_BP", 8), %%mm3\n\t"
+			"movd (%3, %%"REG_a"), %%mm0	\n\t"
+			"movd (%3, %%"REG_b"), %%mm2	\n\t"
+			"punpcklbw %%mm7, %%mm0		\n\t"
+			"punpcklbw %%mm7, %%mm2		\n\t"
+			"pmaddwd %%mm1, %%mm0		\n\t"
+			"pmaddwd %%mm2, %%mm3		\n\t"
+
+			"movq 8(%1, %%"REG_BP", 8), %%mm1\n\t"
+			"movq 24(%1, %%"REG_BP", 8), %%mm5\n\t"
+			"movd 4(%3, %%"REG_a"), %%mm4	\n\t"
+			"movd 4(%3, %%"REG_b"), %%mm2	\n\t"
+			"punpcklbw %%mm7, %%mm4		\n\t"
+			"punpcklbw %%mm7, %%mm2		\n\t"
+			"pmaddwd %%mm1, %%mm4		\n\t"
+			"pmaddwd %%mm2, %%mm5		\n\t"
+			"paddd %%mm4, %%mm0		\n\t"
+			"paddd %%mm5, %%mm3		\n\t"
+
+			"psrad $8, %%mm0		\n\t"
+			"psrad $8, %%mm3		\n\t"
+			"packssdw %%mm3, %%mm0		\n\t"
+			"pmaddwd %%mm6, %%mm0		\n\t"
+			"packssdw %%mm0, %%mm0		\n\t"
+			"movd %%mm0, (%4, %%"REG_BP")	\n\t"
+			"add $4, %%"REG_BP"		\n\t"
+			" jnc 1b			\n\t"
+
+			"pop %%"REG_BP"			\n\t"
+			: "+a" (counter)
+			: "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
+			: "%"REG_b
+		);
+	}
+	else
+	{
+		uint8_t *offset = src+filterSize;
+		stride_t counter= -2*dstW;
 //		filter-= counter*filterSize/2;
-                filterPos -= counter / 2;
-                dst -= counter / 2;
-                asm volatile(
-                    "pxor %%mm7, %%mm7		\n\t"
-                    "movq "MANGLE(w02)", %%mm6	\n\t"
-                    ASMALIGN16
-                    "1:				\n\t"
-                    "mov %2, %%"REG_c"		\n\t"
-                    "movzwl (%%"REG_c", %0), %%eax	\n\t"
-                    "movzwl 2(%%"REG_c", %0), %%ebx	\n\t"
-                    "mov %5, %%"REG_c"		\n\t"
-                    "pxor %%mm4, %%mm4		\n\t"
-                    "pxor %%mm5, %%mm5		\n\t"
-                    "2:				\n\t"
-                    "movq (%1), %%mm1		\n\t"
-                    "movq (%1, %6), %%mm3		\n\t"
-                    "movd (%%"REG_c", %%"REG_a"), %%mm0\n\t"
-                    "movd (%%"REG_c", %%"REG_b"), %%mm2\n\t"
-                    "punpcklbw %%mm7, %%mm0		\n\t"
-                    "punpcklbw %%mm7, %%mm2		\n\t"
-                    "pmaddwd %%mm1, %%mm0		\n\t"
-                    "pmaddwd %%mm2, %%mm3		\n\t"
-                    "paddd %%mm3, %%mm5		\n\t"
-                    "paddd %%mm0, %%mm4		\n\t"
-                    "add $8, %1			\n\t"
-                    "add $4, %%"REG_c"		\n\t"
-                    "cmp %4, %%"REG_c"		\n\t"
-                    " jb 2b				\n\t"
-                    "add %6, %1			\n\t"
-                    "psrad $8, %%mm4		\n\t"
-                    "psrad $8, %%mm5		\n\t"
-                    "packssdw %%mm5, %%mm4		\n\t"
-                    "pmaddwd %%mm6, %%mm4		\n\t"
-                    "packssdw %%mm4, %%mm4		\n\t"
-                    "mov %3, %%"REG_a"		\n\t"
-                    "movd %%mm4, (%%"REG_a", %0)	\n\t"
-                    "add $4, %0			\n\t"
-                    " jnc 1b			\n\t"
-
-                    : "+r"(counter), "+r"(filter)
-                    : "m"(filterPos), "m"(dst), "m"(offset),
-                    "m"(src), "r"(filterSize*2)
-                    : "%"REG_b, "%"REG_a, "%"REG_c
-                );
-            }
+		filterPos-= counter/2;
+		dst-= counter/2;
+		asm volatile(
+			"pxor %%mm7, %%mm7		\n\t"
+			"movq "MANGLE(w02)", %%mm6	\n\t"
+			ASMALIGN16
+			"1:				\n\t"
+			"mov %2, %%"REG_c"		\n\t"
+			"movzwl (%%"REG_c", %0), %%eax	\n\t"
+			"movzwl 2(%%"REG_c", %0), %%ebx	\n\t"
+			"mov %5, %%"REG_c"		\n\t"
+			"pxor %%mm4, %%mm4		\n\t"
+			"pxor %%mm5, %%mm5		\n\t"
+			"2:				\n\t"
+			"movq (%1), %%mm1		\n\t"
+			"movq (%1, %6), %%mm3		\n\t"
+			"movd (%%"REG_c", %%"REG_a"), %%mm0\n\t"
+			"movd (%%"REG_c", %%"REG_b"), %%mm2\n\t"
+			"punpcklbw %%mm7, %%mm0		\n\t"
+			"punpcklbw %%mm7, %%mm2		\n\t"
+			"pmaddwd %%mm1, %%mm0		\n\t"
+			"pmaddwd %%mm2, %%mm3		\n\t"
+			"paddd %%mm3, %%mm5		\n\t"
+			"paddd %%mm0, %%mm4		\n\t"
+			"add $8, %1			\n\t"
+			"add $4, %%"REG_c"		\n\t"
+			"cmp %4, %%"REG_c"		\n\t"
+			" jb 2b				\n\t"
+			"add %6, %1			\n\t"
+			"psrad $8, %%mm4		\n\t"
+			"psrad $8, %%mm5		\n\t"
+			"packssdw %%mm5, %%mm4		\n\t"
+			"pmaddwd %%mm6, %%mm4		\n\t"
+			"packssdw %%mm4, %%mm4		\n\t"
+			"mov %3, %%"REG_a"		\n\t"
+			"movd %%mm4, (%%"REG_a", %0)	\n\t"
+			"add $4, %0			\n\t"
+			" jnc 1b			\n\t"
+
+			: "+r" (counter), "+r" (filter)
+			: "m" (filterPos), "m" (dst), "m"(offset),
+			  "m" (src), "r" (filterSize*2)
+			: "%"REG_b, "%"REG_a, "%"REG_c
+		);
+	}
 #else
 #ifdef HAVE_ALTIVEC
-            hScale_altivec_real(dst, dstW, src, srcW, xInc, filter, filterPos, filterSize);
+	hScale_altivec_real(dst, dstW, src, srcW, xInc, filter, filterPos, filterSize);
 #else
-            int i;
-            for(i = 0; i < dstW; i++)
-            {
-                int j;
-                int srcPos = filterPos[i];
-                int val = 0;
+	int i;
+	for(i=0; i<dstW; i++)
+	{
+		int j;
+		int srcPos= filterPos[i];
+		int val=0;
 //		printf("filterPos: %d\n", filterPos[i]);
-                for(j = 0; j < filterSize; j++)
-                {
+		for(j=0; j<filterSize; j++)
+		{
 //			printf("filter: %d, src: %d\n", filter[i], src[srcPos + j]);
-                    val += ((int)src[srcPos + j]) * filter[filterSize*i + j];
-                }
+			val += ((int)src[srcPos + j])*filter[filterSize*i + j];
+		}
 //		filter += hFilterSize;
-                dst[i] = FFMIN(FFMAX(0, val >> 7), (1 << 15) - 1); // the cubic equation does overflow ...
+		dst[i] = FFMIN(FFMAX(0, val>>7), (1<<15)-1); // the cubic equation does overflow ...
 //		dst[i] = val>>7;
-            }
+	}
 #endif
 #endif
-        }
-        // *** horizontal scale Y line to temp buffer
-        static inline void RENAME(hyscale)(uint16_t * dst, long dstWidth, uint8_t * src, int srcW, int xInc,
-                                           int flags, int canMMX2BeUsed, int16_t * hLumFilter,
-                                           int16_t * hLumFilterPos, int hLumFilterSize, void * funnyYCode,
-                                           int srcFormat, uint8_t * formatConvBuffer, int16_t * mmx2Filter,
-                                           int32_t * mmx2FilterPos)
-        {
-            if(srcFormat == IMGFMT_YUY2 || srcFormat == IMGFMT_YVYU)
-            {
-                RENAME(yuy2ToY)(formatConvBuffer, src, srcW);
-                src = formatConvBuffer;
-            }
-            else if(srcFormat == IMGFMT_UYVY || srcFormat == IMGFMT_VYUY)
-            {
-                RENAME(uyvyToY)(formatConvBuffer, src, srcW);
-                src = formatConvBuffer;
-            }
-            else if(srcFormat == IMGFMT_BGR32)
-            {
-                RENAME(bgr32ToY)(formatConvBuffer, src, srcW);
-                src = formatConvBuffer;
-            }
-            else if(srcFormat == IMGFMT_BGR24)
-            {
-                RENAME(bgr24ToY)(formatConvBuffer, src, srcW);
-                src = formatConvBuffer;
-            }
-            else if(srcFormat == IMGFMT_BGR16)
-            {
-                RENAME(bgr16ToY)(formatConvBuffer, src, srcW);
-                src = formatConvBuffer;
-            }
-            else if(srcFormat == IMGFMT_BGR15)
-            {
-                RENAME(bgr15ToY)(formatConvBuffer, src, srcW);
-                src = formatConvBuffer;
-            }
-            else if(srcFormat == IMGFMT_RGB32)
-            {
-                RENAME(rgb32ToY)(formatConvBuffer, src, srcW);
-                src = formatConvBuffer;
-            }
-            else if(srcFormat == IMGFMT_RGB24)
-            {
-                RENAME(rgb24ToY)(formatConvBuffer, src, srcW);
-                src = formatConvBuffer;
-            }
-            else if(srcFormat == IMGFMT_RGB16)
-            {
-                RENAME(rgb16ToY)(formatConvBuffer, src, srcW);
-                src = formatConvBuffer;
-            }
-            else if(srcFormat == IMGFMT_RGB15)
-            {
-                RENAME(rgb15ToY)(formatConvBuffer, src, srcW);
-                src = formatConvBuffer;
-            }
+}
+      // *** horizontal scale Y line to temp buffer
+static inline void RENAME(hyscale)(uint16_t *dst, long dstWidth, uint8_t *src, int srcW, int xInc,
+				   int flags, int canMMX2BeUsed, int16_t *hLumFilter,
+				   int16_t *hLumFilterPos, int hLumFilterSize, void *funnyYCode,
+				   int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
+				   int32_t *mmx2FilterPos)
+{
+    if(srcFormat==IMGFMT_YUY2 || srcFormat==IMGFMT_YVYU)
+    {
+	RENAME(yuy2ToY)(formatConvBuffer, src, srcW);
+	src= formatConvBuffer;
+    }
+    else if(srcFormat==IMGFMT_UYVY || srcFormat==IMGFMT_VYUY)
+    {
+	RENAME(uyvyToY)(formatConvBuffer, src, srcW);
+	src= formatConvBuffer;
+    }
+    else if(srcFormat==IMGFMT_BGR32)
+    {
+	RENAME(bgr32ToY)(formatConvBuffer, src, srcW);
+	src= formatConvBuffer;
+    }
+    else if(srcFormat==IMGFMT_BGR24)
+    {
+	RENAME(bgr24ToY)(formatConvBuffer, src, srcW);
+	src= formatConvBuffer;
+    }
+    else if(srcFormat==IMGFMT_BGR16)
+    {
+	RENAME(bgr16ToY)(formatConvBuffer, src, srcW);
+	src= formatConvBuffer;
+    }
+    else if(srcFormat==IMGFMT_BGR15)
+    {
+	RENAME(bgr15ToY)(formatConvBuffer, src, srcW);
+	src= formatConvBuffer;
+    }
+    else if(srcFormat==IMGFMT_RGB32)
+    {
+	RENAME(rgb32ToY)(formatConvBuffer, src, srcW);
+	src= formatConvBuffer;
+    }
+    else if(srcFormat==IMGFMT_RGB24)
+    {
+	RENAME(rgb24ToY)(formatConvBuffer, src, srcW);
+	src= formatConvBuffer;
+    }
+    else if(srcFormat==IMGFMT_RGB16)
+    {
+	RENAME(rgb16ToY)(formatConvBuffer, src, srcW);
+	src= formatConvBuffer;
+    }
+    else if(srcFormat==IMGFMT_RGB15)
+    {
+	RENAME(rgb15ToY)(formatConvBuffer, src, srcW);
+	src= formatConvBuffer;
+    }
 
 #if HAVE_MMX
-            // use the new MMX scaler if the mmx2 can't be used (its faster than the x86asm one)
-            if(!(flags & SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
+	// use the new MMX scaler if the mmx2 can't be used (its faster than the x86asm one)
+    if(!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
 #else
-            if(!(flags & SWS_FAST_BILINEAR))
+    if(!(flags&SWS_FAST_BILINEAR))
 #endif
-            {
-                RENAME(hScale)(dst, dstWidth, src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize);
-            }
-            else // Fast Bilinear upscale / crap downscale
-            {
+    {
+    	RENAME(hScale)(dst, dstWidth, src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize);
+    }
+    else // Fast Bilinear upscale / crap downscale
+    {
 #if ARCH_X86_32 || ARCH_X86_64
 #if HAVE_MMX2
-                int i;
-                if(canMMX2BeUsed)
-                {
-                    asm volatile(
-                        "pxor %%mm7, %%mm7		\n\t"
-                        "mov %0, %%"REG_c"		\n\t"
-                        "mov %1, %%"REG_D"		\n\t"
-                        "mov %2, %%"REG_d"		\n\t"
-                        "mov %3, %%"REG_b"		\n\t"
-                        "xor %%"REG_a", %%"REG_a"	\n\t" // i
-                        PREFETCH" (%%"REG_c")		\n\t"
-                        PREFETCH" 32(%%"REG_c")		\n\t"
-                        PREFETCH" 64(%%"REG_c")		\n\t"
+	int i;
+	if(canMMX2BeUsed)
+	{
+		asm volatile(
+			"pxor %%mm7, %%mm7		\n\t"
+			"mov %0, %%"REG_c"		\n\t"
+			"mov %1, %%"REG_D"		\n\t"
+			"mov %2, %%"REG_d"		\n\t"
+			"mov %3, %%"REG_b"		\n\t"
+			"xor %%"REG_a", %%"REG_a"	\n\t" // i
+			PREFETCH" (%%"REG_c")		\n\t"
+			PREFETCH" 32(%%"REG_c")		\n\t"
+			PREFETCH" 64(%%"REG_c")		\n\t"
 
 #if ARCH_X86_64
 
@@ -2575,7 +2557,7 @@ static inline void RENAME(yuv2packed2)(SwsContext *c, uint16_t *buf0, uint16_t *
 			"add %%"REG_S", %%"REG_c"	\n\t"\
 			"add %%"REG_a", %%"REG_D"	\n\t"\
 			"xor %%"REG_a", %%"REG_a"	\n\t"\
- 
+
 #else
 
 #define FUNNY_Y_CODE \
@@ -2584,209 +2566,209 @@ static inline void RENAME(yuv2packed2)(SwsContext *c, uint16_t *buf0, uint16_t *
 			"addl (%%"REG_b", %%"REG_a"), %%"REG_c"\n\t"\
 			"add %%"REG_a", %%"REG_D"	\n\t"\
 			"xor %%"REG_a", %%"REG_a"	\n\t"\
- 
+
 #endif
 
-                        FUNNY_Y_CODE
-                        FUNNY_Y_CODE
-                        FUNNY_Y_CODE
-                        FUNNY_Y_CODE
-                        FUNNY_Y_CODE
-                        FUNNY_Y_CODE
-                        FUNNY_Y_CODE
-                        FUNNY_Y_CODE
-
-                        :: "m"(src), "m"(dst), "m"(mmx2Filter), "m"(mmx2FilterPos),
-                        "m"(funnyYCode)
-                        : "%"REG_a, "%"REG_b, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
-                    );
-                    for(i = dstWidth - 1; (i * xInc) >> 16 >= srcW - 1; i--) dst[i] = src[srcW-1] * 128;
-                }
-                else
-                {
+FUNNY_Y_CODE
+FUNNY_Y_CODE
+FUNNY_Y_CODE
+FUNNY_Y_CODE
+FUNNY_Y_CODE
+FUNNY_Y_CODE
+FUNNY_Y_CODE
+FUNNY_Y_CODE
+
+			:: "m" (src), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
+			"m" (funnyYCode)
+			: "%"REG_a, "%"REG_b, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
+		);
+		for(i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) dst[i] = src[srcW-1]*128;
+	}
+	else
+	{
 #endif
-                    long xInc_shr16 = xInc >> 16;
-                    uint16_t xInc_mask = xInc & 0xffff;
-                    //NO MMX just normal asm ...
-                    asm volatile(
-                        "xor %%"REG_a", %%"REG_a"	\n\t" // i
-                        "xor %%"REG_b", %%"REG_b"	\n\t" // xx
-                        "xorl %%ecx, %%ecx		\n\t" // 2*xalpha
-                        ASMALIGN16
-                        "1:				\n\t"
-                        "movzbl  (%0, %%"REG_b"), %%edi	\n\t" //src[xx]
-                        "movzbl 1(%0, %%"REG_b"), %%esi	\n\t" //src[xx+1]
-                        "subl %%edi, %%esi		\n\t" //src[xx+1] - src[xx]
-                        "imull %%ecx, %%esi		\n\t" //(src[xx+1] - src[xx])*2*xalpha
-                        "shll $16, %%edi		\n\t"
-                        "addl %%edi, %%esi		\n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
-                        "mov %1, %%"REG_D"		\n\t"
-                        "shrl $9, %%esi			\n\t"
-                        "movw %%si, (%%"REG_D", %%"REG_a", 2)\n\t"
-                        "addw %4, %%cx			\n\t" //2*xalpha += xInc&0xFF
-                        "adc %3, %%"REG_b"		\n\t" //xx+= xInc>>8 + carry
-
-                        "movzbl (%0, %%"REG_b"), %%edi	\n\t" //src[xx]
-                        "movzbl 1(%0, %%"REG_b"), %%esi	\n\t" //src[xx+1]
-                        "subl %%edi, %%esi		\n\t" //src[xx+1] - src[xx]
-                        "imull %%ecx, %%esi		\n\t" //(src[xx+1] - src[xx])*2*xalpha
-                        "shll $16, %%edi		\n\t"
-                        "addl %%edi, %%esi		\n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
-                        "mov %1, %%"REG_D"		\n\t"
-                        "shrl $9, %%esi			\n\t"
-                        "movw %%si, 2(%%"REG_D", %%"REG_a", 2)\n\t"
-                        "addw %4, %%cx			\n\t" //2*xalpha += xInc&0xFF
-                        "adc %3, %%"REG_b"		\n\t" //xx+= xInc>>8 + carry
-
-
-                        "add $2, %%"REG_a"		\n\t"
-                        "cmp %2, %%"REG_a"		\n\t"
-                        " jb 1b				\n\t"
-
-
-                        :: "r"(src), "m"(dst), "m"(dstWidth), "m"(xInc_shr16), "m"(xInc_mask)
-                        : "%"REG_a, "%"REG_b, "%ecx", "%"REG_D, "%esi"
-                    );
+	long xInc_shr16 = xInc >> 16;
+	uint16_t xInc_mask = xInc & 0xffff;
+	//NO MMX just normal asm ...
+	asm volatile(
+		"xor %%"REG_a", %%"REG_a"	\n\t" // i
+		"xor %%"REG_b", %%"REG_b"	\n\t" // xx
+		"xorl %%ecx, %%ecx		\n\t" // 2*xalpha
+		ASMALIGN16
+		"1:				\n\t"
+		"movzbl  (%0, %%"REG_b"), %%edi	\n\t" //src[xx]
+		"movzbl 1(%0, %%"REG_b"), %%esi	\n\t" //src[xx+1]
+		"subl %%edi, %%esi		\n\t" //src[xx+1] - src[xx]
+		"imull %%ecx, %%esi		\n\t" //(src[xx+1] - src[xx])*2*xalpha
+		"shll $16, %%edi		\n\t"
+		"addl %%edi, %%esi		\n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
+		"mov %1, %%"REG_D"		\n\t"
+		"shrl $9, %%esi			\n\t"
+		"movw %%si, (%%"REG_D", %%"REG_a", 2)\n\t"
+		"addw %4, %%cx			\n\t" //2*xalpha += xInc&0xFF
+		"adc %3, %%"REG_b"		\n\t" //xx+= xInc>>8 + carry
+
+		"movzbl (%0, %%"REG_b"), %%edi	\n\t" //src[xx]
+		"movzbl 1(%0, %%"REG_b"), %%esi	\n\t" //src[xx+1]
+		"subl %%edi, %%esi		\n\t" //src[xx+1] - src[xx]
+		"imull %%ecx, %%esi		\n\t" //(src[xx+1] - src[xx])*2*xalpha
+		"shll $16, %%edi		\n\t"
+		"addl %%edi, %%esi		\n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
+		"mov %1, %%"REG_D"		\n\t"
+		"shrl $9, %%esi			\n\t"
+		"movw %%si, 2(%%"REG_D", %%"REG_a", 2)\n\t"
+		"addw %4, %%cx			\n\t" //2*xalpha += xInc&0xFF
+		"adc %3, %%"REG_b"		\n\t" //xx+= xInc>>8 + carry
+
+
+		"add $2, %%"REG_a"		\n\t"
+		"cmp %2, %%"REG_a"		\n\t"
+		" jb 1b				\n\t"
+
+
+		:: "r" (src), "m" (dst), "m" (dstWidth), "m" (xInc_shr16), "m" (xInc_mask)
+		: "%"REG_a, "%"REG_b, "%ecx", "%"REG_D, "%esi"
+		);
 #if HAVE_MMX2
-                } //if MMX2 can't be used
+	} //if MMX2 can't be used
 #endif
 #else
-                int i;
-                unsigned int xpos = 0;
-                for(i = 0; i < dstWidth; i++)
-                {
-                    register unsigned int xx = xpos >> 16;
-                    register unsigned int xalpha = (xpos & 0xFFFF) >> 9;
-                    dst[i] = (src[xx] << 7) + (src[xx+1] - src[xx]) * xalpha;
-                    xpos += xInc;
-                }
+	int i;
+	unsigned int xpos=0;
+	for(i=0;i<dstWidth;i++)
+	{
+		register unsigned int xx=xpos>>16;
+		register unsigned int xalpha=(xpos&0xFFFF)>>9;
+		dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha;
+		xpos+=xInc;
+	}
 #endif
-            }
-        }
+    }
+}
 
-        inline static void RENAME(hcscale)(uint16_t * dst, long dstWidth, uint8_t * src1, uint8_t * src2,
-                                           int srcW, int xInc, int flags, int canMMX2BeUsed, int16_t * hChrFilter,
-                                           int16_t * hChrFilterPos, int hChrFilterSize, void * funnyUVCode,
-                                           int srcFormat, uint8_t * formatConvBuffer, int16_t * mmx2Filter,
-                                           int32_t * mmx2FilterPos)
-        {
-            if(srcFormat == IMGFMT_YUY2)
-            {
-                RENAME(yuy2ToUV)(formatConvBuffer, formatConvBuffer + 2048, src1, src2, srcW);
-                src1 = formatConvBuffer;
-                src2 = formatConvBuffer + 2048;
-            }
-            else if(srcFormat == IMGFMT_UYVY)
-            {
-                RENAME(uyvyToUV)(formatConvBuffer, formatConvBuffer + 2048, src1, src2, srcW);
-                src1 = formatConvBuffer;
-                src2 = formatConvBuffer + 2048;
-            }
-            else if(srcFormat == IMGFMT_YVYU)
-            {
-                RENAME(yuy2ToUV)(formatConvBuffer + 2048, formatConvBuffer, src1, src2, srcW);
-                src1 = formatConvBuffer;
-                src2 = formatConvBuffer + 2048;
-            }
-            else if(srcFormat == IMGFMT_VYUY)
-            {
-                RENAME(uyvyToUV)(formatConvBuffer + 2048, formatConvBuffer, src1, src2, srcW);
-                src1 = formatConvBuffer;
-                src2 = formatConvBuffer + 2048;
-            }
-            else if(srcFormat == IMGFMT_BGR32)
-            {
-                RENAME(bgr32ToUV)(formatConvBuffer, formatConvBuffer + 2048, src1, src2, srcW);
-                src1 = formatConvBuffer;
-                src2 = formatConvBuffer + 2048;
-            }
-            else if(srcFormat == IMGFMT_BGR24)
-            {
-                RENAME(bgr24ToUV)(formatConvBuffer, formatConvBuffer + 2048, src1, src2, srcW);
-                src1 = formatConvBuffer;
-                src2 = formatConvBuffer + 2048;
-            }
-            else if(srcFormat == IMGFMT_BGR16)
-            {
-                RENAME(bgr16ToUV)(formatConvBuffer, formatConvBuffer + 2048, src1, src2, srcW);
-                src1 = formatConvBuffer;
-                src2 = formatConvBuffer + 2048;
-            }
-            else if(srcFormat == IMGFMT_BGR15)
-            {
-                RENAME(bgr15ToUV)(formatConvBuffer, formatConvBuffer + 2048, src1, src2, srcW);
-                src1 = formatConvBuffer;
-                src2 = formatConvBuffer + 2048;
-            }
-            else if(srcFormat == IMGFMT_RGB32)
-            {
-                RENAME(rgb32ToUV)(formatConvBuffer, formatConvBuffer + 2048, src1, src2, srcW);
-                src1 = formatConvBuffer;
-                src2 = formatConvBuffer + 2048;
-            }
-            else if(srcFormat == IMGFMT_RGB24)
-            {
-                RENAME(rgb24ToUV)(formatConvBuffer, formatConvBuffer + 2048, src1, src2, srcW);
-                src1 = formatConvBuffer;
-                src2 = formatConvBuffer + 2048;
-            }
-            else if(srcFormat == IMGFMT_RGB16)
-            {
-                RENAME(rgb16ToUV)(formatConvBuffer, formatConvBuffer + 2048, src1, src2, srcW);
-                src1 = formatConvBuffer;
-                src2 = formatConvBuffer + 2048;
-            }
-            else if(srcFormat == IMGFMT_RGB15)
-            {
-                RENAME(rgb15ToUV)(formatConvBuffer, formatConvBuffer + 2048, src1, src2, srcW);
-                src1 = formatConvBuffer;
-                src2 = formatConvBuffer + 2048;
-            }
-            else if(srcFormat == IMGFMT_NV12)
-            {
-                RENAME(nv12ToUV)(formatConvBuffer, formatConvBuffer + 2048, src1, src2, srcW);
-                src1 = formatConvBuffer;
-                src2 = formatConvBuffer + 2048;
-            }
-            else if(srcFormat == IMGFMT_NV21)
-            {
-                RENAME(nv21ToUV)(formatConvBuffer, formatConvBuffer + 2048, src1, src2, srcW);
-                src1 = formatConvBuffer;
-                src2 = formatConvBuffer + 2048;
-            }
-            else if(isGray(srcFormat))
-            {
-                return;
-            }
+inline static void RENAME(hcscale)(uint16_t *dst, long dstWidth, uint8_t *src1, uint8_t *src2,
+				   int srcW, int xInc, int flags, int canMMX2BeUsed, int16_t *hChrFilter,
+				   int16_t *hChrFilterPos, int hChrFilterSize, void *funnyUVCode,
+				   int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
+				   int32_t *mmx2FilterPos)
+{
+    if(srcFormat==IMGFMT_YUY2)
+    {
+	RENAME(yuy2ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
+	src1= formatConvBuffer;
+	src2= formatConvBuffer+2048;
+    }
+    else if(srcFormat==IMGFMT_UYVY)
+    {
+	RENAME(uyvyToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
+	src1= formatConvBuffer;
+	src2= formatConvBuffer+2048;
+    }
+    else if(srcFormat==IMGFMT_YVYU)
+    {
+	RENAME(yuy2ToUV)(formatConvBuffer+2048, formatConvBuffer, src1, src2, srcW);
+	src1= formatConvBuffer;
+	src2= formatConvBuffer+2048;
+    }
+    else if(srcFormat==IMGFMT_VYUY)
+    {
+	RENAME(uyvyToUV)(formatConvBuffer+2048, formatConvBuffer, src1, src2, srcW);
+	src1= formatConvBuffer;
+	src2= formatConvBuffer+2048;
+    }
+    else if(srcFormat==IMGFMT_BGR32)
+    {
+	RENAME(bgr32ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
+	src1= formatConvBuffer;
+	src2= formatConvBuffer+2048;
+    }
+    else if(srcFormat==IMGFMT_BGR24)
+    {
+	RENAME(bgr24ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
+	src1= formatConvBuffer;
+	src2= formatConvBuffer+2048;
+    }
+    else if(srcFormat==IMGFMT_BGR16)
+    {
+	RENAME(bgr16ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
+	src1= formatConvBuffer;
+	src2= formatConvBuffer+2048;
+    }
+    else if(srcFormat==IMGFMT_BGR15)
+    {
+	RENAME(bgr15ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
+	src1= formatConvBuffer;
+	src2= formatConvBuffer+2048;
+    }
+    else if(srcFormat==IMGFMT_RGB32)
+    {
+	RENAME(rgb32ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
+	src1= formatConvBuffer;
+	src2= formatConvBuffer+2048;
+    }
+    else if(srcFormat==IMGFMT_RGB24)
+    {
+	RENAME(rgb24ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
+	src1= formatConvBuffer;
+	src2= formatConvBuffer+2048;
+    }
+    else if(srcFormat==IMGFMT_RGB16)
+    {
+	RENAME(rgb16ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
+	src1= formatConvBuffer;
+	src2= formatConvBuffer+2048;
+    }
+    else if(srcFormat==IMGFMT_RGB15)
+    {
+	RENAME(rgb15ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
+	src1= formatConvBuffer;
+	src2= formatConvBuffer+2048;
+    }
+    else if(srcFormat==IMGFMT_NV12)
+    {
+	RENAME(nv12ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
+	src1= formatConvBuffer;
+	src2= formatConvBuffer+2048;
+    }
+    else if(srcFormat==IMGFMT_NV21)
+    {
+	RENAME(nv21ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
+	src1= formatConvBuffer;
+	src2= formatConvBuffer+2048;
+    }
+    else if(isGray(srcFormat))
+    {
+    	return;
+    }
 
 #if HAVE_MMX
-            // use the new MMX scaler if the mmx2 can't be used (its faster than the x86asm one)
-            if(!(flags & SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
+	// use the new MMX scaler if the mmx2 can't be used (its faster than the x86asm one)
+    if(!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
 #else
-            if(!(flags & SWS_FAST_BILINEAR))
+    if(!(flags&SWS_FAST_BILINEAR))
 #endif
-            {
-                RENAME(hScale)(dst     , dstWidth, src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
-                if(src2)  // NV12 does not have this.
-                    RENAME(hScale)(dst + 2048, dstWidth, src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
-            }
-            else // Fast Bilinear upscale / crap downscale
-            {
+    {
+    	RENAME(hScale)(dst     , dstWidth, src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
+        if (src2) // NV12 does not have this.
+            RENAME(hScale)(dst+2048, dstWidth, src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
+    }
+    else // Fast Bilinear upscale / crap downscale
+    {
 #if ARCH_X86_32 || ARCH_X86_64
 #if HAVE_MMX2
-                int i;
-                if(canMMX2BeUsed)
-                {
-                    asm volatile(
-                        "pxor %%mm7, %%mm7		\n\t"
-                        "mov %0, %%"REG_c"		\n\t"
-                        "mov %1, %%"REG_D"		\n\t"
-                        "mov %2, %%"REG_d"		\n\t"
-                        "mov %3, %%"REG_b"		\n\t"
-                        "xor %%"REG_a", %%"REG_a"	\n\t" // i
-                        PREFETCH" (%%"REG_c")		\n\t"
-                        PREFETCH" 32(%%"REG_c")		\n\t"
-                        PREFETCH" 64(%%"REG_c")		\n\t"
+	int i;
+	if(canMMX2BeUsed)
+	{
+		asm volatile(
+			"pxor %%mm7, %%mm7		\n\t"
+			"mov %0, %%"REG_c"		\n\t"
+			"mov %1, %%"REG_D"		\n\t"
+			"mov %2, %%"REG_d"		\n\t"
+			"mov %3, %%"REG_b"		\n\t"
+			"xor %%"REG_a", %%"REG_a"	\n\t" // i
+			PREFETCH" (%%"REG_c")		\n\t"
+			PREFETCH" 32(%%"REG_c")		\n\t"
+			PREFETCH" 64(%%"REG_c")		\n\t"
 
 #if ARCH_X86_64
 
@@ -2797,7 +2779,7 @@ static inline void RENAME(yuv2packed2)(SwsContext *c, uint16_t *buf0, uint16_t *
 			"add %%"REG_S", %%"REG_c"	\n\t"\
 			"add %%"REG_a", %%"REG_D"	\n\t"\
 			"xor %%"REG_a", %%"REG_a"	\n\t"\
- 
+
 #else
 
 #define FUNNY_UV_CODE \
@@ -2806,517 +2788,503 @@ static inline void RENAME(yuv2packed2)(SwsContext *c, uint16_t *buf0, uint16_t *
 			"addl (%%"REG_b", %%"REG_a"), %%"REG_c"\n\t"\
 			"add %%"REG_a", %%"REG_D"	\n\t"\
 			"xor %%"REG_a", %%"REG_a"	\n\t"\
- 
+
 #endif
 
-                        FUNNY_UV_CODE
-                        FUNNY_UV_CODE
-                        FUNNY_UV_CODE
-                        FUNNY_UV_CODE
-                        "xor %%"REG_a", %%"REG_a"	\n\t" // i
-                        "mov %5, %%"REG_c"		\n\t" // src
-                        "mov %1, %%"REG_D"		\n\t" // buf1
-                        "add $4096, %%"REG_D"		\n\t"
-                        PREFETCH" (%%"REG_c")		\n\t"
-                        PREFETCH" 32(%%"REG_c")		\n\t"
-                        PREFETCH" 64(%%"REG_c")		\n\t"
-
-                        FUNNY_UV_CODE
-                        FUNNY_UV_CODE
-                        FUNNY_UV_CODE
-                        FUNNY_UV_CODE
-
-                        :: "m"(src1), "m"(dst), "m"(mmx2Filter), "m"(mmx2FilterPos),
-                        "m"(funnyUVCode), "m"(src2)
-                        : "%"REG_a, "%"REG_b, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
-                    );
-                    for(i = dstWidth - 1; (i * xInc) >> 16 >= srcW - 1; i--)
-                    {
+FUNNY_UV_CODE
+FUNNY_UV_CODE
+FUNNY_UV_CODE
+FUNNY_UV_CODE
+			"xor %%"REG_a", %%"REG_a"	\n\t" // i
+			"mov %5, %%"REG_c"		\n\t" // src
+			"mov %1, %%"REG_D"		\n\t" // buf1
+			"add $4096, %%"REG_D"		\n\t"
+			PREFETCH" (%%"REG_c")		\n\t"
+			PREFETCH" 32(%%"REG_c")		\n\t"
+			PREFETCH" 64(%%"REG_c")		\n\t"
+
+FUNNY_UV_CODE
+FUNNY_UV_CODE
+FUNNY_UV_CODE
+FUNNY_UV_CODE
+
+			:: "m" (src1), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
+			"m" (funnyUVCode), "m" (src2)
+			: "%"REG_a, "%"REG_b, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
+		);
+		for(i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--)
+		{
 //			printf("%d %d %d\n", dstWidth, i, srcW);
-                        dst[i] = src1[srcW-1] * 128;
-                        dst[i+2048] = src2[srcW-1] * 128;
-                    }
-                }
-                else
-                {
+			dst[i] = src1[srcW-1]*128;
+			dst[i+2048] = src2[srcW-1]*128;
+		}
+	}
+	else
+	{
 #endif
-                    long xInc_shr16 = (long)(xInc >> 16);
-                    uint16_t xInc_mask = xInc & 0xffff;
-                    asm volatile(
-                        "xor %%"REG_a", %%"REG_a"	\n\t" // i
-                        "xor %%"REG_b", %%"REG_b"		\n\t" // xx
-                        "xorl %%ecx, %%ecx		\n\t" // 2*xalpha
-                        ASMALIGN16
-                        "1:				\n\t"
-                        "mov %0, %%"REG_S"		\n\t"
-                        "movzbl  (%%"REG_S", %%"REG_b"), %%edi	\n\t" //src[xx]
-                        "movzbl 1(%%"REG_S", %%"REG_b"), %%esi	\n\t" //src[xx+1]
-                        "subl %%edi, %%esi		\n\t" //src[xx+1] - src[xx]
-                        "imull %%ecx, %%esi		\n\t" //(src[xx+1] - src[xx])*2*xalpha
-                        "shll $16, %%edi		\n\t"
-                        "addl %%edi, %%esi		\n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
-                        "mov %1, %%"REG_D"		\n\t"
-                        "shrl $9, %%esi			\n\t"
-                        "movw %%si, (%%"REG_D", %%"REG_a", 2)\n\t"
-
-                        "movzbl  (%5, %%"REG_b"), %%edi	\n\t" //src[xx]
-                        "movzbl 1(%5, %%"REG_b"), %%esi	\n\t" //src[xx+1]
-                        "subl %%edi, %%esi		\n\t" //src[xx+1] - src[xx]
-                        "imull %%ecx, %%esi		\n\t" //(src[xx+1] - src[xx])*2*xalpha
-                        "shll $16, %%edi		\n\t"
-                        "addl %%edi, %%esi		\n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
-                        "mov %1, %%"REG_D"		\n\t"
-                        "shrl $9, %%esi			\n\t"
-                        "movw %%si, 4096(%%"REG_D", %%"REG_a", 2)\n\t"
-
-                        "addw %4, %%cx			\n\t" //2*xalpha += xInc&0xFF
-                        "adc %3, %%"REG_b"		\n\t" //xx+= xInc>>8 + carry
-                        "add $1, %%"REG_a"		\n\t"
-                        "cmp %2, %%"REG_a"		\n\t"
-                        " jb 1b				\n\t"
-
-                        /* GCC 3.3 makes MPlayer crash on IA-32 machines when using "g" operand here,
-                           which is needed to support GCC 4.0. */
+	long xInc_shr16 = (long) (xInc >> 16);
+	uint16_t xInc_mask = xInc & 0xffff;
+	asm volatile(
+		"xor %%"REG_a", %%"REG_a"	\n\t" // i
+		"xor %%"REG_b", %%"REG_b"		\n\t" // xx
+		"xorl %%ecx, %%ecx		\n\t" // 2*xalpha
+		ASMALIGN16
+		"1:				\n\t"
+		"mov %0, %%"REG_S"		\n\t"
+		"movzbl  (%%"REG_S", %%"REG_b"), %%edi	\n\t" //src[xx]
+		"movzbl 1(%%"REG_S", %%"REG_b"), %%esi	\n\t" //src[xx+1]
+		"subl %%edi, %%esi		\n\t" //src[xx+1] - src[xx]
+		"imull %%ecx, %%esi		\n\t" //(src[xx+1] - src[xx])*2*xalpha
+		"shll $16, %%edi		\n\t"
+		"addl %%edi, %%esi		\n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
+		"mov %1, %%"REG_D"		\n\t"
+		"shrl $9, %%esi			\n\t"
+		"movw %%si, (%%"REG_D", %%"REG_a", 2)\n\t"
+
+		"movzbl  (%5, %%"REG_b"), %%edi	\n\t" //src[xx]
+		"movzbl 1(%5, %%"REG_b"), %%esi	\n\t" //src[xx+1]
+		"subl %%edi, %%esi		\n\t" //src[xx+1] - src[xx]
+		"imull %%ecx, %%esi		\n\t" //(src[xx+1] - src[xx])*2*xalpha
+		"shll $16, %%edi		\n\t"
+		"addl %%edi, %%esi		\n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
+		"mov %1, %%"REG_D"		\n\t"
+		"shrl $9, %%esi			\n\t"
+		"movw %%si, 4096(%%"REG_D", %%"REG_a", 2)\n\t"
+
+		"addw %4, %%cx			\n\t" //2*xalpha += xInc&0xFF
+		"adc %3, %%"REG_b"		\n\t" //xx+= xInc>>8 + carry
+		"add $1, %%"REG_a"		\n\t"
+		"cmp %2, %%"REG_a"		\n\t"
+		" jb 1b				\n\t"
+
+/* GCC 3.3 makes MPlayer crash on IA-32 machines when using "g" operand here,
+   which is needed to support GCC 4.0. */
 #if ARCH_X86_64 && ((__GNUC__ > 3) || ( __GNUC__ == 3 && __GNUC_MINOR__ >= 4))
-                        :: "m"(src1), "m"(dst), "g"((stride_t)dstWidth), "m"(xInc_shr16), "m"(xInc_mask),
+		:: "m" (src1), "m" (dst), "g" ((stride_t)dstWidth), "m" (xInc_shr16), "m" (xInc_mask),
 #else
-                        :: "m"(src1), "m"(dst), "m"((long)dstWidth), "m"(xInc_shr16), "m"(xInc_mask),
+		:: "m" (src1), "m" (dst), "m" ((long)dstWidth), "m" (xInc_shr16), "m" (xInc_mask),
 #endif
-                        "r"(src2)
-                        : "%"REG_a, "%"REG_b, "%ecx", "%"REG_D, "%esi"
-                    );
+		"r" (src2)
+		: "%"REG_a, "%"REG_b, "%ecx", "%"REG_D, "%esi"
+		);
 #if HAVE_MMX2
-                } //if MMX2 can't be used
+	} //if MMX2 can't be used
 #endif
 #else
-                int i;
-                unsigned int xpos = 0;
-                for(i = 0; i < dstWidth; i++)
-                {
-                    register unsigned int xx = xpos >> 16;
-                    register unsigned int xalpha = (xpos & 0xFFFF) >> 9;
-                    dst[i] = (src1[xx] * (xalpha ^ 127) + src1[xx+1] * xalpha);
-                    dst[i+2048] = (src2[xx] * (xalpha ^ 127) + src2[xx+1] * xalpha);
-                    /* slower
-                    	  dst[i]= (src1[xx]<<7) + (src1[xx+1] - src1[xx])*xalpha;
-                    	  dst[i+2048]=(src2[xx]<<7) + (src2[xx+1] - src2[xx])*xalpha;
-                    */
-                    xpos += xInc;
-                }
+	int i;
+	unsigned int xpos=0;
+	for(i=0;i<dstWidth;i++)
+	{
+		register unsigned int xx=xpos>>16;
+		register unsigned int xalpha=(xpos&0xFFFF)>>9;
+		dst[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha);
+		dst[i+2048]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha);
+/* slower
+	  dst[i]= (src1[xx]<<7) + (src1[xx+1] - src1[xx])*xalpha;
+	  dst[i+2048]=(src2[xx]<<7) + (src2[xx+1] - src2[xx])*xalpha;
+*/
+		xpos+=xInc;
+	}
 #endif
-            }
-        }
+   }
+}
 
-        static int RENAME(swScaleI)(SwsContext * c, uint8_t * src[], stride_t srcStride[], int srcSliceY,
-                                    int srcSliceH, uint8_t * dst[], stride_t dstStride[], int dstYstart, int dstYend)
-        {
-
-            /* load a few things into local vars to make the code more readable? and faster */
-            const int srcW = c->srcW;
-            const int dstW = c->dstW;
-            const int dstH = c->dstH;
-            const int chrDstW = c->chrDstW;
-            const int chrSrcW = c->chrSrcW;
-            const int lumXInc = c->lumXInc;
-            const int chrXInc = c->chrXInc;
-            const int dstFormat = c->dstFormat;
-            const int srcFormat = c->srcFormat;
-            const SwsParams params = c->params;
-            const int canMMX2BeUsed = c->canMMX2BeUsed;
-            int16_t *vLumFilterPos = c->vLumFilterPos;
-            int16_t *vChrFilterPos = c->vChrFilterPos;
-            int16_t *hLumFilterPos = c->hLumFilterPos;
-            int16_t *hChrFilterPos = c->hChrFilterPos;
-            int16_t *vLumFilter = c->vLumFilter;
-            int16_t *vChrFilter = c->vChrFilter;
-            int16_t *hLumFilter = c->hLumFilter;
-            int16_t *hChrFilter = c->hChrFilter;
-            int32_t *lumMmxFilter = c->lumMmxFilter;
-            int32_t *chrMmxFilter = c->chrMmxFilter;
-            const int vLumFilterSize = c->vLumFilterSize;
-            const int vChrFilterSize = c->vChrFilterSize;
-            const int hLumFilterSize = c->hLumFilterSize;
-            const int hChrFilterSize = c->hChrFilterSize;
-            int16_t **lumPixBuf = c->lumPixBuf;
-            int16_t **chrPixBuf = c->chrPixBuf;
-            const int vLumBufSize = c->vLumBufSize;
-            const int vChrBufSize = c->vChrBufSize;
-            uint8_t *funnyYCode = c->funnyYCode;
-            uint8_t *funnyUVCode = c->funnyUVCode;
-            uint8_t *formatConvBuffer = c->formatConvBuffer;
-            const int chrSrcSliceY = srcSliceY >> c->chrSrcVSubSample;
-            const int chrSrcSliceH = -((-srcSliceH) >> c->chrSrcVSubSample);
-            int lastDstY;
-
-            /* vars which will change and which we need to store back in the context */
-            int dstY = dstYstart;
-            int lumBufIndex = c->lumBufIndex;
-            int chrBufIndex = c->chrBufIndex;
-            int lastInLumBuf = c->lastInLumBuf;
-            int lastInChrBuf = c->lastInChrBuf;
-
-            if(isPacked(c->srcFormat))
-            {
-                src[0] =
-                    src[1] =
-                        src[2] = src[0];
-                srcStride[0] =
-                    srcStride[1] =
-                        srcStride[2] = srcStride[0];
-            }
-            srcStride[1] <<= c->vChrDrop;
-            srcStride[2] <<= c->vChrDrop;
+static int RENAME(swScaleI)(SwsContext *c, uint8_t* src[], stride_t srcStride[], int srcSliceY,
+             int srcSliceH, uint8_t* dst[], stride_t dstStride[], int dstYstart, int dstYend){
+
+	/* load a few things into local vars to make the code more readable? and faster */
+	const int srcW= c->srcW;
+	const int dstW= c->dstW;
+	const int dstH= c->dstH;
+	const int chrDstW= c->chrDstW;
+	const int chrSrcW= c->chrSrcW;
+	const int lumXInc= c->lumXInc;
+	const int chrXInc= c->chrXInc;
+	const int dstFormat= c->dstFormat;
+	const int srcFormat= c->srcFormat;
+	const SwsParams params= c->params;
+	const int canMMX2BeUsed= c->canMMX2BeUsed;
+	int16_t *vLumFilterPos= c->vLumFilterPos;
+	int16_t *vChrFilterPos= c->vChrFilterPos;
+	int16_t *hLumFilterPos= c->hLumFilterPos;
+	int16_t *hChrFilterPos= c->hChrFilterPos;
+	int16_t *vLumFilter= c->vLumFilter;
+	int16_t *vChrFilter= c->vChrFilter;
+	int16_t *hLumFilter= c->hLumFilter;
+	int16_t *hChrFilter= c->hChrFilter;
+	int32_t *lumMmxFilter= c->lumMmxFilter;
+	int32_t *chrMmxFilter= c->chrMmxFilter;
+	const int vLumFilterSize= c->vLumFilterSize;
+	const int vChrFilterSize= c->vChrFilterSize;
+	const int hLumFilterSize= c->hLumFilterSize;
+	const int hChrFilterSize= c->hChrFilterSize;
+	int16_t **lumPixBuf= c->lumPixBuf;
+	int16_t **chrPixBuf= c->chrPixBuf;
+	const int vLumBufSize= c->vLumBufSize;
+	const int vChrBufSize= c->vChrBufSize;
+	uint8_t *funnyYCode= c->funnyYCode;
+	uint8_t *funnyUVCode= c->funnyUVCode;
+	uint8_t *formatConvBuffer= c->formatConvBuffer;
+	const int chrSrcSliceY= srcSliceY >> c->chrSrcVSubSample;
+	const int chrSrcSliceH= -((-srcSliceH) >> c->chrSrcVSubSample);
+	int lastDstY;
+
+	/* vars which will change and which we need to store back in the context */
+	int dstY= dstYstart;
+	int lumBufIndex= c->lumBufIndex;
+	int chrBufIndex= c->chrBufIndex;
+	int lastInLumBuf= c->lastInLumBuf;
+	int lastInChrBuf= c->lastInChrBuf;
+
+	if(isPacked(c->srcFormat)){
+		src[0]=
+		src[1]=
+		src[2]= src[0];
+		srcStride[0]=
+		srcStride[1]=
+		srcStride[2]= srcStride[0];
+	}
+	srcStride[1]<<= c->vChrDrop;
+	srcStride[2]<<= c->vChrDrop;
 
 //	printf("swscale %X %X %X -> %X %X %X\n", (int)src[0], (int)src[1], (int)src[2],
 //		(int)dst[0], (int)dst[1], (int)dst[2]);
 
 #if 0 //self test FIXME move to a vfilter or something
-            {
-                static volatile int i = 0;
-                i++;
-                if(srcFormat == IMGFMT_YV12 && i == 1 && srcSliceH >= c->srcH)
-                    selfTest(src, srcStride, c->srcW, c->srcH);
-                i--;
-            }
+{
+static volatile int i=0;
+i++;
+if(srcFormat==IMGFMT_YV12 && i==1 && srcSliceH>= c->srcH)
+	selfTest(src, srcStride, c->srcW, c->srcH);
+i--;
+}
 #endif
 
 //printf("sws Strides:%d %d %d -> %d %d %d\n", srcStride[0],srcStride[1],srcStride[2],
 //dstStride[0],dstStride[1],dstStride[2]);
 
-            if(dstStride[0] % 8 != 0 || dstStride[1] % 8 != 0 || dstStride[2] % 8 != 0)
-            {
-                static int firstTime = 1; //FIXME move this into the context perhaps
-                if(params.debug & SWS_PRINT_INFO && firstTime)
-                {
-                    MSG_WARN("SwScaler: Warning: dstStride is not aligned!\n"
-                             "SwScaler:          ->cannot do aligned memory acesses anymore\n");
-                    firstTime = 0;
-                }
-            }
-
-            /* Note the user might start scaling the picture in the middle so this will not get executed
-               this is not really intended but works currently, so ppl might do it */
-            if(srcSliceY == 0)
-            {
-                lumBufIndex = 0;
-                chrBufIndex = 0;
-                // dstY=0;	moved to RENAME(swScale)
-                lastInLumBuf = -1;
-                lastInChrBuf = -1;
-            }
-            lastDstY = dstY;
-
-            if(vLumFilterPos[dstYend] < srcSliceY || vLumFilterPos[dstYstart] > srcSliceY + srcSliceH)
-            {
-                goto ret0;
-            }
-
-            for(; dstY < dstYend; dstY++)
-            {
-                unsigned char *dest = dst[0] + dstStride[0] * dstY;
-                const int chrDstY = dstY >> c->chrDstVSubSample;
-                unsigned char *uDest = dst[1] + dstStride[1] * chrDstY;
-                unsigned char *vDest = dst[2] + dstStride[2] * chrDstY;
-
-                const int firstLumSrcY = vLumFilterPos[dstY]; //First line needed as input
-                const int firstChrSrcY = vChrFilterPos[chrDstY]; //First line needed as input
-                const int lastLumSrcY = firstLumSrcY + vLumFilterSize - 1; // Last line needed as input
-                const int lastChrSrcY = firstChrSrcY + vChrFilterSize - 1; // Last line needed as input
+	if(dstStride[0]%8 !=0 || dstStride[1]%8 !=0 || dstStride[2]%8 !=0)
+	{
+		static int firstTime=1; //FIXME move this into the context perhaps
+		if(params.debug & SWS_PRINT_INFO && firstTime)
+		{
+			MSG_WARN("SwScaler: Warning: dstStride is not aligned!\n"
+					"SwScaler:          ->cannot do aligned memory acesses anymore\n");
+			firstTime=0;
+		}
+	}
+
+	/* Note the user might start scaling the picture in the middle so this will not get executed
+	   this is not really intended but works currently, so ppl might do it */
+	if(srcSliceY ==0){
+		lumBufIndex=0;
+		chrBufIndex=0;
+		// dstY=0;	moved to RENAME(swScale)
+		lastInLumBuf= -1;
+		lastInChrBuf= -1;
+	}
+	lastDstY= dstY;
+
+	if(vLumFilterPos[dstYend]<srcSliceY || vLumFilterPos[dstYstart]>srcSliceY+srcSliceH){
+		goto ret0;
+	}
+
+	for(;dstY < dstYend; dstY++){
+		unsigned char *dest =dst[0]+dstStride[0]*dstY;
+		const int chrDstY= dstY>>c->chrDstVSubSample;
+		unsigned char *uDest=dst[1]+dstStride[1]*chrDstY;
+		unsigned char *vDest=dst[2]+dstStride[2]*chrDstY;
+
+		const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input
+		const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input
+		const int lastLumSrcY= firstLumSrcY + vLumFilterSize -1; // Last line needed as input
+		const int lastChrSrcY= firstChrSrcY + vChrFilterSize -1; // Last line needed as input
 
 //printf("dstY:%d dstH:%d firstLumSrcY:%d lastInLumBuf:%d vLumBufSize: %d vChrBufSize: %d slice: %d %d vLumFilterSize: %d firstChrSrcY: %d vChrFilterSize: %d c->chrSrcVSubSample: %d\n",
 // dstY, dstH, firstLumSrcY, lastInLumBuf, vLumBufSize, vChrBufSize, srcSliceY, srcSliceH, vLumFilterSize, firstChrSrcY, vChrFilterSize,  c->chrSrcVSubSample);
-                //handle holes (FAST_BILINEAR & weird filters)
-                if(firstLumSrcY > lastInLumBuf) lastInLumBuf = firstLumSrcY - 1;
-                if(firstChrSrcY > lastInChrBuf) lastInChrBuf = firstChrSrcY - 1;
+		//handle holes (FAST_BILINEAR & weird filters)
+		if(firstLumSrcY > lastInLumBuf) lastInLumBuf= firstLumSrcY-1;
+		if(firstChrSrcY > lastInChrBuf) lastInChrBuf= firstChrSrcY-1;
 //printf("%d %d %d\n", firstChrSrcY, lastInChrBuf, vChrBufSize);
-                ASSERT(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1)
-                ASSERT(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1)
-
-                // Do we have enough lines in this slice to output the dstY line
-                if(lastLumSrcY < srcSliceY + srcSliceH && lastChrSrcY < -((-srcSliceY - srcSliceH) >> c->chrSrcVSubSample))
-                {
-                    //Do horizontal scaling
-                    while(lastInLumBuf < lastLumSrcY)
-                    {
-                        uint8_t *s = src[0] + (lastInLumBuf + 1 - srcSliceY) * srcStride[0];
-                        lumBufIndex++;
+		ASSERT(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1)
+		ASSERT(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1)
+
+		// Do we have enough lines in this slice to output the dstY line
+		if(lastLumSrcY < srcSliceY + srcSliceH && lastChrSrcY < -((-srcSliceY - srcSliceH)>>c->chrSrcVSubSample))
+		{
+			//Do horizontal scaling
+			while(lastInLumBuf < lastLumSrcY)
+			{
+				uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
+				lumBufIndex++;
 //				printf("%d %d %d %d\n", lumBufIndex, vLumBufSize, lastInLumBuf,  lastLumSrcY);
-                        ASSERT(lumBufIndex < 2 * vLumBufSize)
-                        ASSERT(lastInLumBuf + 1 - srcSliceY < srcSliceH)
-                        ASSERT(lastInLumBuf + 1 - srcSliceY >= 0)
+				ASSERT(lumBufIndex < 2*vLumBufSize)
+				ASSERT(lastInLumBuf + 1 - srcSliceY < srcSliceH)
+				ASSERT(lastInLumBuf + 1 - srcSliceY >= 0)
 //				printf("%d %d\n", lumBufIndex, vLumBufSize);
-                        RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
-                                        params.methodLuma.method, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
-                                        funnyYCode, c->srcFormat, formatConvBuffer,
-                                        c->lumMmx2Filter, c->lumMmx2FilterPos);
-                        lastInLumBuf++;
-                    }
-                    while(lastInChrBuf < lastChrSrcY)
-                    {
-                        uint8_t *src1 = src[1] + (lastInChrBuf + 1 - chrSrcSliceY) * srcStride[1];
-                        uint8_t *src2 = src[2] + (lastInChrBuf + 1 - chrSrcSliceY) * srcStride[2];
-                        chrBufIndex++;
-                        ASSERT(chrBufIndex < 2 * vChrBufSize)
-                        ASSERT(lastInChrBuf + 1 - chrSrcSliceY < (chrSrcSliceH))
-                        ASSERT(lastInChrBuf + 1 - chrSrcSliceY >= 0)
-                        //FIXME replace parameters through context struct (some at least)
-
-                        if(!(isGray(srcFormat) || isGray(dstFormat)))
-                            RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
-                                            params.methodChroma.method, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
-                                            funnyUVCode, c->srcFormat, formatConvBuffer,
-                                            c->chrMmx2Filter, c->chrMmx2FilterPos);
-                        lastInChrBuf++;
-                    }
-                    //wrap buf index around to stay inside the ring buffer
-                    if(lumBufIndex >= vLumBufSize) lumBufIndex -= vLumBufSize;
-                    if(chrBufIndex >= vChrBufSize) chrBufIndex -= vChrBufSize;
-                }
-                else // not enough lines left in this slice -> load the rest in the buffer
-                {
-                    /*		printf("%d %d Last:%d %d LastInBuf:%d %d Index:%d %d Y:%d FSize: %d %d BSize: %d %d\n",
-                    			firstChrSrcY,firstLumSrcY,lastChrSrcY,lastLumSrcY,
-                    			lastInChrBuf,lastInLumBuf,chrBufIndex,lumBufIndex,dstY,vChrFilterSize,vLumFilterSize,
-                    			vChrBufSize, vLumBufSize);*/
-
-                    //Do horizontal scaling
-                    while(lastInLumBuf + 1 < srcSliceY + srcSliceH)
-                    {
-                        uint8_t *s = src[0] + (lastInLumBuf + 1 - srcSliceY) * srcStride[0];
-                        lumBufIndex++;
-                        ASSERT(lumBufIndex < 2 * vLumBufSize)
-                        ASSERT(lastInLumBuf + 1 - srcSliceY < srcSliceH)
-                        ASSERT(lastInLumBuf + 1 - srcSliceY >= 0)
-                        RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
-                                        params.methodLuma.method, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
-                                        funnyYCode, c->srcFormat, formatConvBuffer,
-                                        c->lumMmx2Filter, c->lumMmx2FilterPos);
-                        lastInLumBuf++;
-                    }
-                    while(lastInChrBuf + 1 < (chrSrcSliceY + chrSrcSliceH))
-                    {
-                        uint8_t *src1 = src[1] + (lastInChrBuf + 1 - chrSrcSliceY) * srcStride[1];
-                        uint8_t *src2 = src[2] + (lastInChrBuf + 1 - chrSrcSliceY) * srcStride[2];
-                        chrBufIndex++;
-                        ASSERT(chrBufIndex < 2 * vChrBufSize)
-                        ASSERT(lastInChrBuf + 1 - chrSrcSliceY < chrSrcSliceH)
-                        ASSERT(lastInChrBuf + 1 - chrSrcSliceY >= 0)
-
-                        if(!(isGray(srcFormat) || isGray(dstFormat)))
-                            RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
-                                            params.methodChroma.method, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
-                                            funnyUVCode, c->srcFormat, formatConvBuffer,
-                                            c->chrMmx2Filter, c->chrMmx2FilterPos);
-                        lastInChrBuf++;
-                    }
-                    //wrap buf index around to stay inside the ring buffer
-                    if(lumBufIndex >= vLumBufSize) lumBufIndex -= vLumBufSize;
-                    if(chrBufIndex >= vChrBufSize) chrBufIndex -= vChrBufSize;
-                    break; //we can't output a dstY line so let's try with the next slice
-                }
+				RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
+						params.methodLuma.method, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
+						funnyYCode, c->srcFormat, formatConvBuffer,
+						c->lumMmx2Filter, c->lumMmx2FilterPos);
+				lastInLumBuf++;
+			}
+			while(lastInChrBuf < lastChrSrcY)
+			{
+				uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
+				uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
+				chrBufIndex++;
+				ASSERT(chrBufIndex < 2*vChrBufSize)
+				ASSERT(lastInChrBuf + 1 - chrSrcSliceY < (chrSrcSliceH))
+				ASSERT(lastInChrBuf + 1 - chrSrcSliceY >= 0)
+				//FIXME replace parameters through context struct (some at least)
+
+				if(!(isGray(srcFormat) || isGray(dstFormat)))
+					RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
+						params.methodChroma.method, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
+						funnyUVCode, c->srcFormat, formatConvBuffer,
+						c->chrMmx2Filter, c->chrMmx2FilterPos);
+				lastInChrBuf++;
+			}
+			//wrap buf index around to stay inside the ring buffer
+			if(lumBufIndex >= vLumBufSize ) lumBufIndex-= vLumBufSize;
+			if(chrBufIndex >= vChrBufSize ) chrBufIndex-= vChrBufSize;
+		}
+		else // not enough lines left in this slice -> load the rest in the buffer
+		{
+/*		printf("%d %d Last:%d %d LastInBuf:%d %d Index:%d %d Y:%d FSize: %d %d BSize: %d %d\n",
+			firstChrSrcY,firstLumSrcY,lastChrSrcY,lastLumSrcY,
+			lastInChrBuf,lastInLumBuf,chrBufIndex,lumBufIndex,dstY,vChrFilterSize,vLumFilterSize,
+			vChrBufSize, vLumBufSize);*/
+
+			//Do horizontal scaling
+			while(lastInLumBuf+1 < srcSliceY + srcSliceH)
+			{
+				uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
+				lumBufIndex++;
+				ASSERT(lumBufIndex < 2*vLumBufSize)
+				ASSERT(lastInLumBuf + 1 - srcSliceY < srcSliceH)
+				ASSERT(lastInLumBuf + 1 - srcSliceY >= 0)
+				RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
+						params.methodLuma.method, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
+						funnyYCode, c->srcFormat, formatConvBuffer,
+						c->lumMmx2Filter, c->lumMmx2FilterPos);
+				lastInLumBuf++;
+			}
+			while(lastInChrBuf+1 < (chrSrcSliceY + chrSrcSliceH))
+			{
+				uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
+				uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
+				chrBufIndex++;
+				ASSERT(chrBufIndex < 2*vChrBufSize)
+				ASSERT(lastInChrBuf + 1 - chrSrcSliceY < chrSrcSliceH)
+				ASSERT(lastInChrBuf + 1 - chrSrcSliceY >= 0)
+
+				if(!(isGray(srcFormat) || isGray(dstFormat)))
+					RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
+						params.methodChroma.method, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
+						funnyUVCode, c->srcFormat, formatConvBuffer,
+						c->chrMmx2Filter, c->chrMmx2FilterPos);
+				lastInChrBuf++;
+			}
+			//wrap buf index around to stay inside the ring buffer
+			if(lumBufIndex >= vLumBufSize ) lumBufIndex-= vLumBufSize;
+			if(chrBufIndex >= vChrBufSize ) chrBufIndex-= vChrBufSize;
+			break; //we can't output a dstY line so let's try with the next slice
+		}
 
 #if HAVE_MMX
-                b5Dither = dither8[dstY&1];
-                g6Dither = dither4[dstY&1];
-                g5Dither = dither8[dstY&1];
-                r5Dither = dither8[(dstY+1)&1];
+            b5Dither= dither8[dstY&1];
+            g6Dither= dither4[dstY&1];
+            g5Dither= dither8[dstY&1];
+            r5Dither= dither8[(dstY+1)&1];
 #endif
-                if(dstY < dstH - 2)
-                {
-                    int16_t **lumSrcPtr = lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
-                    int16_t **chrSrcPtr = chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
+            if(dstY < dstH-2)
+            {
+                int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
+                int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
 #if HAVE_MMX
-                    int i;
-                    if(params.subsampling & SWS_ACCURATE_RND)
-                    {
-                        int s = APCK_SIZE / 8;
-                        for(i = 0; i < vLumFilterSize; i += 2)
-                        {
-                            *(void**)&lumMmxFilter[s*i+0            ] = lumSrcPtr[i  ];
-                            *(void**)&lumMmxFilter[s*i+APCK_PTR2/4  ] = lumSrcPtr[i+(vLumFilterSize>1)];
-                            lumMmxFilter[s*i+APCK_COEF/4  ] =
-                                lumMmxFilter[s*i+APCK_COEF/4+1] = vLumFilter[dstY*vLumFilterSize + i    ]
-                                                                  + (vLumFilterSize > 1 ? vLumFilter[dstY*vLumFilterSize + i + 1] << 16 : 0);
-                        }
-                        for(i = 0; i < vChrFilterSize; i += 2)
-                        {
-                            *(void**)&chrMmxFilter[s*i+0            ] = chrSrcPtr[i  ];
-                            *(void**)&chrMmxFilter[s*i+APCK_PTR2/4  ] = chrSrcPtr[i+(vChrFilterSize>1)];
-                            chrMmxFilter[s*i+APCK_COEF/4  ] =
-                                chrMmxFilter[s*i+APCK_COEF/4+1] = vChrFilter[chrDstY*vChrFilterSize + i    ]
-                                                                  + (vChrFilterSize > 1 ? vChrFilter[chrDstY*vChrFilterSize + i + 1] << 16 : 0);
-                        }
-                    }
-                    else
-                    {
-                        for(i = 0; i < vLumFilterSize; i++)
-                        {
-                            lumMmxFilter[4*i+0] = (int32_t)lumSrcPtr[i];
-                            lumMmxFilter[4*i+2] =
-                                lumMmxFilter[4*i+3] =
-                                    ((uint16_t)vLumFilter[dstY*vLumFilterSize + i]) * 0x10001;
-                        }
-                        for(i = 0; i < vChrFilterSize; i++)
-                        {
-                            chrMmxFilter[4*i+0] = (int32_t)chrSrcPtr[i];
-                            chrMmxFilter[4*i+2] =
-                                chrMmxFilter[4*i+3] =
-                                    ((uint16_t)vChrFilter[chrDstY*vChrFilterSize + i]) * 0x10001;
-                        }
-                    }
-#endif
-                    if(dstFormat == IMGFMT_NV12 || dstFormat == IMGFMT_NV21)
-                    {
-                        const int chrSkipMask = (1 << c->chrDstVSubSample) - 1;
-                        if(dstY & chrSkipMask) uDest = NULL; //FIXME split functions in lumi / chromi
-                        RENAME(yuv2nv12X)(c,
-                                          vLumFilter + dstY * vLumFilterSize   , lumSrcPtr, vLumFilterSize,
-                                          vChrFilter + chrDstY * vChrFilterSize, chrSrcPtr, vChrFilterSize,
-                                          dest, /*uDest*/dstY & chrSkipMask ? NULL : dst[1] + dstStride[1]*chrDstY, dstW, chrDstW, dstFormat);
-                    }
-                    else if(isPlanarYUV(dstFormat) || isGray(dstFormat)) //YV12 like
-                    {
-                        const int chrSkipMask = (1 << c->chrDstVSubSample) - 1;
-                        if((dstY & chrSkipMask) || isGray(dstFormat)) uDest = vDest = NULL; //FIXME split functions in lumi / chromi
-                        if(vLumFilterSize == 1 && vChrFilterSize == 1) // Unscaled YV12
-                        {
-                            int16_t *lumBuf = lumPixBuf[0];
-                            int16_t *chrBuf = chrPixBuf[0];
-                            RENAME(yuv2yuv1)(lumBuf, chrBuf, dest, uDest, vDest, dstW, chrDstW);
-                        }
-                        else //General YV12
-                        {
-                            RENAME(yuv2yuvX)(c,
-                                             vLumFilter + dstY * vLumFilterSize   , lumSrcPtr, vLumFilterSize,
-                                             vChrFilter + chrDstY * vChrFilterSize, chrSrcPtr, vChrFilterSize,
-                                             dest, uDest, vDest, dstW, chrDstW);
-                        }
-                    }
-                    else
-                    {
-                        ASSERT(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize * 2);
-                        ASSERT(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize * 2);
-                        if(vLumFilterSize == 1 && vChrFilterSize == 2) //Unscaled RGB
-                        {
-                            int chrAlpha = vChrFilter[2*dstY+1];
-                            RENAME(yuv2packed1)(c, *lumSrcPtr, *chrSrcPtr, *(chrSrcPtr + 1),
-                                                dest, dstW, chrAlpha, dstFormat, params.v_chr_drop, dstY);
-                        }
-                        else if(vLumFilterSize == 2 && vChrFilterSize == 2) //BiLinear Upscale RGB
-                        {
-                            int lumAlpha = vLumFilter[2*dstY+1];
-                            int chrAlpha = vChrFilter[2*dstY+1];
-                            lumMmxFilter[2] =
-                                lumMmxFilter[3] = vLumFilter[2*dstY   ] * 0x10001;
-                            chrMmxFilter[2] =
-                                chrMmxFilter[3] = vChrFilter[2*chrDstY] * 0x10001;
-                            RENAME(yuv2packed2)(c, *lumSrcPtr, *(lumSrcPtr + 1), *chrSrcPtr, *(chrSrcPtr + 1),
-                                                dest, dstW, lumAlpha, chrAlpha, dstY);
-                        }
-                        else //General RGB
-                        {
-                            RENAME(yuv2packedX)(c,
-                                                vLumFilter + dstY * vLumFilterSize, lumSrcPtr, vLumFilterSize,
-                                                vChrFilter + dstY * vChrFilterSize, chrSrcPtr, vChrFilterSize,
-                                                dest, dstW, dstY);
-                        }
+                int i;
+                if(params.subsampling & SWS_ACCURATE_RND){
+                    int s= APCK_SIZE / 8;
+                    for(i=0; i<vLumFilterSize; i+=2){
+                        *(void**)&lumMmxFilter[s*i+0            ]= lumSrcPtr[i  ];
+                        *(void**)&lumMmxFilter[s*i+APCK_PTR2/4  ]= lumSrcPtr[i+(vLumFilterSize>1)];
+                        lumMmxFilter[s*i+APCK_COEF/4  ]=
+                        lumMmxFilter[s*i+APCK_COEF/4+1]= vLumFilter[dstY*vLumFilterSize + i    ]
+                                        + (vLumFilterSize>1 ? vLumFilter[dstY*vLumFilterSize + i + 1]<<16 : 0);
                     }
-                }
-                else // hmm looks like we can't use MMX here without overwriting this array's tail
-                {
-                    int16_t **lumSrcPtr = lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
-                    int16_t **chrSrcPtr = chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
-                    if(dstFormat == IMGFMT_NV12 || dstFormat == IMGFMT_NV21)
-                    {
-                        const int chrSkipMask = (1 << c->chrDstVSubSample) - 1;
-                        if(dstY & chrSkipMask) uDest = NULL; //FIXME split functions in lumi / chromi
-                        yuv2nv12XinC(
-                            vLumFilter + dstY * vLumFilterSize   , lumSrcPtr, vLumFilterSize,
-                            vChrFilter + chrDstY * vChrFilterSize, chrSrcPtr, vChrFilterSize,
-                            dest, /*uDest*/dstY & chrSkipMask ? NULL : dst[1] + dstStride[1]*chrDstY, dstW, chrDstW, dstFormat);
+                    for(i=0; i<vChrFilterSize; i+=2){
+                        *(void**)&chrMmxFilter[s*i+0            ]= chrSrcPtr[i  ];
+                        *(void**)&chrMmxFilter[s*i+APCK_PTR2/4  ]= chrSrcPtr[i+(vChrFilterSize>1)];
+                        chrMmxFilter[s*i+APCK_COEF/4  ]=
+                        chrMmxFilter[s*i+APCK_COEF/4+1]= vChrFilter[chrDstY*vChrFilterSize + i    ]
+                                        + (vChrFilterSize>1 ? vChrFilter[chrDstY*vChrFilterSize + i + 1]<<16 : 0);
                     }
-                    else if(isPlanarYUV(dstFormat) || isGray(dstFormat)) //YV12
+                }else{
+                    for(i=0; i<vLumFilterSize; i++)
                     {
-                        const int chrSkipMask = (1 << c->chrDstVSubSample) - 1;
-                        if((dstY & chrSkipMask) || isGray(dstFormat)) uDest = vDest = NULL; //FIXME split functions in lumi / chromi
-                        yuv2yuvXinC(
-                            vLumFilter + dstY * vLumFilterSize   , lumSrcPtr, vLumFilterSize,
-                            vChrFilter + chrDstY * vChrFilterSize, chrSrcPtr, vChrFilterSize,
-                            dest, uDest, vDest, dstW, chrDstW);
+                        lumMmxFilter[4*i+0]= (int32_t)lumSrcPtr[i];
+                        lumMmxFilter[4*i+2]=
+                        lumMmxFilter[4*i+3]=
+                        ((uint16_t)vLumFilter[dstY*vLumFilterSize + i])*0x10001;
                     }
-                    else
+                    for(i=0; i<vChrFilterSize; i++)
                     {
-                        ASSERT(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize * 2);
-                        ASSERT(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize * 2);
-                        yuv2packedXinC(c,
-                                       vLumFilter + dstY * vLumFilterSize, lumSrcPtr, vLumFilterSize,
-                                       vChrFilter + dstY * vChrFilterSize, chrSrcPtr, vChrFilterSize,
-                                       dest, dstW, dstY);
+                            chrMmxFilter[4*i+0]= (int32_t)chrSrcPtr[i];
+                            chrMmxFilter[4*i+2]=
+                            chrMmxFilter[4*i+3]=
+                            ((uint16_t)vChrFilter[chrDstY*vChrFilterSize + i])*0x10001;
                     }
-                }
             }
+#endif
+		if(dstFormat == IMGFMT_NV12 || dstFormat == IMGFMT_NV21){
+			const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
+			if(dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
+			RENAME(yuv2nv12X)(c,
+				vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
+				vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
+				dest, /*uDest*/dstY&chrSkipMask?NULL:dst[1]+dstStride[1]*chrDstY, dstW, chrDstW, dstFormat);
+		}
+		else if(isPlanarYUV(dstFormat) || isGray(dstFormat)) //YV12 like
+		{
+			const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
+			if((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
+			if(vLumFilterSize == 1 && vChrFilterSize == 1) // Unscaled YV12
+			{
+				int16_t *lumBuf = lumPixBuf[0];
+				int16_t *chrBuf= chrPixBuf[0];
+				RENAME(yuv2yuv1)(lumBuf, chrBuf, dest, uDest, vDest, dstW, chrDstW);
+			}
+			else //General YV12
+			{
+				RENAME(yuv2yuvX)(c,
+					vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
+					vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
+					dest, uDest, vDest, dstW, chrDstW);
+			}
+		}
+		else
+		{
+			ASSERT(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
+			ASSERT(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
+			if(vLumFilterSize == 1 && vChrFilterSize == 2) //Unscaled RGB
+			{
+				int chrAlpha= vChrFilter[2*dstY+1];
+				RENAME(yuv2packed1)(c, *lumSrcPtr, *chrSrcPtr, *(chrSrcPtr+1),
+						 dest, dstW, chrAlpha, dstFormat, params.v_chr_drop, dstY);
+			}
+			else if(vLumFilterSize == 2 && vChrFilterSize == 2) //BiLinear Upscale RGB
+			{
+				int lumAlpha= vLumFilter[2*dstY+1];
+				int chrAlpha= vChrFilter[2*dstY+1];
+                                lumMmxFilter[2]=
+                                lumMmxFilter[3]= vLumFilter[2*dstY   ]*0x10001;
+                                chrMmxFilter[2]=
+                                chrMmxFilter[3]= vChrFilter[2*chrDstY]*0x10001;
+				RENAME(yuv2packed2)(c, *lumSrcPtr, *(lumSrcPtr+1), *chrSrcPtr, *(chrSrcPtr+1),
+						 dest, dstW, lumAlpha, chrAlpha, dstY);
+			}
+			else //General RGB
+			{
+				RENAME(yuv2packedX)(c,
+					vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
+					vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
+					dest, dstW, dstY);
+			}
+		}
+            }
+	    else // hmm looks like we can't use MMX here without overwriting this array's tail
+	    {
+		int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
+		int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
+		if(dstFormat == IMGFMT_NV12 || dstFormat == IMGFMT_NV21){
+			const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
+			if(dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
+			yuv2nv12XinC(
+				vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
+				vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
+				dest, /*uDest*/dstY&chrSkipMask?NULL:dst[1]+dstStride[1]*chrDstY, dstW, chrDstW, dstFormat);
+		}
+		else if(isPlanarYUV(dstFormat) || isGray(dstFormat)) //YV12
+		{
+			const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
+			if((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
+			yuv2yuvXinC(
+				vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
+				vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
+				dest, uDest, vDest, dstW, chrDstW);
+		}
+		else
+		{
+			ASSERT(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
+			ASSERT(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
+			yuv2packedXinC(c,
+				vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
+				vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
+				dest, dstW, dstY);
+		}
+	    }
+	}
 
 #if HAVE_MMX
-            __asm __volatile(SFENCE:::"memory");
-            __asm __volatile(EMMS:::"memory");
+	__asm __volatile(SFENCE:::"memory");
+	__asm __volatile(EMMS:::"memory");
 #endif
 ret0:
-            /* store changed local vars back in the context */
-            c->dstY = dstY;
-            c->lumBufIndex = lumBufIndex;
-            c->chrBufIndex = chrBufIndex;
-            c->lastInLumBuf = lastInLumBuf;
-            c->lastInChrBuf = lastInChrBuf;
-            return dstY - lastDstY;
-        }
-
-        int RENAME(sws_thread_work)(SwsContext * c)		// Thread func
-        {
-            SwsThreadParam *stp = &c->stp;
-            return RENAME(swScaleI)(c, stp->src, stp->srcStride, stp->srcSliceY,
-                                    stp->srcSliceH, stp->dst, stp->dstStride, stp->dstYstart, stp->dstYend);
-        }
-
-        static int RENAME(swScale)(SwsContext * c, uint8_t * src[], stride_t srcStride[], int srcSliceY,
-                                   int srcSliceH, uint8_t * dst[], stride_t dstStride[])
-        {
-            int dstLines;
-            int i;
-            int lastDstY;
-            int processedLines = 0;
+	/* store changed local vars back in the context */
+	c->dstY= dstY;
+	c->lumBufIndex= lumBufIndex;
+	c->chrBufIndex= chrBufIndex;
+	c->lastInLumBuf= lastInLumBuf;
+	c->lastInChrBuf= lastInChrBuf;
+	return dstY - lastDstY;
+}
 
-            if(srcSliceY == 0) c->dstY = 0;
+int RENAME(sws_thread_work)(SwsContext *c)		// Thread func
+{
+	SwsThreadParam *stp= &c->stp;
+	return RENAME(swScaleI)(c, stp->src, stp->srcStride, stp->srcSliceY,
+			stp->srcSliceH, stp->dst, stp->dstStride, stp->dstYstart, stp->dstYend);
+}
 
-            if(c->thread_count == 1)
-            {
-                return RENAME(swScaleI)(c, src, srcStride, srcSliceY,
-                                        srcSliceH, dst, dstStride, c->dstY, c->dstH);
-            }
-            else
-            {
-                lastDstY = c->dstY;
-                c[0].stp.dstYstart = c->dstY;
-                dstLines = (c->dstH - c->dstY) / c->thread_count;
-                c[0].stp.dstYend = c->dstY + dstLines;
-                for(i = 0; i < c->thread_count; i++)
-                {
-                    c[i].stp.c = &c[i];
-                    c[i].stp.src = src;
-                    c[i].stp.srcStride = srcStride;
-                    c[i].stp.srcSliceY = srcSliceY;
-                    c[i].stp.srcSliceH = srcSliceH;
-                    c[i].stp.dst = dst;
-                    c[i].stp.dstStride = dstStride;
-                    c[i].stp.dstYstart = c[0].stp.dstYstart + dstLines * i; // +2*i; //(+2*i makes green or black line in the middle of screen; test item to see multihreading)
-                    c[i].stp.dstYend  = c[0].stp.dstYend +  dstLines * i;
-                }
-                c[c->thread_count-1].stp.dstYend = c->dstH;
-                c->execute(c, RENAME(sws_thread_work), c->ret, c->thread_count);
-                for(i = 0; i < c->thread_count; i++)
-                {
-                    processedLines += c->ret[i];
-                }
-                c->dstY = lastDstY + processedLines;
-                return processedLines;
-            }
-        }
+static int RENAME(swScale)(SwsContext *c, uint8_t* src[], stride_t srcStride[], int srcSliceY,
+             int srcSliceH, uint8_t* dst[], stride_t dstStride[])
+{
+	int dstLines;
+	int i;
+	int lastDstY;
+	int processedLines=0;
+
+	if(srcSliceY==0) c->dstY= 0;
+
+	if (c->thread_count==1)
+	{
+		return RENAME(swScaleI)(c,src,srcStride,srcSliceY,
+			srcSliceH,dst,dstStride,c->dstY,c->dstH);
+	}
+	else
+	{
+		lastDstY= c->dstY;
+		c[0].stp.dstYstart= c->dstY;
+		dstLines= (c->dstH-c->dstY)/c->thread_count;
+		c[0].stp.dstYend= c->dstY + dstLines;
+		for (i=0; i<c->thread_count; i++){
+			c[i].stp.c= &c[i];
+			c[i].stp.src= src;
+			c[i].stp.srcStride= srcStride;
+			c[i].stp.srcSliceY= srcSliceY;
+			c[i].stp.srcSliceH= srcSliceH;
+			c[i].stp.dst= dst;
+			c[i].stp.dstStride= dstStride;
+			c[i].stp.dstYstart= c[0].stp.dstYstart+dstLines*i;// +2*i; //(+2*i makes green or black line in the middle of screen; test item to see multihreading)
+			c[i].stp.dstYend  = c[0].stp.dstYend+  dstLines*i;
+		}
+		c[c->thread_count-1].stp.dstYend= c->dstH;
+		c->execute(c, RENAME(sws_thread_work), c->ret, c->thread_count);
+		for (i=0; i<c->thread_count; i++){
+			processedLines +=c->ret[i];
+		}
+		c->dstY= lastDstY+processedLines;
+		return processedLines;
+	}
+}
diff --git a/src/filters/transform/MpaDecFilter/libflac/src/libFLAC/ia32/nasm.h b/src/filters/transform/MpaDecFilter/libflac/src/libFLAC/ia32/nasm.h
index feca50b48..b7cb2d257 100644
--- a/src/filters/transform/MpaDecFilter/libflac/src/libFLAC/ia32/nasm.h
+++ b/src/filters/transform/MpaDecFilter/libflac/src/libFLAC/ia32/nasm.h
@@ -1,96 +1,75 @@
-;
-libFLAC - Free Lossless Audio Codec library
-;
-Copyright(C) 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009  Josh Coalson
-;
-;
-Redistribution and use in source and binary forms, with or without
-;
-modification, are permitted provided that the following conditions
-;
-are met:
-;
-;
-- Redistributions of source code must retain the above copyright
-;
-notice, this list of conditions and the following disclaimer.
-;
-;
-- Redistributions in binary form must reproduce the above copyright
-;
-notice, this list of conditions and the following disclaimer in the
-;
-documentation and / or other materials provided with the distribution.
-;
-;
-- Neither the name of the Xiph.org Foundation nor the names of its
-;
-contributors may be used to endorse or promote products derived from
-;
-this software without specific prior written permission.
-;
-;
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-;
-``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-;
-LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-;
-A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
-;
-CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-             ;
-EXEMPLARY, OR CONSEQUENTIAL DAMAGES(INCLUDING, BUT NOT LIMITED TO,
-                                    ;  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-                                    ;  PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-;
-LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT(INCLUDING
-        ;  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-;
-SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;  libFLAC - Free Lossless Audio Codec library
+;  Copyright (C) 2001,2002,2003,2004,2005,2006,2007,2008,2009  Josh Coalson
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;
+;  - Redistributions of source code must retain the above copyright
+;  notice, this list of conditions and the following disclaimer.
+;
+;  - Redistributions in binary form must reproduce the above copyright
+;  notice, this list of conditions and the following disclaimer in the
+;  documentation and/or other materials provided with the distribution.
+;
+;  - Neither the name of the Xiph.org Foundation nor the names of its
+;  contributors may be used to endorse or promote products derived from
+;  this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
+;  CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;  EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;  PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-bits 32
+	bits 32
 
-% ifdef OBJ_FORMAT_win32
-% define FLAC__PUBLIC_NEEDS_UNDERSCORE
-% idefine code_section section .text align = 16
-        % idefine data_section section .data align = 32
-                % idefine bss_section  section .bss  align = 32
-                        % elifdef OBJ_FORMAT_aout
-                        % define FLAC__PUBLIC_NEEDS_UNDERSCORE
-                        % idefine code_section section .text
-                        % idefine data_section section .data
-                        % idefine bss_section  section .bss
-                        % elifdef OBJ_FORMAT_aoutb
-                        % define FLAC__PUBLIC_NEEDS_UNDERSCORE
-                        % idefine code_section section .text
-                        % idefine data_section section .data
-                        % idefine bss_section  section .bss
-                        % elifdef OBJ_FORMAT_elf
-                        % idefine code_section section .text align = 16
-                                % idefine data_section section .data align = 32
-                                        % idefine bss_section  section .bss  align = 32
-                                                % else
-                                                    % error unsupported object format!
-                                                    % endif
+%ifdef OBJ_FORMAT_win32
+	%define FLAC__PUBLIC_NEEDS_UNDERSCORE
+	%idefine code_section section .text align=16 
+	%idefine data_section section .data align=32 
+	%idefine bss_section  section .bss  align=32 
+%elifdef OBJ_FORMAT_aout
+	%define FLAC__PUBLIC_NEEDS_UNDERSCORE
+	%idefine code_section section .text
+	%idefine data_section section .data
+	%idefine bss_section  section .bss
+%elifdef OBJ_FORMAT_aoutb
+	%define FLAC__PUBLIC_NEEDS_UNDERSCORE
+	%idefine code_section section .text
+	%idefine data_section section .data
+	%idefine bss_section  section .bss
+%elifdef OBJ_FORMAT_elf
+	%idefine code_section section .text align=16
+	%idefine data_section section .data align=32
+	%idefine bss_section  section .bss  align=32
+%else
+	%error unsupported object format!
+%endif
 
-                                                    % imacro cglobal 1
-                                                    % ifdef FLAC__PUBLIC_NEEDS_UNDERSCORE
-                                                    global _ % 1
-                                                    % else
-                                                        global % 1
-                                                        % endif
-                                                        % endmacro
+%imacro cglobal 1
+	%ifdef FLAC__PUBLIC_NEEDS_UNDERSCORE
+		global _%1
+	%else
+		global %1
+	%endif
+%endmacro
 
-                                                        % imacro cextern 1
-                                                        % ifdef FLAC__PUBLIC_NEEDS_UNDERSCORE
-                                                        extern _ % 1
-                                                        % else
-                                                            extern % 1
-                                                            % endif
-                                                            % endmacro
+%imacro cextern 1
+	%ifdef FLAC__PUBLIC_NEEDS_UNDERSCORE
+		extern _%1
+	%else
+		extern %1
+	%endif
+%endmacro
 
-                                                            % imacro cident 1
-                                                _ % 1:
-                                                % 1:
-                                                            % endmacro
+%imacro cident 1
+_%1:
+%1:
+%endmacro