From 7f3a7b5c40b7fda09cbba4bb53e1ced133970930 Mon Sep 17 00:00:00 2001
From: Justin Ruggles <justin.ruggles@gmail.com>
Date: Mon, 23 May 2011 17:45:51 +0200
Subject: ac3enc: add channel coupling support

Channel coupling is an optional AC-3 feature that increases quality by
combining high frequency information from multiple channels into a
single channel. The per-channel high frequency information is sent with
less accuracy in both the frequency and time domains. This allows more
bits to be used for lower frequencies while preserving enough
information to reconstruct the high frequencies.
---
 doc/encoders.texi         |  43 +++
 libavcodec/ac3.h          |   3 +-
 libavcodec/ac3dec.h       |   5 -
 libavcodec/ac3dec_data.c  |   6 -
 libavcodec/ac3dec_data.h  |   1 -
 libavcodec/ac3enc.c       | 892 +++++++++++++++++++++++++++++++++++++---------
 libavcodec/ac3enc_fixed.c |   2 +-
 libavcodec/ac3enc_float.c |   6 +-
 libavcodec/ac3tab.c       |   7 +
 libavcodec/ac3tab.h       |   1 +
 10 files changed, 787 insertions(+), 179 deletions(-)

diff --git a/doc/encoders.texi b/doc/encoders.texi
index 2f3cecde66..760ec4bad9 100644
--- a/doc/encoders.texi
+++ b/doc/encoders.texi
@@ -365,4 +365,47 @@ is highly recommended that it be left as enabled except for testing purposes.
 
 @end table
 
+@subheading Floating-Point-Only AC-3 Encoding Options
+
+These options are only valid for the floating-point encoder and do not exist
+for the fixed-point encoder due to the corresponding features not being
+implemented in fixed-point.
+
+@table @option
+
+@item -channel_coupling @var{boolean}
+Enables/Disables use of channel coupling, which is an optional AC-3 feature
+that increases quality by combining high frequency information from multiple
+channels into a single channel. The per-channel high frequency information is
+sent with less accuracy in both the frequency and time domains. This allows
+more bits to be used for lower frequencies while preserving enough information
+to reconstruct the high frequencies. This option is enabled by default for the
+floating-point encoder and should generally be left as enabled except for
+testing purposes or to increase encoding speed.
+@table @option
+@item -1
+@itemx auto
+Selected by Encoder (default)
+@item 0
+@itemx off
+Disable Channel Coupling
+@item 1
+@itemx on
+Enable Channel Coupling
+@end table
+
+@item -cpl_start_band @var{number}
+Coupling Start Band. Sets the channel coupling start band, from 1 to 15. If a
+value higher than the bandwidth is used, it will be reduced to 1 less than the
+coupling end band. If @var{auto} is used, the start band will be determined by
+the encoder based on the bit rate, sample rate, and channel layout. This option
+has no effect if channel coupling is disabled.
+@table @option
+@item -1
+@itemx auto
+Selected by Encoder (default)
+@end table
+
+@end table
+
 @c man end ENCODERS
diff --git a/libavcodec/ac3.h b/libavcodec/ac3.h
index 4ed8c2523b..c06f3d542d 100644
--- a/libavcodec/ac3.h
+++ b/libavcodec/ac3.h
@@ -28,7 +28,8 @@
 #define AVCODEC_AC3_H
 
 #define AC3_MAX_CODED_FRAME_SIZE 3840 /* in bytes */
-#define AC3_MAX_CHANNELS 6 /* including LFE channel */
+#define AC3_MAX_CHANNELS 7            /**< maximum number of channels, including coupling channel */
+#define CPL_CH 0                      /**< coupling channel index */
 
 #define AC3_MAX_COEFS   256
 #define AC3_BLOCK_SIZE  256
diff --git a/libavcodec/ac3dec.h b/libavcodec/ac3dec.h
index 6cba95b495..f0ab75ae98 100644
--- a/libavcodec/ac3dec.h
+++ b/libavcodec/ac3dec.h
@@ -58,11 +58,6 @@
 #include "fft.h"
 #include "fmtconvert.h"
 
-/* override ac3.h to include coupling channel */
-#undef AC3_MAX_CHANNELS
-#define AC3_MAX_CHANNELS 7
-#define CPL_CH 0
-
 #define AC3_OUTPUT_LFEON  8
 
 #define SPX_MAX_BANDS    17
diff --git a/libavcodec/ac3dec_data.c b/libavcodec/ac3dec_data.c
index ba3cbd30ef..272a963f08 100644
--- a/libavcodec/ac3dec_data.c
+++ b/libavcodec/ac3dec_data.c
@@ -53,12 +53,6 @@ const uint8_t ff_eac3_hebap_tab[64] = {
     19, 19, 19, 19,
 };
 
-/**
- * Table E2.16 Default Coupling Banding Structure
- */
-const uint8_t ff_eac3_default_cpl_band_struct[18] =
-{ 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1 };
-
 /**
  * Table E2.15 Default Spectral Extension Banding Structure
  */
diff --git a/libavcodec/ac3dec_data.h b/libavcodec/ac3dec_data.h
index a758f8b3d3..c0a584e7b3 100644
--- a/libavcodec/ac3dec_data.h
+++ b/libavcodec/ac3dec_data.h
@@ -27,7 +27,6 @@
 extern const uint8_t ff_ac3_ungroup_3_in_5_bits_tab[32][3];
 
 extern const uint8_t ff_eac3_hebap_tab[64];
-extern const uint8_t ff_eac3_default_cpl_band_struct[18];
 extern const uint8_t ff_eac3_default_spx_band_struct[17];
 
 #endif /* AVCODEC_AC3DEC_DATA_H */
diff --git a/libavcodec/ac3enc.c b/libavcodec/ac3enc.c
index 95bdc58f16..5014fdb753 100644
--- a/libavcodec/ac3enc.c
+++ b/libavcodec/ac3enc.c
@@ -99,6 +99,8 @@ typedef struct AC3EncOptions {
     /* other encoding options */
     int allow_per_frame_metadata;
     int stereo_rematrixing;
+    int channel_coupling;
+    int cpl_start;
 } AC3EncOptions;
 
 /**
@@ -114,10 +116,22 @@ typedef struct AC3Block {
     int16_t  **band_psd;                        ///< psd per critical band
     int16_t  **mask;                            ///< masking curve
     uint16_t **qmant;                           ///< quantized mantissas
+    uint8_t  **cpl_coord_exp;                   ///< coupling coord exponents           (cplcoexp)
+    uint8_t  **cpl_coord_mant;                  ///< coupling coord mantissas           (cplcomant)
     uint8_t  coeff_shift[AC3_MAX_CHANNELS];     ///< fixed-point coefficient shift values
     uint8_t  new_rematrixing_strategy;          ///< send new rematrixing flags in this block
+    int      num_rematrixing_bands;             ///< number of rematrixing bands
     uint8_t  rematrixing_flags[4];              ///< rematrixing flags
     struct AC3Block *exp_ref_block[AC3_MAX_CHANNELS]; ///< reference blocks for EXP_REUSE
+    int      new_cpl_strategy;                  ///< send new coupling strategy
+    int      cpl_in_use;                        ///< coupling in use for this block     (cplinu)
+    uint8_t  channel_in_cpl[AC3_MAX_CHANNELS];  ///< channel in coupling                (chincpl)
+    int      num_cpl_channels;                  ///< number of channels in coupling
+    uint8_t  new_cpl_coords;                    ///< send new coupling coordinates      (cplcoe)
+    uint8_t  cpl_master_exp[AC3_MAX_CHANNELS];  ///< coupling coord master exponents    (mstrcplco)
+    int      new_snr_offsets;                   ///< send new SNR offsets
+    int      new_cpl_leak;                      ///< send new coupling leak info
+    int      end_freq[AC3_MAX_CHANNELS];        ///< end frequency bin                  (endmant)
 } AC3Block;
 
 /**
@@ -164,10 +178,16 @@ typedef struct AC3EncodeContext {
 
     int cutoff;                             ///< user-specified cutoff frequency, in Hz
     int bandwidth_code;                     ///< bandwidth code (0 to 60)               (chbwcod)
-    int nb_coefs[AC3_MAX_CHANNELS];
+    int start_freq[AC3_MAX_CHANNELS];       ///< start frequency bin                    (strtmant)
+    int cpl_end_freq;                       ///< coupling channel end frequency bin
+
+    int cpl_on;                             ///< coupling turned on for this frame
+    int cpl_enabled;                        ///< coupling enabled for all frames
+    int num_cpl_subbands;                   ///< number of coupling subbands            (ncplsubnd)
+    int num_cpl_bands;                      ///< number of coupling bands               (ncplbnd)
+    uint8_t cpl_band_sizes[AC3_MAX_CPL_BANDS];  ///< number of coeffs in each coupling band
 
     int rematrixing_enabled;                ///< stereo rematrixing enabled
-    int num_rematrixing_bands;              ///< number of rematrixing bands
 
     /* bitrate allocation control */
     int slow_gain_code;                     ///< slow gain code                         (sgaincod)
@@ -194,6 +214,8 @@ typedef struct AC3EncodeContext {
     int16_t *band_psd_buffer;
     int16_t *mask_buffer;
     uint16_t *qmant_buffer;
+    uint8_t *cpl_coord_exp_buffer;
+    uint8_t *cpl_coord_mant_buffer;
 
     uint8_t exp_strategy[AC3_MAX_CHANNELS][AC3_MAX_BLOCKS]; ///< exponent strategies
 
@@ -267,6 +289,12 @@ static const AVOption options[] = {
     {"hdcd",     "HDCD",               0, FF_OPT_TYPE_CONST, {.dbl = 1 }, INT_MIN, INT_MAX, AC3ENC_PARAM, "ad_conv_type"},
 /* Other Encoding Options */
 {"stereo_rematrixing", "Stereo Rematrixing", OFFSET(stereo_rematrixing), FF_OPT_TYPE_INT, {.dbl = 1 }, 0, 1, AC3ENC_PARAM},
+#if CONFIG_AC3ENC_FLOAT
+{"channel_coupling",   "Channel Coupling",   OFFSET(channel_coupling),   FF_OPT_TYPE_INT, {.dbl = 1 }, 0, 1, AC3ENC_PARAM, "channel_coupling"},
+    {"auto", "Selected by the Encoder", 0, FF_OPT_TYPE_CONST, {.dbl = -1 }, INT_MIN, INT_MAX, AC3ENC_PARAM, "channel_coupling"},
+{"cpl_start_band", "Coupling Start Band", OFFSET(cpl_start), FF_OPT_TYPE_INT, {.dbl = -1 }, -1, 15, AC3ENC_PARAM, "cpl_start_band"},
+    {"auto", "Selected by the Encoder", 0, FF_OPT_TYPE_CONST, {.dbl = -1 }, INT_MIN, INT_MAX, AC3ENC_PARAM, "cpl_start_band"},
+#endif
 {NULL}
 };
 
@@ -296,9 +324,9 @@ static void scale_coefficients(AC3EncodeContext *s);
 
 /**
  * LUT for number of exponent groups.
- * exponent_group_tab[exponent strategy-1][number of coefficients]
+ * exponent_group_tab[coupling][exponent strategy-1][number of coefficients]
  */
-static uint8_t exponent_group_tab[3][256];
+static uint8_t exponent_group_tab[2][3][256];
 
 
 /**
@@ -357,6 +385,49 @@ static const uint8_t ac3_bandwidth_tab[5][3][19] = {
 };
 
 
+/**
+ * LUT to select the coupling start band based on the bit rate, sample rate, and
+ * number of full-bandwidth channels. -1 = coupling off
+ * ac3_coupling_start_tab[channel_mode-2][sample rate code][bit rate code]
+ *
+ * TODO: more testing for optimal parameters.
+ *       multi-channel tests at 44.1kHz and 32kHz.
+ */
+static const int8_t ac3_coupling_start_tab[6][3][19] = {
+//      32  40  48  56  64  80  96 112 128 160 192 224 256 320 384 448 512 576 640
+
+    // 2/0
+    { {  0,  0,  0,  0,  0,  0,  0,  1,  1,  7,  8, 11, 12, -1, -1, -1, -1, -1, -1 },
+      {  0,  0,  0,  0,  0,  0,  1,  3,  5,  7, 10, 12, 13, -1, -1, -1, -1, -1, -1 },
+      {  0,  0,  0,  0,  1,  2,  2,  9, 13, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1 } },
+
+    // 3/0
+    { {  0,  0,  0,  0,  0,  0,  0,  0,  2,  2,  6,  9, 11, 12, 13, -1, -1, -1, -1 },
+      {  0,  0,  0,  0,  0,  0,  0,  0,  2,  2,  6,  9, 11, 12, 13, -1, -1, -1, -1 },
+      { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 } },
+
+    // 2/1 - untested
+    { {  0,  0,  0,  0,  0,  0,  0,  0,  2,  2,  6,  9, 11, 12, 13, -1, -1, -1, -1 },
+      {  0,  0,  0,  0,  0,  0,  0,  0,  2,  2,  6,  9, 11, 12, 13, -1, -1, -1, -1 },
+      { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 } },
+
+    // 3/1
+    { {  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  3,  2, 10, 11, 11, 12, 12, 14, -1 },
+      {  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  3,  2, 10, 11, 11, 12, 12, 14, -1 },
+      { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 } },
+
+    // 2/2 - untested
+    { {  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  3,  2, 10, 11, 11, 12, 12, 14, -1 },
+      {  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  3,  2, 10, 11, 11, 12, 12, 14, -1 },
+      { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 } },
+
+    // 3/2
+    { {  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  6,  8, 11, 12, 12, -1, -1 },
+      {  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  6,  8, 11, 12, 12, -1, -1 },
+      { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 } },
+};
+
+
 /**
  * Adjust the frame size to make the average bit rate match the target bit rate.
  * This is only needed for 11025, 22050, and 44100 sample rates.
@@ -419,15 +490,297 @@ static void apply_mdct(AC3EncodeContext *s)
 
             apply_window(&s->dsp, s->windowed_samples, input_samples, s->mdct.window, AC3_WINDOW_SIZE);
 
-            block->coeff_shift[ch] = normalize_samples(s);
+            block->coeff_shift[ch+1] = normalize_samples(s);
 
-            s->mdct.fft.mdct_calcw(&s->mdct.fft, block->mdct_coef[ch],
+            s->mdct.fft.mdct_calcw(&s->mdct.fft, block->mdct_coef[ch+1],
                                    s->windowed_samples);
         }
     }
 }
 
 
+static void compute_coupling_strategy(AC3EncodeContext *s)
+{
+    int blk, ch;
+    int got_cpl_snr;
+
+    /* set coupling use flags for each block/channel */
+    /* TODO: turn coupling on/off and adjust start band based on bit usage */
+    for (blk = 0; blk < AC3_MAX_BLOCKS; blk++) {
+        AC3Block *block = &s->blocks[blk];
+        for (ch = 1; ch <= s->fbw_channels; ch++)
+            block->channel_in_cpl[ch] = s->cpl_on;
+    }
+
+    /* enable coupling for each block if at least 2 channels have coupling
+       enabled for that block */
+    got_cpl_snr = 0;
+    for (blk = 0; blk < AC3_MAX_BLOCKS; blk++) {
+        AC3Block *block = &s->blocks[blk];
+        block->num_cpl_channels = 0;
+        for (ch = 1; ch <= s->fbw_channels; ch++)
+            block->num_cpl_channels += block->channel_in_cpl[ch];
+        block->cpl_in_use = block->num_cpl_channels > 1;
+        if (!block->cpl_in_use) {
+            block->num_cpl_channels = 0;
+            for (ch = 1; ch <= s->fbw_channels; ch++)
+                block->channel_in_cpl[ch] = 0;
+        }
+
+        block->new_cpl_strategy = !blk;
+        if (blk) {
+            for (ch = 1; ch <= s->fbw_channels; ch++) {
+                if (block->channel_in_cpl[ch] != s->blocks[blk-1].channel_in_cpl[ch]) {
+                    block->new_cpl_strategy = 1;
+                    break;
+                }
+            }
+        }
+        block->new_cpl_leak = block->new_cpl_strategy;
+
+        if (!blk || (block->cpl_in_use && !got_cpl_snr)) {
+            block->new_snr_offsets = 1;
+            if (block->cpl_in_use)
+                got_cpl_snr = 1;
+        } else {
+            block->new_snr_offsets = 0;
+        }
+    }
+
+    /* set bandwidth for each channel */
+    for (blk = 0; blk < AC3_MAX_BLOCKS; blk++) {
+        AC3Block *block = &s->blocks[blk];
+        for (ch = 1; ch <= s->fbw_channels; ch++) {
+            if (block->channel_in_cpl[ch])
+                block->end_freq[ch] = s->start_freq[CPL_CH];
+            else
+                block->end_freq[ch] = s->bandwidth_code * 3 + 73;
+        }
+    }
+}
+
+
+/**
+ * Calculate a single coupling coordinate.
+ */
+static inline float calc_cpl_coord(float energy_ch, float energy_cpl)
+{
+    float coord = 0.125;
+    if (energy_cpl > 0)
+        coord *= sqrtf(energy_ch / energy_cpl);
+    return coord;
+}
+
+
+/**
+ * Calculate coupling channel and coupling coordinates.
+ * TODO: Currently this is only used for the floating-point encoder. I was
+ *       able to make it work for the fixed-point encoder, but quality was
+ *       generally lower in most cases than not using coupling. If a more
+ *       adaptive coupling strategy were to be implemented it might be useful
+ *       at that time to use coupling for the fixed-point encoder as well.
+ */
+static void apply_channel_coupling(AC3EncodeContext *s)
+{
+#if CONFIG_AC3ENC_FLOAT
+    DECLARE_ALIGNED(16, float,   cpl_coords)      [AC3_MAX_BLOCKS][AC3_MAX_CHANNELS][16];
+    DECLARE_ALIGNED(16, int32_t, fixed_cpl_coords)[AC3_MAX_BLOCKS][AC3_MAX_CHANNELS][16];
+    int blk, ch, bnd, i, j;
+    CoefSumType energy[AC3_MAX_BLOCKS][AC3_MAX_CHANNELS][16] = {{{0}}};
+    int num_cpl_coefs = s->num_cpl_subbands * 12;
+
+    /* calculate coupling channel from fbw channels */
+    for (blk = 0; blk < AC3_MAX_BLOCKS; blk++) {
+        AC3Block *block = &s->blocks[blk];
+        CoefType *cpl_coef = &block->mdct_coef[CPL_CH][s->start_freq[CPL_CH]];
+        if (!block->cpl_in_use)
+            continue;
+        memset(cpl_coef-1, 0, (num_cpl_coefs+4) * sizeof(*cpl_coef));
+        for (ch = 1; ch <= s->fbw_channels; ch++) {
+            CoefType *ch_coef = &block->mdct_coef[ch][s->start_freq[CPL_CH]];
+            if (!block->channel_in_cpl[ch])
+                continue;
+            for (i = 0; i < num_cpl_coefs; i++)
+                cpl_coef[i] += ch_coef[i];
+        }
+        /* note: coupling start bin % 4 will always be 1 and num_cpl_coefs
+                 will always be a multiple of 12, so we need to subtract 1 from
+                 the start and add 4 to the length when using optimized
+                 functions which require 16-byte alignment. */
+
+        /* coefficients must be clipped to +/- 1.0 in order to be encoded */
+        s->dsp.vector_clipf(cpl_coef-1, cpl_coef-1, -1.0f, 1.0f, num_cpl_coefs+4);
+
+        /* scale coupling coefficients from float to 24-bit fixed-point */
+        s->ac3dsp.float_to_fixed24(&block->fixed_coef[CPL_CH][s->start_freq[CPL_CH]-1],
+                                   cpl_coef-1, num_cpl_coefs+4);
+    }
+
+    /* calculate energy in each band in coupling channel and each fbw channel */
+    /* TODO: possibly use SIMD to speed up energy calculation */
+    bnd = 0;
+    i = s->start_freq[CPL_CH];
+    while (i < s->cpl_end_freq) {
+        int band_size = s->cpl_band_sizes[bnd];
+        for (ch = CPL_CH; ch <= s->fbw_channels; ch++) {
+            for (blk = 0; blk < AC3_MAX_BLOCKS; blk++) {
+                AC3Block *block = &s->blocks[blk];
+                if (!block->cpl_in_use || (ch > CPL_CH && !block->channel_in_cpl[ch]))
+                    continue;
+                for (j = 0; j < band_size; j++) {
+                    CoefType v = block->mdct_coef[ch][i+j];
+                    MAC_COEF(energy[blk][ch][bnd], v, v);
+                }
+            }
+        }
+        i += band_size;
+        bnd++;
+    }
+
+    /* determine which blocks to send new coupling coordinates for */
+    for (blk = 0; blk < AC3_MAX_BLOCKS; blk++) {
+        AC3Block *block  = &s->blocks[blk];
+        AC3Block *block0 = blk ? &s->blocks[blk-1] : NULL;
+        int new_coords = 0;
+        CoefSumType coord_diff[AC3_MAX_CHANNELS] = {0,};
+
+        if (block->cpl_in_use) {
+            /* calculate coupling coordinates for all blocks and calculate the
+               average difference between coordinates in successive blocks */
+            for (ch = 1; ch <= s->fbw_channels; ch++) {
+                if (!block->channel_in_cpl[ch])
+                    continue;
+
+                for (bnd = 0; bnd < s->num_cpl_bands; bnd++) {
+                    cpl_coords[blk][ch][bnd] = calc_cpl_coord(energy[blk][ch][bnd],
+                                                              energy[blk][CPL_CH][bnd]);
+                    if (blk > 0 && block0->cpl_in_use &&
+                        block0->channel_in_cpl[ch]) {
+                        coord_diff[ch] += fabs(cpl_coords[blk-1][ch][bnd] -
+                                               cpl_coords[blk  ][ch][bnd]);
+                    }
+                }
+                coord_diff[ch] /= s->num_cpl_bands;
+            }
+
+            /* send new coordinates if this is the first block, if previous
+             * block did not use coupling but this block does, the channels
+             * using coupling has changed from the previous block, or the
+             * coordinate difference from the last block for any channel is
+             * greater than a threshold value. */
+            if (blk == 0) {
+                new_coords = 1;
+            } else if (!block0->cpl_in_use) {
+                new_coords = 1;
+            } else {
+                for (ch = 1; ch <= s->fbw_channels; ch++) {
+                    if (block->channel_in_cpl[ch] && !block0->channel_in_cpl[ch]) {
+                        new_coords = 1;
+                        break;
+                    }
+                }
+                if (!new_coords) {
+                    for (ch = 1; ch <= s->fbw_channels; ch++) {
+                        if (block->channel_in_cpl[ch] && coord_diff[ch] > 0.04) {
+                            new_coords = 1;
+                            break;
+                        }
+                    }
+                }
+            }
+        }
+        block->new_cpl_coords = new_coords;
+    }
+
+    /* calculate final coupling coordinates, taking into account reusing of
+       coordinates in successive blocks */
+    for (bnd = 0; bnd < s->num_cpl_bands; bnd++) {
+        blk = 0;
+        while (blk < AC3_MAX_BLOCKS) {
+            int blk1;
+            CoefSumType energy_cpl;
+            AC3Block *block  = &s->blocks[blk];
+
+            if (!block->cpl_in_use) {
+                blk++;
+                continue;
+            }
+
+            energy_cpl = energy[blk][CPL_CH][bnd];
+            blk1 = blk+1;
+            while (!s->blocks[blk1].new_cpl_coords && blk1 < AC3_MAX_BLOCKS) {
+                if (s->blocks[blk1].cpl_in_use)
+                    energy_cpl += energy[blk1][CPL_CH][bnd];
+                blk1++;
+            }
+
+            for (ch = 1; ch <= s->fbw_channels; ch++) {
+                CoefType energy_ch;
+                if (!block->channel_in_cpl[ch])
+                    continue;
+                energy_ch = energy[blk][ch][bnd];
+                blk1 = blk+1;
+                while (!s->blocks[blk1].new_cpl_coords && blk1 < AC3_MAX_BLOCKS) {
+                    if (s->blocks[blk1].cpl_in_use)
+                        energy_ch += energy[blk1][ch][bnd];
+                    blk1++;
+                }
+                cpl_coords[blk][ch][bnd] = calc_cpl_coord(energy_ch, energy_cpl);
+            }
+            blk = blk1;
+        }
+    }
+
+    /* calculate exponents/mantissas for coupling coordinates */
+    for (blk = 0; blk < AC3_MAX_BLOCKS; blk++) {
+        AC3Block *block = &s->blocks[blk];
+        if (!block->cpl_in_use || !block->new_cpl_coords)
+            continue;
+
+        s->ac3dsp.float_to_fixed24(fixed_cpl_coords[blk][1],
+                                   cpl_coords[blk][1],
+                                   s->fbw_channels * 16);
+        s->ac3dsp.extract_exponents(block->cpl_coord_exp[1],
+                                    fixed_cpl_coords[blk][1],
+                                    s->fbw_channels * 16);
+
+        for (ch = 1; ch <= s->fbw_channels; ch++) {
+            int bnd, min_exp, max_exp, master_exp;
+
+            /* determine master exponent */
+            min_exp = max_exp = block->cpl_coord_exp[ch][0];
+            for (bnd = 1; bnd < s->num_cpl_bands; bnd++) {
+                int exp = block->cpl_coord_exp[ch][bnd];
+                min_exp = FFMIN(exp, min_exp);
+                max_exp = FFMAX(exp, max_exp);
+            }
+            master_exp = ((max_exp - 15) + 2) / 3;
+            master_exp = FFMAX(master_exp, 0);
+            while (min_exp < master_exp * 3)
+                master_exp--;
+            for (bnd = 0; bnd < s->num_cpl_bands; bnd++) {
+                block->cpl_coord_exp[ch][bnd] = av_clip(block->cpl_coord_exp[ch][bnd] -
+                                                        master_exp * 3, 0, 15);
+            }
+            block->cpl_master_exp[ch] = master_exp;
+
+            /* quantize mantissas */
+            for (bnd = 0; bnd < s->num_cpl_bands; bnd++) {
+                int cpl_exp  = block->cpl_coord_exp[ch][bnd];
+                int cpl_mant = (fixed_cpl_coords[blk][ch][bnd] << (5 + cpl_exp + master_exp * 3)) >> 24;
+                if (cpl_exp == 15)
+                    cpl_mant >>= 1;
+                else
+                    cpl_mant -= 16;
+
+                block->cpl_coord_mant[ch][bnd] = cpl_mant;
+            }
+        }
+    }
+#endif /* CONFIG_AC3ENC_FLOAT */
+}
+
+
 /**
  * Determine rematrixing flags for each block and band.
  */
@@ -440,23 +793,32 @@ static void compute_rematrixing_strategy(AC3EncodeContext *s)
     if (s->channel_mode != AC3_CHMODE_STEREO)
         return;
 
-    s->num_rematrixing_bands = 4;
-
-    nb_coefs = FFMIN(s->nb_coefs[0], s->nb_coefs[1]);
-
     for (blk = 0; blk < AC3_MAX_BLOCKS; blk++) {
         block = &s->blocks[blk];
         block->new_rematrixing_strategy = !blk;
-        if (!s->rematrixing_enabled)
+
+        if (!s->rematrixing_enabled) {
+            block0 = block;
             continue;
-        for (bnd = 0; bnd < s->num_rematrixing_bands; bnd++) {
+        }
+
+        block->num_rematrixing_bands = 4;
+        if (block->cpl_in_use) {
+            block->num_rematrixing_bands -= (s->start_freq[CPL_CH] <= 61);
+            block->num_rematrixing_bands -= (s->start_freq[CPL_CH] == 37);
+            if (blk && block->num_rematrixing_bands != block0->num_rematrixing_bands)
+                block->new_rematrixing_strategy = 1;
+        }
+        nb_coefs = FFMIN(block->end_freq[1], block->end_freq[2]);
+
+        for (bnd = 0; bnd < block->num_rematrixing_bands; bnd++) {
             /* calculate calculate sum of squared coeffs for one band in one block */
             int start = ff_ac3_rematrix_band_tab[bnd];
             int end   = FFMIN(nb_coefs, ff_ac3_rematrix_band_tab[bnd+1]);
             CoefSumType sum[4] = {0,};
             for (i = start; i < end; i++) {
-                CoefType lt = block->mdct_coef[0][i];
-                CoefType rt = block->mdct_coef[1][i];
+                CoefType lt = block->mdct_coef[1][i];
+                CoefType rt = block->mdct_coef[2][i];
                 CoefType md = lt + rt;
                 CoefType sd = lt - rt;
                 MAC_COEF(sum[0], lt, lt);
@@ -495,21 +857,20 @@ static void apply_rematrixing(AC3EncodeContext *s)
     if (!s->rematrixing_enabled)
         return;
 
-    nb_coefs = FFMIN(s->nb_coefs[0], s->nb_coefs[1]);
-
     for (blk = 0; blk < AC3_MAX_BLOCKS; blk++) {
         AC3Block *block = &s->blocks[blk];
         if (block->new_rematrixing_strategy)
             flags = block->rematrixing_flags;
-        for (bnd = 0; bnd < s->num_rematrixing_bands; bnd++) {
+        nb_coefs = FFMIN(block->end_freq[1], block->end_freq[2]);
+        for (bnd = 0; bnd < block->num_rematrixing_bands; bnd++) {
             if (flags[bnd]) {
                 start = ff_ac3_rematrix_band_tab[bnd];
                 end   = FFMIN(nb_coefs, ff_ac3_rematrix_band_tab[bnd+1]);
                 for (i = start; i < end; i++) {
-                    int32_t lt = block->fixed_coef[0][i];
-                    int32_t rt = block->fixed_coef[1][i];
-                    block->fixed_coef[0][i] = (lt + rt) >> 1;
-                    block->fixed_coef[1][i] = (lt - rt) >> 1;
+                    int32_t lt = block->fixed_coef[1][i];
+                    int32_t rt = block->fixed_coef[2][i];
+                    block->fixed_coef[1][i] = (lt + rt) >> 1;
+                    block->fixed_coef[2][i] = (lt - rt) >> 1;
                 }
             }
         }
@@ -526,12 +887,13 @@ static av_cold void exponent_init(AC3EncodeContext *s)
 
     for (expstr = EXP_D15-1; expstr <= EXP_D45-1; expstr++) {
         grpsize = 3 << expstr;
-        for (i = 73; i < 256; i++) {
-            exponent_group_tab[expstr][i] = (i + grpsize - 4) / grpsize;
+        for (i = 12; i < 256; i++) {
+            exponent_group_tab[0][expstr][i] = (i + grpsize - 4) / grpsize;
+            exponent_group_tab[1][expstr][i] = (i              ) / grpsize;
         }
     }
     /* LFE */
-    exponent_group_tab[0][7] = 2;
+    exponent_group_tab[0][0][7] = 2;
 }
 
 
@@ -544,7 +906,7 @@ static void extract_exponents(AC3EncodeContext *s)
 {
     int blk, ch;
 
-    for (ch = 0; ch < s->channels; ch++) {
+    for (ch = !s->cpl_on; ch <= s->channels; ch++) {
         for (blk = 0; blk < AC3_MAX_BLOCKS; blk++) {
             AC3Block *block = &s->blocks[blk];
             s->ac3dsp.extract_exponents(block->exp[ch], block->fixed_coef[ch],
@@ -569,7 +931,7 @@ static void compute_exp_strategy(AC3EncodeContext *s)
 {
     int ch, blk, blk1;
 
-    for (ch = 0; ch < s->fbw_channels; ch++) {
+    for (ch = !s->cpl_on; ch <= s->fbw_channels; ch++) {
         uint8_t *exp_strategy = s->exp_strategy[ch];
         uint8_t *exp          = s->blocks[0].exp[ch];
         int exp_diff;
@@ -578,13 +940,18 @@ static void compute_exp_strategy(AC3EncodeContext *s)
            reused in the next frame */
         exp_strategy[0] = EXP_NEW;
         exp += AC3_MAX_COEFS;
-        for (blk = 1; blk < AC3_MAX_BLOCKS; blk++) {
+        for (blk = 1; blk < AC3_MAX_BLOCKS; blk++, exp += AC3_MAX_COEFS) {
+            if ((ch == CPL_CH && (!s->blocks[blk].cpl_in_use || !s->blocks[blk-1].cpl_in_use)) ||
+                (ch  > CPL_CH && (s->blocks[blk].channel_in_cpl[ch] != s->blocks[blk-1].channel_in_cpl[ch]))) {
+                exp_strategy[blk] = EXP_NEW;
+                continue;
+            }
             exp_diff = s->dsp.sad[0](NULL, exp, exp - AC3_MAX_COEFS, 16, 16);
-            if (exp_diff > EXP_DIFF_THRESHOLD)
+            exp_strategy[blk] = EXP_REUSE;
+            if (ch == CPL_CH && exp_diff > (EXP_DIFF_THRESHOLD * (s->blocks[blk].end_freq[ch] - s->start_freq[ch]) / AC3_MAX_COEFS))
+                exp_strategy[blk] = EXP_NEW;
+            else if (ch > CPL_CH && exp_diff > EXP_DIFF_THRESHOLD)
                 exp_strategy[blk] = EXP_NEW;
-            else
-                exp_strategy[blk] = EXP_REUSE;
-            exp += AC3_MAX_COEFS;
         }
 
         /* now select the encoding strategy type : if exponents are often
@@ -615,25 +982,26 @@ static void compute_exp_strategy(AC3EncodeContext *s)
 /**
  * Update the exponents so that they are the ones the decoder will decode.
  */
-static void encode_exponents_blk_ch(uint8_t *exp, int nb_exps, int exp_strategy)
+static void encode_exponents_blk_ch(uint8_t *exp, int nb_exps, int exp_strategy,
+                                    int cpl)
 {
     int nb_groups, i, k;
 
-    nb_groups = exponent_group_tab[exp_strategy-1][nb_exps] * 3;
+    nb_groups = exponent_group_tab[cpl][exp_strategy-1][nb_exps] * 3;
 
     /* for each group, compute the minimum exponent */
     switch(exp_strategy) {
     case EXP_D25:
-        for (i = 1, k = 1; i <= nb_groups; i++) {
+        for (i = 1, k = 1-cpl; i <= nb_groups; i++) {
             uint8_t exp_min = exp[k];
             if (exp[k+1] < exp_min)
                 exp_min = exp[k+1];
-            exp[i] = exp_min;
+            exp[i-cpl] = exp_min;
             k += 2;
         }
         break;
     case EXP_D45:
-        for (i = 1, k = 1; i <= nb_groups; i++) {
+        for (i = 1, k = 1-cpl; i <= nb_groups; i++) {
             uint8_t exp_min = exp[k];
             if (exp[k+1] < exp_min)
                 exp_min = exp[k+1];
@@ -641,14 +1009,14 @@ static void encode_exponents_blk_ch(uint8_t *exp, int nb_exps, int exp_strategy)
                 exp_min = exp[k+2];
             if (exp[k+3] < exp_min)
                 exp_min = exp[k+3];
-            exp[i] = exp_min;
+            exp[i-cpl] = exp_min;
             k += 4;
         }
         break;
     }
 
     /* constraint for DC exponent */
-    if (exp[0] > 15)
+    if (!cpl && exp[0] > 15)
         exp[0] = 15;
 
     /* decrease the delta between each groups to within 2 so that they can be
@@ -659,18 +1027,21 @@ static void encode_exponents_blk_ch(uint8_t *exp, int nb_exps, int exp_strategy)
     while (--i >= 0)
         exp[i] = FFMIN(exp[i], exp[i+1] + 2);
 
+    if (cpl)
+        exp[-1] = exp[0] & ~1;
+
     /* now we have the exponent values the decoder will see */
     switch (exp_strategy) {
     case EXP_D25:
-        for (i = nb_groups, k = nb_groups * 2; i > 0; i--) {
-            uint8_t exp1 = exp[i];
+        for (i = nb_groups, k = (nb_groups * 2)-cpl; i > 0; i--) {
+            uint8_t exp1 = exp[i-cpl];
             exp[k--] = exp1;
             exp[k--] = exp1;
         }
         break;
     case EXP_D45:
-        for (i = nb_groups, k = nb_groups * 4; i > 0; i--) {
-            exp[k] = exp[k-1] = exp[k-2] = exp[k-3] = exp[i];
+        for (i = nb_groups, k = (nb_groups * 4)-cpl; i > 0; i--) {
+            exp[k] = exp[k-1] = exp[k-2] = exp[k-3] = exp[i-cpl];
             k -= 4;
         }
         break;
@@ -686,32 +1057,40 @@ static void encode_exponents_blk_ch(uint8_t *exp, int nb_exps, int exp_strategy)
  */
 static void encode_exponents(AC3EncodeContext *s)
 {
-    int blk, blk1, ch;
+    int blk, blk1, ch, cpl;
     uint8_t *exp, *exp_strategy;
     int nb_coefs, num_reuse_blocks;
 
-    for (ch = 0; ch < s->channels; ch++) {
-        exp          = s->blocks[0].exp[ch];
+    for (ch = !s->cpl_on; ch <= s->channels; ch++) {
+        exp          = s->blocks[0].exp[ch] + s->start_freq[ch];
         exp_strategy = s->exp_strategy[ch];
-        nb_coefs     = s->nb_coefs[ch];
 
+        cpl = (ch == CPL_CH);
         blk = 0;
         while (blk < AC3_MAX_BLOCKS) {
+            AC3Block *block = &s->blocks[blk];
+            if (cpl && !block->cpl_in_use) {
+                exp += AC3_MAX_COEFS;
+                blk++;
+                continue;
+            }
+            nb_coefs = block->end_freq[ch] - s->start_freq[ch];
             blk1 = blk + 1;
 
             /* count the number of EXP_REUSE blocks after the current block
                and set exponent reference block pointers */
-            s->blocks[blk].exp_ref_block[ch] = &s->blocks[blk];
+            block->exp_ref_block[ch] = block;
             while (blk1 < AC3_MAX_BLOCKS && exp_strategy[blk1] == EXP_REUSE) {
-                s->blocks[blk1].exp_ref_block[ch] = &s->blocks[blk];
+                s->blocks[blk1].exp_ref_block[ch] = block;
                 blk1++;
             }
             num_reuse_blocks = blk1 - blk - 1;
 
             /* for the EXP_REUSE case we select the min of the exponents */
-            s->ac3dsp.ac3_exponent_min(exp, num_reuse_blocks, nb_coefs);
+            s->ac3dsp.ac3_exponent_min(exp-s->start_freq[ch], num_reuse_blocks,
+                                       AC3_MAX_COEFS);
 
-            encode_exponents_blk_ch(exp, nb_coefs, exp_strategy[blk]);
+            encode_exponents_blk_ch(exp, nb_coefs, exp_strategy[blk], cpl);
 
             exp += AC3_MAX_COEFS * (num_reuse_blocks + 1);
             blk = blk1;
@@ -727,7 +1106,7 @@ static void encode_exponents(AC3EncodeContext *s)
  */
 static void group_exponents(AC3EncodeContext *s)
 {
-    int blk, ch, i;
+    int blk, ch, i, cpl;
     int group_size, nb_groups, bit_count;
     uint8_t *p;
     int delta0, delta1, delta2;
@@ -736,14 +1115,15 @@ static void group_exponents(AC3EncodeContext *s)
     bit_count = 0;
     for (blk = 0; blk < AC3_MAX_BLOCKS; blk++) {
         AC3Block *block = &s->blocks[blk];
-        for (ch = 0; ch < s->channels; ch++) {
+        for (ch = !block->cpl_in_use; ch <= s->channels; ch++) {
             int exp_strategy = s->exp_strategy[ch][blk];
             if (exp_strategy == EXP_REUSE)
                 continue;
+            cpl = (ch == CPL_CH);
             group_size = exp_strategy + (exp_strategy == EXP_D45);
-            nb_groups = exponent_group_tab[exp_strategy-1][s->nb_coefs[ch]];
+            nb_groups = exponent_group_tab[cpl][exp_strategy-1][block->end_freq[ch]-s->start_freq[ch]];
             bit_count += 4 + (nb_groups * 7);
-            p = block->exp[ch];
+            p = block->exp[ch] + s->start_freq[ch] - cpl;
 
             /* DC exponent */
             exp1 = *p++;
@@ -810,9 +1190,7 @@ static void count_frame_bits_fixed(AC3EncodeContext *s)
 
     /* assumptions:
      *   no dynamic range codes
-     *   no channel coupling
      *   bit allocation parameters do not change between blocks
-     *   SNR offsets do not change between blocks
      *   no delta bit allocation
      *   no skipped data
      *   no auxilliary data
@@ -833,11 +1211,6 @@ static void count_frame_bits_fixed(AC3EncodeContext *s)
         /* dynamic range */
         frame_bits++;
 
-        /* coupling strategy */
-        frame_bits++;
-        if (!blk)
-            frame_bits++;
-
         /* exponent strategy */
         frame_bits += 2 * s->fbw_channels;
         if (s->lfe_on)
@@ -848,11 +1221,6 @@ static void count_frame_bits_fixed(AC3EncodeContext *s)
         if (!blk)
             frame_bits += 2 + 2 + 2 + 2 + 3;
 
-        /* snr offsets and fast gain codes */
-        frame_bits++;
-        if (!blk)
-            frame_bits += 6 + s->channels * (4 + 3);
-
         /* delta bit allocation */
         frame_bits++;
 
@@ -884,7 +1252,7 @@ static void bit_alloc_init(AC3EncodeContext *s)
     s->slow_gain_code  = 1;
     s->db_per_bit_code = 3;
     s->floor_code      = 7;
-    for (ch = 0; ch < s->channels; ch++)
+    for (ch = 0; ch <= s->channels; ch++)
         s->fast_gain_code[ch] = 4;
 
     /* initial snr offset */
@@ -898,6 +1266,8 @@ static void bit_alloc_init(AC3EncodeContext *s)
     s->bit_alloc.slow_gain  = ff_ac3_slow_gain_tab[s->slow_gain_code];
     s->bit_alloc.db_per_bit = ff_ac3_db_per_bit_tab[s->db_per_bit_code];
     s->bit_alloc.floor      = ff_ac3_floor_tab[s->floor_code];
+    s->bit_alloc.cpl_fast_leak = 0;
+    s->bit_alloc.cpl_slow_leak = 0;
 
     count_frame_bits_fixed(s);
 }
@@ -926,17 +1296,64 @@ static void count_frame_bits(AC3EncodeContext *s)
 
     /* audio blocks */
     for (blk = 0; blk < AC3_MAX_BLOCKS; blk++) {
+        AC3Block *block = &s->blocks[blk];
+
+        /* coupling strategy */
+        frame_bits++;
+        if (block->new_cpl_strategy) {
+            frame_bits++;
+            if (block->cpl_in_use) {
+                frame_bits += s->fbw_channels;
+                if (s->channel_mode == AC3_CHMODE_STEREO)
+                    frame_bits++;
+                frame_bits += 4 + 4;
+                frame_bits += s->num_cpl_subbands - 1;
+            }
+        }
+
+        /* coupling coordinates */
+        if (block->cpl_in_use) {
+            for (ch = 1; ch <= s->fbw_channels; ch++) {
+                if (block->channel_in_cpl[ch]) {
+                    frame_bits++;
+                    if (block->new_cpl_coords) {
+                        frame_bits += 2;
+                        frame_bits += (4 + 4) * s->num_cpl_bands;
+                    }
+                }
+            }
+        }
+
         /* stereo rematrixing */
         if (s->channel_mode == AC3_CHMODE_STEREO) {
             frame_bits++;
             if (s->blocks[blk].new_rematrixing_strategy)
-                frame_bits += s->num_rematrixing_bands;
+                frame_bits += block->num_rematrixing_bands;
         }
 
         /* bandwidth codes & gain range */
-        for (ch = 0; ch < s->fbw_channels; ch++) {
-            if (s->exp_strategy[ch][blk] != EXP_REUSE)
-                frame_bits += 6 + 2;
+        for (ch = 1; ch <= s->fbw_channels; ch++) {
+            if (s->exp_strategy[ch][blk] != EXP_REUSE) {
+                if (!block->channel_in_cpl[ch])
+                    frame_bits += 6;
+                frame_bits += 2;
+            }
+        }
+
+        /* coupling exponent strategy */
+        if (block->cpl_in_use)
+            frame_bits += 2;
+
+        /* snr offsets and fast gain codes */
+        frame_bits++;
+        if (block->new_snr_offsets)
+            frame_bits += 6 + (s->channels + block->cpl_in_use) * (4 + 3);
+
+        /* coupling leak info */
+        if (block->cpl_in_use) {
+            frame_bits++;
+            if (block->new_cpl_leak)
+                frame_bits += 3 + 3;
         }
     }
 
@@ -970,16 +1387,16 @@ static void bit_alloc_masking(AC3EncodeContext *s)
 
     for (blk = 0; blk < AC3_MAX_BLOCKS; blk++) {
         AC3Block *block = &s->blocks[blk];
-        for (ch = 0; ch < s->channels; ch++) {
+        for (ch = !block->cpl_in_use; ch <= s->channels; ch++) {
             /* We only need psd and mask for calculating bap.
                Since we currently do not calculate bap when exponent
                strategy is EXP_REUSE we do not need to calculate psd or mask. */
             if (s->exp_strategy[ch][blk] != EXP_REUSE) {
-                ff_ac3_bit_alloc_calc_psd(block->exp[ch], 0,
-                                          s->nb_coefs[ch],
-                                          block->psd[ch], block->band_psd[ch]);
+                ff_ac3_bit_alloc_calc_psd(block->exp[ch], s->start_freq[ch],
+                                          block->end_freq[ch], block->psd[ch],
+                                          block->band_psd[ch]);
                 ff_ac3_bit_alloc_calc_mask(&s->bit_alloc, block->band_psd[ch],
-                                           0, s->nb_coefs[ch],
+                                           s->start_freq[ch], block->end_freq[ch],
                                            ff_ac3_fast_gain_tab[s->fast_gain_code[ch]],
                                            ch == s->lfe_channel,
                                            DBA_NONE, 0, NULL, NULL, NULL,
@@ -997,11 +1414,12 @@ static void bit_alloc_masking(AC3EncodeContext *s)
 static void reset_block_bap(AC3EncodeContext *s)
 {
     int blk, ch;
+    int channels = s->channels + 1;
     if (s->blocks[0].bap[0] == s->bap_buffer)
         return;
     for (blk = 0; blk < AC3_MAX_BLOCKS; blk++) {
-        for (ch = 0; ch < s->channels; ch++) {
-            s->blocks[blk].bap[ch] = &s->bap_buffer[AC3_MAX_COEFS * (blk * s->channels + ch)];
+        for (ch = 0; ch < channels; ch++) {
+            s->blocks[blk].bap[ch] = &s->bap_buffer[AC3_MAX_COEFS * (blk * channels + ch)];
         }
     }
 }
@@ -1027,28 +1445,37 @@ static int bit_alloc(AC3EncodeContext *s, int snr_offset)
     for (blk = 0; blk < AC3_MAX_BLOCKS; blk++) {
         AC3Block *block = &s->blocks[blk];
         AC3Block *ref_block;
+        int av_uninit(ch0);
+        int got_cpl = !block->cpl_in_use;
         // initialize grouped mantissa counts. these are set so that they are
         // padded to the next whole group size when bits are counted in
         // compute_mantissa_size_final
         mant_cnt[0] = mant_cnt[3] = 0;
         mant_cnt[1] = mant_cnt[2] = 2;
         mant_cnt[4] = 1;
-        for (ch = 0; ch < s->channels; ch++) {
+        for (ch = 1; ch <= s->channels; ch++) {
+            if (!got_cpl && ch > 1 && block->channel_in_cpl[ch-1]) {
+                ch0     = ch - 1;
+                ch      = CPL_CH;
+                got_cpl = 1;
+            }
+
             /* Currently the only bit allocation parameters which vary across
                blocks within a frame are the exponent values.  We can take
                advantage of that by reusing the bit allocation pointers
                whenever we reuse exponents. */
             ref_block = block->exp_ref_block[ch];
             if (s->exp_strategy[ch][blk] != EXP_REUSE) {
-                s->ac3dsp.bit_alloc_calc_bap(ref_block->mask[ch],
-                                             ref_block->psd[ch], 0,
-                                             s->nb_coefs[ch], snr_offset,
-                                             s->bit_alloc.floor, ff_ac3_bap_tab,
-                                             ref_block->bap[ch]);
+                s->ac3dsp.bit_alloc_calc_bap(ref_block->mask[ch], ref_block->psd[ch],
+                                             s->start_freq[ch], block->end_freq[ch],
+                                             snr_offset, s->bit_alloc.floor,
+                                             ff_ac3_bap_tab, ref_block->bap[ch]);
             }
             mantissa_bits += s->ac3dsp.compute_mantissa_size(mant_cnt,
-                                                             ref_block->bap[ch],
-                                                             s->nb_coefs[ch]);
+                                                             ref_block->bap[ch]+s->start_freq[ch],
+                                                             block->end_freq[ch]-s->start_freq[ch]);
+            if (ch == CPL_CH)
+                ch = ch0;
         }
         mantissa_bits += compute_mantissa_size_final(mant_cnt);
     }
@@ -1074,7 +1501,7 @@ static int cbr_bit_allocation(AC3EncodeContext *s)
 
     /* if previous frame SNR offset was 1023, check if current frame can also
        use SNR offset of 1023. if so, skip the search. */
-    if ((snr_offset | s->fine_snr_offset[0]) == 1023) {
+    if ((snr_offset | s->fine_snr_offset[1]) == 1023) {
         if (bit_alloc(s, 1023) <= bits_left)
             return 0;
     }
@@ -1098,7 +1525,7 @@ static int cbr_bit_allocation(AC3EncodeContext *s)
     reset_block_bap(s);
 
     s->coarse_snr_offset = snr_offset >> 4;
-    for (ch = 0; ch < s->channels; ch++)
+    for (ch = !s->cpl_on; ch <= s->channels; ch++)
         s->fine_snr_offset[ch] = snr_offset & 0xF;
 
     return 0;
@@ -1116,26 +1543,26 @@ static int downgrade_exponents(AC3EncodeContext *s)
 {
     int ch, blk;
 
-    for (ch = 0; ch < s->fbw_channels; ch++) {
-        for (blk = AC3_MAX_BLOCKS-1; blk >= 0; blk--) {
+    for (blk = AC3_MAX_BLOCKS-1; blk >= 0; blk--) {
+        for (ch = !s->blocks[blk].cpl_in_use; ch <= s->fbw_channels; ch++) {
             if (s->exp_strategy[ch][blk] == EXP_D15) {
                 s->exp_strategy[ch][blk] = EXP_D25;
                 return 0;
             }
         }
     }
-    for (ch = 0; ch < s->fbw_channels; ch++) {
-        for (blk = AC3_MAX_BLOCKS-1; blk >= 0; blk--) {
+    for (blk = AC3_MAX_BLOCKS-1; blk >= 0; blk--) {
+        for (ch = !s->blocks[blk].cpl_in_use; ch <= s->fbw_channels; ch++) {
             if (s->exp_strategy[ch][blk] == EXP_D25) {
                 s->exp_strategy[ch][blk] = EXP_D45;
                 return 0;
             }
         }
     }
-    for (ch = 0; ch < s->fbw_channels; ch++) {
-        /* block 0 cannot reuse exponents, so only downgrade D45 to REUSE if
-           the block number > 0 */
-        for (blk = AC3_MAX_BLOCKS-1; blk > 0; blk--) {
+    /* block 0 cannot reuse exponents, so only downgrade D45 to REUSE if
+       the block number > 0 */
+    for (blk = AC3_MAX_BLOCKS-1; blk > 0; blk--) {
+        for (ch = !s->blocks[blk].cpl_in_use; ch <= s->fbw_channels; ch++) {
             if (s->exp_strategy[ch][blk] > EXP_REUSE) {
                 s->exp_strategy[ch][blk] = EXP_REUSE;
                 return 0;
@@ -1162,7 +1589,18 @@ static int compute_bit_allocation(AC3EncodeContext *s)
 
     ret = cbr_bit_allocation(s);
     while (ret) {
-        /* fallback 1: downgrade exponents */
+        /* fallback 1: disable channel coupling */
+        if (s->cpl_on) {
+            s->cpl_on = 0;
+            compute_coupling_strategy(s);
+            compute_rematrixing_strategy(s);
+            apply_rematrixing(s);
+            process_exponents(s);
+            ret = compute_bit_allocation(s);
+            continue;
+        }
+
+        /* fallback 2: downgrade exponents */
         if (!downgrade_exponents(s)) {
             extract_exponents(s);
             encode_exponents(s);
@@ -1216,12 +1654,13 @@ static inline int asym_quant(int c, int e, int qbits)
  * Quantize a set of mantissas for a single channel in a single block.
  */
 static void quantize_mantissas_blk_ch(AC3Mant *s, int32_t *fixed_coef,
-                                      uint8_t *exp,
-                                      uint8_t *bap, uint16_t *qmant, int n)
+                                      uint8_t *exp, uint8_t *bap,
+                                      uint16_t *qmant, int start_freq,
+                                      int end_freq)
 {
     int i;
 
-    for (i = 0; i < n; i++) {
+    for (i = start_freq; i < end_freq; i++) {
         int v;
         int c = fixed_coef[i];
         int e = exp[i];
@@ -1311,19 +1750,27 @@ static void quantize_mantissas_blk_ch(AC3Mant *s, int32_t *fixed_coef,
  */
 static void quantize_mantissas(AC3EncodeContext *s)
 {
-    int blk, ch;
-
+    int blk, ch, ch0=0, got_cpl;
 
     for (blk = 0; blk < AC3_MAX_BLOCKS; blk++) {
         AC3Block *block = &s->blocks[blk];
         AC3Block *ref_block;
         AC3Mant m = { 0 };
 
-        for (ch = 0; ch < s->channels; ch++) {
+        got_cpl = !block->cpl_in_use;
+        for (ch = 1; ch <= s->channels; ch++) {
+            if (!got_cpl && ch > 1 && block->channel_in_cpl[ch-1]) {
+                ch0     = ch - 1;
+                ch      = CPL_CH;
+                got_cpl = 1;
+            }
             ref_block = block->exp_ref_block[ch];
             quantize_mantissas_blk_ch(&m, block->fixed_coef[ch],
-                                      ref_block->exp[ch], ref_block->bap[ch],
-                                      block->qmant[ch], s->nb_coefs[ch]);
+                                      ref_block->exp[ch],
+                                      ref_block->bap[ch], block->qmant[ch],
+                                      s->start_freq[ch], block->end_freq[ch]);
+            if (ch == CPL_CH)
+                ch = ch0;
         }
     }
 }
@@ -1390,7 +1837,8 @@ static void output_frame_header(AC3EncodeContext *s)
  */
 static void output_audio_block(AC3EncodeContext *s, int blk)
 {
-    int ch, i, baie, rbnd;
+    int ch, i, baie, bnd, got_cpl;
+    int av_uninit(ch0);
     AC3Block *block = &s->blocks[blk];
 
     /* block switching */
@@ -1405,11 +1853,38 @@ static void output_audio_block(AC3EncodeContext *s, int blk)
     put_bits(&s->pb, 1, 0);
 
     /* channel coupling */
-    if (!blk) {
-        put_bits(&s->pb, 1, 1); /* coupling strategy present */
-        put_bits(&s->pb, 1, 0); /* no coupling strategy */
-    } else {
-        put_bits(&s->pb, 1, 0); /* no new coupling strategy */
+    put_bits(&s->pb, 1, block->new_cpl_strategy);
+    if (block->new_cpl_strategy) {
+        put_bits(&s->pb, 1, block->cpl_in_use);
+        if (block->cpl_in_use) {
+            int start_sub, end_sub;
+            for (ch = 1; ch <= s->fbw_channels; ch++)
+                put_bits(&s->pb, 1, block->channel_in_cpl[ch]);
+            if (s->channel_mode == AC3_CHMODE_STEREO)
+                put_bits(&s->pb, 1, 0); /* phase flags in use */
+            start_sub = (s->start_freq[CPL_CH] - 37) / 12;
+            end_sub   = (s->cpl_end_freq       - 37) / 12;
+            put_bits(&s->pb, 4, start_sub);
+            put_bits(&s->pb, 4, end_sub - 3);
+            for (bnd = start_sub+1; bnd < end_sub; bnd++)
+                put_bits(&s->pb, 1, ff_eac3_default_cpl_band_struct[bnd]);
+        }
+    }
+
+    /* coupling coordinates */
+    if (block->cpl_in_use) {
+        for (ch = 1; ch <= s->fbw_channels; ch++) {
+            if (block->channel_in_cpl[ch]) {
+                put_bits(&s->pb, 1, block->new_cpl_coords);
+                if (block->new_cpl_coords) {
+                    put_bits(&s->pb, 2, block->cpl_master_exp[ch]);
+                    for (bnd = 0; bnd < s->num_cpl_bands; bnd++) {
+                        put_bits(&s->pb, 4, block->cpl_coord_exp [ch][bnd]);
+                        put_bits(&s->pb, 4, block->cpl_coord_mant[ch][bnd]);
+                    }
+                }
+            }
+        }
     }
 
     /* stereo rematrixing */
@@ -1417,40 +1892,41 @@ static void output_audio_block(AC3EncodeContext *s, int blk)
         put_bits(&s->pb, 1, block->new_rematrixing_strategy);
         if (block->new_rematrixing_strategy) {
             /* rematrixing flags */
-            for (rbnd = 0; rbnd < s->num_rematrixing_bands; rbnd++)
-                put_bits(&s->pb, 1, block->rematrixing_flags[rbnd]);
+            for (bnd = 0; bnd < block->num_rematrixing_bands; bnd++)
+                put_bits(&s->pb, 1, block->rematrixing_flags[bnd]);
         }
     }
 
     /* exponent strategy */
-    for (ch = 0; ch < s->fbw_channels; ch++)
+    for (ch = !block->cpl_in_use; ch <= s->fbw_channels; ch++)
         put_bits(&s->pb, 2, s->exp_strategy[ch][blk]);
     if (s->lfe_on)
         put_bits(&s->pb, 1, s->exp_strategy[s->lfe_channel][blk]);
 
     /* bandwidth */
-    for (ch = 0; ch < s->fbw_channels; ch++) {
-        if (s->exp_strategy[ch][blk] != EXP_REUSE)
+    for (ch = 1; ch <= s->fbw_channels; ch++) {
+        if (s->exp_strategy[ch][blk] != EXP_REUSE && !block->channel_in_cpl[ch])
             put_bits(&s->pb, 6, s->bandwidth_code);
     }
 
     /* exponents */
-    for (ch = 0; ch < s->channels; ch++) {
+    for (ch = !block->cpl_in_use; ch <= s->channels; ch++) {
         int nb_groups;
+        int cpl = (ch == CPL_CH);
 
         if (s->exp_strategy[ch][blk] == EXP_REUSE)
             continue;
 
         /* DC exponent */
-        put_bits(&s->pb, 4, block->grouped_exp[ch][0]);
+        put_bits(&s->pb, 4, block->grouped_exp[ch][0] >> cpl);
 
         /* exponent groups */
-        nb_groups = exponent_group_tab[s->exp_strategy[ch][blk]-1][s->nb_coefs[ch]];
+        nb_groups = exponent_group_tab[cpl][s->exp_strategy[ch][blk]-1][block->end_freq[ch]-s->start_freq[ch]];
         for (i = 1; i <= nb_groups; i++)
             put_bits(&s->pb, 7, block->grouped_exp[ch][i]);
 
         /* gain range info */
-        if (ch != s->lfe_channel)
+        if (ch != s->lfe_channel && !cpl)
             put_bits(&s->pb, 2, 0);
     }
 
@@ -1466,23 +1942,40 @@ static void output_audio_block(AC3EncodeContext *s, int blk)
     }
 
     /* snr offset */
-    put_bits(&s->pb, 1, baie);
-    if (baie) {
+    put_bits(&s->pb, 1, block->new_snr_offsets);
+    if (block->new_snr_offsets) {
         put_bits(&s->pb, 6, s->coarse_snr_offset);
-        for (ch = 0; ch < s->channels; ch++) {
+        for (ch = !block->cpl_in_use; ch <= s->channels; ch++) {
             put_bits(&s->pb, 4, s->fine_snr_offset[ch]);
             put_bits(&s->pb, 3, s->fast_gain_code[ch]);
         }
     }
 
+    /* coupling leak */
+    if (block->cpl_in_use) {
+        put_bits(&s->pb, 1, block->new_cpl_leak);
+        if (block->new_cpl_leak) {
+            put_bits(&s->pb, 3, s->bit_alloc.cpl_fast_leak);
+            put_bits(&s->pb, 3, s->bit_alloc.cpl_slow_leak);
+        }
+    }
+
     put_bits(&s->pb, 1, 0); /* no delta bit allocation */
     put_bits(&s->pb, 1, 0); /* no data to skip */
 
     /* mantissas */
-    for (ch = 0; ch < s->channels; ch++) {
+    got_cpl = !block->cpl_in_use;
+    for (ch = 1; ch <= s->channels; ch++) {
         int b, q;
-        AC3Block *ref_block = block->exp_ref_block[ch];
-        for (i = 0; i < s->nb_coefs[ch]; i++) {
+        AC3Block *ref_block;
+
+        if (!got_cpl && ch > 1 && block->channel_in_cpl[ch-1]) {
+            ch0     = ch - 1;
+            ch      = CPL_CH;
+            got_cpl = 1;
+        }
+        ref_block = block->exp_ref_block[ch];
+        for (i = s->start_freq[ch]; i < block->end_freq[ch]; i++) {
             q = block->qmant[ch][i];
             b = ref_block->bap[ch][i];
             switch (b) {
@@ -1496,6 +1989,8 @@ static void output_audio_block(AC3EncodeContext *s, int blk)
             default:              put_bits(&s->pb, b-1, q); break;
             }
         }
+        if (ch == CPL_CH)
+            ch = ch0;
     }
 }
 
@@ -1881,6 +2376,12 @@ static int ac3_encode_frame(AVCodecContext *avctx, unsigned char *frame,
 
     scale_coefficients(s);
 
+    s->cpl_on = s->cpl_enabled;
+    compute_coupling_strategy(s);
+
+    if (s->cpl_on)
+        apply_channel_coupling(s);
+
     compute_rematrixing_strategy(s);
 
     apply_rematrixing(s);
@@ -1961,7 +2462,7 @@ static av_cold int set_channel_info(AC3EncodeContext *s, int channels,
     s->lfe_on       = !!(ch_layout & AV_CH_LOW_FREQUENCY);
     s->channels     = channels;
     s->fbw_channels = channels - s->lfe_on;
-    s->lfe_channel  = s->lfe_on ? s->fbw_channels : -1;
+    s->lfe_channel  = s->lfe_on ? s->fbw_channels + 1 : -1;
     if (s->lfe_on)
         ch_layout -= AV_CH_LOW_FREQUENCY;
 
@@ -2060,6 +2561,10 @@ static av_cold int validate_options(AVCodecContext *avctx, AC3EncodeContext *s)
     s->rematrixing_enabled = s->options.stereo_rematrixing &&
                              (s->channel_mode == AC3_CHMODE_STEREO);
 
+    s->cpl_enabled = s->options.channel_coupling &&
+                     s->channel_mode >= AC3_CHMODE_STEREO &&
+                     CONFIG_AC3ENC_FLOAT;
+
     return 0;
 }
 
@@ -2071,7 +2576,8 @@ static av_cold int validate_options(AVCodecContext *avctx, AC3EncodeContext *s)
  */
 static av_cold void set_bandwidth(AC3EncodeContext *s)
 {
-    int ch;
+    int blk, ch;
+    int av_uninit(cpl_start);
 
     if (s->cutoff) {
         /* calculate bandwidth based on user-specified cutoff frequency */
@@ -2084,11 +2590,54 @@ static av_cold void set_bandwidth(AC3EncodeContext *s)
     }
 
     /* set number of coefficients for each channel */
-    for (ch = 0; ch < s->fbw_channels; ch++) {
-        s->nb_coefs[ch] = s->bandwidth_code * 3 + 73;
+    for (ch = 1; ch <= s->fbw_channels; ch++) {
+        s->start_freq[ch] = 0;
+        for (blk = 0; blk < AC3_MAX_BLOCKS; blk++)
+            s->blocks[blk].end_freq[ch] = s->bandwidth_code * 3 + 73;
+    }
+    /* LFE channel always has 7 coefs */
+    if (s->lfe_on) {
+        s->start_freq[s->lfe_channel] = 0;
+        for (blk = 0; blk < AC3_MAX_BLOCKS; blk++)
+            s->blocks[blk].end_freq[ch] = 7;
+    }
+
+    /* initialize coupling strategy */
+    if (s->cpl_enabled) {
+        if (s->options.cpl_start >= 0) {
+            cpl_start = s->options.cpl_start;
+        } else {
+            cpl_start = ac3_coupling_start_tab[s->channel_mode-2][s->bit_alloc.sr_code][s->frame_size_code/2];
+            if (cpl_start < 0)
+                s->cpl_enabled = 0;
+        }
+    }
+    if (s->cpl_enabled) {
+        int i, cpl_start_band, cpl_end_band;
+        uint8_t *cpl_band_sizes = s->cpl_band_sizes;
+
+        cpl_end_band   = s->bandwidth_code / 4 + 3;
+        cpl_start_band = av_clip(cpl_start, 0, FFMIN(cpl_end_band-1, 15));
+
+        s->num_cpl_subbands = cpl_end_band - cpl_start_band;
+
+        s->num_cpl_bands = 1;
+        *cpl_band_sizes  = 12;
+        for (i = cpl_start_band + 1; i < cpl_end_band; i++) {
+            if (ff_eac3_default_cpl_band_struct[i]) {
+                *cpl_band_sizes += 12;
+            } else {
+                s->num_cpl_bands++;
+                cpl_band_sizes++;
+                *cpl_band_sizes = 12;
+            }
+        }
+
+        s->start_freq[CPL_CH] = cpl_start_band * 12 + 37;
+        s->cpl_end_freq       = cpl_end_band   * 12 + 37;
+        for (blk = 0; blk < AC3_MAX_BLOCKS; blk++)
+            s->blocks[blk].end_freq[CPL_CH] = s->cpl_end_freq;
     }
-    if (s->lfe_on)
-        s->nb_coefs[s->lfe_channel] = 7; /* LFE channel always has 7 coefs */
 }
 
 
@@ -2096,6 +2645,7 @@ static av_cold int allocate_buffers(AVCodecContext *avctx)
 {
     int blk, ch;
     AC3EncodeContext *s = avctx->priv_data;
+    int channels = s->channels + 1; /* includes coupling channel */
 
     FF_ALLOC_OR_GOTO(avctx, s->planar_samples, s->channels * sizeof(*s->planar_samples),
                      alloc_fail);
@@ -2104,74 +2654,90 @@ static av_cold int allocate_buffers(AVCodecContext *avctx)
                           (AC3_FRAME_SIZE+AC3_BLOCK_SIZE) * sizeof(**s->planar_samples),
                           alloc_fail);
     }
-    FF_ALLOC_OR_GOTO(avctx, s->bap_buffer,  AC3_MAX_BLOCKS * s->channels *
+    FF_ALLOC_OR_GOTO(avctx, s->bap_buffer,  AC3_MAX_BLOCKS * channels *
                      AC3_MAX_COEFS * sizeof(*s->bap_buffer),  alloc_fail);
-    FF_ALLOC_OR_GOTO(avctx, s->bap1_buffer, AC3_MAX_BLOCKS * s->channels *
+    FF_ALLOC_OR_GOTO(avctx, s->bap1_buffer, AC3_MAX_BLOCKS * channels *
                      AC3_MAX_COEFS * sizeof(*s->bap1_buffer), alloc_fail);
-    FF_ALLOC_OR_GOTO(avctx, s->mdct_coef_buffer, AC3_MAX_BLOCKS * s->channels *
+    FF_ALLOC_OR_GOTO(avctx, s->mdct_coef_buffer, AC3_MAX_BLOCKS * channels *
                      AC3_MAX_COEFS * sizeof(*s->mdct_coef_buffer), alloc_fail);
-    FF_ALLOC_OR_GOTO(avctx, s->exp_buffer, AC3_MAX_BLOCKS * s->channels *
+    FF_ALLOC_OR_GOTO(avctx, s->exp_buffer, AC3_MAX_BLOCKS * channels *
                      AC3_MAX_COEFS * sizeof(*s->exp_buffer), alloc_fail);
-    FF_ALLOC_OR_GOTO(avctx, s->grouped_exp_buffer, AC3_MAX_BLOCKS * s->channels *
+    FF_ALLOC_OR_GOTO(avctx, s->grouped_exp_buffer, AC3_MAX_BLOCKS * channels *
                      128 * sizeof(*s->grouped_exp_buffer), alloc_fail);
-    FF_ALLOC_OR_GOTO(avctx, s->psd_buffer, AC3_MAX_BLOCKS * s->channels *
+    FF_ALLOC_OR_GOTO(avctx, s->psd_buffer, AC3_MAX_BLOCKS * channels *
                      AC3_MAX_COEFS * sizeof(*s->psd_buffer), alloc_fail);
-    FF_ALLOC_OR_GOTO(avctx, s->band_psd_buffer, AC3_MAX_BLOCKS * s->channels *
+    FF_ALLOC_OR_GOTO(avctx, s->band_psd_buffer, AC3_MAX_BLOCKS * channels *
                      64 * sizeof(*s->band_psd_buffer), alloc_fail);
-    FF_ALLOC_OR_GOTO(avctx, s->mask_buffer, AC3_MAX_BLOCKS * s->channels *
+    FF_ALLOC_OR_GOTO(avctx, s->mask_buffer, AC3_MAX_BLOCKS * channels *
                      64 * sizeof(*s->mask_buffer), alloc_fail);
-    FF_ALLOC_OR_GOTO(avctx, s->qmant_buffer, AC3_MAX_BLOCKS * s->channels *
+    FF_ALLOC_OR_GOTO(avctx, s->qmant_buffer, AC3_MAX_BLOCKS * channels *
                      AC3_MAX_COEFS * sizeof(*s->qmant_buffer), alloc_fail);
+    if (s->cpl_enabled) {
+        FF_ALLOC_OR_GOTO(avctx, s->cpl_coord_exp_buffer, AC3_MAX_BLOCKS * channels *
+                         16 * sizeof(*s->cpl_coord_exp_buffer), alloc_fail);
+        FF_ALLOC_OR_GOTO(avctx, s->cpl_coord_mant_buffer, AC3_MAX_BLOCKS * channels *
+                         16 * sizeof(*s->cpl_coord_mant_buffer), alloc_fail);
+    }
     for (blk = 0; blk < AC3_MAX_BLOCKS; blk++) {
         AC3Block *block = &s->blocks[blk];
-        FF_ALLOC_OR_GOTO(avctx, block->bap, s->channels * sizeof(*block->bap),
+        FF_ALLOC_OR_GOTO(avctx, block->bap, channels * sizeof(*block->bap),
                          alloc_fail);
-        FF_ALLOCZ_OR_GOTO(avctx, block->mdct_coef, s->channels * sizeof(*block->mdct_coef),
+        FF_ALLOCZ_OR_GOTO(avctx, block->mdct_coef, channels * sizeof(*block->mdct_coef),
                           alloc_fail);
-        FF_ALLOCZ_OR_GOTO(avctx, block->exp, s->channels * sizeof(*block->exp),
+        FF_ALLOCZ_OR_GOTO(avctx, block->exp, channels * sizeof(*block->exp),
                           alloc_fail);
-        FF_ALLOCZ_OR_GOTO(avctx, block->grouped_exp, s->channels * sizeof(*block->grouped_exp),
+        FF_ALLOCZ_OR_GOTO(avctx, block->grouped_exp, channels * sizeof(*block->grouped_exp),
                           alloc_fail);
-        FF_ALLOCZ_OR_GOTO(avctx, block->psd, s->channels * sizeof(*block->psd),
+        FF_ALLOCZ_OR_GOTO(avctx, block->psd, channels * sizeof(*block->psd),
                           alloc_fail);
-        FF_ALLOCZ_OR_GOTO(avctx, block->band_psd, s->channels * sizeof(*block->band_psd),
+        FF_ALLOCZ_OR_GOTO(avctx, block->band_psd, channels * sizeof(*block->band_psd),
                           alloc_fail);
-        FF_ALLOCZ_OR_GOTO(avctx, block->mask, s->channels * sizeof(*block->mask),
+        FF_ALLOCZ_OR_GOTO(avctx, block->mask, channels * sizeof(*block->mask),
                           alloc_fail);
-        FF_ALLOCZ_OR_GOTO(avctx, block->qmant, s->channels * sizeof(*block->qmant),
+        FF_ALLOCZ_OR_GOTO(avctx, block->qmant, channels * sizeof(*block->qmant),
                           alloc_fail);
+        if (s->cpl_enabled) {
+            FF_ALLOCZ_OR_GOTO(avctx, block->cpl_coord_exp, channels * sizeof(*block->cpl_coord_exp),
+                              alloc_fail);
+            FF_ALLOCZ_OR_GOTO(avctx, block->cpl_coord_mant, channels * sizeof(*block->cpl_coord_mant),
+                              alloc_fail);
+        }
 
-        for (ch = 0; ch < s->channels; ch++) {
+        for (ch = 0; ch < channels; ch++) {
             /* arrangement: block, channel, coeff */
-            block->bap[ch]         = &s->bap_buffer        [AC3_MAX_COEFS * (blk * s->channels + ch)];
-            block->mdct_coef[ch]   = &s->mdct_coef_buffer  [AC3_MAX_COEFS * (blk * s->channels + ch)];
-            block->grouped_exp[ch] = &s->grouped_exp_buffer[128           * (blk * s->channels + ch)];
-            block->psd[ch]         = &s->psd_buffer        [AC3_MAX_COEFS * (blk * s->channels + ch)];
-            block->band_psd[ch]    = &s->band_psd_buffer   [64            * (blk * s->channels + ch)];
-            block->mask[ch]        = &s->mask_buffer       [64            * (blk * s->channels + ch)];
-            block->qmant[ch]       = &s->qmant_buffer      [AC3_MAX_COEFS * (blk * s->channels + ch)];
+            block->bap[ch]         = &s->bap_buffer        [AC3_MAX_COEFS * (blk * channels + ch)];
+            block->grouped_exp[ch] = &s->grouped_exp_buffer[128           * (blk * channels + ch)];
+            block->psd[ch]         = &s->psd_buffer        [AC3_MAX_COEFS * (blk * channels + ch)];
+            block->band_psd[ch]    = &s->band_psd_buffer   [64            * (blk * channels + ch)];
+            block->mask[ch]        = &s->mask_buffer       [64            * (blk * channels + ch)];
+            block->qmant[ch]       = &s->qmant_buffer      [AC3_MAX_COEFS * (blk * channels + ch)];
+            if (s->cpl_enabled) {
+                block->cpl_coord_exp[ch]  = &s->cpl_coord_exp_buffer [16  * (blk * channels + ch)];
+                block->cpl_coord_mant[ch] = &s->cpl_coord_mant_buffer[16  * (blk * channels + ch)];
+            }
 
             /* arrangement: channel, block, coeff */
             block->exp[ch]         = &s->exp_buffer        [AC3_MAX_COEFS * (AC3_MAX_BLOCKS * ch + blk)];
+            block->mdct_coef[ch]   = &s->mdct_coef_buffer  [AC3_MAX_COEFS * (AC3_MAX_BLOCKS * ch + blk)];
         }
     }
 
     if (CONFIG_AC3ENC_FLOAT) {
-        FF_ALLOC_OR_GOTO(avctx, s->fixed_coef_buffer, AC3_MAX_BLOCKS * s->channels *
+        FF_ALLOC_OR_GOTO(avctx, s->fixed_coef_buffer, AC3_MAX_BLOCKS * channels *
                          AC3_MAX_COEFS * sizeof(*s->fixed_coef_buffer), alloc_fail);
         for (blk = 0; blk < AC3_MAX_BLOCKS; blk++) {
             AC3Block *block = &s->blocks[blk];
-            FF_ALLOCZ_OR_GOTO(avctx, block->fixed_coef, s->channels *
+            FF_ALLOCZ_OR_GOTO(avctx, block->fixed_coef, channels *
                               sizeof(*block->fixed_coef), alloc_fail);
-            for (ch = 0; ch < s->channels; ch++)
-                block->fixed_coef[ch] = &s->fixed_coef_buffer[AC3_MAX_COEFS * (blk * s->channels + ch)];
+            for (ch = 0; ch < channels; ch++)
+                block->fixed_coef[ch] = &s->fixed_coef_buffer[AC3_MAX_COEFS * (AC3_MAX_BLOCKS * ch + blk)];
         }
     } else {
         for (blk = 0; blk < AC3_MAX_BLOCKS; blk++) {
             AC3Block *block = &s->blocks[blk];
-            FF_ALLOCZ_OR_GOTO(avctx, block->fixed_coef, s->channels *
+            FF_ALLOCZ_OR_GOTO(avctx, block->fixed_coef, channels *
                               sizeof(*block->fixed_coef), alloc_fail);
-            for (ch = 0; ch < s->channels; ch++)
+            for (ch = 0; ch < channels; ch++)
                 block->fixed_coef[ch] = (int32_t *)block->mdct_coef[ch];
         }
     }
diff --git a/libavcodec/ac3enc_fixed.c b/libavcodec/ac3enc_fixed.c
index 800ef8f92c..035ebb3de9 100644
--- a/libavcodec/ac3enc_fixed.c
+++ b/libavcodec/ac3enc_fixed.c
@@ -101,7 +101,7 @@ static void scale_coefficients(AC3EncodeContext *s)
 
     for (blk = 0; blk < AC3_MAX_BLOCKS; blk++) {
         AC3Block *block = &s->blocks[blk];
-        for (ch = 0; ch < s->channels; ch++) {
+        for (ch = 1; ch <= s->channels; ch++) {
             s->ac3dsp.ac3_rshift_int32(block->mdct_coef[ch], AC3_MAX_COEFS,
                                        block->coeff_shift[ch]);
         }
diff --git a/libavcodec/ac3enc_float.c b/libavcodec/ac3enc_float.c
index d2435dee15..4f61440b52 100644
--- a/libavcodec/ac3enc_float.c
+++ b/libavcodec/ac3enc_float.c
@@ -93,8 +93,10 @@ static int normalize_samples(AC3EncodeContext *s)
  */
 static void scale_coefficients(AC3EncodeContext *s)
 {
-    s->ac3dsp.float_to_fixed24(s->fixed_coef_buffer, s->mdct_coef_buffer,
-                               AC3_MAX_COEFS * AC3_MAX_BLOCKS * s->channels);
+    int chan_size = AC3_MAX_COEFS * AC3_MAX_BLOCKS;
+    s->ac3dsp.float_to_fixed24(s->fixed_coef_buffer + chan_size,
+                               s->mdct_coef_buffer  + chan_size,
+                               chan_size * s->channels);
 }
 
 
diff --git a/libavcodec/ac3tab.c b/libavcodec/ac3tab.c
index 6a4d8cd0a2..7df3d828fb 100644
--- a/libavcodec/ac3tab.c
+++ b/libavcodec/ac3tab.c
@@ -138,6 +138,13 @@ const uint16_t ff_ac3_bitrate_tab[19] = {
  */
 const uint8_t ff_ac3_rematrix_band_tab[5] = { 13, 25, 37, 61, 253 };
 
+/**
+ * Table E2.16 Default Coupling Banding Structure
+ */
+const uint8_t ff_eac3_default_cpl_band_struct[18] = {
+    0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1
+};
+
 /* AC-3 MDCT window */
 
 /* MDCT window */
diff --git a/libavcodec/ac3tab.h b/libavcodec/ac3tab.h
index 292ce0d32f..e5cd368bb7 100644
--- a/libavcodec/ac3tab.h
+++ b/libavcodec/ac3tab.h
@@ -39,6 +39,7 @@ extern const uint8_t  ff_ac3_dec_channel_map[8][2][6];
 extern const uint16_t ff_ac3_sample_rate_tab[3];
 extern const uint16_t ff_ac3_bitrate_tab[19];
 extern const uint8_t  ff_ac3_rematrix_band_tab[5];
+extern const uint8_t  ff_eac3_default_cpl_band_struct[18];
 extern const int16_t  ff_ac3_window[AC3_WINDOW_SIZE/2];
 extern const uint8_t  ff_ac3_log_add_tab[260];
 extern const uint16_t ff_ac3_hearing_threshold_tab[AC3_CRITICAL_BANDS][3];
-- 
cgit v1.2.3


From 57aa765971878ff678abbe3bf1bfd9407b9e5a00 Mon Sep 17 00:00:00 2001
From: Anton Khirnov <anton@khirnov.net>
Date: Fri, 13 May 2011 08:03:36 +0200
Subject: lavc: remove msmpeg4v1 encoder.

The encoder has never produced files that could be decoded
with any software and there should be no reason to create
such files anyway.
---
 libavcodec/Makefile        |  2 --
 libavcodec/allcodecs.c     |  2 +-
 libavcodec/mpegvideo_enc.c | 21 ---------------------
 libavcodec/msmpeg4.c       |  8 --------
 libavcodec/msmpeg4.h       |  3 +--
 5 files changed, 2 insertions(+), 34 deletions(-)

diff --git a/libavcodec/Makefile b/libavcodec/Makefile
index 6cb59a3c43..81981d76b6 100644
--- a/libavcodec/Makefile
+++ b/libavcodec/Makefile
@@ -262,8 +262,6 @@ OBJS-$(CONFIG_MPEG2VIDEO_ENCODER)      += mpeg12enc.o mpegvideo_enc.o \
                                           mpegvideo.o error_resilience.o
 OBJS-$(CONFIG_MPEG4_VAAPI_HWACCEL)     += vaapi_mpeg4.o
 OBJS-$(CONFIG_MSMPEG4V1_DECODER)       += msmpeg4.o msmpeg4data.o
-OBJS-$(CONFIG_MSMPEG4V1_ENCODER)       += msmpeg4.o msmpeg4data.o h263dec.o \
-                                          h263.o ituh263dec.o mpeg4videodec.o
 OBJS-$(CONFIG_MSMPEG4V2_DECODER)       += msmpeg4.o msmpeg4data.o h263dec.o \
                                           h263.o ituh263dec.o mpeg4videodec.o
 OBJS-$(CONFIG_MSMPEG4V2_ENCODER)       += msmpeg4.o msmpeg4data.o h263dec.o \
diff --git a/libavcodec/allcodecs.c b/libavcodec/allcodecs.c
index 3466ad94fd..f063369dfd 100644
--- a/libavcodec/allcodecs.c
+++ b/libavcodec/allcodecs.c
@@ -147,7 +147,7 @@ void avcodec_register_all(void)
     REGISTER_DECODER (MPEGVIDEO, mpegvideo);
     REGISTER_DECODER (MPEG_VDPAU, mpeg_vdpau);
     REGISTER_DECODER (MPEG1_VDPAU, mpeg1_vdpau);
-    REGISTER_ENCDEC  (MSMPEG4V1, msmpeg4v1);
+    REGISTER_DECODER (MSMPEG4V1, msmpeg4v1);
     REGISTER_ENCDEC  (MSMPEG4V2, msmpeg4v2);
     REGISTER_ENCDEC  (MSMPEG4V3, msmpeg4v3);
     REGISTER_DECODER (MSRLE, msrle);
diff --git a/libavcodec/mpegvideo_enc.c b/libavcodec/mpegvideo_enc.c
index 44ecba2524..8df05c4c7a 100644
--- a/libavcodec/mpegvideo_enc.c
+++ b/libavcodec/mpegvideo_enc.c
@@ -637,15 +637,6 @@ av_cold int MPV_encode_init(AVCodecContext *avctx)
         s->low_delay= s->max_b_frames ? 0 : 1;
         avctx->delay= s->low_delay ? 0 : (s->max_b_frames + 1);
         break;
-    case CODEC_ID_MSMPEG4V1:
-        s->out_format = FMT_H263;
-        s->h263_msmpeg4 = 1;
-        s->h263_pred = 1;
-        s->unrestricted_mv = 1;
-        s->msmpeg4_version= 1;
-        avctx->delay=0;
-        s->low_delay=1;
-        break;
     case CODEC_ID_MSMPEG4V2:
         s->out_format = FMT_H263;
         s->h263_msmpeg4 = 1;
@@ -3805,18 +3796,6 @@ AVCodec ff_h263p_encoder = {
     .long_name= NULL_IF_CONFIG_SMALL("H.263+ / H.263-1998 / H.263 version 2"),
 };
 
-AVCodec ff_msmpeg4v1_encoder = {
-    "msmpeg4v1",
-    AVMEDIA_TYPE_VIDEO,
-    CODEC_ID_MSMPEG4V1,
-    sizeof(MpegEncContext),
-    MPV_encode_init,
-    MPV_encode_picture,
-    MPV_encode_end,
-    .pix_fmts= (const enum PixelFormat[]){PIX_FMT_YUV420P, PIX_FMT_NONE},
-    .long_name= NULL_IF_CONFIG_SMALL("MPEG-4 part 2 Microsoft variant version 1"),
-};
-
 AVCodec ff_msmpeg4v2_encoder = {
     "msmpeg4v2",
     AVMEDIA_TYPE_VIDEO,
diff --git a/libavcodec/msmpeg4.c b/libavcodec/msmpeg4.c
index 6c9c2096a2..84ba1fa76a 100644
--- a/libavcodec/msmpeg4.c
+++ b/libavcodec/msmpeg4.c
@@ -846,13 +846,6 @@ static void msmpeg4_encode_dc(MpegEncContext * s, int level, int n, int *dir_ptr
     int pred, extquant;
     int extrabits = 0;
 
-    if(s->msmpeg4_version==1){
-        int32_t *dc_val;
-        pred = msmpeg4v1_pred_dc(s, n, &dc_val);
-
-        /* update predictor */
-        *dc_val= level;
-    }else{
         int16_t *dc_val;
         pred = msmpeg4_pred_dc(s, n, &dc_val, dir_ptr);
 
@@ -862,7 +855,6 @@ static void msmpeg4_encode_dc(MpegEncContext * s, int level, int n, int *dir_ptr
         } else {
             *dc_val = level * s->c_dc_scale;
         }
-    }
 
     /* do the prediction */
     level -= pred;
diff --git a/libavcodec/msmpeg4.h b/libavcodec/msmpeg4.h
index 0570bf9feb..8a0a066e48 100644
--- a/libavcodec/msmpeg4.h
+++ b/libavcodec/msmpeg4.h
@@ -54,8 +54,7 @@ int ff_wmv2_decode_mb(MpegEncContext *s, DCTELEM block[6][64]);
                                 CONFIG_MSMPEG4V3_DECODER || \
                                 CONFIG_WMV2_DECODER      || \
                                 CONFIG_VC1_DECODER)
-#define CONFIG_MSMPEG4_ENCODER (CONFIG_MSMPEG4V1_ENCODER || \
-                                CONFIG_MSMPEG4V2_ENCODER || \
+#define CONFIG_MSMPEG4_ENCODER (CONFIG_MSMPEG4V2_ENCODER || \
                                 CONFIG_MSMPEG4V3_ENCODER || \
                                 CONFIG_WMV2_ENCODER)
 
-- 
cgit v1.2.3


From b2893ee2f8b204f3d636c25a05d1dc1dd81dfdba Mon Sep 17 00:00:00 2001
From: Anton Khirnov <anton@khirnov.net>
Date: Sat, 21 May 2011 12:23:34 +0200
Subject: msmpeg4: reindent.

---
 libavcodec/msmpeg4.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/libavcodec/msmpeg4.c b/libavcodec/msmpeg4.c
index 84ba1fa76a..c575a2f206 100644
--- a/libavcodec/msmpeg4.c
+++ b/libavcodec/msmpeg4.c
@@ -846,15 +846,15 @@ static void msmpeg4_encode_dc(MpegEncContext * s, int level, int n, int *dir_ptr
     int pred, extquant;
     int extrabits = 0;
 
-        int16_t *dc_val;
-        pred = msmpeg4_pred_dc(s, n, &dc_val, dir_ptr);
+    int16_t *dc_val;
+    pred = msmpeg4_pred_dc(s, n, &dc_val, dir_ptr);
 
-        /* update predictor */
-        if (n < 4) {
-            *dc_val = level * s->y_dc_scale;
-        } else {
-            *dc_val = level * s->c_dc_scale;
-        }
+    /* update predictor */
+    if (n < 4) {
+        *dc_val = level * s->y_dc_scale;
+    } else {
+        *dc_val = level * s->c_dc_scale;
+    }
 
     /* do the prediction */
     level -= pred;
-- 
cgit v1.2.3


From f7053dc41a29fdd2592f57ced97420ac917d3ca9 Mon Sep 17 00:00:00 2001
From: Stefano Sabatini <stefano.sabatini-lala@poste.it>
Date: Sun, 3 Apr 2011 16:48:33 +0200
Subject: vsrc_buffer: tweak error message in init()

Change:
Expected 7 arguments, but only %d found in '%s'\n
to:
Expected 7 arguments, but %d found in '%s'\n

as the user may provide more than 7 arguments, in that case the error
is not misleading.

Signed-off-by: Anton Khirnov <anton@khirnov.net>
---
 libavfilter/vsrc_buffer.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libavfilter/vsrc_buffer.c b/libavfilter/vsrc_buffer.c
index 6567279667..1f0233e3e3 100644
--- a/libavfilter/vsrc_buffer.c
+++ b/libavfilter/vsrc_buffer.c
@@ -73,7 +73,7 @@ static av_cold int init(AVFilterContext *ctx, const char *args, void *opaque)
         (n = sscanf(args, "%d:%d:%127[^:]:%d:%d:%d:%d", &c->w, &c->h, pix_fmt_str,
                     &c->time_base.num, &c->time_base.den,
                     &c->pixel_aspect.num, &c->pixel_aspect.den)) != 7) {
-        av_log(ctx, AV_LOG_ERROR, "Expected 7 arguments, but only %d found in '%s'\n", n, args);
+        av_log(ctx, AV_LOG_ERROR, "Expected 7 arguments, but %d found in '%s'\n", n, args);
         return AVERROR(EINVAL);
     }
     if ((c->pix_fmt = av_get_pix_fmt(pix_fmt_str)) == PIX_FMT_NONE) {
-- 
cgit v1.2.3


From 75abcdb3915e3abb2dc6b5f7d101c177dcfdb626 Mon Sep 17 00:00:00 2001
From: Stefano Sabatini <stefano.sabatini-lala@poste.it>
Date: Mon, 11 Apr 2011 11:29:35 +0200
Subject: vsrc_buffer.h: add file doxy

Signed-off-by: Stefano Sabatini <stefano.sabatini-lala@poste.it>
Signed-off-by: Anton Khirnov <anton@khirnov.net>
---
 libavfilter/vsrc_buffer.h | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/libavfilter/vsrc_buffer.h b/libavfilter/vsrc_buffer.h
index 6867f81e1c..cfaf7919ac 100644
--- a/libavfilter/vsrc_buffer.h
+++ b/libavfilter/vsrc_buffer.h
@@ -22,6 +22,11 @@
 #ifndef AVFILTER_VSRC_BUFFER_H
 #define AVFILTER_VSRC_BUFFER_H
 
+/**
+ * @file
+ * memory buffer source API for video
+ */
+
 #include "libavcodec/avcodec.h" /* AVFrame */
 #include "avfilter.h"
 
-- 
cgit v1.2.3


From e66149e714006d099d1ebfcca3f22ca74fc7dcf4 Mon Sep 17 00:00:00 2001
From: "Ronald S. Bultje" <rsbultje@gmail.com>
Date: Tue, 24 May 2011 10:03:26 -0400
Subject: swscale: force --enable-runtime-cpudetect and remove SWS_CPU_CAPS_*.

---
 libswscale/bfin/swscale_bfin.c    |  20 +++---
 libswscale/colorspace-test.c      |  29 +--------
 libswscale/options.c              |   6 --
 libswscale/ppc/yuv2rgb_altivec.c  |   3 +-
 libswscale/rgb2rgb.c              |   7 +-
 libswscale/rgb2rgb.h              |   4 +-
 libswscale/rgb2rgb_template.c     |  19 ------
 libswscale/swscale.c              | 134 +++++++-------------------------------
 libswscale/swscale.h              |   7 --
 libswscale/swscale_internal.h     |   5 --
 libswscale/utils.c                | 120 +++++++++++++++-------------------
 libswscale/x86/rgb2rgb.c          |  21 +++---
 libswscale/x86/swscale_template.c |   7 +-
 libswscale/x86/yuv2rgb_mmx.c      |  70 ++++++++++----------
 libswscale/x86/yuv2rgb_template.c |  31 ++++-----
 libswscale/yuv2rgb.c              |  28 ++++----
 16 files changed, 171 insertions(+), 340 deletions(-)

diff --git a/libswscale/bfin/swscale_bfin.c b/libswscale/bfin/swscale_bfin.c
index fa3c03b4ea..60b7f8310b 100644
--- a/libswscale/bfin/swscale_bfin.c
+++ b/libswscale/bfin/swscale_bfin.c
@@ -79,15 +79,13 @@ static int yuyvtoyv12_unscaled(SwsContext *c, uint8_t* src[], int srcStride[], i
 void ff_bfin_get_unscaled_swscale(SwsContext *c)
 {
     SwsFunc swScale = c->swScale;
-    if (c->flags & SWS_CPU_CAPS_BFIN)
-        if (c->dstFormat == PIX_FMT_YUV420P)
-            if (c->srcFormat == PIX_FMT_UYVY422) {
-                av_log (NULL, AV_LOG_VERBOSE, "selecting Blackfin optimized uyvytoyv12_unscaled\n");
-                c->swScale = uyvytoyv12_unscaled;
-            }
-        if (c->dstFormat == PIX_FMT_YUV420P)
-            if (c->srcFormat == PIX_FMT_YUYV422) {
-                av_log (NULL, AV_LOG_VERBOSE, "selecting Blackfin optimized yuyvtoyv12_unscaled\n");
-                c->swScale = yuyvtoyv12_unscaled;
-            }
+
+    if (c->dstFormat == PIX_FMT_YUV420P && c->srcFormat == PIX_FMT_UYVY422) {
+        av_log (NULL, AV_LOG_VERBOSE, "selecting Blackfin optimized uyvytoyv12_unscaled\n");
+        c->swScale = uyvytoyv12_unscaled;
+    }
+    if (c->dstFormat == PIX_FMT_YUV420P && c->srcFormat == PIX_FMT_YUYV422) {
+        av_log (NULL, AV_LOG_VERBOSE, "selecting Blackfin optimized yuyvtoyv12_unscaled\n");
+        c->swScale = yuyvtoyv12_unscaled;
+    }
 }
diff --git a/libswscale/colorspace-test.c b/libswscale/colorspace-test.c
index 4e7116f5dc..4ed9164323 100644
--- a/libswscale/colorspace-test.c
+++ b/libswscale/colorspace-test.c
@@ -33,31 +33,6 @@
 
 #define FUNC(s,d,n) {s,d,#n,n}
 
-static int cpu_caps;
-
-static char *args_parse(int argc, char *argv[])
-{
-    int o;
-
-    while ((o = getopt(argc, argv, "m23")) != -1) {
-        switch (o) {
-        case 'm':
-            cpu_caps |= SWS_CPU_CAPS_MMX;
-            break;
-        case '2':
-            cpu_caps |= SWS_CPU_CAPS_MMX2;
-            break;
-        case '3':
-            cpu_caps |= SWS_CPU_CAPS_3DNOW;
-            break;
-        default:
-            av_log(NULL, AV_LOG_ERROR, "Unknown option %c\n", o);
-        }
-    }
-
-    return argv[optind];
-}
-
 int main(int argc, char **argv)
 {
     int i, funcNum;
@@ -70,9 +45,7 @@ int main(int argc, char **argv)
         return -1;
 
     av_log(NULL, AV_LOG_INFO, "memory corruption test ...\n");
-    args_parse(argc, argv);
-    av_log(NULL, AV_LOG_INFO, "CPU capabilities forced to %x\n", cpu_caps);
-    sws_rgb2rgb_init(cpu_caps);
+    sws_rgb2rgb_init();
 
     for(funcNum=0; ; funcNum++) {
         struct func_info_s {
diff --git a/libswscale/options.c b/libswscale/options.c
index f80735b261..ecd0ecd53a 100644
--- a/libswscale/options.c
+++ b/libswscale/options.c
@@ -48,12 +48,6 @@ static const AVOption options[] = {
     { "spline", "natural bicubic spline", 0, FF_OPT_TYPE_CONST, {.dbl = SWS_SPLINE }, INT_MIN, INT_MAX, VE, "sws_flags" },
     { "print_info", "print info", 0, FF_OPT_TYPE_CONST, {.dbl = SWS_PRINT_INFO }, INT_MIN, INT_MAX, VE, "sws_flags" },
     { "accurate_rnd", "accurate rounding", 0, FF_OPT_TYPE_CONST, {.dbl = SWS_ACCURATE_RND }, INT_MIN, INT_MAX, VE, "sws_flags" },
-    { "mmx", "MMX SIMD acceleration", 0, FF_OPT_TYPE_CONST, {.dbl = SWS_CPU_CAPS_MMX }, INT_MIN, INT_MAX, VE, "sws_flags" },
-    { "mmx2", "MMX2 SIMD acceleration", 0, FF_OPT_TYPE_CONST, {.dbl = SWS_CPU_CAPS_MMX2 }, INT_MIN, INT_MAX, VE, "sws_flags" },
-    { "sse2", "SSE2 SIMD acceleration", 0, FF_OPT_TYPE_CONST, {.dbl = SWS_CPU_CAPS_SSE2 }, INT_MIN, INT_MAX, VE, "sws_flags" },
-    { "3dnow", "3DNOW SIMD acceleration", 0, FF_OPT_TYPE_CONST, {.dbl = SWS_CPU_CAPS_3DNOW }, INT_MIN, INT_MAX, VE, "sws_flags" },
-    { "altivec", "AltiVec SIMD acceleration", 0, FF_OPT_TYPE_CONST, {.dbl = SWS_CPU_CAPS_ALTIVEC }, INT_MIN, INT_MAX, VE, "sws_flags" },
-    { "bfin", "Blackfin SIMD acceleration", 0, FF_OPT_TYPE_CONST, {.dbl = SWS_CPU_CAPS_BFIN }, INT_MIN, INT_MAX, VE, "sws_flags" },
     { "full_chroma_int", "full chroma interpolation", 0 , FF_OPT_TYPE_CONST, {.dbl = SWS_FULL_CHR_H_INT }, INT_MIN, INT_MAX, VE, "sws_flags" },
     { "full_chroma_inp", "full chroma input", 0 , FF_OPT_TYPE_CONST, {.dbl = SWS_FULL_CHR_H_INP }, INT_MIN, INT_MAX, VE, "sws_flags" },
     { "bitexact", "", 0 , FF_OPT_TYPE_CONST, {.dbl = SWS_BITEXACT }, INT_MIN, INT_MAX, VE, "sws_flags" },
diff --git a/libswscale/ppc/yuv2rgb_altivec.c b/libswscale/ppc/yuv2rgb_altivec.c
index 0113c8ddd5..96c208a074 100644
--- a/libswscale/ppc/yuv2rgb_altivec.c
+++ b/libswscale/ppc/yuv2rgb_altivec.c
@@ -94,6 +94,7 @@ adjustment.
 #include "libswscale/rgb2rgb.h"
 #include "libswscale/swscale.h"
 #include "libswscale/swscale_internal.h"
+#include "libavutil/cpu.h"
 
 #undef PROFILE_THE_BEAST
 #undef INC_SCALING
@@ -692,7 +693,7 @@ static int altivec_uyvy_rgb32 (SwsContext *c,
 */
 SwsFunc ff_yuv2rgb_init_altivec(SwsContext *c)
 {
-    if (!(c->flags & SWS_CPU_CAPS_ALTIVEC))
+    if (!(av_get_cpu_flags() & AV_CPU_FLAG_ALTIVEC))
         return NULL;
 
     /*
diff --git a/libswscale/rgb2rgb.c b/libswscale/rgb2rgb.c
index 74cc42e3b0..e18cd51011 100644
--- a/libswscale/rgb2rgb.c
+++ b/libswscale/rgb2rgb.c
@@ -116,12 +116,11 @@ void (*yuyvtoyuv422)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t
  32-bit C version, and and&add trick by Michael Niedermayer
 */
 
-void sws_rgb2rgb_init(int flags)
+void sws_rgb2rgb_init(void)
 {
     rgb2rgb_init_c();
-#if HAVE_MMX2 || HAVE_AMD3DNOW || HAVE_MMX
-    rgb2rgb_init_x86(flags);
-#endif /* HAVE_MMX2 || HAVE_AMD3DNOW || HAVE_MMX */
+    if (HAVE_MMX)
+        rgb2rgb_init_x86();
 }
 
 void rgb32to24(const uint8_t *src, uint8_t *dst, long src_size)
diff --git a/libswscale/rgb2rgb.h b/libswscale/rgb2rgb.h
index 158d9a9a87..a8d5531cb8 100644
--- a/libswscale/rgb2rgb.h
+++ b/libswscale/rgb2rgb.h
@@ -156,8 +156,8 @@ extern void (*yuyvtoyuv422)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const u
                             long width, long height,
                             long lumStride, long chromStride, long srcStride);
 
-void sws_rgb2rgb_init(int flags);
+void sws_rgb2rgb_init(void);
 
-void rgb2rgb_init_x86(int flags);
+void rgb2rgb_init_x86(void);
 
 #endif /* SWSCALE_RGB2RGB_H */
diff --git a/libswscale/rgb2rgb_template.c b/libswscale/rgb2rgb_template.c
index 7537419d83..ea39be4f13 100644
--- a/libswscale/rgb2rgb_template.c
+++ b/libswscale/rgb2rgb_template.c
@@ -278,25 +278,6 @@ static inline void rgb16tobgr24_c(const uint8_t *src, uint8_t *dst, long src_siz
     }
 }
 
-/*
- * mm0 = 00 B3 00 B2 00 B1 00 B0
- * mm1 = 00 G3 00 G2 00 G1 00 G0
- * mm2 = 00 R3 00 R2 00 R1 00 R0
- * mm6 = FF FF FF FF FF FF FF FF
- * mm7 = 00 00 00 00 00 00 00 00
- */
-#define PACK_RGB32 \
-    "packuswb   %%mm7, %%mm0    \n\t" /* 00 00 00 00 B3 B2 B1 B0 */ \
-    "packuswb   %%mm7, %%mm1    \n\t" /* 00 00 00 00 G3 G2 G1 G0 */ \
-    "packuswb   %%mm7, %%mm2    \n\t" /* 00 00 00 00 R3 R2 R1 R0 */ \
-    "punpcklbw  %%mm1, %%mm0    \n\t" /* G3 B3 G2 B2 G1 B1 G0 B0 */ \
-    "punpcklbw  %%mm6, %%mm2    \n\t" /* FF R3 FF R2 FF R1 FF R0 */ \
-    "movq       %%mm0, %%mm3    \n\t"                               \
-    "punpcklwd  %%mm2, %%mm0    \n\t" /* FF R1 G1 B1 FF R0 G0 B0 */ \
-    "punpckhwd  %%mm2, %%mm3    \n\t" /* FF R3 G3 B3 FF R2 G2 B2 */ \
-    MOVNTQ"     %%mm0,  %0      \n\t"                               \
-    MOVNTQ"     %%mm3, 8%0      \n\t"                               \
-
 static inline void rgb15to32_c(const uint8_t *src, uint8_t *dst, long src_size)
 {
     const uint16_t *end;
diff --git a/libswscale/swscale.c b/libswscale/swscale.c
index 3943aa0b72..749a276e5a 100644
--- a/libswscale/swscale.c
+++ b/libswscale/swscale.c
@@ -62,6 +62,7 @@ untested special converters
 #include "rgb2rgb.h"
 #include "libavutil/intreadwrite.h"
 #include "libavutil/x86_cpu.h"
+#include "libavutil/cpu.h"
 #include "libavutil/avutil.h"
 #include "libavutil/mathematics.h"
 #include "libavutil/bswap.h"
@@ -70,10 +71,6 @@ untested special converters
 #undef MOVNTQ
 #undef PAVGB
 
-//#undef HAVE_MMX2
-//#define HAVE_AMD3DNOW
-//#undef HAVE_MMX
-//#undef ARCH_X86
 #define DITHER1XBPP
 
 #define isPacked(x)         (       \
@@ -1179,57 +1176,14 @@ static inline void monoblack2Y(uint8_t *dst, const uint8_t *src, long width, uin
 
 //Note: we have C, MMX, MMX2, 3DNOW versions, there is no 3DNOW+MMX2 one
 //Plain C versions
-#if CONFIG_RUNTIME_CPUDETECT
-#  define COMPILE_C 1
-#  if   ARCH_X86
-#    define COMPILE_MMX     HAVE_MMX
-#    define COMPILE_MMX2    HAVE_MMX2
-#    define COMPILE_3DNOW   HAVE_AMD3DNOW
-#  elif ARCH_PPC
-#    define COMPILE_ALTIVEC HAVE_ALTIVEC
-#  endif
-#else /* CONFIG_RUNTIME_CPUDETECT */
-#  if   ARCH_X86
-#    if   HAVE_MMX2
-#      define COMPILE_MMX2  1
-#    elif HAVE_AMD3DNOW
-#      define COMPILE_3DNOW 1
-#    elif HAVE_MMX
-#      define COMPILE_MMX   1
-#    else
-#      define COMPILE_C     1
-#    endif
-#  elif ARCH_PPC && HAVE_ALTIVEC
-#    define COMPILE_ALTIVEC 1
-#  else
-#    define COMPILE_C       1
-#  endif
-#endif
-
-#ifndef COMPILE_C
-#  define COMPILE_C 0
-#endif
-#ifndef COMPILE_MMX
-#  define COMPILE_MMX 0
-#endif
-#ifndef COMPILE_MMX2
-#  define COMPILE_MMX2 0
-#endif
-#ifndef COMPILE_3DNOW
-#  define COMPILE_3DNOW 0
-#endif
-#ifndef COMPILE_ALTIVEC
-#  define COMPILE_ALTIVEC 0
-#endif
 
-#define COMPILE_TEMPLATE_MMX 0
 #define COMPILE_TEMPLATE_MMX2 0
 #define COMPILE_TEMPLATE_AMD3DNOW 0
 #define COMPILE_TEMPLATE_ALTIVEC 0
 
 #include "swscale_template.c"
 
-#if COMPILE_ALTIVEC
+#if HAVE_ALTIVEC
 #undef RENAME
 #undef COMPILE_TEMPLATE_ALTIVEC
 #define COMPILE_TEMPLATE_ALTIVEC 1
@@ -1237,15 +1191,11 @@ static inline void monoblack2Y(uint8_t *dst, const uint8_t *src, long width, uin
 #include "ppc/swscale_template.c"
 #endif
 
-#if ARCH_X86
-
 //MMX versions
-#if COMPILE_MMX
+#if HAVE_MMX
 #undef RENAME
-#undef COMPILE_TEMPLATE_MMX
 #undef COMPILE_TEMPLATE_MMX2
 #undef COMPILE_TEMPLATE_AMD3DNOW
-#define COMPILE_TEMPLATE_MMX 1
 #define COMPILE_TEMPLATE_MMX2 0
 #define COMPILE_TEMPLATE_AMD3DNOW 0
 #define RENAME(a) a ## _MMX
@@ -1253,12 +1203,10 @@ static inline void monoblack2Y(uint8_t *dst, const uint8_t *src, long width, uin
 #endif
 
 //MMX2 versions
-#if COMPILE_MMX2
+#if HAVE_MMX2
 #undef RENAME
-#undef COMPILE_TEMPLATE_MMX
 #undef COMPILE_TEMPLATE_MMX2
 #undef COMPILE_TEMPLATE_AMD3DNOW
-#define COMPILE_TEMPLATE_MMX 1
 #define COMPILE_TEMPLATE_MMX2 1
 #define COMPILE_TEMPLATE_AMD3DNOW 0
 #define RENAME(a) a ## _MMX2
@@ -1266,61 +1214,47 @@ static inline void monoblack2Y(uint8_t *dst, const uint8_t *src, long width, uin
 #endif
 
 //3DNOW versions
-#if COMPILE_3DNOW
+#if HAVE_AMD3DNOW
 #undef RENAME
-#undef COMPILE_TEMPLATE_MMX
 #undef COMPILE_TEMPLATE_MMX2
 #undef COMPILE_TEMPLATE_AMD3DNOW
-#define COMPILE_TEMPLATE_MMX 1
 #define COMPILE_TEMPLATE_MMX2 0
 #define COMPILE_TEMPLATE_AMD3DNOW 1
 #define RENAME(a) a ## _3DNow
 #include "x86/swscale_template.c"
 #endif
 
-#endif //ARCH_X86
-
 SwsFunc ff_getSwsFunc(SwsContext *c)
 {
+    int cpu_flags = av_get_cpu_flags();
+
     sws_init_swScale_c(c);
 
-#if CONFIG_RUNTIME_CPUDETECT
-#if ARCH_X86
+#if HAVE_MMX2
     // ordered per speed fastest first
-    if (c->flags & SWS_CPU_CAPS_MMX2) {
+    if (cpu_flags & AV_CPU_FLAG_MMX2) {
         sws_init_swScale_MMX2(c);
         return swScale_MMX2;
-    } else if (c->flags & SWS_CPU_CAPS_3DNOW) {
+    } else
+#endif
+#if HAVE_AMD3DNOW
+    if (cpu_flags & AV_CPU_FLAG_3DNOW) {
         sws_init_swScale_3DNow(c);
         return swScale_3DNow;
-    } else if (c->flags & SWS_CPU_CAPS_MMX) {
+    } else
+#endif
+#if HAVE_MMX
+    if (cpu_flags & AV_CPU_FLAG_MMX) {
         sws_init_swScale_MMX(c);
         return swScale_MMX;
-    }
-
-#else
-#if COMPILE_ALTIVEC
-    if (c->flags & SWS_CPU_CAPS_ALTIVEC) {
+    } else
+#endif
+#if HAVE_ALTIVEC
+    if (cpu_flags & AV_CPU_FLAG_ALTIVEC) {
         sws_init_swScale_altivec(c);
         return swScale_altivec;
-    }
-#endif
-#endif /* ARCH_X86 */
-#else //CONFIG_RUNTIME_CPUDETECT
-#if   COMPILE_TEMPLATE_MMX2
-    sws_init_swScale_MMX2(c);
-    return swScale_MMX2;
-#elif COMPILE_TEMPLATE_AMD3DNOW
-    sws_init_swScale_3DNow(c);
-    return swScale_3DNow;
-#elif COMPILE_TEMPLATE_MMX
-    sws_init_swScale_MMX(c);
-    return swScale_MMX;
-#elif COMPILE_TEMPLATE_ALTIVEC
-    sws_init_swScale_altivec(c);
-    return swScale_altivec;
+    } else
 #endif
-#endif //!CONFIG_RUNTIME_CPUDETECT
 
     return swScale_c;
 }
@@ -1864,23 +1798,6 @@ static int planarCopyWrapper(SwsContext *c, const uint8_t* src[], int srcStride[
     return srcSliceH;
 }
 
-int ff_hardcodedcpuflags(void)
-{
-    int flags = 0;
-#if   COMPILE_TEMPLATE_MMX2
-    flags |= SWS_CPU_CAPS_MMX|SWS_CPU_CAPS_MMX2;
-#elif COMPILE_TEMPLATE_AMD3DNOW
-    flags |= SWS_CPU_CAPS_MMX|SWS_CPU_CAPS_3DNOW;
-#elif COMPILE_TEMPLATE_MMX
-    flags |= SWS_CPU_CAPS_MMX;
-#elif COMPILE_TEMPLATE_ALTIVEC
-    flags |= SWS_CPU_CAPS_ALTIVEC;
-#elif ARCH_BFIN
-    flags |= SWS_CPU_CAPS_BFIN;
-#endif
-    return flags;
-}
-
 void ff_get_unscaled_swscale(SwsContext *c)
 {
     const enum PixelFormat srcFormat = c->srcFormat;
@@ -1964,8 +1881,8 @@ void ff_get_unscaled_swscale(SwsContext *c)
     if(srcFormat == PIX_FMT_UYVY422 && dstFormat == PIX_FMT_YUV422P)
         c->swScale= uyvyToYuv422Wrapper;
 
-#if COMPILE_ALTIVEC
-    if ((c->flags & SWS_CPU_CAPS_ALTIVEC) &&
+#if HAVE_ALTIVEC
+    if ((av_get_cpu_flags() & AV_CPU_FLAG_ALTIVEC) &&
         !(c->flags & SWS_BITEXACT) &&
         srcFormat == PIX_FMT_YUV420P) {
         // unscaled YV12 -> packed YUV, we want speed
@@ -1995,8 +1912,7 @@ void ff_get_unscaled_swscale(SwsContext *c)
             c->swScale= planarCopyWrapper;
     }
 #if ARCH_BFIN
-    if (flags & SWS_CPU_CAPS_BFIN)
-        ff_bfin_get_unscaled_swscale (c);
+    ff_bfin_get_unscaled_swscale (c);
 #endif
 }
 
diff --git a/libswscale/swscale.h b/libswscale/swscale.h
index dd4de76b0a..1babced737 100644
--- a/libswscale/swscale.h
+++ b/libswscale/swscale.h
@@ -95,13 +95,6 @@ const char *swscale_license(void);
 #define SWS_ACCURATE_RND      0x40000
 #define SWS_BITEXACT          0x80000
 
-#define SWS_CPU_CAPS_MMX      0x80000000
-#define SWS_CPU_CAPS_MMX2     0x20000000
-#define SWS_CPU_CAPS_3DNOW    0x40000000
-#define SWS_CPU_CAPS_ALTIVEC  0x10000000
-#define SWS_CPU_CAPS_BFIN     0x01000000
-#define SWS_CPU_CAPS_SSE2     0x02000000
-
 #define SWS_MAX_REDUCE_CUTOFF 0.002
 
 #define SWS_CS_ITU709         1
diff --git a/libswscale/swscale_internal.h b/libswscale/swscale_internal.h
index 2369546cb7..5f2ff94691 100644
--- a/libswscale/swscale_internal.h
+++ b/libswscale/swscale_internal.h
@@ -475,11 +475,6 @@ extern const AVClass sws_context_class;
  */
 void ff_get_unscaled_swscale(SwsContext *c);
 
-/**
- * Returns the SWS_CPU_CAPS for the optimized code compiled into swscale.
- */
-int ff_hardcodedcpuflags(void);
-
 /**
  * Returns function pointer to fastest main scaler path function depending
  * on architecture and available optimizations.
diff --git a/libswscale/utils.c b/libswscale/utils.c
index e9319fa67b..b49ec89be8 100644
--- a/libswscale/utils.c
+++ b/libswscale/utils.c
@@ -39,6 +39,7 @@
 #include "rgb2rgb.h"
 #include "libavutil/intreadwrite.h"
 #include "libavutil/x86_cpu.h"
+#include "libavutil/cpu.h"
 #include "libavutil/avutil.h"
 #include "libavutil/bswap.h"
 #include "libavutil/opt.h"
@@ -180,7 +181,7 @@ static double getSplineCoeff(double a, double b, double c, double d, double dist
 }
 
 static int initFilter(int16_t **outFilter, int16_t **filterPos, int *outFilterSize, int xInc,
-                      int srcW, int dstW, int filterAlign, int one, int flags,
+                      int srcW, int dstW, int filterAlign, int one, int flags, int cpu_flags,
                       SwsVector *srcFilter, SwsVector *dstFilter, double param[2])
 {
     int i;
@@ -191,10 +192,9 @@ static int initFilter(int16_t **outFilter, int16_t **filterPos, int *outFilterSi
     int64_t *filter2=NULL;
     const int64_t fone= 1LL<<54;
     int ret= -1;
-#if ARCH_X86
-    if (flags & SWS_CPU_CAPS_MMX)
+
+    if (HAVE_MMX && cpu_flags & AV_CPU_FLAG_MMX)
         __asm__ volatile("emms\n\t"::: "memory"); //FIXME this should not be required but it IS (even for non-MMX versions)
-#endif
 
     // NOTE: the +1 is for the MMX scaler which reads over the end
     FF_ALLOC_OR_GOTO(NULL, *filterPos, (dstW+1)*sizeof(int16_t), fail);
@@ -411,7 +411,7 @@ static int initFilter(int16_t **outFilter, int16_t **filterPos, int *outFilterSi
         if (min>minFilterSize) minFilterSize= min;
     }
 
-    if (flags & SWS_CPU_CAPS_ALTIVEC) {
+    if (HAVE_ALTIVEC && cpu_flags & AV_CPU_FLAG_ALTIVEC) {
         // we can handle the special case 4,
         // so we don't want to go to the full 8
         if (minFilterSize < 5)
@@ -426,7 +426,7 @@ static int initFilter(int16_t **outFilter, int16_t **filterPos, int *outFilterSi
             filterAlign = 1;
     }
 
-    if (flags & SWS_CPU_CAPS_MMX) {
+    if (HAVE_MMX && cpu_flags & AV_CPU_FLAG_MMX) {
         // special case for unscaled vertical filtering
         if (minFilterSize == 1 && filterAlign == 2)
             filterAlign= 1;
@@ -516,7 +516,7 @@ fail:
     return ret;
 }
 
-#if ARCH_X86 && (HAVE_MMX2 || CONFIG_RUNTIME_CPUDETECT)
+#if HAVE_MMX2
 static int initMMX2HScaler(int dstW, int xInc, uint8_t *filterCode, int16_t *filter, int32_t *filterPos, int numSplits)
 {
     uint8_t *fragmentA;
@@ -674,7 +674,7 @@ static int initMMX2HScaler(int dstW, int xInc, uint8_t *filterCode, int16_t *fil
 
     return fragmentPos + 1;
 }
-#endif /* ARCH_X86 && (HAVE_MMX2 || CONFIG_RUNTIME_CPUDETECT) */
+#endif /* HAVE_MMX2 */
 
 static void getSubSampleFactors(int *h, int *v, enum PixelFormat format)
 {
@@ -682,8 +682,6 @@ static void getSubSampleFactors(int *h, int *v, enum PixelFormat format)
     *v = av_pix_fmt_descriptors[format].log2_chroma_h;
 }
 
-static int update_flags_cpu(int flags);
-
 int sws_setColorspaceDetails(SwsContext *c, const int inv_table[4], int srcRange, const int table[4], int dstRange, int brightness, int contrast, int saturation)
 {
     memcpy(c->srcColorspaceTable, inv_table, sizeof(int)*4);
@@ -698,15 +696,12 @@ int sws_setColorspaceDetails(SwsContext *c, const int inv_table[4], int srcRange
 
     c->dstFormatBpp = av_get_bits_per_pixel(&av_pix_fmt_descriptors[c->dstFormat]);
     c->srcFormatBpp = av_get_bits_per_pixel(&av_pix_fmt_descriptors[c->srcFormat]);
-    c->flags = update_flags_cpu(c->flags);
 
     ff_yuv2rgb_c_init_tables(c, inv_table, srcRange, brightness, contrast, saturation);
     //FIXME factorize
 
-#if HAVE_ALTIVEC
-    if (c->flags & SWS_CPU_CAPS_ALTIVEC)
+    if (HAVE_ALTIVEC && av_get_cpu_flags() & AV_CPU_FLAG_ALTIVEC)
         ff_yuv2rgb_init_tables_altivec(c, inv_table, brightness, contrast, saturation);
-#endif
     return 0;
 }
 
@@ -736,20 +731,6 @@ static int handle_jpeg(enum PixelFormat *format)
     }
 }
 
-static int update_flags_cpu(int flags)
-{
-#if !CONFIG_RUNTIME_CPUDETECT //ensure that the flags match the compiled variant if cpudetect is off
-    flags &= ~( SWS_CPU_CAPS_MMX
-               |SWS_CPU_CAPS_MMX2
-               |SWS_CPU_CAPS_3DNOW
-               |SWS_CPU_CAPS_SSE2
-               |SWS_CPU_CAPS_ALTIVEC
-               |SWS_CPU_CAPS_BFIN);
-    flags |= ff_hardcodedcpuflags();
-#endif /* CONFIG_RUNTIME_CPUDETECT */
-    return flags;
-}
-
 SwsContext *sws_alloc_context(void)
 {
     SwsContext *c= av_mallocz(sizeof(SwsContext));
@@ -770,16 +751,15 @@ int sws_init_context(SwsContext *c, SwsFilter *srcFilter, SwsFilter *dstFilter)
     int srcH= c->srcH;
     int dstW= c->dstW;
     int dstH= c->dstH;
-    int flags;
+    int flags, cpu_flags;
     enum PixelFormat srcFormat= c->srcFormat;
     enum PixelFormat dstFormat= c->dstFormat;
 
-    flags= c->flags = update_flags_cpu(c->flags);
-#if ARCH_X86
-    if (flags & SWS_CPU_CAPS_MMX)
+    cpu_flags = av_get_cpu_flags();
+    flags     = c->flags;
+    if (HAVE_MMX && cpu_flags & AV_CPU_FLAG_MMX)
         __asm__ volatile("emms\n\t"::: "memory");
-#endif
-    if (!rgb15to16) sws_rgb2rgb_init(flags);
+    if (!rgb15to16) sws_rgb2rgb_init();
 
     unscaled = (srcW == dstW && srcH == dstH);
 
@@ -872,7 +852,7 @@ int sws_init_context(SwsContext *c, SwsFilter *srcFilter, SwsFilter *dstFilter)
         }
     }
 
-    if (flags & SWS_CPU_CAPS_MMX2) {
+    if (HAVE_MMX2 && cpu_flags & AV_CPU_FLAG_MMX2) {
         c->canMMX2BeUsed= (dstW >=srcW && (dstW&31)==0 && (srcW&15)==0) ? 1 : 0;
         if (!c->canMMX2BeUsed && dstW >=srcW && (srcW&15)==0 && (flags&SWS_FAST_BILINEAR)) {
             if (flags&SWS_PRINT_INFO)
@@ -898,7 +878,7 @@ int sws_init_context(SwsContext *c, SwsFilter *srcFilter, SwsFilter *dstFilter)
             c->chrXInc+= 20;
         }
         //we don't use the x86 asm scaler if MMX is available
-        else if (flags & SWS_CPU_CAPS_MMX) {
+        else if (HAVE_MMX && cpu_flags & AV_CPU_FLAG_MMX) {
             c->lumXInc = ((srcW-2)<<16)/(dstW-2) - 20;
             c->chrXInc = ((c->chrSrcW-2)<<16)/(c->chrDstW-2) - 20;
         }
@@ -906,7 +886,7 @@ int sws_init_context(SwsContext *c, SwsFilter *srcFilter, SwsFilter *dstFilter)
 
     /* precalculate horizontal scaler filter coefficients */
     {
-#if ARCH_X86 && (HAVE_MMX2 || CONFIG_RUNTIME_CPUDETECT)
+#if HAVE_MMX2
 // can't downscale !!!
         if (c->canMMX2BeUsed && (flags & SWS_FAST_BILINEAR)) {
             c->lumMmx2FilterCodeSize = initMMX2HScaler(      dstW, c->lumXInc, NULL, NULL, NULL, 8);
@@ -938,21 +918,21 @@ int sws_init_context(SwsContext *c, SwsFilter *srcFilter, SwsFilter *dstFilter)
             mprotect(c->chrMmx2FilterCode, c->chrMmx2FilterCodeSize, PROT_EXEC | PROT_READ);
 #endif
         } else
-#endif /* ARCH_X86 && (HAVE_MMX2 || CONFIG_RUNTIME_CPUDETECT) */
+#endif /* HAVE_MMX2 */
         {
             const int filterAlign=
-                (flags & SWS_CPU_CAPS_MMX) ? 4 :
-                (flags & SWS_CPU_CAPS_ALTIVEC) ? 8 :
+                (HAVE_MMX     && cpu_flags & AV_CPU_FLAG_MMX) ? 4 :
+                (HAVE_ALTIVEC && cpu_flags & AV_CPU_FLAG_ALTIVEC) ? 8 :
                 1;
 
             if (initFilter(&c->hLumFilter, &c->hLumFilterPos, &c->hLumFilterSize, c->lumXInc,
                            srcW      ,       dstW, filterAlign, 1<<14,
-                           (flags&SWS_BICUBLIN) ? (flags|SWS_BICUBIC)  : flags,
+                           (flags&SWS_BICUBLIN) ? (flags|SWS_BICUBIC)  : flags, cpu_flags,
                            srcFilter->lumH, dstFilter->lumH, c->param) < 0)
                 goto fail;
             if (initFilter(&c->hChrFilter, &c->hChrFilterPos, &c->hChrFilterSize, c->chrXInc,
                            c->chrSrcW, c->chrDstW, filterAlign, 1<<14,
-                           (flags&SWS_BICUBLIN) ? (flags|SWS_BILINEAR) : flags,
+                           (flags&SWS_BICUBLIN) ? (flags|SWS_BILINEAR) : flags, cpu_flags,
                            srcFilter->chrH, dstFilter->chrH, c->param) < 0)
                 goto fail;
         }
@@ -961,18 +941,18 @@ int sws_init_context(SwsContext *c, SwsFilter *srcFilter, SwsFilter *dstFilter)
     /* precalculate vertical scaler filter coefficients */
     {
         const int filterAlign=
-            (flags & SWS_CPU_CAPS_MMX) && (flags & SWS_ACCURATE_RND) ? 2 :
-            (flags & SWS_CPU_CAPS_ALTIVEC) ? 8 :
+            (HAVE_MMX     && cpu_flags & AV_CPU_FLAG_MMX) && (flags & SWS_ACCURATE_RND) ? 2 :
+            (HAVE_ALTIVEC && cpu_flags & AV_CPU_FLAG_ALTIVEC) ? 8 :
             1;
 
         if (initFilter(&c->vLumFilter, &c->vLumFilterPos, &c->vLumFilterSize, c->lumYInc,
                        srcH      ,        dstH, filterAlign, (1<<12),
-                       (flags&SWS_BICUBLIN) ? (flags|SWS_BICUBIC)  : flags,
+                       (flags&SWS_BICUBLIN) ? (flags|SWS_BICUBIC)  : flags, cpu_flags,
                        srcFilter->lumV, dstFilter->lumV, c->param) < 0)
             goto fail;
         if (initFilter(&c->vChrFilter, &c->vChrFilterPos, &c->vChrFilterSize, c->chrYInc,
                        c->chrSrcH, c->chrDstH, filterAlign, (1<<12),
-                       (flags&SWS_BICUBLIN) ? (flags|SWS_BILINEAR) : flags,
+                       (flags&SWS_BICUBLIN) ? (flags|SWS_BILINEAR) : flags, cpu_flags,
                        srcFilter->chrV, dstFilter->chrV, c->param) < 0)
             goto fail;
 
@@ -1066,13 +1046,13 @@ int sws_init_context(SwsContext *c, SwsFilter *srcFilter, SwsFilter *dstFilter)
 #endif
                sws_format_name(dstFormat));
 
-        if      (flags & SWS_CPU_CAPS_MMX2)    av_log(c, AV_LOG_INFO, "using MMX2\n");
-        else if (flags & SWS_CPU_CAPS_3DNOW)   av_log(c, AV_LOG_INFO, "using 3DNOW\n");
-        else if (flags & SWS_CPU_CAPS_MMX)     av_log(c, AV_LOG_INFO, "using MMX\n");
-        else if (flags & SWS_CPU_CAPS_ALTIVEC) av_log(c, AV_LOG_INFO, "using AltiVec\n");
+        if      (HAVE_MMX2     && cpu_flags & AV_CPU_FLAG_MMX2)    av_log(c, AV_LOG_INFO, "using MMX2\n");
+        else if (HAVE_AMD3DNOW && cpu_flags & AV_CPU_FLAG_3DNOW)   av_log(c, AV_LOG_INFO, "using 3DNOW\n");
+        else if (HAVE_MMX      && cpu_flags & AV_CPU_FLAG_MMX)     av_log(c, AV_LOG_INFO, "using MMX\n");
+        else if (HAVE_ALTIVEC  && cpu_flags & AV_CPU_FLAG_ALTIVEC) av_log(c, AV_LOG_INFO, "using AltiVec\n");
         else                                   av_log(c, AV_LOG_INFO, "using C\n");
 
-        if (flags & SWS_CPU_CAPS_MMX) {
+        if (HAVE_MMX && cpu_flags & AV_CPU_FLAG_MMX) {
             if (c->canMMX2BeUsed && (flags&SWS_FAST_BILINEAR))
                 av_log(c, AV_LOG_VERBOSE, "using FAST_BILINEAR MMX2 scaler for horizontal scaling\n");
             else {
@@ -1091,7 +1071,7 @@ int sws_init_context(SwsContext *c, SwsFilter *srcFilter, SwsFilter *dstFilter)
                     av_log(c, AV_LOG_VERBOSE, "using n-tap MMX scaler for horizontal chrominance scaling\n");
             }
         } else {
-#if ARCH_X86
+#if HAVE_MMX
             av_log(c, AV_LOG_VERBOSE, "using x86 asm scaler for horizontal scaling\n");
 #else
             if (flags & SWS_FAST_BILINEAR)
@@ -1102,31 +1082,41 @@ int sws_init_context(SwsContext *c, SwsFilter *srcFilter, SwsFilter *dstFilter)
         }
         if (isPlanarYUV(dstFormat)) {
             if (c->vLumFilterSize==1)
-                av_log(c, AV_LOG_VERBOSE, "using 1-tap %s \"scaler\" for vertical scaling (YV12 like)\n", (flags & SWS_CPU_CAPS_MMX) ? "MMX" : "C");
+                av_log(c, AV_LOG_VERBOSE, "using 1-tap %s \"scaler\" for vertical scaling (YV12 like)\n",
+                       (HAVE_MMX && cpu_flags & AV_CPU_FLAG_MMX) ? "MMX" : "C");
             else
-                av_log(c, AV_LOG_VERBOSE, "using n-tap %s scaler for vertical scaling (YV12 like)\n", (flags & SWS_CPU_CAPS_MMX) ? "MMX" : "C");
+                av_log(c, AV_LOG_VERBOSE, "using n-tap %s scaler for vertical scaling (YV12 like)\n",
+                       (HAVE_MMX && cpu_flags & AV_CPU_FLAG_MMX) ? "MMX" : "C");
         } else {
             if (c->vLumFilterSize==1 && c->vChrFilterSize==2)
                 av_log(c, AV_LOG_VERBOSE, "using 1-tap %s \"scaler\" for vertical luminance scaling (BGR)\n"
-                       "      2-tap scaler for vertical chrominance scaling (BGR)\n", (flags & SWS_CPU_CAPS_MMX) ? "MMX" : "C");
+                       "      2-tap scaler for vertical chrominance scaling (BGR)\n",
+                       (HAVE_MMX && cpu_flags & AV_CPU_FLAG_MMX) ? "MMX" : "C");
             else if (c->vLumFilterSize==2 && c->vChrFilterSize==2)
-                av_log(c, AV_LOG_VERBOSE, "using 2-tap linear %s scaler for vertical scaling (BGR)\n", (flags & SWS_CPU_CAPS_MMX) ? "MMX" : "C");
+                av_log(c, AV_LOG_VERBOSE, "using 2-tap linear %s scaler for vertical scaling (BGR)\n",
+                       (HAVE_MMX && cpu_flags & AV_CPU_FLAG_MMX) ? "MMX" : "C");
             else
-                av_log(c, AV_LOG_VERBOSE, "using n-tap %s scaler for vertical scaling (BGR)\n", (flags & SWS_CPU_CAPS_MMX) ? "MMX" : "C");
+                av_log(c, AV_LOG_VERBOSE, "using n-tap %s scaler for vertical scaling (BGR)\n",
+                       (HAVE_MMX && cpu_flags & AV_CPU_FLAG_MMX) ? "MMX" : "C");
         }
 
         if (dstFormat==PIX_FMT_BGR24)
             av_log(c, AV_LOG_VERBOSE, "using %s YV12->BGR24 converter\n",
-                   (flags & SWS_CPU_CAPS_MMX2) ? "MMX2" : ((flags & SWS_CPU_CAPS_MMX) ? "MMX" : "C"));
+                   (HAVE_MMX2 && cpu_flags & AV_CPU_FLAG_MMX2) ? "MMX2" :
+                   ((HAVE_MMX && cpu_flags & AV_CPU_FLAG_MMX) ? "MMX" : "C"));
         else if (dstFormat==PIX_FMT_RGB32)
-            av_log(c, AV_LOG_VERBOSE, "using %s YV12->BGR32 converter\n", (flags & SWS_CPU_CAPS_MMX) ? "MMX" : "C");
+            av_log(c, AV_LOG_VERBOSE, "using %s YV12->BGR32 converter\n",
+                   (HAVE_MMX && cpu_flags & AV_CPU_FLAG_MMX) ? "MMX" : "C");
         else if (dstFormat==PIX_FMT_BGR565)
-            av_log(c, AV_LOG_VERBOSE, "using %s YV12->BGR16 converter\n", (flags & SWS_CPU_CAPS_MMX) ? "MMX" : "C");
+            av_log(c, AV_LOG_VERBOSE, "using %s YV12->BGR16 converter\n",
+                   (HAVE_MMX && cpu_flags & AV_CPU_FLAG_MMX) ? "MMX" : "C");
         else if (dstFormat==PIX_FMT_BGR555)
-            av_log(c, AV_LOG_VERBOSE, "using %s YV12->BGR15 converter\n", (flags & SWS_CPU_CAPS_MMX) ? "MMX" : "C");
+            av_log(c, AV_LOG_VERBOSE, "using %s YV12->BGR15 converter\n",
+                   (HAVE_MMX && cpu_flags & AV_CPU_FLAG_MMX) ? "MMX" : "C");
         else if (dstFormat == PIX_FMT_RGB444BE || dstFormat == PIX_FMT_RGB444LE ||
                  dstFormat == PIX_FMT_BGR444BE || dstFormat == PIX_FMT_BGR444LE)
-            av_log(c, AV_LOG_VERBOSE, "using %s YV12->BGR12 converter\n", (flags & SWS_CPU_CAPS_MMX) ? "MMX" : "C");
+            av_log(c, AV_LOG_VERBOSE, "using %s YV12->BGR12 converter\n",
+                   (HAVE_MMX && cpu_flags & AV_CPU_FLAG_MMX) ? "MMX" : "C");
 
         av_log(c, AV_LOG_VERBOSE, "%dx%d -> %dx%d\n", srcW, srcH, dstW, dstH);
         av_log(c, AV_LOG_DEBUG, "lum srcW=%d srcH=%d dstW=%d dstH=%d xInc=%d yInc=%d\n",
@@ -1504,7 +1494,7 @@ void sws_freeContext(SwsContext *c)
     av_freep(&c->hLumFilterPos);
     av_freep(&c->hChrFilterPos);
 
-#if ARCH_X86
+#if HAVE_MMX
 #ifdef MAP_ANONYMOUS
     if (c->lumMmx2FilterCode) munmap(c->lumMmx2FilterCode, c->lumMmx2FilterCodeSize);
     if (c->chrMmx2FilterCode) munmap(c->chrMmx2FilterCode, c->chrMmx2FilterCodeSize);
@@ -1517,7 +1507,7 @@ void sws_freeContext(SwsContext *c)
 #endif
     c->lumMmx2FilterCode=NULL;
     c->chrMmx2FilterCode=NULL;
-#endif /* ARCH_X86 */
+#endif /* HAVE_MMX */
 
     av_freep(&c->yuvTable);
 
@@ -1534,8 +1524,6 @@ struct SwsContext *sws_getCachedContext(struct SwsContext *context,
     if (!param)
         param = default_param;
 
-    flags = update_flags_cpu(flags);
-
     if (context &&
         (context->srcW      != srcW      ||
          context->srcH      != srcH      ||
diff --git a/libswscale/x86/rgb2rgb.c b/libswscale/x86/rgb2rgb.c
index e84bc1bcc9..cf901affe6 100644
--- a/libswscale/x86/rgb2rgb.c
+++ b/libswscale/x86/rgb2rgb.c
@@ -27,6 +27,7 @@
 
 #include "config.h"
 #include "libavutil/x86_cpu.h"
+#include "libavutil/cpu.h"
 #include "libavutil/bswap.h"
 #include "libswscale/rgb2rgb.h"
 #include "libswscale/swscale.h"
@@ -122,16 +123,16 @@ DECLARE_ASM_CONST(8, uint64_t, blue_15mask)  = 0x0000001f0000001fULL;
  32-bit C version, and and&add trick by Michael Niedermayer
 */
 
-void rgb2rgb_init_x86(int flags)
+void rgb2rgb_init_x86(void)
 {
-#if HAVE_MMX2 || HAVE_AMD3DNOW || HAVE_MMX
-    if (flags & SWS_CPU_CAPS_SSE2)
-        rgb2rgb_init_SSE2();
-    else if (flags & SWS_CPU_CAPS_MMX2)
-        rgb2rgb_init_MMX2();
-    else if (flags & SWS_CPU_CAPS_3DNOW)
-        rgb2rgb_init_3DNOW();
-    else if (flags & SWS_CPU_CAPS_MMX)
+    int cpu_flags = av_get_cpu_flags();
+
+    if (HAVE_MMX      && cpu_flags & AV_CPU_FLAG_MMX)
         rgb2rgb_init_MMX();
-#endif /* HAVE_MMX2 || HAVE_AMD3DNOW || HAVE_MMX */
+    if (HAVE_AMD3DNOW && cpu_flags & AV_CPU_FLAG_3DNOW)
+        rgb2rgb_init_3DNOW();
+    if (HAVE_MMX2     && cpu_flags & AV_CPU_FLAG_MMX2)
+        rgb2rgb_init_MMX2();
+    if (HAVE_SSE      && cpu_flags & AV_CPU_FLAG_SSE2)
+        rgb2rgb_init_SSE2();
 }
diff --git a/libswscale/x86/swscale_template.c b/libswscale/x86/swscale_template.c
index d719721693..e9e093780d 100644
--- a/libswscale/x86/swscale_template.c
+++ b/libswscale/x86/swscale_template.c
@@ -2721,10 +2721,11 @@ static int RENAME(swScale)(SwsContext *c, const uint8_t* src[], int srcStride[],
     if ((dstFormat == PIX_FMT_YUVA420P) && !alpPixBuf)
         fillPlane(dst[3], dstStride[3], dstW, dstY-lastDstY, lastDstY, 255);
 
-    if (flags & SWS_CPU_CAPS_MMX2 )  __asm__ volatile("sfence":::"memory");
+    if (COMPILE_TEMPLATE_MMX2)      __asm__ volatile("sfence":::"memory");
     /* On K6 femms is faster than emms. On K7 femms is directly mapped to emms. */
-    if (flags & SWS_CPU_CAPS_3DNOW)  __asm__ volatile("femms" :::"memory");
-    else                             __asm__ volatile("emms"  :::"memory");
+    if (COMPILE_TEMPLATE_AMD3DNOW)  __asm__ volatile("femms" :::"memory");
+    else                            __asm__ volatile("emms"  :::"memory");
+
     /* store changed local vars back in the context */
     c->dstY= dstY;
     c->lumBufIndex= lumBufIndex;
diff --git a/libswscale/x86/yuv2rgb_mmx.c b/libswscale/x86/yuv2rgb_mmx.c
index ff3a93db36..23d4c42700 100644
--- a/libswscale/x86/yuv2rgb_mmx.c
+++ b/libswscale/x86/yuv2rgb_mmx.c
@@ -34,6 +34,7 @@
 #include "libswscale/swscale.h"
 #include "libswscale/swscale_internal.h"
 #include "libavutil/x86_cpu.h"
+#include "libavutil/cpu.h"
 
 #define DITHER1XBPP // only for MMX
 
@@ -46,57 +47,58 @@ DECLARE_ASM_CONST(8, uint64_t, pb_03) = 0x0303030303030303ULL;
 DECLARE_ASM_CONST(8, uint64_t, pb_07) = 0x0707070707070707ULL;
 
 //MMX versions
+#if HAVE_MMX
 #undef RENAME
-#undef HAVE_MMX2
-#undef HAVE_AMD3DNOW
-#define HAVE_MMX2 0
-#define HAVE_AMD3DNOW 0
+#undef COMPILE_TEMPLATE_MMX2
+#define COMPILE_TEMPLATE_MMX2 0
 #define RENAME(a) a ## _MMX
 #include "yuv2rgb_template.c"
+#endif /* HAVE_MMX */
 
 //MMX2 versions
+#if HAVE_MMX2
 #undef RENAME
-#undef HAVE_MMX2
-#define HAVE_MMX2 1
+#undef COMPILE_TEMPLATE_MMX2
+#define COMPILE_TEMPLATE_MMX2 1
 #define RENAME(a) a ## _MMX2
 #include "yuv2rgb_template.c"
+#endif /* HAVE_MMX2 */
 
 SwsFunc ff_yuv2rgb_init_mmx(SwsContext *c)
 {
-    if (c->flags & SWS_CPU_CAPS_MMX2) {
+    int cpu_flags = av_get_cpu_flags();
+
+    if (c->srcFormat != PIX_FMT_YUV420P &&
+        c->srcFormat != PIX_FMT_YUVA420P)
+        return NULL;
+
+    if (HAVE_MMX2 && cpu_flags & AV_CPU_FLAG_MMX2) {
         switch (c->dstFormat) {
-        case PIX_FMT_RGB32:
-            if (CONFIG_SWSCALE_ALPHA && c->srcFormat == PIX_FMT_YUVA420P) {
-                if (HAVE_7REGS) return yuva420_rgb32_MMX2;
-                break;
-            } else return yuv420_rgb32_MMX2;
-        case PIX_FMT_BGR32:
-            if (CONFIG_SWSCALE_ALPHA && c->srcFormat == PIX_FMT_YUVA420P) {
-                if (HAVE_7REGS) return yuva420_bgr32_MMX2;
-                break;
-            } else return yuv420_bgr32_MMX2;
         case PIX_FMT_RGB24:  return yuv420_rgb24_MMX2;
         case PIX_FMT_BGR24:  return yuv420_bgr24_MMX2;
-        case PIX_FMT_RGB565: return yuv420_rgb16_MMX2;
-        case PIX_FMT_RGB555: return yuv420_rgb15_MMX2;
         }
     }
-    if (c->flags & SWS_CPU_CAPS_MMX) {
+
+    if (HAVE_MMX && cpu_flags & AV_CPU_FLAG_MMX) {
         switch (c->dstFormat) {
-        case PIX_FMT_RGB32:
-            if (CONFIG_SWSCALE_ALPHA && c->srcFormat == PIX_FMT_YUVA420P) {
-                if (HAVE_7REGS) return yuva420_rgb32_MMX;
-                break;
-            } else return yuv420_rgb32_MMX;
-        case PIX_FMT_BGR32:
-            if (CONFIG_SWSCALE_ALPHA && c->srcFormat == PIX_FMT_YUVA420P) {
-                if (HAVE_7REGS) return yuva420_bgr32_MMX;
-                break;
-            } else return yuv420_bgr32_MMX;
-        case PIX_FMT_RGB24:  return yuv420_rgb24_MMX;
-        case PIX_FMT_BGR24:  return yuv420_bgr24_MMX;
-        case PIX_FMT_RGB565: return yuv420_rgb16_MMX;
-        case PIX_FMT_RGB555: return yuv420_rgb15_MMX;
+            case PIX_FMT_RGB32:
+                if (CONFIG_SWSCALE_ALPHA && c->srcFormat == PIX_FMT_YUVA420P) {
+#if HAVE_7REGS
+                    return yuva420_rgb32_MMX;
+#endif
+                    break;
+                } else return yuv420_rgb32_MMX;
+            case PIX_FMT_BGR32:
+                if (CONFIG_SWSCALE_ALPHA && c->srcFormat == PIX_FMT_YUVA420P) {
+#if HAVE_7REGS
+                    return yuva420_bgr32_MMX;
+#endif
+                    break;
+                } else return yuv420_bgr32_MMX;
+            case PIX_FMT_RGB24:  return yuv420_rgb24_MMX;
+            case PIX_FMT_BGR24:  return yuv420_bgr24_MMX;
+            case PIX_FMT_RGB565: return yuv420_rgb16_MMX;
+            case PIX_FMT_RGB555: return yuv420_rgb15_MMX;
         }
     }
 
diff --git a/libswscale/x86/yuv2rgb_template.c b/libswscale/x86/yuv2rgb_template.c
index cf8f0d3cfb..5d1fa5b309 100644
--- a/libswscale/x86/yuv2rgb_template.c
+++ b/libswscale/x86/yuv2rgb_template.c
@@ -25,14 +25,7 @@
 #undef EMMS
 #undef SFENCE
 
-#if HAVE_AMD3DNOW
-/* On K6 femms is faster than emms. On K7 femms is directly mapped to emms. */
-#define EMMS   "femms"
-#else
-#define EMMS   "emms"
-#endif
-
-#if HAVE_MMX2
+#if COMPILE_TEMPLATE_MMX2
 #define MOVNTQ "movntq"
 #define SFENCE "sfence"
 #else
@@ -159,7 +152,8 @@
     }                                                             \
 
 #define YUV2RGB_ENDFUNC                          \
-    __asm__ volatile (SFENCE"\n\t"EMMS);         \
+    __asm__ volatile (SFENCE"\n\t"               \
+                    "emms    \n\t");             \
     return srcSliceH;                            \
 
 #define IF0(x)
@@ -188,6 +182,7 @@
     "paddusb "GREEN_DITHER"(%4), %%mm2\n\t"      \
     "paddusb "RED_DITHER"(%4),   %%mm1\n\t"      \
 
+#if !COMPILE_TEMPLATE_MMX2
 static inline int RENAME(yuv420_rgb15)(SwsContext *c, const uint8_t *src[],
                                        int srcStride[],
                                        int srcSliceY, int srcSliceH,
@@ -243,6 +238,7 @@ static inline int RENAME(yuv420_rgb16)(SwsContext *c, const uint8_t *src[],
     YUV2RGB_OPERANDS
     YUV2RGB_ENDFUNC
 }
+#endif /* !COMPILE_TEMPLATE_MMX2 */
 
 #define RGB_PACK24(blue, red)\
     "packuswb  %%mm3,      %%mm0 \n" /* R0 R2 R4 R6 R1 R3 R5 R7 */\
@@ -259,7 +255,7 @@ static inline int RENAME(yuv420_rgb16)(SwsContext *c, const uint8_t *src[],
     "punpckhwd %%mm6,      %%mm5 \n" /* R4 G4 B4 R5 R6 G6 B6 R7 */\
     RGB_PACK24_B
 
-#if HAVE_MMX2
+#if COMPILE_TEMPLATE_MMX2
 DECLARE_ASM_CONST(8, int16_t, mask1101[4]) = {-1,-1, 0,-1};
 DECLARE_ASM_CONST(8, int16_t, mask0010[4]) = { 0, 0,-1, 0};
 DECLARE_ASM_CONST(8, int16_t, mask0110[4]) = { 0,-1,-1, 0};
@@ -366,6 +362,7 @@ static inline int RENAME(yuv420_bgr24)(SwsContext *c, const uint8_t *src[],
     MOVNTQ "   %%mm5,       16(%1)\n\t"      \
     MOVNTQ "   %%mm"alpha", 24(%1)\n\t"      \
 
+#if !COMPILE_TEMPLATE_MMX2
 static inline int RENAME(yuv420_rgb32)(SwsContext *c, const uint8_t *src[],
                                        int srcStride[],
                                        int srcSliceY, int srcSliceH,
@@ -386,12 +383,12 @@ static inline int RENAME(yuv420_rgb32)(SwsContext *c, const uint8_t *src[],
     YUV2RGB_ENDFUNC
 }
 
+#if HAVE_7REGS && CONFIG_SWSCALE_ALPHA
 static inline int RENAME(yuva420_rgb32)(SwsContext *c, const uint8_t *src[],
                                         int srcStride[],
                                         int srcSliceY, int srcSliceH,
                                         uint8_t *dst[], int dstStride[])
 {
-#if HAVE_7REGS
     int y, h_size;
 
     YUV2RGB_LOOP(4)
@@ -406,10 +403,8 @@ static inline int RENAME(yuva420_rgb32)(SwsContext *c, const uint8_t *src[],
     YUV2RGB_ENDLOOP(4)
     YUV2RGB_OPERANDS_ALPHA
     YUV2RGB_ENDFUNC
-#else
-    return 0;
-#endif
 }
+#endif
 
 static inline int RENAME(yuv420_bgr32)(SwsContext *c, const uint8_t *src[],
                                        int srcStride[],
@@ -431,12 +426,12 @@ static inline int RENAME(yuv420_bgr32)(SwsContext *c, const uint8_t *src[],
     YUV2RGB_ENDFUNC
 }
 
+#if HAVE_7REGS && CONFIG_SWSCALE_ALPHA
 static inline int RENAME(yuva420_bgr32)(SwsContext *c, const uint8_t *src[],
                                         int srcStride[],
                                         int srcSliceY, int srcSliceH,
                                         uint8_t *dst[], int dstStride[])
 {
-#if HAVE_7REGS
     int y, h_size;
 
     YUV2RGB_LOOP(4)
@@ -451,7 +446,7 @@ static inline int RENAME(yuva420_bgr32)(SwsContext *c, const uint8_t *src[],
     YUV2RGB_ENDLOOP(4)
     YUV2RGB_OPERANDS_ALPHA
     YUV2RGB_ENDFUNC
-#else
-    return 0;
-#endif
 }
+#endif
+
+#endif /* !COMPILE_TEMPLATE_MMX2 */
diff --git a/libswscale/yuv2rgb.c b/libswscale/yuv2rgb.c
index 74262c6b7f..a502f654ed 100644
--- a/libswscale/yuv2rgb.c
+++ b/libswscale/yuv2rgb.c
@@ -32,7 +32,7 @@
 #include "rgb2rgb.h"
 #include "swscale.h"
 #include "swscale_internal.h"
-#include "libavutil/x86_cpu.h"
+#include "libavutil/cpu.h"
 #include "libavutil/bswap.h"
 
 extern const uint8_t dither_4x4_16[4][8];
@@ -579,24 +579,18 @@ CLOSEYUV2RGBFUNC(1)
 SwsFunc ff_yuv2rgb_get_func_ptr(SwsContext *c)
 {
     SwsFunc t = NULL;
-#if HAVE_MMX
-     t = ff_yuv2rgb_init_mmx(c);
-#endif
-#if HAVE_VIS
-    t = ff_yuv2rgb_init_vis(c);
-#endif
-#if CONFIG_MLIB
-    t = ff_yuv2rgb_init_mlib(c);
-#endif
-#if HAVE_ALTIVEC
-    if (c->flags & SWS_CPU_CAPS_ALTIVEC)
-        t = ff_yuv2rgb_init_altivec(c);
-#endif
 
-#if ARCH_BFIN
-    if (c->flags & SWS_CPU_CAPS_BFIN)
+    if (HAVE_MMX) {
+        t = ff_yuv2rgb_init_mmx(c);
+    } else if (HAVE_VIS) {
+        t = ff_yuv2rgb_init_vis(c);
+    } else if (CONFIG_MLIB) {
+        t = ff_yuv2rgb_init_mlib(c);
+    } else if (HAVE_ALTIVEC) {
+        t = ff_yuv2rgb_init_altivec(c);
+    } else if (ARCH_BFIN) {
         t = ff_yuv2rgb_get_func_ptr_bfin(c);
-#endif
+    }
 
     if (t)
         return t;
-- 
cgit v1.2.3


From 11ffefefdbf92016e764e148e41ec08f9bc1a2db Mon Sep 17 00:00:00 2001
From: "Ronald S. Bultje" <rsbultje@gmail.com>
Date: Tue, 24 May 2011 10:11:26 -0400
Subject: swscale: remove duplicated x86/ functions.

---
 libswscale/x86/swscale_template.c | 212 +++-----------------------------------
 1 file changed, 16 insertions(+), 196 deletions(-)

diff --git a/libswscale/x86/swscale_template.c b/libswscale/x86/swscale_template.c
index e9e093780d..d0671315ae 100644
--- a/libswscale/x86/swscale_template.c
+++ b/libswscale/x86/swscale_template.c
@@ -925,19 +925,9 @@ static inline void RENAME(yuv2yuvX)(SwsContext *c, const int16_t *lumFilter, con
                 alpSrc, dest, uDest, vDest, aDest, dstW, chrDstW);
 }
 
-static inline void RENAME(yuv2nv12X)(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize,
-                                     const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize,
-                                     uint8_t *dest, uint8_t *uDest, int dstW, int chrDstW, enum PixelFormat dstFormat)
-{
-    yuv2nv12XinC(lumFilter, lumSrc, lumFilterSize,
-                 chrFilter, chrSrc, chrFilterSize,
-                 dest, uDest, dstW, chrDstW, dstFormat);
-}
-
 static inline void RENAME(yuv2yuv1)(SwsContext *c, const int16_t *lumSrc, const int16_t *chrSrc, const int16_t *alpSrc,
                                     uint8_t *dest, uint8_t *uDest, uint8_t *vDest, uint8_t *aDest, long dstW, long chrDstW)
 {
-    int i;
     if(!(c->flags & SWS_BITEXACT)) {
         long p= 4;
         const uint8_t *src[4]= {alpSrc + dstW, lumSrc + dstW, chrSrc + chrDstW, chrSrc + VOFW + chrDstW};
@@ -969,38 +959,8 @@ static inline void RENAME(yuv2yuv1)(SwsContext *c, const int16_t *lumSrc, const
         }
         return;
     }
-    for (i=0; i<dstW; i++) {
-        int val= (lumSrc[i]+64)>>7;
-
-        if (val&256) {
-            if (val<0) val=0;
-            else       val=255;
-        }
-
-        dest[i]= val;
-    }
-
-    if (uDest)
-        for (i=0; i<chrDstW; i++) {
-            int u=(chrSrc[i       ]+64)>>7;
-            int v=(chrSrc[i + VOFW]+64)>>7;
-
-            if ((u|v)&256) {
-                if (u<0)        u=0;
-                else if (u>255) u=255;
-                if (v<0)        v=0;
-                else if (v>255) v=255;
-            }
-
-            uDest[i]= u;
-            vDest[i]= v;
-        }
-
-    if (CONFIG_SWSCALE_ALPHA && aDest)
-        for (i=0; i<dstW; i++) {
-            int val= (alpSrc[i]+64)>>7;
-            aDest[i]= av_clip_uint8(val);
-        }
+    yuv2yuv1_c(c, lumSrc, chrSrc, alpSrc, dest, uDest, vDest, aDest,
+               dstW, chrDstW);
 }
 
 
@@ -1182,10 +1142,6 @@ static inline void RENAME(yuv2packedX)(SwsContext *c, const int16_t *lumFilter,
 static inline void RENAME(yuv2packed2)(SwsContext *c, const uint16_t *buf0, const uint16_t *buf1, const uint16_t *uvbuf0, const uint16_t *uvbuf1,
                           const uint16_t *abuf0, const uint16_t *abuf1, uint8_t *dest, int dstW, int yalpha, int uvalpha, int y)
 {
-    int  yalpha1=4095- yalpha;
-    int uvalpha1=4095-uvalpha;
-    int i;
-
     if(!(c->flags & SWS_BITEXACT)) {
         switch(c->dstFormat) {
         //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
@@ -1320,7 +1276,8 @@ static inline void RENAME(yuv2packed2)(SwsContext *c, const uint16_t *buf0, cons
         default: break;
         }
     }
-    YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB2_C, YSCALE_YUV_2_PACKED2_C(void,0), YSCALE_YUV_2_GRAY16_2_C, YSCALE_YUV_2_MONO2_C)
+    yuv2packed2_c(c, buf0, buf1, uvbuf0, uvbuf1, abuf0, abuf1,
+                  dest, dstW, yalpha, uvalpha, y);
 }
 
 /**
@@ -1329,18 +1286,14 @@ static inline void RENAME(yuv2packed2)(SwsContext *c, const uint16_t *buf0, cons
 static inline void RENAME(yuv2packed1)(SwsContext *c, const uint16_t *buf0, const uint16_t *uvbuf0, const uint16_t *uvbuf1,
                           const uint16_t *abuf0, uint8_t *dest, int dstW, int uvalpha, enum PixelFormat dstFormat, int flags, int y)
 {
-    const int yalpha1=0;
-    int i;
-
-    const uint16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
-    const int yalpha= 4096; //FIXME ...
+    if(!(flags & SWS_BITEXACT)) {
+        const uint16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
 
-    if (flags&SWS_FULL_CHR_H_INT) {
-        c->yuv2packed2(c, buf0, buf0, uvbuf0, uvbuf1, abuf0, abuf0, dest, dstW, 0, uvalpha, y);
-        return;
-    }
+        if (flags&SWS_FULL_CHR_H_INT) {
+            c->yuv2packed2(c, buf0, buf0, uvbuf0, uvbuf1, abuf0, abuf0, dest, dstW, 0, uvalpha, y);
+            return;
+        }
 
-    if(!(flags & SWS_BITEXACT)) {
         if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
             switch(dstFormat) {
             case PIX_FMT_RGB32:
@@ -1555,11 +1508,8 @@ static inline void RENAME(yuv2packed1)(SwsContext *c, const uint16_t *buf0, cons
             }
         }
     }
-    if (uvalpha < 2048) {
-        YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1_C, YSCALE_YUV_2_PACKED1_C(void,0), YSCALE_YUV_2_GRAY16_1_C, YSCALE_YUV_2_MONO2_C)
-    } else {
-        YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1B_C, YSCALE_YUV_2_PACKED1B_C(void,0), YSCALE_YUV_2_GRAY16_1_C, YSCALE_YUV_2_MONO2_C)
-    }
+    yuv2packed1_c(c, buf0, uvbuf0, uvbuf1, abuf0, dest,
+                  dstW, uvalpha, dstFormat, flags, y);
 }
 
 //FIXME yuy2* can read up to 7 samples too much
@@ -1866,20 +1816,6 @@ static inline void RENAME(bgr24ToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t
     assert(src1 == src2);
 }
 
-static inline void RENAME(bgr24ToUV_half)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
-{
-    int i;
-    for (i=0; i<width; i++) {
-        int b= src1[6*i + 0] + src1[6*i + 3];
-        int g= src1[6*i + 1] + src1[6*i + 4];
-        int r= src1[6*i + 2] + src1[6*i + 5];
-
-        dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
-        dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
-    }
-    assert(src1 == src2);
-}
-
 static inline void RENAME(rgb24ToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
 {
     RENAME(bgr24ToY_mmx)(dst, src, width, PIX_FMT_RGB24);
@@ -1891,20 +1827,6 @@ static inline void RENAME(rgb24ToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t
     RENAME(bgr24ToUV_mmx)(dstU, dstV, src1, width, PIX_FMT_RGB24);
 }
 
-static inline void RENAME(rgb24ToUV_half)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
-{
-    int i;
-    assert(src1==src2);
-    for (i=0; i<width; i++) {
-        int r= src1[6*i + 0] + src1[6*i + 3];
-        int g= src1[6*i + 1] + src1[6*i + 4];
-        int b= src1[6*i + 2] + src1[6*i + 5];
-
-        dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
-        dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
-    }
-}
-
 
 // bilinear / bicubic scaling
 static inline void RENAME(hScale)(int16_t *dst, int dstW, const uint8_t *src, int srcW, int xInc,
@@ -2061,37 +1983,6 @@ static inline void RENAME(hScale)(int16_t *dst, int dstW, const uint8_t *src, in
     }
 }
 
-//FIXME all pal and rgb srcFormats could do this convertion as well
-//FIXME all scalers more complex than bilinear could do half of this transform
-static void RENAME(chrRangeToJpeg)(uint16_t *dst, int width)
-{
-    int i;
-    for (i = 0; i < width; i++) {
-        dst[i     ] = (FFMIN(dst[i     ],30775)*4663 - 9289992)>>12; //-264
-        dst[i+VOFW] = (FFMIN(dst[i+VOFW],30775)*4663 - 9289992)>>12; //-264
-    }
-}
-static void RENAME(chrRangeFromJpeg)(uint16_t *dst, int width)
-{
-    int i;
-    for (i = 0; i < width; i++) {
-        dst[i     ] = (dst[i     ]*1799 + 4081085)>>11; //1469
-        dst[i+VOFW] = (dst[i+VOFW]*1799 + 4081085)>>11; //1469
-    }
-}
-static void RENAME(lumRangeToJpeg)(uint16_t *dst, int width)
-{
-    int i;
-    for (i = 0; i < width; i++)
-        dst[i] = (FFMIN(dst[i],30189)*19077 - 39057361)>>14;
-}
-static void RENAME(lumRangeFromJpeg)(uint16_t *dst, int width)
-{
-    int i;
-    for (i = 0; i < width; i++)
-        dst[i] = (dst[i]*14071 + 33561947)>>14;
-}
-
 #define FAST_BILINEAR_X86 \
     "subl    %%edi, %%esi    \n\t" /*  src[xx+1] - src[xx] */                   \
     "imull   %%ecx, %%esi    \n\t" /* (src[xx+1] - src[xx])*xalpha */           \
@@ -2212,33 +2103,6 @@ static inline void RENAME(hyscale_fast)(SwsContext *c, int16_t *dst,
 #endif
 }
 
-      // *** horizontal scale Y line to temp buffer
-static inline void RENAME(hyscale)(SwsContext *c, uint16_t *dst, long dstWidth, const uint8_t *src, int srcW, int xInc,
-                                   const int16_t *hLumFilter,
-                                   const int16_t *hLumFilterPos, int hLumFilterSize,
-                                   uint8_t *formatConvBuffer,
-                                   uint32_t *pal, int isAlpha)
-{
-    void (*toYV12)(uint8_t *, const uint8_t *, long, uint32_t *) = isAlpha ? c->alpToYV12 : c->lumToYV12;
-    void (*convertRange)(uint16_t *, int) = isAlpha ? NULL : c->lumConvertRange;
-
-    src += isAlpha ? c->alpSrcOffset : c->lumSrcOffset;
-
-    if (toYV12) {
-        toYV12(formatConvBuffer, src, srcW, pal);
-        src= formatConvBuffer;
-    }
-
-    if (!c->hyscale_fast) {
-        c->hScale(dst, dstWidth, src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize);
-    } else { // fast bilinear upscale / crap downscale
-        c->hyscale_fast(c, dst, dstWidth, src, srcW, xInc);
-    }
-
-    if (convertRange)
-        convertRange(dst, dstWidth);
-}
-
 static inline void RENAME(hcscale_fast)(SwsContext *c, int16_t *dst,
                                         long dstWidth, const uint8_t *src1,
                                         const uint8_t *src2, int srcW, int xInc)
@@ -2345,33 +2209,6 @@ which is needed to support GCC 4.0. */
 #endif
 }
 
-inline static void RENAME(hcscale)(SwsContext *c, uint16_t *dst, long dstWidth, const uint8_t *src1, const uint8_t *src2,
-                                   int srcW, int xInc, const int16_t *hChrFilter,
-                                   const int16_t *hChrFilterPos, int hChrFilterSize,
-                                   uint8_t *formatConvBuffer,
-                                   uint32_t *pal)
-{
-
-    src1 += c->chrSrcOffset;
-    src2 += c->chrSrcOffset;
-
-    if (c->chrToYV12) {
-        c->chrToYV12(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
-        src1= formatConvBuffer;
-        src2= formatConvBuffer+VOFW;
-    }
-
-    if (!c->hcscale_fast) {
-        c->hScale(dst     , dstWidth, src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
-        c->hScale(dst+VOFW, dstWidth, src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
-    } else { // fast bilinear upscale / crap downscale
-        c->hcscale_fast(c, dst, dstWidth, src1, src2, srcW, xInc);
-    }
-
-    if (c->chrConvertRange)
-        c->chrConvertRange(dst, dstWidth);
-}
-
 #define DEBUG_SWSCALE_BUFFERS 0
 #define DEBUG_BUFFERS(...) if (DEBUG_SWSCALE_BUFFERS) av_log(c, AV_LOG_DEBUG, __VA_ARGS__)
 
@@ -2509,12 +2346,12 @@ static int RENAME(swScale)(SwsContext *c, const uint8_t* src[], int srcStride[],
             assert(lumBufIndex < 2*vLumBufSize);
             assert(lastInLumBuf + 1 - srcSliceY < srcSliceH);
             assert(lastInLumBuf + 1 - srcSliceY >= 0);
-            RENAME(hyscale)(c, lumPixBuf[ lumBufIndex ], dstW, src1, srcW, lumXInc,
+            hyscale_c(c, lumPixBuf[ lumBufIndex ], dstW, src1, srcW, lumXInc,
                             hLumFilter, hLumFilterPos, hLumFilterSize,
                             formatConvBuffer,
                             pal, 0);
             if (CONFIG_SWSCALE_ALPHA && alpPixBuf)
-                RENAME(hyscale)(c, alpPixBuf[ lumBufIndex ], dstW, src2, srcW, lumXInc,
+                hyscale_c(c, alpPixBuf[ lumBufIndex ], dstW, src2, srcW, lumXInc,
                                 hLumFilter, hLumFilterPos, hLumFilterSize,
                                 formatConvBuffer,
                                 pal, 1);
@@ -2532,7 +2369,7 @@ static int RENAME(swScale)(SwsContext *c, const uint8_t* src[], int srcStride[],
             //FIXME replace parameters through context struct (some at least)
 
             if (c->needs_hcscale)
-                RENAME(hcscale)(c, chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
+                hcscale_c(c, chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
                                 hChrFilter, hChrFilterPos, hChrFilterSize,
                                 formatConvBuffer,
                                 pal);
@@ -2740,7 +2577,6 @@ static void RENAME(sws_init_swScale)(SwsContext *c)
 {
     enum PixelFormat srcFormat = c->srcFormat;
 
-    c->yuv2nv12X    = RENAME(yuv2nv12X   );
     c->yuv2yuv1     = RENAME(yuv2yuv1    );
     c->yuv2yuvX     = RENAME(yuv2yuvX    );
     c->yuv2packed1  = RENAME(yuv2packed1 );
@@ -2772,13 +2608,7 @@ static void RENAME(sws_init_swScale)(SwsContext *c)
         case PIX_FMT_YUV444P16LE: c->chrToYV12 = RENAME(LEToUV); break;
         default: break;
     }
-    if (c->chrSrcHSubSample) {
-        switch(srcFormat) {
-        case PIX_FMT_BGR24  : c->chrToYV12 = RENAME(bgr24ToUV_half); break;
-        case PIX_FMT_RGB24  : c->chrToYV12 = RENAME(rgb24ToUV_half); break;
-        default: break;
-        }
-    } else {
+    if (!c->chrSrcHSubSample) {
         switch(srcFormat) {
         case PIX_FMT_BGR24  : c->chrToYV12 = RENAME(bgr24ToUV); break;
         case PIX_FMT_RGB24  : c->chrToYV12 = RENAME(rgb24ToUV); break;
@@ -2808,14 +2638,4 @@ static void RENAME(sws_init_swScale)(SwsContext *c)
         default: break;
         }
     }
-
-    if (c->srcRange != c->dstRange && !isAnyRGB(c->dstFormat)) {
-        if (c->srcRange) {
-            c->lumConvertRange = RENAME(lumRangeFromJpeg);
-            c->chrConvertRange = RENAME(chrRangeFromJpeg);
-        } else {
-            c->lumConvertRange = RENAME(lumRangeToJpeg);
-            c->chrConvertRange = RENAME(chrRangeToJpeg);
-        }
-    }
 }
-- 
cgit v1.2.3


From fe43d5d71e1b272e1176232a34074c8c9bc0c61d Mon Sep 17 00:00:00 2001
From: "Ronald S. Bultje" <rsbultje@gmail.com>
Date: Sat, 14 May 2011 14:45:27 -0400
Subject: swscale: remove duplicate code in ppc/ subdirectory.

---
 libswscale/ppc/swscale_template.c | 793 +-------------------------------------
 libswscale/swscale.c              |   4 +-
 2 files changed, 3 insertions(+), 794 deletions(-)

diff --git a/libswscale/ppc/swscale_template.c b/libswscale/ppc/swscale_template.c
index 3a569f1b78..7968177b52 100644
--- a/libswscale/ppc/swscale_template.c
+++ b/libswscale/ppc/swscale_template.c
@@ -23,69 +23,16 @@
 #include "swscale_altivec_template.c"
 #endif
 
+#if COMPILE_TEMPLATE_ALTIVEC
 static inline void RENAME(yuv2yuvX)(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize,
                                     const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize, const int16_t **alpSrc,
                                     uint8_t *dest, uint8_t *uDest, uint8_t *vDest, uint8_t *aDest, long dstW, long chrDstW)
 {
-#if COMPILE_TEMPLATE_ALTIVEC
     yuv2yuvX_altivec_real(lumFilter, lumSrc, lumFilterSize,
                           chrFilter, chrSrc, chrFilterSize,
                           dest, uDest, vDest, dstW, chrDstW);
-#else //COMPILE_TEMPLATE_ALTIVEC
-    yuv2yuvXinC(lumFilter, lumSrc, lumFilterSize,
-                chrFilter, chrSrc, chrFilterSize,
-                alpSrc, dest, uDest, vDest, aDest, dstW, chrDstW);
-#endif //!COMPILE_TEMPLATE_ALTIVEC
-}
-
-static inline void RENAME(yuv2nv12X)(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize,
-                                     const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize,
-                                     uint8_t *dest, uint8_t *uDest, int dstW, int chrDstW, enum PixelFormat dstFormat)
-{
-    yuv2nv12XinC(lumFilter, lumSrc, lumFilterSize,
-                 chrFilter, chrSrc, chrFilterSize,
-                 dest, uDest, dstW, chrDstW, dstFormat);
 }
 
-static inline void RENAME(yuv2yuv1)(SwsContext *c, const int16_t *lumSrc, const int16_t *chrSrc, const int16_t *alpSrc,
-                                    uint8_t *dest, uint8_t *uDest, uint8_t *vDest, uint8_t *aDest, long dstW, long chrDstW)
-{
-    int i;
-    for (i=0; i<dstW; i++) {
-        int val= (lumSrc[i]+64)>>7;
-
-        if (val&256) {
-            if (val<0) val=0;
-            else       val=255;
-        }
-
-        dest[i]= val;
-    }
-
-    if (uDest)
-        for (i=0; i<chrDstW; i++) {
-            int u=(chrSrc[i       ]+64)>>7;
-            int v=(chrSrc[i + VOFW]+64)>>7;
-
-            if ((u|v)&256) {
-                if (u<0)        u=0;
-                else if (u>255) u=255;
-                if (v<0)        v=0;
-                else if (v>255) v=255;
-            }
-
-            uDest[i]= u;
-            vDest[i]= v;
-        }
-
-    if (CONFIG_SWSCALE_ALPHA && aDest)
-        for (i=0; i<dstW; i++) {
-            int val= (alpSrc[i]+64)>>7;
-            aDest[i]= av_clip_uint8(val);
-        }
-}
-
-
 /**
  * vertical scale YV12 to RGB
  */
@@ -93,7 +40,6 @@ static inline void RENAME(yuv2packedX)(SwsContext *c, const int16_t *lumFilter,
                                        const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize,
                                        const int16_t **alpSrc, uint8_t *dest, long dstW, long dstY)
 {
-#if COMPILE_TEMPLATE_ALTIVEC
     /* The following list of supported dstFormat values should
        match what's found in the body of ff_yuv2packedX_altivec() */
     if (!(c->flags & SWS_BITEXACT) && !c->alpPixBuf &&
@@ -104,751 +50,16 @@ static inline void RENAME(yuv2packedX)(SwsContext *c, const int16_t *lumFilter,
                                    chrFilter, chrSrc, chrFilterSize,
                                    dest, dstW, dstY);
     else
-#endif
         yuv2packedXinC(c, lumFilter, lumSrc, lumFilterSize,
                        chrFilter, chrSrc, chrFilterSize,
                        alpSrc, dest, dstW, dstY);
 }
-
-/**
- * vertical bilinear scale YV12 to RGB
- */
-static inline void RENAME(yuv2packed2)(SwsContext *c, const uint16_t *buf0, const uint16_t *buf1, const uint16_t *uvbuf0, const uint16_t *uvbuf1,
-                          const uint16_t *abuf0, const uint16_t *abuf1, uint8_t *dest, int dstW, int yalpha, int uvalpha, int y)
-{
-    int  yalpha1=4095- yalpha;
-    int uvalpha1=4095-uvalpha;
-    int i;
-
-    YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB2_C, YSCALE_YUV_2_PACKED2_C(void,0), YSCALE_YUV_2_GRAY16_2_C, YSCALE_YUV_2_MONO2_C)
-}
-
-/**
- * YV12 to RGB without scaling or interpolating
- */
-static inline void RENAME(yuv2packed1)(SwsContext *c, const uint16_t *buf0, const uint16_t *uvbuf0, const uint16_t *uvbuf1,
-                          const uint16_t *abuf0, uint8_t *dest, int dstW, int uvalpha, enum PixelFormat dstFormat, int flags, int y)
-{
-    const int yalpha1=0;
-    int i;
-
-    const uint16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
-    const int yalpha= 4096; //FIXME ...
-
-    if (flags&SWS_FULL_CHR_H_INT) {
-        c->yuv2packed2(c, buf0, buf0, uvbuf0, uvbuf1, abuf0, abuf0, dest, dstW, 0, uvalpha, y);
-        return;
-    }
-
-    if (uvalpha < 2048) {
-        YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1_C, YSCALE_YUV_2_PACKED1_C(void,0), YSCALE_YUV_2_GRAY16_1_C, YSCALE_YUV_2_MONO2_C)
-    } else {
-        YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1B_C, YSCALE_YUV_2_PACKED1B_C(void,0), YSCALE_YUV_2_GRAY16_1_C, YSCALE_YUV_2_MONO2_C)
-    }
-}
-
-//FIXME yuy2* can read up to 7 samples too much
-
-static inline void RENAME(yuy2ToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
-{
-    int i;
-    for (i=0; i<width; i++)
-        dst[i]= src[2*i];
-}
-
-static inline void RENAME(yuy2ToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
-{
-    int i;
-    for (i=0; i<width; i++) {
-        dstU[i]= src1[4*i + 1];
-        dstV[i]= src1[4*i + 3];
-    }
-    assert(src1 == src2);
-}
-
-static inline void RENAME(LEToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
-{
-    int i;
-    for (i=0; i<width; i++) {
-        dstU[i]= src1[2*i + 1];
-        dstV[i]= src2[2*i + 1];
-    }
-}
-
-/* This is almost identical to the previous, end exists only because
- * yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses. */
-static inline void RENAME(uyvyToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
-{
-    int i;
-    for (i=0; i<width; i++)
-        dst[i]= src[2*i+1];
-}
-
-static inline void RENAME(uyvyToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
-{
-    int i;
-    for (i=0; i<width; i++) {
-        dstU[i]= src1[4*i + 0];
-        dstV[i]= src1[4*i + 2];
-    }
-    assert(src1 == src2);
-}
-
-static inline void RENAME(BEToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
-{
-    int i;
-    for (i=0; i<width; i++) {
-        dstU[i]= src1[2*i];
-        dstV[i]= src2[2*i];
-    }
-}
-
-static inline void RENAME(nvXXtoUV)(uint8_t *dst1, uint8_t *dst2,
-                                    const uint8_t *src, long width)
-{
-    int i;
-    for (i = 0; i < width; i++) {
-        dst1[i] = src[2*i+0];
-        dst2[i] = src[2*i+1];
-    }
-}
-
-static inline void RENAME(nv12ToUV)(uint8_t *dstU, uint8_t *dstV,
-                                    const uint8_t *src1, const uint8_t *src2,
-                                    long width, uint32_t *unused)
-{
-    RENAME(nvXXtoUV)(dstU, dstV, src1, width);
-}
-
-static inline void RENAME(nv21ToUV)(uint8_t *dstU, uint8_t *dstV,
-                                    const uint8_t *src1, const uint8_t *src2,
-                                    long width, uint32_t *unused)
-{
-    RENAME(nvXXtoUV)(dstV, dstU, src1, width);
-}
-
-
-static inline void RENAME(bgr24ToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
-{
-    int i;
-    for (i=0; i<width; i++) {
-        int b= src[i*3+0];
-        int g= src[i*3+1];
-        int r= src[i*3+2];
-
-        dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
-    }
-}
-
-static inline void RENAME(bgr24ToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
-{
-    int i;
-    for (i=0; i<width; i++) {
-        int b= src1[3*i + 0];
-        int g= src1[3*i + 1];
-        int r= src1[3*i + 2];
-
-        dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
-        dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
-    }
-    assert(src1 == src2);
-}
-
-static inline void RENAME(bgr24ToUV_half)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
-{
-    int i;
-    for (i=0; i<width; i++) {
-        int b= src1[6*i + 0] + src1[6*i + 3];
-        int g= src1[6*i + 1] + src1[6*i + 4];
-        int r= src1[6*i + 2] + src1[6*i + 5];
-
-        dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
-        dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
-    }
-    assert(src1 == src2);
-}
-
-static inline void RENAME(rgb24ToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
-{
-    int i;
-    for (i=0; i<width; i++) {
-        int r= src[i*3+0];
-        int g= src[i*3+1];
-        int b= src[i*3+2];
-
-        dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
-    }
-}
-
-static inline void RENAME(rgb24ToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
-{
-    int i;
-    assert(src1==src2);
-    for (i=0; i<width; i++) {
-        int r= src1[3*i + 0];
-        int g= src1[3*i + 1];
-        int b= src1[3*i + 2];
-
-        dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
-        dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
-    }
-}
-
-static inline void RENAME(rgb24ToUV_half)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
-{
-    int i;
-    assert(src1==src2);
-    for (i=0; i<width; i++) {
-        int r= src1[6*i + 0] + src1[6*i + 3];
-        int g= src1[6*i + 1] + src1[6*i + 4];
-        int b= src1[6*i + 2] + src1[6*i + 5];
-
-        dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
-        dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
-    }
-}
-
-
-// bilinear / bicubic scaling
-static inline void RENAME(hScale)(int16_t *dst, int dstW, const uint8_t *src, int srcW, int xInc,
-                                  const int16_t *filter, const int16_t *filterPos, long filterSize)
-{
-#if COMPILE_TEMPLATE_ALTIVEC
-    hScale_altivec_real(dst, dstW, src, srcW, xInc, filter, filterPos, filterSize);
-#else
-    int i;
-    for (i=0; i<dstW; i++) {
-        int j;
-        int srcPos= filterPos[i];
-        int val=0;
-        //printf("filterPos: %d\n", filterPos[i]);
-        for (j=0; j<filterSize; j++) {
-            //printf("filter: %d, src: %d\n", filter[i], src[srcPos + j]);
-            val += ((int)src[srcPos + j])*filter[filterSize*i + j];
-        }
-        //filter += hFilterSize;
-        dst[i] = FFMIN(val>>7, (1<<15)-1); // the cubic equation does overflow ...
-        //dst[i] = val>>7;
-    }
-#endif /* COMPILE_TEMPLATE_ALTIVEC */
-}
-
-//FIXME all pal and rgb srcFormats could do this convertion as well
-//FIXME all scalers more complex than bilinear could do half of this transform
-static void RENAME(chrRangeToJpeg)(uint16_t *dst, int width)
-{
-    int i;
-    for (i = 0; i < width; i++) {
-        dst[i     ] = (FFMIN(dst[i     ],30775)*4663 - 9289992)>>12; //-264
-        dst[i+VOFW] = (FFMIN(dst[i+VOFW],30775)*4663 - 9289992)>>12; //-264
-    }
-}
-static void RENAME(chrRangeFromJpeg)(uint16_t *dst, int width)
-{
-    int i;
-    for (i = 0; i < width; i++) {
-        dst[i     ] = (dst[i     ]*1799 + 4081085)>>11; //1469
-        dst[i+VOFW] = (dst[i+VOFW]*1799 + 4081085)>>11; //1469
-    }
-}
-static void RENAME(lumRangeToJpeg)(uint16_t *dst, int width)
-{
-    int i;
-    for (i = 0; i < width; i++)
-        dst[i] = (FFMIN(dst[i],30189)*19077 - 39057361)>>14;
-}
-static void RENAME(lumRangeFromJpeg)(uint16_t *dst, int width)
-{
-    int i;
-    for (i = 0; i < width; i++)
-        dst[i] = (dst[i]*14071 + 33561947)>>14;
-}
-
-static inline void RENAME(hyscale_fast)(SwsContext *c, int16_t *dst,
-                                        long dstWidth, const uint8_t *src, int srcW,
-                                        int xInc)
-{
-    int i;
-    unsigned int xpos=0;
-    for (i=0;i<dstWidth;i++) {
-        register unsigned int xx=xpos>>16;
-        register unsigned int xalpha=(xpos&0xFFFF)>>9;
-        dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha;
-        xpos+=xInc;
-    }
-}
-
-      // *** horizontal scale Y line to temp buffer
-static inline void RENAME(hyscale)(SwsContext *c, uint16_t *dst, long dstWidth, const uint8_t *src, int srcW, int xInc,
-                                   const int16_t *hLumFilter,
-                                   const int16_t *hLumFilterPos, int hLumFilterSize,
-                                   uint8_t *formatConvBuffer,
-                                   uint32_t *pal, int isAlpha)
-{
-    void (*toYV12)(uint8_t *, const uint8_t *, long, uint32_t *) = isAlpha ? c->alpToYV12 : c->lumToYV12;
-    void (*convertRange)(uint16_t *, int) = isAlpha ? NULL : c->lumConvertRange;
-
-    src += isAlpha ? c->alpSrcOffset : c->lumSrcOffset;
-
-    if (toYV12) {
-        toYV12(formatConvBuffer, src, srcW, pal);
-        src= formatConvBuffer;
-    }
-
-    if (!c->hyscale_fast) {
-        c->hScale(dst, dstWidth, src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize);
-    } else { // fast bilinear upscale / crap downscale
-        c->hyscale_fast(c, dst, dstWidth, src, srcW, xInc);
-    }
-
-    if (convertRange)
-        convertRange(dst, dstWidth);
-}
-
-static inline void RENAME(hcscale_fast)(SwsContext *c, int16_t *dst,
-                                        long dstWidth, const uint8_t *src1,
-                                        const uint8_t *src2, int srcW, int xInc)
-{
-    int i;
-    unsigned int xpos=0;
-    for (i=0;i<dstWidth;i++) {
-        register unsigned int xx=xpos>>16;
-        register unsigned int xalpha=(xpos&0xFFFF)>>9;
-        dst[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha);
-        dst[i+VOFW]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha);
-        /* slower
-        dst[i]= (src1[xx]<<7) + (src1[xx+1] - src1[xx])*xalpha;
-        dst[i+VOFW]=(src2[xx]<<7) + (src2[xx+1] - src2[xx])*xalpha;
-        */
-        xpos+=xInc;
-    }
-}
-
-inline static void RENAME(hcscale)(SwsContext *c, uint16_t *dst, long dstWidth, const uint8_t *src1, const uint8_t *src2,
-                                   int srcW, int xInc, const int16_t *hChrFilter,
-                                   const int16_t *hChrFilterPos, int hChrFilterSize,
-                                   uint8_t *formatConvBuffer,
-                                   uint32_t *pal)
-{
-
-    src1 += c->chrSrcOffset;
-    src2 += c->chrSrcOffset;
-
-    if (c->chrToYV12) {
-        c->chrToYV12(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
-        src1= formatConvBuffer;
-        src2= formatConvBuffer+VOFW;
-    }
-
-    if (!c->hcscale_fast) {
-        c->hScale(dst     , dstWidth, src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
-        c->hScale(dst+VOFW, dstWidth, src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
-    } else { // fast bilinear upscale / crap downscale
-        c->hcscale_fast(c, dst, dstWidth, src1, src2, srcW, xInc);
-    }
-
-    if (c->chrConvertRange)
-        c->chrConvertRange(dst, dstWidth);
-}
-
-#define DEBUG_SWSCALE_BUFFERS 0
-#define DEBUG_BUFFERS(...) if (DEBUG_SWSCALE_BUFFERS) av_log(c, AV_LOG_DEBUG, __VA_ARGS__)
-
-static int RENAME(swScale)(SwsContext *c, const uint8_t* src[], int srcStride[], int srcSliceY,
-                           int srcSliceH, uint8_t* dst[], int dstStride[])
-{
-    /* load a few things into local vars to make the code more readable? and faster */
-    const int srcW= c->srcW;
-    const int dstW= c->dstW;
-    const int dstH= c->dstH;
-    const int chrDstW= c->chrDstW;
-    const int chrSrcW= c->chrSrcW;
-    const int lumXInc= c->lumXInc;
-    const int chrXInc= c->chrXInc;
-    const enum PixelFormat dstFormat= c->dstFormat;
-    const int flags= c->flags;
-    int16_t *vLumFilterPos= c->vLumFilterPos;
-    int16_t *vChrFilterPos= c->vChrFilterPos;
-    int16_t *hLumFilterPos= c->hLumFilterPos;
-    int16_t *hChrFilterPos= c->hChrFilterPos;
-    int16_t *vLumFilter= c->vLumFilter;
-    int16_t *vChrFilter= c->vChrFilter;
-    int16_t *hLumFilter= c->hLumFilter;
-    int16_t *hChrFilter= c->hChrFilter;
-    int32_t *lumMmxFilter= c->lumMmxFilter;
-    int32_t *chrMmxFilter= c->chrMmxFilter;
-    int32_t av_unused *alpMmxFilter= c->alpMmxFilter;
-    const int vLumFilterSize= c->vLumFilterSize;
-    const int vChrFilterSize= c->vChrFilterSize;
-    const int hLumFilterSize= c->hLumFilterSize;
-    const int hChrFilterSize= c->hChrFilterSize;
-    int16_t **lumPixBuf= c->lumPixBuf;
-    int16_t **chrPixBuf= c->chrPixBuf;
-    int16_t **alpPixBuf= c->alpPixBuf;
-    const int vLumBufSize= c->vLumBufSize;
-    const int vChrBufSize= c->vChrBufSize;
-    uint8_t *formatConvBuffer= c->formatConvBuffer;
-    const int chrSrcSliceY= srcSliceY >> c->chrSrcVSubSample;
-    const int chrSrcSliceH= -((-srcSliceH) >> c->chrSrcVSubSample);
-    int lastDstY;
-    uint32_t *pal=c->pal_yuv;
-
-    /* vars which will change and which we need to store back in the context */
-    int dstY= c->dstY;
-    int lumBufIndex= c->lumBufIndex;
-    int chrBufIndex= c->chrBufIndex;
-    int lastInLumBuf= c->lastInLumBuf;
-    int lastInChrBuf= c->lastInChrBuf;
-
-    if (isPacked(c->srcFormat)) {
-        src[0]=
-        src[1]=
-        src[2]=
-        src[3]= src[0];
-        srcStride[0]=
-        srcStride[1]=
-        srcStride[2]=
-        srcStride[3]= srcStride[0];
-    }
-    srcStride[1]<<= c->vChrDrop;
-    srcStride[2]<<= c->vChrDrop;
-
-    DEBUG_BUFFERS("swScale() %p[%d] %p[%d] %p[%d] %p[%d] -> %p[%d] %p[%d] %p[%d] %p[%d]\n",
-                  src[0], srcStride[0], src[1], srcStride[1], src[2], srcStride[2], src[3], srcStride[3],
-                  dst[0], dstStride[0], dst[1], dstStride[1], dst[2], dstStride[2], dst[3], dstStride[3]);
-    DEBUG_BUFFERS("srcSliceY: %d srcSliceH: %d dstY: %d dstH: %d\n",
-                   srcSliceY,    srcSliceH,    dstY,    dstH);
-    DEBUG_BUFFERS("vLumFilterSize: %d vLumBufSize: %d vChrFilterSize: %d vChrBufSize: %d\n",
-                   vLumFilterSize,    vLumBufSize,    vChrFilterSize,    vChrBufSize);
-
-    if (dstStride[0]%8 !=0 || dstStride[1]%8 !=0 || dstStride[2]%8 !=0 || dstStride[3]%8 != 0) {
-        static int warnedAlready=0; //FIXME move this into the context perhaps
-        if (flags & SWS_PRINT_INFO && !warnedAlready) {
-            av_log(c, AV_LOG_WARNING, "Warning: dstStride is not aligned!\n"
-                   "         ->cannot do aligned memory accesses anymore\n");
-            warnedAlready=1;
-        }
-    }
-
-    /* Note the user might start scaling the picture in the middle so this
-       will not get executed. This is not really intended but works
-       currently, so people might do it. */
-    if (srcSliceY ==0) {
-        lumBufIndex=-1;
-        chrBufIndex=-1;
-        dstY=0;
-        lastInLumBuf= -1;
-        lastInChrBuf= -1;
-    }
-
-    lastDstY= dstY;
-
-    for (;dstY < dstH; dstY++) {
-        unsigned char *dest =dst[0]+dstStride[0]*dstY;
-        const int chrDstY= dstY>>c->chrDstVSubSample;
-        unsigned char *uDest=dst[1]+dstStride[1]*chrDstY;
-        unsigned char *vDest=dst[2]+dstStride[2]*chrDstY;
-        unsigned char *aDest=(CONFIG_SWSCALE_ALPHA && alpPixBuf) ? dst[3]+dstStride[3]*dstY : NULL;
-
-        const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input
-        const int firstLumSrcY2= vLumFilterPos[FFMIN(dstY | ((1<<c->chrDstVSubSample) - 1), dstH-1)];
-        const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input
-        int lastLumSrcY= firstLumSrcY + vLumFilterSize -1; // Last line needed as input
-        int lastLumSrcY2=firstLumSrcY2+ vLumFilterSize -1; // Last line needed as input
-        int lastChrSrcY= firstChrSrcY + vChrFilterSize -1; // Last line needed as input
-        int enough_lines;
-
-        //handle holes (FAST_BILINEAR & weird filters)
-        if (firstLumSrcY > lastInLumBuf) lastInLumBuf= firstLumSrcY-1;
-        if (firstChrSrcY > lastInChrBuf) lastInChrBuf= firstChrSrcY-1;
-        assert(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1);
-        assert(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1);
-
-        DEBUG_BUFFERS("dstY: %d\n", dstY);
-        DEBUG_BUFFERS("\tfirstLumSrcY: %d lastLumSrcY: %d lastInLumBuf: %d\n",
-                         firstLumSrcY,    lastLumSrcY,    lastInLumBuf);
-        DEBUG_BUFFERS("\tfirstChrSrcY: %d lastChrSrcY: %d lastInChrBuf: %d\n",
-                         firstChrSrcY,    lastChrSrcY,    lastInChrBuf);
-
-        // Do we have enough lines in this slice to output the dstY line
-        enough_lines = lastLumSrcY2 < srcSliceY + srcSliceH && lastChrSrcY < -((-srcSliceY - srcSliceH)>>c->chrSrcVSubSample);
-
-        if (!enough_lines) {
-            lastLumSrcY = srcSliceY + srcSliceH - 1;
-            lastChrSrcY = chrSrcSliceY + chrSrcSliceH - 1;
-            DEBUG_BUFFERS("buffering slice: lastLumSrcY %d lastChrSrcY %d\n",
-                                            lastLumSrcY, lastChrSrcY);
-        }
-
-        //Do horizontal scaling
-        while(lastInLumBuf < lastLumSrcY) {
-            const uint8_t *src1= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
-            const uint8_t *src2= src[3]+(lastInLumBuf + 1 - srcSliceY)*srcStride[3];
-            lumBufIndex++;
-            assert(lumBufIndex < 2*vLumBufSize);
-            assert(lastInLumBuf + 1 - srcSliceY < srcSliceH);
-            assert(lastInLumBuf + 1 - srcSliceY >= 0);
-            RENAME(hyscale)(c, lumPixBuf[ lumBufIndex ], dstW, src1, srcW, lumXInc,
-                            hLumFilter, hLumFilterPos, hLumFilterSize,
-                            formatConvBuffer,
-                            pal, 0);
-            if (CONFIG_SWSCALE_ALPHA && alpPixBuf)
-                RENAME(hyscale)(c, alpPixBuf[ lumBufIndex ], dstW, src2, srcW, lumXInc,
-                                hLumFilter, hLumFilterPos, hLumFilterSize,
-                                formatConvBuffer,
-                                pal, 1);
-            lastInLumBuf++;
-            DEBUG_BUFFERS("\t\tlumBufIndex %d: lastInLumBuf: %d\n",
-                               lumBufIndex,    lastInLumBuf);
-        }
-        while(lastInChrBuf < lastChrSrcY) {
-            const uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
-            const uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
-            chrBufIndex++;
-            assert(chrBufIndex < 2*vChrBufSize);
-            assert(lastInChrBuf + 1 - chrSrcSliceY < (chrSrcSliceH));
-            assert(lastInChrBuf + 1 - chrSrcSliceY >= 0);
-            //FIXME replace parameters through context struct (some at least)
-
-            if (c->needs_hcscale)
-                RENAME(hcscale)(c, chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
-                                hChrFilter, hChrFilterPos, hChrFilterSize,
-                                formatConvBuffer,
-                                pal);
-            lastInChrBuf++;
-            DEBUG_BUFFERS("\t\tchrBufIndex %d: lastInChrBuf: %d\n",
-                               chrBufIndex,    lastInChrBuf);
-        }
-        //wrap buf index around to stay inside the ring buffer
-        if (lumBufIndex >= vLumBufSize) lumBufIndex-= vLumBufSize;
-        if (chrBufIndex >= vChrBufSize) chrBufIndex-= vChrBufSize;
-        if (!enough_lines)
-            break; //we can't output a dstY line so let's try with the next slice
-
-        if (dstY < dstH-2) {
-            const int16_t **lumSrcPtr= (const int16_t **) lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
-            const int16_t **chrSrcPtr= (const int16_t **) chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
-            const int16_t **alpSrcPtr= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? (const int16_t **) alpPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize : NULL;
-            if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21) {
-                const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
-                if (dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
-                c->yuv2nv12X(c,
-                             vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
-                             vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
-                             dest, uDest, dstW, chrDstW, dstFormat);
-            } else if (isPlanarYUV(dstFormat) || dstFormat==PIX_FMT_GRAY8) { //YV12 like
-                const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
-                if ((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
-                if (is16BPS(dstFormat) || is9_OR_10BPS(dstFormat)) {
-                    yuv2yuvX16inC(
-                                  vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
-                                  vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
-                                  alpSrcPtr, (uint16_t *) dest, (uint16_t *) uDest, (uint16_t *) vDest, (uint16_t *) aDest, dstW, chrDstW,
-                                  dstFormat);
-                } else if (vLumFilterSize == 1 && vChrFilterSize == 1) { // unscaled YV12
-                    const int16_t *lumBuf = lumSrcPtr[0];
-                    const int16_t *chrBuf= chrSrcPtr[0];
-                    const int16_t *alpBuf= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? alpSrcPtr[0] : NULL;
-                    c->yuv2yuv1(c, lumBuf, chrBuf, alpBuf, dest, uDest, vDest, aDest, dstW, chrDstW);
-                } else { //General YV12
-                    c->yuv2yuvX(c,
-                                vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
-                                vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
-                                alpSrcPtr, dest, uDest, vDest, aDest, dstW, chrDstW);
-                }
-            } else {
-                assert(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
-                assert(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
-                if (vLumFilterSize == 1 && vChrFilterSize == 2) { //unscaled RGB
-                    int chrAlpha= vChrFilter[2*dstY+1];
-                    if(flags & SWS_FULL_CHR_H_INT) {
-                        yuv2rgbXinC_full(c, //FIXME write a packed1_full function
-                                         vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
-                                         vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
-                                         alpSrcPtr, dest, dstW, dstY);
-                    } else {
-                        c->yuv2packed1(c, *lumSrcPtr, *chrSrcPtr, *(chrSrcPtr+1),
-                                       alpPixBuf ? *alpSrcPtr : NULL,
-                                       dest, dstW, chrAlpha, dstFormat, flags, dstY);
-                    }
-                } else if (vLumFilterSize == 2 && vChrFilterSize == 2) { //bilinear upscale RGB
-                    int lumAlpha= vLumFilter[2*dstY+1];
-                    int chrAlpha= vChrFilter[2*dstY+1];
-                    lumMmxFilter[2]=
-                    lumMmxFilter[3]= vLumFilter[2*dstY   ]*0x10001;
-                    chrMmxFilter[2]=
-                    chrMmxFilter[3]= vChrFilter[2*chrDstY]*0x10001;
-                    if(flags & SWS_FULL_CHR_H_INT) {
-                        yuv2rgbXinC_full(c, //FIXME write a packed2_full function
-                                         vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
-                                         vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
-                                         alpSrcPtr, dest, dstW, dstY);
-                    } else {
-                        c->yuv2packed2(c, *lumSrcPtr, *(lumSrcPtr+1), *chrSrcPtr, *(chrSrcPtr+1),
-                                       alpPixBuf ? *alpSrcPtr : NULL, alpPixBuf ? *(alpSrcPtr+1) : NULL,
-                                       dest, dstW, lumAlpha, chrAlpha, dstY);
-                    }
-                } else { //general RGB
-                    if(flags & SWS_FULL_CHR_H_INT) {
-                        yuv2rgbXinC_full(c,
-                                         vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
-                                         vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
-                                         alpSrcPtr, dest, dstW, dstY);
-                    } else {
-                        c->yuv2packedX(c,
-                                       vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
-                                       vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
-                                       alpSrcPtr, dest, dstW, dstY);
-                    }
-                }
-            }
-        } else { // hmm looks like we can't use MMX here without overwriting this array's tail
-            const int16_t **lumSrcPtr= (const int16_t **)lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
-            const int16_t **chrSrcPtr= (const int16_t **)chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
-            const int16_t **alpSrcPtr= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? (const int16_t **)alpPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize : NULL;
-            if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21) {
-                const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
-                if (dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
-                yuv2nv12XinC(
-                             vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
-                             vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
-                             dest, uDest, dstW, chrDstW, dstFormat);
-            } else if (isPlanarYUV(dstFormat) || dstFormat==PIX_FMT_GRAY8) { //YV12
-                const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
-                if ((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
-                if (is16BPS(dstFormat) || is9_OR_10BPS(dstFormat)) {
-                    yuv2yuvX16inC(
-                                  vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
-                                  vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
-                                  alpSrcPtr, (uint16_t *) dest, (uint16_t *) uDest, (uint16_t *) vDest, (uint16_t *) aDest, dstW, chrDstW,
-                                  dstFormat);
-                } else {
-                    yuv2yuvXinC(
-                                vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
-                                vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
-                                alpSrcPtr, dest, uDest, vDest, aDest, dstW, chrDstW);
-                }
-            } else {
-                assert(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
-                assert(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
-                if(flags & SWS_FULL_CHR_H_INT) {
-                    yuv2rgbXinC_full(c,
-                                     vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
-                                     vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
-                                     alpSrcPtr, dest, dstW, dstY);
-                } else {
-                    yuv2packedXinC(c,
-                                   vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
-                                   vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
-                                   alpSrcPtr, dest, dstW, dstY);
-                }
-            }
-        }
-    }
-
-    if ((dstFormat == PIX_FMT_YUVA420P) && !alpPixBuf)
-        fillPlane(dst[3], dstStride[3], dstW, dstY-lastDstY, lastDstY, 255);
-
-    /* store changed local vars back in the context */
-    c->dstY= dstY;
-    c->lumBufIndex= lumBufIndex;
-    c->chrBufIndex= chrBufIndex;
-    c->lastInLumBuf= lastInLumBuf;
-    c->lastInChrBuf= lastInChrBuf;
-
-    return dstY - lastDstY;
-}
+#endif
 
 static void RENAME(sws_init_swScale)(SwsContext *c)
 {
     enum PixelFormat srcFormat = c->srcFormat;
 
-    c->yuv2nv12X    = RENAME(yuv2nv12X   );
-    c->yuv2yuv1     = RENAME(yuv2yuv1    );
     c->yuv2yuvX     = RENAME(yuv2yuvX    );
-    c->yuv2packed1  = RENAME(yuv2packed1 );
-    c->yuv2packed2  = RENAME(yuv2packed2 );
     c->yuv2packedX  = RENAME(yuv2packedX );
-
-    c->hScale       = RENAME(hScale      );
-
-    if (c->flags & SWS_FAST_BILINEAR)
-    {
-        c->hyscale_fast = RENAME(hyscale_fast);
-        c->hcscale_fast = RENAME(hcscale_fast);
-    }
-
-    switch(srcFormat) {
-        case PIX_FMT_YUYV422  : c->chrToYV12 = RENAME(yuy2ToUV); break;
-        case PIX_FMT_UYVY422  : c->chrToYV12 = RENAME(uyvyToUV); break;
-        case PIX_FMT_NV12     : c->chrToYV12 = RENAME(nv12ToUV); break;
-        case PIX_FMT_NV21     : c->chrToYV12 = RENAME(nv21ToUV); break;
-        case PIX_FMT_YUV420P16BE:
-        case PIX_FMT_YUV422P16BE:
-        case PIX_FMT_YUV444P16BE: c->chrToYV12 = RENAME(BEToUV); break;
-        case PIX_FMT_YUV420P16LE:
-        case PIX_FMT_YUV422P16LE:
-        case PIX_FMT_YUV444P16LE: c->chrToYV12 = RENAME(LEToUV); break;
-    }
-    if (c->chrSrcHSubSample) {
-        switch(srcFormat) {
-        case PIX_FMT_BGR24  : c->chrToYV12 = RENAME(bgr24ToUV_half); break;
-        case PIX_FMT_RGB24  : c->chrToYV12 = RENAME(rgb24ToUV_half); break;
-        }
-    } else {
-        switch(srcFormat) {
-        case PIX_FMT_BGR24  : c->chrToYV12 = RENAME(bgr24ToUV); break;
-        case PIX_FMT_RGB24  : c->chrToYV12 = RENAME(rgb24ToUV); break;
-        }
-    }
-
-    switch (srcFormat) {
-    case PIX_FMT_YUYV422  :
-    case PIX_FMT_YUV420P16BE:
-    case PIX_FMT_YUV422P16BE:
-    case PIX_FMT_YUV444P16BE:
-    case PIX_FMT_Y400A    :
-    case PIX_FMT_GRAY16BE : c->lumToYV12 = RENAME(yuy2ToY); break;
-    case PIX_FMT_UYVY422  :
-    case PIX_FMT_YUV420P16LE:
-    case PIX_FMT_YUV422P16LE:
-    case PIX_FMT_YUV444P16LE:
-    case PIX_FMT_GRAY16LE : c->lumToYV12 = RENAME(uyvyToY); break;
-    case PIX_FMT_BGR24    : c->lumToYV12 = RENAME(bgr24ToY); break;
-    case PIX_FMT_RGB24    : c->lumToYV12 = RENAME(rgb24ToY); break;
-    }
-    if (c->alpPixBuf) {
-        switch (srcFormat) {
-        case PIX_FMT_Y400A  : c->alpToYV12 = RENAME(yuy2ToY); break;
-        }
-    }
-
-    switch (srcFormat) {
-    case PIX_FMT_Y400A  :
-        c->alpSrcOffset = 1;
-        break;
-    case PIX_FMT_RGB32  :
-    case PIX_FMT_BGR32  :
-        c->alpSrcOffset = 3;
-        break;
-    case PIX_FMT_RGB48LE:
-        c->lumSrcOffset = 1;
-        c->chrSrcOffset = 1;
-        c->alpSrcOffset = 1;
-        break;
-    }
-
-    if (c->srcRange != c->dstRange && !isAnyRGB(c->dstFormat)) {
-        if (c->srcRange) {
-            c->lumConvertRange = RENAME(lumRangeFromJpeg);
-            c->chrConvertRange = RENAME(chrRangeFromJpeg);
-        } else {
-            c->lumConvertRange = RENAME(lumRangeToJpeg);
-            c->chrConvertRange = RENAME(chrRangeToJpeg);
-        }
-    }
-
-    if (!(isGray(srcFormat) || isGray(c->dstFormat) ||
-          srcFormat == PIX_FMT_MONOBLACK || srcFormat == PIX_FMT_MONOWHITE))
-        c->needs_hcscale = 1;
 }
diff --git a/libswscale/swscale.c b/libswscale/swscale.c
index 749a276e5a..4977a3a736 100644
--- a/libswscale/swscale.c
+++ b/libswscale/swscale.c
@@ -1250,10 +1250,8 @@ SwsFunc ff_getSwsFunc(SwsContext *c)
     } else
 #endif
 #if HAVE_ALTIVEC
-    if (cpu_flags & AV_CPU_FLAG_ALTIVEC) {
+    if (cpu_flags & AV_CPU_FLAG_ALTIVEC)
         sws_init_swScale_altivec(c);
-        return swScale_altivec;
-    } else
 #endif
 
     return swScale_c;
-- 
cgit v1.2.3


From 1dd4f4be5a82409a162b42b120fc5054fef9d899 Mon Sep 17 00:00:00 2001
From: "Ronald S. Bultje" <rsbultje@gmail.com>
Date: Tue, 24 May 2011 10:19:41 -0400
Subject: swscale: remove AMD3DNOW "optimizations".

The functions are identical to their MMX counterparts. Thus,
pretending that swscale is highly optimized for AMD3DNOW
extensions is a poorly executed practical joke at best.
---
 libswscale/swscale.c              | 22 ----------------------
 libswscale/x86/swscale_template.c | 15 ++-------------
 2 files changed, 2 insertions(+), 35 deletions(-)

diff --git a/libswscale/swscale.c b/libswscale/swscale.c
index 4977a3a736..5b71a6f1c0 100644
--- a/libswscale/swscale.c
+++ b/libswscale/swscale.c
@@ -1178,7 +1178,6 @@ static inline void monoblack2Y(uint8_t *dst, const uint8_t *src, long width, uin
 //Plain C versions
 
 #define COMPILE_TEMPLATE_MMX2 0
-#define COMPILE_TEMPLATE_AMD3DNOW 0
 #define COMPILE_TEMPLATE_ALTIVEC 0
 
 #include "swscale_template.c"
@@ -1195,9 +1194,7 @@ static inline void monoblack2Y(uint8_t *dst, const uint8_t *src, long width, uin
 #if HAVE_MMX
 #undef RENAME
 #undef COMPILE_TEMPLATE_MMX2
-#undef COMPILE_TEMPLATE_AMD3DNOW
 #define COMPILE_TEMPLATE_MMX2 0
-#define COMPILE_TEMPLATE_AMD3DNOW 0
 #define RENAME(a) a ## _MMX
 #include "x86/swscale_template.c"
 #endif
@@ -1206,24 +1203,11 @@ static inline void monoblack2Y(uint8_t *dst, const uint8_t *src, long width, uin
 #if HAVE_MMX2
 #undef RENAME
 #undef COMPILE_TEMPLATE_MMX2
-#undef COMPILE_TEMPLATE_AMD3DNOW
 #define COMPILE_TEMPLATE_MMX2 1
-#define COMPILE_TEMPLATE_AMD3DNOW 0
 #define RENAME(a) a ## _MMX2
 #include "x86/swscale_template.c"
 #endif
 
-//3DNOW versions
-#if HAVE_AMD3DNOW
-#undef RENAME
-#undef COMPILE_TEMPLATE_MMX2
-#undef COMPILE_TEMPLATE_AMD3DNOW
-#define COMPILE_TEMPLATE_MMX2 0
-#define COMPILE_TEMPLATE_AMD3DNOW 1
-#define RENAME(a) a ## _3DNow
-#include "x86/swscale_template.c"
-#endif
-
 SwsFunc ff_getSwsFunc(SwsContext *c)
 {
     int cpu_flags = av_get_cpu_flags();
@@ -1237,12 +1221,6 @@ SwsFunc ff_getSwsFunc(SwsContext *c)
         return swScale_MMX2;
     } else
 #endif
-#if HAVE_AMD3DNOW
-    if (cpu_flags & AV_CPU_FLAG_3DNOW) {
-        sws_init_swScale_3DNow(c);
-        return swScale_3DNow;
-    } else
-#endif
 #if HAVE_MMX
     if (cpu_flags & AV_CPU_FLAG_MMX) {
         sws_init_swScale_MMX(c);
diff --git a/libswscale/x86/swscale_template.c b/libswscale/x86/swscale_template.c
index d0671315ae..63e0f72cb2 100644
--- a/libswscale/x86/swscale_template.c
+++ b/libswscale/x86/swscale_template.c
@@ -22,23 +22,14 @@
 
 #undef REAL_MOVNTQ
 #undef MOVNTQ
-#undef PAVGB
 #undef PREFETCH
 
-#if COMPILE_TEMPLATE_AMD3DNOW
-#define PREFETCH  "prefetch"
-#elif COMPILE_TEMPLATE_MMX2
+#if COMPILE_TEMPLATE_MMX2
 #define PREFETCH "prefetchnta"
 #else
 #define PREFETCH  " # nop"
 #endif
 
-#if COMPILE_TEMPLATE_MMX2
-#define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
-#elif COMPILE_TEMPLATE_AMD3DNOW
-#define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
-#endif
-
 #if COMPILE_TEMPLATE_MMX2
 #define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
 #else
@@ -2559,9 +2550,7 @@ static int RENAME(swScale)(SwsContext *c, const uint8_t* src[], int srcStride[],
         fillPlane(dst[3], dstStride[3], dstW, dstY-lastDstY, lastDstY, 255);
 
     if (COMPILE_TEMPLATE_MMX2)      __asm__ volatile("sfence":::"memory");
-    /* On K6 femms is faster than emms. On K7 femms is directly mapped to emms. */
-    if (COMPILE_TEMPLATE_AMD3DNOW)  __asm__ volatile("femms" :::"memory");
-    else                            __asm__ volatile("emms"  :::"memory");
+    __asm__ volatile("emms"  :::"memory");
 
     /* store changed local vars back in the context */
     c->dstY= dstY;
-- 
cgit v1.2.3


From b9eb2136af4e9b6700437d996eae731ffca07702 Mon Sep 17 00:00:00 2001
From: "Ronald S. Bultje" <rsbultje@gmail.com>
Date: Tue, 24 May 2011 10:46:40 -0400
Subject: swscale: remove dead macro WRITEBGR24OLD.

---
 libswscale/x86/swscale_template.c | 56 ---------------------------------------
 1 file changed, 56 deletions(-)

diff --git a/libswscale/x86/swscale_template.c b/libswscale/x86/swscale_template.c
index 63e0f72cb2..e03fbd44e2 100644
--- a/libswscale/x86/swscale_template.c
+++ b/libswscale/x86/swscale_template.c
@@ -700,62 +700,6 @@
     " jb             1b             \n\t"
 #define WRITERGB15(dst, dstw, index)  REAL_WRITERGB15(dst, dstw, index)
 
-#define WRITEBGR24OLD(dst, dstw, index) \
-    /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
-    "movq      %%mm2, %%mm1             \n\t" /* B */\
-    "movq      %%mm5, %%mm6             \n\t" /* R */\
-    "punpcklbw %%mm4, %%mm2             \n\t" /* GBGBGBGB 0 */\
-    "punpcklbw %%mm7, %%mm5             \n\t" /* 0R0R0R0R 0 */\
-    "punpckhbw %%mm4, %%mm1             \n\t" /* GBGBGBGB 2 */\
-    "punpckhbw %%mm7, %%mm6             \n\t" /* 0R0R0R0R 2 */\
-    "movq      %%mm2, %%mm0             \n\t" /* GBGBGBGB 0 */\
-    "movq      %%mm1, %%mm3             \n\t" /* GBGBGBGB 2 */\
-    "punpcklwd %%mm5, %%mm0             \n\t" /* 0RGB0RGB 0 */\
-    "punpckhwd %%mm5, %%mm2             \n\t" /* 0RGB0RGB 1 */\
-    "punpcklwd %%mm6, %%mm1             \n\t" /* 0RGB0RGB 2 */\
-    "punpckhwd %%mm6, %%mm3             \n\t" /* 0RGB0RGB 3 */\
-\
-    "movq      %%mm0, %%mm4             \n\t" /* 0RGB0RGB 0 */\
-    "psrlq        $8, %%mm0             \n\t" /* 00RGB0RG 0 */\
-    "pand "MANGLE(bm00000111)", %%mm4   \n\t" /* 00000RGB 0 */\
-    "pand "MANGLE(bm11111000)", %%mm0   \n\t" /* 00RGB000 0.5 */\
-    "por       %%mm4, %%mm0             \n\t" /* 00RGBRGB 0 */\
-    "movq      %%mm2, %%mm4             \n\t" /* 0RGB0RGB 1 */\
-    "psllq       $48, %%mm2             \n\t" /* GB000000 1 */\
-    "por       %%mm2, %%mm0             \n\t" /* GBRGBRGB 0 */\
-\
-    "movq      %%mm4, %%mm2             \n\t" /* 0RGB0RGB 1 */\
-    "psrld       $16, %%mm4             \n\t" /* 000R000R 1 */\
-    "psrlq       $24, %%mm2             \n\t" /* 0000RGB0 1.5 */\
-    "por       %%mm4, %%mm2             \n\t" /* 000RRGBR 1 */\
-    "pand "MANGLE(bm00001111)", %%mm2   \n\t" /* 0000RGBR 1 */\
-    "movq      %%mm1, %%mm4             \n\t" /* 0RGB0RGB 2 */\
-    "psrlq        $8, %%mm1             \n\t" /* 00RGB0RG 2 */\
-    "pand "MANGLE(bm00000111)", %%mm4   \n\t" /* 00000RGB 2 */\
-    "pand "MANGLE(bm11111000)", %%mm1   \n\t" /* 00RGB000 2.5 */\
-    "por       %%mm4, %%mm1             \n\t" /* 00RGBRGB 2 */\
-    "movq      %%mm1, %%mm4             \n\t" /* 00RGBRGB 2 */\
-    "psllq       $32, %%mm1             \n\t" /* BRGB0000 2 */\
-    "por       %%mm1, %%mm2             \n\t" /* BRGBRGBR 1 */\
-\
-    "psrlq       $32, %%mm4             \n\t" /* 000000RG 2.5 */\
-    "movq      %%mm3, %%mm5             \n\t" /* 0RGB0RGB 3 */\
-    "psrlq        $8, %%mm3             \n\t" /* 00RGB0RG 3 */\
-    "pand "MANGLE(bm00000111)", %%mm5   \n\t" /* 00000RGB 3 */\
-    "pand "MANGLE(bm11111000)", %%mm3   \n\t" /* 00RGB000 3.5 */\
-    "por       %%mm5, %%mm3             \n\t" /* 00RGBRGB 3 */\
-    "psllq       $16, %%mm3             \n\t" /* RGBRGB00 3 */\
-    "por       %%mm4, %%mm3             \n\t" /* RGBRGBRG 2.5 */\
-\
-    MOVNTQ(%%mm0,   (dst))\
-    MOVNTQ(%%mm2,  8(dst))\
-    MOVNTQ(%%mm3, 16(dst))\
-    "add         $24, "#dst"            \n\t"\
-\
-    "add          $8, "#index"          \n\t"\
-    "cmp     "#dstw", "#index"          \n\t"\
-    " jb          1b                    \n\t"
-
 #define WRITEBGR24MMX(dst, dstw, index) \
     /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
     "movq      %%mm2, %%mm1     \n\t" /* B */\
-- 
cgit v1.2.3


From ab088f7d2842b9c658a8906062a96be37fbc9981 Mon Sep 17 00:00:00 2001
From: Carl Eugen Hoyos <cehoyos@ag.or.at>
Date: Mon, 21 Mar 2011 12:04:10 +0100
Subject: ape: Allow demuxing of files with metadata tags.

Signed-off-by: Anton Khirnov <anton@khirnov.net>
---
 libavformat/ape.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/libavformat/ape.c b/libavformat/ape.c
index 956036d5a8..0bc7737fde 100644
--- a/libavformat/ape.c
+++ b/libavformat/ape.c
@@ -159,8 +159,8 @@ static int ape_read_header(AVFormatContext * s, AVFormatParameters * ap)
     int total_blocks;
     int64_t pts;
 
-    /* TODO: Skip any leading junk such as id3v2 tags */
-    ape->junklength = 0;
+    /* Skip any leading junk such as id3v2 tags */
+    ape->junklength = avio_tell(pb);
 
     tag = avio_rl32(pb);
     if (tag != MKTAG('M', 'A', 'C', ' '))
@@ -276,7 +276,7 @@ static int ape_read_header(AVFormatContext * s, AVFormatParameters * ap)
     ape->frames[0].nblocks = ape->blocksperframe;
     ape->frames[0].skip    = 0;
     for (i = 1; i < ape->totalframes; i++) {
-        ape->frames[i].pos      = ape->seektable[i]; //ape->frames[i-1].pos + ape->blocksperframe;
+        ape->frames[i].pos      = ape->seektable[i] + ape->junklength;
         ape->frames[i].nblocks  = ape->blocksperframe;
         ape->frames[i - 1].size = ape->frames[i].pos - ape->frames[i - 1].pos;
         ape->frames[i].skip     = (ape->frames[i].pos - ape->frames[0].pos) & 3;
-- 
cgit v1.2.3


From 676eaf84335fa91ee6a699fc03207762b8a7d6ae Mon Sep 17 00:00:00 2001
From: Alexandre Colucci <alexandre@elgato.com>
Date: Fri, 25 Mar 2011 11:25:02 +0100
Subject: dvdsubdec: fix incorrect colors.

On DVD and HD-DVD colors are stored in the order YCrCb (and not YCbCr) as mentioned in the specifications:
see DVD Specifications for Read-Only Disc / Part 3, 4.3 Program Chain Information (7) PGC_SP_PLT
see DVD Specifications for High Definition Disc, 5.2 Navigation for Standard Content (11) PGC_SDSP_PLT
see DVD Specifications for High Definition Disc, 5.2 Navigation for Standard Content (12) PGC_HDSP_PLT
see DVD Specifications for High Definition Disc, 5.5 Presentation Data (4) SET_COLOR2

When decoding a DVD or HD-DVD subtitle, the colors were incorrectly set.

Signed-off-by: Anton Khirnov <anton@khirnov.net>
---
 libavcodec/dvdsubdec.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libavcodec/dvdsubdec.c b/libavcodec/dvdsubdec.c
index 87eb53baaa..8f3ba63229 100644
--- a/libavcodec/dvdsubdec.c
+++ b/libavcodec/dvdsubdec.c
@@ -34,8 +34,8 @@ static void yuv_a_to_rgba(const uint8_t *ycbcr, const uint8_t *alpha, uint32_t *
 
     for (i = num_values; i > 0; i--) {
         y = *ycbcr++;
-        cb = *ycbcr++;
         cr = *ycbcr++;
+        cb = *ycbcr++;
         YUV_TO_RGB1_CCIR(cb, cr);
         YUV_TO_RGB2_CCIR(r, g, b, y);
         *rgba++ = (*alpha++ << 24) | (r << 16) | (g << 8) | b;
-- 
cgit v1.2.3


From d980d7b1295290cedd978ef53118513838aa1484 Mon Sep 17 00:00:00 2001
From: Alexandre Colucci <alexandre@elgato.com>
Date: Fri, 25 Mar 2011 17:31:28 +0100
Subject: pgssubdec: fix incorrect colors.

On Blu-ray colors are stored in the order YCrCb (and not YCbCr) as mentioned in the specifications:
see System Description Blu-ray Disc Read-Only Format, 9.14.4.2.2.1 Palette Definition Segment

When decoding a Blu-ray subtitle, the colors were incorrectly set.

Signed-off-by: Anton Khirnov <anton@khirnov.net>
---
 libavcodec/pgssubdec.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libavcodec/pgssubdec.c b/libavcodec/pgssubdec.c
index ea53e55b35..a480da1168 100644
--- a/libavcodec/pgssubdec.c
+++ b/libavcodec/pgssubdec.c
@@ -246,8 +246,8 @@ static void parse_palette_segment(AVCodecContext *avctx,
     while (buf < buf_end) {
         color_id  = bytestream_get_byte(&buf);
         y         = bytestream_get_byte(&buf);
-        cb        = bytestream_get_byte(&buf);
         cr        = bytestream_get_byte(&buf);
+        cb        = bytestream_get_byte(&buf);
         alpha     = bytestream_get_byte(&buf);
 
         YUV_TO_RGB1(cb, cr);
-- 
cgit v1.2.3


From 4515f9b58a51eb0af81c1ed1fd99889fcdc5ae91 Mon Sep 17 00:00:00 2001
From: Ilya <ilyaDOTilba@gmail.com>
Date: Sat, 26 Mar 2011 17:13:36 +0100
Subject: rtsp: use strtoul to parse rtptime and seq values.

strtol could return negative values, leading to various error messages,
mainly "non-monotonically increasing dts".

Signed-off-by: Anton Khirnov <anton@khirnov.net>
---
 libavformat/rtsp.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/libavformat/rtsp.c b/libavformat/rtsp.c
index 2ebf7e0510..2d1438d3ed 100644
--- a/libavformat/rtsp.c
+++ b/libavformat/rtsp.c
@@ -744,9 +744,9 @@ static void rtsp_parse_rtp_info(RTSPState *rt, const char *p)
         if (!strcmp(key, "url"))
             av_strlcpy(url, value, sizeof(url));
         else if (!strcmp(key, "seq"))
-            seq = strtol(value, NULL, 10);
+            seq = strtoul(value, NULL, 10);
         else if (!strcmp(key, "rtptime"))
-            rtptime = strtol(value, NULL, 10);
+            rtptime = strtoul(value, NULL, 10);
         if (*p == ',') {
             handle_rtp_info(rt, url, seq, rtptime);
             url[0] = '\0';
-- 
cgit v1.2.3


From 7ca5338b49aa9967c9b18503490ca656e3bd6c5d Mon Sep 17 00:00:00 2001
From: Stefano Sabatini <stefano.sabatini-lala@poste.it>
Date: Tue, 24 May 2011 01:17:25 +0200
Subject: tiff: set palette in the context when specified in TIFF_PAL tag

Since image initialization was moved after tag parsing, the
palette needs to be specified in the context and then copied
to the allocated image in init_image().

Fixes a regression with TIFF images that have palette data,
trac issue #230, file Test_Flate_8bpp.tif.

Signed-off-by: Diego Biurrun <diego@biurrun.de>
---
 libavcodec/tiff.c | 23 +++++++++++++----------
 1 file changed, 13 insertions(+), 10 deletions(-)

diff --git a/libavcodec/tiff.c b/libavcodec/tiff.c
index ffa5c6837a..bea353275f 100644
--- a/libavcodec/tiff.c
+++ b/libavcodec/tiff.c
@@ -41,6 +41,8 @@ typedef struct TiffContext {
 
     int width, height;
     unsigned int bpp, bppcount;
+    uint32_t palette[256];
+    int palette_is_set;
     int le;
     enum TiffCompr compr;
     int invert;
@@ -257,11 +259,15 @@ static int init_image(TiffContext *s)
         av_log(s->avctx, AV_LOG_ERROR, "get_buffer() failed\n");
         return ret;
     }
-    if (s->bpp == 8 && s->picture.data[1]){
-        /* make default grayscale pal */
-        pal = (uint32_t *) s->picture.data[1];
-        for (i = 0; i < 256; i++)
-            pal[i] = i * 0x010101;
+    if (s->avctx->pix_fmt == PIX_FMT_PAL8) {
+        if (s->palette_is_set) {
+            memcpy(s->picture.data[1], s->palette, sizeof(s->palette));
+        } else {
+            /* make default grayscale pal */
+            pal = (uint32_t *) s->picture.data[1];
+            for (i = 0; i < 256; i++)
+                pal[i] = i * 0x010101;
+        }
     }
     return 0;
 }
@@ -444,11 +450,7 @@ static int tiff_decode_tag(TiffContext *s, const uint8_t *start, const uint8_t *
         s->fill_order = value - 1;
         break;
     case TIFF_PAL:
-        if(s->avctx->pix_fmt != PIX_FMT_PAL8){
-            av_log(s->avctx, AV_LOG_ERROR, "Palette met but this is not palettized format\n");
-            return -1;
-        }
-        pal = (uint32_t *) s->picture.data[1];
+        pal = (uint32_t *) s->palette;
         off = type_sizes[type];
         rp = buf;
         gp = buf + count / 3 * off;
@@ -460,6 +462,7 @@ static int tiff_decode_tag(TiffContext *s, const uint8_t *start, const uint8_t *
             j |= tget(&bp, type, s->le) >> off;
             pal[i] = j;
         }
+        s->palette_is_set = 1;
         break;
     case TIFF_PLANAR:
         if(value == 2){
-- 
cgit v1.2.3


From e9735572113ab903b92cc0a7931eb894e7177f6e Mon Sep 17 00:00:00 2001
From: "Ronald S. Bultje" <rsbultje@gmail.com>
Date: Tue, 24 May 2011 13:04:38 -0400
Subject: Move emms_c() from libavcodec to libavutil.

---
 libavcodec/dsputil.h | 20 +-------------------
 libavutil/internal.h | 14 ++++++++++++++
 2 files changed, 15 insertions(+), 19 deletions(-)

diff --git a/libavcodec/dsputil.h b/libavcodec/dsputil.h
index 78d21527a5..74230cadbb 100644
--- a/libavcodec/dsputil.h
+++ b/libavcodec/dsputil.h
@@ -630,13 +630,6 @@ static inline int get_penalty_factor(int lambda, int lambda2, int type){
     }
 }
 
-/**
- * Empty mmx state.
- * this must be called between any dsp function and float/double code.
- * for example sin(); dsp->idct_put(); emms_c(); cos()
- */
-#define emms_c()
-
 void dsputil_init_alpha(DSPContext* c, AVCodecContext *avctx);
 void dsputil_init_arm(DSPContext* c, AVCodecContext *avctx);
 void dsputil_init_bfin(DSPContext* c, AVCodecContext *avctx);
@@ -654,18 +647,7 @@ void ff_intrax8dsp_init(DSPContext* c, AVCodecContext *avctx);
 void ff_mlp_init(DSPContext* c, AVCodecContext *avctx);
 void ff_mlp_init_x86(DSPContext* c, AVCodecContext *avctx);
 
-#if HAVE_MMX
-
-#undef emms_c
-
-static inline void emms(void)
-{
-    __asm__ volatile ("emms;":::"memory");
-}
-
-#define emms_c() emms()
-
-#elif ARCH_ARM
+#if ARCH_ARM
 
 #if HAVE_NEON
 #   define STRIDE_ALIGN 16
diff --git a/libavutil/internal.h b/libavutil/internal.h
index e890ae7083..51e449d3fe 100644
--- a/libavutil/internal.h
+++ b/libavutil/internal.h
@@ -223,4 +223,18 @@
 #   define ONLY_IF_THREADS_ENABLED(x) NULL
 #endif
 
+#if HAVE_MMX
+/**
+ * Empty mmx state.
+ * this must be called between any dsp function and float/double code.
+ * for example sin(); dsp->idct_put(); emms_c(); cos()
+ */
+static av_always_inline void emms_c(void)
+{
+    __asm__ volatile ("emms" ::: "memory");
+}
+#else /* HAVE_MMX */
+#define emms_c()
+#endif /* HAVE_MMX */
+
 #endif /* AVUTIL_INTERNAL_H */
-- 
cgit v1.2.3


From c4fd283a467031fdbde5bca0963b20d5911eca91 Mon Sep 17 00:00:00 2001
From: "Ronald S. Bultje" <rsbultje@gmail.com>
Date: Tue, 24 May 2011 13:04:46 -0400
Subject: swscale: use emms_c().

---
 libswscale/utils.c                | 6 ++----
 libswscale/x86/swscale_template.c | 2 +-
 2 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/libswscale/utils.c b/libswscale/utils.c
index b49ec89be8..29fc975046 100644
--- a/libswscale/utils.c
+++ b/libswscale/utils.c
@@ -193,8 +193,7 @@ static int initFilter(int16_t **outFilter, int16_t **filterPos, int *outFilterSi
     const int64_t fone= 1LL<<54;
     int ret= -1;
 
-    if (HAVE_MMX && cpu_flags & AV_CPU_FLAG_MMX)
-        __asm__ volatile("emms\n\t"::: "memory"); //FIXME this should not be required but it IS (even for non-MMX versions)
+    emms_c(); //FIXME this should not be required but it IS (even for non-MMX versions)
 
     // NOTE: the +1 is for the MMX scaler which reads over the end
     FF_ALLOC_OR_GOTO(NULL, *filterPos, (dstW+1)*sizeof(int16_t), fail);
@@ -757,8 +756,7 @@ int sws_init_context(SwsContext *c, SwsFilter *srcFilter, SwsFilter *dstFilter)
 
     cpu_flags = av_get_cpu_flags();
     flags     = c->flags;
-    if (HAVE_MMX && cpu_flags & AV_CPU_FLAG_MMX)
-        __asm__ volatile("emms\n\t"::: "memory");
+    emms_c();
     if (!rgb15to16) sws_rgb2rgb_init();
 
     unscaled = (srcW == dstW && srcH == dstH);
diff --git a/libswscale/x86/swscale_template.c b/libswscale/x86/swscale_template.c
index e03fbd44e2..576e22c082 100644
--- a/libswscale/x86/swscale_template.c
+++ b/libswscale/x86/swscale_template.c
@@ -2494,7 +2494,7 @@ static int RENAME(swScale)(SwsContext *c, const uint8_t* src[], int srcStride[],
         fillPlane(dst[3], dstStride[3], dstW, dstY-lastDstY, lastDstY, 255);
 
     if (COMPILE_TEMPLATE_MMX2)      __asm__ volatile("sfence":::"memory");
-    __asm__ volatile("emms"  :::"memory");
+    emms_c();
 
     /* store changed local vars back in the context */
     c->dstY= dstY;
-- 
cgit v1.2.3


From aaca69c130edf219110c616d2e236a3a27caf706 Mon Sep 17 00:00:00 2001
From: "Ronald S. Bultje" <rsbultje@gmail.com>
Date: Tue, 24 May 2011 12:43:29 -0400
Subject: swscale: remove swScale_{c,MMX,MMX2} duplication.

---
 libswscale/swscale.c              |  15 +-
 libswscale/swscale_template.c     |  10 ++
 libswscale/x86/swscale_template.c | 305 ++------------------------------------
 3 files changed, 28 insertions(+), 302 deletions(-)

diff --git a/libswscale/swscale.c b/libswscale/swscale.c
index 5b71a6f1c0..36b676bf06 100644
--- a/libswscale/swscale.c
+++ b/libswscale/swscale.c
@@ -1214,18 +1214,13 @@ SwsFunc ff_getSwsFunc(SwsContext *c)
 
     sws_init_swScale_c(c);
 
-#if HAVE_MMX2
-    // ordered per speed fastest first
-    if (cpu_flags & AV_CPU_FLAG_MMX2) {
-        sws_init_swScale_MMX2(c);
-        return swScale_MMX2;
-    } else
-#endif
 #if HAVE_MMX
-    if (cpu_flags & AV_CPU_FLAG_MMX) {
+    if (cpu_flags & AV_CPU_FLAG_MMX)
         sws_init_swScale_MMX(c);
-        return swScale_MMX;
-    } else
+#endif
+#if HAVE_MMX2
+    if (cpu_flags & AV_CPU_FLAG_MMX2)
+        sws_init_swScale_MMX2(c);
 #endif
 #if HAVE_ALTIVEC
     if (cpu_flags & AV_CPU_FLAG_ALTIVEC)
diff --git a/libswscale/swscale_template.c b/libswscale/swscale_template.c
index fe872561cb..0ff402806e 100644
--- a/libswscale/swscale_template.c
+++ b/libswscale/swscale_template.c
@@ -485,6 +485,11 @@ inline static void hcscale_c(SwsContext *c, uint16_t *dst, long dstWidth,
 #define DEBUG_SWSCALE_BUFFERS 0
 #define DEBUG_BUFFERS(...) if (DEBUG_SWSCALE_BUFFERS) av_log(c, AV_LOG_DEBUG, __VA_ARGS__)
 
+#if HAVE_MMX
+static void updateMMXDitherTables(SwsContext *c, int dstY, int lumBufIndex, int chrBufIndex,
+                                  int lastInLumBuf, int lastInChrBuf);
+#endif
+
 static int swScale_c(SwsContext *c, const uint8_t* src[], int srcStride[],
                      int srcSliceY, int srcSliceH, uint8_t* dst[], int dstStride[])
 {
@@ -656,6 +661,7 @@ static int swScale_c(SwsContext *c, const uint8_t* src[], int srcStride[],
         if (!enough_lines)
             break; //we can't output a dstY line so let's try with the next slice
 
+        if (HAVE_MMX) updateMMXDitherTables(c, dstY, lumBufIndex, chrBufIndex, lastInLumBuf, lastInChrBuf);
         if (dstY < dstH-2) {
             const int16_t **lumSrcPtr= (const int16_t **) lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
             const int16_t **chrSrcPtr= (const int16_t **) chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
@@ -780,6 +786,10 @@ static int swScale_c(SwsContext *c, const uint8_t* src[], int srcStride[],
     if ((dstFormat == PIX_FMT_YUVA420P) && !alpPixBuf)
         fillPlane(dst[3], dstStride[3], dstW, dstY-lastDstY, lastDstY, 255);
 
+    if (HAVE_MMX2 && av_get_cpu_flags() & AV_CPU_FLAG_MMX2)
+        __asm__ volatile("sfence":::"memory");
+    emms_c();
+
     /* store changed local vars back in the context */
     c->dstY= dstY;
     c->lumBufIndex= lumBufIndex;
diff --git a/libswscale/x86/swscale_template.c b/libswscale/x86/swscale_template.c
index 576e22c082..37a6bb5a0d 100644
--- a/libswscale/x86/swscale_template.c
+++ b/libswscale/x86/swscale_template.c
@@ -2144,179 +2144,29 @@ which is needed to support GCC 4.0. */
 #endif
 }
 
-#define DEBUG_SWSCALE_BUFFERS 0
-#define DEBUG_BUFFERS(...) if (DEBUG_SWSCALE_BUFFERS) av_log(c, AV_LOG_DEBUG, __VA_ARGS__)
-
-static int RENAME(swScale)(SwsContext *c, const uint8_t* src[], int srcStride[], int srcSliceY,
-                           int srcSliceH, uint8_t* dst[], int dstStride[])
+#if !COMPILE_TEMPLATE_MMX2
+static void updateMMXDitherTables(SwsContext *c, int dstY, int lumBufIndex, int chrBufIndex,
+                                  int lastInLumBuf, int lastInChrBuf)
 {
-    /* load a few things into local vars to make the code more readable? and faster */
-    const int srcW= c->srcW;
-    const int dstW= c->dstW;
     const int dstH= c->dstH;
-    const int chrDstW= c->chrDstW;
-    const int chrSrcW= c->chrSrcW;
-    const int lumXInc= c->lumXInc;
-    const int chrXInc= c->chrXInc;
-    const enum PixelFormat dstFormat= c->dstFormat;
     const int flags= c->flags;
+    int16_t **lumPixBuf= c->lumPixBuf;
+    int16_t **chrPixBuf= c->chrPixBuf;
+    int16_t **alpPixBuf= c->alpPixBuf;
+    const int vLumBufSize= c->vLumBufSize;
+    const int vChrBufSize= c->vChrBufSize;
     int16_t *vLumFilterPos= c->vLumFilterPos;
     int16_t *vChrFilterPos= c->vChrFilterPos;
-    int16_t *hLumFilterPos= c->hLumFilterPos;
-    int16_t *hChrFilterPos= c->hChrFilterPos;
     int16_t *vLumFilter= c->vLumFilter;
     int16_t *vChrFilter= c->vChrFilter;
-    int16_t *hLumFilter= c->hLumFilter;
-    int16_t *hChrFilter= c->hChrFilter;
     int32_t *lumMmxFilter= c->lumMmxFilter;
     int32_t *chrMmxFilter= c->chrMmxFilter;
     int32_t av_unused *alpMmxFilter= c->alpMmxFilter;
     const int vLumFilterSize= c->vLumFilterSize;
     const int vChrFilterSize= c->vChrFilterSize;
-    const int hLumFilterSize= c->hLumFilterSize;
-    const int hChrFilterSize= c->hChrFilterSize;
-    int16_t **lumPixBuf= c->lumPixBuf;
-    int16_t **chrPixBuf= c->chrPixBuf;
-    int16_t **alpPixBuf= c->alpPixBuf;
-    const int vLumBufSize= c->vLumBufSize;
-    const int vChrBufSize= c->vChrBufSize;
-    uint8_t *formatConvBuffer= c->formatConvBuffer;
-    const int chrSrcSliceY= srcSliceY >> c->chrSrcVSubSample;
-    const int chrSrcSliceH= -((-srcSliceH) >> c->chrSrcVSubSample);
-    int lastDstY;
-    uint32_t *pal=c->pal_yuv;
-
-    /* vars which will change and which we need to store back in the context */
-    int dstY= c->dstY;
-    int lumBufIndex= c->lumBufIndex;
-    int chrBufIndex= c->chrBufIndex;
-    int lastInLumBuf= c->lastInLumBuf;
-    int lastInChrBuf= c->lastInChrBuf;
-
-    if (isPacked(c->srcFormat)) {
-        src[0]=
-        src[1]=
-        src[2]=
-        src[3]= src[0];
-        srcStride[0]=
-        srcStride[1]=
-        srcStride[2]=
-        srcStride[3]= srcStride[0];
-    }
-    srcStride[1]<<= c->vChrDrop;
-    srcStride[2]<<= c->vChrDrop;
-
-    DEBUG_BUFFERS("swScale() %p[%d] %p[%d] %p[%d] %p[%d] -> %p[%d] %p[%d] %p[%d] %p[%d]\n",
-                  src[0], srcStride[0], src[1], srcStride[1], src[2], srcStride[2], src[3], srcStride[3],
-                  dst[0], dstStride[0], dst[1], dstStride[1], dst[2], dstStride[2], dst[3], dstStride[3]);
-    DEBUG_BUFFERS("srcSliceY: %d srcSliceH: %d dstY: %d dstH: %d\n",
-                   srcSliceY,    srcSliceH,    dstY,    dstH);
-    DEBUG_BUFFERS("vLumFilterSize: %d vLumBufSize: %d vChrFilterSize: %d vChrBufSize: %d\n",
-                   vLumFilterSize,    vLumBufSize,    vChrFilterSize,    vChrBufSize);
-
-    if (dstStride[0]%8 !=0 || dstStride[1]%8 !=0 || dstStride[2]%8 !=0 || dstStride[3]%8 != 0) {
-        static int warnedAlready=0; //FIXME move this into the context perhaps
-        if (flags & SWS_PRINT_INFO && !warnedAlready) {
-            av_log(c, AV_LOG_WARNING, "Warning: dstStride is not aligned!\n"
-                   "         ->cannot do aligned memory accesses anymore\n");
-            warnedAlready=1;
-        }
-    }
-
-    /* Note the user might start scaling the picture in the middle so this
-       will not get executed. This is not really intended but works
-       currently, so people might do it. */
-    if (srcSliceY ==0) {
-        lumBufIndex=-1;
-        chrBufIndex=-1;
-        dstY=0;
-        lastInLumBuf= -1;
-        lastInChrBuf= -1;
-    }
-
-    lastDstY= dstY;
-
-    for (;dstY < dstH; dstY++) {
-        unsigned char *dest =dst[0]+dstStride[0]*dstY;
-        const int chrDstY= dstY>>c->chrDstVSubSample;
-        unsigned char *uDest=dst[1]+dstStride[1]*chrDstY;
-        unsigned char *vDest=dst[2]+dstStride[2]*chrDstY;
-        unsigned char *aDest=(CONFIG_SWSCALE_ALPHA && alpPixBuf) ? dst[3]+dstStride[3]*dstY : NULL;
-
-        const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input
-        const int firstLumSrcY2= vLumFilterPos[FFMIN(dstY | ((1<<c->chrDstVSubSample) - 1), dstH-1)];
-        const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input
-        int lastLumSrcY= firstLumSrcY + vLumFilterSize -1; // Last line needed as input
-        int lastLumSrcY2=firstLumSrcY2+ vLumFilterSize -1; // Last line needed as input
-        int lastChrSrcY= firstChrSrcY + vChrFilterSize -1; // Last line needed as input
-        int enough_lines;
-
-        //handle holes (FAST_BILINEAR & weird filters)
-        if (firstLumSrcY > lastInLumBuf) lastInLumBuf= firstLumSrcY-1;
-        if (firstChrSrcY > lastInChrBuf) lastInChrBuf= firstChrSrcY-1;
-        assert(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1);
-        assert(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1);
-
-        DEBUG_BUFFERS("dstY: %d\n", dstY);
-        DEBUG_BUFFERS("\tfirstLumSrcY: %d lastLumSrcY: %d lastInLumBuf: %d\n",
-                         firstLumSrcY,    lastLumSrcY,    lastInLumBuf);
-        DEBUG_BUFFERS("\tfirstChrSrcY: %d lastChrSrcY: %d lastInChrBuf: %d\n",
-                         firstChrSrcY,    lastChrSrcY,    lastInChrBuf);
-
-        // Do we have enough lines in this slice to output the dstY line
-        enough_lines = lastLumSrcY2 < srcSliceY + srcSliceH && lastChrSrcY < -((-srcSliceY - srcSliceH)>>c->chrSrcVSubSample);
-
-        if (!enough_lines) {
-            lastLumSrcY = srcSliceY + srcSliceH - 1;
-            lastChrSrcY = chrSrcSliceY + chrSrcSliceH - 1;
-            DEBUG_BUFFERS("buffering slice: lastLumSrcY %d lastChrSrcY %d\n",
-                                            lastLumSrcY, lastChrSrcY);
-        }
-
-        //Do horizontal scaling
-        while(lastInLumBuf < lastLumSrcY) {
-            const uint8_t *src1= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
-            const uint8_t *src2= src[3]+(lastInLumBuf + 1 - srcSliceY)*srcStride[3];
-            lumBufIndex++;
-            assert(lumBufIndex < 2*vLumBufSize);
-            assert(lastInLumBuf + 1 - srcSliceY < srcSliceH);
-            assert(lastInLumBuf + 1 - srcSliceY >= 0);
-            hyscale_c(c, lumPixBuf[ lumBufIndex ], dstW, src1, srcW, lumXInc,
-                            hLumFilter, hLumFilterPos, hLumFilterSize,
-                            formatConvBuffer,
-                            pal, 0);
-            if (CONFIG_SWSCALE_ALPHA && alpPixBuf)
-                hyscale_c(c, alpPixBuf[ lumBufIndex ], dstW, src2, srcW, lumXInc,
-                                hLumFilter, hLumFilterPos, hLumFilterSize,
-                                formatConvBuffer,
-                                pal, 1);
-            lastInLumBuf++;
-            DEBUG_BUFFERS("\t\tlumBufIndex %d: lastInLumBuf: %d\n",
-                               lumBufIndex,    lastInLumBuf);
-        }
-        while(lastInChrBuf < lastChrSrcY) {
-            const uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
-            const uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
-            chrBufIndex++;
-            assert(chrBufIndex < 2*vChrBufSize);
-            assert(lastInChrBuf + 1 - chrSrcSliceY < (chrSrcSliceH));
-            assert(lastInChrBuf + 1 - chrSrcSliceY >= 0);
-            //FIXME replace parameters through context struct (some at least)
-
-            if (c->needs_hcscale)
-                hcscale_c(c, chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
-                                hChrFilter, hChrFilterPos, hChrFilterSize,
-                                formatConvBuffer,
-                                pal);
-            lastInChrBuf++;
-            DEBUG_BUFFERS("\t\tchrBufIndex %d: lastInChrBuf: %d\n",
-                               chrBufIndex,    lastInChrBuf);
-        }
-        //wrap buf index around to stay inside the ring buffer
-        if (lumBufIndex >= vLumBufSize) lumBufIndex-= vLumBufSize;
-        if (chrBufIndex >= vChrBufSize) chrBufIndex-= vChrBufSize;
-        if (!enough_lines)
-            break; //we can't output a dstY line so let's try with the next slice
+    const int chrDstY= dstY>>c->chrDstVSubSample;
+    const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input
+    const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input
 
         c->blueDither= ff_dither8[dstY&1];
         if (c->dstFormat == PIX_FMT_RGB555 || c->dstFormat == PIX_FMT_BGR555)
@@ -2324,7 +2174,7 @@ static int RENAME(swScale)(SwsContext *c, const uint8_t* src[], int srcStride[],
         else
             c->greenDither= ff_dither4[dstY&1];
         c->redDither= ff_dither8[(dstY+1)&1];
-        if (dstY < dstH-2) {
+        if (dstY < dstH - 2) {
             const int16_t **lumSrcPtr= (const int16_t **) lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
             const int16_t **chrSrcPtr= (const int16_t **) chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
             const int16_t **alpSrcPtr= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? (const int16_t **) alpPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize : NULL;
@@ -2373,138 +2223,9 @@ static int RENAME(swScale)(SwsContext *c, const uint8_t* src[], int srcStride[],
                         ((uint16_t)vChrFilter[chrDstY*vChrFilterSize + i])*0x10001;
                 }
             }
-            if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21) {
-                const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
-                if (dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
-                c->yuv2nv12X(c,
-                             vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
-                             vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
-                             dest, uDest, dstW, chrDstW, dstFormat);
-            } else if (isPlanarYUV(dstFormat) || dstFormat==PIX_FMT_GRAY8) { //YV12 like
-                const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
-                if ((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
-                if (is16BPS(dstFormat) || is9_OR_10BPS(dstFormat)) {
-                    yuv2yuvX16inC(
-                                  vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
-                                  vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
-                                  alpSrcPtr, (uint16_t *) dest, (uint16_t *) uDest, (uint16_t *) vDest, (uint16_t *) aDest, dstW, chrDstW,
-                                  dstFormat);
-                } else if (vLumFilterSize == 1 && vChrFilterSize == 1) { // unscaled YV12
-                    const int16_t *lumBuf = lumSrcPtr[0];
-                    const int16_t *chrBuf= chrSrcPtr[0];
-                    const int16_t *alpBuf= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? alpSrcPtr[0] : NULL;
-                    c->yuv2yuv1(c, lumBuf, chrBuf, alpBuf, dest, uDest, vDest, aDest, dstW, chrDstW);
-                } else { //General YV12
-                    c->yuv2yuvX(c,
-                                vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
-                                vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
-                                alpSrcPtr, dest, uDest, vDest, aDest, dstW, chrDstW);
-                }
-            } else {
-                assert(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
-                assert(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
-                if (vLumFilterSize == 1 && vChrFilterSize == 2) { //unscaled RGB
-                    int chrAlpha= vChrFilter[2*dstY+1];
-                    if(flags & SWS_FULL_CHR_H_INT) {
-                        yuv2rgbXinC_full(c, //FIXME write a packed1_full function
-                                         vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
-                                         vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
-                                         alpSrcPtr, dest, dstW, dstY);
-                    } else {
-                        c->yuv2packed1(c, *lumSrcPtr, *chrSrcPtr, *(chrSrcPtr+1),
-                                       alpPixBuf ? *alpSrcPtr : NULL,
-                                       dest, dstW, chrAlpha, dstFormat, flags, dstY);
-                    }
-                } else if (vLumFilterSize == 2 && vChrFilterSize == 2) { //bilinear upscale RGB
-                    int lumAlpha= vLumFilter[2*dstY+1];
-                    int chrAlpha= vChrFilter[2*dstY+1];
-                    lumMmxFilter[2]=
-                    lumMmxFilter[3]= vLumFilter[2*dstY   ]*0x10001;
-                    chrMmxFilter[2]=
-                    chrMmxFilter[3]= vChrFilter[2*chrDstY]*0x10001;
-                    if(flags & SWS_FULL_CHR_H_INT) {
-                        yuv2rgbXinC_full(c, //FIXME write a packed2_full function
-                                         vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
-                                         vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
-                                         alpSrcPtr, dest, dstW, dstY);
-                    } else {
-                        c->yuv2packed2(c, *lumSrcPtr, *(lumSrcPtr+1), *chrSrcPtr, *(chrSrcPtr+1),
-                                       alpPixBuf ? *alpSrcPtr : NULL, alpPixBuf ? *(alpSrcPtr+1) : NULL,
-                                       dest, dstW, lumAlpha, chrAlpha, dstY);
-                    }
-                } else { //general RGB
-                    if(flags & SWS_FULL_CHR_H_INT) {
-                        yuv2rgbXinC_full(c,
-                                         vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
-                                         vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
-                                         alpSrcPtr, dest, dstW, dstY);
-                    } else {
-                        c->yuv2packedX(c,
-                                       vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
-                                       vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
-                                       alpSrcPtr, dest, dstW, dstY);
-                    }
-                }
-            }
-        } else { // hmm looks like we can't use MMX here without overwriting this array's tail
-            const int16_t **lumSrcPtr= (const int16_t **)lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
-            const int16_t **chrSrcPtr= (const int16_t **)chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
-            const int16_t **alpSrcPtr= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? (const int16_t **)alpPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize : NULL;
-            if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21) {
-                const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
-                if (dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
-                yuv2nv12XinC(
-                             vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
-                             vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
-                             dest, uDest, dstW, chrDstW, dstFormat);
-            } else if (isPlanarYUV(dstFormat) || dstFormat==PIX_FMT_GRAY8) { //YV12
-                const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
-                if ((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
-                if (is16BPS(dstFormat) || is9_OR_10BPS(dstFormat)) {
-                    yuv2yuvX16inC(
-                                  vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
-                                  vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
-                                  alpSrcPtr, (uint16_t *) dest, (uint16_t *) uDest, (uint16_t *) vDest, (uint16_t *) aDest, dstW, chrDstW,
-                                  dstFormat);
-                } else {
-                    yuv2yuvXinC(
-                                vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
-                                vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
-                                alpSrcPtr, dest, uDest, vDest, aDest, dstW, chrDstW);
-                }
-            } else {
-                assert(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
-                assert(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
-                if(flags & SWS_FULL_CHR_H_INT) {
-                    yuv2rgbXinC_full(c,
-                                     vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
-                                     vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
-                                     alpSrcPtr, dest, dstW, dstY);
-                } else {
-                    yuv2packedXinC(c,
-                                   vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
-                                   vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
-                                   alpSrcPtr, dest, dstW, dstY);
-                }
-            }
         }
-    }
-
-    if ((dstFormat == PIX_FMT_YUVA420P) && !alpPixBuf)
-        fillPlane(dst[3], dstStride[3], dstW, dstY-lastDstY, lastDstY, 255);
-
-    if (COMPILE_TEMPLATE_MMX2)      __asm__ volatile("sfence":::"memory");
-    emms_c();
-
-    /* store changed local vars back in the context */
-    c->dstY= dstY;
-    c->lumBufIndex= lumBufIndex;
-    c->chrBufIndex= chrBufIndex;
-    c->lastInLumBuf= lastInLumBuf;
-    c->lastInChrBuf= lastInChrBuf;
-
-    return dstY - lastDstY;
 }
+#endif /* !COMPILE_TEMPLATE_MMX2 */
 
 static void RENAME(sws_init_swScale)(SwsContext *c)
 {
-- 
cgit v1.2.3


From 566b5fbbb3753f71b60c4da491236a62232a8cf9 Mon Sep 17 00:00:00 2001
From: "Ronald S. Bultje" <rsbultje@gmail.com>
Date: Tue, 24 May 2011 12:15:14 -0400
Subject: swscale: remove if(canMMX2BeUsed) conditional.

Instead, set function pointers conditionally during init. This
patch also reveals a whole branch of dead assembly code that is
therefore also removed.
---
 libswscale/x86/swscale_template.c | 99 ++++-----------------------------------
 1 file changed, 8 insertions(+), 91 deletions(-)

diff --git a/libswscale/x86/swscale_template.c b/libswscale/x86/swscale_template.c
index 37a6bb5a0d..0bee76e664 100644
--- a/libswscale/x86/swscale_template.c
+++ b/libswscale/x86/swscale_template.c
@@ -1918,19 +1918,11 @@ static inline void RENAME(hScale)(int16_t *dst, int dstW, const uint8_t *src, in
     }
 }
 
-#define FAST_BILINEAR_X86 \
-    "subl    %%edi, %%esi    \n\t" /*  src[xx+1] - src[xx] */                   \
-    "imull   %%ecx, %%esi    \n\t" /* (src[xx+1] - src[xx])*xalpha */           \
-    "shll      $16, %%edi    \n\t"                                              \
-    "addl    %%edi, %%esi    \n\t" /* src[xx+1]*xalpha + src[xx]*(1-xalpha) */  \
-    "mov        %1, %%"REG_D"\n\t"                                              \
-    "shrl       $9, %%esi    \n\t"                                              \
-
+#if COMPILE_TEMPLATE_MMX2
 static inline void RENAME(hyscale_fast)(SwsContext *c, int16_t *dst,
                                         long dstWidth, const uint8_t *src, int srcW,
                                         int xInc)
 {
-#if COMPILE_TEMPLATE_MMX2
     int32_t *filterPos = c->hLumFilterPos;
     int16_t *filter    = c->hLumFilter;
     int     canMMX2BeUsed  = c->canMMX2BeUsed;
@@ -1939,7 +1931,7 @@ static inline void RENAME(hyscale_fast)(SwsContext *c, int16_t *dst,
 #if defined(PIC)
     DECLARE_ALIGNED(8, uint64_t, ebxsave);
 #endif
-    if (canMMX2BeUsed) {
+
         __asm__ volatile(
 #if defined(PIC)
             "mov               %%"REG_b", %5        \n\t"
@@ -1998,51 +1990,12 @@ static inline void RENAME(hyscale_fast)(SwsContext *c, int16_t *dst,
 #endif
         );
         for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) dst[i] = src[srcW-1]*128;
-    } else {
-#endif /* COMPILE_TEMPLATE_MMX2 */
-    x86_reg xInc_shr16 = xInc >> 16;
-    uint16_t xInc_mask = xInc & 0xffff;
-    x86_reg dstWidth_reg = dstWidth;
-    //NO MMX just normal asm ...
-    __asm__ volatile(
-        "xor %%"REG_a", %%"REG_a"            \n\t" // i
-        "xor %%"REG_d", %%"REG_d"            \n\t" // xx
-        "xorl    %%ecx, %%ecx                \n\t" // xalpha
-        ".p2align                4           \n\t"
-        "1:                                  \n\t"
-        "movzbl    (%0, %%"REG_d"), %%edi    \n\t" //src[xx]
-        "movzbl   1(%0, %%"REG_d"), %%esi    \n\t" //src[xx+1]
-        FAST_BILINEAR_X86
-        "movw     %%si, (%%"REG_D", %%"REG_a", 2)   \n\t"
-        "addw       %4, %%cx                 \n\t" //xalpha += xInc&0xFFFF
-        "adc        %3, %%"REG_d"            \n\t" //xx+= xInc>>16 + carry
-
-        "movzbl    (%0, %%"REG_d"), %%edi    \n\t" //src[xx]
-        "movzbl   1(%0, %%"REG_d"), %%esi    \n\t" //src[xx+1]
-        FAST_BILINEAR_X86
-        "movw     %%si, 2(%%"REG_D", %%"REG_a", 2)  \n\t"
-        "addw       %4, %%cx                 \n\t" //xalpha += xInc&0xFFFF
-        "adc        %3, %%"REG_d"            \n\t" //xx+= xInc>>16 + carry
-
-
-        "add        $2, %%"REG_a"            \n\t"
-        "cmp        %2, %%"REG_a"            \n\t"
-        " jb        1b                       \n\t"
-
-
-        :: "r" (src), "m" (dst), "m" (dstWidth_reg), "m" (xInc_shr16), "m" (xInc_mask)
-        : "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi"
-    );
-#if COMPILE_TEMPLATE_MMX2
-    } //if MMX2 can't be used
-#endif
 }
 
 static inline void RENAME(hcscale_fast)(SwsContext *c, int16_t *dst,
                                         long dstWidth, const uint8_t *src1,
                                         const uint8_t *src2, int srcW, int xInc)
 {
-#if COMPILE_TEMPLATE_MMX2
     int32_t *filterPos = c->hChrFilterPos;
     int16_t *filter    = c->hChrFilter;
     int     canMMX2BeUsed  = c->canMMX2BeUsed;
@@ -2051,7 +2004,7 @@ static inline void RENAME(hcscale_fast)(SwsContext *c, int16_t *dst,
 #if defined(PIC)
     DECLARE_ALIGNED(8, uint64_t, ebxsave);
 #endif
-    if (canMMX2BeUsed) {
+
         __asm__ volatile(
 #if defined(PIC)
             "mov          %%"REG_b", %6         \n\t"
@@ -2101,48 +2054,8 @@ static inline void RENAME(hcscale_fast)(SwsContext *c, int16_t *dst,
             dst[i] = src1[srcW-1]*128;
             dst[i+VOFW] = src2[srcW-1]*128;
         }
-    } else {
-#endif /* COMPILE_TEMPLATE_MMX2 */
-        x86_reg xInc_shr16 = (x86_reg) (xInc >> 16);
-        uint16_t xInc_mask = xInc & 0xffff;
-        x86_reg dstWidth_reg = dstWidth;
-        __asm__ volatile(
-            "xor %%"REG_a", %%"REG_a"               \n\t" // i
-            "xor %%"REG_d", %%"REG_d"               \n\t" // xx
-            "xorl    %%ecx, %%ecx                   \n\t" // xalpha
-            ".p2align    4                          \n\t"
-            "1:                                     \n\t"
-            "mov        %0, %%"REG_S"               \n\t"
-            "movzbl  (%%"REG_S", %%"REG_d"), %%edi  \n\t" //src[xx]
-            "movzbl 1(%%"REG_S", %%"REG_d"), %%esi  \n\t" //src[xx+1]
-            FAST_BILINEAR_X86
-            "movw     %%si, (%%"REG_D", %%"REG_a", 2)   \n\t"
-
-            "movzbl    (%5, %%"REG_d"), %%edi       \n\t" //src[xx]
-            "movzbl   1(%5, %%"REG_d"), %%esi       \n\t" //src[xx+1]
-            FAST_BILINEAR_X86
-            "movw     %%si, "AV_STRINGIFY(VOF)"(%%"REG_D", %%"REG_a", 2)   \n\t"
-
-            "addw       %4, %%cx                    \n\t" //xalpha += xInc&0xFFFF
-            "adc        %3, %%"REG_d"               \n\t" //xx+= xInc>>16 + carry
-            "add        $1, %%"REG_a"               \n\t"
-            "cmp        %2, %%"REG_a"               \n\t"
-            " jb        1b                          \n\t"
-
-/* GCC 3.3 makes MPlayer crash on IA-32 machines when using "g" operand here,
-which is needed to support GCC 4.0. */
-#if ARCH_X86_64 && AV_GCC_VERSION_AT_LEAST(3,4)
-            :: "m" (src1), "m" (dst), "g" (dstWidth_reg), "m" (xInc_shr16), "m" (xInc_mask),
-#else
-            :: "m" (src1), "m" (dst), "m" (dstWidth_reg), "m" (xInc_shr16), "m" (xInc_mask),
-#endif
-            "r" (src2)
-            : "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi"
-        );
-#if COMPILE_TEMPLATE_MMX2
-    } //if MMX2 can't be used
-#endif
 }
+#endif /* COMPILE_TEMPLATE_MMX2 */
 
 #if !COMPILE_TEMPLATE_MMX2
 static void updateMMXDitherTables(SwsContext *c, int dstY, int lumBufIndex, int chrBufIndex,
@@ -2240,14 +2153,18 @@ static void RENAME(sws_init_swScale)(SwsContext *c)
     c->hScale       = RENAME(hScale      );
 
     // Use the new MMX scaler if the MMX2 one can't be used (it is faster than the x86 ASM one).
+#if COMPILE_TEMPLATE_MMX2
     if (c->flags & SWS_FAST_BILINEAR && c->canMMX2BeUsed)
     {
         c->hyscale_fast = RENAME(hyscale_fast);
         c->hcscale_fast = RENAME(hcscale_fast);
     } else {
+#endif /* COMPILE_TEMPLATE_MMX2 */
         c->hyscale_fast = NULL;
         c->hcscale_fast = NULL;
+#if COMPILE_TEMPLATE_MMX2
     }
+#endif /* COMPILE_TEMPLATE_MMX2 */
 
     switch(srcFormat) {
         case PIX_FMT_YUYV422  : c->chrToYV12 = RENAME(yuy2ToUV); break;
-- 
cgit v1.2.3


From 1bb0f0c925d616375b3b991d9c645e170b90c0a4 Mon Sep 17 00:00:00 2001
From: "Ronald S. Bultje" <rsbultje@gmail.com>
Date: Tue, 24 May 2011 12:30:55 -0400
Subject: swscale: remove if(bitexact) branch from functions.

Instead, only set the function pointers if bitexact flag is
not set during initialization. Since a change in flags triggers
a re-init anyway, this doesn't situations where flag values
change during runtime.
---
 libswscale/x86/swscale_template.c | 34 +++++++++++-----------------------
 1 file changed, 11 insertions(+), 23 deletions(-)

diff --git a/libswscale/x86/swscale_template.c b/libswscale/x86/swscale_template.c
index 0bee76e664..678060f3de 100644
--- a/libswscale/x86/swscale_template.c
+++ b/libswscale/x86/swscale_template.c
@@ -831,7 +831,6 @@ static inline void RENAME(yuv2yuvX)(SwsContext *c, const int16_t *lumFilter, con
                                     const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize, const int16_t **alpSrc,
                                     uint8_t *dest, uint8_t *uDest, uint8_t *vDest, uint8_t *aDest, long dstW, long chrDstW)
 {
-    if(!(c->flags & SWS_BITEXACT)) {
         if (c->flags & SWS_ACCURATE_RND) {
             if (uDest) {
                 YSCALEYUV2YV12X_ACCURATE(   "0", CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
@@ -853,17 +852,11 @@ static inline void RENAME(yuv2yuvX)(SwsContext *c, const int16_t *lumFilter, con
 
             YSCALEYUV2YV12X("0", LUM_MMX_FILTER_OFFSET, dest, dstW)
         }
-        return;
-    }
-    yuv2yuvXinC(lumFilter, lumSrc, lumFilterSize,
-                chrFilter, chrSrc, chrFilterSize,
-                alpSrc, dest, uDest, vDest, aDest, dstW, chrDstW);
 }
 
 static inline void RENAME(yuv2yuv1)(SwsContext *c, const int16_t *lumSrc, const int16_t *chrSrc, const int16_t *alpSrc,
                                     uint8_t *dest, uint8_t *uDest, uint8_t *vDest, uint8_t *aDest, long dstW, long chrDstW)
 {
-    if(!(c->flags & SWS_BITEXACT)) {
         long p= 4;
         const uint8_t *src[4]= {alpSrc + dstW, lumSrc + dstW, chrSrc + chrDstW, chrSrc + VOFW + chrDstW};
         uint8_t *dst[4]= {aDest, dest, uDest, vDest};
@@ -892,10 +885,6 @@ static inline void RENAME(yuv2yuv1)(SwsContext *c, const int16_t *lumSrc, const
                 }
             }
         }
-        return;
-    }
-    yuv2yuv1_c(c, lumSrc, chrSrc, alpSrc, dest, uDest, vDest, aDest,
-               dstW, chrDstW);
 }
 
 
@@ -908,7 +897,7 @@ static inline void RENAME(yuv2packedX)(SwsContext *c, const int16_t *lumFilter,
 {
     x86_reg dummy=0;
     x86_reg dstW_reg = dstW;
-    if(!(c->flags & SWS_BITEXACT)) {
+
         if (c->flags & SWS_ACCURATE_RND) {
             switch(c->dstFormat) {
             case PIX_FMT_RGB32:
@@ -1065,7 +1054,7 @@ static inline void RENAME(yuv2packedX)(SwsContext *c, const int16_t *lumFilter,
                 return;
             }
         }
-    }
+
     yuv2packedXinC(c, lumFilter, lumSrc, lumFilterSize,
                    chrFilter, chrSrc, chrFilterSize,
                    alpSrc, dest, dstW, dstY);
@@ -1077,7 +1066,6 @@ static inline void RENAME(yuv2packedX)(SwsContext *c, const int16_t *lumFilter,
 static inline void RENAME(yuv2packed2)(SwsContext *c, const uint16_t *buf0, const uint16_t *buf1, const uint16_t *uvbuf0, const uint16_t *uvbuf1,
                           const uint16_t *abuf0, const uint16_t *abuf1, uint8_t *dest, int dstW, int yalpha, int uvalpha, int y)
 {
-    if(!(c->flags & SWS_BITEXACT)) {
         switch(c->dstFormat) {
         //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
         case PIX_FMT_RGB32:
@@ -1208,9 +1196,8 @@ static inline void RENAME(yuv2packed2)(SwsContext *c, const uint16_t *buf0, cons
                 "a" (&c->redDither)
             );
             return;
-        default: break;
         }
-    }
+
     yuv2packed2_c(c, buf0, buf1, uvbuf0, uvbuf1, abuf0, abuf1,
                   dest, dstW, yalpha, uvalpha, y);
 }
@@ -1221,7 +1208,6 @@ static inline void RENAME(yuv2packed2)(SwsContext *c, const uint16_t *buf0, cons
 static inline void RENAME(yuv2packed1)(SwsContext *c, const uint16_t *buf0, const uint16_t *uvbuf0, const uint16_t *uvbuf1,
                           const uint16_t *abuf0, uint8_t *dest, int dstW, int uvalpha, enum PixelFormat dstFormat, int flags, int y)
 {
-    if(!(flags & SWS_BITEXACT)) {
         const uint16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
 
         if (flags&SWS_FULL_CHR_H_INT) {
@@ -1442,7 +1428,7 @@ static inline void RENAME(yuv2packed1)(SwsContext *c, const uint16_t *buf0, cons
                 return;
             }
         }
-    }
+
     yuv2packed1_c(c, buf0, uvbuf0, uvbuf1, abuf0, dest,
                   dstW, uvalpha, dstFormat, flags, y);
 }
@@ -2144,11 +2130,13 @@ static void RENAME(sws_init_swScale)(SwsContext *c)
 {
     enum PixelFormat srcFormat = c->srcFormat;
 
-    c->yuv2yuv1     = RENAME(yuv2yuv1    );
-    c->yuv2yuvX     = RENAME(yuv2yuvX    );
-    c->yuv2packed1  = RENAME(yuv2packed1 );
-    c->yuv2packed2  = RENAME(yuv2packed2 );
-    c->yuv2packedX  = RENAME(yuv2packedX );
+    if (!(c->flags & SWS_BITEXACT)) {
+        c->yuv2yuv1     = RENAME(yuv2yuv1    );
+        c->yuv2yuvX     = RENAME(yuv2yuvX    );
+        c->yuv2packed1  = RENAME(yuv2packed1 );
+        c->yuv2packed2  = RENAME(yuv2packed2 );
+        c->yuv2packedX  = RENAME(yuv2packedX );
+    }
 
     c->hScale       = RENAME(hScale      );
 
-- 
cgit v1.2.3


From a724ee6265bbf0acf4c43cd948dede661459727d Mon Sep 17 00:00:00 2001
From: "Ronald S. Bultje" <rsbultje@gmail.com>
Date: Tue, 24 May 2011 15:32:03 -0400
Subject: swscale: unbreak the build on non-x86 systems.

---
 libswscale/swscale_template.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/libswscale/swscale_template.c b/libswscale/swscale_template.c
index 0ff402806e..cd6c8c7a65 100644
--- a/libswscale/swscale_template.c
+++ b/libswscale/swscale_template.c
@@ -661,7 +661,9 @@ static int swScale_c(SwsContext *c, const uint8_t* src[], int srcStride[],
         if (!enough_lines)
             break; //we can't output a dstY line so let's try with the next slice
 
-        if (HAVE_MMX) updateMMXDitherTables(c, dstY, lumBufIndex, chrBufIndex, lastInLumBuf, lastInChrBuf);
+#if HAVE_MMX
+        updateMMXDitherTables(c, dstY, lumBufIndex, chrBufIndex, lastInLumBuf, lastInChrBuf);
+#endif
         if (dstY < dstH-2) {
             const int16_t **lumSrcPtr= (const int16_t **) lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
             const int16_t **chrSrcPtr= (const int16_t **) chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
@@ -786,8 +788,10 @@ static int swScale_c(SwsContext *c, const uint8_t* src[], int srcStride[],
     if ((dstFormat == PIX_FMT_YUVA420P) && !alpPixBuf)
         fillPlane(dst[3], dstStride[3], dstW, dstY-lastDstY, lastDstY, 255);
 
-    if (HAVE_MMX2 && av_get_cpu_flags() & AV_CPU_FLAG_MMX2)
+#if HAVE_MMX2
+    if (av_get_cpu_flags() & AV_CPU_FLAG_MMX2)
         __asm__ volatile("sfence":::"memory");
+#endif
     emms_c();
 
     /* store changed local vars back in the context */
-- 
cgit v1.2.3


From 9bbd6a4cd89da4bfc9fd36fea5777a539a542b40 Mon Sep 17 00:00:00 2001
From: Mans Rullgard <mans@mansr.com>
Date: Tue, 24 May 2011 20:11:53 +0100
Subject: configure: enable memalign_hack automatically when needed

Signed-off-by: Mans Rullgard <mans@mansr.com>
---
 configure | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/configure b/configure
index 5e3d6fbe47..c628ff65bb 100755
--- a/configure
+++ b/configure
@@ -2823,11 +2823,6 @@ check_header X11/extensions/XvMClib.h
 
 check_struct dxva2api.h DXVA_PictureParameters wDecodedPictureIndex
 
-if ! enabled_any memalign memalign_hack posix_memalign malloc_aligned &&
-     enabled_any $need_memalign ; then
-    die "Error, no aligned memory allocator but SSE enabled, disable it or use --enable-memalign-hack."
-fi
-
 disabled  zlib || check_lib   zlib.h      zlibVersion -lz   || disable  zlib
 disabled bzlib || check_lib2 bzlib.h BZ2_bzlibVersion -lbz2 || disable bzlib
 
@@ -3091,6 +3086,9 @@ check_deps $CONFIG_LIST       \
 
 enabled asm || { arch=c; disable $ARCH_LIST $ARCH_EXT_LIST; }
 
+! enabled_any memalign posix_memalign malloc_aligned &&
+    enabled_any $need_memalign && enable memalign_hack
+
 echo "install prefix            $prefix"
 echo "source path               $source_path"
 echo "C compiler                $cc"
-- 
cgit v1.2.3