Updated ffmpeg

git-svn-id: https://mpc-hc.svn.sourceforge.net/svnroot/mpc-hc/trunk@2144 10f7b99b-c216-0410-bff0-8a66a9350fd8
author: XhmikosR <xhmikosr@users.sourceforge.net> 2010-07-23 15:03:50 +0400
committer: XhmikosR <xhmikosr@users.sourceforge.net> 2010-07-23 15:03:50 +0400
commit: d8cb0bd04f30184e2622b50b056f7664aea2d814 (patch)
tree: 5bcce552fbfccaba2ac8b5f024980c5b1910ca0c /src/filters/transform/MPCVideoDec
parent: 7c67a6e4045516f0fa222e29b03aed8fe8189b7f (diff)
12 files changed, 1080 insertions, 775 deletions
diff --git a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/vp56.h b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/vp56.h
index dd541f22e..1e069dc4d 100644
--- a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/vp56.h
+++ b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/vp56.h
@@ -32,7 +32,11 @@
 #include "vp56dsp.h"
 
 typedef struct vp56_context VP56Context;
-typedef struct vp56_mv VP56mv;
+
+typedef struct {
+    int16_t x;
+    int16_t y;
+} DECLARE_ALIGNED(4, , VP56mv);
 
 typedef void (*VP56ParseVectorAdjustment)(VP56Context *s,
                                           VP56mv *vect);
@@ -61,11 +65,6 @@ typedef struct {
     DCTELEM dc_coeff;
 } VP56RefDc;
 
-struct vp56_mv {
-    int x;
-    int y;
-};
-
 typedef struct {
     uint8_t type;
     VP56mv mv;
@@ -175,7 +174,7 @@ void vp56_init(AVCodecContext *avctx, int flip, int has_alpha);
 int vp56_free(AVCodecContext *avctx);
 void vp56_init_dequant(VP56Context *s, int quantizer);
 int vp56_decode_frame(AVCodecContext *avctx, void *data, int *data_size,
-                      const uint8_t *buf, int buf_size);
+                      AVPacket *avpkt);
 
 
 /**
diff --git a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/vp8.c b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/vp8.c
index 264fe72e7..90e873f6d 100644
--- a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/vp8.c
+++ b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/vp8.c
@@ -29,7 +29,12 @@
 #include "rectangle.h"
 
 typedef struct {
-    uint8_t segment;
+    uint8_t filter_level;
+    uint8_t inner_limit;
+    uint8_t inner_filter;
+} VP8FilterStrength;
+
+typedef struct {
     uint8_t skip;
     // todo: make it possible to check for at least (i4x4 or split_mv)
     // in one op. are others needed?
@@ -79,10 +84,12 @@ typedef struct {
 
     VP8Macroblock *macroblocks;
     VP8Macroblock *macroblocks_base;
+    VP8FilterStrength *filter_strength;
     int mb_stride;
 
     uint8_t *intra4x4_pred_mode;
     uint8_t *intra4x4_pred_mode_base;
+    uint8_t *segmentation_map;
     int b4_stride;
 
     /**
@@ -109,11 +116,14 @@ typedef struct {
      */
     DECLARE_ALIGNED(16, uint8_t, non_zero_count_cache)[6][4];
     DECLARE_ALIGNED(16, DCTELEM, block)[6][4][16];
+    uint8_t intra4x4_pred_mode_mb[16];
 
     int chroma_pred_mode;    ///< 8x8c pred mode of the current macroblock
+    int segment;             ///< segment of the current macroblock
 
     int mbskip_enabled;
     int sign_bias[4]; ///< one state [0, 1] per ref frame type
+    int ref_count[3];
 
     /**
      * Base parameters for segmentation, i.e. per-macroblock parameters.
@@ -205,6 +215,7 @@ static void vp8_decode_flush(AVCodecContext *avctx)
     av_freep(&s->top_nnz);
     av_freep(&s->edge_emu_buffer);
     av_freep(&s->top_border);
+    av_freep(&s->segmentation_map);
 
     s->macroblocks        = NULL;
     s->intra4x4_pred_mode = NULL;
@@ -229,15 +240,18 @@ static int update_dimensions(VP8Context *s, int width, int height)
     s->mb_stride = s->mb_width+1;
     s->b4_stride = 4*s->mb_stride;
 
-    s->macroblocks_base        = av_mallocz(s->mb_stride*(s->mb_height+1)*sizeof(*s->macroblocks));
+    s->macroblocks_base        = av_mallocz((s->mb_stride+s->mb_height*2+2)*sizeof(*s->macroblocks));
+    s->filter_strength         = av_mallocz(s->mb_stride*sizeof(*s->filter_strength));
     s->intra4x4_pred_mode_base = av_mallocz(s->b4_stride*(4*s->mb_height+1));
     s->top_nnz                 = av_mallocz(s->mb_width*sizeof(*s->top_nnz));
     s->top_border              = av_mallocz((s->mb_width+1)*sizeof(*s->top_border));
+    s->segmentation_map        = av_mallocz(s->mb_stride*s->mb_height);
 
-    if (!s->macroblocks_base || !s->intra4x4_pred_mode_base || !s->top_nnz || !s->top_border)
+    if (!s->macroblocks_base || !s->filter_strength || !s->intra4x4_pred_mode_base ||
+        !s->top_nnz || !s->top_border || !s->segmentation_map)
         return AVERROR(ENOMEM);
 
-    s->macroblocks        = s->macroblocks_base        + 1 + s->mb_stride;
+    s->macroblocks        = s->macroblocks_base + 1;
     s->intra4x4_pred_mode = s->intra4x4_pred_mode_base + 4 + s->b4_stride;
 
     memset(s->intra4x4_pred_mode_base, DC_PRED, s->b4_stride);
@@ -520,39 +534,45 @@ static inline void clamp_mv(VP8Context *s, VP56mv *dst, const VP56mv *src,
 }
 
 static void find_near_mvs(VP8Context *s, VP8Macroblock *mb, int mb_x, int mb_y,
-                          VP56mv near[2], VP56mv *best, int cnt[4])
+                          VP56mv near[2], VP56mv *best, uint8_t cnt[4])
 {
-    VP8Macroblock *mb_edge[3] = { mb - s->mb_stride     /* top */,
-                                  mb - 1                /* left */,
-                                  mb - s->mb_stride - 1 /* top-left */ };
+    VP8Macroblock *mb_edge[3] = { mb + 2 /* top */,
+                                  mb - 1 /* left */,
+                                  mb + 1 /* top-left */ };
     enum { EDGE_TOP, EDGE_LEFT, EDGE_TOPLEFT };
     VP56mv near_mv[4]  = {{ 0 }};
     enum { CNT_ZERO, CNT_NEAREST, CNT_NEAR, CNT_SPLITMV };
-    int idx = CNT_ZERO, n;
+    int idx = CNT_ZERO;
     int best_idx = CNT_ZERO;
+    int cur_sign_bias = s->sign_bias[mb->ref_frame];
+    int *sign_bias = s->sign_bias;
 
     /* Process MB on top, left and top-left */
-    for (n = 0; n < 3; n++) {
-        VP8Macroblock *edge = mb_edge[n];
-        if (edge->ref_frame != VP56_FRAME_CURRENT) {
-            if (edge->mv.x | edge->mv.y) {
-                VP56mv tmp = edge->mv;
-                if (s->sign_bias[mb->ref_frame] != s->sign_bias[edge->ref_frame]) {
-                    tmp.x *= -1;
-                    tmp.y *= -1;
-                }
-                if ((tmp.x ^ near_mv[idx].x) | (tmp.y ^ near_mv[idx].y))
-                    near_mv[++idx] = tmp;
-                cnt[idx]       += 1 + (n != 2);
-            } else
-                cnt[CNT_ZERO] += 1 + (n != 2);
-        }
+    #define MV_EDGE_CHECK(n)\
+    {\
+        VP8Macroblock *edge = mb_edge[n];\
+        int edge_ref = edge->ref_frame;\
+        if (edge_ref != VP56_FRAME_CURRENT) {\
+            uint32_t mv = AV_RN32A(&edge->mv);\
+            if (mv) {\
+                if (cur_sign_bias != sign_bias[edge_ref]) {\
+                    /* SWAR negate of the values in mv. */\
+                    mv = ~mv;\
+                    mv = ((mv&0x7fff7fff) + 0x00010001) ^ (mv&0x80008000);\
+                }\
+                if (!n || mv != AV_RN32A(&near_mv[idx]))\
+                    AV_WN32A(&near_mv[++idx], mv);\
+                cnt[idx]      += 1 + (n != 2);\
+            } else\
+                cnt[CNT_ZERO] += 1 + (n != 2);\
+        }\
     }
+    MV_EDGE_CHECK(0)
+    MV_EDGE_CHECK(1)
+    MV_EDGE_CHECK(2)
 
-    /* If we have three distinct MV's, merge first and last if they're the same */
-    if (cnt[CNT_SPLITMV] &&
-        !((near_mv[1+EDGE_TOP].x ^ near_mv[1+EDGE_TOPLEFT].x) |
-          (near_mv[1+EDGE_TOP].y ^ near_mv[1+EDGE_TOPLEFT].y)))
+    /* If we have three distinct MVs, merge first and last if they're the same */
+    if (cnt[CNT_SPLITMV] && AV_RN32A(&near_mv[1+EDGE_TOP]) == AV_RN32A(&near_mv[1+EDGE_TOPLEFT]))
         cnt[CNT_NEAREST] += 1;
 
     cnt[CNT_SPLITMV] = ((mb_edge[EDGE_LEFT]->mode   == VP8_MVMODE_SPLIT) +
@@ -561,8 +581,8 @@ static void find_near_mvs(VP8Context *s, VP8Macroblock *mb, int mb_x, int mb_y,
 
     /* Swap near and nearest if necessary */
     if (cnt[CNT_NEAR] > cnt[CNT_NEAREST]) {
-        FFSWAP(int,    cnt[CNT_NEAREST],     cnt[CNT_NEAR]);
-        FFSWAP(VP56mv, near_mv[CNT_NEAREST], near_mv[CNT_NEAR]);
+        FFSWAP(uint8_t,     cnt[CNT_NEAREST],     cnt[CNT_NEAR]);
+        FFSWAP( VP56mv, near_mv[CNT_NEAREST], near_mv[CNT_NEAR]);
     }
 
     /* Choose the best mv out of 0,0 and the nearest mv */
@@ -596,17 +616,13 @@ static int read_mv_component(VP56RangeCoder *c, const uint8_t *p)
     return (x && vp56_rac_get_prob(c, p[1])) ? -x : x;
 }
 
-static const uint8_t *get_submv_prob(const VP56mv *left, const VP56mv *top)
+static const uint8_t *get_submv_prob(uint32_t left, uint32_t top)
 {
-    int l_is_zero = !(left->x | left->y);
-    int t_is_zero = !(top->x  | top->y);
-    int equal = !((left->x ^ top->x) | (left->y ^ top->y));
-
-    if (equal)
-        return l_is_zero ? vp8_submv_prob[4] : vp8_submv_prob[3];
-    if (t_is_zero)
+    if (left == top)
+        return vp8_submv_prob[4-!!left];
+    if (!top)
         return vp8_submv_prob[2];
-    return l_is_zero ? vp8_submv_prob[1] : vp8_submv_prob[0];
+    return vp8_submv_prob[1-!!left];
 }
 
 /**
@@ -619,24 +635,29 @@ static int decode_splitmvs(VP8Context    *s,  VP56RangeCoder *c,
     int part_idx = mb->partitioning =
         vp8_rac_get_tree(c, vp8_mbsplit_tree, vp8_mbsplit_prob);
     int n, num = vp8_mbsplit_count[part_idx];
-    const uint8_t *mbsplits = vp8_mbsplits[part_idx],
+    VP8Macroblock *top_mb  = &mb[2];
+    VP8Macroblock *left_mb = &mb[-1];
+    const uint8_t *mbsplits_left = vp8_mbsplits[left_mb->partitioning],
+                  *mbsplits_top = vp8_mbsplits[top_mb->partitioning],
+                  *mbsplits_cur = vp8_mbsplits[part_idx],
                   *firstidx = vp8_mbfirstidx[part_idx];
+    VP56mv *top_mv  = top_mb->bmv;
+    VP56mv *left_mv = left_mb->bmv;
+    VP56mv *cur_mv  = mb->bmv;
 
     for (n = 0; n < num; n++) {
         int k = firstidx[n];
-        const VP56mv *left, *above;
+        uint32_t left, above;
         const uint8_t *submv_prob;
 
-        if (!(k & 3)) {
-            VP8Macroblock *left_mb = &mb[-1];
-            left = &left_mb->bmv[vp8_mbsplits[left_mb->partitioning][k + 3]];
-        } else
-            left  = &mb->bmv[mbsplits[k - 1]];
-        if (k <= 3) {
-            VP8Macroblock *above_mb = &mb[-s->mb_stride];
-            above = &above_mb->bmv[vp8_mbsplits[above_mb->partitioning][k + 12]];
-        } else
-            above = &mb->bmv[mbsplits[k - 4]];
+        if (!(k & 3))
+            left = AV_RN32A(&left_mv[mbsplits_left[k + 3]]);
+        else
+            left  = AV_RN32A(&cur_mv[mbsplits_cur[k - 1]]);
+        if (k <= 3)
+            above = AV_RN32A(&top_mv[mbsplits_top[k + 12]]);
+        else
+            above = AV_RN32A(&cur_mv[mbsplits_cur[k - 4]]);
 
         submv_prob = get_submv_prob(left, above);
 
@@ -646,14 +667,13 @@ static int decode_splitmvs(VP8Context    *s,  VP56RangeCoder *c,
             mb->bmv[n].x = base_mv->x + read_mv_component(c, s->prob->mvc[1]);
             break;
         case VP8_SUBMVMODE_ZERO4X4:
-            mb->bmv[n].x = 0;
-            mb->bmv[n].y = 0;
+            AV_WN32A(&mb->bmv[n], 0);
             break;
         case VP8_SUBMVMODE_LEFT4X4:
-            mb->bmv[n] = *left;
+            AV_WN32A(&mb->bmv[n], left);
             break;
         case VP8_SUBMVMODE_TOP4X4:
-            mb->bmv[n] = *above;
+            AV_WN32A(&mb->bmv[n], above);
             break;
         }
     }
@@ -664,30 +684,33 @@ static int decode_splitmvs(VP8Context    *s,  VP56RangeCoder *c,
 static inline void decode_intra4x4_modes(VP56RangeCoder *c, uint8_t *intra4x4,
                                          int stride, int keyframe)
 {
-    int x, y, t, l;
-    const uint8_t *ctx = vp8_pred4x4_prob_inter;
+    int x, y, t, l, i;
 
-    for (y = 0; y < 4; y++) {
-        for (x = 0; x < 4; x++) {
-            if (keyframe) {
+    if (keyframe) {
+        const uint8_t *ctx;
+        for (y = 0; y < 4; y++) {
+            for (x = 0; x < 4; x++) {
                 t = intra4x4[x - stride];
                 l = intra4x4[x - 1];
                 ctx = vp8_pred4x4_prob_intra[t][l];
+                intra4x4[x] = vp8_rac_get_tree(c, vp8_pred4x4_tree, ctx);
             }
-            intra4x4[x] = vp8_rac_get_tree(c, vp8_pred4x4_tree, ctx);
+            intra4x4 += stride;
         }
-        intra4x4 += stride;
+    } else {
+        for (i = 0; i < 16; i++)
+            intra4x4[i] = vp8_rac_get_tree(c, vp8_pred4x4_tree, vp8_pred4x4_prob_inter);
     }
 }
 
 static void decode_mb_mode(VP8Context *s, VP8Macroblock *mb, int mb_x, int mb_y,
-                           uint8_t *intra4x4)
+                           uint8_t *intra4x4, uint8_t *segment)
 {
     VP56RangeCoder *c = &s->c;
-    int n;
 
     if (s->segmentation.update_map)
-        mb->segment = vp8_rac_get_tree(c, vp8_segmentid_tree, s->prob->segmentid);
+        *segment = vp8_rac_get_tree(c, vp8_segmentid_tree, s->prob->segmentid);
+    s->segment = *segment;
 
     mb->skip = s->mbskip_enabled ? vp56_rac_get_prob(c, s->prob->mbskip) : 0;
 
@@ -703,7 +726,7 @@ static void decode_mb_mode(VP8Context *s, VP8Macroblock *mb, int mb_x, int mb_y,
         mb->ref_frame = VP56_FRAME_CURRENT;
     } else if (vp56_rac_get_prob(c, s->prob->intra)) {
         VP56mv near[2], best;
-        int cnt[4] = { 0 };
+        uint8_t cnt[4] = { 0 };
         uint8_t p[4];
 
         // inter MB, 16.2
@@ -712,19 +735,21 @@ static void decode_mb_mode(VP8Context *s, VP8Macroblock *mb, int mb_x, int mb_y,
                 VP56_FRAME_GOLDEN2 /* altref */ : VP56_FRAME_GOLDEN;
         else
             mb->ref_frame = VP56_FRAME_PREVIOUS;
+        s->ref_count[mb->ref_frame-1]++;
 
         // motion vectors, 16.3
         find_near_mvs(s, mb, mb_x, mb_y, near, &best, cnt);
-        for (n = 0; n < 4; n++)
-            p[n] = vp8_mode_contexts[cnt[n]][n];
+        p[0] = vp8_mode_contexts[cnt[0]][0];
+        p[1] = vp8_mode_contexts[cnt[1]][1];
+        p[2] = vp8_mode_contexts[cnt[2]][2];
+        p[3] = vp8_mode_contexts[cnt[3]][3];
         mb->mode = vp8_rac_get_tree(c, vp8_pred16x16_tree_mvinter, p);
         switch (mb->mode) {
         case VP8_MVMODE_SPLIT:
             mb->mv = mb->bmv[decode_splitmvs(s, c, mb, &best) - 1];
             break;
         case VP8_MVMODE_ZERO:
-            mb->mv.x = 0;
-            mb->mv.y = 0;
+            AV_WN32A(&mb->mv, 0);
             break;
         case VP8_MVMODE_NEAREST:
             clamp_mv(s, &mb->mv, &near[0], mb_x, mb_y);
@@ -745,13 +770,13 @@ static void decode_mb_mode(VP8Context *s, VP8Macroblock *mb, int mb_x, int mb_y,
         // intra MB, 16.1
         mb->mode = vp8_rac_get_tree(c, vp8_pred16x16_tree_inter, s->prob->pred16x16);
 
-        if (mb->mode == MODE_I4x4) {
-            decode_intra4x4_modes(c, intra4x4, s->b4_stride, 0);
-        } else
-            fill_rectangle(intra4x4, 4, 4, s->b4_stride, vp8_pred4x4_mode[mb->mode], 1);
+        if (mb->mode == MODE_I4x4)
+            decode_intra4x4_modes(c, intra4x4, 4, 0);
 
         s->chroma_pred_mode = vp8_rac_get_tree(c, vp8_pred8x8c_tree, s->prob->pred8x8c);
         mb->ref_frame = VP56_FRAME_CURRENT;
+        mb->partitioning = VP8_SPLITMVMODE_NONE;
+        AV_WN32A(&mb->bmv[0], 0);
     }
 }
 
@@ -781,7 +806,7 @@ static int decode_block_coeffs(VP56RangeCoder *c, DCTELEM block[16],
         else if (token >= DCT_CAT1) {
             int cat = token-DCT_CAT1;
             token = vp8_rac_get_coeff(c, vp8_dct_cat_prob[cat]);
-            token += vp8_dct_cat_offset[cat];
+            token += 3 + (2<<cat);
         }
 
         // after the first token, the non-zero prediction context becomes
@@ -809,9 +834,7 @@ static void decode_mb_coeffs(VP8Context *s, VP56RangeCoder *c, VP8Macroblock *mb
     LOCAL_ALIGNED_16(DCTELEM, dc,[16]);
     int i, x, y, luma_start = 0, luma_ctx = 3;
     int nnz_pred, nnz, nnz_total = 0;
-    int segment = s->segmentation.enabled ? mb->segment : 0;
-
-    s->dsp.clear_blocks((DCTELEM *)s->block);
+    int segment = s->segment;
 
     if (mb->mode != MODE_I4x4 && mb->mode != VP8_MVMODE_SPLIT) {
         AV_ZERO128(dc);
@@ -917,7 +940,7 @@ static int check_intra_pred_mode(int mode, int mb_x, int mb_y)
 }
 
 static void intra_predict(VP8Context *s, uint8_t *dst[3], VP8Macroblock *mb,
-                          uint8_t *bmode, int mb_x, int mb_y)
+                          uint8_t *intra4x4, int mb_x, int mb_y)
 {
     int x, y, mode, nnz, tr;
 
@@ -933,6 +956,7 @@ static void intra_predict(VP8Context *s, uint8_t *dst[3], VP8Macroblock *mb,
         s->hpc.pred16x16[mode](dst[0], s->linesize);
     } else {
         uint8_t *ptr = dst[0];
+        int stride = s->keyframe ? s->b4_stride : 4;
 
         // all blocks on the right edge of the macroblock use bottom edge
         // the top macroblock for their topright edge
@@ -945,13 +969,16 @@ static void intra_predict(VP8Context *s, uint8_t *dst[3], VP8Macroblock *mb,
             tr_right = (uint8_t *)&tr;
         }
 
+        if (mb->skip)
+            AV_ZERO128(s->non_zero_count_cache);
+
         for (y = 0; y < 4; y++) {
             uint8_t *topright = ptr + 4 - s->linesize;
             for (x = 0; x < 4; x++) {
                 if (x == 3)
                     topright = tr_right;
 
-                s->hpc.pred4x4[bmode[x]](ptr+4*x, topright, s->linesize);
+                s->hpc.pred4x4[intra4x4[x]](ptr+4*x, topright, s->linesize);
 
                 nnz = s->non_zero_count_cache[y][x];
                 if (nnz) {
@@ -964,7 +991,7 @@ static void intra_predict(VP8Context *s, uint8_t *dst[3], VP8Macroblock *mb,
             }
 
             ptr   += 4*s->linesize;
-            bmode += s->b4_stride;
+            intra4x4 += stride;
         }
     }
 
@@ -1001,24 +1028,26 @@ static inline void vp8_mc(VP8Context *s, int luma,
                           int width, int height, int linesize,
                           vp8_mc_func mc_func[3][3])
 {
-    static const uint8_t idx[8] = { 0, 1, 2, 1, 2, 1, 2, 1 };
-    int mx = (mv->x << luma)&7, mx_idx = idx[mx];
-    int my = (mv->y << luma)&7, my_idx = idx[my];
-
-    x_off += mv->x >> (3 - luma);
-    y_off += mv->y >> (3 - luma);
-
-    // edge emulation
-    src += y_off * linesize + x_off;
-    if (x_off < 2 || x_off >= width  - block_w - 3 ||
-        y_off < 2 || y_off >= height - block_h - 3) {
-        ff_emulated_edge_mc(s->edge_emu_buffer, src - 2 * linesize - 2, linesize,
-                            block_w + 5, block_h + 5,
-                            x_off - 2, y_off - 2, width, height);
-        src = s->edge_emu_buffer + 2 + linesize * 2;
-    }
-
-    mc_func[my_idx][mx_idx](dst, linesize, src, linesize, block_h, mx, my);
+    if (AV_RN32A(mv)) {
+        static const uint8_t idx[8] = { 0, 1, 2, 1, 2, 1, 2, 1 };
+        int mx = (mv->x << luma)&7, mx_idx = idx[mx];
+        int my = (mv->y << luma)&7, my_idx = idx[my];
+
+        x_off += mv->x >> (3 - luma);
+        y_off += mv->y >> (3 - luma);
+
+        // edge emulation
+        src += y_off * linesize + x_off;
+        if (x_off < 2 || x_off >= width  - block_w - 3 ||
+            y_off < 2 || y_off >= height - block_h - 3) {
+            ff_emulated_edge_mc(s->edge_emu_buffer, src - 2 * linesize - 2, linesize,
+                                block_w + 5, block_h + 5,
+                                x_off - 2, y_off - 2, width, height);
+            src = s->edge_emu_buffer + 2 + linesize * 2;
+        }
+        mc_func[my_idx][mx_idx](dst, linesize, src, linesize, block_h, mx, my);
+    } else
+        mc_func[0][0](dst, linesize, src + y_off * linesize + x_off, linesize, block_h, 0, 0);
 }
 
 static inline void vp8_mc_part(VP8Context *s, uint8_t *dst[3],
@@ -1054,6 +1083,23 @@ static inline void vp8_mc_part(VP8Context *s, uint8_t *dst[3],
            s->put_pixels_tab[1 + (block_w == 4)]);
 }
 
+/* Fetch pixels for estimated mv 4 macroblocks ahead.
+ * Optimized for 64-byte cache lines.  Inspired by ffh264 prefetch_motion. */
+static inline void prefetch_motion(VP8Context *s, VP8Macroblock *mb, int mb_x, int mb_y, int mb_xy, int ref)
+{
+    /* Don't prefetch refs that haven't been used very often this frame. */
+    if (s->ref_count[ref-1] > (mb_xy >> 5)) {
+        int x_off = mb_x << 4, y_off = mb_y << 4;
+        int mx = mb->mv.x + x_off + 8;
+        int my = mb->mv.y + y_off;
+        uint8_t **src= s->framep[ref]->data;
+        int off= mx + (my + (mb_x&3)*4)*s->linesize + 64;
+        s->dsp.prefetch(src[0]+off, s->linesize, 4);
+        off= (mx>>1) + ((my>>1) + (mb_x&7))*s->uvlinesize + 64;
+        s->dsp.prefetch(src[1]+off, src[2]-src[1], 2);
+    }
+}
+
 /**
  * Apply motion vectors to prediction buffer, chapter 18.
  */
@@ -1062,9 +1108,11 @@ static void inter_predict(VP8Context *s, uint8_t *dst[3], VP8Macroblock *mb,
 {
     int x_off = mb_x << 4, y_off = mb_y << 4;
     int width = 16*s->mb_width, height = 16*s->mb_height;
+    AVFrame *ref = s->framep[mb->ref_frame];
+    VP56mv *bmv = mb->bmv;
 
     if (mb->mode < VP8_MVMODE_SPLIT) {
-        vp8_mc_part(s, dst, s->framep[mb->ref_frame], x_off, y_off,
+        vp8_mc_part(s, dst, ref, x_off, y_off,
                     0, 0, 16, 16, width, height, &mb->mv);
     } else switch (mb->partitioning) {
     case VP8_SPLITMVMODE_4x4: {
@@ -1075,7 +1123,7 @@ static void inter_predict(VP8Context *s, uint8_t *dst[3], VP8Macroblock *mb,
         for (y = 0; y < 4; y++) {
             for (x = 0; x < 4; x++) {
                 vp8_mc(s, 1, dst[0] + 4*y*s->linesize + x*4,
-                       s->framep[mb->ref_frame]->data[0], &mb->bmv[4*y + x],
+                       ref->data[0], &bmv[4*y + x],
                        4*x + x_off, 4*y + y_off, 4, 4,
                        width, height, s->linesize,
                        s->put_pixels_tab[2]);
@@ -1101,12 +1149,12 @@ static void inter_predict(VP8Context *s, uint8_t *dst[3], VP8Macroblock *mb,
                     uvmv.y &= ~7;
                 }
                 vp8_mc(s, 0, dst[1] + 4*y*s->uvlinesize + x*4,
-                       s->framep[mb->ref_frame]->data[1], &uvmv,
+                       ref->data[1], &uvmv,
                        4*x + x_off, 4*y + y_off, 4, 4,
                        width, height, s->uvlinesize,
                        s->put_pixels_tab[2]);
                 vp8_mc(s, 0, dst[2] + 4*y*s->uvlinesize + x*4,
-                       s->framep[mb->ref_frame]->data[2], &uvmv,
+                       ref->data[2], &uvmv,
                        4*x + x_off, 4*y + y_off, 4, 4,
                        width, height, s->uvlinesize,
                        s->put_pixels_tab[2]);
@@ -1115,78 +1163,87 @@ static void inter_predict(VP8Context *s, uint8_t *dst[3], VP8Macroblock *mb,
         break;
     }
     case VP8_SPLITMVMODE_16x8:
-        vp8_mc_part(s, dst, s->framep[mb->ref_frame], x_off, y_off,
-                    0, 0, 16, 8, width, height, &mb->bmv[0]);
-        vp8_mc_part(s, dst, s->framep[mb->ref_frame], x_off, y_off,
-                    0, 8, 16, 8, width, height, &mb->bmv[1]);
+        vp8_mc_part(s, dst, ref, x_off, y_off,
+                    0, 0, 16, 8, width, height, &bmv[0]);
+        vp8_mc_part(s, dst, ref, x_off, y_off,
+                    0, 8, 16, 8, width, height, &bmv[1]);
         break;
     case VP8_SPLITMVMODE_8x16:
-        vp8_mc_part(s, dst, s->framep[mb->ref_frame], x_off, y_off,
-                    0, 0, 8, 16, width, height, &mb->bmv[0]);
-        vp8_mc_part(s, dst, s->framep[mb->ref_frame], x_off, y_off,
-                    8, 0, 8, 16, width, height, &mb->bmv[1]);
+        vp8_mc_part(s, dst, ref, x_off, y_off,
+                    0, 0, 8, 16, width, height, &bmv[0]);
+        vp8_mc_part(s, dst, ref, x_off, y_off,
+                    8, 0, 8, 16, width, height, &bmv[1]);
         break;
     case VP8_SPLITMVMODE_8x8:
-        vp8_mc_part(s, dst, s->framep[mb->ref_frame], x_off, y_off,
-                    0, 0, 8, 8, width, height, &mb->bmv[0]);
-        vp8_mc_part(s, dst, s->framep[mb->ref_frame], x_off, y_off,
-                    8, 0, 8, 8, width, height, &mb->bmv[1]);
-        vp8_mc_part(s, dst, s->framep[mb->ref_frame], x_off, y_off,
-                    0, 8, 8, 8, width, height, &mb->bmv[2]);
-        vp8_mc_part(s, dst, s->framep[mb->ref_frame], x_off, y_off,
-                    8, 8, 8, 8, width, height, &mb->bmv[3]);
+        vp8_mc_part(s, dst, ref, x_off, y_off,
+                    0, 0, 8, 8, width, height, &bmv[0]);
+        vp8_mc_part(s, dst, ref, x_off, y_off,
+                    8, 0, 8, 8, width, height, &bmv[1]);
+        vp8_mc_part(s, dst, ref, x_off, y_off,
+                    0, 8, 8, 8, width, height, &bmv[2]);
+        vp8_mc_part(s, dst, ref, x_off, y_off,
+                    8, 8, 8, 8, width, height, &bmv[3]);
         break;
     }
 }
 
-static void idct_mb(VP8Context *s, uint8_t *y_dst, uint8_t *u_dst, uint8_t *v_dst,
-                    VP8Macroblock *mb)
+static void idct_mb(VP8Context *s, uint8_t *dst[3], VP8Macroblock *mb)
 {
-    int x, y, nnz;
+    int x, y, ch;
 
-    if (mb->mode != MODE_I4x4)
+    if (mb->mode != MODE_I4x4) {
+        uint8_t *y_dst = dst[0];
         for (y = 0; y < 4; y++) {
-            for (x = 0; x < 4; x++) {
-                nnz = s->non_zero_count_cache[y][x];
-                if (nnz) {
-                    if (nnz == 1)
-                        s->vp8dsp.vp8_idct_dc_add(y_dst+4*x, s->block[y][x], s->linesize);
-                    else
-                        s->vp8dsp.vp8_idct_add(y_dst+4*x, s->block[y][x], s->linesize);
+            uint32_t nnz4 = AV_RN32A(s->non_zero_count_cache[y]);
+            if (nnz4) {
+                if (nnz4&~0x01010101) {
+                    for (x = 0; x < 4; x++) {
+                        int nnz = s->non_zero_count_cache[y][x];
+                        if (nnz) {
+                            if (nnz == 1)
+                                s->vp8dsp.vp8_idct_dc_add(y_dst+4*x, s->block[y][x], s->linesize);
+                            else
+                                s->vp8dsp.vp8_idct_add(y_dst+4*x, s->block[y][x], s->linesize);
+                        }
+                    }
+                } else {
+                    s->vp8dsp.vp8_idct_dc_add4y(y_dst, s->block[y], s->linesize);
                 }
             }
             y_dst += 4*s->linesize;
         }
+    }
 
-    for (y = 0; y < 2; y++) {
-        for (x = 0; x < 2; x++) {
-            nnz = s->non_zero_count_cache[4][(y<<1)+x];
-            if (nnz) {
-                if (nnz == 1)
-                    s->vp8dsp.vp8_idct_dc_add(u_dst+4*x, s->block[4][(y<<1)+x], s->uvlinesize);
-                else
-                    s->vp8dsp.vp8_idct_add(u_dst+4*x, s->block[4][(y<<1)+x], s->uvlinesize);
-            }
-
-            nnz = s->non_zero_count_cache[5][(y<<1)+x];
-            if (nnz) {
-                if (nnz == 1)
-                    s->vp8dsp.vp8_idct_dc_add(v_dst+4*x, s->block[5][(y<<1)+x], s->uvlinesize);
-                else
-                    s->vp8dsp.vp8_idct_add(v_dst+4*x, s->block[5][(y<<1)+x], s->uvlinesize);
+    for (ch = 0; ch < 2; ch++) {
+        uint32_t nnz4 = AV_RN32A(s->non_zero_count_cache[4+ch]);
+        if (nnz4) {
+            uint8_t *ch_dst = dst[1+ch];
+            if (nnz4&~0x01010101) {
+                for (y = 0; y < 2; y++) {
+                    for (x = 0; x < 2; x++) {
+                        int nnz = s->non_zero_count_cache[4+ch][(y<<1)+x];
+                        if (nnz) {
+                            if (nnz == 1)
+                                s->vp8dsp.vp8_idct_dc_add(ch_dst+4*x, s->block[4+ch][(y<<1)+x], s->uvlinesize);
+                            else
+                                s->vp8dsp.vp8_idct_add(ch_dst+4*x, s->block[4+ch][(y<<1)+x], s->uvlinesize);
+                        }
+                    }
+                    ch_dst += 4*s->uvlinesize;
+                }
+            } else {
+                s->vp8dsp.vp8_idct_dc_add4uv(ch_dst, s->block[4+ch], s->uvlinesize);
             }
         }
-        u_dst += 4*s->uvlinesize;
-        v_dst += 4*s->uvlinesize;
     }
 }
 
-static void filter_level_for_mb(VP8Context *s, VP8Macroblock *mb, int *level, int *inner, int *hev_thresh)
+static void filter_level_for_mb(VP8Context *s, VP8Macroblock *mb, VP8FilterStrength *f )
 {
     int interior_limit, filter_level;
 
     if (s->segmentation.enabled) {
-        filter_level = s->segmentation.filter_level[mb->segment];
+        filter_level = s->segmentation.filter_level[s->segment];
         if (!s->segmentation.absolute_vals)
             filter_level += s->filter.level;
     } else
@@ -1216,83 +1273,88 @@ static void filter_level_for_mb(VP8Context *s, VP8Macroblock *mb, int *level, in
     }
     interior_limit = FFMAX(interior_limit, 1);
 
-    *level = filter_level;
-    *inner = interior_limit;
-
-    if (hev_thresh) {
-        *hev_thresh = filter_level >= 15;
-
-        if (s->keyframe) {
-            if (filter_level >= 40)
-                *hev_thresh = 2;
-        } else {
-            if (filter_level >= 40)
-                *hev_thresh = 3;
-            else if (filter_level >= 20)
-                *hev_thresh = 2;
-        }
-    }
+    f->filter_level = filter_level;
+    f->inner_limit = interior_limit;
+    f->inner_filter = !mb->skip || mb->mode == MODE_I4x4 || mb->mode == VP8_MVMODE_SPLIT;
 }
 
-static void filter_mb(VP8Context *s, uint8_t *dst[3], VP8Macroblock *mb, int mb_x, int mb_y)
+static void filter_mb(VP8Context *s, uint8_t *dst[3], VP8FilterStrength *f, int mb_x, int mb_y)
 {
-    int filter_level, inner_limit, hev_thresh, mbedge_lim, bedge_lim;
+    int mbedge_lim, bedge_lim, hev_thresh;
+    int filter_level = f->filter_level;
+    int inner_limit = f->inner_limit;
+    int inner_filter = f->inner_filter;
+    int linesize = s->linesize;
+    int uvlinesize = s->uvlinesize;
 
-    filter_level_for_mb(s, mb, &filter_level, &inner_limit, &hev_thresh);
     if (!filter_level)
         return;
 
     mbedge_lim = 2*(filter_level+2) + inner_limit;
      bedge_lim = 2* filter_level    + inner_limit;
+    hev_thresh = filter_level >= 15;
+
+    if (s->keyframe) {
+        if (filter_level >= 40)
+            hev_thresh = 2;
+    } else {
+        if (filter_level >= 40)
+            hev_thresh = 3;
+        else if (filter_level >= 20)
+            hev_thresh = 2;
+    }
 
     if (mb_x) {
-        s->vp8dsp.vp8_h_loop_filter16y(dst[0],     s->linesize,
+        s->vp8dsp.vp8_h_loop_filter16y(dst[0],     linesize,
                                        mbedge_lim, inner_limit, hev_thresh);
-        s->vp8dsp.vp8_h_loop_filter8uv(dst[1],     dst[2],      s->uvlinesize,
+        s->vp8dsp.vp8_h_loop_filter8uv(dst[1],     dst[2],      uvlinesize,
                                        mbedge_lim, inner_limit, hev_thresh);
     }
 
-    if (!mb->skip || mb->mode == MODE_I4x4 || mb->mode == VP8_MVMODE_SPLIT) {
-        s->vp8dsp.vp8_h_loop_filter16y_inner(dst[0]+ 4, s->linesize, bedge_lim,
-                                             inner_limit,   hev_thresh);
-        s->vp8dsp.vp8_h_loop_filter16y_inner(dst[0]+ 8, s->linesize, bedge_lim,
-                                             inner_limit,   hev_thresh);
-        s->vp8dsp.vp8_h_loop_filter16y_inner(dst[0]+12, s->linesize, bedge_lim,
-                                             inner_limit,   hev_thresh);
-        s->vp8dsp.vp8_h_loop_filter8uv_inner(dst[1] + 4,    dst[2] + 4,
-                                             s->uvlinesize, bedge_lim,
-                                             inner_limit,   hev_thresh);
+    if (inner_filter) {
+        s->vp8dsp.vp8_h_loop_filter16y_inner(dst[0]+ 4, linesize, bedge_lim,
+                                             inner_limit, hev_thresh);
+        s->vp8dsp.vp8_h_loop_filter16y_inner(dst[0]+ 8, linesize, bedge_lim,
+                                             inner_limit, hev_thresh);
+        s->vp8dsp.vp8_h_loop_filter16y_inner(dst[0]+12, linesize, bedge_lim,
+                                             inner_limit, hev_thresh);
+        s->vp8dsp.vp8_h_loop_filter8uv_inner(dst[1] + 4, dst[2] + 4,
+                                             uvlinesize,  bedge_lim,
+                                             inner_limit, hev_thresh);
     }
 
     if (mb_y) {
-        s->vp8dsp.vp8_v_loop_filter16y(dst[0],     s->linesize,
+        s->vp8dsp.vp8_v_loop_filter16y(dst[0],     linesize,
                                        mbedge_lim, inner_limit, hev_thresh);
-        s->vp8dsp.vp8_v_loop_filter8uv(dst[1],     dst[2],      s->uvlinesize,
+        s->vp8dsp.vp8_v_loop_filter8uv(dst[1],     dst[2],      uvlinesize,
                                        mbedge_lim, inner_limit, hev_thresh);
     }
 
-    if (!mb->skip || mb->mode == MODE_I4x4 || mb->mode == VP8_MVMODE_SPLIT) {
-        s->vp8dsp.vp8_v_loop_filter16y_inner(dst[0]+ 4*s->linesize,
-                                             s->linesize,   bedge_lim,
-                                             inner_limit,   hev_thresh);
-        s->vp8dsp.vp8_v_loop_filter16y_inner(dst[0]+ 8*s->linesize,
-                                             s->linesize,   bedge_lim,
-                                             inner_limit,   hev_thresh);
-        s->vp8dsp.vp8_v_loop_filter16y_inner(dst[0]+12*s->linesize,
-                                             s->linesize,   bedge_lim,
+    if (inner_filter) {
+        s->vp8dsp.vp8_v_loop_filter16y_inner(dst[0]+ 4*linesize,
+                                             linesize,    bedge_lim,
+                                             inner_limit, hev_thresh);
+        s->vp8dsp.vp8_v_loop_filter16y_inner(dst[0]+ 8*linesize,
+                                             linesize,    bedge_lim,
+                                             inner_limit, hev_thresh);
+        s->vp8dsp.vp8_v_loop_filter16y_inner(dst[0]+12*linesize,
+                                             linesize,    bedge_lim,
+                                             inner_limit, hev_thresh);
+        s->vp8dsp.vp8_v_loop_filter8uv_inner(dst[1] + 4 * uvlinesize,
+                                             dst[2] + 4 * uvlinesize,
+                                             uvlinesize,  bedge_lim,
                                              inner_limit, hev_thresh);
-        s->vp8dsp.vp8_v_loop_filter8uv_inner(dst[1] + 4 * s->uvlinesize,
-                                             dst[2] + 4 * s->uvlinesize,
-                                             s->uvlinesize, bedge_lim,
-                                             inner_limit,   hev_thresh);
     }
 }
 
-static void filter_mb_simple(VP8Context *s, uint8_t *dst, VP8Macroblock *mb, int mb_x, int mb_y)
+static void filter_mb_simple(VP8Context *s, uint8_t *dst, VP8FilterStrength *f, int mb_x, int mb_y)
 {
-    int filter_level, inner_limit, mbedge_lim, bedge_lim;
+    int mbedge_lim, bedge_lim;
+    int filter_level = f->filter_level;
+    int inner_limit = f->inner_limit;
+    int inner_filter = f->inner_filter;
+    int linesize = s->linesize;
 
-    filter_level_for_mb(s, mb, &filter_level, &inner_limit, NULL);
     if (!filter_level)
         return;
 
@@ -1300,25 +1362,25 @@ static void filter_mb_simple(VP8Context *s, uint8_t *dst, VP8Macroblock *mb, int
      bedge_lim = 2* filter_level    + inner_limit;
 
     if (mb_x)
-        s->vp8dsp.vp8_h_loop_filter_simple(dst, s->linesize, mbedge_lim);
-    if (!mb->skip || mb->mode == MODE_I4x4 || mb->mode == VP8_MVMODE_SPLIT) {
-        s->vp8dsp.vp8_h_loop_filter_simple(dst+ 4, s->linesize, bedge_lim);
-        s->vp8dsp.vp8_h_loop_filter_simple(dst+ 8, s->linesize, bedge_lim);
-        s->vp8dsp.vp8_h_loop_filter_simple(dst+12, s->linesize, bedge_lim);
+        s->vp8dsp.vp8_h_loop_filter_simple(dst, linesize, mbedge_lim);
+    if (inner_filter) {
+        s->vp8dsp.vp8_h_loop_filter_simple(dst+ 4, linesize, bedge_lim);
+        s->vp8dsp.vp8_h_loop_filter_simple(dst+ 8, linesize, bedge_lim);
+        s->vp8dsp.vp8_h_loop_filter_simple(dst+12, linesize, bedge_lim);
     }
 
     if (mb_y)
-        s->vp8dsp.vp8_v_loop_filter_simple(dst, s->linesize, mbedge_lim);
-    if (!mb->skip || mb->mode == MODE_I4x4 || mb->mode == VP8_MVMODE_SPLIT) {
-        s->vp8dsp.vp8_v_loop_filter_simple(dst+ 4*s->linesize, s->linesize, bedge_lim);
-        s->vp8dsp.vp8_v_loop_filter_simple(dst+ 8*s->linesize, s->linesize, bedge_lim);
-        s->vp8dsp.vp8_v_loop_filter_simple(dst+12*s->linesize, s->linesize, bedge_lim);
+        s->vp8dsp.vp8_v_loop_filter_simple(dst, linesize, mbedge_lim);
+    if (inner_filter) {
+        s->vp8dsp.vp8_v_loop_filter_simple(dst+ 4*linesize, linesize, bedge_lim);
+        s->vp8dsp.vp8_v_loop_filter_simple(dst+ 8*linesize, linesize, bedge_lim);
+        s->vp8dsp.vp8_v_loop_filter_simple(dst+12*linesize, linesize, bedge_lim);
     }
 }
 
 static void filter_mb_row(VP8Context *s, int mb_y)
 {
-    VP8Macroblock *mb = s->macroblocks + mb_y*s->mb_stride;
+    VP8FilterStrength *f = s->filter_strength;
     uint8_t *dst[3] = {
         s->framep[VP56_FRAME_CURRENT]->data[0] + 16*mb_y*s->linesize,
         s->framep[VP56_FRAME_CURRENT]->data[1] +  8*mb_y*s->uvlinesize,
@@ -1328,7 +1390,7 @@ static void filter_mb_row(VP8Context *s, int mb_y)
 
     for (mb_x = 0; mb_x < s->mb_width; mb_x++) {
         backup_mb_border(s->top_border[mb_x+1], dst[0], dst[1], dst[2], s->linesize, s->uvlinesize, 0);
-        filter_mb(s, dst, mb++, mb_x, mb_y);
+        filter_mb(s, dst, f++, mb_x, mb_y);
         dst[0] += 16;
         dst[1] += 8;
         dst[2] += 8;
@@ -1337,26 +1399,26 @@ static void filter_mb_row(VP8Context *s, int mb_y)
 
 static void filter_mb_row_simple(VP8Context *s, int mb_y)
 {
+    VP8FilterStrength *f = s->filter_strength;
     uint8_t *dst = s->framep[VP56_FRAME_CURRENT]->data[0] + 16*mb_y*s->linesize;
-    VP8Macroblock *mb = s->macroblocks + mb_y*s->mb_stride;
     int mb_x;
 
     for (mb_x = 0; mb_x < s->mb_width; mb_x++) {
         backup_mb_border(s->top_border[mb_x+1], dst, NULL, NULL, s->linesize, 0, 1);
-        filter_mb_simple(s, dst, mb++, mb_x, mb_y);
+        filter_mb_simple(s, dst, f++, mb_x, mb_y);
         dst += 16;
     }
 }
 
 static int vp8_decode_frame(AVCodecContext *avctx, void *data, int *data_size,
-                            const uint8_t *buf, int buf_size)
+                            AVPacket *avpkt)
 {
     VP8Context *s = avctx->priv_data;
     int ret, mb_x, mb_y, i, y, referenced;
     enum AVDiscard skip_thresh;
-    AVFrame *curframe;
+    AVFrame *curframe = NULL;
 
-    if ((ret = decode_frame_header(s, buf, buf_size)) < 0)
+    if ((ret = decode_frame_header(s, avpkt->data, avpkt->size)) < 0)
         return ret;
 
     referenced = s->update_last || s->update_golden == VP56_FRAME_CURRENT
@@ -1407,13 +1469,19 @@ static int vp8_decode_frame(AVCodecContext *avctx, void *data, int *data_size,
 
     memset(s->top_nnz, 0, s->mb_width*sizeof(*s->top_nnz));
 
+    /* Zero macroblock structures for top/left prediction from outside the frame. */
+    memset(s->macroblocks, 0, (s->mb_width + s->mb_height*2)*sizeof(*s->macroblocks));
+
     // top edge of 127 for intra prediction
     memset(s->top_border, 127, (s->mb_width+1)*sizeof(*s->top_border));
+    memset(s->ref_count, 0, sizeof(s->ref_count));
 
     for (mb_y = 0; mb_y < s->mb_height; mb_y++) {
         VP56RangeCoder *c = &s->coeff_partition[mb_y & (s->num_coeff_partitions-1)];
-        VP8Macroblock *mb = s->macroblocks + mb_y*s->mb_stride;
+        VP8Macroblock *mb = s->macroblocks + (s->mb_height - mb_y - 1)*2;
         uint8_t *intra4x4 = s->intra4x4_pred_mode + 4*mb_y*s->b4_stride;
+        uint8_t *segment_map = s->segmentation_map + mb_y*s->mb_stride;
+        int mb_xy = mb_y * s->mb_stride;
         uint8_t *dst[3] = {
             curframe->data[0] + 16*mb_y*s->linesize,
             curframe->data[1] +  8*mb_y*s->uvlinesize,
@@ -1430,25 +1498,30 @@ static int vp8_decode_frame(AVCodecContext *avctx, void *data, int *data_size,
         if (mb_y)
             memset(s->top_border, 129, sizeof(*s->top_border));
 
-        for (mb_x = 0; mb_x < s->mb_width; mb_x++) {
-            decode_mb_mode(s, mb, mb_x, mb_y, intra4x4 + 4*mb_x);
+        for (mb_x = 0; mb_x < s->mb_width; mb_x++, mb_xy++, mb++) {
+            uint8_t *intra4x4_mb = s->keyframe ? intra4x4 + 4*mb_x : s->intra4x4_pred_mode_mb;
+            uint8_t *segment_mb = segment_map+mb_x;
+
+            /* Prefetch the current frame, 4 MBs ahead */
+            s->dsp.prefetch(dst[0] + (mb_x&3)*4*s->linesize + 64, s->linesize, 4);
+            s->dsp.prefetch(dst[1] + (mb_x&7)*s->uvlinesize + 64, dst[2] - dst[1], 2);
+
+            decode_mb_mode(s, mb, mb_x, mb_y, intra4x4_mb, segment_mb);
+
+            prefetch_motion(s, mb, mb_x, mb_y, mb_xy, VP56_FRAME_PREVIOUS);
 
             if (!mb->skip)
                 decode_mb_coeffs(s, c, mb, s->top_nnz[mb_x], s->left_nnz);
-            else {
-                AV_ZERO128(s->non_zero_count_cache);    // luma
-                AV_ZERO64(s->non_zero_count_cache[4]);  // chroma
-            }
 
-            if (mb->mode <= MODE_I4x4) {
-                intra_predict(s, dst, mb, intra4x4 + 4*mb_x, mb_x, mb_y);
-                memset(mb->bmv, 0, sizeof(mb->bmv));
-            } else {
+            if (mb->mode <= MODE_I4x4)
+                intra_predict(s, dst, mb, intra4x4_mb, mb_x, mb_y);
+            else
                 inter_predict(s, dst, mb, mb_x, mb_y);
-            }
+
+            prefetch_motion(s, mb, mb_x, mb_y, mb_xy, VP56_FRAME_GOLDEN);
 
             if (!mb->skip) {
-                idct_mb(s, dst[0], dst[1], dst[2], mb);
+                idct_mb(s, dst, mb);
             } else {
                 AV_ZERO64(s->left_nnz);
                 AV_WN64(s->top_nnz[mb_x], 0);   // array of 9, so unaligned
@@ -1460,10 +1533,14 @@ static int vp8_decode_frame(AVCodecContext *avctx, void *data, int *data_size,
                 }
             }
 
+            if (s->deblock_filter)
+                filter_level_for_mb(s, mb, &s->filter_strength[mb_x]);
+
+            prefetch_motion(s, mb, mb_x, mb_y, mb_xy, VP56_FRAME_GOLDEN2);
+
             dst[0] += 16;
             dst[1] += 8;
             dst[2] += 8;
-            mb++;
         }
         if (s->deblock_filter) {
             if (s->filter.simple)
@@ -1508,7 +1585,7 @@ skip_decode:
         *data_size = sizeof(AVFrame);
     }
 
-    return buf_size;
+    return avpkt->size;
 }
 
 static av_cold int vp8_decode_init(AVCodecContext *avctx)
diff --git a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/vp8data.h b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/vp8data.h
index 5d718b4bb..9f56ab63b 100644
--- a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/vp8data.h
+++ b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/vp8data.h
@@ -369,8 +369,6 @@ static const uint8_t * const vp8_dct_cat_prob[6] =
     vp8_dct_cat6_prob,
 };
 
-static const uint8_t vp8_dct_cat_offset[6] = { 5, 7, 11, 19, 35, 67 };
-
 static const uint8_t vp8_token_default_probs[4][8][3][NUM_DCT_TOKENS-1] =
 {
     {
diff --git a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/vp8dsp.c b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/vp8dsp.c
index 5e924017f..0c61d9252 100644
--- a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/vp8dsp.c
+++ b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/vp8dsp.c
@@ -69,6 +69,10 @@ static void vp8_idct_add_c(uint8_t *dst, DCTELEM block[16], int stride)
         t1 = block[0*4+i] - block[2*4+i];
         t2 = MUL_35468(block[1*4+i]) - MUL_20091(block[3*4+i]);
         t3 = MUL_20091(block[1*4+i]) + MUL_35468(block[3*4+i]);
+        block[0*4+i] = 0;
+        block[1*4+i] = 0;
+        block[2*4+i] = 0;
+        block[3*4+i] = 0;
 
         tmp[i*4+0] = t0 + t3;
         tmp[i*4+1] = t1 + t2;
@@ -94,6 +98,7 @@ static void vp8_idct_dc_add_c(uint8_t *dst, DCTELEM block[16], int stride)
 {
     int i, dc = (block[0] + 4) >> 3;
     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP + dc;
+    block[0] = 0;
 
     for (i = 0; i < 4; i++) {
         dst[0] = cm[dst[0]];
@@ -104,6 +109,21 @@ static void vp8_idct_dc_add_c(uint8_t *dst, DCTELEM block[16], int stride)
     }
 }
 
+static void vp8_idct_dc_add4uv_c(uint8_t *dst, DCTELEM block[4][16], int stride)
+{
+    vp8_idct_dc_add_c(dst+stride*0+0, block[0], stride);
+    vp8_idct_dc_add_c(dst+stride*0+4, block[1], stride);
+    vp8_idct_dc_add_c(dst+stride*4+0, block[2], stride);
+    vp8_idct_dc_add_c(dst+stride*4+4, block[3], stride);
+}
+
+static void vp8_idct_dc_add4y_c(uint8_t *dst, DCTELEM block[4][16], int stride)
+{
+    vp8_idct_dc_add_c(dst+ 0, block[0], stride);
+    vp8_idct_dc_add_c(dst+ 4, block[1], stride);
+    vp8_idct_dc_add_c(dst+ 8, block[2], stride);
+    vp8_idct_dc_add_c(dst+12, block[3], stride);
+}
 
 // because I like only having two parameters to pass functions...
 #define LOAD_PIXELS\
@@ -455,9 +475,11 @@ VP8_BILINEAR(4)
 
 av_cold void ff_vp8dsp_init(VP8DSPContext *dsp)
 {
-    dsp->vp8_luma_dc_wht = vp8_luma_dc_wht_c;
-    dsp->vp8_idct_add    = vp8_idct_add_c;
-    dsp->vp8_idct_dc_add = vp8_idct_dc_add_c;
+    dsp->vp8_luma_dc_wht    = vp8_luma_dc_wht_c;
+    dsp->vp8_idct_add       = vp8_idct_add_c;
+    dsp->vp8_idct_dc_add    = vp8_idct_dc_add_c;
+    dsp->vp8_idct_dc_add4y  = vp8_idct_dc_add4y_c;
+    dsp->vp8_idct_dc_add4uv = vp8_idct_dc_add4uv_c;
 
     dsp->vp8_v_loop_filter16y = vp8_v_loop_filter16_c;
     dsp->vp8_h_loop_filter16y = vp8_h_loop_filter16_c;
diff --git a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/vp8dsp.h b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/vp8dsp.h
index 64a3bfbc5..47b1a9077 100644
--- a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/vp8dsp.h
+++ b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/vp8dsp.h
@@ -33,6 +33,8 @@ typedef struct VP8DSPContext {
     void (*vp8_luma_dc_wht)(DCTELEM block[4][4][16], DCTELEM dc[16]);
     void (*vp8_idct_add)(uint8_t *dst, DCTELEM block[16], int stride);
     void (*vp8_idct_dc_add)(uint8_t *dst, DCTELEM block[16], int stride);
+    void (*vp8_idct_dc_add4y)(uint8_t *dst, DCTELEM block[4][16], int stride);
+    void (*vp8_idct_dc_add4uv)(uint8_t *dst, DCTELEM block[4][16], int stride);
 
     // loop filter applied to edges between macroblocks
     void (*vp8_v_loop_filter16y)(uint8_t *dst, int stride,
diff --git a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/dsputil_mmx.c b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/dsputil_mmx.c
index a94cfca0f..cd4e46219 100644
--- a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/dsputil_mmx.c
+++ b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/dsputil_mmx.c
@@ -2956,7 +2956,7 @@ void ff_h264dsp_init_x86(H264DSPContext *c)
             c->h264_idct8_add4= ff_h264_idct8_add4_sse2;
         }
 
-#if CONFIG_GPL && HAVE_YASM
+#if HAVE_YASM
         if (mm_flags & FF_MM_MMX2){
 #if ARCH_X86_32
             c->h264_v_loop_filter_luma_intra = ff_x264_deblock_v_luma_intra_mmxext;
@@ -2969,9 +2969,11 @@ void ff_h264dsp_init_x86(H264DSPContext *c)
                 c->h264_v_loop_filter_luma_intra = ff_x264_deblock_v_luma_intra_sse2;
                 c->h264_h_loop_filter_luma_intra = ff_x264_deblock_h_luma_intra_sse2;
 #endif
+#if CONFIG_GPL
                 c->h264_idct_add16 = ff_h264_idct_add16_sse2;
                 c->h264_idct_add8  = ff_h264_idct_add8_sse2;
                 c->h264_idct_add16intra = ff_h264_idct_add16intra_sse2;
+#endif
             }
         }
 #endif
diff --git a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/h264_deblock_sse2.asm b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/h264_deblock_sse2.asm
index b2aa94023..a9e6dea3d 100644
--- a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/h264_deblock_sse2.asm
+++ b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/h264_deblock_sse2.asm
@@ -5,20 +5,22 @@
 ;*
 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
 ;*
-;* This program is free software; you can redistribute it and/or modify
-;* it under the terms of the GNU General Public License as published by
-;* the Free Software Foundation; either version 2 of the License, or
-;* (at your option) any later version.
+;* This file is part of FFmpeg.
 ;*
-;* This program is distributed in the hope that it will be useful,
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
-;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-;* GNU General Public License for more details.
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
 ;*
-;* You should have received a copy of the GNU General Public License
-;* along with this program; if not, write to the Free Software
-;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
-;*****************************************************************************
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
 
 %include "x86inc.asm"
 
diff --git a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/vc1dsp_yasm.asm b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/vc1dsp_yasm.asm
index b4d50f5ad..660ff1169 100644
--- a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/vc1dsp_yasm.asm
+++ b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/vc1dsp_yasm.asm
@@ -1,330 +1,330 @@
-;******************************************************************************
-;* VC1 deblocking optimizations
-;* Copyright (c) 2009 David Conrad
-;*
-;* This file is part of FFmpeg.
-;*
-;* FFmpeg is free software; you can redistribute it and/or
-;* modify it under the terms of the GNU Lesser General Public
-;* License as published by the Free Software Foundation; either
-;* version 2.1 of the License, or (at your option) any later version.
-;*
-;* FFmpeg is distributed in the hope that it will be useful,
-;* but WITHOUT ANY WARRANTY; without even the implied warranty of
-;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-;* Lesser General Public License for more details.
-;*
-;* You should have received a copy of the GNU Lesser General Public
-;* License along with FFmpeg; if not, write to the Free Software
-;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-;******************************************************************************
-
-%include "x86inc.asm"
-%include "x86util.asm"
-
-cextern pw_4
-cextern pw_5
-
-section .text
-
-; dst_low, dst_high (src), zero
-; zero-extends one vector from 8 to 16 bits
-%macro UNPACK_8TO16 4
-    mova      m%2, m%3
-    punpckh%1 m%3, m%4
-    punpckl%1 m%2, m%4
-%endmacro
-
-%macro STORE_4_WORDS_MMX 6
-    movd   %6, %5
-%if mmsize==16
-    psrldq %5, 4
-%else
-    psrlq  %5, 32
-%endif
-    mov    %1, %6w
-    shr    %6, 16
-    mov    %2, %6w
-    movd   %6, %5
-    mov    %3, %6w
-    shr    %6, 16
-    mov    %4, %6w
-%endmacro
-
-%macro STORE_4_WORDS_SSE4 6
-    pextrw %1, %5, %6+0
-    pextrw %2, %5, %6+1
-    pextrw %3, %5, %6+2
-    pextrw %4, %5, %6+3
-%endmacro
-
-; in:  p1 p0 q0 q1, clobbers p0
-; out: p1 = (2*(p1 - q1) - 5*(p0 - q0) + 4) >> 3
-%macro VC1_LOOP_FILTER_A0 4
-    psubw  %1, %4
-    psubw  %2, %3
-    paddw  %1, %1
-    pmullw %2, [pw_5]
-    psubw  %1, %2
-    paddw  %1, [pw_4]
-    psraw  %1, 3
-%endmacro
-
-; in: p0 q0 a0 a1 a2
-;     m0 m1 m7 m6 m5
-; %1: size
-; out: m0=p0' m1=q0'
-%macro VC1_FILTER 1
-    PABSW   m4, m7
-    PABSW   m3, m6
-    PABSW   m2, m5
-    mova    m6, m4
-    pminsw  m3, m2
-    pcmpgtw m6, m3  ; if (a2 < a0 || a1 < a0)
-    psubw   m3, m4
-    pmullw  m3, [pw_5]   ; 5*(a3 - a0)
-    PABSW   m2, m3
-    psraw   m2, 3   ; abs(d/8)
-    pxor    m7, m3  ; d_sign ^= a0_sign
-
-    pxor    m5, m5
-    movd    m3, r2
-%if %1 > 4
-    punpcklbw m3, m3
-%endif
-    punpcklbw m3, m5
-    pcmpgtw m3, m4  ; if (a0 < pq)
-    pand    m6, m3
-
-    mova    m3, m0
-    psubw   m3, m1
-    PABSW   m4, m3
-    psraw   m4, 1
-    pxor    m3, m7  ; d_sign ^ clip_sign
-    psraw   m3, 15
-    pminsw  m2, m4  ; min(d, clip)
-    pcmpgtw m4, m5
-    pand    m6, m4  ; filt3 (C return value)
-
-; each set of 4 pixels is not filtered if the 3rd is not
-%if mmsize==16
-    pshuflw m4, m6, 0xaa
-%if %1 > 4
-    pshufhw m4, m4, 0xaa
-%endif
-%else
-    pshufw  m4, m6, 0xaa
-%endif
-    pandn   m3, m4
-    pand    m2, m6
-    pand    m3, m2  ; d final
-
-    PSIGNW  m3, m7
-    psubw   m0, m3
-    paddw   m1, m3
-    packuswb m0, m0
-    packuswb m1, m1
-%endmacro
-
-; 1st param: size of filter
-; 2nd param: mov suffix equivalent to the filter size
-%macro VC1_V_LOOP_FILTER 2
-    pxor      m5, m5
-    mov%2     m6, [r4]
-    mov%2     m4, [r4+r1]
-    mov%2     m7, [r4+2*r1]
-    mov%2     m0, [r4+r3]
-    punpcklbw m6, m5
-    punpcklbw m4, m5
-    punpcklbw m7, m5
-    punpcklbw m0, m5
-
-    VC1_LOOP_FILTER_A0 m6, m4, m7, m0
-    mov%2     m1, [r0]
-    mov%2     m2, [r0+r1]
-    punpcklbw m1, m5
-    punpcklbw m2, m5
-    mova      m4, m0
-    VC1_LOOP_FILTER_A0 m7, m4, m1, m2
-    mov%2     m3, [r0+2*r1]
-    mov%2     m4, [r0+r3]
-    punpcklbw m3, m5
-    punpcklbw m4, m5
-    mova      m5, m1
-    VC1_LOOP_FILTER_A0 m5, m2, m3, m4
-
-    VC1_FILTER %1
-    mov%2 [r4+r3], m0
-    mov%2 [r0],    m1
-%endmacro
-
-; 1st param: size of filter
-;     NOTE: UNPACK_8TO16 this number of 8 bit numbers are in half a register
-; 2nd (optional) param: temp register to use for storing words
-%macro VC1_H_LOOP_FILTER 1-2
-%if %1 == 4
-    movq      m0, [r0     -4]
-    movq      m1, [r0+  r1-4]
-    movq      m2, [r0+2*r1-4]
-    movq      m3, [r0+  r3-4]
-    TRANSPOSE4x4B 0, 1, 2, 3, 4
-%else
-    movq      m0, [r0     -4]
-    movq      m4, [r0+  r1-4]
-    movq      m1, [r0+2*r1-4]
-    movq      m5, [r0+  r3-4]
-    movq      m2, [r4     -4]
-    movq      m6, [r4+  r1-4]
-    movq      m3, [r4+2*r1-4]
-    movq      m7, [r4+  r3-4]
-    punpcklbw m0, m4
-    punpcklbw m1, m5
-    punpcklbw m2, m6
-    punpcklbw m3, m7
-    TRANSPOSE4x4W 0, 1, 2, 3, 4
-%endif
-    pxor      m5, m5
-
-    UNPACK_8TO16 bw, 6, 0, 5
-    UNPACK_8TO16 bw, 7, 1, 5
-    VC1_LOOP_FILTER_A0 m6, m0, m7, m1
-    UNPACK_8TO16 bw, 4, 2, 5
-    mova    m0, m1                      ; m0 = p0
-    VC1_LOOP_FILTER_A0 m7, m1, m4, m2
-    UNPACK_8TO16 bw, 1, 3, 5
-    mova    m5, m4
-    VC1_LOOP_FILTER_A0 m5, m2, m1, m3
-    SWAP 1, 4                           ; m1 = q0
-
-    VC1_FILTER %1
-    punpcklbw m0, m1
-%if %0 > 1
-    STORE_4_WORDS_MMX [r0-1], [r0+r1-1], [r0+2*r1-1], [r0+r3-1], m0, %2
-%if %1 > 4
-    psrldq m0, 4
-    STORE_4_WORDS_MMX [r4-1], [r4+r1-1], [r4+2*r1-1], [r4+r3-1], m0, %2
-%endif
-%else
-    STORE_4_WORDS_SSE4 [r0-1], [r0+r1-1], [r0+2*r1-1], [r0+r3-1], m0, 0
-    STORE_4_WORDS_SSE4 [r4-1], [r4+r1-1], [r4+2*r1-1], [r4+r3-1], m0, 4
-%endif
-%endmacro
-
-
-%macro START_V_FILTER 0
-    mov  r4, r0
-    lea  r3, [4*r1]
-    sub  r4, r3
-    lea  r3, [r1+2*r1]
-    imul r2, 0x01010101
-%endmacro
-
-%macro START_H_FILTER 1
-    lea  r3, [r1+2*r1]
-%if %1 > 4
-    lea  r4, [r0+4*r1]
-%endif
-    imul r2, 0x01010101
-%endmacro
-
-; I dont know why the sign extension is needed...
-%macro PSIGNW_SRA_MMX 2
-    psraw %2, 15
-    PSIGNW_MMX %1, %2
-%endmacro
-
-
-%macro VC1_LF_MMX 1
-INIT_MMX
-cglobal vc1_v_loop_filter_internal_%1
-    VC1_V_LOOP_FILTER 4, d
-    ret
-
-cglobal vc1_h_loop_filter_internal_%1
-    VC1_H_LOOP_FILTER 4, r4
-    ret
-
-; void ff_vc1_v_loop_filter4_mmx2(uint8_t *src, int stride, int pq)
-cglobal vc1_v_loop_filter4_%1, 3,5,0
-    START_V_FILTER
-    call vc1_v_loop_filter_internal_%1
-    RET
-
-; void ff_vc1_h_loop_filter4_mmx2(uint8_t *src, int stride, int pq)
-cglobal vc1_h_loop_filter4_%1, 3,5,0
-    START_H_FILTER 4
-    call vc1_h_loop_filter_internal_%1
-    RET
-
-; void ff_vc1_v_loop_filter8_mmx2(uint8_t *src, int stride, int pq)
-cglobal vc1_v_loop_filter8_%1, 3,5,0
-    START_V_FILTER
-    call vc1_v_loop_filter_internal_%1
-    add  r4, 4
-    add  r0, 4
-    call vc1_v_loop_filter_internal_%1
-    RET
-
-; void ff_vc1_h_loop_filter8_mmx2(uint8_t *src, int stride, int pq)
-cglobal vc1_h_loop_filter8_%1, 3,5,0
-    START_H_FILTER 4
-    call vc1_h_loop_filter_internal_%1
-    lea  r0, [r0+4*r1]
-    call vc1_h_loop_filter_internal_%1
-    RET
-%endmacro
-
-%define PABSW PABSW_MMX
-%define PSIGNW PSIGNW_SRA_MMX
-VC1_LF_MMX mmx
-
-%define PABSW PABSW_MMX2
-VC1_LF_MMX mmx2
-
-INIT_XMM
-; void ff_vc1_v_loop_filter8_sse2(uint8_t *src, int stride, int pq)
-cglobal vc1_v_loop_filter8_sse2, 3,5,8
-    START_V_FILTER
-    VC1_V_LOOP_FILTER 8, q
-    RET
-
-; void ff_vc1_h_loop_filter8_sse2(uint8_t *src, int stride, int pq)
-cglobal vc1_h_loop_filter8_sse2, 3,6,8
-    START_H_FILTER 8
-    VC1_H_LOOP_FILTER 8, r5
-    RET
-
-%define PABSW PABSW_SSSE3
-%define PSIGNW PSIGNW_SSSE3
-
-INIT_MMX
-; void ff_vc1_v_loop_filter4_ssse3(uint8_t *src, int stride, int pq)
-cglobal vc1_v_loop_filter4_ssse3, 3,5,0
-    START_V_FILTER
-    VC1_V_LOOP_FILTER 4, d
-    RET
-
-; void ff_vc1_h_loop_filter4_ssse3(uint8_t *src, int stride, int pq)
-cglobal vc1_h_loop_filter4_ssse3, 3,5,0
-    START_H_FILTER 4
-    VC1_H_LOOP_FILTER 4, r4
-    RET
-
-INIT_XMM
-; void ff_vc1_v_loop_filter8_ssse3(uint8_t *src, int stride, int pq)
-cglobal vc1_v_loop_filter8_ssse3, 3,5,8
-    START_V_FILTER
-    VC1_V_LOOP_FILTER 8, q
-    RET
-
-; void ff_vc1_h_loop_filter8_ssse3(uint8_t *src, int stride, int pq)
-cglobal vc1_h_loop_filter8_ssse3, 3,6,8
-    START_H_FILTER 8
-    VC1_H_LOOP_FILTER 8, r5
-    RET
-
-; void ff_vc1_h_loop_filter8_sse4(uint8_t *src, int stride, int pq)
-cglobal vc1_h_loop_filter8_sse4, 3,5,8
-    START_H_FILTER 8
-    VC1_H_LOOP_FILTER 8
-    RET
+;******************************************************************************
+;* VC1 deblocking optimizations
+;* Copyright (c) 2009 David Conrad
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "x86inc.asm"
+%include "x86util.asm"
+
+cextern pw_4
+cextern pw_5
+
+section .text
+
+; dst_low, dst_high (src), zero
+; zero-extends one vector from 8 to 16 bits
+%macro UNPACK_8TO16 4
+    mova      m%2, m%3
+    punpckh%1 m%3, m%4
+    punpckl%1 m%2, m%4
+%endmacro
+
+%macro STORE_4_WORDS_MMX 6
+    movd   %6, %5
+%if mmsize==16
+    psrldq %5, 4
+%else
+    psrlq  %5, 32
+%endif
+    mov    %1, %6w
+    shr    %6, 16
+    mov    %2, %6w
+    movd   %6, %5
+    mov    %3, %6w
+    shr    %6, 16
+    mov    %4, %6w
+%endmacro
+
+%macro STORE_4_WORDS_SSE4 6
+    pextrw %1, %5, %6+0
+    pextrw %2, %5, %6+1
+    pextrw %3, %5, %6+2
+    pextrw %4, %5, %6+3
+%endmacro
+
+; in:  p1 p0 q0 q1, clobbers p0
+; out: p1 = (2*(p1 - q1) - 5*(p0 - q0) + 4) >> 3
+%macro VC1_LOOP_FILTER_A0 4
+    psubw  %1, %4
+    psubw  %2, %3
+    paddw  %1, %1
+    pmullw %2, [pw_5]
+    psubw  %1, %2
+    paddw  %1, [pw_4]
+    psraw  %1, 3
+%endmacro
+
+; in: p0 q0 a0 a1 a2
+;     m0 m1 m7 m6 m5
+; %1: size
+; out: m0=p0' m1=q0'
+%macro VC1_FILTER 1
+    PABSW   m4, m7
+    PABSW   m3, m6
+    PABSW   m2, m5
+    mova    m6, m4
+    pminsw  m3, m2
+    pcmpgtw m6, m3  ; if (a2 < a0 || a1 < a0)
+    psubw   m3, m4
+    pmullw  m3, [pw_5]   ; 5*(a3 - a0)
+    PABSW   m2, m3
+    psraw   m2, 3   ; abs(d/8)
+    pxor    m7, m3  ; d_sign ^= a0_sign
+
+    pxor    m5, m5
+    movd    m3, r2
+%if %1 > 4
+    punpcklbw m3, m3
+%endif
+    punpcklbw m3, m5
+    pcmpgtw m3, m4  ; if (a0 < pq)
+    pand    m6, m3
+
+    mova    m3, m0
+    psubw   m3, m1
+    PABSW   m4, m3
+    psraw   m4, 1
+    pxor    m3, m7  ; d_sign ^ clip_sign
+    psraw   m3, 15
+    pminsw  m2, m4  ; min(d, clip)
+    pcmpgtw m4, m5
+    pand    m6, m4  ; filt3 (C return value)
+
+; each set of 4 pixels is not filtered if the 3rd is not
+%if mmsize==16
+    pshuflw m4, m6, 0xaa
+%if %1 > 4
+    pshufhw m4, m4, 0xaa
+%endif
+%else
+    pshufw  m4, m6, 0xaa
+%endif
+    pandn   m3, m4
+    pand    m2, m6
+    pand    m3, m2  ; d final
+
+    PSIGNW  m3, m7
+    psubw   m0, m3
+    paddw   m1, m3
+    packuswb m0, m0
+    packuswb m1, m1
+%endmacro
+
+; 1st param: size of filter
+; 2nd param: mov suffix equivalent to the filter size
+%macro VC1_V_LOOP_FILTER 2
+    pxor      m5, m5
+    mov%2     m6, [r4]
+    mov%2     m4, [r4+r1]
+    mov%2     m7, [r4+2*r1]
+    mov%2     m0, [r4+r3]
+    punpcklbw m6, m5
+    punpcklbw m4, m5
+    punpcklbw m7, m5
+    punpcklbw m0, m5
+
+    VC1_LOOP_FILTER_A0 m6, m4, m7, m0
+    mov%2     m1, [r0]
+    mov%2     m2, [r0+r1]
+    punpcklbw m1, m5
+    punpcklbw m2, m5
+    mova      m4, m0
+    VC1_LOOP_FILTER_A0 m7, m4, m1, m2
+    mov%2     m3, [r0+2*r1]
+    mov%2     m4, [r0+r3]
+    punpcklbw m3, m5
+    punpcklbw m4, m5
+    mova      m5, m1
+    VC1_LOOP_FILTER_A0 m5, m2, m3, m4
+
+    VC1_FILTER %1
+    mov%2 [r4+r3], m0
+    mov%2 [r0],    m1
+%endmacro
+
+; 1st param: size of filter
+;     NOTE: UNPACK_8TO16 this number of 8 bit numbers are in half a register
+; 2nd (optional) param: temp register to use for storing words
+%macro VC1_H_LOOP_FILTER 1-2
+%if %1 == 4
+    movq      m0, [r0     -4]
+    movq      m1, [r0+  r1-4]
+    movq      m2, [r0+2*r1-4]
+    movq      m3, [r0+  r3-4]
+    TRANSPOSE4x4B 0, 1, 2, 3, 4
+%else
+    movq      m0, [r0     -4]
+    movq      m4, [r0+  r1-4]
+    movq      m1, [r0+2*r1-4]
+    movq      m5, [r0+  r3-4]
+    movq      m2, [r4     -4]
+    movq      m6, [r4+  r1-4]
+    movq      m3, [r4+2*r1-4]
+    movq      m7, [r4+  r3-4]
+    punpcklbw m0, m4
+    punpcklbw m1, m5
+    punpcklbw m2, m6
+    punpcklbw m3, m7
+    TRANSPOSE4x4W 0, 1, 2, 3, 4
+%endif
+    pxor      m5, m5
+
+    UNPACK_8TO16 bw, 6, 0, 5
+    UNPACK_8TO16 bw, 7, 1, 5
+    VC1_LOOP_FILTER_A0 m6, m0, m7, m1
+    UNPACK_8TO16 bw, 4, 2, 5
+    mova    m0, m1                      ; m0 = p0
+    VC1_LOOP_FILTER_A0 m7, m1, m4, m2
+    UNPACK_8TO16 bw, 1, 3, 5
+    mova    m5, m4
+    VC1_LOOP_FILTER_A0 m5, m2, m1, m3
+    SWAP 1, 4                           ; m1 = q0
+
+    VC1_FILTER %1
+    punpcklbw m0, m1
+%if %0 > 1
+    STORE_4_WORDS_MMX [r0-1], [r0+r1-1], [r0+2*r1-1], [r0+r3-1], m0, %2
+%if %1 > 4
+    psrldq m0, 4
+    STORE_4_WORDS_MMX [r4-1], [r4+r1-1], [r4+2*r1-1], [r4+r3-1], m0, %2
+%endif
+%else
+    STORE_4_WORDS_SSE4 [r0-1], [r0+r1-1], [r0+2*r1-1], [r0+r3-1], m0, 0
+    STORE_4_WORDS_SSE4 [r4-1], [r4+r1-1], [r4+2*r1-1], [r4+r3-1], m0, 4
+%endif
+%endmacro
+
+
+%macro START_V_FILTER 0
+    mov  r4, r0
+    lea  r3, [4*r1]
+    sub  r4, r3
+    lea  r3, [r1+2*r1]
+    imul r2, 0x01010101
+%endmacro
+
+%macro START_H_FILTER 1
+    lea  r3, [r1+2*r1]
+%if %1 > 4
+    lea  r4, [r0+4*r1]
+%endif
+    imul r2, 0x01010101
+%endmacro
+
+; I dont know why the sign extension is needed...
+%macro PSIGNW_SRA_MMX 2
+    psraw %2, 15
+    PSIGNW_MMX %1, %2
+%endmacro
+
+
+%macro VC1_LF_MMX 1
+INIT_MMX
+cglobal vc1_v_loop_filter_internal_%1
+    VC1_V_LOOP_FILTER 4, d
+    ret
+
+cglobal vc1_h_loop_filter_internal_%1
+    VC1_H_LOOP_FILTER 4, r4
+    ret
+
+; void ff_vc1_v_loop_filter4_mmx2(uint8_t *src, int stride, int pq)
+cglobal vc1_v_loop_filter4_%1, 3,5,0
+    START_V_FILTER
+    call vc1_v_loop_filter_internal_%1
+    RET
+
+; void ff_vc1_h_loop_filter4_mmx2(uint8_t *src, int stride, int pq)
+cglobal vc1_h_loop_filter4_%1, 3,5,0
+    START_H_FILTER 4
+    call vc1_h_loop_filter_internal_%1
+    RET
+
+; void ff_vc1_v_loop_filter8_mmx2(uint8_t *src, int stride, int pq)
+cglobal vc1_v_loop_filter8_%1, 3,5,0
+    START_V_FILTER
+    call vc1_v_loop_filter_internal_%1
+    add  r4, 4
+    add  r0, 4
+    call vc1_v_loop_filter_internal_%1
+    RET
+
+; void ff_vc1_h_loop_filter8_mmx2(uint8_t *src, int stride, int pq)
+cglobal vc1_h_loop_filter8_%1, 3,5,0
+    START_H_FILTER 4
+    call vc1_h_loop_filter_internal_%1
+    lea  r0, [r0+4*r1]
+    call vc1_h_loop_filter_internal_%1
+    RET
+%endmacro
+
+%define PABSW PABSW_MMX
+%define PSIGNW PSIGNW_SRA_MMX
+VC1_LF_MMX mmx
+
+%define PABSW PABSW_MMX2
+VC1_LF_MMX mmx2
+
+INIT_XMM
+; void ff_vc1_v_loop_filter8_sse2(uint8_t *src, int stride, int pq)
+cglobal vc1_v_loop_filter8_sse2, 3,5,8
+    START_V_FILTER
+    VC1_V_LOOP_FILTER 8, q
+    RET
+
+; void ff_vc1_h_loop_filter8_sse2(uint8_t *src, int stride, int pq)
+cglobal vc1_h_loop_filter8_sse2, 3,6,8
+    START_H_FILTER 8
+    VC1_H_LOOP_FILTER 8, r5
+    RET
+
+%define PABSW PABSW_SSSE3
+%define PSIGNW PSIGNW_SSSE3
+
+INIT_MMX
+; void ff_vc1_v_loop_filter4_ssse3(uint8_t *src, int stride, int pq)
+cglobal vc1_v_loop_filter4_ssse3, 3,5,0
+    START_V_FILTER
+    VC1_V_LOOP_FILTER 4, d
+    RET
+
+; void ff_vc1_h_loop_filter4_ssse3(uint8_t *src, int stride, int pq)
+cglobal vc1_h_loop_filter4_ssse3, 3,5,0
+    START_H_FILTER 4
+    VC1_H_LOOP_FILTER 4, r4
+    RET
+
+INIT_XMM
+; void ff_vc1_v_loop_filter8_ssse3(uint8_t *src, int stride, int pq)
+cglobal vc1_v_loop_filter8_ssse3, 3,5,8
+    START_V_FILTER
+    VC1_V_LOOP_FILTER 8, q
+    RET
+
+; void ff_vc1_h_loop_filter8_ssse3(uint8_t *src, int stride, int pq)
+cglobal vc1_h_loop_filter8_ssse3, 3,6,8
+    START_H_FILTER 8
+    VC1_H_LOOP_FILTER 8, r5
+    RET
+
+; void ff_vc1_h_loop_filter8_sse4(uint8_t *src, int stride, int pq)
+cglobal vc1_h_loop_filter8_sse4, 3,5,8
+    START_H_FILTER 8
+    VC1_H_LOOP_FILTER 8
+    RET
diff --git a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/vp8dsp-init.c b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/vp8dsp-init.c
index d75f1a1d8..e06da5e42 100644
--- a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/vp8dsp-init.c
+++ b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/vp8dsp-init.c
@@ -220,67 +220,39 @@ HVBILIN(ssse3,  8, 16, 16)
 
 extern void ff_vp8_idct_dc_add_mmx(uint8_t *dst, DCTELEM block[16], int stride);
 extern void ff_vp8_idct_dc_add_sse4(uint8_t *dst, DCTELEM block[16], int stride);
-extern void ff_vp8_luma_dc_wht_mmxext(DCTELEM block[4][4][16], DCTELEM dc[16]);
+extern void ff_vp8_idct_dc_add4y_mmx(uint8_t *dst, DCTELEM block[4][16], int stride);
+extern void ff_vp8_idct_dc_add4y_sse2(uint8_t *dst, DCTELEM block[4][16], int stride);
+extern void ff_vp8_idct_dc_add4uv_mmx(uint8_t *dst, DCTELEM block[2][16], int stride);
+extern void ff_vp8_luma_dc_wht_mmx(DCTELEM block[4][4][16], DCTELEM dc[16]);
 extern void ff_vp8_idct_add_mmx(uint8_t *dst, DCTELEM block[16], int stride);
+extern void ff_vp8_idct_add_sse(uint8_t *dst, DCTELEM block[16], int stride);
+
+#define DECLARE_LOOP_FILTER(NAME)\
+extern void ff_vp8_v_loop_filter_simple_ ## NAME(uint8_t *dst, int stride, int flim);\
+extern void ff_vp8_h_loop_filter_simple_ ## NAME(uint8_t *dst, int stride, int flim);\
+extern void ff_vp8_v_loop_filter16y_inner_ ## NAME (uint8_t *dst, int stride,\
+                                                    int e, int i, int hvt);\
+extern void ff_vp8_h_loop_filter16y_inner_ ## NAME (uint8_t *dst, int stride,\
+                                                    int e, int i, int hvt);\
+extern void ff_vp8_v_loop_filter8uv_inner_ ## NAME (uint8_t *dstU, uint8_t *dstV,\
+                                                    int s, int e, int i, int hvt);\
+extern void ff_vp8_h_loop_filter8uv_inner_ ## NAME (uint8_t *dstU, uint8_t *dstV,\
+                                                    int s, int e, int i, int hvt);\
+extern void ff_vp8_v_loop_filter16y_mbedge_ ## NAME(uint8_t *dst, int stride,\
+                                                    int e, int i, int hvt);\
+extern void ff_vp8_h_loop_filter16y_mbedge_ ## NAME(uint8_t *dst, int stride,\
+                                                    int e, int i, int hvt);\
+extern void ff_vp8_v_loop_filter8uv_mbedge_ ## NAME(uint8_t *dstU, uint8_t *dstV,\
+                                                    int s, int e, int i, int hvt);\
+extern void ff_vp8_h_loop_filter8uv_mbedge_ ## NAME(uint8_t *dstU, uint8_t *dstV,\
+                                                    int s, int e, int i, int hvt);
+
+DECLARE_LOOP_FILTER(mmx)
+DECLARE_LOOP_FILTER(mmxext)
+DECLARE_LOOP_FILTER(sse2)
+DECLARE_LOOP_FILTER(ssse3)
+DECLARE_LOOP_FILTER(sse4)
 
-extern void ff_vp8_v_loop_filter_simple_mmx   (uint8_t *dst, int stride, int flim);
-extern void ff_vp8_v_loop_filter_simple_mmxext(uint8_t *dst, int stride, int flim);
-extern void ff_vp8_v_loop_filter_simple_sse2  (uint8_t *dst, int stride, int flim);
-extern void ff_vp8_h_loop_filter_simple_mmx   (uint8_t *dst, int stride, int flim);
-extern void ff_vp8_h_loop_filter_simple_mmxext(uint8_t *dst, int stride, int flim);
-extern void ff_vp8_h_loop_filter_simple_sse2  (uint8_t *dst, int stride, int flim);
-
-extern void ff_vp8_v_loop_filter16y_inner_mmx   (uint8_t *dst, int stride,
-                                                 int e, int i, int hvt);
-extern void ff_vp8_v_loop_filter16y_inner_mmxext(uint8_t *dst, int stride,
-                                                 int e, int i, int hvt);
-extern void ff_vp8_v_loop_filter16y_inner_sse2  (uint8_t *dst, int stride,
-                                                 int e, int i, int hvt);
-extern void ff_vp8_h_loop_filter16y_inner_mmx   (uint8_t *dst, int stride,
-                                                 int e, int i, int hvt);
-extern void ff_vp8_h_loop_filter16y_inner_mmxext(uint8_t *dst, int stride,
-                                                 int e, int i, int hvt);
-extern void ff_vp8_h_loop_filter16y_inner_sse2  (uint8_t *dst, int stride,
-                                                 int e, int i, int hvt);
-
-extern void ff_vp8_v_loop_filter8uv_inner_mmx   (uint8_t *dstU, uint8_t *dstV,
-                                                 int s, int e, int i, int hvt);
-extern void ff_vp8_v_loop_filter8uv_inner_mmxext(uint8_t *dstU, uint8_t *dstV,
-                                                 int s, int e, int i, int hvt);
-extern void ff_vp8_v_loop_filter8uv_inner_sse2  (uint8_t *dstU, uint8_t *dstV,
-                                                 int s, int e, int i, int hvt);
-extern void ff_vp8_h_loop_filter8uv_inner_mmx   (uint8_t *dstU, uint8_t *dstV,
-                                                 int s, int e, int i, int hvt);
-extern void ff_vp8_h_loop_filter8uv_inner_mmxext(uint8_t *dstU, uint8_t *dstV,
-                                                 int s, int e, int i, int hvt);
-extern void ff_vp8_h_loop_filter8uv_inner_sse2  (uint8_t *dstU, uint8_t *dstV,
-                                                 int s, int e, int i, int hvt);
-
-extern void ff_vp8_v_loop_filter16y_mbedge_mmx   (uint8_t *dst, int stride,
-                                                  int e, int i, int hvt);
-extern void ff_vp8_v_loop_filter16y_mbedge_mmxext(uint8_t *dst, int stride,
-                                                  int e, int i, int hvt);
-extern void ff_vp8_v_loop_filter16y_mbedge_sse2  (uint8_t *dst, int stride,
-                                                  int e, int i, int hvt);
-extern void ff_vp8_h_loop_filter16y_mbedge_mmx   (uint8_t *dst, int stride,
-                                                  int e, int i, int hvt);
-extern void ff_vp8_h_loop_filter16y_mbedge_mmxext(uint8_t *dst, int stride,
-                                                  int e, int i, int hvt);
-extern void ff_vp8_h_loop_filter16y_mbedge_sse2  (uint8_t *dst, int stride,
-                                                  int e, int i, int hvt);
-
-extern void ff_vp8_v_loop_filter8uv_mbedge_mmx   (uint8_t *dstU, uint8_t *dstV,
-                                                  int s, int e, int i, int hvt);
-extern void ff_vp8_v_loop_filter8uv_mbedge_mmxext(uint8_t *dstU, uint8_t *dstV,
-                                                  int s, int e, int i, int hvt);
-extern void ff_vp8_v_loop_filter8uv_mbedge_sse2  (uint8_t *dstU, uint8_t *dstV,
-                                                  int s, int e, int i, int hvt);
-extern void ff_vp8_h_loop_filter8uv_mbedge_mmx   (uint8_t *dstU, uint8_t *dstV,
-                                                  int s, int e, int i, int hvt);
-extern void ff_vp8_h_loop_filter8uv_mbedge_mmxext(uint8_t *dstU, uint8_t *dstV,
-                                                  int s, int e, int i, int hvt);
-extern void ff_vp8_h_loop_filter8uv_mbedge_sse2  (uint8_t *dstU, uint8_t *dstV,
-                                                  int s, int e, int i, int hvt);
 #endif
 
 #define VP8_LUMA_MC_FUNC(IDX, SIZE, OPT) \
@@ -313,8 +285,11 @@ av_cold void ff_vp8dsp_init_x86(VP8DSPContext* c)
 
 #if HAVE_YASM
     if (mm_flags & FF_MM_MMX) {
-        c->vp8_idct_dc_add                  = ff_vp8_idct_dc_add_mmx;
-        c->vp8_idct_add                     = ff_vp8_idct_add_mmx;
+        c->vp8_idct_dc_add    = ff_vp8_idct_dc_add_mmx;
+        c->vp8_idct_dc_add4y  = ff_vp8_idct_dc_add4y_mmx;
+        c->vp8_idct_dc_add4uv = ff_vp8_idct_dc_add4uv_mmx;
+        c->vp8_idct_add       = ff_vp8_idct_add_mmx;
+        c->vp8_luma_dc_wht    = ff_vp8_luma_dc_wht_mmx;
         c->put_vp8_epel_pixels_tab[0][0][0]     =
         c->put_vp8_bilinear_pixels_tab[0][0][0] = ff_put_vp8_pixels16_mmx;
         c->put_vp8_epel_pixels_tab[1][0][0]     =
@@ -337,7 +312,6 @@ av_cold void ff_vp8dsp_init_x86(VP8DSPContext* c)
     /* note that 4-tap width=16 functions are missing because w=16
      * is only used for luma, and luma is always a copy or sixtap. */
     if (mm_flags & FF_MM_MMX2) {
-        c->vp8_luma_dc_wht = ff_vp8_luma_dc_wht_mmxext;
 #if ARCH_X86_32
         VP8_LUMA_MC_FUNC(0, 16, mmxext);
         VP8_MC_FUNC(1, 8, mmxext);
@@ -362,6 +336,7 @@ av_cold void ff_vp8dsp_init_x86(VP8DSPContext* c)
     }
 
     if (mm_flags & FF_MM_SSE) {
+        c->vp8_idct_add                         = ff_vp8_idct_add_sse;
         c->put_vp8_epel_pixels_tab[0][0][0]     =
         c->put_vp8_bilinear_pixels_tab[0][0][0] = ff_put_vp8_pixels16_sse;
     }
@@ -380,16 +355,18 @@ av_cold void ff_vp8dsp_init_x86(VP8DSPContext* c)
         c->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16y_inner_sse2;
         c->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_sse2;
 
-        c->vp8_v_loop_filter16y       = ff_vp8_v_loop_filter16y_mbedge_mmxext;
-        c->vp8_v_loop_filter8uv       = ff_vp8_v_loop_filter8uv_mbedge_mmxext;
+        c->vp8_v_loop_filter16y       = ff_vp8_v_loop_filter16y_mbedge_sse2;
+        c->vp8_v_loop_filter8uv       = ff_vp8_v_loop_filter8uv_mbedge_sse2;
     }
 
     if (mm_flags & FF_MM_SSE2) {
+        c->vp8_idct_dc_add4y          = ff_vp8_idct_dc_add4y_sse2;
+
         c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_sse2;
         c->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_sse2;
 
-        c->vp8_h_loop_filter16y       = ff_vp8_h_loop_filter16y_mbedge_mmxext;
-        c->vp8_h_loop_filter8uv       = ff_vp8_h_loop_filter8uv_mbedge_mmxext;
+        c->vp8_h_loop_filter16y       = ff_vp8_h_loop_filter16y_mbedge_sse2;
+        c->vp8_h_loop_filter8uv       = ff_vp8_h_loop_filter8uv_mbedge_sse2;
     }
 
     if (mm_flags & FF_MM_SSSE3) {
@@ -401,10 +378,26 @@ av_cold void ff_vp8dsp_init_x86(VP8DSPContext* c)
         VP8_BILINEAR_MC_FUNC(1, 8, ssse3);
         VP8_BILINEAR_MC_FUNC(2, 4, ssse3);
 #endif
+
+        c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_ssse3;
+        c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_ssse3;
+
+        c->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16y_inner_ssse3;
+        c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_ssse3;
+        c->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_ssse3;
+        c->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_ssse3;
+
+        c->vp8_v_loop_filter16y       = ff_vp8_v_loop_filter16y_mbedge_ssse3;
+        c->vp8_h_loop_filter16y       = ff_vp8_h_loop_filter16y_mbedge_ssse3;
+        c->vp8_v_loop_filter8uv       = ff_vp8_v_loop_filter8uv_mbedge_ssse3;
+        c->vp8_h_loop_filter8uv       = ff_vp8_h_loop_filter8uv_mbedge_ssse3;
     }
 
     if (mm_flags & FF_MM_SSE4) {
         c->vp8_idct_dc_add                  = ff_vp8_idct_dc_add_sse4;
+
+        c->vp8_h_loop_filter16y       = ff_vp8_h_loop_filter16y_mbedge_sse4;
+        c->vp8_h_loop_filter8uv       = ff_vp8_h_loop_filter8uv_mbedge_sse4;
     }
 #endif
 }
diff --git a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/vp8dsp.asm b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/vp8dsp.asm
index 2ff415266..4aa901e27 100644
--- a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/vp8dsp.asm
+++ b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/vp8dsp.asm
@@ -900,74 +900,189 @@ cglobal put_vp8_pixels16_sse, 5,5,2
     REP_RET
 
 ;-----------------------------------------------------------------------------
-; IDCT functions:
-;
 ; void vp8_idct_dc_add_<opt>(uint8_t *dst, DCTELEM block[16], int stride);
 ;-----------------------------------------------------------------------------
 
+%macro ADD_DC 4
+    %4        m2, [r0+%3]
+    %4        m3, [r0+r2+%3]
+    %4        m4, [r1+%3]
+    %4        m5, [r1+r2+%3]
+    paddusb   m2, %1
+    paddusb   m3, %1
+    paddusb   m4, %1
+    paddusb   m5, %1
+    psubusb   m2, %2
+    psubusb   m3, %2
+    psubusb   m4, %2
+    psubusb   m5, %2
+    %4    [r0+%3], m2
+    %4 [r0+r2+%3], m3
+    %4    [r1+%3], m4
+    %4 [r1+r2+%3], m5
+%endmacro
+
+INIT_MMX
 cglobal vp8_idct_dc_add_mmx, 3, 3
     ; load data
-    movd       mm0, [r1]
+    movd       m0, [r1]
 
     ; calculate DC
-    paddw      mm0, [pw_4]
-    pxor       mm1, mm1
-    psraw      mm0, 3
-    psubw      mm1, mm0
-    packuswb   mm0, mm0
-    packuswb   mm1, mm1
-    punpcklbw  mm0, mm0
-    punpcklbw  mm1, mm1
-    punpcklwd  mm0, mm0
-    punpcklwd  mm1, mm1
+    paddw      m0, [pw_4]
+    pxor       m1, m1
+    psraw      m0, 3
+    movd      [r1], m1
+    psubw      m1, m0
+    packuswb   m0, m0
+    packuswb   m1, m1
+    punpcklbw  m0, m0
+    punpcklbw  m1, m1
+    punpcklwd  m0, m0
+    punpcklwd  m1, m1
 
     ; add DC
-    lea         r1, [r0+r2*2]
-    movd       mm2, [r0]
-    movd       mm3, [r0+r2]
-    movd       mm4, [r1]
-    movd       mm5, [r1+r2]
-    paddusb    mm2, mm0
-    paddusb    mm3, mm0
-    paddusb    mm4, mm0
-    paddusb    mm5, mm0
-    psubusb    mm2, mm1
-    psubusb    mm3, mm1
-    psubusb    mm4, mm1
-    psubusb    mm5, mm1
-    movd      [r0], mm2
-    movd   [r0+r2], mm3
-    movd      [r1], mm4
-    movd   [r1+r2], mm5
+    lea        r1, [r0+r2*2]
+    ADD_DC     m0, m1, 0, movh
     RET
 
+INIT_XMM
 cglobal vp8_idct_dc_add_sse4, 3, 3, 6
     ; load data
-    movd       xmm0, [r1]
-    lea          r1, [r0+r2*2]
-    pxor       xmm1, xmm1
-    movq       xmm2, [pw_4]
+    movd       m0, [r1]
+    pxor       m1, m1
+
+    ; calculate DC
+    paddw      m0, [pw_4]
+    movd     [r1], m1
+    lea        r1, [r0+r2*2]
+    movd       m2, [r0]
+    movd       m3, [r0+r2]
+    movd       m4, [r1]
+    movd       m5, [r1+r2]
+    psraw      m0, 3
+    pshuflw    m0, m0, 0
+    punpcklqdq m0, m0
+    punpckldq  m2, m3
+    punpckldq  m4, m5
+    punpcklbw  m2, m1
+    punpcklbw  m4, m1
+    paddw      m2, m0
+    paddw      m4, m0
+    packuswb   m2, m4
+    movd      [r0], m2
+    pextrd [r0+r2], m2, 1
+    pextrd    [r1], m2, 2
+    pextrd [r1+r2], m2, 3
+    RET
+
+;-----------------------------------------------------------------------------
+; void vp8_idct_dc_add4y_<opt>(uint8_t *dst, DCTELEM block[4][16], int stride);
+;-----------------------------------------------------------------------------
+
+INIT_MMX
+cglobal vp8_idct_dc_add4y_mmx, 3, 3
+    ; load data
+    movd      m0, [r1+32*0] ; A
+    movd      m1, [r1+32*2] ; C
+    punpcklwd m0, [r1+32*1] ; A B
+    punpcklwd m1, [r1+32*3] ; C D
+    punpckldq m0, m1        ; A B C D
+    pxor      m6, m6
+
+    ; calculate DC
+    paddw     m0, [pw_4]
+    movd [r1+32*0], m6
+    movd [r1+32*1], m6
+    movd [r1+32*2], m6
+    movd [r1+32*3], m6
+    psraw     m0, 3
+    psubw     m6, m0
+    packuswb  m0, m0
+    packuswb  m6, m6
+    punpcklbw m0, m0 ; AABBCCDD
+    punpcklbw m6, m6 ; AABBCCDD
+    movq      m1, m0
+    movq      m7, m6
+    punpcklbw m0, m0 ; AAAABBBB
+    punpckhbw m1, m1 ; CCCCDDDD
+    punpcklbw m6, m6 ; AAAABBBB
+    punpckhbw m7, m7 ; CCCCDDDD
+
+    ; add DC
+    lea       r1, [r0+r2*2]
+    ADD_DC    m0, m6, 0, mova
+    ADD_DC    m1, m7, 8, mova
+    RET
+
+INIT_XMM
+cglobal vp8_idct_dc_add4y_sse2, 3, 3, 6
+    ; load data
+    movd      m0, [r1+32*0] ; A
+    movd      m1, [r1+32*2] ; C
+    punpcklwd m0, [r1+32*1] ; A B
+    punpcklwd m1, [r1+32*3] ; C D
+    punpckldq m0, m1        ; A B C D
+    pxor      m1, m1
 
     ; calculate DC
-    paddw      xmm0, xmm2
-    movd       xmm2, [r0]
-    movd       xmm3, [r0+r2]
-    movd       xmm4, [r1]
-    movd       xmm5, [r1+r2]
-    psraw      xmm0, 3
-    pshuflw    xmm0, xmm0, 0
-    punpcklqdq xmm0, xmm0
-    punpckldq  xmm2, xmm3
-    punpckldq  xmm4, xmm5
-    punpcklbw  xmm2, xmm1
-    punpcklbw  xmm4, xmm1
-    paddw      xmm2, xmm0
-    paddw      xmm4, xmm0
-    packuswb   xmm2, xmm4
-    movd       [r0], xmm2
-    pextrd  [r0+r2], xmm2, 1
-    pextrd     [r1], xmm2, 2
-    pextrd  [r1+r2], xmm2, 3
+    paddw     m0, [pw_4]
+    movd [r1+32*0], m1
+    movd [r1+32*1], m1
+    movd [r1+32*2], m1
+    movd [r1+32*3], m1
+    psraw     m0, 3
+    psubw     m1, m0
+    packuswb  m0, m0
+    packuswb  m1, m1
+    punpcklbw m0, m0
+    punpcklbw m1, m1
+    punpcklbw m0, m0
+    punpcklbw m1, m1
+
+    ; add DC
+    lea       r1, [r0+r2*2]
+    ADD_DC    m0, m1, 0, mova
+    RET
+
+;-----------------------------------------------------------------------------
+; void vp8_idct_dc_add4uv_<opt>(uint8_t *dst, DCTELEM block[4][16], int stride);
+;-----------------------------------------------------------------------------
+
+INIT_MMX
+cglobal vp8_idct_dc_add4uv_mmx, 3, 3
+    ; load data
+    movd      m0, [r1+32*0] ; A
+    movd      m1, [r1+32*2] ; C
+    punpcklwd m0, [r1+32*1] ; A B
+    punpcklwd m1, [r1+32*3] ; C D
+    punpckldq m0, m1        ; A B C D
+    pxor      m6, m6
+
+    ; calculate DC
+    paddw     m0, [pw_4]
+    movd [r1+32*0], m6
+    movd [r1+32*1], m6
+    movd [r1+32*2], m6
+    movd [r1+32*3], m6
+    psraw     m0, 3
+    psubw     m6, m0
+    packuswb  m0, m0
+    packuswb  m6, m6
+    punpcklbw m0, m0 ; AABBCCDD
+    punpcklbw m6, m6 ; AABBCCDD
+    movq      m1, m0
+    movq      m7, m6
+    punpcklbw m0, m0 ; AAAABBBB
+    punpckhbw m1, m1 ; CCCCDDDD
+    punpcklbw m6, m6 ; AAAABBBB
+    punpckhbw m7, m7 ; CCCCDDDD
+
+    ; add DC
+    lea       r1, [r0+r2*2]
+    ADD_DC    m0, m6, 0, mova
+    lea       r0, [r0+r2*4]
+    lea       r1, [r1+r2*4]
+    ADD_DC    m1, m7, 0, mova
     RET
 
 ;-----------------------------------------------------------------------------
@@ -1006,14 +1121,26 @@ cglobal vp8_idct_dc_add_sse4, 3, 3, 6
 %endmacro
 
 INIT_MMX
-cglobal vp8_idct_add_mmx, 3, 3
+%macro VP8_IDCT_ADD 1
+cglobal vp8_idct_add_%1, 3, 3
     ; load block data
-    movq         m0, [r1]
-    movq         m1, [r1+8]
+    movq         m0, [r1+ 0]
+    movq         m1, [r1+ 8]
     movq         m2, [r1+16]
     movq         m3, [r1+24]
     movq         m6, [pw_20091]
     movq         m7, [pw_17734]
+%ifidn %1, sse
+    xorps      xmm0, xmm0
+    movaps  [r1+ 0], xmm0
+    movaps  [r1+16], xmm0
+%else
+    pxor         m4, m4
+    movq    [r1+ 0], m4
+    movq    [r1+ 8], m4
+    movq    [r1+16], m4
+    movq    [r1+24], m4
+%endif
 
     ; actual IDCT
     VP8_IDCT_TRANSFORM4x4_1D 0, 1, 2, 3, 4, 5
@@ -1029,20 +1156,34 @@ cglobal vp8_idct_add_mmx, 3, 3
     STORE_DIFFx2 m2, m3, m6, m7, m4, 3, r1, r2
 
     RET
+%endmacro
+
+VP8_IDCT_ADD mmx
+VP8_IDCT_ADD sse
 
 ;-----------------------------------------------------------------------------
 ; void vp8_luma_dc_wht_mmxext(DCTELEM block[4][4][16], DCTELEM dc[16])
 ;-----------------------------------------------------------------------------
 
-%macro SCATTER_WHT 1
-    pextrw r1d, m0, %1
-    pextrw r2d, m1, %1
-    mov [r0+2*16*0], r1w
-    mov [r0+2*16*1], r2w
-    pextrw r1d, m2, %1
-    pextrw r2d, m3, %1
-    mov [r0+2*16*2], r1w
-    mov [r0+2*16*3], r2w
+%macro SCATTER_WHT 3
+    movd  r1d, m%1
+    movd  r2d, m%2
+    mov [r0+2*16*(0+%3)], r1w
+    mov [r0+2*16*(1+%3)], r2w
+    shr   r1d, 16
+    shr   r2d, 16
+    psrlq m%1, 32
+    psrlq m%2, 32
+    mov [r0+2*16*(4+%3)], r1w
+    mov [r0+2*16*(5+%3)], r2w
+    movd  r1d, m%1
+    movd  r2d, m%2
+    mov [r0+2*16*(8+%3)], r1w
+    mov [r0+2*16*(9+%3)], r2w
+    shr   r1d, 16
+    shr   r2d, 16
+    mov [r0+2*16*(12+%3)], r1w
+    mov [r0+2*16*(13+%3)], r2w
 %endmacro
 
 %macro HADAMARD4_1D 4
@@ -1052,7 +1193,7 @@ cglobal vp8_idct_add_mmx, 3, 3
 %endmacro
 
 INIT_MMX
-cglobal vp8_luma_dc_wht_mmxext, 2,3
+cglobal vp8_luma_dc_wht_mmx, 2,3
     movq          m0, [r1]
     movq          m1, [r1+8]
     movq          m2, [r1+16]
@@ -1065,13 +1206,8 @@ cglobal vp8_luma_dc_wht_mmxext, 2,3
     psraw         m1, 3
     psraw         m2, 3
     psraw         m3, 3
-    SCATTER_WHT   0
-    add           r0, 2*16*4
-    SCATTER_WHT   1
-    add           r0, 2*16*4
-    SCATTER_WHT   2
-    add           r0, 2*16*4
-    SCATTER_WHT   3
+    SCATTER_WHT   0, 1, 0
+    SCATTER_WHT   2, 3, 2
     RET
 
 ;-----------------------------------------------------------------------------
@@ -1224,18 +1360,22 @@ cglobal vp8_luma_dc_wht_mmxext, 2,3
     movd    [%7+%9*2], m%4
 %endmacro
 
-%macro SPLATB_REG 3
+%macro SPLATB_REG 3-4
     movd           %1, %2
+%ifidn %3, ssse3
+    pshufb         %1, %4
+%else
     punpcklbw      %1, %1
 %if mmsize == 16 ; sse2
-    punpcklwd      %1, %1
-    pshufd         %1, %1, 0x0
+    pshuflw        %1, %1, 0x0
+    punpcklqdq     %1, %1
 %elifidn %3, mmx
     punpcklwd      %1, %1
     punpckldq      %1, %1
 %else ; mmxext
     pshufw         %1, %1, 0x0
 %endif
+%endif
 %endmacro
 
 %macro SIMPLE_LOOPFILTER 3
@@ -1247,7 +1387,10 @@ cglobal vp8_%2_loop_filter_simple_%1, 3, %3
 %if mmsize == 8 ; mmx/mmxext
     mov            r3, 2
 %endif
-    SPLATB_REG     m7, r2, %1       ; splat "flim" into register
+%ifidn %1, ssse3
+    pxor           m0, m0
+%endif
+    SPLATB_REG     m7, r2, %1, m0   ; splat "flim" into register
 
     ; set up indexes to address 4 rows
     mov            r2, r1
@@ -1393,6 +1536,8 @@ SIMPLE_LOOPFILTER mmxext, h, 6
 INIT_XMM
 SIMPLE_LOOPFILTER sse2,   v, 3
 SIMPLE_LOOPFILTER sse2,   h, 6
+SIMPLE_LOOPFILTER ssse3,  v, 3
+SIMPLE_LOOPFILTER ssse3,  h, 6
 
 ;-----------------------------------------------------------------------------
 ; void vp8_h/v_loop_filter<size>_inner_<opt>(uint8_t *dst, [uint8_t *v,] int stride,
@@ -1428,11 +1573,15 @@ cglobal vp8_%2_loop_filter16y_inner_%1, 5, %3, %5
 %define stack_reg   hev_thr_reg
 %endif
 
+%ifidn %1, ssse3
+    pxor             m7, m7
+%endif
+
 %ifndef m8 ; mmx/mmxext or sse2 on x86-32
     ; splat function arguments
-    SPLATB_REG       m0, E_reg, %1   ; E
-    SPLATB_REG       m1, I_reg, %1   ; I
-    SPLATB_REG       m2, hev_thr_reg, %1 ; hev_thresh
+    SPLATB_REG       m0, E_reg, %1, m7 ; E
+    SPLATB_REG       m1, I_reg, %1, m7 ; I
+    SPLATB_REG       m2, hev_thr_reg, %1, m7 ; hev_thresh
 
     ; align stack
     mov       stack_reg, rsp         ; backup stack pointer
@@ -1465,9 +1614,9 @@ cglobal vp8_%2_loop_filter16y_inner_%1, 5, %3, %5
 %define q0backup m8
 
     ; splat function arguments
-    SPLATB_REG   flim_E, E_reg, %1   ; E
-    SPLATB_REG   flim_I, I_reg, %1   ; I
-    SPLATB_REG  hev_thr, hev_thr_reg, %1 ; hev_thresh
+    SPLATB_REG   flim_E, E_reg, %1, m7 ; E
+    SPLATB_REG   flim_I, I_reg, %1, m7 ; I
+    SPLATB_REG  hev_thr, hev_thr_reg, %1, m7 ; hev_thresh
 %endif
 
 %if mmsize == 8 && %4 == 16 ; mmx/mmxext
@@ -1879,15 +2028,15 @@ cglobal vp8_%2_loop_filter16y_inner_%1, 5, %3, %5
 %endmacro
 
 INIT_MMX
-INNER_LOOPFILTER mmx,    v, 6, 16, 8
-INNER_LOOPFILTER mmx,    h, 6, 16, 8
-INNER_LOOPFILTER mmxext, v, 6, 16, 8
-INNER_LOOPFILTER mmxext, h, 6, 16, 8
+INNER_LOOPFILTER mmx,    v, 6, 16, 0
+INNER_LOOPFILTER mmx,    h, 6, 16, 0
+INNER_LOOPFILTER mmxext, v, 6, 16, 0
+INNER_LOOPFILTER mmxext, h, 6, 16, 0
 
-INNER_LOOPFILTER mmx,    v, 6,  8, 8
-INNER_LOOPFILTER mmx,    h, 6,  8, 8
-INNER_LOOPFILTER mmxext, v, 6,  8, 8
-INNER_LOOPFILTER mmxext, h, 6,  8, 8
+INNER_LOOPFILTER mmx,    v, 6,  8, 0
+INNER_LOOPFILTER mmx,    h, 6,  8, 0
+INNER_LOOPFILTER mmxext, v, 6,  8, 0
+INNER_LOOPFILTER mmxext, h, 6,  8, 0
 
 INIT_XMM
 INNER_LOOPFILTER sse2,   v, 5, 16, 13
@@ -1899,6 +2048,15 @@ INNER_LOOPFILTER sse2,   h, 6, 16, 13
 INNER_LOOPFILTER sse2,   v, 6,  8, 13
 INNER_LOOPFILTER sse2,   h, 6,  8, 13
 
+INNER_LOOPFILTER ssse3,  v, 5, 16, 13
+%ifdef m8
+INNER_LOOPFILTER ssse3,  h, 5, 16, 13
+%else
+INNER_LOOPFILTER ssse3,  h, 6, 16, 13
+%endif
+INNER_LOOPFILTER ssse3,  v, 6,  8, 13
+INNER_LOOPFILTER ssse3,  h, 6,  8, 13
+
 ;-----------------------------------------------------------------------------
 ; void vp8_h/v_loop_filter<size>_mbedge_<opt>(uint8_t *dst, [uint8_t *v,] int stride,
 ;                                            int flimE, int flimI, int hev_thr);
@@ -1906,10 +2064,24 @@ INNER_LOOPFILTER sse2,   h, 6,  8, 13
 
 ; write 4 or 8 words in the mmx/xmm registers as 8 lines
 ; 1 and 2 are the registers to write, this can be the same (for SSE2)
+; for pre-SSE4:
 ; 3 is a general-purpose register that we will clobber
+; for SSE4:
+; 3 is a pointer to the destination's 5th line
 ; 4 is a pointer to the destination's 4th line
-; 5 is -stride and +stride
-%macro WRITE_8W 6
+; 5/6 is -stride and +stride
+; 7 is optimization string
+%macro WRITE_8W 7
+%ifidn %7, sse4
+    pextrw    [%4+%5*4], %1, 0
+    pextrw    [%3+%5*4], %1, 1
+    pextrw    [%4+%5*2], %1, 2
+    pextrw    [%4+%5  ], %1, 3
+    pextrw    [%4     ], %1, 4
+    pextrw    [%3     ], %1, 5
+    pextrw    [%3+%6  ], %1, 6
+    pextrw    [%3+%6*2], %1, 7
+%else
     movd             %3, %1
 %if mmsize == 8
     punpckhdq        %1, %1
@@ -1948,6 +2120,7 @@ INNER_LOOPFILTER sse2,   h, 6,  8, 13
 %if mmsize == 8
     add              %4, %5
 %endif
+%endif
 %endmacro
 
 %macro MBEDGE_LOOPFILTER 5
@@ -1979,11 +2152,15 @@ cglobal vp8_%2_loop_filter16y_mbedge_%1, 5, %3, %5
 %define stack_reg   hev_thr_reg
 %endif
 
+%ifidn %1, ssse3
+    pxor             m7, m7
+%endif
+
 %ifndef m8 ; mmx/mmxext or sse2 on x86-32
     ; splat function arguments
-    SPLATB_REG       m0, E_reg, %1   ; E
-    SPLATB_REG       m1, I_reg, %1   ; I
-    SPLATB_REG       m2, hev_thr_reg, %1 ; hev_thresh
+    SPLATB_REG       m0, E_reg, %1, m7 ; E
+    SPLATB_REG       m1, I_reg, %1, m7 ; I
+    SPLATB_REG       m2, hev_thr_reg, %1, m7 ; hev_thresh
 
     ; align stack
     mov       stack_reg, rsp         ; backup stack pointer
@@ -2023,9 +2200,9 @@ cglobal vp8_%2_loop_filter16y_mbedge_%1, 5, %3, %5
 %define lim_sign m15
 
     ; splat function arguments
-    SPLATB_REG   flim_E, E_reg, %1   ; E
-    SPLATB_REG   flim_I, I_reg, %1   ; I
-    SPLATB_REG  hev_thr, hev_thr_reg, %1 ; hev_thresh
+    SPLATB_REG   flim_E, E_reg, %1, m7 ; E
+    SPLATB_REG   flim_I, I_reg, %1, m7 ; I
+    SPLATB_REG  hev_thr, hev_thr_reg, %1, m7 ; hev_thresh
 %endif
 
 %if mmsize == 8 && %4 == 16 ; mmx/mmxext
@@ -2479,14 +2656,17 @@ cglobal vp8_%2_loop_filter16y_mbedge_%1, 5, %3, %5
 %if mmsize == 8 ; mmx/mmxext (h)
     WRITE_4x2D        1, 2, 3, 4, dst_reg, dst2_reg, mstride_reg, stride_reg
     add         dst_reg, 4
-    WRITE_8W         m5, m6, dst2_reg, dst_reg, mstride_reg, stride_reg
+    WRITE_8W         m5, m6, dst2_reg, dst_reg, mstride_reg, stride_reg, %4
 %else ; sse2 (h)
     lea        dst8_reg, [dst8_reg+mstride_reg+1]
     WRITE_4x4D        1, 2, 3, 4, dst_reg, dst2_reg, dst8_reg, mstride_reg, stride_reg, %4
-    add         dst_reg, 4
-    add        dst8_reg, 4
-    WRITE_8W         m5, m5, dst2_reg, dst_reg,  mstride_reg, stride_reg
-    WRITE_8W         m6, m6, dst2_reg, dst8_reg, mstride_reg, stride_reg
+    lea         dst_reg, [dst2_reg+mstride_reg+4]
+    lea        dst8_reg, [dst8_reg+mstride_reg+4]
+    WRITE_8W         m5, m5, dst2_reg, dst_reg,  mstride_reg, stride_reg, %2
+%ifidn %2, sse4
+    lea         dst_reg, [dst8_reg+ stride_reg]
+%endif
+    WRITE_8W         m6, m6, dst2_reg, dst8_reg, mstride_reg, stride_reg, %2
 %endif
 %endif
 
@@ -2516,15 +2696,15 @@ cglobal vp8_%2_loop_filter16y_mbedge_%1, 5, %3, %5
 %endmacro
 
 INIT_MMX
-MBEDGE_LOOPFILTER mmx,    v, 6, 16, 8
-MBEDGE_LOOPFILTER mmx,    h, 6, 16, 8
-MBEDGE_LOOPFILTER mmxext, v, 6, 16, 8
-MBEDGE_LOOPFILTER mmxext, h, 6, 16, 8
+MBEDGE_LOOPFILTER mmx,    v, 6, 16, 0
+MBEDGE_LOOPFILTER mmx,    h, 6, 16, 0
+MBEDGE_LOOPFILTER mmxext, v, 6, 16, 0
+MBEDGE_LOOPFILTER mmxext, h, 6, 16, 0
 
-MBEDGE_LOOPFILTER mmx,    v, 6,  8, 8
-MBEDGE_LOOPFILTER mmx,    h, 6,  8, 8
-MBEDGE_LOOPFILTER mmxext, v, 6,  8, 8
-MBEDGE_LOOPFILTER mmxext, h, 6,  8, 8
+MBEDGE_LOOPFILTER mmx,    v, 6,  8, 0
+MBEDGE_LOOPFILTER mmx,    h, 6,  8, 0
+MBEDGE_LOOPFILTER mmxext, v, 6,  8, 0
+MBEDGE_LOOPFILTER mmxext, h, 6,  8, 0
 
 INIT_XMM
 MBEDGE_LOOPFILTER sse2,   v, 5, 16, 16
@@ -2535,3 +2715,19 @@ MBEDGE_LOOPFILTER sse2,   h, 6, 16, 16
 %endif
 MBEDGE_LOOPFILTER sse2,   v, 6,  8, 16
 MBEDGE_LOOPFILTER sse2,   h, 6,  8, 16
+
+MBEDGE_LOOPFILTER ssse3,  v, 5, 16, 16
+%ifdef m8
+MBEDGE_LOOPFILTER ssse3,  h, 5, 16, 16
+%else
+MBEDGE_LOOPFILTER ssse3,  h, 6, 16, 16
+%endif
+MBEDGE_LOOPFILTER ssse3,  v, 6,  8, 16
+MBEDGE_LOOPFILTER ssse3,  h, 6,  8, 16
+
+%ifdef m8
+MBEDGE_LOOPFILTER sse4,   h, 5, 16, 16
+%else
+MBEDGE_LOOPFILTER sse4,   h, 6, 16, 16
+%endif
+MBEDGE_LOOPFILTER sse4,   h, 6,  8, 16
diff --git a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/x86inc.asm b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/x86inc.asm
index 410b11bb2..b7d17742e 100644
--- a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/x86inc.asm
+++ b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/x86inc.asm
@@ -271,13 +271,21 @@ DECLARE_REG 6, rax, eax, ax,  al,  [rsp + stack_offset + 56]
     ASSERT %2 >= %1
     %assign regs_used %2
     ASSERT regs_used <= 7
-    %assign xmm_regs_used %3
-    ASSERT xmm_regs_used <= 16
     %if regs_used > 4
         push r4
         push r5
         %assign stack_offset stack_offset+16
     %endif
+    WIN64_SPILL_XMM %3
+    LOAD_IF_USED 4, %1
+    LOAD_IF_USED 5, %1
+    LOAD_IF_USED 6, %1
+    DEFINE_ARGS %4
+%endmacro
+
+%macro WIN64_SPILL_XMM 1
+    %assign xmm_regs_used %1
+    ASSERT xmm_regs_used <= 16
     %if xmm_regs_used > 6
         sub rsp, (xmm_regs_used-6)*16+16
         %assign stack_offset stack_offset+(xmm_regs_used-6)*16+16
@@ -287,13 +295,9 @@ DECLARE_REG 6, rax, eax, ax,  al,  [rsp + stack_offset + 56]
             movdqa [rsp + (%%i-6)*16+8], xmm %+ %%i
         %endrep
     %endif
-    LOAD_IF_USED 4, %1
-    LOAD_IF_USED 5, %1
-    LOAD_IF_USED 6, %1
-    DEFINE_ARGS %4
 %endmacro
 
-%macro RESTORE_XMM_INTERNAL 1
+%macro WIN64_RESTORE_XMM_INTERNAL 1
     %if xmm_regs_used > 6
         %assign %%i xmm_regs_used
         %rep (xmm_regs_used-6)
@@ -304,14 +308,14 @@ DECLARE_REG 6, rax, eax, ax,  al,  [rsp + stack_offset + 56]
     %endif
 %endmacro
 
-%macro RESTORE_XMM 1
-    RESTORE_XMM_INTERNAL %1
+%macro WIN64_RESTORE_XMM 1
+    WIN64_RESTORE_XMM_INTERNAL %1
     %assign stack_offset stack_offset-(xmm_regs_used-6)*16+16
     %assign xmm_regs_used 0
 %endmacro
 
 %macro RET 0
-    RESTORE_XMM_INTERNAL rsp
+    WIN64_RESTORE_XMM_INTERNAL rsp
     %if regs_used > 4
         pop r5
         pop r4
@@ -428,6 +432,13 @@ DECLARE_REG 6, ebp, ebp, bp, null, [esp + stack_offset + 28]
 
 %endif ;======================================================================
 
+%ifndef WIN64
+%macro WIN64_SPILL_XMM 1
+%endmacro
+%macro WIN64_RESTORE_XMM 1
+%endmacro
+%endif
+
 
 
 ;=============================================================================
@@ -494,7 +505,7 @@ SECTION .note.GNU-stack noalloc noexec nowrite progbits
     %define mova movq
     %define movu movq
     %define movh movd
-    %define movnt movntq
+    %define movnta movntq
     %assign %%i 0
     %rep 8
     CAT_XDEFINE m, %%i, mm %+ %%i
@@ -518,7 +529,7 @@ SECTION .note.GNU-stack noalloc noexec nowrite progbits
     %define mova movdqa
     %define movu movdqu
     %define movh movq
-    %define movnt movntdq
+    %define movnta movntdq
     %assign %%i 0
     %rep num_mmregs
     CAT_XDEFINE m, %%i, xmm %+ %%i
diff --git a/src/filters/transform/MPCVideoDec/ffmpeg/libavutil/mathematics.h b/src/filters/transform/MPCVideoDec/ffmpeg/libavutil/mathematics.h
index 2fbb99018..1fb848b38 100644
--- a/src/filters/transform/MPCVideoDec/ffmpeg/libavutil/mathematics.h
+++ b/src/filters/transform/MPCVideoDec/ffmpeg/libavutil/mathematics.h
@@ -38,6 +38,9 @@
 #ifndef M_LOG2_10
 #define M_LOG2_10      3.32192809488736234787  /* log_2 10 */
 #endif
+#ifndef M_PHI
+#define M_PHI          1.61803398874989484820   /* phi / golden ratio */
+#endif
 #ifndef M_PI
 #define M_PI           3.14159265358979323846  /* pi */
 #endif
author	XhmikosR <xhmikosr@users.sourceforge.net>	2010-07-23 15:03:50 +0400
committer	XhmikosR <xhmikosr@users.sourceforge.net>	2010-07-23 15:03:50 +0400
commit	d8cb0bd04f30184e2622b50b056f7664aea2d814 (patch)
tree	5bcce552fbfccaba2ac8b5f024980c5b1910ca0c /src/filters/transform/MPCVideoDec
parent	7c67a6e4045516f0fa222e29b03aed8fe8189b7f (diff)