Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/mpc-hc/mpc-hc.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorXhmikosR <xhmikosr@users.sourceforge.net>2010-07-23 15:03:50 +0400
committerXhmikosR <xhmikosr@users.sourceforge.net>2010-07-23 15:03:50 +0400
commitd8cb0bd04f30184e2622b50b056f7664aea2d814 (patch)
tree5bcce552fbfccaba2ac8b5f024980c5b1910ca0c /src/filters/transform/MPCVideoDec
parent7c67a6e4045516f0fa222e29b03aed8fe8189b7f (diff)
Updated ffmpeg
git-svn-id: https://mpc-hc.svn.sourceforge.net/svnroot/mpc-hc/trunk@2144 10f7b99b-c216-0410-bff0-8a66a9350fd8
Diffstat (limited to 'src/filters/transform/MPCVideoDec')
-rw-r--r--src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/vp56.h13
-rw-r--r--src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/vp8.c539
-rw-r--r--src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/vp8data.h2
-rw-r--r--src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/vp8dsp.c28
-rw-r--r--src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/vp8dsp.h2
-rw-r--r--src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/dsputil_mmx.c4
-rw-r--r--src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/h264_deblock_sse2.asm24
-rw-r--r--src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/vc1dsp_yasm.asm660
-rw-r--r--src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/vp8dsp-init.c125
-rw-r--r--src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/vp8dsp.asm420
-rw-r--r--src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/x86inc.asm35
-rw-r--r--src/filters/transform/MPCVideoDec/ffmpeg/libavutil/mathematics.h3
12 files changed, 1080 insertions, 775 deletions
diff --git a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/vp56.h b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/vp56.h
index dd541f22e..1e069dc4d 100644
--- a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/vp56.h
+++ b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/vp56.h
@@ -32,7 +32,11 @@
#include "vp56dsp.h"
typedef struct vp56_context VP56Context;
-typedef struct vp56_mv VP56mv;
+
+typedef struct {
+ int16_t x;
+ int16_t y;
+} DECLARE_ALIGNED(4, , VP56mv);
typedef void (*VP56ParseVectorAdjustment)(VP56Context *s,
VP56mv *vect);
@@ -61,11 +65,6 @@ typedef struct {
DCTELEM dc_coeff;
} VP56RefDc;
-struct vp56_mv {
- int x;
- int y;
-};
-
typedef struct {
uint8_t type;
VP56mv mv;
@@ -175,7 +174,7 @@ void vp56_init(AVCodecContext *avctx, int flip, int has_alpha);
int vp56_free(AVCodecContext *avctx);
void vp56_init_dequant(VP56Context *s, int quantizer);
int vp56_decode_frame(AVCodecContext *avctx, void *data, int *data_size,
- const uint8_t *buf, int buf_size);
+ AVPacket *avpkt);
/**
diff --git a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/vp8.c b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/vp8.c
index 264fe72e7..90e873f6d 100644
--- a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/vp8.c
+++ b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/vp8.c
@@ -29,7 +29,12 @@
#include "rectangle.h"
typedef struct {
- uint8_t segment;
+ uint8_t filter_level;
+ uint8_t inner_limit;
+ uint8_t inner_filter;
+} VP8FilterStrength;
+
+typedef struct {
uint8_t skip;
// todo: make it possible to check for at least (i4x4 or split_mv)
// in one op. are others needed?
@@ -79,10 +84,12 @@ typedef struct {
VP8Macroblock *macroblocks;
VP8Macroblock *macroblocks_base;
+ VP8FilterStrength *filter_strength;
int mb_stride;
uint8_t *intra4x4_pred_mode;
uint8_t *intra4x4_pred_mode_base;
+ uint8_t *segmentation_map;
int b4_stride;
/**
@@ -109,11 +116,14 @@ typedef struct {
*/
DECLARE_ALIGNED(16, uint8_t, non_zero_count_cache)[6][4];
DECLARE_ALIGNED(16, DCTELEM, block)[6][4][16];
+ uint8_t intra4x4_pred_mode_mb[16];
int chroma_pred_mode; ///< 8x8c pred mode of the current macroblock
+ int segment; ///< segment of the current macroblock
int mbskip_enabled;
int sign_bias[4]; ///< one state [0, 1] per ref frame type
+ int ref_count[3];
/**
* Base parameters for segmentation, i.e. per-macroblock parameters.
@@ -205,6 +215,7 @@ static void vp8_decode_flush(AVCodecContext *avctx)
av_freep(&s->top_nnz);
av_freep(&s->edge_emu_buffer);
av_freep(&s->top_border);
+ av_freep(&s->segmentation_map);
s->macroblocks = NULL;
s->intra4x4_pred_mode = NULL;
@@ -229,15 +240,18 @@ static int update_dimensions(VP8Context *s, int width, int height)
s->mb_stride = s->mb_width+1;
s->b4_stride = 4*s->mb_stride;
- s->macroblocks_base = av_mallocz(s->mb_stride*(s->mb_height+1)*sizeof(*s->macroblocks));
+ s->macroblocks_base = av_mallocz((s->mb_stride+s->mb_height*2+2)*sizeof(*s->macroblocks));
+ s->filter_strength = av_mallocz(s->mb_stride*sizeof(*s->filter_strength));
s->intra4x4_pred_mode_base = av_mallocz(s->b4_stride*(4*s->mb_height+1));
s->top_nnz = av_mallocz(s->mb_width*sizeof(*s->top_nnz));
s->top_border = av_mallocz((s->mb_width+1)*sizeof(*s->top_border));
+ s->segmentation_map = av_mallocz(s->mb_stride*s->mb_height);
- if (!s->macroblocks_base || !s->intra4x4_pred_mode_base || !s->top_nnz || !s->top_border)
+ if (!s->macroblocks_base || !s->filter_strength || !s->intra4x4_pred_mode_base ||
+ !s->top_nnz || !s->top_border || !s->segmentation_map)
return AVERROR(ENOMEM);
- s->macroblocks = s->macroblocks_base + 1 + s->mb_stride;
+ s->macroblocks = s->macroblocks_base + 1;
s->intra4x4_pred_mode = s->intra4x4_pred_mode_base + 4 + s->b4_stride;
memset(s->intra4x4_pred_mode_base, DC_PRED, s->b4_stride);
@@ -520,39 +534,45 @@ static inline void clamp_mv(VP8Context *s, VP56mv *dst, const VP56mv *src,
}
static void find_near_mvs(VP8Context *s, VP8Macroblock *mb, int mb_x, int mb_y,
- VP56mv near[2], VP56mv *best, int cnt[4])
+ VP56mv near[2], VP56mv *best, uint8_t cnt[4])
{
- VP8Macroblock *mb_edge[3] = { mb - s->mb_stride /* top */,
- mb - 1 /* left */,
- mb - s->mb_stride - 1 /* top-left */ };
+ VP8Macroblock *mb_edge[3] = { mb + 2 /* top */,
+ mb - 1 /* left */,
+ mb + 1 /* top-left */ };
enum { EDGE_TOP, EDGE_LEFT, EDGE_TOPLEFT };
VP56mv near_mv[4] = {{ 0 }};
enum { CNT_ZERO, CNT_NEAREST, CNT_NEAR, CNT_SPLITMV };
- int idx = CNT_ZERO, n;
+ int idx = CNT_ZERO;
int best_idx = CNT_ZERO;
+ int cur_sign_bias = s->sign_bias[mb->ref_frame];
+ int *sign_bias = s->sign_bias;
/* Process MB on top, left and top-left */
- for (n = 0; n < 3; n++) {
- VP8Macroblock *edge = mb_edge[n];
- if (edge->ref_frame != VP56_FRAME_CURRENT) {
- if (edge->mv.x | edge->mv.y) {
- VP56mv tmp = edge->mv;
- if (s->sign_bias[mb->ref_frame] != s->sign_bias[edge->ref_frame]) {
- tmp.x *= -1;
- tmp.y *= -1;
- }
- if ((tmp.x ^ near_mv[idx].x) | (tmp.y ^ near_mv[idx].y))
- near_mv[++idx] = tmp;
- cnt[idx] += 1 + (n != 2);
- } else
- cnt[CNT_ZERO] += 1 + (n != 2);
- }
+ #define MV_EDGE_CHECK(n)\
+ {\
+ VP8Macroblock *edge = mb_edge[n];\
+ int edge_ref = edge->ref_frame;\
+ if (edge_ref != VP56_FRAME_CURRENT) {\
+ uint32_t mv = AV_RN32A(&edge->mv);\
+ if (mv) {\
+ if (cur_sign_bias != sign_bias[edge_ref]) {\
+ /* SWAR negate of the values in mv. */\
+ mv = ~mv;\
+ mv = ((mv&0x7fff7fff) + 0x00010001) ^ (mv&0x80008000);\
+ }\
+ if (!n || mv != AV_RN32A(&near_mv[idx]))\
+ AV_WN32A(&near_mv[++idx], mv);\
+ cnt[idx] += 1 + (n != 2);\
+ } else\
+ cnt[CNT_ZERO] += 1 + (n != 2);\
+ }\
}
+ MV_EDGE_CHECK(0)
+ MV_EDGE_CHECK(1)
+ MV_EDGE_CHECK(2)
- /* If we have three distinct MV's, merge first and last if they're the same */
- if (cnt[CNT_SPLITMV] &&
- !((near_mv[1+EDGE_TOP].x ^ near_mv[1+EDGE_TOPLEFT].x) |
- (near_mv[1+EDGE_TOP].y ^ near_mv[1+EDGE_TOPLEFT].y)))
+ /* If we have three distinct MVs, merge first and last if they're the same */
+ if (cnt[CNT_SPLITMV] && AV_RN32A(&near_mv[1+EDGE_TOP]) == AV_RN32A(&near_mv[1+EDGE_TOPLEFT]))
cnt[CNT_NEAREST] += 1;
cnt[CNT_SPLITMV] = ((mb_edge[EDGE_LEFT]->mode == VP8_MVMODE_SPLIT) +
@@ -561,8 +581,8 @@ static void find_near_mvs(VP8Context *s, VP8Macroblock *mb, int mb_x, int mb_y,
/* Swap near and nearest if necessary */
if (cnt[CNT_NEAR] > cnt[CNT_NEAREST]) {
- FFSWAP(int, cnt[CNT_NEAREST], cnt[CNT_NEAR]);
- FFSWAP(VP56mv, near_mv[CNT_NEAREST], near_mv[CNT_NEAR]);
+ FFSWAP(uint8_t, cnt[CNT_NEAREST], cnt[CNT_NEAR]);
+ FFSWAP( VP56mv, near_mv[CNT_NEAREST], near_mv[CNT_NEAR]);
}
/* Choose the best mv out of 0,0 and the nearest mv */
@@ -596,17 +616,13 @@ static int read_mv_component(VP56RangeCoder *c, const uint8_t *p)
return (x && vp56_rac_get_prob(c, p[1])) ? -x : x;
}
-static const uint8_t *get_submv_prob(const VP56mv *left, const VP56mv *top)
+static const uint8_t *get_submv_prob(uint32_t left, uint32_t top)
{
- int l_is_zero = !(left->x | left->y);
- int t_is_zero = !(top->x | top->y);
- int equal = !((left->x ^ top->x) | (left->y ^ top->y));
-
- if (equal)
- return l_is_zero ? vp8_submv_prob[4] : vp8_submv_prob[3];
- if (t_is_zero)
+ if (left == top)
+ return vp8_submv_prob[4-!!left];
+ if (!top)
return vp8_submv_prob[2];
- return l_is_zero ? vp8_submv_prob[1] : vp8_submv_prob[0];
+ return vp8_submv_prob[1-!!left];
}
/**
@@ -619,24 +635,29 @@ static int decode_splitmvs(VP8Context *s, VP56RangeCoder *c,
int part_idx = mb->partitioning =
vp8_rac_get_tree(c, vp8_mbsplit_tree, vp8_mbsplit_prob);
int n, num = vp8_mbsplit_count[part_idx];
- const uint8_t *mbsplits = vp8_mbsplits[part_idx],
+ VP8Macroblock *top_mb = &mb[2];
+ VP8Macroblock *left_mb = &mb[-1];
+ const uint8_t *mbsplits_left = vp8_mbsplits[left_mb->partitioning],
+ *mbsplits_top = vp8_mbsplits[top_mb->partitioning],
+ *mbsplits_cur = vp8_mbsplits[part_idx],
*firstidx = vp8_mbfirstidx[part_idx];
+ VP56mv *top_mv = top_mb->bmv;
+ VP56mv *left_mv = left_mb->bmv;
+ VP56mv *cur_mv = mb->bmv;
for (n = 0; n < num; n++) {
int k = firstidx[n];
- const VP56mv *left, *above;
+ uint32_t left, above;
const uint8_t *submv_prob;
- if (!(k & 3)) {
- VP8Macroblock *left_mb = &mb[-1];
- left = &left_mb->bmv[vp8_mbsplits[left_mb->partitioning][k + 3]];
- } else
- left = &mb->bmv[mbsplits[k - 1]];
- if (k <= 3) {
- VP8Macroblock *above_mb = &mb[-s->mb_stride];
- above = &above_mb->bmv[vp8_mbsplits[above_mb->partitioning][k + 12]];
- } else
- above = &mb->bmv[mbsplits[k - 4]];
+ if (!(k & 3))
+ left = AV_RN32A(&left_mv[mbsplits_left[k + 3]]);
+ else
+ left = AV_RN32A(&cur_mv[mbsplits_cur[k - 1]]);
+ if (k <= 3)
+ above = AV_RN32A(&top_mv[mbsplits_top[k + 12]]);
+ else
+ above = AV_RN32A(&cur_mv[mbsplits_cur[k - 4]]);
submv_prob = get_submv_prob(left, above);
@@ -646,14 +667,13 @@ static int decode_splitmvs(VP8Context *s, VP56RangeCoder *c,
mb->bmv[n].x = base_mv->x + read_mv_component(c, s->prob->mvc[1]);
break;
case VP8_SUBMVMODE_ZERO4X4:
- mb->bmv[n].x = 0;
- mb->bmv[n].y = 0;
+ AV_WN32A(&mb->bmv[n], 0);
break;
case VP8_SUBMVMODE_LEFT4X4:
- mb->bmv[n] = *left;
+ AV_WN32A(&mb->bmv[n], left);
break;
case VP8_SUBMVMODE_TOP4X4:
- mb->bmv[n] = *above;
+ AV_WN32A(&mb->bmv[n], above);
break;
}
}
@@ -664,30 +684,33 @@ static int decode_splitmvs(VP8Context *s, VP56RangeCoder *c,
static inline void decode_intra4x4_modes(VP56RangeCoder *c, uint8_t *intra4x4,
int stride, int keyframe)
{
- int x, y, t, l;
- const uint8_t *ctx = vp8_pred4x4_prob_inter;
+ int x, y, t, l, i;
- for (y = 0; y < 4; y++) {
- for (x = 0; x < 4; x++) {
- if (keyframe) {
+ if (keyframe) {
+ const uint8_t *ctx;
+ for (y = 0; y < 4; y++) {
+ for (x = 0; x < 4; x++) {
t = intra4x4[x - stride];
l = intra4x4[x - 1];
ctx = vp8_pred4x4_prob_intra[t][l];
+ intra4x4[x] = vp8_rac_get_tree(c, vp8_pred4x4_tree, ctx);
}
- intra4x4[x] = vp8_rac_get_tree(c, vp8_pred4x4_tree, ctx);
+ intra4x4 += stride;
}
- intra4x4 += stride;
+ } else {
+ for (i = 0; i < 16; i++)
+ intra4x4[i] = vp8_rac_get_tree(c, vp8_pred4x4_tree, vp8_pred4x4_prob_inter);
}
}
static void decode_mb_mode(VP8Context *s, VP8Macroblock *mb, int mb_x, int mb_y,
- uint8_t *intra4x4)
+ uint8_t *intra4x4, uint8_t *segment)
{
VP56RangeCoder *c = &s->c;
- int n;
if (s->segmentation.update_map)
- mb->segment = vp8_rac_get_tree(c, vp8_segmentid_tree, s->prob->segmentid);
+ *segment = vp8_rac_get_tree(c, vp8_segmentid_tree, s->prob->segmentid);
+ s->segment = *segment;
mb->skip = s->mbskip_enabled ? vp56_rac_get_prob(c, s->prob->mbskip) : 0;
@@ -703,7 +726,7 @@ static void decode_mb_mode(VP8Context *s, VP8Macroblock *mb, int mb_x, int mb_y,
mb->ref_frame = VP56_FRAME_CURRENT;
} else if (vp56_rac_get_prob(c, s->prob->intra)) {
VP56mv near[2], best;
- int cnt[4] = { 0 };
+ uint8_t cnt[4] = { 0 };
uint8_t p[4];
// inter MB, 16.2
@@ -712,19 +735,21 @@ static void decode_mb_mode(VP8Context *s, VP8Macroblock *mb, int mb_x, int mb_y,
VP56_FRAME_GOLDEN2 /* altref */ : VP56_FRAME_GOLDEN;
else
mb->ref_frame = VP56_FRAME_PREVIOUS;
+ s->ref_count[mb->ref_frame-1]++;
// motion vectors, 16.3
find_near_mvs(s, mb, mb_x, mb_y, near, &best, cnt);
- for (n = 0; n < 4; n++)
- p[n] = vp8_mode_contexts[cnt[n]][n];
+ p[0] = vp8_mode_contexts[cnt[0]][0];
+ p[1] = vp8_mode_contexts[cnt[1]][1];
+ p[2] = vp8_mode_contexts[cnt[2]][2];
+ p[3] = vp8_mode_contexts[cnt[3]][3];
mb->mode = vp8_rac_get_tree(c, vp8_pred16x16_tree_mvinter, p);
switch (mb->mode) {
case VP8_MVMODE_SPLIT:
mb->mv = mb->bmv[decode_splitmvs(s, c, mb, &best) - 1];
break;
case VP8_MVMODE_ZERO:
- mb->mv.x = 0;
- mb->mv.y = 0;
+ AV_WN32A(&mb->mv, 0);
break;
case VP8_MVMODE_NEAREST:
clamp_mv(s, &mb->mv, &near[0], mb_x, mb_y);
@@ -745,13 +770,13 @@ static void decode_mb_mode(VP8Context *s, VP8Macroblock *mb, int mb_x, int mb_y,
// intra MB, 16.1
mb->mode = vp8_rac_get_tree(c, vp8_pred16x16_tree_inter, s->prob->pred16x16);
- if (mb->mode == MODE_I4x4) {
- decode_intra4x4_modes(c, intra4x4, s->b4_stride, 0);
- } else
- fill_rectangle(intra4x4, 4, 4, s->b4_stride, vp8_pred4x4_mode[mb->mode], 1);
+ if (mb->mode == MODE_I4x4)
+ decode_intra4x4_modes(c, intra4x4, 4, 0);
s->chroma_pred_mode = vp8_rac_get_tree(c, vp8_pred8x8c_tree, s->prob->pred8x8c);
mb->ref_frame = VP56_FRAME_CURRENT;
+ mb->partitioning = VP8_SPLITMVMODE_NONE;
+ AV_WN32A(&mb->bmv[0], 0);
}
}
@@ -781,7 +806,7 @@ static int decode_block_coeffs(VP56RangeCoder *c, DCTELEM block[16],
else if (token >= DCT_CAT1) {
int cat = token-DCT_CAT1;
token = vp8_rac_get_coeff(c, vp8_dct_cat_prob[cat]);
- token += vp8_dct_cat_offset[cat];
+ token += 3 + (2<<cat);
}
// after the first token, the non-zero prediction context becomes
@@ -809,9 +834,7 @@ static void decode_mb_coeffs(VP8Context *s, VP56RangeCoder *c, VP8Macroblock *mb
LOCAL_ALIGNED_16(DCTELEM, dc,[16]);
int i, x, y, luma_start = 0, luma_ctx = 3;
int nnz_pred, nnz, nnz_total = 0;
- int segment = s->segmentation.enabled ? mb->segment : 0;
-
- s->dsp.clear_blocks((DCTELEM *)s->block);
+ int segment = s->segment;
if (mb->mode != MODE_I4x4 && mb->mode != VP8_MVMODE_SPLIT) {
AV_ZERO128(dc);
@@ -917,7 +940,7 @@ static int check_intra_pred_mode(int mode, int mb_x, int mb_y)
}
static void intra_predict(VP8Context *s, uint8_t *dst[3], VP8Macroblock *mb,
- uint8_t *bmode, int mb_x, int mb_y)
+ uint8_t *intra4x4, int mb_x, int mb_y)
{
int x, y, mode, nnz, tr;
@@ -933,6 +956,7 @@ static void intra_predict(VP8Context *s, uint8_t *dst[3], VP8Macroblock *mb,
s->hpc.pred16x16[mode](dst[0], s->linesize);
} else {
uint8_t *ptr = dst[0];
+ int stride = s->keyframe ? s->b4_stride : 4;
// all blocks on the right edge of the macroblock use bottom edge
// the top macroblock for their topright edge
@@ -945,13 +969,16 @@ static void intra_predict(VP8Context *s, uint8_t *dst[3], VP8Macroblock *mb,
tr_right = (uint8_t *)&tr;
}
+ if (mb->skip)
+ AV_ZERO128(s->non_zero_count_cache);
+
for (y = 0; y < 4; y++) {
uint8_t *topright = ptr + 4 - s->linesize;
for (x = 0; x < 4; x++) {
if (x == 3)
topright = tr_right;
- s->hpc.pred4x4[bmode[x]](ptr+4*x, topright, s->linesize);
+ s->hpc.pred4x4[intra4x4[x]](ptr+4*x, topright, s->linesize);
nnz = s->non_zero_count_cache[y][x];
if (nnz) {
@@ -964,7 +991,7 @@ static void intra_predict(VP8Context *s, uint8_t *dst[3], VP8Macroblock *mb,
}
ptr += 4*s->linesize;
- bmode += s->b4_stride;
+ intra4x4 += stride;
}
}
@@ -1001,24 +1028,26 @@ static inline void vp8_mc(VP8Context *s, int luma,
int width, int height, int linesize,
vp8_mc_func mc_func[3][3])
{
- static const uint8_t idx[8] = { 0, 1, 2, 1, 2, 1, 2, 1 };
- int mx = (mv->x << luma)&7, mx_idx = idx[mx];
- int my = (mv->y << luma)&7, my_idx = idx[my];
-
- x_off += mv->x >> (3 - luma);
- y_off += mv->y >> (3 - luma);
-
- // edge emulation
- src += y_off * linesize + x_off;
- if (x_off < 2 || x_off >= width - block_w - 3 ||
- y_off < 2 || y_off >= height - block_h - 3) {
- ff_emulated_edge_mc(s->edge_emu_buffer, src - 2 * linesize - 2, linesize,
- block_w + 5, block_h + 5,
- x_off - 2, y_off - 2, width, height);
- src = s->edge_emu_buffer + 2 + linesize * 2;
- }
-
- mc_func[my_idx][mx_idx](dst, linesize, src, linesize, block_h, mx, my);
+ if (AV_RN32A(mv)) {
+ static const uint8_t idx[8] = { 0, 1, 2, 1, 2, 1, 2, 1 };
+ int mx = (mv->x << luma)&7, mx_idx = idx[mx];
+ int my = (mv->y << luma)&7, my_idx = idx[my];
+
+ x_off += mv->x >> (3 - luma);
+ y_off += mv->y >> (3 - luma);
+
+ // edge emulation
+ src += y_off * linesize + x_off;
+ if (x_off < 2 || x_off >= width - block_w - 3 ||
+ y_off < 2 || y_off >= height - block_h - 3) {
+ ff_emulated_edge_mc(s->edge_emu_buffer, src - 2 * linesize - 2, linesize,
+ block_w + 5, block_h + 5,
+ x_off - 2, y_off - 2, width, height);
+ src = s->edge_emu_buffer + 2 + linesize * 2;
+ }
+ mc_func[my_idx][mx_idx](dst, linesize, src, linesize, block_h, mx, my);
+ } else
+ mc_func[0][0](dst, linesize, src + y_off * linesize + x_off, linesize, block_h, 0, 0);
}
static inline void vp8_mc_part(VP8Context *s, uint8_t *dst[3],
@@ -1054,6 +1083,23 @@ static inline void vp8_mc_part(VP8Context *s, uint8_t *dst[3],
s->put_pixels_tab[1 + (block_w == 4)]);
}
+/* Fetch pixels for estimated mv 4 macroblocks ahead.
+ * Optimized for 64-byte cache lines. Inspired by ffh264 prefetch_motion. */
+static inline void prefetch_motion(VP8Context *s, VP8Macroblock *mb, int mb_x, int mb_y, int mb_xy, int ref)
+{
+ /* Don't prefetch refs that haven't been used very often this frame. */
+ if (s->ref_count[ref-1] > (mb_xy >> 5)) {
+ int x_off = mb_x << 4, y_off = mb_y << 4;
+ int mx = mb->mv.x + x_off + 8;
+ int my = mb->mv.y + y_off;
+ uint8_t **src= s->framep[ref]->data;
+ int off= mx + (my + (mb_x&3)*4)*s->linesize + 64;
+ s->dsp.prefetch(src[0]+off, s->linesize, 4);
+ off= (mx>>1) + ((my>>1) + (mb_x&7))*s->uvlinesize + 64;
+ s->dsp.prefetch(src[1]+off, src[2]-src[1], 2);
+ }
+}
+
/**
* Apply motion vectors to prediction buffer, chapter 18.
*/
@@ -1062,9 +1108,11 @@ static void inter_predict(VP8Context *s, uint8_t *dst[3], VP8Macroblock *mb,
{
int x_off = mb_x << 4, y_off = mb_y << 4;
int width = 16*s->mb_width, height = 16*s->mb_height;
+ AVFrame *ref = s->framep[mb->ref_frame];
+ VP56mv *bmv = mb->bmv;
if (mb->mode < VP8_MVMODE_SPLIT) {
- vp8_mc_part(s, dst, s->framep[mb->ref_frame], x_off, y_off,
+ vp8_mc_part(s, dst, ref, x_off, y_off,
0, 0, 16, 16, width, height, &mb->mv);
} else switch (mb->partitioning) {
case VP8_SPLITMVMODE_4x4: {
@@ -1075,7 +1123,7 @@ static void inter_predict(VP8Context *s, uint8_t *dst[3], VP8Macroblock *mb,
for (y = 0; y < 4; y++) {
for (x = 0; x < 4; x++) {
vp8_mc(s, 1, dst[0] + 4*y*s->linesize + x*4,
- s->framep[mb->ref_frame]->data[0], &mb->bmv[4*y + x],
+ ref->data[0], &bmv[4*y + x],
4*x + x_off, 4*y + y_off, 4, 4,
width, height, s->linesize,
s->put_pixels_tab[2]);
@@ -1101,12 +1149,12 @@ static void inter_predict(VP8Context *s, uint8_t *dst[3], VP8Macroblock *mb,
uvmv.y &= ~7;
}
vp8_mc(s, 0, dst[1] + 4*y*s->uvlinesize + x*4,
- s->framep[mb->ref_frame]->data[1], &uvmv,
+ ref->data[1], &uvmv,
4*x + x_off, 4*y + y_off, 4, 4,
width, height, s->uvlinesize,
s->put_pixels_tab[2]);
vp8_mc(s, 0, dst[2] + 4*y*s->uvlinesize + x*4,
- s->framep[mb->ref_frame]->data[2], &uvmv,
+ ref->data[2], &uvmv,
4*x + x_off, 4*y + y_off, 4, 4,
width, height, s->uvlinesize,
s->put_pixels_tab[2]);
@@ -1115,78 +1163,87 @@ static void inter_predict(VP8Context *s, uint8_t *dst[3], VP8Macroblock *mb,
break;
}
case VP8_SPLITMVMODE_16x8:
- vp8_mc_part(s, dst, s->framep[mb->ref_frame], x_off, y_off,
- 0, 0, 16, 8, width, height, &mb->bmv[0]);
- vp8_mc_part(s, dst, s->framep[mb->ref_frame], x_off, y_off,
- 0, 8, 16, 8, width, height, &mb->bmv[1]);
+ vp8_mc_part(s, dst, ref, x_off, y_off,
+ 0, 0, 16, 8, width, height, &bmv[0]);
+ vp8_mc_part(s, dst, ref, x_off, y_off,
+ 0, 8, 16, 8, width, height, &bmv[1]);
break;
case VP8_SPLITMVMODE_8x16:
- vp8_mc_part(s, dst, s->framep[mb->ref_frame], x_off, y_off,
- 0, 0, 8, 16, width, height, &mb->bmv[0]);
- vp8_mc_part(s, dst, s->framep[mb->ref_frame], x_off, y_off,
- 8, 0, 8, 16, width, height, &mb->bmv[1]);
+ vp8_mc_part(s, dst, ref, x_off, y_off,
+ 0, 0, 8, 16, width, height, &bmv[0]);
+ vp8_mc_part(s, dst, ref, x_off, y_off,
+ 8, 0, 8, 16, width, height, &bmv[1]);
break;
case VP8_SPLITMVMODE_8x8:
- vp8_mc_part(s, dst, s->framep[mb->ref_frame], x_off, y_off,
- 0, 0, 8, 8, width, height, &mb->bmv[0]);
- vp8_mc_part(s, dst, s->framep[mb->ref_frame], x_off, y_off,
- 8, 0, 8, 8, width, height, &mb->bmv[1]);
- vp8_mc_part(s, dst, s->framep[mb->ref_frame], x_off, y_off,
- 0, 8, 8, 8, width, height, &mb->bmv[2]);
- vp8_mc_part(s, dst, s->framep[mb->ref_frame], x_off, y_off,
- 8, 8, 8, 8, width, height, &mb->bmv[3]);
+ vp8_mc_part(s, dst, ref, x_off, y_off,
+ 0, 0, 8, 8, width, height, &bmv[0]);
+ vp8_mc_part(s, dst, ref, x_off, y_off,
+ 8, 0, 8, 8, width, height, &bmv[1]);
+ vp8_mc_part(s, dst, ref, x_off, y_off,
+ 0, 8, 8, 8, width, height, &bmv[2]);
+ vp8_mc_part(s, dst, ref, x_off, y_off,
+ 8, 8, 8, 8, width, height, &bmv[3]);
break;
}
}
-static void idct_mb(VP8Context *s, uint8_t *y_dst, uint8_t *u_dst, uint8_t *v_dst,
- VP8Macroblock *mb)
+static void idct_mb(VP8Context *s, uint8_t *dst[3], VP8Macroblock *mb)
{
- int x, y, nnz;
+ int x, y, ch;
- if (mb->mode != MODE_I4x4)
+ if (mb->mode != MODE_I4x4) {
+ uint8_t *y_dst = dst[0];
for (y = 0; y < 4; y++) {
- for (x = 0; x < 4; x++) {
- nnz = s->non_zero_count_cache[y][x];
- if (nnz) {
- if (nnz == 1)
- s->vp8dsp.vp8_idct_dc_add(y_dst+4*x, s->block[y][x], s->linesize);
- else
- s->vp8dsp.vp8_idct_add(y_dst+4*x, s->block[y][x], s->linesize);
+ uint32_t nnz4 = AV_RN32A(s->non_zero_count_cache[y]);
+ if (nnz4) {
+ if (nnz4&~0x01010101) {
+ for (x = 0; x < 4; x++) {
+ int nnz = s->non_zero_count_cache[y][x];
+ if (nnz) {
+ if (nnz == 1)
+ s->vp8dsp.vp8_idct_dc_add(y_dst+4*x, s->block[y][x], s->linesize);
+ else
+ s->vp8dsp.vp8_idct_add(y_dst+4*x, s->block[y][x], s->linesize);
+ }
+ }
+ } else {
+ s->vp8dsp.vp8_idct_dc_add4y(y_dst, s->block[y], s->linesize);
}
}
y_dst += 4*s->linesize;
}
+ }
- for (y = 0; y < 2; y++) {
- for (x = 0; x < 2; x++) {
- nnz = s->non_zero_count_cache[4][(y<<1)+x];
- if (nnz) {
- if (nnz == 1)
- s->vp8dsp.vp8_idct_dc_add(u_dst+4*x, s->block[4][(y<<1)+x], s->uvlinesize);
- else
- s->vp8dsp.vp8_idct_add(u_dst+4*x, s->block[4][(y<<1)+x], s->uvlinesize);
- }
-
- nnz = s->non_zero_count_cache[5][(y<<1)+x];
- if (nnz) {
- if (nnz == 1)
- s->vp8dsp.vp8_idct_dc_add(v_dst+4*x, s->block[5][(y<<1)+x], s->uvlinesize);
- else
- s->vp8dsp.vp8_idct_add(v_dst+4*x, s->block[5][(y<<1)+x], s->uvlinesize);
+ for (ch = 0; ch < 2; ch++) {
+ uint32_t nnz4 = AV_RN32A(s->non_zero_count_cache[4+ch]);
+ if (nnz4) {
+ uint8_t *ch_dst = dst[1+ch];
+ if (nnz4&~0x01010101) {
+ for (y = 0; y < 2; y++) {
+ for (x = 0; x < 2; x++) {
+ int nnz = s->non_zero_count_cache[4+ch][(y<<1)+x];
+ if (nnz) {
+ if (nnz == 1)
+ s->vp8dsp.vp8_idct_dc_add(ch_dst+4*x, s->block[4+ch][(y<<1)+x], s->uvlinesize);
+ else
+ s->vp8dsp.vp8_idct_add(ch_dst+4*x, s->block[4+ch][(y<<1)+x], s->uvlinesize);
+ }
+ }
+ ch_dst += 4*s->uvlinesize;
+ }
+ } else {
+ s->vp8dsp.vp8_idct_dc_add4uv(ch_dst, s->block[4+ch], s->uvlinesize);
}
}
- u_dst += 4*s->uvlinesize;
- v_dst += 4*s->uvlinesize;
}
}
-static void filter_level_for_mb(VP8Context *s, VP8Macroblock *mb, int *level, int *inner, int *hev_thresh)
+static void filter_level_for_mb(VP8Context *s, VP8Macroblock *mb, VP8FilterStrength *f )
{
int interior_limit, filter_level;
if (s->segmentation.enabled) {
- filter_level = s->segmentation.filter_level[mb->segment];
+ filter_level = s->segmentation.filter_level[s->segment];
if (!s->segmentation.absolute_vals)
filter_level += s->filter.level;
} else
@@ -1216,83 +1273,88 @@ static void filter_level_for_mb(VP8Context *s, VP8Macroblock *mb, int *level, in
}
interior_limit = FFMAX(interior_limit, 1);
- *level = filter_level;
- *inner = interior_limit;
-
- if (hev_thresh) {
- *hev_thresh = filter_level >= 15;
-
- if (s->keyframe) {
- if (filter_level >= 40)
- *hev_thresh = 2;
- } else {
- if (filter_level >= 40)
- *hev_thresh = 3;
- else if (filter_level >= 20)
- *hev_thresh = 2;
- }
- }
+ f->filter_level = filter_level;
+ f->inner_limit = interior_limit;
+ f->inner_filter = !mb->skip || mb->mode == MODE_I4x4 || mb->mode == VP8_MVMODE_SPLIT;
}
-static void filter_mb(VP8Context *s, uint8_t *dst[3], VP8Macroblock *mb, int mb_x, int mb_y)
+static void filter_mb(VP8Context *s, uint8_t *dst[3], VP8FilterStrength *f, int mb_x, int mb_y)
{
- int filter_level, inner_limit, hev_thresh, mbedge_lim, bedge_lim;
+ int mbedge_lim, bedge_lim, hev_thresh;
+ int filter_level = f->filter_level;
+ int inner_limit = f->inner_limit;
+ int inner_filter = f->inner_filter;
+ int linesize = s->linesize;
+ int uvlinesize = s->uvlinesize;
- filter_level_for_mb(s, mb, &filter_level, &inner_limit, &hev_thresh);
if (!filter_level)
return;
mbedge_lim = 2*(filter_level+2) + inner_limit;
bedge_lim = 2* filter_level + inner_limit;
+ hev_thresh = filter_level >= 15;
+
+ if (s->keyframe) {
+ if (filter_level >= 40)
+ hev_thresh = 2;
+ } else {
+ if (filter_level >= 40)
+ hev_thresh = 3;
+ else if (filter_level >= 20)
+ hev_thresh = 2;
+ }
if (mb_x) {
- s->vp8dsp.vp8_h_loop_filter16y(dst[0], s->linesize,
+ s->vp8dsp.vp8_h_loop_filter16y(dst[0], linesize,
mbedge_lim, inner_limit, hev_thresh);
- s->vp8dsp.vp8_h_loop_filter8uv(dst[1], dst[2], s->uvlinesize,
+ s->vp8dsp.vp8_h_loop_filter8uv(dst[1], dst[2], uvlinesize,
mbedge_lim, inner_limit, hev_thresh);
}
- if (!mb->skip || mb->mode == MODE_I4x4 || mb->mode == VP8_MVMODE_SPLIT) {
- s->vp8dsp.vp8_h_loop_filter16y_inner(dst[0]+ 4, s->linesize, bedge_lim,
- inner_limit, hev_thresh);
- s->vp8dsp.vp8_h_loop_filter16y_inner(dst[0]+ 8, s->linesize, bedge_lim,
- inner_limit, hev_thresh);
- s->vp8dsp.vp8_h_loop_filter16y_inner(dst[0]+12, s->linesize, bedge_lim,
- inner_limit, hev_thresh);
- s->vp8dsp.vp8_h_loop_filter8uv_inner(dst[1] + 4, dst[2] + 4,
- s->uvlinesize, bedge_lim,
- inner_limit, hev_thresh);
+ if (inner_filter) {
+ s->vp8dsp.vp8_h_loop_filter16y_inner(dst[0]+ 4, linesize, bedge_lim,
+ inner_limit, hev_thresh);
+ s->vp8dsp.vp8_h_loop_filter16y_inner(dst[0]+ 8, linesize, bedge_lim,
+ inner_limit, hev_thresh);
+ s->vp8dsp.vp8_h_loop_filter16y_inner(dst[0]+12, linesize, bedge_lim,
+ inner_limit, hev_thresh);
+ s->vp8dsp.vp8_h_loop_filter8uv_inner(dst[1] + 4, dst[2] + 4,
+ uvlinesize, bedge_lim,
+ inner_limit, hev_thresh);
}
if (mb_y) {
- s->vp8dsp.vp8_v_loop_filter16y(dst[0], s->linesize,
+ s->vp8dsp.vp8_v_loop_filter16y(dst[0], linesize,
mbedge_lim, inner_limit, hev_thresh);
- s->vp8dsp.vp8_v_loop_filter8uv(dst[1], dst[2], s->uvlinesize,
+ s->vp8dsp.vp8_v_loop_filter8uv(dst[1], dst[2], uvlinesize,
mbedge_lim, inner_limit, hev_thresh);
}
- if (!mb->skip || mb->mode == MODE_I4x4 || mb->mode == VP8_MVMODE_SPLIT) {
- s->vp8dsp.vp8_v_loop_filter16y_inner(dst[0]+ 4*s->linesize,
- s->linesize, bedge_lim,
- inner_limit, hev_thresh);
- s->vp8dsp.vp8_v_loop_filter16y_inner(dst[0]+ 8*s->linesize,
- s->linesize, bedge_lim,
- inner_limit, hev_thresh);
- s->vp8dsp.vp8_v_loop_filter16y_inner(dst[0]+12*s->linesize,
- s->linesize, bedge_lim,
+ if (inner_filter) {
+ s->vp8dsp.vp8_v_loop_filter16y_inner(dst[0]+ 4*linesize,
+ linesize, bedge_lim,
+ inner_limit, hev_thresh);
+ s->vp8dsp.vp8_v_loop_filter16y_inner(dst[0]+ 8*linesize,
+ linesize, bedge_lim,
+ inner_limit, hev_thresh);
+ s->vp8dsp.vp8_v_loop_filter16y_inner(dst[0]+12*linesize,
+ linesize, bedge_lim,
+ inner_limit, hev_thresh);
+ s->vp8dsp.vp8_v_loop_filter8uv_inner(dst[1] + 4 * uvlinesize,
+ dst[2] + 4 * uvlinesize,
+ uvlinesize, bedge_lim,
inner_limit, hev_thresh);
- s->vp8dsp.vp8_v_loop_filter8uv_inner(dst[1] + 4 * s->uvlinesize,
- dst[2] + 4 * s->uvlinesize,
- s->uvlinesize, bedge_lim,
- inner_limit, hev_thresh);
}
}
-static void filter_mb_simple(VP8Context *s, uint8_t *dst, VP8Macroblock *mb, int mb_x, int mb_y)
+static void filter_mb_simple(VP8Context *s, uint8_t *dst, VP8FilterStrength *f, int mb_x, int mb_y)
{
- int filter_level, inner_limit, mbedge_lim, bedge_lim;
+ int mbedge_lim, bedge_lim;
+ int filter_level = f->filter_level;
+ int inner_limit = f->inner_limit;
+ int inner_filter = f->inner_filter;
+ int linesize = s->linesize;
- filter_level_for_mb(s, mb, &filter_level, &inner_limit, NULL);
if (!filter_level)
return;
@@ -1300,25 +1362,25 @@ static void filter_mb_simple(VP8Context *s, uint8_t *dst, VP8Macroblock *mb, int
bedge_lim = 2* filter_level + inner_limit;
if (mb_x)
- s->vp8dsp.vp8_h_loop_filter_simple(dst, s->linesize, mbedge_lim);
- if (!mb->skip || mb->mode == MODE_I4x4 || mb->mode == VP8_MVMODE_SPLIT) {
- s->vp8dsp.vp8_h_loop_filter_simple(dst+ 4, s->linesize, bedge_lim);
- s->vp8dsp.vp8_h_loop_filter_simple(dst+ 8, s->linesize, bedge_lim);
- s->vp8dsp.vp8_h_loop_filter_simple(dst+12, s->linesize, bedge_lim);
+ s->vp8dsp.vp8_h_loop_filter_simple(dst, linesize, mbedge_lim);
+ if (inner_filter) {
+ s->vp8dsp.vp8_h_loop_filter_simple(dst+ 4, linesize, bedge_lim);
+ s->vp8dsp.vp8_h_loop_filter_simple(dst+ 8, linesize, bedge_lim);
+ s->vp8dsp.vp8_h_loop_filter_simple(dst+12, linesize, bedge_lim);
}
if (mb_y)
- s->vp8dsp.vp8_v_loop_filter_simple(dst, s->linesize, mbedge_lim);
- if (!mb->skip || mb->mode == MODE_I4x4 || mb->mode == VP8_MVMODE_SPLIT) {
- s->vp8dsp.vp8_v_loop_filter_simple(dst+ 4*s->linesize, s->linesize, bedge_lim);
- s->vp8dsp.vp8_v_loop_filter_simple(dst+ 8*s->linesize, s->linesize, bedge_lim);
- s->vp8dsp.vp8_v_loop_filter_simple(dst+12*s->linesize, s->linesize, bedge_lim);
+ s->vp8dsp.vp8_v_loop_filter_simple(dst, linesize, mbedge_lim);
+ if (inner_filter) {
+ s->vp8dsp.vp8_v_loop_filter_simple(dst+ 4*linesize, linesize, bedge_lim);
+ s->vp8dsp.vp8_v_loop_filter_simple(dst+ 8*linesize, linesize, bedge_lim);
+ s->vp8dsp.vp8_v_loop_filter_simple(dst+12*linesize, linesize, bedge_lim);
}
}
static void filter_mb_row(VP8Context *s, int mb_y)
{
- VP8Macroblock *mb = s->macroblocks + mb_y*s->mb_stride;
+ VP8FilterStrength *f = s->filter_strength;
uint8_t *dst[3] = {
s->framep[VP56_FRAME_CURRENT]->data[0] + 16*mb_y*s->linesize,
s->framep[VP56_FRAME_CURRENT]->data[1] + 8*mb_y*s->uvlinesize,
@@ -1328,7 +1390,7 @@ static void filter_mb_row(VP8Context *s, int mb_y)
for (mb_x = 0; mb_x < s->mb_width; mb_x++) {
backup_mb_border(s->top_border[mb_x+1], dst[0], dst[1], dst[2], s->linesize, s->uvlinesize, 0);
- filter_mb(s, dst, mb++, mb_x, mb_y);
+ filter_mb(s, dst, f++, mb_x, mb_y);
dst[0] += 16;
dst[1] += 8;
dst[2] += 8;
@@ -1337,26 +1399,26 @@ static void filter_mb_row(VP8Context *s, int mb_y)
static void filter_mb_row_simple(VP8Context *s, int mb_y)
{
+ VP8FilterStrength *f = s->filter_strength;
uint8_t *dst = s->framep[VP56_FRAME_CURRENT]->data[0] + 16*mb_y*s->linesize;
- VP8Macroblock *mb = s->macroblocks + mb_y*s->mb_stride;
int mb_x;
for (mb_x = 0; mb_x < s->mb_width; mb_x++) {
backup_mb_border(s->top_border[mb_x+1], dst, NULL, NULL, s->linesize, 0, 1);
- filter_mb_simple(s, dst, mb++, mb_x, mb_y);
+ filter_mb_simple(s, dst, f++, mb_x, mb_y);
dst += 16;
}
}
static int vp8_decode_frame(AVCodecContext *avctx, void *data, int *data_size,
- const uint8_t *buf, int buf_size)
+ AVPacket *avpkt)
{
VP8Context *s = avctx->priv_data;
int ret, mb_x, mb_y, i, y, referenced;
enum AVDiscard skip_thresh;
- AVFrame *curframe;
+ AVFrame *curframe = NULL;
- if ((ret = decode_frame_header(s, buf, buf_size)) < 0)
+ if ((ret = decode_frame_header(s, avpkt->data, avpkt->size)) < 0)
return ret;
referenced = s->update_last || s->update_golden == VP56_FRAME_CURRENT
@@ -1407,13 +1469,19 @@ static int vp8_decode_frame(AVCodecContext *avctx, void *data, int *data_size,
memset(s->top_nnz, 0, s->mb_width*sizeof(*s->top_nnz));
+ /* Zero macroblock structures for top/left prediction from outside the frame. */
+ memset(s->macroblocks, 0, (s->mb_width + s->mb_height*2)*sizeof(*s->macroblocks));
+
// top edge of 127 for intra prediction
memset(s->top_border, 127, (s->mb_width+1)*sizeof(*s->top_border));
+ memset(s->ref_count, 0, sizeof(s->ref_count));
for (mb_y = 0; mb_y < s->mb_height; mb_y++) {
VP56RangeCoder *c = &s->coeff_partition[mb_y & (s->num_coeff_partitions-1)];
- VP8Macroblock *mb = s->macroblocks + mb_y*s->mb_stride;
+ VP8Macroblock *mb = s->macroblocks + (s->mb_height - mb_y - 1)*2;
uint8_t *intra4x4 = s->intra4x4_pred_mode + 4*mb_y*s->b4_stride;
+ uint8_t *segment_map = s->segmentation_map + mb_y*s->mb_stride;
+ int mb_xy = mb_y * s->mb_stride;
uint8_t *dst[3] = {
curframe->data[0] + 16*mb_y*s->linesize,
curframe->data[1] + 8*mb_y*s->uvlinesize,
@@ -1430,25 +1498,30 @@ static int vp8_decode_frame(AVCodecContext *avctx, void *data, int *data_size,
if (mb_y)
memset(s->top_border, 129, sizeof(*s->top_border));
- for (mb_x = 0; mb_x < s->mb_width; mb_x++) {
- decode_mb_mode(s, mb, mb_x, mb_y, intra4x4 + 4*mb_x);
+ for (mb_x = 0; mb_x < s->mb_width; mb_x++, mb_xy++, mb++) {
+ uint8_t *intra4x4_mb = s->keyframe ? intra4x4 + 4*mb_x : s->intra4x4_pred_mode_mb;
+ uint8_t *segment_mb = segment_map+mb_x;
+
+ /* Prefetch the current frame, 4 MBs ahead */
+ s->dsp.prefetch(dst[0] + (mb_x&3)*4*s->linesize + 64, s->linesize, 4);
+ s->dsp.prefetch(dst[1] + (mb_x&7)*s->uvlinesize + 64, dst[2] - dst[1], 2);
+
+ decode_mb_mode(s, mb, mb_x, mb_y, intra4x4_mb, segment_mb);
+
+ prefetch_motion(s, mb, mb_x, mb_y, mb_xy, VP56_FRAME_PREVIOUS);
if (!mb->skip)
decode_mb_coeffs(s, c, mb, s->top_nnz[mb_x], s->left_nnz);
- else {
- AV_ZERO128(s->non_zero_count_cache); // luma
- AV_ZERO64(s->non_zero_count_cache[4]); // chroma
- }
- if (mb->mode <= MODE_I4x4) {
- intra_predict(s, dst, mb, intra4x4 + 4*mb_x, mb_x, mb_y);
- memset(mb->bmv, 0, sizeof(mb->bmv));
- } else {
+ if (mb->mode <= MODE_I4x4)
+ intra_predict(s, dst, mb, intra4x4_mb, mb_x, mb_y);
+ else
inter_predict(s, dst, mb, mb_x, mb_y);
- }
+
+ prefetch_motion(s, mb, mb_x, mb_y, mb_xy, VP56_FRAME_GOLDEN);
if (!mb->skip) {
- idct_mb(s, dst[0], dst[1], dst[2], mb);
+ idct_mb(s, dst, mb);
} else {
AV_ZERO64(s->left_nnz);
AV_WN64(s->top_nnz[mb_x], 0); // array of 9, so unaligned
@@ -1460,10 +1533,14 @@ static int vp8_decode_frame(AVCodecContext *avctx, void *data, int *data_size,
}
}
+ if (s->deblock_filter)
+ filter_level_for_mb(s, mb, &s->filter_strength[mb_x]);
+
+ prefetch_motion(s, mb, mb_x, mb_y, mb_xy, VP56_FRAME_GOLDEN2);
+
dst[0] += 16;
dst[1] += 8;
dst[2] += 8;
- mb++;
}
if (s->deblock_filter) {
if (s->filter.simple)
@@ -1508,7 +1585,7 @@ skip_decode:
*data_size = sizeof(AVFrame);
}
- return buf_size;
+ return avpkt->size;
}
static av_cold int vp8_decode_init(AVCodecContext *avctx)
diff --git a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/vp8data.h b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/vp8data.h
index 5d718b4bb..9f56ab63b 100644
--- a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/vp8data.h
+++ b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/vp8data.h
@@ -369,8 +369,6 @@ static const uint8_t * const vp8_dct_cat_prob[6] =
vp8_dct_cat6_prob,
};
-static const uint8_t vp8_dct_cat_offset[6] = { 5, 7, 11, 19, 35, 67 };
-
static const uint8_t vp8_token_default_probs[4][8][3][NUM_DCT_TOKENS-1] =
{
{
diff --git a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/vp8dsp.c b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/vp8dsp.c
index 5e924017f..0c61d9252 100644
--- a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/vp8dsp.c
+++ b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/vp8dsp.c
@@ -69,6 +69,10 @@ static void vp8_idct_add_c(uint8_t *dst, DCTELEM block[16], int stride)
t1 = block[0*4+i] - block[2*4+i];
t2 = MUL_35468(block[1*4+i]) - MUL_20091(block[3*4+i]);
t3 = MUL_20091(block[1*4+i]) + MUL_35468(block[3*4+i]);
+ block[0*4+i] = 0;
+ block[1*4+i] = 0;
+ block[2*4+i] = 0;
+ block[3*4+i] = 0;
tmp[i*4+0] = t0 + t3;
tmp[i*4+1] = t1 + t2;
@@ -94,6 +98,7 @@ static void vp8_idct_dc_add_c(uint8_t *dst, DCTELEM block[16], int stride)
{
int i, dc = (block[0] + 4) >> 3;
uint8_t *cm = ff_cropTbl + MAX_NEG_CROP + dc;
+ block[0] = 0;
for (i = 0; i < 4; i++) {
dst[0] = cm[dst[0]];
@@ -104,6 +109,21 @@ static void vp8_idct_dc_add_c(uint8_t *dst, DCTELEM block[16], int stride)
}
}
+static void vp8_idct_dc_add4uv_c(uint8_t *dst, DCTELEM block[4][16], int stride)
+{
+ vp8_idct_dc_add_c(dst+stride*0+0, block[0], stride);
+ vp8_idct_dc_add_c(dst+stride*0+4, block[1], stride);
+ vp8_idct_dc_add_c(dst+stride*4+0, block[2], stride);
+ vp8_idct_dc_add_c(dst+stride*4+4, block[3], stride);
+}
+
+static void vp8_idct_dc_add4y_c(uint8_t *dst, DCTELEM block[4][16], int stride)
+{
+ vp8_idct_dc_add_c(dst+ 0, block[0], stride);
+ vp8_idct_dc_add_c(dst+ 4, block[1], stride);
+ vp8_idct_dc_add_c(dst+ 8, block[2], stride);
+ vp8_idct_dc_add_c(dst+12, block[3], stride);
+}
// because I like only having two parameters to pass functions...
#define LOAD_PIXELS\
@@ -455,9 +475,11 @@ VP8_BILINEAR(4)
av_cold void ff_vp8dsp_init(VP8DSPContext *dsp)
{
- dsp->vp8_luma_dc_wht = vp8_luma_dc_wht_c;
- dsp->vp8_idct_add = vp8_idct_add_c;
- dsp->vp8_idct_dc_add = vp8_idct_dc_add_c;
+ dsp->vp8_luma_dc_wht = vp8_luma_dc_wht_c;
+ dsp->vp8_idct_add = vp8_idct_add_c;
+ dsp->vp8_idct_dc_add = vp8_idct_dc_add_c;
+ dsp->vp8_idct_dc_add4y = vp8_idct_dc_add4y_c;
+ dsp->vp8_idct_dc_add4uv = vp8_idct_dc_add4uv_c;
dsp->vp8_v_loop_filter16y = vp8_v_loop_filter16_c;
dsp->vp8_h_loop_filter16y = vp8_h_loop_filter16_c;
diff --git a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/vp8dsp.h b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/vp8dsp.h
index 64a3bfbc5..47b1a9077 100644
--- a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/vp8dsp.h
+++ b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/vp8dsp.h
@@ -33,6 +33,8 @@ typedef struct VP8DSPContext {
void (*vp8_luma_dc_wht)(DCTELEM block[4][4][16], DCTELEM dc[16]);
void (*vp8_idct_add)(uint8_t *dst, DCTELEM block[16], int stride);
void (*vp8_idct_dc_add)(uint8_t *dst, DCTELEM block[16], int stride);
+ void (*vp8_idct_dc_add4y)(uint8_t *dst, DCTELEM block[4][16], int stride);
+ void (*vp8_idct_dc_add4uv)(uint8_t *dst, DCTELEM block[4][16], int stride);
// loop filter applied to edges between macroblocks
void (*vp8_v_loop_filter16y)(uint8_t *dst, int stride,
diff --git a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/dsputil_mmx.c b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/dsputil_mmx.c
index a94cfca0f..cd4e46219 100644
--- a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/dsputil_mmx.c
+++ b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/dsputil_mmx.c
@@ -2956,7 +2956,7 @@ void ff_h264dsp_init_x86(H264DSPContext *c)
c->h264_idct8_add4= ff_h264_idct8_add4_sse2;
}
-#if CONFIG_GPL && HAVE_YASM
+#if HAVE_YASM
if (mm_flags & FF_MM_MMX2){
#if ARCH_X86_32
c->h264_v_loop_filter_luma_intra = ff_x264_deblock_v_luma_intra_mmxext;
@@ -2969,9 +2969,11 @@ void ff_h264dsp_init_x86(H264DSPContext *c)
c->h264_v_loop_filter_luma_intra = ff_x264_deblock_v_luma_intra_sse2;
c->h264_h_loop_filter_luma_intra = ff_x264_deblock_h_luma_intra_sse2;
#endif
+#if CONFIG_GPL
c->h264_idct_add16 = ff_h264_idct_add16_sse2;
c->h264_idct_add8 = ff_h264_idct_add8_sse2;
c->h264_idct_add16intra = ff_h264_idct_add16intra_sse2;
+#endif
}
}
#endif
diff --git a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/h264_deblock_sse2.asm b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/h264_deblock_sse2.asm
index b2aa94023..a9e6dea3d 100644
--- a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/h264_deblock_sse2.asm
+++ b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/h264_deblock_sse2.asm
@@ -5,20 +5,22 @@
;*
;* Authors: Loren Merritt <lorenm@u.washington.edu>
;*
-;* This program is free software; you can redistribute it and/or modify
-;* it under the terms of the GNU General Public License as published by
-;* the Free Software Foundation; either version 2 of the License, or
-;* (at your option) any later version.
+;* This file is part of FFmpeg.
;*
-;* This program is distributed in the hope that it will be useful,
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
-;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-;* GNU General Public License for more details.
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;* Lesser General Public License for more details.
;*
-;* You should have received a copy of the GNU General Public License
-;* along with this program; if not, write to the Free Software
-;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
-;*****************************************************************************
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
%include "x86inc.asm"
diff --git a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/vc1dsp_yasm.asm b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/vc1dsp_yasm.asm
index b4d50f5ad..660ff1169 100644
--- a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/vc1dsp_yasm.asm
+++ b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/vc1dsp_yasm.asm
@@ -1,330 +1,330 @@
-;******************************************************************************
-;* VC1 deblocking optimizations
-;* Copyright (c) 2009 David Conrad
-;*
-;* This file is part of FFmpeg.
-;*
-;* FFmpeg is free software; you can redistribute it and/or
-;* modify it under the terms of the GNU Lesser General Public
-;* License as published by the Free Software Foundation; either
-;* version 2.1 of the License, or (at your option) any later version.
-;*
-;* FFmpeg is distributed in the hope that it will be useful,
-;* but WITHOUT ANY WARRANTY; without even the implied warranty of
-;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-;* Lesser General Public License for more details.
-;*
-;* You should have received a copy of the GNU Lesser General Public
-;* License along with FFmpeg; if not, write to the Free Software
-;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-;******************************************************************************
-
-%include "x86inc.asm"
-%include "x86util.asm"
-
-cextern pw_4
-cextern pw_5
-
-section .text
-
-; dst_low, dst_high (src), zero
-; zero-extends one vector from 8 to 16 bits
-%macro UNPACK_8TO16 4
- mova m%2, m%3
- punpckh%1 m%3, m%4
- punpckl%1 m%2, m%4
-%endmacro
-
-%macro STORE_4_WORDS_MMX 6
- movd %6, %5
-%if mmsize==16
- psrldq %5, 4
-%else
- psrlq %5, 32
-%endif
- mov %1, %6w
- shr %6, 16
- mov %2, %6w
- movd %6, %5
- mov %3, %6w
- shr %6, 16
- mov %4, %6w
-%endmacro
-
-%macro STORE_4_WORDS_SSE4 6
- pextrw %1, %5, %6+0
- pextrw %2, %5, %6+1
- pextrw %3, %5, %6+2
- pextrw %4, %5, %6+3
-%endmacro
-
-; in: p1 p0 q0 q1, clobbers p0
-; out: p1 = (2*(p1 - q1) - 5*(p0 - q0) + 4) >> 3
-%macro VC1_LOOP_FILTER_A0 4
- psubw %1, %4
- psubw %2, %3
- paddw %1, %1
- pmullw %2, [pw_5]
- psubw %1, %2
- paddw %1, [pw_4]
- psraw %1, 3
-%endmacro
-
-; in: p0 q0 a0 a1 a2
-; m0 m1 m7 m6 m5
-; %1: size
-; out: m0=p0' m1=q0'
-%macro VC1_FILTER 1
- PABSW m4, m7
- PABSW m3, m6
- PABSW m2, m5
- mova m6, m4
- pminsw m3, m2
- pcmpgtw m6, m3 ; if (a2 < a0 || a1 < a0)
- psubw m3, m4
- pmullw m3, [pw_5] ; 5*(a3 - a0)
- PABSW m2, m3
- psraw m2, 3 ; abs(d/8)
- pxor m7, m3 ; d_sign ^= a0_sign
-
- pxor m5, m5
- movd m3, r2
-%if %1 > 4
- punpcklbw m3, m3
-%endif
- punpcklbw m3, m5
- pcmpgtw m3, m4 ; if (a0 < pq)
- pand m6, m3
-
- mova m3, m0
- psubw m3, m1
- PABSW m4, m3
- psraw m4, 1
- pxor m3, m7 ; d_sign ^ clip_sign
- psraw m3, 15
- pminsw m2, m4 ; min(d, clip)
- pcmpgtw m4, m5
- pand m6, m4 ; filt3 (C return value)
-
-; each set of 4 pixels is not filtered if the 3rd is not
-%if mmsize==16
- pshuflw m4, m6, 0xaa
-%if %1 > 4
- pshufhw m4, m4, 0xaa
-%endif
-%else
- pshufw m4, m6, 0xaa
-%endif
- pandn m3, m4
- pand m2, m6
- pand m3, m2 ; d final
-
- PSIGNW m3, m7
- psubw m0, m3
- paddw m1, m3
- packuswb m0, m0
- packuswb m1, m1
-%endmacro
-
-; 1st param: size of filter
-; 2nd param: mov suffix equivalent to the filter size
-%macro VC1_V_LOOP_FILTER 2
- pxor m5, m5
- mov%2 m6, [r4]
- mov%2 m4, [r4+r1]
- mov%2 m7, [r4+2*r1]
- mov%2 m0, [r4+r3]
- punpcklbw m6, m5
- punpcklbw m4, m5
- punpcklbw m7, m5
- punpcklbw m0, m5
-
- VC1_LOOP_FILTER_A0 m6, m4, m7, m0
- mov%2 m1, [r0]
- mov%2 m2, [r0+r1]
- punpcklbw m1, m5
- punpcklbw m2, m5
- mova m4, m0
- VC1_LOOP_FILTER_A0 m7, m4, m1, m2
- mov%2 m3, [r0+2*r1]
- mov%2 m4, [r0+r3]
- punpcklbw m3, m5
- punpcklbw m4, m5
- mova m5, m1
- VC1_LOOP_FILTER_A0 m5, m2, m3, m4
-
- VC1_FILTER %1
- mov%2 [r4+r3], m0
- mov%2 [r0], m1
-%endmacro
-
-; 1st param: size of filter
-; NOTE: UNPACK_8TO16 this number of 8 bit numbers are in half a register
-; 2nd (optional) param: temp register to use for storing words
-%macro VC1_H_LOOP_FILTER 1-2
-%if %1 == 4
- movq m0, [r0 -4]
- movq m1, [r0+ r1-4]
- movq m2, [r0+2*r1-4]
- movq m3, [r0+ r3-4]
- TRANSPOSE4x4B 0, 1, 2, 3, 4
-%else
- movq m0, [r0 -4]
- movq m4, [r0+ r1-4]
- movq m1, [r0+2*r1-4]
- movq m5, [r0+ r3-4]
- movq m2, [r4 -4]
- movq m6, [r4+ r1-4]
- movq m3, [r4+2*r1-4]
- movq m7, [r4+ r3-4]
- punpcklbw m0, m4
- punpcklbw m1, m5
- punpcklbw m2, m6
- punpcklbw m3, m7
- TRANSPOSE4x4W 0, 1, 2, 3, 4
-%endif
- pxor m5, m5
-
- UNPACK_8TO16 bw, 6, 0, 5
- UNPACK_8TO16 bw, 7, 1, 5
- VC1_LOOP_FILTER_A0 m6, m0, m7, m1
- UNPACK_8TO16 bw, 4, 2, 5
- mova m0, m1 ; m0 = p0
- VC1_LOOP_FILTER_A0 m7, m1, m4, m2
- UNPACK_8TO16 bw, 1, 3, 5
- mova m5, m4
- VC1_LOOP_FILTER_A0 m5, m2, m1, m3
- SWAP 1, 4 ; m1 = q0
-
- VC1_FILTER %1
- punpcklbw m0, m1
-%if %0 > 1
- STORE_4_WORDS_MMX [r0-1], [r0+r1-1], [r0+2*r1-1], [r0+r3-1], m0, %2
-%if %1 > 4
- psrldq m0, 4
- STORE_4_WORDS_MMX [r4-1], [r4+r1-1], [r4+2*r1-1], [r4+r3-1], m0, %2
-%endif
-%else
- STORE_4_WORDS_SSE4 [r0-1], [r0+r1-1], [r0+2*r1-1], [r0+r3-1], m0, 0
- STORE_4_WORDS_SSE4 [r4-1], [r4+r1-1], [r4+2*r1-1], [r4+r3-1], m0, 4
-%endif
-%endmacro
-
-
-%macro START_V_FILTER 0
- mov r4, r0
- lea r3, [4*r1]
- sub r4, r3
- lea r3, [r1+2*r1]
- imul r2, 0x01010101
-%endmacro
-
-%macro START_H_FILTER 1
- lea r3, [r1+2*r1]
-%if %1 > 4
- lea r4, [r0+4*r1]
-%endif
- imul r2, 0x01010101
-%endmacro
-
-; I dont know why the sign extension is needed...
-%macro PSIGNW_SRA_MMX 2
- psraw %2, 15
- PSIGNW_MMX %1, %2
-%endmacro
-
-
-%macro VC1_LF_MMX 1
-INIT_MMX
-cglobal vc1_v_loop_filter_internal_%1
- VC1_V_LOOP_FILTER 4, d
- ret
-
-cglobal vc1_h_loop_filter_internal_%1
- VC1_H_LOOP_FILTER 4, r4
- ret
-
-; void ff_vc1_v_loop_filter4_mmx2(uint8_t *src, int stride, int pq)
-cglobal vc1_v_loop_filter4_%1, 3,5,0
- START_V_FILTER
- call vc1_v_loop_filter_internal_%1
- RET
-
-; void ff_vc1_h_loop_filter4_mmx2(uint8_t *src, int stride, int pq)
-cglobal vc1_h_loop_filter4_%1, 3,5,0
- START_H_FILTER 4
- call vc1_h_loop_filter_internal_%1
- RET
-
-; void ff_vc1_v_loop_filter8_mmx2(uint8_t *src, int stride, int pq)
-cglobal vc1_v_loop_filter8_%1, 3,5,0
- START_V_FILTER
- call vc1_v_loop_filter_internal_%1
- add r4, 4
- add r0, 4
- call vc1_v_loop_filter_internal_%1
- RET
-
-; void ff_vc1_h_loop_filter8_mmx2(uint8_t *src, int stride, int pq)
-cglobal vc1_h_loop_filter8_%1, 3,5,0
- START_H_FILTER 4
- call vc1_h_loop_filter_internal_%1
- lea r0, [r0+4*r1]
- call vc1_h_loop_filter_internal_%1
- RET
-%endmacro
-
-%define PABSW PABSW_MMX
-%define PSIGNW PSIGNW_SRA_MMX
-VC1_LF_MMX mmx
-
-%define PABSW PABSW_MMX2
-VC1_LF_MMX mmx2
-
-INIT_XMM
-; void ff_vc1_v_loop_filter8_sse2(uint8_t *src, int stride, int pq)
-cglobal vc1_v_loop_filter8_sse2, 3,5,8
- START_V_FILTER
- VC1_V_LOOP_FILTER 8, q
- RET
-
-; void ff_vc1_h_loop_filter8_sse2(uint8_t *src, int stride, int pq)
-cglobal vc1_h_loop_filter8_sse2, 3,6,8
- START_H_FILTER 8
- VC1_H_LOOP_FILTER 8, r5
- RET
-
-%define PABSW PABSW_SSSE3
-%define PSIGNW PSIGNW_SSSE3
-
-INIT_MMX
-; void ff_vc1_v_loop_filter4_ssse3(uint8_t *src, int stride, int pq)
-cglobal vc1_v_loop_filter4_ssse3, 3,5,0
- START_V_FILTER
- VC1_V_LOOP_FILTER 4, d
- RET
-
-; void ff_vc1_h_loop_filter4_ssse3(uint8_t *src, int stride, int pq)
-cglobal vc1_h_loop_filter4_ssse3, 3,5,0
- START_H_FILTER 4
- VC1_H_LOOP_FILTER 4, r4
- RET
-
-INIT_XMM
-; void ff_vc1_v_loop_filter8_ssse3(uint8_t *src, int stride, int pq)
-cglobal vc1_v_loop_filter8_ssse3, 3,5,8
- START_V_FILTER
- VC1_V_LOOP_FILTER 8, q
- RET
-
-; void ff_vc1_h_loop_filter8_ssse3(uint8_t *src, int stride, int pq)
-cglobal vc1_h_loop_filter8_ssse3, 3,6,8
- START_H_FILTER 8
- VC1_H_LOOP_FILTER 8, r5
- RET
-
-; void ff_vc1_h_loop_filter8_sse4(uint8_t *src, int stride, int pq)
-cglobal vc1_h_loop_filter8_sse4, 3,5,8
- START_H_FILTER 8
- VC1_H_LOOP_FILTER 8
- RET
+;******************************************************************************
+;* VC1 deblocking optimizations
+;* Copyright (c) 2009 David Conrad
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "x86inc.asm"
+%include "x86util.asm"
+
+cextern pw_4
+cextern pw_5
+
+section .text
+
+; dst_low, dst_high (src), zero
+; zero-extends one vector from 8 to 16 bits
+%macro UNPACK_8TO16 4
+ mova m%2, m%3
+ punpckh%1 m%3, m%4
+ punpckl%1 m%2, m%4
+%endmacro
+
+%macro STORE_4_WORDS_MMX 6
+ movd %6, %5
+%if mmsize==16
+ psrldq %5, 4
+%else
+ psrlq %5, 32
+%endif
+ mov %1, %6w
+ shr %6, 16
+ mov %2, %6w
+ movd %6, %5
+ mov %3, %6w
+ shr %6, 16
+ mov %4, %6w
+%endmacro
+
+%macro STORE_4_WORDS_SSE4 6
+ pextrw %1, %5, %6+0
+ pextrw %2, %5, %6+1
+ pextrw %3, %5, %6+2
+ pextrw %4, %5, %6+3
+%endmacro
+
+; in: p1 p0 q0 q1, clobbers p0
+; out: p1 = (2*(p1 - q1) - 5*(p0 - q0) + 4) >> 3
+%macro VC1_LOOP_FILTER_A0 4
+ psubw %1, %4
+ psubw %2, %3
+ paddw %1, %1
+ pmullw %2, [pw_5]
+ psubw %1, %2
+ paddw %1, [pw_4]
+ psraw %1, 3
+%endmacro
+
+; in: p0 q0 a0 a1 a2
+; m0 m1 m7 m6 m5
+; %1: size
+; out: m0=p0' m1=q0'
+%macro VC1_FILTER 1
+ PABSW m4, m7
+ PABSW m3, m6
+ PABSW m2, m5
+ mova m6, m4
+ pminsw m3, m2
+ pcmpgtw m6, m3 ; if (a2 < a0 || a1 < a0)
+ psubw m3, m4
+ pmullw m3, [pw_5] ; 5*(a3 - a0)
+ PABSW m2, m3
+ psraw m2, 3 ; abs(d/8)
+ pxor m7, m3 ; d_sign ^= a0_sign
+
+ pxor m5, m5
+ movd m3, r2
+%if %1 > 4
+ punpcklbw m3, m3
+%endif
+ punpcklbw m3, m5
+ pcmpgtw m3, m4 ; if (a0 < pq)
+ pand m6, m3
+
+ mova m3, m0
+ psubw m3, m1
+ PABSW m4, m3
+ psraw m4, 1
+ pxor m3, m7 ; d_sign ^ clip_sign
+ psraw m3, 15
+ pminsw m2, m4 ; min(d, clip)
+ pcmpgtw m4, m5
+ pand m6, m4 ; filt3 (C return value)
+
+; each set of 4 pixels is not filtered if the 3rd is not
+%if mmsize==16
+ pshuflw m4, m6, 0xaa
+%if %1 > 4
+ pshufhw m4, m4, 0xaa
+%endif
+%else
+ pshufw m4, m6, 0xaa
+%endif
+ pandn m3, m4
+ pand m2, m6
+ pand m3, m2 ; d final
+
+ PSIGNW m3, m7
+ psubw m0, m3
+ paddw m1, m3
+ packuswb m0, m0
+ packuswb m1, m1
+%endmacro
+
+; 1st param: size of filter
+; 2nd param: mov suffix equivalent to the filter size
+%macro VC1_V_LOOP_FILTER 2
+ pxor m5, m5
+ mov%2 m6, [r4]
+ mov%2 m4, [r4+r1]
+ mov%2 m7, [r4+2*r1]
+ mov%2 m0, [r4+r3]
+ punpcklbw m6, m5
+ punpcklbw m4, m5
+ punpcklbw m7, m5
+ punpcklbw m0, m5
+
+ VC1_LOOP_FILTER_A0 m6, m4, m7, m0
+ mov%2 m1, [r0]
+ mov%2 m2, [r0+r1]
+ punpcklbw m1, m5
+ punpcklbw m2, m5
+ mova m4, m0
+ VC1_LOOP_FILTER_A0 m7, m4, m1, m2
+ mov%2 m3, [r0+2*r1]
+ mov%2 m4, [r0+r3]
+ punpcklbw m3, m5
+ punpcklbw m4, m5
+ mova m5, m1
+ VC1_LOOP_FILTER_A0 m5, m2, m3, m4
+
+ VC1_FILTER %1
+ mov%2 [r4+r3], m0
+ mov%2 [r0], m1
+%endmacro
+
+; 1st param: size of filter
+; NOTE: UNPACK_8TO16 this number of 8 bit numbers are in half a register
+; 2nd (optional) param: temp register to use for storing words
+%macro VC1_H_LOOP_FILTER 1-2
+%if %1 == 4
+ movq m0, [r0 -4]
+ movq m1, [r0+ r1-4]
+ movq m2, [r0+2*r1-4]
+ movq m3, [r0+ r3-4]
+ TRANSPOSE4x4B 0, 1, 2, 3, 4
+%else
+ movq m0, [r0 -4]
+ movq m4, [r0+ r1-4]
+ movq m1, [r0+2*r1-4]
+ movq m5, [r0+ r3-4]
+ movq m2, [r4 -4]
+ movq m6, [r4+ r1-4]
+ movq m3, [r4+2*r1-4]
+ movq m7, [r4+ r3-4]
+ punpcklbw m0, m4
+ punpcklbw m1, m5
+ punpcklbw m2, m6
+ punpcklbw m3, m7
+ TRANSPOSE4x4W 0, 1, 2, 3, 4
+%endif
+ pxor m5, m5
+
+ UNPACK_8TO16 bw, 6, 0, 5
+ UNPACK_8TO16 bw, 7, 1, 5
+ VC1_LOOP_FILTER_A0 m6, m0, m7, m1
+ UNPACK_8TO16 bw, 4, 2, 5
+ mova m0, m1 ; m0 = p0
+ VC1_LOOP_FILTER_A0 m7, m1, m4, m2
+ UNPACK_8TO16 bw, 1, 3, 5
+ mova m5, m4
+ VC1_LOOP_FILTER_A0 m5, m2, m1, m3
+ SWAP 1, 4 ; m1 = q0
+
+ VC1_FILTER %1
+ punpcklbw m0, m1
+%if %0 > 1
+ STORE_4_WORDS_MMX [r0-1], [r0+r1-1], [r0+2*r1-1], [r0+r3-1], m0, %2
+%if %1 > 4
+ psrldq m0, 4
+ STORE_4_WORDS_MMX [r4-1], [r4+r1-1], [r4+2*r1-1], [r4+r3-1], m0, %2
+%endif
+%else
+ STORE_4_WORDS_SSE4 [r0-1], [r0+r1-1], [r0+2*r1-1], [r0+r3-1], m0, 0
+ STORE_4_WORDS_SSE4 [r4-1], [r4+r1-1], [r4+2*r1-1], [r4+r3-1], m0, 4
+%endif
+%endmacro
+
+
+%macro START_V_FILTER 0
+ mov r4, r0
+ lea r3, [4*r1]
+ sub r4, r3
+ lea r3, [r1+2*r1]
+ imul r2, 0x01010101
+%endmacro
+
+%macro START_H_FILTER 1
+ lea r3, [r1+2*r1]
+%if %1 > 4
+ lea r4, [r0+4*r1]
+%endif
+ imul r2, 0x01010101
+%endmacro
+
+; I dont know why the sign extension is needed...
+%macro PSIGNW_SRA_MMX 2
+ psraw %2, 15
+ PSIGNW_MMX %1, %2
+%endmacro
+
+
+%macro VC1_LF_MMX 1
+INIT_MMX
+cglobal vc1_v_loop_filter_internal_%1
+ VC1_V_LOOP_FILTER 4, d
+ ret
+
+cglobal vc1_h_loop_filter_internal_%1
+ VC1_H_LOOP_FILTER 4, r4
+ ret
+
+; void ff_vc1_v_loop_filter4_mmx2(uint8_t *src, int stride, int pq)
+cglobal vc1_v_loop_filter4_%1, 3,5,0
+ START_V_FILTER
+ call vc1_v_loop_filter_internal_%1
+ RET
+
+; void ff_vc1_h_loop_filter4_mmx2(uint8_t *src, int stride, int pq)
+cglobal vc1_h_loop_filter4_%1, 3,5,0
+ START_H_FILTER 4
+ call vc1_h_loop_filter_internal_%1
+ RET
+
+; void ff_vc1_v_loop_filter8_mmx2(uint8_t *src, int stride, int pq)
+cglobal vc1_v_loop_filter8_%1, 3,5,0
+ START_V_FILTER
+ call vc1_v_loop_filter_internal_%1
+ add r4, 4
+ add r0, 4
+ call vc1_v_loop_filter_internal_%1
+ RET
+
+; void ff_vc1_h_loop_filter8_mmx2(uint8_t *src, int stride, int pq)
+cglobal vc1_h_loop_filter8_%1, 3,5,0
+ START_H_FILTER 4
+ call vc1_h_loop_filter_internal_%1
+ lea r0, [r0+4*r1]
+ call vc1_h_loop_filter_internal_%1
+ RET
+%endmacro
+
+%define PABSW PABSW_MMX
+%define PSIGNW PSIGNW_SRA_MMX
+VC1_LF_MMX mmx
+
+%define PABSW PABSW_MMX2
+VC1_LF_MMX mmx2
+
+INIT_XMM
+; void ff_vc1_v_loop_filter8_sse2(uint8_t *src, int stride, int pq)
+cglobal vc1_v_loop_filter8_sse2, 3,5,8
+ START_V_FILTER
+ VC1_V_LOOP_FILTER 8, q
+ RET
+
+; void ff_vc1_h_loop_filter8_sse2(uint8_t *src, int stride, int pq)
+cglobal vc1_h_loop_filter8_sse2, 3,6,8
+ START_H_FILTER 8
+ VC1_H_LOOP_FILTER 8, r5
+ RET
+
+%define PABSW PABSW_SSSE3
+%define PSIGNW PSIGNW_SSSE3
+
+INIT_MMX
+; void ff_vc1_v_loop_filter4_ssse3(uint8_t *src, int stride, int pq)
+cglobal vc1_v_loop_filter4_ssse3, 3,5,0
+ START_V_FILTER
+ VC1_V_LOOP_FILTER 4, d
+ RET
+
+; void ff_vc1_h_loop_filter4_ssse3(uint8_t *src, int stride, int pq)
+cglobal vc1_h_loop_filter4_ssse3, 3,5,0
+ START_H_FILTER 4
+ VC1_H_LOOP_FILTER 4, r4
+ RET
+
+INIT_XMM
+; void ff_vc1_v_loop_filter8_ssse3(uint8_t *src, int stride, int pq)
+cglobal vc1_v_loop_filter8_ssse3, 3,5,8
+ START_V_FILTER
+ VC1_V_LOOP_FILTER 8, q
+ RET
+
+; void ff_vc1_h_loop_filter8_ssse3(uint8_t *src, int stride, int pq)
+cglobal vc1_h_loop_filter8_ssse3, 3,6,8
+ START_H_FILTER 8
+ VC1_H_LOOP_FILTER 8, r5
+ RET
+
+; void ff_vc1_h_loop_filter8_sse4(uint8_t *src, int stride, int pq)
+cglobal vc1_h_loop_filter8_sse4, 3,5,8
+ START_H_FILTER 8
+ VC1_H_LOOP_FILTER 8
+ RET
diff --git a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/vp8dsp-init.c b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/vp8dsp-init.c
index d75f1a1d8..e06da5e42 100644
--- a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/vp8dsp-init.c
+++ b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/vp8dsp-init.c
@@ -220,67 +220,39 @@ HVBILIN(ssse3, 8, 16, 16)
extern void ff_vp8_idct_dc_add_mmx(uint8_t *dst, DCTELEM block[16], int stride);
extern void ff_vp8_idct_dc_add_sse4(uint8_t *dst, DCTELEM block[16], int stride);
-extern void ff_vp8_luma_dc_wht_mmxext(DCTELEM block[4][4][16], DCTELEM dc[16]);
+extern void ff_vp8_idct_dc_add4y_mmx(uint8_t *dst, DCTELEM block[4][16], int stride);
+extern void ff_vp8_idct_dc_add4y_sse2(uint8_t *dst, DCTELEM block[4][16], int stride);
+extern void ff_vp8_idct_dc_add4uv_mmx(uint8_t *dst, DCTELEM block[2][16], int stride);
+extern void ff_vp8_luma_dc_wht_mmx(DCTELEM block[4][4][16], DCTELEM dc[16]);
extern void ff_vp8_idct_add_mmx(uint8_t *dst, DCTELEM block[16], int stride);
+extern void ff_vp8_idct_add_sse(uint8_t *dst, DCTELEM block[16], int stride);
+
+#define DECLARE_LOOP_FILTER(NAME)\
+extern void ff_vp8_v_loop_filter_simple_ ## NAME(uint8_t *dst, int stride, int flim);\
+extern void ff_vp8_h_loop_filter_simple_ ## NAME(uint8_t *dst, int stride, int flim);\
+extern void ff_vp8_v_loop_filter16y_inner_ ## NAME (uint8_t *dst, int stride,\
+ int e, int i, int hvt);\
+extern void ff_vp8_h_loop_filter16y_inner_ ## NAME (uint8_t *dst, int stride,\
+ int e, int i, int hvt);\
+extern void ff_vp8_v_loop_filter8uv_inner_ ## NAME (uint8_t *dstU, uint8_t *dstV,\
+ int s, int e, int i, int hvt);\
+extern void ff_vp8_h_loop_filter8uv_inner_ ## NAME (uint8_t *dstU, uint8_t *dstV,\
+ int s, int e, int i, int hvt);\
+extern void ff_vp8_v_loop_filter16y_mbedge_ ## NAME(uint8_t *dst, int stride,\
+ int e, int i, int hvt);\
+extern void ff_vp8_h_loop_filter16y_mbedge_ ## NAME(uint8_t *dst, int stride,\
+ int e, int i, int hvt);\
+extern void ff_vp8_v_loop_filter8uv_mbedge_ ## NAME(uint8_t *dstU, uint8_t *dstV,\
+ int s, int e, int i, int hvt);\
+extern void ff_vp8_h_loop_filter8uv_mbedge_ ## NAME(uint8_t *dstU, uint8_t *dstV,\
+ int s, int e, int i, int hvt);
+
+DECLARE_LOOP_FILTER(mmx)
+DECLARE_LOOP_FILTER(mmxext)
+DECLARE_LOOP_FILTER(sse2)
+DECLARE_LOOP_FILTER(ssse3)
+DECLARE_LOOP_FILTER(sse4)
-extern void ff_vp8_v_loop_filter_simple_mmx (uint8_t *dst, int stride, int flim);
-extern void ff_vp8_v_loop_filter_simple_mmxext(uint8_t *dst, int stride, int flim);
-extern void ff_vp8_v_loop_filter_simple_sse2 (uint8_t *dst, int stride, int flim);
-extern void ff_vp8_h_loop_filter_simple_mmx (uint8_t *dst, int stride, int flim);
-extern void ff_vp8_h_loop_filter_simple_mmxext(uint8_t *dst, int stride, int flim);
-extern void ff_vp8_h_loop_filter_simple_sse2 (uint8_t *dst, int stride, int flim);
-
-extern void ff_vp8_v_loop_filter16y_inner_mmx (uint8_t *dst, int stride,
- int e, int i, int hvt);
-extern void ff_vp8_v_loop_filter16y_inner_mmxext(uint8_t *dst, int stride,
- int e, int i, int hvt);
-extern void ff_vp8_v_loop_filter16y_inner_sse2 (uint8_t *dst, int stride,
- int e, int i, int hvt);
-extern void ff_vp8_h_loop_filter16y_inner_mmx (uint8_t *dst, int stride,
- int e, int i, int hvt);
-extern void ff_vp8_h_loop_filter16y_inner_mmxext(uint8_t *dst, int stride,
- int e, int i, int hvt);
-extern void ff_vp8_h_loop_filter16y_inner_sse2 (uint8_t *dst, int stride,
- int e, int i, int hvt);
-
-extern void ff_vp8_v_loop_filter8uv_inner_mmx (uint8_t *dstU, uint8_t *dstV,
- int s, int e, int i, int hvt);
-extern void ff_vp8_v_loop_filter8uv_inner_mmxext(uint8_t *dstU, uint8_t *dstV,
- int s, int e, int i, int hvt);
-extern void ff_vp8_v_loop_filter8uv_inner_sse2 (uint8_t *dstU, uint8_t *dstV,
- int s, int e, int i, int hvt);
-extern void ff_vp8_h_loop_filter8uv_inner_mmx (uint8_t *dstU, uint8_t *dstV,
- int s, int e, int i, int hvt);
-extern void ff_vp8_h_loop_filter8uv_inner_mmxext(uint8_t *dstU, uint8_t *dstV,
- int s, int e, int i, int hvt);
-extern void ff_vp8_h_loop_filter8uv_inner_sse2 (uint8_t *dstU, uint8_t *dstV,
- int s, int e, int i, int hvt);
-
-extern void ff_vp8_v_loop_filter16y_mbedge_mmx (uint8_t *dst, int stride,
- int e, int i, int hvt);
-extern void ff_vp8_v_loop_filter16y_mbedge_mmxext(uint8_t *dst, int stride,
- int e, int i, int hvt);
-extern void ff_vp8_v_loop_filter16y_mbedge_sse2 (uint8_t *dst, int stride,
- int e, int i, int hvt);
-extern void ff_vp8_h_loop_filter16y_mbedge_mmx (uint8_t *dst, int stride,
- int e, int i, int hvt);
-extern void ff_vp8_h_loop_filter16y_mbedge_mmxext(uint8_t *dst, int stride,
- int e, int i, int hvt);
-extern void ff_vp8_h_loop_filter16y_mbedge_sse2 (uint8_t *dst, int stride,
- int e, int i, int hvt);
-
-extern void ff_vp8_v_loop_filter8uv_mbedge_mmx (uint8_t *dstU, uint8_t *dstV,
- int s, int e, int i, int hvt);
-extern void ff_vp8_v_loop_filter8uv_mbedge_mmxext(uint8_t *dstU, uint8_t *dstV,
- int s, int e, int i, int hvt);
-extern void ff_vp8_v_loop_filter8uv_mbedge_sse2 (uint8_t *dstU, uint8_t *dstV,
- int s, int e, int i, int hvt);
-extern void ff_vp8_h_loop_filter8uv_mbedge_mmx (uint8_t *dstU, uint8_t *dstV,
- int s, int e, int i, int hvt);
-extern void ff_vp8_h_loop_filter8uv_mbedge_mmxext(uint8_t *dstU, uint8_t *dstV,
- int s, int e, int i, int hvt);
-extern void ff_vp8_h_loop_filter8uv_mbedge_sse2 (uint8_t *dstU, uint8_t *dstV,
- int s, int e, int i, int hvt);
#endif
#define VP8_LUMA_MC_FUNC(IDX, SIZE, OPT) \
@@ -313,8 +285,11 @@ av_cold void ff_vp8dsp_init_x86(VP8DSPContext* c)
#if HAVE_YASM
if (mm_flags & FF_MM_MMX) {
- c->vp8_idct_dc_add = ff_vp8_idct_dc_add_mmx;
- c->vp8_idct_add = ff_vp8_idct_add_mmx;
+ c->vp8_idct_dc_add = ff_vp8_idct_dc_add_mmx;
+ c->vp8_idct_dc_add4y = ff_vp8_idct_dc_add4y_mmx;
+ c->vp8_idct_dc_add4uv = ff_vp8_idct_dc_add4uv_mmx;
+ c->vp8_idct_add = ff_vp8_idct_add_mmx;
+ c->vp8_luma_dc_wht = ff_vp8_luma_dc_wht_mmx;
c->put_vp8_epel_pixels_tab[0][0][0] =
c->put_vp8_bilinear_pixels_tab[0][0][0] = ff_put_vp8_pixels16_mmx;
c->put_vp8_epel_pixels_tab[1][0][0] =
@@ -337,7 +312,6 @@ av_cold void ff_vp8dsp_init_x86(VP8DSPContext* c)
/* note that 4-tap width=16 functions are missing because w=16
* is only used for luma, and luma is always a copy or sixtap. */
if (mm_flags & FF_MM_MMX2) {
- c->vp8_luma_dc_wht = ff_vp8_luma_dc_wht_mmxext;
#if ARCH_X86_32
VP8_LUMA_MC_FUNC(0, 16, mmxext);
VP8_MC_FUNC(1, 8, mmxext);
@@ -362,6 +336,7 @@ av_cold void ff_vp8dsp_init_x86(VP8DSPContext* c)
}
if (mm_flags & FF_MM_SSE) {
+ c->vp8_idct_add = ff_vp8_idct_add_sse;
c->put_vp8_epel_pixels_tab[0][0][0] =
c->put_vp8_bilinear_pixels_tab[0][0][0] = ff_put_vp8_pixels16_sse;
}
@@ -380,16 +355,18 @@ av_cold void ff_vp8dsp_init_x86(VP8DSPContext* c)
c->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16y_inner_sse2;
c->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_sse2;
- c->vp8_v_loop_filter16y = ff_vp8_v_loop_filter16y_mbedge_mmxext;
- c->vp8_v_loop_filter8uv = ff_vp8_v_loop_filter8uv_mbedge_mmxext;
+ c->vp8_v_loop_filter16y = ff_vp8_v_loop_filter16y_mbedge_sse2;
+ c->vp8_v_loop_filter8uv = ff_vp8_v_loop_filter8uv_mbedge_sse2;
}
if (mm_flags & FF_MM_SSE2) {
+ c->vp8_idct_dc_add4y = ff_vp8_idct_dc_add4y_sse2;
+
c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_sse2;
c->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_sse2;
- c->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16y_mbedge_mmxext;
- c->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_mbedge_mmxext;
+ c->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16y_mbedge_sse2;
+ c->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_mbedge_sse2;
}
if (mm_flags & FF_MM_SSSE3) {
@@ -401,10 +378,26 @@ av_cold void ff_vp8dsp_init_x86(VP8DSPContext* c)
VP8_BILINEAR_MC_FUNC(1, 8, ssse3);
VP8_BILINEAR_MC_FUNC(2, 4, ssse3);
#endif
+
+ c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_ssse3;
+ c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_ssse3;
+
+ c->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16y_inner_ssse3;
+ c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_ssse3;
+ c->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_ssse3;
+ c->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_ssse3;
+
+ c->vp8_v_loop_filter16y = ff_vp8_v_loop_filter16y_mbedge_ssse3;
+ c->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16y_mbedge_ssse3;
+ c->vp8_v_loop_filter8uv = ff_vp8_v_loop_filter8uv_mbedge_ssse3;
+ c->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_mbedge_ssse3;
}
if (mm_flags & FF_MM_SSE4) {
c->vp8_idct_dc_add = ff_vp8_idct_dc_add_sse4;
+
+ c->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16y_mbedge_sse4;
+ c->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_mbedge_sse4;
}
#endif
}
diff --git a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/vp8dsp.asm b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/vp8dsp.asm
index 2ff415266..4aa901e27 100644
--- a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/vp8dsp.asm
+++ b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/vp8dsp.asm
@@ -900,74 +900,189 @@ cglobal put_vp8_pixels16_sse, 5,5,2
REP_RET
;-----------------------------------------------------------------------------
-; IDCT functions:
-;
; void vp8_idct_dc_add_<opt>(uint8_t *dst, DCTELEM block[16], int stride);
;-----------------------------------------------------------------------------
+%macro ADD_DC 4
+ %4 m2, [r0+%3]
+ %4 m3, [r0+r2+%3]
+ %4 m4, [r1+%3]
+ %4 m5, [r1+r2+%3]
+ paddusb m2, %1
+ paddusb m3, %1
+ paddusb m4, %1
+ paddusb m5, %1
+ psubusb m2, %2
+ psubusb m3, %2
+ psubusb m4, %2
+ psubusb m5, %2
+ %4 [r0+%3], m2
+ %4 [r0+r2+%3], m3
+ %4 [r1+%3], m4
+ %4 [r1+r2+%3], m5
+%endmacro
+
+INIT_MMX
cglobal vp8_idct_dc_add_mmx, 3, 3
; load data
- movd mm0, [r1]
+ movd m0, [r1]
; calculate DC
- paddw mm0, [pw_4]
- pxor mm1, mm1
- psraw mm0, 3
- psubw mm1, mm0
- packuswb mm0, mm0
- packuswb mm1, mm1
- punpcklbw mm0, mm0
- punpcklbw mm1, mm1
- punpcklwd mm0, mm0
- punpcklwd mm1, mm1
+ paddw m0, [pw_4]
+ pxor m1, m1
+ psraw m0, 3
+ movd [r1], m1
+ psubw m1, m0
+ packuswb m0, m0
+ packuswb m1, m1
+ punpcklbw m0, m0
+ punpcklbw m1, m1
+ punpcklwd m0, m0
+ punpcklwd m1, m1
; add DC
- lea r1, [r0+r2*2]
- movd mm2, [r0]
- movd mm3, [r0+r2]
- movd mm4, [r1]
- movd mm5, [r1+r2]
- paddusb mm2, mm0
- paddusb mm3, mm0
- paddusb mm4, mm0
- paddusb mm5, mm0
- psubusb mm2, mm1
- psubusb mm3, mm1
- psubusb mm4, mm1
- psubusb mm5, mm1
- movd [r0], mm2
- movd [r0+r2], mm3
- movd [r1], mm4
- movd [r1+r2], mm5
+ lea r1, [r0+r2*2]
+ ADD_DC m0, m1, 0, movh
RET
+INIT_XMM
cglobal vp8_idct_dc_add_sse4, 3, 3, 6
; load data
- movd xmm0, [r1]
- lea r1, [r0+r2*2]
- pxor xmm1, xmm1
- movq xmm2, [pw_4]
+ movd m0, [r1]
+ pxor m1, m1
+
+ ; calculate DC
+ paddw m0, [pw_4]
+ movd [r1], m1
+ lea r1, [r0+r2*2]
+ movd m2, [r0]
+ movd m3, [r0+r2]
+ movd m4, [r1]
+ movd m5, [r1+r2]
+ psraw m0, 3
+ pshuflw m0, m0, 0
+ punpcklqdq m0, m0
+ punpckldq m2, m3
+ punpckldq m4, m5
+ punpcklbw m2, m1
+ punpcklbw m4, m1
+ paddw m2, m0
+ paddw m4, m0
+ packuswb m2, m4
+ movd [r0], m2
+ pextrd [r0+r2], m2, 1
+ pextrd [r1], m2, 2
+ pextrd [r1+r2], m2, 3
+ RET
+
+;-----------------------------------------------------------------------------
+; void vp8_idct_dc_add4y_<opt>(uint8_t *dst, DCTELEM block[4][16], int stride);
+;-----------------------------------------------------------------------------
+
+INIT_MMX
+cglobal vp8_idct_dc_add4y_mmx, 3, 3
+ ; load data
+ movd m0, [r1+32*0] ; A
+ movd m1, [r1+32*2] ; C
+ punpcklwd m0, [r1+32*1] ; A B
+ punpcklwd m1, [r1+32*3] ; C D
+ punpckldq m0, m1 ; A B C D
+ pxor m6, m6
+
+ ; calculate DC
+ paddw m0, [pw_4]
+ movd [r1+32*0], m6
+ movd [r1+32*1], m6
+ movd [r1+32*2], m6
+ movd [r1+32*3], m6
+ psraw m0, 3
+ psubw m6, m0
+ packuswb m0, m0
+ packuswb m6, m6
+ punpcklbw m0, m0 ; AABBCCDD
+ punpcklbw m6, m6 ; AABBCCDD
+ movq m1, m0
+ movq m7, m6
+ punpcklbw m0, m0 ; AAAABBBB
+ punpckhbw m1, m1 ; CCCCDDDD
+ punpcklbw m6, m6 ; AAAABBBB
+ punpckhbw m7, m7 ; CCCCDDDD
+
+ ; add DC
+ lea r1, [r0+r2*2]
+ ADD_DC m0, m6, 0, mova
+ ADD_DC m1, m7, 8, mova
+ RET
+
+INIT_XMM
+cglobal vp8_idct_dc_add4y_sse2, 3, 3, 6
+ ; load data
+ movd m0, [r1+32*0] ; A
+ movd m1, [r1+32*2] ; C
+ punpcklwd m0, [r1+32*1] ; A B
+ punpcklwd m1, [r1+32*3] ; C D
+ punpckldq m0, m1 ; A B C D
+ pxor m1, m1
; calculate DC
- paddw xmm0, xmm2
- movd xmm2, [r0]
- movd xmm3, [r0+r2]
- movd xmm4, [r1]
- movd xmm5, [r1+r2]
- psraw xmm0, 3
- pshuflw xmm0, xmm0, 0
- punpcklqdq xmm0, xmm0
- punpckldq xmm2, xmm3
- punpckldq xmm4, xmm5
- punpcklbw xmm2, xmm1
- punpcklbw xmm4, xmm1
- paddw xmm2, xmm0
- paddw xmm4, xmm0
- packuswb xmm2, xmm4
- movd [r0], xmm2
- pextrd [r0+r2], xmm2, 1
- pextrd [r1], xmm2, 2
- pextrd [r1+r2], xmm2, 3
+ paddw m0, [pw_4]
+ movd [r1+32*0], m1
+ movd [r1+32*1], m1
+ movd [r1+32*2], m1
+ movd [r1+32*3], m1
+ psraw m0, 3
+ psubw m1, m0
+ packuswb m0, m0
+ packuswb m1, m1
+ punpcklbw m0, m0
+ punpcklbw m1, m1
+ punpcklbw m0, m0
+ punpcklbw m1, m1
+
+ ; add DC
+ lea r1, [r0+r2*2]
+ ADD_DC m0, m1, 0, mova
+ RET
+
+;-----------------------------------------------------------------------------
+; void vp8_idct_dc_add4uv_<opt>(uint8_t *dst, DCTELEM block[4][16], int stride);
+;-----------------------------------------------------------------------------
+
+INIT_MMX
+cglobal vp8_idct_dc_add4uv_mmx, 3, 3
+ ; load data
+ movd m0, [r1+32*0] ; A
+ movd m1, [r1+32*2] ; C
+ punpcklwd m0, [r1+32*1] ; A B
+ punpcklwd m1, [r1+32*3] ; C D
+ punpckldq m0, m1 ; A B C D
+ pxor m6, m6
+
+ ; calculate DC
+ paddw m0, [pw_4]
+ movd [r1+32*0], m6
+ movd [r1+32*1], m6
+ movd [r1+32*2], m6
+ movd [r1+32*3], m6
+ psraw m0, 3
+ psubw m6, m0
+ packuswb m0, m0
+ packuswb m6, m6
+ punpcklbw m0, m0 ; AABBCCDD
+ punpcklbw m6, m6 ; AABBCCDD
+ movq m1, m0
+ movq m7, m6
+ punpcklbw m0, m0 ; AAAABBBB
+ punpckhbw m1, m1 ; CCCCDDDD
+ punpcklbw m6, m6 ; AAAABBBB
+ punpckhbw m7, m7 ; CCCCDDDD
+
+ ; add DC
+ lea r1, [r0+r2*2]
+ ADD_DC m0, m6, 0, mova
+ lea r0, [r0+r2*4]
+ lea r1, [r1+r2*4]
+ ADD_DC m1, m7, 0, mova
RET
;-----------------------------------------------------------------------------
@@ -1006,14 +1121,26 @@ cglobal vp8_idct_dc_add_sse4, 3, 3, 6
%endmacro
INIT_MMX
-cglobal vp8_idct_add_mmx, 3, 3
+%macro VP8_IDCT_ADD 1
+cglobal vp8_idct_add_%1, 3, 3
; load block data
- movq m0, [r1]
- movq m1, [r1+8]
+ movq m0, [r1+ 0]
+ movq m1, [r1+ 8]
movq m2, [r1+16]
movq m3, [r1+24]
movq m6, [pw_20091]
movq m7, [pw_17734]
+%ifidn %1, sse
+ xorps xmm0, xmm0
+ movaps [r1+ 0], xmm0
+ movaps [r1+16], xmm0
+%else
+ pxor m4, m4
+ movq [r1+ 0], m4
+ movq [r1+ 8], m4
+ movq [r1+16], m4
+ movq [r1+24], m4
+%endif
; actual IDCT
VP8_IDCT_TRANSFORM4x4_1D 0, 1, 2, 3, 4, 5
@@ -1029,20 +1156,34 @@ cglobal vp8_idct_add_mmx, 3, 3
STORE_DIFFx2 m2, m3, m6, m7, m4, 3, r1, r2
RET
+%endmacro
+
+VP8_IDCT_ADD mmx
+VP8_IDCT_ADD sse
;-----------------------------------------------------------------------------
; void vp8_luma_dc_wht_mmxext(DCTELEM block[4][4][16], DCTELEM dc[16])
;-----------------------------------------------------------------------------
-%macro SCATTER_WHT 1
- pextrw r1d, m0, %1
- pextrw r2d, m1, %1
- mov [r0+2*16*0], r1w
- mov [r0+2*16*1], r2w
- pextrw r1d, m2, %1
- pextrw r2d, m3, %1
- mov [r0+2*16*2], r1w
- mov [r0+2*16*3], r2w
+%macro SCATTER_WHT 3
+ movd r1d, m%1
+ movd r2d, m%2
+ mov [r0+2*16*(0+%3)], r1w
+ mov [r0+2*16*(1+%3)], r2w
+ shr r1d, 16
+ shr r2d, 16
+ psrlq m%1, 32
+ psrlq m%2, 32
+ mov [r0+2*16*(4+%3)], r1w
+ mov [r0+2*16*(5+%3)], r2w
+ movd r1d, m%1
+ movd r2d, m%2
+ mov [r0+2*16*(8+%3)], r1w
+ mov [r0+2*16*(9+%3)], r2w
+ shr r1d, 16
+ shr r2d, 16
+ mov [r0+2*16*(12+%3)], r1w
+ mov [r0+2*16*(13+%3)], r2w
%endmacro
%macro HADAMARD4_1D 4
@@ -1052,7 +1193,7 @@ cglobal vp8_idct_add_mmx, 3, 3
%endmacro
INIT_MMX
-cglobal vp8_luma_dc_wht_mmxext, 2,3
+cglobal vp8_luma_dc_wht_mmx, 2,3
movq m0, [r1]
movq m1, [r1+8]
movq m2, [r1+16]
@@ -1065,13 +1206,8 @@ cglobal vp8_luma_dc_wht_mmxext, 2,3
psraw m1, 3
psraw m2, 3
psraw m3, 3
- SCATTER_WHT 0
- add r0, 2*16*4
- SCATTER_WHT 1
- add r0, 2*16*4
- SCATTER_WHT 2
- add r0, 2*16*4
- SCATTER_WHT 3
+ SCATTER_WHT 0, 1, 0
+ SCATTER_WHT 2, 3, 2
RET
;-----------------------------------------------------------------------------
@@ -1224,18 +1360,22 @@ cglobal vp8_luma_dc_wht_mmxext, 2,3
movd [%7+%9*2], m%4
%endmacro
-%macro SPLATB_REG 3
+%macro SPLATB_REG 3-4
movd %1, %2
+%ifidn %3, ssse3
+ pshufb %1, %4
+%else
punpcklbw %1, %1
%if mmsize == 16 ; sse2
- punpcklwd %1, %1
- pshufd %1, %1, 0x0
+ pshuflw %1, %1, 0x0
+ punpcklqdq %1, %1
%elifidn %3, mmx
punpcklwd %1, %1
punpckldq %1, %1
%else ; mmxext
pshufw %1, %1, 0x0
%endif
+%endif
%endmacro
%macro SIMPLE_LOOPFILTER 3
@@ -1247,7 +1387,10 @@ cglobal vp8_%2_loop_filter_simple_%1, 3, %3
%if mmsize == 8 ; mmx/mmxext
mov r3, 2
%endif
- SPLATB_REG m7, r2, %1 ; splat "flim" into register
+%ifidn %1, ssse3
+ pxor m0, m0
+%endif
+ SPLATB_REG m7, r2, %1, m0 ; splat "flim" into register
; set up indexes to address 4 rows
mov r2, r1
@@ -1393,6 +1536,8 @@ SIMPLE_LOOPFILTER mmxext, h, 6
INIT_XMM
SIMPLE_LOOPFILTER sse2, v, 3
SIMPLE_LOOPFILTER sse2, h, 6
+SIMPLE_LOOPFILTER ssse3, v, 3
+SIMPLE_LOOPFILTER ssse3, h, 6
;-----------------------------------------------------------------------------
; void vp8_h/v_loop_filter<size>_inner_<opt>(uint8_t *dst, [uint8_t *v,] int stride,
@@ -1428,11 +1573,15 @@ cglobal vp8_%2_loop_filter16y_inner_%1, 5, %3, %5
%define stack_reg hev_thr_reg
%endif
+%ifidn %1, ssse3
+ pxor m7, m7
+%endif
+
%ifndef m8 ; mmx/mmxext or sse2 on x86-32
; splat function arguments
- SPLATB_REG m0, E_reg, %1 ; E
- SPLATB_REG m1, I_reg, %1 ; I
- SPLATB_REG m2, hev_thr_reg, %1 ; hev_thresh
+ SPLATB_REG m0, E_reg, %1, m7 ; E
+ SPLATB_REG m1, I_reg, %1, m7 ; I
+ SPLATB_REG m2, hev_thr_reg, %1, m7 ; hev_thresh
; align stack
mov stack_reg, rsp ; backup stack pointer
@@ -1465,9 +1614,9 @@ cglobal vp8_%2_loop_filter16y_inner_%1, 5, %3, %5
%define q0backup m8
; splat function arguments
- SPLATB_REG flim_E, E_reg, %1 ; E
- SPLATB_REG flim_I, I_reg, %1 ; I
- SPLATB_REG hev_thr, hev_thr_reg, %1 ; hev_thresh
+ SPLATB_REG flim_E, E_reg, %1, m7 ; E
+ SPLATB_REG flim_I, I_reg, %1, m7 ; I
+ SPLATB_REG hev_thr, hev_thr_reg, %1, m7 ; hev_thresh
%endif
%if mmsize == 8 && %4 == 16 ; mmx/mmxext
@@ -1879,15 +2028,15 @@ cglobal vp8_%2_loop_filter16y_inner_%1, 5, %3, %5
%endmacro
INIT_MMX
-INNER_LOOPFILTER mmx, v, 6, 16, 8
-INNER_LOOPFILTER mmx, h, 6, 16, 8
-INNER_LOOPFILTER mmxext, v, 6, 16, 8
-INNER_LOOPFILTER mmxext, h, 6, 16, 8
+INNER_LOOPFILTER mmx, v, 6, 16, 0
+INNER_LOOPFILTER mmx, h, 6, 16, 0
+INNER_LOOPFILTER mmxext, v, 6, 16, 0
+INNER_LOOPFILTER mmxext, h, 6, 16, 0
-INNER_LOOPFILTER mmx, v, 6, 8, 8
-INNER_LOOPFILTER mmx, h, 6, 8, 8
-INNER_LOOPFILTER mmxext, v, 6, 8, 8
-INNER_LOOPFILTER mmxext, h, 6, 8, 8
+INNER_LOOPFILTER mmx, v, 6, 8, 0
+INNER_LOOPFILTER mmx, h, 6, 8, 0
+INNER_LOOPFILTER mmxext, v, 6, 8, 0
+INNER_LOOPFILTER mmxext, h, 6, 8, 0
INIT_XMM
INNER_LOOPFILTER sse2, v, 5, 16, 13
@@ -1899,6 +2048,15 @@ INNER_LOOPFILTER sse2, h, 6, 16, 13
INNER_LOOPFILTER sse2, v, 6, 8, 13
INNER_LOOPFILTER sse2, h, 6, 8, 13
+INNER_LOOPFILTER ssse3, v, 5, 16, 13
+%ifdef m8
+INNER_LOOPFILTER ssse3, h, 5, 16, 13
+%else
+INNER_LOOPFILTER ssse3, h, 6, 16, 13
+%endif
+INNER_LOOPFILTER ssse3, v, 6, 8, 13
+INNER_LOOPFILTER ssse3, h, 6, 8, 13
+
;-----------------------------------------------------------------------------
; void vp8_h/v_loop_filter<size>_mbedge_<opt>(uint8_t *dst, [uint8_t *v,] int stride,
; int flimE, int flimI, int hev_thr);
@@ -1906,10 +2064,24 @@ INNER_LOOPFILTER sse2, h, 6, 8, 13
; write 4 or 8 words in the mmx/xmm registers as 8 lines
; 1 and 2 are the registers to write, this can be the same (for SSE2)
+; for pre-SSE4:
; 3 is a general-purpose register that we will clobber
+; for SSE4:
+; 3 is a pointer to the destination's 5th line
; 4 is a pointer to the destination's 4th line
-; 5 is -stride and +stride
-%macro WRITE_8W 6
+; 5/6 is -stride and +stride
+; 7 is optimization string
+%macro WRITE_8W 7
+%ifidn %7, sse4
+ pextrw [%4+%5*4], %1, 0
+ pextrw [%3+%5*4], %1, 1
+ pextrw [%4+%5*2], %1, 2
+ pextrw [%4+%5 ], %1, 3
+ pextrw [%4 ], %1, 4
+ pextrw [%3 ], %1, 5
+ pextrw [%3+%6 ], %1, 6
+ pextrw [%3+%6*2], %1, 7
+%else
movd %3, %1
%if mmsize == 8
punpckhdq %1, %1
@@ -1948,6 +2120,7 @@ INNER_LOOPFILTER sse2, h, 6, 8, 13
%if mmsize == 8
add %4, %5
%endif
+%endif
%endmacro
%macro MBEDGE_LOOPFILTER 5
@@ -1979,11 +2152,15 @@ cglobal vp8_%2_loop_filter16y_mbedge_%1, 5, %3, %5
%define stack_reg hev_thr_reg
%endif
+%ifidn %1, ssse3
+ pxor m7, m7
+%endif
+
%ifndef m8 ; mmx/mmxext or sse2 on x86-32
; splat function arguments
- SPLATB_REG m0, E_reg, %1 ; E
- SPLATB_REG m1, I_reg, %1 ; I
- SPLATB_REG m2, hev_thr_reg, %1 ; hev_thresh
+ SPLATB_REG m0, E_reg, %1, m7 ; E
+ SPLATB_REG m1, I_reg, %1, m7 ; I
+ SPLATB_REG m2, hev_thr_reg, %1, m7 ; hev_thresh
; align stack
mov stack_reg, rsp ; backup stack pointer
@@ -2023,9 +2200,9 @@ cglobal vp8_%2_loop_filter16y_mbedge_%1, 5, %3, %5
%define lim_sign m15
; splat function arguments
- SPLATB_REG flim_E, E_reg, %1 ; E
- SPLATB_REG flim_I, I_reg, %1 ; I
- SPLATB_REG hev_thr, hev_thr_reg, %1 ; hev_thresh
+ SPLATB_REG flim_E, E_reg, %1, m7 ; E
+ SPLATB_REG flim_I, I_reg, %1, m7 ; I
+ SPLATB_REG hev_thr, hev_thr_reg, %1, m7 ; hev_thresh
%endif
%if mmsize == 8 && %4 == 16 ; mmx/mmxext
@@ -2479,14 +2656,17 @@ cglobal vp8_%2_loop_filter16y_mbedge_%1, 5, %3, %5
%if mmsize == 8 ; mmx/mmxext (h)
WRITE_4x2D 1, 2, 3, 4, dst_reg, dst2_reg, mstride_reg, stride_reg
add dst_reg, 4
- WRITE_8W m5, m6, dst2_reg, dst_reg, mstride_reg, stride_reg
+ WRITE_8W m5, m6, dst2_reg, dst_reg, mstride_reg, stride_reg, %4
%else ; sse2 (h)
lea dst8_reg, [dst8_reg+mstride_reg+1]
WRITE_4x4D 1, 2, 3, 4, dst_reg, dst2_reg, dst8_reg, mstride_reg, stride_reg, %4
- add dst_reg, 4
- add dst8_reg, 4
- WRITE_8W m5, m5, dst2_reg, dst_reg, mstride_reg, stride_reg
- WRITE_8W m6, m6, dst2_reg, dst8_reg, mstride_reg, stride_reg
+ lea dst_reg, [dst2_reg+mstride_reg+4]
+ lea dst8_reg, [dst8_reg+mstride_reg+4]
+ WRITE_8W m5, m5, dst2_reg, dst_reg, mstride_reg, stride_reg, %2
+%ifidn %2, sse4
+ lea dst_reg, [dst8_reg+ stride_reg]
+%endif
+ WRITE_8W m6, m6, dst2_reg, dst8_reg, mstride_reg, stride_reg, %2
%endif
%endif
@@ -2516,15 +2696,15 @@ cglobal vp8_%2_loop_filter16y_mbedge_%1, 5, %3, %5
%endmacro
INIT_MMX
-MBEDGE_LOOPFILTER mmx, v, 6, 16, 8
-MBEDGE_LOOPFILTER mmx, h, 6, 16, 8
-MBEDGE_LOOPFILTER mmxext, v, 6, 16, 8
-MBEDGE_LOOPFILTER mmxext, h, 6, 16, 8
+MBEDGE_LOOPFILTER mmx, v, 6, 16, 0
+MBEDGE_LOOPFILTER mmx, h, 6, 16, 0
+MBEDGE_LOOPFILTER mmxext, v, 6, 16, 0
+MBEDGE_LOOPFILTER mmxext, h, 6, 16, 0
-MBEDGE_LOOPFILTER mmx, v, 6, 8, 8
-MBEDGE_LOOPFILTER mmx, h, 6, 8, 8
-MBEDGE_LOOPFILTER mmxext, v, 6, 8, 8
-MBEDGE_LOOPFILTER mmxext, h, 6, 8, 8
+MBEDGE_LOOPFILTER mmx, v, 6, 8, 0
+MBEDGE_LOOPFILTER mmx, h, 6, 8, 0
+MBEDGE_LOOPFILTER mmxext, v, 6, 8, 0
+MBEDGE_LOOPFILTER mmxext, h, 6, 8, 0
INIT_XMM
MBEDGE_LOOPFILTER sse2, v, 5, 16, 16
@@ -2535,3 +2715,19 @@ MBEDGE_LOOPFILTER sse2, h, 6, 16, 16
%endif
MBEDGE_LOOPFILTER sse2, v, 6, 8, 16
MBEDGE_LOOPFILTER sse2, h, 6, 8, 16
+
+MBEDGE_LOOPFILTER ssse3, v, 5, 16, 16
+%ifdef m8
+MBEDGE_LOOPFILTER ssse3, h, 5, 16, 16
+%else
+MBEDGE_LOOPFILTER ssse3, h, 6, 16, 16
+%endif
+MBEDGE_LOOPFILTER ssse3, v, 6, 8, 16
+MBEDGE_LOOPFILTER ssse3, h, 6, 8, 16
+
+%ifdef m8
+MBEDGE_LOOPFILTER sse4, h, 5, 16, 16
+%else
+MBEDGE_LOOPFILTER sse4, h, 6, 16, 16
+%endif
+MBEDGE_LOOPFILTER sse4, h, 6, 8, 16
diff --git a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/x86inc.asm b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/x86inc.asm
index 410b11bb2..b7d17742e 100644
--- a/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/x86inc.asm
+++ b/src/filters/transform/MPCVideoDec/ffmpeg/libavcodec/x86/x86inc.asm
@@ -271,13 +271,21 @@ DECLARE_REG 6, rax, eax, ax, al, [rsp + stack_offset + 56]
ASSERT %2 >= %1
%assign regs_used %2
ASSERT regs_used <= 7
- %assign xmm_regs_used %3
- ASSERT xmm_regs_used <= 16
%if regs_used > 4
push r4
push r5
%assign stack_offset stack_offset+16
%endif
+ WIN64_SPILL_XMM %3
+ LOAD_IF_USED 4, %1
+ LOAD_IF_USED 5, %1
+ LOAD_IF_USED 6, %1
+ DEFINE_ARGS %4
+%endmacro
+
+%macro WIN64_SPILL_XMM 1
+ %assign xmm_regs_used %1
+ ASSERT xmm_regs_used <= 16
%if xmm_regs_used > 6
sub rsp, (xmm_regs_used-6)*16+16
%assign stack_offset stack_offset+(xmm_regs_used-6)*16+16
@@ -287,13 +295,9 @@ DECLARE_REG 6, rax, eax, ax, al, [rsp + stack_offset + 56]
movdqa [rsp + (%%i-6)*16+8], xmm %+ %%i
%endrep
%endif
- LOAD_IF_USED 4, %1
- LOAD_IF_USED 5, %1
- LOAD_IF_USED 6, %1
- DEFINE_ARGS %4
%endmacro
-%macro RESTORE_XMM_INTERNAL 1
+%macro WIN64_RESTORE_XMM_INTERNAL 1
%if xmm_regs_used > 6
%assign %%i xmm_regs_used
%rep (xmm_regs_used-6)
@@ -304,14 +308,14 @@ DECLARE_REG 6, rax, eax, ax, al, [rsp + stack_offset + 56]
%endif
%endmacro
-%macro RESTORE_XMM 1
- RESTORE_XMM_INTERNAL %1
+%macro WIN64_RESTORE_XMM 1
+ WIN64_RESTORE_XMM_INTERNAL %1
%assign stack_offset stack_offset-(xmm_regs_used-6)*16+16
%assign xmm_regs_used 0
%endmacro
%macro RET 0
- RESTORE_XMM_INTERNAL rsp
+ WIN64_RESTORE_XMM_INTERNAL rsp
%if regs_used > 4
pop r5
pop r4
@@ -428,6 +432,13 @@ DECLARE_REG 6, ebp, ebp, bp, null, [esp + stack_offset + 28]
%endif ;======================================================================
+%ifndef WIN64
+%macro WIN64_SPILL_XMM 1
+%endmacro
+%macro WIN64_RESTORE_XMM 1
+%endmacro
+%endif
+
;=============================================================================
@@ -494,7 +505,7 @@ SECTION .note.GNU-stack noalloc noexec nowrite progbits
%define mova movq
%define movu movq
%define movh movd
- %define movnt movntq
+ %define movnta movntq
%assign %%i 0
%rep 8
CAT_XDEFINE m, %%i, mm %+ %%i
@@ -518,7 +529,7 @@ SECTION .note.GNU-stack noalloc noexec nowrite progbits
%define mova movdqa
%define movu movdqu
%define movh movq
- %define movnt movntdq
+ %define movnta movntdq
%assign %%i 0
%rep num_mmregs
CAT_XDEFINE m, %%i, xmm %+ %%i
diff --git a/src/filters/transform/MPCVideoDec/ffmpeg/libavutil/mathematics.h b/src/filters/transform/MPCVideoDec/ffmpeg/libavutil/mathematics.h
index 2fbb99018..1fb848b38 100644
--- a/src/filters/transform/MPCVideoDec/ffmpeg/libavutil/mathematics.h
+++ b/src/filters/transform/MPCVideoDec/ffmpeg/libavutil/mathematics.h
@@ -38,6 +38,9 @@
#ifndef M_LOG2_10
#define M_LOG2_10 3.32192809488736234787 /* log_2 10 */
#endif
+#ifndef M_PHI
+#define M_PHI 1.61803398874989484820 /* phi / golden ratio */
+#endif
#ifndef M_PI
#define M_PI 3.14159265358979323846 /* pi */
#endif