diff options
author | Henrik Gramner <gramner@twoorioles.com> | 2019-05-18 02:36:35 +0300 |
---|---|---|
committer | Henrik Gramner <henrik@gramner.com> | 2019-05-18 17:29:47 +0300 |
commit | f64fdae55128ff1c2204f578ee26b6d577862b26 (patch) | |
tree | a457c8d48d298c9da2dc1a3f000375386e5a52c9 | |
parent | 3d6479cee8170cbcc1b6c3cea7338e86b3594683 (diff) |
Optimize obmc blend
The last 1/4 of the mask is always zero, so we can skip some
calculations that doesn't change the output.
-rw-r--r-- | src/mc_tmpl.c | 26 | ||||
-rw-r--r-- | src/recon_tmpl.c | 2 | ||||
-rw-r--r-- | src/tables.c | 2 | ||||
-rw-r--r-- | src/x86/mc.asm | 5 | ||||
-rw-r--r-- | src/x86/mc_ssse3.asm | 36 |
5 files changed, 41 insertions, 30 deletions
diff --git a/src/mc_tmpl.c b/src/mc_tmpl.c index 9fe3515..9744f24 100644 --- a/src/mc_tmpl.c +++ b/src/mc_tmpl.c @@ -635,10 +635,8 @@ static void mask_c(pixel *dst, const ptrdiff_t dst_stride, } #define blend_px(a, b, m) (((a * (64 - m) + b * m) + 32) >> 6) -static NOINLINE void -blend_internal_c(pixel *dst, const ptrdiff_t dst_stride, const pixel *tmp, - const int w, int h, const uint8_t *mask, - const ptrdiff_t mask_stride) +static void blend_c(pixel *dst, const ptrdiff_t dst_stride, const pixel *tmp, + const int w, int h, const uint8_t *mask) { do { for (int x = 0; x < w; x++) { @@ -646,26 +644,28 @@ blend_internal_c(pixel *dst, const ptrdiff_t dst_stride, const pixel *tmp, } dst += PXSTRIDE(dst_stride); tmp += w; - mask += mask_stride; + mask += w; } while (--h); } -static void blend_c(pixel *dst, const ptrdiff_t dst_stride, const pixel *tmp, - const int w, const int h, const uint8_t *mask) -{ - blend_internal_c(dst, dst_stride, tmp, w, h, mask, w); -} - static void blend_v_c(pixel *dst, const ptrdiff_t dst_stride, const pixel *tmp, - const int w, const int h) + const int w, int h) { - blend_internal_c(dst, dst_stride, tmp, w, h, &dav1d_obmc_masks[w], 0); + const uint8_t *const mask = &dav1d_obmc_masks[w]; + do { + for (int x = 0; x < (w * 3) >> 2; x++) { + dst[x] = blend_px(dst[x], tmp[x], mask[x]); + } + dst += PXSTRIDE(dst_stride); + tmp += w; + } while (--h); } static void blend_h_c(pixel *dst, const ptrdiff_t dst_stride, const pixel *tmp, const int w, int h) { const uint8_t *mask = &dav1d_obmc_masks[h]; + h = (h * 3) >> 2; do { const int m = *mask++; for (int x = 0; x < w; x++) { diff --git a/src/recon_tmpl.c b/src/recon_tmpl.c index 17b8406..0cadaad 100644 --- a/src/recon_tmpl.c +++ b/src/recon_tmpl.c @@ -644,7 +644,7 @@ static int obmc(Dav1dTileContext *const t, if (a_r->ref[0] > 0) { const int ow4 = iclip(a_b_dim[0], 2, b_dim[0]); const int oh4 = imin(b_dim[1], 16) >> 1; - res = mc(t, lap, NULL, ow4 * h_mul * sizeof(pixel), ow4, oh4, + res = mc(t, lap, NULL, ow4 * h_mul * sizeof(pixel), ow4, (oh4 * 3 + 3) >> 2, t->bx + x, t->by, pl, a_r->mv[0], &f->refp[a_r->ref[0] - 1], a_r->ref[0] - 1, dav1d_filter_2d[t->a->filter[1][bx4 + x + 1]][t->a->filter[0][bx4 + x + 1]]); diff --git a/src/tables.c b/src/tables.c index 4117a24..31d288c 100644 --- a/src/tables.c +++ b/src/tables.c @@ -861,7 +861,7 @@ const int8_t ALIGN(dav1d_filter_intra_taps[5][64], 16) = { } }; -const uint8_t ALIGN(dav1d_obmc_masks[64], 32) = { +const uint8_t dav1d_obmc_masks[64] = { /* Unused */ 0, 0, /* 2 */ diff --git a/src/x86/mc.asm b/src/x86/mc.asm index 26130ee..9c9422e 100644 --- a/src/x86/mc.asm +++ b/src/x86/mc.asm @@ -3837,7 +3837,10 @@ cglobal blend_h, 4, 7, 6, dst, ds, tmp, w, h, mask movsxd wq, dword [r5+wq*4] vpbroadcastd m5, [base+pw_512] add wq, r5 - lea maskq, [base+obmc_masks+hq*4] + lea maskq, [base+obmc_masks+hq*2] + lea hd, [hq*3] + shr hd, 2 ; h * 3/4 + lea maskq, [maskq+hq*2] neg hq jmp wq .w2: diff --git a/src/x86/mc_ssse3.asm b/src/x86/mc_ssse3.asm index abca6cf..82dcb75 100644 --- a/src/x86/mc_ssse3.asm +++ b/src/x86/mc_ssse3.asm @@ -44,8 +44,8 @@ obmc_masks: db 0, 0, 0, 0 db 33, 31, 35, 29, 36, 28, 38, 26, 40, 24, 41, 23, 43, 21, 44, 20 db 45, 19, 47, 17, 48, 16, 50, 14, 51, 13, 52, 12, 53, 11, 55, 9 db 56, 8, 57, 7, 58, 6, 59, 5, 60, 4, 60, 4, 61, 3, 62, 2 - db 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0 +blend_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3 subpel_h_shuf4: db 0, 1, 2, 3, 1, 2, 3, 4, 8, 9, 10, 11, 9, 10, 11, 12 db 2, 3, 4, 5, 3, 4, 5, 6, 10, 11, 12, 13, 11, 12, 13, 14 subpel_h_shufA: db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 @@ -53,7 +53,6 @@ subpel_h_shufB: db 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 subpel_h_shufC: db 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 bilin_h_shuf4: db 1, 0, 2, 1, 3, 2, 4, 3, 9, 8, 10, 9, 11, 10, 12, 11 bilin_h_shuf8: db 1, 0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7 -blend_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3 pb_64: times 16 db 64 pw_8: times 8 dw 8 @@ -3773,7 +3772,7 @@ cglobal blend, 3, 7, 7, dst, ds, tmp, w, h, mask jg .w32 RET -cglobal blend_v, 3, 6, 8, dst, ds, tmp, w, h, mask +cglobal blend_v, 3, 6, 6, dst, ds, tmp, w, h, mask %define base r5-blend_v_ssse3_table LEA r5, blend_v_ssse3_table tzcnt wd, wm @@ -3833,8 +3832,7 @@ cglobal blend_v, 3, 6, 8, dst, ds, tmp, w, h, mask mova m2, [tmpq]; b BLEND_64M m1, m2, m3, m3 movq [dstq+dsq*0], m0 - punpckhqdq m0, m0 - movq [dstq+dsq*1], m0 + movhps [dstq+dsq*1], m0 add tmpq, 16 lea dstq, [dstq+dsq*2] sub hd, 2 @@ -3855,24 +3853,31 @@ cglobal blend_v, 3, 6, 8, dst, ds, tmp, w, h, mask jg .w16_loop RET .w32: - mova m3, [maskq+64 ] ; obmc_masks_32[0] (64-m[0]) - mova m4, [maskq+80 ] ; obmc_masks_32[1] (64-m[1]) - mova m6, [maskq+96 ] ; obmc_masks_32[2] (64-m[2]) - mova m7, [maskq+112] ; obmc_masks_32[3] (64-m[3]) +%if WIN64 + mova [rsp+8], xmm6 +%endif + mova m3, [maskq+64] ; obmc_masks_32[0] (64-m[0]) + mova m4, [maskq+80] ; obmc_masks_32[1] (64-m[1]) + mova m6, [maskq+96] ; obmc_masks_32[2] (64-m[2]) ; 16 mask blend is provided for 64 pixels .w32_loop: mova m1, [dstq+16*0] ; a mova m2, [tmpq+16*0] ; b BLEND_64M m1, m2, m3, m4 + movq m1, [dstq+16*1] ; a + punpcklbw m1, [tmpq+16*1] ; b + pmaddubsw m1, m6 + pmulhrsw m1, m5 + packuswb m1, m1 mova [dstq+16*0], m0 - mova m1, [dstq+16*1] ; a - mova m2, [tmpq+16*1] ; b - BLEND_64M m1, m2, m6, m7 - mova [dstq+16*1], m0 + movq [dstq+16*1], m1 add tmpq, 32 add dstq, dsq dec hd jg .w32_loop +%if WIN64 + mova xmm6, [rsp+8] +%endif RET cglobal blend_h, 3, 7, 6, dst, ds, tmp, w, h, mask @@ -3890,7 +3895,10 @@ cglobal blend_h, 3, 7, 6, dst, ds, tmp, w, h, mask movsxd wq, dword [t0+wq*4] mova m5, [base+pw_512] add wq, t0 - lea maskq, [base+obmc_masks+hq*4] + lea maskq, [base+obmc_masks+hq*2] + lea hd, [hq*3] + shr hd, 2 ; h * 3/4 + lea maskq, [maskq+hq*2] neg hq jmp wq .w2: |