diff options
author | Victorien Le Couviour--Tuffet <victorien@videolan.org> | 2021-07-26 18:02:02 +0300 |
---|---|---|
committer | Victorien Le Couviour--Tuffet <victorien@videolan.org> | 2021-09-03 19:06:31 +0300 |
commit | 753eef833bdd8ff1585c5c858cafeca8fefbb16e (patch) | |
tree | f32e9802b5b466d6ddded5a8e398f3d56d79e377 /src/decode.c | |
parent | 7b433e077298d0f4faf8da6d6eb5774e29bffa54 (diff) |
Merge the 3 threading models into a single one
Merges the 3 threading parameters into a single `--threads=` argument.
Frame threading can still be controlled via the `--framedelay=` argument.
Internally, the threading model is now a global thread/task pool design.
Co-authored-by: Ronald S. Bultje <rsbultje@gmail.com>
Diffstat (limited to 'src/decode.c')
-rw-r--r-- | src/decode.c | 779 |
1 files changed, 425 insertions, 354 deletions
diff --git a/src/decode.c b/src/decode.c index 1caea76..747ee7b 100644 --- a/src/decode.c +++ b/src/decode.c @@ -73,7 +73,7 @@ static void init_quant_tables(const Dav1dSequenceHeader *const seq_hdr, } } -static int read_mv_component_diff(Dav1dTileContext *const t, +static int read_mv_component_diff(Dav1dTaskContext *const t, CdfMvComponent *const mv_comp, const int have_fp) { @@ -117,7 +117,7 @@ static int read_mv_component_diff(Dav1dTileContext *const t, return sign ? -diff : diff; } -static void read_mv_residual(Dav1dTileContext *const t, mv *const ref_mv, +static void read_mv_residual(Dav1dTaskContext *const t, mv *const ref_mv, CdfMvContext *const mv_cdf, const int have_fp) { switch (dav1d_msac_decode_symbol_adapt4(&t->ts->msac, t->ts->cdf.mv.joint, @@ -138,7 +138,7 @@ static void read_mv_residual(Dav1dTileContext *const t, mv *const ref_mv, } } -static void read_tx_tree(Dav1dTileContext *const t, +static void read_tx_tree(Dav1dTaskContext *const t, const enum RectTxfmSize from, const int depth, uint16_t *const masks, const int x_off, const int y_off) @@ -216,7 +216,7 @@ static int neg_deinterleave(int diff, int ref, int max) { } } -static void find_matching_ref(const Dav1dTileContext *const t, +static void find_matching_ref(const Dav1dTaskContext *const t, const enum EdgeFlags intra_edge_flags, const int bw4, const int bh4, const int w4, const int h4, @@ -289,7 +289,7 @@ static void find_matching_ref(const Dav1dTileContext *const t, #undef matches } -static void derive_warpmv(const Dav1dTileContext *const t, +static void derive_warpmv(const Dav1dTaskContext *const t, const int bw4, const int bh4, const uint64_t masks[2], const union mv mv, Dav1dWarpedMotionParams *const wmp) @@ -370,7 +370,7 @@ static inline int findoddzero(const uint8_t *buf, int len) { return 0; } -static void read_pal_plane(Dav1dTileContext *const t, Av1Block *const b, +static void read_pal_plane(Dav1dTaskContext *const t, Av1Block *const b, const int pl, const int sz_ctx, const int bx4, const int by4) { @@ -425,7 +425,7 @@ static void read_pal_plane(Dav1dTileContext *const t, Av1Block *const b, const int n_used_cache = i; // parse new entries - uint16_t *const pal = f->frame_thread.pass ? + uint16_t *const pal = t->frame_thread.pass ? f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) + ((t->bx >> 1) + (t->by & 1))][pl] : t->scratch.pal[pl]; if (i < pal_sz) { @@ -473,7 +473,7 @@ static void read_pal_plane(Dav1dTileContext *const t, Av1Block *const b, } } -static void read_pal_uv(Dav1dTileContext *const t, Av1Block *const b, +static void read_pal_uv(Dav1dTaskContext *const t, Av1Block *const b, const int sz_ctx, const int bx4, const int by4) { read_pal_plane(t, b, 1, sz_ctx, bx4, by4); @@ -481,7 +481,7 @@ static void read_pal_uv(Dav1dTileContext *const t, Av1Block *const b, // V pal coding Dav1dTileState *const ts = t->ts; const Dav1dFrameContext *const f = t->f; - uint16_t *const pal = f->frame_thread.pass ? + uint16_t *const pal = t->frame_thread.pass ? f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) + ((t->bx >> 1) + (t->by & 1))][2] : t->scratch.pal[2]; if (dav1d_msac_decode_bool_equi(&ts->msac)) { @@ -575,7 +575,7 @@ static void order_palette(const uint8_t *pal_idx, const ptrdiff_t stride, } } -static void read_pal_indices(Dav1dTileContext *const t, +static void read_pal_indices(Dav1dTaskContext *const t, uint8_t *const pal_idx, const Av1Block *const b, const int pl, const int w4, const int h4, @@ -612,7 +612,7 @@ static void read_pal_indices(Dav1dTileContext *const t, } } -static void read_vartx_tree(Dav1dTileContext *const t, +static void read_vartx_tree(Dav1dTaskContext *const t, Av1Block *const b, const enum BlockSize bs, const int bx4, const int by4) { @@ -674,11 +674,6 @@ static inline unsigned get_prev_frame_segid(const Dav1dFrameContext *const f, const ptrdiff_t stride) { assert(f->frame_hdr->primary_ref_frame != DAV1D_PRIMARY_REF_NONE); - if (dav1d_thread_picture_wait(&f->refp[f->frame_hdr->primary_ref_frame], - (by + h4) * 4, PLANE_TYPE_BLOCK)) - { - return 8; - } unsigned seg_id = 8; ref_seg_map += by * stride + bx; @@ -693,7 +688,7 @@ static inline unsigned get_prev_frame_segid(const Dav1dFrameContext *const f, } static inline void splat_oneref_mv(const Dav1dContext *const c, - Dav1dTileContext *const t, + Dav1dTaskContext *const t, const enum BlockSize bs, const Av1Block *const b, const int bw4, const int bh4) @@ -709,7 +704,7 @@ static inline void splat_oneref_mv(const Dav1dContext *const c, } static inline void splat_intrabc_mv(const Dav1dContext *const c, - Dav1dTileContext *const t, + Dav1dTaskContext *const t, const enum BlockSize bs, const Av1Block *const b, const int bw4, const int bh4) @@ -724,7 +719,7 @@ static inline void splat_intrabc_mv(const Dav1dContext *const c, } static inline void splat_tworef_mv(const Dav1dContext *const c, - Dav1dTileContext *const t, + Dav1dTaskContext *const t, const enum BlockSize bs, const Av1Block *const b, const int bw4, const int bh4) @@ -741,7 +736,7 @@ static inline void splat_tworef_mv(const Dav1dContext *const c, } static inline void splat_intraref(const Dav1dContext *const c, - Dav1dTileContext *const t, + Dav1dTaskContext *const t, const enum BlockSize bs, const int bw4, const int bh4) { @@ -754,7 +749,98 @@ static inline void splat_intraref(const Dav1dContext *const c, c->refmvs_dsp.splat_mv(&t->rt.r[(t->by & 31) + 5], &tmpl, t->bx, bw4, bh4); } -static int decode_b(Dav1dTileContext *const t, +static inline void mc_lowest_px(int *const dst, const int by4, const int bh4, + const int mvy, const int ss_ver, + const struct ScalableMotionParams *const smp) +{ + const int v_mul = 4 >> ss_ver; + if (!smp->scale) { + const int my = mvy >> (3 + ss_ver), dy = mvy & (15 >> !ss_ver); + *dst = imax(*dst, (by4 + bh4) * v_mul + my + 4 * !!dy); + } else { + int y = (by4 * v_mul << 4) + mvy * (1 << !ss_ver); + const int64_t tmp = (int64_t)(y) * smp->scale + (smp->scale - 0x4000) * 8; + y = apply_sign64((int)((llabs(tmp) + 128) >> 8), tmp) + 32; + const int bottom = ((y + (bh4 * v_mul - 1) * smp->step) >> 10) + 1 + 4; + *dst = imax(*dst, bottom); + } +} + +static inline void affine_lowest_px(Dav1dTaskContext *const t, + int *const dst, const int is_chroma, + const uint8_t *const b_dim, + const Dav1dWarpedMotionParams *const wmp) +{ + const Dav1dFrameContext *const f = t->f; + const int ss_ver = is_chroma && f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420; + const int ss_hor = is_chroma && f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444; + const int h_mul = 4 >> ss_hor, v_mul = 4 >> ss_ver; + assert(!((b_dim[0] * h_mul) & 7) && !((b_dim[1] * v_mul) & 7)); + const int32_t *const mat = wmp->matrix; + const int y = b_dim[1] * v_mul - 8; // lowest y + + const int src_y = t->by * 4 + ((y + 4) << ss_ver); + const int64_t mat5_y = (int64_t) mat[5] * src_y + mat[1]; + // check left- and right-most blocks + for (int x = 0; x < b_dim[0] * h_mul; x += imax(8, b_dim[0] * h_mul - 8)) { + // calculate transformation relative to center of 8x8 block in + // luma pixel units + const int src_x = t->bx * 4 + ((x + 4) << ss_hor); + const int64_t mvy = ((int64_t) mat[4] * src_x + mat5_y) >> ss_ver; + const int dy = (int) (mvy >> 16) - 4; + *dst = imax(*dst, dy + 4 + 8); + } +} + +static void obmc_lowest_px(Dav1dTaskContext *const t, + int (*const dst)[2], const int is_chroma, + const uint8_t *const b_dim, + const int bx4, const int by4, const int w4, const int h4) +{ + assert(!(t->bx & 1) && !(t->by & 1)); + const Dav1dFrameContext *const f = t->f; + /*const*/ refmvs_block **r = &t->rt.r[(t->by & 31) + 5]; + const int ss_ver = is_chroma && f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420; + const int ss_hor = is_chroma && f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444; + const int h_mul = 4 >> ss_hor, v_mul = 4 >> ss_ver; + + if (t->by > t->ts->tiling.row_start && + (!is_chroma || b_dim[0] * h_mul + b_dim[1] * v_mul >= 16)) + { + for (int i = 0, x = 0; x < w4 && i < imin(b_dim[2], 4); ) { + // only odd blocks are considered for overlap handling, hence +1 + const refmvs_block *const a_r = &r[-1][t->bx + x + 1]; + const uint8_t *const a_b_dim = dav1d_block_dimensions[a_r->bs]; + + if (a_r->ref.ref[0] > 0) { + const int oh4 = imin(b_dim[1], 16) >> 1; + mc_lowest_px(&dst[a_r->ref.ref[0] - 1][is_chroma], t->by, + (oh4 * 3 + 3) >> 2, a_r->mv.mv[0].y, ss_ver, + &f->svc[a_r->ref.ref[0] - 1][1]); + i++; + } + x += imax(a_b_dim[0], 2); + } + } + + if (t->bx > t->ts->tiling.col_start) + for (int i = 0, y = 0; y < h4 && i < imin(b_dim[3], 4); ) { + // only odd blocks are considered for overlap handling, hence +1 + const refmvs_block *const l_r = &r[y + 1][t->bx - 1]; + const uint8_t *const l_b_dim = dav1d_block_dimensions[l_r->bs]; + + if (l_r->ref.ref[0] > 0) { + const int oh4 = iclip(l_b_dim[1], 2, b_dim[1]); + mc_lowest_px(&dst[l_r->ref.ref[0] - 1][is_chroma], + t->by + y, oh4, l_r->mv.mv[0].y, ss_ver, + &f->svc[l_r->ref.ref[0] - 1][1]); + i++; + } + y += imax(l_b_dim[1], 2); + } +} + +static int decode_b(Dav1dTaskContext *const t, const enum BlockLevel bl, const enum BlockSize bs, const enum BlockPartition bp, @@ -762,7 +848,7 @@ static int decode_b(Dav1dTileContext *const t, { Dav1dTileState *const ts = t->ts; const Dav1dFrameContext *const f = t->f; - Av1Block b_mem, *const b = f->frame_thread.pass ? + Av1Block b_mem, *const b = t->frame_thread.pass ? &f->frame_thread.b[t->by * f->b4_stride + t->bx] : &b_mem; const uint8_t *const b_dim = dav1d_block_dimensions[bs]; const int bx4 = t->bx & 31, by4 = t->by & 31; @@ -778,7 +864,7 @@ static int decode_b(Dav1dTileContext *const t, (bw4 > ss_hor || t->bx & 1) && (bh4 > ss_ver || t->by & 1); - if (f->frame_thread.pass == 2) { + if (t->frame_thread.pass == 2) { if (b->intra) { f->bd_fn.recon_b_intra(t, bs, intra_edge_flags, b); @@ -1236,10 +1322,11 @@ static int decode_b(Dav1dTileContext *const t, if (b->pal_sz[0]) { uint8_t *pal_idx; - if (f->frame_thread.pass) { - assert(ts->frame_thread.pal_idx); - pal_idx = ts->frame_thread.pal_idx; - ts->frame_thread.pal_idx += bw4 * bh4 * 16; + if (t->frame_thread.pass) { + const int p = t->frame_thread.pass & 1; + assert(ts->frame_thread[p].pal_idx); + pal_idx = ts->frame_thread[p].pal_idx; + ts->frame_thread[p].pal_idx += bw4 * bh4 * 16; } else pal_idx = t->scratch.pal_idx; read_pal_indices(t, pal_idx, b, 0, w4, h4, bw4, bh4); @@ -1249,10 +1336,11 @@ static int decode_b(Dav1dTileContext *const t, if (has_chroma && b->pal_sz[1]) { uint8_t *pal_idx; - if (f->frame_thread.pass) { - assert(ts->frame_thread.pal_idx); - pal_idx = ts->frame_thread.pal_idx; - ts->frame_thread.pal_idx += cbw4 * cbh4 * 16; + if (t->frame_thread.pass) { + const int p = t->frame_thread.pass & 1; + assert(ts->frame_thread[p].pal_idx); + pal_idx = ts->frame_thread[p].pal_idx; + ts->frame_thread[p].pal_idx += cbw4 * cbh4 * 16; } else pal_idx = &t->scratch.pal_idx[bw4 * bh4 * 16]; read_pal_indices(t, pal_idx, b, 1, cw4, ch4, cbw4, cbh4); @@ -1284,7 +1372,7 @@ static int decode_b(Dav1dTileContext *const t, } // reconstruction - if (f->frame_thread.pass == 1) { + if (t->frame_thread.pass == 1) { f->bd_fn.read_coef_blocks(t, bs, b); } else { f->bd_fn.recon_b_intra(t, bs, intra_edge_flags, b); @@ -1328,7 +1416,7 @@ static int decode_b(Dav1dTileContext *const t, case_set(bw4, a->, 0, bx4); #undef set_ctx if (b->pal_sz[0]) { - uint16_t *const pal = f->frame_thread.pass ? + uint16_t *const pal = t->frame_thread.pass ? f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) + ((t->bx >> 1) + (t->by & 1))][0] : t->scratch.pal[0]; for (int x = 0; x < bw4; x++) @@ -1343,7 +1431,7 @@ static int decode_b(Dav1dTileContext *const t, case_set(cbw4, a->, 0, cbx4); #undef set_ctx if (b->pal_sz[1]) { - const uint16_t (*const pal)[8] = f->frame_thread.pass ? + const uint16_t (*const pal)[8] = t->frame_thread.pass ? f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) + ((t->bx >> 1) + (t->by & 1))] : t->scratch.pal; @@ -1446,7 +1534,7 @@ static int decode_b(Dav1dTileContext *const t, read_vartx_tree(t, b, bs, bx4, by4); // reconstruction - if (f->frame_thread.pass == 1) { + if (t->frame_thread.pass == 1) { f->bd_fn.read_coef_blocks(t, bs, b); b->filter2d = FILTER_2D_BILINEAR; } else { @@ -1910,7 +1998,7 @@ static int decode_b(Dav1dTileContext *const t, signabs(t->warpmv.u.p.delta), b->mv[0].y, b->mv[0].x); #undef signabs - if (f->frame_thread.pass) { + if (t->frame_thread.pass) { if (t->warpmv.type == DAV1D_WM_TYPE_AFFINE) { b->matrix[0] = t->warpmv.matrix[2] - 0x10000; b->matrix[1] = t->warpmv.matrix[3]; @@ -1970,7 +2058,7 @@ static int decode_b(Dav1dTileContext *const t, read_vartx_tree(t, b, bs, bx4, by4); // reconstruction - if (f->frame_thread.pass == 1) { + if (t->frame_thread.pass == 1) { f->bd_fn.read_coef_blocks(t, bs, b); } else { if (f->bd_fn.recon_b_inter(t, bs, b)) return -1; @@ -2052,6 +2140,110 @@ static int decode_b(Dav1dTileContext *const t, } } + if (t->frame_thread.pass == 1 && !b->intra && IS_INTER_OR_SWITCH(f->frame_hdr)) { + const int sby = (t->by - ts->tiling.row_start) >> f->sb_shift; + int (*const lowest_px)[2] = ts->lowest_pixel[sby]; + + // keep track of motion vectors for each reference + if (b->comp_type == COMP_INTER_NONE) { + // y + if (imin(bw4, bh4) > 1 && + ((b->inter_mode == GLOBALMV && f->gmv_warp_allowed[b->ref[0]]) || + (b->motion_mode == MM_WARP && t->warpmv.type > DAV1D_WM_TYPE_TRANSLATION))) + { + affine_lowest_px(t, &lowest_px[b->ref[0]][0], 0, b_dim, + b->motion_mode == MM_WARP ? &t->warpmv : + &f->frame_hdr->gmv[b->ref[0]]); + } else { + mc_lowest_px(&lowest_px[b->ref[0]][0], t->by, bh4, b->mv[0].y, + 0, &f->svc[b->ref[0]][1]); + if (b->motion_mode == MM_OBMC) { + obmc_lowest_px(t, lowest_px, 0, b_dim, bx4, by4, w4, h4); + } + } + + // uv + if (has_chroma) { + // sub8x8 derivation + int is_sub8x8 = bw4 == ss_hor || bh4 == ss_ver; + refmvs_block *const *r; + if (is_sub8x8) { + assert(ss_hor == 1); + r = &t->rt.r[(t->by & 31) + 5]; + if (bw4 == 1) is_sub8x8 &= r[0][t->bx - 1].ref.ref[0] > 0; + if (bh4 == ss_ver) is_sub8x8 &= r[-1][t->bx].ref.ref[0] > 0; + if (bw4 == 1 && bh4 == ss_ver) + is_sub8x8 &= r[-1][t->bx - 1].ref.ref[0] > 0; + } + + // chroma prediction + if (is_sub8x8) { + assert(ss_hor == 1); + if (bw4 == 1 && bh4 == ss_ver) { + const refmvs_block *const rr = &r[-1][t->bx - 1]; + mc_lowest_px(&lowest_px[rr->ref.ref[0] - 1][1], + t->by - 1, bh4, rr->mv.mv[0].y, ss_ver, + &f->svc[rr->ref.ref[0] - 1][1]); + } + if (bw4 == 1) { + const refmvs_block *const rr = &r[0][t->bx - 1]; + mc_lowest_px(&lowest_px[rr->ref.ref[0] - 1][1], + t->by, bh4, rr->mv.mv[0].y, ss_ver, + &f->svc[rr->ref.ref[0] - 1][1]); + } + if (bh4 == ss_ver) { + const refmvs_block *const rr = &r[-1][t->bx]; + mc_lowest_px(&lowest_px[rr->ref.ref[0] - 1][1], + t->by - 1, bh4, rr->mv.mv[0].y, ss_ver, + &f->svc[rr->ref.ref[0] - 1][1]); + } + mc_lowest_px(&lowest_px[b->ref[0]][1], t->by, bh4, + b->mv[0].y, ss_ver, &f->svc[b->ref[0]][1]); + } else { + if (imin(cbw4, cbh4) > 1 && + ((b->inter_mode == GLOBALMV && f->gmv_warp_allowed[b->ref[0]]) || + (b->motion_mode == MM_WARP && t->warpmv.type > DAV1D_WM_TYPE_TRANSLATION))) + { + affine_lowest_px(t, &lowest_px[b->ref[0]][1], 1, b_dim, + b->motion_mode == MM_WARP ? &t->warpmv : + &f->frame_hdr->gmv[b->ref[0]]); + } else { + mc_lowest_px(&lowest_px[b->ref[0]][1], + t->by & ~ss_ver, bh4 << (bh4 == ss_ver), + b->mv[0].y, ss_ver, &f->svc[b->ref[0]][1]); + if (b->motion_mode == MM_OBMC) { + obmc_lowest_px(t, lowest_px, 1, b_dim, bx4, by4, w4, h4); + } + } + } + } + } else { + // y + for (int i = 0; i < 2; i++) { + if (b->inter_mode == GLOBALMV_GLOBALMV && f->gmv_warp_allowed[b->ref[i]]) { + affine_lowest_px(t, &lowest_px[b->ref[i]][0], 0, b_dim, + &f->frame_hdr->gmv[b->ref[i]]); + } else { + mc_lowest_px(&lowest_px[b->ref[i]][0], t->by, bh4, + b->mv[i].y, 0, &f->svc[b->ref[i]][1]); + } + } + + // uv + if (has_chroma) for (int i = 0; i < 2; i++) { + if (b->inter_mode == GLOBALMV_GLOBALMV && + imin(cbw4, cbh4) > 1 && f->gmv_warp_allowed[b->ref[i]]) + { + affine_lowest_px(t, &lowest_px[b->ref[i]][1], 1, b_dim, + &f->frame_hdr->gmv[b->ref[i]]); + } else { + mc_lowest_px(&lowest_px[b->ref[i]][1], t->by, bh4, + b->mv[i].y, ss_ver, &f->svc[b->ref[i]][1]); + } + } + } + } + return 0; } @@ -2059,7 +2251,7 @@ static int decode_b(Dav1dTileContext *const t, #include <sanitizer/msan_interface.h> -static int checked_decode_b(Dav1dTileContext *const t, +static int checked_decode_b(Dav1dTaskContext *const t, const enum BlockLevel bl, const enum BlockSize bs, const enum BlockPartition bp, @@ -2068,7 +2260,7 @@ static int checked_decode_b(Dav1dTileContext *const t, const Dav1dFrameContext *const f = t->f; const int err = decode_b(t, bl, bs, bp, intra_edge_flags); - if (err == 0 && !(f->frame_thread.pass & 1)) { + if (err == 0 && !(t->frame_thread.pass & 1)) { const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420; const int ss_hor = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444; const uint8_t *const b_dim = dav1d_block_dimensions[bs]; @@ -2108,7 +2300,7 @@ static int checked_decode_b(Dav1dTileContext *const t, #endif /* defined(__has_feature) */ -static int decode_sb(Dav1dTileContext *const t, const enum BlockLevel bl, +static int decode_sb(Dav1dTaskContext *const t, const enum BlockLevel bl, const EdgeNode *const node) { const Dav1dFrameContext *const f = t->f; @@ -2124,7 +2316,7 @@ static int decode_sb(Dav1dTileContext *const t, const enum BlockLevel bl, uint16_t *pc; enum BlockPartition bp; int ctx, bx8, by8; - if (f->frame_thread.pass != 2) { + if (t->frame_thread.pass != 2) { if (0 && bl == BL_64X64) printf("poc=%d,y=%d,x=%d,bl=%d,r=%d\n", f->frame_hdr->frame_offset, t->by, t->bx, bl, t->ts->msac.rng); @@ -2135,7 +2327,7 @@ static int decode_sb(Dav1dTileContext *const t, const enum BlockLevel bl, } if (have_h_split && have_v_split) { - if (f->frame_thread.pass == 2) { + if (t->frame_thread.pass == 2) { const Av1Block *const b = &f->frame_thread.b[t->by * f->b4_stride + t->bx]; bp = b->bl == bl ? b->bp : PARTITION_SPLIT; } else { @@ -2307,7 +2499,7 @@ static int decode_sb(Dav1dTileContext *const t, const enum BlockLevel bl, } } else if (have_h_split) { unsigned is_split; - if (f->frame_thread.pass == 2) { + if (t->frame_thread.pass == 2) { const Av1Block *const b = &f->frame_thread.b[t->by * f->b4_stride + t->bx]; is_split = b->bl != bl; } else { @@ -2336,7 +2528,7 @@ static int decode_sb(Dav1dTileContext *const t, const enum BlockLevel bl, } else { assert(have_v_split); unsigned is_split; - if (f->frame_thread.pass == 2) { + if (t->frame_thread.pass == 2) { const Av1Block *const b = &f->frame_thread.b[t->by * f->b4_stride + t->bx]; is_split = b->bl != bl; } else { @@ -2366,7 +2558,7 @@ static int decode_sb(Dav1dTileContext *const t, const enum BlockLevel bl, } } - if (f->frame_thread.pass != 2 && (bp != PARTITION_SPLIT || bl == BL_8X8)) { + if (t->frame_thread.pass != 2 && (bp != PARTITION_SPLIT || bl == BL_8X8)) { #define set_ctx(type, dir, diridx, off, mul, rep_macro) \ rep_macro(type, t->a->partition, bx8, mul * dav1d_al_part_ctx[0][bl][bp]); \ rep_macro(type, t->l.partition, by8, mul * dav1d_al_part_ctx[1][bl][bp]) @@ -2426,14 +2618,15 @@ static void setup_tile(Dav1dTileState *const ts, const int sb_shift = f->sb_shift; const uint8_t *const size_mul = ss_size_mul[f->cur.p.layout]; - ts->frame_thread.pal_idx = f->frame_thread.pal_idx ? - &f->frame_thread.pal_idx[(size_t)tile_start_off * size_mul[1] / 4] : - NULL; - - ts->frame_thread.cf = f->frame_thread.cf ? - (uint8_t*)f->frame_thread.cf + - (((size_t)tile_start_off * size_mul[0]) >> !f->seq_hdr->hbd) : - NULL; + for (int p = 0; p < 2; p++) { + ts->frame_thread[p].pal_idx = f->frame_thread.pal_idx ? + &f->frame_thread.pal_idx[(size_t)tile_start_off * size_mul[1] / 4] : + NULL; + ts->frame_thread[p].cf = f->frame_thread.cf ? + (uint8_t*)f->frame_thread.cf + + (((size_t)tile_start_off * size_mul[0]) >> !f->seq_hdr->hbd) : + NULL; + } dav1d_cdf_thread_copy(&ts->cdf, &f->in_cdf); ts->last_qidx = f->frame_hdr->quant.yac; @@ -2488,11 +2681,13 @@ static void setup_tile(Dav1dTileState *const ts, ts->lr_ref[p]->sgr_weights[1] = 31; } - if (f->n_tc > 1) - atomic_init(&ts->progress, row_sb_start); + if (f->c->n_tc > 1) { + for (int p = 0; p < 2; p++) + atomic_init(&ts->progress[p], row_sb_start); + } } -static void read_restoration_info(Dav1dTileContext *const t, +static void read_restoration_info(Dav1dTaskContext *const t, Av1RestorationUnit *const lr, const int p, const enum Dav1dRestorationType frame_type) { @@ -2558,7 +2753,7 @@ static void read_restoration_info(Dav1dTileContext *const t, } } -int dav1d_decode_tile_sbrow(Dav1dTileContext *const t) { +int dav1d_decode_tile_sbrow(Dav1dTaskContext *const t) { const Dav1dFrameContext *const f = t->f; const enum BlockLevel root_bl = f->seq_hdr->sb128 ? BL_128X128 : BL_64X64; Dav1dTileState *const ts = t->ts; @@ -2572,13 +2767,22 @@ int dav1d_decode_tile_sbrow(Dav1dTileContext *const t) { dav1d_refmvs_tile_sbrow_init(&t->rt, &f->rf, ts->tiling.col_start, ts->tiling.col_end, ts->tiling.row_start, ts->tiling.row_end, t->by >> f->sb_shift, - ts->tiling.row); + ts->tiling.row, t->frame_thread.pass); + } + + if (IS_INTER_OR_SWITCH(f->frame_hdr) && c->n_fc > 1) { + const int sby = (t->by - ts->tiling.row_start) >> f->sb_shift; + int (*const lowest_px)[2] = ts->lowest_pixel[sby]; + for (int n = 0; n < 7; n++) + for (int m = 0; m < 2; m++) + lowest_px[n][m] = INT_MIN; } - reset_context(&t->l, IS_KEY_OR_INTRA(f->frame_hdr), f->frame_thread.pass); - if (f->frame_thread.pass == 2) { + reset_context(&t->l, IS_KEY_OR_INTRA(f->frame_hdr), t->frame_thread.pass); + if (t->frame_thread.pass == 2) { + const int off_2pass = c->n_tc > 1 ? f->sb128w * f->frame_hdr->tiling.rows : 0; for (t->bx = ts->tiling.col_start, - t->a = f->a + col_sb128_start + tile_row * f->sb128w; + t->a = f->a + off_2pass + col_sb128_start + tile_row * f->sb128w; t->bx < ts->tiling.col_end; t->bx += sb_step) { if (atomic_load_explicit(c->flush, memory_order_acquire)) @@ -2595,13 +2799,7 @@ int dav1d_decode_tile_sbrow(Dav1dTileContext *const t) { // error out on symbol decoder overread if (ts->msac.cnt < -15) return 1; - if (f->n_tc > 1 && f->frame_hdr->use_ref_frame_mvs) { - if (c->n_fc > 1) for (int n = 0; n < 7; n++) - if (dav1d_thread_picture_wait(&f->refp[n], 4 * (t->by + sb_step), - PLANE_TYPE_BLOCK)) - { - return 1; - } + if (f->c->n_tc > 1 && f->frame_hdr->use_ref_frame_mvs) { dav1d_refmvs_load_tmvs(&f->rf, ts->tiling.row, ts->tiling.col_start >> 1, ts->tiling.col_end >> 1, t->by >> 1, (t->by + sb_step) >> 1); @@ -2686,14 +2884,14 @@ int dav1d_decode_tile_sbrow(Dav1dTileContext *const t) { } } - if (f->seq_hdr->ref_frame_mvs && f->n_tc > 1 && IS_INTER_OR_SWITCH(f->frame_hdr)) { + if (f->seq_hdr->ref_frame_mvs && f->c->n_tc > 1 && IS_INTER_OR_SWITCH(f->frame_hdr)) { dav1d_refmvs_save_tmvs(&t->rt, ts->tiling.col_start >> 1, ts->tiling.col_end >> 1, t->by >> 1, (t->by + sb_step) >> 1); } // backup pre-loopfilter pixels for intra prediction of the next sbrow - if (f->frame_thread.pass != 1) + if (t->frame_thread.pass != 1) f->bd_fn.backup_ipred_edge(t); // backup t->a/l.tx_lpf_y/uv at tile boundaries to use them to "fix" @@ -2709,50 +2907,24 @@ int dav1d_decode_tile_sbrow(Dav1dTileContext *const t) { return 0; } -int dav1d_decode_frame(Dav1dFrameContext *const f) { +int dav1d_decode_frame_init(Dav1dFrameContext *const f) { const Dav1dContext *const c = f->c; int retval = DAV1D_ERR(ENOMEM); - if (f->n_tc > 1) { - const int titsati_sz = f->frame_hdr->tiling.cols * f->sbh; - if (titsati_sz != f->tile_thread.titsati_sz) { - freep(&f->tile_thread.task_idx_to_sby_and_tile_idx); - f->tile_thread.task_idx_to_sby_and_tile_idx = - malloc(sizeof(*f->tile_thread.task_idx_to_sby_and_tile_idx) * - titsati_sz); - if (!f->tile_thread.task_idx_to_sby_and_tile_idx) { - f->tile_thread.titsati_sz = 0; - goto error; - } - f->tile_thread.titsati_sz = titsati_sz; - } - if (f->tile_thread.titsati_init[0] != f->frame_hdr->tiling.cols || - f->tile_thread.titsati_init[1] != f->frame_hdr->tiling.rows || - memcmp(f->frame_hdr->tiling.row_start_sb, f->tile_thread.titsati_index_rows, - sizeof(*f->tile_thread.titsati_index_rows) * - (f->frame_hdr->tiling.rows + 1))) - { - for (int tile_row = 0, task_idx = 0; - tile_row < f->frame_hdr->tiling.rows; tile_row++) - { - for (int sby = f->frame_hdr->tiling.row_start_sb[tile_row]; - sby < f->frame_hdr->tiling.row_start_sb[tile_row + 1]; sby++) - { - for (int tile_col = 0; tile_col < f->frame_hdr->tiling.cols; - tile_col++, task_idx++) - { - f->tile_thread.task_idx_to_sby_and_tile_idx[task_idx][0] = sby; - f->tile_thread.task_idx_to_sby_and_tile_idx[task_idx][1] = - tile_row * f->frame_hdr->tiling.cols + tile_col; - } - } - } - f->tile_thread.titsati_init[0] = f->frame_hdr->tiling.cols; - f->tile_thread.titsati_init[1] = f->frame_hdr->tiling.rows; - memcpy(f->tile_thread.titsati_index_rows, f->frame_hdr->tiling.row_start_sb, - sizeof(*f->tile_thread.titsati_index_rows) * - (f->frame_hdr->tiling.rows + 1)); + if (f->sbh > f->lf.start_of_tile_row_sz) { + free(f->lf.start_of_tile_row); + f->lf.start_of_tile_row = malloc(f->sbh * sizeof(uint8_t)); + if (!f->lf.start_of_tile_row) { + f->lf.start_of_tile_row_sz = 0; + goto error; } + f->lf.start_of_tile_row_sz = f->sbh; + } + int sby = 0; + for (int tile_row = 0; tile_row < f->frame_hdr->tiling.rows; tile_row++) { + f->lf.start_of_tile_row[sby++] = tile_row; + while (sby < f->frame_hdr->tiling.row_start_sb[tile_row + 1]) + f->lf.start_of_tile_row[sby++] = 0; } const int n_ts = f->frame_hdr->tiling.cols * f->frame_hdr->tiling.rows; @@ -2762,45 +2934,21 @@ int dav1d_decode_frame(Dav1dFrameContext *const f) { f->frame_thread.tile_start_off = malloc(sizeof(*f->frame_thread.tile_start_off) * n_ts); if (!f->frame_thread.tile_start_off) { - for (int n = 0; n < f->n_ts; n++) { - Dav1dTileState *const ts = &f->ts[n]; - pthread_cond_destroy(&ts->tile_thread.cond); - pthread_mutex_destroy(&ts->tile_thread.lock); - } f->n_ts = 0; goto error; } } Dav1dTileState *ts_new = dav1d_alloc_aligned(sizeof(*f->ts) * n_ts, 32); if (!ts_new) goto error; - if (n_ts > f->n_ts) { - if (f->ts) { - memcpy(ts_new, f->ts, sizeof(*f->ts) * f->n_ts); - dav1d_free_aligned(f->ts); - } - f->ts = ts_new; - for (int n = f->n_ts; n < n_ts; f->n_ts = ++n) { - Dav1dTileState *const ts = &f->ts[n]; - if (pthread_mutex_init(&ts->tile_thread.lock, NULL)) goto error; - if (pthread_cond_init(&ts->tile_thread.cond, NULL)) { - pthread_mutex_destroy(&ts->tile_thread.lock); - goto error; - } - } - } else { - for (int n = n_ts; n < f->n_ts; n++) { - Dav1dTileState *const ts = &f->ts[n]; - pthread_cond_destroy(&ts->tile_thread.cond); - pthread_mutex_destroy(&ts->tile_thread.lock); - } - memcpy(ts_new, f->ts, sizeof(*f->ts) * n_ts); + if (f->ts) { + memcpy(ts_new, f->ts, sizeof(*f->ts) * imin(n_ts, f->n_ts)); dav1d_free_aligned(f->ts); - f->n_ts = n_ts; - f->ts = ts_new; } + f->n_ts = n_ts; + f->ts = ts_new; } - const int a_sz = f->sb128w * f->frame_hdr->tiling.rows; + const int a_sz = f->sb128w * f->frame_hdr->tiling.rows * (1 + (c->n_fc > 1 && c->n_tc > 1)); if (a_sz != f->a_sz) { freep(&f->a); f->a = malloc(sizeof(*f->a) * a_sz); @@ -2827,6 +2975,29 @@ int dav1d_decode_frame(Dav1dFrameContext *const f) { } } + const int lowest_pixel_mem_sz = f->frame_hdr->tiling.cols * f->sbh; + if (lowest_pixel_mem_sz != f->tile_thread.lowest_pixel_mem_sz) { + free(f->tile_thread.lowest_pixel_mem); + f->tile_thread.lowest_pixel_mem = + malloc(lowest_pixel_mem_sz * sizeof(*f->tile_thread.lowest_pixel_mem)); + if (!f->tile_thread.lowest_pixel_mem) { + f->tile_thread.lowest_pixel_mem_sz = 0; + goto error; + } + f->tile_thread.lowest_pixel_mem_sz = lowest_pixel_mem_sz; + } + int (*lowest_pixel_ptr)[7][2] = f->tile_thread.lowest_pixel_mem; + for (int tile_row = 0, tile_row_base = 0; tile_row < f->frame_hdr->tiling.rows; + tile_row++, tile_row_base += f->frame_hdr->tiling.cols) + { + const int tile_row_sb_h = f->frame_hdr->tiling.row_start_sb[tile_row + 1] - + f->frame_hdr->tiling.row_start_sb[tile_row]; + for (int tile_col = 0; tile_col < f->frame_hdr->tiling.cols; tile_col++) { + f->ts[tile_row_base + tile_col].lowest_pixel = lowest_pixel_ptr; + lowest_pixel_ptr += tile_row_sb_h; + } + } + const int cf_sz = (num_sb128 * size_mul[0]) << hbd; if (cf_sz != f->frame_thread.cf_sz) { dav1d_freep_aligned(&f->frame_thread.cf); @@ -2914,7 +3085,7 @@ int dav1d_decode_frame(Dav1dFrameContext *const f) { const int lr_line_sz = ((f->sr_cur.p.p.w + 31) & ~31) << hbd; if (lr_line_sz != f->lf.lr_line_sz) { dav1d_freep_aligned(&f->lf.lr_lpf_line[0]); - const int num_lines = c->n_pfc > 1 ? f->sbh * (4 << f->seq_hdr->sb128) : 12; + const int num_lines = c->n_tc > 1 ? f->sbh * (4 << f->seq_hdr->sb128) : 12; // lr simd may overread the input, so slightly over-allocate the lpf buffer uint8_t *lr_ptr = dav1d_alloc_aligned(lr_line_sz * num_lines * 3 + 64, 32); if (!lr_ptr) { @@ -3009,15 +3180,11 @@ int dav1d_decode_frame(Dav1dFrameContext *const f) { if (IS_INTER_OR_SWITCH(f->frame_hdr) || f->frame_hdr->allow_intrabc) { const int ret = dav1d_refmvs_init_frame(&f->rf, f->seq_hdr, f->frame_hdr, - f->refpoc, f->mvs, f->refrefpoc, f->ref_mvs, f->n_tc); + f->refpoc, f->mvs, f->refrefpoc, f->ref_mvs, + f->c->n_tc, f->c->n_fc); if (ret < 0) goto error; } - // create post-filtering tasks - if (c->n_pfc > 1) - if (dav1d_task_create_filter_sbrow(f)) - goto error; - retval = DAV1D_ERR(EINVAL); // setup dequant tables @@ -3081,14 +3248,13 @@ int dav1d_decode_frame(Dav1dFrameContext *const f) { f->lf.sr_p[0] = f->sr_cur.p.data[0]; f->lf.sr_p[1] = f->sr_cur.p.data[has_chroma ? 1 : 0]; f->lf.sr_p[2] = f->sr_cur.p.data[has_chroma ? 2 : 0]; - f->lf.tile_row = 1; - dav1d_cdf_thread_wait(&f->in_cdf); if (f->frame_hdr->refresh_context) dav1d_cdf_thread_copy(f->out_cdf.data.cdf, &f->in_cdf); // parse individual tiles per tile group - int update_set = 0, tile_row = 0, tile_col = 0; + int tile_row = 0, tile_col = 0; + f->task_thread.update_set = 0; for (int i = 0; i < f->n_tile_data; i++) { const uint8_t *data = f->tile[i].data.data; size_t size = f->tile[i].data.sz; @@ -3115,213 +3281,76 @@ int dav1d_decode_frame(Dav1dFrameContext *const f) { tile_row++; } if (j == f->frame_hdr->tiling.update && f->frame_hdr->refresh_context) - update_set = 1; + f->task_thread.update_set = 1; data += tile_sz; size -= tile_sz; } } - // 2-pass decoding: - // - enabled for frame-threading, so that one frame can do symbol parsing - // as another (or multiple) are doing reconstruction. One advantage here - // is that although reconstruction is limited by reference availability, - // symbol parsing is not. Therefore, symbol parsing can effectively use - // row and col tile threading, but reconstruction only col tile threading; - // - pass 0 means no 2-pass; - // - pass 1 means symbol parsing only; - // - pass 2 means reconstruction and loop filtering. + if (c->n_tc > 1) { + const int uses_2pass = c->n_fc > 1; + for (int n = 0; n < f->sb128w * f->frame_hdr->tiling.rows * (1 + uses_2pass); n++) + reset_context(&f->a[n], IS_KEY_OR_INTRA(f->frame_hdr), + uses_2pass ? 1 + (n >= f->sb128w * f->frame_hdr->tiling.rows) : 0); + } - const int uses_2pass = c->n_fc > 1 && f->frame_hdr->refresh_context; - for (f->frame_thread.pass = uses_2pass; - f->frame_thread.pass <= 2 * uses_2pass; f->frame_thread.pass++) - { - const enum PlaneType progress_plane_type = - f->frame_thread.pass == 0 ? PLANE_TYPE_ALL : - f->frame_thread.pass == 1 ? PLANE_TYPE_BLOCK : PLANE_TYPE_Y; - - for (int n = 0; n < f->sb128w * f->frame_hdr->tiling.rows; n++) - reset_context(&f->a[n], IS_KEY_OR_INTRA(f->frame_hdr), f->frame_thread.pass); - - if (f->n_tc == 1 || (c->n_pfc > 1 && f->frame_hdr->tiling.cols * f->frame_hdr->tiling.rows == 1)) { - Dav1dTileContext *const t = f->tc; - - // no tile threading - we explicitly interleave tile/sbrow decoding - // and post-filtering, so that the full process runs in-line, so - // that frame threading is still possible - for (int tile_row = 0; tile_row < f->frame_hdr->tiling.rows; tile_row++) { - const int sbh_end = - imin(f->frame_hdr->tiling.row_start_sb[tile_row + 1], f->sbh); - for (int sby = f->frame_hdr->tiling.row_start_sb[tile_row]; - sby < sbh_end; sby++) - { - t->by = sby << (4 + f->seq_hdr->sb128); - const int by_end = (t->by + f->sb_step) >> 1; - if (f->frame_thread.pass <= 1 && f->frame_hdr->use_ref_frame_mvs) { - if (c->n_fc > 1) for (int n = 0; n < 7; n++) - if (dav1d_thread_picture_wait(&f->refp[n], - 4 * (t->by + f->sb_step), - PLANE_TYPE_BLOCK)) - { - goto error; - } - dav1d_refmvs_load_tmvs(&f->rf, tile_row, - 0, f->bw >> 1, t->by >> 1, by_end); - } - for (int tile_col = 0; tile_col < f->frame_hdr->tiling.cols; tile_col++) { - t->ts = &f->ts[tile_row * f->frame_hdr->tiling.cols + tile_col]; - if (dav1d_decode_tile_sbrow(t)) goto error; - } - if (f->seq_hdr->ref_frame_mvs && f->frame_thread.pass <= 1 && IS_INTER_OR_SWITCH(f->frame_hdr)) { - dav1d_refmvs_save_tmvs(&t->rt, 0, f->bw >> 1, t->by >> 1, by_end); - } + retval = 0; +error: + return retval; +} - // loopfilter + cdef + restoration - if (f->frame_thread.pass != 1) { - if (c->n_pfc == 1) - f->bd_fn.filter_sbrow(f, sby); - else { - pthread_mutex_lock(&f->lf.thread.pftd->lock); - if (f->lf.thread.npf != 0 && !f->lf.thread.done) { - Dav1dTask *const t = &f->lf.thread.tasks[sby * f->lf.thread.npf]; - t->start = 1; - if (t->status == DAV1D_TASK_READY) - dav1d_task_schedule(f->lf.thread.pftd, t); - } - pthread_mutex_unlock(&f->lf.thread.pftd->lock); - } - } - if (c->n_pfc == 1 || f->frame_thread.pass == 1 || f->lf.thread.npf == 0) - dav1d_thread_picture_signal(&f->sr_cur, - (sby + 1) * f->sb_step * 4, - progress_plane_type); - } - } - } else { - // signal available tasks to worker threads - int num_tasks; - - pthread_mutex_lock(&f->tile_thread.lock); - assert(!f->tile_thread.tasks_left); - if (f->frame_thread.pass == 1 || f->n_tc >= f->frame_hdr->tiling.cols) { - // we can (or in fact, if >, we need to) do full tile decoding. - // loopfilter happens below - num_tasks = f->frame_hdr->tiling.cols * f->frame_hdr->tiling.rows; - } else { - // we need to interleave sbrow decoding for all tile cols in a - // tile row, since otherwise subsequent threads will be blocked - // waiting for the post-filter to complete - num_tasks = f->sbh * f->frame_hdr->tiling.cols; - } - f->tile_thread.num_tasks = f->tile_thread.tasks_left = num_tasks; - pthread_cond_broadcast(&f->tile_thread.cond); - pthread_mutex_unlock(&f->tile_thread.lock); - - for (int tile_row = 0; tile_row < f->frame_hdr->tiling.rows; tile_row++) { - for (int sby = f->frame_hdr->tiling.row_start_sb[tile_row]; - sby < f->frame_hdr->tiling.row_start_sb[tile_row + 1]; sby++) - { - for (int tile_col = 0; tile_col < f->frame_hdr->tiling.cols; - tile_col++) - { - int progress; - Dav1dTileState *const ts = - &f->ts[tile_row * f->frame_hdr->tiling.cols + tile_col]; - - if ((progress = atomic_load(&ts->progress)) <= sby) { - pthread_mutex_lock(&ts->tile_thread.lock); - while ((progress = atomic_load(&ts->progress)) <= sby) - pthread_cond_wait(&ts->tile_thread.cond, - &ts->tile_thread.lock); - pthread_mutex_unlock(&ts->tile_thread.lock); - } - if (progress == TILE_ERROR) { - dav1d_thread_picture_signal(&f->sr_cur, FRAME_ERROR, - PLANE_TYPE_ALL); - const uint64_t all_mask = ~0ULL >> (64 - f->n_tc); - pthread_mutex_lock(&f->tile_thread.lock); - while (f->tile_thread.available != all_mask) - pthread_cond_wait(&f->tile_thread.icond, &f->tile_thread.lock); - pthread_mutex_unlock(&f->tile_thread.lock); - goto error; - } - } +int dav1d_decode_frame_main(Dav1dFrameContext *const f) { + const Dav1dContext *const c = f->c; + int retval = DAV1D_ERR(ENOMEM); - // loopfilter + cdef + restoration - if (f->frame_thread.pass != 1) { - if (c->n_pfc == 1) - f->bd_fn.filter_sbrow(f, sby); - else { - pthread_mutex_lock(&f->lf.thread.pftd->lock); - if (f->lf.thread.npf != 0 && !f->lf.thread.done) { - Dav1dTask *const t = &f->lf.thread.tasks[sby * f->lf.thread.npf]; - t->start = 1; - if (t->status == DAV1D_TASK_READY) - dav1d_task_schedule(f->lf.thread.pftd, t); - } - pthread_mutex_unlock(&f->lf.thread.pftd->lock); - } - } - if (c->n_pfc == 1 || f->frame_thread.pass == 1 || f->lf.thread.npf == 0) - dav1d_thread_picture_signal(&f->sr_cur, - (sby + 1) * f->sb_step * 4, - progress_plane_type); - } - } + assert(f->c->n_tc == 1); - const uint64_t all_mask = ~0ULL >> (64 - f->n_tc); - pthread_mutex_lock(&f->tile_thread.lock); - while (f->tile_thread.available != all_mask) - pthread_cond_wait(&f->tile_thread.icond, &f->tile_thread.lock); - pthread_mutex_unlock(&f->tile_thread.lock); - } + Dav1dTaskContext *const t = &c->tc[f - c->fc]; + t->f = f; + t->frame_thread.pass = 0; - if (f->frame_thread.pass <= 1 && f->frame_hdr->refresh_context) { - // cdf update - if (update_set) - dav1d_cdf_thread_update(f->frame_hdr, f->out_cdf.data.cdf, - &f->ts[f->frame_hdr->tiling.update].cdf); - dav1d_cdf_thread_signal(&f->out_cdf); - } - if (f->frame_thread.pass == 1) { - assert(c->n_fc > 1); - for (int tile_idx = 0; - tile_idx < f->frame_hdr->tiling.rows * f->frame_hdr->tiling.cols; - tile_idx++) - { - Dav1dTileState *const ts = &f->ts[tile_idx]; - const size_t tile_start_off = - (size_t) f->frame_thread.tile_start_off[tile_idx]; - ts->frame_thread.pal_idx = f->frame_thread.pal_idx ? - &f->frame_thread.pal_idx[tile_start_off * size_mul[1] / 4] : - NULL; - ts->frame_thread.cf = f->frame_thread.cf ? - (uint8_t*)f->frame_thread.cf + - ((tile_start_off * size_mul[0]) >> !f->seq_hdr->hbd) : - NULL; - if (f->n_tc > 0) { - const unsigned row_sb_start = - f->frame_hdr->tiling.row_start_sb[ts->tiling.row]; - atomic_init(&ts->progress, row_sb_start); - } + for (int n = 0; n < f->sb128w * f->frame_hdr->tiling.rows; n++) + reset_context(&f->a[n], IS_KEY_OR_INTRA(f->frame_hdr), 0); + + // no threading - we explicitly interleave tile/sbrow decoding + // and post-filtering, so that the full process runs in-line + for (int tile_row = 0; tile_row < f->frame_hdr->tiling.rows; tile_row++) { + const int sbh_end = + imin(f->frame_hdr->tiling.row_start_sb[tile_row + 1], f->sbh); + for (int sby = f->frame_hdr->tiling.row_start_sb[tile_row]; + sby < sbh_end; sby++) + { + t->by = sby << (4 + f->seq_hdr->sb128); + const int by_end = (t->by + f->sb_step) >> 1; + if (f->frame_hdr->use_ref_frame_mvs) { + dav1d_refmvs_load_tmvs(&f->rf, tile_row, + 0, f->bw >> 1, t->by >> 1, by_end); + } + for (int tile_col = 0; tile_col < f->frame_hdr->tiling.cols; tile_col++) { + t->ts = &f->ts[tile_row * f->frame_hdr->tiling.cols + tile_col]; + if (dav1d_decode_tile_sbrow(t)) goto error; + } + if (IS_INTER_OR_SWITCH(f->frame_hdr)) { + dav1d_refmvs_save_tmvs(&t->rt, 0, f->bw >> 1, t->by >> 1, by_end); } + + // loopfilter + cdef + restoration + f->bd_fn.filter_sbrow(f, sby); } } retval = 0; error: - if (c->n_pfc > 1) { - pthread_mutex_lock(&f->lf.thread.pftd->lock); - if (!f->lf.thread.done) { - if (retval != 0) { - f->lf.thread.done = -1; - pthread_cond_signal(&f->lf.thread.pftd->cond); - } - pthread_cond_wait(&f->lf.thread.cond, &f->lf.thread.pftd->lock); - } - pthread_mutex_unlock(&f->lf.thread.pftd->lock); + return retval; +} + +void dav1d_decode_frame_exit(Dav1dFrameContext *const f, const int retval) { + const Dav1dContext *const c = f->c; + + if (c->n_fc > 1 && retval && f->frame_thread.cf) { + memset(f->frame_thread.cf, 0, + (size_t)f->frame_thread.cf_sz * 128 * 128 / 2); } - dav1d_thread_picture_signal(&f->sr_cur, retval == 0 ? UINT_MAX : FRAME_ERROR, - PLANE_TYPE_ALL); for (int i = 0; i < 7; i++) { if (f->refp[i].p.data[0]) dav1d_thread_picture_unref(&f->refp[i]); @@ -3331,8 +3360,9 @@ error: dav1d_picture_unref_internal(&f->cur); dav1d_thread_picture_unref(&f->sr_cur); dav1d_cdf_thread_unref(&f->in_cdf); - if (f->frame_hdr->refresh_context) { - dav1d_cdf_thread_signal(&f->out_cdf); + if (f->frame_hdr && f->frame_hdr->refresh_context) { + if (f->out_cdf.progress) + atomic_store(f->out_cdf.progress, retval == 0 ? 1 : TILE_ERROR); dav1d_cdf_thread_unref(&f->out_cdf); } dav1d_ref_dec(&f->cur_segmap_ref); @@ -3343,8 +3373,39 @@ error: for (int i = 0; i < f->n_tile_data; i++) dav1d_data_unref_internal(&f->tile[i].data); +} - return retval; +int dav1d_decode_frame(Dav1dFrameContext *const f) { + assert(f->c->n_fc == 1); + // if n_tc > 1 (but n_fc == 1), we could run init/exit in the task + // threads also. Not sure it makes a measurable difference. + int res = dav1d_decode_frame_init(f); + // wait until all threads have completed + if (!res) { + if (f->c->n_tc > 1) { + pthread_mutex_lock(&f->task_thread.ttd->lock); + res = dav1d_task_create_tile_sbrow(f, 0, 1); + if (!res) { + const int uses_2pass = f->c->n_fc > 1; + while (!f->task_thread.done[0] || + (uses_2pass && !f->task_thread.done[1]) || + f->task_thread.task_counter > 0) + { + pthread_cond_wait(&f->task_thread.cond, + &f->task_thread.ttd->lock); + } + } + pthread_mutex_unlock(&f->task_thread.ttd->lock); + } else { + res = dav1d_decode_frame_main(f); + if (!res && f->frame_hdr->refresh_context && f->task_thread.update_set) { + dav1d_cdf_thread_update(f->frame_hdr, f->out_cdf.data.cdf, + &f->ts[f->frame_hdr->tiling.update].cdf); + } + } + } + dav1d_decode_frame_exit(f, res); + return res; } static int get_upscale_x0(const int in_w, const int out_w, const int step) { @@ -3360,17 +3421,23 @@ int dav1d_submit_frame(Dav1dContext *const c) { // wait for c->out_delayed[next] and move into c->out if visible Dav1dThreadPicture *out_delayed; if (c->n_fc > 1) { + pthread_mutex_lock(&c->task_thread.lock); const unsigned next = c->frame_thread.next++; if (c->frame_thread.next == c->n_fc) c->frame_thread.next = 0; f = &c->fc[next]; - pthread_mutex_lock(&f->frame_thread.td.lock); while (f->n_tile_data > 0) - pthread_cond_wait(&f->frame_thread.td.cond, - &f->frame_thread.td.lock); + pthread_cond_wait(&f->task_thread.cond, + &f->task_thread.ttd->lock); out_delayed = &c->frame_thread.out_delayed[next]; if (out_delayed->p.data[0]) { + if (atomic_load(&c->task_thread.first) + 1 < c->n_fc) + atomic_fetch_add(&c->task_thread.first, 1); + else + atomic_store(&c->task_thread.first, 0); + if (c->task_thread.cur < c->n_fc) + c->task_thread.cur--; const unsigned progress = atomic_load_explicit(&out_delayed->progress[1], memory_order_relaxed); if (out_delayed->visible && progress != FRAME_ERROR) { @@ -3429,7 +3496,8 @@ int dav1d_submit_frame(Dav1dContext *const c) { f->bd_fn.recon_b_inter = dav1d_recon_b_inter_##bd##bpc; \ f->bd_fn.recon_b_intra = dav1d_recon_b_intra_##bd##bpc; \ f->bd_fn.filter_sbrow = dav1d_filter_sbrow_##bd##bpc; \ - f->bd_fn.filter_sbrow_deblock = dav1d_filter_sbrow_deblock_##bd##bpc; \ + f->bd_fn.filter_sbrow_deblock_cols = dav1d_filter_sbrow_deblock_cols_##bd##bpc; \ + f->bd_fn.filter_sbrow_deblock_rows = dav1d_filter_sbrow_deblock_rows_##bd##bpc; \ f->bd_fn.filter_sbrow_cdef = dav1d_filter_sbrow_cdef_##bd##bpc; \ f->bd_fn.filter_sbrow_resize = dav1d_filter_sbrow_resize_##bd##bpc; \ f->bd_fn.filter_sbrow_lr = dav1d_filter_sbrow_lr_##bd##bpc; \ @@ -3484,7 +3552,7 @@ int dav1d_submit_frame(Dav1dContext *const c) { f->svc[i][0].step = (f->svc[i][0].scale + 8) >> 4; f->svc[i][1].step = (f->svc[i][1].scale + 8) >> 4; } else { - f->svc[i][0].scale = 0; + f->svc[i][0].scale = f->svc[i][1].scale = 0; } f->gmv_warp_allowed[i] = f->frame_hdr->gmv[i].type > DAV1D_WM_TYPE_TRANSLATION && !f->frame_hdr->force_integer_mv && @@ -3501,7 +3569,7 @@ int dav1d_submit_frame(Dav1dContext *const c) { dav1d_cdf_thread_ref(&f->in_cdf, &c->cdf[pri_ref]); } if (f->frame_hdr->refresh_context) { - res = dav1d_cdf_thread_alloc(c, &f->out_cdf, c->n_fc > 1 ? &f->frame_thread.td : NULL); + res = dav1d_cdf_thread_alloc(c, &f->out_cdf, c->n_fc > 1); if (res < 0) goto error; } @@ -3565,6 +3633,11 @@ int dav1d_submit_frame(Dav1dContext *const c) { f->sbh = (f->bh + f->sb_step - 1) >> f->sb_shift; f->b4_stride = (f->bw + 31) & ~31; f->bitdepth_max = (1 << f->cur.p.bpc) - 1; + f->task_thread.error = 0; + const int uses_2pass = c->n_fc > 1; + const int cols = f->frame_hdr->tiling.cols; + const int rows = f->frame_hdr->tiling.rows; + f->task_thread.task_counter = (cols * rows + f->sbh) << uses_2pass; // ref_mvs if (IS_INTER_OR_SWITCH(f->frame_hdr) || f->frame_hdr->allow_intrabc) { @@ -3707,8 +3780,8 @@ int dav1d_submit_frame(Dav1dContext *const c) { return res; } } else { - pthread_cond_signal(&f->frame_thread.td.cond); - pthread_mutex_unlock(&f->frame_thread.td.lock); + dav1d_task_frame_init(f); + pthread_mutex_unlock(&c->task_thread.lock); } return 0; @@ -3735,10 +3808,8 @@ error: dav1d_data_unref_internal(&f->tile[i].data); f->n_tile_data = 0; - if (c->n_fc > 1) { - pthread_cond_signal(&f->frame_thread.td.cond); - pthread_mutex_unlock(&f->frame_thread.td.lock); - } + if (c->n_fc > 1) + pthread_mutex_unlock(&f->task_thread.ttd->lock); return res; } |