Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/videolan/dav1d.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorVictorien Le Couviour--Tuffet <victorien@videolan.org>2021-07-26 18:02:02 +0300
committerVictorien Le Couviour--Tuffet <victorien@videolan.org>2021-09-03 19:06:31 +0300
commit753eef833bdd8ff1585c5c858cafeca8fefbb16e (patch)
treef32e9802b5b466d6ddded5a8e398f3d56d79e377 /src/decode.c
parent7b433e077298d0f4faf8da6d6eb5774e29bffa54 (diff)
Merge the 3 threading models into a single one
Merges the 3 threading parameters into a single `--threads=` argument. Frame threading can still be controlled via the `--framedelay=` argument. Internally, the threading model is now a global thread/task pool design. Co-authored-by: Ronald S. Bultje <rsbultje@gmail.com>
Diffstat (limited to 'src/decode.c')
-rw-r--r--src/decode.c779
1 files changed, 425 insertions, 354 deletions
diff --git a/src/decode.c b/src/decode.c
index 1caea76..747ee7b 100644
--- a/src/decode.c
+++ b/src/decode.c
@@ -73,7 +73,7 @@ static void init_quant_tables(const Dav1dSequenceHeader *const seq_hdr,
}
}
-static int read_mv_component_diff(Dav1dTileContext *const t,
+static int read_mv_component_diff(Dav1dTaskContext *const t,
CdfMvComponent *const mv_comp,
const int have_fp)
{
@@ -117,7 +117,7 @@ static int read_mv_component_diff(Dav1dTileContext *const t,
return sign ? -diff : diff;
}
-static void read_mv_residual(Dav1dTileContext *const t, mv *const ref_mv,
+static void read_mv_residual(Dav1dTaskContext *const t, mv *const ref_mv,
CdfMvContext *const mv_cdf, const int have_fp)
{
switch (dav1d_msac_decode_symbol_adapt4(&t->ts->msac, t->ts->cdf.mv.joint,
@@ -138,7 +138,7 @@ static void read_mv_residual(Dav1dTileContext *const t, mv *const ref_mv,
}
}
-static void read_tx_tree(Dav1dTileContext *const t,
+static void read_tx_tree(Dav1dTaskContext *const t,
const enum RectTxfmSize from,
const int depth, uint16_t *const masks,
const int x_off, const int y_off)
@@ -216,7 +216,7 @@ static int neg_deinterleave(int diff, int ref, int max) {
}
}
-static void find_matching_ref(const Dav1dTileContext *const t,
+static void find_matching_ref(const Dav1dTaskContext *const t,
const enum EdgeFlags intra_edge_flags,
const int bw4, const int bh4,
const int w4, const int h4,
@@ -289,7 +289,7 @@ static void find_matching_ref(const Dav1dTileContext *const t,
#undef matches
}
-static void derive_warpmv(const Dav1dTileContext *const t,
+static void derive_warpmv(const Dav1dTaskContext *const t,
const int bw4, const int bh4,
const uint64_t masks[2], const union mv mv,
Dav1dWarpedMotionParams *const wmp)
@@ -370,7 +370,7 @@ static inline int findoddzero(const uint8_t *buf, int len) {
return 0;
}
-static void read_pal_plane(Dav1dTileContext *const t, Av1Block *const b,
+static void read_pal_plane(Dav1dTaskContext *const t, Av1Block *const b,
const int pl, const int sz_ctx,
const int bx4, const int by4)
{
@@ -425,7 +425,7 @@ static void read_pal_plane(Dav1dTileContext *const t, Av1Block *const b,
const int n_used_cache = i;
// parse new entries
- uint16_t *const pal = f->frame_thread.pass ?
+ uint16_t *const pal = t->frame_thread.pass ?
f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) +
((t->bx >> 1) + (t->by & 1))][pl] : t->scratch.pal[pl];
if (i < pal_sz) {
@@ -473,7 +473,7 @@ static void read_pal_plane(Dav1dTileContext *const t, Av1Block *const b,
}
}
-static void read_pal_uv(Dav1dTileContext *const t, Av1Block *const b,
+static void read_pal_uv(Dav1dTaskContext *const t, Av1Block *const b,
const int sz_ctx, const int bx4, const int by4)
{
read_pal_plane(t, b, 1, sz_ctx, bx4, by4);
@@ -481,7 +481,7 @@ static void read_pal_uv(Dav1dTileContext *const t, Av1Block *const b,
// V pal coding
Dav1dTileState *const ts = t->ts;
const Dav1dFrameContext *const f = t->f;
- uint16_t *const pal = f->frame_thread.pass ?
+ uint16_t *const pal = t->frame_thread.pass ?
f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) +
((t->bx >> 1) + (t->by & 1))][2] : t->scratch.pal[2];
if (dav1d_msac_decode_bool_equi(&ts->msac)) {
@@ -575,7 +575,7 @@ static void order_palette(const uint8_t *pal_idx, const ptrdiff_t stride,
}
}
-static void read_pal_indices(Dav1dTileContext *const t,
+static void read_pal_indices(Dav1dTaskContext *const t,
uint8_t *const pal_idx,
const Av1Block *const b, const int pl,
const int w4, const int h4,
@@ -612,7 +612,7 @@ static void read_pal_indices(Dav1dTileContext *const t,
}
}
-static void read_vartx_tree(Dav1dTileContext *const t,
+static void read_vartx_tree(Dav1dTaskContext *const t,
Av1Block *const b, const enum BlockSize bs,
const int bx4, const int by4)
{
@@ -674,11 +674,6 @@ static inline unsigned get_prev_frame_segid(const Dav1dFrameContext *const f,
const ptrdiff_t stride)
{
assert(f->frame_hdr->primary_ref_frame != DAV1D_PRIMARY_REF_NONE);
- if (dav1d_thread_picture_wait(&f->refp[f->frame_hdr->primary_ref_frame],
- (by + h4) * 4, PLANE_TYPE_BLOCK))
- {
- return 8;
- }
unsigned seg_id = 8;
ref_seg_map += by * stride + bx;
@@ -693,7 +688,7 @@ static inline unsigned get_prev_frame_segid(const Dav1dFrameContext *const f,
}
static inline void splat_oneref_mv(const Dav1dContext *const c,
- Dav1dTileContext *const t,
+ Dav1dTaskContext *const t,
const enum BlockSize bs,
const Av1Block *const b,
const int bw4, const int bh4)
@@ -709,7 +704,7 @@ static inline void splat_oneref_mv(const Dav1dContext *const c,
}
static inline void splat_intrabc_mv(const Dav1dContext *const c,
- Dav1dTileContext *const t,
+ Dav1dTaskContext *const t,
const enum BlockSize bs,
const Av1Block *const b,
const int bw4, const int bh4)
@@ -724,7 +719,7 @@ static inline void splat_intrabc_mv(const Dav1dContext *const c,
}
static inline void splat_tworef_mv(const Dav1dContext *const c,
- Dav1dTileContext *const t,
+ Dav1dTaskContext *const t,
const enum BlockSize bs,
const Av1Block *const b,
const int bw4, const int bh4)
@@ -741,7 +736,7 @@ static inline void splat_tworef_mv(const Dav1dContext *const c,
}
static inline void splat_intraref(const Dav1dContext *const c,
- Dav1dTileContext *const t,
+ Dav1dTaskContext *const t,
const enum BlockSize bs,
const int bw4, const int bh4)
{
@@ -754,7 +749,98 @@ static inline void splat_intraref(const Dav1dContext *const c,
c->refmvs_dsp.splat_mv(&t->rt.r[(t->by & 31) + 5], &tmpl, t->bx, bw4, bh4);
}
-static int decode_b(Dav1dTileContext *const t,
+static inline void mc_lowest_px(int *const dst, const int by4, const int bh4,
+ const int mvy, const int ss_ver,
+ const struct ScalableMotionParams *const smp)
+{
+ const int v_mul = 4 >> ss_ver;
+ if (!smp->scale) {
+ const int my = mvy >> (3 + ss_ver), dy = mvy & (15 >> !ss_ver);
+ *dst = imax(*dst, (by4 + bh4) * v_mul + my + 4 * !!dy);
+ } else {
+ int y = (by4 * v_mul << 4) + mvy * (1 << !ss_ver);
+ const int64_t tmp = (int64_t)(y) * smp->scale + (smp->scale - 0x4000) * 8;
+ y = apply_sign64((int)((llabs(tmp) + 128) >> 8), tmp) + 32;
+ const int bottom = ((y + (bh4 * v_mul - 1) * smp->step) >> 10) + 1 + 4;
+ *dst = imax(*dst, bottom);
+ }
+}
+
+static inline void affine_lowest_px(Dav1dTaskContext *const t,
+ int *const dst, const int is_chroma,
+ const uint8_t *const b_dim,
+ const Dav1dWarpedMotionParams *const wmp)
+{
+ const Dav1dFrameContext *const f = t->f;
+ const int ss_ver = is_chroma && f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
+ const int ss_hor = is_chroma && f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444;
+ const int h_mul = 4 >> ss_hor, v_mul = 4 >> ss_ver;
+ assert(!((b_dim[0] * h_mul) & 7) && !((b_dim[1] * v_mul) & 7));
+ const int32_t *const mat = wmp->matrix;
+ const int y = b_dim[1] * v_mul - 8; // lowest y
+
+ const int src_y = t->by * 4 + ((y + 4) << ss_ver);
+ const int64_t mat5_y = (int64_t) mat[5] * src_y + mat[1];
+ // check left- and right-most blocks
+ for (int x = 0; x < b_dim[0] * h_mul; x += imax(8, b_dim[0] * h_mul - 8)) {
+ // calculate transformation relative to center of 8x8 block in
+ // luma pixel units
+ const int src_x = t->bx * 4 + ((x + 4) << ss_hor);
+ const int64_t mvy = ((int64_t) mat[4] * src_x + mat5_y) >> ss_ver;
+ const int dy = (int) (mvy >> 16) - 4;
+ *dst = imax(*dst, dy + 4 + 8);
+ }
+}
+
+static void obmc_lowest_px(Dav1dTaskContext *const t,
+ int (*const dst)[2], const int is_chroma,
+ const uint8_t *const b_dim,
+ const int bx4, const int by4, const int w4, const int h4)
+{
+ assert(!(t->bx & 1) && !(t->by & 1));
+ const Dav1dFrameContext *const f = t->f;
+ /*const*/ refmvs_block **r = &t->rt.r[(t->by & 31) + 5];
+ const int ss_ver = is_chroma && f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
+ const int ss_hor = is_chroma && f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444;
+ const int h_mul = 4 >> ss_hor, v_mul = 4 >> ss_ver;
+
+ if (t->by > t->ts->tiling.row_start &&
+ (!is_chroma || b_dim[0] * h_mul + b_dim[1] * v_mul >= 16))
+ {
+ for (int i = 0, x = 0; x < w4 && i < imin(b_dim[2], 4); ) {
+ // only odd blocks are considered for overlap handling, hence +1
+ const refmvs_block *const a_r = &r[-1][t->bx + x + 1];
+ const uint8_t *const a_b_dim = dav1d_block_dimensions[a_r->bs];
+
+ if (a_r->ref.ref[0] > 0) {
+ const int oh4 = imin(b_dim[1], 16) >> 1;
+ mc_lowest_px(&dst[a_r->ref.ref[0] - 1][is_chroma], t->by,
+ (oh4 * 3 + 3) >> 2, a_r->mv.mv[0].y, ss_ver,
+ &f->svc[a_r->ref.ref[0] - 1][1]);
+ i++;
+ }
+ x += imax(a_b_dim[0], 2);
+ }
+ }
+
+ if (t->bx > t->ts->tiling.col_start)
+ for (int i = 0, y = 0; y < h4 && i < imin(b_dim[3], 4); ) {
+ // only odd blocks are considered for overlap handling, hence +1
+ const refmvs_block *const l_r = &r[y + 1][t->bx - 1];
+ const uint8_t *const l_b_dim = dav1d_block_dimensions[l_r->bs];
+
+ if (l_r->ref.ref[0] > 0) {
+ const int oh4 = iclip(l_b_dim[1], 2, b_dim[1]);
+ mc_lowest_px(&dst[l_r->ref.ref[0] - 1][is_chroma],
+ t->by + y, oh4, l_r->mv.mv[0].y, ss_ver,
+ &f->svc[l_r->ref.ref[0] - 1][1]);
+ i++;
+ }
+ y += imax(l_b_dim[1], 2);
+ }
+}
+
+static int decode_b(Dav1dTaskContext *const t,
const enum BlockLevel bl,
const enum BlockSize bs,
const enum BlockPartition bp,
@@ -762,7 +848,7 @@ static int decode_b(Dav1dTileContext *const t,
{
Dav1dTileState *const ts = t->ts;
const Dav1dFrameContext *const f = t->f;
- Av1Block b_mem, *const b = f->frame_thread.pass ?
+ Av1Block b_mem, *const b = t->frame_thread.pass ?
&f->frame_thread.b[t->by * f->b4_stride + t->bx] : &b_mem;
const uint8_t *const b_dim = dav1d_block_dimensions[bs];
const int bx4 = t->bx & 31, by4 = t->by & 31;
@@ -778,7 +864,7 @@ static int decode_b(Dav1dTileContext *const t,
(bw4 > ss_hor || t->bx & 1) &&
(bh4 > ss_ver || t->by & 1);
- if (f->frame_thread.pass == 2) {
+ if (t->frame_thread.pass == 2) {
if (b->intra) {
f->bd_fn.recon_b_intra(t, bs, intra_edge_flags, b);
@@ -1236,10 +1322,11 @@ static int decode_b(Dav1dTileContext *const t,
if (b->pal_sz[0]) {
uint8_t *pal_idx;
- if (f->frame_thread.pass) {
- assert(ts->frame_thread.pal_idx);
- pal_idx = ts->frame_thread.pal_idx;
- ts->frame_thread.pal_idx += bw4 * bh4 * 16;
+ if (t->frame_thread.pass) {
+ const int p = t->frame_thread.pass & 1;
+ assert(ts->frame_thread[p].pal_idx);
+ pal_idx = ts->frame_thread[p].pal_idx;
+ ts->frame_thread[p].pal_idx += bw4 * bh4 * 16;
} else
pal_idx = t->scratch.pal_idx;
read_pal_indices(t, pal_idx, b, 0, w4, h4, bw4, bh4);
@@ -1249,10 +1336,11 @@ static int decode_b(Dav1dTileContext *const t,
if (has_chroma && b->pal_sz[1]) {
uint8_t *pal_idx;
- if (f->frame_thread.pass) {
- assert(ts->frame_thread.pal_idx);
- pal_idx = ts->frame_thread.pal_idx;
- ts->frame_thread.pal_idx += cbw4 * cbh4 * 16;
+ if (t->frame_thread.pass) {
+ const int p = t->frame_thread.pass & 1;
+ assert(ts->frame_thread[p].pal_idx);
+ pal_idx = ts->frame_thread[p].pal_idx;
+ ts->frame_thread[p].pal_idx += cbw4 * cbh4 * 16;
} else
pal_idx = &t->scratch.pal_idx[bw4 * bh4 * 16];
read_pal_indices(t, pal_idx, b, 1, cw4, ch4, cbw4, cbh4);
@@ -1284,7 +1372,7 @@ static int decode_b(Dav1dTileContext *const t,
}
// reconstruction
- if (f->frame_thread.pass == 1) {
+ if (t->frame_thread.pass == 1) {
f->bd_fn.read_coef_blocks(t, bs, b);
} else {
f->bd_fn.recon_b_intra(t, bs, intra_edge_flags, b);
@@ -1328,7 +1416,7 @@ static int decode_b(Dav1dTileContext *const t,
case_set(bw4, a->, 0, bx4);
#undef set_ctx
if (b->pal_sz[0]) {
- uint16_t *const pal = f->frame_thread.pass ?
+ uint16_t *const pal = t->frame_thread.pass ?
f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) +
((t->bx >> 1) + (t->by & 1))][0] : t->scratch.pal[0];
for (int x = 0; x < bw4; x++)
@@ -1343,7 +1431,7 @@ static int decode_b(Dav1dTileContext *const t,
case_set(cbw4, a->, 0, cbx4);
#undef set_ctx
if (b->pal_sz[1]) {
- const uint16_t (*const pal)[8] = f->frame_thread.pass ?
+ const uint16_t (*const pal)[8] = t->frame_thread.pass ?
f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) *
(f->b4_stride >> 1) + ((t->bx >> 1) + (t->by & 1))] :
t->scratch.pal;
@@ -1446,7 +1534,7 @@ static int decode_b(Dav1dTileContext *const t,
read_vartx_tree(t, b, bs, bx4, by4);
// reconstruction
- if (f->frame_thread.pass == 1) {
+ if (t->frame_thread.pass == 1) {
f->bd_fn.read_coef_blocks(t, bs, b);
b->filter2d = FILTER_2D_BILINEAR;
} else {
@@ -1910,7 +1998,7 @@ static int decode_b(Dav1dTileContext *const t,
signabs(t->warpmv.u.p.delta),
b->mv[0].y, b->mv[0].x);
#undef signabs
- if (f->frame_thread.pass) {
+ if (t->frame_thread.pass) {
if (t->warpmv.type == DAV1D_WM_TYPE_AFFINE) {
b->matrix[0] = t->warpmv.matrix[2] - 0x10000;
b->matrix[1] = t->warpmv.matrix[3];
@@ -1970,7 +2058,7 @@ static int decode_b(Dav1dTileContext *const t,
read_vartx_tree(t, b, bs, bx4, by4);
// reconstruction
- if (f->frame_thread.pass == 1) {
+ if (t->frame_thread.pass == 1) {
f->bd_fn.read_coef_blocks(t, bs, b);
} else {
if (f->bd_fn.recon_b_inter(t, bs, b)) return -1;
@@ -2052,6 +2140,110 @@ static int decode_b(Dav1dTileContext *const t,
}
}
+ if (t->frame_thread.pass == 1 && !b->intra && IS_INTER_OR_SWITCH(f->frame_hdr)) {
+ const int sby = (t->by - ts->tiling.row_start) >> f->sb_shift;
+ int (*const lowest_px)[2] = ts->lowest_pixel[sby];
+
+ // keep track of motion vectors for each reference
+ if (b->comp_type == COMP_INTER_NONE) {
+ // y
+ if (imin(bw4, bh4) > 1 &&
+ ((b->inter_mode == GLOBALMV && f->gmv_warp_allowed[b->ref[0]]) ||
+ (b->motion_mode == MM_WARP && t->warpmv.type > DAV1D_WM_TYPE_TRANSLATION)))
+ {
+ affine_lowest_px(t, &lowest_px[b->ref[0]][0], 0, b_dim,
+ b->motion_mode == MM_WARP ? &t->warpmv :
+ &f->frame_hdr->gmv[b->ref[0]]);
+ } else {
+ mc_lowest_px(&lowest_px[b->ref[0]][0], t->by, bh4, b->mv[0].y,
+ 0, &f->svc[b->ref[0]][1]);
+ if (b->motion_mode == MM_OBMC) {
+ obmc_lowest_px(t, lowest_px, 0, b_dim, bx4, by4, w4, h4);
+ }
+ }
+
+ // uv
+ if (has_chroma) {
+ // sub8x8 derivation
+ int is_sub8x8 = bw4 == ss_hor || bh4 == ss_ver;
+ refmvs_block *const *r;
+ if (is_sub8x8) {
+ assert(ss_hor == 1);
+ r = &t->rt.r[(t->by & 31) + 5];
+ if (bw4 == 1) is_sub8x8 &= r[0][t->bx - 1].ref.ref[0] > 0;
+ if (bh4 == ss_ver) is_sub8x8 &= r[-1][t->bx].ref.ref[0] > 0;
+ if (bw4 == 1 && bh4 == ss_ver)
+ is_sub8x8 &= r[-1][t->bx - 1].ref.ref[0] > 0;
+ }
+
+ // chroma prediction
+ if (is_sub8x8) {
+ assert(ss_hor == 1);
+ if (bw4 == 1 && bh4 == ss_ver) {
+ const refmvs_block *const rr = &r[-1][t->bx - 1];
+ mc_lowest_px(&lowest_px[rr->ref.ref[0] - 1][1],
+ t->by - 1, bh4, rr->mv.mv[0].y, ss_ver,
+ &f->svc[rr->ref.ref[0] - 1][1]);
+ }
+ if (bw4 == 1) {
+ const refmvs_block *const rr = &r[0][t->bx - 1];
+ mc_lowest_px(&lowest_px[rr->ref.ref[0] - 1][1],
+ t->by, bh4, rr->mv.mv[0].y, ss_ver,
+ &f->svc[rr->ref.ref[0] - 1][1]);
+ }
+ if (bh4 == ss_ver) {
+ const refmvs_block *const rr = &r[-1][t->bx];
+ mc_lowest_px(&lowest_px[rr->ref.ref[0] - 1][1],
+ t->by - 1, bh4, rr->mv.mv[0].y, ss_ver,
+ &f->svc[rr->ref.ref[0] - 1][1]);
+ }
+ mc_lowest_px(&lowest_px[b->ref[0]][1], t->by, bh4,
+ b->mv[0].y, ss_ver, &f->svc[b->ref[0]][1]);
+ } else {
+ if (imin(cbw4, cbh4) > 1 &&
+ ((b->inter_mode == GLOBALMV && f->gmv_warp_allowed[b->ref[0]]) ||
+ (b->motion_mode == MM_WARP && t->warpmv.type > DAV1D_WM_TYPE_TRANSLATION)))
+ {
+ affine_lowest_px(t, &lowest_px[b->ref[0]][1], 1, b_dim,
+ b->motion_mode == MM_WARP ? &t->warpmv :
+ &f->frame_hdr->gmv[b->ref[0]]);
+ } else {
+ mc_lowest_px(&lowest_px[b->ref[0]][1],
+ t->by & ~ss_ver, bh4 << (bh4 == ss_ver),
+ b->mv[0].y, ss_ver, &f->svc[b->ref[0]][1]);
+ if (b->motion_mode == MM_OBMC) {
+ obmc_lowest_px(t, lowest_px, 1, b_dim, bx4, by4, w4, h4);
+ }
+ }
+ }
+ }
+ } else {
+ // y
+ for (int i = 0; i < 2; i++) {
+ if (b->inter_mode == GLOBALMV_GLOBALMV && f->gmv_warp_allowed[b->ref[i]]) {
+ affine_lowest_px(t, &lowest_px[b->ref[i]][0], 0, b_dim,
+ &f->frame_hdr->gmv[b->ref[i]]);
+ } else {
+ mc_lowest_px(&lowest_px[b->ref[i]][0], t->by, bh4,
+ b->mv[i].y, 0, &f->svc[b->ref[i]][1]);
+ }
+ }
+
+ // uv
+ if (has_chroma) for (int i = 0; i < 2; i++) {
+ if (b->inter_mode == GLOBALMV_GLOBALMV &&
+ imin(cbw4, cbh4) > 1 && f->gmv_warp_allowed[b->ref[i]])
+ {
+ affine_lowest_px(t, &lowest_px[b->ref[i]][1], 1, b_dim,
+ &f->frame_hdr->gmv[b->ref[i]]);
+ } else {
+ mc_lowest_px(&lowest_px[b->ref[i]][1], t->by, bh4,
+ b->mv[i].y, ss_ver, &f->svc[b->ref[i]][1]);
+ }
+ }
+ }
+ }
+
return 0;
}
@@ -2059,7 +2251,7 @@ static int decode_b(Dav1dTileContext *const t,
#include <sanitizer/msan_interface.h>
-static int checked_decode_b(Dav1dTileContext *const t,
+static int checked_decode_b(Dav1dTaskContext *const t,
const enum BlockLevel bl,
const enum BlockSize bs,
const enum BlockPartition bp,
@@ -2068,7 +2260,7 @@ static int checked_decode_b(Dav1dTileContext *const t,
const Dav1dFrameContext *const f = t->f;
const int err = decode_b(t, bl, bs, bp, intra_edge_flags);
- if (err == 0 && !(f->frame_thread.pass & 1)) {
+ if (err == 0 && !(t->frame_thread.pass & 1)) {
const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
const int ss_hor = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444;
const uint8_t *const b_dim = dav1d_block_dimensions[bs];
@@ -2108,7 +2300,7 @@ static int checked_decode_b(Dav1dTileContext *const t,
#endif /* defined(__has_feature) */
-static int decode_sb(Dav1dTileContext *const t, const enum BlockLevel bl,
+static int decode_sb(Dav1dTaskContext *const t, const enum BlockLevel bl,
const EdgeNode *const node)
{
const Dav1dFrameContext *const f = t->f;
@@ -2124,7 +2316,7 @@ static int decode_sb(Dav1dTileContext *const t, const enum BlockLevel bl,
uint16_t *pc;
enum BlockPartition bp;
int ctx, bx8, by8;
- if (f->frame_thread.pass != 2) {
+ if (t->frame_thread.pass != 2) {
if (0 && bl == BL_64X64)
printf("poc=%d,y=%d,x=%d,bl=%d,r=%d\n",
f->frame_hdr->frame_offset, t->by, t->bx, bl, t->ts->msac.rng);
@@ -2135,7 +2327,7 @@ static int decode_sb(Dav1dTileContext *const t, const enum BlockLevel bl,
}
if (have_h_split && have_v_split) {
- if (f->frame_thread.pass == 2) {
+ if (t->frame_thread.pass == 2) {
const Av1Block *const b = &f->frame_thread.b[t->by * f->b4_stride + t->bx];
bp = b->bl == bl ? b->bp : PARTITION_SPLIT;
} else {
@@ -2307,7 +2499,7 @@ static int decode_sb(Dav1dTileContext *const t, const enum BlockLevel bl,
}
} else if (have_h_split) {
unsigned is_split;
- if (f->frame_thread.pass == 2) {
+ if (t->frame_thread.pass == 2) {
const Av1Block *const b = &f->frame_thread.b[t->by * f->b4_stride + t->bx];
is_split = b->bl != bl;
} else {
@@ -2336,7 +2528,7 @@ static int decode_sb(Dav1dTileContext *const t, const enum BlockLevel bl,
} else {
assert(have_v_split);
unsigned is_split;
- if (f->frame_thread.pass == 2) {
+ if (t->frame_thread.pass == 2) {
const Av1Block *const b = &f->frame_thread.b[t->by * f->b4_stride + t->bx];
is_split = b->bl != bl;
} else {
@@ -2366,7 +2558,7 @@ static int decode_sb(Dav1dTileContext *const t, const enum BlockLevel bl,
}
}
- if (f->frame_thread.pass != 2 && (bp != PARTITION_SPLIT || bl == BL_8X8)) {
+ if (t->frame_thread.pass != 2 && (bp != PARTITION_SPLIT || bl == BL_8X8)) {
#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
rep_macro(type, t->a->partition, bx8, mul * dav1d_al_part_ctx[0][bl][bp]); \
rep_macro(type, t->l.partition, by8, mul * dav1d_al_part_ctx[1][bl][bp])
@@ -2426,14 +2618,15 @@ static void setup_tile(Dav1dTileState *const ts,
const int sb_shift = f->sb_shift;
const uint8_t *const size_mul = ss_size_mul[f->cur.p.layout];
- ts->frame_thread.pal_idx = f->frame_thread.pal_idx ?
- &f->frame_thread.pal_idx[(size_t)tile_start_off * size_mul[1] / 4] :
- NULL;
-
- ts->frame_thread.cf = f->frame_thread.cf ?
- (uint8_t*)f->frame_thread.cf +
- (((size_t)tile_start_off * size_mul[0]) >> !f->seq_hdr->hbd) :
- NULL;
+ for (int p = 0; p < 2; p++) {
+ ts->frame_thread[p].pal_idx = f->frame_thread.pal_idx ?
+ &f->frame_thread.pal_idx[(size_t)tile_start_off * size_mul[1] / 4] :
+ NULL;
+ ts->frame_thread[p].cf = f->frame_thread.cf ?
+ (uint8_t*)f->frame_thread.cf +
+ (((size_t)tile_start_off * size_mul[0]) >> !f->seq_hdr->hbd) :
+ NULL;
+ }
dav1d_cdf_thread_copy(&ts->cdf, &f->in_cdf);
ts->last_qidx = f->frame_hdr->quant.yac;
@@ -2488,11 +2681,13 @@ static void setup_tile(Dav1dTileState *const ts,
ts->lr_ref[p]->sgr_weights[1] = 31;
}
- if (f->n_tc > 1)
- atomic_init(&ts->progress, row_sb_start);
+ if (f->c->n_tc > 1) {
+ for (int p = 0; p < 2; p++)
+ atomic_init(&ts->progress[p], row_sb_start);
+ }
}
-static void read_restoration_info(Dav1dTileContext *const t,
+static void read_restoration_info(Dav1dTaskContext *const t,
Av1RestorationUnit *const lr, const int p,
const enum Dav1dRestorationType frame_type)
{
@@ -2558,7 +2753,7 @@ static void read_restoration_info(Dav1dTileContext *const t,
}
}
-int dav1d_decode_tile_sbrow(Dav1dTileContext *const t) {
+int dav1d_decode_tile_sbrow(Dav1dTaskContext *const t) {
const Dav1dFrameContext *const f = t->f;
const enum BlockLevel root_bl = f->seq_hdr->sb128 ? BL_128X128 : BL_64X64;
Dav1dTileState *const ts = t->ts;
@@ -2572,13 +2767,22 @@ int dav1d_decode_tile_sbrow(Dav1dTileContext *const t) {
dav1d_refmvs_tile_sbrow_init(&t->rt, &f->rf, ts->tiling.col_start,
ts->tiling.col_end, ts->tiling.row_start,
ts->tiling.row_end, t->by >> f->sb_shift,
- ts->tiling.row);
+ ts->tiling.row, t->frame_thread.pass);
+ }
+
+ if (IS_INTER_OR_SWITCH(f->frame_hdr) && c->n_fc > 1) {
+ const int sby = (t->by - ts->tiling.row_start) >> f->sb_shift;
+ int (*const lowest_px)[2] = ts->lowest_pixel[sby];
+ for (int n = 0; n < 7; n++)
+ for (int m = 0; m < 2; m++)
+ lowest_px[n][m] = INT_MIN;
}
- reset_context(&t->l, IS_KEY_OR_INTRA(f->frame_hdr), f->frame_thread.pass);
- if (f->frame_thread.pass == 2) {
+ reset_context(&t->l, IS_KEY_OR_INTRA(f->frame_hdr), t->frame_thread.pass);
+ if (t->frame_thread.pass == 2) {
+ const int off_2pass = c->n_tc > 1 ? f->sb128w * f->frame_hdr->tiling.rows : 0;
for (t->bx = ts->tiling.col_start,
- t->a = f->a + col_sb128_start + tile_row * f->sb128w;
+ t->a = f->a + off_2pass + col_sb128_start + tile_row * f->sb128w;
t->bx < ts->tiling.col_end; t->bx += sb_step)
{
if (atomic_load_explicit(c->flush, memory_order_acquire))
@@ -2595,13 +2799,7 @@ int dav1d_decode_tile_sbrow(Dav1dTileContext *const t) {
// error out on symbol decoder overread
if (ts->msac.cnt < -15) return 1;
- if (f->n_tc > 1 && f->frame_hdr->use_ref_frame_mvs) {
- if (c->n_fc > 1) for (int n = 0; n < 7; n++)
- if (dav1d_thread_picture_wait(&f->refp[n], 4 * (t->by + sb_step),
- PLANE_TYPE_BLOCK))
- {
- return 1;
- }
+ if (f->c->n_tc > 1 && f->frame_hdr->use_ref_frame_mvs) {
dav1d_refmvs_load_tmvs(&f->rf, ts->tiling.row,
ts->tiling.col_start >> 1, ts->tiling.col_end >> 1,
t->by >> 1, (t->by + sb_step) >> 1);
@@ -2686,14 +2884,14 @@ int dav1d_decode_tile_sbrow(Dav1dTileContext *const t) {
}
}
- if (f->seq_hdr->ref_frame_mvs && f->n_tc > 1 && IS_INTER_OR_SWITCH(f->frame_hdr)) {
+ if (f->seq_hdr->ref_frame_mvs && f->c->n_tc > 1 && IS_INTER_OR_SWITCH(f->frame_hdr)) {
dav1d_refmvs_save_tmvs(&t->rt,
ts->tiling.col_start >> 1, ts->tiling.col_end >> 1,
t->by >> 1, (t->by + sb_step) >> 1);
}
// backup pre-loopfilter pixels for intra prediction of the next sbrow
- if (f->frame_thread.pass != 1)
+ if (t->frame_thread.pass != 1)
f->bd_fn.backup_ipred_edge(t);
// backup t->a/l.tx_lpf_y/uv at tile boundaries to use them to "fix"
@@ -2709,50 +2907,24 @@ int dav1d_decode_tile_sbrow(Dav1dTileContext *const t) {
return 0;
}
-int dav1d_decode_frame(Dav1dFrameContext *const f) {
+int dav1d_decode_frame_init(Dav1dFrameContext *const f) {
const Dav1dContext *const c = f->c;
int retval = DAV1D_ERR(ENOMEM);
- if (f->n_tc > 1) {
- const int titsati_sz = f->frame_hdr->tiling.cols * f->sbh;
- if (titsati_sz != f->tile_thread.titsati_sz) {
- freep(&f->tile_thread.task_idx_to_sby_and_tile_idx);
- f->tile_thread.task_idx_to_sby_and_tile_idx =
- malloc(sizeof(*f->tile_thread.task_idx_to_sby_and_tile_idx) *
- titsati_sz);
- if (!f->tile_thread.task_idx_to_sby_and_tile_idx) {
- f->tile_thread.titsati_sz = 0;
- goto error;
- }
- f->tile_thread.titsati_sz = titsati_sz;
- }
- if (f->tile_thread.titsati_init[0] != f->frame_hdr->tiling.cols ||
- f->tile_thread.titsati_init[1] != f->frame_hdr->tiling.rows ||
- memcmp(f->frame_hdr->tiling.row_start_sb, f->tile_thread.titsati_index_rows,
- sizeof(*f->tile_thread.titsati_index_rows) *
- (f->frame_hdr->tiling.rows + 1)))
- {
- for (int tile_row = 0, task_idx = 0;
- tile_row < f->frame_hdr->tiling.rows; tile_row++)
- {
- for (int sby = f->frame_hdr->tiling.row_start_sb[tile_row];
- sby < f->frame_hdr->tiling.row_start_sb[tile_row + 1]; sby++)
- {
- for (int tile_col = 0; tile_col < f->frame_hdr->tiling.cols;
- tile_col++, task_idx++)
- {
- f->tile_thread.task_idx_to_sby_and_tile_idx[task_idx][0] = sby;
- f->tile_thread.task_idx_to_sby_and_tile_idx[task_idx][1] =
- tile_row * f->frame_hdr->tiling.cols + tile_col;
- }
- }
- }
- f->tile_thread.titsati_init[0] = f->frame_hdr->tiling.cols;
- f->tile_thread.titsati_init[1] = f->frame_hdr->tiling.rows;
- memcpy(f->tile_thread.titsati_index_rows, f->frame_hdr->tiling.row_start_sb,
- sizeof(*f->tile_thread.titsati_index_rows) *
- (f->frame_hdr->tiling.rows + 1));
+ if (f->sbh > f->lf.start_of_tile_row_sz) {
+ free(f->lf.start_of_tile_row);
+ f->lf.start_of_tile_row = malloc(f->sbh * sizeof(uint8_t));
+ if (!f->lf.start_of_tile_row) {
+ f->lf.start_of_tile_row_sz = 0;
+ goto error;
}
+ f->lf.start_of_tile_row_sz = f->sbh;
+ }
+ int sby = 0;
+ for (int tile_row = 0; tile_row < f->frame_hdr->tiling.rows; tile_row++) {
+ f->lf.start_of_tile_row[sby++] = tile_row;
+ while (sby < f->frame_hdr->tiling.row_start_sb[tile_row + 1])
+ f->lf.start_of_tile_row[sby++] = 0;
}
const int n_ts = f->frame_hdr->tiling.cols * f->frame_hdr->tiling.rows;
@@ -2762,45 +2934,21 @@ int dav1d_decode_frame(Dav1dFrameContext *const f) {
f->frame_thread.tile_start_off =
malloc(sizeof(*f->frame_thread.tile_start_off) * n_ts);
if (!f->frame_thread.tile_start_off) {
- for (int n = 0; n < f->n_ts; n++) {
- Dav1dTileState *const ts = &f->ts[n];
- pthread_cond_destroy(&ts->tile_thread.cond);
- pthread_mutex_destroy(&ts->tile_thread.lock);
- }
f->n_ts = 0;
goto error;
}
}
Dav1dTileState *ts_new = dav1d_alloc_aligned(sizeof(*f->ts) * n_ts, 32);
if (!ts_new) goto error;
- if (n_ts > f->n_ts) {
- if (f->ts) {
- memcpy(ts_new, f->ts, sizeof(*f->ts) * f->n_ts);
- dav1d_free_aligned(f->ts);
- }
- f->ts = ts_new;
- for (int n = f->n_ts; n < n_ts; f->n_ts = ++n) {
- Dav1dTileState *const ts = &f->ts[n];
- if (pthread_mutex_init(&ts->tile_thread.lock, NULL)) goto error;
- if (pthread_cond_init(&ts->tile_thread.cond, NULL)) {
- pthread_mutex_destroy(&ts->tile_thread.lock);
- goto error;
- }
- }
- } else {
- for (int n = n_ts; n < f->n_ts; n++) {
- Dav1dTileState *const ts = &f->ts[n];
- pthread_cond_destroy(&ts->tile_thread.cond);
- pthread_mutex_destroy(&ts->tile_thread.lock);
- }
- memcpy(ts_new, f->ts, sizeof(*f->ts) * n_ts);
+ if (f->ts) {
+ memcpy(ts_new, f->ts, sizeof(*f->ts) * imin(n_ts, f->n_ts));
dav1d_free_aligned(f->ts);
- f->n_ts = n_ts;
- f->ts = ts_new;
}
+ f->n_ts = n_ts;
+ f->ts = ts_new;
}
- const int a_sz = f->sb128w * f->frame_hdr->tiling.rows;
+ const int a_sz = f->sb128w * f->frame_hdr->tiling.rows * (1 + (c->n_fc > 1 && c->n_tc > 1));
if (a_sz != f->a_sz) {
freep(&f->a);
f->a = malloc(sizeof(*f->a) * a_sz);
@@ -2827,6 +2975,29 @@ int dav1d_decode_frame(Dav1dFrameContext *const f) {
}
}
+ const int lowest_pixel_mem_sz = f->frame_hdr->tiling.cols * f->sbh;
+ if (lowest_pixel_mem_sz != f->tile_thread.lowest_pixel_mem_sz) {
+ free(f->tile_thread.lowest_pixel_mem);
+ f->tile_thread.lowest_pixel_mem =
+ malloc(lowest_pixel_mem_sz * sizeof(*f->tile_thread.lowest_pixel_mem));
+ if (!f->tile_thread.lowest_pixel_mem) {
+ f->tile_thread.lowest_pixel_mem_sz = 0;
+ goto error;
+ }
+ f->tile_thread.lowest_pixel_mem_sz = lowest_pixel_mem_sz;
+ }
+ int (*lowest_pixel_ptr)[7][2] = f->tile_thread.lowest_pixel_mem;
+ for (int tile_row = 0, tile_row_base = 0; tile_row < f->frame_hdr->tiling.rows;
+ tile_row++, tile_row_base += f->frame_hdr->tiling.cols)
+ {
+ const int tile_row_sb_h = f->frame_hdr->tiling.row_start_sb[tile_row + 1] -
+ f->frame_hdr->tiling.row_start_sb[tile_row];
+ for (int tile_col = 0; tile_col < f->frame_hdr->tiling.cols; tile_col++) {
+ f->ts[tile_row_base + tile_col].lowest_pixel = lowest_pixel_ptr;
+ lowest_pixel_ptr += tile_row_sb_h;
+ }
+ }
+
const int cf_sz = (num_sb128 * size_mul[0]) << hbd;
if (cf_sz != f->frame_thread.cf_sz) {
dav1d_freep_aligned(&f->frame_thread.cf);
@@ -2914,7 +3085,7 @@ int dav1d_decode_frame(Dav1dFrameContext *const f) {
const int lr_line_sz = ((f->sr_cur.p.p.w + 31) & ~31) << hbd;
if (lr_line_sz != f->lf.lr_line_sz) {
dav1d_freep_aligned(&f->lf.lr_lpf_line[0]);
- const int num_lines = c->n_pfc > 1 ? f->sbh * (4 << f->seq_hdr->sb128) : 12;
+ const int num_lines = c->n_tc > 1 ? f->sbh * (4 << f->seq_hdr->sb128) : 12;
// lr simd may overread the input, so slightly over-allocate the lpf buffer
uint8_t *lr_ptr = dav1d_alloc_aligned(lr_line_sz * num_lines * 3 + 64, 32);
if (!lr_ptr) {
@@ -3009,15 +3180,11 @@ int dav1d_decode_frame(Dav1dFrameContext *const f) {
if (IS_INTER_OR_SWITCH(f->frame_hdr) || f->frame_hdr->allow_intrabc) {
const int ret =
dav1d_refmvs_init_frame(&f->rf, f->seq_hdr, f->frame_hdr,
- f->refpoc, f->mvs, f->refrefpoc, f->ref_mvs, f->n_tc);
+ f->refpoc, f->mvs, f->refrefpoc, f->ref_mvs,
+ f->c->n_tc, f->c->n_fc);
if (ret < 0) goto error;
}
- // create post-filtering tasks
- if (c->n_pfc > 1)
- if (dav1d_task_create_filter_sbrow(f))
- goto error;
-
retval = DAV1D_ERR(EINVAL);
// setup dequant tables
@@ -3081,14 +3248,13 @@ int dav1d_decode_frame(Dav1dFrameContext *const f) {
f->lf.sr_p[0] = f->sr_cur.p.data[0];
f->lf.sr_p[1] = f->sr_cur.p.data[has_chroma ? 1 : 0];
f->lf.sr_p[2] = f->sr_cur.p.data[has_chroma ? 2 : 0];
- f->lf.tile_row = 1;
- dav1d_cdf_thread_wait(&f->in_cdf);
if (f->frame_hdr->refresh_context)
dav1d_cdf_thread_copy(f->out_cdf.data.cdf, &f->in_cdf);
// parse individual tiles per tile group
- int update_set = 0, tile_row = 0, tile_col = 0;
+ int tile_row = 0, tile_col = 0;
+ f->task_thread.update_set = 0;
for (int i = 0; i < f->n_tile_data; i++) {
const uint8_t *data = f->tile[i].data.data;
size_t size = f->tile[i].data.sz;
@@ -3115,213 +3281,76 @@ int dav1d_decode_frame(Dav1dFrameContext *const f) {
tile_row++;
}
if (j == f->frame_hdr->tiling.update && f->frame_hdr->refresh_context)
- update_set = 1;
+ f->task_thread.update_set = 1;
data += tile_sz;
size -= tile_sz;
}
}
- // 2-pass decoding:
- // - enabled for frame-threading, so that one frame can do symbol parsing
- // as another (or multiple) are doing reconstruction. One advantage here
- // is that although reconstruction is limited by reference availability,
- // symbol parsing is not. Therefore, symbol parsing can effectively use
- // row and col tile threading, but reconstruction only col tile threading;
- // - pass 0 means no 2-pass;
- // - pass 1 means symbol parsing only;
- // - pass 2 means reconstruction and loop filtering.
+ if (c->n_tc > 1) {
+ const int uses_2pass = c->n_fc > 1;
+ for (int n = 0; n < f->sb128w * f->frame_hdr->tiling.rows * (1 + uses_2pass); n++)
+ reset_context(&f->a[n], IS_KEY_OR_INTRA(f->frame_hdr),
+ uses_2pass ? 1 + (n >= f->sb128w * f->frame_hdr->tiling.rows) : 0);
+ }
- const int uses_2pass = c->n_fc > 1 && f->frame_hdr->refresh_context;
- for (f->frame_thread.pass = uses_2pass;
- f->frame_thread.pass <= 2 * uses_2pass; f->frame_thread.pass++)
- {
- const enum PlaneType progress_plane_type =
- f->frame_thread.pass == 0 ? PLANE_TYPE_ALL :
- f->frame_thread.pass == 1 ? PLANE_TYPE_BLOCK : PLANE_TYPE_Y;
-
- for (int n = 0; n < f->sb128w * f->frame_hdr->tiling.rows; n++)
- reset_context(&f->a[n], IS_KEY_OR_INTRA(f->frame_hdr), f->frame_thread.pass);
-
- if (f->n_tc == 1 || (c->n_pfc > 1 && f->frame_hdr->tiling.cols * f->frame_hdr->tiling.rows == 1)) {
- Dav1dTileContext *const t = f->tc;
-
- // no tile threading - we explicitly interleave tile/sbrow decoding
- // and post-filtering, so that the full process runs in-line, so
- // that frame threading is still possible
- for (int tile_row = 0; tile_row < f->frame_hdr->tiling.rows; tile_row++) {
- const int sbh_end =
- imin(f->frame_hdr->tiling.row_start_sb[tile_row + 1], f->sbh);
- for (int sby = f->frame_hdr->tiling.row_start_sb[tile_row];
- sby < sbh_end; sby++)
- {
- t->by = sby << (4 + f->seq_hdr->sb128);
- const int by_end = (t->by + f->sb_step) >> 1;
- if (f->frame_thread.pass <= 1 && f->frame_hdr->use_ref_frame_mvs) {
- if (c->n_fc > 1) for (int n = 0; n < 7; n++)
- if (dav1d_thread_picture_wait(&f->refp[n],
- 4 * (t->by + f->sb_step),
- PLANE_TYPE_BLOCK))
- {
- goto error;
- }
- dav1d_refmvs_load_tmvs(&f->rf, tile_row,
- 0, f->bw >> 1, t->by >> 1, by_end);
- }
- for (int tile_col = 0; tile_col < f->frame_hdr->tiling.cols; tile_col++) {
- t->ts = &f->ts[tile_row * f->frame_hdr->tiling.cols + tile_col];
- if (dav1d_decode_tile_sbrow(t)) goto error;
- }
- if (f->seq_hdr->ref_frame_mvs && f->frame_thread.pass <= 1 && IS_INTER_OR_SWITCH(f->frame_hdr)) {
- dav1d_refmvs_save_tmvs(&t->rt, 0, f->bw >> 1, t->by >> 1, by_end);
- }
+ retval = 0;
+error:
+ return retval;
+}
- // loopfilter + cdef + restoration
- if (f->frame_thread.pass != 1) {
- if (c->n_pfc == 1)
- f->bd_fn.filter_sbrow(f, sby);
- else {
- pthread_mutex_lock(&f->lf.thread.pftd->lock);
- if (f->lf.thread.npf != 0 && !f->lf.thread.done) {
- Dav1dTask *const t = &f->lf.thread.tasks[sby * f->lf.thread.npf];
- t->start = 1;
- if (t->status == DAV1D_TASK_READY)
- dav1d_task_schedule(f->lf.thread.pftd, t);
- }
- pthread_mutex_unlock(&f->lf.thread.pftd->lock);
- }
- }
- if (c->n_pfc == 1 || f->frame_thread.pass == 1 || f->lf.thread.npf == 0)
- dav1d_thread_picture_signal(&f->sr_cur,
- (sby + 1) * f->sb_step * 4,
- progress_plane_type);
- }
- }
- } else {
- // signal available tasks to worker threads
- int num_tasks;
-
- pthread_mutex_lock(&f->tile_thread.lock);
- assert(!f->tile_thread.tasks_left);
- if (f->frame_thread.pass == 1 || f->n_tc >= f->frame_hdr->tiling.cols) {
- // we can (or in fact, if >, we need to) do full tile decoding.
- // loopfilter happens below
- num_tasks = f->frame_hdr->tiling.cols * f->frame_hdr->tiling.rows;
- } else {
- // we need to interleave sbrow decoding for all tile cols in a
- // tile row, since otherwise subsequent threads will be blocked
- // waiting for the post-filter to complete
- num_tasks = f->sbh * f->frame_hdr->tiling.cols;
- }
- f->tile_thread.num_tasks = f->tile_thread.tasks_left = num_tasks;
- pthread_cond_broadcast(&f->tile_thread.cond);
- pthread_mutex_unlock(&f->tile_thread.lock);
-
- for (int tile_row = 0; tile_row < f->frame_hdr->tiling.rows; tile_row++) {
- for (int sby = f->frame_hdr->tiling.row_start_sb[tile_row];
- sby < f->frame_hdr->tiling.row_start_sb[tile_row + 1]; sby++)
- {
- for (int tile_col = 0; tile_col < f->frame_hdr->tiling.cols;
- tile_col++)
- {
- int progress;
- Dav1dTileState *const ts =
- &f->ts[tile_row * f->frame_hdr->tiling.cols + tile_col];
-
- if ((progress = atomic_load(&ts->progress)) <= sby) {
- pthread_mutex_lock(&ts->tile_thread.lock);
- while ((progress = atomic_load(&ts->progress)) <= sby)
- pthread_cond_wait(&ts->tile_thread.cond,
- &ts->tile_thread.lock);
- pthread_mutex_unlock(&ts->tile_thread.lock);
- }
- if (progress == TILE_ERROR) {
- dav1d_thread_picture_signal(&f->sr_cur, FRAME_ERROR,
- PLANE_TYPE_ALL);
- const uint64_t all_mask = ~0ULL >> (64 - f->n_tc);
- pthread_mutex_lock(&f->tile_thread.lock);
- while (f->tile_thread.available != all_mask)
- pthread_cond_wait(&f->tile_thread.icond, &f->tile_thread.lock);
- pthread_mutex_unlock(&f->tile_thread.lock);
- goto error;
- }
- }
+int dav1d_decode_frame_main(Dav1dFrameContext *const f) {
+ const Dav1dContext *const c = f->c;
+ int retval = DAV1D_ERR(ENOMEM);
- // loopfilter + cdef + restoration
- if (f->frame_thread.pass != 1) {
- if (c->n_pfc == 1)
- f->bd_fn.filter_sbrow(f, sby);
- else {
- pthread_mutex_lock(&f->lf.thread.pftd->lock);
- if (f->lf.thread.npf != 0 && !f->lf.thread.done) {
- Dav1dTask *const t = &f->lf.thread.tasks[sby * f->lf.thread.npf];
- t->start = 1;
- if (t->status == DAV1D_TASK_READY)
- dav1d_task_schedule(f->lf.thread.pftd, t);
- }
- pthread_mutex_unlock(&f->lf.thread.pftd->lock);
- }
- }
- if (c->n_pfc == 1 || f->frame_thread.pass == 1 || f->lf.thread.npf == 0)
- dav1d_thread_picture_signal(&f->sr_cur,
- (sby + 1) * f->sb_step * 4,
- progress_plane_type);
- }
- }
+ assert(f->c->n_tc == 1);
- const uint64_t all_mask = ~0ULL >> (64 - f->n_tc);
- pthread_mutex_lock(&f->tile_thread.lock);
- while (f->tile_thread.available != all_mask)
- pthread_cond_wait(&f->tile_thread.icond, &f->tile_thread.lock);
- pthread_mutex_unlock(&f->tile_thread.lock);
- }
+ Dav1dTaskContext *const t = &c->tc[f - c->fc];
+ t->f = f;
+ t->frame_thread.pass = 0;
- if (f->frame_thread.pass <= 1 && f->frame_hdr->refresh_context) {
- // cdf update
- if (update_set)
- dav1d_cdf_thread_update(f->frame_hdr, f->out_cdf.data.cdf,
- &f->ts[f->frame_hdr->tiling.update].cdf);
- dav1d_cdf_thread_signal(&f->out_cdf);
- }
- if (f->frame_thread.pass == 1) {
- assert(c->n_fc > 1);
- for (int tile_idx = 0;
- tile_idx < f->frame_hdr->tiling.rows * f->frame_hdr->tiling.cols;
- tile_idx++)
- {
- Dav1dTileState *const ts = &f->ts[tile_idx];
- const size_t tile_start_off =
- (size_t) f->frame_thread.tile_start_off[tile_idx];
- ts->frame_thread.pal_idx = f->frame_thread.pal_idx ?
- &f->frame_thread.pal_idx[tile_start_off * size_mul[1] / 4] :
- NULL;
- ts->frame_thread.cf = f->frame_thread.cf ?
- (uint8_t*)f->frame_thread.cf +
- ((tile_start_off * size_mul[0]) >> !f->seq_hdr->hbd) :
- NULL;
- if (f->n_tc > 0) {
- const unsigned row_sb_start =
- f->frame_hdr->tiling.row_start_sb[ts->tiling.row];
- atomic_init(&ts->progress, row_sb_start);
- }
+ for (int n = 0; n < f->sb128w * f->frame_hdr->tiling.rows; n++)
+ reset_context(&f->a[n], IS_KEY_OR_INTRA(f->frame_hdr), 0);
+
+ // no threading - we explicitly interleave tile/sbrow decoding
+ // and post-filtering, so that the full process runs in-line
+ for (int tile_row = 0; tile_row < f->frame_hdr->tiling.rows; tile_row++) {
+ const int sbh_end =
+ imin(f->frame_hdr->tiling.row_start_sb[tile_row + 1], f->sbh);
+ for (int sby = f->frame_hdr->tiling.row_start_sb[tile_row];
+ sby < sbh_end; sby++)
+ {
+ t->by = sby << (4 + f->seq_hdr->sb128);
+ const int by_end = (t->by + f->sb_step) >> 1;
+ if (f->frame_hdr->use_ref_frame_mvs) {
+ dav1d_refmvs_load_tmvs(&f->rf, tile_row,
+ 0, f->bw >> 1, t->by >> 1, by_end);
+ }
+ for (int tile_col = 0; tile_col < f->frame_hdr->tiling.cols; tile_col++) {
+ t->ts = &f->ts[tile_row * f->frame_hdr->tiling.cols + tile_col];
+ if (dav1d_decode_tile_sbrow(t)) goto error;
+ }
+ if (IS_INTER_OR_SWITCH(f->frame_hdr)) {
+ dav1d_refmvs_save_tmvs(&t->rt, 0, f->bw >> 1, t->by >> 1, by_end);
}
+
+ // loopfilter + cdef + restoration
+ f->bd_fn.filter_sbrow(f, sby);
}
}
retval = 0;
error:
- if (c->n_pfc > 1) {
- pthread_mutex_lock(&f->lf.thread.pftd->lock);
- if (!f->lf.thread.done) {
- if (retval != 0) {
- f->lf.thread.done = -1;
- pthread_cond_signal(&f->lf.thread.pftd->cond);
- }
- pthread_cond_wait(&f->lf.thread.cond, &f->lf.thread.pftd->lock);
- }
- pthread_mutex_unlock(&f->lf.thread.pftd->lock);
+ return retval;
+}
+
+void dav1d_decode_frame_exit(Dav1dFrameContext *const f, const int retval) {
+ const Dav1dContext *const c = f->c;
+
+ if (c->n_fc > 1 && retval && f->frame_thread.cf) {
+ memset(f->frame_thread.cf, 0,
+ (size_t)f->frame_thread.cf_sz * 128 * 128 / 2);
}
- dav1d_thread_picture_signal(&f->sr_cur, retval == 0 ? UINT_MAX : FRAME_ERROR,
- PLANE_TYPE_ALL);
for (int i = 0; i < 7; i++) {
if (f->refp[i].p.data[0])
dav1d_thread_picture_unref(&f->refp[i]);
@@ -3331,8 +3360,9 @@ error:
dav1d_picture_unref_internal(&f->cur);
dav1d_thread_picture_unref(&f->sr_cur);
dav1d_cdf_thread_unref(&f->in_cdf);
- if (f->frame_hdr->refresh_context) {
- dav1d_cdf_thread_signal(&f->out_cdf);
+ if (f->frame_hdr && f->frame_hdr->refresh_context) {
+ if (f->out_cdf.progress)
+ atomic_store(f->out_cdf.progress, retval == 0 ? 1 : TILE_ERROR);
dav1d_cdf_thread_unref(&f->out_cdf);
}
dav1d_ref_dec(&f->cur_segmap_ref);
@@ -3343,8 +3373,39 @@ error:
for (int i = 0; i < f->n_tile_data; i++)
dav1d_data_unref_internal(&f->tile[i].data);
+}
- return retval;
+int dav1d_decode_frame(Dav1dFrameContext *const f) {
+ assert(f->c->n_fc == 1);
+ // if n_tc > 1 (but n_fc == 1), we could run init/exit in the task
+ // threads also. Not sure it makes a measurable difference.
+ int res = dav1d_decode_frame_init(f);
+ // wait until all threads have completed
+ if (!res) {
+ if (f->c->n_tc > 1) {
+ pthread_mutex_lock(&f->task_thread.ttd->lock);
+ res = dav1d_task_create_tile_sbrow(f, 0, 1);
+ if (!res) {
+ const int uses_2pass = f->c->n_fc > 1;
+ while (!f->task_thread.done[0] ||
+ (uses_2pass && !f->task_thread.done[1]) ||
+ f->task_thread.task_counter > 0)
+ {
+ pthread_cond_wait(&f->task_thread.cond,
+ &f->task_thread.ttd->lock);
+ }
+ }
+ pthread_mutex_unlock(&f->task_thread.ttd->lock);
+ } else {
+ res = dav1d_decode_frame_main(f);
+ if (!res && f->frame_hdr->refresh_context && f->task_thread.update_set) {
+ dav1d_cdf_thread_update(f->frame_hdr, f->out_cdf.data.cdf,
+ &f->ts[f->frame_hdr->tiling.update].cdf);
+ }
+ }
+ }
+ dav1d_decode_frame_exit(f, res);
+ return res;
}
static int get_upscale_x0(const int in_w, const int out_w, const int step) {
@@ -3360,17 +3421,23 @@ int dav1d_submit_frame(Dav1dContext *const c) {
// wait for c->out_delayed[next] and move into c->out if visible
Dav1dThreadPicture *out_delayed;
if (c->n_fc > 1) {
+ pthread_mutex_lock(&c->task_thread.lock);
const unsigned next = c->frame_thread.next++;
if (c->frame_thread.next == c->n_fc)
c->frame_thread.next = 0;
f = &c->fc[next];
- pthread_mutex_lock(&f->frame_thread.td.lock);
while (f->n_tile_data > 0)
- pthread_cond_wait(&f->frame_thread.td.cond,
- &f->frame_thread.td.lock);
+ pthread_cond_wait(&f->task_thread.cond,
+ &f->task_thread.ttd->lock);
out_delayed = &c->frame_thread.out_delayed[next];
if (out_delayed->p.data[0]) {
+ if (atomic_load(&c->task_thread.first) + 1 < c->n_fc)
+ atomic_fetch_add(&c->task_thread.first, 1);
+ else
+ atomic_store(&c->task_thread.first, 0);
+ if (c->task_thread.cur < c->n_fc)
+ c->task_thread.cur--;
const unsigned progress = atomic_load_explicit(&out_delayed->progress[1],
memory_order_relaxed);
if (out_delayed->visible && progress != FRAME_ERROR) {
@@ -3429,7 +3496,8 @@ int dav1d_submit_frame(Dav1dContext *const c) {
f->bd_fn.recon_b_inter = dav1d_recon_b_inter_##bd##bpc; \
f->bd_fn.recon_b_intra = dav1d_recon_b_intra_##bd##bpc; \
f->bd_fn.filter_sbrow = dav1d_filter_sbrow_##bd##bpc; \
- f->bd_fn.filter_sbrow_deblock = dav1d_filter_sbrow_deblock_##bd##bpc; \
+ f->bd_fn.filter_sbrow_deblock_cols = dav1d_filter_sbrow_deblock_cols_##bd##bpc; \
+ f->bd_fn.filter_sbrow_deblock_rows = dav1d_filter_sbrow_deblock_rows_##bd##bpc; \
f->bd_fn.filter_sbrow_cdef = dav1d_filter_sbrow_cdef_##bd##bpc; \
f->bd_fn.filter_sbrow_resize = dav1d_filter_sbrow_resize_##bd##bpc; \
f->bd_fn.filter_sbrow_lr = dav1d_filter_sbrow_lr_##bd##bpc; \
@@ -3484,7 +3552,7 @@ int dav1d_submit_frame(Dav1dContext *const c) {
f->svc[i][0].step = (f->svc[i][0].scale + 8) >> 4;
f->svc[i][1].step = (f->svc[i][1].scale + 8) >> 4;
} else {
- f->svc[i][0].scale = 0;
+ f->svc[i][0].scale = f->svc[i][1].scale = 0;
}
f->gmv_warp_allowed[i] = f->frame_hdr->gmv[i].type > DAV1D_WM_TYPE_TRANSLATION &&
!f->frame_hdr->force_integer_mv &&
@@ -3501,7 +3569,7 @@ int dav1d_submit_frame(Dav1dContext *const c) {
dav1d_cdf_thread_ref(&f->in_cdf, &c->cdf[pri_ref]);
}
if (f->frame_hdr->refresh_context) {
- res = dav1d_cdf_thread_alloc(c, &f->out_cdf, c->n_fc > 1 ? &f->frame_thread.td : NULL);
+ res = dav1d_cdf_thread_alloc(c, &f->out_cdf, c->n_fc > 1);
if (res < 0) goto error;
}
@@ -3565,6 +3633,11 @@ int dav1d_submit_frame(Dav1dContext *const c) {
f->sbh = (f->bh + f->sb_step - 1) >> f->sb_shift;
f->b4_stride = (f->bw + 31) & ~31;
f->bitdepth_max = (1 << f->cur.p.bpc) - 1;
+ f->task_thread.error = 0;
+ const int uses_2pass = c->n_fc > 1;
+ const int cols = f->frame_hdr->tiling.cols;
+ const int rows = f->frame_hdr->tiling.rows;
+ f->task_thread.task_counter = (cols * rows + f->sbh) << uses_2pass;
// ref_mvs
if (IS_INTER_OR_SWITCH(f->frame_hdr) || f->frame_hdr->allow_intrabc) {
@@ -3707,8 +3780,8 @@ int dav1d_submit_frame(Dav1dContext *const c) {
return res;
}
} else {
- pthread_cond_signal(&f->frame_thread.td.cond);
- pthread_mutex_unlock(&f->frame_thread.td.lock);
+ dav1d_task_frame_init(f);
+ pthread_mutex_unlock(&c->task_thread.lock);
}
return 0;
@@ -3735,10 +3808,8 @@ error:
dav1d_data_unref_internal(&f->tile[i].data);
f->n_tile_data = 0;
- if (c->n_fc > 1) {
- pthread_cond_signal(&f->frame_thread.td.cond);
- pthread_mutex_unlock(&f->frame_thread.td.lock);
- }
+ if (c->n_fc > 1)
+ pthread_mutex_unlock(&f->task_thread.ttd->lock);
return res;
}