diff options
-rw-r--r-- | .gitlab-ci.yml | 16 | ||||
-rw-r--r-- | examples/dav1dplay.c | 7 | ||||
-rw-r--r-- | include/dav1d/dav1d.h | 4 | ||||
-rw-r--r-- | meson.build | 2 | ||||
-rw-r--r-- | src/decode.c | 78 | ||||
-rw-r--r-- | src/internal.h | 42 | ||||
-rw-r--r-- | src/lib.c | 105 | ||||
-rw-r--r-- | src/lr_apply_tmpl.c | 77 | ||||
-rw-r--r-- | src/recon.h | 8 | ||||
-rw-r--r-- | src/recon_tmpl.c | 147 | ||||
-rw-r--r-- | src/thread_task.c | 226 | ||||
-rw-r--r-- | src/thread_task.h | 27 | ||||
-rw-r--r-- | tools/dav1d_cli_parse.c | 7 |
13 files changed, 615 insertions, 131 deletions
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index db8fefd..b18301b 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -410,7 +410,8 @@ test-debian: - ninja coverage-xml - grep -Eo 'line-rate="[^"]+"' meson-logs/coverage.xml | head -n 1 | grep -Eo '[0-9.]+' | awk '{ print "coverage:", $1 * 100 } ' - - time meson test -v --suite testdata_seek-stress --test-args "--tilethreads 2 --framethreads 2" + - time meson test -v --suite testdata_seek-stress --test-args "--tilethreads 2 --framethreads 1 --pfthreads=2" + - time meson test -v --suite testdata_seek-stress --test-args "--tilethreads 2 --framethreads 2 --pfthreads=2" coverage: '/^coverage: (\d+.\d+)$/' artifacts: expose_as: 'Coverage HTML report' @@ -520,11 +521,14 @@ test-debian-tsan: - ninja -C build - cd build - exit_code=0 - - time meson test -v --setup=sanitizer --suite testdata-8 --suite testdata-10 --suite testdata-12 --test-args "--tilethreads 1 --framethreads 2" || exit_code=$((exit_code + $?)) - - time meson test -v --setup=sanitizer --suite testdata-8 --suite testdata-10 --suite testdata-12 --test-args "--tilethreads 2 --framethreads 1" || exit_code=$((exit_code + $?)) - - time meson test -v --setup=sanitizer --suite testdata-8 --suite testdata-10 --suite testdata-12 --test-args "--tilethreads 2 --framethreads 2" || exit_code=$((exit_code + $?)) - - time meson test -v --setup=sanitizer --suite testdata_seek-stress --test-args "--tilethreads 2 --framethreads 2" || exit_code=$((exit_code + $?)) - - time meson test -v --setup=sanitizer --suite oss-fuzz-asan --suite oss-fuzz-msan --suite oss-fuzz-ubsan || exit_code=$((exit_code + $?)) + - time meson test -v --setup=sanitizer --suite testdata-8 --suite testdata-10 --suite testdata-12 --test-args "--tilethreads 1 --framethreads 2 --pfthreads 1" || exit_code=$((exit_code + $?)) + - time meson test -v --setup=sanitizer --suite testdata-8 --suite testdata-10 --suite testdata-12 --test-args "--tilethreads 2 --framethreads 1 --pfthreads 1" || exit_code=$((exit_code + $?)) + - time meson test -v --setup=sanitizer --suite testdata-8 --suite testdata-10 --suite testdata-12 --test-args "--tilethreads 2 --framethreads 2 --pfthreads 1" || exit_code=$((exit_code + $?)) + - time meson test -v --setup=sanitizer --suite testdata-8 --suite testdata-10 --suite testdata-12 --test-args "--tilethreads 2 --framethreads 1 --pfthreads 2" || exit_code=$((exit_code + $?)) + - time meson test -v --setup=sanitizer --suite testdata-8 --suite testdata-10 --suite testdata-12 --test-args "--tilethreads 2 --framethreads 2 --pfthreads 2" || exit_code=$((exit_code + $?)) + - time meson test -v --setup=sanitizer --suite testdata_seek-stress --test-args "--tilethreads 2 --framethreads 1 --pfthreads 2" || exit_code=$((exit_code + $?)) + - time meson test -v --setup=sanitizer --suite testdata_seek-stress --test-args "--tilethreads 2 --framethreads 2 --pfthreads 2" || exit_code=$((exit_code + $?)) + - time meson test -v --setup=sanitizer --suite oss-fuzz-asan --suite oss-fuzz-msan --suite oss-fuzz-ubsan || exit_code=$((exit_code + $?)) - if [ $exit_code -ne 0 ]; then exit $exit_code; fi test-win64: diff --git a/examples/dav1dplay.c b/examples/dav1dplay.c index d6bb262..a7eca93 100644 --- a/examples/dav1dplay.c +++ b/examples/dav1dplay.c @@ -95,6 +95,7 @@ static void dp_settings_print_usage(const char *const app, " --untimed/-u: ignore PTS, render as fast as possible\n" " --framethreads $num: number of frame threads (default: 1)\n" " --tilethreads $num: number of tile threads (default: 1)\n" + " --pfthreads $num: number of postfilter threads(default: 1)\n" " --highquality: enable high quality rendering\n" " --zerocopy/-z: enable zero copy upload path\n" " --gpugrain/-g: enable GPU grain synthesis\n" @@ -127,6 +128,7 @@ static void dp_rd_ctx_parse_args(Dav1dPlayRenderContext *rd_ctx, enum { ARG_FRAME_THREADS = 256, ARG_TILE_THREADS, + ARG_POSTFILTER_THREADS, ARG_HIGH_QUALITY, }; @@ -137,6 +139,7 @@ static void dp_rd_ctx_parse_args(Dav1dPlayRenderContext *rd_ctx, { "untimed", 0, NULL, 'u' }, { "framethreads", 1, NULL, ARG_FRAME_THREADS }, { "tilethreads", 1, NULL, ARG_TILE_THREADS }, + { "pfthreads", 1, NULL, ARG_POSTFILTER_THREADS }, { "highquality", 0, NULL, ARG_HIGH_QUALITY }, { "zerocopy", 0, NULL, 'z' }, { "gpugrain", 0, NULL, 'g' }, @@ -175,6 +178,10 @@ static void dp_rd_ctx_parse_args(Dav1dPlayRenderContext *rd_ctx, lib_settings->n_tile_threads = parse_unsigned(optarg, ARG_TILE_THREADS, argv[0]); break; + case ARG_POSTFILTER_THREADS: + lib_settings->n_postfilter_threads = + parse_unsigned(optarg, ARG_POSTFILTER_THREADS, argv[0]); + break; default: dp_settings_print_usage(argv[0], NULL); } diff --git a/include/dav1d/dav1d.h b/include/dav1d/dav1d.h index 9d484e5..165e1c0 100644 --- a/include/dav1d/dav1d.h +++ b/include/dav1d/dav1d.h @@ -45,6 +45,7 @@ typedef struct Dav1dRef Dav1dRef; #define DAV1D_MAX_FRAME_THREADS 256 #define DAV1D_MAX_TILE_THREADS 64 +#define DAV1D_MAX_POSTFILTER_THREADS 256 typedef struct Dav1dLogger { void *cookie; ///< Custom data to pass to the callback. @@ -67,7 +68,8 @@ typedef struct Dav1dSettings { unsigned frame_size_limit; ///< maximum frame size, in pixels (0 = unlimited) Dav1dPicAllocator allocator; ///< Picture allocator callback. Dav1dLogger logger; ///< Logger callback. - uint8_t reserved[32]; ///< reserved for future use + int n_postfilter_threads; + uint8_t reserved[28]; ///< reserved for future use } Dav1dSettings; /** diff --git a/meson.build b/meson.build index 3b354d6..2af8302 100644 --- a/meson.build +++ b/meson.build @@ -30,7 +30,7 @@ project('dav1d', ['c'], 'b_ndebug=if-release'], meson_version: '>= 0.49.0') -dav1d_soname_version = '5.0.0' +dav1d_soname_version = '5.0.1' dav1d_api_version_array = dav1d_soname_version.split('.') dav1d_api_version_major = dav1d_api_version_array[0] dav1d_api_version_minor = dav1d_api_version_array[1] diff --git a/src/decode.c b/src/decode.c index 197af98..6730b22 100644 --- a/src/decode.c +++ b/src/decode.c @@ -2526,7 +2526,7 @@ int dav1d_decode_tile_sbrow(Dav1dTileContext *const t) { t->a = f->a + col_sb128_start + tile_row * f->sb128w; t->bx < ts->tiling.col_end; t->bx += sb_step) { - if (atomic_load_explicit(c->frame_thread.flush, memory_order_acquire)) + if (atomic_load_explicit(c->flush, memory_order_acquire)) return 1; if (decode_sb(t, root_bl, c->intra_edge.root[root_bl])) return 1; @@ -2557,7 +2557,7 @@ int dav1d_decode_tile_sbrow(Dav1dTileContext *const t) { t->lf_mask = f->lf.mask + sb128y * f->sb128w + col_sb128_start; t->bx < ts->tiling.col_end; t->bx += sb_step) { - if (atomic_load_explicit(c->frame_thread.flush, memory_order_acquire)) + if (atomic_load_explicit(c->flush, memory_order_acquire)) return 1; if (root_bl == BL_128X128) { t->cur_sb_cdef_idx_ptr = t->lf_mask->cdef_idx; @@ -2859,7 +2859,8 @@ int dav1d_decode_frame(Dav1dFrameContext *const f) { const int lr_line_sz = ((f->sr_cur.p.p.w + 31) & ~31) << hbd; if (lr_line_sz != f->lf.lr_line_sz) { dav1d_freep_aligned(&f->lf.lr_lpf_line[0]); - uint8_t *lr_ptr = dav1d_alloc_aligned(lr_line_sz * 3 * 12, 32); + const int num_lines = c->n_pfc > 1 ? f->sbh * (4 << f->seq_hdr->sb128) : 12; + uint8_t *lr_ptr = dav1d_alloc_aligned(lr_line_sz * num_lines * 3, 32); if (!lr_ptr) { f->lf.lr_line_sz = 0; goto error; @@ -2867,7 +2868,7 @@ int dav1d_decode_frame(Dav1dFrameContext *const f) { for (int pl = 0; pl <= 2; pl++) { f->lf.lr_lpf_line[pl] = lr_ptr; - lr_ptr += lr_line_sz * 12; + lr_ptr += lr_line_sz * num_lines; } f->lf.lr_line_sz = lr_line_sz; @@ -2955,6 +2956,12 @@ int dav1d_decode_frame(Dav1dFrameContext *const f) { f->refpoc, f->mvs, f->refrefpoc, f->ref_mvs, f->n_tc); if (ret < 0) goto error; } + + // create post-filtering tasks + if (c->n_pfc > 1) + if (dav1d_task_create_filter_sbrow(f)) + goto error; + retval = DAV1D_ERR(EINVAL); // setup dequant tables @@ -3081,7 +3088,7 @@ int dav1d_decode_frame(Dav1dFrameContext *const f) { for (int n = 0; n < f->sb128w * f->frame_hdr->tiling.rows; n++) reset_context(&f->a[n], !(f->frame_hdr->frame_type & 1), f->frame_thread.pass); - if (f->n_tc == 1) { + if (f->n_tc == 1 || (c->n_pfc > 1 && f->frame_hdr->tiling.cols * f->frame_hdr->tiling.rows == 1)) { Dav1dTileContext *const t = f->tc; // no tile threading - we explicitly interleave tile/sbrow decoding @@ -3108,7 +3115,6 @@ int dav1d_decode_frame(Dav1dFrameContext *const f) { } for (int tile_col = 0; tile_col < f->frame_hdr->tiling.cols; tile_col++) { t->ts = &f->ts[tile_row * f->frame_hdr->tiling.cols + tile_col]; - if (dav1d_decode_tile_sbrow(t)) goto error; } if (f->frame_thread.pass <= 1 && f->frame_hdr->frame_type & 1) { @@ -3116,10 +3122,24 @@ int dav1d_decode_frame(Dav1dFrameContext *const f) { } // loopfilter + cdef + restoration - if (f->frame_thread.pass != 1) - f->bd_fn.filter_sbrow(f, sby); - dav1d_thread_picture_signal(&f->sr_cur, (sby + 1) * f->sb_step * 4, - progress_plane_type); + if (f->frame_thread.pass != 1) { + if (c->n_pfc == 1) + f->bd_fn.filter_sbrow(f, sby); + else { + pthread_mutex_lock(&f->lf.thread.pftd->lock); + if (f->lf.thread.npf != 0 && !f->lf.thread.done) { + Dav1dTask *const t = &f->lf.thread.tasks[sby * f->lf.thread.npf]; + t->start = 1; + if (t->status == DAV1D_TASK_READY) + dav1d_task_schedule(f->lf.thread.pftd, t); + } + pthread_mutex_unlock(&f->lf.thread.pftd->lock); + } + } + if (c->n_pfc == 1 || f->frame_thread.pass == 1 || f->lf.thread.npf == 0) + dav1d_thread_picture_signal(&f->sr_cur, + (sby + 1) * f->sb_step * 4, + progress_plane_type); } } } else { @@ -3142,7 +3162,6 @@ int dav1d_decode_frame(Dav1dFrameContext *const f) { pthread_cond_broadcast(&f->tile_thread.cond); pthread_mutex_unlock(&f->tile_thread.lock); - // loopfilter + cdef + restoration for (int tile_row = 0; tile_row < f->frame_hdr->tiling.rows; tile_row++) { for (int sby = f->frame_hdr->tiling.row_start_sb[tile_row]; sby < f->frame_hdr->tiling.row_start_sb[tile_row + 1]; sby++) @@ -3174,10 +3193,24 @@ int dav1d_decode_frame(Dav1dFrameContext *const f) { } // loopfilter + cdef + restoration - if (f->frame_thread.pass != 1) - f->bd_fn.filter_sbrow(f, sby); - dav1d_thread_picture_signal(&f->sr_cur, (sby + 1) * f->sb_step * 4, - progress_plane_type); + if (f->frame_thread.pass != 1) { + if (c->n_pfc == 1) + f->bd_fn.filter_sbrow(f, sby); + else { + pthread_mutex_lock(&f->lf.thread.pftd->lock); + if (f->lf.thread.npf != 0 && !f->lf.thread.done) { + Dav1dTask *const t = &f->lf.thread.tasks[sby * f->lf.thread.npf]; + t->start = 1; + if (t->status == DAV1D_TASK_READY) + dav1d_task_schedule(f->lf.thread.pftd, t); + } + pthread_mutex_unlock(&f->lf.thread.pftd->lock); + } + } + if (c->n_pfc == 1 || f->frame_thread.pass == 1 || f->lf.thread.npf == 0) + dav1d_thread_picture_signal(&f->sr_cur, + (sby + 1) * f->sb_step * 4, + progress_plane_type); } } @@ -3222,6 +3255,17 @@ int dav1d_decode_frame(Dav1dFrameContext *const f) { retval = 0; error: + if (c->n_pfc > 1) { + pthread_mutex_lock(&f->lf.thread.pftd->lock); + if (!f->lf.thread.done) { + if (retval != 0) { + f->lf.thread.done = -1; + pthread_cond_signal(&f->lf.thread.pftd->cond); + } + pthread_cond_wait(&f->lf.thread.cond, &f->lf.thread.pftd->lock); + } + pthread_mutex_unlock(&f->lf.thread.pftd->lock); + } dav1d_thread_picture_signal(&f->sr_cur, retval == 0 ? UINT_MAX : FRAME_ERROR, PLANE_TYPE_ALL); for (int i = 0; i < 7; i++) { @@ -3329,6 +3373,10 @@ int dav1d_submit_frame(Dav1dContext *const c) { f->bd_fn.recon_b_inter = dav1d_recon_b_inter_##bd##bpc; \ f->bd_fn.recon_b_intra = dav1d_recon_b_intra_##bd##bpc; \ f->bd_fn.filter_sbrow = dav1d_filter_sbrow_##bd##bpc; \ + f->bd_fn.filter_sbrow_deblock = dav1d_filter_sbrow_deblock_##bd##bpc; \ + f->bd_fn.filter_sbrow_cdef = dav1d_filter_sbrow_cdef_##bd##bpc; \ + f->bd_fn.filter_sbrow_resize = dav1d_filter_sbrow_resize_##bd##bpc; \ + f->bd_fn.filter_sbrow_lr = dav1d_filter_sbrow_lr_##bd##bpc; \ f->bd_fn.backup_ipred_edge = dav1d_backup_ipred_edge_##bd##bpc; \ f->bd_fn.read_coef_blocks = dav1d_read_coef_blocks_##bd##bpc if (!f->seq_hdr->hbd) { diff --git a/src/internal.h b/src/internal.h index ea29d8f..0a71420 100644 --- a/src/internal.h +++ b/src/internal.h @@ -35,6 +35,8 @@ typedef struct Dav1dFrameContext Dav1dFrameContext; typedef struct Dav1dTileState Dav1dTileState; typedef struct Dav1dTileContext Dav1dTileContext; +typedef struct Dav1dPostFilterContext Dav1dPostFilterContext; +typedef struct Dav1dTask Dav1dTask; #include "common/attributes.h" @@ -76,6 +78,9 @@ struct Dav1dContext { Dav1dFrameContext *fc; unsigned n_fc; + Dav1dPostFilterContext *pfc; + unsigned n_pfc; + // cache of OBUs that make up a single frame before we submit them // to a frame worker to be decoded struct Dav1dTileGroup *tile; @@ -99,15 +104,23 @@ struct Dav1dContext { // decoded output picture queue Dav1dData in; Dav1dPicture out; + // dummy is a pointer to prevent compiler errors about atomic_load() + // not taking const arguments + atomic_int flush_mem, *flush; struct { Dav1dThreadPicture *out_delayed; unsigned next; - // dummy is a pointer to prevent compiler errors about atomic_load() - // not taking const arguments; the const attribute is not taken - // from pointers - atomic_int flush_mem, *flush; } frame_thread; + // postfilter threading (refer to pfc[] for per_thread thingies) + struct PostFilterThreadData { + pthread_mutex_t lock; + pthread_cond_t cond; + struct Dav1dTask *tasks; + int frame_cnt; + int inited; + } postfilter_thread; + // reference/entropy state Dav1dMemPool *segmap_pool; Dav1dMemPool *refmvs_pool; @@ -182,6 +195,10 @@ struct Dav1dFrameContext { recon_b_intra_fn recon_b_intra; recon_b_inter_fn recon_b_inter; filter_sbrow_fn filter_sbrow; + filter_sbrow_fn filter_sbrow_deblock; + filter_sbrow_fn filter_sbrow_cdef; + filter_sbrow_fn filter_sbrow_resize; + filter_sbrow_fn filter_sbrow_lr; backup_ipred_edge_fn backup_ipred_edge; read_coef_blocks_fn read_coef_blocks; } bd_fn; @@ -238,6 +255,16 @@ struct Dav1dFrameContext { pixel *p[3], *sr_p[3]; Av1Filter *mask_ptr, *prev_mask_ptr; int restore_planes; // enum LrRestorePlanes + + struct { + pthread_cond_t cond; + struct PostFilterThreadData *pftd; + struct Dav1dTask *tasks; + int num_tasks; + int npf; + int done; + int inited; + } thread; } lf; // threading (refer to tc[] for per-thread things) @@ -353,4 +380,11 @@ struct Dav1dTileContext { } tile_thread; }; +struct Dav1dPostFilterContext { + Dav1dContext *c; + struct thread_data td; + int flushed; + int die; +}; + #endif /* DAV1D_SRC_INTERNAL_H */ @@ -65,6 +65,7 @@ COLD const char *dav1d_version(void) { COLD void dav1d_default_settings(Dav1dSettings *const s) { s->n_frame_threads = 1; s->n_tile_threads = 1; + s->n_postfilter_threads = 1; s->apply_grain = 1; s->allocator.cookie = NULL; s->allocator.alloc_picture_callback = dav1d_default_picture_alloc; @@ -100,6 +101,8 @@ COLD int dav1d_open(Dav1dContext **const c_out, const Dav1dSettings *const s) { validate_input_or_ret(c_out != NULL, DAV1D_ERR(EINVAL)); validate_input_or_ret(s != NULL, DAV1D_ERR(EINVAL)); + validate_input_or_ret(s->n_postfilter_threads >= 1 && + s->n_postfilter_threads <= DAV1D_MAX_POSTFILTER_THREADS, DAV1D_ERR(EINVAL)); validate_input_or_ret(s->n_tile_threads >= 1 && s->n_tile_threads <= DAV1D_MAX_TILE_THREADS, DAV1D_ERR(EINVAL)); validate_input_or_ret(s->n_frame_threads >= 1 && @@ -160,12 +163,49 @@ COLD int dav1d_open(Dav1dContext **const c_out, const Dav1dSettings *const s) { s->frame_size_limit, c->frame_size_limit); } - c->frame_thread.flush = &c->frame_thread.flush_mem; - atomic_init(c->frame_thread.flush, 0); + c->flush = &c->flush_mem; + atomic_init(c->flush, 0); + + c->n_pfc = s->n_postfilter_threads; c->n_fc = s->n_frame_threads; c->fc = dav1d_alloc_aligned(sizeof(*c->fc) * s->n_frame_threads, 32); if (!c->fc) goto error; memset(c->fc, 0, sizeof(*c->fc) * s->n_frame_threads); + + if (c->n_pfc > 1) { + c->pfc = dav1d_alloc_aligned(sizeof(*c->pfc) * s->n_postfilter_threads, 32); + if (!c->pfc) goto error; + memset(c->pfc, 0, sizeof(*c->pfc) * s->n_postfilter_threads); + if (pthread_mutex_init(&c->postfilter_thread.lock, NULL)) goto error; + if (pthread_cond_init(&c->postfilter_thread.cond, NULL)) { + pthread_mutex_destroy(&c->postfilter_thread.lock); + goto error; + } + c->postfilter_thread.inited = 1; + for (int n = 0; n < s->n_frame_threads; n++) { + Dav1dFrameContext *const f = &c->fc[n]; + if (pthread_cond_init(&f->lf.thread.cond, NULL)) goto error; + f->lf.thread.pftd = &c->postfilter_thread; + f->lf.thread.done = 1; + f->lf.thread.inited = 1; + } + for (int n = 0; n < s->n_postfilter_threads; ++n) { + Dav1dPostFilterContext *const pf = &c->pfc[n]; + pf->c = c; + if (pthread_mutex_init(&pf->td.lock, NULL)) goto error; + if (pthread_cond_init(&pf->td.cond, NULL)) { + pthread_mutex_destroy(&pf->td.lock); + goto error; + } + if (pthread_create(&pf->td.thread, &thread_attr, dav1d_postfilter_task, pf)) { + pthread_cond_destroy(&c->postfilter_thread.cond); + pthread_mutex_destroy(&c->postfilter_thread.lock); + goto error; + } + pf->td.inited = 1; + } + } + if (c->n_fc > 1) { c->frame_thread.out_delayed = calloc(c->n_fc, sizeof(*c->frame_thread.out_delayed)); @@ -467,11 +507,17 @@ void dav1d_flush(Dav1dContext *const c) { dav1d_ref_dec(&c->content_light_ref); dav1d_ref_dec(&c->itut_t35_ref); - if (c->n_fc == 1) return; + if (c->n_fc == 1 && c->n_pfc == 1) return; - // mark each currently-running frame as flushing, so that we - // exit out as quickly as the running thread checks this flag - atomic_store(c->frame_thread.flush, 1); + // wait for threads to complete flushing + if (c->n_pfc > 1) + pthread_mutex_lock(&c->postfilter_thread.lock); + atomic_store(c->flush, 1); + if (c->n_pfc > 1) { + pthread_cond_broadcast(&c->postfilter_thread.cond); + pthread_mutex_unlock(&c->postfilter_thread.lock); + } + if (c->n_fc == 1) goto skip_ft_flush; for (unsigned n = 0, next = c->frame_thread.next; n < c->n_fc; n++, next++) { if (next == c->n_fc) next = 0; Dav1dFrameContext *const f = &c->fc[next]; @@ -483,13 +529,31 @@ void dav1d_flush(Dav1dContext *const c) { assert(!f->cur.data[0]); } pthread_mutex_unlock(&f->frame_thread.td.lock); - Dav1dThreadPicture *const out_delayed = &c->frame_thread.out_delayed[next]; + Dav1dThreadPicture *const out_delayed = + &c->frame_thread.out_delayed[next]; if (out_delayed->p.data[0]) dav1d_thread_picture_unref(out_delayed); } - atomic_store(c->frame_thread.flush, 0); - c->frame_thread.next = 0; +skip_ft_flush: + if (c->n_pfc > 1) { + for (unsigned i = 0; i < c->n_pfc; ++i) { + Dav1dPostFilterContext *const pf = &c->pfc[i]; + pthread_mutex_lock(&pf->td.lock); + if (!pf->flushed) + pthread_cond_wait(&pf->td.cond, &pf->td.lock); + pf->flushed = 0; + pthread_mutex_unlock(&pf->td.lock); + } + pthread_mutex_lock(&c->postfilter_thread.lock); + c->postfilter_thread.tasks = NULL; + pthread_mutex_unlock(&c->postfilter_thread.lock); + for (unsigned i = 0; i < c->n_fc; ++i) { + freep(&c->fc[i].lf.thread.tasks); + c->fc[i].lf.thread.num_tasks = 0; + } + } + atomic_store(c->flush, 0); } COLD void dav1d_close(Dav1dContext **const c_out) { @@ -503,6 +567,25 @@ static COLD void close_internal(Dav1dContext **const c_out, int flush) { if (flush) dav1d_flush(c); + if (c->pfc) { + struct PostFilterThreadData *pftd = &c->postfilter_thread; + if (pftd->inited) { + pthread_mutex_lock(&pftd->lock); + for (unsigned n = 0; n < c->n_pfc && c->pfc[n].td.inited; n++) + c->pfc[n].die = 1; + pthread_cond_broadcast(&pftd->cond); + pthread_mutex_unlock(&pftd->lock); + for (unsigned n = 0; n < c->n_pfc && c->pfc[n].td.inited; n++) { + pthread_join(c->pfc[n].td.thread, NULL); + pthread_cond_destroy(&c->pfc[n].td.cond); + pthread_mutex_destroy(&c->pfc[n].td.lock); + } + pthread_cond_destroy(&pftd->cond); + pthread_mutex_destroy(&pftd->lock); + } + dav1d_free_aligned(c->pfc); + } + for (unsigned n = 0; c->fc && n < c->n_fc; n++) { Dav1dFrameContext *const f = &c->fc[n]; @@ -554,6 +637,10 @@ static COLD void close_internal(Dav1dContext **const c_out, int flush) { pthread_cond_destroy(&ts->tile_thread.cond); pthread_mutex_destroy(&ts->tile_thread.lock); } + if (f->lf.thread.inited) { + freep(&f->lf.thread.tasks); + pthread_cond_destroy(&f->lf.thread.cond); + } dav1d_free_aligned(f->ts); dav1d_free_aligned(f->tc); dav1d_free_aligned(f->ipred_edge[0]); diff --git a/src/lr_apply_tmpl.c b/src/lr_apply_tmpl.c index 3ba4613..6fcfb67 100644 --- a/src/lr_apply_tmpl.c +++ b/src/lr_apply_tmpl.c @@ -48,31 +48,32 @@ static void backup_lpf(const Dav1dFrameContext *const f, const pixel *src, const ptrdiff_t src_stride, const int ss_ver, const int sb128, int row, const int row_h, const int src_w, - const int h, const int ss_hor) + const int h, const int ss_hor, const int pft) { const int dst_w = f->frame_hdr->super_res.enabled ? (f->frame_hdr->width[1] + ss_hor) >> ss_hor : src_w; // The first stripe of the frame is shorter by 8 luma pixel rows. int stripe_h = (64 - 8 * !row) >> ss_ver; + src += (stripe_h - 2) * PXSTRIDE(src_stride); - if (row) { - const int top = 4 << sb128; - // Copy the top part of the stored loop filtered pixels from the - // previous sb row needed above the first stripe of this sb row. - pixel_copy(&dst[PXSTRIDE(dst_stride) * 0], - &dst[PXSTRIDE(dst_stride) * top], dst_w); - pixel_copy(&dst[PXSTRIDE(dst_stride) * 1], - &dst[PXSTRIDE(dst_stride) * (top + 1)], dst_w); - pixel_copy(&dst[PXSTRIDE(dst_stride) * 2], - &dst[PXSTRIDE(dst_stride) * (top + 2)], dst_w); - pixel_copy(&dst[PXSTRIDE(dst_stride) * 3], - &dst[PXSTRIDE(dst_stride) * (top + 3)], dst_w); + if (!pft) { + if (row) { + const int top = 4 << sb128; + // Copy the top part of the stored loop filtered pixels from the + // previous sb row needed above the first stripe of this sb row. + pixel_copy(&dst[PXSTRIDE(dst_stride) * 0], + &dst[PXSTRIDE(dst_stride) * top], dst_w); + pixel_copy(&dst[PXSTRIDE(dst_stride) * 1], + &dst[PXSTRIDE(dst_stride) * (top + 1)], dst_w); + pixel_copy(&dst[PXSTRIDE(dst_stride) * 2], + &dst[PXSTRIDE(dst_stride) * (top + 2)], dst_w); + pixel_copy(&dst[PXSTRIDE(dst_stride) * 3], + &dst[PXSTRIDE(dst_stride) * (top + 3)], dst_w); + } + dst += 4 * PXSTRIDE(dst_stride); } - dst += 4 * PXSTRIDE(dst_stride); - src += (stripe_h - 2) * PXSTRIDE(src_stride); - if (f->frame_hdr->width[0] != f->frame_hdr->width[1]) { while (row + stripe_h <= row_h) { const int n_lines = 4 - (row + stripe_h + 1 == h); @@ -107,9 +108,15 @@ static void backup_lpf(const Dav1dFrameContext *const f, void bytefn(dav1d_lr_copy_lpf)(Dav1dFrameContext *const f, /*const*/ pixel *const src[3], const int sby) { + const int pft = f->c->n_pfc > 1; const int offset = 8 * !!sby; const ptrdiff_t *const src_stride = f->cur.stride; const ptrdiff_t lr_stride = ((f->sr_cur.p.p.w + 31) & ~31) * sizeof(pixel); + pixel *const dst[3] = { + f->lf.lr_lpf_line[0] + pft * sby * (4 << f->seq_hdr->sb128) * PXSTRIDE(lr_stride), + f->lf.lr_lpf_line[1] + pft * sby * (4 << f->seq_hdr->sb128) * PXSTRIDE(lr_stride), + f->lf.lr_lpf_line[2] + pft * sby * (4 << f->seq_hdr->sb128) * PXSTRIDE(lr_stride) + }; // TODO Also check block level restore type to reduce copying. const int restore_planes = f->lf.restore_planes; @@ -119,9 +126,9 @@ void bytefn(dav1d_lr_copy_lpf)(Dav1dFrameContext *const f, const int w = f->bw << 2; const int row_h = imin((sby + 1) << (6 + f->seq_hdr->sb128), h - 1); const int y_stripe = (sby << (6 + f->seq_hdr->sb128)) - offset; - backup_lpf(f, f->lf.lr_lpf_line[0], lr_stride, + backup_lpf(f, dst[0], lr_stride, src[0] - offset * PXSTRIDE(src_stride[0]), src_stride[0], - 0, f->seq_hdr->sb128, y_stripe, row_h, w, h, 0); + 0, f->seq_hdr->sb128, y_stripe, row_h, w, h, 0, pft); } if (restore_planes & (LR_RESTORE_U | LR_RESTORE_V)) { const int ss_ver = f->sr_cur.p.p.layout == DAV1D_PIXEL_LAYOUT_I420; @@ -130,18 +137,16 @@ void bytefn(dav1d_lr_copy_lpf)(Dav1dFrameContext *const f, const int w = f->bw << (2 - ss_hor); const int row_h = imin((sby + 1) << ((6 - ss_ver) + f->seq_hdr->sb128), h - 1); const int offset_uv = offset >> ss_ver; - const int y_stripe = - (sby << ((6 - ss_ver) + f->seq_hdr->sb128)) - offset_uv; - + const int y_stripe = (sby << ((6 - ss_ver) + f->seq_hdr->sb128)) - offset_uv; if (restore_planes & LR_RESTORE_U) { - backup_lpf(f, f->lf.lr_lpf_line[1], lr_stride, + backup_lpf(f, dst[1], lr_stride, src[1] - offset_uv * PXSTRIDE(src_stride[1]), src_stride[1], - ss_ver, f->seq_hdr->sb128, y_stripe, row_h, w, h, ss_hor); + ss_ver, f->seq_hdr->sb128, y_stripe, row_h, w, h, ss_hor, pft); } if (restore_planes & LR_RESTORE_V) { - backup_lpf(f, f->lf.lr_lpf_line[2], lr_stride, + backup_lpf(f, dst[2], lr_stride, src[2] - offset_uv * PXSTRIDE(src_stride[1]), src_stride[1], - ss_ver, f->seq_hdr->sb128, y_stripe, row_h, w, h, ss_hor); + ss_ver, f->seq_hdr->sb128, y_stripe, row_h, w, h, ss_hor, pft); } } } @@ -154,10 +159,10 @@ static void lr_stripe(const Dav1dFrameContext *const f, pixel *p, const Dav1dDSPContext *const dsp = f->dsp; const int chroma = !!plane; const int ss_ver = chroma & (f->sr_cur.p.p.layout == DAV1D_PIXEL_LAYOUT_I420); - const int sbrow_has_bottom = (edges & LR_HAVE_BOTTOM); - const pixel *lpf = f->lf.lr_lpf_line[plane] + x; const ptrdiff_t p_stride = f->sr_cur.p.stride[chroma]; const ptrdiff_t lpf_stride = sizeof(pixel) * ((f->sr_cur.p.p.w + 31) & ~31); + const int sby = (y + (y ? 8 << ss_ver : 0)) >> (6 - ss_ver + f->seq_hdr->sb128); + const pixel *lpf = f->lf.lr_lpf_line[plane] + (f->c->n_pfc > 1) * (sby * (4 << f->seq_hdr->sb128) - 4) * PXSTRIDE(lpf_stride) + x; // The first stripe of the frame is shorter by 8 luma pixel rows. int stripe_h = imin((64 - 8 * !y) >> ss_ver, row_h - y); @@ -186,8 +191,8 @@ static void lr_stripe(const Dav1dFrameContext *const f, pixel *p, } while (y + stripe_h <= row_h) { - // Change HAVE_BOTTOM bit in edges to (y + stripe_h != row_h) - edges ^= (-(y + stripe_h != row_h) ^ edges) & LR_HAVE_BOTTOM; + // Change the HAVE_BOTTOM bit in edges to (sby + 1 != f->sbh || y + stripe_h != row_h) + edges ^= (-(sby + 1 != f->sbh || y + stripe_h != row_h) ^ edges) & LR_HAVE_BOTTOM; if (wiener_fn) { wiener_fn(p, p_stride, left, lpf, lpf_stride, unit_w, stripe_h, filter, edges HIGHBD_CALL_SUFFIX); @@ -198,7 +203,6 @@ static void lr_stripe(const Dav1dFrameContext *const f, pixel *p, left += stripe_h; y += stripe_h; - if (y + stripe_h > row_h && sbrow_has_bottom) break; p += stripe_h * PXSTRIDE(p_stride); edges |= LR_HAVE_TOP; stripe_h = imin(64 >> ss_ver, row_h - y); @@ -242,8 +246,7 @@ static void lr_sbrow(const Dav1dFrameContext *const f, pixel *p, const int y, pixel pre_lr_border[2][128 + 8 /* maximum sbrow height is 128 + 8 rows offset */][4]; const Av1RestorationUnit *lr[2]; - enum LrEdgeFlags edges = (y > 0 ? LR_HAVE_TOP : 0) | LR_HAVE_RIGHT | - (row_h < h ? LR_HAVE_BOTTOM : 0); + enum LrEdgeFlags edges = (y > 0 ? LR_HAVE_TOP : 0) | LR_HAVE_RIGHT; int aligned_unit_pos = row_y & ~(unit_size - 1); if (aligned_unit_pos && aligned_unit_pos + half_unit_size > h) @@ -281,11 +284,13 @@ void bytefn(dav1d_lr_sbrow)(Dav1dFrameContext *const f, pixel *const dst[3], const int offset_y = 8 * !!sby; const ptrdiff_t *const dst_stride = f->sr_cur.p.stride; const int restore_planes = f->lf.restore_planes; + const int not_last = sby + 1 < f->sbh; if (restore_planes & LR_RESTORE_Y) { const int h = f->sr_cur.p.p.h; const int w = f->sr_cur.p.p.w; - const int row_h = imin((sby + 1) << (6 + f->seq_hdr->sb128), h); + const int next_row_y = (sby + 1) << (6 + f->seq_hdr->sb128); + const int row_h = imin(next_row_y - 8 * not_last, h); const int y_stripe = (sby << (6 + f->seq_hdr->sb128)) - offset_y; lr_sbrow(f, dst[0] - offset_y * PXSTRIDE(dst_stride[0]), y_stripe, w, h, row_h, 0); @@ -295,10 +300,10 @@ void bytefn(dav1d_lr_sbrow)(Dav1dFrameContext *const f, pixel *const dst[3], const int ss_hor = f->sr_cur.p.p.layout != DAV1D_PIXEL_LAYOUT_I444; const int h = (f->sr_cur.p.p.h + ss_ver) >> ss_ver; const int w = (f->sr_cur.p.p.w + ss_hor) >> ss_hor; - const int row_h = imin((sby + 1) << ((6 - ss_ver) + f->seq_hdr->sb128), h); + const int next_row_y = (sby + 1) << ((6 - ss_ver) + f->seq_hdr->sb128); + const int row_h = imin(next_row_y - (8 >> ss_ver) * not_last, h); const int offset_uv = offset_y >> ss_ver; - const int y_stripe = - (sby << ((6 - ss_ver) + f->seq_hdr->sb128)) - offset_uv; + const int y_stripe = (sby << ((6 - ss_ver) + f->seq_hdr->sb128)) - offset_uv; if (restore_planes & LR_RESTORE_U) lr_sbrow(f, dst[1] - offset_uv * PXSTRIDE(dst_stride[1]), y_stripe, w, h, row_h, 1); diff --git a/src/recon.h b/src/recon.h index f84c8ab..9751810 100644 --- a/src/recon.h +++ b/src/recon.h @@ -65,6 +65,14 @@ decl_recon_b_inter_fn(dav1d_recon_b_inter_16bpc); decl_filter_sbrow_fn(dav1d_filter_sbrow_8bpc); decl_filter_sbrow_fn(dav1d_filter_sbrow_16bpc); +decl_filter_sbrow_fn(dav1d_filter_sbrow_deblock_8bpc); +decl_filter_sbrow_fn(dav1d_filter_sbrow_deblock_16bpc); +decl_filter_sbrow_fn(dav1d_filter_sbrow_cdef_8bpc); +decl_filter_sbrow_fn(dav1d_filter_sbrow_cdef_16bpc); +decl_filter_sbrow_fn(dav1d_filter_sbrow_resize_8bpc); +decl_filter_sbrow_fn(dav1d_filter_sbrow_resize_16bpc); +decl_filter_sbrow_fn(dav1d_filter_sbrow_lr_8bpc); +decl_filter_sbrow_fn(dav1d_filter_sbrow_lr_16bpc); decl_backup_ipred_edge_fn(dav1d_backup_ipred_edge_8bpc); decl_backup_ipred_edge_fn(dav1d_backup_ipred_edge_16bpc); diff --git a/src/recon_tmpl.c b/src/recon_tmpl.c index 5a3e81d..9905914 100644 --- a/src/recon_tmpl.c +++ b/src/recon_tmpl.c @@ -1965,76 +1965,109 @@ int bytefn(dav1d_recon_b_inter)(Dav1dTileContext *const t, const enum BlockSize return 0; } -void bytefn(dav1d_filter_sbrow)(Dav1dFrameContext *const f, const int sby) { - const int sbsz = f->sb_step, sbh = f->sbh; - - if (f->frame_hdr->loopfilter.level_y[0] || - f->frame_hdr->loopfilter.level_y[1]) - { +void bytefn(dav1d_filter_sbrow_deblock)(Dav1dFrameContext*const f, const int sby) { + const int y = sby * f->sb_step * 4; + const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420; + pixel *const p[3] = { + f->lf.p[0] + y * PXSTRIDE(f->cur.stride[0]), + f->lf.p[1] + (y * PXSTRIDE(f->cur.stride[1]) >> ss_ver), + f->lf.p[2] + (y * PXSTRIDE(f->cur.stride[1]) >> ss_ver) + }; + Av1Filter *mask = f->lf.mask + (sby >> !f->seq_hdr->sb128) * f->sb128w; + if (f->frame_hdr->loopfilter.level_y[0] || f->frame_hdr->loopfilter.level_y[1]) { int start_of_tile_row = 0; if (f->frame_hdr->tiling.row_start_sb[f->lf.tile_row] == sby) start_of_tile_row = f->lf.tile_row++; - bytefn(dav1d_loopfilter_sbrow)(f, f->lf.p, f->lf.mask_ptr, sby, - start_of_tile_row); + bytefn(dav1d_loopfilter_sbrow)(f, p, mask, sby, start_of_tile_row); } - if (f->lf.restore_planes) { // Store loop filtered pixels required by loop restoration - bytefn(dav1d_lr_copy_lpf)(f, f->lf.p, sby); + bytefn(dav1d_lr_copy_lpf)(f, p, sby); } - if (f->seq_hdr->cdef) { - if (sby) { - const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420; - pixel *p_up[3] = { - f->lf.p[0] - 8 * PXSTRIDE(f->cur.stride[0]), - f->lf.p[1] - (8 * PXSTRIDE(f->cur.stride[1]) >> ss_ver), - f->lf.p[2] - (8 * PXSTRIDE(f->cur.stride[1]) >> ss_ver), - }; - bytefn(dav1d_cdef_brow)(f, p_up, f->lf.prev_mask_ptr, - sby * sbsz - 2, sby * sbsz); - } - const int n_blks = sbsz - 2 * (sby + 1 < sbh); - bytefn(dav1d_cdef_brow)(f, f->lf.p, f->lf.mask_ptr, sby * sbsz, - imin(sby * sbsz + n_blks, f->bh)); - } - if (f->frame_hdr->width[0] != f->frame_hdr->width[1]) { - const int has_chroma = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I400; - for (int pl = 0; pl < 1 + 2 * has_chroma; pl++) { - const int ss_ver = pl && f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420; - const int h_start = 8 * !!sby >> ss_ver; - const ptrdiff_t dst_stride = f->sr_cur.p.stride[!!pl]; - pixel *dst = f->lf.sr_p[pl] - h_start * PXSTRIDE(dst_stride); - const ptrdiff_t src_stride = f->cur.stride[!!pl]; - const pixel *src = f->lf.p[pl] - h_start * PXSTRIDE(src_stride); - const int h_end = 4 * (sbsz - 2 * (sby + 1 < sbh)) >> ss_ver; - const int ss_hor = pl && f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444; - const int dst_w = (f->sr_cur.p.p.w + ss_hor) >> ss_hor; - const int src_w = (4 * f->bw + ss_hor) >> ss_hor; - const int img_h = (f->cur.p.h - sbsz * 4 * sby + ss_ver) >> ss_ver; - - f->dsp->mc.resize(dst, dst_stride, src, src_stride, dst_w, - imin(img_h, h_end) + h_start, src_w, - f->resize_step[!!pl], f->resize_start[!!pl] - HIGHBD_CALL_SUFFIX); - } - } - if (f->lf.restore_planes) { - bytefn(dav1d_lr_sbrow)(f, f->lf.sr_p, sby); +} + +void bytefn(dav1d_filter_sbrow_cdef)(Dav1dFrameContext *const f, const int sby) { + const int sbsz = f->sb_step; + const int y = sby * sbsz * 4; + const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420; + pixel *const p[3] = { + f->lf.p[0] + y * PXSTRIDE(f->cur.stride[0]), + f->lf.p[1] + (y * PXSTRIDE(f->cur.stride[1]) >> ss_ver), + f->lf.p[2] + (y * PXSTRIDE(f->cur.stride[1]) >> ss_ver) + }; + Av1Filter *prev_mask = f->lf.mask + ((sby - 1) >> !f->seq_hdr->sb128) * f->sb128w; + Av1Filter *mask = f->lf.mask + (sby >> !f->seq_hdr->sb128) * f->sb128w; + const int start = sby * sbsz; + if (sby) { + const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420; + pixel *p_up[3] = { + p[0] - 8 * PXSTRIDE(f->cur.stride[0]), + p[1] - (8 * PXSTRIDE(f->cur.stride[1]) >> ss_ver), + p[2] - (8 * PXSTRIDE(f->cur.stride[1]) >> ss_ver), + }; + bytefn(dav1d_cdef_brow)(f, p_up, prev_mask, start - 2, start); } + const int n_blks = sbsz - 2 * (sby + 1 < f->sbh); + const int end = imin(start + n_blks, f->bh); + bytefn(dav1d_cdef_brow)(f, p, mask, start, end); +} +void bytefn(dav1d_filter_sbrow_resize)(Dav1dFrameContext *const f, const int sby) { + const int sbsz = f->sb_step; + const int y = sby * sbsz * 4; const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420; - f->lf.p[0] += sbsz * 4 * PXSTRIDE(f->cur.stride[0]); - f->lf.p[1] += sbsz * 4 * PXSTRIDE(f->cur.stride[1]) >> ss_ver; - f->lf.p[2] += sbsz * 4 * PXSTRIDE(f->cur.stride[1]) >> ss_ver; - f->lf.sr_p[0] += sbsz * 4 * PXSTRIDE(f->sr_cur.p.stride[0]); - f->lf.sr_p[1] += sbsz * 4 * PXSTRIDE(f->sr_cur.p.stride[1]) >> ss_ver; - f->lf.sr_p[2] += sbsz * 4 * PXSTRIDE(f->sr_cur.p.stride[1]) >> ss_ver; - f->lf.prev_mask_ptr = f->lf.mask_ptr; - if ((sby & 1) || f->seq_hdr->sb128) { - f->lf.mask_ptr += f->sb128w; + const pixel *const p[3] = { + f->lf.p[0] + y * PXSTRIDE(f->cur.stride[0]), + f->lf.p[1] + (y * PXSTRIDE(f->cur.stride[1]) >> ss_ver), + f->lf.p[2] + (y * PXSTRIDE(f->cur.stride[1]) >> ss_ver) + }; + pixel *const sr_p[3] = { + f->lf.sr_p[0] + y * PXSTRIDE(f->sr_cur.p.stride[0]), + f->lf.sr_p[1] + (y * PXSTRIDE(f->sr_cur.p.stride[1]) >> ss_ver), + f->lf.sr_p[2] + (y * PXSTRIDE(f->sr_cur.p.stride[1]) >> ss_ver) + }; + const int has_chroma = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I400; + for (int pl = 0; pl < 1 + 2 * has_chroma; pl++) { + const int ss_ver = pl && f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420; + const int h_start = 8 * !!sby >> ss_ver; + const ptrdiff_t dst_stride = f->sr_cur.p.stride[!!pl]; + pixel *dst = sr_p[pl] - h_start * PXSTRIDE(dst_stride); + const ptrdiff_t src_stride = f->cur.stride[!!pl]; + const pixel *src = p[pl] - h_start * PXSTRIDE(src_stride); + const int h_end = 4 * (sbsz - 2 * (sby + 1 < f->sbh)) >> ss_ver; + const int ss_hor = pl && f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444; + const int dst_w = (f->sr_cur.p.p.w + ss_hor) >> ss_hor; + const int src_w = (4 * f->bw + ss_hor) >> ss_hor; + const int img_h = (f->cur.p.h - sbsz * 4 * sby + ss_ver) >> ss_ver; + + f->dsp->mc.resize(dst, dst_stride, src, src_stride, dst_w, + imin(img_h, h_end) + h_start, src_w, + f->resize_step[!!pl], f->resize_start[!!pl] + HIGHBD_CALL_SUFFIX); } } +void bytefn(dav1d_filter_sbrow_lr)(Dav1dFrameContext *const f, const int sby) { + const int y = sby * f->sb_step * 4; + const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420; + pixel *const sr_p[3] = { + f->lf.sr_p[0] + y * PXSTRIDE(f->sr_cur.p.stride[0]), + f->lf.sr_p[1] + (y * PXSTRIDE(f->sr_cur.p.stride[1]) >> ss_ver), + f->lf.sr_p[2] + (y * PXSTRIDE(f->sr_cur.p.stride[1]) >> ss_ver) + }; + bytefn(dav1d_lr_sbrow)(f, sr_p, sby); +} + +void bytefn(dav1d_filter_sbrow)(Dav1dFrameContext *const f, const int sby) { + bytefn(dav1d_filter_sbrow_deblock)(f, sby); + if (f->seq_hdr->cdef) + bytefn(dav1d_filter_sbrow_cdef)(f, sby); + if (f->frame_hdr->width[0] != f->frame_hdr->width[1]) + bytefn(dav1d_filter_sbrow_resize)(f, sby); + if (f->lf.restore_planes) + bytefn(dav1d_filter_sbrow_lr)(f, sby); +} + void bytefn(dav1d_backup_ipred_edge)(Dav1dTileContext *const t) { const Dav1dFrameContext *const f = t->f; Dav1dTileState *const ts = t->ts; diff --git a/src/thread_task.c b/src/thread_task.c index 6c1c139..9c2f49b 100644 --- a/src/thread_task.c +++ b/src/thread_task.c @@ -29,6 +29,137 @@ #include "src/thread_task.h" +int dav1d_task_create_filter_sbrow(Dav1dFrameContext *const f) { + struct PostFilterThreadData *const pftd = f->lf.thread.pftd; + const int frame_idx = (int)(f - f->c->fc); + + const int has_deblock = f->frame_hdr->loopfilter.level_y[0] || + f->frame_hdr->loopfilter.level_y[1] || + f->lf.restore_planes; + const int has_cdef = f->seq_hdr->cdef; + const int has_resize = f->frame_hdr->width[0] != f->frame_hdr->width[1]; + const int has_lr = !!f->lf.restore_planes; + f->lf.thread.npf = has_deblock + has_cdef + has_resize + has_lr; + if (f->lf.thread.npf == 0) return 0; + + pthread_mutex_lock(&pftd->lock); + + Dav1dTask *tasks = f->lf.thread.tasks; + int num_tasks = f->sbh * f->lf.thread.npf; + if (num_tasks > f->lf.thread.num_tasks) { + const size_t size = sizeof(Dav1dTask) * num_tasks; + tasks = realloc(f->lf.thread.tasks, size); + if (!tasks) return -1; + memset(tasks, 0, size); + f->lf.thread.tasks = tasks; + f->lf.thread.num_tasks = num_tasks; + } + +#define create_task(task, ready_cond, start_cond) \ + do { \ + t = &tasks[num_tasks++]; \ + t->status = ready_cond ? DAV1D_TASK_READY : DAV1D_TASK_DEFAULT; \ + t->start = start_cond; \ + t->frame_id = frame_cnt; \ + t->frame_idx = frame_idx; \ + t->sby = sby; \ + t->fn = f->bd_fn.filter_sbrow_##task; \ + t->last_deps[0] = NULL; \ + t->last_deps[1] = NULL; \ + t->next_deps[0] = NULL; \ + t->next_deps[1] = NULL; \ + t->next_exec = NULL; \ + } while (0) + + Dav1dTask *last_sbrow_deblock = NULL; + Dav1dTask *last_sbrow_cdef = NULL; + Dav1dTask *last_sbrow_resize = NULL; + Dav1dTask *last_sbrow_lr = NULL; + num_tasks = 0; + const int frame_cnt = pftd->frame_cnt++; + + for (int sby = 0; sby < f->sbh; ++sby) { + Dav1dTask *t; + Dav1dTask *last = NULL; + if (has_deblock) { + create_task(deblock, sby == 0, 0); + if (sby) { + t->last_deps[1] = last_sbrow_deblock; + last_sbrow_deblock->next_deps[1] = t; + } + last = t; + last_sbrow_deblock = t; + } + if (has_cdef) { + create_task(cdef, sby == 0 && !has_deblock, has_deblock); + if (has_deblock) { + t->last_deps[0] = last; + last->next_deps[0] = t; + } + if (sby) { + t->last_deps[1] = last_sbrow_cdef; + last_sbrow_cdef->next_deps[1] = t; + } + last = t; + last_sbrow_cdef = t; + }; + if (has_resize) { + create_task(resize, sby == 0 && !last, !!last); + if (last) { + t->last_deps[0] = last; + last->next_deps[0] = t; + } + if (sby) { + t->last_deps[1] = last_sbrow_resize; + last_sbrow_resize->next_deps[1] = t; + } + last = t; + last_sbrow_resize = t; + } + if (has_lr) { + create_task(lr, sby == 0 && !last, !!last); + if (last) { + t->last_deps[0] = last; + last->next_deps[0] = t; + } + if (sby) { + t->last_deps[1] = last_sbrow_lr; + last_sbrow_lr->next_deps[1] = t; + } + last_sbrow_lr = t; + } + } + f->lf.thread.done = 0; + pthread_mutex_unlock(&pftd->lock); + + return 0; +} + +void dav1d_task_schedule(struct PostFilterThreadData *const pftd, + Dav1dTask *const t) +{ + Dav1dTask **pt = &pftd->tasks; + while (*pt && + ((*pt)->sby < t->sby || + ((*pt)->sby == t->sby && (*pt)->frame_id <= t->frame_id))) + pt = &(*pt)->next_exec; + t->next_exec = *pt; + *pt = t; + pthread_cond_signal(&pftd->cond); +} + +static inline void update_task(Dav1dTask *const t, const int dep_type, + Dav1dFrameContext *const f) +{ + if (!t->last_deps[!dep_type] || + t->last_deps[!dep_type]->status == DAV1D_TASK_DONE) + { + t->status = DAV1D_TASK_READY; + if (t->start) + dav1d_task_schedule(f->lf.thread.pftd, t); + } +} + void *dav1d_frame_task(void *const data) { Dav1dFrameContext *const f = data; @@ -140,3 +271,98 @@ void *dav1d_tile_task(void *const data) { return NULL; } + +static inline int handle_abortion(Dav1dPostFilterContext *const pf, + Dav1dContext *const c, + struct PostFilterThreadData *const pftd) +{ + const int flush = atomic_load_explicit(c->flush, memory_order_acquire); + if (flush) { + pthread_mutex_lock(&pf->td.lock); + pf->flushed = 0; + pthread_mutex_unlock(&pf->td.lock); + } + for (unsigned i = 0; i < c->n_fc; i++) { + Dav1dFrameContext *const f = &c->fc[i]; + int send_signal; + if (flush) // TODO before merge, see if this can be safely merged + send_signal = f->lf.thread.done != 1 && f->lf.thread.num_tasks != 0; + else + send_signal = f->lf.thread.done == -1; + for (int j = 0; send_signal && j < f->lf.thread.num_tasks; j++) { + Dav1dTask *const t = &f->lf.thread.tasks[j]; + if (t->status == DAV1D_TASK_RUNNING || + (t->status == DAV1D_TASK_DONE && t->start != -1)) + send_signal = 0; + } + if (send_signal) { + if (!flush) { + Dav1dTask **pt = &pftd->tasks; + while (*pt) { + if ((*pt)->frame_idx == i) + *pt = (*pt)->next_exec; + else + pt = &(*pt)->next_exec; + } + } + f->lf.thread.done = 1; + pthread_cond_signal(&f->lf.thread.cond); + } + } + if (flush) { + pthread_mutex_lock(&pf->td.lock); + pf->flushed = 1; + pthread_cond_signal(&pf->td.cond); + pthread_mutex_unlock(&pf->td.lock); + } + return !flush; +} + +void *dav1d_postfilter_task(void *data) { + Dav1dPostFilterContext *const pf = data; + Dav1dContext *const c = pf->c; + struct PostFilterThreadData *pftd = &c->postfilter_thread; + + dav1d_set_thread_name("dav1d-postfilter"); + + int exec = 1; + pthread_mutex_lock(&pftd->lock); + for (;;) { + if (!exec && !pf->die) + pthread_cond_wait(&pftd->cond, &pftd->lock); + if (!(exec = handle_abortion(pf, c, pftd))) continue; + if (pf->die) break; + + Dav1dTask *const t = pftd->tasks; + if (!t) { exec = 0; continue; } + pftd->tasks = t->next_exec; + t->status = DAV1D_TASK_RUNNING; + + pthread_mutex_unlock(&pftd->lock); + Dav1dFrameContext *const f = &c->fc[t->frame_idx]; + t->fn(f, t->sby); + exec = 1; + pthread_mutex_lock(&pftd->lock); + + if (t->next_deps[0]) + update_task(t->next_deps[0], 0, f); + if (t->next_deps[1]) + update_task(t->next_deps[1], 1, f); + t->status = DAV1D_TASK_DONE; + if (!t->next_deps[0]) { + const enum PlaneType progress_plane_type = + c->n_fc > 1 && f->frame_hdr->refresh_context ? + PLANE_TYPE_Y : PLANE_TYPE_ALL; + const int y = (t->sby + 1) * f->sb_step * 4; + dav1d_thread_picture_signal(&f->sr_cur, y, progress_plane_type); + if (t->sby + 1 == f->sbh) { + f->lf.thread.done = 1; + pthread_cond_signal(&f->lf.thread.cond); + } + } + t->start = -1; + } + pthread_mutex_unlock(&pftd->lock); + + return NULL; +} diff --git a/src/thread_task.h b/src/thread_task.h index 309a714..b3f3905 100644 --- a/src/thread_task.h +++ b/src/thread_task.h @@ -35,10 +35,33 @@ #define FRAME_ERROR (UINT_MAX - 1) #define TILE_ERROR (INT_MAX - 1) -int dav1d_decode_frame(Dav1dFrameContext *f); +enum TaskStatus { + DAV1D_TASK_DEFAULT, + DAV1D_TASK_READY, + DAV1D_TASK_RUNNING, + DAV1D_TASK_DONE, +}; + +struct Dav1dTask { + enum TaskStatus status; // task status + int start; // frame thread start flag + unsigned frame_idx; // frame thread id + int frame_id; // frame ordering + int sby; // sbrow + filter_sbrow_fn fn; // task work + Dav1dTask *last_deps[2]; // dependencies + Dav1dTask *next_deps[2]; // dependant tasks + Dav1dTask *next_exec; // tasks scheduling +}; + +int dav1d_task_create_filter_sbrow(Dav1dFrameContext *f); +void dav1d_task_schedule(struct PostFilterThreadData *pftd, Dav1dTask *t); + void *dav1d_frame_task(void *data); +void *dav1d_tile_task(void *data); +void *dav1d_postfilter_task(void *data); +int dav1d_decode_frame(Dav1dFrameContext *f); int dav1d_decode_tile_sbrow(Dav1dTileContext *t); -void *dav1d_tile_task(void *data); #endif /* DAV1D_SRC_THREAD_TASK_H */ diff --git a/tools/dav1d_cli_parse.c b/tools/dav1d_cli_parse.c index 96f81e0..35d8e17 100644 --- a/tools/dav1d_cli_parse.c +++ b/tools/dav1d_cli_parse.c @@ -51,6 +51,7 @@ enum { ARG_REALTIME_CACHE, ARG_FRAME_THREADS, ARG_TILE_THREADS, + ARG_POSTFILTER_THREADS, ARG_VERIFY, ARG_FILM_GRAIN, ARG_OPPOINT, @@ -73,6 +74,7 @@ static const struct option long_opts[] = { { "realtimecache", 1, NULL, ARG_REALTIME_CACHE }, { "framethreads", 1, NULL, ARG_FRAME_THREADS }, { "tilethreads", 1, NULL, ARG_TILE_THREADS }, + { "pfthreads", 1, NULL, ARG_POSTFILTER_THREADS }, { "verify", 1, NULL, ARG_VERIFY }, { "filmgrain", 1, NULL, ARG_FILM_GRAIN }, { "oppoint", 1, NULL, ARG_OPPOINT }, @@ -117,6 +119,7 @@ static void usage(const char *const app, const char *const reason, ...) { " --version/-v: print version and exit\n" " --framethreads $num: number of frame threads (default: 1)\n" " --tilethreads $num: number of tile threads (default: 1)\n" + " --pfthreads $num: number of postfilter threads (default: 1)\n" " --filmgrain $num: enable film grain application (default: 1, except if muxer is md5)\n" " --oppoint $num: select an operating point of a scalable AV1 bitstream (0 - 31)\n" " --alllayers $num: output all spatial layers of a scalable AV1 bitstream (default: 1)\n" @@ -295,6 +298,10 @@ void parse(const int argc, char *const *const argv, lib_settings->n_tile_threads = parse_unsigned(optarg, ARG_TILE_THREADS, argv[0]); break; + case ARG_POSTFILTER_THREADS: + lib_settings->n_postfilter_threads = + parse_unsigned(optarg, ARG_POSTFILTER_THREADS, argv[0]); + break; case ARG_VERIFY: cli_settings->verify = optarg; break; |