Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/videolan/dav1d.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--.gitlab-ci.yml16
-rw-r--r--examples/dav1dplay.c7
-rw-r--r--include/dav1d/dav1d.h4
-rw-r--r--meson.build2
-rw-r--r--src/decode.c78
-rw-r--r--src/internal.h42
-rw-r--r--src/lib.c105
-rw-r--r--src/lr_apply_tmpl.c77
-rw-r--r--src/recon.h8
-rw-r--r--src/recon_tmpl.c147
-rw-r--r--src/thread_task.c226
-rw-r--r--src/thread_task.h27
-rw-r--r--tools/dav1d_cli_parse.c7
13 files changed, 615 insertions, 131 deletions
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index db8fefd..b18301b 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -410,7 +410,8 @@ test-debian:
- ninja coverage-xml
- grep -Eo 'line-rate="[^"]+"' meson-logs/coverage.xml | head -n 1 |
grep -Eo '[0-9.]+' | awk '{ print "coverage:", $1 * 100 } '
- - time meson test -v --suite testdata_seek-stress --test-args "--tilethreads 2 --framethreads 2"
+ - time meson test -v --suite testdata_seek-stress --test-args "--tilethreads 2 --framethreads 1 --pfthreads=2"
+ - time meson test -v --suite testdata_seek-stress --test-args "--tilethreads 2 --framethreads 2 --pfthreads=2"
coverage: '/^coverage: (\d+.\d+)$/'
artifacts:
expose_as: 'Coverage HTML report'
@@ -520,11 +521,14 @@ test-debian-tsan:
- ninja -C build
- cd build
- exit_code=0
- - time meson test -v --setup=sanitizer --suite testdata-8 --suite testdata-10 --suite testdata-12 --test-args "--tilethreads 1 --framethreads 2" || exit_code=$((exit_code + $?))
- - time meson test -v --setup=sanitizer --suite testdata-8 --suite testdata-10 --suite testdata-12 --test-args "--tilethreads 2 --framethreads 1" || exit_code=$((exit_code + $?))
- - time meson test -v --setup=sanitizer --suite testdata-8 --suite testdata-10 --suite testdata-12 --test-args "--tilethreads 2 --framethreads 2" || exit_code=$((exit_code + $?))
- - time meson test -v --setup=sanitizer --suite testdata_seek-stress --test-args "--tilethreads 2 --framethreads 2" || exit_code=$((exit_code + $?))
- - time meson test -v --setup=sanitizer --suite oss-fuzz-asan --suite oss-fuzz-msan --suite oss-fuzz-ubsan || exit_code=$((exit_code + $?))
+ - time meson test -v --setup=sanitizer --suite testdata-8 --suite testdata-10 --suite testdata-12 --test-args "--tilethreads 1 --framethreads 2 --pfthreads 1" || exit_code=$((exit_code + $?))
+ - time meson test -v --setup=sanitizer --suite testdata-8 --suite testdata-10 --suite testdata-12 --test-args "--tilethreads 2 --framethreads 1 --pfthreads 1" || exit_code=$((exit_code + $?))
+ - time meson test -v --setup=sanitizer --suite testdata-8 --suite testdata-10 --suite testdata-12 --test-args "--tilethreads 2 --framethreads 2 --pfthreads 1" || exit_code=$((exit_code + $?))
+ - time meson test -v --setup=sanitizer --suite testdata-8 --suite testdata-10 --suite testdata-12 --test-args "--tilethreads 2 --framethreads 1 --pfthreads 2" || exit_code=$((exit_code + $?))
+ - time meson test -v --setup=sanitizer --suite testdata-8 --suite testdata-10 --suite testdata-12 --test-args "--tilethreads 2 --framethreads 2 --pfthreads 2" || exit_code=$((exit_code + $?))
+ - time meson test -v --setup=sanitizer --suite testdata_seek-stress --test-args "--tilethreads 2 --framethreads 1 --pfthreads 2" || exit_code=$((exit_code + $?))
+ - time meson test -v --setup=sanitizer --suite testdata_seek-stress --test-args "--tilethreads 2 --framethreads 2 --pfthreads 2" || exit_code=$((exit_code + $?))
+ - time meson test -v --setup=sanitizer --suite oss-fuzz-asan --suite oss-fuzz-msan --suite oss-fuzz-ubsan || exit_code=$((exit_code + $?))
- if [ $exit_code -ne 0 ]; then exit $exit_code; fi
test-win64:
diff --git a/examples/dav1dplay.c b/examples/dav1dplay.c
index d6bb262..a7eca93 100644
--- a/examples/dav1dplay.c
+++ b/examples/dav1dplay.c
@@ -95,6 +95,7 @@ static void dp_settings_print_usage(const char *const app,
" --untimed/-u: ignore PTS, render as fast as possible\n"
" --framethreads $num: number of frame threads (default: 1)\n"
" --tilethreads $num: number of tile threads (default: 1)\n"
+ " --pfthreads $num: number of postfilter threads(default: 1)\n"
" --highquality: enable high quality rendering\n"
" --zerocopy/-z: enable zero copy upload path\n"
" --gpugrain/-g: enable GPU grain synthesis\n"
@@ -127,6 +128,7 @@ static void dp_rd_ctx_parse_args(Dav1dPlayRenderContext *rd_ctx,
enum {
ARG_FRAME_THREADS = 256,
ARG_TILE_THREADS,
+ ARG_POSTFILTER_THREADS,
ARG_HIGH_QUALITY,
};
@@ -137,6 +139,7 @@ static void dp_rd_ctx_parse_args(Dav1dPlayRenderContext *rd_ctx,
{ "untimed", 0, NULL, 'u' },
{ "framethreads", 1, NULL, ARG_FRAME_THREADS },
{ "tilethreads", 1, NULL, ARG_TILE_THREADS },
+ { "pfthreads", 1, NULL, ARG_POSTFILTER_THREADS },
{ "highquality", 0, NULL, ARG_HIGH_QUALITY },
{ "zerocopy", 0, NULL, 'z' },
{ "gpugrain", 0, NULL, 'g' },
@@ -175,6 +178,10 @@ static void dp_rd_ctx_parse_args(Dav1dPlayRenderContext *rd_ctx,
lib_settings->n_tile_threads =
parse_unsigned(optarg, ARG_TILE_THREADS, argv[0]);
break;
+ case ARG_POSTFILTER_THREADS:
+ lib_settings->n_postfilter_threads =
+ parse_unsigned(optarg, ARG_POSTFILTER_THREADS, argv[0]);
+ break;
default:
dp_settings_print_usage(argv[0], NULL);
}
diff --git a/include/dav1d/dav1d.h b/include/dav1d/dav1d.h
index 9d484e5..165e1c0 100644
--- a/include/dav1d/dav1d.h
+++ b/include/dav1d/dav1d.h
@@ -45,6 +45,7 @@ typedef struct Dav1dRef Dav1dRef;
#define DAV1D_MAX_FRAME_THREADS 256
#define DAV1D_MAX_TILE_THREADS 64
+#define DAV1D_MAX_POSTFILTER_THREADS 256
typedef struct Dav1dLogger {
void *cookie; ///< Custom data to pass to the callback.
@@ -67,7 +68,8 @@ typedef struct Dav1dSettings {
unsigned frame_size_limit; ///< maximum frame size, in pixels (0 = unlimited)
Dav1dPicAllocator allocator; ///< Picture allocator callback.
Dav1dLogger logger; ///< Logger callback.
- uint8_t reserved[32]; ///< reserved for future use
+ int n_postfilter_threads;
+ uint8_t reserved[28]; ///< reserved for future use
} Dav1dSettings;
/**
diff --git a/meson.build b/meson.build
index 3b354d6..2af8302 100644
--- a/meson.build
+++ b/meson.build
@@ -30,7 +30,7 @@ project('dav1d', ['c'],
'b_ndebug=if-release'],
meson_version: '>= 0.49.0')
-dav1d_soname_version = '5.0.0'
+dav1d_soname_version = '5.0.1'
dav1d_api_version_array = dav1d_soname_version.split('.')
dav1d_api_version_major = dav1d_api_version_array[0]
dav1d_api_version_minor = dav1d_api_version_array[1]
diff --git a/src/decode.c b/src/decode.c
index 197af98..6730b22 100644
--- a/src/decode.c
+++ b/src/decode.c
@@ -2526,7 +2526,7 @@ int dav1d_decode_tile_sbrow(Dav1dTileContext *const t) {
t->a = f->a + col_sb128_start + tile_row * f->sb128w;
t->bx < ts->tiling.col_end; t->bx += sb_step)
{
- if (atomic_load_explicit(c->frame_thread.flush, memory_order_acquire))
+ if (atomic_load_explicit(c->flush, memory_order_acquire))
return 1;
if (decode_sb(t, root_bl, c->intra_edge.root[root_bl]))
return 1;
@@ -2557,7 +2557,7 @@ int dav1d_decode_tile_sbrow(Dav1dTileContext *const t) {
t->lf_mask = f->lf.mask + sb128y * f->sb128w + col_sb128_start;
t->bx < ts->tiling.col_end; t->bx += sb_step)
{
- if (atomic_load_explicit(c->frame_thread.flush, memory_order_acquire))
+ if (atomic_load_explicit(c->flush, memory_order_acquire))
return 1;
if (root_bl == BL_128X128) {
t->cur_sb_cdef_idx_ptr = t->lf_mask->cdef_idx;
@@ -2859,7 +2859,8 @@ int dav1d_decode_frame(Dav1dFrameContext *const f) {
const int lr_line_sz = ((f->sr_cur.p.p.w + 31) & ~31) << hbd;
if (lr_line_sz != f->lf.lr_line_sz) {
dav1d_freep_aligned(&f->lf.lr_lpf_line[0]);
- uint8_t *lr_ptr = dav1d_alloc_aligned(lr_line_sz * 3 * 12, 32);
+ const int num_lines = c->n_pfc > 1 ? f->sbh * (4 << f->seq_hdr->sb128) : 12;
+ uint8_t *lr_ptr = dav1d_alloc_aligned(lr_line_sz * num_lines * 3, 32);
if (!lr_ptr) {
f->lf.lr_line_sz = 0;
goto error;
@@ -2867,7 +2868,7 @@ int dav1d_decode_frame(Dav1dFrameContext *const f) {
for (int pl = 0; pl <= 2; pl++) {
f->lf.lr_lpf_line[pl] = lr_ptr;
- lr_ptr += lr_line_sz * 12;
+ lr_ptr += lr_line_sz * num_lines;
}
f->lf.lr_line_sz = lr_line_sz;
@@ -2955,6 +2956,12 @@ int dav1d_decode_frame(Dav1dFrameContext *const f) {
f->refpoc, f->mvs, f->refrefpoc, f->ref_mvs, f->n_tc);
if (ret < 0) goto error;
}
+
+ // create post-filtering tasks
+ if (c->n_pfc > 1)
+ if (dav1d_task_create_filter_sbrow(f))
+ goto error;
+
retval = DAV1D_ERR(EINVAL);
// setup dequant tables
@@ -3081,7 +3088,7 @@ int dav1d_decode_frame(Dav1dFrameContext *const f) {
for (int n = 0; n < f->sb128w * f->frame_hdr->tiling.rows; n++)
reset_context(&f->a[n], !(f->frame_hdr->frame_type & 1), f->frame_thread.pass);
- if (f->n_tc == 1) {
+ if (f->n_tc == 1 || (c->n_pfc > 1 && f->frame_hdr->tiling.cols * f->frame_hdr->tiling.rows == 1)) {
Dav1dTileContext *const t = f->tc;
// no tile threading - we explicitly interleave tile/sbrow decoding
@@ -3108,7 +3115,6 @@ int dav1d_decode_frame(Dav1dFrameContext *const f) {
}
for (int tile_col = 0; tile_col < f->frame_hdr->tiling.cols; tile_col++) {
t->ts = &f->ts[tile_row * f->frame_hdr->tiling.cols + tile_col];
-
if (dav1d_decode_tile_sbrow(t)) goto error;
}
if (f->frame_thread.pass <= 1 && f->frame_hdr->frame_type & 1) {
@@ -3116,10 +3122,24 @@ int dav1d_decode_frame(Dav1dFrameContext *const f) {
}
// loopfilter + cdef + restoration
- if (f->frame_thread.pass != 1)
- f->bd_fn.filter_sbrow(f, sby);
- dav1d_thread_picture_signal(&f->sr_cur, (sby + 1) * f->sb_step * 4,
- progress_plane_type);
+ if (f->frame_thread.pass != 1) {
+ if (c->n_pfc == 1)
+ f->bd_fn.filter_sbrow(f, sby);
+ else {
+ pthread_mutex_lock(&f->lf.thread.pftd->lock);
+ if (f->lf.thread.npf != 0 && !f->lf.thread.done) {
+ Dav1dTask *const t = &f->lf.thread.tasks[sby * f->lf.thread.npf];
+ t->start = 1;
+ if (t->status == DAV1D_TASK_READY)
+ dav1d_task_schedule(f->lf.thread.pftd, t);
+ }
+ pthread_mutex_unlock(&f->lf.thread.pftd->lock);
+ }
+ }
+ if (c->n_pfc == 1 || f->frame_thread.pass == 1 || f->lf.thread.npf == 0)
+ dav1d_thread_picture_signal(&f->sr_cur,
+ (sby + 1) * f->sb_step * 4,
+ progress_plane_type);
}
}
} else {
@@ -3142,7 +3162,6 @@ int dav1d_decode_frame(Dav1dFrameContext *const f) {
pthread_cond_broadcast(&f->tile_thread.cond);
pthread_mutex_unlock(&f->tile_thread.lock);
- // loopfilter + cdef + restoration
for (int tile_row = 0; tile_row < f->frame_hdr->tiling.rows; tile_row++) {
for (int sby = f->frame_hdr->tiling.row_start_sb[tile_row];
sby < f->frame_hdr->tiling.row_start_sb[tile_row + 1]; sby++)
@@ -3174,10 +3193,24 @@ int dav1d_decode_frame(Dav1dFrameContext *const f) {
}
// loopfilter + cdef + restoration
- if (f->frame_thread.pass != 1)
- f->bd_fn.filter_sbrow(f, sby);
- dav1d_thread_picture_signal(&f->sr_cur, (sby + 1) * f->sb_step * 4,
- progress_plane_type);
+ if (f->frame_thread.pass != 1) {
+ if (c->n_pfc == 1)
+ f->bd_fn.filter_sbrow(f, sby);
+ else {
+ pthread_mutex_lock(&f->lf.thread.pftd->lock);
+ if (f->lf.thread.npf != 0 && !f->lf.thread.done) {
+ Dav1dTask *const t = &f->lf.thread.tasks[sby * f->lf.thread.npf];
+ t->start = 1;
+ if (t->status == DAV1D_TASK_READY)
+ dav1d_task_schedule(f->lf.thread.pftd, t);
+ }
+ pthread_mutex_unlock(&f->lf.thread.pftd->lock);
+ }
+ }
+ if (c->n_pfc == 1 || f->frame_thread.pass == 1 || f->lf.thread.npf == 0)
+ dav1d_thread_picture_signal(&f->sr_cur,
+ (sby + 1) * f->sb_step * 4,
+ progress_plane_type);
}
}
@@ -3222,6 +3255,17 @@ int dav1d_decode_frame(Dav1dFrameContext *const f) {
retval = 0;
error:
+ if (c->n_pfc > 1) {
+ pthread_mutex_lock(&f->lf.thread.pftd->lock);
+ if (!f->lf.thread.done) {
+ if (retval != 0) {
+ f->lf.thread.done = -1;
+ pthread_cond_signal(&f->lf.thread.pftd->cond);
+ }
+ pthread_cond_wait(&f->lf.thread.cond, &f->lf.thread.pftd->lock);
+ }
+ pthread_mutex_unlock(&f->lf.thread.pftd->lock);
+ }
dav1d_thread_picture_signal(&f->sr_cur, retval == 0 ? UINT_MAX : FRAME_ERROR,
PLANE_TYPE_ALL);
for (int i = 0; i < 7; i++) {
@@ -3329,6 +3373,10 @@ int dav1d_submit_frame(Dav1dContext *const c) {
f->bd_fn.recon_b_inter = dav1d_recon_b_inter_##bd##bpc; \
f->bd_fn.recon_b_intra = dav1d_recon_b_intra_##bd##bpc; \
f->bd_fn.filter_sbrow = dav1d_filter_sbrow_##bd##bpc; \
+ f->bd_fn.filter_sbrow_deblock = dav1d_filter_sbrow_deblock_##bd##bpc; \
+ f->bd_fn.filter_sbrow_cdef = dav1d_filter_sbrow_cdef_##bd##bpc; \
+ f->bd_fn.filter_sbrow_resize = dav1d_filter_sbrow_resize_##bd##bpc; \
+ f->bd_fn.filter_sbrow_lr = dav1d_filter_sbrow_lr_##bd##bpc; \
f->bd_fn.backup_ipred_edge = dav1d_backup_ipred_edge_##bd##bpc; \
f->bd_fn.read_coef_blocks = dav1d_read_coef_blocks_##bd##bpc
if (!f->seq_hdr->hbd) {
diff --git a/src/internal.h b/src/internal.h
index ea29d8f..0a71420 100644
--- a/src/internal.h
+++ b/src/internal.h
@@ -35,6 +35,8 @@
typedef struct Dav1dFrameContext Dav1dFrameContext;
typedef struct Dav1dTileState Dav1dTileState;
typedef struct Dav1dTileContext Dav1dTileContext;
+typedef struct Dav1dPostFilterContext Dav1dPostFilterContext;
+typedef struct Dav1dTask Dav1dTask;
#include "common/attributes.h"
@@ -76,6 +78,9 @@ struct Dav1dContext {
Dav1dFrameContext *fc;
unsigned n_fc;
+ Dav1dPostFilterContext *pfc;
+ unsigned n_pfc;
+
// cache of OBUs that make up a single frame before we submit them
// to a frame worker to be decoded
struct Dav1dTileGroup *tile;
@@ -99,15 +104,23 @@ struct Dav1dContext {
// decoded output picture queue
Dav1dData in;
Dav1dPicture out;
+ // dummy is a pointer to prevent compiler errors about atomic_load()
+ // not taking const arguments
+ atomic_int flush_mem, *flush;
struct {
Dav1dThreadPicture *out_delayed;
unsigned next;
- // dummy is a pointer to prevent compiler errors about atomic_load()
- // not taking const arguments; the const attribute is not taken
- // from pointers
- atomic_int flush_mem, *flush;
} frame_thread;
+ // postfilter threading (refer to pfc[] for per_thread thingies)
+ struct PostFilterThreadData {
+ pthread_mutex_t lock;
+ pthread_cond_t cond;
+ struct Dav1dTask *tasks;
+ int frame_cnt;
+ int inited;
+ } postfilter_thread;
+
// reference/entropy state
Dav1dMemPool *segmap_pool;
Dav1dMemPool *refmvs_pool;
@@ -182,6 +195,10 @@ struct Dav1dFrameContext {
recon_b_intra_fn recon_b_intra;
recon_b_inter_fn recon_b_inter;
filter_sbrow_fn filter_sbrow;
+ filter_sbrow_fn filter_sbrow_deblock;
+ filter_sbrow_fn filter_sbrow_cdef;
+ filter_sbrow_fn filter_sbrow_resize;
+ filter_sbrow_fn filter_sbrow_lr;
backup_ipred_edge_fn backup_ipred_edge;
read_coef_blocks_fn read_coef_blocks;
} bd_fn;
@@ -238,6 +255,16 @@ struct Dav1dFrameContext {
pixel *p[3], *sr_p[3];
Av1Filter *mask_ptr, *prev_mask_ptr;
int restore_planes; // enum LrRestorePlanes
+
+ struct {
+ pthread_cond_t cond;
+ struct PostFilterThreadData *pftd;
+ struct Dav1dTask *tasks;
+ int num_tasks;
+ int npf;
+ int done;
+ int inited;
+ } thread;
} lf;
// threading (refer to tc[] for per-thread things)
@@ -353,4 +380,11 @@ struct Dav1dTileContext {
} tile_thread;
};
+struct Dav1dPostFilterContext {
+ Dav1dContext *c;
+ struct thread_data td;
+ int flushed;
+ int die;
+};
+
#endif /* DAV1D_SRC_INTERNAL_H */
diff --git a/src/lib.c b/src/lib.c
index af928d6..95efada 100644
--- a/src/lib.c
+++ b/src/lib.c
@@ -65,6 +65,7 @@ COLD const char *dav1d_version(void) {
COLD void dav1d_default_settings(Dav1dSettings *const s) {
s->n_frame_threads = 1;
s->n_tile_threads = 1;
+ s->n_postfilter_threads = 1;
s->apply_grain = 1;
s->allocator.cookie = NULL;
s->allocator.alloc_picture_callback = dav1d_default_picture_alloc;
@@ -100,6 +101,8 @@ COLD int dav1d_open(Dav1dContext **const c_out, const Dav1dSettings *const s) {
validate_input_or_ret(c_out != NULL, DAV1D_ERR(EINVAL));
validate_input_or_ret(s != NULL, DAV1D_ERR(EINVAL));
+ validate_input_or_ret(s->n_postfilter_threads >= 1 &&
+ s->n_postfilter_threads <= DAV1D_MAX_POSTFILTER_THREADS, DAV1D_ERR(EINVAL));
validate_input_or_ret(s->n_tile_threads >= 1 &&
s->n_tile_threads <= DAV1D_MAX_TILE_THREADS, DAV1D_ERR(EINVAL));
validate_input_or_ret(s->n_frame_threads >= 1 &&
@@ -160,12 +163,49 @@ COLD int dav1d_open(Dav1dContext **const c_out, const Dav1dSettings *const s) {
s->frame_size_limit, c->frame_size_limit);
}
- c->frame_thread.flush = &c->frame_thread.flush_mem;
- atomic_init(c->frame_thread.flush, 0);
+ c->flush = &c->flush_mem;
+ atomic_init(c->flush, 0);
+
+ c->n_pfc = s->n_postfilter_threads;
c->n_fc = s->n_frame_threads;
c->fc = dav1d_alloc_aligned(sizeof(*c->fc) * s->n_frame_threads, 32);
if (!c->fc) goto error;
memset(c->fc, 0, sizeof(*c->fc) * s->n_frame_threads);
+
+ if (c->n_pfc > 1) {
+ c->pfc = dav1d_alloc_aligned(sizeof(*c->pfc) * s->n_postfilter_threads, 32);
+ if (!c->pfc) goto error;
+ memset(c->pfc, 0, sizeof(*c->pfc) * s->n_postfilter_threads);
+ if (pthread_mutex_init(&c->postfilter_thread.lock, NULL)) goto error;
+ if (pthread_cond_init(&c->postfilter_thread.cond, NULL)) {
+ pthread_mutex_destroy(&c->postfilter_thread.lock);
+ goto error;
+ }
+ c->postfilter_thread.inited = 1;
+ for (int n = 0; n < s->n_frame_threads; n++) {
+ Dav1dFrameContext *const f = &c->fc[n];
+ if (pthread_cond_init(&f->lf.thread.cond, NULL)) goto error;
+ f->lf.thread.pftd = &c->postfilter_thread;
+ f->lf.thread.done = 1;
+ f->lf.thread.inited = 1;
+ }
+ for (int n = 0; n < s->n_postfilter_threads; ++n) {
+ Dav1dPostFilterContext *const pf = &c->pfc[n];
+ pf->c = c;
+ if (pthread_mutex_init(&pf->td.lock, NULL)) goto error;
+ if (pthread_cond_init(&pf->td.cond, NULL)) {
+ pthread_mutex_destroy(&pf->td.lock);
+ goto error;
+ }
+ if (pthread_create(&pf->td.thread, &thread_attr, dav1d_postfilter_task, pf)) {
+ pthread_cond_destroy(&c->postfilter_thread.cond);
+ pthread_mutex_destroy(&c->postfilter_thread.lock);
+ goto error;
+ }
+ pf->td.inited = 1;
+ }
+ }
+
if (c->n_fc > 1) {
c->frame_thread.out_delayed =
calloc(c->n_fc, sizeof(*c->frame_thread.out_delayed));
@@ -467,11 +507,17 @@ void dav1d_flush(Dav1dContext *const c) {
dav1d_ref_dec(&c->content_light_ref);
dav1d_ref_dec(&c->itut_t35_ref);
- if (c->n_fc == 1) return;
+ if (c->n_fc == 1 && c->n_pfc == 1) return;
- // mark each currently-running frame as flushing, so that we
- // exit out as quickly as the running thread checks this flag
- atomic_store(c->frame_thread.flush, 1);
+ // wait for threads to complete flushing
+ if (c->n_pfc > 1)
+ pthread_mutex_lock(&c->postfilter_thread.lock);
+ atomic_store(c->flush, 1);
+ if (c->n_pfc > 1) {
+ pthread_cond_broadcast(&c->postfilter_thread.cond);
+ pthread_mutex_unlock(&c->postfilter_thread.lock);
+ }
+ if (c->n_fc == 1) goto skip_ft_flush;
for (unsigned n = 0, next = c->frame_thread.next; n < c->n_fc; n++, next++) {
if (next == c->n_fc) next = 0;
Dav1dFrameContext *const f = &c->fc[next];
@@ -483,13 +529,31 @@ void dav1d_flush(Dav1dContext *const c) {
assert(!f->cur.data[0]);
}
pthread_mutex_unlock(&f->frame_thread.td.lock);
- Dav1dThreadPicture *const out_delayed = &c->frame_thread.out_delayed[next];
+ Dav1dThreadPicture *const out_delayed =
+ &c->frame_thread.out_delayed[next];
if (out_delayed->p.data[0])
dav1d_thread_picture_unref(out_delayed);
}
- atomic_store(c->frame_thread.flush, 0);
-
c->frame_thread.next = 0;
+skip_ft_flush:
+ if (c->n_pfc > 1) {
+ for (unsigned i = 0; i < c->n_pfc; ++i) {
+ Dav1dPostFilterContext *const pf = &c->pfc[i];
+ pthread_mutex_lock(&pf->td.lock);
+ if (!pf->flushed)
+ pthread_cond_wait(&pf->td.cond, &pf->td.lock);
+ pf->flushed = 0;
+ pthread_mutex_unlock(&pf->td.lock);
+ }
+ pthread_mutex_lock(&c->postfilter_thread.lock);
+ c->postfilter_thread.tasks = NULL;
+ pthread_mutex_unlock(&c->postfilter_thread.lock);
+ for (unsigned i = 0; i < c->n_fc; ++i) {
+ freep(&c->fc[i].lf.thread.tasks);
+ c->fc[i].lf.thread.num_tasks = 0;
+ }
+ }
+ atomic_store(c->flush, 0);
}
COLD void dav1d_close(Dav1dContext **const c_out) {
@@ -503,6 +567,25 @@ static COLD void close_internal(Dav1dContext **const c_out, int flush) {
if (flush) dav1d_flush(c);
+ if (c->pfc) {
+ struct PostFilterThreadData *pftd = &c->postfilter_thread;
+ if (pftd->inited) {
+ pthread_mutex_lock(&pftd->lock);
+ for (unsigned n = 0; n < c->n_pfc && c->pfc[n].td.inited; n++)
+ c->pfc[n].die = 1;
+ pthread_cond_broadcast(&pftd->cond);
+ pthread_mutex_unlock(&pftd->lock);
+ for (unsigned n = 0; n < c->n_pfc && c->pfc[n].td.inited; n++) {
+ pthread_join(c->pfc[n].td.thread, NULL);
+ pthread_cond_destroy(&c->pfc[n].td.cond);
+ pthread_mutex_destroy(&c->pfc[n].td.lock);
+ }
+ pthread_cond_destroy(&pftd->cond);
+ pthread_mutex_destroy(&pftd->lock);
+ }
+ dav1d_free_aligned(c->pfc);
+ }
+
for (unsigned n = 0; c->fc && n < c->n_fc; n++) {
Dav1dFrameContext *const f = &c->fc[n];
@@ -554,6 +637,10 @@ static COLD void close_internal(Dav1dContext **const c_out, int flush) {
pthread_cond_destroy(&ts->tile_thread.cond);
pthread_mutex_destroy(&ts->tile_thread.lock);
}
+ if (f->lf.thread.inited) {
+ freep(&f->lf.thread.tasks);
+ pthread_cond_destroy(&f->lf.thread.cond);
+ }
dav1d_free_aligned(f->ts);
dav1d_free_aligned(f->tc);
dav1d_free_aligned(f->ipred_edge[0]);
diff --git a/src/lr_apply_tmpl.c b/src/lr_apply_tmpl.c
index 3ba4613..6fcfb67 100644
--- a/src/lr_apply_tmpl.c
+++ b/src/lr_apply_tmpl.c
@@ -48,31 +48,32 @@ static void backup_lpf(const Dav1dFrameContext *const f,
const pixel *src, const ptrdiff_t src_stride,
const int ss_ver, const int sb128,
int row, const int row_h, const int src_w,
- const int h, const int ss_hor)
+ const int h, const int ss_hor, const int pft)
{
const int dst_w = f->frame_hdr->super_res.enabled ?
(f->frame_hdr->width[1] + ss_hor) >> ss_hor : src_w;
// The first stripe of the frame is shorter by 8 luma pixel rows.
int stripe_h = (64 - 8 * !row) >> ss_ver;
+ src += (stripe_h - 2) * PXSTRIDE(src_stride);
- if (row) {
- const int top = 4 << sb128;
- // Copy the top part of the stored loop filtered pixels from the
- // previous sb row needed above the first stripe of this sb row.
- pixel_copy(&dst[PXSTRIDE(dst_stride) * 0],
- &dst[PXSTRIDE(dst_stride) * top], dst_w);
- pixel_copy(&dst[PXSTRIDE(dst_stride) * 1],
- &dst[PXSTRIDE(dst_stride) * (top + 1)], dst_w);
- pixel_copy(&dst[PXSTRIDE(dst_stride) * 2],
- &dst[PXSTRIDE(dst_stride) * (top + 2)], dst_w);
- pixel_copy(&dst[PXSTRIDE(dst_stride) * 3],
- &dst[PXSTRIDE(dst_stride) * (top + 3)], dst_w);
+ if (!pft) {
+ if (row) {
+ const int top = 4 << sb128;
+ // Copy the top part of the stored loop filtered pixels from the
+ // previous sb row needed above the first stripe of this sb row.
+ pixel_copy(&dst[PXSTRIDE(dst_stride) * 0],
+ &dst[PXSTRIDE(dst_stride) * top], dst_w);
+ pixel_copy(&dst[PXSTRIDE(dst_stride) * 1],
+ &dst[PXSTRIDE(dst_stride) * (top + 1)], dst_w);
+ pixel_copy(&dst[PXSTRIDE(dst_stride) * 2],
+ &dst[PXSTRIDE(dst_stride) * (top + 2)], dst_w);
+ pixel_copy(&dst[PXSTRIDE(dst_stride) * 3],
+ &dst[PXSTRIDE(dst_stride) * (top + 3)], dst_w);
+ }
+ dst += 4 * PXSTRIDE(dst_stride);
}
- dst += 4 * PXSTRIDE(dst_stride);
- src += (stripe_h - 2) * PXSTRIDE(src_stride);
-
if (f->frame_hdr->width[0] != f->frame_hdr->width[1]) {
while (row + stripe_h <= row_h) {
const int n_lines = 4 - (row + stripe_h + 1 == h);
@@ -107,9 +108,15 @@ static void backup_lpf(const Dav1dFrameContext *const f,
void bytefn(dav1d_lr_copy_lpf)(Dav1dFrameContext *const f,
/*const*/ pixel *const src[3], const int sby)
{
+ const int pft = f->c->n_pfc > 1;
const int offset = 8 * !!sby;
const ptrdiff_t *const src_stride = f->cur.stride;
const ptrdiff_t lr_stride = ((f->sr_cur.p.p.w + 31) & ~31) * sizeof(pixel);
+ pixel *const dst[3] = {
+ f->lf.lr_lpf_line[0] + pft * sby * (4 << f->seq_hdr->sb128) * PXSTRIDE(lr_stride),
+ f->lf.lr_lpf_line[1] + pft * sby * (4 << f->seq_hdr->sb128) * PXSTRIDE(lr_stride),
+ f->lf.lr_lpf_line[2] + pft * sby * (4 << f->seq_hdr->sb128) * PXSTRIDE(lr_stride)
+ };
// TODO Also check block level restore type to reduce copying.
const int restore_planes = f->lf.restore_planes;
@@ -119,9 +126,9 @@ void bytefn(dav1d_lr_copy_lpf)(Dav1dFrameContext *const f,
const int w = f->bw << 2;
const int row_h = imin((sby + 1) << (6 + f->seq_hdr->sb128), h - 1);
const int y_stripe = (sby << (6 + f->seq_hdr->sb128)) - offset;
- backup_lpf(f, f->lf.lr_lpf_line[0], lr_stride,
+ backup_lpf(f, dst[0], lr_stride,
src[0] - offset * PXSTRIDE(src_stride[0]), src_stride[0],
- 0, f->seq_hdr->sb128, y_stripe, row_h, w, h, 0);
+ 0, f->seq_hdr->sb128, y_stripe, row_h, w, h, 0, pft);
}
if (restore_planes & (LR_RESTORE_U | LR_RESTORE_V)) {
const int ss_ver = f->sr_cur.p.p.layout == DAV1D_PIXEL_LAYOUT_I420;
@@ -130,18 +137,16 @@ void bytefn(dav1d_lr_copy_lpf)(Dav1dFrameContext *const f,
const int w = f->bw << (2 - ss_hor);
const int row_h = imin((sby + 1) << ((6 - ss_ver) + f->seq_hdr->sb128), h - 1);
const int offset_uv = offset >> ss_ver;
- const int y_stripe =
- (sby << ((6 - ss_ver) + f->seq_hdr->sb128)) - offset_uv;
-
+ const int y_stripe = (sby << ((6 - ss_ver) + f->seq_hdr->sb128)) - offset_uv;
if (restore_planes & LR_RESTORE_U) {
- backup_lpf(f, f->lf.lr_lpf_line[1], lr_stride,
+ backup_lpf(f, dst[1], lr_stride,
src[1] - offset_uv * PXSTRIDE(src_stride[1]), src_stride[1],
- ss_ver, f->seq_hdr->sb128, y_stripe, row_h, w, h, ss_hor);
+ ss_ver, f->seq_hdr->sb128, y_stripe, row_h, w, h, ss_hor, pft);
}
if (restore_planes & LR_RESTORE_V) {
- backup_lpf(f, f->lf.lr_lpf_line[2], lr_stride,
+ backup_lpf(f, dst[2], lr_stride,
src[2] - offset_uv * PXSTRIDE(src_stride[1]), src_stride[1],
- ss_ver, f->seq_hdr->sb128, y_stripe, row_h, w, h, ss_hor);
+ ss_ver, f->seq_hdr->sb128, y_stripe, row_h, w, h, ss_hor, pft);
}
}
}
@@ -154,10 +159,10 @@ static void lr_stripe(const Dav1dFrameContext *const f, pixel *p,
const Dav1dDSPContext *const dsp = f->dsp;
const int chroma = !!plane;
const int ss_ver = chroma & (f->sr_cur.p.p.layout == DAV1D_PIXEL_LAYOUT_I420);
- const int sbrow_has_bottom = (edges & LR_HAVE_BOTTOM);
- const pixel *lpf = f->lf.lr_lpf_line[plane] + x;
const ptrdiff_t p_stride = f->sr_cur.p.stride[chroma];
const ptrdiff_t lpf_stride = sizeof(pixel) * ((f->sr_cur.p.p.w + 31) & ~31);
+ const int sby = (y + (y ? 8 << ss_ver : 0)) >> (6 - ss_ver + f->seq_hdr->sb128);
+ const pixel *lpf = f->lf.lr_lpf_line[plane] + (f->c->n_pfc > 1) * (sby * (4 << f->seq_hdr->sb128) - 4) * PXSTRIDE(lpf_stride) + x;
// The first stripe of the frame is shorter by 8 luma pixel rows.
int stripe_h = imin((64 - 8 * !y) >> ss_ver, row_h - y);
@@ -186,8 +191,8 @@ static void lr_stripe(const Dav1dFrameContext *const f, pixel *p,
}
while (y + stripe_h <= row_h) {
- // Change HAVE_BOTTOM bit in edges to (y + stripe_h != row_h)
- edges ^= (-(y + stripe_h != row_h) ^ edges) & LR_HAVE_BOTTOM;
+ // Change the HAVE_BOTTOM bit in edges to (sby + 1 != f->sbh || y + stripe_h != row_h)
+ edges ^= (-(sby + 1 != f->sbh || y + stripe_h != row_h) ^ edges) & LR_HAVE_BOTTOM;
if (wiener_fn) {
wiener_fn(p, p_stride, left, lpf, lpf_stride, unit_w, stripe_h,
filter, edges HIGHBD_CALL_SUFFIX);
@@ -198,7 +203,6 @@ static void lr_stripe(const Dav1dFrameContext *const f, pixel *p,
left += stripe_h;
y += stripe_h;
- if (y + stripe_h > row_h && sbrow_has_bottom) break;
p += stripe_h * PXSTRIDE(p_stride);
edges |= LR_HAVE_TOP;
stripe_h = imin(64 >> ss_ver, row_h - y);
@@ -242,8 +246,7 @@ static void lr_sbrow(const Dav1dFrameContext *const f, pixel *p, const int y,
pixel pre_lr_border[2][128 + 8 /* maximum sbrow height is 128 + 8 rows offset */][4];
const Av1RestorationUnit *lr[2];
- enum LrEdgeFlags edges = (y > 0 ? LR_HAVE_TOP : 0) | LR_HAVE_RIGHT |
- (row_h < h ? LR_HAVE_BOTTOM : 0);
+ enum LrEdgeFlags edges = (y > 0 ? LR_HAVE_TOP : 0) | LR_HAVE_RIGHT;
int aligned_unit_pos = row_y & ~(unit_size - 1);
if (aligned_unit_pos && aligned_unit_pos + half_unit_size > h)
@@ -281,11 +284,13 @@ void bytefn(dav1d_lr_sbrow)(Dav1dFrameContext *const f, pixel *const dst[3],
const int offset_y = 8 * !!sby;
const ptrdiff_t *const dst_stride = f->sr_cur.p.stride;
const int restore_planes = f->lf.restore_planes;
+ const int not_last = sby + 1 < f->sbh;
if (restore_planes & LR_RESTORE_Y) {
const int h = f->sr_cur.p.p.h;
const int w = f->sr_cur.p.p.w;
- const int row_h = imin((sby + 1) << (6 + f->seq_hdr->sb128), h);
+ const int next_row_y = (sby + 1) << (6 + f->seq_hdr->sb128);
+ const int row_h = imin(next_row_y - 8 * not_last, h);
const int y_stripe = (sby << (6 + f->seq_hdr->sb128)) - offset_y;
lr_sbrow(f, dst[0] - offset_y * PXSTRIDE(dst_stride[0]), y_stripe, w,
h, row_h, 0);
@@ -295,10 +300,10 @@ void bytefn(dav1d_lr_sbrow)(Dav1dFrameContext *const f, pixel *const dst[3],
const int ss_hor = f->sr_cur.p.p.layout != DAV1D_PIXEL_LAYOUT_I444;
const int h = (f->sr_cur.p.p.h + ss_ver) >> ss_ver;
const int w = (f->sr_cur.p.p.w + ss_hor) >> ss_hor;
- const int row_h = imin((sby + 1) << ((6 - ss_ver) + f->seq_hdr->sb128), h);
+ const int next_row_y = (sby + 1) << ((6 - ss_ver) + f->seq_hdr->sb128);
+ const int row_h = imin(next_row_y - (8 >> ss_ver) * not_last, h);
const int offset_uv = offset_y >> ss_ver;
- const int y_stripe =
- (sby << ((6 - ss_ver) + f->seq_hdr->sb128)) - offset_uv;
+ const int y_stripe = (sby << ((6 - ss_ver) + f->seq_hdr->sb128)) - offset_uv;
if (restore_planes & LR_RESTORE_U)
lr_sbrow(f, dst[1] - offset_uv * PXSTRIDE(dst_stride[1]), y_stripe,
w, h, row_h, 1);
diff --git a/src/recon.h b/src/recon.h
index f84c8ab..9751810 100644
--- a/src/recon.h
+++ b/src/recon.h
@@ -65,6 +65,14 @@ decl_recon_b_inter_fn(dav1d_recon_b_inter_16bpc);
decl_filter_sbrow_fn(dav1d_filter_sbrow_8bpc);
decl_filter_sbrow_fn(dav1d_filter_sbrow_16bpc);
+decl_filter_sbrow_fn(dav1d_filter_sbrow_deblock_8bpc);
+decl_filter_sbrow_fn(dav1d_filter_sbrow_deblock_16bpc);
+decl_filter_sbrow_fn(dav1d_filter_sbrow_cdef_8bpc);
+decl_filter_sbrow_fn(dav1d_filter_sbrow_cdef_16bpc);
+decl_filter_sbrow_fn(dav1d_filter_sbrow_resize_8bpc);
+decl_filter_sbrow_fn(dav1d_filter_sbrow_resize_16bpc);
+decl_filter_sbrow_fn(dav1d_filter_sbrow_lr_8bpc);
+decl_filter_sbrow_fn(dav1d_filter_sbrow_lr_16bpc);
decl_backup_ipred_edge_fn(dav1d_backup_ipred_edge_8bpc);
decl_backup_ipred_edge_fn(dav1d_backup_ipred_edge_16bpc);
diff --git a/src/recon_tmpl.c b/src/recon_tmpl.c
index 5a3e81d..9905914 100644
--- a/src/recon_tmpl.c
+++ b/src/recon_tmpl.c
@@ -1965,76 +1965,109 @@ int bytefn(dav1d_recon_b_inter)(Dav1dTileContext *const t, const enum BlockSize
return 0;
}
-void bytefn(dav1d_filter_sbrow)(Dav1dFrameContext *const f, const int sby) {
- const int sbsz = f->sb_step, sbh = f->sbh;
-
- if (f->frame_hdr->loopfilter.level_y[0] ||
- f->frame_hdr->loopfilter.level_y[1])
- {
+void bytefn(dav1d_filter_sbrow_deblock)(Dav1dFrameContext*const f, const int sby) {
+ const int y = sby * f->sb_step * 4;
+ const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
+ pixel *const p[3] = {
+ f->lf.p[0] + y * PXSTRIDE(f->cur.stride[0]),
+ f->lf.p[1] + (y * PXSTRIDE(f->cur.stride[1]) >> ss_ver),
+ f->lf.p[2] + (y * PXSTRIDE(f->cur.stride[1]) >> ss_ver)
+ };
+ Av1Filter *mask = f->lf.mask + (sby >> !f->seq_hdr->sb128) * f->sb128w;
+ if (f->frame_hdr->loopfilter.level_y[0] || f->frame_hdr->loopfilter.level_y[1]) {
int start_of_tile_row = 0;
if (f->frame_hdr->tiling.row_start_sb[f->lf.tile_row] == sby)
start_of_tile_row = f->lf.tile_row++;
- bytefn(dav1d_loopfilter_sbrow)(f, f->lf.p, f->lf.mask_ptr, sby,
- start_of_tile_row);
+ bytefn(dav1d_loopfilter_sbrow)(f, p, mask, sby, start_of_tile_row);
}
-
if (f->lf.restore_planes) {
// Store loop filtered pixels required by loop restoration
- bytefn(dav1d_lr_copy_lpf)(f, f->lf.p, sby);
+ bytefn(dav1d_lr_copy_lpf)(f, p, sby);
}
- if (f->seq_hdr->cdef) {
- if (sby) {
- const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
- pixel *p_up[3] = {
- f->lf.p[0] - 8 * PXSTRIDE(f->cur.stride[0]),
- f->lf.p[1] - (8 * PXSTRIDE(f->cur.stride[1]) >> ss_ver),
- f->lf.p[2] - (8 * PXSTRIDE(f->cur.stride[1]) >> ss_ver),
- };
- bytefn(dav1d_cdef_brow)(f, p_up, f->lf.prev_mask_ptr,
- sby * sbsz - 2, sby * sbsz);
- }
- const int n_blks = sbsz - 2 * (sby + 1 < sbh);
- bytefn(dav1d_cdef_brow)(f, f->lf.p, f->lf.mask_ptr, sby * sbsz,
- imin(sby * sbsz + n_blks, f->bh));
- }
- if (f->frame_hdr->width[0] != f->frame_hdr->width[1]) {
- const int has_chroma = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I400;
- for (int pl = 0; pl < 1 + 2 * has_chroma; pl++) {
- const int ss_ver = pl && f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
- const int h_start = 8 * !!sby >> ss_ver;
- const ptrdiff_t dst_stride = f->sr_cur.p.stride[!!pl];
- pixel *dst = f->lf.sr_p[pl] - h_start * PXSTRIDE(dst_stride);
- const ptrdiff_t src_stride = f->cur.stride[!!pl];
- const pixel *src = f->lf.p[pl] - h_start * PXSTRIDE(src_stride);
- const int h_end = 4 * (sbsz - 2 * (sby + 1 < sbh)) >> ss_ver;
- const int ss_hor = pl && f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444;
- const int dst_w = (f->sr_cur.p.p.w + ss_hor) >> ss_hor;
- const int src_w = (4 * f->bw + ss_hor) >> ss_hor;
- const int img_h = (f->cur.p.h - sbsz * 4 * sby + ss_ver) >> ss_ver;
-
- f->dsp->mc.resize(dst, dst_stride, src, src_stride, dst_w,
- imin(img_h, h_end) + h_start, src_w,
- f->resize_step[!!pl], f->resize_start[!!pl]
- HIGHBD_CALL_SUFFIX);
- }
- }
- if (f->lf.restore_planes) {
- bytefn(dav1d_lr_sbrow)(f, f->lf.sr_p, sby);
+}
+
+void bytefn(dav1d_filter_sbrow_cdef)(Dav1dFrameContext *const f, const int sby) {
+ const int sbsz = f->sb_step;
+ const int y = sby * sbsz * 4;
+ const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
+ pixel *const p[3] = {
+ f->lf.p[0] + y * PXSTRIDE(f->cur.stride[0]),
+ f->lf.p[1] + (y * PXSTRIDE(f->cur.stride[1]) >> ss_ver),
+ f->lf.p[2] + (y * PXSTRIDE(f->cur.stride[1]) >> ss_ver)
+ };
+ Av1Filter *prev_mask = f->lf.mask + ((sby - 1) >> !f->seq_hdr->sb128) * f->sb128w;
+ Av1Filter *mask = f->lf.mask + (sby >> !f->seq_hdr->sb128) * f->sb128w;
+ const int start = sby * sbsz;
+ if (sby) {
+ const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
+ pixel *p_up[3] = {
+ p[0] - 8 * PXSTRIDE(f->cur.stride[0]),
+ p[1] - (8 * PXSTRIDE(f->cur.stride[1]) >> ss_ver),
+ p[2] - (8 * PXSTRIDE(f->cur.stride[1]) >> ss_ver),
+ };
+ bytefn(dav1d_cdef_brow)(f, p_up, prev_mask, start - 2, start);
}
+ const int n_blks = sbsz - 2 * (sby + 1 < f->sbh);
+ const int end = imin(start + n_blks, f->bh);
+ bytefn(dav1d_cdef_brow)(f, p, mask, start, end);
+}
+void bytefn(dav1d_filter_sbrow_resize)(Dav1dFrameContext *const f, const int sby) {
+ const int sbsz = f->sb_step;
+ const int y = sby * sbsz * 4;
const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
- f->lf.p[0] += sbsz * 4 * PXSTRIDE(f->cur.stride[0]);
- f->lf.p[1] += sbsz * 4 * PXSTRIDE(f->cur.stride[1]) >> ss_ver;
- f->lf.p[2] += sbsz * 4 * PXSTRIDE(f->cur.stride[1]) >> ss_ver;
- f->lf.sr_p[0] += sbsz * 4 * PXSTRIDE(f->sr_cur.p.stride[0]);
- f->lf.sr_p[1] += sbsz * 4 * PXSTRIDE(f->sr_cur.p.stride[1]) >> ss_ver;
- f->lf.sr_p[2] += sbsz * 4 * PXSTRIDE(f->sr_cur.p.stride[1]) >> ss_ver;
- f->lf.prev_mask_ptr = f->lf.mask_ptr;
- if ((sby & 1) || f->seq_hdr->sb128) {
- f->lf.mask_ptr += f->sb128w;
+ const pixel *const p[3] = {
+ f->lf.p[0] + y * PXSTRIDE(f->cur.stride[0]),
+ f->lf.p[1] + (y * PXSTRIDE(f->cur.stride[1]) >> ss_ver),
+ f->lf.p[2] + (y * PXSTRIDE(f->cur.stride[1]) >> ss_ver)
+ };
+ pixel *const sr_p[3] = {
+ f->lf.sr_p[0] + y * PXSTRIDE(f->sr_cur.p.stride[0]),
+ f->lf.sr_p[1] + (y * PXSTRIDE(f->sr_cur.p.stride[1]) >> ss_ver),
+ f->lf.sr_p[2] + (y * PXSTRIDE(f->sr_cur.p.stride[1]) >> ss_ver)
+ };
+ const int has_chroma = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I400;
+ for (int pl = 0; pl < 1 + 2 * has_chroma; pl++) {
+ const int ss_ver = pl && f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
+ const int h_start = 8 * !!sby >> ss_ver;
+ const ptrdiff_t dst_stride = f->sr_cur.p.stride[!!pl];
+ pixel *dst = sr_p[pl] - h_start * PXSTRIDE(dst_stride);
+ const ptrdiff_t src_stride = f->cur.stride[!!pl];
+ const pixel *src = p[pl] - h_start * PXSTRIDE(src_stride);
+ const int h_end = 4 * (sbsz - 2 * (sby + 1 < f->sbh)) >> ss_ver;
+ const int ss_hor = pl && f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444;
+ const int dst_w = (f->sr_cur.p.p.w + ss_hor) >> ss_hor;
+ const int src_w = (4 * f->bw + ss_hor) >> ss_hor;
+ const int img_h = (f->cur.p.h - sbsz * 4 * sby + ss_ver) >> ss_ver;
+
+ f->dsp->mc.resize(dst, dst_stride, src, src_stride, dst_w,
+ imin(img_h, h_end) + h_start, src_w,
+ f->resize_step[!!pl], f->resize_start[!!pl]
+ HIGHBD_CALL_SUFFIX);
}
}
+void bytefn(dav1d_filter_sbrow_lr)(Dav1dFrameContext *const f, const int sby) {
+ const int y = sby * f->sb_step * 4;
+ const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
+ pixel *const sr_p[3] = {
+ f->lf.sr_p[0] + y * PXSTRIDE(f->sr_cur.p.stride[0]),
+ f->lf.sr_p[1] + (y * PXSTRIDE(f->sr_cur.p.stride[1]) >> ss_ver),
+ f->lf.sr_p[2] + (y * PXSTRIDE(f->sr_cur.p.stride[1]) >> ss_ver)
+ };
+ bytefn(dav1d_lr_sbrow)(f, sr_p, sby);
+}
+
+void bytefn(dav1d_filter_sbrow)(Dav1dFrameContext *const f, const int sby) {
+ bytefn(dav1d_filter_sbrow_deblock)(f, sby);
+ if (f->seq_hdr->cdef)
+ bytefn(dav1d_filter_sbrow_cdef)(f, sby);
+ if (f->frame_hdr->width[0] != f->frame_hdr->width[1])
+ bytefn(dav1d_filter_sbrow_resize)(f, sby);
+ if (f->lf.restore_planes)
+ bytefn(dav1d_filter_sbrow_lr)(f, sby);
+}
+
void bytefn(dav1d_backup_ipred_edge)(Dav1dTileContext *const t) {
const Dav1dFrameContext *const f = t->f;
Dav1dTileState *const ts = t->ts;
diff --git a/src/thread_task.c b/src/thread_task.c
index 6c1c139..9c2f49b 100644
--- a/src/thread_task.c
+++ b/src/thread_task.c
@@ -29,6 +29,137 @@
#include "src/thread_task.h"
+int dav1d_task_create_filter_sbrow(Dav1dFrameContext *const f) {
+ struct PostFilterThreadData *const pftd = f->lf.thread.pftd;
+ const int frame_idx = (int)(f - f->c->fc);
+
+ const int has_deblock = f->frame_hdr->loopfilter.level_y[0] ||
+ f->frame_hdr->loopfilter.level_y[1] ||
+ f->lf.restore_planes;
+ const int has_cdef = f->seq_hdr->cdef;
+ const int has_resize = f->frame_hdr->width[0] != f->frame_hdr->width[1];
+ const int has_lr = !!f->lf.restore_planes;
+ f->lf.thread.npf = has_deblock + has_cdef + has_resize + has_lr;
+ if (f->lf.thread.npf == 0) return 0;
+
+ pthread_mutex_lock(&pftd->lock);
+
+ Dav1dTask *tasks = f->lf.thread.tasks;
+ int num_tasks = f->sbh * f->lf.thread.npf;
+ if (num_tasks > f->lf.thread.num_tasks) {
+ const size_t size = sizeof(Dav1dTask) * num_tasks;
+ tasks = realloc(f->lf.thread.tasks, size);
+ if (!tasks) return -1;
+ memset(tasks, 0, size);
+ f->lf.thread.tasks = tasks;
+ f->lf.thread.num_tasks = num_tasks;
+ }
+
+#define create_task(task, ready_cond, start_cond) \
+ do { \
+ t = &tasks[num_tasks++]; \
+ t->status = ready_cond ? DAV1D_TASK_READY : DAV1D_TASK_DEFAULT; \
+ t->start = start_cond; \
+ t->frame_id = frame_cnt; \
+ t->frame_idx = frame_idx; \
+ t->sby = sby; \
+ t->fn = f->bd_fn.filter_sbrow_##task; \
+ t->last_deps[0] = NULL; \
+ t->last_deps[1] = NULL; \
+ t->next_deps[0] = NULL; \
+ t->next_deps[1] = NULL; \
+ t->next_exec = NULL; \
+ } while (0)
+
+ Dav1dTask *last_sbrow_deblock = NULL;
+ Dav1dTask *last_sbrow_cdef = NULL;
+ Dav1dTask *last_sbrow_resize = NULL;
+ Dav1dTask *last_sbrow_lr = NULL;
+ num_tasks = 0;
+ const int frame_cnt = pftd->frame_cnt++;
+
+ for (int sby = 0; sby < f->sbh; ++sby) {
+ Dav1dTask *t;
+ Dav1dTask *last = NULL;
+ if (has_deblock) {
+ create_task(deblock, sby == 0, 0);
+ if (sby) {
+ t->last_deps[1] = last_sbrow_deblock;
+ last_sbrow_deblock->next_deps[1] = t;
+ }
+ last = t;
+ last_sbrow_deblock = t;
+ }
+ if (has_cdef) {
+ create_task(cdef, sby == 0 && !has_deblock, has_deblock);
+ if (has_deblock) {
+ t->last_deps[0] = last;
+ last->next_deps[0] = t;
+ }
+ if (sby) {
+ t->last_deps[1] = last_sbrow_cdef;
+ last_sbrow_cdef->next_deps[1] = t;
+ }
+ last = t;
+ last_sbrow_cdef = t;
+ };
+ if (has_resize) {
+ create_task(resize, sby == 0 && !last, !!last);
+ if (last) {
+ t->last_deps[0] = last;
+ last->next_deps[0] = t;
+ }
+ if (sby) {
+ t->last_deps[1] = last_sbrow_resize;
+ last_sbrow_resize->next_deps[1] = t;
+ }
+ last = t;
+ last_sbrow_resize = t;
+ }
+ if (has_lr) {
+ create_task(lr, sby == 0 && !last, !!last);
+ if (last) {
+ t->last_deps[0] = last;
+ last->next_deps[0] = t;
+ }
+ if (sby) {
+ t->last_deps[1] = last_sbrow_lr;
+ last_sbrow_lr->next_deps[1] = t;
+ }
+ last_sbrow_lr = t;
+ }
+ }
+ f->lf.thread.done = 0;
+ pthread_mutex_unlock(&pftd->lock);
+
+ return 0;
+}
+
+void dav1d_task_schedule(struct PostFilterThreadData *const pftd,
+ Dav1dTask *const t)
+{
+ Dav1dTask **pt = &pftd->tasks;
+ while (*pt &&
+ ((*pt)->sby < t->sby ||
+ ((*pt)->sby == t->sby && (*pt)->frame_id <= t->frame_id)))
+ pt = &(*pt)->next_exec;
+ t->next_exec = *pt;
+ *pt = t;
+ pthread_cond_signal(&pftd->cond);
+}
+
+static inline void update_task(Dav1dTask *const t, const int dep_type,
+ Dav1dFrameContext *const f)
+{
+ if (!t->last_deps[!dep_type] ||
+ t->last_deps[!dep_type]->status == DAV1D_TASK_DONE)
+ {
+ t->status = DAV1D_TASK_READY;
+ if (t->start)
+ dav1d_task_schedule(f->lf.thread.pftd, t);
+ }
+}
+
void *dav1d_frame_task(void *const data) {
Dav1dFrameContext *const f = data;
@@ -140,3 +271,98 @@ void *dav1d_tile_task(void *const data) {
return NULL;
}
+
+static inline int handle_abortion(Dav1dPostFilterContext *const pf,
+ Dav1dContext *const c,
+ struct PostFilterThreadData *const pftd)
+{
+ const int flush = atomic_load_explicit(c->flush, memory_order_acquire);
+ if (flush) {
+ pthread_mutex_lock(&pf->td.lock);
+ pf->flushed = 0;
+ pthread_mutex_unlock(&pf->td.lock);
+ }
+ for (unsigned i = 0; i < c->n_fc; i++) {
+ Dav1dFrameContext *const f = &c->fc[i];
+ int send_signal;
+ if (flush) // TODO before merge, see if this can be safely merged
+ send_signal = f->lf.thread.done != 1 && f->lf.thread.num_tasks != 0;
+ else
+ send_signal = f->lf.thread.done == -1;
+ for (int j = 0; send_signal && j < f->lf.thread.num_tasks; j++) {
+ Dav1dTask *const t = &f->lf.thread.tasks[j];
+ if (t->status == DAV1D_TASK_RUNNING ||
+ (t->status == DAV1D_TASK_DONE && t->start != -1))
+ send_signal = 0;
+ }
+ if (send_signal) {
+ if (!flush) {
+ Dav1dTask **pt = &pftd->tasks;
+ while (*pt) {
+ if ((*pt)->frame_idx == i)
+ *pt = (*pt)->next_exec;
+ else
+ pt = &(*pt)->next_exec;
+ }
+ }
+ f->lf.thread.done = 1;
+ pthread_cond_signal(&f->lf.thread.cond);
+ }
+ }
+ if (flush) {
+ pthread_mutex_lock(&pf->td.lock);
+ pf->flushed = 1;
+ pthread_cond_signal(&pf->td.cond);
+ pthread_mutex_unlock(&pf->td.lock);
+ }
+ return !flush;
+}
+
+void *dav1d_postfilter_task(void *data) {
+ Dav1dPostFilterContext *const pf = data;
+ Dav1dContext *const c = pf->c;
+ struct PostFilterThreadData *pftd = &c->postfilter_thread;
+
+ dav1d_set_thread_name("dav1d-postfilter");
+
+ int exec = 1;
+ pthread_mutex_lock(&pftd->lock);
+ for (;;) {
+ if (!exec && !pf->die)
+ pthread_cond_wait(&pftd->cond, &pftd->lock);
+ if (!(exec = handle_abortion(pf, c, pftd))) continue;
+ if (pf->die) break;
+
+ Dav1dTask *const t = pftd->tasks;
+ if (!t) { exec = 0; continue; }
+ pftd->tasks = t->next_exec;
+ t->status = DAV1D_TASK_RUNNING;
+
+ pthread_mutex_unlock(&pftd->lock);
+ Dav1dFrameContext *const f = &c->fc[t->frame_idx];
+ t->fn(f, t->sby);
+ exec = 1;
+ pthread_mutex_lock(&pftd->lock);
+
+ if (t->next_deps[0])
+ update_task(t->next_deps[0], 0, f);
+ if (t->next_deps[1])
+ update_task(t->next_deps[1], 1, f);
+ t->status = DAV1D_TASK_DONE;
+ if (!t->next_deps[0]) {
+ const enum PlaneType progress_plane_type =
+ c->n_fc > 1 && f->frame_hdr->refresh_context ?
+ PLANE_TYPE_Y : PLANE_TYPE_ALL;
+ const int y = (t->sby + 1) * f->sb_step * 4;
+ dav1d_thread_picture_signal(&f->sr_cur, y, progress_plane_type);
+ if (t->sby + 1 == f->sbh) {
+ f->lf.thread.done = 1;
+ pthread_cond_signal(&f->lf.thread.cond);
+ }
+ }
+ t->start = -1;
+ }
+ pthread_mutex_unlock(&pftd->lock);
+
+ return NULL;
+}
diff --git a/src/thread_task.h b/src/thread_task.h
index 309a714..b3f3905 100644
--- a/src/thread_task.h
+++ b/src/thread_task.h
@@ -35,10 +35,33 @@
#define FRAME_ERROR (UINT_MAX - 1)
#define TILE_ERROR (INT_MAX - 1)
-int dav1d_decode_frame(Dav1dFrameContext *f);
+enum TaskStatus {
+ DAV1D_TASK_DEFAULT,
+ DAV1D_TASK_READY,
+ DAV1D_TASK_RUNNING,
+ DAV1D_TASK_DONE,
+};
+
+struct Dav1dTask {
+ enum TaskStatus status; // task status
+ int start; // frame thread start flag
+ unsigned frame_idx; // frame thread id
+ int frame_id; // frame ordering
+ int sby; // sbrow
+ filter_sbrow_fn fn; // task work
+ Dav1dTask *last_deps[2]; // dependencies
+ Dav1dTask *next_deps[2]; // dependant tasks
+ Dav1dTask *next_exec; // tasks scheduling
+};
+
+int dav1d_task_create_filter_sbrow(Dav1dFrameContext *f);
+void dav1d_task_schedule(struct PostFilterThreadData *pftd, Dav1dTask *t);
+
void *dav1d_frame_task(void *data);
+void *dav1d_tile_task(void *data);
+void *dav1d_postfilter_task(void *data);
+int dav1d_decode_frame(Dav1dFrameContext *f);
int dav1d_decode_tile_sbrow(Dav1dTileContext *t);
-void *dav1d_tile_task(void *data);
#endif /* DAV1D_SRC_THREAD_TASK_H */
diff --git a/tools/dav1d_cli_parse.c b/tools/dav1d_cli_parse.c
index 96f81e0..35d8e17 100644
--- a/tools/dav1d_cli_parse.c
+++ b/tools/dav1d_cli_parse.c
@@ -51,6 +51,7 @@ enum {
ARG_REALTIME_CACHE,
ARG_FRAME_THREADS,
ARG_TILE_THREADS,
+ ARG_POSTFILTER_THREADS,
ARG_VERIFY,
ARG_FILM_GRAIN,
ARG_OPPOINT,
@@ -73,6 +74,7 @@ static const struct option long_opts[] = {
{ "realtimecache", 1, NULL, ARG_REALTIME_CACHE },
{ "framethreads", 1, NULL, ARG_FRAME_THREADS },
{ "tilethreads", 1, NULL, ARG_TILE_THREADS },
+ { "pfthreads", 1, NULL, ARG_POSTFILTER_THREADS },
{ "verify", 1, NULL, ARG_VERIFY },
{ "filmgrain", 1, NULL, ARG_FILM_GRAIN },
{ "oppoint", 1, NULL, ARG_OPPOINT },
@@ -117,6 +119,7 @@ static void usage(const char *const app, const char *const reason, ...) {
" --version/-v: print version and exit\n"
" --framethreads $num: number of frame threads (default: 1)\n"
" --tilethreads $num: number of tile threads (default: 1)\n"
+ " --pfthreads $num: number of postfilter threads (default: 1)\n"
" --filmgrain $num: enable film grain application (default: 1, except if muxer is md5)\n"
" --oppoint $num: select an operating point of a scalable AV1 bitstream (0 - 31)\n"
" --alllayers $num: output all spatial layers of a scalable AV1 bitstream (default: 1)\n"
@@ -295,6 +298,10 @@ void parse(const int argc, char *const *const argv,
lib_settings->n_tile_threads =
parse_unsigned(optarg, ARG_TILE_THREADS, argv[0]);
break;
+ case ARG_POSTFILTER_THREADS:
+ lib_settings->n_postfilter_threads =
+ parse_unsigned(optarg, ARG_POSTFILTER_THREADS, argv[0]);
+ break;
case ARG_VERIFY:
cli_settings->verify = optarg;
break;