Build: Add suffix to templated BITDEPTH files

Fix #96
author: Marvin Scholz <epirat07@gmail.com> 2018-10-25 17:45:12 +0300
committer: Ronald S. Bultje <rsbultje@gmail.com> 2018-10-25 19:51:31 +0300
commit: 46e2a2d0cc451e1d6bb929f80088f8a7b8940dd0 (patch)
tree: 002462d6840acd6551bb0b6b2dd5f4416db1ab84 /src/recon.c
parent: 367d785a4e70b3e43eee234b3c745b047e3fbd40 (diff)
1 files changed, 0 insertions, 1518 deletions
diff --git a/src/recon.c b/src/recon.c
deleted file mode 100644
index eb5bb2f..0000000
--- a/src/recon.c
+++ /dev/null
@@ -1,1518 +0,0 @@
-/*
- * Copyright © 2018, VideoLAN and dav1d authors
- * Copyright © 2018, Two Orioles, LLC
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- *    list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- *    this list of conditions and the following disclaimer in the documentation
- *    and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
- * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include "config.h"
-
-#include <string.h>
-#include <stdio.h>
-
-#include "common/attributes.h"
-#include "common/bitdepth.h"
-#include "common/dump.h"
-#include "common/intops.h"
-#include "common/mem.h"
-
-#include "src/cdef_apply.h"
-#include "src/ipred_prepare.h"
-#include "src/lf_apply.h"
-#include "src/lr_apply.h"
-#include "src/recon.h"
-#include "src/scan.h"
-#include "src/tables.h"
-#include "src/wedge.h"
-
-static unsigned read_golomb(MsacContext *const msac) {
-    int len = 0;
-    unsigned val = 1;
-
-    while (!msac_decode_bool(msac, 128 << 7) && len < 32) len++;
-    while (len--) val = (val << 1) | msac_decode_bool(msac, 128 << 7);
-
-    return val - 1;
-}
-
-static int decode_coefs(Dav1dTileContext *const t,
-                        uint8_t *const a, uint8_t *const l,
-                        const enum RectTxfmSize tx, const enum BlockSize bs,
-                        const Av1Block *const b, const int intra,
-                        const int plane, coef *cf,
-                        enum TxfmType *const txtp, uint8_t *res_ctx)
-{
-    Dav1dTileState *const ts = t->ts;
-    const int chroma = !!plane;
-    const Dav1dFrameContext *const f = t->f;
-    const TxfmInfo *const t_dim = &dav1d_txfm_dimensions[tx];
-    const int dbg = DEBUG_BLOCK_INFO && plane && 0;
-
-    if (dbg) printf("Start: r=%d\n", ts->msac.rng);
-
-    // does this block have any non-zero coefficients
-    const int sctx = get_coef_skip_ctx(t_dim, bs, a, l, chroma, f->cur.p.p.layout);
-    const int all_skip =
-        msac_decode_bool_adapt(&ts->msac, ts->cdf.coef.skip[t_dim->ctx][sctx]);
-    if (dbg)
-    printf("Post-non-zero[%d][%d][%d]: r=%d\n",
-           t_dim->ctx, sctx, all_skip, ts->msac.rng);
-    if (all_skip) {
-        *res_ctx = 0x40;
-        *txtp = f->frame_hdr.segmentation.lossless[b->seg_id] ? WHT_WHT :
-                                                                DCT_DCT;
-        return -1;
-    }
-
-    // transform type (chroma: derived, luma: explicitly coded)
-    if (chroma) {
-        if (intra) {
-            *txtp = get_uv_intra_txtp(b->uv_mode, tx, &f->frame_hdr, b->seg_id);
-        } else {
-            const enum TxfmType y_txtp = *txtp;
-            *txtp = get_uv_inter_txtp(t_dim, y_txtp, &f->frame_hdr, b->seg_id);
-        }
-    } else {
-        const enum TxfmTypeSet set = get_ext_txtp_set(tx, !intra,
-                                                      &f->frame_hdr, b->seg_id);
-        const unsigned set_cnt = dav1d_tx_type_count[set];
-        unsigned idx;
-        if (set_cnt == 1) {
-            idx = 0;
-        } else {
-            const int set_idx = dav1d_tx_type_set_index[!intra][set];
-            const enum IntraPredMode y_mode_nofilt = b->y_mode == FILTER_PRED ?
-                dav1d_filter_mode_to_y_mode[b->y_angle] : b->y_mode;
-            uint16_t *const txtp_cdf = intra ?
-                       ts->cdf.m.txtp_intra[set_idx][t_dim->min][y_mode_nofilt] :
-                       ts->cdf.m.txtp_inter[set_idx][t_dim->min];
-            idx = msac_decode_symbol_adapt(&ts->msac, txtp_cdf, set_cnt);
-            if (dbg)
-            printf("Post-txtp[%d->%d][%d->%d][%d][%d->%d]: r=%d\n",
-                   set, set_idx, tx, t_dim->min, b->intra ? (int)y_mode_nofilt : -1,
-                   idx, dav1d_tx_types_per_set[set][idx], ts->msac.rng);
-        }
-        *txtp = dav1d_tx_types_per_set[set][idx];
-    }
-
-    // find end-of-block (eob)
-    int eob_bin;
-    const int tx2dszctx = imin(t_dim->lw, TX_32X32) + imin(t_dim->lh, TX_32X32);
-    const enum TxClass tx_class = dav1d_tx_type_class[*txtp];
-    const int is_1d = tx_class != TX_CLASS_2D;
-    switch (tx2dszctx) {
-#define case_sz(sz, bin) \
-    case sz: { \
-        uint16_t *const eob_bin_cdf = ts->cdf.coef.eob_bin_##bin[chroma][is_1d]; \
-        eob_bin = msac_decode_symbol_adapt(&ts->msac, eob_bin_cdf, 5 + sz); \
-        break; \
-    }
-    case_sz(0,   16);
-    case_sz(1,   32);
-    case_sz(2,   64);
-    case_sz(3,  128);
-    case_sz(4,  256);
-    case_sz(5,  512);
-    case_sz(6, 1024);
-#undef case_sz
-    }
-    if (dbg)
-    printf("Post-eob_bin_%d[%d][%d][%d]: r=%d\n",
-           16 << tx2dszctx, chroma, is_1d, eob_bin, ts->msac.rng);
-    int eob;
-    if (eob_bin > 1) {
-        eob = 1 << (eob_bin - 1);
-        uint16_t *const eob_hi_bit_cdf =
-            ts->cdf.coef.eob_hi_bit[t_dim->ctx][chroma][eob_bin];
-        const int eob_hi_bit = msac_decode_bool_adapt(&ts->msac, eob_hi_bit_cdf);
-        if (dbg)
-        printf("Post-eob_hi_bit[%d][%d][%d][%d]: r=%d\n",
-               t_dim->ctx, chroma, eob_bin, eob_hi_bit, ts->msac.rng);
-        unsigned mask = eob >> 1;
-        if (eob_hi_bit) eob |= mask;
-        for (mask >>= 1; mask; mask >>= 1) {
-            const int eob_bit = msac_decode_bool(&ts->msac, 128 << 7);
-            if (eob_bit) eob |= mask;
-        }
-        if (dbg)
-        printf("Post-eob[%d]: r=%d\n", eob, ts->msac.rng);
-    } else {
-        eob = eob_bin;
-    }
-
-    // base tokens
-    uint16_t (*const br_cdf)[5] =
-        ts->cdf.coef.br_tok[imin(t_dim->ctx, 3)][chroma];
-    const int16_t *const scan = dav1d_scans[tx][tx_class];
-    uint8_t levels[36 * 36];
-    ptrdiff_t stride = 4 * (imin(t_dim->h, 8) + 1);
-    memset(levels, 0, stride * 4 * (imin(t_dim->w, 8) + 1));
-    const int shift = 2 + imin(t_dim->lh, 3), mask = 4 * imin(t_dim->h, 8) - 1;
-    unsigned cul_level = 0;
-    for (int i = eob, is_last = 1; i >= 0; i--, is_last = 0) {
-        const int rc = scan[i], x = rc >> shift, y = rc & mask;
-
-        // lo tok
-        const int ctx = get_coef_nz_ctx(levels, i, rc, is_last, tx, tx_class);
-        uint16_t *const lo_cdf = is_last ?
-            ts->cdf.coef.eob_base_tok[t_dim->ctx][chroma][ctx] :
-            ts->cdf.coef.base_tok[t_dim->ctx][chroma][ctx];
-        int tok = msac_decode_symbol_adapt(&ts->msac, lo_cdf,
-                                           4 - is_last) + is_last;
-        if (dbg)
-        printf("Post-lo_tok[%d][%d][%d][%d=%d=%d]: r=%d\n",
-               t_dim->ctx, chroma, ctx, i, rc, tok, ts->msac.rng);
-        if (!tok) continue;
-
-        // hi tok
-        if (tok == 3) {
-            const int br_ctx = get_br_ctx(levels, rc, tx, tx_class);
-            do {
-                const int tok_br =
-                    msac_decode_symbol_adapt(&ts->msac, br_cdf[br_ctx], 4);
-                if (dbg)
-                printf("Post-hi_tok[%d][%d][%d][%d=%d=%d->%d]: r=%d\n",
-                       imin(t_dim->ctx, 3), chroma, br_ctx,
-                       i, rc, tok_br, tok, ts->msac.rng);
-                tok += tok_br;
-                if (tok_br < 3) break;
-            } while (tok < 15);
-        }
-
-        levels[x * stride + y] = cf[rc] = tok;
-    }
-
-    // residual and sign
-    int dc_sign = 1;
-    const uint16_t *const dq_tbl = ts->dq[b->seg_id][plane];
-    const uint8_t *const qm_tbl = f->qm[is_1d || *txtp == IDTX][tx][plane];
-    const int dq_shift = imax(0, t_dim->ctx - 2);
-    for (int i = 0; i <= eob; i++) {
-        const int rc = scan[i];
-        int tok = cf[rc];
-        if (!tok) continue;
-        int dq;
-
-        // sign
-        int sign;
-        if (i == 0) {
-            const int dc_sign_ctx = get_dc_sign_ctx(t_dim, a, l);
-            uint16_t *const dc_sign_cdf =
-                ts->cdf.coef.dc_sign[chroma][dc_sign_ctx];
-            sign = msac_decode_bool_adapt(&ts->msac, dc_sign_cdf);
-            if (dbg)
-            printf("Post-dc_sign[%d][%d][%d]: r=%d\n",
-                   chroma, dc_sign_ctx, sign, ts->msac.rng);
-            dc_sign = sign ? 0 : 2;
-            dq = (dq_tbl[0] * qm_tbl[0] + 16) >> 5;
-        } else {
-            sign = msac_decode_bool(&ts->msac, 128 << 7);
-            if (dbg)
-            printf("Post-sign[%d=%d=%d]: r=%d\n", i, rc, sign, ts->msac.rng);
-            dq = (dq_tbl[1] * qm_tbl[rc] + 16) >> 5;
-        }
-
-        // residual
-        if (tok == 15) {
-            tok += read_golomb(&ts->msac);
-            if (dbg)
-            printf("Post-residual[%d=%d=%d->%d]: r=%d\n",
-                   i, rc, tok - 15, tok, ts->msac.rng);
-        }
-
-        // dequant
-        cul_level += tok;
-        tok *= dq;
-        tok >>= dq_shift;
-        cf[rc] = sign ? -tok : tok;
-    }
-
-    // context
-    *res_ctx = imin(cul_level, 63) | (dc_sign << 6);
-
-    return eob;
-}
-
-static void read_coef_tree(Dav1dTileContext *const t,
-                           const enum BlockSize bs, const Av1Block *const b,
-                           const enum RectTxfmSize ytx, const int depth,
-                           const uint16_t *const tx_split,
-                           const int x_off, const int y_off, pixel *dst)
-{
-    const Dav1dFrameContext *const f = t->f;
-    Dav1dTileState *const ts = t->ts;
-    const Dav1dDSPContext *const dsp = f->dsp;
-    const TxfmInfo *const t_dim = &dav1d_txfm_dimensions[ytx];
-    const int txw = t_dim->w, txh = t_dim->h;
-
-    if (depth < 2 && tx_split[depth] & (1 << (y_off * 4 + x_off))) {
-        const enum RectTxfmSize sub = t_dim->sub;
-        const TxfmInfo *const sub_t_dim = &dav1d_txfm_dimensions[sub];
-        const int txsw = sub_t_dim->w, txsh = sub_t_dim->h;
-
-        read_coef_tree(t, bs, b, sub, depth + 1, tx_split,
-                       x_off * 2 + 0, y_off * 2 + 0, dst);
-        t->bx += txsw;
-        if (txw >= txh && t->bx < f->bw)
-            read_coef_tree(t, bs, b, sub, depth + 1, tx_split, x_off * 2 + 1,
-                           y_off * 2 + 0, dst ? &dst[4 * txsw] : NULL);
-        t->bx -= txsw;
-        t->by += txsh;
-        if (txh >= txw && t->by < f->bh) {
-            if (dst)
-                dst += 4 * txsh * PXSTRIDE(f->cur.p.stride[0]);
-            read_coef_tree(t, bs, b, sub, depth + 1, tx_split,
-                           x_off * 2 + 0, y_off * 2 + 1, dst);
-            t->bx += txsw;
-            if (txw >= txh && t->bx < f->bw)
-                read_coef_tree(t, bs, b, sub, depth + 1, tx_split, x_off * 2 + 1,
-                               y_off * 2 + 1, dst ? &dst[4 * txsw] : NULL);
-            t->bx -= txsw;
-        }
-        t->by -= txsh;
-    } else {
-        const int bx4 = t->bx & 31, by4 = t->by & 31;
-        enum TxfmType txtp;
-        uint8_t cf_ctx;
-        int eob;
-        coef *cf;
-        struct CodedBlockInfo *cbi;
-
-        if (f->frame_thread.pass) {
-            cf = ts->frame_thread.cf;
-            ts->frame_thread.cf += imin(t_dim->w, 8) * imin(t_dim->h, 8) * 16;
-            cbi = &f->frame_thread.cbi[t->by * f->b4_stride + t->bx];
-        } else {
-            cf = t->cf;
-        }
-        if (f->frame_thread.pass != 2) {
-            eob = decode_coefs(t, &t->a->lcoef[bx4], &t->l.lcoef[by4],
-                               ytx, bs, b, 0, 0, cf, &txtp, &cf_ctx);
-            if (DEBUG_BLOCK_INFO)
-                printf("Post-y-cf-blk[tx=%d,txtp=%d,eob=%d]: r=%d\n",
-                       ytx, txtp, eob, ts->msac.rng);
-            memset(&t->a->lcoef[bx4], cf_ctx, imin(txw, f->bw - t->bx));
-            memset(&t->l.lcoef[by4], cf_ctx, imin(txh, f->bh - t->by));
-            for (int y = 0; y < txh; y++)
-                memset(&t->txtp_map[(by4 + y) * 32 + bx4], txtp, txw);
-            if (f->frame_thread.pass == 1) {
-                cbi->eob[0] = eob;
-                cbi->txtp[0] = txtp;
-            }
-        } else {
-            eob = cbi->eob[0];
-            txtp = cbi->txtp[0];
-        }
-        if (!(f->frame_thread.pass & 1)) {
-            assert(dst);
-            if (eob >= 0) {
-                if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
-                    coef_dump(cf, imin(t_dim->h, 8) * 4, imin(t_dim->w, 8) * 4, 3, "dq");
-                dsp->itx.itxfm_add[ytx][txtp](dst, f->cur.p.stride[0], cf, eob);
-                if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
-                    hex_dump(dst, f->cur.p.stride[0], t_dim->w * 4, t_dim->h * 4, "recon");
-            }
-        }
-    }
-}
-
-void bytefn(dav1d_read_coef_blocks)(Dav1dTileContext *const t,
-                                    const enum BlockSize bs, const Av1Block *const b)
-{
-    const Dav1dFrameContext *const f = t->f;
-    const int ss_ver = f->cur.p.p.layout == DAV1D_PIXEL_LAYOUT_I420;
-    const int ss_hor = f->cur.p.p.layout != DAV1D_PIXEL_LAYOUT_I444;
-    const int bx4 = t->bx & 31, by4 = t->by & 31;
-    const int cbx4 = bx4 >> ss_hor, cby4 = by4 >> ss_ver;
-    const uint8_t *const b_dim = dav1d_block_dimensions[bs];
-    const int bw4 = b_dim[0], bh4 = b_dim[1];
-    const int cbw4 = (bw4 + 1) >> ss_hor, cbh4 = (bh4 + 1) >> ss_ver;
-    const int has_chroma = f->seq_hdr.layout != DAV1D_PIXEL_LAYOUT_I400 &&
-                           (bw4 > ss_hor || t->bx & 1) &&
-                           (bh4 > ss_ver || t->by & 1);
-
-    if (b->skip) {
-        memset(&t->a->lcoef[bx4], 0x40, bw4);
-        memset(&t->l.lcoef[by4], 0x40, bh4);
-        if (has_chroma) for (int pl = 0; pl < 2; pl++) {
-            memset(&t->a->ccoef[pl][cbx4], 0x40, cbw4);
-            memset(&t->l.ccoef[pl][cby4], 0x40, cbh4);
-        }
-        return;
-    }
-
-    Dav1dTileState *const ts = t->ts;
-    const int w4 = imin(bw4, f->bw - t->bx), h4 = imin(bh4, f->bh - t->by);
-    const int cw4 = (w4 + ss_hor) >> ss_hor, ch4 = (h4 + ss_ver) >> ss_ver;
-    assert(f->frame_thread.pass == 1);
-    assert(!b->skip);
-    const TxfmInfo *const uv_t_dim = &dav1d_txfm_dimensions[b->uvtx];
-    const TxfmInfo *const t_dim = &dav1d_txfm_dimensions[b->intra ? b->tx : b->max_ytx];
-
-    for (int init_y = 0; init_y < h4; init_y += 16) {
-        for (int init_x = 0; init_x < w4; init_x += 16) {
-            const int sub_h4 = imin(h4, 16 + init_y);
-            const int sub_w4 = imin(w4, init_x + 16);
-            int y_off = !!init_y, y, x;
-            for (y = init_y, t->by += init_y; y < sub_h4;
-                 y += t_dim->h, t->by += t_dim->h, y_off++)
-            {
-                struct CodedBlockInfo *const cbi =
-                    &f->frame_thread.cbi[t->by * f->b4_stride];
-                int x_off = !!init_x;
-                for (x = init_x, t->bx += init_x; x < sub_w4;
-                     x += t_dim->w, t->bx += t_dim->w, x_off++)
-                {
-                    if (!b->intra) {
-                        read_coef_tree(t, bs, b, b->max_ytx, 0, b->tx_split,
-                                       x_off, y_off, NULL);
-                    } else {
-                        uint8_t cf_ctx = 0x40;
-                        enum TxfmType txtp;
-                        const int eob = cbi[t->bx].eob[0] =
-                            decode_coefs(t, &t->a->lcoef[bx4 + x],
-                                         &t->l.lcoef[by4 + y], b->tx, bs, b, 1,
-                                         0, ts->frame_thread.cf, &txtp, &cf_ctx);
-                        if (DEBUG_BLOCK_INFO)
-                            printf("Post-y-cf-blk[tx=%d,txtp=%d,eob=%d]: r=%d\n",
-                                   b->tx, txtp, eob, ts->msac.rng);
-                        cbi[t->bx].txtp[0] = txtp;
-                        ts->frame_thread.cf += imin(t_dim->w, 8) * imin(t_dim->h, 8) * 16;
-                        memset(&t->a->lcoef[bx4 + x], cf_ctx,
-                               imin(t_dim->w, f->bw - t->bx));
-                        memset(&t->l.lcoef[by4 + y], cf_ctx,
-                               imin(t_dim->h, f->bh - t->by));
-                    }
-                }
-                t->bx -= x;
-            }
-            t->by -= y;
-
-            if (!has_chroma) continue;
-
-            const int sub_ch4 = imin(ch4, (init_y + 16) >> ss_ver);
-            const int sub_cw4 = imin(cw4, (init_x + 16) >> ss_hor);
-            for (int pl = 0; pl < 2; pl++) {
-                for (y = init_y >> ss_ver, t->by += init_y; y < sub_ch4;
-                     y += uv_t_dim->h, t->by += uv_t_dim->h << ss_ver)
-                {
-                    struct CodedBlockInfo *const cbi =
-                        &f->frame_thread.cbi[t->by * f->b4_stride];
-                    for (x = init_x >> ss_hor, t->bx += init_x; x < sub_cw4;
-                         x += uv_t_dim->w, t->bx += uv_t_dim->w << ss_hor)
-                    {
-                        uint8_t cf_ctx = 0x40;
-                        enum TxfmType txtp;
-                        if (!b->intra)
-                            txtp = t->txtp_map[(by4 + (y << ss_ver)) * 32 +
-                                                bx4 + (x << ss_hor)];
-                        const int eob = cbi[t->bx].eob[1 + pl] =
-                            decode_coefs(t, &t->a->ccoef[pl][cbx4 + x],
-                                         &t->l.ccoef[pl][cby4 + y], b->uvtx, bs,
-                                         b, b->intra, 1 + pl, ts->frame_thread.cf,
-                                         &txtp, &cf_ctx);
-                        if (DEBUG_BLOCK_INFO)
-                            printf("Post-uv-cf-blk[pl=%d,tx=%d,"
-                                   "txtp=%d,eob=%d]: r=%d\n",
-                                   pl, b->uvtx, txtp, eob, ts->msac.rng);
-                        cbi[t->bx].txtp[1 + pl] = txtp;
-                        ts->frame_thread.cf += uv_t_dim->w * uv_t_dim->h * 16;
-                        memset(&t->a->ccoef[pl][cbx4 + x], cf_ctx,
-                               imin(uv_t_dim->w, (f->bw - t->bx + ss_hor) >> ss_hor));
-                        memset(&t->l.ccoef[pl][cby4 + y], cf_ctx,
-                               imin(uv_t_dim->h, (f->bh - t->by + ss_ver) >> ss_ver));
-                    }
-                    t->bx -= x << ss_hor;
-                }
-                t->by -= y << ss_ver;
-            }
-        }
-    }
-}
-
-static void emu_edge(pixel *dst, const ptrdiff_t dst_stride,
-                     const pixel *ref, const ptrdiff_t ref_stride,
-                     const int bw, const int bh,
-                     const int iw, const int ih,
-                     const int x, const int y)
-{
-    // find offset in reference of visible block to copy
-    ref += iclip(y, 0, ih - 1) * PXSTRIDE(ref_stride) + iclip(x, 0, iw - 1);
-
-    // number of pixels to extend (left, right, top, bottom)
-    const int left_ext = iclip(-x, 0, bw - 1);
-    const int right_ext = iclip(x + bw - iw, 0, bw - 1);
-    assert(left_ext + right_ext < bw);
-    const int top_ext = iclip(-y, 0, bh - 1);
-    const int bottom_ext = iclip(y + bh - ih, 0, bh - 1);
-    assert(top_ext + bottom_ext < bh);
-
-    // copy visible portion first
-    pixel *blk = dst + top_ext * PXSTRIDE(dst_stride);
-    const int center_w = bw - left_ext - right_ext;
-    const int center_h = bh - top_ext - bottom_ext;
-    for (int y = 0; y < center_h; y++) {
-        pixel_copy(blk + left_ext, ref, center_w);
-        // extend left edge for this line
-        if (left_ext)
-            pixel_set(blk, blk[left_ext], left_ext);
-        // extend right edge for this line
-        if (right_ext)
-            pixel_set(blk + left_ext + center_w, blk[left_ext + center_w - 1],
-                      right_ext);
-        ref += PXSTRIDE(ref_stride);
-        blk += PXSTRIDE(dst_stride);
-    }
-
-    // copy top
-    blk = dst + top_ext * PXSTRIDE(dst_stride);
-    for (int y = 0; y < top_ext; y++) {
-        pixel_copy(dst, blk, bw);
-        dst += PXSTRIDE(dst_stride);
-    }
-
-    // copy bottom
-    dst += center_h * PXSTRIDE(dst_stride);
-    for (int y = 0; y < bottom_ext; y++) {
-        pixel_copy(dst, &dst[-PXSTRIDE(dst_stride)], bw);
-        dst += PXSTRIDE(dst_stride);
-    }
-}
-
-static void mc(Dav1dTileContext *const t,
-               pixel *const dst8, coef *const dst16, const ptrdiff_t dst_stride,
-               const int bw4, const int bh4,
-               const int bx, const int by, const int pl,
-               const mv mv, const Dav1dThreadPicture *const refp,
-               const enum Filter2d filter_2d)
-{
-    assert((dst8 != NULL) ^ (dst16 != NULL));
-    const Dav1dFrameContext *const f = t->f;
-    const int ss_ver = !!pl && f->cur.p.p.layout == DAV1D_PIXEL_LAYOUT_I420;
-    const int ss_hor = !!pl && f->cur.p.p.layout != DAV1D_PIXEL_LAYOUT_I444;
-    const int h_mul = 4 >> ss_hor, v_mul = 4 >> ss_ver;
-    const int mvx = mv.x, mvy = mv.y;
-    const int mx = mvx & (15 >> !ss_hor), my = mvy & (15 >> !ss_ver);
-    const int dx = bx * h_mul + (mvx >> (3 + ss_hor));
-    const int dy = by * v_mul + (mvy >> (3 + ss_ver));
-    ptrdiff_t ref_stride = refp->p.stride[!!pl];
-    const pixel *ref;
-    int w, h;
-
-    if (refp != &f->cur) { // i.e. not for intrabc
-        dav1d_thread_picture_wait(refp, dy + bh4 * v_mul + !!my * 4,
-                                  PLANE_TYPE_Y + !!pl);
-        w = (f->cur.p.p.w + ss_hor) >> ss_hor;
-        h = (f->cur.p.p.h + ss_ver) >> ss_ver;
-    } else {
-        w = f->bw * 4 >> ss_hor;
-        h = f->bh * 4 >> ss_ver;
-    }
-    if (dx < !!mx * 3 || dy < !!my * 3 ||
-        dx + bw4 * h_mul + !!mx * 4 > w ||
-        dy + bh4 * v_mul + !!my * 4 > h)
-    {
-        emu_edge(t->emu_edge, 160 * sizeof(pixel), refp->p.data[pl], ref_stride,
-                 bw4 * h_mul + !!mx * 7, bh4 * v_mul + !!my * 7, w, h,
-                 dx - !!mx * 3, dy - !!my * 3);
-        ref = &t->emu_edge[160 * !!my * 3 + !!mx * 3];
-        ref_stride = 160 * sizeof(pixel);
-    } else {
-        ref = ((pixel *) refp->p.data[pl]) + PXSTRIDE(ref_stride) * dy + dx;
-    }
-
-    if (dst8 != NULL) {
-        f->dsp->mc.mc[filter_2d](dst8, dst_stride, ref, ref_stride, bw4 * h_mul,
-                                 bh4 * v_mul, mx << !ss_hor, my << !ss_ver);
-    } else {
-        f->dsp->mc.mct[filter_2d](dst16, ref, ref_stride, bw4 * h_mul,
-                                  bh4 * v_mul, mx << !ss_hor, my << !ss_ver);
-    }
-}
-
-static void obmc(Dav1dTileContext *const t,
-                 pixel *const dst, const ptrdiff_t dst_stride,
-                 const uint8_t *const b_dim, const int pl,
-                 const int bx4, const int by4, const int w4, const int h4)
-{
-    assert(!(t->bx & 1) && !(t->by & 1));
-    const Dav1dFrameContext *const f = t->f;
-    const refmvs *const r = &f->mvs[t->by * f->b4_stride + t->bx];
-    pixel *const lap = t->scratch.lap;
-    static const uint8_t obmc_mask_2[2] = { 19,  0 };
-    static const uint8_t obmc_mask_4[4] = { 25, 14,  5,  0 };
-    static const uint8_t obmc_mask_8[8] = { 28, 22, 16, 11,  7,  3,  0,  0 };
-    static const uint8_t obmc_mask_16[16] = { 30, 27, 24, 21, 18, 15, 12, 10,
-                                               8,  6,  4,  3,  0,  0,  0,  0 };
-    static const uint8_t obmc_mask_32[32] = { 31, 29, 28, 26, 24, 23, 21, 20,
-                                              19, 17, 16, 14, 13, 12, 11,  9,
-                                               8,  7,  6,  5,  4,  4,  3,  2,
-                                               0,  0,  0,  0,  0,  0,  0,  0 };
-    static const uint8_t *const obmc_masks[] = {
-        obmc_mask_2, obmc_mask_4, obmc_mask_8, obmc_mask_16, obmc_mask_32
-    };
-    const int ss_ver = !!pl && f->cur.p.p.layout == DAV1D_PIXEL_LAYOUT_I420;
-    const int ss_hor = !!pl && f->cur.p.p.layout != DAV1D_PIXEL_LAYOUT_I444;
-    const int h_mul = 4 >> ss_hor, v_mul = 4 >> ss_ver;
-
-    if (t->by > t->ts->tiling.row_start &&
-        (!pl || b_dim[0] * h_mul + b_dim[1] * v_mul >= 16))
-    {
-        for (int i = 0, x = 0; x < w4 && i < imin(b_dim[2], 4); ) {
-            // only odd blocks are considered for overlap handling, hence +1
-            const refmvs *const a_r = &r[x - f->b4_stride + 1];
-            const uint8_t *const a_b_dim =
-                dav1d_block_dimensions[sbtype_to_bs[a_r->sb_type]];
-
-            if (a_r->ref[0] > 0) {
-                mc(t, lap, NULL, 128 * sizeof(pixel),
-                   iclip(a_b_dim[0], 2, b_dim[0]), imin(b_dim[1], 16) >> 1,
-                   t->bx + x, t->by, pl, a_r->mv[0],
-                   &f->refp[a_r->ref[0] - 1],
-                   dav1d_filter_2d[t->a->filter[1][bx4 + x + 1]][t->a->filter[0][bx4 + x + 1]]);
-                f->dsp->mc.blend(&dst[x * h_mul], dst_stride,
-                                 lap, 128 * sizeof(pixel),
-                                 h_mul * iclip(a_b_dim[0], 2, b_dim[0]),
-                                 v_mul * imin(b_dim[1], 16) >> 1,
-                                 obmc_masks[imin(b_dim[3], 4) - ss_ver], 1);
-                i++;
-            }
-            x += imax(a_b_dim[0], 2);
-        }
-    }
-
-    if (t->bx > t->ts->tiling.col_start)
-        for (int i = 0, y = 0; y < h4 && i < imin(b_dim[3], 4); ) {
-            // only odd blocks are considered for overlap handling, hence +1
-            const refmvs *const l_r = &r[(y + 1) * f->b4_stride - 1];
-            const uint8_t *const l_b_dim =
-                dav1d_block_dimensions[sbtype_to_bs[l_r->sb_type]];
-
-            if (l_r->ref[0] > 0) {
-                mc(t, lap, NULL, 32 * sizeof(pixel),
-                   imin(b_dim[0], 16) >> 1,
-                   iclip(l_b_dim[1], 2, b_dim[1]),
-                   t->bx, t->by + y, pl, l_r->mv[0],
-                   &f->refp[l_r->ref[0] - 1],
-                   dav1d_filter_2d[t->l.filter[1][by4 + y + 1]][t->l.filter[0][by4 + y + 1]]);
-                f->dsp->mc.blend(&dst[y * v_mul * PXSTRIDE(dst_stride)], dst_stride,
-                                 lap, 32 * sizeof(pixel),
-                                 h_mul * imin(b_dim[0], 16) >> 1,
-                                 v_mul * iclip(l_b_dim[1], 2, b_dim[1]),
-                                 obmc_masks[imin(b_dim[2], 4) - ss_hor], 0);
-                i++;
-            }
-            y += imax(l_b_dim[1], 2);
-        }
-}
-
-static void warp_affine(Dav1dTileContext *const t,
-                        pixel *dst8, coef *dst16, const ptrdiff_t dstride,
-                        const uint8_t *const b_dim, const int pl,
-                        const Dav1dThreadPicture *const refp,
-                        const WarpedMotionParams *const wmp)
-{
-    assert((dst8 != NULL) ^ (dst16 != NULL));
-    const Dav1dFrameContext *const f = t->f;
-    const Dav1dDSPContext *const dsp = f->dsp;
-    const int ss_ver = !!pl && f->cur.p.p.layout == DAV1D_PIXEL_LAYOUT_I420;
-    const int ss_hor = !!pl && f->cur.p.p.layout != DAV1D_PIXEL_LAYOUT_I444;
-    const int h_mul = 4 >> ss_hor, v_mul = 4 >> ss_ver;
-    assert(!((b_dim[0] * h_mul) & 7) && !((b_dim[1] * v_mul) & 7));
-    const int32_t *const mat = wmp->matrix;
-    const int width = (f->cur.p.p.w + ss_hor) >> ss_hor;
-    const int height = (f->cur.p.p.h + ss_ver) >> ss_ver;
-
-    for (int y = 0; y < b_dim[1] * v_mul; y += 8) {
-        for (int x = 0; x < b_dim[0] * h_mul; x += 8) {
-            // calculate transformation relative to center of 8x8 block in
-            // luma pixel units
-            const int src_x = t->bx * 4 + ((x + 4) << ss_hor);
-            const int src_y = t->by * 4 + ((y + 4) << ss_ver);
-            const int mvx = (mat[2] * src_x + mat[3] * src_y + mat[0]) >> ss_hor;
-            const int mvy = (mat[4] * src_x + mat[5] * src_y + mat[1]) >> ss_ver;
-
-            const int dx = (mvx >> 16) - 4;
-            const int mx = ((mvx & 0xffff) - wmp->alpha * 4 -
-                                             wmp->beta  * 7) & ~0x3f;
-            const int dy = (mvy >> 16) - 4;
-            const int my = ((mvy & 0xffff) - wmp->gamma * 4 -
-                                             wmp->delta * 4) & ~0x3f;
-
-            const pixel *ref_ptr;
-            ptrdiff_t ref_stride = refp->p.stride[!!pl];
-
-            dav1d_thread_picture_wait(refp, dy + 4 + 8,
-                                      PLANE_TYPE_Y + !!pl);
-            if (dx < 3 || dx + 8 + 4 > width || dy < 3 || dy + 8 + 4 > height) {
-                emu_edge(t->emu_edge, 160 * sizeof(pixel), refp->p.data[pl],
-                         ref_stride, 15, 15, width, height, dx - 3, dy - 3);
-                ref_ptr = &t->emu_edge[160 * 3 + 3];
-                ref_stride = 160 * sizeof(pixel);
-            } else {
-                ref_ptr = ((pixel *) refp->p.data[pl]) + PXSTRIDE(ref_stride) * dy + dx;
-            }
-            if (dst16 != NULL)
-                dsp->mc.warp8x8t(&dst16[x], dstride, ref_ptr, ref_stride,
-                                 wmp->abcd, mx, my);
-            else
-                dsp->mc.warp8x8(&dst8[x], dstride, ref_ptr, ref_stride,
-                                wmp->abcd, mx, my);
-        }
-        if (dst8) dst8  += 8 * PXSTRIDE(dstride);
-        else      dst16 += 8 * dstride;
-    }
-}
-
-void bytefn(dav1d_recon_b_intra)(Dav1dTileContext *const t, const enum BlockSize bs,
-                                 const enum EdgeFlags intra_edge_flags,
-                                 const Av1Block *const b)
-{
-    Dav1dTileState *const ts = t->ts;
-    const Dav1dFrameContext *const f = t->f;
-    const Dav1dDSPContext *const dsp = f->dsp;
-    const int bx4 = t->bx & 31, by4 = t->by & 31;
-    const int ss_ver = f->cur.p.p.layout == DAV1D_PIXEL_LAYOUT_I420;
-    const int ss_hor = f->cur.p.p.layout != DAV1D_PIXEL_LAYOUT_I444;
-    const int cbx4 = bx4 >> ss_hor, cby4 = by4 >> ss_ver;
-    const uint8_t *const b_dim = dav1d_block_dimensions[bs];
-    const int bw4 = b_dim[0], bh4 = b_dim[1];
-    const int w4 = imin(bw4, f->bw - t->bx), h4 = imin(bh4, f->bh - t->by);
-    const int cw4 = (w4 + ss_hor) >> ss_hor, ch4 = (h4 + ss_ver) >> ss_ver;
-    const int has_chroma = f->seq_hdr.layout != DAV1D_PIXEL_LAYOUT_I400 &&
-                           (bw4 > ss_hor || t->bx & 1) &&
-                           (bh4 > ss_ver || t->by & 1);
-    const TxfmInfo *const t_dim = &dav1d_txfm_dimensions[b->tx];
-    const TxfmInfo *const uv_t_dim = &dav1d_txfm_dimensions[b->uvtx];
-
-    // coefficient coding
-    ALIGN_STK_32(pixel, edge_buf, 257,);
-    pixel *const edge = edge_buf + 128;
-    const int cbw4 = (bw4 + ss_hor) >> ss_hor, cbh4 = (bh4 + ss_ver) >> ss_ver;
-
-    for (int init_y = 0; init_y < h4; init_y += 16) {
-        for (int init_x = 0; init_x < w4; init_x += 16) {
-            if (b->pal_sz[0]) {
-                pixel *dst = ((pixel *) f->cur.p.data[0]) +
-                             4 * (t->by * PXSTRIDE(f->cur.p.stride[0]) + t->bx);
-                const uint8_t *pal_idx;
-                if (f->frame_thread.pass) {
-                    pal_idx = ts->frame_thread.pal_idx;
-                    ts->frame_thread.pal_idx += bw4 * bh4 * 16;
-                } else {
-                    pal_idx = t->scratch.pal_idx;
-                }
-                const uint16_t *const pal = f->frame_thread.pass ?
-                    f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) +
-                                        ((t->bx >> 1) + (t->by & 1))][0] : t->pal[0];
-                f->dsp->ipred.pal_pred(dst, f->cur.p.stride[0], pal,
-                                       pal_idx, bw4 * 4, bh4 * 4);
-                if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
-                    hex_dump(dst, PXSTRIDE(f->cur.p.stride[0]),
-                             bw4 * 4, bh4 * 4, "y-pal-pred");
-            }
-
-            const int sm_fl = sm_flag(t->a, bx4) | sm_flag(&t->l, by4);
-            const int sb_has_tr = init_x + 16 < w4 ? 1 : init_y ? 0 :
-                              intra_edge_flags & EDGE_I444_TOP_HAS_RIGHT;
-            const int sb_has_bl = init_x ? 0 : init_y + 16 < h4 ? 1 :
-                              intra_edge_flags & EDGE_I444_LEFT_HAS_BOTTOM;
-            int y, x;
-            const int sub_h4 = imin(h4, 16 + init_y);
-            const int sub_w4 = imin(w4, init_x + 16);
-            for (y = init_y, t->by += init_y; y < sub_h4;
-                 y += t_dim->h, t->by += t_dim->h)
-            {
-                pixel *dst = ((pixel *) f->cur.p.data[0]) +
-                               4 * (t->by * PXSTRIDE(f->cur.p.stride[0]) +
-                                    t->bx + init_x);
-                for (x = init_x, t->bx += init_x; x < sub_w4;
-                     x += t_dim->w, t->bx += t_dim->w)
-                {
-                    if (b->pal_sz[0]) goto skip_y_pred;
-
-                    int angle = b->y_angle;
-                    const enum EdgeFlags edge_flags =
-                        (((y > init_y || !sb_has_tr) && (x + t_dim->w >= sub_w4)) ?
-                             0 : EDGE_I444_TOP_HAS_RIGHT) |
-                        ((x > init_x || (!sb_has_bl && y + t_dim->h >= sub_h4)) ?
-                             0 : EDGE_I444_LEFT_HAS_BOTTOM);
-                    const pixel *top_sb_edge = NULL;
-                    if (!(t->by & (f->sb_step - 1))) {
-                        top_sb_edge = f->ipred_edge[0];
-                        const int sby = t->by >> f->sb_shift;
-                        top_sb_edge += f->sb128w * 128 * (sby - 1);
-                    }
-                    const enum IntraPredMode m =
-                        bytefn(dav1d_prepare_intra_edges)(t->bx,
-                                                          t->bx > ts->tiling.col_start,
-                                                          t->by,
-                                                          t->by > ts->tiling.row_start,
-                                                          ts->tiling.col_end,
-                                                          ts->tiling.row_end,
-                                                          edge_flags, dst,
-                                                          f->cur.p.stride[0], top_sb_edge,
-                                                          b->y_mode, &angle,
-                                                          t_dim->w, t_dim->h, edge);
-                    dsp->ipred.intra_pred[m](dst, f->cur.p.stride[0], edge,
-                                             t_dim->w * 4, t_dim->h * 4,
-                                             angle | sm_fl);
-
-                    if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) {
-                        hex_dump(edge - t_dim->h * 4, t_dim->h * 4,
-                                 t_dim->h * 4, 2, "l");
-                        hex_dump(edge, 0, 1, 1, "tl");
-                        hex_dump(edge + 1, t_dim->w * 4,
-                                 t_dim->w * 4, 2, "t");
-                        hex_dump(dst, f->cur.p.stride[0],
-                                 t_dim->w * 4, t_dim->h * 4, "y-intra-pred");
-                    }
-
-                skip_y_pred: {}
-                    if (!b->skip) {
-                        coef *cf;
-                        int eob;
-                        enum TxfmType txtp;
-                        if (f->frame_thread.pass) {
-                            cf = ts->frame_thread.cf;
-                            ts->frame_thread.cf += imin(t_dim->w, 8) * imin(t_dim->h, 8) * 16;
-                            const struct CodedBlockInfo *const cbi =
-                                &f->frame_thread.cbi[t->by * f->b4_stride + t->bx];
-                            eob = cbi->eob[0];
-                            txtp = cbi->txtp[0];
-                        } else {
-                            uint8_t cf_ctx;
-                            cf = t->cf;
-                            eob = decode_coefs(t, &t->a->lcoef[bx4 + x],
-                                               &t->l.lcoef[by4 + y], b->tx, bs,
-                                               b, 1, 0, cf, &txtp, &cf_ctx);
-                            if (DEBUG_BLOCK_INFO)
-                                printf("Post-y-cf-blk[tx=%d,txtp=%d,eob=%d]: r=%d\n",
-                                       b->tx, txtp, eob, ts->msac.rng);
-                            memset(&t->a->lcoef[bx4 + x], cf_ctx,
-                                   imin(t_dim->w, f->bw - t->bx));
-                            memset(&t->l.lcoef[by4 + y], cf_ctx,
-                                   imin(t_dim->h, f->bh - t->by));
-                        }
-                        if (eob >= 0) {
-                            if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
-                                coef_dump(cf, imin(t_dim->h, 8) * 4,
-                                          imin(t_dim->w, 8) * 4, 3, "dq");
-                            dsp->itx.itxfm_add[b->tx]
-                                              [txtp](dst,
-                                                     f->cur.p.stride[0],
-                                                     cf, eob);
-                            if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
-                                hex_dump(dst, f->cur.p.stride[0],
-                                         t_dim->w * 4, t_dim->h * 4, "recon");
-                        }
-                    } else if (!f->frame_thread.pass) {
-                        memset(&t->a->lcoef[bx4 + x], 0x40, t_dim->w);
-                        memset(&t->l.lcoef[by4 + y], 0x40, t_dim->h);
-                    }
-                    dst += 4 * t_dim->w;
-                }
-                t->bx -= x;
-            }
-            t->by -= y;
-
-            if (!has_chroma) continue;
-
-            const ptrdiff_t stride = f->cur.p.stride[1];
-
-            if (b->uv_mode == CFL_PRED) {
-                assert(!init_x && !init_y);
-
-                int16_t *const ac = t->scratch.ac;
-                pixel *y_src = ((pixel *) f->cur.p.data[0]) + 4 * (t->bx & ~ss_hor) +
-                                 4 * (t->by & ~ss_ver) * PXSTRIDE(f->cur.p.stride[0]);
-                const ptrdiff_t uv_off = 4 * ((t->bx >> ss_hor) +
-                                              (t->by >> ss_ver) * PXSTRIDE(stride));
-                pixel *const uv_dst[2] = { ((pixel *) f->cur.p.data[1]) + uv_off,
-                                           ((pixel *) f->cur.p.data[2]) + uv_off };
-
-                const int furthest_r =
-                    ((cw4 << ss_hor) + t_dim->w - 1) & ~(t_dim->w - 1);
-                const int furthest_b =
-                    ((ch4 << ss_ver) + t_dim->h - 1) & ~(t_dim->h - 1);
-                dsp->ipred.cfl_ac[f->cur.p.p.layout - 1]
-                                 [b->uvtx](ac, y_src, f->cur.p.stride[0],
-                                           cbw4 - (furthest_r >> ss_hor),
-                                           cbh4 - (furthest_b >> ss_ver));
-                for (int pl = 0; pl < 2; pl++) {
-                    if (!b->cfl_alpha[pl]) continue;
-                    int angle = 0;
-                    const pixel *top_sb_edge = NULL;
-                    if (!((t->by & ~ss_ver) & (f->sb_step - 1))) {
-                        top_sb_edge = f->ipred_edge[pl + 1];
-                        const int sby = t->by >> f->sb_shift;
-                        top_sb_edge += f->sb128w * 128 * (sby - 1);
-                    }
-                    const int xpos = t->bx >> ss_hor, ypos = t->by >> ss_ver;
-                    const int xstart = ts->tiling.col_start >> ss_hor;
-                    const int ystart = ts->tiling.row_start >> ss_ver;
-                    const enum IntraPredMode m =
-                        bytefn(dav1d_prepare_intra_edges)(xpos, xpos > xstart,
-                                                          ypos, ypos > ystart,
-                                                          ts->tiling.col_end >> ss_hor,
-                                                          ts->tiling.row_end >> ss_ver,
-                                                          0, uv_dst[pl], stride,
-                                                          top_sb_edge, DC_PRED, &angle,
-                                                          uv_t_dim->w,
-                                                          uv_t_dim->h, edge);
-                    dsp->ipred.cfl_pred[m](uv_dst[pl], stride, edge,
-                                           uv_t_dim->w * 4,
-                                           uv_t_dim->h * 4,
-                                           ac, b->cfl_alpha[pl]);
-                }
-                if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) {
-                    ac_dump(ac, 4*cbw4, 4*cbh4, "ac");
-                    hex_dump(uv_dst[0], stride, cbw4 * 4, cbh4 * 4, "u-cfl-pred");
-                    hex_dump(uv_dst[1], stride, cbw4 * 4, cbh4 * 4, "v-cfl-pred");
-                }
-            } else if (b->pal_sz[1]) {
-                ptrdiff_t uv_dstoff = 4 * ((t->bx >> ss_hor) +
-                                           (t->by >> ss_ver) * PXSTRIDE(f->cur.p.stride[1]));
-                const uint8_t *pal_idx;
-                if (f->frame_thread.pass) {
-                    pal_idx = ts->frame_thread.pal_idx;
-                    ts->frame_thread.pal_idx += cbw4 * cbh4 * 16;
-                } else {
-                    pal_idx = &t->scratch.pal_idx[bw4 * bh4 * 16];
-                }
-                const uint16_t *const pal_u = f->frame_thread.pass ?
-                    f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) +
-                                        ((t->bx >> 1) + (t->by & 1))][1] : t->pal[1];
-                f->dsp->ipred.pal_pred(((pixel *) f->cur.p.data[1]) + uv_dstoff,
-                                       f->cur.p.stride[1], pal_u,
-                                       pal_idx, cbw4 * 4, cbh4 * 4);
-                const uint16_t *const pal_v = f->frame_thread.pass ?
-                    f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) +
-                                        ((t->bx >> 1) + (t->by & 1))][2] : t->pal[2];
-                f->dsp->ipred.pal_pred(((pixel *) f->cur.p.data[2]) + uv_dstoff,
-                                       f->cur.p.stride[1], pal_v,
-                                       pal_idx, cbw4 * 4, cbh4 * 4);
-                if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) {
-                    hex_dump(((pixel *) f->cur.p.data[1]) + uv_dstoff,
-                             PXSTRIDE(f->cur.p.stride[1]),
-                             cbw4 * 4, cbh4 * 4, "u-pal-pred");
-                    hex_dump(((pixel *) f->cur.p.data[2]) + uv_dstoff,
-                             PXSTRIDE(f->cur.p.stride[1]),
-                             cbw4 * 4, cbh4 * 4, "v-pal-pred");
-                }
-            }
-
-            const int sm_uv_fl = sm_uv_flag(t->a, cbx4) |
-                                 sm_uv_flag(&t->l, cby4);
-            const int uv_sb_has_tr =
-                ((init_x + 16) >> ss_hor) < cw4 ? 1 : init_y ? 0 :
-                intra_edge_flags & (EDGE_I420_TOP_HAS_RIGHT >> (f->cur.p.p.layout - 1));
-            const int uv_sb_has_bl =
-                init_x ? 0 : ((init_y + 16) >> ss_ver) < ch4 ? 1 :
-                intra_edge_flags & (EDGE_I420_LEFT_HAS_BOTTOM >> (f->cur.p.p.layout - 1));
-            const int sub_ch4 = imin(ch4, (init_y + 16) >> ss_ver);
-            const int sub_cw4 = imin(cw4, (init_x + 16) >> ss_hor);
-            for (int pl = 0; pl < 2; pl++) {
-                for (y = init_y >> ss_ver, t->by += init_y; y < sub_ch4;
-                     y += uv_t_dim->h, t->by += uv_t_dim->h << ss_ver)
-                {
-                    pixel *dst = ((pixel *) f->cur.p.data[1 + pl]) +
-                                   4 * ((t->by >> ss_ver) * PXSTRIDE(stride) +
-                                        ((t->bx + init_x) >> ss_hor));
-                    for (x = init_x >> ss_hor, t->bx += init_x; x < sub_cw4;
-                         x += uv_t_dim->w, t->bx += uv_t_dim->w << ss_hor)
-                    {
-                        if ((b->uv_mode == CFL_PRED && b->cfl_alpha[pl]) ||
-                            b->pal_sz[1])
-                        {
-                            goto skip_uv_pred;
-                        }
-
-                        int angle = b->uv_angle;
-                        // this probably looks weird because we're using
-                        // luma flags in a chroma loop, but that's because
-                        // prepare_intra_edges() expects luma flags as input
-                        const enum EdgeFlags edge_flags =
-                            (((y > (init_y >> ss_ver) || !uv_sb_has_tr) &&
-                              (x + uv_t_dim->w >= sub_cw4)) ?
-                                 0 : EDGE_I444_TOP_HAS_RIGHT) |
-                            ((x > (init_x >> ss_hor) ||
-                              (!uv_sb_has_bl && y + uv_t_dim->h >= sub_ch4)) ?
-                                 0 : EDGE_I444_LEFT_HAS_BOTTOM);
-                        const pixel *top_sb_edge = NULL;
-                        if (!((t->by & ~ss_ver) & (f->sb_step - 1))) {
-                            top_sb_edge = f->ipred_edge[1 + pl];
-                            const int sby = t->by >> f->sb_shift;
-                            top_sb_edge += f->sb128w * 128 * (sby - 1);
-                        }
-                        const enum IntraPredMode uv_mode =
-                             b->uv_mode == CFL_PRED ? DC_PRED : b->uv_mode;
-                        const int xpos = t->bx >> ss_hor, ypos = t->by >> ss_ver;
-                        const int xstart = ts->tiling.col_start >> ss_hor;
-                        const int ystart = ts->tiling.row_start >> ss_ver;
-                        const enum IntraPredMode m =
-                            bytefn(dav1d_prepare_intra_edges)(xpos, xpos > xstart,
-                                                              ypos, ypos > ystart,
-                                                              ts->tiling.col_end >> ss_hor,
-                                                              ts->tiling.row_end >> ss_ver,
-                                                              edge_flags, dst, stride,
-                                                              top_sb_edge, uv_mode,
-                                                              &angle, uv_t_dim->w,
-                                                              uv_t_dim->h, edge);
-                        dsp->ipred.intra_pred[m](dst, stride, edge,
-                                                 uv_t_dim->w * 4,
-                                                 uv_t_dim->h * 4,
-                                                 angle | sm_uv_fl);
-                        if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) {
-                            hex_dump(edge - uv_t_dim->h * 4, uv_t_dim->h * 4,
-                                     uv_t_dim->h * 4, 2, "l");
-                            hex_dump(edge, 0, 1, 1, "tl");
-                            hex_dump(edge + 1, uv_t_dim->w * 4,
-                                     uv_t_dim->w * 4, 2, "t");
-                            hex_dump(dst, stride, uv_t_dim->w * 4,
-                                     uv_t_dim->h * 4, pl ? "v-intra-pred" : "u-intra-pred");
-                        }
-
-                    skip_uv_pred: {}
-                        if (!b->skip) {
-                            enum TxfmType txtp;
-                            int eob;
-                            coef *cf;
-                            if (f->frame_thread.pass) {
-                                cf = ts->frame_thread.cf;
-                                ts->frame_thread.cf += uv_t_dim->w * uv_t_dim->h * 16;
-                                const struct CodedBlockInfo *const cbi =
-                                    &f->frame_thread.cbi[t->by * f->b4_stride + t->bx];
-                                eob = cbi->eob[pl + 1];
-                                txtp = cbi->txtp[pl + 1];
-                            } else {
-                                uint8_t cf_ctx;
-                                cf = t->cf;
-                                eob = decode_coefs(t, &t->a->ccoef[pl][cbx4 + x],
-                                                   &t->l.ccoef[pl][cby4 + y],
-                                                   b->uvtx, bs, b, 1, 1 + pl, cf,
-                                                   &txtp, &cf_ctx);
-                                if (DEBUG_BLOCK_INFO)
-                                    printf("Post-uv-cf-blk[pl=%d,tx=%d,"
-                                           "txtp=%d,eob=%d]: r=%d [x=%d,cbx4=%d]\n",
-                                           pl, b->uvtx, txtp, eob, ts->msac.rng, x, cbx4);
-                                memset(&t->a->ccoef[pl][cbx4 + x], cf_ctx,
-                                       imin(uv_t_dim->w, (f->bw - t->bx + ss_hor) >> ss_hor));
-                                memset(&t->l.ccoef[pl][cby4 + y], cf_ctx,
-                                       imin(uv_t_dim->h, (f->bh - t->by + ss_ver) >> ss_ver));
-                            }
-                            if (eob >= 0) {
-                                if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
-                                    coef_dump(cf, uv_t_dim->h * 4,
-                                              uv_t_dim->w * 4, 3, "dq");
-                                dsp->itx.itxfm_add[b->uvtx]
-                                                  [txtp](dst, stride,
-                                                         cf, eob);
-                                if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
-                                    hex_dump(dst, stride, uv_t_dim->w * 4,
-                                             uv_t_dim->h * 4, "recon");
-                            }
-                        } else if (!f->frame_thread.pass) {
-                            memset(&t->a->ccoef[pl][cbx4 + x], 0x40, uv_t_dim->w);
-                            memset(&t->l.ccoef[pl][cby4 + y], 0x40, uv_t_dim->h);
-                        }
-                        dst += uv_t_dim->w * 4;
-                    }
-                    t->bx -= x << ss_hor;
-                }
-                t->by -= y << ss_ver;
-            }
-        }
-    }
-}
-
-void bytefn(dav1d_recon_b_inter)(Dav1dTileContext *const t, const enum BlockSize bs,
-                                 const Av1Block *const b)
-{
-    Dav1dTileState *const ts = t->ts;
-    const Dav1dFrameContext *const f = t->f;
-    const Dav1dDSPContext *const dsp = f->dsp;
-    const int bx4 = t->bx & 31, by4 = t->by & 31;
-    const int ss_ver = f->cur.p.p.layout == DAV1D_PIXEL_LAYOUT_I420;
-    const int ss_hor = f->cur.p.p.layout != DAV1D_PIXEL_LAYOUT_I444;
-    const int cbx4 = bx4 >> ss_hor, cby4 = by4 >> ss_ver;
-    const uint8_t *const b_dim = dav1d_block_dimensions[bs];
-    const int bw4 = b_dim[0], bh4 = b_dim[1];
-    const int w4 = imin(bw4, f->bw - t->bx), h4 = imin(bh4, f->bh - t->by);
-    const int has_chroma = f->seq_hdr.layout != DAV1D_PIXEL_LAYOUT_I400 &&
-                           (bw4 > ss_hor || t->bx & 1) &&
-                           (bh4 > ss_ver || t->by & 1);
-    const int chr_layout_idx = f->cur.p.p.layout == DAV1D_PIXEL_LAYOUT_I400 ? 0 :
-                               DAV1D_PIXEL_LAYOUT_I444 - f->cur.p.p.layout;
-
-    // prediction
-    const int cbh4 = (bh4 + ss_ver) >> ss_ver, cbw4 = (bw4 + ss_hor) >> ss_hor;
-    pixel *dst = ((pixel *) f->cur.p.data[0]) +
-        4 * (t->by * PXSTRIDE(f->cur.p.stride[0]) + t->bx);
-    const ptrdiff_t uvdstoff =
-        4 * ((t->bx >> ss_hor) + (t->by >> ss_ver) * PXSTRIDE(f->cur.p.stride[1]));
-    if (!(f->frame_hdr.frame_type & 1)) {
-        // intrabc
-        mc(t, dst, NULL, f->cur.p.stride[0],
-           bw4, bh4, t->bx, t->by, 0, b->mv[0], &f->cur, FILTER_2D_BILINEAR);
-        if (has_chroma) for (int pl = 1; pl < 3; pl++)
-            mc(t, ((pixel *) f->cur.p.data[pl]) + uvdstoff, NULL, f->cur.p.stride[1],
-               bw4 << (bw4 == ss_hor), bh4 << (bh4 == ss_ver),
-               t->bx & ~ss_hor, t->by & ~ss_ver,
-               pl, b->mv[0], &f->cur, FILTER_2D_BILINEAR);
-    } else if (b->comp_type == COMP_INTER_NONE) {
-        const Dav1dThreadPicture *const refp = &f->refp[b->ref[0]];
-        const enum Filter2d filter_2d = b->filter2d;
-
-        if (imin(bw4, bh4) > 1 && !f->frame_hdr.force_integer_mv &&
-            ((b->inter_mode == GLOBALMV &&
-              f->frame_hdr.gmv[b->ref[0]].type > WM_TYPE_TRANSLATION) ||
-             (b->motion_mode == MM_WARP &&
-              t->warpmv.type > WM_TYPE_TRANSLATION)))
-        {
-            warp_affine(t, dst, NULL, f->cur.p.stride[0], b_dim, 0, refp,
-                        b->motion_mode == MM_WARP ? &t->warpmv :
-                            &f->frame_hdr.gmv[b->ref[0]]);
-        } else {
-            mc(t, dst, NULL, f->cur.p.stride[0],
-               bw4, bh4, t->bx, t->by, 0, b->mv[0], refp, filter_2d);
-            if (b->motion_mode == MM_OBMC)
-                obmc(t, dst, f->cur.p.stride[0], b_dim, 0, bx4, by4, w4, h4);
-        }
-        if (b->interintra_type) {
-            ALIGN_STK_32(pixel, tl_edge_buf, 65,);
-            pixel *const tl_edge = tl_edge_buf + 32;
-            enum IntraPredMode m = b->interintra_mode == II_SMOOTH_PRED ?
-                                   SMOOTH_PRED : b->interintra_mode;
-            pixel *const tmp = t->scratch.interintra;
-            int angle = 0;
-            const pixel *top_sb_edge = NULL;
-            if (!(t->by & (f->sb_step - 1))) {
-                top_sb_edge = f->ipred_edge[0];
-                const int sby = t->by >> f->sb_shift;
-                top_sb_edge += f->sb128w * 128 * (sby - 1);
-            }
-            m = bytefn(dav1d_prepare_intra_edges)(t->bx, t->bx > ts->tiling.col_start,
-                                                  t->by, t->by > ts->tiling.row_start,
-                                                  ts->tiling.col_end, ts->tiling.row_end,
-                                                  0, dst, f->cur.p.stride[0], top_sb_edge,
-                                                  m, &angle, bw4, bh4, tl_edge);
-            dsp->ipred.intra_pred[m](tmp, 4 * bw4 * sizeof(pixel),
-                                     tl_edge, bw4 * 4, bh4 * 4, 0);
-            const uint8_t *const ii_mask =
-                b->interintra_type == INTER_INTRA_BLEND ?
-                     dav1d_ii_masks[bs][0][b->interintra_mode] :
-                     dav1d_wedge_masks[bs][0][0][b->wedge_idx];
-            dsp->mc.blend(dst, f->cur.p.stride[0], tmp, bw4 * 4 * sizeof(pixel),
-                          bw4 * 4, bh4 * 4, ii_mask, bw4 * 4);
-        }
-
-        if (!has_chroma) goto skip_inter_chroma_pred;
-
-        // sub8x8 derivation
-        int is_sub8x8 = bw4 == ss_hor || bh4 == ss_ver;
-        refmvs *r;
-        if (is_sub8x8) {
-            assert(ss_hor == 1);
-            r = &f->mvs[t->by * f->b4_stride + t->bx];
-            if (bw4 == 1) is_sub8x8 &= r[-1].ref[0] > 0;
-            if (bh4 == ss_ver) is_sub8x8 &= r[-f->b4_stride].ref[0] > 0;
-            if (bw4 == 1 && bh4 == ss_ver)
-                is_sub8x8 &= r[-(1 + f->b4_stride)].ref[0] > 0;
-        }
-
-        // chroma prediction
-        if (is_sub8x8) {
-            assert(ss_hor == 1);
-            int h_off = 0, v_off = 0;
-            if (bw4 == 1 && bh4 == ss_ver) {
-                for (int pl = 0; pl < 2; pl++)
-                    mc(t, ((pixel *) f->cur.p.data[1 + pl]) + uvdstoff,
-                       NULL, f->cur.p.stride[1],
-                       bw4, bh4, t->bx - 1, t->by - 1, 1 + pl,
-                       r[-(f->b4_stride + 1)].mv[0],
-                       &f->refp[r[-(f->b4_stride + 1)].ref[0] - 1],
-                       f->frame_thread.pass != 2 ? t->tl_4x4_filter :
-                           f->frame_thread.b[((t->by - 1) * f->b4_stride) + t->bx - 1].filter2d);
-                v_off = 2 * PXSTRIDE(f->cur.p.stride[1]);
-                h_off = 2;
-            }
-            if (bw4 == 1) {
-                const enum Filter2d left_filter_2d =
-                    dav1d_filter_2d[t->l.filter[1][by4]][t->l.filter[0][by4]];
-                for (int pl = 0; pl < 2; pl++)
-                    mc(t, ((pixel *) f->cur.p.data[1 + pl]) + uvdstoff + v_off, NULL,
-                       f->cur.p.stride[1], bw4, bh4, t->bx - 1,
-                       t->by, 1 + pl, r[-1].mv[0], &f->refp[r[-1].ref[0] - 1],
-                       f->frame_thread.pass != 2 ? left_filter_2d :
-                           f->frame_thread.b[(t->by * f->b4_stride) + t->bx - 1].filter2d);
-                h_off = 2;
-            }
-            if (bh4 == ss_ver) {
-                const enum Filter2d top_filter_2d =
-                    dav1d_filter_2d[t->a->filter[1][bx4]][t->a->filter[0][bx4]];
-                for (int pl = 0; pl < 2; pl++)
-                    mc(t, ((pixel *) f->cur.p.data[1 + pl]) + uvdstoff + h_off, NULL,
-                       f->cur.p.stride[1], bw4, bh4, t->bx, t->by - 1,
-                       1 + pl, r[-f->b4_stride].mv[0],
-                       &f->refp[r[-f->b4_stride].ref[0] - 1],
-                       f->frame_thread.pass != 2 ? top_filter_2d :
-                           f->frame_thread.b[((t->by - 1) * f->b4_stride) + t->bx].filter2d);
-                v_off = 2 * PXSTRIDE(f->cur.p.stride[1]);
-            }
-            for (int pl = 0; pl < 2; pl++)
-                mc(t, ((pixel *) f->cur.p.data[1 + pl]) + uvdstoff + h_off + v_off, NULL, f->cur.p.stride[1],
-                   bw4, bh4, t->bx, t->by, 1 + pl, b->mv[0], refp, filter_2d);
-        } else {
-            if (imin(cbw4, cbh4) > 1 && !f->frame_hdr.force_integer_mv &&
-                ((b->inter_mode == GLOBALMV &&
-                  f->frame_hdr.gmv[b->ref[0]].type > WM_TYPE_TRANSLATION) ||
-                 (b->motion_mode == MM_WARP &&
-                  t->warpmv.type > WM_TYPE_TRANSLATION)))
-            {
-                for (int pl = 0; pl < 2; pl++)
-                    warp_affine(t, ((pixel *) f->cur.p.data[1 + pl]) + uvdstoff, NULL,
-                                f->cur.p.stride[1], b_dim, 1 + pl, refp,
-                                b->motion_mode == MM_WARP ? &t->warpmv :
-                                    &f->frame_hdr.gmv[b->ref[0]]);
-            } else {
-                for (int pl = 0; pl < 2; pl++) {
-                    mc(t, ((pixel *) f->cur.p.data[1 + pl]) + uvdstoff,
-                       NULL, f->cur.p.stride[1],
-                       bw4 << (bw4 == ss_hor), bh4 << (bh4 == ss_ver),
-                       t->bx & ~ss_hor, t->by & ~ss_ver,
-                       1 + pl, b->mv[0], refp, filter_2d);
-                    if (b->motion_mode == MM_OBMC)
-                        obmc(t, ((pixel *) f->cur.p.data[1 + pl]) + uvdstoff,
-                             f->cur.p.stride[1], b_dim, 1 + pl, bx4, by4, w4, h4);
-                }
-            }
-            if (b->interintra_type) {
-                // FIXME for 8x32 with 4:2:2 subsampling, this probably does
-                // the wrong thing since it will select 4x16, not 4x32, as a
-                // transform size...
-                const uint8_t *const ii_mask =
-                    b->interintra_type == INTER_INTRA_BLEND ?
-                         dav1d_ii_masks[bs][chr_layout_idx][b->interintra_mode] :
-                         dav1d_wedge_masks[bs][chr_layout_idx][0][b->wedge_idx];
-
-                for (int pl = 0; pl < 2; pl++) {
-                    pixel *const tmp = t->scratch.interintra;
-                    pixel tl_edge_px[65], *const tl_edge = &tl_edge_px[32];
-                    enum IntraPredMode m =
-                        b->interintra_mode == II_SMOOTH_PRED ?
-                        SMOOTH_PRED : b->interintra_mode;
-                    int angle = 0;
-                    pixel *const uvdst = ((pixel *) f->cur.p.data[1 + pl]) + uvdstoff;
-                    const pixel *top_sb_edge = NULL;
-                    if (!(t->by & (f->sb_step - 1))) {
-                        top_sb_edge = f->ipred_edge[pl + 1];
-                        const int sby = t->by >> f->sb_shift;
-                        top_sb_edge += f->sb128w * 128 * (sby - 1);
-                    }
-                    m = bytefn(dav1d_prepare_intra_edges)(t->bx >> ss_hor,
-                                                          (t->bx >> ss_hor) >
-                                                              (ts->tiling.col_start >> ss_hor),
-                                                          t->by >> ss_ver,
-                                                          (t->by >> ss_ver) >
-                                                              (ts->tiling.row_start >> ss_ver),
-                                                          ts->tiling.col_end >> ss_hor,
-                                                          ts->tiling.row_end >> ss_ver,
-                                                          0, uvdst, f->cur.p.stride[1],
-                                                          top_sb_edge, m,
-                                                          &angle, cbw4, cbh4, tl_edge);
-                    dsp->ipred.intra_pred[m](tmp, cbw4 * 4 * sizeof(pixel),
-                                             tl_edge, cbw4 * 4, cbh4 * 4, 0);
-                    dsp->mc.blend(uvdst, f->cur.p.stride[1], tmp, cbw4 * 4 * sizeof(pixel),
-                                  cbw4 * 4, cbh4 * 4, ii_mask, cbw4 * 4);
-                }
-            }
-        }
-
-    skip_inter_chroma_pred: {}
-        t->tl_4x4_filter = filter_2d;
-    } else {
-        const enum Filter2d filter_2d = b->filter2d;
-        // Maximum super block size is 128x128
-        coef (*tmp)[128 * 128] = (coef (*)[128 * 128]) t->scratch.compinter;
-        int jnt_weight;
-        uint8_t *const seg_mask = t->scratch_seg_mask;
-        const uint8_t *mask;
-
-        for (int i = 0; i < 2; i++) {
-            const Dav1dThreadPicture *const refp = &f->refp[b->ref[i]];
-
-            if (b->inter_mode == GLOBALMV_GLOBALMV && !f->frame_hdr.force_integer_mv &&
-                f->frame_hdr.gmv[b->ref[i]].type > WM_TYPE_TRANSLATION)
-            {
-                warp_affine(t, NULL, tmp[i], bw4 * 4, b_dim, 0, refp,
-                            &f->frame_hdr.gmv[b->ref[i]]);
-            } else {
-                mc(t, NULL, tmp[i], 0, bw4, bh4, t->bx, t->by, 0,
-                   b->mv[i], refp, filter_2d);
-            }
-        }
-        switch (b->comp_type) {
-        case COMP_INTER_AVG:
-            dsp->mc.avg(dst, f->cur.p.stride[0], tmp[0], tmp[1],
-                        bw4 * 4, bh4 * 4);
-            break;
-        case COMP_INTER_WEIGHTED_AVG:
-            jnt_weight = f->jnt_weights[b->ref[0]][b->ref[1]];
-            dsp->mc.w_avg(dst, f->cur.p.stride[0], tmp[0], tmp[1],
-                          bw4 * 4, bh4 * 4, jnt_weight);
-            break;
-        case COMP_INTER_SEG:
-            dsp->mc.w_mask[chr_layout_idx](dst, f->cur.p.stride[0],
-                                           tmp[b->mask_sign], tmp[!b->mask_sign],
-                                           bw4 * 4, bh4 * 4, seg_mask, b->mask_sign);
-            mask = seg_mask;
-            break;
-        case COMP_INTER_WEDGE:
-            mask = dav1d_wedge_masks[bs][0][0][b->wedge_idx];
-            dsp->mc.mask(dst, f->cur.p.stride[0],
-                         tmp[b->mask_sign], tmp[!b->mask_sign],
-                         bw4 * 4, bh4 * 4, mask);
-            if (has_chroma)
-                mask = dav1d_wedge_masks[bs][chr_layout_idx][b->mask_sign][b->wedge_idx];
-            break;
-        }
-
-        // chroma
-        if (has_chroma) for (int pl = 0; pl < 2; pl++) {
-            for (int i = 0; i < 2; i++) {
-                const Dav1dThreadPicture *const refp = &f->refp[b->ref[i]];
-                if (b->inter_mode == GLOBALMV_GLOBALMV &&
-                    imin(cbw4, cbh4) > 1 && !f->frame_hdr.force_integer_mv &&
-                    f->frame_hdr.gmv[b->ref[i]].type > WM_TYPE_TRANSLATION)
-                {
-                    warp_affine(t, NULL, tmp[i], bw4 * 2, b_dim, 1 + pl,
-                                refp, &f->frame_hdr.gmv[b->ref[i]]);
-                } else {
-                    mc(t, NULL, tmp[i], 0, bw4, bh4, t->bx, t->by,
-                       1 + pl, b->mv[i], refp, filter_2d);
-                }
-            }
-            pixel *const uvdst = ((pixel *) f->cur.p.data[1 + pl]) + uvdstoff;
-            switch (b->comp_type) {
-            case COMP_INTER_AVG:
-                dsp->mc.avg(uvdst, f->cur.p.stride[1], tmp[0], tmp[1],
-                            bw4 * 4 >> ss_hor, bh4 * 4 >> ss_ver);
-                break;
-            case COMP_INTER_WEIGHTED_AVG:
-                dsp->mc.w_avg(uvdst, f->cur.p.stride[1], tmp[0], tmp[1],
-                              bw4 * 4 >> ss_hor, bh4 * 4 >> ss_ver, jnt_weight);
-                break;
-            case COMP_INTER_WEDGE:
-            case COMP_INTER_SEG:
-                dsp->mc.mask(uvdst, f->cur.p.stride[1],
-                             tmp[b->mask_sign], tmp[!b->mask_sign],
-                             bw4 * 4 >> ss_hor, bh4 * 4 >> ss_ver, mask);
-                break;
-            }
-        }
-    }
-
-    if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) {
-        hex_dump(dst, f->cur.p.stride[0], b_dim[0] * 4, b_dim[1] * 4, "y-pred");
-        if (has_chroma) {
-            hex_dump(&((pixel *) f->cur.p.data[1])[uvdstoff], f->cur.p.stride[1],
-                     cbw4 * 4, cbh4 * 4, "u-pred");
-            hex_dump(&((pixel *) f->cur.p.data[2])[uvdstoff], f->cur.p.stride[1],
-                     cbw4 * 4, cbh4 * 4, "v-pred");
-        }
-    }
-
-    const int cw4 = (w4 + ss_hor) >> ss_hor, ch4 = (h4 + ss_ver) >> ss_ver;
-
-    if (b->skip) {
-        // reset coef contexts
-        memset(&t->a->lcoef[bx4], 0x40, w4);
-        memset(&t->l.lcoef[by4], 0x40, h4);
-        if (has_chroma) {
-            memset(&t->a->ccoef[0][cbx4], 0x40, cw4);
-            memset(&t->l.ccoef[0][cby4], 0x40, ch4);
-            memset(&t->a->ccoef[1][cbx4], 0x40, cw4);
-            memset(&t->l.ccoef[1][cby4], 0x40, ch4);
-        }
-        return;
-    }
-
-    const TxfmInfo *const uvtx = &dav1d_txfm_dimensions[b->uvtx];
-    const TxfmInfo *const ytx = &dav1d_txfm_dimensions[b->max_ytx];
-
-    for (int init_y = 0; init_y < bh4; init_y += 16) {
-        for (int init_x = 0; init_x < bw4; init_x += 16) {
-            // coefficient coding & inverse transforms
-            int y_off = !!init_y, y;
-            dst += PXSTRIDE(f->cur.p.stride[0]) * 4 * init_y;
-            for (y = init_y, t->by += init_y; y < imin(h4, init_y + 16);
-                 y += ytx->h, y_off++)
-            {
-                int x, x_off = !!init_x;
-                for (x = init_x, t->bx += init_x; x < imin(w4, init_x + 16);
-                     x += ytx->w, x_off++)
-                {
-                    read_coef_tree(t, bs, b, b->max_ytx, 0, b->tx_split,
-                                   x_off, y_off, &dst[x * 4]);
-                    t->bx += ytx->w;
-                }
-                dst += PXSTRIDE(f->cur.p.stride[0]) * 4 * ytx->h;
-                t->bx -= x;
-                t->by += ytx->h;
-            }
-            dst -= PXSTRIDE(f->cur.p.stride[0]) * 4 * y;
-            t->by -= y;
-
-            // chroma coefs and inverse transform
-            if (has_chroma) for (int pl = 0; pl < 2; pl++) {
-                pixel *uvdst = ((pixel *) f->cur.p.data[1 + pl]) + uvdstoff +
-                    (PXSTRIDE(f->cur.p.stride[1]) * init_y * 4 >> ss_ver);
-                for (y = init_y >> ss_ver, t->by += init_y;
-                     y < imin(ch4, (init_y + 16) >> ss_ver); y += uvtx->h)
-                {
-                    int x;
-                    for (x = init_x >> ss_hor, t->bx += init_x;
-                         x < imin(cw4, (init_x + 16) >> ss_hor); x += uvtx->w)
-                    {
-                        coef *cf;
-                        int eob;
-                        enum TxfmType txtp;
-                        if (f->frame_thread.pass) {
-                            cf = ts->frame_thread.cf;
-                            ts->frame_thread.cf += uvtx->w * uvtx->h * 16;
-                            const struct CodedBlockInfo *const cbi =
-                                &f->frame_thread.cbi[t->by * f->b4_stride + t->bx];
-                            eob = cbi->eob[1 + pl];
-                            txtp = cbi->txtp[1 + pl];
-                        } else {
-                            uint8_t cf_ctx;
-                            cf = t->cf;
-                            txtp = t->txtp_map[(by4 + (y << ss_ver)) * 32 +
-                                                bx4 + (x << ss_hor)];
-                            eob = decode_coefs(t, &t->a->ccoef[pl][cbx4 + x],
-                                               &t->l.ccoef[pl][cby4 + y],
-                                               b->uvtx, bs, b, 0, 1 + pl,
-                                               cf, &txtp, &cf_ctx);
-                            if (DEBUG_BLOCK_INFO)
-                                printf("Post-uv-cf-blk[pl=%d,tx=%d,"
-                                       "txtp=%d,eob=%d]: r=%d\n",
-                                       pl, b->uvtx, txtp, eob, ts->msac.rng);
-                            memset(&t->a->ccoef[pl][cbx4 + x], cf_ctx,
-                                   imin(uvtx->w, (f->bw - t->bx + ss_hor) >> ss_hor));
-                            memset(&t->l.ccoef[pl][cby4 + y], cf_ctx,
-                                   imin(uvtx->h, (f->bh - t->by + ss_ver) >> ss_ver));
-                        }
-                        if (eob >= 0) {
-                            if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
-                                coef_dump(cf, uvtx->h * 4, uvtx->w * 4, 3, "dq");
-                            dsp->itx.itxfm_add[b->uvtx]
-                                              [txtp](&uvdst[4 * x],
-                                                     f->cur.p.stride[1],
-                                                     cf, eob);
-                            if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
-                                hex_dump(&uvdst[4 * x], f->cur.p.stride[1],
-                                         uvtx->w * 4, uvtx->h * 4, "recon");
-                        }
-                        t->bx += uvtx->w << ss_hor;
-                    }
-                    uvdst += PXSTRIDE(f->cur.p.stride[1]) * 4 * uvtx->h;
-                    t->bx -= x << ss_hor;
-                    t->by += uvtx->h << ss_ver;
-                }
-                t->by -= y << ss_ver;
-            }
-        }
-    }
-}
-
-void bytefn(dav1d_filter_sbrow)(Dav1dFrameContext *const f, const int sby) {
-    const int ss_ver = f->cur.p.p.layout == DAV1D_PIXEL_LAYOUT_I420;
-    const int sbsz = f->sb_step, sbh = f->sbh;
-
-    if (f->frame_hdr.loopfilter.level_y[0] ||
-        f->frame_hdr.loopfilter.level_y[1])
-    {
-        int start_of_tile_row = 0;
-        if (f->frame_hdr.tiling.row_start_sb[f->lf.tile_row] == sby)
-            start_of_tile_row = f->lf.tile_row++;
-        bytefn(dav1d_loopfilter_sbrow)(f, f->lf.p, f->lf.mask_ptr, sby,
-                                       start_of_tile_row);
-    }
-
-    if (f->seq_hdr.restoration) {
-        // Store loop filtered pixels required by loop restoration
-        bytefn(dav1d_lr_copy_lpf)(f, f->lf.p, sby);
-    }
-    if (f->seq_hdr.cdef) {
-        if (sby) {
-            pixel *p_up[3] = {
-                f->lf.p[0] - 8 * PXSTRIDE(f->cur.p.stride[0]),
-                f->lf.p[1] - (8 * PXSTRIDE(f->cur.p.stride[1]) >> ss_ver),
-                f->lf.p[2] - (8 * PXSTRIDE(f->cur.p.stride[1]) >> ss_ver),
-            };
-            bytefn(dav1d_cdef_brow)(f, p_up, f->lf.prev_mask_ptr,
-                                    sby * sbsz - 2, sby * sbsz);
-        }
-        const int n_blks = sbsz - 2 * (sby + 1 < sbh);
-        bytefn(dav1d_cdef_brow)(f, f->lf.p, f->lf.mask_ptr, sby * sbsz,
-                                imin(sby * sbsz + n_blks, f->bh));
-    }
-    if (f->seq_hdr.restoration) {
-        bytefn(dav1d_lr_sbrow)(f, f->lf.p, sby);
-    }
-
-    f->lf.p[0] += sbsz * 4 * PXSTRIDE(f->cur.p.stride[0]);
-    f->lf.p[1] += sbsz * 4 * PXSTRIDE(f->cur.p.stride[1]) >> ss_ver;
-    f->lf.p[2] += sbsz * 4 * PXSTRIDE(f->cur.p.stride[1]) >> ss_ver;
-    f->lf.prev_mask_ptr = f->lf.mask_ptr;
-    if ((sby & 1) || f->seq_hdr.sb128) {
-        f->lf.mask_ptr += f->sb128w;
-    }
-}
-
-void bytefn(dav1d_backup_ipred_edge)(Dav1dTileContext *const t) {
-    const Dav1dFrameContext *const f = t->f;
-    Dav1dTileState *const ts = t->ts;
-    const int sby = t->by >> f->sb_shift;
-    const int sby_off = f->sb128w * 128 * sby;
-    const int x_off = ts->tiling.col_start;
-
-    const pixel *const y =
-        ((const pixel *) f->cur.p.data[0]) + x_off * 4 +
-                    ((t->by + f->sb_step) * 4 - 1) * PXSTRIDE(f->cur.p.stride[0]);
-    pixel_copy(&f->ipred_edge[0][sby_off + x_off * 4], y,
-               4 * (ts->tiling.col_end - x_off));
-
-    if (f->cur.p.p.layout != DAV1D_PIXEL_LAYOUT_I400) {
-        const int ss_ver = f->cur.p.p.layout == DAV1D_PIXEL_LAYOUT_I420;
-        const int ss_hor = f->cur.p.p.layout != DAV1D_PIXEL_LAYOUT_I444;
-
-        const ptrdiff_t uv_off = (x_off * 4 >> ss_hor) +
-            (((t->by + f->sb_step) * 4 >> ss_ver) - 1) * PXSTRIDE(f->cur.p.stride[1]);
-        for (int pl = 1; pl <= 2; pl++)
-            pixel_copy(&f->ipred_edge[pl][sby_off + (x_off * 4 >> ss_hor)],
-                       &((const pixel *) f->cur.p.data[pl])[uv_off],
-                       4 * (ts->tiling.col_end - x_off) >> ss_hor);
-    }
-}
author	Marvin Scholz <epirat07@gmail.com>	2018-10-25 17:45:12 +0300
committer	Ronald S. Bultje <rsbultje@gmail.com>	2018-10-25 19:51:31 +0300
commit	46e2a2d0cc451e1d6bb929f80088f8a7b8940dd0 (patch)
tree	002462d6840acd6551bb0b6b2dd5f4416db1ab84 /src/recon.c
parent	367d785a4e70b3e43eee234b3c745b047e3fbd40 (diff)