diff options
author | Henrik Gramner <gramner@twoorioles.com> | 2019-08-06 16:17:31 +0300 |
---|---|---|
committer | Henrik Gramner <henrik@gramner.com> | 2019-08-13 19:51:49 +0300 |
commit | e29fd5c0016fec27c88a36ac6f6eaaf416d91330 (patch) | |
tree | c0bb371a91af5abbe470751189e35dcee5f8314b | |
parent | a819653e1b71ea69c13faaa64c5bb89534ce2772 (diff) |
Add msac optimizations
* Eliminate the trailing zero after the CDF probabilities. We can
reuse the count value as a terminator instead. This reduces the
size of the CDF context by around 8%.
* Align the CDF arrays.
* Various other minor optimizations.
-rw-r--r-- | src/arm/64/msac.S | 25 | ||||
-rw-r--r-- | src/cdf.c | 121 | ||||
-rw-r--r-- | src/cdf.h | 148 | ||||
-rw-r--r-- | src/decode.c | 73 | ||||
-rw-r--r-- | src/internal.h | 6 | ||||
-rw-r--r-- | src/lib.c | 2 | ||||
-rw-r--r-- | src/msac.c | 31 | ||||
-rw-r--r-- | src/msac.h | 2 | ||||
-rw-r--r-- | src/recon_tmpl.c | 52 | ||||
-rw-r--r-- | src/tables.c | 8 | ||||
-rw-r--r-- | src/tables.h | 1 | ||||
-rw-r--r-- | src/x86/msac.asm | 30 | ||||
-rw-r--r-- | tests/checkasm/msac.c | 47 |
13 files changed, 273 insertions, 273 deletions
diff --git a/src/arm/64/msac.S b/src/arm/64/msac.S index b1bff59..81a6822 100644 --- a/src/arm/64/msac.S +++ b/src/arm/64/msac.S @@ -148,7 +148,7 @@ function msac_decode_symbol_adapt4_neon, export=1 add x8, x0, #RNG ld1_n v0, v1, x1, \sz, \n // cdf ld1r {v4\sz}, [x8] // rng - movrel x9, coeffs, 32 + movrel x9, coeffs, 30 sub x9, x9, x2, lsl #1 ushr_n v2, v3, v0, v1, #6, \sz, \n // cdf >> EC_PROB_SHIFT str h4, [sp, #14] // store original u = s->rng @@ -183,16 +183,24 @@ function msac_decode_symbol_adapt4_neon, export=1 // update_cdf ldrh w3, [x1, x2, lsl #1] // count = cdf[n_symbols] movi v5\szb, #0xff - cmp x2, #4 // set C if n_symbols >= 4 (n_symbols > 3) - mov w14, #4 - lsr w4, w3, #4 // count >> 4 +.if \n == 16 + mov w4, #-5 +.else + mvn w14, w2 + mov w4, #-4 + cmn w14, #3 // set C if n_symbols <= 2 +.endif urhadd_n v4, v5, v5, v5, v2, v3, \sz, \n // i >= val ? -1 : 32768 - adc w4, w4, w14 // (count >> 4) + (n_symbols > 3) + 4 - neg w4, w4 // -rate +.if \n == 16 + sub w4, w4, w3, lsr #4 // -((count >> 4) + 5) +.else + lsr w14, w3, #4 // count >> 4 + sbc w4, w4, w14 // -((count >> 4) + (n_symbols > 2) + 4) +.endif sub_n v4, v5, v4, v5, v0, v1, \sz, \n // (32768 - cdf[i]) or (-1 - cdf[i]) dup v6.8h, w4 // -rate - sub w3, w3, w3, lsr #5 // count - (count >= 32) + sub w3, w3, w3, lsr #5 // count - (count == 32) sub_n v0, v1, v0, v1, v2, v3, \sz, \n // cdf + (i >= val ? 1 : 0) sshl_n v4, v5, v4, v5, v6, v6, \sz, \n // ({32768,-1} - cdf[i]) >> rate add w3, w3, #1 // count + (count < 32) @@ -224,8 +232,7 @@ L(renorm2): b.ge 9f // refill - ldr x3, [x0, #BUF_POS] - ldr x4, [x0, #BUF_END] + ldp x3, x4, [x0] // BUF_POS, BUF_END add x5, x3, #8 cmp x5, x4 b.gt 2f @@ -34,6 +34,7 @@ #include "common/intops.h" #include "src/cdf.h" +#include "src/tables.h" #define AOM_ICDF(x) (32768-(x)) @@ -752,12 +753,11 @@ static const CdfMvComponent default_mv_component_cdf = { } }; - -static const uint16_t default_mv_joint_cdf[N_MV_JOINTS + 1] = { +static const uint16_t ALIGN(default_mv_joint_cdf[N_MV_JOINTS], 8) = { AOM_CDF4(4096, 11264, 19328) }; -static const uint16_t default_kf_y_mode_cdf[5][5][N_INTRA_PRED_MODES + 1 + 2] = { +static const uint16_t ALIGN(default_kf_y_mode_cdf[5][5][N_INTRA_PRED_MODES + 3], 32) = { { { AOM_CDF13(15588, 17027, 19338, 20218, 20682, 21110, 21825, 23244, 24189, 28165, 29093, 30466) }, @@ -3927,25 +3927,18 @@ void dav1d_cdf_thread_update(const Dav1dFrameHeader *const hdr, CdfContext *const dst, const CdfContext *const src) { - int i, j, k, l; - #define update_cdf_1d(n1d, name) \ do { \ - memcpy(dst->name, src->name, sizeof(*dst->name) * n1d); \ - assert(!dst->name[n1d - 1]); \ + memcpy(dst->name, src->name, sizeof(dst->name)); \ dst->name[n1d] = 0; \ } while (0) #define update_cdf_2d(n1d, n2d, name) \ - for (j = 0; j < (n1d); j++) update_cdf_1d(n2d, name[j]) + for (int j = 0; j < (n1d); j++) update_cdf_1d(n2d, name[j]) #define update_cdf_3d(n1d, n2d, n3d, name) \ - for (k = 0; k < (n1d); k++) update_cdf_2d(n2d, n3d, name[k]) + for (int k = 0; k < (n1d); k++) update_cdf_2d(n2d, n3d, name[k]) #define update_cdf_4d(n1d, n2d, n3d, n4d, name) \ - for (l = 0; l < (n1d); l++) update_cdf_3d(n2d, n3d, n4d, name[l]) -#define update_cdf_6d(n1d, n2d, n3d, n4d, n5d, n6d, name) \ - for (n = 0; n < (n1d); n++) \ - for (m = 0; m < (n2d); m++) \ - update_cdf_4d(n3d, n4d, n5d, n6d, name[n][m]) + for (int l = 0; l < (n1d); l++) update_cdf_3d(n2d, n3d, n4d, name[l]) #define update_bit_0d(name) \ do { \ @@ -3954,65 +3947,57 @@ void dav1d_cdf_thread_update(const Dav1dFrameHeader *const hdr, } while (0) #define update_bit_1d(n1d, name) \ - for (i = 0; i < (n1d); i++) update_bit_0d(name[i]) + for (int i = 0; i < (n1d); i++) update_bit_0d(name[i]) #define update_bit_2d(n1d, n2d, name) \ - for (j = 0; j < (n1d); j++) update_bit_1d(n2d, name[j]) + for (int j = 0; j < (n1d); j++) update_bit_1d(n2d, name[j]) #define update_bit_3d(n1d, n2d, n3d, name) \ - for (k = 0; k < (n1d); k++) update_bit_2d(n2d, n3d, name[k]) + for (int k = 0; k < (n1d); k++) update_bit_2d(n2d, n3d, name[k]) update_bit_1d(N_BS_SIZES, m.use_filter_intra); - update_cdf_1d(5, m.filter_intra); - update_cdf_3d(2, N_INTRA_PRED_MODES, N_UV_INTRA_PRED_MODES - !k, m.uv_mode); - update_cdf_2d(8, 7, m.angle_delta); - update_cdf_3d(N_TX_SIZES - 1, 3, imin(k + 2, 3), m.txsz); - update_cdf_3d(2, N_INTRA_PRED_MODES, 7, m.txtp_intra1); - update_cdf_3d(3, N_INTRA_PRED_MODES, 5, m.txtp_intra2); + update_cdf_1d(4, m.filter_intra); + update_cdf_3d(2, N_INTRA_PRED_MODES, N_UV_INTRA_PRED_MODES - 1 - !k, m.uv_mode); + update_cdf_2d(8, 6, m.angle_delta); + update_cdf_3d(N_TX_SIZES - 1, 3, imin(k + 1, 2), m.txsz); + update_cdf_3d(2, N_INTRA_PRED_MODES, 6, m.txtp_intra1); + update_cdf_3d(3, N_INTRA_PRED_MODES, 4, m.txtp_intra2); update_bit_1d(3, m.skip); - static const uint8_t n_partitions[N_BL_LEVELS] = { - [BL_128X128] = N_PARTITIONS - 2, - [BL_64X64] = N_PARTITIONS, - [BL_32X32] = N_PARTITIONS, - [BL_16X16] = N_PARTITIONS, - [BL_8X8] = N_SUB8X8_PARTITIONS, - }; - update_cdf_3d(N_BL_LEVELS, 4, n_partitions[k], m.partition); + update_cdf_3d(N_BL_LEVELS, 4, dav1d_partition_type_count[k], m.partition); update_bit_2d(N_TX_SIZES, 13, coef.skip); - update_cdf_3d(2, 2, 5, coef.eob_bin_16); - update_cdf_3d(2, 2, 6, coef.eob_bin_32); - update_cdf_3d(2, 2, 7, coef.eob_bin_64); - update_cdf_3d(2, 2, 8, coef.eob_bin_128); - update_cdf_3d(2, 2, 9, coef.eob_bin_256); - update_cdf_2d(2, 10, coef.eob_bin_512); - update_cdf_2d(2, 11, coef.eob_bin_1024); + update_cdf_3d(2, 2, 4, coef.eob_bin_16); + update_cdf_3d(2, 2, 5, coef.eob_bin_32); + update_cdf_3d(2, 2, 6, coef.eob_bin_64); + update_cdf_3d(2, 2, 7, coef.eob_bin_128); + update_cdf_3d(2, 2, 8, coef.eob_bin_256); + update_cdf_2d(2, 9, coef.eob_bin_512); + update_cdf_2d(2, 10, coef.eob_bin_1024); update_bit_3d(N_TX_SIZES, 2, 11 /*22*/, coef.eob_hi_bit); - update_cdf_4d(N_TX_SIZES, 2, 4, 3, coef.eob_base_tok); - update_cdf_4d(N_TX_SIZES, 2, 41 /*42*/, 4, coef.base_tok); + update_cdf_4d(N_TX_SIZES, 2, 4, 2, coef.eob_base_tok); + update_cdf_4d(N_TX_SIZES, 2, 41 /*42*/, 3, coef.base_tok); update_bit_2d(2, 3, coef.dc_sign); - update_cdf_4d(4, 2, 21, 4, coef.br_tok); - update_cdf_2d(3, DAV1D_MAX_SEGMENTS, m.seg_id); - update_cdf_1d(8, m.cfl_sign); - update_cdf_2d(6, 16, m.cfl_alpha); + update_cdf_4d(4, 2, 21, 3, coef.br_tok); + update_cdf_2d(3, DAV1D_MAX_SEGMENTS - 1, m.seg_id); + update_cdf_1d(7, m.cfl_sign); + update_cdf_2d(6, 15, m.cfl_alpha); update_bit_0d(m.restore_wiener); update_bit_0d(m.restore_sgrproj); - update_cdf_1d(3, m.restore_switchable); - update_cdf_1d(4, m.delta_q); - update_cdf_2d(5, 4, m.delta_lf); + update_cdf_1d(2, m.restore_switchable); + update_cdf_1d(3, m.delta_q); + update_cdf_2d(5, 3, m.delta_lf); update_bit_2d(7, 3, m.pal_y); update_bit_1d(2, m.pal_uv); - update_cdf_3d(2, 7, 7, m.pal_sz); - update_cdf_4d(2, 7, 5, k + 2, m.color_map); - + update_cdf_3d(2, 7, 6, m.pal_sz); + update_cdf_4d(2, 7, 5, k + 1, m.color_map); update_bit_2d(7, 3, m.txpart); - update_cdf_2d(2, 16, m.txtp_inter1); - update_cdf_1d(12, m.txtp_inter2); + update_cdf_2d(2, 15, m.txtp_inter1); + update_cdf_1d(11, m.txtp_inter2); update_bit_1d(4, m.txtp_inter3); if (!(hdr->frame_type & 1)) { update_bit_0d(m.intrabc); - update_cdf_1d(N_MV_JOINTS, dmv.joint); - for (k = 0; k < 2; k++) { - update_cdf_1d(11, dmv.comp[k].classes); + update_cdf_1d(N_MV_JOINTS - 1, dmv.joint); + for (int k = 0; k < 2; k++) { + update_cdf_1d(10, dmv.comp[k].classes); update_bit_0d(dmv.comp[k].class0); update_bit_1d(10, dmv.comp[k].classN); update_bit_0d(dmv.comp[k].sign); @@ -4021,20 +4006,20 @@ void dav1d_cdf_thread_update(const Dav1dFrameHeader *const hdr, } update_bit_1d(3, m.skip_mode); - update_cdf_2d(4, N_INTRA_PRED_MODES, m.y_mode); - update_cdf_3d(2, 8, DAV1D_N_SWITCHABLE_FILTERS, m.filter); + update_cdf_2d(4, N_INTRA_PRED_MODES - 1, m.y_mode); + update_cdf_3d(2, 8, DAV1D_N_SWITCHABLE_FILTERS - 1, m.filter); update_bit_1d(6, m.newmv_mode); update_bit_1d(2, m.globalmv_mode); update_bit_1d(6, m.refmv_mode); update_bit_1d(3, m.drl_bit); - update_cdf_2d(8, N_COMP_INTER_PRED_MODES, m.comp_inter_mode); + update_cdf_2d(8, N_COMP_INTER_PRED_MODES - 1, m.comp_inter_mode); update_bit_1d(4, m.intra); update_bit_1d(5, m.comp); update_bit_1d(5, m.comp_dir); update_bit_1d(6, m.jnt_comp); update_bit_1d(6, m.mask_comp); update_bit_1d(9, m.wedge_comp); - update_cdf_2d(9, 16, m.wedge_idx); + update_cdf_2d(9, 15, m.wedge_idx); update_bit_2d(6, 3, m.ref); update_bit_2d(3, 3, m.comp_fwd_ref); update_bit_2d(2, 3, m.comp_bwd_ref); @@ -4042,17 +4027,17 @@ void dav1d_cdf_thread_update(const Dav1dFrameHeader *const hdr, update_bit_1d(3, m.seg_pred); update_bit_1d(4, m.interintra); update_bit_1d(7, m.interintra_wedge); - update_cdf_2d(4, 4, m.interintra_mode); - update_cdf_2d(N_BS_SIZES, 3, m.motion_mode); + update_cdf_2d(4, 3, m.interintra_mode); + update_cdf_2d(N_BS_SIZES, 2, m.motion_mode); update_bit_1d(N_BS_SIZES, m.obmc); - update_cdf_1d(N_MV_JOINTS, mv.joint); - for (k = 0; k < 2; k++) { - update_cdf_1d(11, mv.comp[k].classes); + update_cdf_1d(N_MV_JOINTS - 1, mv.joint); + for (int k = 0; k < 2; k++) { + update_cdf_1d(10, mv.comp[k].classes); update_bit_0d(mv.comp[k].class0); update_bit_1d(10, mv.comp[k].classN); - update_cdf_2d(2, 4, mv.comp[k].class0_fp); - update_cdf_1d(4, mv.comp[k].classN_fp); + update_cdf_2d(2, 3, mv.comp[k].class0_fp); + update_cdf_1d(3, mv.comp[k].classN_fp); update_bit_0d(mv.comp[k].class0_hp); update_bit_0d(mv.comp[k].classN_hp); update_bit_0d(mv.comp[k].sign); @@ -4062,7 +4047,7 @@ void dav1d_cdf_thread_update(const Dav1dFrameHeader *const hdr, /* * CDF threading wrappers. */ -static inline int get_qcat_idx(int q) { +static inline int get_qcat_idx(const int q) { if (q <= 20) return 0; if (q <= 60) return 1; if (q <= 120) return 2; @@ -4089,7 +4074,7 @@ void dav1d_cdf_thread_copy(CdfContext *const dst, const CdfThreadContext *const } int dav1d_cdf_thread_alloc(CdfThreadContext *const cdf, - struct thread_data *const t) + struct thread_data *const t) { cdf->ref = dav1d_ref_create(sizeof(CdfContext) + (t != NULL) * sizeof(atomic_uint)); @@ -37,94 +37,94 @@ /* Buffers padded to [8] or [16] for SIMD where needed. */ typedef struct CdfModeContext { - uint16_t y_mode[4][N_INTRA_PRED_MODES + 1 + 2]; - uint16_t use_filter_intra[N_BS_SIZES][2]; - uint16_t filter_intra[5 + 1]; - uint16_t uv_mode[2][N_INTRA_PRED_MODES][N_UV_INTRA_PRED_MODES + 1 + 1]; - uint16_t angle_delta[8][8]; - uint16_t filter[2][8][DAV1D_N_SWITCHABLE_FILTERS + 1]; - uint16_t newmv_mode[6][2]; - uint16_t globalmv_mode[2][2]; - uint16_t refmv_mode[6][2]; - uint16_t drl_bit[3][2]; - uint16_t comp_inter_mode[8][N_COMP_INTER_PRED_MODES + 1]; - uint16_t intra[4][2]; - uint16_t comp[5][2]; - uint16_t comp_dir[5][2]; - uint16_t jnt_comp[6][2]; - uint16_t mask_comp[6][2]; - uint16_t wedge_comp[9][2]; - uint16_t wedge_idx[9][16 + 1]; - uint16_t interintra[7][2]; - uint16_t interintra_mode[4][5]; - uint16_t interintra_wedge[7][2]; - uint16_t ref[6][3][2]; - uint16_t comp_fwd_ref[3][3][2]; - uint16_t comp_bwd_ref[2][3][2]; - uint16_t comp_uni_ref[3][3][2]; - uint16_t txsz[N_TX_SIZES - 1][3][4]; - uint16_t txpart[7][3][2]; - uint16_t txtp_inter1[2][16 + 1]; - uint16_t txtp_inter2[12 + 1 + 3]; - uint16_t txtp_inter3[4][2]; - uint16_t txtp_intra1[2][N_INTRA_PRED_MODES][7 + 1]; - uint16_t txtp_intra2[3][N_INTRA_PRED_MODES][5 + 1 + 2]; - uint16_t skip[3][2]; - uint16_t skip_mode[3][2]; - uint16_t partition[N_BL_LEVELS][4][N_PARTITIONS + 1 + 5]; - uint16_t seg_pred[3][2]; - uint16_t seg_id[3][DAV1D_MAX_SEGMENTS + 1]; - uint16_t cfl_sign[8 + 1]; - uint16_t cfl_alpha[6][16 + 1]; - uint16_t restore_wiener[2]; - uint16_t restore_sgrproj[2]; - uint16_t restore_switchable[3 + 1]; - uint16_t delta_q[4 + 1]; - uint16_t delta_lf[5][4 + 1]; - uint16_t obmc[N_BS_SIZES][2]; - uint16_t motion_mode[N_BS_SIZES][3 + 1]; - uint16_t pal_y[7][3][2]; - uint16_t pal_uv[2][2]; - uint16_t pal_sz[2][7][7 + 1]; - uint16_t color_map[2][7][5][8 + 1]; - uint16_t intrabc[2]; + ALIGN(uint16_t y_mode[4][N_INTRA_PRED_MODES + 3], 32); + ALIGN(uint16_t uv_mode[2][N_INTRA_PRED_MODES][N_UV_INTRA_PRED_MODES + 2], 32); + ALIGN(uint16_t wedge_idx[9][16], 32); + ALIGN(uint16_t partition[N_BL_LEVELS][4][N_PARTITIONS + 6], 32); + ALIGN(uint16_t cfl_alpha[6][16], 32); + ALIGN(uint16_t txtp_inter1[2][16], 32); + ALIGN(uint16_t txtp_inter2[12 + 4], 32); + ALIGN(uint16_t txtp_intra1[2][N_INTRA_PRED_MODES][7 + 1], 16); + ALIGN(uint16_t txtp_intra2[3][N_INTRA_PRED_MODES][5 + 3], 16); + ALIGN(uint16_t cfl_sign[8], 16); + ALIGN(uint16_t angle_delta[8][8], 16); + ALIGN(uint16_t filter_intra[5 + 3], 16); + ALIGN(uint16_t comp_inter_mode[8][N_COMP_INTER_PRED_MODES], 16); + ALIGN(uint16_t seg_id[3][DAV1D_MAX_SEGMENTS], 16); + ALIGN(uint16_t pal_sz[2][7][7 + 1], 16); + ALIGN(uint16_t color_map[2][7][5][8], 16); + ALIGN(uint16_t filter[2][8][DAV1D_N_SWITCHABLE_FILTERS + 1], 8); + ALIGN(uint16_t txsz[N_TX_SIZES - 1][3][4], 8); + ALIGN(uint16_t motion_mode[N_BS_SIZES][3 + 1], 8); + ALIGN(uint16_t delta_q[4], 8); + ALIGN(uint16_t delta_lf[5][4], 8); + ALIGN(uint16_t interintra_mode[4][4], 8); + ALIGN(uint16_t restore_switchable[3 + 1], 8); + ALIGN(uint16_t restore_wiener[2], 4); + ALIGN(uint16_t restore_sgrproj[2], 4); + ALIGN(uint16_t interintra[7][2], 4); + ALIGN(uint16_t interintra_wedge[7][2], 4); + ALIGN(uint16_t txtp_inter3[4][2], 4); + ALIGN(uint16_t use_filter_intra[N_BS_SIZES][2], 4); + ALIGN(uint16_t newmv_mode[6][2], 4); + ALIGN(uint16_t globalmv_mode[2][2], 4); + ALIGN(uint16_t refmv_mode[6][2], 4); + ALIGN(uint16_t drl_bit[3][2], 4); + ALIGN(uint16_t intra[4][2], 4); + ALIGN(uint16_t comp[5][2], 4); + ALIGN(uint16_t comp_dir[5][2], 4); + ALIGN(uint16_t jnt_comp[6][2], 4); + ALIGN(uint16_t mask_comp[6][2], 4); + ALIGN(uint16_t wedge_comp[9][2], 4); + ALIGN(uint16_t ref[6][3][2], 4); + ALIGN(uint16_t comp_fwd_ref[3][3][2], 4); + ALIGN(uint16_t comp_bwd_ref[2][3][2], 4); + ALIGN(uint16_t comp_uni_ref[3][3][2], 4); + ALIGN(uint16_t txpart[7][3][2], 4); + ALIGN(uint16_t skip[3][2], 4); + ALIGN(uint16_t skip_mode[3][2], 4); + ALIGN(uint16_t seg_pred[3][2], 4); + ALIGN(uint16_t obmc[N_BS_SIZES][2], 4); + ALIGN(uint16_t pal_y[7][3][2], 4); + ALIGN(uint16_t pal_uv[2][2], 4); + ALIGN(uint16_t intrabc[2], 4); } CdfModeContext; typedef struct CdfCoefContext { - uint16_t skip[N_TX_SIZES][13][2]; - uint16_t eob_bin_16[2][2][6]; - uint16_t eob_bin_32[2][2][7 + 1]; - uint16_t eob_bin_64[2][2][8]; - uint16_t eob_bin_128[2][2][9]; - uint16_t eob_bin_256[2][2][10 + 6]; - uint16_t eob_bin_512[2][11 + 5]; - uint16_t eob_bin_1024[2][12 + 4]; - uint16_t eob_hi_bit[N_TX_SIZES][2][11 /*22*/][2]; - uint16_t eob_base_tok[N_TX_SIZES][2][4][4]; - uint16_t base_tok[N_TX_SIZES][2][41][5]; - uint16_t dc_sign[2][3][2]; - uint16_t br_tok[4 /*5*/][2][21][5]; + ALIGN(uint16_t eob_bin_16[2][2][5 + 3], 16); + ALIGN(uint16_t eob_bin_32[2][2][6 + 2], 16); + ALIGN(uint16_t eob_bin_64[2][2][7 + 1], 16); + ALIGN(uint16_t eob_bin_128[2][2][8 + 0], 16); + ALIGN(uint16_t eob_bin_256[2][2][9 + 7], 32); + ALIGN(uint16_t eob_bin_512[2][10 + 6], 32); + ALIGN(uint16_t eob_bin_1024[2][11 + 5], 32); + ALIGN(uint16_t eob_base_tok[N_TX_SIZES][2][4][4], 8); + ALIGN(uint16_t base_tok[N_TX_SIZES][2][41][4], 8); + ALIGN(uint16_t br_tok[4 /*5*/][2][21][4], 8); + ALIGN(uint16_t eob_hi_bit[N_TX_SIZES][2][11 /*22*/][2], 4); + ALIGN(uint16_t skip[N_TX_SIZES][13][2], 4); + ALIGN(uint16_t dc_sign[2][3][2], 4); } CdfCoefContext; typedef struct CdfMvComponent { - uint16_t classes[11 + 1 + 4]; - uint16_t class0[2]; - uint16_t classN[10][2]; - uint16_t class0_fp[2][4 + 1]; - uint16_t classN_fp[4 + 1]; - uint16_t class0_hp[2]; - uint16_t classN_hp[2]; - uint16_t sign[2]; + ALIGN(uint16_t classes[11 + 5], 32); + ALIGN(uint16_t class0_fp[2][4], 8); + ALIGN(uint16_t classN_fp[4], 8); + ALIGN(uint16_t class0_hp[2], 4); + ALIGN(uint16_t classN_hp[2], 4); + ALIGN(uint16_t class0[2], 4); + ALIGN(uint16_t classN[10][2], 4); + ALIGN(uint16_t sign[2], 4); } CdfMvComponent; typedef struct CdfMvContext { CdfMvComponent comp[2]; - uint16_t joint[N_MV_JOINTS + 1]; + ALIGN(uint16_t joint[N_MV_JOINTS], 8); } CdfMvContext; typedef struct CdfContext { CdfModeContext m; - uint16_t kfym[5][5][N_INTRA_PRED_MODES + 1 + 2]; + ALIGN(uint16_t kfym[5][5][N_INTRA_PRED_MODES + 3], 32); CdfCoefContext coef; CdfMvContext mv, dmv; } CdfContext; diff --git a/src/decode.c b/src/decode.c index fe99c40..24f1af6 100644 --- a/src/decode.c +++ b/src/decode.c @@ -81,14 +81,14 @@ static int read_mv_component_diff(Dav1dTileContext *const t, const int have_hp = f->frame_hdr->hp; const int sign = dav1d_msac_decode_bool_adapt(&ts->msac, mv_comp->sign); const int cl = dav1d_msac_decode_symbol_adapt16(&ts->msac, - mv_comp->classes, 11); + mv_comp->classes, 10); int up, fp, hp; if (!cl) { up = dav1d_msac_decode_bool_adapt(&ts->msac, mv_comp->class0); if (have_fp) { fp = dav1d_msac_decode_symbol_adapt4(&ts->msac, - mv_comp->class0_fp[up], 4); + mv_comp->class0_fp[up], 3); hp = have_hp ? dav1d_msac_decode_bool_adapt(&ts->msac, mv_comp->class0_hp) : 1; } else { @@ -102,7 +102,7 @@ static int read_mv_component_diff(Dav1dTileContext *const t, mv_comp->classN[n]) << n; if (have_fp) { fp = dav1d_msac_decode_symbol_adapt4(&ts->msac, - mv_comp->classN_fp, 4); + mv_comp->classN_fp, 3); hp = have_hp ? dav1d_msac_decode_bool_adapt(&ts->msac, mv_comp->classN_hp) : 1; } else { @@ -120,7 +120,7 @@ static void read_mv_residual(Dav1dTileContext *const t, mv *const ref_mv, CdfMvContext *const mv_cdf, const int have_fp) { switch (dav1d_msac_decode_symbol_adapt4(&t->ts->msac, t->ts->cdf.mv.joint, - N_MV_JOINTS)) + N_MV_JOINTS - 1)) { case MV_JOINT_HV: ref_mv->y += read_mv_component_diff(t, &mv_cdf->comp[0], have_fp); @@ -380,7 +380,7 @@ static void read_pal_plane(Dav1dTileContext *const t, Av1Block *const b, Dav1dTileState *const ts = t->ts; const Dav1dFrameContext *const f = t->f; const int pal_sz = b->pal_sz[pl] = dav1d_msac_decode_symbol_adapt8(&ts->msac, - ts->cdf.m.pal_sz[pl][sz_ctx], 7) + 2; + ts->cdf.m.pal_sz[pl][sz_ctx], 6) + 2; uint16_t cache[16], used_cache[8]; int l_cache = pl ? t->pal_sz_uv[1][by4] : t->l.pal_sz[by4]; int n_cache = 0; @@ -586,7 +586,7 @@ static void read_pal_indices(Dav1dTileContext *const t, Dav1dTileState *const ts = t->ts; const ptrdiff_t stride = bw4 * 4; pal_idx[0] = dav1d_msac_decode_uniform(&ts->msac, b->pal_sz[pl]); - uint16_t (*const color_map_cdf)[8 + 1] = + uint16_t (*const color_map_cdf)[8] = ts->cdf.m.color_map[pl][b->pal_sz[pl] - 2]; uint8_t (*const order)[8] = t->scratch.pal_order; uint8_t *const ctx = t->scratch.pal_ctx; @@ -597,7 +597,7 @@ static void read_pal_indices(Dav1dTileContext *const t, order_palette(pal_idx, stride, i, first, last, order, ctx); for (int j = first, m = 0; j >= last; j--, m++) { const int color_idx = dav1d_msac_decode_symbol_adapt8(&ts->msac, - color_map_cdf[ctx[m]], b->pal_sz[pl]); + color_map_cdf[ctx[m]], b->pal_sz[pl] - 1); pal_idx[(i - j) * stride + j] = order[m][color_idx]; } } @@ -813,7 +813,7 @@ static int decode_b(Dav1dTileContext *const t, &seg_ctx, f->cur_segmap, f->b4_stride); const unsigned diff = dav1d_msac_decode_symbol_adapt8(&ts->msac, ts->cdf.m.seg_id[seg_ctx], - DAV1D_MAX_SEGMENTS); + DAV1D_MAX_SEGMENTS - 1); const unsigned last_active_seg_id = f->frame_hdr->segmentation.seg_data.last_active_segid; b->seg_id = neg_deinterleave(diff, pred_seg_id, @@ -885,7 +885,7 @@ static int decode_b(Dav1dTileContext *const t, } else { const unsigned diff = dav1d_msac_decode_symbol_adapt8(&ts->msac, ts->cdf.m.seg_id[seg_ctx], - DAV1D_MAX_SEGMENTS); + DAV1D_MAX_SEGMENTS - 1); const unsigned last_active_seg_id = f->frame_hdr->segmentation.seg_data.last_active_segid; b->seg_id = neg_deinterleave(diff, pred_seg_id, @@ -933,7 +933,7 @@ static int decode_b(Dav1dTileContext *const t, if (have_delta_q) { int delta_q = dav1d_msac_decode_symbol_adapt4(&ts->msac, - ts->cdf.m.delta_q, 4); + ts->cdf.m.delta_q, 3); if (delta_q == 3) { const int n_bits = 1 + dav1d_msac_decode_bools(&ts->msac, 3); delta_q = dav1d_msac_decode_bools(&ts->msac, n_bits) + @@ -954,7 +954,7 @@ static int decode_b(Dav1dTileContext *const t, for (int i = 0; i < n_lfs; i++) { int delta_lf = dav1d_msac_decode_symbol_adapt4(&ts->msac, - ts->cdf.m.delta_lf[i + f->frame_hdr->delta.lf.multi], 4); + ts->cdf.m.delta_lf[i + f->frame_hdr->delta.lf.multi], 3); if (delta_lf == 3) { const int n_bits = 1 + dav1d_msac_decode_bools(&ts->msac, 3); delta_lf = dav1d_msac_decode_bools(&ts->msac, n_bits) + @@ -1019,7 +1019,7 @@ static int decode_b(Dav1dTileContext *const t, ts->cdf.kfym[dav1d_intra_mode_context[t->a->mode[bx4]]] [dav1d_intra_mode_context[t->l.mode[by4]]]; b->y_mode = dav1d_msac_decode_symbol_adapt16(&ts->msac, ymode_cdf, - N_INTRA_PRED_MODES); + N_INTRA_PRED_MODES - 1); if (DEBUG_BLOCK_INFO) printf("Post-ymode[%d]: r=%d\n", b->y_mode, ts->msac.rng); @@ -1028,7 +1028,7 @@ static int decode_b(Dav1dTileContext *const t, b->y_mode <= VERT_LEFT_PRED) { uint16_t *const acdf = ts->cdf.m.angle_delta[b->y_mode - VERT_PRED]; - const int angle = dav1d_msac_decode_symbol_adapt8(&ts->msac, acdf, 7); + const int angle = dav1d_msac_decode_symbol_adapt8(&ts->msac, acdf, 6); b->y_angle = angle - 3; } else { b->y_angle = 0; @@ -1039,20 +1039,20 @@ static int decode_b(Dav1dTileContext *const t, cbw4 == 1 && cbh4 == 1 : !!(cfl_allowed_mask & (1 << bs)); uint16_t *const uvmode_cdf = ts->cdf.m.uv_mode[cfl_allowed][b->y_mode]; b->uv_mode = dav1d_msac_decode_symbol_adapt16(&ts->msac, uvmode_cdf, - N_UV_INTRA_PRED_MODES - !cfl_allowed); + N_UV_INTRA_PRED_MODES - 1 - !cfl_allowed); if (DEBUG_BLOCK_INFO) printf("Post-uvmode[%d]: r=%d\n", b->uv_mode, ts->msac.rng); if (b->uv_mode == CFL_PRED) { #define SIGN(a) (!!(a) + ((a) > 0)) const int sign = dav1d_msac_decode_symbol_adapt8(&ts->msac, - ts->cdf.m.cfl_sign, 8) + 1; + ts->cdf.m.cfl_sign, 7) + 1; const int sign_u = sign * 0x56 >> 8, sign_v = sign - sign_u * 3; assert(sign_u == sign / 3); if (sign_u) { const int ctx = (sign_u == 2) * 3 + sign_v; b->cfl_alpha[0] = dav1d_msac_decode_symbol_adapt16(&ts->msac, - ts->cdf.m.cfl_alpha[ctx], 16) + 1; + ts->cdf.m.cfl_alpha[ctx], 15) + 1; if (sign_u == 1) b->cfl_alpha[0] = -b->cfl_alpha[0]; } else { b->cfl_alpha[0] = 0; @@ -1060,7 +1060,7 @@ static int decode_b(Dav1dTileContext *const t, if (sign_v) { const int ctx = (sign_v == 2) * 3 + sign_u; b->cfl_alpha[1] = dav1d_msac_decode_symbol_adapt16(&ts->msac, - ts->cdf.m.cfl_alpha[ctx], 16) + 1; + ts->cdf.m.cfl_alpha[ctx], 15) + 1; if (sign_v == 1) b->cfl_alpha[1] = -b->cfl_alpha[1]; } else { b->cfl_alpha[1] = 0; @@ -1073,7 +1073,7 @@ static int decode_b(Dav1dTileContext *const t, b->uv_mode <= VERT_LEFT_PRED) { uint16_t *const acdf = ts->cdf.m.angle_delta[b->uv_mode - VERT_PRED]; - const int angle = dav1d_msac_decode_symbol_adapt8(&ts->msac, acdf, 7); + const int angle = dav1d_msac_decode_symbol_adapt8(&ts->msac, acdf, 6); b->uv_angle = angle - 3; } else { b->uv_angle = 0; @@ -1114,7 +1114,7 @@ static int decode_b(Dav1dTileContext *const t, if (is_filter) { b->y_mode = FILTER_PRED; b->y_angle = dav1d_msac_decode_symbol_adapt4(&ts->msac, - ts->cdf.m.filter_intra, 5); + ts->cdf.m.filter_intra, 4); } if (DEBUG_BLOCK_INFO) printf("Post-filterintramode[%d/%d]: r=%d\n", @@ -1157,7 +1157,7 @@ static int decode_b(Dav1dTileContext *const t, const int tctx = get_tx_ctx(t->a, &t->l, t_dim, by4, bx4); uint16_t *const tx_cdf = ts->cdf.m.txsz[t_dim->max - 1][tctx]; int depth = dav1d_msac_decode_symbol_adapt4(&ts->msac, tx_cdf, - imin(t_dim->max + 1, 3)); + imin(t_dim->max, 2)); while (depth--) { b->tx = t_dim->sub; @@ -1479,7 +1479,7 @@ static int decode_b(Dav1dTileContext *const t, b->inter_mode = dav1d_msac_decode_symbol_adapt8(&ts->msac, ts->cdf.m.comp_inter_mode[ctx], - N_COMP_INTER_PRED_MODES); + N_COMP_INTER_PRED_MODES - 1); if (DEBUG_BLOCK_INFO) printf("Post-compintermode[%d,ctx=%d,n_mvs=%d]: r=%d\n", b->inter_mode, ctx, n_mvs, ts->msac.rng); @@ -1587,7 +1587,7 @@ static int decode_b(Dav1dTileContext *const t, ts->cdf.m.wedge_comp[ctx]); if (b->comp_type == COMP_INTER_WEDGE) b->wedge_idx = dav1d_msac_decode_symbol_adapt16(&ts->msac, - ts->cdf.m.wedge_idx[ctx], 16); + ts->cdf.m.wedge_idx[ctx], 15); } else { b->comp_type = COMP_INTER_SEG; } @@ -1742,14 +1742,14 @@ static int decode_b(Dav1dTileContext *const t, { b->interintra_mode = dav1d_msac_decode_symbol_adapt4(&ts->msac, ts->cdf.m.interintra_mode[ii_sz_grp], - N_INTER_INTRA_PRED_MODES); + N_INTER_INTRA_PRED_MODES - 1); const int wedge_ctx = dav1d_wedge_ctx_lut[bs]; b->interintra_type = INTER_INTRA_BLEND + dav1d_msac_decode_bool_adapt(&ts->msac, ts->cdf.m.interintra_wedge[wedge_ctx]); if (b->interintra_type == INTER_INTRA_WEDGE) b->wedge_idx = dav1d_msac_decode_symbol_adapt16(&ts->msac, - ts->cdf.m.wedge_idx[wedge_ctx], 16); + ts->cdf.m.wedge_idx[wedge_ctx], 15); } else { b->interintra_type = INTER_INTRA_NONE; } @@ -1782,7 +1782,7 @@ static int decode_b(Dav1dTileContext *const t, b->motion_mode = allow_warp ? dav1d_msac_decode_symbol_adapt4(&ts->msac, - ts->cdf.m.motion_mode[bs], 3) : + ts->cdf.m.motion_mode[bs], 2) : dav1d_msac_decode_bool_adapt(&ts->msac, ts->cdf.m.obmc[bs]); if (b->motion_mode == MM_WARP) { has_subpel_filter = 0; @@ -1822,7 +1822,7 @@ static int decode_b(Dav1dTileContext *const t, by4, bx4); filter[0] = dav1d_msac_decode_symbol_adapt4(&ts->msac, ts->cdf.m.filter[0][ctx1], - DAV1D_N_SWITCHABLE_FILTERS); + DAV1D_N_SWITCHABLE_FILTERS - 1); if (f->seq_hdr->dual_filter) { const int ctx2 = get_filter_ctx(t->a, &t->l, comp, 1, b->ref[0], by4, bx4); @@ -1831,7 +1831,7 @@ static int decode_b(Dav1dTileContext *const t, filter[0], ctx1, ts->msac.rng); filter[1] = dav1d_msac_decode_symbol_adapt4(&ts->msac, ts->cdf.m.filter[1][ctx2], - DAV1D_N_SWITCHABLE_FILTERS); + DAV1D_N_SWITCHABLE_FILTERS - 1); if (DEBUG_BLOCK_INFO) printf("Post-subpel_filter2[%d,ctx=%d]: r=%d\n", filter[1], ctx2, ts->msac.rng); @@ -2022,9 +2022,8 @@ static int decode_sb(Dav1dTileContext *const t, const enum BlockLevel bl, const Av1Block *const b = &f->frame_thread.b[t->by * f->b4_stride + t->bx]; bp = b->bl == bl ? b->bp : PARTITION_SPLIT; } else { - const unsigned n_part = bl == BL_8X8 ? N_SUB8X8_PARTITIONS : - bl == BL_128X128 ? N_PARTITIONS - 2 : N_PARTITIONS; - bp = dav1d_msac_decode_symbol_adapt16(&t->ts->msac, pc, n_part); + bp = dav1d_msac_decode_symbol_adapt16(&t->ts->msac, pc, + dav1d_partition_type_count[bl]); if (f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I422 && (bp == PARTITION_V || bp == PARTITION_V4 || bp == PARTITION_T_LEFT_SPLIT || bp == PARTITION_T_RIGHT_SPLIT)) @@ -2380,7 +2379,7 @@ static void read_restoration_info(Dav1dTileContext *const t, if (frame_type == DAV1D_RESTORATION_SWITCHABLE) { const int filter = dav1d_msac_decode_symbol_adapt4(&ts->msac, - ts->cdf.m.restore_switchable, 3); + ts->cdf.m.restore_switchable, 2); lr->type = filter ? filter == 2 ? DAV1D_RESTORATION_SGRPROJ : DAV1D_RESTORATION_WIENER : DAV1D_RESTORATION_NONE; @@ -2636,9 +2635,13 @@ int dav1d_decode_frame(Dav1dFrameContext *const f) { goto error; } } + Dav1dTileState *ts_new = dav1d_alloc_aligned(sizeof(*f->ts) * n_ts, 32); + if (!ts_new) goto error; if (n_ts > f->n_ts) { - Dav1dTileState *ts_new = realloc(f->ts, sizeof(*f->ts) * n_ts); - if (!ts_new) goto error; + if (f->ts) { + memcpy(ts_new, f->ts, sizeof(*f->ts) * f->n_ts); + dav1d_free_aligned(f->ts); + } f->ts = ts_new; for (int n = f->n_ts; n < n_ts; f->n_ts = ++n) { Dav1dTileState *const ts = &f->ts[n]; @@ -2654,9 +2657,9 @@ int dav1d_decode_frame(Dav1dFrameContext *const f) { pthread_cond_destroy(&ts->tile_thread.cond); pthread_mutex_destroy(&ts->tile_thread.lock); } + memcpy(ts_new, f->ts, sizeof(*f->ts) * n_ts); + dav1d_free_aligned(f->ts); f->n_ts = n_ts; - Dav1dTileState *ts_new = realloc(f->ts, sizeof(*f->ts) * n_ts); - if (!ts_new) goto error; f->ts = ts_new; } } diff --git a/src/internal.h b/src/internal.h index 45863d2..86caee4 100644 --- a/src/internal.h +++ b/src/internal.h @@ -241,14 +241,14 @@ struct Dav1dFrameContext { }; struct Dav1dTileState { + CdfContext cdf; + MsacContext msac; + struct { int col_start, col_end, row_start, row_end; // in 4px units int col, row; // in tile units } tiling; - CdfContext cdf; - MsacContext msac; - atomic_int progress; // in sby units, TILE_ERROR after a decoding error struct { pthread_mutex_t lock; @@ -502,7 +502,7 @@ static COLD void close_internal(Dav1dContext **const c_out, int flush) { pthread_cond_destroy(&ts->tile_thread.cond); pthread_mutex_destroy(&ts->tile_thread.lock); } - free(f->ts); + dav1d_free_aligned(f->ts); dav1d_free_aligned(f->tc); dav1d_free_aligned(f->ipred_edge[0]); free(f->a); @@ -116,42 +116,39 @@ int dav1d_msac_decode_subexp(MsacContext *const s, const int ref, /* Decodes a symbol given an inverse cumulative distribution function (CDF) * table in Q15. */ -static unsigned decode_symbol(MsacContext *const s, const uint16_t *const cdf, - const size_t n_symbols) +unsigned dav1d_msac_decode_symbol_adapt_c(MsacContext *const s, + uint16_t *const cdf, + const size_t n_symbols) { - const unsigned c = s->dif >> (EC_WIN_SIZE - 16); - unsigned u, v = s->rng, r = s->rng >> 8, ret = 0; + const unsigned c = s->dif >> (EC_WIN_SIZE - 16), r = s->rng >> 8; + unsigned u, v = s->rng, val = -1; - assert(!cdf[n_symbols - 1]); + assert(n_symbols <= 15); + assert(cdf[n_symbols] <= 32); do { + val++; u = v; - v = r * (cdf[ret++] >> EC_PROB_SHIFT); + v = r * (cdf[val] >> EC_PROB_SHIFT); v >>= 7 - EC_PROB_SHIFT; - v += EC_MIN_PROB * (int) (n_symbols - ret); + v += EC_MIN_PROB * ((unsigned)n_symbols - val); } while (c < v); assert(u <= s->rng); ctx_norm(s, s->dif - ((ec_win)v << (EC_WIN_SIZE - 16)), u - v); - return ret - 1; -} -unsigned dav1d_msac_decode_symbol_adapt_c(MsacContext *const s, - uint16_t *const cdf, - const size_t n_symbols) -{ - const unsigned val = decode_symbol(s, cdf, n_symbols); if (s->allow_update_cdf) { const unsigned count = cdf[n_symbols]; - const int rate = ((count >> 4) | 4) + (n_symbols > 3); + const unsigned rate = 4 + (count >> 4) + (n_symbols > 2); unsigned i; for (i = 0; i < val; i++) cdf[i] += (32768 - cdf[i]) >> rate; - for (; i < n_symbols - 1; i++) + for (; i < n_symbols; i++) cdf[i] -= cdf[i] >> rate; cdf[n_symbols] = count + (count < 32); } + return val; } @@ -163,7 +160,7 @@ unsigned dav1d_msac_decode_bool_adapt_c(MsacContext *const s, if (s->allow_update_cdf) { // update_cdf() specialized for boolean CDFs const unsigned count = cdf[1]; - const int rate = (count >> 4) | 4; + const int rate = 4 + (count >> 4); if (bit) cdf[0] += (32768 - cdf[0]) >> rate; else @@ -60,7 +60,7 @@ unsigned dav1d_msac_decode_bool_equi_c(MsacContext *s); unsigned dav1d_msac_decode_bool_c(MsacContext *s, unsigned f); int dav1d_msac_decode_subexp(MsacContext *s, int ref, int n, unsigned k); -/* Supported n_symbols ranges: adapt4: 1-5, adapt8: 1-8, adapt16: 4-16 */ +/* Supported n_symbols ranges: adapt4: 1-4, adapt8: 1-7, adapt16: 3-15 */ #ifndef dav1d_msac_decode_symbol_adapt4 #define dav1d_msac_decode_symbol_adapt4 dav1d_msac_decode_symbol_adapt_c #endif diff --git a/src/recon_tmpl.c b/src/recon_tmpl.c index 7194c73..71e7e80 100644 --- a/src/recon_tmpl.c +++ b/src/recon_tmpl.c @@ -104,11 +104,11 @@ static int decode_coefs(Dav1dTileContext *const t, dav1d_filter_mode_to_y_mode[b->y_angle] : b->y_mode; if (f->frame_hdr->reduced_txtp_set || t_dim->min == TX_16X16) { idx = dav1d_msac_decode_symbol_adapt4(&ts->msac, - ts->cdf.m.txtp_intra2[t_dim->min][y_mode_nofilt], 5); + ts->cdf.m.txtp_intra2[t_dim->min][y_mode_nofilt], 4); *txtp = dav1d_tx_types_per_set[idx + 0]; } else { idx = dav1d_msac_decode_symbol_adapt8(&ts->msac, - ts->cdf.m.txtp_intra1[t_dim->min][y_mode_nofilt], 7); + ts->cdf.m.txtp_intra1[t_dim->min][y_mode_nofilt], 6); *txtp = dav1d_tx_types_per_set[idx + 5]; } if (dbg) @@ -121,11 +121,11 @@ static int decode_coefs(Dav1dTileContext *const t, *txtp = (idx - 1) & IDTX; /* idx ? DCT_DCT : IDTX */ } else if (t_dim->min == TX_16X16) { idx = dav1d_msac_decode_symbol_adapt16(&ts->msac, - ts->cdf.m.txtp_inter2, 12); + ts->cdf.m.txtp_inter2, 11); *txtp = dav1d_tx_types_per_set[idx + 12]; } else { idx = dav1d_msac_decode_symbol_adapt16(&ts->msac, - ts->cdf.m.txtp_inter1[t_dim->min], 16); + ts->cdf.m.txtp_inter1[t_dim->min], 15); *txtp = dav1d_tx_types_per_set[idx + 24]; } if (dbg) @@ -143,7 +143,7 @@ static int decode_coefs(Dav1dTileContext *const t, #define case_sz(sz, bin, ns, is_1d) \ case sz: { \ uint16_t *const eob_bin_cdf = ts->cdf.coef.eob_bin_##bin[chroma]is_1d; \ - eob_bin = dav1d_msac_decode_symbol_adapt##ns(&ts->msac, eob_bin_cdf, 5 + sz); \ + eob_bin = dav1d_msac_decode_symbol_adapt##ns(&ts->msac, eob_bin_cdf, 4 + sz); \ break; \ } case_sz(0, 16, 4, [is_1d]); @@ -175,7 +175,7 @@ static int decode_coefs(Dav1dTileContext *const t, } // base tokens - uint16_t (*const br_cdf)[5] = + uint16_t (*const br_cdf)[4] = ts->cdf.coef.br_tok[imin(t_dim->ctx, 3)][chroma]; const int16_t *const scan = dav1d_scans[tx][tx_class]; int dc_tok; @@ -193,7 +193,7 @@ static int decode_coefs(Dav1dTileContext *const t, const int ctx = 1 + (eob > sw * sh * 2) + (eob > sw * sh * 4); uint16_t *const lo_cdf = ts->cdf.coef.eob_base_tok[t_dim->ctx][chroma][ctx]; - int tok_br = dav1d_msac_decode_symbol_adapt4(&ts->msac, lo_cdf, 3); + int tok_br = dav1d_msac_decode_symbol_adapt4(&ts->msac, lo_cdf, 2); int tok = 1 + tok_br; if (dbg) printf("Post-lo_tok[%d][%d][%d][%d=%d=%d]: r=%d\n", @@ -209,26 +209,26 @@ static int decode_coefs(Dav1dTileContext *const t, const int br_ctx = get_br_ctx(levels, 1, tx_class, x, y, stride); tok_br = dav1d_msac_decode_symbol_adapt4(&ts->msac, - br_cdf[br_ctx], 4); + br_cdf[br_ctx], 3); tok = 3 + tok_br; dbg_print_hi_tok(eob, tok + tok_br, tok_br); if (tok_br == 3) { tok_br = dav1d_msac_decode_symbol_adapt4(&ts->msac, - br_cdf[br_ctx], 4); + br_cdf[br_ctx], 3); tok = 6 + tok_br; dbg_print_hi_tok(eob, tok + tok_br, tok_br); if (tok_br == 3) { tok_br = dav1d_msac_decode_symbol_adapt4(&ts->msac, br_cdf[br_ctx], - 4); + 3); tok = 9 + tok_br; dbg_print_hi_tok(eob, tok + tok_br, tok_br); if (tok_br == 3) { tok = 12 + dav1d_msac_decode_symbol_adapt4(&ts->msac, br_cdf[br_ctx], - 4); + 3); dbg_print_hi_tok(eob, tok + tok_br, tok_br); } } @@ -244,7 +244,7 @@ static int decode_coefs(Dav1dTileContext *const t, // lo tok const int ctx = get_coef_nz_ctx(levels, tx, tx_class, x, y, stride); uint16_t *const lo_cdf = ts->cdf.coef.base_tok[t_dim->ctx][chroma][ctx]; - int tok = dav1d_msac_decode_symbol_adapt4(&ts->msac, lo_cdf, 4); + int tok = dav1d_msac_decode_symbol_adapt4(&ts->msac, lo_cdf, 3); if (dbg) printf("Post-lo_tok[%d][%d][%d][%d=%d=%d]: r=%d\n", t_dim->ctx, chroma, ctx, i, rc, tok, ts->msac.rng); @@ -254,26 +254,26 @@ static int decode_coefs(Dav1dTileContext *const t, const int br_ctx = get_br_ctx(levels, 1, tx_class, x, y, stride); int tok_br = dav1d_msac_decode_symbol_adapt4(&ts->msac, - br_cdf[br_ctx], 4); + br_cdf[br_ctx], 3); tok = 3 + tok_br; dbg_print_hi_tok(i, tok + tok_br, tok_br); if (tok_br == 3) { tok_br = dav1d_msac_decode_symbol_adapt4(&ts->msac, - br_cdf[br_ctx], 4); + br_cdf[br_ctx], 3); tok = 6 + tok_br; dbg_print_hi_tok(i, tok + tok_br, tok_br); if (tok_br == 3) { tok_br = dav1d_msac_decode_symbol_adapt4(&ts->msac, br_cdf[br_ctx], - 4); + 3); tok = 9 + tok_br; dbg_print_hi_tok(i, tok + tok_br, tok_br); if (tok_br == 3) { tok = 12 + dav1d_msac_decode_symbol_adapt4(&ts->msac, br_cdf[br_ctx], - 4); + 3); dbg_print_hi_tok(i, tok + tok_br, tok_br); } } @@ -287,7 +287,7 @@ static int decode_coefs(Dav1dTileContext *const t, const int ctx = (tx_class != TX_CLASS_2D) ? get_coef_nz_ctx(levels, tx, tx_class, 0, 0, stride) : 0; uint16_t *const lo_cdf = ts->cdf.coef.base_tok[t_dim->ctx][chroma][ctx]; - dc_tok = dav1d_msac_decode_symbol_adapt4(&ts->msac, lo_cdf, 4); + dc_tok = dav1d_msac_decode_symbol_adapt4(&ts->msac, lo_cdf, 3); if (dbg) printf("Post-dc_lo_tok[%d][%d][%d][%d]: r=%d\n", t_dim->ctx, chroma, ctx, dc_tok, ts->msac.rng); @@ -302,27 +302,27 @@ static int decode_coefs(Dav1dTileContext *const t, const int br_ctx = get_br_ctx(levels, 0, tx_class, 0, 0, stride); int tok_br = - dav1d_msac_decode_symbol_adapt4(&ts->msac, br_cdf[br_ctx], 4); + dav1d_msac_decode_symbol_adapt4(&ts->msac, br_cdf[br_ctx], 3); dc_tok = 3 + tok_br; dbg_print_hi_tok(dc_tok + tok_br, tok_br); if (tok_br == 3) { tok_br = dav1d_msac_decode_symbol_adapt4(&ts->msac, - br_cdf[br_ctx], 4); + br_cdf[br_ctx], 3); dc_tok = 6 + tok_br; dbg_print_hi_tok(dc_tok + tok_br, tok_br); if (tok_br == 3) { tok_br = dav1d_msac_decode_symbol_adapt4(&ts->msac, br_cdf[br_ctx], - 4); + 3); dc_tok = 9 + tok_br; dbg_print_hi_tok(dc_tok + tok_br, tok_br); if (tok_br == 3) { dc_tok = 12 + dav1d_msac_decode_symbol_adapt4(&ts->msac, br_cdf[br_ctx], - 4); + 3); dbg_print_hi_tok(dc_tok + tok_br, tok_br); } } @@ -332,7 +332,7 @@ static int decode_coefs(Dav1dTileContext *const t, } } else { // dc-only uint16_t *const lo_cdf = ts->cdf.coef.eob_base_tok[t_dim->ctx][chroma][0]; - int tok_br = dav1d_msac_decode_symbol_adapt4(&ts->msac, lo_cdf, 3); + int tok_br = dav1d_msac_decode_symbol_adapt4(&ts->msac, lo_cdf, 2); dc_tok = 1 + tok_br; if (dbg) printf("Post-dc_lo_tok[%d][%d][%d][%d]: r=%d\n", @@ -345,24 +345,24 @@ static int decode_coefs(Dav1dTileContext *const t, printf("Post-dc_hi_tok[%d][%d][0][%d->%d]: r=%d\n", \ imin(t_dim->ctx, 3), chroma, tok_br, dc_tok, ts->msac.rng); - tok_br = dav1d_msac_decode_symbol_adapt4(&ts->msac, br_cdf[0], 4); + tok_br = dav1d_msac_decode_symbol_adapt4(&ts->msac, br_cdf[0], 3); dc_tok = 3 + tok_br; dbg_print_hi_tok(dc_tok + tok_br, tok_br); if (tok_br == 3) { - tok_br = dav1d_msac_decode_symbol_adapt4(&ts->msac, br_cdf[0], 4); + tok_br = dav1d_msac_decode_symbol_adapt4(&ts->msac, br_cdf[0], 3); dc_tok = 6 + tok_br; dbg_print_hi_tok(dc_tok + tok_br, tok_br); if (tok_br == 3) { tok_br = dav1d_msac_decode_symbol_adapt4(&ts->msac, - br_cdf[0], 4); + br_cdf[0], 3); dc_tok = 9 + tok_br; dbg_print_hi_tok(dc_tok + tok_br, tok_br); if (tok_br == 3) { dc_tok = 12 + dav1d_msac_decode_symbol_adapt4(&ts->msac, - br_cdf[0], 4); + br_cdf[0], 3); dbg_print_hi_tok(dc_tok + tok_br, tok_br); } } diff --git a/src/tables.c b/src/tables.c index 4f8c623..38ea6aa 100644 --- a/src/tables.c +++ b/src/tables.c @@ -225,6 +225,14 @@ const uint8_t /* enum InterPredMode */ [NEARMV_NEWMV] = { NEARMV, NEWMV }, }; +const uint8_t dav1d_partition_type_count[N_BL_LEVELS] = { + [BL_128X128] = N_PARTITIONS - 3, + [BL_64X64] = N_PARTITIONS - 1, + [BL_32X32] = N_PARTITIONS - 1, + [BL_16X16] = N_PARTITIONS - 1, + [BL_8X8] = N_SUB8X8_PARTITIONS - 1, +}; + const uint8_t /* enum TxfmType */ dav1d_tx_types_per_set[40] = { /* Intra2 */ IDTX, DCT_DCT, ADST_ADST, ADST_DCT, DCT_ADST, diff --git a/src/tables.h b/src/tables.h index fb5b3f8..749bfdd 100644 --- a/src/tables.h +++ b/src/tables.h @@ -52,6 +52,7 @@ extern const uint8_t /* enum TxfmType */ extern const uint8_t /* enum InterPredMode */ dav1d_comp_inter_pred_modes[N_COMP_INTER_PRED_MODES][2]; +extern const uint8_t dav1d_partition_type_count[N_BL_LEVELS]; extern const uint8_t /* enum TxfmType */ dav1d_tx_types_per_set[40]; extern const uint8_t dav1d_filter_mode_to_y_mode[5]; diff --git a/src/x86/msac.asm b/src/x86/msac.asm index c19e159..b896f74 100644 --- a/src/x86/msac.asm +++ b/src/x86/msac.asm @@ -88,7 +88,7 @@ cglobal msac_decode_symbol_adapt4, 0, 6, 6 movp m3, [t0+msac.dif] mov t3d, [t0+msac.update_cdf] mov t4d, t2d - neg t2 + not t2 ; -(n_symbols + 1) pshuflw m2, m2, q0000 movd [buf+12], m2 pand m2, [rax] @@ -112,15 +112,15 @@ cglobal msac_decode_symbol_adapt4, 0, 6, 6 pcmpeqw m2, m2 mov t2d, t3d shr t3d, 4 - cmp t4d, 4 - sbb t3d, -5 ; (count >> 4) + (n_symbols > 3) + 4 + cmp t4d, 3 + sbb t3d, -5 ; (count >> 4) + (n_symbols > 2) + 4 cmp t2d, 32 adc t2d, 0 ; count + (count < 32) movd m3, t3d pavgw m2, m1 ; i >= val ? -1 : 32768 psubw m2, m0 ; for (i = 0; i < val; i++) psubw m0, m1 ; cdf[i] += (32768 - cdf[i]) >> rate; - psraw m2, m3 ; for (; i < n_symbols - 1; i++) + psraw m2, m3 ; for (; i < n_symbols; i++) paddw m0, m2 ; cdf[i] += (( -1 - cdf[i]) >> rate) + 1; movq [t1], m0 mov [t1+t4*2], t2w @@ -214,11 +214,11 @@ cglobal msac_decode_symbol_adapt8, 0, 6, 6 DECODE_SYMBOL_ADAPT_INIT LEA rax, pw_0xff00 movd m2, [t0+msac.rng] - movu m1, [t1] + mova m1, [t1] movp m3, [t0+msac.dif] mov t3d, [t0+msac.update_cdf] mov t4d, t2d - neg t2 + not t2 pshuflw m2, m2, q0000 movd [buf+12], m2 punpcklqdq m2, m2 @@ -242,7 +242,7 @@ cglobal msac_decode_symbol_adapt8, 0, 6, 6 pcmpeqw m2, m2 mov t2d, t3d shr t3d, 4 - cmp t4d, 4 ; may be called with n_symbols < 4 + cmp t4d, 3 ; may be called with n_symbols <= 2 sbb t3d, -5 cmp t2d, 32 adc t2d, 0 @@ -252,7 +252,7 @@ cglobal msac_decode_symbol_adapt8, 0, 6, 6 psubw m0, m1 psraw m2, m3 paddw m0, m2 - movu [t1], m0 + mova [t1], m0 mov [t1+t4*2], t2w jmp m(msac_decode_symbol_adapt4).renorm @@ -260,12 +260,12 @@ cglobal msac_decode_symbol_adapt16, 0, 6, 6 DECODE_SYMBOL_ADAPT_INIT LEA rax, pw_0xff00 movd m4, [t0+msac.rng] - movu m2, [t1] - movu m3, [t1+16] + mova m2, [t1] + mova m3, [t1+16] movp m5, [t0+msac.dif] mov t3d, [t0+msac.update_cdf] mov t4d, t2d - neg t2 + not t2 %if WIN64 sub rsp, 48 ; need 36 bytes, shadow space is only 32 %endif @@ -288,8 +288,8 @@ cglobal msac_decode_symbol_adapt16, 0, 6, 6 punpcklqdq m5, m5 paddw m3, m4 mova [buf], m2 - mova [buf+16], m3 psubusw m2, m5 + mova [buf+16], m3 psubusw m3, m5 pxor m4, m4 pcmpeqw m2, m4 @@ -301,7 +301,7 @@ cglobal msac_decode_symbol_adapt16, 0, 6, 6 movzx t3d, word [t1+t4*2] pcmpeqw m4, m4 mova m5, m4 - lea t2d, [t3+80] ; only support n_symbols >= 4 + lea t2d, [t3+80] ; only support n_symbols > 2 shr t2d, 4 cmp t3d, 32 adc t3d, 0 @@ -316,8 +316,8 @@ cglobal msac_decode_symbol_adapt16, 0, 6, 6 psraw m5, m2 paddw m0, m4 paddw m1, m5 - movu [t1], m0 - movu [t1+16], m1 + mova [t1], m0 + mova [t1+16], m1 mov [t1+t4*2], t3w .renorm: tzcnt eax, eax diff --git a/tests/checkasm/msac.c b/tests/checkasm/msac.c index b2d21b5..3b75055 100644 --- a/tests/checkasm/msac.c +++ b/tests/checkasm/msac.c @@ -51,12 +51,14 @@ typedef struct { decode_bool_fn bool; } MsacDSPContext; -static void randomize_cdf(uint16_t *const cdf, int n) { - for (int i = 16; i > n; i--) - cdf[i] = rnd(); /* randomize padding */ - cdf[n] = cdf[n-1] = 0; - while (--n > 0) - cdf[n-1] = cdf[n] + rnd() % (32768 - cdf[n] - n) + 1; +static void randomize_cdf(uint16_t *const cdf, const int n) { + int i; + for (i = 15; i > n; i--) + cdf[i] = rnd(); // padding + cdf[i] = 0; // count + do { + cdf[i - 1] = cdf[i] + rnd() % (32768 - cdf[i] - i) + 1; + } while (--i > 0); } /* memcmp() on structs can have weird behavior due to padding etc. */ @@ -69,7 +71,7 @@ static int msac_cmp(const MsacContext *const a, const MsacContext *const b) { static void msac_dump(unsigned c_res, unsigned a_res, const MsacContext *const a, const MsacContext *const b, const uint16_t *const cdf_a, const uint16_t *const cdf_b, - int num_cdf) + const int num_cdf) { if (c_res != a_res) fprintf(stderr, "c_res %u a_res %u\n", c_res, a_res); @@ -86,16 +88,15 @@ static void msac_dump(unsigned c_res, unsigned a_res, if (a->allow_update_cdf) fprintf(stderr, "allow_update_cdf %d vs %d\n", a->allow_update_cdf, b->allow_update_cdf); - if (cdf_a != NULL && cdf_b != NULL && - memcmp(cdf_a, cdf_b, sizeof(*cdf_a) * num_cdf)) { + if (num_cdf && memcmp(cdf_a, cdf_b, sizeof(*cdf_a) * (num_cdf + 1))) { fprintf(stderr, "cdf:\n"); - for (int i = 0; i < num_cdf; i++) + for (int i = 0; i <= num_cdf; i++) fprintf(stderr, " %5u", cdf_a[i]); fprintf(stderr, "\n"); - for (int i = 0; i < num_cdf; i++) + for (int i = 0; i <= num_cdf; i++) fprintf(stderr, " %5u", cdf_b[i]); fprintf(stderr, "\n"); - for (int i = 0; i < num_cdf; i++) + for (int i = 0; i <= num_cdf; i++) fprintf(stderr, " %c", cdf_a[i] != cdf_b[i] ? 'x' : '.'); fprintf(stderr, "\n"); } @@ -105,7 +106,7 @@ static void msac_dump(unsigned c_res, unsigned a_res, if (check_func(c->symbol_adapt##n, "msac_decode_symbol_adapt%d", n)) { \ for (int cdf_update = 0; cdf_update <= 1; cdf_update++) { \ for (int ns = n_min; ns <= n_max; ns++) { \ - dav1d_msac_init(&s_c, buf, BUF_SIZE, !cdf_update); \ + dav1d_msac_init(&s_c, buf, BUF_SIZE, !cdf_update); \ s_a = s_c; \ randomize_cdf(cdf[0], ns); \ memcpy(cdf[1], cdf[0], sizeof(*cdf)); \ @@ -117,26 +118,24 @@ static void msac_dump(unsigned c_res, unsigned a_res, { \ if (fail()) \ msac_dump(c_res, a_res, &s_c, &s_a, \ - cdf[0], cdf[1], ns + 1); \ + cdf[0], cdf[1], ns); \ } \ } \ - if (cdf_update && ns == n) \ - bench_new(&s_a, cdf[0], n); \ + if (cdf_update && ns == n - 1) \ + bench_new(&s_a, cdf[1], ns); \ } \ } \ } \ } while (0) static void check_decode_symbol(MsacDSPContext *const c, uint8_t *const buf) { - /* Use an aligned CDF buffer for more consistent benchmark - * results, and a misaligned one for checking correctness. */ - ALIGN_STK_16(uint16_t, cdf, 2, [17]); + ALIGN_STK_32(uint16_t, cdf, 2, [16]); MsacContext s_c, s_a; declare_func(unsigned, MsacContext *s, uint16_t *cdf, size_t n_symbols); - CHECK_SYMBOL_ADAPT( 4, 1, 5); - CHECK_SYMBOL_ADAPT( 8, 1, 8); - CHECK_SYMBOL_ADAPT(16, 4, 16); + CHECK_SYMBOL_ADAPT( 4, 1, 4); + CHECK_SYMBOL_ADAPT( 8, 1, 7); + CHECK_SYMBOL_ADAPT(16, 3, 15); report("decode_symbol"); } @@ -158,11 +157,11 @@ static void check_decode_bool(MsacDSPContext *const c, uint8_t *const buf) { memcmp(cdf[0], cdf[1], sizeof(*cdf))) { if (fail()) - msac_dump(c_res, a_res, &s_c, &s_a, cdf[0], cdf[1], 2); + msac_dump(c_res, a_res, &s_c, &s_a, cdf[0], cdf[1], 1); } } if (cdf_update) - bench_new(&s_a, cdf[0]); + bench_new(&s_a, cdf[1]); } } |