From a029d6892c5c39f4cda629d4a3b676ef2e8288f6 Mon Sep 17 00:00:00 2001
From: Henrik Gramner <gramner@twoorioles.com>
Date: Mon, 18 Jul 2022 15:17:00 +0200
Subject: Adjust inlining attributes on some functions

The code size increase of inlining every call to certain functions
isn't a worthwhile trade-off, and most compilers actually ends up
overriding those particular inlining hints anyway.

In some cases it's also better to split the function into separate
luma and chroma functions.
---
 src/decode.c  | 56 ++++++++++++++++++++++++++++++++++++--------------------
 src/lf_mask.c | 53 +++++++++++++++++++++++++++++++----------------------
 2 files changed, 67 insertions(+), 42 deletions(-)

diff --git a/src/decode.c b/src/decode.c
index b44f157..8a1ef7c 100644
--- a/src/decode.c
+++ b/src/decode.c
@@ -749,9 +749,9 @@ static inline void splat_intraref(const Dav1dContext *const c,
     c->refmvs_dsp.splat_mv(&t->rt.r[(t->by & 31) + 5], &tmpl, t->bx, bw4, bh4);
 }
 
-static inline void mc_lowest_px(int *const dst, const int by4, const int bh4,
-                                const int mvy, const int ss_ver,
-                                const struct ScalableMotionParams *const smp)
+static void mc_lowest_px(int *const dst, const int by4, const int bh4,
+                         const int mvy, const int ss_ver,
+                         const struct ScalableMotionParams *const smp)
 {
     const int v_mul = 4 >> ss_ver;
     if (!smp->scale) {
@@ -766,14 +766,11 @@ static inline void mc_lowest_px(int *const dst, const int by4, const int bh4,
     }
 }
 
-static inline void affine_lowest_px(Dav1dTaskContext *const t,
-                                    int *const dst, const int is_chroma,
-                                    const uint8_t *const b_dim,
-                                    const Dav1dWarpedMotionParams *const wmp)
+static ALWAYS_INLINE void affine_lowest_px(Dav1dTaskContext *const t, int *const dst,
+                                           const uint8_t *const b_dim,
+                                           const Dav1dWarpedMotionParams *const wmp,
+                                           const int ss_ver, const int ss_hor)
 {
-    const Dav1dFrameContext *const f = t->f;
-    const int ss_ver = is_chroma && f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
-    const int ss_hor = is_chroma && f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444;
     const int h_mul = 4 >> ss_hor, v_mul = 4 >> ss_ver;
     assert(!((b_dim[0] * h_mul) & 7) && !((b_dim[1] * v_mul) & 7));
     const int32_t *const mat = wmp->matrix;
@@ -792,6 +789,25 @@ static inline void affine_lowest_px(Dav1dTaskContext *const t,
     }
 }
 
+static NOINLINE void affine_lowest_px_luma(Dav1dTaskContext *const t, int *const dst,
+                                           const uint8_t *const b_dim,
+                                           const Dav1dWarpedMotionParams *const wmp)
+{
+    affine_lowest_px(t, dst, b_dim, wmp, 0, 0);
+}
+
+static NOINLINE void affine_lowest_px_chroma(Dav1dTaskContext *const t, int *const dst,
+                                             const uint8_t *const b_dim,
+                                             const Dav1dWarpedMotionParams *const wmp)
+{
+    const Dav1dFrameContext *const f = t->f;
+    assert(f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I400);
+    if (f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I444)
+        affine_lowest_px_luma(t, dst, b_dim, wmp);
+    else
+        affine_lowest_px(t, dst, b_dim, wmp, f->cur.p.layout & DAV1D_PIXEL_LAYOUT_I420, 1);
+}
+
 static void obmc_lowest_px(Dav1dTaskContext *const t,
                            int (*const dst)[2], const int is_chroma,
                            const uint8_t *const b_dim,
@@ -2150,9 +2166,9 @@ static int decode_b(Dav1dTaskContext *const t,
                 ((b->inter_mode == GLOBALMV && f->gmv_warp_allowed[b->ref[0]]) ||
                  (b->motion_mode == MM_WARP && t->warpmv.type > DAV1D_WM_TYPE_TRANSLATION)))
             {
-                affine_lowest_px(t, &lowest_px[b->ref[0]][0], 0, b_dim,
-                                 b->motion_mode == MM_WARP ? &t->warpmv :
-                                     &f->frame_hdr->gmv[b->ref[0]]);
+                affine_lowest_px_luma(t, &lowest_px[b->ref[0]][0], b_dim,
+                                      b->motion_mode == MM_WARP ? &t->warpmv :
+                                      &f->frame_hdr->gmv[b->ref[0]]);
             } else {
                 mc_lowest_px(&lowest_px[b->ref[0]][0], t->by, bh4, b->mv[0].y,
                              0, &f->svc[b->ref[0]][1]);
@@ -2203,9 +2219,9 @@ static int decode_b(Dav1dTaskContext *const t,
                         ((b->inter_mode == GLOBALMV && f->gmv_warp_allowed[b->ref[0]]) ||
                          (b->motion_mode == MM_WARP && t->warpmv.type > DAV1D_WM_TYPE_TRANSLATION)))
                     {
-                        affine_lowest_px(t, &lowest_px[b->ref[0]][1], 1, b_dim,
-                                         b->motion_mode == MM_WARP ? &t->warpmv :
-                                            &f->frame_hdr->gmv[b->ref[0]]);
+                        affine_lowest_px_chroma(t, &lowest_px[b->ref[0]][1], b_dim,
+                                                b->motion_mode == MM_WARP ? &t->warpmv :
+                                                &f->frame_hdr->gmv[b->ref[0]]);
                     } else {
                         mc_lowest_px(&lowest_px[b->ref[0]][1],
                                      t->by & ~ss_ver, bh4 << (bh4 == ss_ver),
@@ -2220,8 +2236,8 @@ static int decode_b(Dav1dTaskContext *const t,
             // y
             for (int i = 0; i < 2; i++) {
                 if (b->inter_mode == GLOBALMV_GLOBALMV && f->gmv_warp_allowed[b->ref[i]]) {
-                    affine_lowest_px(t, &lowest_px[b->ref[i]][0], 0, b_dim,
-                                     &f->frame_hdr->gmv[b->ref[i]]);
+                    affine_lowest_px_luma(t, &lowest_px[b->ref[i]][0], b_dim,
+                                          &f->frame_hdr->gmv[b->ref[i]]);
                 } else {
                     mc_lowest_px(&lowest_px[b->ref[i]][0], t->by, bh4,
                                  b->mv[i].y, 0, &f->svc[b->ref[i]][1]);
@@ -2233,8 +2249,8 @@ static int decode_b(Dav1dTaskContext *const t,
                 if (b->inter_mode == GLOBALMV_GLOBALMV &&
                     imin(cbw4, cbh4) > 1 && f->gmv_warp_allowed[b->ref[i]])
                 {
-                    affine_lowest_px(t, &lowest_px[b->ref[i]][1], 1, b_dim,
-                                     &f->frame_hdr->gmv[b->ref[i]]);
+                    affine_lowest_px_chroma(t, &lowest_px[b->ref[i]][1], b_dim,
+                                            &f->frame_hdr->gmv[b->ref[i]]);
                 } else {
                     mc_lowest_px(&lowest_px[b->ref[i]][1], t->by, bh4,
                                  b->mv[i].y, ss_ver, &f->svc[b->ref[i]][1]);
diff --git a/src/lf_mask.c b/src/lf_mask.c
index 411c884..91fe4a0 100644
--- a/src/lf_mask.c
+++ b/src/lf_mask.c
@@ -212,13 +212,13 @@ static inline void mask_edges_intra(uint16_t (*const masks)[32][3][2],
 #undef set_ctx
 }
 
-static inline void mask_edges_chroma(uint16_t (*const masks)[32][2][2],
-                                     const int cby4, const int cbx4,
-                                     const int cw4, const int ch4,
-                                     const int skip_inter,
-                                     const enum RectTxfmSize tx,
-                                     uint8_t *const a, uint8_t *const l,
-                                     const int ss_hor, const int ss_ver)
+static void mask_edges_chroma(uint16_t (*const masks)[32][2][2],
+                              const int cby4, const int cbx4,
+                              const int cw4, const int ch4,
+                              const int skip_inter,
+                              const enum RectTxfmSize tx,
+                              uint8_t *const a, uint8_t *const l,
+                              const int ss_hor, const int ss_ver)
 {
     const TxfmInfo *const t_dim = &dav1d_txfm_dimensions[tx];
     const int twl4 = t_dim->lw, thl4 = t_dim->lh;
@@ -424,16 +424,14 @@ void dav1d_calc_eih(Av1FilterLUT *const lim_lut, const int filter_sharpness) {
     lim_lut->sharp[1] = sharp ? 9 - sharp : 0xff;
 }
 
-static inline void calc_lf_value(uint8_t (*const lflvl_values)[2],
-                                 const int is_chroma, const int base_lvl,
-                                 const int lf_delta, const int seg_delta,
-                                 const Dav1dLoopfilterModeRefDeltas *const mr_delta)
+static void calc_lf_value(uint8_t (*const lflvl_values)[2],
+                          const int base_lvl, const int lf_delta,
+                          const int seg_delta,
+                          const Dav1dLoopfilterModeRefDeltas *const mr_delta)
 {
     const int base = iclip(iclip(base_lvl + lf_delta, 0, 63) + seg_delta, 0, 63);
 
-    if (!base_lvl && is_chroma) {
-        memset(lflvl_values, 0, 8 * 2);
-    } else if (!mr_delta) {
+    if (!mr_delta) {
         memset(lflvl_values, base, 8 * 2);
     } else {
         const int sh = base >= 32;
@@ -449,6 +447,17 @@ static inline void calc_lf_value(uint8_t (*const lflvl_values)[2],
     }
 }
 
+static inline void calc_lf_value_chroma(uint8_t (*const lflvl_values)[2],
+                                        const int base_lvl, const int lf_delta,
+                                        const int seg_delta,
+                                        const Dav1dLoopfilterModeRefDeltas *const mr_delta)
+{
+    if (!base_lvl)
+        memset(lflvl_values, 0, 8 * 2);
+    else
+        calc_lf_value(lflvl_values, base_lvl, lf_delta, seg_delta, mr_delta);
+}
+
 void dav1d_calc_lf_values(uint8_t (*const lflvl_values)[4][8][2],
                           const Dav1dFrameHeader *const hdr,
                           const int8_t lf_delta[4])
@@ -467,16 +476,16 @@ void dav1d_calc_lf_values(uint8_t (*const lflvl_values)[4][8][2],
         const Dav1dSegmentationData *const segd =
             hdr->segmentation.enabled ? &hdr->segmentation.seg_data.d[s] : NULL;
 
-        calc_lf_value(lflvl_values[s][0], 0, hdr->loopfilter.level_y[0],
+        calc_lf_value(lflvl_values[s][0], hdr->loopfilter.level_y[0],
                       lf_delta[0], segd ? segd->delta_lf_y_v : 0, mr_deltas);
-        calc_lf_value(lflvl_values[s][1], 0, hdr->loopfilter.level_y[1],
+        calc_lf_value(lflvl_values[s][1], hdr->loopfilter.level_y[1],
                       lf_delta[hdr->delta.lf.multi ? 1 : 0],
                       segd ? segd->delta_lf_y_h : 0, mr_deltas);
-        calc_lf_value(lflvl_values[s][2], 1, hdr->loopfilter.level_u,
-                      lf_delta[hdr->delta.lf.multi ? 2 : 0],
-                      segd ? segd->delta_lf_u : 0, mr_deltas);
-        calc_lf_value(lflvl_values[s][3], 1, hdr->loopfilter.level_v,
-                      lf_delta[hdr->delta.lf.multi ? 3 : 0],
-                      segd ? segd->delta_lf_v : 0, mr_deltas);
+        calc_lf_value_chroma(lflvl_values[s][2], hdr->loopfilter.level_u,
+                             lf_delta[hdr->delta.lf.multi ? 2 : 0],
+                             segd ? segd->delta_lf_u : 0, mr_deltas);
+        calc_lf_value_chroma(lflvl_values[s][3], hdr->loopfilter.level_v,
+                             lf_delta[hdr->delta.lf.multi ? 3 : 0],
+                             segd ? segd->delta_lf_v : 0, mr_deltas);
     }
 }
-- 
cgit v1.2.3