From ee98592ba27aa0c958778eef998ed89a664e252d Mon Sep 17 00:00:00 2001
From: David Conrad <david_conrad@apple.com>
Date: Tue, 7 Jun 2022 15:03:36 -0700
Subject: Fix overflow when saturating dequantized coefficients clipped to 0

It's possible to encode a large coefficient that becomes 0 after
the clipping in dequant (Abs( dq ) & 0xFFFFFF), e.g. 0x1000000
After that &0xFFFFFF, coeffs are saturated in the range of
[-(1 << (bitdepth+7)), 1 << (bitdepth+7))

dav1d implements this saturation via umin(dq - sign, cf_max), then applies
the sign afterwards via xor. However, for dq = 0 and sign = 1, this step
evaulates to umin(UINT_MAX, cf_max) == cf_max instead of the expected 0.

So instead, do unsigned saturate as umin(dq, cf_max + sign),
then apply sign via (sign ? -dq : dq)
On arm this is the same number of instructions, since cneg exists and is used
On x86 this requires an additional instruction, but this isn't a
latency-critical path
---
 src/recon_tmpl.c | 26 +++++++++++++++-----------
 1 file changed, 15 insertions(+), 11 deletions(-)

diff --git a/src/recon_tmpl.c b/src/recon_tmpl.c
index 0ed4169..5f49910 100644
--- a/src/recon_tmpl.c
+++ b/src/recon_tmpl.c
@@ -591,7 +591,7 @@ static int decode_coefs(Dav1dTaskContext *const t,
     const uint16_t *const dq_tbl = ts->dq[b->seg_id][plane];
     const uint8_t *const qm_tbl = *txtp < IDTX ? f->qm[tx][plane] : NULL;
     const int dq_shift = imax(0, t_dim->ctx - 2);
-    const unsigned cf_max = ~(~127U << (BITDEPTH == 8 ? 8 : f->cur.p.bpc));
+    const int cf_max = ~(~127U << (BITDEPTH == 8 ? 8 : f->cur.p.bpc));
     unsigned cul_level, dc_sign_level;
 
     if (!dc_tok) {
@@ -608,7 +608,7 @@ static int decode_coefs(Dav1dTaskContext *const t,
         printf("Post-dc_sign[%d][%d][%d]: r=%d\n",
                chroma, dc_sign_ctx, dc_sign, ts->msac.rng);
 
-    unsigned dc_dq = dq_tbl[0];
+    int dc_dq = dq_tbl[0];
     dc_sign_level = (dc_sign - 1) & (2 << 6);
 
     if (qm_tbl) {
@@ -628,7 +628,8 @@ static int decode_coefs(Dav1dTaskContext *const t,
         }
         cul_level = dc_tok;
         dc_dq >>= dq_shift;
-        cf[0] = (coef) (umin(dc_dq - dc_sign, cf_max) ^ -dc_sign);
+        dc_dq = umin(dc_dq, cf_max + dc_sign);
+        cf[0] = (coef) (dc_sign ? -dc_dq : dc_dq);
 
         if (rc) ac_qm: {
             const unsigned ac_dq = dq_tbl[1];
@@ -638,6 +639,7 @@ static int decode_coefs(Dav1dTaskContext *const t,
                     printf("Post-sign[%d=%d]: r=%d\n", rc, sign, ts->msac.rng);
                 const unsigned rc_tok = cf[rc];
                 unsigned tok, dq = (ac_dq * qm_tbl[rc] + 16) >> 5;
+                int dq_sat;
 
                 if (rc_tok >= (15 << 11)) {
                     tok = read_golomb(&ts->msac) + 15;
@@ -654,7 +656,8 @@ static int decode_coefs(Dav1dTaskContext *const t,
                 }
                 cul_level += tok;
                 dq >>= dq_shift;
-                cf[rc] = (coef) (umin(dq - sign, cf_max) ^ -sign);
+                dq_sat = umin(dq, cf_max + sign);
+                cf[rc] = (coef) (sign ? -dq_sat : dq_sat);
 
                 rc = rc_tok & 0x3ff;
             } while (rc);
@@ -669,13 +672,13 @@ static int decode_coefs(Dav1dTaskContext *const t,
 
             dc_tok &= 0xfffff;
             dc_dq = ((dc_dq * dc_tok) & 0xffffff) >> dq_shift;
-            dc_dq = umin(dc_dq - dc_sign, cf_max);
+            dc_dq = umin(dc_dq, cf_max + dc_sign);
         } else {
-            dc_dq = ((dc_dq * dc_tok) >> dq_shift) - dc_sign;
+            dc_dq = ((dc_dq * dc_tok) >> dq_shift);
             assert(dc_dq <= cf_max);
         }
         cul_level = dc_tok;
-        cf[0] = (coef) (dc_dq ^ -dc_sign);
+        cf[0] = (coef) (dc_sign ? -dc_dq : dc_dq);
 
         if (rc) ac_noqm: {
             const unsigned ac_dq = dq_tbl[1];
@@ -684,7 +687,8 @@ static int decode_coefs(Dav1dTaskContext *const t,
                 if (dbg)
                     printf("Post-sign[%d=%d]: r=%d\n", rc, sign, ts->msac.rng);
                 const unsigned rc_tok = cf[rc];
-                unsigned tok, dq;
+                unsigned tok;
+                int dq;
 
                 // residual
                 if (rc_tok >= (15 << 11)) {
@@ -698,15 +702,15 @@ static int decode_coefs(Dav1dTaskContext *const t,
 
                     // dequant, see 7.12.3
                     dq = ((ac_dq * tok) & 0xffffff) >> dq_shift;
-                    dq = umin(dq - sign, cf_max);
+                    dq = umin(dq, cf_max + sign);
                 } else {
                     // cannot exceed cf_max, so we can avoid the clipping
                     tok = rc_tok >> 11;
-                    dq = ((ac_dq * tok) >> dq_shift) - sign;
+                    dq = ((ac_dq * tok) >> dq_shift);
                     assert(dq <= cf_max);
                 }
                 cul_level += tok;
-                cf[rc] = (coef) (dq ^ -sign);
+                cf[rc] = (coef) (sign ? -dq : dq);
 
                 rc = rc_tok & 0x3ff; // next non-zero rc, zero if eob
             } while (rc);
-- 
cgit v1.2.3