Fix buffer overflow in 64x16 ssse3 idct

With frame threading enabled the code could previously clobber the coefficients of the next block. Update the checkasm test to check for this.
author: Henrik Gramner <gramner@twoorioles.com> 2019-05-09 00:14:22 +0300
committer: Henrik Gramner <henrik@gramner.com> 2019-05-09 00:21:09 +0300
commit: 11da4086e59f81393255a1c1786a990f9fa565b9 (patch)
tree: e6be8aa426e540aea296fe15dc06a5d59e1d8b3b
parent: 94e30ef935a489d0bbd2e4c18bbdf7dde15544bd (diff)
2 files changed, 58 insertions, 23 deletions
diff --git a/src/x86/itx_ssse3.asm b/src/x86/itx_ssse3.asm
index 9682a80..13c1269 100644
--- a/src/x86/itx_ssse3.asm
+++ b/src/x86/itx_ssse3.asm
@@ -6097,7 +6097,7 @@ ALIGN function_align
 
 
 
-cglobal inv_txfm_add_dct_dct_64x16, 4, 6, 8, 16*68, dst, stride, coeff, eob, tx2
+cglobal inv_txfm_add_dct_dct_64x16, 4, 6, 8, 16*132, dst, stride, coeff, eob, tx2
 %if ARCH_X86_32
     LEA                     r5, $$
 %endif
@@ -6186,7 +6186,9 @@ cglobal inv_txfm_add_dct_dct_64x16, 4, 6, 8, 16*68, dst, stride, coeff, eob, tx2
 %endmacro
 
 cglobal idct_64x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
-    mov                     r3, 2
+    mov                    r3d, 2
+    mov  [rsp+gprsize*2+16*67], dstq
+    lea                   dstq, [rsp+gprsize+16*68]
 
 .pass1_loop:
     LOAD_4ROWS     coeffq+32*0, 32*8
@@ -6277,7 +6279,7 @@ cglobal idct_64x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
     jmp   m(idct_8x8_internal).pass1_end1
 
 .pass1_end4:
-    SAVE_8ROWS    coeffq+32*32, 32
+    SAVE_8ROWS       dstq+32*0, 32
     LOAD_8ROWS   rsp+gprsize+16*43, 16
     mova    [rsp+gprsize+16*0], m7
     mova                    m7, [o(pw_8192)]
@@ -6285,7 +6287,7 @@ cglobal idct_64x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
     jmp   m(idct_8x8_internal).pass1_end1
 
 .pass1_end5:
-    SAVE_8ROWS    coeffq+32*40, 32
+    SAVE_8ROWS       dstq+32*8, 32
     LOAD_8ROWS   rsp+gprsize+16*51, 16
     mova    [rsp+gprsize+16*0], m7
     mova                    m7, [o(pw_8192)]
@@ -6293,7 +6295,7 @@ cglobal idct_64x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
     jmp   m(idct_8x8_internal).pass1_end1
 
 .pass1_end6:
-    SAVE_8ROWS    coeffq+32*48, 32
+    SAVE_8ROWS      dstq+32*16, 32
     LOAD_8ROWS   rsp+gprsize+16*59, 16
     mova    [rsp+gprsize+16*0], m7
     mova                    m7, [o(pw_8192)]
@@ -6301,20 +6303,20 @@ cglobal idct_64x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
     jmp   m(idct_8x8_internal).pass1_end1
 
 .pass1_end7:
-    SAVE_8ROWS    coeffq+32*56, 32
+    SAVE_8ROWS      dstq+32*24, 32
 
     add                 coeffq, 16
-    dec                     r3
+    add                   dstq, 16
+    dec                    r3d
     jg .pass1_loop
 
 .pass2:
+    mov                   dstq, [rsp+gprsize*2+16*67]
     sub                 coeffq, 32
-    mov                     r3, 8
-    lea                     r4, [dstq+8]
-    mov  [rsp+gprsize*2+16*67], r4
+    mov                    r3d, 4
 
 .pass2_loop:
-    mov  [rsp+gprsize*1+16*67], r3
+    mov  [rsp+gprsize*1+16*67], r3d
 
     LOAD_4ROWS     coeffq+16*0, 32*2
     LOAD_4ROWS_H   coeffq+16*1, 32*2
@@ -6341,13 +6343,47 @@ cglobal idct_64x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
     REPX  {mova [coeffq+16*x], m7}, 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15
 
     add                 coeffq, 16*16
-    mov                     r3, [rsp+gprsize*1+16*67]
+    mov                    r3d, [rsp+gprsize*1+16*67]
     mov                   dstq, [rsp+gprsize*2+16*67]
-    lea                     r4, [dstq+8]
-    mov  [rsp+gprsize*2+16*67], r4
-
-    dec                     r3
+    add                   dstq, 8
+    mov  [rsp+gprsize*2+16*67], dstq
+    dec                    r3d
     jg .pass2_loop
+
+    mov                    r3d, 4
+    lea                 coeffq, [rsp+gprsize+16*68]
+.pass2_loop2:
+    mov  [rsp+gprsize*1+16*67], r3d
+
+    LOAD_4ROWS     coeffq+16*0, 32*2
+    LOAD_4ROWS_H   coeffq+16*1, 32*2
+    call  m(idct_8x8_internal).main
+    SAVE_7ROWS    rsp+gprsize+16*3, 16
+    LOAD_4ROWS     coeffq+16*2, 32*2
+    LOAD_4ROWS_H   coeffq+16*3, 32*2
+    call m(idct_16x8_internal).main
+
+    mov                    r3, dstq
+    lea                  tx2q, [o(m(idct_64x16_internal).end2)]
+    lea                  dstq, [dstq+strideq*8]
+    jmp  m(idct_8x8_internal).end
+
+.end2:
+    LOAD_8ROWS   rsp+gprsize+16*3, 16
+    mova   [rsp+gprsize+16*0], m7
+    lea                  tx2q, [o(m(idct_64x16_internal).end3)]
+    mov                  dstq, r3
+    jmp  m(idct_8x8_internal).end
+
+.end3:
+
+    add                 coeffq, 16*16
+    mov                    r3d, [rsp+gprsize*1+16*67]
+    mov                   dstq, [rsp+gprsize*2+16*67]
+    add                   dstq, 8
+    mov  [rsp+gprsize*2+16*67], dstq
+    dec                    r3d
+    jg .pass2_loop2
     ret
 
 
diff --git a/tests/checkasm/itx.c b/tests/checkasm/itx.c
index 9254491..cdf787b 100644
--- a/tests/checkasm/itx.c
+++ b/tests/checkasm/itx.c
@@ -158,6 +158,8 @@ static int copy_subcoefs(coef *coeff,
         eob += rnd() % (n - eob - 1);
     for (n = eob + 1; n < sw * sh; n++)
         coeff[scan[n]] = 0;
+    for (; n < 32 * 32; n++)
+        coeff[n] = rnd();
     return eob;
 }
 
@@ -224,7 +226,7 @@ void bitfn(checkasm_check_itx)(void) {
     Dav1dInvTxfmDSPContext c;
     bitfn(dav1d_itx_dsp_init)(&c);
 
-    ALIGN_STK_32(coef, coeff, 3, [32 * 32]);
+    ALIGN_STK_32(coef, coeff, 2, [32 * 32]);
     ALIGN_STK_32(pixel, c_dst, 64 * 64,);
     ALIGN_STK_32(pixel, a_dst, 64 * 64,);
 
@@ -245,7 +247,6 @@ void bitfn(checkasm_check_itx)(void) {
         const enum RectTxfmSize tx = txfm_size_order[i];
         const int w = dav1d_txfm_dimensions[tx].w * 4;
         const int h = dav1d_txfm_dimensions[tx].h * 4;
-        const int sw = imin(w, 32), sh = imin(h, 32);
         const int subsh_max = subsh_iters[imax(dav1d_txfm_dimensions[tx].lw,
                                                dav1d_txfm_dimensions[tx].lh)];
 
@@ -263,24 +264,22 @@ void bitfn(checkasm_check_itx)(void) {
                     const int bitdepth_max = 0xff;
 #endif
                     const int eob = ftx(coeff[0], tx, txtp, w, h, subsh, bitdepth_max);
+                    memcpy(coeff[1], coeff[0], sizeof(*coeff));
 
                     for (int j = 0; j < w * h; j++)
                         c_dst[j] = a_dst[j] = rnd() & bitdepth_max;
 
-                    memcpy(coeff[1], coeff[0], sw * sh * sizeof(**coeff));
-                    memcpy(coeff[2], coeff[0], sw * sh * sizeof(**coeff));
-
                     call_ref(c_dst, w * sizeof(*c_dst), coeff[0], eob
                              HIGHBD_TAIL_SUFFIX);
                     call_new(a_dst, w * sizeof(*c_dst), coeff[1], eob
                              HIGHBD_TAIL_SUFFIX);
                     if (memcmp(c_dst, a_dst, w * h * sizeof(*c_dst)) ||
-                        memcmp(coeff[0], coeff[1], sw * sh * sizeof(**coeff)))
+                        memcmp(coeff[0], coeff[1], sizeof(*coeff)))
                     {
                         fail();
                     }
 
-                    bench_new(a_dst, w * sizeof(*c_dst), coeff[2], eob
+                    bench_new(a_dst, w * sizeof(*c_dst), coeff[0], eob
                               HIGHBD_TAIL_SUFFIX);
                 }
         report("add_%dx%d", w, h);
author	Henrik Gramner <gramner@twoorioles.com>	2019-05-09 00:14:22 +0300
committer	Henrik Gramner <henrik@gramner.com>	2019-05-09 00:21:09 +0300
commit	11da4086e59f81393255a1c1786a990f9fa565b9 (patch)
tree	e6be8aa426e540aea296fe15dc06a5d59e1d8b3b
parent	94e30ef935a489d0bbd2e4c18bbdf7dde15544bd (diff)