Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/videolan/dav1d.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorHenrik Gramner <gramner@twoorioles.com>2019-05-09 00:14:22 +0300
committerHenrik Gramner <henrik@gramner.com>2019-05-09 00:21:09 +0300
commit11da4086e59f81393255a1c1786a990f9fa565b9 (patch)
treee6be8aa426e540aea296fe15dc06a5d59e1d8b3b
parent94e30ef935a489d0bbd2e4c18bbdf7dde15544bd (diff)
Fix buffer overflow in 64x16 ssse3 idct
With frame threading enabled the code could previously clobber the coefficients of the next block. Update the checkasm test to check for this.
-rw-r--r--src/x86/itx_ssse3.asm68
-rw-r--r--tests/checkasm/itx.c13
2 files changed, 58 insertions, 23 deletions
diff --git a/src/x86/itx_ssse3.asm b/src/x86/itx_ssse3.asm
index 9682a80..13c1269 100644
--- a/src/x86/itx_ssse3.asm
+++ b/src/x86/itx_ssse3.asm
@@ -6097,7 +6097,7 @@ ALIGN function_align
-cglobal inv_txfm_add_dct_dct_64x16, 4, 6, 8, 16*68, dst, stride, coeff, eob, tx2
+cglobal inv_txfm_add_dct_dct_64x16, 4, 6, 8, 16*132, dst, stride, coeff, eob, tx2
%if ARCH_X86_32
LEA r5, $$
%endif
@@ -6186,7 +6186,9 @@ cglobal inv_txfm_add_dct_dct_64x16, 4, 6, 8, 16*68, dst, stride, coeff, eob, tx2
%endmacro
cglobal idct_64x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
- mov r3, 2
+ mov r3d, 2
+ mov [rsp+gprsize*2+16*67], dstq
+ lea dstq, [rsp+gprsize+16*68]
.pass1_loop:
LOAD_4ROWS coeffq+32*0, 32*8
@@ -6277,7 +6279,7 @@ cglobal idct_64x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
jmp m(idct_8x8_internal).pass1_end1
.pass1_end4:
- SAVE_8ROWS coeffq+32*32, 32
+ SAVE_8ROWS dstq+32*0, 32
LOAD_8ROWS rsp+gprsize+16*43, 16
mova [rsp+gprsize+16*0], m7
mova m7, [o(pw_8192)]
@@ -6285,7 +6287,7 @@ cglobal idct_64x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
jmp m(idct_8x8_internal).pass1_end1
.pass1_end5:
- SAVE_8ROWS coeffq+32*40, 32
+ SAVE_8ROWS dstq+32*8, 32
LOAD_8ROWS rsp+gprsize+16*51, 16
mova [rsp+gprsize+16*0], m7
mova m7, [o(pw_8192)]
@@ -6293,7 +6295,7 @@ cglobal idct_64x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
jmp m(idct_8x8_internal).pass1_end1
.pass1_end6:
- SAVE_8ROWS coeffq+32*48, 32
+ SAVE_8ROWS dstq+32*16, 32
LOAD_8ROWS rsp+gprsize+16*59, 16
mova [rsp+gprsize+16*0], m7
mova m7, [o(pw_8192)]
@@ -6301,20 +6303,20 @@ cglobal idct_64x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
jmp m(idct_8x8_internal).pass1_end1
.pass1_end7:
- SAVE_8ROWS coeffq+32*56, 32
+ SAVE_8ROWS dstq+32*24, 32
add coeffq, 16
- dec r3
+ add dstq, 16
+ dec r3d
jg .pass1_loop
.pass2:
+ mov dstq, [rsp+gprsize*2+16*67]
sub coeffq, 32
- mov r3, 8
- lea r4, [dstq+8]
- mov [rsp+gprsize*2+16*67], r4
+ mov r3d, 4
.pass2_loop:
- mov [rsp+gprsize*1+16*67], r3
+ mov [rsp+gprsize*1+16*67], r3d
LOAD_4ROWS coeffq+16*0, 32*2
LOAD_4ROWS_H coeffq+16*1, 32*2
@@ -6341,13 +6343,47 @@ cglobal idct_64x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
REPX {mova [coeffq+16*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
add coeffq, 16*16
- mov r3, [rsp+gprsize*1+16*67]
+ mov r3d, [rsp+gprsize*1+16*67]
mov dstq, [rsp+gprsize*2+16*67]
- lea r4, [dstq+8]
- mov [rsp+gprsize*2+16*67], r4
-
- dec r3
+ add dstq, 8
+ mov [rsp+gprsize*2+16*67], dstq
+ dec r3d
jg .pass2_loop
+
+ mov r3d, 4
+ lea coeffq, [rsp+gprsize+16*68]
+.pass2_loop2:
+ mov [rsp+gprsize*1+16*67], r3d
+
+ LOAD_4ROWS coeffq+16*0, 32*2
+ LOAD_4ROWS_H coeffq+16*1, 32*2
+ call m(idct_8x8_internal).main
+ SAVE_7ROWS rsp+gprsize+16*3, 16
+ LOAD_4ROWS coeffq+16*2, 32*2
+ LOAD_4ROWS_H coeffq+16*3, 32*2
+ call m(idct_16x8_internal).main
+
+ mov r3, dstq
+ lea tx2q, [o(m(idct_64x16_internal).end2)]
+ lea dstq, [dstq+strideq*8]
+ jmp m(idct_8x8_internal).end
+
+.end2:
+ LOAD_8ROWS rsp+gprsize+16*3, 16
+ mova [rsp+gprsize+16*0], m7
+ lea tx2q, [o(m(idct_64x16_internal).end3)]
+ mov dstq, r3
+ jmp m(idct_8x8_internal).end
+
+.end3:
+
+ add coeffq, 16*16
+ mov r3d, [rsp+gprsize*1+16*67]
+ mov dstq, [rsp+gprsize*2+16*67]
+ add dstq, 8
+ mov [rsp+gprsize*2+16*67], dstq
+ dec r3d
+ jg .pass2_loop2
ret
diff --git a/tests/checkasm/itx.c b/tests/checkasm/itx.c
index 9254491..cdf787b 100644
--- a/tests/checkasm/itx.c
+++ b/tests/checkasm/itx.c
@@ -158,6 +158,8 @@ static int copy_subcoefs(coef *coeff,
eob += rnd() % (n - eob - 1);
for (n = eob + 1; n < sw * sh; n++)
coeff[scan[n]] = 0;
+ for (; n < 32 * 32; n++)
+ coeff[n] = rnd();
return eob;
}
@@ -224,7 +226,7 @@ void bitfn(checkasm_check_itx)(void) {
Dav1dInvTxfmDSPContext c;
bitfn(dav1d_itx_dsp_init)(&c);
- ALIGN_STK_32(coef, coeff, 3, [32 * 32]);
+ ALIGN_STK_32(coef, coeff, 2, [32 * 32]);
ALIGN_STK_32(pixel, c_dst, 64 * 64,);
ALIGN_STK_32(pixel, a_dst, 64 * 64,);
@@ -245,7 +247,6 @@ void bitfn(checkasm_check_itx)(void) {
const enum RectTxfmSize tx = txfm_size_order[i];
const int w = dav1d_txfm_dimensions[tx].w * 4;
const int h = dav1d_txfm_dimensions[tx].h * 4;
- const int sw = imin(w, 32), sh = imin(h, 32);
const int subsh_max = subsh_iters[imax(dav1d_txfm_dimensions[tx].lw,
dav1d_txfm_dimensions[tx].lh)];
@@ -263,24 +264,22 @@ void bitfn(checkasm_check_itx)(void) {
const int bitdepth_max = 0xff;
#endif
const int eob = ftx(coeff[0], tx, txtp, w, h, subsh, bitdepth_max);
+ memcpy(coeff[1], coeff[0], sizeof(*coeff));
for (int j = 0; j < w * h; j++)
c_dst[j] = a_dst[j] = rnd() & bitdepth_max;
- memcpy(coeff[1], coeff[0], sw * sh * sizeof(**coeff));
- memcpy(coeff[2], coeff[0], sw * sh * sizeof(**coeff));
-
call_ref(c_dst, w * sizeof(*c_dst), coeff[0], eob
HIGHBD_TAIL_SUFFIX);
call_new(a_dst, w * sizeof(*c_dst), coeff[1], eob
HIGHBD_TAIL_SUFFIX);
if (memcmp(c_dst, a_dst, w * h * sizeof(*c_dst)) ||
- memcmp(coeff[0], coeff[1], sw * sh * sizeof(**coeff)))
+ memcmp(coeff[0], coeff[1], sizeof(*coeff)))
{
fail();
}
- bench_new(a_dst, w * sizeof(*c_dst), coeff[2], eob
+ bench_new(a_dst, w * sizeof(*c_dst), coeff[0], eob
HIGHBD_TAIL_SUFFIX);
}
report("add_%dx%d", w, h);