diff options
author | Martin Storsjö <martin@martin.st> | 2021-09-01 11:05:45 +0300 |
---|---|---|
committer | Martin Storsjö <martin@martin.st> | 2021-09-03 14:36:15 +0300 |
commit | a4e139b5494458e68272b313809eaa1ffb012d1c (patch) | |
tree | 171400658564ff58d1a876cfcf7d97ab263abba7 /src/arm | |
parent | 0f9cd468146e37718e69bd0fbd335431471ccc0e (diff) |
arm32: filmgrain: Add NEON implementation of gen_grain for 16 bpc
Relative speedup over C code:
Cortex A7 A8 A9 A53 A72 A73
gen_grain_uv_ar0_16bpc_420_neon: 5.05 6.71 5.42 4.95 6.45 9.59
gen_grain_uv_ar0_16bpc_422_neon: 5.54 7.18 6.29 5.45 6.55 8.80
gen_grain_uv_ar0_16bpc_444_neon: 6.64 8.07 6.70 6.89 7.16 9.98
gen_grain_uv_ar1_16bpc_420_neon: 3.22 2.16 2.58 3.51 3.16 4.68
gen_grain_uv_ar1_16bpc_422_neon: 3.24 2.26 2.73 3.83 3.36 4.65
gen_grain_uv_ar1_16bpc_444_neon: 3.48 2.41 2.85 4.32 3.69 4.90
gen_grain_uv_ar2_16bpc_420_neon: 3.29 2.90 2.92 4.14 3.48 4.59
gen_grain_uv_ar2_16bpc_422_neon: 3.35 3.01 3.13 4.50 3.61 4.50
gen_grain_uv_ar2_16bpc_444_neon: 3.66 3.55 3.32 5.15 3.87 4.93
gen_grain_uv_ar3_16bpc_420_neon: 3.39 3.79 3.60 4.67 4.04 4.70
gen_grain_uv_ar3_16bpc_422_neon: 3.39 4.04 3.96 4.93 4.16 4.65
gen_grain_uv_ar3_16bpc_444_neon: 3.79 4.47 4.36 5.54 4.59 5.07
gen_grain_y_ar0_16bpc_neon: 5.05 5.26 6.97 5.47 5.95 8.59
gen_grain_y_ar1_16bpc_neon: 2.35 1.72 2.07 3.53 3.16 3.47
gen_grain_y_ar2_16bpc_neon: 3.02 2.70 2.88 4.19 3.57 4.03
gen_grain_y_ar3_16bpc_neon: 3.49 3.18 3.69 5.01 3.99 4.50
Diffstat (limited to 'src/arm')
-rw-r--r-- | src/arm/32/film_grain16.S | 1188 | ||||
-rw-r--r-- | src/arm/film_grain_init_tmpl.c | 5 |
2 files changed, 1188 insertions, 5 deletions
diff --git a/src/arm/32/film_grain16.S b/src/arm/32/film_grain16.S index 42805e5..6c36cac 100644 --- a/src/arm/32/film_grain16.S +++ b/src/arm/32/film_grain16.S @@ -30,6 +30,1194 @@ #include "src/arm/asm-offsets.h" #define GRAIN_WIDTH 82 +#define GRAIN_HEIGHT 73 + +#define SUB_GRAIN_WIDTH 44 +#define SUB_GRAIN_HEIGHT 38 + +.macro increment_seed steps, shift=1 + lsr r11, r2, #3 + lsr r12, r2, #12 + lsr lr, r2, #1 + eor r11, r2, r11 // (r >> 0) ^ (r >> 3) + eor r12, r12, lr // (r >> 12) ^ (r >> 1) + eor r11, r11, r12 // (r >> 0) ^ (r >> 3) ^ (r >> 12) ^ (r >> 1) +.if \shift + lsr r2, r2, #\steps +.endif + and r11, r11, #((1 << \steps) - 1) // bit +.if \shift + orr r2, r2, r11, lsl #(16 - \steps) // *state +.else + orr r2, r2, r11, lsl #16 // *state +.endif +.endm + +.macro read_rand dest, bits, age + ubfx \dest, r2, #16 - \bits - \age, #\bits +.endm + +.macro read_shift_rand dest, bits + ubfx \dest, r2, #17 - \bits, #\bits + lsr r2, r2, #1 +.endm + +// special calling convention: +// r2 holds seed +// r3 holds dav1d_gaussian_sequence +// clobbers r11-r12 +// returns in d0-d1 +function get_gaussian_neon + push {r5-r6,lr} + increment_seed 4 + read_rand r5, 11, 3 + read_rand r6, 11, 2 + add r5, r3, r5, lsl #1 + add r6, r3, r6, lsl #1 + vld1.16 {d0[0]}, [r5] + read_rand r5, 11, 1 + vld1.16 {d0[1]}, [r6] + add r5, r3, r5, lsl #1 + read_rand r6, 11, 0 + increment_seed 4 + add r6, r3, r6, lsl #1 + vld1.16 {d0[2]}, [r5] + read_rand r5, 11, 3 + vld1.16 {d0[3]}, [r6] + add r5, r3, r5, lsl #1 + read_rand r6, 11, 2 + vld1.16 {d1[0]}, [r5] + add r6, r3, r6, lsl #1 + read_rand r5, 11, 1 + vld1.16 {d1[1]}, [r6] + read_rand r6, 11, 0 + add r5, r3, r5, lsl #1 + add r6, r3, r6, lsl #1 + vld1.16 {d1[2]}, [r5] + vld1.16 {d1[3]}, [r6] + pop {r5-r6,pc} +endfunc + +function get_grain_2_neon + push {r11,lr} + increment_seed 2 + read_rand r11, 11, 1 + read_rand r12, 11, 0 + add r11, r3, r11, lsl #1 + add r12, r3, r12, lsl #1 + vld1.16 {d0[0]}, [r11] + vld1.16 {d0[1]}, [r12] + vrshl.s16 d0, d0, d30 + pop {r11,pc} +endfunc + +.macro get_grain_2 dst + bl get_grain_2_neon +.ifnc \dst, d0 + vmov \dst, d0 +.endif +.endm + +function get_grain_4_neon + push {r11,lr} + increment_seed 4 + read_rand r11, 11, 3 + read_rand r12, 11, 2 + add r11, r3, r11, lsl #1 + add r12, r3, r12, lsl #1 + vld1.16 {d0[0]}, [r11] + read_rand r11, 11, 1 + vld1.16 {d0[1]}, [r12] + read_rand r12, 11, 0 + add r11, r3, r11, lsl #1 + add r12, r3, r12, lsl #1 + vld1.16 {d0[2]}, [r11] + vld1.16 {d0[3]}, [r12] + vrshl.s16 d0, d0, d30 + pop {r11,pc} +endfunc + +.macro get_grain_4 dst + bl get_grain_4_neon +.ifnc \dst, d0 + vmov \dst, d0 +.endif +.endm + +// r1 holds the number of entries to produce +// r6, r8 and r10 hold the previous output entries +// q0 holds the vector of produced entries +// q1 holds the input vector of sums from above +.macro output_lag n +function output_lag\n\()_neon + push {r0, lr} +.if \n == 1 + mvn lr, r5 // grain_min = ~grain_max +.else + mov r0, #1 + mov lr, #1 + sub r7, r7, #1 + sub r9, r9, #1 + lsl r0, r0, r7 + lsl lr, lr, r9 + add r7, r7, #1 + add r9, r9, #1 +.endif +1: + read_shift_rand r12, 11 + vmov.32 r11, d2[0] + lsl r12, r12, #1 + vext.8 q0, q0, q0, #2 + ldrsh r12, [r3, r12] +.if \n == 1 + mla r11, r6, r4, r11 // sum (above) + *coeff * prev output + add r6, r11, r8 // 1 << (ar_coeff_shift - 1) + add r12, r12, r10 + asr r6, r6, r7 // >> ar_coeff_shift + asr r12, r12, r9 // >> (4 - bitdepth_min_8 + grain_scale_shift) + add r6, r6, r12 + cmp r6, r5 +.elseif \n == 2 + mla r11, r8, r4, r11 // sum (above) + *coeff * prev output 1 + mla r11, r6, r10, r11 // += *coeff * prev output 2 + mov r8, r6 + add r6, r11, r0 // 1 << (ar_coeff_shift - 1) + add r12, r12, lr // 1 << (4 - bitdepth_min_8 + grain_scale_shift - 1) + asr r6, r6, r7 // >> ar_coeff_shift + asr r12, r12, r9 // >> (4 - bitdepth_min_8 + grain_scale_shift) + add r6, r6, r12 + push {lr} + cmp r6, r5 + mvn lr, r5 // grain_min = ~grain_max +.else + push {r1-r3} + sbfx r1, r4, #0, #8 + sbfx r2, r4, #8, #8 + sbfx r3, r4, #16, #8 + mla r11, r10, r1, r11 // sum (above) + *coeff * prev output 1 + mla r11, r8, r2, r11 // sum (above) + *coeff * prev output 2 + mla r11, r6, r3, r11 // += *coeff * prev output 3 + pop {r1-r3} + mov r10, r8 + mov r8, r6 + + add r6, r11, r0 // 1 << (ar_coeff_shift - 1) + add r12, r12, lr // 1 << (4 - bitdepth_min_8 + grain_scale_shift - 1) + asr r6, r6, r7 // >> ar_coeff_shift + asr r12, r12, r9 // >> (4 - bitdepth_min_8 + grain_scale_shift) + add r6, r6, r12 + push {lr} + cmp r6, r5 + mvn lr, r5 // grain_min = ~grain_max +.endif + it gt + movgt r6, r5 + cmp r6, lr + it lt + movlt r6, lr +.if \n >= 2 + pop {lr} +.endif + subs r1, r1, #1 + vext.8 q1, q1, q1, #4 + vmov.16 d1[3], r6 + bgt 1b + pop {r0, pc} +endfunc +.endm + +output_lag 1 +output_lag 2 +output_lag 3 + + +function sum_lag1_above_neon + sub r12, r0, #1*GRAIN_WIDTH*2 - 16 + vld1.16 {q10}, [r12] // load top right + + vext.8 q0, q8, q9, #14 // top left, top mid + vext.8 q1, q9, q10, #2 // top left, top mid + + vmull.s16 q2, d18, d28 + vmlal.s16 q2, d0, d27 + vmlal.s16 q2, d2, d29 + vmull.s16 q3, d19, d28 + vmlal.s16 q3, d1, d27 + vmlal.s16 q3, d3, d29 + + vmov q8, q9 + vmov q9, q10 + + bx lr +endfunc + +.macro sum_lag_n_body lag, type, uv_layout, edge, elems, uv_coeff +.ifc \lag\()_\edge, lag3_left + bl sum_lag3_left_above_neon +.else + bl sum_\lag\()_above_neon +.endif +.ifc \type, uv_420 + vpush {q6-q7} + add r12, r11, #GRAIN_WIDTH*2 + vld1.16 {q0, q1}, [r11]! + vld1.16 {q6, q7}, [r12]! + vpadd.i16 d0, d0, d1 + vpadd.i16 d1, d2, d3 + vpadd.i16 d12, d12, d13 + vpadd.i16 d13, d14, d15 + vadd.i16 q0, q0, q6 + vpop {q6-q7} + vrshr.s16 q0, q0, #2 +.endif +.ifc \type, uv_422 + vld1.16 {q0, q1}, [r11]! + vpadd.i16 d0, d0, d1 + vpadd.i16 d1, d2, d3 + vrshr.s16 q0, q0, #1 +.endif +.ifc \type, uv_444 + vld1.16 {q0}, [r11]! +.endif +.if \uv_layout +.ifnb \uv_coeff + vdup.8 d13, \uv_coeff + vmovl.s8 q6, d13 +.endif + vmlal.s16 q2, d0, d13 + vmlal.s16 q3, d1, d13 +.endif +.if \uv_layout && \elems == 8 + b sum_\lag\()_y_\edge\()_start +.elseif \uv_layout == 444 && \elems == 7 + b sum_\lag\()_y_\edge\()_start +.elseif \uv_layout == 422 && \elems == 1 + b sum_\lag\()_uv_420_\edge\()_start +.else +sum_\lag\()_\type\()_\edge\()_start: + push {r11} +.if \elems > 4 +.ifc \edge, left + increment_seed 4 + read_rand r11, 11, 3 + read_rand r12, 11, 2 + add r11, r3, r11, lsl #1 + add r12, r3, r12, lsl #1 + vld1.16 {d1[1]}, [r11] + read_rand r11, 11, 1 + vld1.16 {d1[2]}, [r12] + add r11, r3, r11, lsl #1 + vld1.16 {d1[3]}, [r11] + lsl r2, r2, #1 // shift back the state as if we'd done increment_seed with shift=0 + vrshl.s16 d1, d1, d30 + vext.8 q2, q2, q2, #12 +.ifc \lag, lag3 + vmov.s16 r10, d1[1] +.endif +.ifnc \lag, lag1 + vmov.s16 r8, d1[2] +.endif + vmov.s16 r6, d1[3] + + vmov q1, q2 + mov r1, #1 + bl output_\lag\()_neon +.else + increment_seed 4, shift=0 + vmov q1, q2 + mov r1, #4 + bl output_\lag\()_neon +.endif + + increment_seed 4, shift=0 + vmov q1, q3 +.ifc \edge, right + mov r1, #3 + bl output_\lag\()_neon + read_shift_rand r12, 11 + add r12, r3, r12, lsl #1 + vld1.16 {d2[0]}, [r12] + vrshl.s16 d2, d2, d30 + vext.8 q0, q0, q1, #2 +.else + mov r1, #4 + bl output_\lag\()_neon +.endif +.else + // elems == 1 + increment_seed 4, shift=0 + vmov q1, q2 + mov r1, #1 + bl output_\lag\()_neon + lsr r2, r2, #3 + + read_rand r11, 11, 2 + read_rand r12, 11, 1 + add r11, r3, r11, lsl #1 + add r12, r3, r12, lsl #1 + vld1.16 {d2[0]}, [r11] + read_rand r11, 11, 0 + vld1.16 {d2[1]}, [r12] + add r11, r3, r11, lsl #1 + vld1.16 {d2[2]}, [r11] + vrshl.s16 d2, d2, d30 + vext.8 q0, q0, q1, #14 +.endif + vst1.16 {q0}, [r0]! + pop {r11} + pop {r1, pc} +.endif +.endm + +.macro sum_lag1_func type, uv_layout, edge, elems=8 +function sum_\type\()_lag1_\edge\()_neon + push {r1, lr} +.ifc \edge, left + sub r12, r0, #1*GRAIN_WIDTH*2 + vld1.8 {q9}, [r12] // load the previous block right above +.endif + sum_lag_n_body lag1, \type, \uv_layout, \edge, \elems +endfunc +.endm + +sum_lag1_func y, 0, left +sum_lag1_func y, 0, mid +sum_lag1_func y, 0, right, 7 +sum_lag1_func uv_444, 444, left +sum_lag1_func uv_444, 444, mid +sum_lag1_func uv_444, 444, right, 7 +sum_lag1_func uv_422, 422, left +sum_lag1_func uv_422, 422, mid +sum_lag1_func uv_422, 422, right, 1 +sum_lag1_func uv_420, 420, left +sum_lag1_func uv_420, 420, mid +sum_lag1_func uv_420, 420, right, 1 + + +function sum_lag2_above_neon + push {lr} + sub r12, r0, #2*GRAIN_WIDTH*2 - 16 + sub lr, r0, #1*GRAIN_WIDTH*2 - 16 + vld1.16 {q10}, [r12] // load top right + vld1.16 {q13}, [lr] + + vdup.8 d10, d28[0] + vext.8 q0, q8, q9, #12 // top left, top mid + vdup.8 d12, d28[1] + vext.8 q1, q8, q9, #14 + vdup.8 d14, d28[3] + vext.8 q4, q9, q10, #2 // top mid, top right + vmovl.s8 q5, d10 + vmovl.s8 q6, d12 + vmovl.s8 q7, d14 + + vmull.s16 q2, d0, d10 + vmlal.s16 q2, d2, d12 + vmlal.s16 q2, d8, d14 + vmull.s16 q3, d1, d10 + vmlal.s16 q3, d3, d12 + vmlal.s16 q3, d9, d14 + + vdup.8 d10, d28[4] + vext.8 q0, q9, q10, #4 // top mid, top right + vdup.8 d12, d28[5] + vext.8 q1, q11, q12, #12 // top left, top mid + vdup.8 d14, d28[6] + vext.8 q4, q11, q12, #14 + vmovl.s8 q5, d10 + vmovl.s8 q6, d12 + vmovl.s8 q7, d14 + + vmlal.s16 q2, d0, d10 + vmlal.s16 q2, d2, d12 + vmlal.s16 q2, d8, d14 + vmlal.s16 q3, d1, d10 + vmlal.s16 q3, d3, d12 + vmlal.s16 q3, d9, d14 + + vdup.8 d10, d29[0] + vext.8 q0, q12, q13, #2 // top mid, top right + vdup.8 d12, d29[1] + vext.8 q1, q12, q13, #4 + + vdup.8 d14, d28[2] + vdup.8 d8, d28[7] + + vmovl.s8 q5, d10 + vmovl.s8 q6, d12 + vmovl.s8 q7, d14 + vmovl.s8 q4, d8 + + vmlal.s16 q2, d0, d10 + vmlal.s16 q2, d2, d12 + vmlal.s16 q2, d18, d14 + vmlal.s16 q2, d24, d8 + vmlal.s16 q3, d1, d10 + vmlal.s16 q3, d3, d12 + vmlal.s16 q3, d19, d14 + vmlal.s16 q3, d25, d8 + + vmov q8, q9 + vmov q9, q10 + + vmov q11, q12 + vmov q12, q13 + + pop {pc} +endfunc + +.macro sum_lag2_func type, uv_layout, edge, elems=8 +function sum_\type\()_lag2_\edge\()_neon + push {r1, lr} +.ifc \edge, left + sub r12, r0, #2*GRAIN_WIDTH*2 + sub lr, r0, #1*GRAIN_WIDTH*2 + vld1.16 {q9}, [r12] // load the previous block right above + vld1.16 {q12}, [lr] +.endif + sum_lag_n_body lag2, \type, \uv_layout, \edge, \elems, uv_coeff=d29[4] +endfunc +.endm + +sum_lag2_func y, 0, left +sum_lag2_func y, 0, mid +sum_lag2_func y, 0, right, 7 +sum_lag2_func uv_444, 444, left +sum_lag2_func uv_444, 444, mid +sum_lag2_func uv_444, 444, right, 7 +sum_lag2_func uv_422, 422, left +sum_lag2_func uv_422, 422, mid +sum_lag2_func uv_422, 422, right, 1 +sum_lag2_func uv_420, 420, left +sum_lag2_func uv_420, 420, mid +sum_lag2_func uv_420, 420, right, 1 + + +function sum_lag3_left_above_neon + // A separate codepath for the left edge, to avoid reading outside + // of the edge of the buffer. + sub r12, r0, #3*GRAIN_WIDTH*2 + vld1.8 {q11, q12}, [r12] + vext.8 q12, q11, q12, #10 + vext.8 q11, q11, q11, #10 + b sum_lag3_above_start +endfunc + +function sum_lag3_above_neon + movw r12, #(3*GRAIN_WIDTH + 3)*2 + sub r12, r0, r12 + vld1.8 {q11, q12}, [r12] + +sum_lag3_above_start: + vdup.8 d12, d26[0] + vext.8 q1, q11, q12, #2 + vdup.8 d14, d26[1] + vext.8 q4, q11, q12, #4 + vdup.8 d16, d26[2] + vext.8 q5, q11, q12, #6 + vdup.8 d18, d26[3] + vmovl.s8 q6, d12 + vmovl.s8 q7, d14 + vmovl.s8 q8, d16 + vmovl.s8 q9, d18 + + movw r12, #(2*GRAIN_WIDTH + 3)*2 + sub r12, r0, r12 + + vmull.s16 q2, d22, d12 + vmlal.s16 q2, d2, d14 + vmlal.s16 q2, d8, d16 + vmlal.s16 q2, d10, d18 + vmull.s16 q3, d23, d12 + vmlal.s16 q3, d3, d14 + vmlal.s16 q3, d9, d16 + vmlal.s16 q3, d11, d18 + + vdup.8 d12, d26[4] + vext.8 q0, q11, q12, #8 + vdup.8 d14, d26[5] + vext.8 q1, q11, q12, #10 + vdup.8 d16, d26[6] + vext.8 q4, q11, q12, #12 + vld1.8 {q11, q12}, [r12] + vdup.8 d18, d26[7] + vmovl.s8 q6, d12 + vmovl.s8 q7, d14 + vmovl.s8 q8, d16 + vmovl.s8 q9, d18 + + vmlal.s16 q2, d0, d12 + vmlal.s16 q2, d2, d14 + vmlal.s16 q2, d8, d16 + vmlal.s16 q2, d22, d18 + vmlal.s16 q3, d1, d12 + vmlal.s16 q3, d3, d14 + vmlal.s16 q3, d9, d16 + vmlal.s16 q3, d23, d18 + + vdup.8 d12, d27[0] + vext.8 q0, q11, q12, #2 + vdup.8 d14, d27[1] + vext.8 q1, q11, q12, #4 + vdup.8 d16, d27[2] + vext.8 q4, q11, q12, #6 + vdup.8 d18, d27[3] + vext.8 q5, q11, q12, #8 + vmovl.s8 q6, d12 + vmovl.s8 q7, d14 + vmovl.s8 q8, d16 + vmovl.s8 q9, d18 + + sub r12, r0, #(1*GRAIN_WIDTH + 3)*2 + + vmlal.s16 q2, d0, d12 + vmlal.s16 q2, d2, d14 + vmlal.s16 q2, d8, d16 + vmlal.s16 q2, d10, d18 + vmlal.s16 q3, d1, d12 + vmlal.s16 q3, d3, d14 + vmlal.s16 q3, d9, d16 + vmlal.s16 q3, d11, d18 + + vdup.8 d12, d27[4] + vext.8 q0, q11, q12, #10 + vdup.8 d14, d27[5] + vext.8 q1, q11, q12, #12 + vld1.8 {q11, q12}, [r12] + vdup.8 d16, d27[6] + vdup.8 d18, d27[7] + vmovl.s8 q6, d12 + vmovl.s8 q7, d14 + vext.8 q5, q11, q12, #2 + vmovl.s8 q8, d16 + vmovl.s8 q9, d18 + + vmlal.s16 q2, d0, d12 + vmlal.s16 q2, d2, d14 + vmlal.s16 q2, d22, d16 + vmlal.s16 q2, d10, d18 + vmlal.s16 q3, d1, d12 + vmlal.s16 q3, d3, d14 + vmlal.s16 q3, d23, d16 + vmlal.s16 q3, d11, d18 + + vdup.8 d12, d28[0] + vext.8 q0, q11, q12, #4 + vdup.8 d14, d28[1] + vext.8 q1, q11, q12, #6 + vdup.8 d16, d28[2] + vext.8 q4, q11, q12, #8 + vdup.8 d18, d28[3] + vext.8 q5, q11, q12, #10 + vmovl.s8 q6, d12 + vmovl.s8 q7, d14 + vmovl.s8 q8, d16 + vmovl.s8 q9, d18 + + vmlal.s16 q2, d0, d12 + vmlal.s16 q2, d2, d14 + vmlal.s16 q2, d8, d16 + vmlal.s16 q2, d10, d18 + vmlal.s16 q3, d1, d12 + vmlal.s16 q3, d3, d14 + vmlal.s16 q3, d9, d16 + vmlal.s16 q3, d11, d18 + + vdup.8 d12, d28[4] + vext.8 q0, q11, q12, #12 + vmovl.s8 q6, d12 + + vmlal.s16 q2, d0, d12 + vmlal.s16 q3, d1, d12 + + bx lr +endfunc + +.macro sum_lag3_func type, uv_layout, edge, elems=8 +function sum_\type\()_lag3_\edge\()_neon + push {r1, lr} + sum_lag_n_body lag3, \type, \uv_layout, \edge, \elems, uv_coeff=d29[0] +endfunc +.endm + +sum_lag3_func y, 0, left +sum_lag3_func y, 0, mid +sum_lag3_func y, 0, right, 7 +sum_lag3_func uv_444, 444, left +sum_lag3_func uv_444, 444, mid +sum_lag3_func uv_444, 444, right, 7 +sum_lag3_func uv_422, 422, left +sum_lag3_func uv_422, 422, mid +sum_lag3_func uv_422, 422, right, 1 +sum_lag3_func uv_420, 420, left +sum_lag3_func uv_420, 420, mid +sum_lag3_func uv_420, 420, right, 1 + +function generate_grain_rows_neon + push {r10-r11,lr} +1: + mov r10, #80 +2: + bl get_gaussian_neon + vrshl.s16 q0, q0, q15 + subs r10, r10, #8 + vst1.16 {q0}, [r0]! + bgt 2b + get_grain_2 d0 + subs r1, r1, #1 + vst1.32 {d0[0]}, [r0]! + bgt 1b + pop {r10-r11,pc} +endfunc + +function generate_grain_rows_44_neon + push {r10-r11,lr} +1: + mov r10, #40 +2: + bl get_gaussian_neon + vrshl.s16 q0, q0, q15 + subs r10, r10, #8 + vst1.16 {q0}, [r0]! + bgt 2b + get_grain_4 d0 + subs r1, r1, #1 + vst1.16 {d0}, [r0] + add r0, r0, #GRAIN_WIDTH*2-80 + bgt 1b + pop {r10-r11,pc} +endfunc + +function gen_grain_uv_444_lag0_neon + vld1.16 {q3}, [r11]! +gen_grain_uv_lag0_8_start: + push {r11,lr} + bl get_gaussian_neon + vrshl.s16 q0, q0, q15 +gen_grain_uv_lag0_8_add: + vand q3, q3, q1 + vmull.s16 q2, d6, d22 + vmull.s16 q3, d7, d22 + vrshl.s32 q2, q2, q12 + vrshl.s32 q3, q3, q12 + vqmovn.s32 d4, q2 + vqmovn.s32 d5, q3 + vqadd.s16 q2, q2, q0 + vmin.s16 q2, q2, q9 + vmax.s16 q2, q2, q10 + vst1.16 {q2}, [r0]! + pop {r11,pc} +endfunc + +function gen_grain_uv_420_lag0_8_neon + add r12, r11, #GRAIN_WIDTH*2 + vld1.16 {q2,q3}, [r11]! + vld1.16 {q4,q5}, [r12] + vpadd.i16 d4, d4, d5 + vpadd.i16 d5, d6, d7 + vpadd.i16 d8, d8, d9 + vpadd.i16 d9, d10, d11 + vadd.i16 q2, q2, q4 + vrshr.s16 q3, q2, #2 + b gen_grain_uv_lag0_8_start +endfunc + +function gen_grain_uv_422_lag0_8_neon + vld1.16 {q2,q3}, [r11]! + vpadd.i16 d4, d4, d5 + vpadd.i16 d5, d6, d7 + vrshr.s16 q3, q2, #1 + b gen_grain_uv_lag0_8_start +endfunc + +function gen_grain_uv_420_lag0_4_neon + add r12, r11, #GRAIN_WIDTH*2 + vld1.16 {q2}, [r11] + vld1.16 {q0}, [r12] + add r11, r11, #32 + vpadd.i16 d4, d4, d5 + vpadd.i16 d0, d0, d1 + vadd.i16 d4, d4, d0 + vrshr.s16 d6, d4, #2 + push {r11,lr} + get_grain_4 d0 + b gen_grain_uv_lag0_8_add +endfunc + +function gen_grain_uv_422_lag0_4_neon + vld1.16 {q2}, [r11] + add r11, r11, #32 + vpadd.i16 d4, d4, d5 + vrshr.s16 d6, d4, #1 + push {r11,lr} + get_grain_4 d0 + b gen_grain_uv_lag0_8_add +endfunc + +.macro gen_grain_82 type +function generate_grain_\type\()_16bpc_neon, export=1 + push {r4-r11,lr} + +.ifc \type, uv_444 + ldr r4, [sp, #36] + mov r12, r3 + mov lr, #28 + add r11, r1, #3*GRAIN_WIDTH*2 + mov r1, r2 + mul r12, r12, lr + clz lr, r4 +.else + clz lr, r2 +.endif + movrel r3, X(gaussian_sequence) + sub lr, lr, #24 // -bitdepth_min_8 + ldr r2, [r1, #FGD_SEED] + ldr r9, [r1, #FGD_GRAIN_SCALE_SHIFT] +.ifc \type, y + add r4, r1, #FGD_AR_COEFFS_Y +.else + add r4, r1, #FGD_AR_COEFFS_UV +.endif + add r9, r9, lr // grain_scale_shift - bitdepth_min_8 + adr r5, L(gen_grain_\type\()_tbl) + ldr r6, [r1, #FGD_AR_COEFF_LAG] + add r9, r9, #4 + ldr r6, [r5, r6, lsl #2] + vdup.16 q15, r9 // 4 - bitdepth_min_8 + data->grain_scale_shift + add r5, r5, r6 + vneg.s16 q15, q15 + +.ifc \type, uv_444 + push {lr} + cmp r12, #0 + movw r10, #0x49d8 + movw lr, #0xb524 + // Intentionally using a separate register instead of moveq with an + // immediate constant, to avoid armv8 deprecated it instruction forms. + it eq + moveq r10, lr + add r4, r4, r12 // Add offset to ar_coeffs_uv[1] + eor r2, r2, r10 + pop {lr} +.endif + + ldr r7, [r1, #FGD_AR_COEFF_SHIFT] + neg lr, lr // bitdepth_min_8 + mov r8, #1 + mov r10, #1 + lsl r8, r8, r7 // 1 << ar_coeff_shift + lsl r10, r10, r9 // 1 << (4 + data->grain_scale_shift) + lsr r8, r8, #1 // 1 << (ar_coeff_shift - 1) + lsr r10, r10, #1 // 1 << (4 + data->grain_scale_shift - 1) + + bx r5 + + .align 2 +L(gen_grain_\type\()_tbl): + .word L(generate_grain_\type\()_lag0) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB + .word L(generate_grain_\type\()_lag1) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB + .word L(generate_grain_\type\()_lag2) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB + .word L(generate_grain_\type\()_lag3) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB + +L(generate_grain_\type\()_lag0): +.ifc \type, y + mov r1, #GRAIN_HEIGHT + bl generate_grain_rows_neon +.else + mov r5, #128 + lsl r5, r5, lr // 128 << bitdepth_min_8 + sub r5, r5, #1 // (128 << bitdepth_min_8) - 1 + mvn r6, r5 // grain_min = ~grain_max + + mov r1, #3 + bl generate_grain_rows_neon + mov r1, #GRAIN_HEIGHT-3 + + vdup.32 q12, r7 + vld1.8 {d22[]}, [r4] // ar_coeffs_uv[0] + vmov.i8 q0, #0 + vmov.i8 q1, #255 + vdup.16 q9, r5 + vdup.16 q10, r6 + vext.8 q13, q0, q1, #10 + vext.8 q14, q1, q0, #2 + vneg.s32 q12, q12 + vmovl.s8 q11, d22 + +1: + vmov q1, q13 + bl gen_grain_uv_444_lag0_neon // 8 + vmov.i8 q1, #255 + bl gen_grain_uv_444_lag0_neon // 16 + bl gen_grain_uv_444_lag0_neon // 24 + bl gen_grain_uv_444_lag0_neon // 32 + bl gen_grain_uv_444_lag0_neon // 40 + bl gen_grain_uv_444_lag0_neon // 48 + bl gen_grain_uv_444_lag0_neon // 56 + bl gen_grain_uv_444_lag0_neon // 64 + bl gen_grain_uv_444_lag0_neon // 72 + vmov q1, q14 + bl gen_grain_uv_444_lag0_neon // 80 + get_grain_2 d16 + subs r1, r1, #1 + add r11, r11, #4 + vst1.32 {d16[0]}, [r0]! + bgt 1b +.endif + pop {r4-r11,pc} + +L(generate_grain_\type\()_lag1): + vpush {q4-q7} + mov r5, #128 + lsl r5, r5, lr // 128 << bitdepth_min_8 + sub r5, r5, #1 // (128 << bitdepth_min_8) - 1 + vld1.8 {d27[]}, [r4]! // ar_coeffs_y[0] + vld1.8 {d28[]}, [r4]! // ar_coeffs_y[1] + vld1.8 {d29[]}, [r4] // ar_coeffs_y[2] +.ifc \type, y + ldrsb r4, [r4, #1] // ar_coeffs_y[3] +.else + add r4, r4, #2 +.endif + + mov r1, #3 +.ifc \type, uv_444 + vld1.8 {d13[]}, [r4] // ar_coeffs_uv[4] + ldrsb r4, [r4, #-1] // ar_coeffs_uv[3] +.endif + bl generate_grain_rows_neon + vmovl.s8 q13, d27 + vmovl.s8 q12, d29 + vmovl.s8 q14, d28 + vmov d29, d24 +.ifc \type, uv_444 + vmovl.s8 q6, d13 +.endif + + mov r1, #GRAIN_HEIGHT - 3 +1: + bl sum_\type\()_lag1_left_neon // 8 + bl sum_\type\()_lag1_mid_neon // 16 + bl sum_\type\()_lag1_mid_neon // 24 + bl sum_\type\()_lag1_mid_neon // 32 + bl sum_\type\()_lag1_mid_neon // 40 + bl sum_\type\()_lag1_mid_neon // 48 + bl sum_\type\()_lag1_mid_neon // 56 + bl sum_\type\()_lag1_mid_neon // 64 + bl sum_\type\()_lag1_mid_neon // 72 + bl sum_\type\()_lag1_right_neon // 80 + get_grain_2 d16 + subs r1, r1, #1 +.ifc \type, uv_444 + add r11, r11, #4 +.endif + vst1.32 {d16[0]}, [r0]! + bgt 1b + + vpop {q4-q7} + pop {r4-r11,pc} + +L(generate_grain_\type\()_lag2): + vpush {q4-q7} + mov r5, #128 + lsl r5, r5, lr // 128 << bitdepth_min_8 + sub r5, r5, #1 // (128 << bitdepth_min_8) - 1 + vld1.8 {d28,d29}, [r4] // ar_coeffs_y[0-11], ar_coeffs_uv[0-12] + + vmov.s8 r4, d29[2] + vmov.s8 r10, d29[3] + + mov r1, #3 + bl generate_grain_rows_neon + + mov r1, #GRAIN_HEIGHT - 3 +1: + bl sum_\type\()_lag2_left_neon // 8 + bl sum_\type\()_lag2_mid_neon // 16 + bl sum_\type\()_lag2_mid_neon // 24 + bl sum_\type\()_lag2_mid_neon // 32 + bl sum_\type\()_lag2_mid_neon // 40 + bl sum_\type\()_lag2_mid_neon // 48 + bl sum_\type\()_lag2_mid_neon // 56 + bl sum_\type\()_lag2_mid_neon // 64 + bl sum_\type\()_lag2_mid_neon // 72 + bl sum_\type\()_lag2_right_neon // 80 + get_grain_2 d16 + subs r1, r1, #1 +.ifc \type, uv_444 + add r11, r11, #4 +.endif + vst1.32 {d16[0]}, [r0]! + bgt 1b + + vpop {q4-q7} + pop {r4-r11,pc} + +L(generate_grain_\type\()_lag3): + vpush {q4-q7} + mov r5, #128 + lsl r5, r5, lr // 128 << bitdepth_min_8 + sub r5, r5, #1 // (128 << bitdepth_min_8) - 1 + vld1.8 {q13, q14}, [r4] // ar_coeffs_y[0-23], ar_coeffs_uv[0-24] + + vmov.u8 r4, d28[5] + vmov.u8 r10, d28[6] + vmov.u8 r12, d28[7] + + orr r4, r4, r10, lsl #8 + orr r4, r4, r12, lsl #16 + + mov r1, #3 + vpush {d26} + bl generate_grain_rows_neon + vpop {d26} + + mov r1, #GRAIN_HEIGHT - 3 +1: + bl sum_\type\()_lag3_left_neon // 8 + bl sum_\type\()_lag3_mid_neon // 16 + bl sum_\type\()_lag3_mid_neon // 24 + bl sum_\type\()_lag3_mid_neon // 32 + bl sum_\type\()_lag3_mid_neon // 40 + bl sum_\type\()_lag3_mid_neon // 48 + bl sum_\type\()_lag3_mid_neon // 56 + bl sum_\type\()_lag3_mid_neon // 64 + bl sum_\type\()_lag3_mid_neon // 72 + bl sum_\type\()_lag3_right_neon // 80 + get_grain_2 d16 + subs r1, r1, #1 +.ifc \type, uv_444 + add r11, r11, #4 +.endif + vst1.32 {d16[0]}, [r0]! + bgt 1b + + vpop {q4-q7} + pop {r4-r11,pc} +endfunc +.endm + +gen_grain_82 y +gen_grain_82 uv_444 + +.macro set_height dst, type +.ifc \type, uv_420 + mov \dst, #SUB_GRAIN_HEIGHT-3 +.else + mov \dst, #GRAIN_HEIGHT-3 +.endif +.endm + +.macro increment_y_ptr reg, type +.ifc \type, uv_420 + add \reg, \reg, #2*GRAIN_WIDTH*2-(6*32) +.else + sub \reg, \reg, #6*32-GRAIN_WIDTH*2 +.endif +.endm + +.macro gen_grain_44 type +function generate_grain_\type\()_16bpc_neon, export=1 + push {r4-r11,lr} + + ldr r4, [sp, #36] + mov r12, r3 + movw r11, #(3*GRAIN_WIDTH-3)*2 + mov lr, #28 + add r11, r1, r11 + mov r1, r2 + mul r12, r12, lr + clz lr, r4 + + movrel r3, X(gaussian_sequence) + sub lr, lr, #24 // -bitdepth_min_8 + ldr r2, [r1, #FGD_SEED] + ldr r9, [r1, #FGD_GRAIN_SCALE_SHIFT] + add r4, r1, #FGD_AR_COEFFS_UV + add r9, r9, lr // grain_scale_shift - bitdepth_min_8 + adr r5, L(gen_grain_\type\()_tbl) + ldr r6, [r1, #FGD_AR_COEFF_LAG] + add r9, r9, #4 + ldr r6, [r5, r6, lsl #2] + vdup.16 q15, r9 // 4 - bitdepth_min_8 + data->grain_scale_shift + add r5, r5, r6 + vneg.s16 q15, q15 + + push {lr} + cmp r12, #0 + movw r10, #0x49d8 + movw lr, #0xb524 + // Intentionally using a separate register instead of moveq with an + // immediate constant, to avoid armv8 deprecated it instruction forms. + it eq + moveq r10, lr + add r4, r4, r12 // Add offset to ar_coeffs_uv[1] + eor r2, r2, r10 + pop {lr} + + ldr r7, [r1, #FGD_AR_COEFF_SHIFT] + neg lr, lr + mov r8, #1 + mov r10, #1 + lsl r8, r8, r7 // 1 << ar_coeff_shift + lsl r10, r10, r9 // 1 << (4 + data->grain_scale_shift) + lsr r8, r8, #1 // 1 << (ar_coeff_shift - 1) + lsr r10, r10, #1 // 1 << (4 + data->grain_scale_shift - 1) + bx r5 + + .align 2 +L(gen_grain_\type\()_tbl): + .word L(generate_grain_\type\()_lag0) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB + .word L(generate_grain_\type\()_lag1) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB + .word L(generate_grain_\type\()_lag2) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB + .word L(generate_grain_\type\()_lag3) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB + +L(generate_grain_\type\()_lag0): +.ifc \type, uv_420 + vpush {q4-q5} +.endif + mov r5, #128 + lsl r5, r5, lr // 128 << bitdepth_min_8 + sub r5, r5, #1 // (128 << bitdepth_min_8) - 1 + mvn r6, r5 // grain_min = ~grain_max + + mov r1, #3 + bl generate_grain_rows_44_neon + set_height r1, \type + + vdup.32 q12, r7 + vld1.8 {d22[]}, [r4] // ar_coeffs_uv[0] + vmov.i8 q0, #0 + vmov.i8 q1, #255 + vdup.16 q9, r5 + vdup.16 q10, r6 + vext.8 q13, q0, q1, #10 + vext.8 q14, q1, q0, #14 + vneg.s32 q12, q12 + vmovl.s8 q11, d22 + +1: + vmov q1, q13 + bl gen_grain_\type\()_lag0_8_neon // 8 + vmov.i8 q1, #255 + bl gen_grain_\type\()_lag0_8_neon // 16 + bl gen_grain_\type\()_lag0_8_neon // 24 + bl gen_grain_\type\()_lag0_8_neon // 32 + bl gen_grain_\type\()_lag0_8_neon // 40 + vmov q1, q14 + bl gen_grain_\type\()_lag0_4_neon // 44 + subs r1, r1, #1 + increment_y_ptr r11, \type + add r0, r0, #GRAIN_WIDTH*2-6*16 + bgt 1b + +.ifc \type, uv_420 + vpop {q4-q5} +.endif + pop {r4-r11,pc} + +L(generate_grain_\type\()_lag1): + vpush {q4-q7} + mov r5, #128 + lsl r5, r5, lr // 128 << bitdepth_min_8 + sub r5, r5, #1 // (128 << bitdepth_min_8) - 1 + vld1.8 {d27[]}, [r4]! // ar_coeffs_uv[0] + vld1.8 {d28[]}, [r4]! // ar_coeffs_uv[1] + vld1.8 {d29[]}, [r4] // ar_coeffs_uv[2] + add r4, r4, #2 + + mov r1, #3 + vld1.8 {d13[]}, [r4] // ar_coeffs_uv[4] + ldrsb r4, [r4, #-1] // ar_coeffs_uv[3] + bl generate_grain_rows_44_neon + vmovl.s8 q13, d27 + vmovl.s8 q12, d29 + vmovl.s8 q14, d28 + vmov d29, d24 + vmovl.s8 q6, d13 + + set_height r1, \type +1: + bl sum_\type\()_lag1_left_neon // 8 + bl sum_\type\()_lag1_mid_neon // 16 + bl sum_\type\()_lag1_mid_neon // 24 + bl sum_\type\()_lag1_mid_neon // 32 + bl sum_\type\()_lag1_mid_neon // 40 + bl sum_\type\()_lag1_right_neon // 44 + subs r1, r1, #1 + increment_y_ptr r11, \type + add r0, r0, #GRAIN_WIDTH*2-6*16 + bgt 1b + + vpop {q4-q7} + pop {r4-r11,pc} + +L(generate_grain_\type\()_lag2): + vpush {q4-q7} + mov r5, #128 + lsl r5, r5, lr // 128 << bitdepth_min_8 + sub r5, r5, #1 // (128 << bitdepth_min_8) - 1 + vld1.8 {d28,d29}, [r4] // ar_coeffs_uv[0-12] + + vmov.s8 r4, d29[2] + vmov.s8 r10, d29[3] + + mov r1, #3 + bl generate_grain_rows_44_neon + + set_height r1, \type +1: + bl sum_\type\()_lag2_left_neon // 8 + bl sum_\type\()_lag2_mid_neon // 16 + bl sum_\type\()_lag2_mid_neon // 24 + bl sum_\type\()_lag2_mid_neon // 32 + bl sum_\type\()_lag2_mid_neon // 40 + bl sum_\type\()_lag2_right_neon // 44 + subs r1, r1, #1 + increment_y_ptr r11, \type + add r0, r0, #GRAIN_WIDTH*2-6*16 + bgt 1b + + vpop {q4-q7} + pop {r4-r11,pc} + +L(generate_grain_\type\()_lag3): + vpush {q4-q7} + mov r5, #128 + lsl r5, r5, lr // 128 << bitdepth_min_8 + sub r5, r5, #1 // (128 << bitdepth_min_8) - 1 + vld1.8 {q13, q14}, [r4] // ar_coeffs_y[0-23], ar_coeffs_uv[0-24] + + vmov.u8 r4, d28[5] + vmov.u8 r10, d28[6] + vmov.u8 r12, d28[7] + + orr r4, r4, r10, lsl #8 + orr r4, r4, r12, lsl #16 + + mov r1, #3 + bl generate_grain_rows_44_neon + + set_height r1, \type +1: + bl sum_\type\()_lag3_left_neon // 8 + bl sum_\type\()_lag3_mid_neon // 16 + bl sum_\type\()_lag3_mid_neon // 24 + bl sum_\type\()_lag3_mid_neon // 32 + bl sum_\type\()_lag3_mid_neon // 40 + bl sum_\type\()_lag3_right_neon // 44 + subs r1, r1, #1 + increment_y_ptr r11, \type + add r0, r0, #GRAIN_WIDTH*2-6*16 + bgt 1b + + vpop {q4-q7} + pop {r4-r11,pc} +endfunc +.endm + +gen_grain_44 uv_420 +gen_grain_44 uv_422 .macro gather_interleaved dst1, dst2, src1, src2, src3, src4, off vmov.u16 r11, \src1[0+\off] diff --git a/src/arm/film_grain_init_tmpl.c b/src/arm/film_grain_init_tmpl.c index 29d7faf..3a41602 100644 --- a/src/arm/film_grain_init_tmpl.c +++ b/src/arm/film_grain_init_tmpl.c @@ -31,8 +31,6 @@ #include "src/film_grain.h" #include "asm-offsets.h" -#if ARCH_AARCH64 || BITDEPTH == 8 - CHECK_OFFSET(Dav1dFilmGrainData, seed, FGD_SEED); CHECK_OFFSET(Dav1dFilmGrainData, ar_coeff_lag, FGD_AR_COEFF_LAG); CHECK_OFFSET(Dav1dFilmGrainData, ar_coeffs_y, FGD_AR_COEFFS_Y); @@ -60,7 +58,6 @@ void BF(dav1d_generate_grain_uv_ ## suff, neon)(entry buf[][GRAIN_WIDTH], \ GEN_GRAIN_UV(420); GEN_GRAIN_UV(422); GEN_GRAIN_UV(444); -#endif // Use ptrdiff_t instead of int for the last few parameters, to get the // same layout of parameters on the stack across platforms. @@ -209,12 +206,10 @@ COLD void bitfn(dav1d_film_grain_dsp_init_arm)(Dav1dFilmGrainDSPContext *const c if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return; -#if ARCH_AARCH64 || BITDEPTH == 8 c->generate_grain_y = BF(dav1d_generate_grain_y, neon); c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I420 - 1] = BF(dav1d_generate_grain_uv_420, neon); c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I422 - 1] = BF(dav1d_generate_grain_uv_422, neon); c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I444 - 1] = BF(dav1d_generate_grain_uv_444, neon); -#endif c->fgy_32x32xn = fgy_32x32xn_neon; c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I420 - 1] = fguv_32x32xn_420_neon; |