arm64: filmgrain: Deduplicate the output_lag functions

No practical difference in generated code (or the size of it), but less source code to handle.
author: Martin Storsjö <martin@martin.st> 2021-08-12 15:17:32 +0300
committer: Martin Storsjö <martin@martin.st> 2021-08-13 18:16:47 +0300
commit: 54a22a4f7c97fdb1770d11157e7214d12e94b686 (patch)
tree: 23bef3dea3a00c27abc603ee5219f641b620bc4b
parent: caa2ede5f76f7e994e78c15f548123ed89eea6d4 (diff)
1 files changed, 23 insertions, 63 deletions
diff --git a/src/arm/64/film_grain.S b/src/arm/64/film_grain.S
index ff1fa6c..95b9360 100644
--- a/src/arm/64/film_grain.S
+++ b/src/arm/64/film_grain.S
@@ -207,17 +207,30 @@ endfunc
 .endm
 
 // w15 holds the number of entries to produce
-// w14 holds the previous output entry
+// w14, w16 and w17 hold the previous output entries
 // v0 holds the vector of produced entries
 // v1 holds the input vector of sums from above
-function output_lag1_neon
+.macro output_lag n
+function output_lag\n\()_neon
 1:
         read_shift_rand x13, 11
         mov             w11, v1.s[0]
         ldrsh           w12, [x3, x13, lsl #1]
         ext             v0.16b,  v0.16b,  v0.16b,  #1
-        madd            w14, w14, w4,  w11        // sum (above) + *coeff * prev output
-        add             w14, w14, w8              // 1 << (ar_coeff_shift - 1)
+.if \n == 1
+        madd            w11, w14, w4,  w11        // sum (above) + *coeff * prev output
+.elseif \n == 2
+        madd            w11, w16, w4,  w11        // sum (above) + *coeff * prev output 1
+        madd            w11, w14, w17, w11        // += *coeff * prev output 2
+        mov             w16, w14
+.else
+        madd            w11, w17, w4,  w11        // sum (above) + *coeff * prev output 1
+        madd            w11, w16, w20, w11        // sum (above) + *coeff * prev output 2
+        madd            w11, w14, w21, w11        // += *coeff * prev output 3
+        mov             w17, w16
+        mov             w16, w14
+.endif
+        add             w14, w11, w8              // 1 << (ar_coeff_shift - 1)
         asr             w14, w14, w7              // >> ar_coeff_shift
         add             w12, w12, w10
         asr             w12, w12, w9              // >> (4 + grain_scale_shift)
@@ -232,6 +245,12 @@ function output_lag1_neon
         b.gt            1b
         ret
 endfunc
+.endm
+
+output_lag 1
+output_lag 2
+output_lag 3
+
 
 function sum_lag1_above_neon
         smull           v2.8h,   v3.8b,   v28.8b
@@ -408,34 +427,6 @@ sum_lag1_func uv_420, 420, right, 9
         sum_lag1        uv_420, \dst, \left, \mid, \right, \edge
 .endm
 
-// w15 holds the number of entries to produce
-// w14 and w16 hold the previous output entries
-// v0 holds the vector of produced entries
-// v1 holds the input vector of sums from above
-function output_lag2_neon
-1:
-        read_shift_rand x13, 11
-        mov             w11, v1.s[0]
-        ldrsh           w12, [x3, x13, lsl #1]
-        ext             v0.16b,  v0.16b,  v0.16b,  #1
-        madd            w11, w16, w4,  w11        // sum (above) + *coeff * prev output 1
-        madd            w11, w14, w17, w11        // += *coeff * prev output 2
-        mov             w16, w14
-        add             w14, w11, w8              // 1 << (ar_coeff_shift - 1)
-        asr             w14, w14, w7              // >> ar_coeff_shift
-        add             w12, w12, w10
-        asr             w12, w12, w9              // >> (4 + grain_scale_shift)
-        add             w14, w14, w12
-        cmp             w14, w5
-        csel            w14, w14, w5,  le
-        cmp             w14, w6
-        csel            w14, w14, w6,  ge
-        subs            w15, w15, #1
-        ext             v1.16b,  v1.16b,  v1.16b,  #4
-        ins             v0.b[15], w14
-        b.gt            1b
-        ret
-endfunc
 
 function sum_lag2_above_neon
         sub             x12, x0,  #2*GRAIN_WIDTH - 16
@@ -672,37 +663,6 @@ sum_lag2_func uv_420, 420, mid
 sum_lag2_func uv_420, 420, right, 9
 
 
-// w15 holds the number of entries to produce
-// w14, w16 and w17 hold the previous output entries
-// v0 holds the vector of produced entries
-// v1 holds the input vector of sums from above
-function output_lag3_neon
-1:
-        read_shift_rand x13, 11
-        mov             w11, v1.s[0]
-        ldrsh           w12, [x3, x13, lsl #1]
-        ext             v0.16b,  v0.16b,  v0.16b,  #1
-        madd            w11, w17, w4,  w11        // sum (above) + *coeff * prev output 1
-        madd            w11, w16, w20, w11        // sum (above) + *coeff * prev output 2
-        madd            w11, w14, w21, w11        // += *coeff * prev output 3
-        mov             w17, w16
-        mov             w16, w14
-        add             w14, w11, w8              // 1 << (ar_coeff_shift - 1)
-        asr             w14, w14, w7              // >> ar_coeff_shift
-        add             w12, w12, w10
-        asr             w12, w12, w9              // >> (4 + grain_scale_shift)
-        add             w14, w14, w12
-        cmp             w14, w5
-        csel            w14, w14, w5,  le
-        cmp             w14, w6
-        csel            w14, w14, w6,  ge
-        subs            w15, w15, #1
-        ext             v1.16b,  v1.16b,  v1.16b,  #4
-        ins             v0.b[15], w14
-        b.gt            1b
-        ret
-endfunc
-
 function sum_lag3_above_neon
         sub             x11, x0,  #3*GRAIN_WIDTH - 16
         sub             x12, x0,  #2*GRAIN_WIDTH - 16
author	Martin Storsjö <martin@martin.st>	2021-08-12 15:17:32 +0300
committer	Martin Storsjö <martin@martin.st>	2021-08-13 18:16:47 +0300
commit	54a22a4f7c97fdb1770d11157e7214d12e94b686 (patch)
tree	23bef3dea3a00c27abc603ee5219f641b620bc4b
parent	caa2ede5f76f7e994e78c15f548123ed89eea6d4 (diff)