diff options
author | Martin Storsjö <martin@martin.st> | 2021-06-04 14:50:11 +0300 |
---|---|---|
committer | Martin Storsjö <martin@martin.st> | 2021-06-05 22:47:57 +0300 |
commit | 3e044a7dc5126dd91318dc03915a7f6932e9bb6f (patch) | |
tree | 9e9b30bfc17e5ebc72b91837b4b52c84c31475ae /src/arm | |
parent | a8b13fc11020f7af579288808fcbcf2610b63da3 (diff) |
arm64: filmgrain16: Use sqrdmulh for the scaling*grain multiplication
Before: Cortex A53 A72 A73 Apple M1
fgy_32x32xn_16bpc_neon: 10396.8 8150.8 8718.3 19.5
After:
fgy_32x32xn_16bpc_neon: 9665.1 7558.8 7652.8 19.5
Diffstat (limited to 'src/arm')
-rw-r--r-- | src/arm/64/film_grain16.S | 45 |
1 files changed, 13 insertions, 32 deletions
diff --git a/src/arm/64/film_grain16.S b/src/arm/64/film_grain16.S index f79812c..d3d08dc 100644 --- a/src/arm/64/film_grain16.S +++ b/src/arm/64/film_grain16.S @@ -110,6 +110,7 @@ function fgy_32x32_16bpc_neon, export=1 stp d10, d11, [sp, #32] stp d12, d13, [sp, #48] str d14, [sp, #64] + eor w4, w4, #15 // 15 - scaling_shift ldr w11, [x6, #8] // offsets[1][0] ldr w13, [x6, #4] // offsets[0][1] ldr w15, [x6, #12] // offsets[1][1] @@ -122,8 +123,7 @@ function fgy_32x32_16bpc_neon, export=1 mov x9, #GRAIN_WIDTH*2 // grain_lut stride neg w10, w10 // bitdepth_min_8 - neg w4, w4 - dup v29.4s, w4 // -scaling_shift + dup v29.8h, w4 // 15 - scaling_shift dup v27.8h, w10 // bitdepth_min_8 movrel x16, overlap_coeffs_0 @@ -268,7 +268,7 @@ L(loop_\ox\oy): smax v19.8h, v19.8h, v25.8h .endif - uxtl v4.8h, v6.8b // scaling + uxtl v4.8h, v6.8b // scaling .if \ox && !\oy sqrshrn v20.4h, v20.4s, #5 .endif @@ -281,37 +281,18 @@ L(loop_\ox\oy): smax v20.4h, v20.4h, v25.4h .endif uxtl2 v7.8h, v7.16b - .if \ox && !\oy - smull v20.4s, v20.4h, v4.4h // scaling * grain -.else - smull v20.4s, v16.4h, v4.4h + ins v16.d[0], v20.d[0] .endif - smull2 v21.4s, v16.8h, v4.8h - smull v22.4s, v17.4h, v5.4h - smull2 v23.4s, v17.8h, v5.8h - smull v16.4s, v18.4h, v6.4h - smull2 v17.4s, v18.8h, v6.8h - smull v18.4s, v19.4h, v7.4h - smull2 v19.4s, v19.8h, v7.8h - - srshl v20.4s, v20.4s, v29.4s // round2(scaling * grain, scaling_shift) - srshl v21.4s, v21.4s, v29.4s - srshl v22.4s, v22.4s, v29.4s - srshl v23.4s, v23.4s, v29.4s - srshl v16.4s, v16.4s, v29.4s - srshl v17.4s, v17.4s, v29.4s - srshl v18.4s, v18.4s, v29.4s - srshl v19.4s, v19.4s, v29.4s - - sqxtn v20.4h, v20.4s - sqxtn2 v20.8h, v21.4s - sqxtn v21.4h, v22.4s - sqxtn2 v21.8h, v23.4s - sqxtn v22.4h, v16.4s - sqxtn2 v22.8h, v17.4s - sqxtn v23.4h, v18.4s - sqxtn2 v23.8h, v19.4s + ushl v4.8h, v4.8h, v29.8h // scaling << (15 - scaling_shift) + ushl v5.8h, v5.8h, v29.8h + ushl v6.8h, v6.8h, v29.8h + ushl v7.8h, v7.8h, v29.8h + + sqrdmulh v20.8h, v16.8h, v4.8h // round2((scaling << (15 - scaling_shift) * grain, 15) + sqrdmulh v21.8h, v17.8h, v5.8h + sqrdmulh v22.8h, v18.8h, v6.8h + sqrdmulh v23.8h, v19.8h, v7.8h usqadd v0.8h, v20.8h // *src + noise usqadd v1.8h, v21.8h |