Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/videolan/dav1d.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
path: root/src/arm
diff options
context:
space:
mode:
authorMartin Storsjö <martin@martin.st>2021-06-04 14:50:11 +0300
committerMartin Storsjö <martin@martin.st>2021-06-05 22:47:57 +0300
commit3e044a7dc5126dd91318dc03915a7f6932e9bb6f (patch)
tree9e9b30bfc17e5ebc72b91837b4b52c84c31475ae /src/arm
parenta8b13fc11020f7af579288808fcbcf2610b63da3 (diff)
arm64: filmgrain16: Use sqrdmulh for the scaling*grain multiplication
Before: Cortex A53 A72 A73 Apple M1 fgy_32x32xn_16bpc_neon: 10396.8 8150.8 8718.3 19.5 After: fgy_32x32xn_16bpc_neon: 9665.1 7558.8 7652.8 19.5
Diffstat (limited to 'src/arm')
-rw-r--r--src/arm/64/film_grain16.S45
1 files changed, 13 insertions, 32 deletions
diff --git a/src/arm/64/film_grain16.S b/src/arm/64/film_grain16.S
index f79812c..d3d08dc 100644
--- a/src/arm/64/film_grain16.S
+++ b/src/arm/64/film_grain16.S
@@ -110,6 +110,7 @@ function fgy_32x32_16bpc_neon, export=1
stp d10, d11, [sp, #32]
stp d12, d13, [sp, #48]
str d14, [sp, #64]
+ eor w4, w4, #15 // 15 - scaling_shift
ldr w11, [x6, #8] // offsets[1][0]
ldr w13, [x6, #4] // offsets[0][1]
ldr w15, [x6, #12] // offsets[1][1]
@@ -122,8 +123,7 @@ function fgy_32x32_16bpc_neon, export=1
mov x9, #GRAIN_WIDTH*2 // grain_lut stride
neg w10, w10 // bitdepth_min_8
- neg w4, w4
- dup v29.4s, w4 // -scaling_shift
+ dup v29.8h, w4 // 15 - scaling_shift
dup v27.8h, w10 // bitdepth_min_8
movrel x16, overlap_coeffs_0
@@ -268,7 +268,7 @@ L(loop_\ox\oy):
smax v19.8h, v19.8h, v25.8h
.endif
- uxtl v4.8h, v6.8b // scaling
+ uxtl v4.8h, v6.8b // scaling
.if \ox && !\oy
sqrshrn v20.4h, v20.4s, #5
.endif
@@ -281,37 +281,18 @@ L(loop_\ox\oy):
smax v20.4h, v20.4h, v25.4h
.endif
uxtl2 v7.8h, v7.16b
-
.if \ox && !\oy
- smull v20.4s, v20.4h, v4.4h // scaling * grain
-.else
- smull v20.4s, v16.4h, v4.4h
+ ins v16.d[0], v20.d[0]
.endif
- smull2 v21.4s, v16.8h, v4.8h
- smull v22.4s, v17.4h, v5.4h
- smull2 v23.4s, v17.8h, v5.8h
- smull v16.4s, v18.4h, v6.4h
- smull2 v17.4s, v18.8h, v6.8h
- smull v18.4s, v19.4h, v7.4h
- smull2 v19.4s, v19.8h, v7.8h
-
- srshl v20.4s, v20.4s, v29.4s // round2(scaling * grain, scaling_shift)
- srshl v21.4s, v21.4s, v29.4s
- srshl v22.4s, v22.4s, v29.4s
- srshl v23.4s, v23.4s, v29.4s
- srshl v16.4s, v16.4s, v29.4s
- srshl v17.4s, v17.4s, v29.4s
- srshl v18.4s, v18.4s, v29.4s
- srshl v19.4s, v19.4s, v29.4s
-
- sqxtn v20.4h, v20.4s
- sqxtn2 v20.8h, v21.4s
- sqxtn v21.4h, v22.4s
- sqxtn2 v21.8h, v23.4s
- sqxtn v22.4h, v16.4s
- sqxtn2 v22.8h, v17.4s
- sqxtn v23.4h, v18.4s
- sqxtn2 v23.8h, v19.4s
+ ushl v4.8h, v4.8h, v29.8h // scaling << (15 - scaling_shift)
+ ushl v5.8h, v5.8h, v29.8h
+ ushl v6.8h, v6.8h, v29.8h
+ ushl v7.8h, v7.8h, v29.8h
+
+ sqrdmulh v20.8h, v16.8h, v4.8h // round2((scaling << (15 - scaling_shift) * grain, 15)
+ sqrdmulh v21.8h, v17.8h, v5.8h
+ sqrdmulh v22.8h, v18.8h, v6.8h
+ sqrdmulh v23.8h, v19.8h, v7.8h
usqadd v0.8h, v20.8h // *src + noise
usqadd v1.8h, v21.8h