Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/videolan/dav1d.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMartin Storsjö <martin@martin.st>2019-01-10 00:27:00 +0300
committerJean-Baptiste Kempf <jb@videolan.org>2019-01-24 19:28:05 +0300
commit72af9329c0c003f68639301be33d4632147245b6 (patch)
treedac4a456e162b57ea6207b75b61eba2aed5631ca /src/arm/64/mc.S
parentfc5a3728144c62b634bb6fb036a6da47ee9bdf8f (diff)
arm64: mc: Simplify the 8tap_2w_hv code slightly
Before: Cortex A53 Snapdragon 835 mc_8tap_regular_w2_hv_8bpc_neon: 415.0 286.9 After: mc_8tap_regular_w2_hv_8bpc_neon: 399.1 269.9
Diffstat (limited to 'src/arm/64/mc.S')
-rw-r--r--src/arm/64/mc.S41
1 files changed, 16 insertions, 25 deletions
diff --git a/src/arm/64/mc.S b/src/arm/64/mc.S
index a09d218..7a544a5 100644
--- a/src/arm/64/mc.S
+++ b/src/arm/64/mc.S
@@ -1307,21 +1307,19 @@ L(\type\()_8tap_hv):
ext v29.16b, v28.16b, v28.16b, #2
mul v28.4h, v28.4h, v0.4h
mul v29.4h, v29.4h, v0.4h
- addv h28, v28.4h
- addv h29, v29.4h
- trn1 v16.4h, v28.4h, v29.4h
- srshr v16.4h, v16.4h, #2
+ addp v28.4h, v28.4h, v29.4h
+ addp v16.4h, v28.4h, v28.4h
+ srshr v16.4h, v16.4h, #2
bl L(\type\()_8tap_filter_2)
trn1 v16.2s, v16.2s, v28.2s
- trn1 v17.2s, v28.2s, v30.2s
- mov v18.8b, v30.8b
+ mov v17.8b, v28.8b
2:
bl L(\type\()_8tap_filter_2)
- trn1 v18.2s, v18.2s, v28.2s
- trn1 v19.2s, v28.2s, v30.2s
+ ext v18.8b, v17.8b, v28.8b, #4
+ mov v19.8b, v28.8b
smull v2.4s, v16.4h, v1.h[0]
smlal v2.4s, v17.4h, v1.h[1]
smlal v2.4s, v18.4h, v1.h[2]
@@ -1335,7 +1333,6 @@ L(\type\()_8tap_hv):
b.le 0f
mov v16.8b, v18.8b
mov v17.8b, v19.8b
- mov v18.8b, v30.8b
b 2b
280: // 2x8, 2x16, 2x32 hv
@@ -1355,28 +1352,24 @@ L(\type\()_8tap_hv):
ext v29.16b, v28.16b, v28.16b, #2
mul v28.4h, v28.4h, v0.4h
mul v29.4h, v29.4h, v0.4h
- addv h28, v28.4h
- addv h29, v29.4h
- trn1 v16.4h, v28.4h, v29.4h
- srshr v16.4h, v16.4h, #2
+ addp v28.4h, v28.4h, v29.4h
+ addp v16.4h, v28.4h, v28.4h
+ srshr v16.4h, v16.4h, #2
bl L(\type\()_8tap_filter_2)
trn1 v16.2s, v16.2s, v28.2s
- trn1 v17.2s, v28.2s, v30.2s
- mov v18.8b, v30.8b
+ mov v17.8b, v28.8b
bl L(\type\()_8tap_filter_2)
- trn1 v18.2s, v18.2s, v28.2s
- trn1 v19.2s, v28.2s, v30.2s
- mov v20.8b, v30.8b
+ ext v18.8b, v17.8b, v28.8b, #4
+ mov v19.8b, v28.8b
bl L(\type\()_8tap_filter_2)
- trn1 v20.2s, v20.2s, v28.2s
- trn1 v21.2s, v28.2s, v30.2s
- mov v22.8b, v30.8b
+ ext v20.8b, v19.8b, v28.8b, #4
+ mov v21.8b, v28.8b
28:
bl L(\type\()_8tap_filter_2)
- trn1 v22.2s, v22.2s, v28.2s
- trn1 v23.2s, v28.2s, v30.2s
+ ext v22.8b, v21.8b, v28.8b, #4
+ mov v23.8b, v28.8b
smull v2.4s, v16.4h, v1.h[0]
smlal v2.4s, v17.4h, v1.h[1]
smlal v2.4s, v18.4h, v1.h[2]
@@ -1398,7 +1391,6 @@ L(\type\()_8tap_hv):
mov v19.8b, v21.8b
mov v20.8b, v22.8b
mov v21.8b, v23.8b
- mov v22.8b, v30.8b
b 28b
0:
@@ -1420,7 +1412,6 @@ L(\type\()_8tap_filter_2):
mla v27.4h, v30.4h, v0.h[2]
mla v27.4h, v31.4h, v0.h[3]
srshr v28.4h, v27.4h, #2
- trn2 v30.2s, v28.2s, v28.2s
ret
.endif