Fix overflow in 8-bit NEON ADST

In 8-bit adst, it's possible that the final Round2(x[0], 12) can exceed 16-bits signed Specifically, in 7.13.2.6. Inverse ADST4 process, the precision requirement is: "It is a requirement of bitstream conformance that all values stored in the s and x arrays by this process are representable by a signed integer using r + 12 bits of precision." For 8 bits, r is 16 for both row and column, so x[] can be 28-bit signed. For values [134215680, 134217727] (within 2047 of the maximum 28-bit value), the final Round2(x[0], 12) evaluates to 32768, exceeding 16-bits signed. So switch to using sqrshrn, which saturates to 16-bits signed This is a continuation of: Commit b53ff29d80a21180e5ad9bbe39a02541151f4f53 arm: itx: Do clipping in all narrowing downshifts
author: David Conrad <david_conrad@apple.com> 2022-06-08 23:50:02 +0300
committer: David Conrad <david_conrad@apple.com> 2022-09-15 03:28:22 +0300
commit: 1bdb776c71d615e9a1a4890bbc53f979e225e150 (patch)
tree: a465f7e5ab722725fe4aa36bdb73e151a933c980
parent: 08c708015ec372b6c28d341cba7bbc86843cc17b (diff)
1 files changed, 12 insertions, 12 deletions
diff --git a/src/arm/64/itx.S b/src/arm/64/itx.S
index c9650e9..b1b2f8f 100644
--- a/src/arm/64/itx.S
+++ b/src/arm/64/itx.S
@@ -483,10 +483,10 @@ endfunc
         add             \o1\().4s, v5.4s,     v7.4s
         sub             \o3\().4s, \o3\().4s, v7.4s
 
-        rshrn           \o0\().4h, \o0\().4s, #12
-        rshrn           \o2\().4h, \o2\().4s, #12
-        rshrn           \o1\().4h, \o1\().4s, #12
-        rshrn           \o3\().4h, \o3\().4s, #12
+        sqrshrn         \o0\().4h, \o0\().4s, #12
+        sqrshrn         \o2\().4h, \o2\().4s, #12
+        sqrshrn         \o1\().4h, \o1\().4s, #12
+        sqrshrn         \o3\().4h, \o3\().4s, #12
 .endm
 
 function inv_adst_4h_x4_neon, export=1
@@ -538,21 +538,21 @@ endfunc
         sub             v4.4s,   v4.4s,   v2.4s // out3
         sub             v5.4s,   v5.4s,   v3.4s
 
-        rshrn           v18.4h,  v18.4s, #12
-        rshrn2          v18.8h,  v19.4s, #12
+        sqrshrn         v18.4h,  v18.4s, #12
+        sqrshrn2        v18.8h,  v19.4s, #12
 
-        rshrn           \o0\().4h, v16.4s, #12
-        rshrn2          \o0\().8h, v17.4s, #12
+        sqrshrn         \o0\().4h, v16.4s, #12
+        sqrshrn2        \o0\().8h, v17.4s, #12
 
 .ifc \o2, v17
         mov             v17.16b,   v18.16b
 .endif
 
-        rshrn           \o1\().4h, v6.4s,  #12
-        rshrn2          \o1\().8h, v7.4s,  #12
+        sqrshrn         \o1\().4h, v6.4s,  #12
+        sqrshrn2        \o1\().8h, v7.4s,  #12
 
-        rshrn           \o3\().4h, v4.4s,  #12
-        rshrn2          \o3\().8h, v5.4s,  #12
+        sqrshrn         \o3\().4h, v4.4s,  #12
+        sqrshrn2        \o3\().8h, v5.4s,  #12
 .endm
 
 function inv_adst_8h_x4_neon, export=1
author	David Conrad <david_conrad@apple.com>	2022-06-08 23:50:02 +0300
committer	David Conrad <david_conrad@apple.com>	2022-09-15 03:28:22 +0300
commit	1bdb776c71d615e9a1a4890bbc53f979e225e150 (patch)
tree	a465f7e5ab722725fe4aa36bdb73e151a933c980
parent	08c708015ec372b6c28d341cba7bbc86843cc17b (diff)