From f32690a298badbf2df66319e9b38236ad3d3e321 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Storsj=C3=B6?= Date: Thu, 23 Feb 2017 23:33:58 +0200 Subject: aarch64: vp9lpf: Use dup+rev16+uzp1 instead of dup+lsr+dup+trn1 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This is one cycle faster in total, and three instructions fewer. Before: vp9_loop_filter_mix2_v_44_16_neon: 123.2 After: vp9_loop_filter_mix2_v_44_16_neon: 122.2 This is cherrypicked from libav commit 3bf9c48320f25f3d5557485b0202f22ae60748b0. Signed-off-by: Martin Storsjö --- libavcodec/aarch64/vp9lpf_neon.S | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) (limited to 'libavcodec/aarch64') diff --git a/libavcodec/aarch64/vp9lpf_neon.S b/libavcodec/aarch64/vp9lpf_neon.S index a9eea7f951..0878763020 100644 --- a/libavcodec/aarch64/vp9lpf_neon.S +++ b/libavcodec/aarch64/vp9lpf_neon.S @@ -162,18 +162,15 @@ dup v2\sz, w3 // I dup v3\sz, w4 // H .else - dup v0.8b, w2 // E - dup v2.8b, w3 // I - dup v3.8b, w4 // H - lsr w5, w2, #8 - lsr w6, w3, #8 - lsr w7, w4, #8 - dup v1.8b, w5 // E - dup v4.8b, w6 // I - dup v5.8b, w7 // H - trn1 v0.2d, v0.2d, v1.2d - trn1 v2.2d, v2.2d, v4.2d - trn1 v3.2d, v3.2d, v5.2d + dup v0.8h, w2 // E + dup v2.8h, w3 // I + dup v3.8h, w4 // H + rev16 v1.16b, v0.16b // E + rev16 v4.16b, v2.16b // I + rev16 v5.16b, v3.16b // H + uzp1 v0.16b, v0.16b, v1.16b + uzp1 v2.16b, v2.16b, v4.16b + uzp1 v3.16b, v3.16b, v5.16b .endif uabd v4\sz, v20\sz, v21\sz // abs(p3 - p2) -- cgit v1.2.3