From 648b422e171d5eab18f6c6fd346e4050d717b936 Mon Sep 17 00:00:00 2001
From: gxw <guxiwei-hf@loongson.cn>
Date: Mon, 21 Oct 2019 15:56:47 +0800
Subject: avcodec/mips: msa optimizations for vc1dsp

Performance of WMV3 decoding has speed up from 3.66x to 5.23x tested on 3A4000.

Reviewed-by: Shiyou Yin <yinshiyou-hf@loongson.cn>
Signed-off-by: Michael Niedermayer <michael@niedermayer.cc>
---
 libavutil/mips/generic_macros_msa.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'libavutil/mips')

diff --git a/libavutil/mips/generic_macros_msa.h b/libavutil/mips/generic_macros_msa.h
index c25509e483..267d4e6ca5 100644
--- a/libavutil/mips/generic_macros_msa.h
+++ b/libavutil/mips/generic_macros_msa.h
@@ -299,6 +299,7 @@
 #define LD_SB4(...) LD_V4(v16i8, __VA_ARGS__)
 #define LD_UH4(...) LD_V4(v8u16, __VA_ARGS__)
 #define LD_SH4(...) LD_V4(v8i16, __VA_ARGS__)
+#define LD_SW4(...) LD_V4(v4i32, __VA_ARGS__)
 
 #define LD_V5(RTYPE, psrc, stride, out0, out1, out2, out3, out4)  \
 {                                                                 \
@@ -337,6 +338,7 @@
 #define LD_SB8(...) LD_V8(v16i8, __VA_ARGS__)
 #define LD_UH8(...) LD_V8(v8u16, __VA_ARGS__)
 #define LD_SH8(...) LD_V8(v8i16, __VA_ARGS__)
+#define LD_SW8(...) LD_V8(v4i32, __VA_ARGS__)
 
 #define LD_V16(RTYPE, psrc, stride,                                   \
                out0, out1, out2, out3, out4, out5, out6, out7,        \
@@ -1382,6 +1384,7 @@
             out4, out5, out6, out7);                              \
 }
 #define ILVR_B8_UH(...) ILVR_B8(v8u16, __VA_ARGS__)
+#define ILVR_B8_SW(...) ILVR_B8(v4i32, __VA_ARGS__)
 
 /* Description : Interleave right half of halfword elements from vectors
    Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
-- 
cgit v1.2.3