From e1c84856bb7d804e74904ba117a2ca9700211082 Mon Sep 17 00:00:00 2001
From: Lynne <dev@lynne.ee>
Date: Fri, 14 Feb 2020 19:55:00 +0000
Subject: lavu/tx: improve 3-point fixed precision

There's just no reason not to when its so easy (albeit messy) and its also
reducing the precision of all non-power-of-two transforms that use it.
---
 libavutil/tx_priv.h     |  4 ----
 libavutil/tx_template.c | 23 ++++++++++++++++++-----
 2 files changed, 18 insertions(+), 9 deletions(-)

(limited to 'libavutil')

diff --git a/libavutil/tx_priv.h b/libavutil/tx_priv.h
index 6fabea2d4d..e0d980abfb 100644
--- a/libavutil/tx_priv.h
+++ b/libavutil/tx_priv.h
@@ -47,8 +47,6 @@ typedef void FFTComplex;
 
 #if defined(TX_FLOAT) || defined(TX_DOUBLE)
 
-#define MUL(x, y) ((x)*(y))
-
 #define CMUL(dre, dim, are, aim, bre, bim) do {                                \
         (dre) = (are) * (bre) - (aim) * (bim);                                 \
         (dim) = (are) * (bim) + (aim) * (bre);                                 \
@@ -65,8 +63,6 @@ typedef void FFTComplex;
 
 #elif defined(TX_INT32)
 
-#define MUL(x, y) ((int32_t)(((int64_t)(x) * (int64_t)(y) + 0x40000000) >> 31))
-
 /* Properly rounds the result */
 #define CMUL(dre, dim, are, aim, bre, bim) do {                                \
         int64_t accu;                                                          \
diff --git a/libavutil/tx_template.c b/libavutil/tx_template.c
index f30f3bf5b6..69158e07f9 100644
--- a/libavutil/tx_template.c
+++ b/libavutil/tx_template.c
@@ -131,6 +131,9 @@ static av_always_inline void fft3(FFTComplex *out, FFTComplex *in,
                                   ptrdiff_t stride)
 {
     FFTComplex tmp[2];
+#ifdef TX_INT32
+    int64_t mtmp[4];
+#endif
 
     BF(tmp[0].re, tmp[1].im, in[1].im, in[2].im);
     BF(tmp[0].im, tmp[1].re, in[1].re, in[2].re);
@@ -138,15 +141,25 @@ static av_always_inline void fft3(FFTComplex *out, FFTComplex *in,
     out[0*stride].re = in[0].re + tmp[1].re;
     out[0*stride].im = in[0].im + tmp[1].im;
 
-    tmp[0].re = MUL(TX_NAME(ff_cos_53)[0].re, tmp[0].re);
-    tmp[0].im = MUL(TX_NAME(ff_cos_53)[0].im, tmp[0].im);
-    tmp[1].re = MUL(TX_NAME(ff_cos_53)[1].re, tmp[1].re);
-    tmp[1].im = MUL(TX_NAME(ff_cos_53)[1].re, tmp[1].im);
-
+#ifdef TX_INT32
+    mtmp[0] = (int64_t)TX_NAME(ff_cos_53)[0].re * tmp[0].re;
+    mtmp[1] = (int64_t)TX_NAME(ff_cos_53)[0].im * tmp[0].im;
+    mtmp[2] = (int64_t)TX_NAME(ff_cos_53)[1].re * tmp[1].re;
+    mtmp[3] = (int64_t)TX_NAME(ff_cos_53)[1].re * tmp[1].im;
+    out[1*stride].re = in[0].re - (mtmp[2] + mtmp[0] + 0x40000000 >> 31);
+    out[1*stride].im = in[0].im - (mtmp[3] - mtmp[1] + 0x40000000 >> 31);
+    out[2*stride].re = in[0].re - (mtmp[2] - mtmp[0] + 0x40000000 >> 31);
+    out[2*stride].im = in[0].im - (mtmp[3] + mtmp[1] + 0x40000000 >> 31);
+#else
+    tmp[0].re = TX_NAME(ff_cos_53)[0].re * tmp[0].re;
+    tmp[0].im = TX_NAME(ff_cos_53)[0].im * tmp[0].im;
+    tmp[1].re = TX_NAME(ff_cos_53)[1].re * tmp[1].re;
+    tmp[1].im = TX_NAME(ff_cos_53)[1].re * tmp[1].im;
     out[1*stride].re = in[0].re - tmp[1].re + tmp[0].re;
     out[1*stride].im = in[0].im - tmp[1].im - tmp[0].im;
     out[2*stride].re = in[0].re - tmp[1].re - tmp[0].re;
     out[2*stride].im = in[0].im - tmp[1].im + tmp[0].im;
+#endif
 }
 
 #define DECL_FFT5(NAME, D0, D1, D2, D3, D4)                                                       \
-- 
cgit v1.2.3