From e1c84856bb7d804e74904ba117a2ca9700211082 Mon Sep 17 00:00:00 2001 From: Lynne Date: Fri, 14 Feb 2020 19:55:00 +0000 Subject: lavu/tx: improve 3-point fixed precision There's just no reason not to when its so easy (albeit messy) and its also reducing the precision of all non-power-of-two transforms that use it. --- libavutil/tx_priv.h | 4 ---- libavutil/tx_template.c | 23 ++++++++++++++++++----- 2 files changed, 18 insertions(+), 9 deletions(-) (limited to 'libavutil') diff --git a/libavutil/tx_priv.h b/libavutil/tx_priv.h index 6fabea2d4d..e0d980abfb 100644 --- a/libavutil/tx_priv.h +++ b/libavutil/tx_priv.h @@ -47,8 +47,6 @@ typedef void FFTComplex; #if defined(TX_FLOAT) || defined(TX_DOUBLE) -#define MUL(x, y) ((x)*(y)) - #define CMUL(dre, dim, are, aim, bre, bim) do { \ (dre) = (are) * (bre) - (aim) * (bim); \ (dim) = (are) * (bim) + (aim) * (bre); \ @@ -65,8 +63,6 @@ typedef void FFTComplex; #elif defined(TX_INT32) -#define MUL(x, y) ((int32_t)(((int64_t)(x) * (int64_t)(y) + 0x40000000) >> 31)) - /* Properly rounds the result */ #define CMUL(dre, dim, are, aim, bre, bim) do { \ int64_t accu; \ diff --git a/libavutil/tx_template.c b/libavutil/tx_template.c index f30f3bf5b6..69158e07f9 100644 --- a/libavutil/tx_template.c +++ b/libavutil/tx_template.c @@ -131,6 +131,9 @@ static av_always_inline void fft3(FFTComplex *out, FFTComplex *in, ptrdiff_t stride) { FFTComplex tmp[2]; +#ifdef TX_INT32 + int64_t mtmp[4]; +#endif BF(tmp[0].re, tmp[1].im, in[1].im, in[2].im); BF(tmp[0].im, tmp[1].re, in[1].re, in[2].re); @@ -138,15 +141,25 @@ static av_always_inline void fft3(FFTComplex *out, FFTComplex *in, out[0*stride].re = in[0].re + tmp[1].re; out[0*stride].im = in[0].im + tmp[1].im; - tmp[0].re = MUL(TX_NAME(ff_cos_53)[0].re, tmp[0].re); - tmp[0].im = MUL(TX_NAME(ff_cos_53)[0].im, tmp[0].im); - tmp[1].re = MUL(TX_NAME(ff_cos_53)[1].re, tmp[1].re); - tmp[1].im = MUL(TX_NAME(ff_cos_53)[1].re, tmp[1].im); - +#ifdef TX_INT32 + mtmp[0] = (int64_t)TX_NAME(ff_cos_53)[0].re * tmp[0].re; + mtmp[1] = (int64_t)TX_NAME(ff_cos_53)[0].im * tmp[0].im; + mtmp[2] = (int64_t)TX_NAME(ff_cos_53)[1].re * tmp[1].re; + mtmp[3] = (int64_t)TX_NAME(ff_cos_53)[1].re * tmp[1].im; + out[1*stride].re = in[0].re - (mtmp[2] + mtmp[0] + 0x40000000 >> 31); + out[1*stride].im = in[0].im - (mtmp[3] - mtmp[1] + 0x40000000 >> 31); + out[2*stride].re = in[0].re - (mtmp[2] - mtmp[0] + 0x40000000 >> 31); + out[2*stride].im = in[0].im - (mtmp[3] + mtmp[1] + 0x40000000 >> 31); +#else + tmp[0].re = TX_NAME(ff_cos_53)[0].re * tmp[0].re; + tmp[0].im = TX_NAME(ff_cos_53)[0].im * tmp[0].im; + tmp[1].re = TX_NAME(ff_cos_53)[1].re * tmp[1].re; + tmp[1].im = TX_NAME(ff_cos_53)[1].re * tmp[1].im; out[1*stride].re = in[0].re - tmp[1].re + tmp[0].re; out[1*stride].im = in[0].im - tmp[1].im - tmp[0].im; out[2*stride].re = in[0].re - tmp[1].re - tmp[0].re; out[2*stride].im = in[0].im - tmp[1].im + tmp[0].im; +#endif } #define DECL_FFT5(NAME, D0, D1, D2, D3, D4) \ -- cgit v1.2.3