itx: clip according to spec, fixes #103, #158

This does not adjust the AVX2 asm. The asm clips in many places to the required range (16-bit signed) for performance reason. No mismatch observed with coefs generated by the forward transform in checkasm in 10 thousand runs.
author: Janne Grunau <janne-vlc@jannau.net> 2018-11-13 01:59:02 +0300
committer: Janne Grunau <janne-vlc@jannau.net> 2018-11-20 20:36:23 +0300
commit: 5098b23ab3a98dc2202449dd3bd0c06902c4ff64 (patch)
tree: 676dabb88aada0059f7cd8116c989754c8923aa2 /src/itx_tmpl.c
parent: c627f16f5ae671c4583b6c85006ce648224814e3 (diff)
1 files changed, 16 insertions, 9 deletions
diff --git a/src/itx_tmpl.c b/src/itx_tmpl.c
index 0fb37f1..1d35146 100644
--- a/src/itx_tmpl.c
+++ b/src/itx_tmpl.c
@@ -40,7 +40,7 @@
 #include "src/itx_1d.c"
 
 typedef void (*itx_1d_fn)(const coef *in, ptrdiff_t in_s,
-                          coef *out, ptrdiff_t out_s);
+                          coef *out, ptrdiff_t out_s, const int range);
 
 static void NOINLINE
 inv_txfm_add_c(pixel *dst, const ptrdiff_t stride,
@@ -54,6 +54,9 @@ inv_txfm_add_c(pixel *dst, const ptrdiff_t stride,
     // Maximum value for h and w is 64
     coef tmp[4096 /* w * h */], out[64 /* h */], in_mem[64 /* w */];
     const int is_rect2 = w * 2 == h || h * 2 == w;
+    const int row_clip_max = (1 << (BITDEPTH + 8 - 1)) - 1;
+    const int col_clip_max = (1 << (imax(BITDEPTH + 6, 16) - 1)) -1;
+    const int col_clip_min = -col_clip_max - 1;
 
     if (w != sw) memset(&in_mem[sw], 0, (w - sw) * sizeof(*in_mem));
     const int rnd1 = (1 << shift1) >> 1;
@@ -64,18 +67,19 @@ inv_txfm_add_c(pixel *dst, const ptrdiff_t stride,
                 if (is_rect2)
                     in_mem[j] = (in_mem[j] * 2896 + 2048) >> 12;
             }
-            first_1d_fn(in_mem, 1, &tmp[i * w], 1);
+            first_1d_fn(in_mem, 1, &tmp[i * w], 1, row_clip_max);
         } else {
-            first_1d_fn(&coeff[i], sh, &tmp[i * w], 1);
+            first_1d_fn(&coeff[i], sh, &tmp[i * w], 1, row_clip_max);
         }
         for (j = 0; j < w; j++)
-            tmp[i * w + j] = (tmp[i * w + j] + (rnd1)) >> shift1;
+            tmp[i * w + j] = iclip((tmp[i * w + j] + (rnd1)) >> shift1,
+                                   col_clip_min, col_clip_max);
     }
 
     if (h != sh) memset(&tmp[sh * w], 0, w * (h - sh) * sizeof(*tmp));
     const int rnd2 = (1 << shift2) >> 1;
     for (i = 0; i < w; i++) {
-        second_1d_fn(&tmp[i], w, out, 1);
+        second_1d_fn(&tmp[i], w, out, 1, col_clip_max);
         for (j = 0; j < h; j++)
             dst[i + j * PXSTRIDE(stride)] =
                 iclip_pixel(dst[i + j * PXSTRIDE(stride)] +
@@ -145,15 +149,18 @@ inv_txfm_fn64(64, 64, 2, 4)
 static void inv_txfm_add_wht_wht_4x4_c(pixel *dst, const ptrdiff_t stride,
                                        coef *const coeff, const int eob)
 {
-    int i, j;
+    const int col_clip_max = (1 << (imax(BITDEPTH + 6, 16) - 1)) -1;
+    const int col_clip_min = -col_clip_max - 1;
     coef tmp[4 * 4], out[4];
 
-    for (i = 0; i < 4; i++)
+    for (int i = 0; i < 4; i++)
         inv_wht4_1d(&coeff[i], 4, &tmp[i * 4], 1, 0);
+    for (int k = 0; k < 4 * 4; k++)
+        tmp[k] = iclip(tmp[k], col_clip_min, col_clip_max);
 
-    for (i = 0; i < 4; i++) {
+    for (int i = 0; i < 4; i++) {
         inv_wht4_1d(&tmp[i], 4, out, 1, 1);
-        for (j = 0; j < 4; j++)
+        for (int j = 0; j < 4; j++)
             dst[i + j * PXSTRIDE(stride)] =
                 iclip_pixel(dst[i + j * PXSTRIDE(stride)] + out[j]);
     }
author	Janne Grunau <janne-vlc@jannau.net>	2018-11-13 01:59:02 +0300
committer	Janne Grunau <janne-vlc@jannau.net>	2018-11-20 20:36:23 +0300
commit	5098b23ab3a98dc2202449dd3bd0c06902c4ff64 (patch)
tree	676dabb88aada0059f7cd8116c989754c8923aa2 /src/itx_tmpl.c
parent	c627f16f5ae671c4583b6c85006ce648224814e3 (diff)