Build: Add suffix to templated BITDEPTH files

Fix #96
author: Marvin Scholz <epirat07@gmail.com> 2018-10-25 17:45:12 +0300
committer: Ronald S. Bultje <rsbultje@gmail.com> 2018-10-25 19:51:31 +0300
commit: 46e2a2d0cc451e1d6bb929f80088f8a7b8940dd0 (patch)
tree: 002462d6840acd6551bb0b6b2dd5f4416db1ab84 /src/loopfilter_tmpl.c
parent: 367d785a4e70b3e43eee234b3c745b047e3fbd40 (diff)
1 files changed, 246 insertions, 0 deletions
diff --git a/src/loopfilter_tmpl.c b/src/loopfilter_tmpl.c
new file mode 100644
index 0000000..f1d1432
--- /dev/null
+++ b/src/loopfilter_tmpl.c
@@ -0,0 +1,246 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include <stdlib.h>
+
+#include "common/attributes.h"
+#include "common/intops.h"
+
+#include "src/loopfilter.h"
+
+static NOINLINE void
+loop_filter(pixel *dst, int E, int I, int H,
+            const ptrdiff_t stridea, const ptrdiff_t strideb, const int wd)
+{
+    const int F = 1 << (BITDEPTH - 8);
+    E <<= BITDEPTH - 8;
+    I <<= BITDEPTH - 8;
+    H <<= BITDEPTH - 8;
+
+    for (int i = 0; i < 4; i++, dst += stridea) {
+        int p6, p5, p4, p3, p2;
+        int p1 = dst[strideb * -2], p0 = dst[strideb * -1];
+        int q0 = dst[strideb * +0], q1 = dst[strideb * +1];
+        int q2, q3, q4, q5, q6;
+        int fm, flat8out, flat8in;
+
+        fm = abs(p1 - p0) <= I && abs(q1 - q0) <= I &&
+             abs(p0 - q0) * 2 + (abs(p1 - q1) >> 1) <= E;
+
+        if (wd > 4) {
+            p2 = dst[strideb * -3];
+            q2 = dst[strideb * +2];
+
+            fm &= abs(p2 - p1) <= I && abs(q2 - q1) <= I;
+
+            if (wd > 6) {
+                p3 = dst[strideb * -4];
+                q3 = dst[strideb * +3];
+
+                fm &= abs(p3 - p2) <= I && abs(q3 - q2) <= I;
+            }
+        }
+        if (!fm) continue;
+
+        if (wd >= 16) {
+            p6 = dst[strideb * -7];
+            p5 = dst[strideb * -6];
+            p4 = dst[strideb * -5];
+            q4 = dst[strideb * +4];
+            q5 = dst[strideb * +5];
+            q6 = dst[strideb * +6];
+
+            flat8out = abs(p6 - p0) <= F && abs(p5 - p0) <= F &&
+                       abs(p4 - p0) <= F && abs(q4 - q0) <= F &&
+                       abs(q5 - q0) <= F && abs(q6 - q0) <= F;
+        }
+
+        if (wd >= 6)
+            flat8in = abs(p2 - p0) <= F && abs(p1 - p0) <= F &&
+                      abs(q1 - q0) <= F && abs(q2 - q0) <= F;
+
+        if (wd >= 8)
+            flat8in &= abs(p3 - p0) <= F && abs(q3 - q0) <= F;
+
+        if (wd >= 16 && (flat8out & flat8in)) {
+            dst[strideb * -6] = (p6 + p6 + p6 + p6 + p6 + p6 * 2 + p5 * 2 +
+                                 p4 * 2 + p3 + p2 + p1 + p0 + q0 + 8) >> 4;
+            dst[strideb * -5] = (p6 + p6 + p6 + p6 + p6 + p5 * 2 + p4 * 2 +
+                                 p3 * 2 + p2 + p1 + p0 + q0 + q1 + 8) >> 4;
+            dst[strideb * -4] = (p6 + p6 + p6 + p6 + p5 + p4 * 2 + p3 * 2 +
+                                 p2 * 2 + p1 + p0 + q0 + q1 + q2 + 8) >> 4;
+            dst[strideb * -3] = (p6 + p6 + p6 + p5 + p4 + p3 * 2 + p2 * 2 +
+                                 p1 * 2 + p0 + q0 + q1 + q2 + q3 + 8) >> 4;
+            dst[strideb * -2] = (p6 + p6 + p5 + p4 + p3 + p2 * 2 + p1 * 2 +
+                                 p0 * 2 + q0 + q1 + q2 + q3 + q4 + 8) >> 4;
+            dst[strideb * -1] = (p6 + p5 + p4 + p3 + p2 + p1 * 2 + p0 * 2 +
+                                 q0 * 2 + q1 + q2 + q3 + q4 + q5 + 8) >> 4;
+            dst[strideb * +0] = (p5 + p4 + p3 + p2 + p1 + p0 * 2 + q0 * 2 +
+                                 q1 * 2 + q2 + q3 + q4 + q5 + q6 + 8) >> 4;
+            dst[strideb * +1] = (p4 + p3 + p2 + p1 + p0 + q0 * 2 + q1 * 2 +
+                                 q2 * 2 + q3 + q4 + q5 + q6 + q6 + 8) >> 4;
+            dst[strideb * +2] = (p3 + p2 + p1 + p0 + q0 + q1 * 2 + q2 * 2 +
+                                 q3 * 2 + q4 + q5 + q6 + q6 + q6 + 8) >> 4;
+            dst[strideb * +3] = (p2 + p1 + p0 + q0 + q1 + q2 * 2 + q3 * 2 +
+                                 q4 * 2 + q5 + q6 + q6 + q6 + q6 + 8) >> 4;
+            dst[strideb * +4] = (p1 + p0 + q0 + q1 + q2 + q3 * 2 + q4 * 2 +
+                                 q5 * 2 + q6 + q6 + q6 + q6 + q6 + 8) >> 4;
+            dst[strideb * +5] = (p0 + q0 + q1 + q2 + q3 + q4 * 2 + q5 * 2 +
+                                 q6 * 2 + q6 + q6 + q6 + q6 + q6 + 8) >> 4;
+        } else if (wd >= 8 && flat8in) {
+            dst[strideb * -3] = (p3 + p3 + p3 + 2 * p2 + p1 + p0 + q0 + 4) >> 3;
+            dst[strideb * -2] = (p3 + p3 + p2 + 2 * p1 + p0 + q0 + q1 + 4) >> 3;
+            dst[strideb * -1] = (p3 + p2 + p1 + 2 * p0 + q0 + q1 + q2 + 4) >> 3;
+            dst[strideb * +0] = (p2 + p1 + p0 + 2 * q0 + q1 + q2 + q3 + 4) >> 3;
+            dst[strideb * +1] = (p1 + p0 + q0 + 2 * q1 + q2 + q3 + q3 + 4) >> 3;
+            dst[strideb * +2] = (p0 + q0 + q1 + 2 * q2 + q3 + q3 + q3 + 4) >> 3;
+        } else if (wd == 6 && flat8in) {
+            dst[strideb * -2] = (p2 + 2 * p2 + 2 * p1 + 2 * p0 + q0 + 4) >> 3;
+            dst[strideb * -1] = (p2 + 2 * p1 + 2 * p0 + 2 * q0 + q1 + 4) >> 3;
+            dst[strideb * +0] = (p1 + 2 * p0 + 2 * q0 + 2 * q1 + q2 + 4) >> 3;
+            dst[strideb * +1] = (p0 + 2 * q0 + 2 * q1 + 2 * q2 + q2 + 4) >> 3;
+        } else {
+            const int hev = abs(p1 - p0) > H || abs(q1 - q0) > H;
+
+#define iclip_diff(v) iclip(v, -128 * (1 << (BITDEPTH - 8)), \
+                                128 * (1 << (BITDEPTH - 8)) - 1)
+
+            if (hev) {
+                int f = iclip_diff(p1 - q1), f1, f2;
+                f = iclip_diff(3 * (q0 - p0) + f);
+
+                f1 = imin(f + 4, (128 << (BITDEPTH - 8)) - 1) >> 3;
+                f2 = imin(f + 3, (128 << (BITDEPTH - 8)) - 1) >> 3;
+
+                dst[strideb * -1] = iclip_pixel(p0 + f2);
+                dst[strideb * +0] = iclip_pixel(q0 - f1);
+            } else {
+                int f = iclip_diff(3 * (q0 - p0)), f1, f2;
+
+                f1 = imin(f + 4, (128 << (BITDEPTH - 8)) - 1) >> 3;
+                f2 = imin(f + 3, (128 << (BITDEPTH - 8)) - 1) >> 3;
+
+                dst[strideb * -1] = iclip_pixel(p0 + f2);
+                dst[strideb * +0] = iclip_pixel(q0 - f1);
+
+                f = (f1 + 1) >> 1;
+                dst[strideb * -2] = iclip_pixel(p1 + f);
+                dst[strideb * +1] = iclip_pixel(q1 - f);
+            }
+#undef iclip_diff
+        }
+    }
+}
+
+static void loop_filter_h_sb128y_c(pixel *dst, const ptrdiff_t stride,
+                                   const uint32_t *const vmask,
+                                   const uint8_t (*l)[4], ptrdiff_t b4_stride,
+                                   const Av1FilterLUT *lut, const int h)
+{
+    const unsigned vm = (vmask[0] | vmask[1] | vmask[2]) & ((1ULL << h) - 1);
+    for (unsigned y = 1; vm & ~(y - 1);
+         y <<= 1, dst += 4 * PXSTRIDE(stride), l += b4_stride)
+    {
+        if (vm & y) {
+            const int L = l[0][0] ? l[0][0] : l[-1][0];
+            if (!L) continue;
+            const int H = L >> 4;
+            const int E = lut->e[L], I = lut->i[L];
+            const int idx = (vmask[2] & y) ? 2 : !!(vmask[1] & y);
+            loop_filter(dst, E, I, H, PXSTRIDE(stride), 1, 4 << idx);
+        }
+    }
+}
+
+static void loop_filter_v_sb128y_c(pixel *dst, const ptrdiff_t stride,
+                                   const uint32_t *const vmask,
+                                   const uint8_t (*l)[4], ptrdiff_t b4_stride,
+                                   const Av1FilterLUT *lut, const int w)
+{
+    const unsigned vm = vmask[0] | vmask[1] | vmask[2];
+    for (unsigned x = 1; vm & ~(x - 1); x <<= 1, dst += 4, l++) {
+        if (vm & x) {
+            const int L = l[0][0] ? l[0][0] : l[-b4_stride][0];
+            if (!L) continue;
+            const int H = L >> 4;
+            const int E = lut->e[L], I = lut->i[L];
+            const int idx = (vmask[2] & x) ? 2 : !!(vmask[1] & x);
+            loop_filter(dst, E, I, H, 1, PXSTRIDE(stride), 4 << idx);
+        }
+    }
+}
+
+static void loop_filter_h_sb128uv_c(pixel *dst, const ptrdiff_t stride,
+                                    const uint32_t *const vmask,
+                                    const uint8_t (*l)[4], ptrdiff_t b4_stride,
+                                    const Av1FilterLUT *lut, const int h)
+{
+    const unsigned vm = (vmask[0] | vmask[1]) & ((1ULL << h) - 1);
+    for (unsigned y = 1; vm & ~(y - 1);
+         y <<= 1, dst += 4 * PXSTRIDE(stride), l += b4_stride)
+    {
+        if (vm & y) {
+            const int L = l[0][0] ? l[0][0] : l[-1][0];
+            if (!L) continue;
+            const int H = L >> 4;
+            const int E = lut->e[L], I = lut->i[L];
+            const int idx = !!(vmask[1] & y);
+            loop_filter(dst, E, I, H, PXSTRIDE(stride), 1, 4 + 2 * idx);
+        }
+    }
+}
+
+static void loop_filter_v_sb128uv_c(pixel *dst, const ptrdiff_t stride,
+                                    const uint32_t *const vmask,
+                                    const uint8_t (*l)[4], ptrdiff_t b4_stride,
+                                    const Av1FilterLUT *lut, const int w)
+{
+    const unsigned vm = vmask[0] | vmask[1];
+    for (unsigned x = 1; vm & ~(x - 1); x <<= 1, dst += 4, l++) {
+        if (vm & x) {
+            const int L = l[0][0] ? l[0][0] : l[-b4_stride][0];
+            if (!L) continue;
+            const int H = L >> 4;
+            const int E = lut->e[L], I = lut->i[L];
+            const int idx = !!(vmask[1] & x);
+            loop_filter(dst, E, I, H, 1, PXSTRIDE(stride), 4 + 2 * idx);
+        }
+    }
+}
+
+void bitfn(dav1d_loop_filter_dsp_init)(Dav1dLoopFilterDSPContext *const c) {
+    c->loop_filter_sb[0][0] = loop_filter_h_sb128y_c;
+    c->loop_filter_sb[0][1] = loop_filter_v_sb128y_c;
+    c->loop_filter_sb[1][0] = loop_filter_h_sb128uv_c;
+    c->loop_filter_sb[1][1] = loop_filter_v_sb128uv_c;
+
+#if HAVE_ASM && ARCH_X86
+    bitfn(dav1d_loop_filter_dsp_init_x86)(c);
+#endif
+}
author	Marvin Scholz <epirat07@gmail.com>	2018-10-25 17:45:12 +0300
committer	Ronald S. Bultje <rsbultje@gmail.com>	2018-10-25 19:51:31 +0300
commit	46e2a2d0cc451e1d6bb929f80088f8a7b8940dd0 (patch)
tree	002462d6840acd6551bb0b6b2dd5f4416db1ab84 /src/loopfilter_tmpl.c
parent	367d785a4e70b3e43eee234b3c745b047e3fbd40 (diff)