/*
 * Copyright 2011-2016 Blender Foundation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#pragma once

#ifdef WITH_NANOVDB
#  define NANOVDB_USE_INTRINSICS
#  include <nanovdb/NanoVDB.h>
#  include <nanovdb/util/SampleFromVoxels.h>
#endif

CCL_NAMESPACE_BEGIN

/* Make template functions private so symbols don't conflict between kernels with different
 * instruction sets. */
namespace {

#define SET_CUBIC_SPLINE_WEIGHTS(u, t) \
  { \
    u[0] = (((-1.0f / 6.0f) * t + 0.5f) * t - 0.5f) * t + (1.0f / 6.0f); \
    u[1] = ((0.5f * t - 1.0f) * t) * t + (2.0f / 3.0f); \
    u[2] = ((-0.5f * t + 0.5f) * t + 0.5f) * t + (1.0f / 6.0f); \
    u[3] = (1.0f / 6.0f) * t * t * t; \
  } \
  (void)0

ccl_device_inline float frac(float x, int *ix)
{
  int i = float_to_int(x) - ((x < 0.0f) ? 1 : 0);
  *ix = i;
  return x - (float)i;
}

template<typename T> struct TextureInterpolator {

  static ccl_always_inline float4 read(float4 r)
  {
    return r;
  }

  static ccl_always_inline float4 read(uchar4 r)
  {
    float f = 1.0f / 255.0f;
    return make_float4(r.x * f, r.y * f, r.z * f, r.w * f);
  }

  static ccl_always_inline float4 read(uchar r)
  {
    float f = r * (1.0f / 255.0f);
    return make_float4(f, f, f, 1.0f);
  }

  static ccl_always_inline float4 read(float r)
  {
    /* TODO(dingto): Optimize this, so interpolation
     * happens on float instead of float4 */
    return make_float4(r, r, r, 1.0f);
  }

  static ccl_always_inline float4 read(half4 r)
  {
    return half4_to_float4(r);
  }

  static ccl_always_inline float4 read(half r)
  {
    float f = half_to_float(r);
    return make_float4(f, f, f, 1.0f);
  }

  static ccl_always_inline float4 read(uint16_t r)
  {
    float f = r * (1.0f / 65535.0f);
    return make_float4(f, f, f, 1.0f);
  }

  static ccl_always_inline float4 read(ushort4 r)
  {
    float f = 1.0f / 65535.0f;
    return make_float4(r.x * f, r.y * f, r.z * f, r.w * f);
  }

  static ccl_always_inline float4 read(const T *data, int x, int y, int width, int height)
  {
    if (x < 0 || y < 0 || x >= width || y >= height) {
      return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
    }
    return read(data[y * width + x]);
  }

  static ccl_always_inline int wrap_periodic(int x, int width)
  {
    x %= width;
    if (x < 0)
      x += width;
    return x;
  }

  static ccl_always_inline int wrap_clamp(int x, int width)
  {
    return clamp(x, 0, width - 1);
  }

  /* ********  2D interpolation ******** */

  static ccl_always_inline float4 interp_closest(const TextureInfo &info, float x, float y)
  {
    const T *data = (const T *)info.data;
    const int width = info.width;
    const int height = info.height;
    int ix, iy;
    frac(x * (float)width, &ix);
    frac(y * (float)height, &iy);
    switch (info.extension) {
      case EXTENSION_REPEAT:
        ix = wrap_periodic(ix, width);
        iy = wrap_periodic(iy, height);
        break;
      case EXTENSION_CLIP:
        if (x < 0.0f || y < 0.0f || x > 1.0f || y > 1.0f) {
          return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
        }
        ATTR_FALLTHROUGH;
      case EXTENSION_EXTEND:
        ix = wrap_clamp(ix, width);
        iy = wrap_clamp(iy, height);
        break;
      default:
        kernel_assert(0);
        return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
    }
    return read(data[ix + iy * width]);
  }

  static ccl_always_inline float4 interp_linear(const TextureInfo &info, float x, float y)
  {
    const T *data = (const T *)info.data;
    const int width = info.width;
    const int height = info.height;
    int ix, iy, nix, niy;
    const float tx = frac(x * (float)width - 0.5f, &ix);
    const float ty = frac(y * (float)height - 0.5f, &iy);
    switch (info.extension) {
      case EXTENSION_REPEAT:
        ix = wrap_periodic(ix, width);
        iy = wrap_periodic(iy, height);
        nix = wrap_periodic(ix + 1, width);
        niy = wrap_periodic(iy + 1, height);
        break;
      case EXTENSION_CLIP:
        nix = ix + 1;
        niy = iy + 1;
        break;
      case EXTENSION_EXTEND:
        nix = wrap_clamp(ix + 1, width);
        niy = wrap_clamp(iy + 1, height);
        ix = wrap_clamp(ix, width);
        iy = wrap_clamp(iy, height);
        break;
      default:
        kernel_assert(0);
        return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
    }
    return (1.0f - ty) * (1.0f - tx) * read(data, ix, iy, width, height) +
           (1.0f - ty) * tx * read(data, nix, iy, width, height) +
           ty * (1.0f - tx) * read(data, ix, niy, width, height) +
           ty * tx * read(data, nix, niy, width, height);
  }

  static ccl_always_inline float4 interp_cubic(const TextureInfo &info, float x, float y)
  {
    const T *data = (const T *)info.data;
    const int width = info.width;
    const int height = info.height;
    int ix, iy, nix, niy;
    const float tx = frac(x * (float)width - 0.5f, &ix);
    const float ty = frac(y * (float)height - 0.5f, &iy);
    int pix, piy, nnix, nniy;
    switch (info.extension) {
      case EXTENSION_REPEAT:
        ix = wrap_periodic(ix, width);
        iy = wrap_periodic(iy, height);
        pix = wrap_periodic(ix - 1, width);
        piy = wrap_periodic(iy - 1, height);
        nix = wrap_periodic(ix + 1, width);
        niy = wrap_periodic(iy + 1, height);
        nnix = wrap_periodic(ix + 2, width);
        nniy = wrap_periodic(iy + 2, height);
        break;
      case EXTENSION_CLIP:
        pix = ix - 1;
        piy = iy - 1;
        nix = ix + 1;
        niy = iy + 1;
        nnix = ix + 2;
        nniy = iy + 2;
        break;
      case EXTENSION_EXTEND:
        pix = wrap_clamp(ix - 1, width);
        piy = wrap_clamp(iy - 1, height);
        nix = wrap_clamp(ix + 1, width);
        niy = wrap_clamp(iy + 1, height);
        nnix = wrap_clamp(ix + 2, width);
        nniy = wrap_clamp(iy + 2, height);
        ix = wrap_clamp(ix, width);
        iy = wrap_clamp(iy, height);
        break;
      default:
        kernel_assert(0);
        return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
    }
    const int xc[4] = {pix, ix, nix, nnix};
    const int yc[4] = {piy, iy, niy, nniy};
    float u[4], v[4];
    /* Some helper macro to keep code reasonable size,
     * let compiler to inline all the matrix multiplications.
     */
#define DATA(x, y) (read(data, xc[x], yc[y], width, height))
#define TERM(col) \
  (v[col] * \
   (u[0] * DATA(0, col) + u[1] * DATA(1, col) + u[2] * DATA(2, col) + u[3] * DATA(3, col)))

    SET_CUBIC_SPLINE_WEIGHTS(u, tx);
    SET_CUBIC_SPLINE_WEIGHTS(v, ty);

    /* Actual interpolation. */
    return TERM(0) + TERM(1) + TERM(2) + TERM(3);
#undef TERM
#undef DATA
  }

  static ccl_always_inline float4 interp(const TextureInfo &info, float x, float y)
  {
    if (UNLIKELY(!info.data)) {
      return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
    }
    switch (info.interpolation) {
      case INTERPOLATION_CLOSEST:
        return interp_closest(info, x, y);
      case INTERPOLATION_LINEAR:
        return interp_linear(info, x, y);
      default:
        return interp_cubic(info, x, y);
    }
  }

  /* ********  3D interpolation ******** */

  static ccl_always_inline float4 interp_3d_closest(const TextureInfo &info,
                                                    float x,
                                                    float y,
                                                    float z)
  {
    int width = info.width;
    int height = info.height;
    int depth = info.depth;
    int ix, iy, iz;

    frac(x * (float)width, &ix);
    frac(y * (float)height, &iy);
    frac(z * (float)depth, &iz);

    switch (info.extension) {
      case EXTENSION_REPEAT:
        ix = wrap_periodic(ix, width);
        iy = wrap_periodic(iy, height);
        iz = wrap_periodic(iz, depth);
        break;
      case EXTENSION_CLIP:
        if (x < 0.0f || y < 0.0f || z < 0.0f || x > 1.0f || y > 1.0f || z > 1.0f) {
          return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
        }
        ATTR_FALLTHROUGH;
      case EXTENSION_EXTEND:
        ix = wrap_clamp(ix, width);
        iy = wrap_clamp(iy, height);
        iz = wrap_clamp(iz, depth);
        break;
      default:
        kernel_assert(0);
        return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
    }

    const T *data = (const T *)info.data;
    return read(data[ix + iy * width + iz * width * height]);
  }

  static ccl_always_inline float4 interp_3d_linear(const TextureInfo &info,
                                                   float x,
                                                   float y,
                                                   float z)
  {
    int width = info.width;
    int height = info.height;
    int depth = info.depth;
    int ix, iy, iz;
    int nix, niy, niz;

    float tx = frac(x * (float)width - 0.5f, &ix);
    float ty = frac(y * (float)height - 0.5f, &iy);
    float tz = frac(z * (float)depth - 0.5f, &iz);

    switch (info.extension) {
      case EXTENSION_REPEAT:
        ix = wrap_periodic(ix, width);
        iy = wrap_periodic(iy, height);
        iz = wrap_periodic(iz, depth);

        nix = wrap_periodic(ix + 1, width);
        niy = wrap_periodic(iy + 1, height);
        niz = wrap_periodic(iz + 1, depth);
        break;
      case EXTENSION_CLIP:
        if (x < 0.0f || y < 0.0f || z < 0.0f || x > 1.0f || y > 1.0f || z > 1.0f) {
          return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
        }
        ATTR_FALLTHROUGH;
      case EXTENSION_EXTEND:
        nix = wrap_clamp(ix + 1, width);
        niy = wrap_clamp(iy + 1, height);
        niz = wrap_clamp(iz + 1, depth);

        ix = wrap_clamp(ix, width);
        iy = wrap_clamp(iy, height);
        iz = wrap_clamp(iz, depth);
        break;
      default:
        kernel_assert(0);
        return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
    }

    const T *data = (const T *)info.data;
    float4 r;

    r = (1.0f - tz) * (1.0f - ty) * (1.0f - tx) *
        read(data[ix + iy * width + iz * width * height]);
    r += (1.0f - tz) * (1.0f - ty) * tx * read(data[nix + iy * width + iz * width * height]);
    r += (1.0f - tz) * ty * (1.0f - tx) * read(data[ix + niy * width + iz * width * height]);
    r += (1.0f - tz) * ty * tx * read(data[nix + niy * width + iz * width * height]);

    r += tz * (1.0f - ty) * (1.0f - tx) * read(data[ix + iy * width + niz * width * height]);
    r += tz * (1.0f - ty) * tx * read(data[nix + iy * width + niz * width * height]);
    r += tz * ty * (1.0f - tx) * read(data[ix + niy * width + niz * width * height]);
    r += tz * ty * tx * read(data[nix + niy * width + niz * width * height]);

    return r;
  }

  /* TODO(sergey): For some unspeakable reason both GCC-6 and Clang-3.9 are
   * causing stack overflow issue in this function unless it is inlined.
   *
   * Only happens for AVX2 kernel and global __KERNEL_SSE__ vectorization
   * enabled.
   */
#if defined(__GNUC__) || defined(__clang__)
  static ccl_always_inline
#else
  static ccl_never_inline
#endif
      float4
      interp_3d_cubic(const TextureInfo &info, float x, float y, float z)
  {
    int width = info.width;
    int height = info.height;
    int depth = info.depth;
    int ix, iy, iz;
    int nix, niy, niz;
    /* Tricubic b-spline interpolation. */
    const float tx = frac(x * (float)width - 0.5f, &ix);
    const float ty = frac(y * (float)height - 0.5f, &iy);
    const float tz = frac(z * (float)depth - 0.5f, &iz);
    int pix, piy, piz, nnix, nniy, nniz;

    switch (info.extension) {
      case EXTENSION_REPEAT:
        ix = wrap_periodic(ix, width);
        iy = wrap_periodic(iy, height);
        iz = wrap_periodic(iz, depth);

        pix = wrap_periodic(ix - 1, width);
        piy = wrap_periodic(iy - 1, height);
        piz = wrap_periodic(iz - 1, depth);

        nix = wrap_periodic(ix + 1, width);
        niy = wrap_periodic(iy + 1, height);
        niz = wrap_periodic(iz + 1, depth);

        nnix = wrap_periodic(ix + 2, width);
        nniy = wrap_periodic(iy + 2, height);
        nniz = wrap_periodic(iz + 2, depth);
        break;
      case EXTENSION_CLIP:
        if (x < 0.0f || y < 0.0f || z < 0.0f || x > 1.0f || y > 1.0f || z > 1.0f) {
          return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
        }
        ATTR_FALLTHROUGH;
      case EXTENSION_EXTEND:
        pix = wrap_clamp(ix - 1, width);
        piy = wrap_clamp(iy - 1, height);
        piz = wrap_clamp(iz - 1, depth);

        nix = wrap_clamp(ix + 1, width);
        niy = wrap_clamp(iy + 1, height);
        niz = wrap_clamp(iz + 1, depth);

        nnix = wrap_clamp(ix + 2, width);
        nniy = wrap_clamp(iy + 2, height);
        nniz = wrap_clamp(iz + 2, depth);

        ix = wrap_clamp(ix, width);
        iy = wrap_clamp(iy, height);
        iz = wrap_clamp(iz, depth);
        break;
      default:
        kernel_assert(0);
        return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
    }

    const int xc[4] = {pix, ix, nix, nnix};
    const int yc[4] = {width * piy, width * iy, width * niy, width * nniy};
    const int zc[4] = {
        width * height * piz, width * height * iz, width * height * niz, width * height * nniz};
    float u[4], v[4], w[4];

    /* Some helper macro to keep code reasonable size,
     * let compiler to inline all the matrix multiplications.
     */
#define DATA(x, y, z) (read(data[xc[x] + yc[y] + zc[z]]))
#define COL_TERM(col, row) \
  (v[col] * (u[0] * DATA(0, col, row) + u[1] * DATA(1, col, row) + u[2] * DATA(2, col, row) + \
             u[3] * DATA(3, col, row)))
#define ROW_TERM(row) \
  (w[row] * (COL_TERM(0, row) + COL_TERM(1, row) + COL_TERM(2, row) + COL_TERM(3, row)))

    SET_CUBIC_SPLINE_WEIGHTS(u, tx);
    SET_CUBIC_SPLINE_WEIGHTS(v, ty);
    SET_CUBIC_SPLINE_WEIGHTS(w, tz);

    /* Actual interpolation. */
    const T *data = (const T *)info.data;
    return ROW_TERM(0) + ROW_TERM(1) + ROW_TERM(2) + ROW_TERM(3);

#undef COL_TERM
#undef ROW_TERM
#undef DATA
  }

  static ccl_always_inline float4
  interp_3d(const TextureInfo &info, float x, float y, float z, InterpolationType interp)
  {
    if (UNLIKELY(!info.data))
      return make_float4(0.0f, 0.0f, 0.0f, 0.0f);

    switch ((interp == INTERPOLATION_NONE) ? info.interpolation : interp) {
      case INTERPOLATION_CLOSEST:
        return interp_3d_closest(info, x, y, z);
      case INTERPOLATION_LINEAR:
        return interp_3d_linear(info, x, y, z);
      default:
        return interp_3d_cubic(info, x, y, z);
    }
  }
};

#ifdef WITH_NANOVDB
template<typename T> struct NanoVDBInterpolator {

  typedef typename nanovdb::NanoGrid<T>::AccessorType AccessorType;

  static ccl_always_inline float4 read(float r)
  {
    return make_float4(r, r, r, 1.0f);
  }

  static ccl_always_inline float4 read(nanovdb::Vec3f r)
  {
    return make_float4(r[0], r[1], r[2], 1.0f);
  }

  static ccl_always_inline float4 interp_3d_closest(const AccessorType &acc,
                                                    float x,
                                                    float y,
                                                    float z)
  {
    const nanovdb::Vec3f xyz(x, y, z);
    return read(nanovdb::SampleFromVoxels<AccessorType, 0, false>(acc)(xyz));
  }

  static ccl_always_inline float4 interp_3d_linear(const AccessorType &acc,
                                                   float x,
                                                   float y,
                                                   float z)
  {
    const nanovdb::Vec3f xyz(x - 0.5f, y - 0.5f, z - 0.5f);
    return read(nanovdb::SampleFromVoxels<AccessorType, 1, false>(acc)(xyz));
  }

#  if defined(__GNUC__) || defined(__clang__)
  static ccl_always_inline
#  else
  static ccl_never_inline
#  endif
      float4
      interp_3d_cubic(const AccessorType &acc, float x, float y, float z)
  {
    int ix, iy, iz;
    int nix, niy, niz;
    int pix, piy, piz;
    int nnix, nniy, nniz;
    /* Tricubic b-spline interpolation. */
    const float tx = frac(x - 0.5f, &ix);
    const float ty = frac(y - 0.5f, &iy);
    const float tz = frac(z - 0.5f, &iz);
    pix = ix - 1;
    piy = iy - 1;
    piz = iz - 1;
    nix = ix + 1;
    niy = iy + 1;
    niz = iz + 1;
    nnix = ix + 2;
    nniy = iy + 2;
    nniz = iz + 2;

    const int xc[4] = {pix, ix, nix, nnix};
    const int yc[4] = {piy, iy, niy, nniy};
    const int zc[4] = {piz, iz, niz, nniz};
    float u[4], v[4], w[4];

    /* Some helper macro to keep code reasonable size,
     * let compiler to inline all the matrix multiplications.
     */
#  define DATA(x, y, z) (read(acc.getValue(nanovdb::Coord(xc[x], yc[y], zc[z]))))
#  define COL_TERM(col, row) \
    (v[col] * (u[0] * DATA(0, col, row) + u[1] * DATA(1, col, row) + u[2] * DATA(2, col, row) + \
               u[3] * DATA(3, col, row)))
#  define ROW_TERM(row) \
    (w[row] * (COL_TERM(0, row) + COL_TERM(1, row) + COL_TERM(2, row) + COL_TERM(3, row)))

    SET_CUBIC_SPLINE_WEIGHTS(u, tx);
    SET_CUBIC_SPLINE_WEIGHTS(v, ty);
    SET_CUBIC_SPLINE_WEIGHTS(w, tz);

    /* Actual interpolation. */
    return ROW_TERM(0) + ROW_TERM(1) + ROW_TERM(2) + ROW_TERM(3);

#  undef COL_TERM
#  undef ROW_TERM
#  undef DATA
  }

  static ccl_always_inline float4
  interp_3d(const TextureInfo &info, float x, float y, float z, InterpolationType interp)
  {
    using namespace nanovdb;

    NanoGrid<T> *const grid = (NanoGrid<T> *)info.data;
    AccessorType acc = grid->getAccessor();

    switch ((interp == INTERPOLATION_NONE) ? info.interpolation : interp) {
      case INTERPOLATION_CLOSEST:
        return interp_3d_closest(acc, x, y, z);
      case INTERPOLATION_LINEAR:
        return interp_3d_linear(acc, x, y, z);
      default:
        return interp_3d_cubic(acc, x, y, z);
    }
  }
};
#endif

#undef SET_CUBIC_SPLINE_WEIGHTS

ccl_device float4 kernel_tex_image_interp(const KernelGlobals *kg, int id, float x, float y)
{
  const TextureInfo &info = kernel_tex_fetch(__texture_info, id);

  switch (info.data_type) {
    case IMAGE_DATA_TYPE_HALF:
      return TextureInterpolator<half>::interp(info, x, y);
    case IMAGE_DATA_TYPE_BYTE:
      return TextureInterpolator<uchar>::interp(info, x, y);
    case IMAGE_DATA_TYPE_USHORT:
      return TextureInterpolator<uint16_t>::interp(info, x, y);
    case IMAGE_DATA_TYPE_FLOAT:
      return TextureInterpolator<float>::interp(info, x, y);
    case IMAGE_DATA_TYPE_HALF4:
      return TextureInterpolator<half4>::interp(info, x, y);
    case IMAGE_DATA_TYPE_BYTE4:
      return TextureInterpolator<uchar4>::interp(info, x, y);
    case IMAGE_DATA_TYPE_USHORT4:
      return TextureInterpolator<ushort4>::interp(info, x, y);
    case IMAGE_DATA_TYPE_FLOAT4:
      return TextureInterpolator<float4>::interp(info, x, y);
    default:
      assert(0);
      return make_float4(
          TEX_IMAGE_MISSING_R, TEX_IMAGE_MISSING_G, TEX_IMAGE_MISSING_B, TEX_IMAGE_MISSING_A);
  }
}

ccl_device float4 kernel_tex_image_interp_3d(const KernelGlobals *kg,
                                             int id,
                                             float3 P,
                                             InterpolationType interp)
{
  const TextureInfo &info = kernel_tex_fetch(__texture_info, id);

  if (info.use_transform_3d) {
    P = transform_point(&info.transform_3d, P);
  }

  switch (info.data_type) {
    case IMAGE_DATA_TYPE_HALF:
      return TextureInterpolator<half>::interp_3d(info, P.x, P.y, P.z, interp);
    case IMAGE_DATA_TYPE_BYTE:
      return TextureInterpolator<uchar>::interp_3d(info, P.x, P.y, P.z, interp);
    case IMAGE_DATA_TYPE_USHORT:
      return TextureInterpolator<uint16_t>::interp_3d(info, P.x, P.y, P.z, interp);
    case IMAGE_DATA_TYPE_FLOAT:
      return TextureInterpolator<float>::interp_3d(info, P.x, P.y, P.z, interp);
    case IMAGE_DATA_TYPE_HALF4:
      return TextureInterpolator<half4>::interp_3d(info, P.x, P.y, P.z, interp);
    case IMAGE_DATA_TYPE_BYTE4:
      return TextureInterpolator<uchar4>::interp_3d(info, P.x, P.y, P.z, interp);
    case IMAGE_DATA_TYPE_USHORT4:
      return TextureInterpolator<ushort4>::interp_3d(info, P.x, P.y, P.z, interp);
    case IMAGE_DATA_TYPE_FLOAT4:
      return TextureInterpolator<float4>::interp_3d(info, P.x, P.y, P.z, interp);
#ifdef WITH_NANOVDB
    case IMAGE_DATA_TYPE_NANOVDB_FLOAT:
      return NanoVDBInterpolator<float>::interp_3d(info, P.x, P.y, P.z, interp);
    case IMAGE_DATA_TYPE_NANOVDB_FLOAT3:
      return NanoVDBInterpolator<nanovdb::Vec3f>::interp_3d(info, P.x, P.y, P.z, interp);
#endif
    default:
      assert(0);
      return make_float4(
          TEX_IMAGE_MISSING_R, TEX_IMAGE_MISSING_G, TEX_IMAGE_MISSING_B, TEX_IMAGE_MISSING_A);
  }
}

} /* Namespace. */

CCL_NAMESPACE_END