intern/cycles/util/half.h


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184

/* SPDX-License-Identifier: Apache-2.0
 * Copyright 2011-2022 Blender Foundation */

#ifndef __UTIL_HALF_H__
#define __UTIL_HALF_H__

#include "util/math.h"
#include "util/types.h"

#if !defined(__KERNEL_GPU__) && defined(__KERNEL_SSE2__)
#  include "util/simd.h"
#endif

CCL_NAMESPACE_BEGIN

/* Half Floats */

#if defined(__KERNEL_METAL__)

ccl_device_inline float half_to_float(half h_in)
{
  float f;
  union {
    half h;
    uint16_t s;
  } val;
  val.h = h_in;

  *((ccl_private int *)&f) = ((val.s & 0x8000) << 16) | (((val.s & 0x7c00) + 0x1C000) << 13) |
                             ((val.s & 0x03FF) << 13);

  return f;
}

#else

/* CUDA has its own half data type, no need to define then */
#  if !defined(__KERNEL_CUDA__) && !defined(__KERNEL_HIP__) && !defined(__KERNEL_ONEAPI__)
/* Implementing this as a class rather than a typedef so that the compiler can tell it apart from
 * unsigned shorts. */
class half {
 public:
  half() : v(0)
  {
  }
  half(const unsigned short &i) : v(i)
  {
  }
  operator unsigned short()
  {
    return v;
  }
  half &operator=(const unsigned short &i)
  {
    v = i;
    return *this;
  }

 private:
  unsigned short v;
};
#  endif

struct half4 {
  half x, y, z, w;
};
#endif

/* Conversion to/from half float for image textures
 *
 * Simplified float to half for fast sampling on processor without a native
 * instruction, and eliminating any NaN and inf values. */

ccl_device_inline half float_to_half_image(float f)
{
#if defined(__KERNEL_METAL__) || defined(__KERNEL_ONEAPI__)
  return half(min(f, 65504.0f));
#elif defined(__KERNEL_CUDA__) || defined(__KERNEL_HIP__)
  return __float2half(min(f, 65504.0f));
#else
  const uint u = __float_as_uint(f);
  /* Sign bit, shifted to its position. */
  uint sign_bit = u & 0x80000000;
  sign_bit >>= 16;
  /* Exponent. */
  uint exponent_bits = u & 0x7f800000;
  /* Non-sign bits. */
  uint value_bits = u & 0x7fffffff;
  value_bits >>= 13;     /* Align mantissa on MSB. */
  value_bits -= 0x1c000; /* Adjust bias. */
  /* Flush-to-zero. */
  value_bits = (exponent_bits < 0x38800000) ? 0 : value_bits;
  /* Clamp-to-max. */
  value_bits = (exponent_bits > 0x47000000) ? 0x7bff : value_bits;
  /* Denormals-as-zero. */
  value_bits = (exponent_bits == 0 ? 0 : value_bits);
  /* Re-insert sign bit and return. */
  return (value_bits | sign_bit);
#endif
}

ccl_device_inline float half_to_float_image(half h)
{
#if defined(__KERNEL_METAL__)
  return half_to_float(h);
#elif defined(__KERNEL_ONEAPI__)
  return float(h);
#elif defined(__KERNEL_CUDA__) || defined(__KERNEL_HIP__)
  return __half2float(h);
#else
  const int x = ((h & 0x8000) << 16) | (((h & 0x7c00) + 0x1C000) << 13) | ((h & 0x03FF) << 13);
  return __int_as_float(x);
#endif
}

ccl_device_inline float4 half4_to_float4_image(const half4 h)
{
  /* Unable to use because it gives different results half_to_float_image, can we
   * modify float_to_half_image so the conversion results are identical? */
#if 0 /* defined(__KERNEL_AVX2__) */
  /* CPU: AVX. */
  __m128i x = _mm_castpd_si128(_mm_load_sd((const double *)&h));
  return float4(_mm_cvtph_ps(x));
#endif

  const float4 f = make_float4(half_to_float_image(h.x),
                               half_to_float_image(h.y),
                               half_to_float_image(h.z),
                               half_to_float_image(h.w));
  return f;
}

/* Conversion to half float texture for display.
 *
 * Simplified float to half for fast display texture conversion on processors
 * without a native instruction. Assumes no negative, no NaN, no inf, and sets
 * denormal to 0. */

ccl_device_inline half float_to_half_display(const float f)
{
#if defined(__KERNEL_METAL__) || defined(__KERNEL_ONEAPI__)
  return half(min(f, 65504.0f));
#elif defined(__KERNEL_CUDA__) || defined(__KERNEL_HIP__)
  return __float2half(min(f, 65504.0f));
#else
  const int x = __float_as_int((f > 0.0f) ? ((f < 65504.0f) ? f : 65504.0f) : 0.0f);
  const int absolute = x & 0x7FFFFFFF;
  const int Z = absolute + 0xC8000000;
  const int result = (absolute < 0x38800000) ? 0 : Z;
  const int rshift = (result >> 13);
  return (rshift & 0x7FFF);
#endif
}

ccl_device_inline half4 float4_to_half4_display(const float4 f)
{
#ifdef __KERNEL_SSE2__
  /* CPU: SSE and AVX. */
  ssef x = min(max(load4f(f), 0.0f), 65504.0f);
#  ifdef __KERNEL_AVX2__
  ssei rpack = _mm_cvtps_ph(x, 0);
#  else
  ssei absolute = cast(x) & 0x7FFFFFFF;
  ssei Z = absolute + 0xC8000000;
  ssei result = andnot(absolute < 0x38800000, Z);
  ssei rshift = (result >> 13) & 0x7FFF;
  ssei rpack = _mm_packs_epi32(rshift, rshift);
#  endif
  half4 h;
  _mm_storel_pi((__m64 *)&h, _mm_castsi128_ps(rpack));
  return h;
#else
  /* GPU and scalar fallback. */
  const half4 h = {float_to_half_display(f.x),
                   float_to_half_display(f.y),
                   float_to_half_display(f.z),
                   float_to_half_display(f.w)};
  return h;
#endif
}

CCL_NAMESPACE_END

#endif /* __UTIL_HALF_H__ */