src/common/definitions.h


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198

#pragma once

#include "common/logging.h"
#include "common/shape.h"
#include "common/intrusive_ptr.h"

#include <functional>
#include <iostream>
#include <memory>
#include <string>
#include <vector>

#define THREAD_GUARD(body) [&]() { body; }() // test if THREAD_GUARD is necessary, remove if no problems occur.
#define NodeOp(op) [=]() { op; }

// helper macro to disable optimization (gcc only)
// To use this, just insert DONT_OPTIMIZE right before the function definition
// (e.g. where the "static" keyword would go).
#ifdef __GNUC__
#define DONT_OPTIMIZE __attribute__((optimize("O0")))
#else
#define DONT_OPTIMIZE // silently ignore on Visual Studio, where this is less of a problem
#endif

// Use these macros to enable faster floating-point math. Put them around one
// or more functions.
//
// Usage:
// MARIAN_FFAST_MATH_BEGIN
// void LayerNormalization(float *arg) { *arg += 1.0; }
// void SomethingElse() {}
// MARIAN_FFAST_MATH_END
//
// ffast-math allows the compiler to assume associative arithmetic and finite
// values.
//
// Associative arithmetic is particularly important to vectorize i.e. a sum:
//   for (const float f : range) sum += f;
// Without ffast-math, the sum will be done one value at a time.  On x86 it
// still uses vector math, but only uses the first slot and wastes the rest.
//
// With ffast-math, the compiler can sum in batches of 4, 8, or 16 floats.
// Also, it can run multiple adds in parallel e.g. vaddps has latency 4 and
// throughput 0.5 on Skylake so multiple vector adds can run at once.
//
// On average, a vectorized sum is more numerically stable because it sums in
// batches. Vectorized floats can still produce NaNs and infs (remember even
// scalar operations are implemented with vector instructions).
//
// Allowing the compiler to assume finite values means functions like isnan or
// isinf do not work as expected. Do not enable this for a function that
// depends upon fully standard float behavior.
//
// It can also change the sign of zeros.
//
// Fast math also makes results more architecture dependent because different
// register widths mean different results. They also depend on the compiler
// and compiler version more. For example, clang <= 10 does not support the
// float_control pragma below so it will still be conservative.
//
// There is a more conservative option for just associativity:
// llvm introduced "#pragma clang fp reassociate" that goes inside a function.
// However, llvm <11 considers that pragma an error so we'd need some ugly
// version test (which they don't recommend) or a compilation test.  Moreover,
// it has to be in the function to keep scope.
// gcc supports "-fassociative-math" that has to be outside a function.
// I didn't find a MSVC equivalent.
#if defined(_MSC_VER)
#define MARIAN_FFAST_MATH_BEGIN __pragma(float_control(precise, off, push))
#define MARIAN_FFAST_MATH_END __pragma(float_control(pop))
#elif defined(__clang__)
#define MARIAN_FFAST_MATH_BEGIN _Pragma("float_control(precise, off, push)")
#define MARIAN_FFAST_MATH_END _Pragma("float_control(pop)")
#elif defined(__GNUC__)
// Also available as __attribute__((optimize("-ffast-math"))) but done as pragmas for consistency
#define MARIAN_FFAST_MATH_BEGIN _Pragma("GCC push_options") _Pragma("GCC optimize(\"-ffast-math\")")
#define MARIAN_FFAST_MATH_END _Pragma("GCC pop_options")
#endif

namespace marian {

// Type to be used for all index types, e.g. for integer tensors for rows operator.
// size_t would seem to be the natural choice over uint32_t but has usually 8 bytes
// while uint32_t has 4 bytes. This type will be often exchanged between CPU and GPU.
// This minimizes bandwith at little cost.
typedef uint32_t IndexType;

// @TODO: come up with better short name. "I..." stands for interface now. Here it stands
// for "intrusive". Not a good overlap.
template <class T>
using IPtr = IntrusivePtr<T>;

template <class T>
using UPtr = std::unique_ptr<T>;

// @TODO: come up with better short name. "I..." stands for interface now.
template <class T>
using IWeak = T*;

template <class T>
using Ptr = std::shared_ptr<T>;

template <class T>
using Weak = std::weak_ptr<T>;

/** @brief Creates shared_ptr of any type, passes all arguments to any available
 * constructor */
template <class T, typename... Args>
Ptr<T> New(Args&&... args) {
  return Ptr<T>(new T(std::forward<Args>(args)...));
}

template <class T>
Ptr<T> New(Ptr<T> p) {
  return Ptr<T>(p);
}

/** @brief Creates InstrusivePtr of any type, passes all arguments to any available
 * constructor */
template <class T, typename... Args>
IPtr<T> INew(Args&&... args) {
  return IPtr<T>(new T(std::forward<Args>(args)...));
}

template <class T>
IPtr<T> INew(Ptr<T> p) {
  return IPtr<T>(p);
}

/// enum class DeviceType: defines which device is used for computation
enum class DeviceType : size_t { gpu = 0, cpu = 1 };

struct DeviceId {
  size_t no{0};
  DeviceType type{DeviceType::gpu};

  DeviceId() : no{0}, type{DeviceType::gpu} {}
  DeviceId(size_t no_, DeviceType type_) : no(no_), type(type_) {}

  std::string typeAsString() const {
    return (type == DeviceType::gpu ? "gpu" : "cpu");
  }

  operator std::string() const {
    return typeAsString() + std::to_string(no);
  }

  friend std::ostream& operator<<(std::ostream& out, DeviceId deviceId) {
    out << std::string(deviceId);
    return out;
  }

  friend bool operator==(DeviceId id1, DeviceId id2) {
    return id1.no == id2.no && id1.type == id2.type;
  }
  friend bool operator!=(DeviceId id1, DeviceId id2) { return !(id1 == id2); }
};

// predefine a couple of devices for easier manual use
const DeviceId CPU0{0, DeviceType::cpu};
const DeviceId CPU1{1, DeviceType::cpu};
const DeviceId CPU2{2, DeviceType::cpu};
const DeviceId CPU3{3, DeviceType::cpu};
const DeviceId CPU4{4, DeviceType::cpu};
const DeviceId CPU5{5, DeviceType::cpu};
const DeviceId CPU6{6, DeviceType::cpu};
const DeviceId CPU7{7, DeviceType::cpu};

const DeviceId GPU0{0, DeviceType::gpu};
const DeviceId GPU1{1, DeviceType::gpu};
const DeviceId GPU2{2, DeviceType::gpu};
const DeviceId GPU3{3, DeviceType::gpu};
const DeviceId GPU4{4, DeviceType::gpu};
const DeviceId GPU5{5, DeviceType::gpu};
const DeviceId GPU6{6, DeviceType::gpu};
const DeviceId GPU7{7, DeviceType::gpu};

// These are many small objects, hence use IntrusivePtr
class TensorBase;
typedef IPtr<TensorBase> Tensor;

// These are many small objects, hence use IntrusivePtr
template <class DataType>
class Chainable;
typedef IPtr<Chainable<Tensor>> Expr;

class OptimizerBase;
typedef Ptr<OptimizerBase> OptimizerBasePtr;

class ClipperBase;
typedef Ptr<ClipperBase> ClipperBasePtr;

class RunBase;
typedef Ptr<RunBase> RunBasePtr;


const float NEMATUS_LN_EPS = 1e-5f;
}  // namespace marian