Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/marian-nmt/marian.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorQianqian Zhu <qianqian.zhu@hotmail.com>2021-02-28 11:07:19 +0300
committerGitHub <noreply@github.com>2021-02-28 11:07:19 +0300
commit2a9c0bb3773c32c20fce74a8b0f7149478ebd8cb (patch)
treecf0af697c565ae9a1fb6ff09fcc2ce466ef6dcc2 /src
parentf88ded2ba8fc9c452717674542fcaef231b3f3e9 (diff)
Add graph documentations (#788)
* add API docs for expression_graph.h * change API docs to doxygen-readable format * add API docs for node_initializers * update doxygen configure file * add hyperlinks and remove layers section from graph documentation * fixing typos and links on graph doc
Diffstat (limited to 'src')
-rwxr-xr-xsrc/common/definitions.h1
-rw-r--r--src/common/shape.h8
-rw-r--r--src/common/types.h61
-rw-r--r--src/graph/expression_graph.cpp16
-rw-r--r--src/graph/expression_graph.h336
-rwxr-xr-xsrc/graph/node.cpp11
-rw-r--r--src/graph/node.h20
-rwxr-xr-xsrc/graph/node_initializers.cpp11
-rwxr-xr-xsrc/graph/node_initializers.h206
-rw-r--r--src/graph/node_operators.h16
-rwxr-xr-xsrc/tensors/tensor.h6
11 files changed, 535 insertions, 157 deletions
diff --git a/src/common/definitions.h b/src/common/definitions.h
index 0bc14ba1..d2cf8aa4 100755
--- a/src/common/definitions.h
+++ b/src/common/definitions.h
@@ -127,6 +127,7 @@ IPtr<T> INew(Ptr<T> p) {
return IPtr<T>(p);
}
+/// enum class DeviceType: defines which device is used for computation
enum class DeviceType : size_t { gpu = 0, cpu = 1 };
struct DeviceId {
diff --git a/src/common/shape.h b/src/common/shape.h
index 0b633e4d..7b1841e5 100644
--- a/src/common/shape.h
+++ b/src/common/shape.h
@@ -28,6 +28,14 @@ struct Slice // Python-like slice/index descriptor
};
typedef std::vector<Slice> Slices;
+/**
+ * Shape class mainly defines the shape or dimensionality of the node.
+ * Basically, Shape is a wrapper of a std::vector. Its size is the number of
+ * dimension. E.g., shape={2,3} means 2D matrix with dim[0]=2 and dim[1]=3.
+ * WHen the index is negative, the real index is size() + index.
+ * It implements most common functions demanded by operations, e.g., resize(),
+ * slice(), and broadcast().
+ */
struct Shape {
private:
std::vector<int> shape_;
diff --git a/src/common/types.h b/src/common/types.h
index 0f70bb22..9ce57527 100644
--- a/src/common/types.h
+++ b/src/common/types.h
@@ -131,7 +131,7 @@ do { \
default: ABORT("Unknown type {}", type); \
} \
} while(0)
-
+/// namespace marian
namespace marian {
// small struct to enable templating based on types use for packing
@@ -247,36 +247,37 @@ constexpr inline size_t operator+(size_t val, TypeClass typeClass) {
}
// @TODO: rename to ElementType when things become stable, so it's easier to review
+/// enum class Type: stores all supported data type in Marian
enum class Type : size_t {
- int8 = TypeClass::signed_type + 1u,
- int16 = TypeClass::signed_type + 2u,
- int32 = TypeClass::signed_type + 4u,
- int64 = TypeClass::signed_type + 8u,
-
- uint8 = TypeClass::unsigned_type + 1u,
- uint16 = TypeClass::unsigned_type + 2u,
- uint32 = TypeClass::unsigned_type + 4u,
- uint64 = TypeClass::unsigned_type + 8u,
-
- float16 = TypeClass::float_type + 2u,
- float32 = TypeClass::float_type + 4u,
- float64 = TypeClass::float_type + 8u,
-
- packed16 = TypeClass::packed_type + 2u, // special type for FBGEMM, not meant to be used anywhere else, not meant to be accessed invidually. Internal actual type (uint16) is meaningless.
- packed8avx2 = TypeClass::packed_type + 1u + TypeClass::avx2_type, // special type for FBGEMM with AVX2, not meant to be used anywhere else, not meant to be accessed invidually. Internal actual type (uint8) is meaningless.
- packed8avx512 = TypeClass::packed_type + 1u + TypeClass::avx512_type, // special type for FBGEMM with AVX512, not meant to be used anywhere else, not meant to be accessed invidually. Internal actual type (uint8) is meaningless.
-
- intgemm8 = TypeClass::intgemm_type + 1u, // Int8 quantized (not packed) matrices for intgemm
- intgemm16 = TypeClass::intgemm_type + 2u, // Int16 quantized (not packed) matrices for intgemm
-
- intgemm8ssse3 = TypeClass::intgemm_type + 1u + TypeClass::ssse3_type, // Int8 quantized and packed (ssse3) matrices for intgemm
- intgemm8avx2 = TypeClass::intgemm_type + 1u + TypeClass::avx2_type, // Int8 quantized and packed (avx2) matrices for intgemm
- intgemm8avx512 = TypeClass::intgemm_type + 1u + TypeClass::avx512_type, // Int8 quantized and packed (avx512) matrices for intgemm
- intgemm8avx512vnni = TypeClass::intgemm_type + 1u + TypeClass::avx512_type + 4096u, // Int8 quantized and packed (avx512) matrices for intgemm. VNNI algorithm
-
- intgemm16sse2 = TypeClass::intgemm_type + 2u + TypeClass::sse2_type, // Int16 quantized and packed (sse2) matrices for intgemm
- intgemm16avx2 = TypeClass::intgemm_type + 2u + TypeClass::avx2_type, // Int16 quantized and packed (avx2) matrices for intgemm
- intgemm16avx512 = TypeClass::intgemm_type + 2u + TypeClass::avx512_type, // Int16 quantized and packed (avx512) matrices for intgemm
+ int8 = TypeClass::signed_type + 1u, ///< int8 type
+ int16 = TypeClass::signed_type + 2u, ///< int16 type
+ int32 = TypeClass::signed_type + 4u, ///< int32 type
+ int64 = TypeClass::signed_type + 8u, ///< int64 type
+
+ uint8 = TypeClass::unsigned_type + 1u, ///< uint8 type
+ uint16 = TypeClass::unsigned_type + 2u, ///< uint16 type
+ uint32 = TypeClass::unsigned_type + 4u, ///< uint32 type
+ uint64 = TypeClass::unsigned_type + 8u, ///< uint64 type
+
+ float16 = TypeClass::float_type + 2u, ///< float16 type
+ float32 = TypeClass::float_type + 4u, ///< float32 type
+ float64 = TypeClass::float_type + 8u, ///< float64 type
+
+ packed16 = TypeClass::packed_type + 2u, ///< special type for FBGEMM, not meant to be used anywhere else, not meant to be accessed invidually. Internal actual type (uint16) is meaningless.
+ packed8avx2 = TypeClass::packed_type + 1u + TypeClass::avx2_type, ///< special type for FBGEMM with AVX2, not meant to be used anywhere else, not meant to be accessed invidually. Internal actual type (uint8) is meaningless.
+ packed8avx512 = TypeClass::packed_type + 1u + TypeClass::avx512_type, ///< special type for FBGEMM with AVX512, not meant to be used anywhere else, not meant to be accessed invidually. Internal actual type (uint8) is meaningless.
+
+ intgemm8 = TypeClass::intgemm_type + 1u, ///< Int8 quantized (not packed) matrices for intgemm
+ intgemm16 = TypeClass::intgemm_type + 2u, ///< Int16 quantized (not packed) matrices for intgemm
+
+ intgemm8ssse3 = TypeClass::intgemm_type + 1u + TypeClass::ssse3_type, ///< Int8 quantized and packed (ssse3) matrices for intgemm
+ intgemm8avx2 = TypeClass::intgemm_type + 1u + TypeClass::avx2_type, ///< Int8 quantized and packed (avx2) matrices for intgemm
+ intgemm8avx512 = TypeClass::intgemm_type + 1u + TypeClass::avx512_type, ///< Int8 quantized and packed (avx512) matrices for intgemm
+ intgemm8avx512vnni = TypeClass::intgemm_type + 1u + TypeClass::avx512_type + 4096u, ///< Int8 quantized and packed (avx512) matrices for intgemm. VNNI algorithm
+
+ intgemm16sse2 = TypeClass::intgemm_type + 2u + TypeClass::sse2_type, ///< Int16 quantized and packed (sse2) matrices for intgemm
+ intgemm16avx2 = TypeClass::intgemm_type + 2u + TypeClass::avx2_type, ///< Int16 quantized and packed (avx2) matrices for intgemm
+ intgemm16avx512 = TypeClass::intgemm_type + 2u + TypeClass::avx512_type, ///< Int16 quantized and packed (avx512) matrices for intgemm
};
static inline size_t operator&(TypeClass typeClass, Type type) {
diff --git a/src/graph/expression_graph.cpp b/src/graph/expression_graph.cpp
index 7d42e811..827fb3ed 100644
--- a/src/graph/expression_graph.cpp
+++ b/src/graph/expression_graph.cpp
@@ -30,7 +30,7 @@ Expr ExpressionGraph::add(Expr node) {
} else {
node->setId(count_++);
- // record in foward graph
+ // record in forward graph
nodesForward_.push_back(node);
// record in backward graph if training, and keep track of roots
@@ -143,6 +143,11 @@ void ExpressionGraph::forward(std::list<Expr>& forwardTape, bool finalPass) {
if(inferenceOnly_)
v->children().clear();
+ // If checkpointing is disabled, keep the memory for forward signals for all nodes.
+ // If checkpointing is enabled:
+ // (a) In the forward pass before the backward pass, free the memory for the nodes in the subtape to save memory.
+ // (b) In the forward calls during the backward pass, keep the memory in the current subtape to accelerate
+ // gradient computation.
if(checkpointing_ && !finalPass) {
auto subtape = v->getSubtape();
if(subtape) {
@@ -171,12 +176,14 @@ void ExpressionGraph::backward(bool reset, float clipValue) {
ABORT("Aborting");
}
+ // allocates memory and initialises gradients for parameters
for(auto kvParams : paramsByElementType_) {
kvParams.second->allocateBackward();
if(reset)
kvParams.second->set_zero_adjoint();
}
+ // for top nodes: allocates memory and initialise gradients to 1
for(auto&& v : topNodes_)
v->init_dependent();
@@ -186,13 +193,16 @@ void ExpressionGraph::backward(bool reset, float clipValue) {
bool firstNaN = true;
while(!nodesBackward_.empty()) {
- auto v = nodesBackward_.back();
- nodesBackward_.pop_back();
+ auto v = nodesBackward_.back(); // return the last element
+ nodesBackward_.pop_back(); // remove the last element
+ // for non-top nodes: allocates memory and initialises gradients to 0
for(auto&& child : v->children())
if(child->trainable() && child->type() != "param")
child->set_zero_adjoint();
+ // if using gradient checkpointing,
+ // recompute the forward pass from checkpoint to the root
if(checkpointing_ && v->getSubtape()) {
forward(*v->getSubtape(), /*finalPass=*/true);
}
diff --git a/src/graph/expression_graph.h b/src/graph/expression_graph.h
index b4f0c1e2..f03f189d 100644
--- a/src/graph/expression_graph.h
+++ b/src/graph/expression_graph.h
@@ -16,9 +16,18 @@
namespace marian {
+/**
+ * Create an expression node of any type, and pass all
+ * arguments to any available constructor.
+ * E.g., to create a ConstantNode uses `Expression<ConstantNode>(...)`.
+ */
template <class T, typename... Args>
Expr Expression(Args&&... args);
+/**
+ * The whole tensor set in the graph.
+ * Holds all tensor objects (memory and nodes) for a graph.
+ */
class Tensors {
private:
Ptr<TensorAllocator> tensors_;
@@ -27,8 +36,8 @@ private:
typedef std::unordered_map<size_t, std::vector<WExpr>> WeakMemory;
typedef std::unordered_map<size_t, std::vector<Expr>> Memory;
- Ptr<WeakMemory> shortterm_;
- Ptr<Memory> longterm_;
+ Ptr<WeakMemory> shortterm_; // holds all nodes for a graph
+ Ptr<Memory> longterm_; // holds memoized nodes
public:
Tensors(Ptr<Backend> backend)
@@ -112,97 +121,145 @@ public:
typedef std::map<Type, Ptr<Parameters>> ElementTypeParamsMap; // keep it sorted, hence map not unordered map
+/**
+ * Main implementation of a computation graph.
+ * Keeps a record of data (tensors) and all operations. Each operation in a computation graph is a Node.
+ * Each Node defines its forward and backward steps.
+ */
class ExpressionGraph : public std::enable_shared_from_this<ExpressionGraph> {
- size_t count_{0};
+ size_t count_{0}; // counter for nodes in the graph; hold current node index
- std::unordered_set<Expr> topNodes_; // current set of roots. In the end, all but one must have been consumed.
+ std::unordered_set<Expr> topNodes_; // current set of roots. In the end, all but one must have been consumed
protected: // (these are protected, not private, for ONNX exporting)
- std::list<Expr> nodesForward_;
- std::list<Expr> nodesBackward_;
+ std::list<Expr> nodesForward_; ///< contains all nodes used for forward()
+ std::list<Expr> nodesBackward_; ///< contains trainable nodes used for backward()
- // Holds memory and expressions that correspond to temporary expressions.
- // This gets cleared before a new graph is built.
+ /**
+ * A shared pointer to the tensor objects in the graph.
+ * Holds memory and nodes that corresponds to tensors in a graph.
+ * Since operations will result in new tensors, this attribute is used
+ * to allocate memory for new tensors during forward() and backward().
+ * This gets cleared before a new graph is built.
+ */
Ptr<Tensors> tensors_;
private:
std::unordered_map<size_t, std::vector<Expr>> memoized_;
- Type defaultElementType_{Type::float32}; // Type used for storing parameters, currently all parameters have to have the same type
+ Type defaultElementType_{Type::float32}; // Type used for storing parameters, currently all parameters have to have the same type
- bool inferenceOnly_{false};
+ bool inferenceOnly_{false}; // a flag holds whether the graph is used for inference only
- bool checkpointing_{false}; // use gradient checkpointing if true
+ bool checkpointing_{false}; // use gradient checkpointing if true
- bool reloaded_{false};
+ bool reloaded_{false}; // a flag holds whether the graph is reloaded: reloaded is true if the graph loads parameters by load() function.
- bool throwNaN_{false};
+ bool throwNaN_{false}; // a flag holds whether the graph throws a NaN exception
protected:
// Delete, copy and move constructors
ExpressionGraph(const ExpressionGraph&) = delete;
ExpressionGraph(ExpressionGraph&&) = delete;
- // Holds memory and expressions that correspond to graph parameters
- // Now we can have multiple types of parameters in a separate parameters object per value type.
- // This is currently only accessible through private functions during loading, will abort during training
- // when params() is called (e.g. optimizer) and there is more or other types than the default parameter type.
- // Currently the only usecase is inference. Trying to access params() for non-default parameter type is going
- // to abort. Inference does not need to access a whole set of parameters.
+ /**
+ * A map holds memory and nodes that corresponds to graph parameters.
+ * The key is Type and the mapped value is a set of parameter objects with corresponding type.
+ * Now we can have multiple types of parameters in a separate parameters object per value type.
+ * This is currently only accessible through private functions during loading, will abort during training
+ * when params() is called (e.g. optimizer) and there is more or other types than the default parameter type.
+ * Currently the only usecase is inference. Trying to access params() for non-default parameter type is going
+ * to abort. Inference does not need to access a whole set of parameters.
+ */
ElementTypeParamsMap paramsByElementType_;
- Ptr<Backend> backend_;
-
- std::string namespace_;
+ Ptr<Backend> backend_; ///< a shared pointer to the backend for the graph
+ std::string namespace_; ///< a string defines the namespace of the graph. Each graph has its own unique namespace.
public:
- /** @brief Constructs a new expression graph
- *
- * Constructor should be used as New<ExpressionGraph>()
- */
+ /** Constructs a new expression graph. Constructor should be used as New<ExpressionGraph>(). */
ExpressionGraph(bool inference = false);
+ /** Destructor. Clear everything related to the graph except memoized nodes. */
virtual ~ExpressionGraph() {
clear();
for(auto kvParams : paramsByElementType_)
kvParams.second->clear();
}
+ /**
+ * Set device options used to run the graph.
+ * @param deviceId a struct type which stores device no. (size_t)
+ * and device type (DeviceType::cpu or DeviceType::gpu)
+ * @param device a pointer to the device
+ */
virtual void setDevice(DeviceId deviceId = {0, DeviceType::gpu},
Ptr<Device> device = nullptr);
+ /**
+ * Get device info for the graph.
+ * @return deviceId a struct type which stores device no. (size_t)
+ * and device type (DeviceType::cpu or DeviceType::gpu)
+ */
DeviceId getDeviceId() { return backend_->getDeviceId(); }
+ /**
+ * Get backend pointer for the graph.
+ * @return Ptr<Backend> pointer to backend
+ */
Ptr<Backend> getBackend() { return backend_; }
+ /** Set whether the graph is used for inference only */
void setInference(bool inference) { inferenceOnly_ = inference; }
+
+ /** Check whether the graph is used for inference only (true) or not */
bool isInference() { return inferenceOnly_; }
+ /**
+ * Set whether the graph uses gradient checkpointing.
+ * <a href="https://github.com/cybertronai/gradient-checkpointing">Gradient Checkpointing</a>
+ * works by trading compute for memory, which reruns a forward-pass segment for each checkpoint segment during backward.
+ */
void setCheckpointing(bool checkpointing) { checkpointing_ = checkpointing; }
+
+ /** Check whether the graph uses gradient checkpointing or not */
bool isCheckpointing() { return checkpointing_; }
+ /**
+ * Set namespace (std::string) for the graph.
+ * Each graph has its own unique namespace, which is used to form the name of a parameter object.
+ */
void switchParams(const std::string& newNamespace) {
namespace_ = newNamespace;
}
+ /**
+ * Copy all parameter objects from one graph to current graph.
+ * @param graph a pointer to a graph object
+ */
virtual void copyParams(Ptr<ExpressionGraph> graph) {
for(auto p : *graph->params())
param(p->name(), p->shape(), inits::fromTensor(p->val()), p->value_type());
- forward(); // this will allocate parameters, execute the intializers and therefore copy parameter values
+ forward(); // this will allocate parameters, execute the initializers and therefore copy parameter values
}
+ /**
+ * Preallocate workspace memory (MB) for the graph.
+ * Sets the size of the memory available for the forward and backward step of the training procedure.
+ * This does not include model size and optimizer parameters that are allocated outsize workspace.
+ */
void reserveWorkspaceMB(size_t num) {
size_t bytes = num * 1024 * 1024 - 1;
tensors_->reserve(bytes);
}
+ /** Copy tensor objects from one graph to current graph */
void reuseWorkspace(Ptr<ExpressionGraph> graph) {
tensors_ = graph->tensors_;
}
/**
- * @brief Performs backpropogation on this expression graph.
- *
- * Backpropogation is implemented by performing first the forward pass and
+ * Performs backpropagation on this expression graph.
+ * Backpropagation is implemented by performing first the forward pass and
* then the backward pass of algorithmic differentiation (AD) on the nodes of
* the graph.
*/
@@ -211,6 +268,12 @@ public:
backward();
}
+ /**
+ * Perform one backpropagation process on the graph to test
+ * whether the graph workspace fits into a given workspace memory.
+ * This function is used for searching the maximum batch size
+ * that fits into given workspace memory.
+ */
bool fits() {
try {
tensors_->throwAtReallocation(true);
@@ -223,19 +286,50 @@ public:
return true;
}
+ /**
+ * Check whether the memory allocated for a tensor object contains a NaN or infinite value.
+ * @param t a Tensor object
+ * @param isNaN a bool type holds the result whether the tensor contains a NaN value (pass by reference)
+ * @param isInf a bool type holds the result whether the tensor contains a infinite value (pass by reference)
+ */
void checkNaN(Tensor t, bool& isNaN, bool& isInf);
+ /**
+ * Perform the forward pass on the nodes of the graph.
+ * The forward pass refers to the calculation process.
+ * It traverses through all nodes from input layer to output layer.
+ */
void forward() {
for(auto kvParams : paramsByElementType_)
kvParams.second->allocateForward();
forwardNext();
}
+ /**
+ * Perform the forward pass without memory allocation for parameters.
+ * Helper function for forward().
+ */
void forwardNext();
+
+ /**
+ * Perform forward pass on a given nodes with finalPass flag.
+ * Helper function for forward() and backward().
+ * @param forwardTape a pointer to the nodes used for forward pass
+ * @param finalPass a bool type which controls whether nodes should be freed with gradient-checkpointing
+ */
void forward(std::list<Expr>& forwardTape, bool finalPass);
+ /**
+ * Perform the backward pass on the trainable nodes of the graph.
+ * The back pass refers to the process of computing the output error.
+ * It traverses through all nodes from output layer to input layer.
+ */
void backward(bool reset = true, float clipValue = 0.f);
+ /**
+ * Generate graph layout in Graphviz format for visualisation.
+ * @return a string presenting graph layout in Graphviz format (dot)
+ */
std::string graphviz() {
std::stringstream ss;
ss << "digraph ExpressionGraph {" << std::endl;
@@ -253,6 +347,10 @@ public:
return ss.str();
}
+ /**
+ * Write graph layout in Graphviz format to a file.
+ * @param filename a string type specifies filename that writes the graph layout
+ */
void graphviz(const std::string& filename) {
std::ofstream dot(filename);
dot << graphviz();
@@ -345,6 +443,18 @@ private:
}
public:
+
+ /**
+ * Construct a parameter node in the graph.
+ * @param pname a string type holds the name of the parameter node
+ * @param shape a struct type defines the shape of the parameter tensor
+ * e.g., shape={2,3} means 2D matrix with dim[0]=2 and dim[1]=3
+ * @param init a pointer to a NodeInitializer object, e.g., inits::zeros()
+ * @param elementType a scoped enumerator (enum class) defines the element type, e.g., Type::float16
+ * @param fixed a bool type specifies whether the parameter object is fixed (not trainable) or not.
+ * The default value is false which means the parameter is trainable.
+ * @return a pointer to the parameter node
+ */
Expr param(const std::string& pname,
const Shape& shape,
const Ptr<inits::NodeInitializer>& init,
@@ -354,6 +464,17 @@ public:
return param(pname, shape, init, elementType, fixed, /*typeSpecified=*/true);
}
+ /**
+ * Construct a parameter node in the graph without a specified type, and
+ * the type is set to defaultElementType_.
+ * @param pname a string type holds the name of the parameter node
+ * @param shape a struct type defines the shape of the parameter tensor
+ * e.g., shape={2,3} means 2D matrix with dim[0]=2 and dim[1]=3
+ * @param init a pointer to a NodeInitializer object, e.g., inits::zeros()
+ * @param fixed a bool type specifies whether the parameter object is fixed (not trainable) or not.
+ * The default value is false which means the parameter is trainable.
+ * @return a pointer to the parameter node
+ */
Expr param(const std::string& pname,
const Shape& shape,
const Ptr<inits::NodeInitializer>& init,
@@ -362,28 +483,59 @@ public:
return param(pname, shape, init, defaultElementType_, fixed, /*typeSpecified=*/false);
}
+ /**
+ * Construct a constant node in the graph without a specified type, and
+ * the type is set to defaultElementType_.
+ * @param shape a struct type defines the shape of the constant tensor
+ * e.g., shape={2,3} means 2D matrix with dim[0]=2 and dim[1]=3
+ * @param init a pointer to a NodeInitializer object, e.g., inits::zeros()
+ * @param elementType a scoped enumerator (enum class) defines the element type, e.g., Type::float16
+ * @return a pointer to the constant node
+ */
Expr constant(const Shape& shape,
const Ptr<inits::NodeInitializer>& init,
Type elementType) {
return Expression<ConstantNode>(shared_from_this(), shape, init, elementType);
}
+ /**
+ * Construct a constant node in the graph without a specified type, and
+ * the type is set to defaultElementType_.
+ * @param shape a struct type defines the shape of the constant tensor
+ * e.g., shape={2,3} means 2D matrix with dim[0]=2 and dim[1]=3
+ * @param init a pointer to a NodeInitializer object, e.g., inits::zeros()
+ * @return a pointer to the constant node
+ */
Expr constant(const Shape& shape,
const Ptr<inits::NodeInitializer>& init) {
return Expression<ConstantNode>(shared_from_this(), shape, init, defaultElementType_);
}
// @TODO: add version with iterators
- // shortcut to turn vector of indices to integer tensor, to be used with operators
- // like rows or select
+ /**
+ * Turn vector of indices to integer tensor.
+ * A shortcut version to turn vector of indices to integer tensor, to be used with operators
+ * like rows() or index_select()
+ * @param indicesVector a vector of IndexType (uint32_t) specifies the indexes
+ */
Expr indices(const std::vector<IndexType>& indicesVector) {
return constant({(int)indicesVector.size()},
inits::fromVector(indicesVector),
Type::uint32);
}
- // this version sets up the shape such that the indices are in a given axis
- // Use this if you want to pass these indices to gather().
- // indexee shape = (3, 2, 5, 2); axis = 1 -> resulting shape = (1, size of indicesVector, 1, 1)
+
+ /**
+ * Specify the indexes of elements to be taken from a tensor.
+ * This version sets up the shape such that the indices are in a given axis.
+ * Use this if you want to pass these indices to gather().
+ * E.g., indexee shape = (3, 2, 5, 2); axis = 1 -> resulting shape = (1, size of indicesVector, 1, 1):
+ * - The size of the resulting shape is the same as that of the indexee; here is 4.
+ * - The shape of the specified axis is equal to the size of given indicesVector.
+ * - The shapes of the rest axes are filled with 1.
+ * @param indicesVector a vector of IndexType (uint32_t) specifies the indexes
+ * @param indexee the source tensor that we want to select elements from
+ * @param axis specifies the axis that we want to collect along
+ */
Expr indices(const std::vector<IndexType>& indicesVector, Expr indexee, int axis = -1) {
Shape shape;
shape.resize(indexee->shape().size());
@@ -393,24 +545,70 @@ public:
Type::uint32);
}
+ /**
+ * Construct a constant node filled with `1`.
+ * @param shape a struct type defines the shape of the constant dataset
+ * e.g., shape={2,3} means 2D matrix with dim[0]=2 and dim[1]=3
+ * @param elementType a scoped enumerator (enum class) defines the element type, e.g., Type::float16
+ */
Expr ones(const Shape& shape, Type elementType) {
return constant(shape, inits::ones(), elementType);
}
+
+ /**
+ * Construct a constant node filled with `1` without a specified type,
+ * and the type is set to defaultElementType_.
+ * @param shape a struct type defines the shape of the constant dataset
+ * e.g., shape={2,3} means 2D matrix with dim[0]=2 and dim[1]=3
+ */
Expr ones(const Shape& shape) {
return constant(shape, inits::ones(), defaultElementType_);
}
+ /**
+ * Construct a constant node filled with `0`.
+ * @param shape a struct type defines the shape of the constant dataset
+ * e.g., shape={2,3} means 2D matrix with dim[0]=2 and dim[1]=3
+ * @param elementType a scoped enumerator (enum class) defines the element type, e.g., Type::float16
+ */
Expr zeros(const Shape& shape, Type elementType) {
return constant(shape, inits::zeros(), elementType);
}
+
+ /**
+ * Construct a constant node filled with `0` without a specified type,
+ * and the type is set to defaultElementType_.
+ * @param shape a struct type defines the shape of the constant dataset
+ * e.g., shape={2,3} means 2D matrix with dim[0]=2 and dim[1]=3
+ */
Expr zeros(const Shape& shape) {
return constant(shape, inits::zeros(), defaultElementType_);
}
- // prob = dropProb, e.g. 0.1 means 90% of values are kept
+ /**
+ * Construct a dropout mask (a tensor of 0 and 1).
+ * @param dropProb a float type specifies the dropout probability.
+ * E.g., dropProb=0.1 means 90% of values are kept.
+ * @param shape a struct type defines the shape of the constant dataset
+ * e.g., shape={2,3} means 2D matrix with dim[0]=2 and dim[1]=3
+ * @param elementType a scoped enumerator (enum class) defines the element type, e.g., Type::float16
+ */
Expr dropoutMask(float dropProb, const Shape& shape, Type elementType);
+
+ /**
+ * Construct a dropout mask (a tensor of 0 and 1) without a specified type,
+ * and the type is set to defaultElementType_.
+ * @param dropProb a float type specifies the dropout probability.
+ * E.g., dropProb=0.1 means 90% of values are kept.
+ * @param shape a struct type defines the shape of the constant dataset
+ * e.g., shape={2,3} means 2D matrix with dim[0]=2 and dim[1]=3
+ */
Expr dropoutMask(float dropProb, const Shape& shape);
+ /**
+ * Get the parameter object by name.
+ * @param name a string specifies the name of the parameter object
+ */
Expr get(std::string name) {
if(!namespace_.empty())
name = namespace_ + "::" + name;
@@ -419,6 +617,11 @@ public:
return p;
}
+ /**
+ * Get the parameter object by name and type.
+ * @param name a string specifies the name of the parameter object
+ * @param elementType a scoped enumerator (enum class) defines the element type, e.g., Type::float16
+ */
Expr get(std::string name, Type specifiedElementType) {
if(!namespace_.empty())
name = namespace_ + "::" + name;
@@ -427,6 +630,10 @@ public:
return p;
}
+ /**
+ * Return the Parameters object related to the graph.
+ * The Parameters object holds the whole set of the parameter nodes.
+ */
Ptr<Parameters>& params() {
// There are no parameter objects, that's weird.
ABORT_IF(paramsByElementType_.empty(), "No parameter object has been created");
@@ -441,6 +648,10 @@ public:
return it->second;
}
+ /**
+ * Set default element type for the graph.
+ * The default value is used if some node type is not specified.
+ */
void setDefaultElementType(Type defaultElementType) {
ABORT_IF(!paramsByElementType_.empty() && defaultElementType != defaultElementType_,
"Parameter objects already exist, cannot change default type from {} to {}",
@@ -448,31 +659,53 @@ public:
defaultElementType_ = defaultElementType;
}
+ /**
+ * Add a expression node to the graph.
+ * @param node a pointer to a expression node
+ */
Expr add(Expr node);
+ /**
+ * Allocate memory for the forward pass of the given node.
+ * @param node a pointer to a expression node
+ */
void allocateForward(Expr node) {
if(tensors_)
tensors_->allocateForward(node);
}
+ /**
+ * Allocate memory for the backward pass of the given node.
+ * @param node a pointer to a expression node
+ */
void allocateBackward(Expr node) {
if(tensors_)
tensors_->allocateBackward(node);
}
+ /**
+ * Free the memory for a tensor object.
+ * @param tensor a reference to the tensor object
+ */
void free(const Tensor& tensor) {
if(tensors_)
tensors_->free(tensor);
}
- // Returns the memory allocator of the graph workspace, allocates row unstructured memory (but 256-byte aligned)
+ /**
+ * Returns the memory allocator of the graph workspace.
+ * Allocates raw unstructured memory (but 256-byte aligned).
+ */
Ptr<Allocator> allocator() { return tensors_->getAllocator(); } // @TODO: rename this to getAllocator();
- // Returns the tensor allocator of the graph workspace, different from above as proper tensor objects are allocated
+ /**
+ * Returns the tensor allocator of the graph workspace.
+ * Different from allocator() as proper tensor objects are allocated.
+ */
Ptr<TensorAllocator> getTensorAllocator() { return tensors_->getTensorAllocator(); }
+ /** Clear everything apart from parameters and memoized nodes */
void clear() {
- // clear everything apart from parameters and memoized nodes
count_ = 0;
nodesForward_.clear();
nodesBackward_.clear();
@@ -482,13 +715,17 @@ public:
tensors_->clear();
}
+ /** Set the flag value whether the graph is reloaded (true) or not */
void setReloaded(bool reloaded) { reloaded_ = reloaded; }
+ /** Set the flag value whether the graph throws a NaN exception (true) or not */
void setThrowNaN(bool throwNaN) { throwNaN_ = throwNaN; }
+
+ /** Get the flag value whether the graph throws a NaN exception (true) or not */
bool getThrowNaN() { return throwNaN_; }
public:
- // loading from array of io::Items
+ /** Load model (mainly parameter objects) from array of io::Items */
void load(std::vector<io::Item>& ioItems, bool markReloaded = true) {
setReloaded(false);
for(auto& item : ioItems) {
@@ -507,18 +744,24 @@ public:
setReloaded(true);
}
+ /** Load model by filename */
void load(const std::string& name, bool markReloaded = true) {
LOG(info, "Loading model from {}", name);
auto items = io::loadItems(name);
load(items, markReloaded);
}
+ /** Load model from buffer (a file pointer) */
void load(const void* ptr, bool markReloaded = true) {
LOG(info, "Loading model from buffer at {}", ptr);
auto items = io::loadItems(ptr);
load(items, markReloaded);
}
+ /**
+ * Turn the model (given a file pointer) into a memory-mapped type
+ * by converting all the parameter object to memory-mapped version, i.e., MappedParameters.
+ */
void mmap(const void* ptr, bool markReloaded = true) {
ABORT_IF(backend_->getDeviceId().type != DeviceType::cpu || !inferenceOnly_,
"Memory mapping only supported for CPU inference mode");
@@ -541,7 +784,6 @@ public:
}
}
-
// pre-populate parameters by type
for(auto& item : items) {
auto it1 = paramsByElementType_.find(item.type);
@@ -556,9 +798,19 @@ public:
}
public:
- // convert all parameters into an array of io::Item elements, for saving
+ /**
+ * Convert all parameters into an array of io::Item elements, for saving.
+ * @param ioItems an array of io::Item elements
+ * @param saveElementType the element type for saving
+ */
void save(std::vector<io::Item>& ioItems, Type saveElementType = Type::float32);
+ /**
+ * Save all parameters into a file (.npz or .bin).
+ * @param name a string specifies the filename
+ * @param meta a string specifies the name of io::Item elements. If not specified, the parameter name is reserved.
+ * @param saveElementType the element type for saving
+ */
void save(const std::string& name, const std::string& meta = "", Type saveElementType = Type::float32) {
std::vector<io::Item> ioItems;
save(ioItems, saveElementType);
diff --git a/src/graph/node.cpp b/src/graph/node.cpp
index 256f7623..257a639f 100755
--- a/src/graph/node.cpp
+++ b/src/graph/node.cpp
@@ -27,11 +27,6 @@ void Node::free() {
}
}
-/**
- * Initialization for backward step of top node
- * in computation graph. Allocates memory and sets gradient
- * to 1 (df/df == 1).
- */
void Node::init_dependent() {
if(!adj_) {
graph()->allocateBackward(this);
@@ -39,12 +34,6 @@ void Node::init_dependent() {
}
}
-/**
- * Initialization for backward step of any non-top node
- * in computation graph. Allocates memory and sets gradient
- * to 0 for further accumulation of gradients from all
- * parents.
- */
void Node::set_zero_adjoint() {
if(!adj_) {
graph()->allocateBackward(this);
diff --git a/src/graph/node.h b/src/graph/node.h
index 9c5382d4..d7b328d6 100644
--- a/src/graph/node.h
+++ b/src/graph/node.h
@@ -28,13 +28,13 @@ protected:
std::vector<Expr> children_;
Weak<ExpressionGraph> graph_;
- Shape shape_{1, 1, 1, 1};
- Type valueType_{Type::float32};
+ Shape shape_{1, 1, 1, 1}; // defines the dimensionality of the node (for tensors)
+ Type valueType_{Type::float32}; // defines the element type of the node (for tensors)
std::string name_{"none"};
- Tensor val_{nullptr};
- Tensor adj_{nullptr};
+ Tensor val_{nullptr}; // the resulting new tensor in forward pass
+ Tensor adj_{nullptr}; // the accumulated gradients (a tensor) in backward pass
bool markedForDebug_{false};
std::string debugMessage_;
@@ -105,9 +105,19 @@ public:
virtual void free() override;
virtual void init() override {};
-
+ /**
+ * Initialization for backward step of top node
+ * in computation graph. Allocates memory and sets gradient
+ * to 1 (df/df == 1).
+ */
virtual void init_dependent() override;
+ /**
+ * Initialization for backward step of any non-top node
+ * in computation graph. Allocates memory and sets gradient
+ * to 0 for further accumulation of gradients from all
+ * parents.
+ */
virtual void set_zero_adjoint() override;
virtual Tensor& val() override { return val_; };
diff --git a/src/graph/node_initializers.cpp b/src/graph/node_initializers.cpp
index 531cfaad..c6fb622f 100755
--- a/src/graph/node_initializers.cpp
+++ b/src/graph/node_initializers.cpp
@@ -98,9 +98,10 @@ Ptr<NodeInitializer> glorotUniform(bool fanIn, bool fanOut, float scalingFactor)
return fromLambda([fanIn, fanOut, scalingFactor](Tensor t) {
float scale = sqrtf(6.0f / (t->shape()[-2] + t->shape()[-1]));
if(fanIn && !fanOut)
- scale = sqrtf(3.0f / t->shape()[-2]); // results in columns of matrix to be ~unit length
+ scale = sqrtf(3.0f / t->shape()[-2]); // fanIn mode: the scale of tensor is adapted by the input variance
+ // results in columns of matrix to be ~unit range
if(!fanIn && fanOut)
- scale = sqrtf(3.0f / t->shape()[-1]);
+ scale = sqrtf(3.0f / t->shape()[-1]); // fanOut mode: the scale of tensor is adapted by the output variance
scale *= scalingFactor;
@@ -112,9 +113,9 @@ Ptr<NodeInitializer> glorotNormal(bool fanIn, bool fanOut, float scalingFactor)
return fromLambda([fanIn, fanOut, scalingFactor](Tensor t) {
float scale = sqrtf(2.0f / (t->shape()[-2] + t->shape()[-1]));
if(fanIn && !fanOut)
- scale = sqrtf(1.0f / t->shape()[-2]);
+ scale = sqrtf(1.0f / t->shape()[-2]); // fanIn mode: the scale of tensor is adapted by the input variance
if(!fanIn && fanOut)
- scale = sqrtf(1.0f / t->shape()[-1]);
+ scale = sqrtf(1.0f / t->shape()[-1]); // fanOut mode: the scale of tensor is adapted by the output variance
scale *= scalingFactor;
@@ -170,7 +171,7 @@ Ptr<NodeInitializer> fromWord2vec(const std::string& file,
bool normalize /*= false*/) {
return fromLambda([file, dimVoc, dimEmb, normalize](Tensor t) {
auto embs = Word2VecReader().read(file, dimVoc, dimEmb);
- if(normalize) {
+ if(normalize) { // scaling to unit length:
float norm = 0;
for(auto e : embs)
norm += e * e;
diff --git a/src/graph/node_initializers.h b/src/graph/node_initializers.h
index 9dc01a0d..7cdb4183 100755
--- a/src/graph/node_initializers.h
+++ b/src/graph/node_initializers.h
@@ -11,17 +11,18 @@
namespace marian {
class ExpressionGraph; // Forward declaration
-
+/**
+ * The namespace inits.
+ * Declare class NodeInitializer and all the available functions to initialise a node.
+*/
namespace inits {
/**
* Base class for specialized NodeInitializers.
- *
* A NodeInitializer is a functor that is associated with parameters
- * and constants, and is invoked on a tensor during node intialization.
- * You need to override NodeIntializer::apply(Tensor) with your own
- * functionality or use a fromLambda intializer.
- *
+ * and constants, and is invoked on a tensor during node initialization.
+ * You need to override NodeInitializer::apply(Tensor) with your own
+ * functionality or use a fromLambda initializer.
* See node_initializers.cpp for examples.
*/
class NodeInitializer {
@@ -35,155 +36,242 @@ public:
};
/**
- * Use a lambda function of form [](Tensor t) { do something with t } to initalize tensor
+ * Use a lambda function of form [](Tensor t) { do something with t } to initialize tensor.
+ * @param func functor
*/
Ptr<NodeInitializer> fromLambda(std::function<void(Tensor)>&& func);
/**
- * Use a lambda function of form [](Tensor t) { do something with t } to initalize tensor
- * Create temporary tensor of Type intermediateType first, initialize and then copy/convert to actual Tensor
- * Useful for functions that can only operate on a specific type of tensor
+ * Use a lambda function of form [](Tensor t) { do something with t } to initialize tensor.
+ * Create temporary tensor of Type intermediateType first, initialize and then copy/convert to actual Tensor.
+ * Useful for functions that can only operate on a specific type of tensor.
*/
Ptr<NodeInitializer> fromLambda(std::function<void(Tensor)>&& func, Type intermediateType);
/**
- * Initialize tensor with given value
- *
- * Creates a NodeInitializer that will intialize the given tensor
+ * Initialize tensor with given value.
+ * Creates a NodeInitializer that will initialize the given tensor
* with `value`. Works with any underlying numeric tensor type.
- *
* @return A NodeInitializer
*/
Ptr<NodeInitializer> fromValue(float value);
/**
- * Fill tensor with `0`
- *
- * Creates a NodeInitializer that will intialize the given tensor
+ * Fill tensor with `0`.
+ * Creates a NodeInitializer that will initialize the given tensor
* with `0`. Works with any underlying numeric tensor type.
- *
* @return A NodeInitializer
*/
static Ptr<NodeInitializer> zeros() { return fromValue(0.0f); }
/**
- * Fill tensor with `1`
- *
- * Creates a NodeInitializer that will intialize the given tensor
+ * Fill tensor with `1`.
+ * Creates a NodeInitializer that will initialize the given tensor
* with `1`. Works with any underlying numeric tensor type.
- *
* @return A NodeInitializer
*/
static Ptr<NodeInitializer> ones() { return fromValue(1.0f); }
/**
* Set diagonal of two dimensional quadratic matrix to `value`.
- *
- * Sets all values of the tensor to 0 and intializes the diagonal with
+ * Sets all values of the tensor to 0 and initializes the diagonal with
* the given `value`. If no value is specified `1` is used by default.
- *
* @return A NodeInitializer
*/
Ptr<NodeInitializer> eye(float value = 1.f);
/**
- * Intialize tensor with normally distributed random numbers
- *
- * Be default this generates floating point numbers from the
+ * Initialize tensor with normally distributed random numbers.
+ * By default this generates floating point numbers from the
* normal distribution Normal(0, 1) unless specified differently.
- *
* If compiled with `CUDA`, `marian` will use the `cuRand` library
* for both, GPU and CPU computation. The random sequences generated
* are the same on both devices.
- *
* If `marian` is compiled without `CUDA`, a random generator
* from the C++ standard library is used. These random generators
* do not have the same random sequences.
- *
* @return A NodeInitializer
*/
Ptr<NodeInitializer> normal(float mean = 0.f, float stddev = 1.f);
/**
- * Intialize tensor with uniformly distributed random numbers
- *
- * Be default this generates floating point numbers from the
+ * Initialize tensor with uniformly distributed random numbers.
+ * By default this generates floating point numbers from the
* uniform distribution Uniform(0, 1) unless specified differently.
- *
* If compiled with `CUDA`, `marian` will use the `cuRand` library
* for both, GPU and CPU computation. The random sequences generated
* are the same on both devices.
- *
* If `marian` is compiled without `CUDA`, a random generator
* from the C++ standard library is used. These random generators
* do not have the same random sequences.
- *
+ * @param a the lower bound of interval
+ * @param b the upper bound of interval
* @return A NodeInitializer
*/
Ptr<NodeInitializer> uniform(float a = 0.f, float b = 1.f);
-// @TODO: add documentation
+/**
+ * Initialize tensor with random numbers from Bernoulli Distribution.
+ * The Bernoulli distribution is the discrete probability distribution of
+ * a random variable which takes value `1` with probability p, and
+ * value `0` with probability (1-p).
+ * By default this function generates a tensor of 0 and 1 with probability p
+ * if bernoulli(p) is called. We offer `scale` and `shift` parameters which
+ * can map {0,1} to {0,1}*`scale`+`shift`.
+ * E.g., bernoulli(tensor, 0.5f, 2.f, -1.f) where p=0.5f, scale=2.f, shift=-1.f.
+ * {0,1} is mapped to {0,1}*2+(-1)= {-1,1}. It generates a tensor composed of
+ * 50% of 1 and 50% of -1.
+ * @return A NodeInitializer
+ */
Ptr<NodeInitializer> bernoulli(float p, float scale = 1.f, float shift = 0.f);
-// @TODO: add documentation
+/**
+ * Initialize tensor with random numbers from Glorot uniform distribution.
+ * The <a href=http://proceedings.mlr.press/v9/glorot10a/glorot10a.pdf>Glorot uniform</a>,
+ * also called Xavier uniform, is designed to keep the scale of
+ * the gradients roughly the same in all layers.
+ * This function offers three variants (modes).
+ * The values of the tensor is sampled from Uniform(-x*scale, x*scale):
+ * - when fanIn=false and fanOut=false (by default):
+ * x = sqrt(6 / (in + out))
+ * - when fanIn=true and fanOut=false (fanIn mode):
+ * x = sqrt(3 / in)
+ * - when fanIn=false and fanOut=false (fanOut mode):
+ * x = sqrt(3 / out)
+ * where `in` is the number of input units in the tensor, `out` is the number of output units.
+ * `scale` is used to change the range of Uniform distribution.
+ * @return A NodeInitializer
+ */
Ptr<NodeInitializer> glorotUniform(bool fanIn = false, bool fanOut = false, float scale = 1.f);
-// @TODO: add documentation
+/**
+ * Initialize tensor with random numbers from Glorot Normal distribution.
+ * Similar to function glorotUniform(), this function adopts Normal distribution instead of
+ * uniform distribution.
+ * This function offers three variants (modes).
+ * The values of the tensor is sampled from Normal(-x*scale, x*scale):
+ * - when fanIn=false and fanOut=false (by default):
+ * x = sqrt(2 / (in + out))
+ * - when fanIn=true and fanOut=false (fanIn mode):
+ * x = sqrt(1 / in)
+ * - when fanIn=false and fanOut=false (fanOut mode):
+ * x = sqrt(1 / out)
+ * where `in` is the number of input units in the tensor, `out` is the number of output units.
+ * `scale` is used to change the range of Normal distribution.
+ * @return A NodeInitializer
+ */
Ptr<NodeInitializer> glorotNormal(bool fanIn = false, bool fanOut = false, float scale = 1.f);
-// @TODO: add documentation
-Ptr<NodeInitializer> dropout(float dropoutProbabilty);
+/**
+ * Initialize a dropout mask (a tensor of 0 and 1) with given dropout probability.
+ * <a href=https://www.cs.toronto.edu/~hinton/absps/JMLRdropout.pdf>Dropout</a>
+ * is proposed as a technique to prevent Neural Networks from overfitting.
+ * @param dropoutProbability a float type defines the dropout probability.
+ * E.g., dropoutProbability=0.1 means 90% of values are kept.
+ * @return A NodeInitializer
+ */
+Ptr<NodeInitializer> dropout(float dropoutProbability);
/**
- * Intialize with gumbel noise, i.e. -log(-log(u)) where u ~ Uniform(0 + eps, 1 - eps)
- *
+ * Initialize with gumbel noise, i.e. -log(-log(u)) where u ~ Uniform(0 + eps, 1 - eps).
+ * @param eps a variable protects from log(0)
* @return A NodeInitializer
*/
Ptr<NodeInitializer> gumbel(float eps = 1e-5f);
-// @TODO: add documentation
+/**
+ * Initialize tensor by *copying* from the given vector.
+ * Creates a NodeInitializer that will initialize the tensor
+ * by *copying* the values from the given vector
+ * @param v vector
+ * @return A NodeInitializer
+ */
template <typename T>
Ptr<NodeInitializer> fromVector(const std::vector<T>& v);
+
+/**
+ * Initialize tensor by *moving* from the given vector.
+ * Creates a NodeInitializer that will initialize the tensor by *moving* the values
+ * from the given vector into this tensor, and the given vector may be emptied.
+ * This version is the <a href=https://en.cppreference.com/w/cpp/language/reference>
+ * rvalue reference</a> overloading.
+ * @param v vector
+ * @return A NodeInitializer
+ */
template <typename T>
Ptr<NodeInitializer> fromVector(std::vector<T>&& v);
-// @TODO: add documentation
+/**
+ * Initialize tensor from a given sparse vector.
+ * Creates a NodeInitializer that will initialize the tensor from a given
+ * sparse vector (stored in std::pair). The resulting tensor is first filled
+ * with `1e-6` (a placeholder for non-zero element), then set the value to
+ * the given sparse vector.
+ * @param v the sparse vector is stored in `std::pair`:
+ * - the first object (v.first) holds the indexes (in a vector)
+ * - the second object (v.second) holds the corresponding values (in a vector).
+ * This means the value of the resulting tensor at index v.first[i] is v.second[i].
+ * @return A NodeInitializer
+ */
Ptr<NodeInitializer> fromSparseVector(std::pair<std::vector<size_t>, std::vector<float>>& v);
-// @TODO: add documentation
+/**
+ * Initialize tensor by copying from the given io::Item.
+ * Creates a NodeInitializer that will initialize the tensor by copying the values
+ * from the given io::Item. If this io::Item is a memory-mapped item, then the
+ * function will set the memory region pointing to this item. If this io::Item is
+ * a regular item, then the function will copy the values from this item.
+ * @return A NodeInitializer
+ */
Ptr<NodeInitializer> fromItem(const io::Item& item);
-// @TODO: add documentation
+/**
+ * Initialize tensor by copying from the given tensor.
+ * Creates a NodeInitializer that will initialize the tensor
+ * by copying the values from the given tensor.
+ * @return A NodeInitializer
+ */
Ptr<NodeInitializer> fromTensor(Tensor tensor);
-// @TODO: add documentation
+/**
+ * Initialize tensor from a file.
+ * Creates a NodeInitializer that will initialize the tensor
+ * by copying the values from the given file. This function is
+ * mainly used for loading embedding vectors from a file.
+ * @param file filename
+ * @param dimVoc the number of words in the vocabulary
+ * @param dimEmb the length of embedding vectors
+ * @param normalize a flag holds whether the values are normalize.
+ * Here we adopt the method of <a
+ * href=https://en.wikipedia.org/wiki/Feature_scaling#Scaling_to_unit_length>
+ * scaling to unit length</a>, i.e., dividing each element by the Euclidean length of the vector.
+ * @return A NodeInitializer
+ */
Ptr<NodeInitializer> fromWord2vec(const std::string& file,
int dimVoc,
int dimEmb,
bool normalize = false);
/**
+ * Computes Google's sinusoidal position embeddings.
* Computes Google's Transformer-style sinusoidal position embeddings
* starting from position 'start' taking into account batch and time
- * dimensions of the tensor.
- *
- * Expected tensor layout {-2: time, -1: model}
- *
- * Usually gets later reshaped to {time, 1, model} and
- * added with a broadcast to learned embeddings. Positional
- * embeddings are the same for each batch entry and change
- * over time steps.
+ * dimensions of the tensor. Expected tensor layout {-2: time, -1: model}.
+ * Usually gets later reshaped to {time, 1, model} and added with a broadcast
+ * to learned embeddings. Positional embeddings are the same for each batch
+ * entry and change over time steps.
*/
Ptr<NodeInitializer> sinusoidalPositionEmbeddings(int start);
/**
- * Computes a random rotation matrix for LSH hashing. This is part
- * of a hash function. The values are orthonormal and computed via
+ * Computes a random rotation matrix for LSH hashing.
+ * This is part of a hash function. The values are orthonormal and computed via
* QR decomposition. Same seed results in same random rotation.
*/
Ptr<NodeInitializer> randomRotation(size_t seed = Config::seed);
/**
+ * Computes the equivalent of Python's range().
* Computes a range from begin to end-1, like Python's range().
* The constant being initialized must have one dimension that matches
* the number of elements being generated, while any other dimension must be 1.
diff --git a/src/graph/node_operators.h b/src/graph/node_operators.h
index c3dde73c..4f9582db 100644
--- a/src/graph/node_operators.h
+++ b/src/graph/node_operators.h
@@ -5,7 +5,13 @@
#include "tensors/tensor.h"
namespace marian {
-
+/**
+ * A constant node for the graph.
+ * A constant node is actually a constant tensor whose value is
+ * immutable during the training. ConstantNode instance is usually
+ * used as the inputs. To construct a constant node in the
+ * graph, we use constant() function in ExpressionGraph class.
+ */
struct ConstantNode : public Node {
ConstantNode(Ptr<ExpressionGraph> graph,
const Shape& shape,
@@ -35,7 +41,13 @@ private:
Ptr<inits::NodeInitializer> init_;
bool initialized_;
};
-
+/**
+ * A parameter node for the graph.
+ * A parameter node is used to store model parameters whose value can be
+ * changed during the training, such as weights and biases. To construct
+ * a parameter node in the graph, we use param() function in
+ * ExpressionGraph class.
+ */
struct ParamNode : public Node {
ParamNode(Ptr<ExpressionGraph> graph,
const Shape& shape,
diff --git a/src/tensors/tensor.h b/src/tensors/tensor.h
index 4c214b20..c959619c 100755
--- a/src/tensors/tensor.h
+++ b/src/tensors/tensor.h
@@ -21,6 +21,12 @@ namespace io {
struct Item;
}
+/**
+ * Main implementation of a <a href="https://en.wikipedia.org/wiki/Tensor">tensor</a>,
+ * a multi-dimensional matrix containing elements of a single data type.
+ * TensorBase contains the data, data type, pointer to
+ * memory region, shape, backend info and other attributes.
+ */
class TensorBase {
MemoryPiece::PtrType memory_;
Shape shape_;