Add graph documentations (#788)

* add API docs for expression_graph.h * change API docs to doxygen-readable format * add API docs for node_initializers * update doxygen configure file * add hyperlinks and remove layers section from graph documentation * fixing typos and links on graph doc
author: Qianqian Zhu <qianqian.zhu@hotmail.com> 2021-02-28 11:07:19 +0300
committer: GitHub <noreply@github.com> 2021-02-28 11:07:19 +0300
commit: 2a9c0bb3773c32c20fce74a8b0f7149478ebd8cb (patch)
tree: cf0af697c565ae9a1fb6ff09fcc2ce466ef6dcc2 /src
parent: f88ded2ba8fc9c452717674542fcaef231b3f3e9 (diff)
11 files changed, 535 insertions, 157 deletions
diff --git a/src/common/definitions.h b/src/common/definitions.h
index 0bc14ba1..d2cf8aa4 100755
--- a/src/common/definitions.h
+++ b/src/common/definitions.h
@@ -127,6 +127,7 @@ IPtr<T> INew(Ptr<T> p) {
   return IPtr<T>(p);
 }
 
+/// enum class DeviceType: defines which device is used for computation
 enum class DeviceType : size_t { gpu = 0, cpu = 1 };
 
 struct DeviceId {
diff --git a/src/common/shape.h b/src/common/shape.h
index 0b633e4d..7b1841e5 100644
--- a/src/common/shape.h
+++ b/src/common/shape.h
@@ -28,6 +28,14 @@ struct Slice // Python-like slice/index descriptor
 };
 typedef std::vector<Slice> Slices;
 
+/**
+ * Shape class mainly defines the shape or dimensionality of the node.
+ * Basically, Shape is a wrapper of a std::vector. Its size is the number of
+ * dimension. E.g., shape={2,3} means 2D matrix with dim[0]=2 and dim[1]=3.
+ * WHen the index is negative, the real index is size() + index.
+ * It implements most common functions demanded by operations, e.g., resize(),
+ * slice(), and broadcast().
+ */
 struct Shape {
 private:
   std::vector<int> shape_;
diff --git a/src/common/types.h b/src/common/types.h
index 0f70bb22..9ce57527 100644
--- a/src/common/types.h
+++ b/src/common/types.h
@@ -131,7 +131,7 @@ do { \
     default: ABORT("Unknown type {}", type); \
   } \
 } while(0)
-
+/// namespace marian
 namespace marian {
 
 // small struct to enable templating based on types use for packing
@@ -247,36 +247,37 @@ constexpr inline size_t operator+(size_t val, TypeClass typeClass) {
 }
 
 // @TODO: rename to ElementType when things become stable, so it's easier to review
+/// enum class Type: stores all supported data type in Marian
 enum class Type : size_t {
-  int8     = TypeClass::signed_type + 1u,
-  int16    = TypeClass::signed_type + 2u,
-  int32    = TypeClass::signed_type + 4u,
-  int64    = TypeClass::signed_type + 8u,
-
-  uint8    = TypeClass::unsigned_type + 1u,
-  uint16   = TypeClass::unsigned_type + 2u,
-  uint32   = TypeClass::unsigned_type + 4u,
-  uint64   = TypeClass::unsigned_type + 8u,
-
-  float16  = TypeClass::float_type + 2u,
-  float32  = TypeClass::float_type + 4u,
-  float64  = TypeClass::float_type + 8u,
-
-  packed16            = TypeClass::packed_type + 2u,                                   // special type for FBGEMM, not meant to be used anywhere else, not meant to be accessed invidually. Internal actual type (uint16) is meaningless.
-  packed8avx2         = TypeClass::packed_type + 1u + TypeClass::avx2_type,            // special type for FBGEMM with AVX2, not meant to be used anywhere else, not meant to be accessed invidually. Internal actual type (uint8) is meaningless.
-  packed8avx512       = TypeClass::packed_type + 1u + TypeClass::avx512_type,          // special type for FBGEMM with AVX512, not meant to be used anywhere else, not meant to be accessed invidually. Internal actual type (uint8) is meaningless.
-
-  intgemm8            = TypeClass::intgemm_type + 1u,                                  // Int8 quantized (not packed) matrices for intgemm
-  intgemm16           = TypeClass::intgemm_type + 2u,                                  // Int16 quantized (not packed) matrices for intgemm
-
-  intgemm8ssse3       = TypeClass::intgemm_type + 1u + TypeClass::ssse3_type,          // Int8 quantized and packed (ssse3) matrices for intgemm
-  intgemm8avx2        = TypeClass::intgemm_type + 1u + TypeClass::avx2_type,           // Int8 quantized and packed (avx2) matrices for intgemm
-  intgemm8avx512      = TypeClass::intgemm_type + 1u + TypeClass::avx512_type,         // Int8 quantized and packed (avx512) matrices for intgemm
-  intgemm8avx512vnni  = TypeClass::intgemm_type + 1u + TypeClass::avx512_type + 4096u, // Int8 quantized and packed (avx512) matrices for intgemm. VNNI algorithm
-
-  intgemm16sse2       = TypeClass::intgemm_type + 2u + TypeClass::sse2_type,           // Int16 quantized and packed (sse2) matrices for intgemm
-  intgemm16avx2       = TypeClass::intgemm_type + 2u + TypeClass::avx2_type,           // Int16 quantized and packed (avx2) matrices for intgemm
-  intgemm16avx512     = TypeClass::intgemm_type + 2u + TypeClass::avx512_type,         // Int16 quantized and packed (avx512) matrices for intgemm
+  int8     = TypeClass::signed_type + 1u,      ///< int8 type
+  int16    = TypeClass::signed_type + 2u,      ///< int16 type
+  int32    = TypeClass::signed_type + 4u,      ///< int32 type
+  int64    = TypeClass::signed_type + 8u,      ///< int64 type
+
+  uint8    = TypeClass::unsigned_type + 1u,    ///< uint8 type
+  uint16   = TypeClass::unsigned_type + 2u,    ///< uint16 type
+  uint32   = TypeClass::unsigned_type + 4u,    ///< uint32 type
+  uint64   = TypeClass::unsigned_type + 8u,    ///< uint64 type
+
+  float16  = TypeClass::float_type + 2u,       ///< float16 type
+  float32  = TypeClass::float_type + 4u,       ///< float32 type
+  float64  = TypeClass::float_type + 8u,       ///< float64 type
+
+  packed16            = TypeClass::packed_type + 2u,                                   ///< special type for FBGEMM, not meant to be used anywhere else, not meant to be accessed invidually. Internal actual type (uint16) is meaningless.
+  packed8avx2         = TypeClass::packed_type + 1u + TypeClass::avx2_type,            ///< special type for FBGEMM with AVX2, not meant to be used anywhere else, not meant to be accessed invidually. Internal actual type (uint8) is meaningless.
+  packed8avx512       = TypeClass::packed_type + 1u + TypeClass::avx512_type,          ///< special type for FBGEMM with AVX512, not meant to be used anywhere else, not meant to be accessed invidually. Internal actual type (uint8) is meaningless.
+
+  intgemm8            = TypeClass::intgemm_type + 1u,                                  ///< Int8 quantized (not packed) matrices for intgemm
+  intgemm16           = TypeClass::intgemm_type + 2u,                                  ///< Int16 quantized (not packed) matrices for intgemm
+  
+  intgemm8ssse3       = TypeClass::intgemm_type + 1u + TypeClass::ssse3_type,          ///< Int8 quantized and packed (ssse3) matrices for intgemm
+  intgemm8avx2        = TypeClass::intgemm_type + 1u + TypeClass::avx2_type,           ///< Int8 quantized and packed (avx2) matrices for intgemm
+  intgemm8avx512      = TypeClass::intgemm_type + 1u + TypeClass::avx512_type,         ///< Int8 quantized and packed (avx512) matrices for intgemm
+  intgemm8avx512vnni  = TypeClass::intgemm_type + 1u + TypeClass::avx512_type + 4096u, ///< Int8 quantized and packed (avx512) matrices for intgemm. VNNI algorithm
+
+  intgemm16sse2       = TypeClass::intgemm_type + 2u + TypeClass::sse2_type,           ///< Int16 quantized and packed (sse2) matrices for intgemm
+  intgemm16avx2       = TypeClass::intgemm_type + 2u + TypeClass::avx2_type,           ///< Int16 quantized and packed (avx2) matrices for intgemm
+  intgemm16avx512     = TypeClass::intgemm_type + 2u + TypeClass::avx512_type,         ///< Int16 quantized and packed (avx512) matrices for intgemm
 };
 
 static inline size_t operator&(TypeClass typeClass, Type type) {
diff --git a/src/graph/expression_graph.cpp b/src/graph/expression_graph.cpp
index 7d42e811..827fb3ed 100644
--- a/src/graph/expression_graph.cpp
+++ b/src/graph/expression_graph.cpp
@@ -30,7 +30,7 @@ Expr ExpressionGraph::add(Expr node) {
   } else {
     node->setId(count_++);
 
-    // record in foward graph
+    // record in forward graph
     nodesForward_.push_back(node);
 
     // record in backward graph if training, and keep track of roots
@@ -143,6 +143,11 @@ void ExpressionGraph::forward(std::list<Expr>& forwardTape, bool finalPass) {
     if(inferenceOnly_)
       v->children().clear();
 
+    // If checkpointing is disabled, keep the memory for forward signals for all nodes.
+    // If checkpointing is enabled:
+    //  (a) In the forward pass before the backward pass, free the memory for the nodes in the subtape to save memory.
+    //  (b) In the forward calls during the backward pass, keep the memory in the current subtape to accelerate
+    //      gradient computation.
     if(checkpointing_ && !finalPass) {
       auto subtape = v->getSubtape();
       if(subtape) {
@@ -171,12 +176,14 @@ void ExpressionGraph::backward(bool reset, float clipValue) {
     ABORT("Aborting");
   }
 
+  // allocates memory and initialises gradients for parameters
   for(auto kvParams : paramsByElementType_) {
     kvParams.second->allocateBackward();
     if(reset)
       kvParams.second->set_zero_adjoint();
   }
 
+  // for top nodes: allocates memory and initialise gradients to 1
   for(auto&& v : topNodes_)
     v->init_dependent();
 
@@ -186,13 +193,16 @@ void ExpressionGraph::backward(bool reset, float clipValue) {
 
   bool firstNaN = true;
   while(!nodesBackward_.empty()) {
-    auto v = nodesBackward_.back();
-    nodesBackward_.pop_back();
+    auto v = nodesBackward_.back();  // return the last element
+    nodesBackward_.pop_back();       // remove the last element
 
+    // for non-top nodes: allocates memory and initialises gradients to 0
     for(auto&& child : v->children())
       if(child->trainable() && child->type() != "param")
         child->set_zero_adjoint();
 
+    // if using gradient checkpointing,
+    // recompute the forward pass from checkpoint to the root
     if(checkpointing_ && v->getSubtape()) {
       forward(*v->getSubtape(), /*finalPass=*/true);
     }
diff --git a/src/graph/expression_graph.h b/src/graph/expression_graph.h
index b4f0c1e2..f03f189d 100644
--- a/src/graph/expression_graph.h
+++ b/src/graph/expression_graph.h
@@ -16,9 +16,18 @@
 
 namespace marian {
 
+/**
+ * Create an expression node of any type, and pass all
+ * arguments to any available constructor.
+ * E.g., to create a ConstantNode uses `Expression<ConstantNode>(...)`.
+ */
 template <class T, typename... Args>
 Expr Expression(Args&&... args);
 
+/**
+ * The whole tensor set in the graph.
+ * Holds all tensor objects (memory and nodes) for a graph.
+ */
 class Tensors {
 private:
   Ptr<TensorAllocator> tensors_;
@@ -27,8 +36,8 @@ private:
   typedef std::unordered_map<size_t, std::vector<WExpr>> WeakMemory;
   typedef std::unordered_map<size_t, std::vector<Expr>> Memory;
 
-  Ptr<WeakMemory> shortterm_;
-  Ptr<Memory> longterm_;
+  Ptr<WeakMemory> shortterm_;  // holds all nodes for a graph
+  Ptr<Memory> longterm_;  // holds memoized nodes
 
 public:
   Tensors(Ptr<Backend> backend)
@@ -112,97 +121,145 @@ public:
 
 typedef std::map<Type, Ptr<Parameters>> ElementTypeParamsMap; // keep it sorted, hence map not unordered map
 
+/**
+ *  Main implementation of a computation graph.
+ *  Keeps a record of data (tensors) and all operations. Each operation in a computation graph is a Node.
+ *  Each Node defines its forward and backward steps.
+ */
 class ExpressionGraph : public std::enable_shared_from_this<ExpressionGraph> {
-  size_t count_{0};
+  size_t count_{0};  // counter for nodes in the graph; hold current node index
 
-  std::unordered_set<Expr> topNodes_; // current set of roots. In the end, all but one must have been consumed.
+  std::unordered_set<Expr> topNodes_; // current set of roots. In the end, all but one must have been consumed
 
 protected:  // (these are protected, not private, for ONNX exporting)
-  std::list<Expr> nodesForward_;
-  std::list<Expr> nodesBackward_;
+  std::list<Expr> nodesForward_;     ///< contains all nodes used for forward()
+  std::list<Expr> nodesBackward_;    ///< contains trainable nodes used for backward()
 
-  // Holds memory and expressions that correspond to temporary expressions.
-  // This gets cleared before a new graph is built.
+  /**
+   * A shared pointer to the tensor objects in the graph.
+   * Holds memory and nodes that corresponds to tensors in a graph.
+   * Since operations will result in new tensors, this attribute is used
+   * to allocate memory for new tensors during forward() and backward().
+   * This gets cleared before a new graph is built.
+   */
   Ptr<Tensors> tensors_;
 private:
 
   std::unordered_map<size_t, std::vector<Expr>> memoized_;
 
-  Type defaultElementType_{Type::float32}; // Type used for storing parameters, currently all parameters have to have the same type
+  Type defaultElementType_{Type::float32};  // Type used for storing parameters, currently all parameters have to have the same type
 
-  bool inferenceOnly_{false};
+  bool inferenceOnly_{false};               // a flag holds whether the graph is used for inference only
 
-  bool checkpointing_{false}; // use gradient checkpointing if true
+  bool checkpointing_{false};               // use gradient checkpointing if true
 
-  bool reloaded_{false};
+  bool reloaded_{false};                    // a flag holds whether the graph is reloaded: reloaded is true if the graph loads parameters by load() function.
 
-  bool throwNaN_{false};
+  bool throwNaN_{false};                    // a flag holds whether the graph throws a NaN exception
 
 protected:
   // Delete, copy and move constructors
   ExpressionGraph(const ExpressionGraph&) = delete;
   ExpressionGraph(ExpressionGraph&&) = delete;
 
-  // Holds memory and expressions that correspond to graph parameters
-  // Now we can have multiple types of parameters in a separate parameters object per value type. 
-  // This is currently only accessible through private functions during loading, will abort during training
-  // when params() is called (e.g. optimizer) and there is more or other types than the default parameter type.
-  // Currently the only usecase is inference. Trying to access params() for non-default parameter type is going
-  // to abort. Inference does not need to access a whole set of parameters.
+  /**
+   * A map holds memory and nodes that corresponds to graph parameters.
+   * The key is Type and the mapped value is a set of parameter objects with corresponding type.
+   * Now we can have multiple types of parameters in a separate parameters object per value type.
+   * This is currently only accessible through private functions during loading, will abort during training
+   * when params() is called (e.g. optimizer) and there is more or other types than the default parameter type.
+   * Currently the only usecase is inference. Trying to access params() for non-default parameter type is going
+   * to abort. Inference does not need to access a whole set of parameters.
+   */
   ElementTypeParamsMap paramsByElementType_;
-  Ptr<Backend> backend_;
-
-  std::string namespace_;
+  Ptr<Backend> backend_;      ///< a shared pointer to the backend for the graph
+  std::string namespace_;     ///< a string defines the namespace of the graph. Each graph has its own unique namespace.
 
 public:
-  /** @brief Constructs a new expression graph
-   *
-   * Constructor should be used as New<ExpressionGraph>()
-   */
+  /** Constructs a new expression graph. Constructor should be used as New<ExpressionGraph>(). */
   ExpressionGraph(bool inference = false);
 
+  /** Destructor. Clear everything related to the graph except memoized nodes. */
   virtual ~ExpressionGraph() {
     clear();
     for(auto kvParams : paramsByElementType_)
       kvParams.second->clear();
   }
 
+  /**
+   * Set device options used to run the graph.
+   * @param deviceId a struct type which stores device no. (size_t)
+   * and device type (DeviceType::cpu or DeviceType::gpu)
+   * @param device a pointer to the device
+   */
   virtual void setDevice(DeviceId deviceId = {0, DeviceType::gpu},
                          Ptr<Device> device = nullptr);
 
+  /**
+   * Get device info for the graph.
+   * @return deviceId a struct type which stores device no. (size_t)
+   * and device type (DeviceType::cpu or DeviceType::gpu)
+   */
   DeviceId getDeviceId() { return backend_->getDeviceId(); }
 
+  /**
+   * Get backend pointer for the graph.
+   * @return Ptr<Backend> pointer to backend
+   */
   Ptr<Backend> getBackend() { return backend_; }
 
+  /** Set whether the graph is used for inference only */
   void setInference(bool inference) { inferenceOnly_ = inference; }
+
+  /** Check whether the graph is used for inference only (true) or not */
   bool isInference() { return inferenceOnly_; }
 
+  /**
+   * Set whether the graph uses gradient checkpointing.
+   * <a href="https://github.com/cybertronai/gradient-checkpointing">Gradient Checkpointing</a>
+   * works by trading compute for memory, which reruns a forward-pass segment for each checkpoint segment during backward.
+   */
   void setCheckpointing(bool checkpointing) { checkpointing_ = checkpointing; }
+
+  /** Check whether the graph uses gradient checkpointing or not */
   bool isCheckpointing() { return checkpointing_; }
 
+  /**
+   * Set namespace (std::string) for the graph.
+   * Each graph has its own unique namespace, which is used to form the name of a parameter object.
+   */
   void switchParams(const std::string& newNamespace) {
     namespace_ = newNamespace;
   }
 
+  /**
+   * Copy all parameter objects from one graph to current graph.
+   * @param graph a pointer to a graph object
+   */
   virtual void copyParams(Ptr<ExpressionGraph> graph) {
     for(auto p : *graph->params())
       param(p->name(), p->shape(), inits::fromTensor(p->val()), p->value_type());
-    forward(); // this will allocate parameters, execute the intializers and therefore copy parameter values
+    forward(); // this will allocate parameters, execute the initializers and therefore copy parameter values
   }
 
+  /**
+   * Preallocate workspace memory (MB) for the graph.
+   * Sets the size of the memory available for the forward and backward step of the training procedure.
+   * This does not include model size and optimizer parameters that are allocated outsize workspace.
+   */
   void reserveWorkspaceMB(size_t num) {
     size_t bytes = num * 1024 * 1024 - 1;
     tensors_->reserve(bytes);
   }
 
+  /** Copy tensor objects from one graph to current graph */
   void reuseWorkspace(Ptr<ExpressionGraph> graph) {
     tensors_ = graph->tensors_;
   }
 
   /**
-   * @brief Performs backpropogation on this expression graph.
-   *
-   * Backpropogation is implemented by performing first the forward pass and
+   * Performs backpropagation on this expression graph.
+   * Backpropagation is implemented by performing first the forward pass and
    * then the backward pass of algorithmic differentiation (AD) on the nodes of
    * the graph.
    */
@@ -211,6 +268,12 @@ public:
     backward();
   }
 
+  /**
+   * Perform one backpropagation process on the graph to test
+   * whether the graph workspace fits into a given workspace memory.
+   * This function is used for searching the maximum batch size
+   * that fits into given workspace memory.
+   */
   bool fits() {
     try {
       tensors_->throwAtReallocation(true);
@@ -223,19 +286,50 @@ public:
     return true;
   }
 
+  /**
+   * Check whether the memory allocated for a tensor object contains a NaN or infinite value.
+   * @param t a Tensor object
+   * @param isNaN a bool type holds the result whether the tensor contains a NaN value (pass by reference)
+   * @param isInf a bool type holds the result whether the tensor contains a infinite value (pass by reference)
+   */
   void checkNaN(Tensor t, bool& isNaN, bool& isInf);
 
+  /**
+   * Perform the forward pass on the nodes of the graph.
+   * The forward pass refers to the calculation process.
+   * It traverses through all nodes from input layer to output layer.
+   */
   void forward() {
     for(auto kvParams : paramsByElementType_)
       kvParams.second->allocateForward();
     forwardNext();
   }
 
+  /**
+   * Perform the forward pass without memory allocation for parameters.
+   * Helper function for forward().
+   */
   void forwardNext();
+
+  /**
+   * Perform forward pass on a given nodes with finalPass flag.
+   * Helper function for forward() and backward().
+   * @param forwardTape a pointer to the nodes used for forward pass
+   * @param finalPass a bool type which controls whether nodes should be freed with gradient-checkpointing
+   */
   void forward(std::list<Expr>& forwardTape, bool finalPass);
 
+  /**
+   * Perform the backward pass on the trainable nodes of the graph.
+   * The back pass refers to the process of computing the output error.
+   * It traverses through all nodes from output layer to input layer.
+   */
   void backward(bool reset = true, float clipValue = 0.f);
 
+  /**
+   * Generate graph layout in Graphviz format for visualisation.
+   * @return a string presenting graph layout in Graphviz format (dot)
+   */
   std::string graphviz() {
     std::stringstream ss;
     ss << "digraph ExpressionGraph {" << std::endl;
@@ -253,6 +347,10 @@ public:
     return ss.str();
   }
 
+  /**
+   * Write graph layout in Graphviz format to a file.
+   * @param filename a string type specifies filename that writes the graph layout
+   */
   void graphviz(const std::string& filename) {
     std::ofstream dot(filename);
     dot << graphviz();
@@ -345,6 +443,18 @@ private:
   }
 
 public:
+
+  /**
+   * Construct a parameter node in the graph.
+   * @param pname a string type holds the name of the parameter node
+   * @param shape a struct type defines the shape of the parameter tensor
+   *        e.g., shape={2,3} means 2D matrix with dim[0]=2 and dim[1]=3
+   * @param init a pointer to a NodeInitializer object, e.g., inits::zeros()
+   * @param elementType a scoped enumerator (enum class) defines the element type, e.g., Type::float16
+   * @param fixed a bool type specifies whether the parameter object is fixed (not trainable) or not.
+   *        The default value is false which means the parameter is trainable.
+   * @return a pointer to the parameter node
+   */
   Expr param(const std::string& pname,
              const Shape& shape,
              const Ptr<inits::NodeInitializer>& init,
@@ -354,6 +464,17 @@ public:
     return param(pname, shape, init, elementType, fixed, /*typeSpecified=*/true);
   }
 
+  /**
+   * Construct a parameter node in the graph without a specified type, and
+   * the type is set to defaultElementType_.
+   * @param pname a string type holds the name of the parameter node
+   * @param shape a struct type defines the shape of the parameter tensor
+   *        e.g., shape={2,3} means 2D matrix with dim[0]=2 and dim[1]=3
+   * @param init a pointer to a NodeInitializer object, e.g., inits::zeros()
+   * @param fixed a bool type specifies whether the parameter object is fixed (not trainable) or not.
+   *        The default value is false which means the parameter is trainable.
+   * @return a pointer to the parameter node
+   */
   Expr param(const std::string& pname,
              const Shape& shape,
              const Ptr<inits::NodeInitializer>& init,
@@ -362,28 +483,59 @@ public:
     return param(pname, shape, init, defaultElementType_, fixed, /*typeSpecified=*/false);
   }
 
+  /**
+   * Construct a constant node in the graph without a specified type, and
+   * the type is set to defaultElementType_.
+   * @param shape a struct type defines the shape of the constant tensor
+   *        e.g., shape={2,3} means 2D matrix with dim[0]=2 and dim[1]=3
+   * @param init a pointer to a NodeInitializer object, e.g., inits::zeros()
+   * @param elementType a scoped enumerator (enum class) defines the element type, e.g., Type::float16
+   * @return a pointer to the constant node
+   */
   Expr constant(const Shape& shape,
                 const Ptr<inits::NodeInitializer>& init,
                 Type elementType) {
     return Expression<ConstantNode>(shared_from_this(), shape, init, elementType);
   }
 
+  /**
+   * Construct a constant node in the graph without a specified type, and
+   * the type is set to defaultElementType_.
+   * @param shape a struct type defines the shape of the constant tensor
+   *        e.g., shape={2,3} means 2D matrix with dim[0]=2 and dim[1]=3
+   * @param init a pointer to a NodeInitializer object, e.g., inits::zeros()
+   * @return a pointer to the constant node
+   */
   Expr constant(const Shape& shape,
                 const Ptr<inits::NodeInitializer>& init) {
     return Expression<ConstantNode>(shared_from_this(), shape, init, defaultElementType_);
   }
 
   // @TODO: add version with iterators
-  // shortcut to turn vector of indices to integer tensor, to be used with operators
-  // like rows or select
+  /**
+   * Turn vector of indices to integer tensor.
+   * A shortcut version to turn vector of indices to integer tensor, to be used with operators
+   * like rows() or index_select()
+   * @param indicesVector a vector of IndexType (uint32_t) specifies the indexes
+   */
   Expr indices(const std::vector<IndexType>& indicesVector) {
     return constant({(int)indicesVector.size()},
                     inits::fromVector(indicesVector),
                     Type::uint32);
   }
-  // this version sets up the shape such that the indices are in a given axis
-  // Use this if you want to pass these indices to gather().
-  // indexee shape = (3, 2, 5, 2); axis = 1 -> resulting shape = (1, size of indicesVector, 1, 1)
+
+  /**
+   * Specify the indexes of elements to be taken from a tensor.
+   * This version sets up the shape such that the indices are in a given axis.
+   * Use this if you want to pass these indices to gather().
+   * E.g., indexee shape = (3, 2, 5, 2); axis = 1 -> resulting shape = (1, size of indicesVector, 1, 1):
+   *  - The size of the resulting shape is the same as that of the indexee; here is 4.
+   *  - The shape of the specified axis is equal to the size of given indicesVector.
+   *  - The shapes of the rest axes are filled with 1.
+   * @param indicesVector a vector of IndexType (uint32_t) specifies the indexes
+   * @param indexee the source tensor that we want to select elements from
+   * @param axis specifies the axis that we want to collect along
+   */
   Expr indices(const std::vector<IndexType>& indicesVector, Expr indexee, int axis = -1) {
     Shape shape;
     shape.resize(indexee->shape().size());
@@ -393,24 +545,70 @@ public:
                     Type::uint32);
   }
 
+  /**
+   * Construct a constant node filled with `1`.
+   * @param shape a struct type defines the shape of the constant dataset
+   *        e.g., shape={2,3} means 2D matrix with dim[0]=2 and dim[1]=3
+   * @param elementType a scoped enumerator (enum class) defines the element type, e.g., Type::float16
+   */
   Expr ones(const Shape& shape, Type elementType) {
     return constant(shape, inits::ones(), elementType);
   }
+
+  /**
+   * Construct a constant node filled with `1` without a specified type,
+   * and the type is set to defaultElementType_.
+   * @param shape a struct type defines the shape of the constant dataset
+   *        e.g., shape={2,3} means 2D matrix with dim[0]=2 and dim[1]=3
+   */
   Expr ones(const Shape& shape) {
     return constant(shape, inits::ones(), defaultElementType_);
   }
 
+  /**
+   * Construct a constant node filled with `0`.
+   * @param shape a struct type defines the shape of the constant dataset
+   *        e.g., shape={2,3} means 2D matrix with dim[0]=2 and dim[1]=3
+   * @param elementType a scoped enumerator (enum class) defines the element type, e.g., Type::float16
+   */
   Expr zeros(const Shape& shape, Type elementType) {
     return constant(shape, inits::zeros(), elementType);
   }
+
+  /**
+   * Construct a constant node filled with `0` without a specified type,
+   * and the type is set to defaultElementType_.
+   * @param shape a struct type defines the shape of the constant dataset
+   *        e.g., shape={2,3} means 2D matrix with dim[0]=2 and dim[1]=3
+   */
   Expr zeros(const Shape& shape) {
     return constant(shape, inits::zeros(), defaultElementType_);
   }
 
-  // prob = dropProb, e.g. 0.1 means 90% of values are kept
+  /**
+   * Construct a dropout mask (a tensor of 0 and 1).
+   * @param dropProb a float type specifies the dropout probability.
+   *        E.g., dropProb=0.1 means 90% of values are kept.
+   * @param shape a struct type defines the shape of the constant dataset
+   *        e.g., shape={2,3} means 2D matrix with dim[0]=2 and dim[1]=3
+   * @param elementType a scoped enumerator (enum class) defines the element type, e.g., Type::float16
+   */
   Expr dropoutMask(float dropProb, const Shape& shape, Type elementType);
+
+  /**
+   * Construct a dropout mask (a tensor of 0 and 1) without a specified type,
+   * and the type is set to defaultElementType_.
+   * @param dropProb a float type specifies the dropout probability.
+   *        E.g., dropProb=0.1 means 90% of values are kept.
+   * @param shape a struct type defines the shape of the constant dataset
+   *        e.g., shape={2,3} means 2D matrix with dim[0]=2 and dim[1]=3
+   */
   Expr dropoutMask(float dropProb, const Shape& shape);
 
+  /**
+   * Get the parameter object by name.
+   * @param name a string specifies the name of the parameter object
+   */
   Expr get(std::string name) {
     if(!namespace_.empty())
       name = namespace_ + "::" + name;
@@ -419,6 +617,11 @@ public:
     return p;
   }
 
+  /**
+   * Get the parameter object by name and type.
+   * @param name a string specifies the name of the parameter object
+   * @param elementType a scoped enumerator (enum class) defines the element type, e.g., Type::float16
+   */
   Expr get(std::string name, Type specifiedElementType) {
     if(!namespace_.empty())
       name = namespace_ + "::" + name;
@@ -427,6 +630,10 @@ public:
     return p;
   }
 
+  /**
+   * Return the Parameters object related to the graph.
+   * The Parameters object holds the whole set of the parameter nodes.
+   */
   Ptr<Parameters>& params() { 
     // There are no parameter objects, that's weird.
     ABORT_IF(paramsByElementType_.empty(), "No parameter object has been created");
@@ -441,6 +648,10 @@ public:
     return it->second;
   }
 
+  /**
+   * Set default element type for the graph.
+   * The default value is used if some node type is not specified.
+   */
   void setDefaultElementType(Type defaultElementType) {
     ABORT_IF(!paramsByElementType_.empty() && defaultElementType != defaultElementType_, 
              "Parameter objects already exist, cannot change default type from {} to {}", 
@@ -448,31 +659,53 @@ public:
     defaultElementType_ = defaultElementType;
   }
 
+  /**
+   * Add a expression node to the graph.
+   * @param node a pointer to a expression node
+   */
   Expr add(Expr node);
 
+  /**
+   * Allocate memory for the forward pass of the given node.
+   * @param node a pointer to a expression node
+   */
   void allocateForward(Expr node) {
     if(tensors_)
       tensors_->allocateForward(node);
   }
 
+  /**
+   * Allocate memory for the backward pass of the given node.
+   * @param node a pointer to a expression node
+   */
   void allocateBackward(Expr node) {
     if(tensors_)
       tensors_->allocateBackward(node);
   }
 
+  /**
+   * Free the memory for a tensor object.
+   * @param tensor a reference to the tensor object
+   */
   void free(const Tensor& tensor) {
     if(tensors_)
       tensors_->free(tensor);
   }
 
-  // Returns the memory allocator of the graph workspace, allocates row unstructured memory (but 256-byte aligned)
+  /**
+   * Returns the memory allocator of the graph workspace.
+   * Allocates raw unstructured memory (but 256-byte aligned).
+   */
   Ptr<Allocator> allocator() { return tensors_->getAllocator(); } // @TODO: rename this to getAllocator();
 
-  // Returns the tensor allocator of the graph workspace, different from above as proper tensor objects are allocated
+  /**
+   * Returns the tensor allocator of the graph workspace.
+   * Different from allocator() as proper tensor objects are allocated.
+   */
   Ptr<TensorAllocator> getTensorAllocator() { return tensors_->getTensorAllocator(); }
 
+  /** Clear everything apart from parameters and memoized nodes */
   void clear() {
-    // clear everything apart from parameters and memoized nodes
     count_ = 0;
     nodesForward_.clear();
     nodesBackward_.clear();
@@ -482,13 +715,17 @@ public:
     tensors_->clear();
   }
 
+  /** Set the flag value whether the graph is reloaded (true) or not */
   void setReloaded(bool reloaded) { reloaded_ = reloaded; }
 
+  /** Set the flag value whether the graph throws a NaN exception (true) or not */
   void setThrowNaN(bool throwNaN) { throwNaN_ = throwNaN; }
+
+  /** Get the flag value whether the graph throws a NaN exception (true) or not */
   bool getThrowNaN() { return throwNaN_; }
 
 public:
-  // loading from array of io::Items
+  /** Load model (mainly parameter objects) from array of io::Items */
   void load(std::vector<io::Item>& ioItems, bool markReloaded = true) {
     setReloaded(false);
     for(auto& item : ioItems) {
@@ -507,18 +744,24 @@ public:
       setReloaded(true);
   }
 
+  /** Load model by filename */
   void load(const std::string& name, bool markReloaded = true) {
     LOG(info, "Loading model from {}", name);
     auto items = io::loadItems(name);
     load(items, markReloaded);
   }
 
+  /** Load model from buffer (a file pointer) */
   void load(const void* ptr, bool markReloaded = true) {
     LOG(info, "Loading model from buffer at {}", ptr);
     auto items = io::loadItems(ptr);
     load(items, markReloaded);
   }
 
+  /**
+   * Turn the model (given a file pointer) into a memory-mapped type
+   * by converting all the parameter object to memory-mapped version, i.e., MappedParameters.
+   */
   void mmap(const void* ptr, bool markReloaded = true) {
     ABORT_IF(backend_->getDeviceId().type != DeviceType::cpu || !inferenceOnly_,
              "Memory mapping only supported for CPU inference mode");
@@ -541,7 +784,6 @@ public:
       }
     }
 
-
     // pre-populate parameters by type
     for(auto& item : items) {
       auto it1 = paramsByElementType_.find(item.type);
@@ -556,9 +798,19 @@ public:
   }
 
 public:
-  // convert all parameters into an array of io::Item elements, for saving
+  /**
+   * Convert all parameters into an array of io::Item elements, for saving.
+   * @param ioItems an array of io::Item elements
+   * @param saveElementType the element type for saving
+   */
   void save(std::vector<io::Item>& ioItems, Type saveElementType = Type::float32);
 
+  /**
+   * Save all parameters into a file (.npz or .bin).
+   * @param name a string specifies the filename
+   * @param meta a string specifies the name of io::Item elements. If not specified, the parameter name is reserved.
+   * @param saveElementType the element type for saving
+   */
   void save(const std::string& name, const std::string& meta = "", Type saveElementType = Type::float32) {
     std::vector<io::Item> ioItems;
     save(ioItems, saveElementType);
diff --git a/src/graph/node.cpp b/src/graph/node.cpp
index 256f7623..257a639f 100755
--- a/src/graph/node.cpp
+++ b/src/graph/node.cpp
@@ -27,11 +27,6 @@ void Node::free() {
   }
 }
 
-/**
- * Initialization for backward step of top node
- * in computation graph. Allocates memory and sets gradient
- * to 1 (df/df == 1).
- */
 void Node::init_dependent() {
   if(!adj_) {
     graph()->allocateBackward(this);
@@ -39,12 +34,6 @@ void Node::init_dependent() {
   }
 }
 
-/**
- * Initialization for backward step of any non-top node
- * in computation graph. Allocates memory and sets gradient
- * to 0 for further accumulation of gradients from all
- * parents.
- */
 void Node::set_zero_adjoint() {
   if(!adj_) {
     graph()->allocateBackward(this);
diff --git a/src/graph/node.h b/src/graph/node.h
index 9c5382d4..d7b328d6 100644
--- a/src/graph/node.h
+++ b/src/graph/node.h
@@ -28,13 +28,13 @@ protected:
   std::vector<Expr> children_;
 
   Weak<ExpressionGraph> graph_;
-  Shape shape_{1, 1, 1, 1};
-  Type valueType_{Type::float32};
+  Shape shape_{1, 1, 1, 1};         // defines the dimensionality of the node (for tensors)
+  Type valueType_{Type::float32};   // defines the element type of the node (for tensors)
 
   std::string name_{"none"};
 
-  Tensor val_{nullptr};
-  Tensor adj_{nullptr};
+  Tensor val_{nullptr};  // the resulting new tensor in forward pass
+  Tensor adj_{nullptr};  // the accumulated gradients (a tensor) in backward pass
 
   bool markedForDebug_{false};
   std::string debugMessage_;
@@ -105,9 +105,19 @@ public:
   virtual void free() override;
 
   virtual void init() override {};
-
+  /**
+   * Initialization for backward step of top node
+   * in computation graph. Allocates memory and sets gradient
+   * to 1 (df/df == 1).
+   */
   virtual void init_dependent() override;
 
+  /**
+   * Initialization for backward step of any non-top node
+   * in computation graph. Allocates memory and sets gradient
+   * to 0 for further accumulation of gradients from all
+   * parents.
+   */
   virtual void set_zero_adjoint() override;
 
   virtual Tensor& val() override { return val_; };
diff --git a/src/graph/node_initializers.cpp b/src/graph/node_initializers.cpp
index 531cfaad..c6fb622f 100755
--- a/src/graph/node_initializers.cpp
+++ b/src/graph/node_initializers.cpp
@@ -98,9 +98,10 @@ Ptr<NodeInitializer> glorotUniform(bool fanIn, bool fanOut, float scalingFactor)
   return fromLambda([fanIn, fanOut, scalingFactor](Tensor t) {
     float scale = sqrtf(6.0f / (t->shape()[-2] + t->shape()[-1]));
     if(fanIn && !fanOut)
-      scale = sqrtf(3.0f / t->shape()[-2]); // results in columns of matrix to be ~unit length
+      scale = sqrtf(3.0f / t->shape()[-2]); // fanIn mode: the scale of tensor is adapted by the input variance
+                                            // results in columns of matrix to be ~unit range
     if(!fanIn && fanOut)
-      scale = sqrtf(3.0f / t->shape()[-1]);
+      scale = sqrtf(3.0f / t->shape()[-1]); // fanOut mode: the scale of tensor is adapted by the output variance
 
     scale *= scalingFactor;
 
@@ -112,9 +113,9 @@ Ptr<NodeInitializer> glorotNormal(bool fanIn, bool fanOut, float scalingFactor)
   return fromLambda([fanIn, fanOut, scalingFactor](Tensor t) {
     float scale = sqrtf(2.0f / (t->shape()[-2] + t->shape()[-1]));
     if(fanIn && !fanOut)
-      scale = sqrtf(1.0f / t->shape()[-2]);
+      scale = sqrtf(1.0f / t->shape()[-2]); // fanIn mode: the scale of tensor is adapted by the input variance
     if(!fanIn && fanOut)
-      scale = sqrtf(1.0f / t->shape()[-1]);
+      scale = sqrtf(1.0f / t->shape()[-1]); // fanOut mode: the scale of tensor is adapted by the output variance
 
     scale *= scalingFactor;
 
@@ -170,7 +171,7 @@ Ptr<NodeInitializer> fromWord2vec(const std::string& file,
                               bool normalize /*= false*/) {
   return fromLambda([file, dimVoc, dimEmb, normalize](Tensor t) {
     auto embs = Word2VecReader().read(file, dimVoc, dimEmb);
-    if(normalize) {
+    if(normalize) { // scaling to unit length:
       float norm = 0;
       for(auto e : embs)
         norm += e * e;
diff --git a/src/graph/node_initializers.h b/src/graph/node_initializers.h
index 9dc01a0d..7cdb4183 100755
--- a/src/graph/node_initializers.h
+++ b/src/graph/node_initializers.h
@@ -11,17 +11,18 @@
 namespace marian {
 
 class ExpressionGraph; // Forward declaration
-
+/**
+ * The namespace inits.
+ * Declare class NodeInitializer and all the available functions to initialise a node.
+*/
 namespace inits {
 
 /**
  * Base class for specialized NodeInitializers.
- *
  * A NodeInitializer is a functor that is associated with parameters
- * and constants, and is invoked on a tensor during node intialization.
- * You need to override NodeIntializer::apply(Tensor) with your own
- * functionality or use a fromLambda intializer.
- *
+ * and constants, and is invoked on a tensor during node initialization.
+ * You need to override NodeInitializer::apply(Tensor) with your own
+ * functionality or use a fromLambda initializer.
  * See node_initializers.cpp for examples.
  */
 class NodeInitializer {
@@ -35,155 +36,242 @@ public:
 };
 
 /**
- * Use a lambda function of form [](Tensor t) { do something with t } to initalize tensor
+ * Use a lambda function of form [](Tensor t) { do something with t } to initialize tensor.
+ * @param func functor
  */
 Ptr<NodeInitializer> fromLambda(std::function<void(Tensor)>&& func);
 
 /**
- * Use a lambda function of form [](Tensor t) { do something with t } to initalize tensor
- * Create temporary tensor of Type intermediateType first, initialize and then copy/convert to actual Tensor
- * Useful for functions that can only operate on a specific type of tensor
+ * Use a lambda function of form [](Tensor t) { do something with t } to initialize tensor.
+ * Create temporary tensor of Type intermediateType first, initialize and then copy/convert to actual Tensor.
+ * Useful for functions that can only operate on a specific type of tensor.
  */
 Ptr<NodeInitializer> fromLambda(std::function<void(Tensor)>&& func, Type intermediateType);
 
 /**
- * Initialize tensor with given value
- *
- * Creates a NodeInitializer that will intialize the given tensor
+ * Initialize tensor with given value.
+ * Creates a NodeInitializer that will initialize the given tensor
  * with `value`. Works with any underlying numeric tensor type.
- *
  * @return A NodeInitializer
  */
 Ptr<NodeInitializer> fromValue(float value);
 
 /**
- * Fill tensor with `0`
- *
- * Creates a NodeInitializer that will intialize the given tensor
+ * Fill tensor with `0`.
+ * Creates a NodeInitializer that will initialize the given tensor
  * with `0`. Works with any underlying numeric tensor type.
- *
  * @return A NodeInitializer
  */
 static Ptr<NodeInitializer> zeros() { return fromValue(0.0f); }
 
 /**
- * Fill tensor with `1`
- *
- * Creates a NodeInitializer that will intialize the given tensor
+ * Fill tensor with `1`.
+ * Creates a NodeInitializer that will initialize the given tensor
  * with `1`. Works with any underlying numeric tensor type.
- *
  * @return A NodeInitializer
  */
 static Ptr<NodeInitializer> ones() { return fromValue(1.0f); }
 
 /**
  * Set diagonal of two dimensional quadratic matrix to `value`.
- *
- * Sets all values of the tensor to 0 and intializes the diagonal with
+ * Sets all values of the tensor to 0 and initializes the diagonal with
  * the given `value`. If no value is specified `1` is used by default.
- *
  * @return A NodeInitializer
  */
 Ptr<NodeInitializer> eye(float value = 1.f);
 
 /**
- * Intialize tensor with normally distributed random numbers
- *
- * Be default this generates floating point numbers from the
+ * Initialize tensor with normally distributed random numbers.
+ * By default this generates floating point numbers from the
  * normal distribution Normal(0, 1) unless specified differently.
- *
  * If compiled with `CUDA`, `marian` will use the `cuRand` library
  * for both, GPU and CPU computation. The random sequences generated
  * are the same on both devices.
- *
  * If `marian` is compiled without `CUDA`, a random generator
  * from the C++ standard library is used. These random generators
  * do not have the same random sequences.
- *
  * @return A NodeInitializer
  */
 Ptr<NodeInitializer> normal(float mean = 0.f, float stddev = 1.f);
 
 /**
- * Intialize tensor with uniformly distributed random numbers
- *
- * Be default this generates floating point numbers from the
+ * Initialize tensor with uniformly distributed random numbers.
+ * By default this generates floating point numbers from the
  * uniform distribution Uniform(0, 1) unless specified differently.
- *
  * If compiled with `CUDA`, `marian` will use the `cuRand` library
  * for both, GPU and CPU computation. The random sequences generated
  * are the same on both devices.
- *
  * If `marian` is compiled without `CUDA`, a random generator
  * from the C++ standard library is used. These random generators
  * do not have the same random sequences.
- *
+ * @param a the lower bound of interval
+ * @param b the upper bound of interval
  * @return A NodeInitializer
  */
 Ptr<NodeInitializer> uniform(float a = 0.f, float b = 1.f);
 
-// @TODO: add documentation
+/**
+ * Initialize tensor with random numbers from Bernoulli Distribution.
+ * The Bernoulli distribution is the discrete probability distribution of
+ * a random variable which takes value `1` with probability p, and
+ * value `0` with probability (1-p).
+ * By default this function generates a tensor of 0 and 1 with probability p
+ * if bernoulli(p) is called. We offer `scale` and `shift` parameters which
+ * can map {0,1} to {0,1}*`scale`+`shift`.
+ * E.g., bernoulli(tensor, 0.5f, 2.f, -1.f) where p=0.5f, scale=2.f, shift=-1.f.
+ * {0,1} is mapped to {0,1}*2+(-1)= {-1,1}. It generates a tensor composed of
+ * 50% of 1 and 50% of -1.
+ * @return A NodeInitializer
+ */
 Ptr<NodeInitializer> bernoulli(float p, float scale = 1.f, float shift = 0.f);
 
-// @TODO: add documentation
+/**
+ * Initialize tensor with random numbers from Glorot uniform distribution.
+ * The <a href=http://proceedings.mlr.press/v9/glorot10a/glorot10a.pdf>Glorot uniform</a>,
+ * also called Xavier uniform, is designed to keep the scale of
+ * the gradients roughly the same in all layers.
+ * This function offers three variants (modes).
+ * The values of the tensor is sampled from Uniform(-x*scale, x*scale):
+ *   - when fanIn=false and fanOut=false (by default):
+ *      x = sqrt(6 / (in + out))
+ *   - when fanIn=true and fanOut=false (fanIn mode):
+ *      x = sqrt(3 / in)
+ *   - when fanIn=false and fanOut=false (fanOut mode):
+ *      x = sqrt(3 / out)
+ * where `in` is the number of input units in the tensor, `out` is the number of output units.
+ * `scale` is used to change the range of Uniform distribution.
+ * @return A NodeInitializer
+ */
 Ptr<NodeInitializer> glorotUniform(bool fanIn = false, bool fanOut = false, float scale = 1.f);
 
-// @TODO: add documentation
+/**
+ * Initialize tensor with random numbers from Glorot Normal distribution.
+ * Similar to function glorotUniform(), this function adopts Normal distribution instead of
+ * uniform distribution.
+ * This function offers three variants (modes).
+ * The values of the tensor is sampled from Normal(-x*scale, x*scale):
+ *   - when fanIn=false and fanOut=false (by default):
+ *      x = sqrt(2 / (in + out))
+ *   - when fanIn=true and fanOut=false (fanIn mode):
+ *      x = sqrt(1 / in)
+ *   - when fanIn=false and fanOut=false (fanOut mode):
+ *      x = sqrt(1 / out)
+ * where `in` is the number of input units in the tensor, `out` is the number of output units.
+ * `scale` is used to change the range of Normal distribution.
+ * @return A NodeInitializer
+ */
 Ptr<NodeInitializer> glorotNormal(bool fanIn = false, bool fanOut = false, float scale = 1.f);
 
-// @TODO: add documentation
-Ptr<NodeInitializer> dropout(float dropoutProbabilty);
+/**
+ * Initialize a dropout mask (a tensor of 0 and 1) with given dropout probability.
+ * <a href=https://www.cs.toronto.edu/~hinton/absps/JMLRdropout.pdf>Dropout</a>
+ * is proposed as a technique to prevent Neural Networks from overfitting.
+ * @param dropoutProbability a float type defines the dropout probability.
+ *        E.g., dropoutProbability=0.1 means 90% of values are kept.
+ * @return A NodeInitializer
+ */
+Ptr<NodeInitializer> dropout(float dropoutProbability);
 
 /**
- * Intialize with gumbel noise, i.e. -log(-log(u)) where u ~ Uniform(0 + eps, 1 - eps)
- *
+ * Initialize with gumbel noise, i.e. -log(-log(u)) where u ~ Uniform(0 + eps, 1 - eps).
+ * @param eps a variable protects from log(0)
  * @return A NodeInitializer
  */
 Ptr<NodeInitializer> gumbel(float eps = 1e-5f);
 
-// @TODO: add documentation
+/**
+ * Initialize tensor by *copying* from the given vector.
+ * Creates a NodeInitializer that will initialize the tensor
+ * by *copying* the values from the given vector
+ * @param v vector
+ * @return A NodeInitializer
+ */
 template <typename T>
 Ptr<NodeInitializer> fromVector(const std::vector<T>& v);
+
+/**
+ * Initialize tensor by *moving* from the given vector.
+ * Creates a NodeInitializer that will initialize the tensor by *moving* the values
+ * from the given vector into this tensor, and the given vector may be emptied.
+ * This version is the <a href=https://en.cppreference.com/w/cpp/language/reference>
+ * rvalue reference</a> overloading.
+ * @param v vector
+ * @return A NodeInitializer
+ */
 template <typename T>
 Ptr<NodeInitializer> fromVector(std::vector<T>&& v);
 
-// @TODO: add documentation
+/**
+ * Initialize tensor from a given sparse vector.
+ * Creates a NodeInitializer that will initialize the tensor from a given
+ * sparse vector (stored in std::pair). The resulting tensor is first filled
+ * with `1e-6` (a placeholder for non-zero element), then set the value to
+ * the given sparse vector.
+ * @param v the sparse vector is stored in `std::pair`:
+ *   - the first object (v.first) holds the indexes (in a vector)
+ *   - the second object (v.second) holds the corresponding values (in a vector).
+ *   This means the value of the resulting tensor at index v.first[i] is v.second[i].
+ * @return A NodeInitializer
+ */
 Ptr<NodeInitializer> fromSparseVector(std::pair<std::vector<size_t>, std::vector<float>>& v);
 
-// @TODO: add documentation
+/**
+ * Initialize tensor by copying from the given io::Item.
+ * Creates a NodeInitializer that will initialize the tensor by copying the values
+ * from the given io::Item. If this io::Item is a memory-mapped item, then the
+ * function will set the memory region pointing to this item. If this io::Item is
+ * a regular item, then the function will copy the values from this item.
+ * @return A NodeInitializer
+ */
 Ptr<NodeInitializer> fromItem(const io::Item& item);
 
-// @TODO: add documentation
+/**
+ * Initialize tensor by copying from the given tensor.
+ * Creates a NodeInitializer that will initialize the tensor
+ * by copying the values from the given tensor.
+ * @return A NodeInitializer
+ */
 Ptr<NodeInitializer> fromTensor(Tensor tensor);
 
-// @TODO: add documentation
+/**
+ * Initialize tensor from a file.
+ * Creates a NodeInitializer that will initialize the tensor
+ * by copying the values from the given file. This function is
+ * mainly used for loading embedding vectors from a file.
+ * @param file filename
+ * @param dimVoc the number of words in the vocabulary
+ * @param dimEmb the length of embedding vectors
+ * @param normalize a flag holds whether the values are normalize.
+ * Here we adopt the method of <a
+ * href=https://en.wikipedia.org/wiki/Feature_scaling#Scaling_to_unit_length>
+ * scaling to unit length</a>, i.e., dividing each element by the Euclidean length of the vector.
+ * @return A NodeInitializer
+ */
 Ptr<NodeInitializer> fromWord2vec(const std::string& file,
                                   int dimVoc,
                                   int dimEmb,
                                   bool normalize = false);
 
 /**
+ * Computes Google's sinusoidal position embeddings.
  * Computes Google's Transformer-style sinusoidal position embeddings
  * starting from position 'start' taking into account batch and time
- * dimensions of the tensor.
- *
- * Expected tensor layout {-2: time, -1: model}
- *
- * Usually gets later reshaped to {time, 1, model} and
- * added with a broadcast to learned embeddings. Positional
- * embeddings are the same for each batch entry and change
- * over time steps.
+ * dimensions of the tensor. Expected tensor layout {-2: time, -1: model}.
+ * Usually gets later reshaped to {time, 1, model} and added with a broadcast
+ * to learned embeddings. Positional embeddings are the same for each batch
+ * entry and change over time steps.
  */
 Ptr<NodeInitializer> sinusoidalPositionEmbeddings(int start);
 
 /**
- * Computes a random rotation matrix for LSH hashing. This is part  
- * of a hash function. The values are orthonormal and computed via
+ * Computes a random rotation matrix for LSH hashing.
+ * This is part of a hash function. The values are orthonormal and computed via
  * QR decomposition. Same seed results in same random rotation.
  */
 Ptr<NodeInitializer> randomRotation(size_t seed = Config::seed);
 
 /**
+ * Computes the equivalent of Python's range().
  * Computes a range from begin to end-1, like Python's range().
  * The constant being initialized must have one dimension that matches
  * the number of elements being generated, while any other dimension must be 1.
diff --git a/src/graph/node_operators.h b/src/graph/node_operators.h
index c3dde73c..4f9582db 100644
--- a/src/graph/node_operators.h
+++ b/src/graph/node_operators.h
@@ -5,7 +5,13 @@
 #include "tensors/tensor.h"
 
 namespace marian {
-
+/**
+ *  A constant node for the graph.
+ *  A constant node is actually a constant tensor whose value is
+ *  immutable during the training. ConstantNode instance is usually
+ *  used as the inputs. To construct a constant node in the
+ *  graph, we use constant() function in ExpressionGraph class.
+ */
 struct ConstantNode : public Node {
   ConstantNode(Ptr<ExpressionGraph> graph,
                const Shape& shape,
@@ -35,7 +41,13 @@ private:
   Ptr<inits::NodeInitializer> init_;
   bool initialized_;
 };
-
+/**
+ * A parameter node for the graph.
+ * A parameter node is used to store model parameters whose value can be
+ * changed during the training, such as weights and biases. To construct
+ * a parameter node in the graph, we use param() function in
+ * ExpressionGraph class.
+ */
 struct ParamNode : public Node {
   ParamNode(Ptr<ExpressionGraph> graph,
             const Shape& shape,
diff --git a/src/tensors/tensor.h b/src/tensors/tensor.h
index 4c214b20..c959619c 100755
--- a/src/tensors/tensor.h
+++ b/src/tensors/tensor.h
@@ -21,6 +21,12 @@ namespace io {
   struct Item;
 }
 
+/**
+ * Main implementation of a <a href="https://en.wikipedia.org/wiki/Tensor">tensor</a>,
+ * a multi-dimensional matrix containing elements of a single data type.
+ * TensorBase contains the data, data type, pointer to
+ * memory region, shape, backend info and other attributes.
+ */
 class TensorBase {
   MemoryPiece::PtrType memory_;
   Shape shape_;
author	Qianqian Zhu <qianqian.zhu@hotmail.com>	2021-02-28 11:07:19 +0300
committer	GitHub <noreply@github.com>	2021-02-28 11:07:19 +0300
commit	2a9c0bb3773c32c20fce74a8b0f7149478ebd8cb (patch)
tree	cf0af697c565ae9a1fb6ff09fcc2ce466ef6dcc2 /src
parent	f88ded2ba8fc9c452717674542fcaef231b3f3e9 (diff)