diff options
author | Sergey Zagoruyko <zagoruyko2@gmail.com> | 2016-04-13 16:01:55 +0300 |
---|---|---|
committer | Sergey Zagoruyko <zagoruyko2@gmail.com> | 2016-04-13 19:01:48 +0300 |
commit | 70adafcae259f67129c2de6e1048594aa0283e59 (patch) | |
tree | 432358dc7274ec86132e2c3bb40b655b0f4a4f61 /ffi.lua | |
parent | 0a040cf2bb3d0f0cdf217c8e92efaddf29ed2efc (diff) |
R5 rebase
Diffstat (limited to 'ffi.lua')
-rw-r--r-- | ffi.lua | 1226 |
1 files changed, 770 insertions, 456 deletions
@@ -1,7 +1,15 @@ local ffi = require 'ffi' ffi.cdef[[ -size_t cudnnGetVersion(); + + +typedef enum { + CUDNN_MAJOR = 5, + CUDNN_MINOR = 0, + CUDNN_PATCHLEVEL = 4, + CUDNN_VERSION = (CUDNN_MAJOR * 1000 + CUDNN_MINOR * 100 + CUDNN_PATCHLEVEL) +} cudnnVerFakeEnum; + struct cudnnContext; typedef struct cudnnContext *cudnnHandle_t; @@ -25,22 +33,24 @@ typedef enum CUDNN_STATUS_LICENSE_ERROR = 10 } cudnnStatus_t; +/* human-readable error messages*/ const char * cudnnGetErrorString(cudnnStatus_t status); -typedef struct CUstream_st *cudaStream_t; -cudnnStatus_t cudnnCreate(cudnnHandle_t *handle); -cudnnStatus_t cudnnDestroy(cudnnHandle_t handle); -cudnnStatus_t cudnnSetStream(cudnnHandle_t handle, cudaStream_t streamId); -cudnnStatus_t cudnnGetStream(cudnnHandle_t handle, cudaStream_t *streamId); +cudnnStatus_t cudnnCreate (cudnnHandle_t *handle); +cudnnStatus_t cudnnDestroy (cudnnHandle_t handle); +cudnnStatus_t cudnnSetStream (cudnnHandle_t handle, cudaStream_t streamId); +cudnnStatus_t cudnnGetStream (cudnnHandle_t handle, cudaStream_t *streamId); /* Data structures to represent Image/Filter and the Neural Network Layer */ -typedef struct cudnnTensorStruct* cudnnTensorDescriptor_t; -typedef struct cudnnConvolutionStruct* cudnnConvolutionDescriptor_t; -typedef struct cudnnPoolingStruct* cudnnPoolingDescriptor_t; -typedef struct cudnnFilterStruct* cudnnFilterDescriptor_t; -typedef struct cudnnLRNStruct* cudnnLRNDescriptor_t; -typedef struct cudnnActivationStruct* cudnnActivationDescriptor_t; +typedef struct cudnnTensorStruct* cudnnTensorDescriptor_t; +typedef struct cudnnConvolutionStruct* cudnnConvolutionDescriptor_t; +typedef struct cudnnPoolingStruct* cudnnPoolingDescriptor_t; +typedef struct cudnnFilterStruct* cudnnFilterDescriptor_t; +typedef struct cudnnLRNStruct* cudnnLRNDescriptor_t; +typedef struct cudnnActivationStruct* cudnnActivationDescriptor_t; +typedef struct cudnnSpatialTransformerStruct* cudnnSpatialTransformerDescriptor_t; +typedef struct cudnnOpTensorStruct* cudnnOpTensorDescriptor_t; /* * CUDNN data type */ @@ -75,20 +85,20 @@ typedef enum cudnnStatus_t cudnnSetTensor4dDescriptor( cudnnTensorDescriptor_t tensorDesc, cudnnTensorFormat_t format, - cudnnDataType_t dataType, /* image data type */ - int n, /* number of inputs (batch size) */ - int c, /* number of input feature maps */ - int h, /* height of input section */ - int w ); /* width of input section */ + cudnnDataType_t dataType, /* image data type*/ + int n, /* number of inputs (batch size)*/ + int c, /* number of input feature maps*/ + int h, /* height of input section*/ + int w ); /* width of input section*/ cudnnStatus_t cudnnSetTensor4dDescriptorEx( cudnnTensorDescriptor_t tensorDesc, - cudnnDataType_t dataType, /* image data type */ - int n, /* number of inputs (batch size) */ - int c, /* number of input feature maps */ - int h, /* height of input section */ - int w, /* width of input section */ + cudnnDataType_t dataType, /* image data type*/ + int n, /* number of inputs (batch size)*/ + int c, /* number of input feature maps*/ + int h, /* height of input section*/ + int w, /* width of input section*/ int nStride, int cStride, int hStride, @@ -96,11 +106,11 @@ cudnnStatus_t cudnnSetTensor4dDescriptorEx( cudnnStatus_t cudnnGetTensor4dDescriptor( const cudnnTensorDescriptor_t tensorDesc, - cudnnDataType_t *dataType, /* image data type */ - int *n, /* number of inputs (batch size) */ - int *c, /* number of input feature maps */ - int *h, /* height of input section */ - int *w, /* width of input section */ + cudnnDataType_t *dataType, /* image data type*/ + int *n, /* number of inputs (batch size)*/ + int *c, /* number of input feature maps*/ + int *h, /* height of input section*/ + int *w, /* width of input section*/ int *nStride, int *cStride, int *hStride, @@ -159,55 +169,69 @@ cudnnStatus_t cudnnTransformTensor( const cudnnTensorDescriptor_t yDesc, void *y ); -typedef enum -{ - /* add one image to every feature maps of each input */ - CUDNN_ADD_IMAGE = 0, - CUDNN_ADD_SAME_HW = 0, - - /* add a set of feature maps to a batch of inputs : tensorBias has n=1 , same number of features as x and y */ - CUDNN_ADD_FEATURE_MAP = 1, - CUDNN_ADD_SAME_CHW = 1, - /* add a tensor of size 1,c,1,1 to every corresponding point of n,c,h,w input */ - CUDNN_ADD_SAME_C = 2, - - /* add 2 tensors with same n,c,h,w */ - CUDNN_ADD_FULL_TENSOR = 3 -} cudnnAddMode_t; - -/* Tensor Bias addition : y = alpha * b + beta * y */ +/* Tensor Bias addition : C = alpha * A + beta * C */ cudnnStatus_t cudnnAddTensor( cudnnHandle_t handle, const void *alpha, - const cudnnTensorDescriptor_t bDesc, - const void *b, + const cudnnTensorDescriptor_t aDesc, + const void *A, const void *beta, - cudnnTensorDescriptor_t yDesc, - void *y ); + const cudnnTensorDescriptor_t cDesc, + void *C ); -/* cudnnAddTensor_v3 is now mapped to cudnnAddTensor - and will be removed at the same time as cudnnAddTensor_v2 - Use cudnnAddTensor instead - */ -cudnnStatus_t cudnnAddTensor_v3( +/* +* CUDNN OpTensor op type +*/ +typedef enum +{ + CUDNN_OP_TENSOR_ADD = 0, + CUDNN_OP_TENSOR_MUL = 1, + CUDNN_OP_TENSOR_MIN = 2, + CUDNN_OP_TENSOR_MAX = 3, +} cudnnOpTensorOp_t; + +cudnnStatus_t cudnnCreateOpTensorDescriptor( + cudnnOpTensorDescriptor_t *opTensorDesc ); + +cudnnStatus_t cudnnSetOpTensorDescriptor( + cudnnOpTensorDescriptor_t opTensorDesc, + cudnnOpTensorOp_t opTensorOp, + cudnnDataType_t opTensorCompType, + cudnnNanPropagation_t opTensorNanOpt ); + +cudnnStatus_t cudnnGetOpTensorDescriptor( + const cudnnOpTensorDescriptor_t opTensorDesc, + cudnnOpTensorOp_t *opTensorOp, + cudnnDataType_t *opTensorCompType, + cudnnNanPropagation_t *opTensorNanOpt ); + +cudnnStatus_t cudnnDestroyOpTensorDescriptor( + cudnnOpTensorDescriptor_t opTensorDesc ); + +/* Tensor Bias operation : C = op( alpha1 * A, alpha2 * B ) + beta * C */ +cudnnStatus_t cudnnOpTensor( cudnnHandle_t handle, - const void *alpha, + const cudnnOpTensorDescriptor_t opTensorDesc, + const void *alpha1, + const cudnnTensorDescriptor_t aDesc, + const void *A, + const void *alpha2, const cudnnTensorDescriptor_t bDesc, - const void *b, + const void *B, const void *beta, - cudnnTensorDescriptor_t yDesc, - void *y ); + const cudnnTensorDescriptor_t cDesc, + void *C ); /* Set all values of a tensor to a given value : y[i] = value[0] */ -cudnnStatus_t cudnnSetTensor( +cudnnStatus_t cudnnSetTensor( cudnnHandle_t handle, const cudnnTensorDescriptor_t yDesc, void *y, const void *valuePtr ); /* Scale all values of a tensor by a given factor : y[i] = alpha * y[i] */ -cudnnStatus_t cudnnScaleTensor( +cudnnStatus_t cudnnScaleTensor( cudnnHandle_t handle, const cudnnTensorDescriptor_t yDesc, void *y, @@ -224,53 +248,33 @@ typedef enum /* Create an instance of FilterStruct */ -cudnnStatus_t cudnnCreateFilterDescriptor( +cudnnStatus_t cudnnCreateFilterDescriptor( cudnnFilterDescriptor_t *filterDesc ); -cudnnStatus_t cudnnSetFilter4dDescriptor( - cudnnFilterDescriptor_t filterDesc, - cudnnDataType_t dataType, /* image data type */ - int k, /* number of output feature maps */ - int c, /* number of input feature maps */ - int h, /* height of each input filter */ - int w ); /* width of each input fitler */ -cudnnStatus_t cudnnSetFilter4dDescriptor_v4( +cudnnStatus_t cudnnSetFilter4dDescriptor( cudnnFilterDescriptor_t filterDesc, - cudnnDataType_t dataType, /* image data type */ + cudnnDataType_t dataType, /* image data type*/ cudnnTensorFormat_t format, - int k, /* number of output feature maps */ - int c, /* number of input feature maps */ - int h, /* height of each input filter */ - int w ); /* width of each input fitler */ + int k, /* number of output feature maps*/ + int c, /* number of input feature maps*/ + int h, /* height of each input filter*/ + int w ); /* width of each input filter*/ -cudnnStatus_t cudnnGetFilter4dDescriptor( - const cudnnFilterDescriptor_t filterDesc, - cudnnDataType_t *dataType, /* image data type */ - int *k, /* number of output feature maps */ - int *c, /* number of input feature maps */ - int *h, /* height of each input filter */ - int *w ); /* width of each input fitler */ -cudnnStatus_t cudnnGetFilter4dDescriptor_v4( +cudnnStatus_t cudnnGetFilter4dDescriptor( const cudnnFilterDescriptor_t filterDesc, - cudnnDataType_t *dataType, /* image data type */ + cudnnDataType_t *dataType, /* image data type*/ cudnnTensorFormat_t *format, - int *k, /* number of output feature maps */ - int *c, /* number of input feature maps */ - int *h, /* height of each input filter */ - int *w ); /* width of each input fitler */ + int *k, /* number of output feature maps*/ + int *c, /* number of input feature maps*/ + int *h, /* height of each input filter*/ + int *w ); /* width of each input filter*/ -cudnnStatus_t cudnnSetFilterNdDescriptor( - cudnnFilterDescriptor_t filterDesc, - cudnnDataType_t dataType, /* image data type */ - int nbDims, - const int filterDimA[] ); - -cudnnStatus_t cudnnSetFilterNdDescriptor_v4( +cudnnStatus_t cudnnSetFilterNdDescriptor( cudnnFilterDescriptor_t filterDesc, - cudnnDataType_t dataType, /* image data type */ + cudnnDataType_t dataType, /* image data type*/ cudnnTensorFormat_t format, int nbDims, const int filterDimA[] ); @@ -278,47 +282,63 @@ cudnnStatus_t cudnnSetFilterNdDescriptor_v4( cudnnStatus_t cudnnGetFilterNdDescriptor( const cudnnFilterDescriptor_t filterDesc, int nbDimsRequested, - cudnnDataType_t *dataType, - int *nbDims, - int filterDimA[] ); - -cudnnStatus_t cudnnGetFilterNdDescriptor_v4( - const cudnnFilterDescriptor_t filterDesc, - int nbDimsRequested, - cudnnDataType_t *dataType, + cudnnDataType_t *dataType, /* image data type*/ cudnnTensorFormat_t *format, int *nbDims, int filterDimA[] ); -cudnnStatus_t cudnnDestroyFilterDescriptor( cudnnFilterDescriptor_t filterDesc); + +cudnnStatus_t cudnnDestroyFilterDescriptor( + cudnnFilterDescriptor_t filterDesc ); /* Create an instance of convolution descriptor */ -cudnnStatus_t cudnnCreateConvolutionDescriptor( +cudnnStatus_t cudnnCreateConvolutionDescriptor( cudnnConvolutionDescriptor_t *convDesc ); -cudnnStatus_t cudnnSetConvolution2dDescriptor( +cudnnStatus_t cudnnSetConvolution2dDescriptor( cudnnConvolutionDescriptor_t convDesc, - int pad_h, /* zero-padding height */ - int pad_w, /* zero-padding width */ - int u, /* vertical filter stride */ - int v, /* horizontal filter stride */ - int upscalex, /* upscale the input in x-direction */ - int upscaley, /* upscale the input in y-direction */ + int pad_h, /* zero-padding height*/ + int pad_w, /* zero-padding width*/ + int u, /* vertical filter stride*/ + int v, /* horizontal filter stride*/ + int upscalex, /* upscale the input in x-direction*/ + int upscaley, /* upscale the input in y-direction*/ cudnnConvolutionMode_t mode ); - -cudnnStatus_t cudnnGetConvolution2dDescriptor( +cudnnStatus_t cudnnSetConvolution2dDescriptor_v5( cudnnConvolutionDescriptor_t convDesc, + int pad_h, /* zero-padding height*/ + int pad_w, /* zero-padding width*/ + int u, /* vertical filter stride*/ + int v, /* horizontal filter stride*/ + int upscalex, /* upscale the input in x-direction*/ + int upscaley, /* upscale the input in y-direction*/ + cudnnConvolutionMode_t mode, + cudnnDataType_t dataType + ); + +cudnnStatus_t cudnnGetConvolution2dDescriptor( const cudnnConvolutionDescriptor_t convDesc, - int *pad_h, /* zero-padding height */ - int *pad_w, /* zero-padding width */ - int *u, /* vertical filter stride */ - int *v, /* horizontal filter stride */ - int *upscalex, /* upscale the input in x-direction */ - int *upscaley, /* upscale the input in y-direction */ + int *pad_h, /* zero-padding height*/ + int *pad_w, /* zero-padding width*/ + int *u, /* vertical filter stride*/ + int *v, /* horizontal filter stride*/ + int *upscalex, /* upscale the input in x-direction*/ + int *upscaley, /* upscale the input in y-direction*/ cudnnConvolutionMode_t *mode ); +cudnnStatus_t cudnnGetConvolution2dDescriptor_v5( const cudnnConvolutionDescriptor_t convDesc, + int* pad_h, /* zero-padding height*/ + int* pad_w, /* zero-padding width*/ + int* u, /* vertical filter stride*/ + int* v, /* horizontal filter stride*/ + int* upscalex, /* upscale the input in x-direction*/ + int* upscaley, /* upscale the input in y-direction*/ + cudnnConvolutionMode_t* mode, + cudnnDataType_t *dataType + ); + /* Helper function to return the dimensions of the output tensor given a convolution descriptor */ -cudnnStatus_t cudnnGetConvolution2dForwardOutputDim( +cudnnStatus_t cudnnGetConvolution2dForwardOutputDim( const cudnnConvolutionDescriptor_t convDesc, const cudnnTensorDescriptor_t inputTensorDesc, const cudnnFilterDescriptor_t filterDesc, @@ -328,16 +348,16 @@ cudnnStatus_t cudnnGetConvolution2dForwardOutputDim( int *w ); -cudnnStatus_t cudnnSetConvolutionNdDescriptor( +cudnnStatus_t cudnnSetConvolutionNdDescriptor( cudnnConvolutionDescriptor_t convDesc, int arrayLength, /* nbDims-2 size */ const int padA[], const int filterStrideA[], const int upscaleA[], cudnnConvolutionMode_t mode, - cudnnDataType_t dataType ); /* convolution data type */ + cudnnDataType_t dataType ); /* convolution data type*/ -cudnnStatus_t cudnnGetConvolutionNdDescriptor( +cudnnStatus_t cudnnGetConvolutionNdDescriptor( const cudnnConvolutionDescriptor_t convDesc, int arrayLengthRequested, int *arrayLength, @@ -345,36 +365,11 @@ cudnnStatus_t cudnnGetConvolutionNdDescriptor( int strideA[], int upscaleA[], cudnnConvolutionMode_t *mode, - cudnnDataType_t *dataType ); /* convolution data type */ + cudnnDataType_t *dataType ); /* convolution data type*/ -/* cudnnSetConvolutionNdDescriptor_v3 is now mapped to cudnnSetConvolutionNdDescriptor - and will be removed at the same time than cudnnSetConvolutionNdDescriptor_v2 - Use cudnnSetConvolutionNdDescriptor instead */ -cudnnStatus_t cudnnSetConvolutionNdDescriptor_v3( - cudnnConvolutionDescriptor_t convDesc, - int arrayLength, /* nbDims-2 size */ - const int padA[], - const int filterStrideA[], - const int upscaleA[], - cudnnConvolutionMode_t mode, - cudnnDataType_t dataType ); /* convolution data type */ - -/* cudnnGetConvolutionNdDescriptor_v3 is now mapped to cudnnGetConvolutionNdDescriptor - and will be removed at the same time thancudnnGetConvolutionNdDescriptor_v2 - Use cudnnGetConvolutionNdDescriptor instead - */ -cudnnStatus_t cudnnGetConvolutionNdDescriptor_v3( - const cudnnConvolutionDescriptor_t convDesc, - int arrayLengthRequested, - int *arrayLength, - int padA[], - int strideA[], - int upscaleA[], - cudnnConvolutionMode_t *mode, - cudnnDataType_t *dataType ); /* convolution data type */ /* Helper function to return the dimensions of the output tensor given a convolution descriptor */ -cudnnStatus_t cudnnGetConvolutionNdForwardOutputDim( +cudnnStatus_t cudnnGetConvolutionNdForwardOutputDim( const cudnnConvolutionDescriptor_t convDesc, const cudnnTensorDescriptor_t inputTensorDesc, const cudnnFilterDescriptor_t filterDesc, @@ -402,8 +397,8 @@ typedef enum CUDNN_CONVOLUTION_FWD_ALGO_GEMM = 2, CUDNN_CONVOLUTION_FWD_ALGO_DIRECT = 3, CUDNN_CONVOLUTION_FWD_ALGO_FFT = 4, - /* CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_BATCHED_GEMM = 100, */ - CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING = 5 + CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING = 5, + CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD = 6 } cudnnConvolutionFwdAlgo_t; typedef struct { @@ -423,14 +418,30 @@ cudnnStatus_t cudnnFindConvolutionForwardAlgorithm( int *returnedAlgoCount, cudnnConvolutionFwdAlgoPerf_t *perfResults ); +cudnnStatus_t cudnnFindConvolutionForwardAlgorithmEx( + cudnnHandle_t handle, + const cudnnTensorDescriptor_t xDesc, + const void *x, + const cudnnFilterDescriptor_t wDesc, + const void *w, + const cudnnConvolutionDescriptor_t convDesc, + const cudnnTensorDescriptor_t yDesc, + void *y, + const int requestedAlgoCount, + int *returnedAlgoCount, + cudnnConvolutionFwdAlgoPerf_t *perfResults, + void *workSpace, + size_t workSpaceSizeInBytes ); + + cudnnStatus_t cudnnGetConvolutionForwardAlgorithm( cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc, - const cudnnFilterDescriptor_t filterDesc, + const cudnnFilterDescriptor_t wDesc, const cudnnConvolutionDescriptor_t convDesc, const cudnnTensorDescriptor_t yDesc, cudnnConvolutionFwdPreference_t preference, - size_t memoryLimitInbytes, + size_t memoryLimitInBytes, cudnnConvolutionFwdAlgo_t *algo ); /* @@ -441,7 +452,7 @@ cudnnStatus_t cudnnGetConvolutionForwardAlgorithm( cudnnStatus_t cudnnGetConvolutionForwardWorkspaceSize( cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc, - const cudnnFilterDescriptor_t filterDesc, + const cudnnFilterDescriptor_t wDesc, const cudnnConvolutionDescriptor_t convDesc, const cudnnTensorDescriptor_t yDesc, cudnnConvolutionFwdAlgo_t algo, @@ -487,10 +498,10 @@ typedef enum typedef enum { - CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0 = 0, /* non-deterministic */ + CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0 = 0, /* non-deterministic*/ CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1 = 1, CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT = 2, - CUDNN_CONVOLUTION_BWD_FILTER_ALGO_3 = 3 /* non-deterministic, algo0 with workspace */ + CUDNN_CONVOLUTION_BWD_FILTER_ALGO_3 = 3 /* non-deterministic, algo0 with workspace*/ } cudnnConvolutionBwdFilterAlgo_t; @@ -506,20 +517,35 @@ cudnnStatus_t cudnnFindConvolutionBackwardFilterAlgorithm( const cudnnTensorDescriptor_t xDesc, const cudnnTensorDescriptor_t dyDesc, const cudnnConvolutionDescriptor_t convDesc, - const cudnnFilterDescriptor_t wDesc, + const cudnnFilterDescriptor_t dwDesc, const int requestedAlgoCount, - int *returnedAlgoCount, - cudnnConvolutionBwdFilterAlgoPerf_t*perfResults ); + int *returnedAlgoCount, + cudnnConvolutionBwdFilterAlgoPerf_t *perfResults ); + +cudnnStatus_t cudnnFindConvolutionBackwardFilterAlgorithmEx( + cudnnHandle_t handle, + const cudnnTensorDescriptor_t xDesc, + const void *x, + const cudnnTensorDescriptor_t dyDesc, + const void *y, + const cudnnConvolutionDescriptor_t convDesc, + const cudnnFilterDescriptor_t dwDesc, + void *dw, + const int requestedAlgoCount, + int *returnedAlgoCount, + cudnnConvolutionBwdFilterAlgoPerf_t *perfResults, + void *workSpace, + size_t workSpaceSizeInBytes ); cudnnStatus_t cudnnGetConvolutionBackwardFilterAlgorithm( - cudnnHandle_t handle, - const cudnnTensorDescriptor_t xDesc, - const cudnnTensorDescriptor_t dyDesc, - const cudnnConvolutionDescriptor_t convDesc, - const cudnnFilterDescriptor_t wDesc, + cudnnHandle_t handle, + const cudnnTensorDescriptor_t xDesc, + const cudnnTensorDescriptor_t dyDesc, + const cudnnConvolutionDescriptor_t convDesc, + const cudnnFilterDescriptor_t dwDesc, cudnnConvolutionBwdFilterPreference_t preference, - size_t memoryLimitInbytes, - cudnnConvolutionBwdFilterAlgo_t *algo ); + size_t memoryLimitInBytes, + cudnnConvolutionBwdFilterAlgo_t *algo ); /* * convolution algorithm (which requires potentially some workspace) @@ -550,24 +576,6 @@ cudnnStatus_t cudnnConvolutionBackwardFilter( const cudnnFilterDescriptor_t dwDesc, void *dw ); -/* cudnnConvolutionBackwardFilter_v3 is now mapped to cudnnConvolutionBackwardFilter - and will be removed at the same time thancudnnConvolutionBackwardFilter_v2 - Use cudnnConvolutionBackwardFilter instead */ -cudnnStatus_t cudnnConvolutionBackwardFilter_v3( - cudnnHandle_t handle, - const void *alpha, - const cudnnTensorDescriptor_t xDesc, - const void *x, - const cudnnTensorDescriptor_t dyDesc, - const void *dy, - const cudnnConvolutionDescriptor_t convDesc, - cudnnConvolutionBwdFilterAlgo_t algo, - void *workSpace, - size_t workSpaceSizeInBytes, - const void *beta, - const cudnnFilterDescriptor_t dwDesc, - void *dw ); - /*********************************************************/ /* helper function to provide the convolution algo that fit best the requirement */ typedef enum @@ -579,10 +587,11 @@ typedef enum typedef enum { - CUDNN_CONVOLUTION_BWD_DATA_ALGO_0 = 0, /* non-deterministic */ + CUDNN_CONVOLUTION_BWD_DATA_ALGO_0 = 0, /* non-deterministic*/ CUDNN_CONVOLUTION_BWD_DATA_ALGO_1 = 1, CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT = 2, - CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING = 3 + CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING = 3, + CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD = 4 } cudnnConvolutionBwdDataAlgo_t; typedef struct { @@ -603,6 +612,21 @@ cudnnStatus_t cudnnFindConvolutionBackwardDataAlgorithm( int *returnedAlgoCount, cudnnConvolutionBwdDataAlgoPerf_t *perfResults ); +cudnnStatus_t cudnnFindConvolutionBackwardDataAlgorithmEx( + cudnnHandle_t handle, + const cudnnFilterDescriptor_t wDesc, + const void *w, + const cudnnTensorDescriptor_t dyDesc, + const void *dy, + const cudnnConvolutionDescriptor_t convDesc, + const cudnnTensorDescriptor_t dxDesc, + void *dx, + const int requestedAlgoCount, + int *returnedAlgoCount, + cudnnConvolutionBwdDataAlgoPerf_t *perfResults, + void *workSpace, + size_t workSpaceSizeInBytes ); + cudnnStatus_t cudnnGetConvolutionBackwardDataAlgorithm( cudnnHandle_t handle, const cudnnFilterDescriptor_t wDesc, @@ -610,7 +634,7 @@ cudnnStatus_t cudnnGetConvolutionBackwardDataAlgorithm( const cudnnConvolutionDescriptor_t convDesc, const cudnnTensorDescriptor_t dxDesc, cudnnConvolutionBwdDataPreference_t preference, - size_t memoryLimitInbytes, + size_t memoryLimitInBytes, cudnnConvolutionBwdDataAlgo_t *algo ); /* Helper function to return the minimum size of the workspace to be passed to the convolution given an algo*/ @@ -639,23 +663,6 @@ cudnnStatus_t cudnnConvolutionBackwardData( const cudnnTensorDescriptor_t dxDesc, void *dx ); -/* cudnnConvolutionBackwardData_v3 is now mapped to cudnnConvolutionBackwardData - and will be removed at the same time thancudnnConvolutionBackwardData_v2 - Use cudnnConvolutionBackwardData instead */ -cudnnStatus_t cudnnConvolutionBackwardData_v3( - cudnnHandle_t handle, - const void *alpha, - const cudnnFilterDescriptor_t wDesc, - const void *w, - const cudnnTensorDescriptor_t dyDesc, - const void *dy, - const cudnnConvolutionDescriptor_t convDesc, - cudnnConvolutionBwdDataAlgo_t algo, - void *workSpace, - size_t workSpaceSizeInBytes, - const void *beta, - const cudnnTensorDescriptor_t dxDesc, - void *dx ); cudnnStatus_t cudnnIm2Col( cudnnHandle_t handle, @@ -687,7 +694,7 @@ typedef enum /* Function to perform forward softmax */ cudnnStatus_t cudnnSoftmaxForward( cudnnHandle_t handle, - cudnnSoftmaxAlgorithm_t algorithm, + cudnnSoftmaxAlgorithm_t algo, cudnnSoftmaxMode_t mode, const void *alpha, const cudnnTensorDescriptor_t xDesc, @@ -699,7 +706,7 @@ cudnnStatus_t cudnnSoftmaxForward( /* Function to perform backward softmax */ cudnnStatus_t cudnnSoftmaxBackward( cudnnHandle_t handle, - cudnnSoftmaxAlgorithm_t algorithm, + cudnnSoftmaxAlgorithm_t algo, cudnnSoftmaxMode_t mode, const void *alpha, const cudnnTensorDescriptor_t yDesc, @@ -716,8 +723,8 @@ cudnnStatus_t cudnnSoftmaxBackward( typedef enum { CUDNN_POOLING_MAX = 0, - CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING = 1, /* count for average includes padded values */ - CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING = 2, /* count for average does not include padded values */ + CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING = 1, /* count for average includes padded values*/ + CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING = 2, /* count for average does not include padded values*/ CUDNN_POOLING_AVERAGE = CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING // for backward compatibility } cudnnPoolingMode_t; @@ -728,16 +735,6 @@ cudnnStatus_t cudnnCreatePoolingDescriptor( cudnnStatus_t cudnnSetPooling2dDescriptor( cudnnPoolingDescriptor_t poolingDesc, cudnnPoolingMode_t mode, - int windowHeight, - int windowWidth, - int verticalPadding, - int horizontalPadding, - int verticalStride, - int horizontalStride ); - -cudnnStatus_t cudnnSetPooling2dDescriptor_v4( - cudnnPoolingDescriptor_t poolingDesc, - cudnnPoolingMode_t mode, cudnnNanPropagation_t maxpoolingNanOpt, int windowHeight, int windowWidth, @@ -749,16 +746,6 @@ cudnnStatus_t cudnnSetPooling2dDescriptor_v4( cudnnStatus_t cudnnGetPooling2dDescriptor( const cudnnPoolingDescriptor_t poolingDesc, cudnnPoolingMode_t *mode, - int *windowHeight, - int *windowWidth, - int *verticalPadding, - int *horizontalPadding, - int *verticalStride, - int *horizontalStride ); - -cudnnStatus_t cudnnGetPooling2dDescriptor_v4( - const cudnnPoolingDescriptor_t poolingDesc, - cudnnPoolingMode_t *mode, cudnnNanPropagation_t *maxpoolingNanOpt, int *windowHeight, int *windowWidth, @@ -770,14 +757,6 @@ cudnnStatus_t cudnnGetPooling2dDescriptor_v4( cudnnStatus_t cudnnSetPoolingNdDescriptor( cudnnPoolingDescriptor_t poolingDesc, const cudnnPoolingMode_t mode, - int nbDims, - const int windowDimA[], - const int paddingA[], - const int strideA[] ); - -cudnnStatus_t cudnnSetPoolingNdDescriptor_v4( - cudnnPoolingDescriptor_t poolingDesc, - const cudnnPoolingMode_t mode, const cudnnNanPropagation_t maxpoolingNanOpt, int nbDims, const int windowDimA[], @@ -786,15 +765,6 @@ cudnnStatus_t cudnnSetPoolingNdDescriptor_v4( cudnnStatus_t cudnnGetPoolingNdDescriptor( const cudnnPoolingDescriptor_t poolingDesc, - const int nbDimsRequested, - cudnnPoolingMode_t *mode, - int *nbDims, - int windowDimA[], - int paddingA[], - int strideA[] ); - -cudnnStatus_t cudnnGetPoolingNdDescriptor_v4( - const cudnnPoolingDescriptor_t poolingDesc, int nbDimsRequested, cudnnPoolingMode_t *mode, cudnnNanPropagation_t *maxpoolingNanOpt, @@ -812,10 +782,10 @@ cudnnStatus_t cudnnGetPoolingNdForwardOutputDim( cudnnStatus_t cudnnGetPooling2dForwardOutputDim( const cudnnPoolingDescriptor_t poolingDesc, const cudnnTensorDescriptor_t inputTensorDesc, - int *outN, - int *outC, - int *outH, - int *outW ); + int *n, + int *c, + int *h, + int *w ); /* Destroy an instance of pooling descriptor */ @@ -826,7 +796,7 @@ cudnnStatus_t cudnnDestroyPoolingDescriptor( /* Function to perform forward pooling */ cudnnStatus_t cudnnPoolingForward( - cudnnHandle_t handle, + cudnnHandle_t handle, const cudnnPoolingDescriptor_t poolingDesc, const void *alpha, const cudnnTensorDescriptor_t xDesc, @@ -883,16 +853,6 @@ cudnnStatus_t cudnnDestroyActivationDescriptor( /* Function to perform forward activation */ cudnnStatus_t cudnnActivationForward( cudnnHandle_t handle, - cudnnActivationMode_t mode, - const void *alpha, - const cudnnTensorDescriptor_t xDesc, - const void *x, - const void *beta, - const cudnnTensorDescriptor_t yDesc, - void *y ); - -cudnnStatus_t cudnnActivationForward_v4( - cudnnHandle_t handle, cudnnActivationDescriptor_t activationDesc, const void *alpha, const cudnnTensorDescriptor_t xDesc, @@ -904,20 +864,6 @@ cudnnStatus_t cudnnActivationForward_v4( /* Function to perform backward activation */ cudnnStatus_t cudnnActivationBackward( cudnnHandle_t handle, - cudnnActivationMode_t mode, - const void *alpha, - const cudnnTensorDescriptor_t yDesc, - const void *y, - const cudnnTensorDescriptor_t dyDesc, - const void *dy, - const cudnnTensorDescriptor_t xDesc, - const void *x, - const void *beta, - const cudnnTensorDescriptor_t dxDesc, - void *dx ); - -cudnnStatus_t cudnnActivationBackward_v4( - cudnnHandle_t handle, cudnnActivationDescriptor_t activationDesc, const void *alpha, const cudnnTensorDescriptor_t yDesc, @@ -930,37 +876,41 @@ cudnnStatus_t cudnnActivationBackward_v4( const cudnnTensorDescriptor_t dxDesc, void *dx ); -/* Create an instance of LRN (Local Response Normalization) descriptor */ -/* This function will set lrnN=5, lrnAlpha=1e-4, lrnBeta=0.75, lrnK=2.0 as defaults from Krizhevsky'12 ImageNet paper */ +/* +* Create an instance of LRN (Local Response Normalization) descriptor +* Uses lrnN=5, lrnAlpha=1e-4, lrnBeta=0.75, lrnK=2.0 as defaults from Krizhevsky'12 ImageNet paper +*/ cudnnStatus_t cudnnCreateLRNDescriptor( cudnnLRNDescriptor_t *normDesc ); -typedef enum { CUDNN_LRN_MIN_N = 1, /* minimum allowed lrnN */ +typedef enum { CUDNN_LRN_MIN_N = 1, /* minimum allowed lrnN */ CUDNN_LRN_MAX_N = 16 } /* maximum allowed lrnN */ - LRN_MinMaxFakeEnum; + LRN_MinMaxFakeEnum; -/* define CUDNN_LRN_MIN_K 1e-5 -- minimum allowed lrnK */ -/* define CUDNN_LRN_MIN_BETA 0.01 -- minimum allowed lrnBeta */ +/* static const float CUDNN_LRN_MIN_K = 1e-5; */ /* minimum allowed lrnK*/ +/* static const float CUDNN_LRN_MIN_BETA = 0.01; */ /* minimum allowed lrnBeta*/ -/* LRN layer mode, currently only cross-channel is supported (across the tensor's dimA[1] dimension) */ +/* LRN layer mode */ typedef enum { - CUDNN_LRN_CROSS_CHANNEL_DIM1 = 0, + CUDNN_LRN_CROSS_CHANNEL_DIM1 = 0,/* Normalize across tensor's dimA[1] dimension*/ } cudnnLRNMode_t; -/* LRN uses a window [center-lookBehind, center+lookAhead], where */ -/* lookBehind = floor( (lrnN-1)/2 ), lookAhead = lrnN-lookBehind-1. */ -/* So for n=10, the window is [k-4...k...k+5] with a total of 10 samples. */ -/* Values of double parameters will be cast down to tensor data type. */ +/* +* Uses a window [center-lookBehind, center+lookAhead], where +* lookBehind = floor( (lrnN-1)/2 ), lookAhead = lrnN-lookBehind-1. +* Values of double parameters cast to tensor data type. +*/ cudnnStatus_t cudnnSetLRNDescriptor( cudnnLRNDescriptor_t normDesc, unsigned lrnN, double lrnAlpha, double lrnBeta, double lrnK ); - -/* Retrieve the settings currently stored in an LRN layer descriptor */ -/* Any of the provided pointers can be NULL (no corresponding value will be returned) */ +/* +* Retrieve the settings currently stored in an LRN layer descriptor +* Any of the provided pointers can be NULL (no corresponding value will be returned) +*/ cudnnStatus_t cudnnGetLRNDescriptor( cudnnLRNDescriptor_t normDesc, unsigned* lrnN, @@ -968,13 +918,12 @@ cudnnStatus_t cudnnGetLRNDescriptor( double* lrnBeta, double* lrnK ); -/* Destroy an instance of LRN descriptor */ +/* Destroy an instance of LRN descriptor */ cudnnStatus_t cudnnDestroyLRNDescriptor( cudnnLRNDescriptor_t lrnDesc ); -/* LRN functions: of the form "output = alpha * normalize(x) + beta * old_y" */ +/* LRN functions: output = alpha * normalize(x) + beta * old_y */ -/* Function to perform LRN forward cross-channel computation */ -/* Values of double parameters will be cast down to tensor data type */ +/* LRN cross-channel forward computation. Double parameters cast to tensor data type */ cudnnStatus_t cudnnLRNCrossChannelForward( cudnnHandle_t handle, cudnnLRNDescriptor_t normDesc, @@ -986,8 +935,7 @@ cudnnStatus_t cudnnLRNCrossChannelForward( const cudnnTensorDescriptor_t yDesc, void *y ); -/* Function to perform LRN cross-channel backpropagation */ -/* values of double parameters will be cast down to tensor data type */ +/* LRN cross-channel backward computation. Double parameters cast to tensor data type */ cudnnStatus_t cudnnLRNCrossChannelBackward( cudnnHandle_t handle, cudnnLRNDescriptor_t normDesc, @@ -1008,16 +956,15 @@ typedef enum CUDNN_DIVNORM_PRECOMPUTED_MEANS = 0, } cudnnDivNormMode_t; -/* LCN/divisive normalization functions: of the form "y = alpha * normalize(x) + beta * y" */ -/* means can be NULL to reproduce Caffe's LRN within-channel behavior */ +/* LCN/divisive normalization functions: y = alpha * normalize(x) + beta * y */ cudnnStatus_t cudnnDivisiveNormalizationForward( cudnnHandle_t handle, cudnnLRNDescriptor_t normDesc, cudnnDivNormMode_t mode, const void *alpha, - const cudnnTensorDescriptor_t xDesc, /* same desc for means, temp, temp2 */ + const cudnnTensorDescriptor_t xDesc, /* same desc for means, temp, temp2*/ const void *x, - const void *means, /* if NULL, means are assumed to be zero */ + const void *means, /* if NULL, means are assumed to be zero*/ void *temp, void *temp2, const void *beta, @@ -1029,157 +976,114 @@ cudnnStatus_t cudnnDivisiveNormalizationBackward( cudnnLRNDescriptor_t normDesc, cudnnDivNormMode_t mode, const void *alpha, - const cudnnTensorDescriptor_t xDesc, /* same desc for x, means, dy, temp, temp2 */ + const cudnnTensorDescriptor_t xDesc, /* same desc for x, means, dy, temp, temp2*/ const void *x, - const void *means, /* if NULL, means are assumed to be zero */ + const void *means, /* if NULL, means are assumed to be zero*/ const void *dy, void *temp, void *temp2, const void *beta, - const cudnnTensorDescriptor_t dXdMeansDesc, /* same desc for dx, dMeans */ - void *dx, /* output x differential */ - void *dMeans ); /* output means differential, can be NULL */ + const cudnnTensorDescriptor_t dXdMeansDesc, /* same desc for dx, dMeans*/ + void *dx, /* output x differential*/ + void *dMeans ); /* output means differential, can be NULL*/ typedef enum { - /* Use for non-convolution layers. */ - /* bnScale, bnBias tensors dims are 1xCxHxWx.. (one value per CHW...-slice, normalized over N slice) */ + /* bnScale, bnBias tensor dims are 1xCxHxWx.. (one value per CHW...-slice, normalized over N slice)*/ CUDNN_BATCHNORM_PER_ACTIVATION = 0, - /* Use after convolution layers. bnScale, bnBias tensors dims are 1xCx1x1 (one value per C-dim normalized over Nx1xHxW subtensors) */ + /*bnScale, bnBias tensor dims are 1xCx1x1 (one value per C-dim normalized over Nx1xHxW subtensors)*/ CUDNN_BATCHNORM_SPATIAL = 1, } cudnnBatchNormMode_t; -/* CUDNN_BN_MIN_EPSILON 1e-5 -- Minimum epsilon allowed to be used in the Batch Normalization formula */ +/* static const float CUDNN_BN_MIN_EPSILON = 1e-5; */ /* Minimum epsilon allowed to be used in the Batch Normalization formula*/ -/* Derives a tensor descriptor from layer data descriptor for BatchNormalization scale, invVariance, bnBias, bnScale subtensors */ -/* Use the tensor desc produced by these functions as the bnScaleBiasMeanVarDesc and bnScaleBiasDiffDesc parameters in */ -/* Spatial and Per-activation Batch Normalization forward and backward functions. */ -/* Note - derivedBnDesc has to be first created using cudnnCreateTensorDescriptor */ -/* Note - dataDesc is the descriptor for the layer data and has to be setup with proper dimensions prior to calling these functions. */ +/* +* Derives a tensor descriptor from layer data descriptor for BatchNormalization +* scale, invVariance, bnBias, bnScale tensors. Use this tensor desc for +* bnScaleBiasMeanVarDesc and bnScaleBiasDiffDesc in Batch Normalization forward and backward functions. +*/ cudnnStatus_t cudnnDeriveBNTensorDescriptor( cudnnTensorDescriptor_t derivedBnDesc, const cudnnTensorDescriptor_t xDesc, cudnnBatchNormMode_t mode ); -/* This function performs a forward pass for Batch Normalization layer. */ -/* In addition to computing y = BN(x) it accumulates the moving averages of the mean and inverse variances */ +/* Computes y = BN(x). Also accumulates moving averages of mean and inverse variances */ cudnnStatus_t cudnnBatchNormalizationForwardTraining( cudnnHandle_t handle, cudnnBatchNormMode_t mode, - const void *alpha, /* alpha[0] = result blend factor */ - const void *beta, /* beta[0] = dest layer blend factor */ + const void *alpha, /* alpha[0] = result blend factor*/ + const void *beta, /* beta[0] = dest layer blend factor*/ const cudnnTensorDescriptor_t xDesc, - const void *x, /* NxCxHxW */ - const cudnnTensorDescriptor_t yDesc, - void *y, /* NxCxHxW */ - - /* Same shared desc for all the 6 tensors below in the argument list. */ - /* Note that the data type for this descriptor has to be set as follows: */ - /* type = (typeOf(x) == half) ? float : typeof(x) */ - /* The dimensions for this tensor descriptor are dependent on the normalization mode */ - /* For spatial normalization the tensors are expected to be 1D (of size C) */ - /* (in this case normalization is performed across NxHxW) */ - /* In per-activation mode the normalization is performed across N dimension only */ - /* So the tensors are expected to have dimensions of CxHxW */ + const void *x, /* NxCxHxW*/ + const cudnnTensorDescriptor_t yDesc, + void *y, /* NxCxHxW*/ + + /* Shared desc for the next 6 tensors in the argument list. + Data type to be set as follows: + type = (typeOf(x) == double) ? double : float + Dimensions for this descriptor depend on normalization mode + - Spatial Normalization : tensors are expected to have dims 1xCx1x1 + (normalization is performed across NxHxW) + - Per-Activation Normalization : tensors are expected to have dims of 1xCxHxW + (normalization is performed across N) */ const cudnnTensorDescriptor_t bnScaleBiasMeanVarDesc, - /* Note - bnScale is 'gamma' in paper's notation */ - const void *bnScale, /* Mode-dependent dims */ - /* Note - this bias parameter can effectively replace the bias in Conv and FCN layers */ - /* (Which can be set to zero for efficiency) */ - /* Note - bnBias is 'beta' in paper's notation */ - const void *bnBias, /* Mode-dependent dims */ - - /* It is required that factor=1 is used for the very first call of a complete training cycle. */ - /* This is necessary to properly initialize the moving average. */ - /* Use a factor=1/(1+n) at N-th call to the function to get */ - /* Cumulative Moving Average (CMA) behavior */ - /* CMA[n] = (x[1]+...+x[n])/n */ - /* Since CMA[n+1] = (n*CMA[n]+x[n+1])/(n+1) = */ - /* ((n+1)*CMA[n]-CMA[n])/(n+1) + x[n+1]/(n+1) = */ - /* CMA[n]*(1-1/(n+1)) + x[n+1]*1/(n+1) */ + /* 'Gamma' and 'Beta' respectively in Ioffe and Szegedy's paper's notation*/ + const void *bnScale, + const void *bnBias, + + /* MUST use factor=1 in the very first call of a complete training cycle. + Use a factor=1/(1+n) at N-th call to the function to get + Cumulative Moving Average (CMA) behavior + CMA[n] = (x[1]+...+x[n])/n + Since CMA[n+1] = (n*CMA[n]+x[n+1])/(n+1) = + ((n+1)*CMA[n]-CMA[n])/(n+1) + x[n+1]/(n+1) = + CMA[n]*(1-1/(n+1)) + x[n+1]*1/(n+1) */ double exponentialAverageFactor, - /* runningMean = newMean*factor + runningMean*(1-factor) */ - /* if isTrainingPhase == false, these tensors will remain const */ - /* and exponentialAverageFactor parameter is not used. */ - - /* Both of these pointers (running mean, inv variance) can be NULL but only at the same time. */ + /* Used in Training phase only. + runningMean = newMean*factor + runningMean*(1-factor) */ void *resultRunningMean, - /* The value stored here (or passed as an input in inference mode) is the moving average */ - /* of the expression 1 / sqrt( epsilon + variance[x] ) */ - void *resultRunningInvVariance, + /* Output in training mode, input in inference. Is the moving average + of variance[x] (factor is applied in the same way as for runningMean) */ + void *resultRunningVariance, - /* Constant used to prevent divides by zero variance. Has to be >= CUDNN_BN_MIN_EPSILON. */ - /* Same epsilon value should be used in forward and backward functions. */ + /* Has to be >= CUDNN_BN_MIN_EPSILON. Should be the same in forward and backward functions. */ double epsilon, - /* Optional cache to save intermediate results computed during the forward pass */ - /* - these can then be reused to speed up backward pass. For this to work correctly, */ - /* the x data has to remain unchanged until the backward function is called. */ - /* Note that both of these parameters can be NULL but only at the same time. */ - /* It is recommended to use this cache since memory overhead is relatively small. */ + /* Optionally save intermediate results from the forward pass here + - can be reused to speed up backward pass. NULL if unused */ void *resultSaveMean, void *resultSaveInvVariance ); -/* This function will compute a linear transform of the inputs as follows: */ -/* y[i] = bnScale[k]*(x[i]-estimatedMean[k])*estimatedInvVariance[k] + bnBias[k] */ -/* with bnScale, bnBias, runningMean, runningInvVariance tensors indexed */ -/* according to spatial or per-activation mode (please refer to the paper for details). */ -/* During inference estimatedMean and estimatedVariance are treated */ -/* as const inputs (accumulated and saved during the training phase) */ +/* +* Performs Batch Normalization during Inference: +* y[i] = bnScale[k]*(x[i]-estimatedMean[k])/sqrt(epsilon+estimatedVariance[k]) + bnBias[k] +* with bnScale, bnBias, runningMean, runningInvVariance tensors indexed +* according to spatial or per-activation mode. Refer to cudnnBatchNormalizationForwardTraining +* above for notes on function arguments. +*/ cudnnStatus_t cudnnBatchNormalizationForwardInference( cudnnHandle_t handle, cudnnBatchNormMode_t mode, - - const void *alpha, /* alpha[0] = result blend factor */ - const void *beta, /* beta[0] = dest layer blend factor */ - + const void *alpha, /* alpha[0] = result blend factor*/ + const void *beta, /* beta[0] = dest layer blend factor*/ const cudnnTensorDescriptor_t xDesc, - const void *x, /* NxCxHxW */ - const cudnnTensorDescriptor_t yDesc, - void *y, /* NxCxHxW */ - - /* Same desc for all 4 tensors below */ - /* Note that the data type for this descriptor has to be set as follows: */ - /* type = (typeOf(x) == half) ? float : typeof(x) */ - /* The dimensions for this tensor descriptor are dependent on the normalization mode */ - /* For spatial normalization the tensors are expected to be 1D (of size C) */ - /* (in this case normalization is performed across NxHxW) */ - /* In per-activation mode the normalization is performed across N dimension only */ - /* So the tensors are expected to have dimensions of CxHxW */ + const void *x, /* NxCxHxW*/ + const cudnnTensorDescriptor_t yDesc, + void *y, /* NxCxHxW*/ const cudnnTensorDescriptor_t bnScaleBiasMeanVarDesc, - - /* Note - bnScale is 'gamma' in paper's notation */ - const void *bnScale, /* Mode-dependent dims */ - /* Note - this bias parameter can effectively replace the bias in Conv and FCN layers */ - /* (Which can be set to zero for efficiency) */ - /* Note - bnBias is 'beta' in paper's notation */ - const void *bnBias, /* Mode-dependent dims */ - - /* runningMean = newMean*factor + runningMean*(1-factor) */ - /* if isTrainingPhase == false, these tensors will remain const */ - /* and exponentialAverageFactor parameter is not used. */ - - /* An estimate of the batch mean, can be accumulated over multiple calls to */ - /* batchNormalizationForwardTraining */ + const void *bnScale, + const void *bnBias, const void *estimatedMean, - /* An estimate of the expression 1 / sqrt( epsilon + variance[x] ), */ - /* Can also be accumulated over multiple calls to batchNormalizationForwardTraining. */ - const void *estimatedInvVariance, - - /* Constant used to prevent divides by zero variance. Has to be >= CUDNN_BN_MIN_EPSILON. */ - /* Same epsilon value should be used in forward and backward functions. */ + const void *estimatedVariance, double epsilon ); -/* This function performs a backward pass for Batch Normalization layer. */ -/* The results are */ -/* 1. x gradient */ -/* 2. bnScale gradient */ -/* 3. bnBias gradient */ +/* Performs backward pass of Batch Normalization layer. Returns x gradient, +* bnScale gradient and bnBias gradient */ cudnnStatus_t cudnnBatchNormalizationBackward( cudnnHandle_t handle, cudnnBatchNormMode_t mode, @@ -1187,87 +1091,496 @@ cudnnStatus_t cudnnBatchNormalizationBackward( const void *betaDataDiff, const void *alphaParamDiff, const void *betaParamDiff, - - const cudnnTensorDescriptor_t xDesc, /* same desc for x, dx, dy */ + const cudnnTensorDescriptor_t xDesc, /* same desc for x, dx, dy*/ const void *x, - const cudnnTensorDescriptor_t dyDesc, + const cudnnTensorDescriptor_t dyDesc, const void *dy, - const cudnnTensorDescriptor_t dxDesc, + const cudnnTensorDescriptor_t dxDesc, void *dx, - - /* this tensor desc is used for all the 4 tensors below */ + /* Shared tensor desc for the 4 tensors below */ const cudnnTensorDescriptor_t dBnScaleBiasDesc, - const void *bnScale, /* bnBias doesn't affect backpropagation */ - - /* scale and bias diff are not backpropagated below this layer (dead-end computation DAG nodes) */ + const void *bnScale, /* bnBias doesn't affect backpropagation*/ + /* scale and bias diff are not backpropagated below this layer */ void *dBnScaleResult, void *dBnBiasResult, - /* Constant used to prevent divides by zero variance. Has to be >= CUDNN_BN_MIN_EPSILON. */ - /* Same epsilon value should be used in forward and backward functions. */ + /* Same epsilon as forward pass */ double epsilon, - /* Optional cache parameters containing saved intermediate results computed during the forward pass */ - /* For this to work correctly, the x data has to remain unchanged until the backward function is called. */ - /* Note that both of these parameters can be NULL but only at the same time. */ - /* It is recommended to use this cache since memory overhead is relatively small. */ + /* Optionally cached intermediate results from + forward pass */ const void *savedMean, const void *savedInvVariance ); -/* DEPRECATED API THAT WILL BE REMOVED SOON */ -cudnnStatus_t cudnnSetConvolutionNdDescriptor_v2( - cudnnConvolutionDescriptor_t convDesc, - int arrayLength, /* nbDims-2 size */ - const int padA[], - const int filterStrideA[], - const int upscaleA[], - cudnnConvolutionMode_t mode ); -cudnnStatus_t cudnnGetConvolutionNdDescriptor_v2( - const cudnnConvolutionDescriptor_t convDesc, - int arrayLengthRequested, - int *arrayLength, - int padA[], - int strideA[], - int upscaleA[], - cudnnConvolutionMode_t *mode ); +/* APIs for spatial transformer network*/ +typedef enum { + CUDNN_SAMPLER_BILINEAR=0, +} cudnnSamplerType_t; + +cudnnStatus_t cudnnCreateSpatialTransformerDescriptor( + + cudnnSpatialTransformerDescriptor_t *stDesc); + +cudnnStatus_t cudnnSetSpatialTransformerNdDescriptor( + cudnnSpatialTransformerDescriptor_t stDesc, + cudnnSamplerType_t samplerType, + cudnnDataType_t dataType, + const int nbDims, + const int dimA[]); + +cudnnStatus_t cudnnDestroySpatialTransformerDescriptor( + cudnnSpatialTransformerDescriptor_t stDesc); + +cudnnStatus_t cudnnSpatialTfGridGeneratorForward( + cudnnHandle_t handle, + const cudnnSpatialTransformerDescriptor_t stDesc, + const void *theta, + void *grid); + +cudnnStatus_t cudnnSpatialTfGridGeneratorBackward( + cudnnHandle_t handle, + const cudnnSpatialTransformerDescriptor_t stDesc, + const void *dgrid, + void *dtheta); + +cudnnStatus_t cudnnSpatialTfGridGeneratorForward( + cudnnHandle_t handle, + const cudnnSpatialTransformerDescriptor_t stDesc, + const void *theta, + void *grid); + +cudnnStatus_t cudnnSpatialTfSamplerForward( + cudnnHandle_t handle, + cudnnSpatialTransformerDescriptor_t stDesc, + const void *alpha, + const cudnnTensorDescriptor_t xDesc, + const void *x, + const void *grid, + const void *beta, + cudnnTensorDescriptor_t yDesc, + void *y); + +cudnnStatus_t cudnnSpatialTfSamplerBackward( + cudnnHandle_t handle, + cudnnSpatialTransformerDescriptor_t stDesc, + const void *alpha, + const cudnnTensorDescriptor_t xDesc, + const void *x, + const void *beta, + const cudnnTensorDescriptor_t dxDesc, + void *dx, + const void *alphaDgrid, + const cudnnTensorDescriptor_t dyDesc, + const void *dy, + const void *grid, + const void *betaDgrid, + void *dgrid); + +typedef struct cudnnDropoutStruct * cudnnDropoutDescriptor_t; + +cudnnStatus_t cudnnCreateDropoutDescriptor(cudnnDropoutDescriptor_t * dropoutDesc); + +cudnnStatus_t cudnnDestroyDropoutDescriptor(cudnnDropoutDescriptor_t dropoutDesc); + +/*helper function to determine size of the states to be passed to cudnnSetDropoutDescriptor */ +cudnnStatus_t cudnnDropoutGetStatesSize(cudnnHandle_t handle, size_t * sizeInBytes); + +/*helper function to determine size of the reserve space to be passed to dropout forward/backward calls */ +cudnnStatus_t cudnnDropoutGetReserveSpaceSize(cudnnTensorDescriptor_t xdesc, size_t * sizeInBytes); + +cudnnStatus_t cudnnSetDropoutDescriptor(cudnnDropoutDescriptor_t dropoutDesc, + cudnnHandle_t handle, + float dropout, + void * states, + size_t stateSizeInBytes, + unsigned long long seed); + +cudnnStatus_t cudnnDropoutForward(cudnnHandle_t handle, + const cudnnDropoutDescriptor_t dropoutDesc, + const cudnnTensorDescriptor_t xdesc, + const void * x, + const cudnnTensorDescriptor_t ydesc, + void * y, + void * reserveSpace, + size_t reserveSpaceSizeInBytes); + +cudnnStatus_t cudnnDropoutBackward(cudnnHandle_t handle, + const cudnnDropoutDescriptor_t dropoutDesc, + const cudnnTensorDescriptor_t dydesc, + const void * dy, + const cudnnTensorDescriptor_t dxdesc, + void * dx, + void * reserveSpace, + size_t reserveSpaceSizeInBytes); + +/* RNN API */ +typedef enum + { + CUDNN_RNN_RELU = 0, /* Stock RNN with ReLu activation*/ + CUDNN_RNN_TANH = 1, /* Stock RNN with tanh activation*/ + CUDNN_LSTM = 2, /* LSTM with no peephole connections*/ + CUDNN_GRU = 3 /* Using h' = tanh(r * Uh(t-1) + Wx) and h = (1 - z) * h' + z * h(t-1);*/ + } cudnnRNNMode_t; + +typedef enum + { + CUDNN_UNIDIRECTIONAL = 0, + CUDNN_BIDIRECTIONAL = 1 /* Using output concatination at each step. Do we also want to support output sum?*/ + } cudnnDirectionMode_t; -cudnnStatus_t cudnnAddTensor_v2( +typedef enum + { + CUDNN_LINEAR_INPUT = 0, + CUDNN_SKIP_INPUT = 1 + } cudnnRNNInputMode_t; + + +struct cudnnRNNStruct; +typedef struct cudnnRNNStruct* cudnnRNNDescriptor_t; + +cudnnStatus_t cudnnCreateRNNDescriptor(cudnnRNNDescriptor_t * rnnDesc); +cudnnStatus_t cudnnDestroyRNNDescriptor(cudnnRNNDescriptor_t rnnDesc); + +cudnnStatus_t cudnnSetRNNDescriptor(cudnnRNNDescriptor_t rnnDesc, + int hiddenSize, + int seqLength, + int numLayers, + cudnnDropoutDescriptor_t dropoutDesc, /* Between layers, not between recurrent steps.*/ + cudnnRNNInputMode_t inputMode, + cudnnDirectionMode_t direction, + cudnnRNNMode_t mode, + cudnnDataType_t dataType); + + +/* dataType in the RNN descriptor is used to determine math precision*/ +/* dataType in weight descriptors and input descriptors is used to describe storage*/ + +cudnnStatus_t cudnnGetRNNWorkspaceSize( cudnnHandle_t handle, + const cudnnRNNDescriptor_t rnnDesc, + const cudnnTensorDescriptor_t *xDesc, + size_t *sizeInBytes + ); + +cudnnStatus_t cudnnGetRNNTrainingReserveSize( cudnnHandle_t handle, + const cudnnRNNDescriptor_t rnnDesc, + const cudnnTensorDescriptor_t *xDesc, + size_t *sizeInBytes + ); + + +cudnnStatus_t cudnnGetRNNParamsSize( cudnnHandle_t handle, + const cudnnRNNDescriptor_t rnnDesc, + const cudnnTensorDescriptor_t *xDesc, + size_t *sizeInBytes + ); + +cudnnStatus_t cudnnGetRNNLinLayerMatrixParams( cudnnHandle_t handle, + const cudnnRNNDescriptor_t rnnDesc, + const int layer, + const cudnnTensorDescriptor_t * xDesc, + const cudnnFilterDescriptor_t wDesc, + const void * w, + const int linLayerID, + cudnnFilterDescriptor_t linLayerMatDesc, + void ** linLayerMat + ); + +cudnnStatus_t cudnnGetRNNLinLayerBiasParams( cudnnHandle_t handle, + const cudnnRNNDescriptor_t rnnDesc, + const int layer, + const cudnnTensorDescriptor_t * xDesc, + const cudnnFilterDescriptor_t wDesc, + const void * w, + const int linLayerID, + cudnnFilterDescriptor_t linLayerBiasDesc, + void ** linLayerBias + ); + + +cudnnStatus_t cudnnRNNForwardInference( cudnnHandle_t handle, + const cudnnRNNDescriptor_t rnnDesc, + const cudnnTensorDescriptor_t * xDesc, + const void * x, + const cudnnTensorDescriptor_t hxDesc, + const void * hx, + const cudnnTensorDescriptor_t cxDesc, + const void * cx, + const cudnnFilterDescriptor_t wDesc, + const void * w, + const cudnnTensorDescriptor_t *yDesc, + void * y, + const cudnnTensorDescriptor_t hyDesc, + void * hy, + const cudnnTensorDescriptor_t cyDesc, + void * cy, + void * workspace, + size_t workSpaceSizeInBytes); + + + +cudnnStatus_t cudnnRNNForwardTraining( cudnnHandle_t handle, + const cudnnRNNDescriptor_t rnnDesc, + const cudnnTensorDescriptor_t *xDesc, + const void * x, + const cudnnTensorDescriptor_t hxDesc, + const void * hx, + const cudnnTensorDescriptor_t cxDesc, + const void * cx, + const cudnnFilterDescriptor_t wDesc, + const void * w, + const cudnnTensorDescriptor_t *yDesc, + void * y, + const cudnnTensorDescriptor_t hyDesc, + void * hy, + const cudnnTensorDescriptor_t cyDesc, + void * cy, + void * workspace, + size_t workSpaceSizeInBytes, + void * reserveSpace, + size_t reserveSpaceSizeInBytes); + +cudnnStatus_t cudnnRNNBackwardData( cudnnHandle_t handle, + const cudnnRNNDescriptor_t rnnDesc, + const cudnnTensorDescriptor_t * yDesc, + const void * y, + const cudnnTensorDescriptor_t * dyDesc, + const void * dy, + const cudnnTensorDescriptor_t dhyDesc, + const void * dhy, + const cudnnTensorDescriptor_t dcyDesc, + const void * dcy, + const cudnnFilterDescriptor_t wDesc, + const void * w, + const cudnnTensorDescriptor_t hxDesc, + const void * hx, + const cudnnTensorDescriptor_t cxDesc, + const void * cx, + const cudnnTensorDescriptor_t * dxDesc, + void * dx, + const cudnnTensorDescriptor_t dhxDesc, + void * dhx, + const cudnnTensorDescriptor_t dcxDesc, + void * dcx, + void * workspace, + size_t workSpaceSizeInBytes, + const void * reserveSpace, + size_t reserveSpaceSizeInBytes ); + + +cudnnStatus_t cudnnRNNBackwardWeights( cudnnHandle_t handle, + const cudnnRNNDescriptor_t rnnDesc, + const cudnnTensorDescriptor_t * xDesc, + const void * x, + const cudnnTensorDescriptor_t hxDesc, + const void * hx, + const cudnnTensorDescriptor_t * yDesc, + const void * y, + const void * workspace, + size_t workSpaceSizeInBytes, + const cudnnFilterDescriptor_t dwDesc, + void * dw, + const void * reserveSpace, + size_t reserveSpaceSizeInBytes ); + + + +/* DEPRECATED routines to be removed next release : + User should use the non-suffixed version (which has the API and functionality of _v4 version) + Routines with _v3 suffix has the functionality of the non-suffixed routines in the CUDNN V4 + */ + +cudnnStatus_t cudnnSetFilter4dDescriptor_v3( + cudnnFilterDescriptor_t filterDesc, + cudnnDataType_t dataType, /* image data type*/ + int k, /* number of output feature maps*/ + int c, /* number of input feature maps*/ + int h, /* height of each input filter*/ + int w ); /* width of each input filter*/ + +cudnnStatus_t cudnnSetFilter4dDescriptor_v4( + cudnnFilterDescriptor_t filterDesc, + cudnnDataType_t dataType, /* image data type*/ + cudnnTensorFormat_t format, + int k, /* number of output feature maps*/ + int c, /* number of input feature maps*/ + int h, /* height of each input filter*/ + int w ); /* width of each input filter*/ + +cudnnStatus_t cudnnGetFilter4dDescriptor_v3( + const cudnnFilterDescriptor_t filterDesc, + cudnnDataType_t *dataType, /* image data type*/ + int *k, /* number of output feature maps*/ + int *c, /* number of input feature maps*/ + int *h, /* height of each input filter*/ + int *w ); /* width of each input filter*/ + +cudnnStatus_t cudnnGetFilter4dDescriptor_v4( + const cudnnFilterDescriptor_t filterDesc, + cudnnDataType_t *dataType, /* image data type*/ + cudnnTensorFormat_t *format, + int *k, /* number of output feature maps*/ + int *c, /* number of input feature maps*/ + int *h, /* height of each input filter*/ + int *w ); /* width of each input filter */ + +cudnnStatus_t cudnnSetFilterNdDescriptor_v3( + cudnnFilterDescriptor_t filterDesc, + cudnnDataType_t dataType, /* image data type*/ + int nbDims, + const int filterDimA[] ); + + +cudnnStatus_t cudnnSetFilterNdDescriptor_v4( + cudnnFilterDescriptor_t filterDesc, + cudnnDataType_t dataType, /* image data type*/ + cudnnTensorFormat_t format, + int nbDims, + const int filterDimA[] ); + +cudnnStatus_t cudnnGetFilterNdDescriptor_v3( + const cudnnFilterDescriptor_t filterDesc, + int nbDimsRequested, + cudnnDataType_t *dataType, /* image data type*/ + int *nbDims, + int filterDimA[] ); + +cudnnStatus_t cudnnGetFilterNdDescriptor_v4( + const cudnnFilterDescriptor_t filterDesc, + int nbDimsRequested, + cudnnDataType_t *dataType, /* image data type*/ + cudnnTensorFormat_t *format, + int *nbDims, + int filterDimA[] ); + +cudnnStatus_t cudnnSetPooling2dDescriptor_v3( + cudnnPoolingDescriptor_t poolingDesc, + cudnnPoolingMode_t mode, + int windowHeight, + int windowWidth, + int verticalPadding, + int horizontalPadding, + int verticalStride, + int horizontalStride ); + +cudnnStatus_t cudnnSetPooling2dDescriptor_v4( + cudnnPoolingDescriptor_t poolingDesc, + cudnnPoolingMode_t mode, + cudnnNanPropagation_t maxpoolingNanOpt, + int windowHeight, + int windowWidth, + int verticalPadding, + int horizontalPadding, + int verticalStride, + int horizontalStride ); +cudnnStatus_t cudnnGetPooling2dDescriptor_v3( + const cudnnPoolingDescriptor_t poolingDesc, + cudnnPoolingMode_t *mode, + int *windowHeight, + int *windowWidth, + int *verticalPadding, + int *horizontalPadding, + int *verticalStride, + int *horizontalStride ); + +cudnnStatus_t cudnnGetPooling2dDescriptor_v4( + const cudnnPoolingDescriptor_t poolingDesc, + cudnnPoolingMode_t *mode, + cudnnNanPropagation_t *maxpoolingNanOpt, + int *windowHeight, + int *windowWidth, + int *verticalPadding, + int *horizontalPadding, + int *verticalStride, + int *horizontalStride ); + +cudnnStatus_t cudnnSetPoolingNdDescriptor_v3( + cudnnPoolingDescriptor_t poolingDesc, + const cudnnPoolingMode_t mode, + int nbDims, + const int windowDimA[], + const int paddingA[], + const int strideA[] ); + +cudnnStatus_t cudnnSetPoolingNdDescriptor_v4( + cudnnPoolingDescriptor_t poolingDesc, + const cudnnPoolingMode_t mode, + const cudnnNanPropagation_t maxpoolingNanOpt, + int nbDims, + const int windowDimA[], + const int paddingA[], + const int strideA[] ); + +cudnnStatus_t cudnnGetPoolingNdDescriptor_v3( + const cudnnPoolingDescriptor_t poolingDesc, + const int nbDimsRequested, + cudnnPoolingMode_t *mode, + int *nbDims, + int windowDimA[], + int paddingA[], + int strideA[] ); + +cudnnStatus_t cudnnGetPoolingNdDescriptor_v4( + const cudnnPoolingDescriptor_t poolingDesc, + int nbDimsRequested, + cudnnPoolingMode_t *mode, + cudnnNanPropagation_t *maxpoolingNanOpt, + int *nbDims, + int windowDimA[], + int paddingA[], + int strideA[] ); + +cudnnStatus_t cudnnActivationForward_v3( cudnnHandle_t handle, - cudnnAddMode_t mode, + cudnnActivationMode_t mode, const void *alpha, - const cudnnTensorDescriptor_t bDesc, - const void *b, + const cudnnTensorDescriptor_t xDesc, + const void *x, const void *beta, - cudnnTensorDescriptor_t yDesc, + const cudnnTensorDescriptor_t yDesc, void *y ); -cudnnStatus_t cudnnConvolutionBackwardFilter_v2( +cudnnStatus_t cudnnActivationForward_v4( cudnnHandle_t handle, + cudnnActivationDescriptor_t activationDesc, const void *alpha, const cudnnTensorDescriptor_t xDesc, const void *x, + const void *beta, + const cudnnTensorDescriptor_t yDesc, + void *y ); + +cudnnStatus_t cudnnActivationBackward_v3( + cudnnHandle_t handle, + cudnnActivationMode_t mode, + const void *alpha, + const cudnnTensorDescriptor_t yDesc, + const void *y, const cudnnTensorDescriptor_t dyDesc, const void *dy, - const cudnnConvolutionDescriptor_t convDesc, + const cudnnTensorDescriptor_t xDesc, + const void *x, const void *beta, - const cudnnFilterDescriptor_t dxDesc, + const cudnnTensorDescriptor_t dxDesc, void *dx ); -cudnnStatus_t cudnnConvolutionBackwardData_v2( +cudnnStatus_t cudnnActivationBackward_v4( cudnnHandle_t handle, + cudnnActivationDescriptor_t activationDesc, const void *alpha, - const cudnnFilterDescriptor_t xDesc, - const void *x, + const cudnnTensorDescriptor_t yDesc, + const void *y, const cudnnTensorDescriptor_t dyDesc, const void *dy, - const cudnnConvolutionDescriptor_t convDesc, + const cudnnTensorDescriptor_t xDesc, + const void *x, const void *beta, const cudnnTensorDescriptor_t dxDesc, void *dx ); -]] -local libnames = {'libcudnn.so.4', 'libcudnn.4.dylib'} +]] + +local libnames = {'libcudnn.so.5', 'libcudnn.5.dylib'} local ok = false for i=1,#libnames do ok = pcall(function () cudnn.C = ffi.load(libnames[i]) end) @@ -1275,15 +1588,16 @@ for i=1,#libnames do end if not ok then - error([['libcudnn (R4) not found in library path. + print(err) + error([['libcudnn (R5) not found in library path. Please install CuDNN from https://developer.nvidia.com/cuDNN -Then make sure files named as libcudnn.so.4 or libcudnn.4.dylib are placed in your library load path (for example /usr/local/lib , or manually add a path to LD_LIBRARY_PATH) +Then make sure files named as libcudnn.so.5 or libcudnn.5.dylib are placed in your library load path (for example /usr/local/lib , or manually add a path to LD_LIBRARY_PATH) ]]) end cudnn.version = tonumber(cudnn.C.cudnnGetVersion()) -if cudnn.version < 4005 then - error('These bindings are for version 4005 or above, ' +if cudnn.version < 5002 then + error('These bindings are for version 5002 or above, ' .. 'while the loaded CuDNN is version: ' .. cudnn.version .. ' \nAre you using an older version of CuDNN?') end |