Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/soumith/cudnn.torch.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorsoumith <soumith@fb.com>2015-09-15 20:32:36 +0300
committersoumith <soumith@fb.com>2015-09-15 20:32:36 +0300
commit4d5c3db15efc87fe4220fc06486a8d7be759dcc2 (patch)
tree87548ebc6a6c6113d952569d1ab72ccf6052ebb0
parent97f41c48602a345344bb5f76e73e4b2fbf7eb679 (diff)
whitespace cleanups, fixing logsoftmax test
-rw-r--r--SpatialConvolution.lua610
-rw-r--r--VolumetricConvolution.lua86
-rw-r--r--ffi.lua308
-rw-r--r--functional.lua2
-rw-r--r--test/test.lua14
5 files changed, 506 insertions, 514 deletions
diff --git a/SpatialConvolution.lua b/SpatialConvolution.lua
index fe7f027..04c9319 100644
--- a/SpatialConvolution.lua
+++ b/SpatialConvolution.lua
@@ -1,146 +1,144 @@
local SpatialConvolution, parent =
- torch.class('cudnn.SpatialConvolution', 'nn.SpatialConvolution')
+ torch.class('cudnn.SpatialConvolution', 'nn.SpatialConvolution')
local ffi = require 'ffi'
local errcheck = cudnn.errcheck
function SpatialConvolution:__init(nInputPlane, nOutputPlane,
kW, kH, dW, dH, padW, padH, groups)
- local delayedReset = self.reset
- self.reset = function() end
- parent.__init(self, nInputPlane, nOutputPlane, kW, kH, dW, dH)
- self.reset = delayedReset
- self.padW = padW or 0
- self.padH = padH or 0
- self.groups = groups or 1
- assert(nInputPlane % self.groups == 0,
- 'nInputPlane should be divisible by nGroups')
- assert(nOutputPlane % self.groups == 0,
- 'nOutputPlane should be divisible by nGroups')
- self.weight = torch.Tensor(nOutputPlane, nInputPlane/self.groups, kW, kH)
- self.gradWeight = torch.Tensor(nOutputPlane, nInputPlane/self.groups, kW, kH)
- self:reset()
- self.iSize = torch.LongStorage(4):fill(0)
- self.fastest_mode = true
+ local delayedReset = self.reset
+ self.reset = function() end
+ parent.__init(self, nInputPlane, nOutputPlane, kW, kH, dW, dH)
+ self.reset = delayedReset
+ self.padW = padW or 0
+ self.padH = padH or 0
+ self.groups = groups or 1
+ assert(nInputPlane % self.groups == 0,
+ 'nInputPlane should be divisible by nGroups')
+ assert(nOutputPlane % self.groups == 0,
+ 'nOutputPlane should be divisible by nGroups')
+ self.weight = torch.Tensor(nOutputPlane, nInputPlane/self.groups, kW, kH)
+ self.gradWeight = torch.Tensor(nOutputPlane, nInputPlane/self.groups, kW, kH)
+ self:reset()
+ self.iSize = torch.LongStorage(4):fill(0)
+ self.fastest_mode = true
end
-- if you change the configuration of the module manually, call this
function SpatialConvolution:resetWeightDescriptors()
- assert(torch.typename(self.weight) == 'torch.CudaTensor',
- 'Only Cuda supported duh!')
- assert(torch.typename(self.bias) == 'torch.CudaTensor',
- 'Only Cuda supported duh!')
- -- for compatibility
- self.groups = self.groups or 1
- -- create filterDescriptor for weight
- self.weightDesc = ffi.new('struct cudnnFilterStruct*[1]')
- errcheck('cudnnCreateFilterDescriptor', self.weightDesc)
- local desc = torch.IntTensor({self.nOutputPlane/self.groups,
- self.nInputPlane/self.groups,
- self.kH, self.kW})
- errcheck('cudnnSetFilterNdDescriptor', self.weightDesc[0],
- 'CUDNN_DATA_FLOAT', 4,
- desc:data());
- local function destroyWDesc(d)
- errcheck('cudnnDestroyFilterDescriptor', d[0]);
- end
- ffi.gc(self.weightDesc, destroyWDesc)
-
- -- create descriptor for bias
- self.biasDesc = cudnn.toDescriptor(self.bias:view(1, self.nOutputPlane,1,1))
+ assert(torch.typename(self.weight) == 'torch.CudaTensor',
+ 'Only Cuda supported duh!')
+ assert(torch.typename(self.bias) == 'torch.CudaTensor',
+ 'Only Cuda supported duh!')
+ -- for compatibility
+ self.groups = self.groups or 1
+ -- create filterDescriptor for weight
+ self.weightDesc = ffi.new('struct cudnnFilterStruct*[1]')
+ errcheck('cudnnCreateFilterDescriptor', self.weightDesc)
+ local desc = torch.IntTensor({self.nOutputPlane/self.groups,
+ self.nInputPlane/self.groups,
+ self.kH, self.kW})
+ errcheck('cudnnSetFilterNdDescriptor', self.weightDesc[0],
+ 'CUDNN_DATA_FLOAT', 4,
+ desc:data());
+ local function destroyWDesc(d)
+ errcheck('cudnnDestroyFilterDescriptor', d[0]);
+ end
+ ffi.gc(self.weightDesc, destroyWDesc)
+
+ -- create descriptor for bias
+ self.biasDesc = cudnn.toDescriptor(self.bias:view(1, self.nOutputPlane,1,1))
end
function SpatialConvolution:fastest(mode)
- if mode == nil then mode = true end
- self.fastest_mode = mode
- self.iSize:fill(0)
- return self
+ if mode == nil then mode = true end
+ self.fastest_mode = mode
+ return self
end
function SpatialConvolution:setMode(fmode, bdmode, bwmode)
- if fmode ~= nil then
- self.fmode = fmode
- end
- if bdmode ~= nil then
- self.bdmode = bdmode
- end
- if bwmode ~= nil then
- self.bwmode = bwmode
- end
- self.iSize:fill(0)
- return self
+ if fmode ~= nil then
+ self.fmode = fmode
+ end
+ if bdmode ~= nil then
+ self.bdmode = bdmode
+ end
+ if bwmode ~= nil then
+ self.bwmode = bwmode
+ end
+ return self
end
function SpatialConvolution:resetMode()
- self.fmode = nil
- self.bdmode = nil
- self.bwmode = nil
- return self
+ self.fmode = nil
+ self.bdmode = nil
+ self.bwmode = nil
+ return self
end
function SpatialConvolution:createIODescriptors(input)
- local batch = true
- if input:dim() == 3 then
- input = input:view(1, input:size(1), input:size(2), input:size(3))
- batch = false
- end
- assert(input:dim() == 4 and input:isContiguous());
- if not self.iDesc or not self.oDesc or
- input:size(1) ~= self.iSize[1] or input:size(2) ~= self.iSize[2]
- or input:size(3) ~= self.iSize[3] or input:size(4) ~= self.iSize[4] then
- self.iSize = input:size()
-
- -- resize gradInput
- if self.gradInput then self.gradInput:resizeAs(input); end
- assert(self.nInputPlane == input:size(2), 'input has to contain: '
+ local batch = true
+ if input:dim() == 3 then
+ input = input:view(1, input:size(1), input:size(2), input:size(3))
+ batch = false
+ end
+ assert(input:dim() == 4 and input:isContiguous());
+ if not self.iDesc or not self.oDesc or
+ input:size(1) ~= self.iSize[1] or input:size(2) ~= self.iSize[2]
+ or input:size(3) ~= self.iSize[3] or input:size(4) ~= self.iSize[4] then
+ self.iSize = input:size()
+
+ -- resize gradInput
+ if self.gradInput then self.gradInput:resizeAs(input); end
+ assert(self.nInputPlane == input:size(2), 'input has to contain: '
.. self.nInputPlane
.. ' feature maps, but received input of size: '
.. input:size(1) .. ' x ' .. input:size(2) ..
' x ' .. input:size(3) .. ' x ' .. input:size(4))
- -- create input descriptor
- local input_slice = {{},{1,self.nInputPlane/self.groups},{},{}}
- self.iDesc = cudnn.toDescriptor(input[input_slice])
-
- -- create conv descriptor
- self.convDesc = ffi.new('struct cudnnConvolutionStruct*[1]')
- errcheck('cudnnCreateConvolutionDescriptor', self.convDesc)
- local pad = torch.IntTensor({self.padH, self.padW})
- local stride = torch.IntTensor({self.dH, self.dW})
- local upscale = torch.IntTensor({1,1})
- errcheck('cudnnSetConvolutionNdDescriptor_v3', self.convDesc[0],
- 2, pad:data(),
- stride:data(), upscale:data(), 'CUDNN_CROSS_CORRELATION',
- 'CUDNN_DATA_FLOAT');
- local function destroyConvDesc(d)
+ -- create input descriptor
+ local input_slice = {{},{1,self.nInputPlane/self.groups},{},{}}
+ self.iDesc = cudnn.toDescriptor(input[input_slice])
+
+ -- create conv descriptor
+ self.convDesc = ffi.new('struct cudnnConvolutionStruct*[1]')
+ errcheck('cudnnCreateConvolutionDescriptor', self.convDesc)
+ local pad = torch.IntTensor({self.padH, self.padW})
+ local stride = torch.IntTensor({self.dH, self.dW})
+ local upscale = torch.IntTensor({1,1})
+ errcheck('cudnnSetConvolutionNdDescriptor_v3', self.convDesc[0],
+ 2, pad:data(),
+ stride:data(), upscale:data(), 'CUDNN_CROSS_CORRELATION',
+ 'CUDNN_DATA_FLOAT');
+ local function destroyConvDesc(d)
errcheck('cudnnDestroyConvolutionDescriptor', d[0]);
- end
- ffi.gc(self.convDesc, destroyConvDesc)
-
- -- get output shape, resize output
- local oSize = torch.IntTensor(4)
- local oSizeD = oSize:data()
- errcheck('cudnnGetConvolutionNdForwardOutputDim',
- self.convDesc[0], self.iDesc[0],
- self.weightDesc[0], 4, oSizeD)
- oSize[2] = oSize[2] * self.groups
- self.output:resize(oSize:long():storage())
-
- -- create descriptor for output
- local output_slice = {{},{1,self.nOutputPlane/self.groups},{},{}}
- self.oDesc = cudnn.toDescriptor(self.output[output_slice])
- self.oDescForBias = cudnn.toDescriptor(self.output)
-
- -----------------------------------------------------------------------
- local maxBufSize = 0
-
- -- create forwardAlgorithm descriptors
- local algType = ffi.new("cudnnConvolutionFwdAlgo_t[?]", 1)
- local algSearchMode = 'CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT'
- local algWorkspaceLimit = self.workspace_limit
- or (self.nInputPlane * self.kH * self.kW * 4) -- 4 = sizeof int/float.
-
- if self.fastest_mode then algSearchMode = 'CUDNN_CONVOLUTION_FWD_PREFER_FASTEST' end
- if cudnn.benchmark then -- the manual auto-tuner is run
+ end
+ ffi.gc(self.convDesc, destroyConvDesc)
+
+ -- get output shape, resize output
+ local oSize = torch.IntTensor(4)
+ local oSizeD = oSize:data()
+ errcheck('cudnnGetConvolutionNdForwardOutputDim',
+ self.convDesc[0], self.iDesc[0],
+ self.weightDesc[0], 4, oSizeD)
+ oSize[2] = oSize[2] * self.groups
+ self.output:resize(oSize:long():storage())
+
+ -- create descriptor for output
+ local output_slice = {{},{1,self.nOutputPlane/self.groups},{},{}}
+ self.oDesc = cudnn.toDescriptor(self.output[output_slice])
+ self.oDescForBias = cudnn.toDescriptor(self.output)
+
+ -----------------------------------------------------------------------
+ local maxBufSize = 0
+
+ -- create forwardAlgorithm descriptors
+ local algType = ffi.new("cudnnConvolutionFwdAlgo_t[?]", 1)
+ local algSearchMode = 'CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT'
+ local algWorkspaceLimit = self.workspace_limit
+ or (self.nInputPlane * self.kH * self.kW * 4) -- 4 = sizeof int/float.
+
+ if self.fastest_mode then algSearchMode = 'CUDNN_CONVOLUTION_FWD_PREFER_FASTEST' end
+ if cudnn.benchmark then -- the manual auto-tuner is run
local perfResults = ffi.new("cudnnConvolutionFwdAlgoPerf_t[?]", 1)
local intt = torch.IntTensor(1);
errcheck('cudnnFindConvolutionForwardAlgorithm',
@@ -150,36 +148,36 @@ function SpatialConvolution:createIODescriptors(input)
1, intt:data(), perfResults)
algType[0] = perfResults[0].algo
if cudnn.verbose then
- print('AutoTuning:', perfResults[0].time,
- tonumber(perfResults[0].memory), tonumber(perfResults[0].algo))
+ print('AutoTuning:', perfResults[0].time,
+ tonumber(perfResults[0].memory), tonumber(perfResults[0].algo))
end
- else
+ else
errcheck('cudnnGetConvolutionForwardAlgorithm',
cudnn.getHandle(),
self.iDesc[0], self.weightDesc[0],
self.convDesc[0], self.oDesc[0],
algSearchMode, algWorkspaceLimit, algType)
- end
- algType[0] = self.fmode or algType[0]
- self.fwdAlgType = algType
- local bufSize = torch.LongTensor(1)
- errcheck('cudnnGetConvolutionForwardWorkspaceSize',
- cudnn.getHandle(),
- self.iDesc[0], self.weightDesc[0],
- self.convDesc[0], self.oDesc[0],
- algType[0], bufSize:data())
- maxBufSize = math.max(maxBufSize, bufSize[1])
-
- -- create backwardFilterAlgorithm descriptors
- local algType = ffi.new("cudnnConvolutionBwdFilterAlgo_t[?]", 1)
- local algSearchMode = 'CUDNN_CONVOLUTION_BWD_FILTER_NO_WORKSPACE'
- local algWorkspaceLimit = self.workspace_limit
- or (self.nInputPlane * self.kH * self.kW * 4) -- 4 = sizeof int/float.
- if self.fastest_mode then
+ end
+ algType[0] = self.fmode or algType[0]
+ self.fwdAlgType = algType
+ local bufSize = torch.LongTensor(1)
+ errcheck('cudnnGetConvolutionForwardWorkspaceSize',
+ cudnn.getHandle(),
+ self.iDesc[0], self.weightDesc[0],
+ self.convDesc[0], self.oDesc[0],
+ algType[0], bufSize:data())
+ maxBufSize = math.max(maxBufSize, bufSize[1])
+
+ -- create backwardFilterAlgorithm descriptors
+ local algType = ffi.new("cudnnConvolutionBwdFilterAlgo_t[?]", 1)
+ local algSearchMode = 'CUDNN_CONVOLUTION_BWD_FILTER_NO_WORKSPACE'
+ local algWorkspaceLimit = self.workspace_limit
+ or (self.nInputPlane * self.kH * self.kW * 4) -- 4 = sizeof int/float.
+ if self.fastest_mode then
algSearchMode = 'CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST'
- end
+ end
- if cudnn.benchmark then -- the manual auto-tuner is run
+ if cudnn.benchmark then -- the manual auto-tuner is run
local perfResults = ffi.new("cudnnConvolutionBwdFilterAlgoPerf_t[?]", 1)
local intt = torch.IntTensor(1);
errcheck('cudnnFindConvolutionBackwardFilterAlgorithm',
@@ -189,35 +187,35 @@ function SpatialConvolution:createIODescriptors(input)
1, intt:data(), perfResults)
algType[0] = perfResults[0].algo
if cudnn.verbose then
- print('AutoTuning:', perfResults[0].time,
- tonumber(perfResults[0].memory), tonumber(perfResults[0].algo))
+ print('AutoTuning:', perfResults[0].time,
+ tonumber(perfResults[0].memory), tonumber(perfResults[0].algo))
end
- else
+ else
errcheck('cudnnGetConvolutionBackwardFilterAlgorithm',
cudnn.getHandle(),
self.iDesc[0], self.oDesc[0],
self.convDesc[0], self.weightDesc[0],
algSearchMode, algWorkspaceLimit, algType)
- end
- algType[0] = self.bwmode or algType[0]
- self.bwdFilterAlgType = algType
- local bufSize = torch.LongTensor(1)
- errcheck('cudnnGetConvolutionBackwardFilterWorkspaceSize',
- cudnn.getHandle(),
- self.iDesc[0], self.oDesc[0],
- self.convDesc[0], self.weightDesc[0],
- algType[0], bufSize:data())
- maxBufSize = math.max(maxBufSize, bufSize[1])
-
- -- create backwardDataAlgorithm descriptors
- local algType = ffi.new("cudnnConvolutionBwdDataAlgo_t[?]", 1)
- local algSearchMode = 'CUDNN_CONVOLUTION_BWD_DATA_NO_WORKSPACE'
- local algWorkspaceLimit = self.workspace_limit
- or (self.nInputPlane * self.kH * self.kW * 4) -- 4 = sizeof int/float.
- if self.fastest_mode then
+ end
+ algType[0] = self.bwmode or algType[0]
+ self.bwdFilterAlgType = algType
+ local bufSize = torch.LongTensor(1)
+ errcheck('cudnnGetConvolutionBackwardFilterWorkspaceSize',
+ cudnn.getHandle(),
+ self.iDesc[0], self.oDesc[0],
+ self.convDesc[0], self.weightDesc[0],
+ algType[0], bufSize:data())
+ maxBufSize = math.max(maxBufSize, bufSize[1])
+
+ -- create backwardDataAlgorithm descriptors
+ local algType = ffi.new("cudnnConvolutionBwdDataAlgo_t[?]", 1)
+ local algSearchMode = 'CUDNN_CONVOLUTION_BWD_DATA_NO_WORKSPACE'
+ local algWorkspaceLimit = self.workspace_limit
+ or (self.nInputPlane * self.kH * self.kW * 4) -- 4 = sizeof int/float.
+ if self.fastest_mode then
algSearchMode = 'CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST'
- end
- if cudnn.benchmark then -- the manual auto-tuner is run
+ end
+ if cudnn.benchmark then -- the manual auto-tuner is run
local perfResults = ffi.new("cudnnConvolutionBwdDataAlgoPerf_t[?]", 1)
local intt = torch.IntTensor(1);
errcheck('cudnnFindConvolutionBackwardDataAlgorithm',
@@ -227,183 +225,183 @@ function SpatialConvolution:createIODescriptors(input)
1, intt:data(), perfResults)
algType[0] = perfResults[0].algo
if cudnn.verbose then
- print('AutoTuning:', perfResults[0].time,
- tonumber(perfResults[0].memory), tonumber(perfResults[0].algo))
+ print('AutoTuning:', perfResults[0].time,
+ tonumber(perfResults[0].memory), tonumber(perfResults[0].algo))
end
- else
+ else
errcheck('cudnnGetConvolutionBackwardDataAlgorithm',
cudnn.getHandle(),
self.weightDesc[0], self.oDesc[0],
self.convDesc[0], self.iDesc[0],
algSearchMode, algWorkspaceLimit, algType)
- end
- algType[0] = self.bdmode or algType[0]
- self.bwdDataAlgType = algType
- local bufSize = torch.LongTensor(1)
- errcheck('cudnnGetConvolutionBackwardDataWorkspaceSize',
- cudnn.getHandle(),
- self.weightDesc[0], self.oDesc[0],
- self.convDesc[0], self.iDesc[0],
- algType[0], bufSize:data())
- maxBufSize = math.max(maxBufSize, bufSize[1])
-
- self.extraBuffer = self.extraBuffer or cudnn.getSharedWorkspace()
- self.extraBufferSizeInBytes = self.extraBuffer:nElement() * 4 -- float
- if maxBufSize > self.extraBufferSizeInBytes then
- self.extraBuffer:resize(math.ceil(maxBufSize/4))
- self.extraBufferSizeInBytes = maxBufSize
- end
-
- -----------------------------------------------------------------------
- -- create offsets for groups
- local iH, iW = input:size(3), input:size(4)
- local kH, kW = self.kH, self.kW
- local oH, oW = oSize[3], oSize[4]
- self.input_offset = self.nInputPlane / self.groups * iH * iW
- self.output_offset = self.nOutputPlane / self.groups * oH, oW
- self.weight_offset = self.nInputPlane / self.groups * self.nOutputPlane / self.groups * kH * kW
-
- if not batch then
+ end
+ algType[0] = self.bdmode or algType[0]
+ self.bwdDataAlgType = algType
+ local bufSize = torch.LongTensor(1)
+ errcheck('cudnnGetConvolutionBackwardDataWorkspaceSize',
+ cudnn.getHandle(),
+ self.weightDesc[0], self.oDesc[0],
+ self.convDesc[0], self.iDesc[0],
+ algType[0], bufSize:data())
+ maxBufSize = math.max(maxBufSize, bufSize[1])
+
+ self.extraBuffer = self.extraBuffer or cudnn.getSharedWorkspace()
+ self.extraBufferSizeInBytes = self.extraBuffer:nElement() * 4 -- float
+ if maxBufSize > self.extraBufferSizeInBytes then
+ self.extraBuffer:resize(math.ceil(maxBufSize/4))
+ self.extraBufferSizeInBytes = maxBufSize
+ end
+
+ -----------------------------------------------------------------------
+ -- create offsets for groups
+ local iH, iW = input:size(3), input:size(4)
+ local kH, kW = self.kH, self.kW
+ local oH, oW = oSize[3], oSize[4]
+ self.input_offset = self.nInputPlane / self.groups * iH * iW
+ self.output_offset = self.nOutputPlane / self.groups * oH, oW
+ self.weight_offset = self.nInputPlane / self.groups * self.nOutputPlane / self.groups * kH * kW
+
+ if not batch then
self.gradInput = self.gradInput:view(self.gradInput:size(2),
self.gradInput:size(3),
self.gradInput:size(4))
self.output = self.output:view(self.output:size(2),
self.output:size(3),
self.output:size(4))
- end
- end
+ end
+ end
end
local one = torch.FloatTensor({1});
local zero = torch.FloatTensor({0});
function SpatialConvolution:updateOutput(input)
- if not self.weightDesc then self:resetWeightDescriptors() end
- self:createIODescriptors(input)
-
- local prevStream
- local streamQueue = {}
- if self.groups > 1 then -- try to do stream parallelization
- prevStream = cutorch.getStream()
-
- --[[
- Only if prevStream is 0, then do parallelization.
- the justification for this is that this is a hard problem, there is no
- way to know if one is doing other kinds of stream-parallelization
- (like GPUConcat), and if thats the case, streams are already
- being ideally exploited.
- --]]
-
- if prevStream == 0 then
- cutorch.reserveStreams(self.groups)
- for i=1,self.groups do
- cutorch.streamWaitFor(i, {prevStream})
- end
- end
- end
-
- for g = 0, self.groups - 1 do
- -- stream-parallelize if appropriate
- if self.groups > 1 and prevStream == 0 then
- cutorch.setStream(g + 1)
- table.insert(streamQueue, g + 1)
- end
-
- errcheck('cudnnConvolutionForward', cudnn.getHandle(),
- one:data(),
- self.iDesc[0], input:data() + g*self.input_offset,
- self.weightDesc[0], self.weight:data() + g*self.weight_offset,
- self.convDesc[0], self.fwdAlgType[0],
- self.extraBuffer:data(), self.extraBufferSizeInBytes,
- zero:data(),
- self.oDesc[0], self.output:data() + g*self.output_offset);
- end
-
- if prevStream == 0 then
- cutorch.setStream(prevStream)
- cutorch.streamWaitFor(prevStream, streamQueue)
- end
-
- -- add bias
- errcheck('cudnnAddTensor', cudnn.getHandle(),
- 'CUDNN_ADD_SAME_C',
- one:data(), self.biasDesc[0], self.bias:data(),
- one:data(), self.oDescForBias[0], self.output:data())
-
- return self.output
+ if not self.weightDesc then self:resetWeightDescriptors() end
+ self:createIODescriptors(input)
+
+ local prevStream
+ local streamQueue = {}
+ if self.groups > 1 then -- try to do stream parallelization
+ prevStream = cutorch.getStream()
+
+ --[[
+ Only if prevStream is 0, then do parallelization.
+ the justification for this is that this is a hard problem, there is no
+ way to know if one is doing other kinds of stream-parallelization
+ (like GPUConcat), and if thats the case, streams are already
+ being ideally exploited.
+ --]]
+
+ if prevStream == 0 then
+ cutorch.reserveStreams(self.groups)
+ for i=1,self.groups do
+ cutorch.streamWaitFor(i, {prevStream})
+ end
+ end
+ end
+
+ for g = 0, self.groups - 1 do
+ -- stream-parallelize if appropriate
+ if self.groups > 1 and prevStream == 0 then
+ cutorch.setStream(g + 1)
+ table.insert(streamQueue, g + 1)
+ end
+
+ errcheck('cudnnConvolutionForward', cudnn.getHandle(),
+ one:data(),
+ self.iDesc[0], input:data() + g*self.input_offset,
+ self.weightDesc[0], self.weight:data() + g*self.weight_offset,
+ self.convDesc[0], self.fwdAlgType[0],
+ self.extraBuffer:data(), self.extraBufferSizeInBytes,
+ zero:data(),
+ self.oDesc[0], self.output:data() + g*self.output_offset);
+ end
+
+ if prevStream == 0 then
+ cutorch.setStream(prevStream)
+ cutorch.streamWaitFor(prevStream, streamQueue)
+ end
+
+ -- add bias
+ errcheck('cudnnAddTensor', cudnn.getHandle(),
+ 'CUDNN_ADD_SAME_C',
+ one:data(), self.biasDesc[0], self.bias:data(),
+ one:data(), self.oDescForBias[0], self.output:data())
+
+ return self.output
end
function SpatialConvolution:updateGradInput(input, gradOutput)
- if not self.gradInput then return end
-
- assert(gradOutput:dim() == 3 or gradOutput:dim() == 4, 'gradOutput has to be 3D or 4D');
- assert(gradOutput:isContiguous(), 'gradOutput has to be contiguous')
- if not self.weightDesc then self:resetWeightDescriptors() end
- self:createIODescriptors(input)
-
- for g = 0,self.groups - 1 do
- errcheck('cudnnConvolutionBackwardData_v3', cudnn.getHandle(),
- one:data(),
- self.weightDesc[0], self.weight:data() + g*self.weight_offset,
- self.oDesc[0], gradOutput:data() + g*self.output_offset,
- self.convDesc[0],
- self.bwdDataAlgType[0],
- self.extraBuffer:data(), self.extraBufferSizeInBytes,
- zero:data(),
- self.iDesc[0], self.gradInput:data() + g*self.input_offset);
- end
- return self.gradInput
+ if not self.gradInput then return end
+
+ assert(gradOutput:dim() == 3 or gradOutput:dim() == 4, 'gradOutput has to be 3D or 4D');
+ assert(gradOutput:isContiguous(), 'gradOutput has to be contiguous')
+ if not self.weightDesc then self:resetWeightDescriptors() end
+ self:createIODescriptors(input)
+
+ for g = 0,self.groups - 1 do
+ errcheck('cudnnConvolutionBackwardData_v3', cudnn.getHandle(),
+ one:data(),
+ self.weightDesc[0], self.weight:data() + g*self.weight_offset,
+ self.oDesc[0], gradOutput:data() + g*self.output_offset,
+ self.convDesc[0],
+ self.bwdDataAlgType[0],
+ self.extraBuffer:data(), self.extraBufferSizeInBytes,
+ zero:data(),
+ self.iDesc[0], self.gradInput:data() + g*self.input_offset);
+ end
+ return self.gradInput
end
function SpatialConvolution:accGradParameters(input, gradOutput, scale)
- self.scaleT = self.scaleT or torch.FloatTensor(1):fill(1.0)
- -- this line forces this member to always be on CPU (needed for cudnn)
- self.scaleT = self.scaleT:float()
- scale = scale or 1.0
- self.scaleT[1] = scale
-
- assert(gradOutput:dim() == 3 or gradOutput:dim() == 4, 'gradOutput has to be 3D or 4D');
- assert(gradOutput:isContiguous(), 'gradOutput has to be contiguous')
- if not self.weightDesc then self:resetWeightDescriptors() end
- self:createIODescriptors(input)
-
- -- gradBias
- errcheck('cudnnConvolutionBackwardBias', cudnn.getHandle(),
+ self.scaleT = self.scaleT or torch.FloatTensor(1):fill(1.0)
+ -- this line forces this member to always be on CPU (needed for cudnn)
+ self.scaleT = self.scaleT:float()
+ scale = scale or 1.0
+ self.scaleT[1] = scale
+
+ assert(gradOutput:dim() == 3 or gradOutput:dim() == 4, 'gradOutput has to be 3D or 4D');
+ assert(gradOutput:isContiguous(), 'gradOutput has to be contiguous')
+ if not self.weightDesc then self:resetWeightDescriptors() end
+ self:createIODescriptors(input)
+
+ -- gradBias
+ errcheck('cudnnConvolutionBackwardBias', cudnn.getHandle(),
self.scaleT:data(),
self.oDescForBias[0], gradOutput:data(),
one:data(),
self.biasDesc[0], self.gradBias:data())
- for g = 0, self.groups - 1 do
- -- gradWeight
- errcheck('cudnnConvolutionBackwardFilter_v3', cudnn.getHandle(),
- self.scaleT:data(),
- self.iDesc[0], input:data() + g*self.input_offset,
- self.oDesc[0], gradOutput:data() + g*self.output_offset,
- self.convDesc[0],
- self.bwdFilterAlgType[0],
- self.extraBuffer:data(), self.extraBufferSizeInBytes,
- one:data(),
- self.weightDesc[0], self.gradWeight:data() + g*self.weight_offset);
- end
+ for g = 0, self.groups - 1 do
+ -- gradWeight
+ errcheck('cudnnConvolutionBackwardFilter_v3', cudnn.getHandle(),
+ self.scaleT:data(),
+ self.iDesc[0], input:data() + g*self.input_offset,
+ self.oDesc[0], gradOutput:data() + g*self.output_offset,
+ self.convDesc[0],
+ self.bwdFilterAlgType[0],
+ self.extraBuffer:data(), self.extraBufferSizeInBytes,
+ one:data(),
+ self.weightDesc[0], self.gradWeight:data() + g*self.weight_offset);
+ end
end
function SpatialConvolution:write(f)
- self.weightDesc = nil
- self.biasDesc = nil
- self.convDesc = nil
- self.iDesc = nil
- self.oDesc = nil
- self.oDescForBias = nil
- self.algType = nil
- self.fwdAlgType = nil
- self.bwdDataAlgType = nil
- self.bwdFilterAlgType = nil
- self.extraBuffer = nil
- self.extraBufferSizeInBytes = nil
- local var = {}
- for k,v in pairs(self) do
- var[k] = v
- end
- f:writeObject(var)
+ self.weightDesc = nil
+ self.biasDesc = nil
+ self.convDesc = nil
+ self.iDesc = nil
+ self.oDesc = nil
+ self.oDescForBias = nil
+ self.algType = nil
+ self.fwdAlgType = nil
+ self.bwdDataAlgType = nil
+ self.bwdFilterAlgType = nil
+ self.extraBuffer = nil
+ self.extraBufferSizeInBytes = nil
+ local var = {}
+ for k,v in pairs(self) do
+ var[k] = v
+ end
+ f:writeObject(var)
end
diff --git a/VolumetricConvolution.lua b/VolumetricConvolution.lua
index fd2447d..4c73ce7 100644
--- a/VolumetricConvolution.lua
+++ b/VolumetricConvolution.lua
@@ -41,7 +41,6 @@ end
function VolumetricConvolution:fastest(mode)
if mode == nil then mode = true end
self.fastest_mode = mode
- self.iSize:fill(0)
return self
end
@@ -55,7 +54,6 @@ function VolumetricConvolution:setMode(fmode, bdmode, bwmode)
if bwmode ~= nil then
self.bwmode = bwmode
end
- self.iSize:fill(0)
return self
end
@@ -92,7 +90,7 @@ function VolumetricConvolution:createIODescriptors(input)
errcheck('cudnnSetConvolutionNdDescriptor_v3', self.convDesc[0],
3, pad:data(),
stride:data(), upscale:data(), 'CUDNN_CROSS_CORRELATION',
- 'CUDNN_DATA_FLOAT');
+ 'CUDNN_DATA_FLOAT');
local function destroyConvDesc(d)
errcheck('cudnnDestroyConvolutionDescriptor', d[0]);
end
@@ -112,20 +110,20 @@ function VolumetricConvolution:createIODescriptors(input)
self.output:size(2),
self.output:size(3)*self.output:size(4),
self.output:size(5)))
- -----------------------------------------------------------------
- local maxBufSize = 0
+ -----------------------------------------------------------------
+ local maxBufSize = 0
-- create forwardAlgorithm descriptors for
local algType = ffi.new("cudnnConvolutionFwdAlgo_t[?]", 1)
- local algSearchMode = 'CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT'
- local algWorkspaceLimit = self.workspace_limit
- or (self.nInputPlane * self.kT * self.kH * self.kW * 4) -- 4 = sizeof int/float.
- if self.fastest_mode then algSearchMode = 'CUDNN_CONVOLUTION_FWD_PREFER_FASTEST' end
+ local algSearchMode = 'CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT'
+ local algWorkspaceLimit = self.workspace_limit
+ or (self.nInputPlane * self.kT * self.kH * self.kW * 4) -- 4 = sizeof int/float.
+ if self.fastest_mode then algSearchMode = 'CUDNN_CONVOLUTION_FWD_PREFER_FASTEST' end
errcheck('cudnnGetConvolutionForwardAlgorithm',
cudnn.getHandle(),
self.iDesc[0], self.weightDesc[0],
self.convDesc[0], self.oDesc[0],
algSearchMode, algWorkspaceLimit, algType)
- algType[0] = self.fmode or algType[0]
+ algType[0] = self.fmode or algType[0]
self.fwdAlgType = algType
local bufSize = torch.LongTensor(1)
errcheck('cudnnGetConvolutionForwardWorkspaceSize',
@@ -133,20 +131,20 @@ function VolumetricConvolution:createIODescriptors(input)
self.iDesc[0], self.weightDesc[0],
self.convDesc[0], self.oDesc[0],
algType[0], bufSize:data())
- maxBufSize = math.max(maxBufSize, bufSize[1])
+ maxBufSize = math.max(maxBufSize, bufSize[1])
- -- create backwardFilterAlgorithm descriptors for
+ -- create backwardFilterAlgorithm descriptors for
local algType = ffi.new("cudnnConvolutionBwdFilterAlgo_t[?]", 1)
- local algSearchMode = 'CUDNN_CONVOLUTION_BWD_FILTER_NO_WORKSPACE'
- local algWorkspaceLimit = self.workspace_limit
- or (self.nInputPlane * self.kT * self.kH * self.kW * 4) -- 4 = sizeof int/float.
- if self.fastest_mode then algSearchMode = 'CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST' end
+ local algSearchMode = 'CUDNN_CONVOLUTION_BWD_FILTER_NO_WORKSPACE'
+ local algWorkspaceLimit = self.workspace_limit
+ or (self.nInputPlane * self.kT * self.kH * self.kW * 4) -- 4 = sizeof int/float.
+ if self.fastest_mode then algSearchMode = 'CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST' end
errcheck('cudnnGetConvolutionBackwardFilterAlgorithm',
cudnn.getHandle(),
self.iDesc[0], self.oDesc[0],
self.convDesc[0], self.weightDesc[0],
algSearchMode, algWorkspaceLimit, algType)
- algType[0] = self.bwmode or algType[0]
+ algType[0] = self.bwmode or algType[0]
self.bwdFilterAlgType = algType
local bufSize = torch.LongTensor(1)
errcheck('cudnnGetConvolutionBackwardFilterWorkspaceSize',
@@ -154,20 +152,20 @@ function VolumetricConvolution:createIODescriptors(input)
self.iDesc[0], self.oDesc[0],
self.convDesc[0], self.weightDesc[0],
algType[0], bufSize:data())
- maxBufSize = math.max(maxBufSize, bufSize[1])
+ maxBufSize = math.max(maxBufSize, bufSize[1])
- -- create backwardDataAlgorithm descriptors for
+ -- create backwardDataAlgorithm descriptors for
local algType = ffi.new("cudnnConvolutionBwdDataAlgo_t[?]", 1)
- local algSearchMode = 'CUDNN_CONVOLUTION_BWD_DATA_NO_WORKSPACE'
- local algWorkspaceLimit = self.workspace_limit
- or (self.nInputPlane * self.kT * self.kH * self.kW * 4) -- 4 = sizeof int/float.
- if self.fastest_mode then algSearchMode = 'CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST' end
+ local algSearchMode = 'CUDNN_CONVOLUTION_BWD_DATA_NO_WORKSPACE'
+ local algWorkspaceLimit = self.workspace_limit
+ or (self.nInputPlane * self.kT * self.kH * self.kW * 4) -- 4 = sizeof int/float.
+ if self.fastest_mode then algSearchMode = 'CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST' end
errcheck('cudnnGetConvolutionBackwardDataAlgorithm',
cudnn.getHandle(),
self.weightDesc[0], self.oDesc[0],
self.convDesc[0], self.iDesc[0],
algSearchMode, algWorkspaceLimit, algType)
- algType[0] = self.bdmode or algType[0]
+ algType[0] = self.bdmode or algType[0]
self.bwdDataAlgType = algType
local bufSize = torch.LongTensor(1)
errcheck('cudnnGetConvolutionBackwardDataWorkspaceSize',
@@ -175,16 +173,16 @@ function VolumetricConvolution:createIODescriptors(input)
self.weightDesc[0], self.oDesc[0],
self.convDesc[0], self.iDesc[0],
algType[0], bufSize:data())
- maxBufSize = math.max(maxBufSize, bufSize[1])
+ maxBufSize = math.max(maxBufSize, bufSize[1])
- self.extraBuffer = self.extraBuffer or cudnn.getSharedWorkspace()
- self.extraBufferSizeInBytes = self.extraBuffer:nElement() * 4 -- float
+ self.extraBuffer = self.extraBuffer or cudnn.getSharedWorkspace()
+ self.extraBufferSizeInBytes = self.extraBuffer:nElement() * 4 -- float
if maxBufSize > self.extraBufferSizeInBytes then
self.extraBuffer:resize(math.ceil(maxBufSize/4))
self.extraBufferSizeInBytes = maxBufSize
end
- -----------------------------------------------------------------
+ -----------------------------------------------------------------
if not batch then
self.gradInput = self.gradInput:view(self.gradInput:size(2),
self.gradInput:size(3),
@@ -226,14 +224,14 @@ function VolumetricConvolution:updateGradInput(input, gradOutput)
if not self.weightDesc then self:resetWeightDescriptors() end
self:createIODescriptors(input)
errcheck('cudnnConvolutionBackwardData_v3', cudnn.getHandle(),
- one:data(),
- self.weightDesc[0], self.weight:data(),
- self.oDesc[0], gradOutput:data(),
- self.convDesc[0],
- self.bwdDataAlgType[0],
- self.extraBuffer:data(), self.extraBufferSizeInBytes,
- zero:data(),
- self.iDesc[0], self.gradInput:data());
+ one:data(),
+ self.weightDesc[0], self.weight:data(),
+ self.oDesc[0], gradOutput:data(),
+ self.convDesc[0],
+ self.bwdDataAlgType[0],
+ self.extraBuffer:data(), self.extraBufferSizeInBytes,
+ zero:data(),
+ self.iDesc[0], self.gradInput:data());
return self.gradInput
end
@@ -256,14 +254,14 @@ function VolumetricConvolution:accGradParameters(input, gradOutput, scale)
self.biasDesc[0], self.gradBias:data());
-- gradWeight
errcheck('cudnnConvolutionBackwardFilter_v3', cudnn.getHandle(),
- self.scaleT:data(),
- self.iDesc[0], input:data(),
- self.oDesc[0], gradOutput:data(),
- self.convDesc[0],
- self.bwdFilterAlgType[0],
- self.extraBuffer:data(), self.extraBufferSizeInBytes,
- one:data(),
- self.weightDesc[0], self.gradWeight:data());
+ self.scaleT:data(),
+ self.iDesc[0], input:data(),
+ self.oDesc[0], gradOutput:data(),
+ self.convDesc[0],
+ self.bwdFilterAlgType[0],
+ self.extraBuffer:data(), self.extraBufferSizeInBytes,
+ one:data(),
+ self.weightDesc[0], self.gradWeight:data());
end
function VolumetricConvolution:write(f)
diff --git a/ffi.lua b/ffi.lua
index d749744..2bb68b7 100644
--- a/ffi.lua
+++ b/ffi.lua
@@ -111,13 +111,13 @@ cudnnStatus_t
cudnnCreateConvolutionDescriptor(cudnnConvolutionDescriptor_t *convDesc );
cudnnStatus_t
cudnnSetConvolutionNdDescriptor_v3( cudnnConvolutionDescriptor_t convDesc,
- int arrayLength,
- const int padA[],
- const int filterStrideA[],
- const int upscaleA[],
- cudnnConvolutionMode_t mode,
- cudnnDataType_t dataType
- );
+ int arrayLength,
+ const int padA[],
+ const int filterStrideA[],
+ const int upscaleA[],
+ cudnnConvolutionMode_t mode,
+ cudnnDataType_t dataType
+ );
cudnnStatus_t
cudnnGetConvolutionNdForwardOutputDim(
@@ -156,14 +156,14 @@ typedef struct {
cudnnStatus_t
cudnnFindConvolutionForwardAlgorithm(cudnnHandle_t handle,
- const cudnnTensorDescriptor_t srcDesc,
- const cudnnFilterDescriptor_t filterDesc,
- const cudnnConvolutionDescriptor_t convDesc,
- const cudnnTensorDescriptor_t destDesc,
- const int requestedCount,
- int *returnedCount,
- cudnnConvolutionFwdAlgoPerf_t *perfResults
- );
+ const cudnnTensorDescriptor_t srcDesc,
+ const cudnnFilterDescriptor_t filterDesc,
+ const cudnnConvolutionDescriptor_t convDesc,
+ const cudnnTensorDescriptor_t destDesc,
+ const int requestedCount,
+ int *returnedCount,
+ cudnnConvolutionFwdAlgoPerf_t *perfResults
+ );
cudnnStatus_t cudnnGetConvolutionForwardAlgorithm( cudnnHandle_t handle,
@@ -243,42 +243,42 @@ cudnnStatus_t cudnnFindConvolutionBackwardFilterAlgorithm( cudnnHandle_t handle,
cudnnStatus_t
cudnnGetConvolutionBackwardFilterAlgorithm(
- cudnnHandle_t handle,
- const cudnnTensorDescriptor_t srcDesc,
- const cudnnTensorDescriptor_t diffDesc,
- const cudnnConvolutionDescriptor_t convDesc,
- const cudnnFilterDescriptor_t gradDesc,
- cudnnConvolutionBwdFilterPreference_t preference,
- size_t memoryLimitInbytes,
- cudnnConvolutionBwdFilterAlgo_t *algo
- );
+ cudnnHandle_t handle,
+ const cudnnTensorDescriptor_t srcDesc,
+ const cudnnTensorDescriptor_t diffDesc,
+ const cudnnConvolutionDescriptor_t convDesc,
+ const cudnnFilterDescriptor_t gradDesc,
+ cudnnConvolutionBwdFilterPreference_t preference,
+ size_t memoryLimitInbytes,
+ cudnnConvolutionBwdFilterAlgo_t *algo
+ );
cudnnStatus_t
cudnnGetConvolutionBackwardFilterWorkspaceSize(
- cudnnHandle_t handle,
- const cudnnTensorDescriptor_t srcDesc,
- const cudnnTensorDescriptor_t diffDesc,
- const cudnnConvolutionDescriptor_t convDesc,
- const cudnnFilterDescriptor_t gradDesc,
- cudnnConvolutionBwdFilterAlgo_t algo,
- size_t *sizeInBytes
- );
+ cudnnHandle_t handle,
+ const cudnnTensorDescriptor_t srcDesc,
+ const cudnnTensorDescriptor_t diffDesc,
+ const cudnnConvolutionDescriptor_t convDesc,
+ const cudnnFilterDescriptor_t gradDesc,
+ cudnnConvolutionBwdFilterAlgo_t algo,
+ size_t *sizeInBytes
+ );
cudnnStatus_t cudnnConvolutionBackwardFilter_v3(
- cudnnHandle_t handle,
- const void *alpha,
- const cudnnTensorDescriptor_t srcDesc,
- const void *srcData,
- const cudnnTensorDescriptor_t diffDesc,
- const void *diffData,
- const cudnnConvolutionDescriptor_t convDesc,
- cudnnConvolutionBwdFilterAlgo_t algo,
- void *workSpace,
- size_t workSpaceSizeInBytes,
- const void *beta,
- const cudnnFilterDescriptor_t gradDesc,
- void *gradData
- );
+ cudnnHandle_t handle,
+ const void *alpha,
+ const cudnnTensorDescriptor_t srcDesc,
+ const void *srcData,
+ const cudnnTensorDescriptor_t diffDesc,
+ const void *diffData,
+ const cudnnConvolutionDescriptor_t convDesc,
+ cudnnConvolutionBwdFilterAlgo_t algo,
+ void *workSpace,
+ size_t workSpaceSizeInBytes,
+ const void *beta,
+ const cudnnFilterDescriptor_t gradDesc,
+ void *gradData
+ );
typedef enum
{
@@ -312,42 +312,42 @@ cudnnStatus_t cudnnFindConvolutionBackwardDataAlgorithm(cudnnHandle_t handle,
);
cudnnStatus_t cudnnGetConvolutionBackwardDataAlgorithm(
- cudnnHandle_t handle,
- const cudnnFilterDescriptor_t filterDesc,
- const cudnnTensorDescriptor_t diffDesc,
- const cudnnConvolutionDescriptor_t convDesc,
- const cudnnTensorDescriptor_t gradDesc,
- cudnnConvolutionBwdDataPreference_t preference,
- size_t memoryLimitInbytes,
- cudnnConvolutionBwdDataAlgo_t *algo
- );
+ cudnnHandle_t handle,
+ const cudnnFilterDescriptor_t filterDesc,
+ const cudnnTensorDescriptor_t diffDesc,
+ const cudnnConvolutionDescriptor_t convDesc,
+ const cudnnTensorDescriptor_t gradDesc,
+ cudnnConvolutionBwdDataPreference_t preference,
+ size_t memoryLimitInbytes,
+ cudnnConvolutionBwdDataAlgo_t *algo
+ );
cudnnStatus_t cudnnGetConvolutionBackwardDataWorkspaceSize(
- cudnnHandle_t handle,
- const cudnnFilterDescriptor_t filterDesc,
- const cudnnTensorDescriptor_t diffDesc,
- const cudnnConvolutionDescriptor_t convDesc,
- const cudnnTensorDescriptor_t gradDesc,
- cudnnConvolutionBwdDataAlgo_t algo,
- size_t *sizeInBytes
- );
+ cudnnHandle_t handle,
+ const cudnnFilterDescriptor_t filterDesc,
+ const cudnnTensorDescriptor_t diffDesc,
+ const cudnnConvolutionDescriptor_t convDesc,
+ const cudnnTensorDescriptor_t gradDesc,
+ cudnnConvolutionBwdDataAlgo_t algo,
+ size_t *sizeInBytes
+ );
cudnnStatus_t cudnnConvolutionBackwardData_v3(
- cudnnHandle_t handle,
- const void *alpha,
- const cudnnFilterDescriptor_t filterDesc,
- const void *filterData,
- const cudnnTensorDescriptor_t diffDesc,
- const void *diffData,
- const cudnnConvolutionDescriptor_t convDesc,
- cudnnConvolutionBwdDataAlgo_t algo,
- void *workSpace,
- size_t workSpaceSizeInBytes,
- const void *beta,
- const cudnnTensorDescriptor_t gradDesc,
- void *gradData
- );
+ cudnnHandle_t handle,
+ const void *alpha,
+ const cudnnFilterDescriptor_t filterDesc,
+ const void *filterData,
+ const cudnnTensorDescriptor_t diffDesc,
+ const void *diffData,
+ const cudnnConvolutionDescriptor_t convDesc,
+ cudnnConvolutionBwdDataAlgo_t algo,
+ void *workSpace,
+ size_t workSpaceSizeInBytes,
+ const void *beta,
+ const cudnnTensorDescriptor_t gradDesc,
+ void *gradData
+ );
typedef enum
@@ -365,19 +365,19 @@ typedef enum
/* Function to perform forward softmax */
cudnnStatus_t cudnnSoftmaxForward( cudnnHandle_t handle,
- cudnnSoftmaxAlgorithm_t algorithm,
- cudnnSoftmaxMode_t mode,
- const void *alpha,
- const cudnnTensorDescriptor_t srcDesc,
- const void *srcData,
- const void *beta,
- const cudnnTensorDescriptor_t destDesc,
- void *destData
- );
+ cudnnSoftmaxAlgorithm_t algorithm,
+ cudnnSoftmaxMode_t mode,
+ const void *alpha,
+ const cudnnTensorDescriptor_t srcDesc,
+ const void *srcData,
+ const void *beta,
+ const cudnnTensorDescriptor_t destDesc,
+ void *destData
+ );
/* Function to perform backward softmax */
cudnnStatus_t cudnnSoftmaxBackward(
- cudnnHandle_t handle,
+ cudnnHandle_t handle,
cudnnSoftmaxAlgorithm_t algorithm,
cudnnSoftmaxMode_t mode,
const void *alpha,
@@ -399,15 +399,15 @@ typedef enum
} cudnnPoolingMode_t;
cudnnStatus_t cudnnCreatePoolingDescriptor(
- cudnnPoolingDescriptor_t *poolingDesc);
+ cudnnPoolingDescriptor_t *poolingDesc);
cudnnStatus_t cudnnSetPoolingNdDescriptor(
- cudnnPoolingDescriptor_t poolingDesc,
+ cudnnPoolingDescriptor_t poolingDesc,
const cudnnPoolingMode_t mode,
int nbDims,
const int windowDimA[],
const int paddingA[],
const int strideA[]
- );
+ );
cudnnStatus_t cudnnGetPoolingNdDescriptor(
const cudnnPoolingDescriptor_t poolingDesc,
@@ -421,12 +421,12 @@ cudnnStatus_t cudnnGetPoolingNdDescriptor(
cudnnStatus_t cudnnGetPoolingNdForwardOutputDim(
const cudnnPoolingDescriptor_t poolingDesc,
- const cudnnTensorDescriptor_t inputTensorDesc,
- int nbDims,
- int outputTensorDimA[]);
+ const cudnnTensorDescriptor_t inputTensorDesc,
+ int nbDims,
+ int outputTensorDimA[]);
cudnnStatus_t cudnnDestroyPoolingDescriptor(
- cudnnPoolingDescriptor_t poolingDesc );
+ cudnnPoolingDescriptor_t poolingDesc );
cudnnStatus_t cudnnPoolingForward( cudnnHandle_t handle,
const cudnnPoolingDescriptor_t poolingDesc,
@@ -491,46 +491,46 @@ typedef enum
} cudnnLRNMode_t;
cudnnStatus_t cudnnSetLRNDescriptor(
- cudnnLRNDescriptor_t normDesc,
- unsigned lrnN,
- double lrnAlpha,
- double lrnBeta,
- double lrnK);
+ cudnnLRNDescriptor_t normDesc,
+ unsigned lrnN,
+ double lrnAlpha,
+ double lrnBeta,
+ double lrnK);
cudnnStatus_t cudnnGetLRNDescriptor(
- cudnnLRNDescriptor_t normDesc,
- unsigned* lrnN,
- double* lrnAlpha,
- double* lrnBeta,
- double* lrnK);
+ cudnnLRNDescriptor_t normDesc,
+ unsigned* lrnN,
+ double* lrnAlpha,
+ double* lrnBeta,
+ double* lrnK);
cudnnStatus_t cudnnDestroyLRNDescriptor( cudnnLRNDescriptor_t lrnDesc );
cudnnStatus_t cudnnLRNCrossChannelForward(
- cudnnHandle_t handle,
- cudnnLRNDescriptor_t normDesc,
- cudnnLRNMode_t lrnMode,
- const void* alpha,
- const cudnnTensorDescriptor_t srcDesc,
- const void *srcData,
- const void *beta,
- const cudnnTensorDescriptor_t destDesc,
- void *destData);
+ cudnnHandle_t handle,
+ cudnnLRNDescriptor_t normDesc,
+ cudnnLRNMode_t lrnMode,
+ const void* alpha,
+ const cudnnTensorDescriptor_t srcDesc,
+ const void *srcData,
+ const void *beta,
+ const cudnnTensorDescriptor_t destDesc,
+ void *destData);
cudnnStatus_t cudnnLRNCrossChannelBackward(
- cudnnHandle_t handle,
- cudnnLRNDescriptor_t normDesc,
- cudnnLRNMode_t lrnMode,
- const void* alpha,
- const cudnnTensorDescriptor_t srcDesc,
- const void *srcData,
- const cudnnTensorDescriptor_t srcDiffDesc,
- const void *srcDiffData,
- const cudnnTensorDescriptor_t destDesc,
- const void *destData,
- const void *beta,
- const cudnnTensorDescriptor_t destDiffDesc,
- void *destDiffData);
+ cudnnHandle_t handle,
+ cudnnLRNDescriptor_t normDesc,
+ cudnnLRNMode_t lrnMode,
+ const void* alpha,
+ const cudnnTensorDescriptor_t srcDesc,
+ const void *srcData,
+ const cudnnTensorDescriptor_t srcDiffDesc,
+ const void *srcDiffData,
+ const cudnnTensorDescriptor_t destDesc,
+ const void *destData,
+ const void *beta,
+ const cudnnTensorDescriptor_t destDiffDesc,
+ void *destDiffData);
typedef enum
{
@@ -538,36 +538,36 @@ typedef enum
} cudnnDivNormMode_t;
cudnnStatus_t cudnnDivisiveNormalizationForward(
- cudnnHandle_t handle,
- cudnnLRNDescriptor_t normDesc,
- cudnnDivNormMode_t mode,
- const void *alpha,
- const cudnnTensorDescriptor_t srcDesc,
- const void *srcData,
- const void *srcMeansData,
- void *tempData,
- void *tempData2,
- const void *beta,
- const cudnnTensorDescriptor_t destDesc,
- void *destData
- );
+ cudnnHandle_t handle,
+ cudnnLRNDescriptor_t normDesc,
+ cudnnDivNormMode_t mode,
+ const void *alpha,
+ const cudnnTensorDescriptor_t srcDesc,
+ const void *srcData,
+ const void *srcMeansData,
+ void *tempData,
+ void *tempData2,
+ const void *beta,
+ const cudnnTensorDescriptor_t destDesc,
+ void *destData
+ );
cudnnStatus_t cudnnDivisiveNormalizationBackward(
- cudnnHandle_t handle,
- cudnnLRNDescriptor_t normDesc,
- cudnnDivNormMode_t mode,
- const void *alpha,
- const cudnnTensorDescriptor_t srcDesc,
- const void *srcData,
- const void *srcMeansData,
- const void *srcDiffData,
- void *tempData,
- void *tempData2,
- const void *betaData,
- const cudnnTensorDescriptor_t destDataDesc,
- void *destDataDiff,
- void *destMeansDiff
- );
+ cudnnHandle_t handle,
+ cudnnLRNDescriptor_t normDesc,
+ cudnnDivNormMode_t mode,
+ const void *alpha,
+ const cudnnTensorDescriptor_t srcDesc,
+ const void *srcData,
+ const void *srcMeansData,
+ const void *srcDiffData,
+ void *tempData,
+ void *tempData2,
+ const void *betaData,
+ const cudnnTensorDescriptor_t destDataDesc,
+ void *destDataDiff,
+ void *destMeansDiff
+ );
]]
diff --git a/functional.lua b/functional.lua
index 5a343c5..66bb4d6 100644
--- a/functional.lua
+++ b/functional.lua
@@ -181,7 +181,7 @@ cudnn.functional.Convolution2D_updateGradInput = function(handle, input, weight,
weightDesc[0], weight:data(),
oDesc[0], gradOutput:data(),
convDesc[0],
- algType[0],
+ algType[0],
NULL, 0,
zero:data(),
iDesc[0], gradInput:data());
diff --git a/test/test.lua b/test/test.lua
index 4062425..8c22ece 100644
--- a/test/test.lua
+++ b/test/test.lua
@@ -692,15 +692,11 @@ end
function cudnntest.LogSoftMax_batch()
local bs = math.random(1,32)
local from = math.random(1,32)
- local outi = math.random(1,64)
- local outj = math.random(1,64)
- local ini = outi
- local inj = outj
- local input = torch.randn(bs,from,inj,ini):cuda()
- local gradOutput = torch.randn(bs,from,outj,outi):cuda()
+ local input = torch.randn(bs,from):cuda()
+ local gradOutput = torch.randn(bs,from):cuda()
local sconv = nn.LogSoftMax():cuda()
- local groundtruth = sconv:forward(input:view(bs,-1))
+ local groundtruth = sconv:forward(input)
local groundgrad = sconv:backward(input, gradOutput)
cutorch.synchronize()
local gconv = cudnn.LogSoftMax():cuda()
@@ -713,8 +709,8 @@ function cudnntest.LogSoftMax_batch()
local rescuda = gconv:forward(input)
local resgrad = gconv:backward(input, gradOutput)
cutorch.synchronize()
- mytester:asserteq(rescuda:dim(), 4, 'error in dimension')
- mytester:asserteq(resgrad:dim(), 4, 'error in dimension')
+ mytester:asserteq(rescuda:dim(), 2, 'error in dimension')
+ mytester:asserteq(resgrad:dim(), 2, 'error in dimension')
local error = rescuda:float() - groundtruth:float()
mytester:assertlt(error:abs():max(),