diff options
author | soumith <soumith@fb.com> | 2015-09-15 20:32:36 +0300 |
---|---|---|
committer | soumith <soumith@fb.com> | 2015-09-15 20:32:36 +0300 |
commit | 4d5c3db15efc87fe4220fc06486a8d7be759dcc2 (patch) | |
tree | 87548ebc6a6c6113d952569d1ab72ccf6052ebb0 | |
parent | 97f41c48602a345344bb5f76e73e4b2fbf7eb679 (diff) |
whitespace cleanups, fixing logsoftmax test
-rw-r--r-- | SpatialConvolution.lua | 610 | ||||
-rw-r--r-- | VolumetricConvolution.lua | 86 | ||||
-rw-r--r-- | ffi.lua | 308 | ||||
-rw-r--r-- | functional.lua | 2 | ||||
-rw-r--r-- | test/test.lua | 14 |
5 files changed, 506 insertions, 514 deletions
diff --git a/SpatialConvolution.lua b/SpatialConvolution.lua index fe7f027..04c9319 100644 --- a/SpatialConvolution.lua +++ b/SpatialConvolution.lua @@ -1,146 +1,144 @@ local SpatialConvolution, parent = - torch.class('cudnn.SpatialConvolution', 'nn.SpatialConvolution') + torch.class('cudnn.SpatialConvolution', 'nn.SpatialConvolution') local ffi = require 'ffi' local errcheck = cudnn.errcheck function SpatialConvolution:__init(nInputPlane, nOutputPlane, kW, kH, dW, dH, padW, padH, groups) - local delayedReset = self.reset - self.reset = function() end - parent.__init(self, nInputPlane, nOutputPlane, kW, kH, dW, dH) - self.reset = delayedReset - self.padW = padW or 0 - self.padH = padH or 0 - self.groups = groups or 1 - assert(nInputPlane % self.groups == 0, - 'nInputPlane should be divisible by nGroups') - assert(nOutputPlane % self.groups == 0, - 'nOutputPlane should be divisible by nGroups') - self.weight = torch.Tensor(nOutputPlane, nInputPlane/self.groups, kW, kH) - self.gradWeight = torch.Tensor(nOutputPlane, nInputPlane/self.groups, kW, kH) - self:reset() - self.iSize = torch.LongStorage(4):fill(0) - self.fastest_mode = true + local delayedReset = self.reset + self.reset = function() end + parent.__init(self, nInputPlane, nOutputPlane, kW, kH, dW, dH) + self.reset = delayedReset + self.padW = padW or 0 + self.padH = padH or 0 + self.groups = groups or 1 + assert(nInputPlane % self.groups == 0, + 'nInputPlane should be divisible by nGroups') + assert(nOutputPlane % self.groups == 0, + 'nOutputPlane should be divisible by nGroups') + self.weight = torch.Tensor(nOutputPlane, nInputPlane/self.groups, kW, kH) + self.gradWeight = torch.Tensor(nOutputPlane, nInputPlane/self.groups, kW, kH) + self:reset() + self.iSize = torch.LongStorage(4):fill(0) + self.fastest_mode = true end -- if you change the configuration of the module manually, call this function SpatialConvolution:resetWeightDescriptors() - assert(torch.typename(self.weight) == 'torch.CudaTensor', - 'Only Cuda supported duh!') - assert(torch.typename(self.bias) == 'torch.CudaTensor', - 'Only Cuda supported duh!') - -- for compatibility - self.groups = self.groups or 1 - -- create filterDescriptor for weight - self.weightDesc = ffi.new('struct cudnnFilterStruct*[1]') - errcheck('cudnnCreateFilterDescriptor', self.weightDesc) - local desc = torch.IntTensor({self.nOutputPlane/self.groups, - self.nInputPlane/self.groups, - self.kH, self.kW}) - errcheck('cudnnSetFilterNdDescriptor', self.weightDesc[0], - 'CUDNN_DATA_FLOAT', 4, - desc:data()); - local function destroyWDesc(d) - errcheck('cudnnDestroyFilterDescriptor', d[0]); - end - ffi.gc(self.weightDesc, destroyWDesc) - - -- create descriptor for bias - self.biasDesc = cudnn.toDescriptor(self.bias:view(1, self.nOutputPlane,1,1)) + assert(torch.typename(self.weight) == 'torch.CudaTensor', + 'Only Cuda supported duh!') + assert(torch.typename(self.bias) == 'torch.CudaTensor', + 'Only Cuda supported duh!') + -- for compatibility + self.groups = self.groups or 1 + -- create filterDescriptor for weight + self.weightDesc = ffi.new('struct cudnnFilterStruct*[1]') + errcheck('cudnnCreateFilterDescriptor', self.weightDesc) + local desc = torch.IntTensor({self.nOutputPlane/self.groups, + self.nInputPlane/self.groups, + self.kH, self.kW}) + errcheck('cudnnSetFilterNdDescriptor', self.weightDesc[0], + 'CUDNN_DATA_FLOAT', 4, + desc:data()); + local function destroyWDesc(d) + errcheck('cudnnDestroyFilterDescriptor', d[0]); + end + ffi.gc(self.weightDesc, destroyWDesc) + + -- create descriptor for bias + self.biasDesc = cudnn.toDescriptor(self.bias:view(1, self.nOutputPlane,1,1)) end function SpatialConvolution:fastest(mode) - if mode == nil then mode = true end - self.fastest_mode = mode - self.iSize:fill(0) - return self + if mode == nil then mode = true end + self.fastest_mode = mode + return self end function SpatialConvolution:setMode(fmode, bdmode, bwmode) - if fmode ~= nil then - self.fmode = fmode - end - if bdmode ~= nil then - self.bdmode = bdmode - end - if bwmode ~= nil then - self.bwmode = bwmode - end - self.iSize:fill(0) - return self + if fmode ~= nil then + self.fmode = fmode + end + if bdmode ~= nil then + self.bdmode = bdmode + end + if bwmode ~= nil then + self.bwmode = bwmode + end + return self end function SpatialConvolution:resetMode() - self.fmode = nil - self.bdmode = nil - self.bwmode = nil - return self + self.fmode = nil + self.bdmode = nil + self.bwmode = nil + return self end function SpatialConvolution:createIODescriptors(input) - local batch = true - if input:dim() == 3 then - input = input:view(1, input:size(1), input:size(2), input:size(3)) - batch = false - end - assert(input:dim() == 4 and input:isContiguous()); - if not self.iDesc or not self.oDesc or - input:size(1) ~= self.iSize[1] or input:size(2) ~= self.iSize[2] - or input:size(3) ~= self.iSize[3] or input:size(4) ~= self.iSize[4] then - self.iSize = input:size() - - -- resize gradInput - if self.gradInput then self.gradInput:resizeAs(input); end - assert(self.nInputPlane == input:size(2), 'input has to contain: ' + local batch = true + if input:dim() == 3 then + input = input:view(1, input:size(1), input:size(2), input:size(3)) + batch = false + end + assert(input:dim() == 4 and input:isContiguous()); + if not self.iDesc or not self.oDesc or + input:size(1) ~= self.iSize[1] or input:size(2) ~= self.iSize[2] + or input:size(3) ~= self.iSize[3] or input:size(4) ~= self.iSize[4] then + self.iSize = input:size() + + -- resize gradInput + if self.gradInput then self.gradInput:resizeAs(input); end + assert(self.nInputPlane == input:size(2), 'input has to contain: ' .. self.nInputPlane .. ' feature maps, but received input of size: ' .. input:size(1) .. ' x ' .. input:size(2) .. ' x ' .. input:size(3) .. ' x ' .. input:size(4)) - -- create input descriptor - local input_slice = {{},{1,self.nInputPlane/self.groups},{},{}} - self.iDesc = cudnn.toDescriptor(input[input_slice]) - - -- create conv descriptor - self.convDesc = ffi.new('struct cudnnConvolutionStruct*[1]') - errcheck('cudnnCreateConvolutionDescriptor', self.convDesc) - local pad = torch.IntTensor({self.padH, self.padW}) - local stride = torch.IntTensor({self.dH, self.dW}) - local upscale = torch.IntTensor({1,1}) - errcheck('cudnnSetConvolutionNdDescriptor_v3', self.convDesc[0], - 2, pad:data(), - stride:data(), upscale:data(), 'CUDNN_CROSS_CORRELATION', - 'CUDNN_DATA_FLOAT'); - local function destroyConvDesc(d) + -- create input descriptor + local input_slice = {{},{1,self.nInputPlane/self.groups},{},{}} + self.iDesc = cudnn.toDescriptor(input[input_slice]) + + -- create conv descriptor + self.convDesc = ffi.new('struct cudnnConvolutionStruct*[1]') + errcheck('cudnnCreateConvolutionDescriptor', self.convDesc) + local pad = torch.IntTensor({self.padH, self.padW}) + local stride = torch.IntTensor({self.dH, self.dW}) + local upscale = torch.IntTensor({1,1}) + errcheck('cudnnSetConvolutionNdDescriptor_v3', self.convDesc[0], + 2, pad:data(), + stride:data(), upscale:data(), 'CUDNN_CROSS_CORRELATION', + 'CUDNN_DATA_FLOAT'); + local function destroyConvDesc(d) errcheck('cudnnDestroyConvolutionDescriptor', d[0]); - end - ffi.gc(self.convDesc, destroyConvDesc) - - -- get output shape, resize output - local oSize = torch.IntTensor(4) - local oSizeD = oSize:data() - errcheck('cudnnGetConvolutionNdForwardOutputDim', - self.convDesc[0], self.iDesc[0], - self.weightDesc[0], 4, oSizeD) - oSize[2] = oSize[2] * self.groups - self.output:resize(oSize:long():storage()) - - -- create descriptor for output - local output_slice = {{},{1,self.nOutputPlane/self.groups},{},{}} - self.oDesc = cudnn.toDescriptor(self.output[output_slice]) - self.oDescForBias = cudnn.toDescriptor(self.output) - - ----------------------------------------------------------------------- - local maxBufSize = 0 - - -- create forwardAlgorithm descriptors - local algType = ffi.new("cudnnConvolutionFwdAlgo_t[?]", 1) - local algSearchMode = 'CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT' - local algWorkspaceLimit = self.workspace_limit - or (self.nInputPlane * self.kH * self.kW * 4) -- 4 = sizeof int/float. - - if self.fastest_mode then algSearchMode = 'CUDNN_CONVOLUTION_FWD_PREFER_FASTEST' end - if cudnn.benchmark then -- the manual auto-tuner is run + end + ffi.gc(self.convDesc, destroyConvDesc) + + -- get output shape, resize output + local oSize = torch.IntTensor(4) + local oSizeD = oSize:data() + errcheck('cudnnGetConvolutionNdForwardOutputDim', + self.convDesc[0], self.iDesc[0], + self.weightDesc[0], 4, oSizeD) + oSize[2] = oSize[2] * self.groups + self.output:resize(oSize:long():storage()) + + -- create descriptor for output + local output_slice = {{},{1,self.nOutputPlane/self.groups},{},{}} + self.oDesc = cudnn.toDescriptor(self.output[output_slice]) + self.oDescForBias = cudnn.toDescriptor(self.output) + + ----------------------------------------------------------------------- + local maxBufSize = 0 + + -- create forwardAlgorithm descriptors + local algType = ffi.new("cudnnConvolutionFwdAlgo_t[?]", 1) + local algSearchMode = 'CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT' + local algWorkspaceLimit = self.workspace_limit + or (self.nInputPlane * self.kH * self.kW * 4) -- 4 = sizeof int/float. + + if self.fastest_mode then algSearchMode = 'CUDNN_CONVOLUTION_FWD_PREFER_FASTEST' end + if cudnn.benchmark then -- the manual auto-tuner is run local perfResults = ffi.new("cudnnConvolutionFwdAlgoPerf_t[?]", 1) local intt = torch.IntTensor(1); errcheck('cudnnFindConvolutionForwardAlgorithm', @@ -150,36 +148,36 @@ function SpatialConvolution:createIODescriptors(input) 1, intt:data(), perfResults) algType[0] = perfResults[0].algo if cudnn.verbose then - print('AutoTuning:', perfResults[0].time, - tonumber(perfResults[0].memory), tonumber(perfResults[0].algo)) + print('AutoTuning:', perfResults[0].time, + tonumber(perfResults[0].memory), tonumber(perfResults[0].algo)) end - else + else errcheck('cudnnGetConvolutionForwardAlgorithm', cudnn.getHandle(), self.iDesc[0], self.weightDesc[0], self.convDesc[0], self.oDesc[0], algSearchMode, algWorkspaceLimit, algType) - end - algType[0] = self.fmode or algType[0] - self.fwdAlgType = algType - local bufSize = torch.LongTensor(1) - errcheck('cudnnGetConvolutionForwardWorkspaceSize', - cudnn.getHandle(), - self.iDesc[0], self.weightDesc[0], - self.convDesc[0], self.oDesc[0], - algType[0], bufSize:data()) - maxBufSize = math.max(maxBufSize, bufSize[1]) - - -- create backwardFilterAlgorithm descriptors - local algType = ffi.new("cudnnConvolutionBwdFilterAlgo_t[?]", 1) - local algSearchMode = 'CUDNN_CONVOLUTION_BWD_FILTER_NO_WORKSPACE' - local algWorkspaceLimit = self.workspace_limit - or (self.nInputPlane * self.kH * self.kW * 4) -- 4 = sizeof int/float. - if self.fastest_mode then + end + algType[0] = self.fmode or algType[0] + self.fwdAlgType = algType + local bufSize = torch.LongTensor(1) + errcheck('cudnnGetConvolutionForwardWorkspaceSize', + cudnn.getHandle(), + self.iDesc[0], self.weightDesc[0], + self.convDesc[0], self.oDesc[0], + algType[0], bufSize:data()) + maxBufSize = math.max(maxBufSize, bufSize[1]) + + -- create backwardFilterAlgorithm descriptors + local algType = ffi.new("cudnnConvolutionBwdFilterAlgo_t[?]", 1) + local algSearchMode = 'CUDNN_CONVOLUTION_BWD_FILTER_NO_WORKSPACE' + local algWorkspaceLimit = self.workspace_limit + or (self.nInputPlane * self.kH * self.kW * 4) -- 4 = sizeof int/float. + if self.fastest_mode then algSearchMode = 'CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST' - end + end - if cudnn.benchmark then -- the manual auto-tuner is run + if cudnn.benchmark then -- the manual auto-tuner is run local perfResults = ffi.new("cudnnConvolutionBwdFilterAlgoPerf_t[?]", 1) local intt = torch.IntTensor(1); errcheck('cudnnFindConvolutionBackwardFilterAlgorithm', @@ -189,35 +187,35 @@ function SpatialConvolution:createIODescriptors(input) 1, intt:data(), perfResults) algType[0] = perfResults[0].algo if cudnn.verbose then - print('AutoTuning:', perfResults[0].time, - tonumber(perfResults[0].memory), tonumber(perfResults[0].algo)) + print('AutoTuning:', perfResults[0].time, + tonumber(perfResults[0].memory), tonumber(perfResults[0].algo)) end - else + else errcheck('cudnnGetConvolutionBackwardFilterAlgorithm', cudnn.getHandle(), self.iDesc[0], self.oDesc[0], self.convDesc[0], self.weightDesc[0], algSearchMode, algWorkspaceLimit, algType) - end - algType[0] = self.bwmode or algType[0] - self.bwdFilterAlgType = algType - local bufSize = torch.LongTensor(1) - errcheck('cudnnGetConvolutionBackwardFilterWorkspaceSize', - cudnn.getHandle(), - self.iDesc[0], self.oDesc[0], - self.convDesc[0], self.weightDesc[0], - algType[0], bufSize:data()) - maxBufSize = math.max(maxBufSize, bufSize[1]) - - -- create backwardDataAlgorithm descriptors - local algType = ffi.new("cudnnConvolutionBwdDataAlgo_t[?]", 1) - local algSearchMode = 'CUDNN_CONVOLUTION_BWD_DATA_NO_WORKSPACE' - local algWorkspaceLimit = self.workspace_limit - or (self.nInputPlane * self.kH * self.kW * 4) -- 4 = sizeof int/float. - if self.fastest_mode then + end + algType[0] = self.bwmode or algType[0] + self.bwdFilterAlgType = algType + local bufSize = torch.LongTensor(1) + errcheck('cudnnGetConvolutionBackwardFilterWorkspaceSize', + cudnn.getHandle(), + self.iDesc[0], self.oDesc[0], + self.convDesc[0], self.weightDesc[0], + algType[0], bufSize:data()) + maxBufSize = math.max(maxBufSize, bufSize[1]) + + -- create backwardDataAlgorithm descriptors + local algType = ffi.new("cudnnConvolutionBwdDataAlgo_t[?]", 1) + local algSearchMode = 'CUDNN_CONVOLUTION_BWD_DATA_NO_WORKSPACE' + local algWorkspaceLimit = self.workspace_limit + or (self.nInputPlane * self.kH * self.kW * 4) -- 4 = sizeof int/float. + if self.fastest_mode then algSearchMode = 'CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST' - end - if cudnn.benchmark then -- the manual auto-tuner is run + end + if cudnn.benchmark then -- the manual auto-tuner is run local perfResults = ffi.new("cudnnConvolutionBwdDataAlgoPerf_t[?]", 1) local intt = torch.IntTensor(1); errcheck('cudnnFindConvolutionBackwardDataAlgorithm', @@ -227,183 +225,183 @@ function SpatialConvolution:createIODescriptors(input) 1, intt:data(), perfResults) algType[0] = perfResults[0].algo if cudnn.verbose then - print('AutoTuning:', perfResults[0].time, - tonumber(perfResults[0].memory), tonumber(perfResults[0].algo)) + print('AutoTuning:', perfResults[0].time, + tonumber(perfResults[0].memory), tonumber(perfResults[0].algo)) end - else + else errcheck('cudnnGetConvolutionBackwardDataAlgorithm', cudnn.getHandle(), self.weightDesc[0], self.oDesc[0], self.convDesc[0], self.iDesc[0], algSearchMode, algWorkspaceLimit, algType) - end - algType[0] = self.bdmode or algType[0] - self.bwdDataAlgType = algType - local bufSize = torch.LongTensor(1) - errcheck('cudnnGetConvolutionBackwardDataWorkspaceSize', - cudnn.getHandle(), - self.weightDesc[0], self.oDesc[0], - self.convDesc[0], self.iDesc[0], - algType[0], bufSize:data()) - maxBufSize = math.max(maxBufSize, bufSize[1]) - - self.extraBuffer = self.extraBuffer or cudnn.getSharedWorkspace() - self.extraBufferSizeInBytes = self.extraBuffer:nElement() * 4 -- float - if maxBufSize > self.extraBufferSizeInBytes then - self.extraBuffer:resize(math.ceil(maxBufSize/4)) - self.extraBufferSizeInBytes = maxBufSize - end - - ----------------------------------------------------------------------- - -- create offsets for groups - local iH, iW = input:size(3), input:size(4) - local kH, kW = self.kH, self.kW - local oH, oW = oSize[3], oSize[4] - self.input_offset = self.nInputPlane / self.groups * iH * iW - self.output_offset = self.nOutputPlane / self.groups * oH, oW - self.weight_offset = self.nInputPlane / self.groups * self.nOutputPlane / self.groups * kH * kW - - if not batch then + end + algType[0] = self.bdmode or algType[0] + self.bwdDataAlgType = algType + local bufSize = torch.LongTensor(1) + errcheck('cudnnGetConvolutionBackwardDataWorkspaceSize', + cudnn.getHandle(), + self.weightDesc[0], self.oDesc[0], + self.convDesc[0], self.iDesc[0], + algType[0], bufSize:data()) + maxBufSize = math.max(maxBufSize, bufSize[1]) + + self.extraBuffer = self.extraBuffer or cudnn.getSharedWorkspace() + self.extraBufferSizeInBytes = self.extraBuffer:nElement() * 4 -- float + if maxBufSize > self.extraBufferSizeInBytes then + self.extraBuffer:resize(math.ceil(maxBufSize/4)) + self.extraBufferSizeInBytes = maxBufSize + end + + ----------------------------------------------------------------------- + -- create offsets for groups + local iH, iW = input:size(3), input:size(4) + local kH, kW = self.kH, self.kW + local oH, oW = oSize[3], oSize[4] + self.input_offset = self.nInputPlane / self.groups * iH * iW + self.output_offset = self.nOutputPlane / self.groups * oH, oW + self.weight_offset = self.nInputPlane / self.groups * self.nOutputPlane / self.groups * kH * kW + + if not batch then self.gradInput = self.gradInput:view(self.gradInput:size(2), self.gradInput:size(3), self.gradInput:size(4)) self.output = self.output:view(self.output:size(2), self.output:size(3), self.output:size(4)) - end - end + end + end end local one = torch.FloatTensor({1}); local zero = torch.FloatTensor({0}); function SpatialConvolution:updateOutput(input) - if not self.weightDesc then self:resetWeightDescriptors() end - self:createIODescriptors(input) - - local prevStream - local streamQueue = {} - if self.groups > 1 then -- try to do stream parallelization - prevStream = cutorch.getStream() - - --[[ - Only if prevStream is 0, then do parallelization. - the justification for this is that this is a hard problem, there is no - way to know if one is doing other kinds of stream-parallelization - (like GPUConcat), and if thats the case, streams are already - being ideally exploited. - --]] - - if prevStream == 0 then - cutorch.reserveStreams(self.groups) - for i=1,self.groups do - cutorch.streamWaitFor(i, {prevStream}) - end - end - end - - for g = 0, self.groups - 1 do - -- stream-parallelize if appropriate - if self.groups > 1 and prevStream == 0 then - cutorch.setStream(g + 1) - table.insert(streamQueue, g + 1) - end - - errcheck('cudnnConvolutionForward', cudnn.getHandle(), - one:data(), - self.iDesc[0], input:data() + g*self.input_offset, - self.weightDesc[0], self.weight:data() + g*self.weight_offset, - self.convDesc[0], self.fwdAlgType[0], - self.extraBuffer:data(), self.extraBufferSizeInBytes, - zero:data(), - self.oDesc[0], self.output:data() + g*self.output_offset); - end - - if prevStream == 0 then - cutorch.setStream(prevStream) - cutorch.streamWaitFor(prevStream, streamQueue) - end - - -- add bias - errcheck('cudnnAddTensor', cudnn.getHandle(), - 'CUDNN_ADD_SAME_C', - one:data(), self.biasDesc[0], self.bias:data(), - one:data(), self.oDescForBias[0], self.output:data()) - - return self.output + if not self.weightDesc then self:resetWeightDescriptors() end + self:createIODescriptors(input) + + local prevStream + local streamQueue = {} + if self.groups > 1 then -- try to do stream parallelization + prevStream = cutorch.getStream() + + --[[ + Only if prevStream is 0, then do parallelization. + the justification for this is that this is a hard problem, there is no + way to know if one is doing other kinds of stream-parallelization + (like GPUConcat), and if thats the case, streams are already + being ideally exploited. + --]] + + if prevStream == 0 then + cutorch.reserveStreams(self.groups) + for i=1,self.groups do + cutorch.streamWaitFor(i, {prevStream}) + end + end + end + + for g = 0, self.groups - 1 do + -- stream-parallelize if appropriate + if self.groups > 1 and prevStream == 0 then + cutorch.setStream(g + 1) + table.insert(streamQueue, g + 1) + end + + errcheck('cudnnConvolutionForward', cudnn.getHandle(), + one:data(), + self.iDesc[0], input:data() + g*self.input_offset, + self.weightDesc[0], self.weight:data() + g*self.weight_offset, + self.convDesc[0], self.fwdAlgType[0], + self.extraBuffer:data(), self.extraBufferSizeInBytes, + zero:data(), + self.oDesc[0], self.output:data() + g*self.output_offset); + end + + if prevStream == 0 then + cutorch.setStream(prevStream) + cutorch.streamWaitFor(prevStream, streamQueue) + end + + -- add bias + errcheck('cudnnAddTensor', cudnn.getHandle(), + 'CUDNN_ADD_SAME_C', + one:data(), self.biasDesc[0], self.bias:data(), + one:data(), self.oDescForBias[0], self.output:data()) + + return self.output end function SpatialConvolution:updateGradInput(input, gradOutput) - if not self.gradInput then return end - - assert(gradOutput:dim() == 3 or gradOutput:dim() == 4, 'gradOutput has to be 3D or 4D'); - assert(gradOutput:isContiguous(), 'gradOutput has to be contiguous') - if not self.weightDesc then self:resetWeightDescriptors() end - self:createIODescriptors(input) - - for g = 0,self.groups - 1 do - errcheck('cudnnConvolutionBackwardData_v3', cudnn.getHandle(), - one:data(), - self.weightDesc[0], self.weight:data() + g*self.weight_offset, - self.oDesc[0], gradOutput:data() + g*self.output_offset, - self.convDesc[0], - self.bwdDataAlgType[0], - self.extraBuffer:data(), self.extraBufferSizeInBytes, - zero:data(), - self.iDesc[0], self.gradInput:data() + g*self.input_offset); - end - return self.gradInput + if not self.gradInput then return end + + assert(gradOutput:dim() == 3 or gradOutput:dim() == 4, 'gradOutput has to be 3D or 4D'); + assert(gradOutput:isContiguous(), 'gradOutput has to be contiguous') + if not self.weightDesc then self:resetWeightDescriptors() end + self:createIODescriptors(input) + + for g = 0,self.groups - 1 do + errcheck('cudnnConvolutionBackwardData_v3', cudnn.getHandle(), + one:data(), + self.weightDesc[0], self.weight:data() + g*self.weight_offset, + self.oDesc[0], gradOutput:data() + g*self.output_offset, + self.convDesc[0], + self.bwdDataAlgType[0], + self.extraBuffer:data(), self.extraBufferSizeInBytes, + zero:data(), + self.iDesc[0], self.gradInput:data() + g*self.input_offset); + end + return self.gradInput end function SpatialConvolution:accGradParameters(input, gradOutput, scale) - self.scaleT = self.scaleT or torch.FloatTensor(1):fill(1.0) - -- this line forces this member to always be on CPU (needed for cudnn) - self.scaleT = self.scaleT:float() - scale = scale or 1.0 - self.scaleT[1] = scale - - assert(gradOutput:dim() == 3 or gradOutput:dim() == 4, 'gradOutput has to be 3D or 4D'); - assert(gradOutput:isContiguous(), 'gradOutput has to be contiguous') - if not self.weightDesc then self:resetWeightDescriptors() end - self:createIODescriptors(input) - - -- gradBias - errcheck('cudnnConvolutionBackwardBias', cudnn.getHandle(), + self.scaleT = self.scaleT or torch.FloatTensor(1):fill(1.0) + -- this line forces this member to always be on CPU (needed for cudnn) + self.scaleT = self.scaleT:float() + scale = scale or 1.0 + self.scaleT[1] = scale + + assert(gradOutput:dim() == 3 or gradOutput:dim() == 4, 'gradOutput has to be 3D or 4D'); + assert(gradOutput:isContiguous(), 'gradOutput has to be contiguous') + if not self.weightDesc then self:resetWeightDescriptors() end + self:createIODescriptors(input) + + -- gradBias + errcheck('cudnnConvolutionBackwardBias', cudnn.getHandle(), self.scaleT:data(), self.oDescForBias[0], gradOutput:data(), one:data(), self.biasDesc[0], self.gradBias:data()) - for g = 0, self.groups - 1 do - -- gradWeight - errcheck('cudnnConvolutionBackwardFilter_v3', cudnn.getHandle(), - self.scaleT:data(), - self.iDesc[0], input:data() + g*self.input_offset, - self.oDesc[0], gradOutput:data() + g*self.output_offset, - self.convDesc[0], - self.bwdFilterAlgType[0], - self.extraBuffer:data(), self.extraBufferSizeInBytes, - one:data(), - self.weightDesc[0], self.gradWeight:data() + g*self.weight_offset); - end + for g = 0, self.groups - 1 do + -- gradWeight + errcheck('cudnnConvolutionBackwardFilter_v3', cudnn.getHandle(), + self.scaleT:data(), + self.iDesc[0], input:data() + g*self.input_offset, + self.oDesc[0], gradOutput:data() + g*self.output_offset, + self.convDesc[0], + self.bwdFilterAlgType[0], + self.extraBuffer:data(), self.extraBufferSizeInBytes, + one:data(), + self.weightDesc[0], self.gradWeight:data() + g*self.weight_offset); + end end function SpatialConvolution:write(f) - self.weightDesc = nil - self.biasDesc = nil - self.convDesc = nil - self.iDesc = nil - self.oDesc = nil - self.oDescForBias = nil - self.algType = nil - self.fwdAlgType = nil - self.bwdDataAlgType = nil - self.bwdFilterAlgType = nil - self.extraBuffer = nil - self.extraBufferSizeInBytes = nil - local var = {} - for k,v in pairs(self) do - var[k] = v - end - f:writeObject(var) + self.weightDesc = nil + self.biasDesc = nil + self.convDesc = nil + self.iDesc = nil + self.oDesc = nil + self.oDescForBias = nil + self.algType = nil + self.fwdAlgType = nil + self.bwdDataAlgType = nil + self.bwdFilterAlgType = nil + self.extraBuffer = nil + self.extraBufferSizeInBytes = nil + local var = {} + for k,v in pairs(self) do + var[k] = v + end + f:writeObject(var) end diff --git a/VolumetricConvolution.lua b/VolumetricConvolution.lua index fd2447d..4c73ce7 100644 --- a/VolumetricConvolution.lua +++ b/VolumetricConvolution.lua @@ -41,7 +41,6 @@ end function VolumetricConvolution:fastest(mode) if mode == nil then mode = true end self.fastest_mode = mode - self.iSize:fill(0) return self end @@ -55,7 +54,6 @@ function VolumetricConvolution:setMode(fmode, bdmode, bwmode) if bwmode ~= nil then self.bwmode = bwmode end - self.iSize:fill(0) return self end @@ -92,7 +90,7 @@ function VolumetricConvolution:createIODescriptors(input) errcheck('cudnnSetConvolutionNdDescriptor_v3', self.convDesc[0], 3, pad:data(), stride:data(), upscale:data(), 'CUDNN_CROSS_CORRELATION', - 'CUDNN_DATA_FLOAT'); + 'CUDNN_DATA_FLOAT'); local function destroyConvDesc(d) errcheck('cudnnDestroyConvolutionDescriptor', d[0]); end @@ -112,20 +110,20 @@ function VolumetricConvolution:createIODescriptors(input) self.output:size(2), self.output:size(3)*self.output:size(4), self.output:size(5))) - ----------------------------------------------------------------- - local maxBufSize = 0 + ----------------------------------------------------------------- + local maxBufSize = 0 -- create forwardAlgorithm descriptors for local algType = ffi.new("cudnnConvolutionFwdAlgo_t[?]", 1) - local algSearchMode = 'CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT' - local algWorkspaceLimit = self.workspace_limit - or (self.nInputPlane * self.kT * self.kH * self.kW * 4) -- 4 = sizeof int/float. - if self.fastest_mode then algSearchMode = 'CUDNN_CONVOLUTION_FWD_PREFER_FASTEST' end + local algSearchMode = 'CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT' + local algWorkspaceLimit = self.workspace_limit + or (self.nInputPlane * self.kT * self.kH * self.kW * 4) -- 4 = sizeof int/float. + if self.fastest_mode then algSearchMode = 'CUDNN_CONVOLUTION_FWD_PREFER_FASTEST' end errcheck('cudnnGetConvolutionForwardAlgorithm', cudnn.getHandle(), self.iDesc[0], self.weightDesc[0], self.convDesc[0], self.oDesc[0], algSearchMode, algWorkspaceLimit, algType) - algType[0] = self.fmode or algType[0] + algType[0] = self.fmode or algType[0] self.fwdAlgType = algType local bufSize = torch.LongTensor(1) errcheck('cudnnGetConvolutionForwardWorkspaceSize', @@ -133,20 +131,20 @@ function VolumetricConvolution:createIODescriptors(input) self.iDesc[0], self.weightDesc[0], self.convDesc[0], self.oDesc[0], algType[0], bufSize:data()) - maxBufSize = math.max(maxBufSize, bufSize[1]) + maxBufSize = math.max(maxBufSize, bufSize[1]) - -- create backwardFilterAlgorithm descriptors for + -- create backwardFilterAlgorithm descriptors for local algType = ffi.new("cudnnConvolutionBwdFilterAlgo_t[?]", 1) - local algSearchMode = 'CUDNN_CONVOLUTION_BWD_FILTER_NO_WORKSPACE' - local algWorkspaceLimit = self.workspace_limit - or (self.nInputPlane * self.kT * self.kH * self.kW * 4) -- 4 = sizeof int/float. - if self.fastest_mode then algSearchMode = 'CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST' end + local algSearchMode = 'CUDNN_CONVOLUTION_BWD_FILTER_NO_WORKSPACE' + local algWorkspaceLimit = self.workspace_limit + or (self.nInputPlane * self.kT * self.kH * self.kW * 4) -- 4 = sizeof int/float. + if self.fastest_mode then algSearchMode = 'CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST' end errcheck('cudnnGetConvolutionBackwardFilterAlgorithm', cudnn.getHandle(), self.iDesc[0], self.oDesc[0], self.convDesc[0], self.weightDesc[0], algSearchMode, algWorkspaceLimit, algType) - algType[0] = self.bwmode or algType[0] + algType[0] = self.bwmode or algType[0] self.bwdFilterAlgType = algType local bufSize = torch.LongTensor(1) errcheck('cudnnGetConvolutionBackwardFilterWorkspaceSize', @@ -154,20 +152,20 @@ function VolumetricConvolution:createIODescriptors(input) self.iDesc[0], self.oDesc[0], self.convDesc[0], self.weightDesc[0], algType[0], bufSize:data()) - maxBufSize = math.max(maxBufSize, bufSize[1]) + maxBufSize = math.max(maxBufSize, bufSize[1]) - -- create backwardDataAlgorithm descriptors for + -- create backwardDataAlgorithm descriptors for local algType = ffi.new("cudnnConvolutionBwdDataAlgo_t[?]", 1) - local algSearchMode = 'CUDNN_CONVOLUTION_BWD_DATA_NO_WORKSPACE' - local algWorkspaceLimit = self.workspace_limit - or (self.nInputPlane * self.kT * self.kH * self.kW * 4) -- 4 = sizeof int/float. - if self.fastest_mode then algSearchMode = 'CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST' end + local algSearchMode = 'CUDNN_CONVOLUTION_BWD_DATA_NO_WORKSPACE' + local algWorkspaceLimit = self.workspace_limit + or (self.nInputPlane * self.kT * self.kH * self.kW * 4) -- 4 = sizeof int/float. + if self.fastest_mode then algSearchMode = 'CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST' end errcheck('cudnnGetConvolutionBackwardDataAlgorithm', cudnn.getHandle(), self.weightDesc[0], self.oDesc[0], self.convDesc[0], self.iDesc[0], algSearchMode, algWorkspaceLimit, algType) - algType[0] = self.bdmode or algType[0] + algType[0] = self.bdmode or algType[0] self.bwdDataAlgType = algType local bufSize = torch.LongTensor(1) errcheck('cudnnGetConvolutionBackwardDataWorkspaceSize', @@ -175,16 +173,16 @@ function VolumetricConvolution:createIODescriptors(input) self.weightDesc[0], self.oDesc[0], self.convDesc[0], self.iDesc[0], algType[0], bufSize:data()) - maxBufSize = math.max(maxBufSize, bufSize[1]) + maxBufSize = math.max(maxBufSize, bufSize[1]) - self.extraBuffer = self.extraBuffer or cudnn.getSharedWorkspace() - self.extraBufferSizeInBytes = self.extraBuffer:nElement() * 4 -- float + self.extraBuffer = self.extraBuffer or cudnn.getSharedWorkspace() + self.extraBufferSizeInBytes = self.extraBuffer:nElement() * 4 -- float if maxBufSize > self.extraBufferSizeInBytes then self.extraBuffer:resize(math.ceil(maxBufSize/4)) self.extraBufferSizeInBytes = maxBufSize end - ----------------------------------------------------------------- + ----------------------------------------------------------------- if not batch then self.gradInput = self.gradInput:view(self.gradInput:size(2), self.gradInput:size(3), @@ -226,14 +224,14 @@ function VolumetricConvolution:updateGradInput(input, gradOutput) if not self.weightDesc then self:resetWeightDescriptors() end self:createIODescriptors(input) errcheck('cudnnConvolutionBackwardData_v3', cudnn.getHandle(), - one:data(), - self.weightDesc[0], self.weight:data(), - self.oDesc[0], gradOutput:data(), - self.convDesc[0], - self.bwdDataAlgType[0], - self.extraBuffer:data(), self.extraBufferSizeInBytes, - zero:data(), - self.iDesc[0], self.gradInput:data()); + one:data(), + self.weightDesc[0], self.weight:data(), + self.oDesc[0], gradOutput:data(), + self.convDesc[0], + self.bwdDataAlgType[0], + self.extraBuffer:data(), self.extraBufferSizeInBytes, + zero:data(), + self.iDesc[0], self.gradInput:data()); return self.gradInput end @@ -256,14 +254,14 @@ function VolumetricConvolution:accGradParameters(input, gradOutput, scale) self.biasDesc[0], self.gradBias:data()); -- gradWeight errcheck('cudnnConvolutionBackwardFilter_v3', cudnn.getHandle(), - self.scaleT:data(), - self.iDesc[0], input:data(), - self.oDesc[0], gradOutput:data(), - self.convDesc[0], - self.bwdFilterAlgType[0], - self.extraBuffer:data(), self.extraBufferSizeInBytes, - one:data(), - self.weightDesc[0], self.gradWeight:data()); + self.scaleT:data(), + self.iDesc[0], input:data(), + self.oDesc[0], gradOutput:data(), + self.convDesc[0], + self.bwdFilterAlgType[0], + self.extraBuffer:data(), self.extraBufferSizeInBytes, + one:data(), + self.weightDesc[0], self.gradWeight:data()); end function VolumetricConvolution:write(f) @@ -111,13 +111,13 @@ cudnnStatus_t cudnnCreateConvolutionDescriptor(cudnnConvolutionDescriptor_t *convDesc ); cudnnStatus_t cudnnSetConvolutionNdDescriptor_v3( cudnnConvolutionDescriptor_t convDesc, - int arrayLength, - const int padA[], - const int filterStrideA[], - const int upscaleA[], - cudnnConvolutionMode_t mode, - cudnnDataType_t dataType - ); + int arrayLength, + const int padA[], + const int filterStrideA[], + const int upscaleA[], + cudnnConvolutionMode_t mode, + cudnnDataType_t dataType + ); cudnnStatus_t cudnnGetConvolutionNdForwardOutputDim( @@ -156,14 +156,14 @@ typedef struct { cudnnStatus_t cudnnFindConvolutionForwardAlgorithm(cudnnHandle_t handle, - const cudnnTensorDescriptor_t srcDesc, - const cudnnFilterDescriptor_t filterDesc, - const cudnnConvolutionDescriptor_t convDesc, - const cudnnTensorDescriptor_t destDesc, - const int requestedCount, - int *returnedCount, - cudnnConvolutionFwdAlgoPerf_t *perfResults - ); + const cudnnTensorDescriptor_t srcDesc, + const cudnnFilterDescriptor_t filterDesc, + const cudnnConvolutionDescriptor_t convDesc, + const cudnnTensorDescriptor_t destDesc, + const int requestedCount, + int *returnedCount, + cudnnConvolutionFwdAlgoPerf_t *perfResults + ); cudnnStatus_t cudnnGetConvolutionForwardAlgorithm( cudnnHandle_t handle, @@ -243,42 +243,42 @@ cudnnStatus_t cudnnFindConvolutionBackwardFilterAlgorithm( cudnnHandle_t handle, cudnnStatus_t cudnnGetConvolutionBackwardFilterAlgorithm( - cudnnHandle_t handle, - const cudnnTensorDescriptor_t srcDesc, - const cudnnTensorDescriptor_t diffDesc, - const cudnnConvolutionDescriptor_t convDesc, - const cudnnFilterDescriptor_t gradDesc, - cudnnConvolutionBwdFilterPreference_t preference, - size_t memoryLimitInbytes, - cudnnConvolutionBwdFilterAlgo_t *algo - ); + cudnnHandle_t handle, + const cudnnTensorDescriptor_t srcDesc, + const cudnnTensorDescriptor_t diffDesc, + const cudnnConvolutionDescriptor_t convDesc, + const cudnnFilterDescriptor_t gradDesc, + cudnnConvolutionBwdFilterPreference_t preference, + size_t memoryLimitInbytes, + cudnnConvolutionBwdFilterAlgo_t *algo + ); cudnnStatus_t cudnnGetConvolutionBackwardFilterWorkspaceSize( - cudnnHandle_t handle, - const cudnnTensorDescriptor_t srcDesc, - const cudnnTensorDescriptor_t diffDesc, - const cudnnConvolutionDescriptor_t convDesc, - const cudnnFilterDescriptor_t gradDesc, - cudnnConvolutionBwdFilterAlgo_t algo, - size_t *sizeInBytes - ); + cudnnHandle_t handle, + const cudnnTensorDescriptor_t srcDesc, + const cudnnTensorDescriptor_t diffDesc, + const cudnnConvolutionDescriptor_t convDesc, + const cudnnFilterDescriptor_t gradDesc, + cudnnConvolutionBwdFilterAlgo_t algo, + size_t *sizeInBytes + ); cudnnStatus_t cudnnConvolutionBackwardFilter_v3( - cudnnHandle_t handle, - const void *alpha, - const cudnnTensorDescriptor_t srcDesc, - const void *srcData, - const cudnnTensorDescriptor_t diffDesc, - const void *diffData, - const cudnnConvolutionDescriptor_t convDesc, - cudnnConvolutionBwdFilterAlgo_t algo, - void *workSpace, - size_t workSpaceSizeInBytes, - const void *beta, - const cudnnFilterDescriptor_t gradDesc, - void *gradData - ); + cudnnHandle_t handle, + const void *alpha, + const cudnnTensorDescriptor_t srcDesc, + const void *srcData, + const cudnnTensorDescriptor_t diffDesc, + const void *diffData, + const cudnnConvolutionDescriptor_t convDesc, + cudnnConvolutionBwdFilterAlgo_t algo, + void *workSpace, + size_t workSpaceSizeInBytes, + const void *beta, + const cudnnFilterDescriptor_t gradDesc, + void *gradData + ); typedef enum { @@ -312,42 +312,42 @@ cudnnStatus_t cudnnFindConvolutionBackwardDataAlgorithm(cudnnHandle_t handle, ); cudnnStatus_t cudnnGetConvolutionBackwardDataAlgorithm( - cudnnHandle_t handle, - const cudnnFilterDescriptor_t filterDesc, - const cudnnTensorDescriptor_t diffDesc, - const cudnnConvolutionDescriptor_t convDesc, - const cudnnTensorDescriptor_t gradDesc, - cudnnConvolutionBwdDataPreference_t preference, - size_t memoryLimitInbytes, - cudnnConvolutionBwdDataAlgo_t *algo - ); + cudnnHandle_t handle, + const cudnnFilterDescriptor_t filterDesc, + const cudnnTensorDescriptor_t diffDesc, + const cudnnConvolutionDescriptor_t convDesc, + const cudnnTensorDescriptor_t gradDesc, + cudnnConvolutionBwdDataPreference_t preference, + size_t memoryLimitInbytes, + cudnnConvolutionBwdDataAlgo_t *algo + ); cudnnStatus_t cudnnGetConvolutionBackwardDataWorkspaceSize( - cudnnHandle_t handle, - const cudnnFilterDescriptor_t filterDesc, - const cudnnTensorDescriptor_t diffDesc, - const cudnnConvolutionDescriptor_t convDesc, - const cudnnTensorDescriptor_t gradDesc, - cudnnConvolutionBwdDataAlgo_t algo, - size_t *sizeInBytes - ); + cudnnHandle_t handle, + const cudnnFilterDescriptor_t filterDesc, + const cudnnTensorDescriptor_t diffDesc, + const cudnnConvolutionDescriptor_t convDesc, + const cudnnTensorDescriptor_t gradDesc, + cudnnConvolutionBwdDataAlgo_t algo, + size_t *sizeInBytes + ); cudnnStatus_t cudnnConvolutionBackwardData_v3( - cudnnHandle_t handle, - const void *alpha, - const cudnnFilterDescriptor_t filterDesc, - const void *filterData, - const cudnnTensorDescriptor_t diffDesc, - const void *diffData, - const cudnnConvolutionDescriptor_t convDesc, - cudnnConvolutionBwdDataAlgo_t algo, - void *workSpace, - size_t workSpaceSizeInBytes, - const void *beta, - const cudnnTensorDescriptor_t gradDesc, - void *gradData - ); + cudnnHandle_t handle, + const void *alpha, + const cudnnFilterDescriptor_t filterDesc, + const void *filterData, + const cudnnTensorDescriptor_t diffDesc, + const void *diffData, + const cudnnConvolutionDescriptor_t convDesc, + cudnnConvolutionBwdDataAlgo_t algo, + void *workSpace, + size_t workSpaceSizeInBytes, + const void *beta, + const cudnnTensorDescriptor_t gradDesc, + void *gradData + ); typedef enum @@ -365,19 +365,19 @@ typedef enum /* Function to perform forward softmax */ cudnnStatus_t cudnnSoftmaxForward( cudnnHandle_t handle, - cudnnSoftmaxAlgorithm_t algorithm, - cudnnSoftmaxMode_t mode, - const void *alpha, - const cudnnTensorDescriptor_t srcDesc, - const void *srcData, - const void *beta, - const cudnnTensorDescriptor_t destDesc, - void *destData - ); + cudnnSoftmaxAlgorithm_t algorithm, + cudnnSoftmaxMode_t mode, + const void *alpha, + const cudnnTensorDescriptor_t srcDesc, + const void *srcData, + const void *beta, + const cudnnTensorDescriptor_t destDesc, + void *destData + ); /* Function to perform backward softmax */ cudnnStatus_t cudnnSoftmaxBackward( - cudnnHandle_t handle, + cudnnHandle_t handle, cudnnSoftmaxAlgorithm_t algorithm, cudnnSoftmaxMode_t mode, const void *alpha, @@ -399,15 +399,15 @@ typedef enum } cudnnPoolingMode_t; cudnnStatus_t cudnnCreatePoolingDescriptor( - cudnnPoolingDescriptor_t *poolingDesc); + cudnnPoolingDescriptor_t *poolingDesc); cudnnStatus_t cudnnSetPoolingNdDescriptor( - cudnnPoolingDescriptor_t poolingDesc, + cudnnPoolingDescriptor_t poolingDesc, const cudnnPoolingMode_t mode, int nbDims, const int windowDimA[], const int paddingA[], const int strideA[] - ); + ); cudnnStatus_t cudnnGetPoolingNdDescriptor( const cudnnPoolingDescriptor_t poolingDesc, @@ -421,12 +421,12 @@ cudnnStatus_t cudnnGetPoolingNdDescriptor( cudnnStatus_t cudnnGetPoolingNdForwardOutputDim( const cudnnPoolingDescriptor_t poolingDesc, - const cudnnTensorDescriptor_t inputTensorDesc, - int nbDims, - int outputTensorDimA[]); + const cudnnTensorDescriptor_t inputTensorDesc, + int nbDims, + int outputTensorDimA[]); cudnnStatus_t cudnnDestroyPoolingDescriptor( - cudnnPoolingDescriptor_t poolingDesc ); + cudnnPoolingDescriptor_t poolingDesc ); cudnnStatus_t cudnnPoolingForward( cudnnHandle_t handle, const cudnnPoolingDescriptor_t poolingDesc, @@ -491,46 +491,46 @@ typedef enum } cudnnLRNMode_t; cudnnStatus_t cudnnSetLRNDescriptor( - cudnnLRNDescriptor_t normDesc, - unsigned lrnN, - double lrnAlpha, - double lrnBeta, - double lrnK); + cudnnLRNDescriptor_t normDesc, + unsigned lrnN, + double lrnAlpha, + double lrnBeta, + double lrnK); cudnnStatus_t cudnnGetLRNDescriptor( - cudnnLRNDescriptor_t normDesc, - unsigned* lrnN, - double* lrnAlpha, - double* lrnBeta, - double* lrnK); + cudnnLRNDescriptor_t normDesc, + unsigned* lrnN, + double* lrnAlpha, + double* lrnBeta, + double* lrnK); cudnnStatus_t cudnnDestroyLRNDescriptor( cudnnLRNDescriptor_t lrnDesc ); cudnnStatus_t cudnnLRNCrossChannelForward( - cudnnHandle_t handle, - cudnnLRNDescriptor_t normDesc, - cudnnLRNMode_t lrnMode, - const void* alpha, - const cudnnTensorDescriptor_t srcDesc, - const void *srcData, - const void *beta, - const cudnnTensorDescriptor_t destDesc, - void *destData); + cudnnHandle_t handle, + cudnnLRNDescriptor_t normDesc, + cudnnLRNMode_t lrnMode, + const void* alpha, + const cudnnTensorDescriptor_t srcDesc, + const void *srcData, + const void *beta, + const cudnnTensorDescriptor_t destDesc, + void *destData); cudnnStatus_t cudnnLRNCrossChannelBackward( - cudnnHandle_t handle, - cudnnLRNDescriptor_t normDesc, - cudnnLRNMode_t lrnMode, - const void* alpha, - const cudnnTensorDescriptor_t srcDesc, - const void *srcData, - const cudnnTensorDescriptor_t srcDiffDesc, - const void *srcDiffData, - const cudnnTensorDescriptor_t destDesc, - const void *destData, - const void *beta, - const cudnnTensorDescriptor_t destDiffDesc, - void *destDiffData); + cudnnHandle_t handle, + cudnnLRNDescriptor_t normDesc, + cudnnLRNMode_t lrnMode, + const void* alpha, + const cudnnTensorDescriptor_t srcDesc, + const void *srcData, + const cudnnTensorDescriptor_t srcDiffDesc, + const void *srcDiffData, + const cudnnTensorDescriptor_t destDesc, + const void *destData, + const void *beta, + const cudnnTensorDescriptor_t destDiffDesc, + void *destDiffData); typedef enum { @@ -538,36 +538,36 @@ typedef enum } cudnnDivNormMode_t; cudnnStatus_t cudnnDivisiveNormalizationForward( - cudnnHandle_t handle, - cudnnLRNDescriptor_t normDesc, - cudnnDivNormMode_t mode, - const void *alpha, - const cudnnTensorDescriptor_t srcDesc, - const void *srcData, - const void *srcMeansData, - void *tempData, - void *tempData2, - const void *beta, - const cudnnTensorDescriptor_t destDesc, - void *destData - ); + cudnnHandle_t handle, + cudnnLRNDescriptor_t normDesc, + cudnnDivNormMode_t mode, + const void *alpha, + const cudnnTensorDescriptor_t srcDesc, + const void *srcData, + const void *srcMeansData, + void *tempData, + void *tempData2, + const void *beta, + const cudnnTensorDescriptor_t destDesc, + void *destData + ); cudnnStatus_t cudnnDivisiveNormalizationBackward( - cudnnHandle_t handle, - cudnnLRNDescriptor_t normDesc, - cudnnDivNormMode_t mode, - const void *alpha, - const cudnnTensorDescriptor_t srcDesc, - const void *srcData, - const void *srcMeansData, - const void *srcDiffData, - void *tempData, - void *tempData2, - const void *betaData, - const cudnnTensorDescriptor_t destDataDesc, - void *destDataDiff, - void *destMeansDiff - ); + cudnnHandle_t handle, + cudnnLRNDescriptor_t normDesc, + cudnnDivNormMode_t mode, + const void *alpha, + const cudnnTensorDescriptor_t srcDesc, + const void *srcData, + const void *srcMeansData, + const void *srcDiffData, + void *tempData, + void *tempData2, + const void *betaData, + const cudnnTensorDescriptor_t destDataDesc, + void *destDataDiff, + void *destMeansDiff + ); ]] diff --git a/functional.lua b/functional.lua index 5a343c5..66bb4d6 100644 --- a/functional.lua +++ b/functional.lua @@ -181,7 +181,7 @@ cudnn.functional.Convolution2D_updateGradInput = function(handle, input, weight, weightDesc[0], weight:data(), oDesc[0], gradOutput:data(), convDesc[0], - algType[0], + algType[0], NULL, 0, zero:data(), iDesc[0], gradInput:data()); diff --git a/test/test.lua b/test/test.lua index 4062425..8c22ece 100644 --- a/test/test.lua +++ b/test/test.lua @@ -692,15 +692,11 @@ end function cudnntest.LogSoftMax_batch() local bs = math.random(1,32) local from = math.random(1,32) - local outi = math.random(1,64) - local outj = math.random(1,64) - local ini = outi - local inj = outj - local input = torch.randn(bs,from,inj,ini):cuda() - local gradOutput = torch.randn(bs,from,outj,outi):cuda() + local input = torch.randn(bs,from):cuda() + local gradOutput = torch.randn(bs,from):cuda() local sconv = nn.LogSoftMax():cuda() - local groundtruth = sconv:forward(input:view(bs,-1)) + local groundtruth = sconv:forward(input) local groundgrad = sconv:backward(input, gradOutput) cutorch.synchronize() local gconv = cudnn.LogSoftMax():cuda() @@ -713,8 +709,8 @@ function cudnntest.LogSoftMax_batch() local rescuda = gconv:forward(input) local resgrad = gconv:backward(input, gradOutput) cutorch.synchronize() - mytester:asserteq(rescuda:dim(), 4, 'error in dimension') - mytester:asserteq(resgrad:dim(), 4, 'error in dimension') + mytester:asserteq(rescuda:dim(), 2, 'error in dimension') + mytester:asserteq(resgrad:dim(), 2, 'error in dimension') local error = rescuda:float() - groundtruth:float() mytester:assertlt(error:abs():max(), |