diff options
-rw-r--r-- | RNN.lua | 24 | ||||
-rw-r--r-- | SpatialConvolution.lua | 341 | ||||
-rw-r--r-- | SpatialFullConvolution.lua | 290 | ||||
-rw-r--r-- | TemporalConvolution.lua | 21 | ||||
-rw-r--r-- | VolumetricConvolution.lua | 356 | ||||
-rw-r--r-- | algo.lua | 129 | ||||
-rw-r--r-- | cudnn-scm-1.rockspec | 2 | ||||
-rw-r--r-- | functional.lua | 2 | ||||
-rw-r--r-- | init.lua | 17 | ||||
-rw-r--r-- | test/test.lua | 38 | ||||
-rw-r--r-- | test/test_groups.lua | 2 |
11 files changed, 889 insertions, 333 deletions
@@ -63,29 +63,43 @@ function RNN:reset(stdv) self.gradWeight:resizeAs(self.weight):zero() end +function RNN:createDescriptors(count, descs_type, create_func, destroy_func) + local ds = ffi.new(descs_type, count) + for i = 0, count - 1 do + errcheck(create_func, ds + i) + end + local function destroyDescriptors(ds) + for i = 0, count - 1 do + errcheck(destroy_func, ds[i]) + end + end + ffi.gc(ds, destroyDescriptors) + return ds +end + function RNN:createDropoutDescriptors(count) - return cudnn.createDescriptors(count, + return self:createDescriptors(count, 'cudnnDropoutDescriptor_t[?]', 'cudnnCreateDropoutDescriptor', 'cudnnDestroyDropoutDescriptor') end function RNN:createFilterDescriptors(count) - return cudnn.createDescriptors(count, + return self:createDescriptors(count, 'cudnnFilterDescriptor_t[?]', 'cudnnCreateFilterDescriptor', 'cudnnDestroyFilterDescriptor') end function RNN:createRNNDescriptors(count) - return cudnn.createDescriptors(count, + return self:createDescriptors(count, 'cudnnRNNDescriptor_t[?]', 'cudnnCreateRNNDescriptor', 'cudnnDestroyRNNDescriptor') end function RNN:createTensorDescriptors(count) - return cudnn.createDescriptors(count, + return self:createDescriptors(count, 'cudnnTensorDescriptor_t[?]', 'cudnnCreateTensorDescriptor', 'cudnnDestroyTensorDescriptor') @@ -369,7 +383,7 @@ function RNN:updateOutput(input) if self.cellOutput then self.cellInput = self.cellOutput:clone() end - end + end if (self.batchFirst) then self.output = self.output:transpose(1, 2) end diff --git a/SpatialConvolution.lua b/SpatialConvolution.lua index 1656154..58c78b2 100644 --- a/SpatialConvolution.lua +++ b/SpatialConvolution.lua @@ -1,9 +1,13 @@ local SpatialConvolution, parent = torch.class('cudnn.SpatialConvolution', 'nn.SpatialConvolution') local ffi = require 'ffi' -local algo = require 'cudnn.algo' local errcheck = cudnn.errcheck +local autotunerCache = {} +autotunerCache[1] = {} -- forward +autotunerCache[2] = {} -- backwardFilter +autotunerCache[3] = {} -- backwardData + function SpatialConvolution:__init(nInputPlane, nOutputPlane, kW, kH, dW, dH, padW, padH, groups) local delayedReset = self.reset @@ -22,38 +26,39 @@ function SpatialConvolution:__init(nInputPlane, nOutputPlane, self:reset() -- should nil for serialization, the reset will still work self.reset = nil - return self -end - -function SpatialConvolution:createWeightDescriptors() - assert(cudnn.typemap[torch.typename(self.weight)] or not self.weight, 'Only Cuda supported duh!') - assert(cudnn.typemap[torch.typename(self.bias)] or not self.bias, 'Only Cuda supported duh!') - -- create descriptor for bias - if self.bias then - self.biasDesc = cudnn.toDescriptor(self.bias:view(1, self.nOutputPlane,1,1)) - end - -- create filterDescriptor for weight - return cudnn.createDescriptors(1, 'struct cudnnFilterStruct*[?]', - 'cudnnCreateFilterDescriptor', 'cudnnDestroyFilterDescriptor') end -- if you change the configuration of the module manually, call this -function SpatialConvolution:resetWeightDescriptors(desc) +function SpatialConvolution:resetWeightDescriptors() + assert(cudnn.typemap[torch.typename(self.weight)], 'Only Cuda supported duh!') + assert(cudnn.typemap[torch.typename(self.bias)] or not self.bias, 'Only Cuda supported duh!') -- for compatibility self.groups = self.groups or 1 - self.weightDesc = SpatialConvolution.createWeightDescriptors(self) - desc = desc or torch.IntTensor({self.nOutputPlane/self.groups, - self.nInputPlane/self.groups, - self.kH, self.kW}) + -- create filterDescriptor for weight + self.weightDesc = ffi.new('struct cudnnFilterStruct*[1]') + errcheck('cudnnCreateFilterDescriptor', self.weightDesc) + local desc = torch.IntTensor({self.nOutputPlane/self.groups, + self.nInputPlane/self.groups, + self.kH, self.kW}) errcheck('cudnnSetFilterNdDescriptor', self.weightDesc[0], - cudnn.typemap[torch.typename(self.weight)], 'CUDNN_TENSOR_NCHW', self.nDim, + cudnn.typemap[torch.typename(self.weight)], 'CUDNN_TENSOR_NCHW', 4, desc:data()); + local function destroyWDesc(d) + errcheck('cudnnDestroyFilterDescriptor', d[0]); + end + ffi.gc(self.weightDesc, destroyWDesc) + + -- create descriptor for bias + if self.bias then + self.biasDesc = cudnn.toDescriptor(self.bias:view(1, self.nOutputPlane,1,1)) + end end function SpatialConvolution:fastest(mode) if mode == nil then mode = true end self.fastest_mode = mode - self.iDesc = nil + self.iSize = self.iSize or torch.LongStorage(4) + self.iSize:fill(0) return self end @@ -67,7 +72,8 @@ function SpatialConvolution:setMode(fmode, bdmode, bwmode) if bwmode ~= nil then self.bwmode = bwmode end - self.iDesc = nil + self.iSize = self.iSize or torch.LongStorage(4) + self.iSize:fill(0) return self end @@ -84,41 +90,32 @@ function SpatialConvolution:noBias() return self end - -function SpatialConvolution:checkInputChanged(input) - self.nDim = self.nDim or 4 - assert(input:dim() == self.nDim) - assert(input:isContiguous()) - self.iSize = self.iSize or torch.LongStorage(self.nDim):fill(0) - self.groups = self.groups or 1 - if not self.weightDesc then self:resetWeightDescriptors() end - if not self.iDesc or not self.oDesc or input:size(1) ~= self.iSize[1] or input:size(2) ~= self.iSize[2] - or input:size(3) ~= self.iSize[3] or input:size(4) ~= self.iSize[4] or (self.nDim==5 and input:size(5) ~= self.iSize[5]) then - self.iSize = input:size() - - assert(self.nInputPlane == input:size(2), 'input has to contain: ' - .. self.nInputPlane - .. ' feature maps, but received input of size: ' - .. input:size(1) .. ' x ' .. input:size(2) .. - ' x ' .. input:size(3) .. ' x ' .. input:size(4)) - return true +function SpatialConvolution:createIODescriptors(input) + local batch = true + if input:dim() == 3 then + input = input:view(1, input:size(1), input:size(2), input:size(3)) + batch = false end - return false -end + assert(input:dim() == 4 and input:isContiguous()); + self.iSize = self.iSize or torch.LongStorage(4):fill(0) + if not self.iDesc or not self.oDesc or + input:size(1) ~= self.iSize[1] or input:size(2) ~= self.iSize[2] + or input:size(3) ~= self.iSize[3] or input:size(4) ~= self.iSize[4] then + self.iSize = input:size() + + assert(self.nInputPlane == input:size(2), 'input has to contain: ' + .. self.nInputPlane + .. ' feature maps, but received input of size: ' + .. input:size(1) .. ' x ' .. input:size(2) .. + ' x ' .. input:size(3) .. ' x ' .. input:size(4)) -function SpatialConvolution:createIODescriptors(input) - local batch = true - if input:dim() == 3 then - input = input:view(1, input:size(1), input:size(2), input:size(3)) - batch = false - end - if SpatialConvolution.checkInputChanged(self, input) then -- create input descriptor local input_slice = input:narrow(2,1,self.nInputPlane/self.groups) self.iDesc = cudnn.toDescriptor(input_slice) + -- create conv descriptor - self.convDesc = cudnn.createDescriptors(1, 'struct cudnnConvolutionStruct*[?]', - 'cudnnCreateConvolutionDescriptor', 'cudnnDestroyConvolutionDescriptor') + self.convDesc = ffi.new('struct cudnnConvolutionStruct*[1]') + errcheck('cudnnCreateConvolutionDescriptor', self.convDesc) self.padH, self.padW = self.padH or 0, self.padW or 0 local pad = torch.IntTensor({self.padH, self.padW}) local stride = torch.IntTensor({self.dH, self.dW}) @@ -127,24 +124,207 @@ function SpatialConvolution:createIODescriptors(input) 2, pad:data(), stride:data(), upscale:data(), 'CUDNN_CROSS_CORRELATION', cudnn.configmap(torch.type(self.weight))); - + local function destroyConvDesc(d) + errcheck('cudnnDestroyConvolutionDescriptor', d[0]); + end + ffi.gc(self.convDesc, destroyConvDesc) -- get output shape, resize output - local oSize = torch.IntTensor(self.nDim) + local oSize = torch.IntTensor(4) local oSizeD = oSize:data() errcheck('cudnnGetConvolutionNdForwardOutputDim', self.convDesc[0], self.iDesc[0], - self.weightDesc[0], self.nDim, oSizeD) + self.weightDesc[0], 4, oSizeD) oSize[2] = oSize[2] * self.groups self.output:resize(oSize:long():storage()) - local output_slice = self.output:narrow(2,1,self.nOutputPlane/self.groups) -- create descriptor for output + local output_slice = self.output:narrow(2,1,self.nOutputPlane/self.groups) self.oDesc = cudnn.toDescriptor(output_slice) self.oDescForBias = cudnn.toDescriptor(self.output) - algo.prepareHash(self, input_slice, output_slice) + ----------------------------------------------------------------------- + local function shape(x) + local sz = x:size() + local str = '' + for i=1,sz:size() do + str = str .. sz[i] .. 'x' + end + if #str > 0 then + str = str:sub(1, #str-1) + end + return str + end + local autotunerHash = shape(self.weight) .. ';' + .. shape(input_slice) .. ';' + .. shape(output_slice) + + local maxBufSize = 0 + + -- create forwardAlgorithm descriptors + local algType = ffi.new("cudnnConvolutionFwdAlgo_t[?]", 1) + local algSearchMode = 'CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT' + local algWorkspaceLimit = self.workspace_limit + or (self.nInputPlane * self.kH * self.kW * 4) -- 4 = sizeof int/float. + + if self.fastest_mode or cudnn.fastest == true then + algSearchMode = 'CUDNN_CONVOLUTION_FWD_PREFER_FASTEST' + end + + if cudnn.benchmark then -- the manual auto-tuner is run + if autotunerCache[1][autotunerHash] then + algType[0] = autotunerCache[1][autotunerHash] + if cudnn.verbose then + print('Autotuning SC FW: using cached algo = ', algType[0], ' for: ', autotunerHash) + end + else + local perfResults = ffi.new("cudnnConvolutionFwdAlgoPerf_t[?]", 1) + local intt = torch.IntTensor(1); + errcheck('cudnnFindConvolutionForwardAlgorithm', + cudnn.getHandle(), + self.iDesc[0], self.weightDesc[0], + self.convDesc[0], self.oDesc[0], + 1, intt:data(), perfResults) + algType[0] = perfResults[0].algo + autotunerCache[1][autotunerHash] = perfResults[0].algo + if cudnn.verbose then + print(string.format( + "\nAutotuning SC Forward: Time: %3.5f Memory: %8d Algorithm: %d" + .. " Weight: %15s Input: %15s Output: %15s", + perfResults[0].time, tonumber(perfResults[0].memory), + tonumber(perfResults[0].algo), + shape(self.weight), shape(input_slice), + shape(output_slice))) + end + end + else + errcheck('cudnnGetConvolutionForwardAlgorithm', + cudnn.getHandle(), + self.iDesc[0], self.weightDesc[0], + self.convDesc[0], self.oDesc[0], + algSearchMode, algWorkspaceLimit, algType) + end + algType[0] = self.fmode or algType[0] + self.fwdAlgType = algType + local bufSize = torch.LongTensor(1) + errcheck('cudnnGetConvolutionForwardWorkspaceSize', + cudnn.getHandle(), + self.iDesc[0], self.weightDesc[0], + self.convDesc[0], self.oDesc[0], + algType[0], bufSize:data()) + maxBufSize = math.max(maxBufSize, bufSize[1]) + + -- create backwardFilterAlgorithm descriptors + local algType = ffi.new("cudnnConvolutionBwdFilterAlgo_t[?]", 1) + local algSearchMode = 'CUDNN_CONVOLUTION_BWD_FILTER_NO_WORKSPACE' + local algWorkspaceLimit = self.workspace_limit + or (self.nInputPlane * self.kH * self.kW * 4) -- 4 = sizeof int/float. + if self.fastest_mode or cudnn.fastest == true then + algSearchMode = 'CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST' + end + + if cudnn.benchmark then -- the manual auto-tuner is run + if autotunerCache[2][autotunerHash] then + algType[0] = autotunerCache[2][autotunerHash] + if cudnn.verbose then + print('Autotuning SC BW: using cached algo = ', algType[0], ' for: ', autotunerHash) + end + else + local perfResults = ffi.new("cudnnConvolutionBwdFilterAlgoPerf_t[?]", 1) + local intt = torch.IntTensor(1); + errcheck('cudnnFindConvolutionBackwardFilterAlgorithm', + cudnn.getHandle(), + self.iDesc[0], self.oDesc[0], + self.convDesc[0], self.weightDesc[0], + 1, intt:data(), perfResults) + algType[0] = perfResults[0].algo + autotunerCache[2][autotunerHash] = perfResults[0].algo + if cudnn.verbose then + print(string.format( + "Autotuning backwardFilter: Time: %3.5f Memory: %8d Algorithm: %d" + .. " Weight: %15s Input: %15s Output: %15s", + perfResults[0].time, tonumber(perfResults[0].memory), + tonumber(perfResults[0].algo), + shape(self.weight), shape(input_slice), + shape(output_slice))) + end + end + else + errcheck('cudnnGetConvolutionBackwardFilterAlgorithm', + cudnn.getHandle(), + self.iDesc[0], self.oDesc[0], + self.convDesc[0], self.weightDesc[0], + algSearchMode, algWorkspaceLimit, algType) + end + algType[0] = self.bwmode or algType[0] + self.bwdFilterAlgType = algType + local bufSize = torch.LongTensor(1) + errcheck('cudnnGetConvolutionBackwardFilterWorkspaceSize', + cudnn.getHandle(), + self.iDesc[0], self.oDesc[0], + self.convDesc[0], self.weightDesc[0], + algType[0], bufSize:data()) + maxBufSize = math.max(maxBufSize, bufSize[1]) + + -- create backwardDataAlgorithm descriptors + local algType = ffi.new("cudnnConvolutionBwdDataAlgo_t[?]", 1) + local algSearchMode = 'CUDNN_CONVOLUTION_BWD_DATA_NO_WORKSPACE' + local algWorkspaceLimit = self.workspace_limit + or (self.nInputPlane * self.kH * self.kW * 4) -- 4 = sizeof int/float. + if self.fastest_mode or cudnn.fastest == true then + algSearchMode = 'CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST' + end + if cudnn.benchmark then -- the manual auto-tuner is run + if autotunerCache[3][autotunerHash] then + algType[0] = autotunerCache[3][autotunerHash] + if cudnn.verbose then + print('Autotuning SC BWD: using cached algo = ', algType[0], ' for: ', autotunerHash) + end + else + local perfResults = ffi.new("cudnnConvolutionBwdDataAlgoPerf_t[?]", 1) + local intt = torch.IntTensor(1); + errcheck('cudnnFindConvolutionBackwardDataAlgorithm', + cudnn.getHandle(), + self.weightDesc[0], self.oDesc[0], + self.convDesc[0], self.iDesc[0], + 1, intt:data(), perfResults) + algType[0] = perfResults[0].algo + autotunerCache[3][autotunerHash] = perfResults[0].algo + if cudnn.verbose then + print(string.format( + "Autotuning backwardData: Time: %3.5f Memory: %8d Algorithm: %d" + .. " Weight: %15s Input: %15s Output: %15s\n", + perfResults[0].time, tonumber(perfResults[0].memory), + tonumber(perfResults[0].algo), + shape(self.weight), shape(input_slice), + shape(output_slice))) + end + end + else + errcheck('cudnnGetConvolutionBackwardDataAlgorithm', + cudnn.getHandle(), + self.weightDesc[0], self.oDesc[0], + self.convDesc[0], self.iDesc[0], + algSearchMode, algWorkspaceLimit, algType) + end + algType[0] = self.bdmode or algType[0] + self.bwdDataAlgType = algType + local bufSize = torch.LongTensor(1) + errcheck('cudnnGetConvolutionBackwardDataWorkspaceSize', + cudnn.getHandle(), + self.weightDesc[0], self.oDesc[0], + self.convDesc[0], self.iDesc[0], + algType[0], bufSize:data()) + maxBufSize = math.max(maxBufSize, bufSize[1]) + + self.extraBuffer = self.extraBuffer or cudnn.getSharedWorkspace() + self.extraBufferSizeInBytes = self.extraBuffer:nElement() * 4 -- float + if maxBufSize > self.extraBufferSizeInBytes then + self.extraBuffer:resize(math.ceil(maxBufSize/4)) + self.extraBufferSizeInBytes = maxBufSize + end + ----------------------------------------------------------------------- -- create offsets for groups local iH, iW = input:size(3), input:size(4) local kH, kW = self.kH, self.kW @@ -158,14 +338,13 @@ function SpatialConvolution:createIODescriptors(input) self.output:size(3), self.output:size(4)) end - - end + end end local one = torch.FloatTensor({1}); local zero = torch.FloatTensor({0}); -function SpatialConvolution:makeContiguous(input, gradOutput) +local function makeContiguous(self, input, gradOutput) if not input:isContiguous() then self._input = self._input or input.new() self._input:typeAs(input):resizeAs(input):copy(input) @@ -180,18 +359,17 @@ function SpatialConvolution:makeContiguous(input, gradOutput) end function SpatialConvolution:updateOutput(input) - input = SpatialConvolution.makeContiguous(self, input) + if not self.weightDesc then self:resetWeightDescriptors() end + input = makeContiguous(self, input) self:createIODescriptors(input) - if not self.fwdAlgType then - algo.setupForwardAlgorithm(self) - end + for g = 0, self.groups - 1 do errcheck('cudnnConvolutionForward', cudnn.getHandle(), one:data(), self.iDesc[0], input:data() + g*self.input_offset, self.weightDesc[0], self.weight:data() + g*self.weight_offset, - self.convDesc[0], self.fwdAlgType, - self.extraBuffer:data(), self.extraBuffer:nElement() * self.extraBuffer.elementSize(), + self.convDesc[0], self.fwdAlgType[0], + self.extraBuffer:data(), self.extraBufferSizeInBytes, zero:data(), self.oDesc[0], self.output:data() + g*self.output_offset); end @@ -209,12 +387,11 @@ end function SpatialConvolution:updateGradInput(input, gradOutput) if not self.gradInput then return end self.gradInput:resizeAs(input) - input, gradOutput = SpatialConvolution.makeContiguous(self, input, gradOutput) - assert(gradOutput:dim() == self.nDim-1 or gradOutput:dim() == self.nDim, 'gradOutput has to be nDim or nDim-1'); + + input, gradOutput = makeContiguous(self, input, gradOutput) + assert(gradOutput:dim() == 3 or gradOutput:dim() == 4, 'gradOutput has to be 3D or 4D'); + if not self.weightDesc then self:resetWeightDescriptors() end self:createIODescriptors(input) - if not self.bwdDataAlgType then - algo.setupBackwardDataAlgorithm(self) - end for g = 0,self.groups - 1 do errcheck('cudnnConvolutionBackwardData', cudnn.getHandle(), @@ -222,8 +399,8 @@ function SpatialConvolution:updateGradInput(input, gradOutput) self.weightDesc[0], self.weight:data() + g*self.weight_offset, self.oDesc[0], gradOutput:data() + g*self.output_offset, self.convDesc[0], - self.bwdDataAlgType, - self.extraBuffer:data(), self.extraBuffer:nElement() * self.extraBuffer.elementSize(), + self.bwdDataAlgType[0], + self.extraBuffer:data(), self.extraBufferSizeInBytes, zero:data(), self.iDesc[0], self.gradInput:data() + g*self.input_offset); end @@ -236,13 +413,12 @@ function SpatialConvolution:accGradParameters(input, gradOutput, scale) self.scaleT = self.scaleT:float() scale = scale or 1.0 self.scaleT[1] = scale - input, gradOutput = SpatialConvolution.makeContiguous(self, input, gradOutput) - assert(gradOutput:dim() == self.nDim-1 or gradOutput:dim() == self.nDim, 'gradOutput has to be nDim or nDim-1'); - self:createIODescriptors(input) - if not self.bwdFilterAlgType then - algo.setupBackwardFilterAlgorithm(self) - end + input, gradOutput = makeContiguous(self, input, gradOutput) + + assert(gradOutput:dim() == 3 or gradOutput:dim() == 4, 'gradOutput has to be 3D or 4D'); + if not self.weightDesc then self:resetWeightDescriptors() end + self:createIODescriptors(input) -- gradBias if self.bias then @@ -260,8 +436,8 @@ function SpatialConvolution:accGradParameters(input, gradOutput, scale) self.iDesc[0], input:data() + g*self.input_offset, self.oDesc[0], gradOutput:data() + g*self.output_offset, self.convDesc[0], - self.bwdFilterAlgType, - self.extraBuffer:data(), self.extraBuffer:nElement() * self.extraBuffer.elementSize(), + self.bwdFilterAlgType[0], + self.extraBuffer:data(), self.extraBufferSizeInBytes, one:data(), self.weightDesc[0], self.gradWeight:data() + g*self.weight_offset); end @@ -279,6 +455,7 @@ function SpatialConvolution:clearDesc() self.bwdDataAlgType = nil self.bwdFilterAlgType = nil self.extraBuffer = nil + self.extraBufferSizeInBytes = nil self.scaleT = nil end @@ -293,8 +470,6 @@ end function SpatialConvolution:clearState() self:clearDesc() - nn.utils.clear(self, 'extraBuffer', '_input', '_gradOutput') + nn.utils.clear(self, '_input', '_gradOutput') return nn.Module.clearState(self) end - -return SpatialConvolution diff --git a/SpatialFullConvolution.lua b/SpatialFullConvolution.lua index 1cdfb33..ff91a13 100644 --- a/SpatialFullConvolution.lua +++ b/SpatialFullConvolution.lua @@ -2,52 +2,98 @@ local SpatialFullConvolution, parent = torch.class('cudnn.SpatialFullConvolution', 'nn.SpatialFullConvolution') local ffi = require 'ffi' local errcheck = cudnn.errcheck -local algo = require 'cudnn.algo' local autotunerCache = {} autotunerCache[1] = {} -- forward autotunerCache[2] = {} -- backwardFilter autotunerCache[3] = {} -- backwardData -local Convolution = cudnn.SpatialConvolution -SpatialFullConvolution.nDim = 4 - -- if you change the configuration of the module manually, call this function SpatialFullConvolution:resetWeightDescriptors() - self.weightDesc = Convolution.createWeightDescriptors(self) + assert(cudnn.typemap[torch.typename(self.weight)], 'Only Cuda supported duh!') + assert(cudnn.typemap[torch.typename(self.bias)] or not self.bias, 'Only Cuda supported duh!') + -- create filterDescriptor for weight + self.weightDesc = ffi.new('struct cudnnFilterStruct*[1]') + errcheck('cudnnCreateFilterDescriptor', self.weightDesc) local desc = torch.IntTensor({self.nInputPlane, self.nOutputPlane, self.kH, self.kW}) errcheck('cudnnSetFilterNdDescriptor', self.weightDesc[0], cudnn.typemap[torch.typename(self.weight)], 'CUDNN_TENSOR_NCHW', 4, desc:data()); + local function destroyWDesc(d) + errcheck('cudnnDestroyFilterDescriptor', d[0]); + end + ffi.gc(self.weightDesc, destroyWDesc) + + -- create descriptor for bias + if self.bias then + self.biasDesc = cudnn.toDescriptor(self.bias:view(1, self.nOutputPlane,1,1)) + end end function SpatialFullConvolution:fastest(mode) - return Convolution.fastest(self) + if mode == nil then mode = true end + self.fastest_mode = mode + self.iSize = self.iSize or torch.LongStorage(4) + self.iSize:fill(0) + return self end function SpatialFullConvolution:setMode(fmode, bdmode, bwmode) - return Convolution.setMode(self, fmode, bdmode, bwmode) + if fmode ~= nil then + self.fmode = fmode + end + if bdmode ~= nil then + self.bdmode = bdmode + end + if bwmode ~= nil then + self.bwmode = bwmode + end + self.iSize = self.iSize or torch.LongStorage(4) + self.iSize:fill(0) + return self end function SpatialFullConvolution:resetMode() - return Convolution.resetMode(self) + self.fmode = nil + self.bdmode = nil + self.bwmode = nil + return self end function SpatialFullConvolution:noBias() - return Convolution.noBias(self) + self.bias = nil + self.gradBias = nil + return self end function SpatialFullConvolution:createIODescriptors(input) - if Convolution.checkInputChanged(self, input) then + local batch = true + if input:dim() == 3 then + input = input:view(1, input:size(1), input:size(2), input:size(3)) + batch = false + end + assert(input:dim() == 4 and input:isContiguous()); + self.iSize = self.iSize or torch.LongStorage(4):fill(0) + if not self.iDesc or not self.oDesc or + input:size(1) ~= self.iSize[1] or input:size(2) ~= self.iSize[2] + or input:size(3) ~= self.iSize[3] or input:size(4) ~= self.iSize[4] then + self.iSize = input:size() + + assert(self.nInputPlane == input:size(2), 'input has to contain: ' + .. self.nInputPlane + .. ' feature maps, but received input of size: ' + .. input:size(1) .. ' x ' .. input:size(2) .. + ' x ' .. input:size(3) .. ' x ' .. input:size(4)) + -- create input descriptor - local input_slice = input[{{},{1,self.nInputPlane},{},{}}] - self.iDesc = cudnn.toDescriptor(input_slice) + local input_slice = {{},{1,self.nInputPlane},{},{}} + self.iDesc = cudnn.toDescriptor(input[input_slice]) -- create conv descriptor - self.convDesc = cudnn.createDescriptors(1, 'struct cudnnConvolutionStruct*[?]', - 'cudnnCreateConvolutionDescriptor', 'cudnnDestroyConvolutionDescriptor') + self.convDesc = ffi.new('struct cudnnConvolutionStruct*[1]') + errcheck('cudnnCreateConvolutionDescriptor', self.convDesc) local pad = torch.IntTensor({self.padH, self.padW}) local stride = torch.IntTensor({self.dH, self.dW}) local upscale = torch.IntTensor({1,1}) @@ -55,6 +101,10 @@ function SpatialFullConvolution:createIODescriptors(input) 2, pad:data(), stride:data(), upscale:data(), 'CUDNN_CROSS_CORRELATION', cudnn.configmap(torch.type(self.weight))); + local function destroyConvDesc(d) + errcheck('cudnnDestroyConvolutionDescriptor', d[0]); + end + ffi.gc(self.convDesc, destroyConvDesc) -- get output shape, resize output local iwidth = input:size(4) @@ -65,12 +115,190 @@ function SpatialFullConvolution:createIODescriptors(input) self.output:resize(oSize:long():storage()) -- create descriptor for output - local output_slice = self.output[{{},{1,self.nOutputPlane},{},{}}] - self.oDesc = cudnn.toDescriptor(output_slice) + local output_slice = {{},{1,self.nOutputPlane},{},{}} + self.oDesc = cudnn.toDescriptor(self.output[output_slice]) self.oDescForBias = cudnn.toDescriptor(self.output) - algo.prepareHash(self, input_slice, output_slice) + ----------------------------------------------------------------------- + local function shape(x) + local sz = x:size() + local str = '' + for i=1,sz:size() do + str = str .. sz[i] .. 'x' + end + if #str > 0 then + str = str:sub(1, #str-1) + end + return str + end + local autotunerHash = shape(self.weight) .. ';' + .. shape(input[input_slice]) .. ';' + .. shape(self.output[output_slice]) + local maxBufSize = 0 + + -- create forwardAlgorithm descriptors + local algType = ffi.new("cudnnConvolutionFwdAlgo_t[?]", 1) + local algSearchMode = 'CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT' + local algWorkspaceLimit = self.workspace_limit + or (self.nOutputPlane * self.kH * self.kW * 4) -- 4 = sizeof int/float. + + if self.fastest_mode or cudnn.fastest == true then + algSearchMode = 'CUDNN_CONVOLUTION_FWD_PREFER_FASTEST' + end + + if cudnn.benchmark then -- the manual auto-tuner is run + if autotunerCache[1][autotunerHash] then + algType[0] = autotunerCache[1][autotunerHash] + if cudnn.verbose then + print('Autotuning SFC: using cached algo = ', algType[0], ' for: ', autotunerHash) + end + else + local perfResults = ffi.new("cudnnConvolutionFwdAlgoPerf_t[?]", 1) + local intt = torch.IntTensor(1); + errcheck('cudnnFindConvolutionForwardAlgorithm', + cudnn.getHandle(), + self.oDesc[0], self.weightDesc[0], + self.convDesc[0], self.iDesc[0], + 1, intt:data(), perfResults) + algType[0] = perfResults[0].algo + autotunerCache[1][autotunerHash] = perfResults[0].algo + if cudnn.verbose then + print(string.format( + "Autotuning Forward: Time: %3.5f Memory: %8d Algorithm: %d" + .. " Weight: %15s Input: %15s Output: %15s", + perfResults[0].time, tonumber(perfResults[0].memory), + tonumber(perfResults[0].algo), + shape(self.weight), shape(input[input_slice]), + shape(self.output[output_slice]))) + end + end + else + errcheck('cudnnGetConvolutionForwardAlgorithm', + cudnn.getHandle(), + self.oDesc[0], self.weightDesc[0], + self.convDesc[0], self.iDesc[0], + algSearchMode, algWorkspaceLimit, algType) + end + algType[0] = self.fmode or algType[0] + self.fwdAlgType = algType + local bufSize = torch.LongTensor(1) + errcheck('cudnnGetConvolutionForwardWorkspaceSize', + cudnn.getHandle(), + self.oDesc[0], self.weightDesc[0], + self.convDesc[0], self.iDesc[0], + algType[0], bufSize:data()) + maxBufSize = math.max(maxBufSize, bufSize[1]) + + -- create backwardFilterAlgorithm descriptors + local algType = ffi.new("cudnnConvolutionBwdFilterAlgo_t[?]", 1) + local algSearchMode = 'CUDNN_CONVOLUTION_BWD_FILTER_NO_WORKSPACE' + local algWorkspaceLimit = self.workspace_limit + or (self.nOutputPlane * self.kH * self.kW * 4) -- 4 = sizeof int/float. + if self.fastest_mode or cudnn.fastest == true then + algSearchMode = 'CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST' + end + + if cudnn.benchmark then -- the manual auto-tuner is run + if autotunerCache[2][autotunerHash] then + algType[0] = autotunerCache[2][autotunerHash] + else + local perfResults = ffi.new("cudnnConvolutionBwdFilterAlgoPerf_t[?]", 1) + local intt = torch.IntTensor(1); + errcheck('cudnnFindConvolutionBackwardFilterAlgorithm', + cudnn.getHandle(), + self.oDesc[0], self.iDesc[0], + self.convDesc[0], self.weightDesc[0], + 1, intt:data(), perfResults) + algType[0] = perfResults[0].algo + autotunerCache[2][autotunerHash] = perfResults[0].algo + if cudnn.verbose then + print(string.format( + "Autotuning backwardFilter: Time: %3.5f Memory: %8d Algorithm: %d" + .. " Weight: %15s Input: %15s Output: %15s", + perfResults[0].time, tonumber(perfResults[0].memory), + tonumber(perfResults[0].algo), + shape(self.weight), shape(input[input_slice]), + shape(self.output[output_slice]))) + end + end + else + errcheck('cudnnGetConvolutionBackwardFilterAlgorithm', + cudnn.getHandle(), + self.oDesc[0], self.iDesc[0], + self.convDesc[0], self.weightDesc[0], + algSearchMode, algWorkspaceLimit, algType) + end + algType[0] = self.bwmode or algType[0] + self.bwdFilterAlgType = algType + local bufSize = torch.LongTensor(1) + errcheck('cudnnGetConvolutionBackwardFilterWorkspaceSize', + cudnn.getHandle(), + self.oDesc[0], self.iDesc[0], + self.convDesc[0], self.weightDesc[0], + algType[0], bufSize:data()) + maxBufSize = math.max(maxBufSize, bufSize[1]) + + -- create backwardDataAlgorithm descriptors + local algType = ffi.new("cudnnConvolutionBwdDataAlgo_t[?]", 1) + local algSearchMode = 'CUDNN_CONVOLUTION_BWD_DATA_NO_WORKSPACE' + local algWorkspaceLimit = self.workspace_limit + or (self.nOutputPlane * self.kH * self.kW * 4) -- 4 = sizeof int/float. + if self.fastest_mode or cudnn.fastest == true then + algSearchMode = 'CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST' + end + if cudnn.benchmark then -- the manual auto-tuner is run + if autotunerCache[3][autotunerHash] then + algType[0] = autotunerCache[3][autotunerHash] + else + local perfResults = ffi.new("cudnnConvolutionBwdDataAlgoPerf_t[?]", 1) + local intt = torch.IntTensor(1); + errcheck('cudnnFindConvolutionBackwardDataAlgorithm', + cudnn.getHandle(), + self.weightDesc[0], self.iDesc[0], + self.convDesc[0], self.oDesc[0], + 1, intt:data(), perfResults) + algType[0] = perfResults[0].algo + autotunerCache[3][autotunerHash] = perfResults[0].algo + if cudnn.verbose then + print(string.format( + "Autotuning backwardData: Time: %3.5f Memory: %8d Algorithm: %d" + .. " Weight: %15s Input: %15s Output: %15s\n", + perfResults[0].time, tonumber(perfResults[0].memory), + tonumber(perfResults[0].algo), + shape(self.weight), shape(input[input_slice]), + shape(self.output[output_slice]))) + end + end + else + errcheck('cudnnGetConvolutionBackwardDataAlgorithm', + cudnn.getHandle(), + self.weightDesc[0], self.iDesc[0], + self.convDesc[0], self.oDesc[0], + algSearchMode, algWorkspaceLimit, algType) + end + algType[0] = self.bdmode or algType[0] + self.bwdDataAlgType = algType + local bufSize = torch.LongTensor(1) + errcheck('cudnnGetConvolutionBackwardDataWorkspaceSize', + cudnn.getHandle(), + self.weightDesc[0], self.iDesc[0], + self.convDesc[0], self.oDesc[0], + algType[0], bufSize:data()) + maxBufSize = math.max(maxBufSize, bufSize[1]) + + self.extraBuffer = self.extraBuffer or cudnn.getSharedWorkspace() + self.extraBufferSizeInBytes = self.extraBuffer:nElement() * 4 -- float + if maxBufSize > self.extraBufferSizeInBytes then + self.extraBuffer:resize(math.ceil(maxBufSize/4)) + self.extraBufferSizeInBytes = maxBufSize + end + + if not batch then + self.output = self.output:view(self.output:size(2), + self.output:size(3), + self.output:size(4)) + end end end @@ -78,11 +306,8 @@ local one = torch.FloatTensor({1}); local zero = torch.FloatTensor({0}); function SpatialFullConvolution:updateOutput(input) + if not self.weightDesc then self:resetWeightDescriptors() end self:createIODescriptors(input) - if not self.bwdDataAlgType then - algo.setupBackwardDataAlgorithm(self, {self.weightDesc[0], self.iDesc[0], - self.convDesc[0], self.oDesc[0]}) - end -- Because SpatialFullConvolution is performing the adjoint of the forward -- convolution operator, we need to swap the forward and backward passes. @@ -90,8 +315,8 @@ function SpatialFullConvolution:updateOutput(input) one:data(), self.weightDesc[0], self.weight:data(), self.iDesc[0], input:data(), - self.convDesc[0], self.bwdDataAlgType, - self.extraBuffer:data(), self.extraBuffer:nElement() * self.extraBuffer.elementSize(), + self.convDesc[0], self.bwdDataAlgType[0], + self.extraBuffer:data(), self.extraBufferSizeInBytes, zero:data(), self.oDesc[0], self.output:data()) @@ -111,19 +336,16 @@ function SpatialFullConvolution:updateGradInput(input, gradOutput) assert(gradOutput:dim() == 3 or gradOutput:dim() == 4, 'gradOutput has to be 3D or 4D'); assert(gradOutput:isContiguous(), 'gradOutput has to be contiguous') + if not self.weightDesc then self:resetWeightDescriptors() end self:createIODescriptors(input) - if not self.fwdDataAlgType then - algo.setupForwardAlgorithm(self, {self.oDesc[0], self.weightDesc[0], - self.convDesc[0], self.iDesc[0]}) - end errcheck('cudnnConvolutionForward', cudnn.getHandle(), one:data(), self.oDesc[0], gradOutput:data(), self.weightDesc[0], self.weight:data(), self.convDesc[0], - self.fwdAlgType, - self.extraBuffer:data(), self.extraBuffer:nElement() * self.extraBuffer.elementSize(), + self.fwdAlgType[0], + self.extraBuffer:data(), self.extraBufferSizeInBytes, zero:data(), self.iDesc[0], self.gradInput:data()); return self.gradInput @@ -138,11 +360,8 @@ function SpatialFullConvolution:accGradParameters(input, gradOutput, scale) assert(gradOutput:dim() == 3 or gradOutput:dim() == 4, 'gradOutput has to be 3D or 4D'); assert(gradOutput:isContiguous(), 'gradOutput has to be contiguous') + if not self.weightDesc then self:resetWeightDescriptors() end self:createIODescriptors(input) - if not self.bwdFilterAlgType then - algo.setupBackwardFilterAlgorithm(self, {self.oDesc[0], self.iDesc[0], - self.convDesc[0], self.weightDesc[0]}) - end -- gradBias if self.bias then @@ -159,8 +378,8 @@ function SpatialFullConvolution:accGradParameters(input, gradOutput, scale) self.oDesc[0], gradOutput:data(), self.iDesc[0], input:data(), self.convDesc[0], - self.bwdFilterAlgType, - self.extraBuffer:data(), self.extraBuffer:nElement() * self.extraBuffer.elementSize(), + self.bwdFilterAlgType[0], + self.extraBuffer:data(), self.extraBufferSizeInBytes, one:data(), self.weightDesc[0], self.gradWeight:data()) end @@ -177,6 +396,7 @@ function SpatialFullConvolution:clearDesc() self.bwdDataAlgType = nil self.bwdFilterAlgType = nil self.extraBuffer = nil + self.extraBufferSizeInBytes = nil end function SpatialFullConvolution:write(f) diff --git a/TemporalConvolution.lua b/TemporalConvolution.lua index 947cc4f..4648ffd 100644 --- a/TemporalConvolution.lua +++ b/TemporalConvolution.lua @@ -6,8 +6,6 @@ local TemporalConvolution, parent = --it is recommended to pass padding parameter to this routine and use cudnn implicit padding facilities. --limitation is that padding will be equal on both sides. -local Convolution = cudnn.SpatialConvolution - function TemporalConvolution:__init(inputFrameSize, outputFrameSize, kH, dH, padH) local delayedReset = self.reset @@ -16,8 +14,7 @@ function TemporalConvolution:__init(inputFrameSize, outputFrameSize, local nOutputPlane = outputFrameSize self.inputFrameSize = inputFrameSize self.outputFrameSize = outputFrameSize - self.nDim = 4 - Convolution.__init(self, nInputPlane, nOutputPlane, kW, kH, 1, dH,0,padH) + cudnn.SpatialConvolution.__init(self, nInputPlane, nOutputPlane, kW, kH, 1, dH,0,padH) self.weight = self.weight:view(nOutputPlane,inputFrameSize*kH) self.gradWeight = self.gradWeight:view(outputFrameSize, inputFrameSize*kH) --self.dW and self.kW now have different meaning than in nn.TemporalConvolution, because @@ -31,24 +28,24 @@ function TemporalConvolution:createIODescriptors(input) or input:size(3) ~= self.iSize[3] or input:size(4) ~= self.iSize[4] then sizeChanged = true end - Convolution.createIODescriptors(self,input) + cudnn.SpatialConvolution.createIODescriptors(self,input) if sizeChanged then self.oSize = self.output:size() end end function TemporalConvolution:fastest(mode) - self = Convolution.fastest(self,mode) + self = cudnn.SpatialConvolution.fastest(self,mode) return self end function TemporalConvolution:setMode(fmode, bdmode, bwmode) - self = Convolution.setMode(self,fmode, bdmode, bwmode) + self = cudnn.SpatialConvolution.setMode(self,fmode, bdmode, bwmode) return self end function TemporalConvolution:resetWeightDescriptors() - Convolution.resetWeightDescriptors(self) + cudnn.SpatialConvolution.resetWeightDescriptors(self) end local function inputview(input) @@ -66,7 +63,7 @@ function TemporalConvolution:updateOutput(input) self._output = self._output or input.new() if self.output:storage() then self._output:set(self.output:storage()) else self._output = self.output end if self.buffer:storage() then self.output:set(self.buffer:storage(), 1, self.output:size()) else self.output = self.buffer end - Convolution.updateOutput(self,_input) + cudnn.SpatialConvolution.updateOutput(self,_input) self.buffer = self.output:view(self.oSize):transpose(2,3) self.output = self._output:resize(self.buffer:size()):copy(self.buffer) -- self.output here is always 4D, use input dimensions to properly view output @@ -95,7 +92,7 @@ function TemporalConvolution:updateGradInput(input, gradOutput) if not self.gradInput then return end local _gradOutput = transposeGradOutput(gradOutput,self.buffer) local _input = inputview(input) - self.gradInput = Convolution.updateGradInput(self,_input, _gradOutput) + self.gradInput = cudnn.SpatialConvolution.updateGradInput(self,_input, _gradOutput) if input:dim()==3 then self.gradInput = self.gradInput:view(self.iSize[1],self.iSize[3],self.iSize[4]) else @@ -109,7 +106,7 @@ function TemporalConvolution:accGradParameters(input,gradOutput,scale) local _input = inputview(input) -- transpose gradOutput (it will likely be transposed twice, hopefully, no big deal local _gradOutput = transposeGradOutput(gradOutput,self.buffer) - Convolution.accGradParameters(self,_input,_gradOutput,scale) + cudnn.SpatialConvolution.accGradParameters(self,_input,_gradOutput,scale) end function TemporalConvolution:clearDesc() @@ -120,7 +117,7 @@ end function TemporalConvolution:write(f) self:clearDesc() - Convolution.clearDesc(self) + cudnn.SpatialConvolution.clearDesc(self) local var = {} for k,v in pairs(self) do var[k] = v diff --git a/VolumetricConvolution.lua b/VolumetricConvolution.lua index 73fd9ce..b255467 100644 --- a/VolumetricConvolution.lua +++ b/VolumetricConvolution.lua @@ -2,49 +2,83 @@ local VolumetricConvolution, parent = torch.class('cudnn.VolumetricConvolution', 'nn.VolumetricConvolution') local ffi = require 'ffi' local errcheck = cudnn.errcheck -local algo = require 'cudnn.algo' -local Convolution = cudnn.SpatialConvolution +local autotunerCache = {} +autotunerCache[1] = {} -- forward +autotunerCache[2] = {} -- backwardFilter +autotunerCache[3] = {} -- backwardData -function VolumetricConvolution:__init(nInputPlane, nOutputPlane, - kT, kW, kH, dW, dH, padW, padH) - self.nDim = 5 - self.kT = kT - Convolution.__init(self,nInputPlane, nOutputPlane, - kW, kH, dW, dH, padW, padH, 1) - return self -end -- if you change the configuration of the module manually, call this function VolumetricConvolution:resetWeightDescriptors() + assert(cudnn.typemap[torch.typename(self.weight)], 'Only Cuda supported duh!') + assert(cudnn.typemap[torch.typename(self.bias)] or not self.bias, 'Only Cuda supported duh!') + -- create filterDescriptor for weight + self.weightDesc = ffi.new('struct cudnnFilterStruct*[1]') + errcheck('cudnnCreateFilterDescriptor', self.weightDesc) local desc = torch.IntTensor({self.nOutputPlane, self.nInputPlane, self.kT, self.kH, self.kW}) - Convolution.resetWeightDescriptors(self, desc) + errcheck('cudnnSetFilterNdDescriptor', self.weightDesc[0], + cudnn.typemap[torch.typename(self.weight)], 'CUDNN_TENSOR_NCHW', 5, + desc:data()); + local function destroyWDesc(d) + errcheck('cudnnDestroyFilterDescriptor', d[0]); + end + ffi.gc(self.weightDesc, destroyWDesc) + + -- create descriptor for bias + self.biasDesc = cudnn.toDescriptor(self.bias:view(1, self.nOutputPlane, + 1, 1)) end function VolumetricConvolution:fastest(mode) - return Convolution.fastest(self) + if mode == nil then mode = true end + self.fastest_mode = mode + self.iSize = self.iSize or torch.LongStorage(4) + self.iSize:fill(0) + return self end function VolumetricConvolution:setMode(fmode, bdmode, bwmode) - return Convolution.setMode(self, fmode, bdmode, bwmode) + if fmode ~= nil then + self.fmode = fmode + end + if bdmode ~= nil then + self.bdmode = bdmode + end + if bwmode ~= nil then + self.bwmode = bwmode + end + self.iSize = self.iSize or torch.LongStorage(4) + self.iSize:fill(0) + return self end function VolumetricConvolution:resetMode() - return Convolution.resetMode(self) + self.fmode = nil + self.bdmode = nil + self.bwmode = nil + return self end function VolumetricConvolution:createIODescriptors(input) + local batch = true if input:dim() == 4 then input = input:view(1, input:size(1), input:size(2), input:size(3), input:size(4)) batch = false end - if Convolution.checkInputChanged(self, input) then + assert(input:dim() == 5 and input:isContiguous()); + self.iSize = self.iSize or torch.LongStorage(4):fill(0) + if not self.iDesc or not self.oDesc or + input:size(1) ~= self.iSize[1] or input:size(2) ~= self.iSize[2] + or input:size(3) ~= self.iSize[3] or input:size(4) ~= self.iSize[4] + or input:size(5) ~= self.iSize[5] then + self.iSize = input:size() -- create input descriptor self.iDesc = cudnn.toDescriptor(input) -- create conv descriptor - self.convDesc = cudnn.createDescriptors(1, 'struct cudnnConvolutionStruct*[?]', - 'cudnnCreateConvolutionDescriptor', 'cudnnDestroyConvolutionDescriptor') + self.convDesc = ffi.new('struct cudnnConvolutionStruct*[1]') + errcheck('cudnnCreateConvolutionDescriptor', self.convDesc) local pad = torch.IntTensor({self.padT, self.padH, self.padW}) local stride = torch.IntTensor({self.dT, self.dH, self.dW}) local upscale = torch.IntTensor({1,1,1}) @@ -52,6 +86,11 @@ function VolumetricConvolution:createIODescriptors(input) 3, pad:data(), stride:data(), upscale:data(), 'CUDNN_CROSS_CORRELATION', cudnn.configmap(torch.type(self.weight))); + local function destroyConvDesc(d) + errcheck('cudnnDestroyConvolutionDescriptor', d[0]); + end + ffi.gc(self.convDesc, destroyConvDesc) + -- create output descriptor and resize output local oSize = torch.IntTensor(5) local oSizeD = oSize:data() @@ -67,7 +106,181 @@ function VolumetricConvolution:createIODescriptors(input) self.output:size(3)*self.output:size(4), self.output:size(5))) - algo.prepareHash(self, input, output) + + + ----------------------------------------------------------------------- + local function shape(x) + return table.concat(x:size():totable(),'x') + end + local autotunerHash = shape(self.weight) .. ';' + .. shape(input) .. ';' + .. shape(self.output) + + local maxBufSize = 0 + + -- create forwardAlgorithm descriptors + local algType = ffi.new("cudnnConvolutionFwdAlgo_t[?]", 1) + local algSearchMode = 'CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT' + local algWorkspaceLimit = self.workspace_limit + or (self.nInputPlane * self.kH * self.kW * 4) -- 4 = sizeof int/float. + + if self.fastest_mode or cudnn.fastest == true then + algSearchMode = 'CUDNN_CONVOLUTION_FWD_PREFER_FASTEST' + end + + if cudnn.benchmark then -- the manual auto-tuner is run + if autotunerCache[1][autotunerHash] then + algType[0] = autotunerCache[1][autotunerHash] + if cudnn.verbose then + print('Autotuning VMC FW: using cached algo = ', algType[0], ' for: ', autotunerHash) + end + else + local perfResults = ffi.new("cudnnConvolutionFwdAlgoPerf_t[?]", 1) + local intt = torch.IntTensor(1); + errcheck('cudnnFindConvolutionForwardAlgorithm', + cudnn.getHandle(), + self.iDesc[0], self.weightDesc[0], + self.convDesc[0], self.oDesc[0], + 1, intt:data(), perfResults) + algType[0] = perfResults[0].algo + autotunerCache[1][autotunerHash] = perfResults[0].algo + if cudnn.verbose then + print(string.format( + "\nAutotuning VMC Forward: Time: %3.5f Memory: %8d Algorithm: %d" + .. " Weight: %15s Input: %15s Output: %15s", + perfResults[0].time, tonumber(perfResults[0].memory), + tonumber(perfResults[0].algo), + shape(self.weight), shape(input), + shape(self.output))) + end + end + else + errcheck('cudnnGetConvolutionForwardAlgorithm', + cudnn.getHandle(), + self.iDesc[0], self.weightDesc[0], + self.convDesc[0], self.oDesc[0], + algSearchMode, algWorkspaceLimit, algType) + end + algType[0] = self.fmode or algType[0] + self.fwdAlgType = algType + local bufSize = torch.LongTensor(1) + errcheck('cudnnGetConvolutionForwardWorkspaceSize', + cudnn.getHandle(), + self.iDesc[0], self.weightDesc[0], + self.convDesc[0], self.oDesc[0], + algType[0], bufSize:data()) + maxBufSize = math.max(maxBufSize, bufSize[1]) + + -- create backwardFilterAlgorithm descriptors + local algType = ffi.new("cudnnConvolutionBwdFilterAlgo_t[?]", 1) + local algSearchMode = 'CUDNN_CONVOLUTION_BWD_FILTER_NO_WORKSPACE' + local algWorkspaceLimit = self.workspace_limit + or (self.nInputPlane * self.kH * self.kW * 4) -- 4 = sizeof int/float. + if self.fastest_mode or cudnn.fastest == true then + algSearchMode = 'CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST' + end + + if cudnn.benchmark then -- the manual auto-tuner is run + if autotunerCache[2][autotunerHash] then + algType[0] = autotunerCache[2][autotunerHash] + if cudnn.verbose then + print('Autotuning VMC BWF: using cached algo = ', algType[0], ' for: ', autotunerHash) + end + else + local perfResults = ffi.new("cudnnConvolutionBwdFilterAlgoPerf_t[?]", 1) + local intt = torch.IntTensor(1); + errcheck('cudnnFindConvolutionBackwardFilterAlgorithm', + cudnn.getHandle(), + self.iDesc[0], self.oDesc[0], + self.convDesc[0], self.weightDesc[0], + 1, intt:data(), perfResults) + algType[0] = perfResults[0].algo + autotunerCache[2][autotunerHash] = perfResults[0].algo + if cudnn.verbose then + print(string.format( + "Autotuning backwardFilter: Time: %3.5f Memory: %8d Algorithm: %d" + .. " Weight: %15s Input: %15s Output: %15s", + perfResults[0].time, tonumber(perfResults[0].memory), + tonumber(perfResults[0].algo), + shape(self.weight), shape(input), + shape(self.output))) + end + end + else + errcheck('cudnnGetConvolutionBackwardFilterAlgorithm', + cudnn.getHandle(), + self.iDesc[0], self.oDesc[0], + self.convDesc[0], self.weightDesc[0], + algSearchMode, algWorkspaceLimit, algType) + end + algType[0] = self.bwmode or algType[0] + self.bwdFilterAlgType = algType + local bufSize = torch.LongTensor(1) + errcheck('cudnnGetConvolutionBackwardFilterWorkspaceSize', + cudnn.getHandle(), + self.iDesc[0], self.oDesc[0], + self.convDesc[0], self.weightDesc[0], + algType[0], bufSize:data()) + maxBufSize = math.max(maxBufSize, bufSize[1]) + + -- create backwardDataAlgorithm descriptors + local algType = ffi.new("cudnnConvolutionBwdDataAlgo_t[?]", 1) + local algSearchMode = 'CUDNN_CONVOLUTION_BWD_DATA_NO_WORKSPACE' + local algWorkspaceLimit = self.workspace_limit + or (self.nInputPlane * self.kH * self.kW * 4) -- 4 = sizeof int/float. + if self.fastest_mode or cudnn.fastest == true then + algSearchMode = 'CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST' + end + if cudnn.benchmark then -- the manual auto-tuner is run + if autotunerCache[3][autotunerHash] then + algType[0] = autotunerCache[3][autotunerHash] + if cudnn.verbose then + print('Autotuning VMC BWD: using cached algo = ', algType[0], ' for: ', autotunerHash) + end + else + local perfResults = ffi.new("cudnnConvolutionBwdDataAlgoPerf_t[?]", 1) + local intt = torch.IntTensor(1); + errcheck('cudnnFindConvolutionBackwardDataAlgorithm', + cudnn.getHandle(), + self.weightDesc[0], self.oDesc[0], + self.convDesc[0], self.iDesc[0], + 1, intt:data(), perfResults) + algType[0] = perfResults[0].algo + autotunerCache[3][autotunerHash] = perfResults[0].algo + if cudnn.verbose then + print(string.format( + "Autotuning backwardData: Time: %3.5f Memory: %8d Algorithm: %d" + .. " Weight: %15s Input: %15s Output: %15s\n", + perfResults[0].time, tonumber(perfResults[0].memory), + tonumber(perfResults[0].algo), + shape(self.weight), shape(input), + shape(self.output))) + end + end + else + errcheck('cudnnGetConvolutionBackwardDataAlgorithm', + cudnn.getHandle(), + self.weightDesc[0], self.oDesc[0], + self.convDesc[0], self.iDesc[0], + algSearchMode, algWorkspaceLimit, algType) + end + algType[0] = self.bdmode or algType[0] + self.bwdDataAlgType = algType + local bufSize = torch.LongTensor(1) + errcheck('cudnnGetConvolutionBackwardDataWorkspaceSize', + cudnn.getHandle(), + self.weightDesc[0], self.oDesc[0], + self.convDesc[0], self.iDesc[0], + algType[0], bufSize:data()) + maxBufSize = math.max(maxBufSize, bufSize[1]) + + self.extraBuffer = self.extraBuffer or cudnn.getSharedWorkspace() + self.extraBufferSizeInBytes = self.extraBuffer:nElement() * 4 -- float + if maxBufSize > self.extraBufferSizeInBytes then + self.extraBuffer:resize(math.ceil(maxBufSize/4)) + self.extraBufferSizeInBytes = maxBufSize + end + ----------------------------------------------------------------------- if not batch then self.output = self.output:view(self.output:size(2), @@ -78,28 +291,119 @@ function VolumetricConvolution:createIODescriptors(input) end end +local one = torch.FloatTensor({1}); +local zero = torch.FloatTensor({0}); + +local function makeContiguous(self, input, gradOutput) + if not input:isContiguous() then + self._input = self._input or input.new() + self._input:typeAs(input):resizeAs(input):copy(input) + input = self._input + end + if gradOutput and not gradOutput:isContiguous() then + self._gradOutput = self._gradOutput or gradOutput.new() + self._gradOutput:typeAs(gradOutput):resizeAs(gradOutput):copy(gradOutput) + gradOutput = self._gradOutput + end + return input, gradOutput +end + function VolumetricConvolution:updateOutput(input) - return Convolution:updateOutput(input) + if not self.weightDesc then self:resetWeightDescriptors() end + input = makeContiguous(self, input) + self:createIODescriptors(input) + errcheck('cudnnConvolutionForward', cudnn.getHandle(), + one:data(), + self.iDesc[0], input:data(), + self.weightDesc[0], self.weight:data(), + self.convDesc[0], self.fwdAlgType[0], + self.extraBuffer:data(), self.extraBufferSizeInBytes, + zero:data(), + self.oDesc[0], self.output:data()); + errcheck('cudnnAddTensor', cudnn.getHandle(), + one:data(), + self.biasDesc[0], self.bias:data(), one:data(), + self.oDescBias[0], self.output:data()); + return self.output end function VolumetricConvolution:updateGradInput(input, gradOutput) - return Convolution:updateGradInput(input) + if not self.gradInput then return end + self.gradInput:resizeAs(input) + + input, gradOutput = makeContiguous(self, input, gradOutput) + assert(gradOutput:dim() == 4 or gradOutput:dim() == 5, + 'gradOutput has to be a 4D or 5D tensor'); + if not self.weightDesc then self:resetWeightDescriptors() end + self:createIODescriptors(input) + errcheck('cudnnConvolutionBackwardData', cudnn.getHandle(), + one:data(), + self.weightDesc[0], self.weight:data(), + self.oDesc[0], gradOutput:data(), + self.convDesc[0], + self.bwdDataAlgType[0], + self.extraBuffer:data(), self.extraBufferSizeInBytes, + zero:data(), + self.iDesc[0], self.gradInput:data()); + return self.gradInput end function VolumetricConvolution:accGradParameters(input, gradOutput, scale) - return Convolution:accGradParameters(input, gradOutput, scale) + self.scaleT = self.scaleT or torch.FloatTensor(1):fill(1.0) + -- this line forces this member to always be on CPU (needed for cudnn) + self.scaleT = self.scaleT:float() + + scale = scale or 1.0 + self.scaleT[1] = scale + input, gradOutput = makeContiguous(self, input, gradOutput) + assert(gradOutput:dim() == 4 or gradOutput:dim() == 5, + 'gradOutput has to be a 4D or 5D tensor'); + self:createIODescriptors(input) + if not self.weightDesc then self:resetWeightDescriptors() end + -- gradBias + errcheck('cudnnConvolutionBackwardBias', cudnn.getHandle(), + self.scaleT:data(), + self.oDescBias[0], gradOutput:data(), + one:data(), + self.biasDesc[0], self.gradBias:data()); + -- gradWeight + errcheck('cudnnConvolutionBackwardFilter', cudnn.getHandle(), + self.scaleT:data(), + self.iDesc[0], input:data(), + self.oDesc[0], gradOutput:data(), + self.convDesc[0], + self.bwdFilterAlgType[0], + self.extraBuffer:data(), self.extraBufferSizeInBytes, + one:data(), + self.weightDesc[0], self.gradWeight:data()); end function VolumetricConvolution:clearDesc() - Convolution:clearDesc() + self.weightDesc = nil + self.biasDesc = nil + self.convDesc = nil + self.iDesc = nil + self.oDesc = nil + self.oDescBias = nil + self.fwdAlgType = nil + self.bwdDataAlgType = nil + self.bwdFilterAlgType = nil + self.extraBuffer = nil + self.extraBufferInBytes = nil + self.scaleT = nil end function VolumetricConvolution:write(f) - Convolution:write(f) + self:clearDesc() + local var = {} + for k,v in pairs(self) do + var[k] = v + end + f:writeObject(var) end function VolumetricConvolution:clearState() - return Convolution:clearState() + self:clearDesc() + nn.utils.clear(self, 'extraBuffer', '_input', '_gradOutput') + return nn.Module.clearState(self) end - -return VolumetricConvolution diff --git a/algo.lua b/algo.lua deleted file mode 100644 index ba211ad..0000000 --- a/algo.lua +++ /dev/null @@ -1,129 +0,0 @@ -local ffi = require 'ffi' -local errcheck = cudnn.errcheck - -local algo = {} -local autotunerCache = {} -autotunerCache['cudnnFindConvolutionForwardAlgorithm'] = {} -autotunerCache['cudnnFindConvolutionBackwardFilterAlgorithm'] = {} -autotunerCache['cudnnFindConvolutionBackwardDataAlgorithm'] = {} - -local function setupAlgo(self, algo_t, perf_t, findAPI, getAPI, wsAPI, algSearchMode, params) - - local algType = ffi.new(algo_t, 1) - - if cudnn.benchmark or cudnn.fastest then -- the manual auto-tuner is run - local cachedAlgo = autotunerCache[findAPI][self.autotunerHash]; - if cachedAlgo then - algType[0] = cachedAlgo - if cudnn.verbose then - print('\n', findAPI, ' using cached algo = ' , algType[0] , ' for: ', self.autotunerHash) - end - else - local perfResults = ffi.new(perf_t, 1) - local intt = torch.IntTensor(1) - errcheck(findAPI, - cudnn.getHandle(), - params[1], params[2], params[3], params[4], - 1, intt:data(), perfResults) - algType[0] = perfResults[0].algo - autotunerCache[findAPI][self.autotunerHash] = perfResults[0].algo - if cudnn.verbose then - print(string.format( - "\n" .. findAPI .. " Time: %3.5f Memory: %8d Algorithm: %d" - .. " hash: %45s", - perfResults[0].time, tonumber(perfResults[0].memory), - tonumber(perfResults[0].algo), self.autotunerHash )) - - end - end - else - - local algWorkspaceLimit = self.workspace_limit - or (self.nInputPlane * self.kH * self.kW * self.weight.elementSize()) - - errcheck(getAPI, - cudnn.getHandle(), - params[1], params[2], params[3], params[4], - algSearchMode, algWorkspaceLimit, algType) - if cudnn.verbose then - print(string.format( - "\n" .. getAPI .. " Limit: %d Algorithm: %d", - tonumber(algWorkspaceLimit), - tonumber(algType[0]))) - end - end - local bufSize = torch.LongTensor(1) - errcheck(wsAPI, - cudnn.getHandle(), - params[1], params[2], params[3], params[4], - algType[0], bufSize:data()) - - self.extraBuffer = self.extraBuffer or cudnn.getSharedWorkspace() - local extraBufferSizeInBytes = self.extraBuffer:nElement() * self.extraBuffer.elementSize() - - if cudnn.verbose then - print(string.format( - "\n" .. wsAPI .. " returned bufSize: %d, current extraBufferSizeInBytes: %d, %d elements", - tonumber(bufSize[1]), tonumber(extraBufferSizeInBytes), tonumber(self.extraBuffer:nElement()))) - end - - if extraBufferSizeInBytes < bufSize[1] then - self.extraBuffer:resize(math.ceil(bufSize[1]/self.extraBuffer.elementSize())) - end - return algType[0] -end - -function algo.prepareHash(self, input_slice, output_slice) - local function shape(x) - return table.concat(x:size():totable(),'x') - end - self.autotunerHash = shape(self.weight) .. ';' - .. shape(input_slice) .. ';' - .. shape(output_slice) - - self.fwdAlgType = nil - self.bwdDataAlgType = nil - self.bwdFilterAlgType = nil -end - -function algo.setupForwardAlgorithm(self, params) - local algSearchMode - if self.fastest_mode or cudnn.benchmark == true or cudnn.fastest == true then - algSearchMode = 'CUDNN_CONVOLUTION_FWD_PREFER_FASTEST' - else - algSearchMode = 'CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT' - end - - params = params or { self.iDesc[0], self.weightDesc[0], self.convDesc[0], self.oDesc[0] } - self.fwdAlgType = self.fmode or - setupAlgo(self,"cudnnConvolutionFwdAlgo_t[?]", "cudnnConvolutionFwdAlgoPerf_t[?]", - 'cudnnFindConvolutionForwardAlgorithm', 'cudnnGetConvolutionForwardAlgorithm', - 'cudnnGetConvolutionForwardWorkspaceSize', algSearchMode, params) -end - -function algo.setupBackwardFilterAlgorithm(self, params) - local algSearchMode = 'CUDNN_CONVOLUTION_BWD_FILTER_NO_WORKSPACE' - if self.fastest_mode or cudnn.fastest == true then - algSearchMode = 'CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST' - end - params = params or { self.iDesc[0], self.oDesc[0], self.convDesc[0], self.weightDesc[0] } - self.bwdFilterAlgType = self.bwmode or - setupAlgo(self,"cudnnConvolutionBwdFilterAlgo_t[?]", "cudnnConvolutionBwdFilterAlgoPerf_t[?]", - 'cudnnFindConvolutionBackwardFilterAlgorithm', 'cudnnGetConvolutionBackwardFilterAlgorithm', - 'cudnnGetConvolutionBackwardFilterWorkspaceSize', algSearchMode, - params) -end - -function algo.setupBackwardDataAlgorithm(self, params) - local algSearchMode = 'CUDNN_CONVOLUTION_BWD_DATA_NO_WORKSPACE' - if self.fastest_mode or cudnn.fastest == true then - algSearchMode = 'CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST' - end - params = params or { self.weightDesc[0], self.oDesc[0], self.convDesc[0], self.iDesc[0] } - self.bwdDataAlgType = self.bdmode or - setupAlgo(self,"cudnnConvolutionBwdDataAlgo_t[?]", "cudnnConvolutionBwdDataAlgoPerf_t[?]", - 'cudnnFindConvolutionBackwardDataAlgorithm', 'cudnnGetConvolutionBackwardDataAlgorithm', - 'cudnnGetConvolutionBackwardDataWorkspaceSize', algSearchMode, params) -end - -return algo diff --git a/cudnn-scm-1.rockspec b/cudnn-scm-1.rockspec index 3066d12..bd8072f 100644 --- a/cudnn-scm-1.rockspec +++ b/cudnn-scm-1.rockspec @@ -2,7 +2,7 @@ package = "cudnn" version = "scm-1" source = { - url = "git://github.com/soumith/cudnn.torch.git" + url = "git://github.com/soumith/cudnn.torch.git", } description = { diff --git a/functional.lua b/functional.lua index 8eee7c9..cea9df9 100644 --- a/functional.lua +++ b/functional.lua @@ -105,7 +105,7 @@ cudnn.functional.Convolution2D_updateOutput = function(handle, input, weight, ou local algSearchMode = 'CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT' local algWorkspaceLimit = 0 if workspace then - algWorkspaceLimit = workspace:nElement() * workspace:elementSize() + algWorkspaceLimit = workspace:nElement() * 4 -- 4 = sizeof float end errcheck('cudnnGetConvolutionForwardAlgorithm', handle, @@ -116,20 +116,6 @@ function cudnn.toDescriptor(t) return descriptor end -function cudnn.createDescriptors(count, descs_type, create_func, destroy_func) - local ds = ffi.new(descs_type, count) - for i = 0, count - 1 do - errcheck(create_func, ds + i) - end - local function destroyDescriptors(ds) - for i = 0, count - 1 do - errcheck(destroy_func, ds[i]) - end - end - ffi.gc(ds, destroyDescriptors) - return ds -end - local sharedBuffer = {} for i=1,numDevices do @@ -140,7 +126,7 @@ function cudnn.getSharedWorkspace() local device = cutorch.getDevice() local stream = cutorch.getStream() -- starts from 0 if not sharedBuffer[device][stream] then - sharedBuffer[device][stream] = torch.CudaDoubleTensor(256) + sharedBuffer[device][stream] = torch.CudaTensor(1) end return sharedBuffer[device][stream] end @@ -179,4 +165,5 @@ require('cudnn.GRU') require('cudnn.functional') require('cudnn.convert') + return cudnn diff --git a/test/test.lua b/test/test.lua index aa8ea7f..a4f9bcb 100644 --- a/test/test.lua +++ b/test/test.lua @@ -25,7 +25,7 @@ local testparams_float = { } -- TODO: find out why the errors are so huge -local testparams_double_err = { +local testparams_double = { test_type = 'torch.CudaDoubleTensor', precision_forward = 1e+2, precision_backward = 1e+3, -- 1e+4, @@ -33,14 +33,6 @@ local testparams_double_err = { precision_io = 1e-5, } -local testparams_double = { - test_type = 'torch.CudaDoubleTensor', - precision_forward = 1e-4, - precision_backward = 2e-2, - precision_jac = 1e-3, - precision_io = 1e-5, -} - local testparams = testparams_half local function cast(input) @@ -185,11 +177,8 @@ function cudnntest.SpatialConvolution_forward_single() cutorch.synchronize() mytester:asserteq(rescuda:dim(), 3, 'error in dimension') local error = rescuda:float() - groundtruth:float() - if cudnn.verbose and error:abs():max() > tonumber(testparams.precision_forward) then - print('\n==== rescuda:float():\n', rescuda:float(), '\n==== groundtruth:float():\n', groundtruth:float()) - end mytester:assertlt(error:abs():max(), testparams.precision_forward, - 'error on state (forward)') + 'error on state (forward) ') -- IO local ferr,berr = jac.testIO(gconv, cast(input)) @@ -1054,7 +1043,7 @@ function cudnntest.SpatialCrossMapLRN_batch() local size = math.random(1,3)*2+1 local nbfeatures = math.random(3,8) local alpha = math.random(1,100)/100 - local beta = math.random(1,100)/100 + local beta = math.random(0,100)/100 local k = math.random(1,3) local tm = {} @@ -1519,8 +1508,8 @@ mytester = torch.Tester() mytester:add(cudnntest) if torch.random(1,2) == 1 then - cudnn.benchmark = true -- run manual auto-tuner - cudnn.verbose = true + cudnn.benchmark = true -- run manual auto-tuner +-- cudnn.verbose = true end @@ -1530,21 +1519,20 @@ for i=1,cutorch.getDeviceCount() do print('Running test on device: #' .. i .. ' : ' .. prop.name) cutorch.setDevice(i) --- double tensor may be broken --- print'Testing torch.CudaDoubleTensor' --- torch.setdefaulttensortype('torch.DoubleTensor') --- testparams = testparams_double --- mytester:run() + + print'Testing torch.CudaHalfTensor' + testparams = testparams_half + mytester:run() print'Testing torch.CudaTensor' testparams = testparams_float mytester:run() +-- double tensor may be broken at some places, gets NaNs. +-- print'Testing torch.CudaDoubleTensor' +-- testparams = testparams_double +-- mytester:run() --- half tensor is broken on Pascal - print'Testing torch.CudaHalfTensor: note there may be errors on 6.x (Pascal) cards' - testparams = testparams_half - mytester:run() end os.execute('rm -f modelTemp.t7') diff --git a/test/test_groups.lua b/test/test_groups.lua index 1675fdd..8b386b9 100644 --- a/test/test_groups.lua +++ b/test/test_groups.lua @@ -34,6 +34,6 @@ ccn2_gradWeight = ccn2_conv.gradWeight:t() assert((cudnn_output - ccn2_output):abs():max() < 1e-4) assert((cudnn_gradInput - ccn2_gradInput):abs():max() < 1e-4) -assert((cudnn_gradWeight - ccn2_gradWeight):abs():max() < 1e-1) +assert((cudnn_gradWeight - ccn2_gradWeight):abs():max() < 5e-2) print 'no assertions' |