diff options
author | Boris Fomitchev <borisfom@users.noreply.github.com> | 2016-08-05 01:04:59 +0300 |
---|---|---|
committer | GitHub <noreply@github.com> | 2016-08-05 01:04:59 +0300 |
commit | 103e6ed51df3a909053c2a524d9e6fc6b35cbf60 (patch) | |
tree | 951b8203eeee55ded736943365300acae19771ae | |
parent | b812efdd1c3465547f0e5b0e24d2053f95ebfe2e (diff) | |
parent | fb1bec17939eb26f94da6a22f410ad316730b9e4 (diff) |
Merge pull request #231 from borisfom/algo
Refactoring CUDNN Find
-rw-r--r-- | RNN.lua | 24 | ||||
-rw-r--r-- | SpatialConvolution.lua | 341 | ||||
-rw-r--r-- | SpatialFullConvolution.lua | 290 | ||||
-rw-r--r-- | TemporalConvolution.lua | 21 | ||||
-rw-r--r-- | VolumetricConvolution.lua | 356 | ||||
-rw-r--r-- | algo.lua | 129 | ||||
-rw-r--r-- | cudnn-scm-1.rockspec | 2 | ||||
-rw-r--r-- | functional.lua | 2 | ||||
-rw-r--r-- | init.lua | 17 | ||||
-rw-r--r-- | test/test.lua | 38 | ||||
-rw-r--r-- | test/test_groups.lua | 2 |
11 files changed, 333 insertions, 889 deletions
@@ -63,43 +63,29 @@ function RNN:reset(stdv) self.gradWeight:resizeAs(self.weight):zero() end -function RNN:createDescriptors(count, descs_type, create_func, destroy_func) - local ds = ffi.new(descs_type, count) - for i = 0, count - 1 do - errcheck(create_func, ds + i) - end - local function destroyDescriptors(ds) - for i = 0, count - 1 do - errcheck(destroy_func, ds[i]) - end - end - ffi.gc(ds, destroyDescriptors) - return ds -end - function RNN:createDropoutDescriptors(count) - return self:createDescriptors(count, + return cudnn.createDescriptors(count, 'cudnnDropoutDescriptor_t[?]', 'cudnnCreateDropoutDescriptor', 'cudnnDestroyDropoutDescriptor') end function RNN:createFilterDescriptors(count) - return self:createDescriptors(count, + return cudnn.createDescriptors(count, 'cudnnFilterDescriptor_t[?]', 'cudnnCreateFilterDescriptor', 'cudnnDestroyFilterDescriptor') end function RNN:createRNNDescriptors(count) - return self:createDescriptors(count, + return cudnn.createDescriptors(count, 'cudnnRNNDescriptor_t[?]', 'cudnnCreateRNNDescriptor', 'cudnnDestroyRNNDescriptor') end function RNN:createTensorDescriptors(count) - return self:createDescriptors(count, + return cudnn.createDescriptors(count, 'cudnnTensorDescriptor_t[?]', 'cudnnCreateTensorDescriptor', 'cudnnDestroyTensorDescriptor') @@ -383,7 +369,7 @@ function RNN:updateOutput(input) if self.cellOutput then self.cellInput = self.cellOutput:clone() end - end + end if (self.batchFirst) then self.output = self.output:transpose(1, 2) end diff --git a/SpatialConvolution.lua b/SpatialConvolution.lua index 58c78b2..1656154 100644 --- a/SpatialConvolution.lua +++ b/SpatialConvolution.lua @@ -1,13 +1,9 @@ local SpatialConvolution, parent = torch.class('cudnn.SpatialConvolution', 'nn.SpatialConvolution') local ffi = require 'ffi' +local algo = require 'cudnn.algo' local errcheck = cudnn.errcheck -local autotunerCache = {} -autotunerCache[1] = {} -- forward -autotunerCache[2] = {} -- backwardFilter -autotunerCache[3] = {} -- backwardData - function SpatialConvolution:__init(nInputPlane, nOutputPlane, kW, kH, dW, dH, padW, padH, groups) local delayedReset = self.reset @@ -26,39 +22,38 @@ function SpatialConvolution:__init(nInputPlane, nOutputPlane, self:reset() -- should nil for serialization, the reset will still work self.reset = nil + return self end --- if you change the configuration of the module manually, call this -function SpatialConvolution:resetWeightDescriptors() - assert(cudnn.typemap[torch.typename(self.weight)], 'Only Cuda supported duh!') +function SpatialConvolution:createWeightDescriptors() + assert(cudnn.typemap[torch.typename(self.weight)] or not self.weight, 'Only Cuda supported duh!') assert(cudnn.typemap[torch.typename(self.bias)] or not self.bias, 'Only Cuda supported duh!') - -- for compatibility - self.groups = self.groups or 1 - -- create filterDescriptor for weight - self.weightDesc = ffi.new('struct cudnnFilterStruct*[1]') - errcheck('cudnnCreateFilterDescriptor', self.weightDesc) - local desc = torch.IntTensor({self.nOutputPlane/self.groups, - self.nInputPlane/self.groups, - self.kH, self.kW}) - errcheck('cudnnSetFilterNdDescriptor', self.weightDesc[0], - cudnn.typemap[torch.typename(self.weight)], 'CUDNN_TENSOR_NCHW', 4, - desc:data()); - local function destroyWDesc(d) - errcheck('cudnnDestroyFilterDescriptor', d[0]); - end - ffi.gc(self.weightDesc, destroyWDesc) - -- create descriptor for bias if self.bias then self.biasDesc = cudnn.toDescriptor(self.bias:view(1, self.nOutputPlane,1,1)) end + -- create filterDescriptor for weight + return cudnn.createDescriptors(1, 'struct cudnnFilterStruct*[?]', + 'cudnnCreateFilterDescriptor', 'cudnnDestroyFilterDescriptor') +end + +-- if you change the configuration of the module manually, call this +function SpatialConvolution:resetWeightDescriptors(desc) + -- for compatibility + self.groups = self.groups or 1 + self.weightDesc = SpatialConvolution.createWeightDescriptors(self) + desc = desc or torch.IntTensor({self.nOutputPlane/self.groups, + self.nInputPlane/self.groups, + self.kH, self.kW}) + errcheck('cudnnSetFilterNdDescriptor', self.weightDesc[0], + cudnn.typemap[torch.typename(self.weight)], 'CUDNN_TENSOR_NCHW', self.nDim, + desc:data()); end function SpatialConvolution:fastest(mode) if mode == nil then mode = true end self.fastest_mode = mode - self.iSize = self.iSize or torch.LongStorage(4) - self.iSize:fill(0) + self.iDesc = nil return self end @@ -72,8 +67,7 @@ function SpatialConvolution:setMode(fmode, bdmode, bwmode) if bwmode ~= nil then self.bwmode = bwmode end - self.iSize = self.iSize or torch.LongStorage(4) - self.iSize:fill(0) + self.iDesc = nil return self end @@ -90,32 +84,41 @@ function SpatialConvolution:noBias() return self end -function SpatialConvolution:createIODescriptors(input) - local batch = true - if input:dim() == 3 then - input = input:view(1, input:size(1), input:size(2), input:size(3)) - batch = false + +function SpatialConvolution:checkInputChanged(input) + self.nDim = self.nDim or 4 + assert(input:dim() == self.nDim) + assert(input:isContiguous()) + self.iSize = self.iSize or torch.LongStorage(self.nDim):fill(0) + self.groups = self.groups or 1 + if not self.weightDesc then self:resetWeightDescriptors() end + if not self.iDesc or not self.oDesc or input:size(1) ~= self.iSize[1] or input:size(2) ~= self.iSize[2] + or input:size(3) ~= self.iSize[3] or input:size(4) ~= self.iSize[4] or (self.nDim==5 and input:size(5) ~= self.iSize[5]) then + self.iSize = input:size() + + assert(self.nInputPlane == input:size(2), 'input has to contain: ' + .. self.nInputPlane + .. ' feature maps, but received input of size: ' + .. input:size(1) .. ' x ' .. input:size(2) .. + ' x ' .. input:size(3) .. ' x ' .. input:size(4)) + return true end - assert(input:dim() == 4 and input:isContiguous()); - self.iSize = self.iSize or torch.LongStorage(4):fill(0) - if not self.iDesc or not self.oDesc or - input:size(1) ~= self.iSize[1] or input:size(2) ~= self.iSize[2] - or input:size(3) ~= self.iSize[3] or input:size(4) ~= self.iSize[4] then - self.iSize = input:size() - - assert(self.nInputPlane == input:size(2), 'input has to contain: ' - .. self.nInputPlane - .. ' feature maps, but received input of size: ' - .. input:size(1) .. ' x ' .. input:size(2) .. - ' x ' .. input:size(3) .. ' x ' .. input:size(4)) + return false +end +function SpatialConvolution:createIODescriptors(input) + local batch = true + if input:dim() == 3 then + input = input:view(1, input:size(1), input:size(2), input:size(3)) + batch = false + end + if SpatialConvolution.checkInputChanged(self, input) then -- create input descriptor local input_slice = input:narrow(2,1,self.nInputPlane/self.groups) self.iDesc = cudnn.toDescriptor(input_slice) - -- create conv descriptor - self.convDesc = ffi.new('struct cudnnConvolutionStruct*[1]') - errcheck('cudnnCreateConvolutionDescriptor', self.convDesc) + self.convDesc = cudnn.createDescriptors(1, 'struct cudnnConvolutionStruct*[?]', + 'cudnnCreateConvolutionDescriptor', 'cudnnDestroyConvolutionDescriptor') self.padH, self.padW = self.padH or 0, self.padW or 0 local pad = torch.IntTensor({self.padH, self.padW}) local stride = torch.IntTensor({self.dH, self.dW}) @@ -124,207 +127,24 @@ function SpatialConvolution:createIODescriptors(input) 2, pad:data(), stride:data(), upscale:data(), 'CUDNN_CROSS_CORRELATION', cudnn.configmap(torch.type(self.weight))); - local function destroyConvDesc(d) - errcheck('cudnnDestroyConvolutionDescriptor', d[0]); - end - ffi.gc(self.convDesc, destroyConvDesc) + -- get output shape, resize output - local oSize = torch.IntTensor(4) + local oSize = torch.IntTensor(self.nDim) local oSizeD = oSize:data() errcheck('cudnnGetConvolutionNdForwardOutputDim', self.convDesc[0], self.iDesc[0], - self.weightDesc[0], 4, oSizeD) + self.weightDesc[0], self.nDim, oSizeD) oSize[2] = oSize[2] * self.groups self.output:resize(oSize:long():storage()) - -- create descriptor for output local output_slice = self.output:narrow(2,1,self.nOutputPlane/self.groups) + -- create descriptor for output self.oDesc = cudnn.toDescriptor(output_slice) self.oDescForBias = cudnn.toDescriptor(self.output) - ----------------------------------------------------------------------- - local function shape(x) - local sz = x:size() - local str = '' - for i=1,sz:size() do - str = str .. sz[i] .. 'x' - end - if #str > 0 then - str = str:sub(1, #str-1) - end - return str - end - local autotunerHash = shape(self.weight) .. ';' - .. shape(input_slice) .. ';' - .. shape(output_slice) - - local maxBufSize = 0 - - -- create forwardAlgorithm descriptors - local algType = ffi.new("cudnnConvolutionFwdAlgo_t[?]", 1) - local algSearchMode = 'CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT' - local algWorkspaceLimit = self.workspace_limit - or (self.nInputPlane * self.kH * self.kW * 4) -- 4 = sizeof int/float. - - if self.fastest_mode or cudnn.fastest == true then - algSearchMode = 'CUDNN_CONVOLUTION_FWD_PREFER_FASTEST' - end - - if cudnn.benchmark then -- the manual auto-tuner is run - if autotunerCache[1][autotunerHash] then - algType[0] = autotunerCache[1][autotunerHash] - if cudnn.verbose then - print('Autotuning SC FW: using cached algo = ', algType[0], ' for: ', autotunerHash) - end - else - local perfResults = ffi.new("cudnnConvolutionFwdAlgoPerf_t[?]", 1) - local intt = torch.IntTensor(1); - errcheck('cudnnFindConvolutionForwardAlgorithm', - cudnn.getHandle(), - self.iDesc[0], self.weightDesc[0], - self.convDesc[0], self.oDesc[0], - 1, intt:data(), perfResults) - algType[0] = perfResults[0].algo - autotunerCache[1][autotunerHash] = perfResults[0].algo - if cudnn.verbose then - print(string.format( - "\nAutotuning SC Forward: Time: %3.5f Memory: %8d Algorithm: %d" - .. " Weight: %15s Input: %15s Output: %15s", - perfResults[0].time, tonumber(perfResults[0].memory), - tonumber(perfResults[0].algo), - shape(self.weight), shape(input_slice), - shape(output_slice))) - end - end - else - errcheck('cudnnGetConvolutionForwardAlgorithm', - cudnn.getHandle(), - self.iDesc[0], self.weightDesc[0], - self.convDesc[0], self.oDesc[0], - algSearchMode, algWorkspaceLimit, algType) - end - algType[0] = self.fmode or algType[0] - self.fwdAlgType = algType - local bufSize = torch.LongTensor(1) - errcheck('cudnnGetConvolutionForwardWorkspaceSize', - cudnn.getHandle(), - self.iDesc[0], self.weightDesc[0], - self.convDesc[0], self.oDesc[0], - algType[0], bufSize:data()) - maxBufSize = math.max(maxBufSize, bufSize[1]) - - -- create backwardFilterAlgorithm descriptors - local algType = ffi.new("cudnnConvolutionBwdFilterAlgo_t[?]", 1) - local algSearchMode = 'CUDNN_CONVOLUTION_BWD_FILTER_NO_WORKSPACE' - local algWorkspaceLimit = self.workspace_limit - or (self.nInputPlane * self.kH * self.kW * 4) -- 4 = sizeof int/float. - if self.fastest_mode or cudnn.fastest == true then - algSearchMode = 'CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST' - end - - if cudnn.benchmark then -- the manual auto-tuner is run - if autotunerCache[2][autotunerHash] then - algType[0] = autotunerCache[2][autotunerHash] - if cudnn.verbose then - print('Autotuning SC BW: using cached algo = ', algType[0], ' for: ', autotunerHash) - end - else - local perfResults = ffi.new("cudnnConvolutionBwdFilterAlgoPerf_t[?]", 1) - local intt = torch.IntTensor(1); - errcheck('cudnnFindConvolutionBackwardFilterAlgorithm', - cudnn.getHandle(), - self.iDesc[0], self.oDesc[0], - self.convDesc[0], self.weightDesc[0], - 1, intt:data(), perfResults) - algType[0] = perfResults[0].algo - autotunerCache[2][autotunerHash] = perfResults[0].algo - if cudnn.verbose then - print(string.format( - "Autotuning backwardFilter: Time: %3.5f Memory: %8d Algorithm: %d" - .. " Weight: %15s Input: %15s Output: %15s", - perfResults[0].time, tonumber(perfResults[0].memory), - tonumber(perfResults[0].algo), - shape(self.weight), shape(input_slice), - shape(output_slice))) - end - end - else - errcheck('cudnnGetConvolutionBackwardFilterAlgorithm', - cudnn.getHandle(), - self.iDesc[0], self.oDesc[0], - self.convDesc[0], self.weightDesc[0], - algSearchMode, algWorkspaceLimit, algType) - end - algType[0] = self.bwmode or algType[0] - self.bwdFilterAlgType = algType - local bufSize = torch.LongTensor(1) - errcheck('cudnnGetConvolutionBackwardFilterWorkspaceSize', - cudnn.getHandle(), - self.iDesc[0], self.oDesc[0], - self.convDesc[0], self.weightDesc[0], - algType[0], bufSize:data()) - maxBufSize = math.max(maxBufSize, bufSize[1]) - - -- create backwardDataAlgorithm descriptors - local algType = ffi.new("cudnnConvolutionBwdDataAlgo_t[?]", 1) - local algSearchMode = 'CUDNN_CONVOLUTION_BWD_DATA_NO_WORKSPACE' - local algWorkspaceLimit = self.workspace_limit - or (self.nInputPlane * self.kH * self.kW * 4) -- 4 = sizeof int/float. - if self.fastest_mode or cudnn.fastest == true then - algSearchMode = 'CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST' - end - if cudnn.benchmark then -- the manual auto-tuner is run - if autotunerCache[3][autotunerHash] then - algType[0] = autotunerCache[3][autotunerHash] - if cudnn.verbose then - print('Autotuning SC BWD: using cached algo = ', algType[0], ' for: ', autotunerHash) - end - else - local perfResults = ffi.new("cudnnConvolutionBwdDataAlgoPerf_t[?]", 1) - local intt = torch.IntTensor(1); - errcheck('cudnnFindConvolutionBackwardDataAlgorithm', - cudnn.getHandle(), - self.weightDesc[0], self.oDesc[0], - self.convDesc[0], self.iDesc[0], - 1, intt:data(), perfResults) - algType[0] = perfResults[0].algo - autotunerCache[3][autotunerHash] = perfResults[0].algo - if cudnn.verbose then - print(string.format( - "Autotuning backwardData: Time: %3.5f Memory: %8d Algorithm: %d" - .. " Weight: %15s Input: %15s Output: %15s\n", - perfResults[0].time, tonumber(perfResults[0].memory), - tonumber(perfResults[0].algo), - shape(self.weight), shape(input_slice), - shape(output_slice))) - end - end - else - errcheck('cudnnGetConvolutionBackwardDataAlgorithm', - cudnn.getHandle(), - self.weightDesc[0], self.oDesc[0], - self.convDesc[0], self.iDesc[0], - algSearchMode, algWorkspaceLimit, algType) - end - algType[0] = self.bdmode or algType[0] - self.bwdDataAlgType = algType - local bufSize = torch.LongTensor(1) - errcheck('cudnnGetConvolutionBackwardDataWorkspaceSize', - cudnn.getHandle(), - self.weightDesc[0], self.oDesc[0], - self.convDesc[0], self.iDesc[0], - algType[0], bufSize:data()) - maxBufSize = math.max(maxBufSize, bufSize[1]) - - self.extraBuffer = self.extraBuffer or cudnn.getSharedWorkspace() - self.extraBufferSizeInBytes = self.extraBuffer:nElement() * 4 -- float - if maxBufSize > self.extraBufferSizeInBytes then - self.extraBuffer:resize(math.ceil(maxBufSize/4)) - self.extraBufferSizeInBytes = maxBufSize - end + algo.prepareHash(self, input_slice, output_slice) - ----------------------------------------------------------------------- -- create offsets for groups local iH, iW = input:size(3), input:size(4) local kH, kW = self.kH, self.kW @@ -338,13 +158,14 @@ function SpatialConvolution:createIODescriptors(input) self.output:size(3), self.output:size(4)) end - end + + end end local one = torch.FloatTensor({1}); local zero = torch.FloatTensor({0}); -local function makeContiguous(self, input, gradOutput) +function SpatialConvolution:makeContiguous(input, gradOutput) if not input:isContiguous() then self._input = self._input or input.new() self._input:typeAs(input):resizeAs(input):copy(input) @@ -359,17 +180,18 @@ local function makeContiguous(self, input, gradOutput) end function SpatialConvolution:updateOutput(input) - if not self.weightDesc then self:resetWeightDescriptors() end - input = makeContiguous(self, input) + input = SpatialConvolution.makeContiguous(self, input) self:createIODescriptors(input) - + if not self.fwdAlgType then + algo.setupForwardAlgorithm(self) + end for g = 0, self.groups - 1 do errcheck('cudnnConvolutionForward', cudnn.getHandle(), one:data(), self.iDesc[0], input:data() + g*self.input_offset, self.weightDesc[0], self.weight:data() + g*self.weight_offset, - self.convDesc[0], self.fwdAlgType[0], - self.extraBuffer:data(), self.extraBufferSizeInBytes, + self.convDesc[0], self.fwdAlgType, + self.extraBuffer:data(), self.extraBuffer:nElement() * self.extraBuffer.elementSize(), zero:data(), self.oDesc[0], self.output:data() + g*self.output_offset); end @@ -387,11 +209,12 @@ end function SpatialConvolution:updateGradInput(input, gradOutput) if not self.gradInput then return end self.gradInput:resizeAs(input) - - input, gradOutput = makeContiguous(self, input, gradOutput) - assert(gradOutput:dim() == 3 or gradOutput:dim() == 4, 'gradOutput has to be 3D or 4D'); - if not self.weightDesc then self:resetWeightDescriptors() end + input, gradOutput = SpatialConvolution.makeContiguous(self, input, gradOutput) + assert(gradOutput:dim() == self.nDim-1 or gradOutput:dim() == self.nDim, 'gradOutput has to be nDim or nDim-1'); self:createIODescriptors(input) + if not self.bwdDataAlgType then + algo.setupBackwardDataAlgorithm(self) + end for g = 0,self.groups - 1 do errcheck('cudnnConvolutionBackwardData', cudnn.getHandle(), @@ -399,8 +222,8 @@ function SpatialConvolution:updateGradInput(input, gradOutput) self.weightDesc[0], self.weight:data() + g*self.weight_offset, self.oDesc[0], gradOutput:data() + g*self.output_offset, self.convDesc[0], - self.bwdDataAlgType[0], - self.extraBuffer:data(), self.extraBufferSizeInBytes, + self.bwdDataAlgType, + self.extraBuffer:data(), self.extraBuffer:nElement() * self.extraBuffer.elementSize(), zero:data(), self.iDesc[0], self.gradInput:data() + g*self.input_offset); end @@ -413,13 +236,14 @@ function SpatialConvolution:accGradParameters(input, gradOutput, scale) self.scaleT = self.scaleT:float() scale = scale or 1.0 self.scaleT[1] = scale - - input, gradOutput = makeContiguous(self, input, gradOutput) - - assert(gradOutput:dim() == 3 or gradOutput:dim() == 4, 'gradOutput has to be 3D or 4D'); - if not self.weightDesc then self:resetWeightDescriptors() end + input, gradOutput = SpatialConvolution.makeContiguous(self, input, gradOutput) + assert(gradOutput:dim() == self.nDim-1 or gradOutput:dim() == self.nDim, 'gradOutput has to be nDim or nDim-1'); self:createIODescriptors(input) + if not self.bwdFilterAlgType then + algo.setupBackwardFilterAlgorithm(self) + end + -- gradBias if self.bias then errcheck('cudnnConvolutionBackwardBias', cudnn.getHandle(), @@ -436,8 +260,8 @@ function SpatialConvolution:accGradParameters(input, gradOutput, scale) self.iDesc[0], input:data() + g*self.input_offset, self.oDesc[0], gradOutput:data() + g*self.output_offset, self.convDesc[0], - self.bwdFilterAlgType[0], - self.extraBuffer:data(), self.extraBufferSizeInBytes, + self.bwdFilterAlgType, + self.extraBuffer:data(), self.extraBuffer:nElement() * self.extraBuffer.elementSize(), one:data(), self.weightDesc[0], self.gradWeight:data() + g*self.weight_offset); end @@ -455,7 +279,6 @@ function SpatialConvolution:clearDesc() self.bwdDataAlgType = nil self.bwdFilterAlgType = nil self.extraBuffer = nil - self.extraBufferSizeInBytes = nil self.scaleT = nil end @@ -470,6 +293,8 @@ end function SpatialConvolution:clearState() self:clearDesc() - nn.utils.clear(self, '_input', '_gradOutput') + nn.utils.clear(self, 'extraBuffer', '_input', '_gradOutput') return nn.Module.clearState(self) end + +return SpatialConvolution diff --git a/SpatialFullConvolution.lua b/SpatialFullConvolution.lua index ff91a13..1cdfb33 100644 --- a/SpatialFullConvolution.lua +++ b/SpatialFullConvolution.lua @@ -2,98 +2,52 @@ local SpatialFullConvolution, parent = torch.class('cudnn.SpatialFullConvolution', 'nn.SpatialFullConvolution') local ffi = require 'ffi' local errcheck = cudnn.errcheck +local algo = require 'cudnn.algo' local autotunerCache = {} autotunerCache[1] = {} -- forward autotunerCache[2] = {} -- backwardFilter autotunerCache[3] = {} -- backwardData +local Convolution = cudnn.SpatialConvolution +SpatialFullConvolution.nDim = 4 + -- if you change the configuration of the module manually, call this function SpatialFullConvolution:resetWeightDescriptors() - assert(cudnn.typemap[torch.typename(self.weight)], 'Only Cuda supported duh!') - assert(cudnn.typemap[torch.typename(self.bias)] or not self.bias, 'Only Cuda supported duh!') - -- create filterDescriptor for weight - self.weightDesc = ffi.new('struct cudnnFilterStruct*[1]') - errcheck('cudnnCreateFilterDescriptor', self.weightDesc) + self.weightDesc = Convolution.createWeightDescriptors(self) local desc = torch.IntTensor({self.nInputPlane, self.nOutputPlane, self.kH, self.kW}) errcheck('cudnnSetFilterNdDescriptor', self.weightDesc[0], cudnn.typemap[torch.typename(self.weight)], 'CUDNN_TENSOR_NCHW', 4, desc:data()); - local function destroyWDesc(d) - errcheck('cudnnDestroyFilterDescriptor', d[0]); - end - ffi.gc(self.weightDesc, destroyWDesc) - - -- create descriptor for bias - if self.bias then - self.biasDesc = cudnn.toDescriptor(self.bias:view(1, self.nOutputPlane,1,1)) - end end function SpatialFullConvolution:fastest(mode) - if mode == nil then mode = true end - self.fastest_mode = mode - self.iSize = self.iSize or torch.LongStorage(4) - self.iSize:fill(0) - return self + return Convolution.fastest(self) end function SpatialFullConvolution:setMode(fmode, bdmode, bwmode) - if fmode ~= nil then - self.fmode = fmode - end - if bdmode ~= nil then - self.bdmode = bdmode - end - if bwmode ~= nil then - self.bwmode = bwmode - end - self.iSize = self.iSize or torch.LongStorage(4) - self.iSize:fill(0) - return self + return Convolution.setMode(self, fmode, bdmode, bwmode) end function SpatialFullConvolution:resetMode() - self.fmode = nil - self.bdmode = nil - self.bwmode = nil - return self + return Convolution.resetMode(self) end function SpatialFullConvolution:noBias() - self.bias = nil - self.gradBias = nil - return self + return Convolution.noBias(self) end function SpatialFullConvolution:createIODescriptors(input) - local batch = true - if input:dim() == 3 then - input = input:view(1, input:size(1), input:size(2), input:size(3)) - batch = false - end - assert(input:dim() == 4 and input:isContiguous()); - self.iSize = self.iSize or torch.LongStorage(4):fill(0) - if not self.iDesc or not self.oDesc or - input:size(1) ~= self.iSize[1] or input:size(2) ~= self.iSize[2] - or input:size(3) ~= self.iSize[3] or input:size(4) ~= self.iSize[4] then - self.iSize = input:size() - - assert(self.nInputPlane == input:size(2), 'input has to contain: ' - .. self.nInputPlane - .. ' feature maps, but received input of size: ' - .. input:size(1) .. ' x ' .. input:size(2) .. - ' x ' .. input:size(3) .. ' x ' .. input:size(4)) - + if Convolution.checkInputChanged(self, input) then -- create input descriptor - local input_slice = {{},{1,self.nInputPlane},{},{}} - self.iDesc = cudnn.toDescriptor(input[input_slice]) + local input_slice = input[{{},{1,self.nInputPlane},{},{}}] + self.iDesc = cudnn.toDescriptor(input_slice) -- create conv descriptor - self.convDesc = ffi.new('struct cudnnConvolutionStruct*[1]') - errcheck('cudnnCreateConvolutionDescriptor', self.convDesc) + self.convDesc = cudnn.createDescriptors(1, 'struct cudnnConvolutionStruct*[?]', + 'cudnnCreateConvolutionDescriptor', 'cudnnDestroyConvolutionDescriptor') local pad = torch.IntTensor({self.padH, self.padW}) local stride = torch.IntTensor({self.dH, self.dW}) local upscale = torch.IntTensor({1,1}) @@ -101,10 +55,6 @@ function SpatialFullConvolution:createIODescriptors(input) 2, pad:data(), stride:data(), upscale:data(), 'CUDNN_CROSS_CORRELATION', cudnn.configmap(torch.type(self.weight))); - local function destroyConvDesc(d) - errcheck('cudnnDestroyConvolutionDescriptor', d[0]); - end - ffi.gc(self.convDesc, destroyConvDesc) -- get output shape, resize output local iwidth = input:size(4) @@ -115,190 +65,12 @@ function SpatialFullConvolution:createIODescriptors(input) self.output:resize(oSize:long():storage()) -- create descriptor for output - local output_slice = {{},{1,self.nOutputPlane},{},{}} - self.oDesc = cudnn.toDescriptor(self.output[output_slice]) + local output_slice = self.output[{{},{1,self.nOutputPlane},{},{}}] + self.oDesc = cudnn.toDescriptor(output_slice) self.oDescForBias = cudnn.toDescriptor(self.output) - ----------------------------------------------------------------------- - local function shape(x) - local sz = x:size() - local str = '' - for i=1,sz:size() do - str = str .. sz[i] .. 'x' - end - if #str > 0 then - str = str:sub(1, #str-1) - end - return str - end - local autotunerHash = shape(self.weight) .. ';' - .. shape(input[input_slice]) .. ';' - .. shape(self.output[output_slice]) + algo.prepareHash(self, input_slice, output_slice) - local maxBufSize = 0 - - -- create forwardAlgorithm descriptors - local algType = ffi.new("cudnnConvolutionFwdAlgo_t[?]", 1) - local algSearchMode = 'CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT' - local algWorkspaceLimit = self.workspace_limit - or (self.nOutputPlane * self.kH * self.kW * 4) -- 4 = sizeof int/float. - - if self.fastest_mode or cudnn.fastest == true then - algSearchMode = 'CUDNN_CONVOLUTION_FWD_PREFER_FASTEST' - end - - if cudnn.benchmark then -- the manual auto-tuner is run - if autotunerCache[1][autotunerHash] then - algType[0] = autotunerCache[1][autotunerHash] - if cudnn.verbose then - print('Autotuning SFC: using cached algo = ', algType[0], ' for: ', autotunerHash) - end - else - local perfResults = ffi.new("cudnnConvolutionFwdAlgoPerf_t[?]", 1) - local intt = torch.IntTensor(1); - errcheck('cudnnFindConvolutionForwardAlgorithm', - cudnn.getHandle(), - self.oDesc[0], self.weightDesc[0], - self.convDesc[0], self.iDesc[0], - 1, intt:data(), perfResults) - algType[0] = perfResults[0].algo - autotunerCache[1][autotunerHash] = perfResults[0].algo - if cudnn.verbose then - print(string.format( - "Autotuning Forward: Time: %3.5f Memory: %8d Algorithm: %d" - .. " Weight: %15s Input: %15s Output: %15s", - perfResults[0].time, tonumber(perfResults[0].memory), - tonumber(perfResults[0].algo), - shape(self.weight), shape(input[input_slice]), - shape(self.output[output_slice]))) - end - end - else - errcheck('cudnnGetConvolutionForwardAlgorithm', - cudnn.getHandle(), - self.oDesc[0], self.weightDesc[0], - self.convDesc[0], self.iDesc[0], - algSearchMode, algWorkspaceLimit, algType) - end - algType[0] = self.fmode or algType[0] - self.fwdAlgType = algType - local bufSize = torch.LongTensor(1) - errcheck('cudnnGetConvolutionForwardWorkspaceSize', - cudnn.getHandle(), - self.oDesc[0], self.weightDesc[0], - self.convDesc[0], self.iDesc[0], - algType[0], bufSize:data()) - maxBufSize = math.max(maxBufSize, bufSize[1]) - - -- create backwardFilterAlgorithm descriptors - local algType = ffi.new("cudnnConvolutionBwdFilterAlgo_t[?]", 1) - local algSearchMode = 'CUDNN_CONVOLUTION_BWD_FILTER_NO_WORKSPACE' - local algWorkspaceLimit = self.workspace_limit - or (self.nOutputPlane * self.kH * self.kW * 4) -- 4 = sizeof int/float. - if self.fastest_mode or cudnn.fastest == true then - algSearchMode = 'CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST' - end - - if cudnn.benchmark then -- the manual auto-tuner is run - if autotunerCache[2][autotunerHash] then - algType[0] = autotunerCache[2][autotunerHash] - else - local perfResults = ffi.new("cudnnConvolutionBwdFilterAlgoPerf_t[?]", 1) - local intt = torch.IntTensor(1); - errcheck('cudnnFindConvolutionBackwardFilterAlgorithm', - cudnn.getHandle(), - self.oDesc[0], self.iDesc[0], - self.convDesc[0], self.weightDesc[0], - 1, intt:data(), perfResults) - algType[0] = perfResults[0].algo - autotunerCache[2][autotunerHash] = perfResults[0].algo - if cudnn.verbose then - print(string.format( - "Autotuning backwardFilter: Time: %3.5f Memory: %8d Algorithm: %d" - .. " Weight: %15s Input: %15s Output: %15s", - perfResults[0].time, tonumber(perfResults[0].memory), - tonumber(perfResults[0].algo), - shape(self.weight), shape(input[input_slice]), - shape(self.output[output_slice]))) - end - end - else - errcheck('cudnnGetConvolutionBackwardFilterAlgorithm', - cudnn.getHandle(), - self.oDesc[0], self.iDesc[0], - self.convDesc[0], self.weightDesc[0], - algSearchMode, algWorkspaceLimit, algType) - end - algType[0] = self.bwmode or algType[0] - self.bwdFilterAlgType = algType - local bufSize = torch.LongTensor(1) - errcheck('cudnnGetConvolutionBackwardFilterWorkspaceSize', - cudnn.getHandle(), - self.oDesc[0], self.iDesc[0], - self.convDesc[0], self.weightDesc[0], - algType[0], bufSize:data()) - maxBufSize = math.max(maxBufSize, bufSize[1]) - - -- create backwardDataAlgorithm descriptors - local algType = ffi.new("cudnnConvolutionBwdDataAlgo_t[?]", 1) - local algSearchMode = 'CUDNN_CONVOLUTION_BWD_DATA_NO_WORKSPACE' - local algWorkspaceLimit = self.workspace_limit - or (self.nOutputPlane * self.kH * self.kW * 4) -- 4 = sizeof int/float. - if self.fastest_mode or cudnn.fastest == true then - algSearchMode = 'CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST' - end - if cudnn.benchmark then -- the manual auto-tuner is run - if autotunerCache[3][autotunerHash] then - algType[0] = autotunerCache[3][autotunerHash] - else - local perfResults = ffi.new("cudnnConvolutionBwdDataAlgoPerf_t[?]", 1) - local intt = torch.IntTensor(1); - errcheck('cudnnFindConvolutionBackwardDataAlgorithm', - cudnn.getHandle(), - self.weightDesc[0], self.iDesc[0], - self.convDesc[0], self.oDesc[0], - 1, intt:data(), perfResults) - algType[0] = perfResults[0].algo - autotunerCache[3][autotunerHash] = perfResults[0].algo - if cudnn.verbose then - print(string.format( - "Autotuning backwardData: Time: %3.5f Memory: %8d Algorithm: %d" - .. " Weight: %15s Input: %15s Output: %15s\n", - perfResults[0].time, tonumber(perfResults[0].memory), - tonumber(perfResults[0].algo), - shape(self.weight), shape(input[input_slice]), - shape(self.output[output_slice]))) - end - end - else - errcheck('cudnnGetConvolutionBackwardDataAlgorithm', - cudnn.getHandle(), - self.weightDesc[0], self.iDesc[0], - self.convDesc[0], self.oDesc[0], - algSearchMode, algWorkspaceLimit, algType) - end - algType[0] = self.bdmode or algType[0] - self.bwdDataAlgType = algType - local bufSize = torch.LongTensor(1) - errcheck('cudnnGetConvolutionBackwardDataWorkspaceSize', - cudnn.getHandle(), - self.weightDesc[0], self.iDesc[0], - self.convDesc[0], self.oDesc[0], - algType[0], bufSize:data()) - maxBufSize = math.max(maxBufSize, bufSize[1]) - - self.extraBuffer = self.extraBuffer or cudnn.getSharedWorkspace() - self.extraBufferSizeInBytes = self.extraBuffer:nElement() * 4 -- float - if maxBufSize > self.extraBufferSizeInBytes then - self.extraBuffer:resize(math.ceil(maxBufSize/4)) - self.extraBufferSizeInBytes = maxBufSize - end - - if not batch then - self.output = self.output:view(self.output:size(2), - self.output:size(3), - self.output:size(4)) - end end end @@ -306,8 +78,11 @@ local one = torch.FloatTensor({1}); local zero = torch.FloatTensor({0}); function SpatialFullConvolution:updateOutput(input) - if not self.weightDesc then self:resetWeightDescriptors() end self:createIODescriptors(input) + if not self.bwdDataAlgType then + algo.setupBackwardDataAlgorithm(self, {self.weightDesc[0], self.iDesc[0], + self.convDesc[0], self.oDesc[0]}) + end -- Because SpatialFullConvolution is performing the adjoint of the forward -- convolution operator, we need to swap the forward and backward passes. @@ -315,8 +90,8 @@ function SpatialFullConvolution:updateOutput(input) one:data(), self.weightDesc[0], self.weight:data(), self.iDesc[0], input:data(), - self.convDesc[0], self.bwdDataAlgType[0], - self.extraBuffer:data(), self.extraBufferSizeInBytes, + self.convDesc[0], self.bwdDataAlgType, + self.extraBuffer:data(), self.extraBuffer:nElement() * self.extraBuffer.elementSize(), zero:data(), self.oDesc[0], self.output:data()) @@ -336,16 +111,19 @@ function SpatialFullConvolution:updateGradInput(input, gradOutput) assert(gradOutput:dim() == 3 or gradOutput:dim() == 4, 'gradOutput has to be 3D or 4D'); assert(gradOutput:isContiguous(), 'gradOutput has to be contiguous') - if not self.weightDesc then self:resetWeightDescriptors() end self:createIODescriptors(input) + if not self.fwdDataAlgType then + algo.setupForwardAlgorithm(self, {self.oDesc[0], self.weightDesc[0], + self.convDesc[0], self.iDesc[0]}) + end errcheck('cudnnConvolutionForward', cudnn.getHandle(), one:data(), self.oDesc[0], gradOutput:data(), self.weightDesc[0], self.weight:data(), self.convDesc[0], - self.fwdAlgType[0], - self.extraBuffer:data(), self.extraBufferSizeInBytes, + self.fwdAlgType, + self.extraBuffer:data(), self.extraBuffer:nElement() * self.extraBuffer.elementSize(), zero:data(), self.iDesc[0], self.gradInput:data()); return self.gradInput @@ -360,8 +138,11 @@ function SpatialFullConvolution:accGradParameters(input, gradOutput, scale) assert(gradOutput:dim() == 3 or gradOutput:dim() == 4, 'gradOutput has to be 3D or 4D'); assert(gradOutput:isContiguous(), 'gradOutput has to be contiguous') - if not self.weightDesc then self:resetWeightDescriptors() end self:createIODescriptors(input) + if not self.bwdFilterAlgType then + algo.setupBackwardFilterAlgorithm(self, {self.oDesc[0], self.iDesc[0], + self.convDesc[0], self.weightDesc[0]}) + end -- gradBias if self.bias then @@ -378,8 +159,8 @@ function SpatialFullConvolution:accGradParameters(input, gradOutput, scale) self.oDesc[0], gradOutput:data(), self.iDesc[0], input:data(), self.convDesc[0], - self.bwdFilterAlgType[0], - self.extraBuffer:data(), self.extraBufferSizeInBytes, + self.bwdFilterAlgType, + self.extraBuffer:data(), self.extraBuffer:nElement() * self.extraBuffer.elementSize(), one:data(), self.weightDesc[0], self.gradWeight:data()) end @@ -396,7 +177,6 @@ function SpatialFullConvolution:clearDesc() self.bwdDataAlgType = nil self.bwdFilterAlgType = nil self.extraBuffer = nil - self.extraBufferSizeInBytes = nil end function SpatialFullConvolution:write(f) diff --git a/TemporalConvolution.lua b/TemporalConvolution.lua index 4648ffd..947cc4f 100644 --- a/TemporalConvolution.lua +++ b/TemporalConvolution.lua @@ -6,6 +6,8 @@ local TemporalConvolution, parent = --it is recommended to pass padding parameter to this routine and use cudnn implicit padding facilities. --limitation is that padding will be equal on both sides. +local Convolution = cudnn.SpatialConvolution + function TemporalConvolution:__init(inputFrameSize, outputFrameSize, kH, dH, padH) local delayedReset = self.reset @@ -14,7 +16,8 @@ function TemporalConvolution:__init(inputFrameSize, outputFrameSize, local nOutputPlane = outputFrameSize self.inputFrameSize = inputFrameSize self.outputFrameSize = outputFrameSize - cudnn.SpatialConvolution.__init(self, nInputPlane, nOutputPlane, kW, kH, 1, dH,0,padH) + self.nDim = 4 + Convolution.__init(self, nInputPlane, nOutputPlane, kW, kH, 1, dH,0,padH) self.weight = self.weight:view(nOutputPlane,inputFrameSize*kH) self.gradWeight = self.gradWeight:view(outputFrameSize, inputFrameSize*kH) --self.dW and self.kW now have different meaning than in nn.TemporalConvolution, because @@ -28,24 +31,24 @@ function TemporalConvolution:createIODescriptors(input) or input:size(3) ~= self.iSize[3] or input:size(4) ~= self.iSize[4] then sizeChanged = true end - cudnn.SpatialConvolution.createIODescriptors(self,input) + Convolution.createIODescriptors(self,input) if sizeChanged then self.oSize = self.output:size() end end function TemporalConvolution:fastest(mode) - self = cudnn.SpatialConvolution.fastest(self,mode) + self = Convolution.fastest(self,mode) return self end function TemporalConvolution:setMode(fmode, bdmode, bwmode) - self = cudnn.SpatialConvolution.setMode(self,fmode, bdmode, bwmode) + self = Convolution.setMode(self,fmode, bdmode, bwmode) return self end function TemporalConvolution:resetWeightDescriptors() - cudnn.SpatialConvolution.resetWeightDescriptors(self) + Convolution.resetWeightDescriptors(self) end local function inputview(input) @@ -63,7 +66,7 @@ function TemporalConvolution:updateOutput(input) self._output = self._output or input.new() if self.output:storage() then self._output:set(self.output:storage()) else self._output = self.output end if self.buffer:storage() then self.output:set(self.buffer:storage(), 1, self.output:size()) else self.output = self.buffer end - cudnn.SpatialConvolution.updateOutput(self,_input) + Convolution.updateOutput(self,_input) self.buffer = self.output:view(self.oSize):transpose(2,3) self.output = self._output:resize(self.buffer:size()):copy(self.buffer) -- self.output here is always 4D, use input dimensions to properly view output @@ -92,7 +95,7 @@ function TemporalConvolution:updateGradInput(input, gradOutput) if not self.gradInput then return end local _gradOutput = transposeGradOutput(gradOutput,self.buffer) local _input = inputview(input) - self.gradInput = cudnn.SpatialConvolution.updateGradInput(self,_input, _gradOutput) + self.gradInput = Convolution.updateGradInput(self,_input, _gradOutput) if input:dim()==3 then self.gradInput = self.gradInput:view(self.iSize[1],self.iSize[3],self.iSize[4]) else @@ -106,7 +109,7 @@ function TemporalConvolution:accGradParameters(input,gradOutput,scale) local _input = inputview(input) -- transpose gradOutput (it will likely be transposed twice, hopefully, no big deal local _gradOutput = transposeGradOutput(gradOutput,self.buffer) - cudnn.SpatialConvolution.accGradParameters(self,_input,_gradOutput,scale) + Convolution.accGradParameters(self,_input,_gradOutput,scale) end function TemporalConvolution:clearDesc() @@ -117,7 +120,7 @@ end function TemporalConvolution:write(f) self:clearDesc() - cudnn.SpatialConvolution.clearDesc(self) + Convolution.clearDesc(self) local var = {} for k,v in pairs(self) do var[k] = v diff --git a/VolumetricConvolution.lua b/VolumetricConvolution.lua index b255467..73fd9ce 100644 --- a/VolumetricConvolution.lua +++ b/VolumetricConvolution.lua @@ -2,83 +2,49 @@ local VolumetricConvolution, parent = torch.class('cudnn.VolumetricConvolution', 'nn.VolumetricConvolution') local ffi = require 'ffi' local errcheck = cudnn.errcheck +local algo = require 'cudnn.algo' -local autotunerCache = {} -autotunerCache[1] = {} -- forward -autotunerCache[2] = {} -- backwardFilter -autotunerCache[3] = {} -- backwardData +local Convolution = cudnn.SpatialConvolution +function VolumetricConvolution:__init(nInputPlane, nOutputPlane, + kT, kW, kH, dW, dH, padW, padH) + self.nDim = 5 + self.kT = kT + Convolution.__init(self,nInputPlane, nOutputPlane, + kW, kH, dW, dH, padW, padH, 1) + return self +end -- if you change the configuration of the module manually, call this function VolumetricConvolution:resetWeightDescriptors() - assert(cudnn.typemap[torch.typename(self.weight)], 'Only Cuda supported duh!') - assert(cudnn.typemap[torch.typename(self.bias)] or not self.bias, 'Only Cuda supported duh!') - -- create filterDescriptor for weight - self.weightDesc = ffi.new('struct cudnnFilterStruct*[1]') - errcheck('cudnnCreateFilterDescriptor', self.weightDesc) local desc = torch.IntTensor({self.nOutputPlane, self.nInputPlane, self.kT, self.kH, self.kW}) - errcheck('cudnnSetFilterNdDescriptor', self.weightDesc[0], - cudnn.typemap[torch.typename(self.weight)], 'CUDNN_TENSOR_NCHW', 5, - desc:data()); - local function destroyWDesc(d) - errcheck('cudnnDestroyFilterDescriptor', d[0]); - end - ffi.gc(self.weightDesc, destroyWDesc) - - -- create descriptor for bias - self.biasDesc = cudnn.toDescriptor(self.bias:view(1, self.nOutputPlane, - 1, 1)) + Convolution.resetWeightDescriptors(self, desc) end function VolumetricConvolution:fastest(mode) - if mode == nil then mode = true end - self.fastest_mode = mode - self.iSize = self.iSize or torch.LongStorage(4) - self.iSize:fill(0) - return self + return Convolution.fastest(self) end function VolumetricConvolution:setMode(fmode, bdmode, bwmode) - if fmode ~= nil then - self.fmode = fmode - end - if bdmode ~= nil then - self.bdmode = bdmode - end - if bwmode ~= nil then - self.bwmode = bwmode - end - self.iSize = self.iSize or torch.LongStorage(4) - self.iSize:fill(0) - return self + return Convolution.setMode(self, fmode, bdmode, bwmode) end function VolumetricConvolution:resetMode() - self.fmode = nil - self.bdmode = nil - self.bwmode = nil - return self + return Convolution.resetMode(self) end function VolumetricConvolution:createIODescriptors(input) - local batch = true if input:dim() == 4 then input = input:view(1, input:size(1), input:size(2), input:size(3), input:size(4)) batch = false end - assert(input:dim() == 5 and input:isContiguous()); - self.iSize = self.iSize or torch.LongStorage(4):fill(0) - if not self.iDesc or not self.oDesc or - input:size(1) ~= self.iSize[1] or input:size(2) ~= self.iSize[2] - or input:size(3) ~= self.iSize[3] or input:size(4) ~= self.iSize[4] - or input:size(5) ~= self.iSize[5] then - self.iSize = input:size() + if Convolution.checkInputChanged(self, input) then -- create input descriptor self.iDesc = cudnn.toDescriptor(input) -- create conv descriptor - self.convDesc = ffi.new('struct cudnnConvolutionStruct*[1]') - errcheck('cudnnCreateConvolutionDescriptor', self.convDesc) + self.convDesc = cudnn.createDescriptors(1, 'struct cudnnConvolutionStruct*[?]', + 'cudnnCreateConvolutionDescriptor', 'cudnnDestroyConvolutionDescriptor') local pad = torch.IntTensor({self.padT, self.padH, self.padW}) local stride = torch.IntTensor({self.dT, self.dH, self.dW}) local upscale = torch.IntTensor({1,1,1}) @@ -86,11 +52,6 @@ function VolumetricConvolution:createIODescriptors(input) 3, pad:data(), stride:data(), upscale:data(), 'CUDNN_CROSS_CORRELATION', cudnn.configmap(torch.type(self.weight))); - local function destroyConvDesc(d) - errcheck('cudnnDestroyConvolutionDescriptor', d[0]); - end - ffi.gc(self.convDesc, destroyConvDesc) - -- create output descriptor and resize output local oSize = torch.IntTensor(5) local oSizeD = oSize:data() @@ -106,181 +67,7 @@ function VolumetricConvolution:createIODescriptors(input) self.output:size(3)*self.output:size(4), self.output:size(5))) - - - ----------------------------------------------------------------------- - local function shape(x) - return table.concat(x:size():totable(),'x') - end - local autotunerHash = shape(self.weight) .. ';' - .. shape(input) .. ';' - .. shape(self.output) - - local maxBufSize = 0 - - -- create forwardAlgorithm descriptors - local algType = ffi.new("cudnnConvolutionFwdAlgo_t[?]", 1) - local algSearchMode = 'CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT' - local algWorkspaceLimit = self.workspace_limit - or (self.nInputPlane * self.kH * self.kW * 4) -- 4 = sizeof int/float. - - if self.fastest_mode or cudnn.fastest == true then - algSearchMode = 'CUDNN_CONVOLUTION_FWD_PREFER_FASTEST' - end - - if cudnn.benchmark then -- the manual auto-tuner is run - if autotunerCache[1][autotunerHash] then - algType[0] = autotunerCache[1][autotunerHash] - if cudnn.verbose then - print('Autotuning VMC FW: using cached algo = ', algType[0], ' for: ', autotunerHash) - end - else - local perfResults = ffi.new("cudnnConvolutionFwdAlgoPerf_t[?]", 1) - local intt = torch.IntTensor(1); - errcheck('cudnnFindConvolutionForwardAlgorithm', - cudnn.getHandle(), - self.iDesc[0], self.weightDesc[0], - self.convDesc[0], self.oDesc[0], - 1, intt:data(), perfResults) - algType[0] = perfResults[0].algo - autotunerCache[1][autotunerHash] = perfResults[0].algo - if cudnn.verbose then - print(string.format( - "\nAutotuning VMC Forward: Time: %3.5f Memory: %8d Algorithm: %d" - .. " Weight: %15s Input: %15s Output: %15s", - perfResults[0].time, tonumber(perfResults[0].memory), - tonumber(perfResults[0].algo), - shape(self.weight), shape(input), - shape(self.output))) - end - end - else - errcheck('cudnnGetConvolutionForwardAlgorithm', - cudnn.getHandle(), - self.iDesc[0], self.weightDesc[0], - self.convDesc[0], self.oDesc[0], - algSearchMode, algWorkspaceLimit, algType) - end - algType[0] = self.fmode or algType[0] - self.fwdAlgType = algType - local bufSize = torch.LongTensor(1) - errcheck('cudnnGetConvolutionForwardWorkspaceSize', - cudnn.getHandle(), - self.iDesc[0], self.weightDesc[0], - self.convDesc[0], self.oDesc[0], - algType[0], bufSize:data()) - maxBufSize = math.max(maxBufSize, bufSize[1]) - - -- create backwardFilterAlgorithm descriptors - local algType = ffi.new("cudnnConvolutionBwdFilterAlgo_t[?]", 1) - local algSearchMode = 'CUDNN_CONVOLUTION_BWD_FILTER_NO_WORKSPACE' - local algWorkspaceLimit = self.workspace_limit - or (self.nInputPlane * self.kH * self.kW * 4) -- 4 = sizeof int/float. - if self.fastest_mode or cudnn.fastest == true then - algSearchMode = 'CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST' - end - - if cudnn.benchmark then -- the manual auto-tuner is run - if autotunerCache[2][autotunerHash] then - algType[0] = autotunerCache[2][autotunerHash] - if cudnn.verbose then - print('Autotuning VMC BWF: using cached algo = ', algType[0], ' for: ', autotunerHash) - end - else - local perfResults = ffi.new("cudnnConvolutionBwdFilterAlgoPerf_t[?]", 1) - local intt = torch.IntTensor(1); - errcheck('cudnnFindConvolutionBackwardFilterAlgorithm', - cudnn.getHandle(), - self.iDesc[0], self.oDesc[0], - self.convDesc[0], self.weightDesc[0], - 1, intt:data(), perfResults) - algType[0] = perfResults[0].algo - autotunerCache[2][autotunerHash] = perfResults[0].algo - if cudnn.verbose then - print(string.format( - "Autotuning backwardFilter: Time: %3.5f Memory: %8d Algorithm: %d" - .. " Weight: %15s Input: %15s Output: %15s", - perfResults[0].time, tonumber(perfResults[0].memory), - tonumber(perfResults[0].algo), - shape(self.weight), shape(input), - shape(self.output))) - end - end - else - errcheck('cudnnGetConvolutionBackwardFilterAlgorithm', - cudnn.getHandle(), - self.iDesc[0], self.oDesc[0], - self.convDesc[0], self.weightDesc[0], - algSearchMode, algWorkspaceLimit, algType) - end - algType[0] = self.bwmode or algType[0] - self.bwdFilterAlgType = algType - local bufSize = torch.LongTensor(1) - errcheck('cudnnGetConvolutionBackwardFilterWorkspaceSize', - cudnn.getHandle(), - self.iDesc[0], self.oDesc[0], - self.convDesc[0], self.weightDesc[0], - algType[0], bufSize:data()) - maxBufSize = math.max(maxBufSize, bufSize[1]) - - -- create backwardDataAlgorithm descriptors - local algType = ffi.new("cudnnConvolutionBwdDataAlgo_t[?]", 1) - local algSearchMode = 'CUDNN_CONVOLUTION_BWD_DATA_NO_WORKSPACE' - local algWorkspaceLimit = self.workspace_limit - or (self.nInputPlane * self.kH * self.kW * 4) -- 4 = sizeof int/float. - if self.fastest_mode or cudnn.fastest == true then - algSearchMode = 'CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST' - end - if cudnn.benchmark then -- the manual auto-tuner is run - if autotunerCache[3][autotunerHash] then - algType[0] = autotunerCache[3][autotunerHash] - if cudnn.verbose then - print('Autotuning VMC BWD: using cached algo = ', algType[0], ' for: ', autotunerHash) - end - else - local perfResults = ffi.new("cudnnConvolutionBwdDataAlgoPerf_t[?]", 1) - local intt = torch.IntTensor(1); - errcheck('cudnnFindConvolutionBackwardDataAlgorithm', - cudnn.getHandle(), - self.weightDesc[0], self.oDesc[0], - self.convDesc[0], self.iDesc[0], - 1, intt:data(), perfResults) - algType[0] = perfResults[0].algo - autotunerCache[3][autotunerHash] = perfResults[0].algo - if cudnn.verbose then - print(string.format( - "Autotuning backwardData: Time: %3.5f Memory: %8d Algorithm: %d" - .. " Weight: %15s Input: %15s Output: %15s\n", - perfResults[0].time, tonumber(perfResults[0].memory), - tonumber(perfResults[0].algo), - shape(self.weight), shape(input), - shape(self.output))) - end - end - else - errcheck('cudnnGetConvolutionBackwardDataAlgorithm', - cudnn.getHandle(), - self.weightDesc[0], self.oDesc[0], - self.convDesc[0], self.iDesc[0], - algSearchMode, algWorkspaceLimit, algType) - end - algType[0] = self.bdmode or algType[0] - self.bwdDataAlgType = algType - local bufSize = torch.LongTensor(1) - errcheck('cudnnGetConvolutionBackwardDataWorkspaceSize', - cudnn.getHandle(), - self.weightDesc[0], self.oDesc[0], - self.convDesc[0], self.iDesc[0], - algType[0], bufSize:data()) - maxBufSize = math.max(maxBufSize, bufSize[1]) - - self.extraBuffer = self.extraBuffer or cudnn.getSharedWorkspace() - self.extraBufferSizeInBytes = self.extraBuffer:nElement() * 4 -- float - if maxBufSize > self.extraBufferSizeInBytes then - self.extraBuffer:resize(math.ceil(maxBufSize/4)) - self.extraBufferSizeInBytes = maxBufSize - end - ----------------------------------------------------------------------- + algo.prepareHash(self, input, output) if not batch then self.output = self.output:view(self.output:size(2), @@ -291,119 +78,28 @@ function VolumetricConvolution:createIODescriptors(input) end end -local one = torch.FloatTensor({1}); -local zero = torch.FloatTensor({0}); - -local function makeContiguous(self, input, gradOutput) - if not input:isContiguous() then - self._input = self._input or input.new() - self._input:typeAs(input):resizeAs(input):copy(input) - input = self._input - end - if gradOutput and not gradOutput:isContiguous() then - self._gradOutput = self._gradOutput or gradOutput.new() - self._gradOutput:typeAs(gradOutput):resizeAs(gradOutput):copy(gradOutput) - gradOutput = self._gradOutput - end - return input, gradOutput -end - function VolumetricConvolution:updateOutput(input) - if not self.weightDesc then self:resetWeightDescriptors() end - input = makeContiguous(self, input) - self:createIODescriptors(input) - errcheck('cudnnConvolutionForward', cudnn.getHandle(), - one:data(), - self.iDesc[0], input:data(), - self.weightDesc[0], self.weight:data(), - self.convDesc[0], self.fwdAlgType[0], - self.extraBuffer:data(), self.extraBufferSizeInBytes, - zero:data(), - self.oDesc[0], self.output:data()); - errcheck('cudnnAddTensor', cudnn.getHandle(), - one:data(), - self.biasDesc[0], self.bias:data(), one:data(), - self.oDescBias[0], self.output:data()); - return self.output + return Convolution:updateOutput(input) end function VolumetricConvolution:updateGradInput(input, gradOutput) - if not self.gradInput then return end - self.gradInput:resizeAs(input) - - input, gradOutput = makeContiguous(self, input, gradOutput) - assert(gradOutput:dim() == 4 or gradOutput:dim() == 5, - 'gradOutput has to be a 4D or 5D tensor'); - if not self.weightDesc then self:resetWeightDescriptors() end - self:createIODescriptors(input) - errcheck('cudnnConvolutionBackwardData', cudnn.getHandle(), - one:data(), - self.weightDesc[0], self.weight:data(), - self.oDesc[0], gradOutput:data(), - self.convDesc[0], - self.bwdDataAlgType[0], - self.extraBuffer:data(), self.extraBufferSizeInBytes, - zero:data(), - self.iDesc[0], self.gradInput:data()); - return self.gradInput + return Convolution:updateGradInput(input) end function VolumetricConvolution:accGradParameters(input, gradOutput, scale) - self.scaleT = self.scaleT or torch.FloatTensor(1):fill(1.0) - -- this line forces this member to always be on CPU (needed for cudnn) - self.scaleT = self.scaleT:float() - - scale = scale or 1.0 - self.scaleT[1] = scale - input, gradOutput = makeContiguous(self, input, gradOutput) - assert(gradOutput:dim() == 4 or gradOutput:dim() == 5, - 'gradOutput has to be a 4D or 5D tensor'); - self:createIODescriptors(input) - if not self.weightDesc then self:resetWeightDescriptors() end - -- gradBias - errcheck('cudnnConvolutionBackwardBias', cudnn.getHandle(), - self.scaleT:data(), - self.oDescBias[0], gradOutput:data(), - one:data(), - self.biasDesc[0], self.gradBias:data()); - -- gradWeight - errcheck('cudnnConvolutionBackwardFilter', cudnn.getHandle(), - self.scaleT:data(), - self.iDesc[0], input:data(), - self.oDesc[0], gradOutput:data(), - self.convDesc[0], - self.bwdFilterAlgType[0], - self.extraBuffer:data(), self.extraBufferSizeInBytes, - one:data(), - self.weightDesc[0], self.gradWeight:data()); + return Convolution:accGradParameters(input, gradOutput, scale) end function VolumetricConvolution:clearDesc() - self.weightDesc = nil - self.biasDesc = nil - self.convDesc = nil - self.iDesc = nil - self.oDesc = nil - self.oDescBias = nil - self.fwdAlgType = nil - self.bwdDataAlgType = nil - self.bwdFilterAlgType = nil - self.extraBuffer = nil - self.extraBufferInBytes = nil - self.scaleT = nil + Convolution:clearDesc() end function VolumetricConvolution:write(f) - self:clearDesc() - local var = {} - for k,v in pairs(self) do - var[k] = v - end - f:writeObject(var) + Convolution:write(f) end function VolumetricConvolution:clearState() - self:clearDesc() - nn.utils.clear(self, 'extraBuffer', '_input', '_gradOutput') - return nn.Module.clearState(self) + return Convolution:clearState() end + +return VolumetricConvolution diff --git a/algo.lua b/algo.lua new file mode 100644 index 0000000..ba211ad --- /dev/null +++ b/algo.lua @@ -0,0 +1,129 @@ +local ffi = require 'ffi' +local errcheck = cudnn.errcheck + +local algo = {} +local autotunerCache = {} +autotunerCache['cudnnFindConvolutionForwardAlgorithm'] = {} +autotunerCache['cudnnFindConvolutionBackwardFilterAlgorithm'] = {} +autotunerCache['cudnnFindConvolutionBackwardDataAlgorithm'] = {} + +local function setupAlgo(self, algo_t, perf_t, findAPI, getAPI, wsAPI, algSearchMode, params) + + local algType = ffi.new(algo_t, 1) + + if cudnn.benchmark or cudnn.fastest then -- the manual auto-tuner is run + local cachedAlgo = autotunerCache[findAPI][self.autotunerHash]; + if cachedAlgo then + algType[0] = cachedAlgo + if cudnn.verbose then + print('\n', findAPI, ' using cached algo = ' , algType[0] , ' for: ', self.autotunerHash) + end + else + local perfResults = ffi.new(perf_t, 1) + local intt = torch.IntTensor(1) + errcheck(findAPI, + cudnn.getHandle(), + params[1], params[2], params[3], params[4], + 1, intt:data(), perfResults) + algType[0] = perfResults[0].algo + autotunerCache[findAPI][self.autotunerHash] = perfResults[0].algo + if cudnn.verbose then + print(string.format( + "\n" .. findAPI .. " Time: %3.5f Memory: %8d Algorithm: %d" + .. " hash: %45s", + perfResults[0].time, tonumber(perfResults[0].memory), + tonumber(perfResults[0].algo), self.autotunerHash )) + + end + end + else + + local algWorkspaceLimit = self.workspace_limit + or (self.nInputPlane * self.kH * self.kW * self.weight.elementSize()) + + errcheck(getAPI, + cudnn.getHandle(), + params[1], params[2], params[3], params[4], + algSearchMode, algWorkspaceLimit, algType) + if cudnn.verbose then + print(string.format( + "\n" .. getAPI .. " Limit: %d Algorithm: %d", + tonumber(algWorkspaceLimit), + tonumber(algType[0]))) + end + end + local bufSize = torch.LongTensor(1) + errcheck(wsAPI, + cudnn.getHandle(), + params[1], params[2], params[3], params[4], + algType[0], bufSize:data()) + + self.extraBuffer = self.extraBuffer or cudnn.getSharedWorkspace() + local extraBufferSizeInBytes = self.extraBuffer:nElement() * self.extraBuffer.elementSize() + + if cudnn.verbose then + print(string.format( + "\n" .. wsAPI .. " returned bufSize: %d, current extraBufferSizeInBytes: %d, %d elements", + tonumber(bufSize[1]), tonumber(extraBufferSizeInBytes), tonumber(self.extraBuffer:nElement()))) + end + + if extraBufferSizeInBytes < bufSize[1] then + self.extraBuffer:resize(math.ceil(bufSize[1]/self.extraBuffer.elementSize())) + end + return algType[0] +end + +function algo.prepareHash(self, input_slice, output_slice) + local function shape(x) + return table.concat(x:size():totable(),'x') + end + self.autotunerHash = shape(self.weight) .. ';' + .. shape(input_slice) .. ';' + .. shape(output_slice) + + self.fwdAlgType = nil + self.bwdDataAlgType = nil + self.bwdFilterAlgType = nil +end + +function algo.setupForwardAlgorithm(self, params) + local algSearchMode + if self.fastest_mode or cudnn.benchmark == true or cudnn.fastest == true then + algSearchMode = 'CUDNN_CONVOLUTION_FWD_PREFER_FASTEST' + else + algSearchMode = 'CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT' + end + + params = params or { self.iDesc[0], self.weightDesc[0], self.convDesc[0], self.oDesc[0] } + self.fwdAlgType = self.fmode or + setupAlgo(self,"cudnnConvolutionFwdAlgo_t[?]", "cudnnConvolutionFwdAlgoPerf_t[?]", + 'cudnnFindConvolutionForwardAlgorithm', 'cudnnGetConvolutionForwardAlgorithm', + 'cudnnGetConvolutionForwardWorkspaceSize', algSearchMode, params) +end + +function algo.setupBackwardFilterAlgorithm(self, params) + local algSearchMode = 'CUDNN_CONVOLUTION_BWD_FILTER_NO_WORKSPACE' + if self.fastest_mode or cudnn.fastest == true then + algSearchMode = 'CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST' + end + params = params or { self.iDesc[0], self.oDesc[0], self.convDesc[0], self.weightDesc[0] } + self.bwdFilterAlgType = self.bwmode or + setupAlgo(self,"cudnnConvolutionBwdFilterAlgo_t[?]", "cudnnConvolutionBwdFilterAlgoPerf_t[?]", + 'cudnnFindConvolutionBackwardFilterAlgorithm', 'cudnnGetConvolutionBackwardFilterAlgorithm', + 'cudnnGetConvolutionBackwardFilterWorkspaceSize', algSearchMode, + params) +end + +function algo.setupBackwardDataAlgorithm(self, params) + local algSearchMode = 'CUDNN_CONVOLUTION_BWD_DATA_NO_WORKSPACE' + if self.fastest_mode or cudnn.fastest == true then + algSearchMode = 'CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST' + end + params = params or { self.weightDesc[0], self.oDesc[0], self.convDesc[0], self.iDesc[0] } + self.bwdDataAlgType = self.bdmode or + setupAlgo(self,"cudnnConvolutionBwdDataAlgo_t[?]", "cudnnConvolutionBwdDataAlgoPerf_t[?]", + 'cudnnFindConvolutionBackwardDataAlgorithm', 'cudnnGetConvolutionBackwardDataAlgorithm', + 'cudnnGetConvolutionBackwardDataWorkspaceSize', algSearchMode, params) +end + +return algo diff --git a/cudnn-scm-1.rockspec b/cudnn-scm-1.rockspec index bd8072f..3066d12 100644 --- a/cudnn-scm-1.rockspec +++ b/cudnn-scm-1.rockspec @@ -2,7 +2,7 @@ package = "cudnn" version = "scm-1" source = { - url = "git://github.com/soumith/cudnn.torch.git", + url = "git://github.com/soumith/cudnn.torch.git" } description = { diff --git a/functional.lua b/functional.lua index cea9df9..8eee7c9 100644 --- a/functional.lua +++ b/functional.lua @@ -105,7 +105,7 @@ cudnn.functional.Convolution2D_updateOutput = function(handle, input, weight, ou local algSearchMode = 'CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT' local algWorkspaceLimit = 0 if workspace then - algWorkspaceLimit = workspace:nElement() * 4 -- 4 = sizeof float + algWorkspaceLimit = workspace:nElement() * workspace:elementSize() end errcheck('cudnnGetConvolutionForwardAlgorithm', handle, @@ -116,6 +116,20 @@ function cudnn.toDescriptor(t) return descriptor end +function cudnn.createDescriptors(count, descs_type, create_func, destroy_func) + local ds = ffi.new(descs_type, count) + for i = 0, count - 1 do + errcheck(create_func, ds + i) + end + local function destroyDescriptors(ds) + for i = 0, count - 1 do + errcheck(destroy_func, ds[i]) + end + end + ffi.gc(ds, destroyDescriptors) + return ds +end + local sharedBuffer = {} for i=1,numDevices do @@ -126,7 +140,7 @@ function cudnn.getSharedWorkspace() local device = cutorch.getDevice() local stream = cutorch.getStream() -- starts from 0 if not sharedBuffer[device][stream] then - sharedBuffer[device][stream] = torch.CudaTensor(1) + sharedBuffer[device][stream] = torch.CudaDoubleTensor(256) end return sharedBuffer[device][stream] end @@ -165,5 +179,4 @@ require('cudnn.GRU') require('cudnn.functional') require('cudnn.convert') - return cudnn diff --git a/test/test.lua b/test/test.lua index a4f9bcb..aa8ea7f 100644 --- a/test/test.lua +++ b/test/test.lua @@ -25,7 +25,7 @@ local testparams_float = { } -- TODO: find out why the errors are so huge -local testparams_double = { +local testparams_double_err = { test_type = 'torch.CudaDoubleTensor', precision_forward = 1e+2, precision_backward = 1e+3, -- 1e+4, @@ -33,6 +33,14 @@ local testparams_double = { precision_io = 1e-5, } +local testparams_double = { + test_type = 'torch.CudaDoubleTensor', + precision_forward = 1e-4, + precision_backward = 2e-2, + precision_jac = 1e-3, + precision_io = 1e-5, +} + local testparams = testparams_half local function cast(input) @@ -177,8 +185,11 @@ function cudnntest.SpatialConvolution_forward_single() cutorch.synchronize() mytester:asserteq(rescuda:dim(), 3, 'error in dimension') local error = rescuda:float() - groundtruth:float() + if cudnn.verbose and error:abs():max() > tonumber(testparams.precision_forward) then + print('\n==== rescuda:float():\n', rescuda:float(), '\n==== groundtruth:float():\n', groundtruth:float()) + end mytester:assertlt(error:abs():max(), testparams.precision_forward, - 'error on state (forward) ') + 'error on state (forward)') -- IO local ferr,berr = jac.testIO(gconv, cast(input)) @@ -1043,7 +1054,7 @@ function cudnntest.SpatialCrossMapLRN_batch() local size = math.random(1,3)*2+1 local nbfeatures = math.random(3,8) local alpha = math.random(1,100)/100 - local beta = math.random(0,100)/100 + local beta = math.random(1,100)/100 local k = math.random(1,3) local tm = {} @@ -1508,8 +1519,8 @@ mytester = torch.Tester() mytester:add(cudnntest) if torch.random(1,2) == 1 then - cudnn.benchmark = true -- run manual auto-tuner --- cudnn.verbose = true + cudnn.benchmark = true -- run manual auto-tuner + cudnn.verbose = true end @@ -1519,20 +1530,21 @@ for i=1,cutorch.getDeviceCount() do print('Running test on device: #' .. i .. ' : ' .. prop.name) cutorch.setDevice(i) - - print'Testing torch.CudaHalfTensor' - testparams = testparams_half - mytester:run() +-- double tensor may be broken +-- print'Testing torch.CudaDoubleTensor' +-- torch.setdefaulttensortype('torch.DoubleTensor') +-- testparams = testparams_double +-- mytester:run() print'Testing torch.CudaTensor' testparams = testparams_float mytester:run() --- double tensor may be broken at some places, gets NaNs. --- print'Testing torch.CudaDoubleTensor' --- testparams = testparams_double --- mytester:run() +-- half tensor is broken on Pascal + print'Testing torch.CudaHalfTensor: note there may be errors on 6.x (Pascal) cards' + testparams = testparams_half + mytester:run() end os.execute('rm -f modelTemp.t7') diff --git a/test/test_groups.lua b/test/test_groups.lua index 8b386b9..1675fdd 100644 --- a/test/test_groups.lua +++ b/test/test_groups.lua @@ -34,6 +34,6 @@ ccn2_gradWeight = ccn2_conv.gradWeight:t() assert((cudnn_output - ccn2_output):abs():max() < 1e-4) assert((cudnn_gradInput - ccn2_gradInput):abs():max() < 1e-4) -assert((cudnn_gradWeight - ccn2_gradWeight):abs():max() < 5e-2) +assert((cudnn_gradWeight - ccn2_gradWeight):abs():max() < 1e-1) print 'no assertions' |