Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/soumith/cudnn.torch.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--RNN.lua24
-rw-r--r--SpatialConvolution.lua341
-rw-r--r--SpatialFullConvolution.lua290
-rw-r--r--TemporalConvolution.lua21
-rw-r--r--VolumetricConvolution.lua356
-rw-r--r--algo.lua129
-rw-r--r--cudnn-scm-1.rockspec2
-rw-r--r--functional.lua2
-rw-r--r--init.lua17
-rw-r--r--test/test.lua38
-rw-r--r--test/test_groups.lua2
11 files changed, 889 insertions, 333 deletions
diff --git a/RNN.lua b/RNN.lua
index eab2ce4..7f89da9 100644
--- a/RNN.lua
+++ b/RNN.lua
@@ -63,29 +63,43 @@ function RNN:reset(stdv)
self.gradWeight:resizeAs(self.weight):zero()
end
+function RNN:createDescriptors(count, descs_type, create_func, destroy_func)
+ local ds = ffi.new(descs_type, count)
+ for i = 0, count - 1 do
+ errcheck(create_func, ds + i)
+ end
+ local function destroyDescriptors(ds)
+ for i = 0, count - 1 do
+ errcheck(destroy_func, ds[i])
+ end
+ end
+ ffi.gc(ds, destroyDescriptors)
+ return ds
+end
+
function RNN:createDropoutDescriptors(count)
- return cudnn.createDescriptors(count,
+ return self:createDescriptors(count,
'cudnnDropoutDescriptor_t[?]',
'cudnnCreateDropoutDescriptor',
'cudnnDestroyDropoutDescriptor')
end
function RNN:createFilterDescriptors(count)
- return cudnn.createDescriptors(count,
+ return self:createDescriptors(count,
'cudnnFilterDescriptor_t[?]',
'cudnnCreateFilterDescriptor',
'cudnnDestroyFilterDescriptor')
end
function RNN:createRNNDescriptors(count)
- return cudnn.createDescriptors(count,
+ return self:createDescriptors(count,
'cudnnRNNDescriptor_t[?]',
'cudnnCreateRNNDescriptor',
'cudnnDestroyRNNDescriptor')
end
function RNN:createTensorDescriptors(count)
- return cudnn.createDescriptors(count,
+ return self:createDescriptors(count,
'cudnnTensorDescriptor_t[?]',
'cudnnCreateTensorDescriptor',
'cudnnDestroyTensorDescriptor')
@@ -369,7 +383,7 @@ function RNN:updateOutput(input)
if self.cellOutput then
self.cellInput = self.cellOutput:clone()
end
- end
+ end
if (self.batchFirst) then
self.output = self.output:transpose(1, 2)
end
diff --git a/SpatialConvolution.lua b/SpatialConvolution.lua
index 1656154..58c78b2 100644
--- a/SpatialConvolution.lua
+++ b/SpatialConvolution.lua
@@ -1,9 +1,13 @@
local SpatialConvolution, parent =
torch.class('cudnn.SpatialConvolution', 'nn.SpatialConvolution')
local ffi = require 'ffi'
-local algo = require 'cudnn.algo'
local errcheck = cudnn.errcheck
+local autotunerCache = {}
+autotunerCache[1] = {} -- forward
+autotunerCache[2] = {} -- backwardFilter
+autotunerCache[3] = {} -- backwardData
+
function SpatialConvolution:__init(nInputPlane, nOutputPlane,
kW, kH, dW, dH, padW, padH, groups)
local delayedReset = self.reset
@@ -22,38 +26,39 @@ function SpatialConvolution:__init(nInputPlane, nOutputPlane,
self:reset()
-- should nil for serialization, the reset will still work
self.reset = nil
- return self
-end
-
-function SpatialConvolution:createWeightDescriptors()
- assert(cudnn.typemap[torch.typename(self.weight)] or not self.weight, 'Only Cuda supported duh!')
- assert(cudnn.typemap[torch.typename(self.bias)] or not self.bias, 'Only Cuda supported duh!')
- -- create descriptor for bias
- if self.bias then
- self.biasDesc = cudnn.toDescriptor(self.bias:view(1, self.nOutputPlane,1,1))
- end
- -- create filterDescriptor for weight
- return cudnn.createDescriptors(1, 'struct cudnnFilterStruct*[?]',
- 'cudnnCreateFilterDescriptor', 'cudnnDestroyFilterDescriptor')
end
-- if you change the configuration of the module manually, call this
-function SpatialConvolution:resetWeightDescriptors(desc)
+function SpatialConvolution:resetWeightDescriptors()
+ assert(cudnn.typemap[torch.typename(self.weight)], 'Only Cuda supported duh!')
+ assert(cudnn.typemap[torch.typename(self.bias)] or not self.bias, 'Only Cuda supported duh!')
-- for compatibility
self.groups = self.groups or 1
- self.weightDesc = SpatialConvolution.createWeightDescriptors(self)
- desc = desc or torch.IntTensor({self.nOutputPlane/self.groups,
- self.nInputPlane/self.groups,
- self.kH, self.kW})
+ -- create filterDescriptor for weight
+ self.weightDesc = ffi.new('struct cudnnFilterStruct*[1]')
+ errcheck('cudnnCreateFilterDescriptor', self.weightDesc)
+ local desc = torch.IntTensor({self.nOutputPlane/self.groups,
+ self.nInputPlane/self.groups,
+ self.kH, self.kW})
errcheck('cudnnSetFilterNdDescriptor', self.weightDesc[0],
- cudnn.typemap[torch.typename(self.weight)], 'CUDNN_TENSOR_NCHW', self.nDim,
+ cudnn.typemap[torch.typename(self.weight)], 'CUDNN_TENSOR_NCHW', 4,
desc:data());
+ local function destroyWDesc(d)
+ errcheck('cudnnDestroyFilterDescriptor', d[0]);
+ end
+ ffi.gc(self.weightDesc, destroyWDesc)
+
+ -- create descriptor for bias
+ if self.bias then
+ self.biasDesc = cudnn.toDescriptor(self.bias:view(1, self.nOutputPlane,1,1))
+ end
end
function SpatialConvolution:fastest(mode)
if mode == nil then mode = true end
self.fastest_mode = mode
- self.iDesc = nil
+ self.iSize = self.iSize or torch.LongStorage(4)
+ self.iSize:fill(0)
return self
end
@@ -67,7 +72,8 @@ function SpatialConvolution:setMode(fmode, bdmode, bwmode)
if bwmode ~= nil then
self.bwmode = bwmode
end
- self.iDesc = nil
+ self.iSize = self.iSize or torch.LongStorage(4)
+ self.iSize:fill(0)
return self
end
@@ -84,41 +90,32 @@ function SpatialConvolution:noBias()
return self
end
-
-function SpatialConvolution:checkInputChanged(input)
- self.nDim = self.nDim or 4
- assert(input:dim() == self.nDim)
- assert(input:isContiguous())
- self.iSize = self.iSize or torch.LongStorage(self.nDim):fill(0)
- self.groups = self.groups or 1
- if not self.weightDesc then self:resetWeightDescriptors() end
- if not self.iDesc or not self.oDesc or input:size(1) ~= self.iSize[1] or input:size(2) ~= self.iSize[2]
- or input:size(3) ~= self.iSize[3] or input:size(4) ~= self.iSize[4] or (self.nDim==5 and input:size(5) ~= self.iSize[5]) then
- self.iSize = input:size()
-
- assert(self.nInputPlane == input:size(2), 'input has to contain: '
- .. self.nInputPlane
- .. ' feature maps, but received input of size: '
- .. input:size(1) .. ' x ' .. input:size(2) ..
- ' x ' .. input:size(3) .. ' x ' .. input:size(4))
- return true
+function SpatialConvolution:createIODescriptors(input)
+ local batch = true
+ if input:dim() == 3 then
+ input = input:view(1, input:size(1), input:size(2), input:size(3))
+ batch = false
end
- return false
-end
+ assert(input:dim() == 4 and input:isContiguous());
+ self.iSize = self.iSize or torch.LongStorage(4):fill(0)
+ if not self.iDesc or not self.oDesc or
+ input:size(1) ~= self.iSize[1] or input:size(2) ~= self.iSize[2]
+ or input:size(3) ~= self.iSize[3] or input:size(4) ~= self.iSize[4] then
+ self.iSize = input:size()
+
+ assert(self.nInputPlane == input:size(2), 'input has to contain: '
+ .. self.nInputPlane
+ .. ' feature maps, but received input of size: '
+ .. input:size(1) .. ' x ' .. input:size(2) ..
+ ' x ' .. input:size(3) .. ' x ' .. input:size(4))
-function SpatialConvolution:createIODescriptors(input)
- local batch = true
- if input:dim() == 3 then
- input = input:view(1, input:size(1), input:size(2), input:size(3))
- batch = false
- end
- if SpatialConvolution.checkInputChanged(self, input) then
-- create input descriptor
local input_slice = input:narrow(2,1,self.nInputPlane/self.groups)
self.iDesc = cudnn.toDescriptor(input_slice)
+
-- create conv descriptor
- self.convDesc = cudnn.createDescriptors(1, 'struct cudnnConvolutionStruct*[?]',
- 'cudnnCreateConvolutionDescriptor', 'cudnnDestroyConvolutionDescriptor')
+ self.convDesc = ffi.new('struct cudnnConvolutionStruct*[1]')
+ errcheck('cudnnCreateConvolutionDescriptor', self.convDesc)
self.padH, self.padW = self.padH or 0, self.padW or 0
local pad = torch.IntTensor({self.padH, self.padW})
local stride = torch.IntTensor({self.dH, self.dW})
@@ -127,24 +124,207 @@ function SpatialConvolution:createIODescriptors(input)
2, pad:data(),
stride:data(), upscale:data(), 'CUDNN_CROSS_CORRELATION',
cudnn.configmap(torch.type(self.weight)));
-
+ local function destroyConvDesc(d)
+ errcheck('cudnnDestroyConvolutionDescriptor', d[0]);
+ end
+ ffi.gc(self.convDesc, destroyConvDesc)
-- get output shape, resize output
- local oSize = torch.IntTensor(self.nDim)
+ local oSize = torch.IntTensor(4)
local oSizeD = oSize:data()
errcheck('cudnnGetConvolutionNdForwardOutputDim',
self.convDesc[0], self.iDesc[0],
- self.weightDesc[0], self.nDim, oSizeD)
+ self.weightDesc[0], 4, oSizeD)
oSize[2] = oSize[2] * self.groups
self.output:resize(oSize:long():storage())
- local output_slice = self.output:narrow(2,1,self.nOutputPlane/self.groups)
-- create descriptor for output
+ local output_slice = self.output:narrow(2,1,self.nOutputPlane/self.groups)
self.oDesc = cudnn.toDescriptor(output_slice)
self.oDescForBias = cudnn.toDescriptor(self.output)
- algo.prepareHash(self, input_slice, output_slice)
+ -----------------------------------------------------------------------
+ local function shape(x)
+ local sz = x:size()
+ local str = ''
+ for i=1,sz:size() do
+ str = str .. sz[i] .. 'x'
+ end
+ if #str > 0 then
+ str = str:sub(1, #str-1)
+ end
+ return str
+ end
+ local autotunerHash = shape(self.weight) .. ';'
+ .. shape(input_slice) .. ';'
+ .. shape(output_slice)
+
+ local maxBufSize = 0
+
+ -- create forwardAlgorithm descriptors
+ local algType = ffi.new("cudnnConvolutionFwdAlgo_t[?]", 1)
+ local algSearchMode = 'CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT'
+ local algWorkspaceLimit = self.workspace_limit
+ or (self.nInputPlane * self.kH * self.kW * 4) -- 4 = sizeof int/float.
+
+ if self.fastest_mode or cudnn.fastest == true then
+ algSearchMode = 'CUDNN_CONVOLUTION_FWD_PREFER_FASTEST'
+ end
+
+ if cudnn.benchmark then -- the manual auto-tuner is run
+ if autotunerCache[1][autotunerHash] then
+ algType[0] = autotunerCache[1][autotunerHash]
+ if cudnn.verbose then
+ print('Autotuning SC FW: using cached algo = ', algType[0], ' for: ', autotunerHash)
+ end
+ else
+ local perfResults = ffi.new("cudnnConvolutionFwdAlgoPerf_t[?]", 1)
+ local intt = torch.IntTensor(1);
+ errcheck('cudnnFindConvolutionForwardAlgorithm',
+ cudnn.getHandle(),
+ self.iDesc[0], self.weightDesc[0],
+ self.convDesc[0], self.oDesc[0],
+ 1, intt:data(), perfResults)
+ algType[0] = perfResults[0].algo
+ autotunerCache[1][autotunerHash] = perfResults[0].algo
+ if cudnn.verbose then
+ print(string.format(
+ "\nAutotuning SC Forward: Time: %3.5f Memory: %8d Algorithm: %d"
+ .. " Weight: %15s Input: %15s Output: %15s",
+ perfResults[0].time, tonumber(perfResults[0].memory),
+ tonumber(perfResults[0].algo),
+ shape(self.weight), shape(input_slice),
+ shape(output_slice)))
+ end
+ end
+ else
+ errcheck('cudnnGetConvolutionForwardAlgorithm',
+ cudnn.getHandle(),
+ self.iDesc[0], self.weightDesc[0],
+ self.convDesc[0], self.oDesc[0],
+ algSearchMode, algWorkspaceLimit, algType)
+ end
+ algType[0] = self.fmode or algType[0]
+ self.fwdAlgType = algType
+ local bufSize = torch.LongTensor(1)
+ errcheck('cudnnGetConvolutionForwardWorkspaceSize',
+ cudnn.getHandle(),
+ self.iDesc[0], self.weightDesc[0],
+ self.convDesc[0], self.oDesc[0],
+ algType[0], bufSize:data())
+ maxBufSize = math.max(maxBufSize, bufSize[1])
+
+ -- create backwardFilterAlgorithm descriptors
+ local algType = ffi.new("cudnnConvolutionBwdFilterAlgo_t[?]", 1)
+ local algSearchMode = 'CUDNN_CONVOLUTION_BWD_FILTER_NO_WORKSPACE'
+ local algWorkspaceLimit = self.workspace_limit
+ or (self.nInputPlane * self.kH * self.kW * 4) -- 4 = sizeof int/float.
+ if self.fastest_mode or cudnn.fastest == true then
+ algSearchMode = 'CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST'
+ end
+
+ if cudnn.benchmark then -- the manual auto-tuner is run
+ if autotunerCache[2][autotunerHash] then
+ algType[0] = autotunerCache[2][autotunerHash]
+ if cudnn.verbose then
+ print('Autotuning SC BW: using cached algo = ', algType[0], ' for: ', autotunerHash)
+ end
+ else
+ local perfResults = ffi.new("cudnnConvolutionBwdFilterAlgoPerf_t[?]", 1)
+ local intt = torch.IntTensor(1);
+ errcheck('cudnnFindConvolutionBackwardFilterAlgorithm',
+ cudnn.getHandle(),
+ self.iDesc[0], self.oDesc[0],
+ self.convDesc[0], self.weightDesc[0],
+ 1, intt:data(), perfResults)
+ algType[0] = perfResults[0].algo
+ autotunerCache[2][autotunerHash] = perfResults[0].algo
+ if cudnn.verbose then
+ print(string.format(
+ "Autotuning backwardFilter: Time: %3.5f Memory: %8d Algorithm: %d"
+ .. " Weight: %15s Input: %15s Output: %15s",
+ perfResults[0].time, tonumber(perfResults[0].memory),
+ tonumber(perfResults[0].algo),
+ shape(self.weight), shape(input_slice),
+ shape(output_slice)))
+ end
+ end
+ else
+ errcheck('cudnnGetConvolutionBackwardFilterAlgorithm',
+ cudnn.getHandle(),
+ self.iDesc[0], self.oDesc[0],
+ self.convDesc[0], self.weightDesc[0],
+ algSearchMode, algWorkspaceLimit, algType)
+ end
+ algType[0] = self.bwmode or algType[0]
+ self.bwdFilterAlgType = algType
+ local bufSize = torch.LongTensor(1)
+ errcheck('cudnnGetConvolutionBackwardFilterWorkspaceSize',
+ cudnn.getHandle(),
+ self.iDesc[0], self.oDesc[0],
+ self.convDesc[0], self.weightDesc[0],
+ algType[0], bufSize:data())
+ maxBufSize = math.max(maxBufSize, bufSize[1])
+
+ -- create backwardDataAlgorithm descriptors
+ local algType = ffi.new("cudnnConvolutionBwdDataAlgo_t[?]", 1)
+ local algSearchMode = 'CUDNN_CONVOLUTION_BWD_DATA_NO_WORKSPACE'
+ local algWorkspaceLimit = self.workspace_limit
+ or (self.nInputPlane * self.kH * self.kW * 4) -- 4 = sizeof int/float.
+ if self.fastest_mode or cudnn.fastest == true then
+ algSearchMode = 'CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST'
+ end
+ if cudnn.benchmark then -- the manual auto-tuner is run
+ if autotunerCache[3][autotunerHash] then
+ algType[0] = autotunerCache[3][autotunerHash]
+ if cudnn.verbose then
+ print('Autotuning SC BWD: using cached algo = ', algType[0], ' for: ', autotunerHash)
+ end
+ else
+ local perfResults = ffi.new("cudnnConvolutionBwdDataAlgoPerf_t[?]", 1)
+ local intt = torch.IntTensor(1);
+ errcheck('cudnnFindConvolutionBackwardDataAlgorithm',
+ cudnn.getHandle(),
+ self.weightDesc[0], self.oDesc[0],
+ self.convDesc[0], self.iDesc[0],
+ 1, intt:data(), perfResults)
+ algType[0] = perfResults[0].algo
+ autotunerCache[3][autotunerHash] = perfResults[0].algo
+ if cudnn.verbose then
+ print(string.format(
+ "Autotuning backwardData: Time: %3.5f Memory: %8d Algorithm: %d"
+ .. " Weight: %15s Input: %15s Output: %15s\n",
+ perfResults[0].time, tonumber(perfResults[0].memory),
+ tonumber(perfResults[0].algo),
+ shape(self.weight), shape(input_slice),
+ shape(output_slice)))
+ end
+ end
+ else
+ errcheck('cudnnGetConvolutionBackwardDataAlgorithm',
+ cudnn.getHandle(),
+ self.weightDesc[0], self.oDesc[0],
+ self.convDesc[0], self.iDesc[0],
+ algSearchMode, algWorkspaceLimit, algType)
+ end
+ algType[0] = self.bdmode or algType[0]
+ self.bwdDataAlgType = algType
+ local bufSize = torch.LongTensor(1)
+ errcheck('cudnnGetConvolutionBackwardDataWorkspaceSize',
+ cudnn.getHandle(),
+ self.weightDesc[0], self.oDesc[0],
+ self.convDesc[0], self.iDesc[0],
+ algType[0], bufSize:data())
+ maxBufSize = math.max(maxBufSize, bufSize[1])
+
+ self.extraBuffer = self.extraBuffer or cudnn.getSharedWorkspace()
+ self.extraBufferSizeInBytes = self.extraBuffer:nElement() * 4 -- float
+ if maxBufSize > self.extraBufferSizeInBytes then
+ self.extraBuffer:resize(math.ceil(maxBufSize/4))
+ self.extraBufferSizeInBytes = maxBufSize
+ end
+ -----------------------------------------------------------------------
-- create offsets for groups
local iH, iW = input:size(3), input:size(4)
local kH, kW = self.kH, self.kW
@@ -158,14 +338,13 @@ function SpatialConvolution:createIODescriptors(input)
self.output:size(3),
self.output:size(4))
end
-
- end
+ end
end
local one = torch.FloatTensor({1});
local zero = torch.FloatTensor({0});
-function SpatialConvolution:makeContiguous(input, gradOutput)
+local function makeContiguous(self, input, gradOutput)
if not input:isContiguous() then
self._input = self._input or input.new()
self._input:typeAs(input):resizeAs(input):copy(input)
@@ -180,18 +359,17 @@ function SpatialConvolution:makeContiguous(input, gradOutput)
end
function SpatialConvolution:updateOutput(input)
- input = SpatialConvolution.makeContiguous(self, input)
+ if not self.weightDesc then self:resetWeightDescriptors() end
+ input = makeContiguous(self, input)
self:createIODescriptors(input)
- if not self.fwdAlgType then
- algo.setupForwardAlgorithm(self)
- end
+
for g = 0, self.groups - 1 do
errcheck('cudnnConvolutionForward', cudnn.getHandle(),
one:data(),
self.iDesc[0], input:data() + g*self.input_offset,
self.weightDesc[0], self.weight:data() + g*self.weight_offset,
- self.convDesc[0], self.fwdAlgType,
- self.extraBuffer:data(), self.extraBuffer:nElement() * self.extraBuffer.elementSize(),
+ self.convDesc[0], self.fwdAlgType[0],
+ self.extraBuffer:data(), self.extraBufferSizeInBytes,
zero:data(),
self.oDesc[0], self.output:data() + g*self.output_offset);
end
@@ -209,12 +387,11 @@ end
function SpatialConvolution:updateGradInput(input, gradOutput)
if not self.gradInput then return end
self.gradInput:resizeAs(input)
- input, gradOutput = SpatialConvolution.makeContiguous(self, input, gradOutput)
- assert(gradOutput:dim() == self.nDim-1 or gradOutput:dim() == self.nDim, 'gradOutput has to be nDim or nDim-1');
+
+ input, gradOutput = makeContiguous(self, input, gradOutput)
+ assert(gradOutput:dim() == 3 or gradOutput:dim() == 4, 'gradOutput has to be 3D or 4D');
+ if not self.weightDesc then self:resetWeightDescriptors() end
self:createIODescriptors(input)
- if not self.bwdDataAlgType then
- algo.setupBackwardDataAlgorithm(self)
- end
for g = 0,self.groups - 1 do
errcheck('cudnnConvolutionBackwardData', cudnn.getHandle(),
@@ -222,8 +399,8 @@ function SpatialConvolution:updateGradInput(input, gradOutput)
self.weightDesc[0], self.weight:data() + g*self.weight_offset,
self.oDesc[0], gradOutput:data() + g*self.output_offset,
self.convDesc[0],
- self.bwdDataAlgType,
- self.extraBuffer:data(), self.extraBuffer:nElement() * self.extraBuffer.elementSize(),
+ self.bwdDataAlgType[0],
+ self.extraBuffer:data(), self.extraBufferSizeInBytes,
zero:data(),
self.iDesc[0], self.gradInput:data() + g*self.input_offset);
end
@@ -236,13 +413,12 @@ function SpatialConvolution:accGradParameters(input, gradOutput, scale)
self.scaleT = self.scaleT:float()
scale = scale or 1.0
self.scaleT[1] = scale
- input, gradOutput = SpatialConvolution.makeContiguous(self, input, gradOutput)
- assert(gradOutput:dim() == self.nDim-1 or gradOutput:dim() == self.nDim, 'gradOutput has to be nDim or nDim-1');
- self:createIODescriptors(input)
- if not self.bwdFilterAlgType then
- algo.setupBackwardFilterAlgorithm(self)
- end
+ input, gradOutput = makeContiguous(self, input, gradOutput)
+
+ assert(gradOutput:dim() == 3 or gradOutput:dim() == 4, 'gradOutput has to be 3D or 4D');
+ if not self.weightDesc then self:resetWeightDescriptors() end
+ self:createIODescriptors(input)
-- gradBias
if self.bias then
@@ -260,8 +436,8 @@ function SpatialConvolution:accGradParameters(input, gradOutput, scale)
self.iDesc[0], input:data() + g*self.input_offset,
self.oDesc[0], gradOutput:data() + g*self.output_offset,
self.convDesc[0],
- self.bwdFilterAlgType,
- self.extraBuffer:data(), self.extraBuffer:nElement() * self.extraBuffer.elementSize(),
+ self.bwdFilterAlgType[0],
+ self.extraBuffer:data(), self.extraBufferSizeInBytes,
one:data(),
self.weightDesc[0], self.gradWeight:data() + g*self.weight_offset);
end
@@ -279,6 +455,7 @@ function SpatialConvolution:clearDesc()
self.bwdDataAlgType = nil
self.bwdFilterAlgType = nil
self.extraBuffer = nil
+ self.extraBufferSizeInBytes = nil
self.scaleT = nil
end
@@ -293,8 +470,6 @@ end
function SpatialConvolution:clearState()
self:clearDesc()
- nn.utils.clear(self, 'extraBuffer', '_input', '_gradOutput')
+ nn.utils.clear(self, '_input', '_gradOutput')
return nn.Module.clearState(self)
end
-
-return SpatialConvolution
diff --git a/SpatialFullConvolution.lua b/SpatialFullConvolution.lua
index 1cdfb33..ff91a13 100644
--- a/SpatialFullConvolution.lua
+++ b/SpatialFullConvolution.lua
@@ -2,52 +2,98 @@ local SpatialFullConvolution, parent =
torch.class('cudnn.SpatialFullConvolution', 'nn.SpatialFullConvolution')
local ffi = require 'ffi'
local errcheck = cudnn.errcheck
-local algo = require 'cudnn.algo'
local autotunerCache = {}
autotunerCache[1] = {} -- forward
autotunerCache[2] = {} -- backwardFilter
autotunerCache[3] = {} -- backwardData
-local Convolution = cudnn.SpatialConvolution
-SpatialFullConvolution.nDim = 4
-
-- if you change the configuration of the module manually, call this
function SpatialFullConvolution:resetWeightDescriptors()
- self.weightDesc = Convolution.createWeightDescriptors(self)
+ assert(cudnn.typemap[torch.typename(self.weight)], 'Only Cuda supported duh!')
+ assert(cudnn.typemap[torch.typename(self.bias)] or not self.bias, 'Only Cuda supported duh!')
+ -- create filterDescriptor for weight
+ self.weightDesc = ffi.new('struct cudnnFilterStruct*[1]')
+ errcheck('cudnnCreateFilterDescriptor', self.weightDesc)
local desc = torch.IntTensor({self.nInputPlane,
self.nOutputPlane,
self.kH, self.kW})
errcheck('cudnnSetFilterNdDescriptor', self.weightDesc[0],
cudnn.typemap[torch.typename(self.weight)], 'CUDNN_TENSOR_NCHW', 4,
desc:data());
+ local function destroyWDesc(d)
+ errcheck('cudnnDestroyFilterDescriptor', d[0]);
+ end
+ ffi.gc(self.weightDesc, destroyWDesc)
+
+ -- create descriptor for bias
+ if self.bias then
+ self.biasDesc = cudnn.toDescriptor(self.bias:view(1, self.nOutputPlane,1,1))
+ end
end
function SpatialFullConvolution:fastest(mode)
- return Convolution.fastest(self)
+ if mode == nil then mode = true end
+ self.fastest_mode = mode
+ self.iSize = self.iSize or torch.LongStorage(4)
+ self.iSize:fill(0)
+ return self
end
function SpatialFullConvolution:setMode(fmode, bdmode, bwmode)
- return Convolution.setMode(self, fmode, bdmode, bwmode)
+ if fmode ~= nil then
+ self.fmode = fmode
+ end
+ if bdmode ~= nil then
+ self.bdmode = bdmode
+ end
+ if bwmode ~= nil then
+ self.bwmode = bwmode
+ end
+ self.iSize = self.iSize or torch.LongStorage(4)
+ self.iSize:fill(0)
+ return self
end
function SpatialFullConvolution:resetMode()
- return Convolution.resetMode(self)
+ self.fmode = nil
+ self.bdmode = nil
+ self.bwmode = nil
+ return self
end
function SpatialFullConvolution:noBias()
- return Convolution.noBias(self)
+ self.bias = nil
+ self.gradBias = nil
+ return self
end
function SpatialFullConvolution:createIODescriptors(input)
- if Convolution.checkInputChanged(self, input) then
+ local batch = true
+ if input:dim() == 3 then
+ input = input:view(1, input:size(1), input:size(2), input:size(3))
+ batch = false
+ end
+ assert(input:dim() == 4 and input:isContiguous());
+ self.iSize = self.iSize or torch.LongStorage(4):fill(0)
+ if not self.iDesc or not self.oDesc or
+ input:size(1) ~= self.iSize[1] or input:size(2) ~= self.iSize[2]
+ or input:size(3) ~= self.iSize[3] or input:size(4) ~= self.iSize[4] then
+ self.iSize = input:size()
+
+ assert(self.nInputPlane == input:size(2), 'input has to contain: '
+ .. self.nInputPlane
+ .. ' feature maps, but received input of size: '
+ .. input:size(1) .. ' x ' .. input:size(2) ..
+ ' x ' .. input:size(3) .. ' x ' .. input:size(4))
+
-- create input descriptor
- local input_slice = input[{{},{1,self.nInputPlane},{},{}}]
- self.iDesc = cudnn.toDescriptor(input_slice)
+ local input_slice = {{},{1,self.nInputPlane},{},{}}
+ self.iDesc = cudnn.toDescriptor(input[input_slice])
-- create conv descriptor
- self.convDesc = cudnn.createDescriptors(1, 'struct cudnnConvolutionStruct*[?]',
- 'cudnnCreateConvolutionDescriptor', 'cudnnDestroyConvolutionDescriptor')
+ self.convDesc = ffi.new('struct cudnnConvolutionStruct*[1]')
+ errcheck('cudnnCreateConvolutionDescriptor', self.convDesc)
local pad = torch.IntTensor({self.padH, self.padW})
local stride = torch.IntTensor({self.dH, self.dW})
local upscale = torch.IntTensor({1,1})
@@ -55,6 +101,10 @@ function SpatialFullConvolution:createIODescriptors(input)
2, pad:data(),
stride:data(), upscale:data(), 'CUDNN_CROSS_CORRELATION',
cudnn.configmap(torch.type(self.weight)));
+ local function destroyConvDesc(d)
+ errcheck('cudnnDestroyConvolutionDescriptor', d[0]);
+ end
+ ffi.gc(self.convDesc, destroyConvDesc)
-- get output shape, resize output
local iwidth = input:size(4)
@@ -65,12 +115,190 @@ function SpatialFullConvolution:createIODescriptors(input)
self.output:resize(oSize:long():storage())
-- create descriptor for output
- local output_slice = self.output[{{},{1,self.nOutputPlane},{},{}}]
- self.oDesc = cudnn.toDescriptor(output_slice)
+ local output_slice = {{},{1,self.nOutputPlane},{},{}}
+ self.oDesc = cudnn.toDescriptor(self.output[output_slice])
self.oDescForBias = cudnn.toDescriptor(self.output)
- algo.prepareHash(self, input_slice, output_slice)
+ -----------------------------------------------------------------------
+ local function shape(x)
+ local sz = x:size()
+ local str = ''
+ for i=1,sz:size() do
+ str = str .. sz[i] .. 'x'
+ end
+ if #str > 0 then
+ str = str:sub(1, #str-1)
+ end
+ return str
+ end
+ local autotunerHash = shape(self.weight) .. ';'
+ .. shape(input[input_slice]) .. ';'
+ .. shape(self.output[output_slice])
+ local maxBufSize = 0
+
+ -- create forwardAlgorithm descriptors
+ local algType = ffi.new("cudnnConvolutionFwdAlgo_t[?]", 1)
+ local algSearchMode = 'CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT'
+ local algWorkspaceLimit = self.workspace_limit
+ or (self.nOutputPlane * self.kH * self.kW * 4) -- 4 = sizeof int/float.
+
+ if self.fastest_mode or cudnn.fastest == true then
+ algSearchMode = 'CUDNN_CONVOLUTION_FWD_PREFER_FASTEST'
+ end
+
+ if cudnn.benchmark then -- the manual auto-tuner is run
+ if autotunerCache[1][autotunerHash] then
+ algType[0] = autotunerCache[1][autotunerHash]
+ if cudnn.verbose then
+ print('Autotuning SFC: using cached algo = ', algType[0], ' for: ', autotunerHash)
+ end
+ else
+ local perfResults = ffi.new("cudnnConvolutionFwdAlgoPerf_t[?]", 1)
+ local intt = torch.IntTensor(1);
+ errcheck('cudnnFindConvolutionForwardAlgorithm',
+ cudnn.getHandle(),
+ self.oDesc[0], self.weightDesc[0],
+ self.convDesc[0], self.iDesc[0],
+ 1, intt:data(), perfResults)
+ algType[0] = perfResults[0].algo
+ autotunerCache[1][autotunerHash] = perfResults[0].algo
+ if cudnn.verbose then
+ print(string.format(
+ "Autotuning Forward: Time: %3.5f Memory: %8d Algorithm: %d"
+ .. " Weight: %15s Input: %15s Output: %15s",
+ perfResults[0].time, tonumber(perfResults[0].memory),
+ tonumber(perfResults[0].algo),
+ shape(self.weight), shape(input[input_slice]),
+ shape(self.output[output_slice])))
+ end
+ end
+ else
+ errcheck('cudnnGetConvolutionForwardAlgorithm',
+ cudnn.getHandle(),
+ self.oDesc[0], self.weightDesc[0],
+ self.convDesc[0], self.iDesc[0],
+ algSearchMode, algWorkspaceLimit, algType)
+ end
+ algType[0] = self.fmode or algType[0]
+ self.fwdAlgType = algType
+ local bufSize = torch.LongTensor(1)
+ errcheck('cudnnGetConvolutionForwardWorkspaceSize',
+ cudnn.getHandle(),
+ self.oDesc[0], self.weightDesc[0],
+ self.convDesc[0], self.iDesc[0],
+ algType[0], bufSize:data())
+ maxBufSize = math.max(maxBufSize, bufSize[1])
+
+ -- create backwardFilterAlgorithm descriptors
+ local algType = ffi.new("cudnnConvolutionBwdFilterAlgo_t[?]", 1)
+ local algSearchMode = 'CUDNN_CONVOLUTION_BWD_FILTER_NO_WORKSPACE'
+ local algWorkspaceLimit = self.workspace_limit
+ or (self.nOutputPlane * self.kH * self.kW * 4) -- 4 = sizeof int/float.
+ if self.fastest_mode or cudnn.fastest == true then
+ algSearchMode = 'CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST'
+ end
+
+ if cudnn.benchmark then -- the manual auto-tuner is run
+ if autotunerCache[2][autotunerHash] then
+ algType[0] = autotunerCache[2][autotunerHash]
+ else
+ local perfResults = ffi.new("cudnnConvolutionBwdFilterAlgoPerf_t[?]", 1)
+ local intt = torch.IntTensor(1);
+ errcheck('cudnnFindConvolutionBackwardFilterAlgorithm',
+ cudnn.getHandle(),
+ self.oDesc[0], self.iDesc[0],
+ self.convDesc[0], self.weightDesc[0],
+ 1, intt:data(), perfResults)
+ algType[0] = perfResults[0].algo
+ autotunerCache[2][autotunerHash] = perfResults[0].algo
+ if cudnn.verbose then
+ print(string.format(
+ "Autotuning backwardFilter: Time: %3.5f Memory: %8d Algorithm: %d"
+ .. " Weight: %15s Input: %15s Output: %15s",
+ perfResults[0].time, tonumber(perfResults[0].memory),
+ tonumber(perfResults[0].algo),
+ shape(self.weight), shape(input[input_slice]),
+ shape(self.output[output_slice])))
+ end
+ end
+ else
+ errcheck('cudnnGetConvolutionBackwardFilterAlgorithm',
+ cudnn.getHandle(),
+ self.oDesc[0], self.iDesc[0],
+ self.convDesc[0], self.weightDesc[0],
+ algSearchMode, algWorkspaceLimit, algType)
+ end
+ algType[0] = self.bwmode or algType[0]
+ self.bwdFilterAlgType = algType
+ local bufSize = torch.LongTensor(1)
+ errcheck('cudnnGetConvolutionBackwardFilterWorkspaceSize',
+ cudnn.getHandle(),
+ self.oDesc[0], self.iDesc[0],
+ self.convDesc[0], self.weightDesc[0],
+ algType[0], bufSize:data())
+ maxBufSize = math.max(maxBufSize, bufSize[1])
+
+ -- create backwardDataAlgorithm descriptors
+ local algType = ffi.new("cudnnConvolutionBwdDataAlgo_t[?]", 1)
+ local algSearchMode = 'CUDNN_CONVOLUTION_BWD_DATA_NO_WORKSPACE'
+ local algWorkspaceLimit = self.workspace_limit
+ or (self.nOutputPlane * self.kH * self.kW * 4) -- 4 = sizeof int/float.
+ if self.fastest_mode or cudnn.fastest == true then
+ algSearchMode = 'CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST'
+ end
+ if cudnn.benchmark then -- the manual auto-tuner is run
+ if autotunerCache[3][autotunerHash] then
+ algType[0] = autotunerCache[3][autotunerHash]
+ else
+ local perfResults = ffi.new("cudnnConvolutionBwdDataAlgoPerf_t[?]", 1)
+ local intt = torch.IntTensor(1);
+ errcheck('cudnnFindConvolutionBackwardDataAlgorithm',
+ cudnn.getHandle(),
+ self.weightDesc[0], self.iDesc[0],
+ self.convDesc[0], self.oDesc[0],
+ 1, intt:data(), perfResults)
+ algType[0] = perfResults[0].algo
+ autotunerCache[3][autotunerHash] = perfResults[0].algo
+ if cudnn.verbose then
+ print(string.format(
+ "Autotuning backwardData: Time: %3.5f Memory: %8d Algorithm: %d"
+ .. " Weight: %15s Input: %15s Output: %15s\n",
+ perfResults[0].time, tonumber(perfResults[0].memory),
+ tonumber(perfResults[0].algo),
+ shape(self.weight), shape(input[input_slice]),
+ shape(self.output[output_slice])))
+ end
+ end
+ else
+ errcheck('cudnnGetConvolutionBackwardDataAlgorithm',
+ cudnn.getHandle(),
+ self.weightDesc[0], self.iDesc[0],
+ self.convDesc[0], self.oDesc[0],
+ algSearchMode, algWorkspaceLimit, algType)
+ end
+ algType[0] = self.bdmode or algType[0]
+ self.bwdDataAlgType = algType
+ local bufSize = torch.LongTensor(1)
+ errcheck('cudnnGetConvolutionBackwardDataWorkspaceSize',
+ cudnn.getHandle(),
+ self.weightDesc[0], self.iDesc[0],
+ self.convDesc[0], self.oDesc[0],
+ algType[0], bufSize:data())
+ maxBufSize = math.max(maxBufSize, bufSize[1])
+
+ self.extraBuffer = self.extraBuffer or cudnn.getSharedWorkspace()
+ self.extraBufferSizeInBytes = self.extraBuffer:nElement() * 4 -- float
+ if maxBufSize > self.extraBufferSizeInBytes then
+ self.extraBuffer:resize(math.ceil(maxBufSize/4))
+ self.extraBufferSizeInBytes = maxBufSize
+ end
+
+ if not batch then
+ self.output = self.output:view(self.output:size(2),
+ self.output:size(3),
+ self.output:size(4))
+ end
end
end
@@ -78,11 +306,8 @@ local one = torch.FloatTensor({1});
local zero = torch.FloatTensor({0});
function SpatialFullConvolution:updateOutput(input)
+ if not self.weightDesc then self:resetWeightDescriptors() end
self:createIODescriptors(input)
- if not self.bwdDataAlgType then
- algo.setupBackwardDataAlgorithm(self, {self.weightDesc[0], self.iDesc[0],
- self.convDesc[0], self.oDesc[0]})
- end
-- Because SpatialFullConvolution is performing the adjoint of the forward
-- convolution operator, we need to swap the forward and backward passes.
@@ -90,8 +315,8 @@ function SpatialFullConvolution:updateOutput(input)
one:data(),
self.weightDesc[0], self.weight:data(),
self.iDesc[0], input:data(),
- self.convDesc[0], self.bwdDataAlgType,
- self.extraBuffer:data(), self.extraBuffer:nElement() * self.extraBuffer.elementSize(),
+ self.convDesc[0], self.bwdDataAlgType[0],
+ self.extraBuffer:data(), self.extraBufferSizeInBytes,
zero:data(),
self.oDesc[0], self.output:data())
@@ -111,19 +336,16 @@ function SpatialFullConvolution:updateGradInput(input, gradOutput)
assert(gradOutput:dim() == 3 or gradOutput:dim() == 4, 'gradOutput has to be 3D or 4D');
assert(gradOutput:isContiguous(), 'gradOutput has to be contiguous')
+ if not self.weightDesc then self:resetWeightDescriptors() end
self:createIODescriptors(input)
- if not self.fwdDataAlgType then
- algo.setupForwardAlgorithm(self, {self.oDesc[0], self.weightDesc[0],
- self.convDesc[0], self.iDesc[0]})
- end
errcheck('cudnnConvolutionForward', cudnn.getHandle(),
one:data(),
self.oDesc[0], gradOutput:data(),
self.weightDesc[0], self.weight:data(),
self.convDesc[0],
- self.fwdAlgType,
- self.extraBuffer:data(), self.extraBuffer:nElement() * self.extraBuffer.elementSize(),
+ self.fwdAlgType[0],
+ self.extraBuffer:data(), self.extraBufferSizeInBytes,
zero:data(),
self.iDesc[0], self.gradInput:data());
return self.gradInput
@@ -138,11 +360,8 @@ function SpatialFullConvolution:accGradParameters(input, gradOutput, scale)
assert(gradOutput:dim() == 3 or gradOutput:dim() == 4, 'gradOutput has to be 3D or 4D');
assert(gradOutput:isContiguous(), 'gradOutput has to be contiguous')
+ if not self.weightDesc then self:resetWeightDescriptors() end
self:createIODescriptors(input)
- if not self.bwdFilterAlgType then
- algo.setupBackwardFilterAlgorithm(self, {self.oDesc[0], self.iDesc[0],
- self.convDesc[0], self.weightDesc[0]})
- end
-- gradBias
if self.bias then
@@ -159,8 +378,8 @@ function SpatialFullConvolution:accGradParameters(input, gradOutput, scale)
self.oDesc[0], gradOutput:data(),
self.iDesc[0], input:data(),
self.convDesc[0],
- self.bwdFilterAlgType,
- self.extraBuffer:data(), self.extraBuffer:nElement() * self.extraBuffer.elementSize(),
+ self.bwdFilterAlgType[0],
+ self.extraBuffer:data(), self.extraBufferSizeInBytes,
one:data(),
self.weightDesc[0], self.gradWeight:data())
end
@@ -177,6 +396,7 @@ function SpatialFullConvolution:clearDesc()
self.bwdDataAlgType = nil
self.bwdFilterAlgType = nil
self.extraBuffer = nil
+ self.extraBufferSizeInBytes = nil
end
function SpatialFullConvolution:write(f)
diff --git a/TemporalConvolution.lua b/TemporalConvolution.lua
index 947cc4f..4648ffd 100644
--- a/TemporalConvolution.lua
+++ b/TemporalConvolution.lua
@@ -6,8 +6,6 @@ local TemporalConvolution, parent =
--it is recommended to pass padding parameter to this routine and use cudnn implicit padding facilities.
--limitation is that padding will be equal on both sides.
-local Convolution = cudnn.SpatialConvolution
-
function TemporalConvolution:__init(inputFrameSize, outputFrameSize,
kH, dH, padH)
local delayedReset = self.reset
@@ -16,8 +14,7 @@ function TemporalConvolution:__init(inputFrameSize, outputFrameSize,
local nOutputPlane = outputFrameSize
self.inputFrameSize = inputFrameSize
self.outputFrameSize = outputFrameSize
- self.nDim = 4
- Convolution.__init(self, nInputPlane, nOutputPlane, kW, kH, 1, dH,0,padH)
+ cudnn.SpatialConvolution.__init(self, nInputPlane, nOutputPlane, kW, kH, 1, dH,0,padH)
self.weight = self.weight:view(nOutputPlane,inputFrameSize*kH)
self.gradWeight = self.gradWeight:view(outputFrameSize, inputFrameSize*kH)
--self.dW and self.kW now have different meaning than in nn.TemporalConvolution, because
@@ -31,24 +28,24 @@ function TemporalConvolution:createIODescriptors(input)
or input:size(3) ~= self.iSize[3] or input:size(4) ~= self.iSize[4] then
sizeChanged = true
end
- Convolution.createIODescriptors(self,input)
+ cudnn.SpatialConvolution.createIODescriptors(self,input)
if sizeChanged then
self.oSize = self.output:size()
end
end
function TemporalConvolution:fastest(mode)
- self = Convolution.fastest(self,mode)
+ self = cudnn.SpatialConvolution.fastest(self,mode)
return self
end
function TemporalConvolution:setMode(fmode, bdmode, bwmode)
- self = Convolution.setMode(self,fmode, bdmode, bwmode)
+ self = cudnn.SpatialConvolution.setMode(self,fmode, bdmode, bwmode)
return self
end
function TemporalConvolution:resetWeightDescriptors()
- Convolution.resetWeightDescriptors(self)
+ cudnn.SpatialConvolution.resetWeightDescriptors(self)
end
local function inputview(input)
@@ -66,7 +63,7 @@ function TemporalConvolution:updateOutput(input)
self._output = self._output or input.new()
if self.output:storage() then self._output:set(self.output:storage()) else self._output = self.output end
if self.buffer:storage() then self.output:set(self.buffer:storage(), 1, self.output:size()) else self.output = self.buffer end
- Convolution.updateOutput(self,_input)
+ cudnn.SpatialConvolution.updateOutput(self,_input)
self.buffer = self.output:view(self.oSize):transpose(2,3)
self.output = self._output:resize(self.buffer:size()):copy(self.buffer)
-- self.output here is always 4D, use input dimensions to properly view output
@@ -95,7 +92,7 @@ function TemporalConvolution:updateGradInput(input, gradOutput)
if not self.gradInput then return end
local _gradOutput = transposeGradOutput(gradOutput,self.buffer)
local _input = inputview(input)
- self.gradInput = Convolution.updateGradInput(self,_input, _gradOutput)
+ self.gradInput = cudnn.SpatialConvolution.updateGradInput(self,_input, _gradOutput)
if input:dim()==3 then
self.gradInput = self.gradInput:view(self.iSize[1],self.iSize[3],self.iSize[4])
else
@@ -109,7 +106,7 @@ function TemporalConvolution:accGradParameters(input,gradOutput,scale)
local _input = inputview(input)
-- transpose gradOutput (it will likely be transposed twice, hopefully, no big deal
local _gradOutput = transposeGradOutput(gradOutput,self.buffer)
- Convolution.accGradParameters(self,_input,_gradOutput,scale)
+ cudnn.SpatialConvolution.accGradParameters(self,_input,_gradOutput,scale)
end
function TemporalConvolution:clearDesc()
@@ -120,7 +117,7 @@ end
function TemporalConvolution:write(f)
self:clearDesc()
- Convolution.clearDesc(self)
+ cudnn.SpatialConvolution.clearDesc(self)
local var = {}
for k,v in pairs(self) do
var[k] = v
diff --git a/VolumetricConvolution.lua b/VolumetricConvolution.lua
index 73fd9ce..b255467 100644
--- a/VolumetricConvolution.lua
+++ b/VolumetricConvolution.lua
@@ -2,49 +2,83 @@ local VolumetricConvolution, parent
= torch.class('cudnn.VolumetricConvolution', 'nn.VolumetricConvolution')
local ffi = require 'ffi'
local errcheck = cudnn.errcheck
-local algo = require 'cudnn.algo'
-local Convolution = cudnn.SpatialConvolution
+local autotunerCache = {}
+autotunerCache[1] = {} -- forward
+autotunerCache[2] = {} -- backwardFilter
+autotunerCache[3] = {} -- backwardData
-function VolumetricConvolution:__init(nInputPlane, nOutputPlane,
- kT, kW, kH, dW, dH, padW, padH)
- self.nDim = 5
- self.kT = kT
- Convolution.__init(self,nInputPlane, nOutputPlane,
- kW, kH, dW, dH, padW, padH, 1)
- return self
-end
-- if you change the configuration of the module manually, call this
function VolumetricConvolution:resetWeightDescriptors()
+ assert(cudnn.typemap[torch.typename(self.weight)], 'Only Cuda supported duh!')
+ assert(cudnn.typemap[torch.typename(self.bias)] or not self.bias, 'Only Cuda supported duh!')
+ -- create filterDescriptor for weight
+ self.weightDesc = ffi.new('struct cudnnFilterStruct*[1]')
+ errcheck('cudnnCreateFilterDescriptor', self.weightDesc)
local desc = torch.IntTensor({self.nOutputPlane, self.nInputPlane,
self.kT, self.kH, self.kW})
- Convolution.resetWeightDescriptors(self, desc)
+ errcheck('cudnnSetFilterNdDescriptor', self.weightDesc[0],
+ cudnn.typemap[torch.typename(self.weight)], 'CUDNN_TENSOR_NCHW', 5,
+ desc:data());
+ local function destroyWDesc(d)
+ errcheck('cudnnDestroyFilterDescriptor', d[0]);
+ end
+ ffi.gc(self.weightDesc, destroyWDesc)
+
+ -- create descriptor for bias
+ self.biasDesc = cudnn.toDescriptor(self.bias:view(1, self.nOutputPlane,
+ 1, 1))
end
function VolumetricConvolution:fastest(mode)
- return Convolution.fastest(self)
+ if mode == nil then mode = true end
+ self.fastest_mode = mode
+ self.iSize = self.iSize or torch.LongStorage(4)
+ self.iSize:fill(0)
+ return self
end
function VolumetricConvolution:setMode(fmode, bdmode, bwmode)
- return Convolution.setMode(self, fmode, bdmode, bwmode)
+ if fmode ~= nil then
+ self.fmode = fmode
+ end
+ if bdmode ~= nil then
+ self.bdmode = bdmode
+ end
+ if bwmode ~= nil then
+ self.bwmode = bwmode
+ end
+ self.iSize = self.iSize or torch.LongStorage(4)
+ self.iSize:fill(0)
+ return self
end
function VolumetricConvolution:resetMode()
- return Convolution.resetMode(self)
+ self.fmode = nil
+ self.bdmode = nil
+ self.bwmode = nil
+ return self
end
function VolumetricConvolution:createIODescriptors(input)
+ local batch = true
if input:dim() == 4 then
input = input:view(1, input:size(1), input:size(2),
input:size(3), input:size(4))
batch = false
end
- if Convolution.checkInputChanged(self, input) then
+ assert(input:dim() == 5 and input:isContiguous());
+ self.iSize = self.iSize or torch.LongStorage(4):fill(0)
+ if not self.iDesc or not self.oDesc or
+ input:size(1) ~= self.iSize[1] or input:size(2) ~= self.iSize[2]
+ or input:size(3) ~= self.iSize[3] or input:size(4) ~= self.iSize[4]
+ or input:size(5) ~= self.iSize[5] then
+ self.iSize = input:size()
-- create input descriptor
self.iDesc = cudnn.toDescriptor(input)
-- create conv descriptor
- self.convDesc = cudnn.createDescriptors(1, 'struct cudnnConvolutionStruct*[?]',
- 'cudnnCreateConvolutionDescriptor', 'cudnnDestroyConvolutionDescriptor')
+ self.convDesc = ffi.new('struct cudnnConvolutionStruct*[1]')
+ errcheck('cudnnCreateConvolutionDescriptor', self.convDesc)
local pad = torch.IntTensor({self.padT, self.padH, self.padW})
local stride = torch.IntTensor({self.dT, self.dH, self.dW})
local upscale = torch.IntTensor({1,1,1})
@@ -52,6 +86,11 @@ function VolumetricConvolution:createIODescriptors(input)
3, pad:data(),
stride:data(), upscale:data(), 'CUDNN_CROSS_CORRELATION',
cudnn.configmap(torch.type(self.weight)));
+ local function destroyConvDesc(d)
+ errcheck('cudnnDestroyConvolutionDescriptor', d[0]);
+ end
+ ffi.gc(self.convDesc, destroyConvDesc)
+
-- create output descriptor and resize output
local oSize = torch.IntTensor(5)
local oSizeD = oSize:data()
@@ -67,7 +106,181 @@ function VolumetricConvolution:createIODescriptors(input)
self.output:size(3)*self.output:size(4),
self.output:size(5)))
- algo.prepareHash(self, input, output)
+
+
+ -----------------------------------------------------------------------
+ local function shape(x)
+ return table.concat(x:size():totable(),'x')
+ end
+ local autotunerHash = shape(self.weight) .. ';'
+ .. shape(input) .. ';'
+ .. shape(self.output)
+
+ local maxBufSize = 0
+
+ -- create forwardAlgorithm descriptors
+ local algType = ffi.new("cudnnConvolutionFwdAlgo_t[?]", 1)
+ local algSearchMode = 'CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT'
+ local algWorkspaceLimit = self.workspace_limit
+ or (self.nInputPlane * self.kH * self.kW * 4) -- 4 = sizeof int/float.
+
+ if self.fastest_mode or cudnn.fastest == true then
+ algSearchMode = 'CUDNN_CONVOLUTION_FWD_PREFER_FASTEST'
+ end
+
+ if cudnn.benchmark then -- the manual auto-tuner is run
+ if autotunerCache[1][autotunerHash] then
+ algType[0] = autotunerCache[1][autotunerHash]
+ if cudnn.verbose then
+ print('Autotuning VMC FW: using cached algo = ', algType[0], ' for: ', autotunerHash)
+ end
+ else
+ local perfResults = ffi.new("cudnnConvolutionFwdAlgoPerf_t[?]", 1)
+ local intt = torch.IntTensor(1);
+ errcheck('cudnnFindConvolutionForwardAlgorithm',
+ cudnn.getHandle(),
+ self.iDesc[0], self.weightDesc[0],
+ self.convDesc[0], self.oDesc[0],
+ 1, intt:data(), perfResults)
+ algType[0] = perfResults[0].algo
+ autotunerCache[1][autotunerHash] = perfResults[0].algo
+ if cudnn.verbose then
+ print(string.format(
+ "\nAutotuning VMC Forward: Time: %3.5f Memory: %8d Algorithm: %d"
+ .. " Weight: %15s Input: %15s Output: %15s",
+ perfResults[0].time, tonumber(perfResults[0].memory),
+ tonumber(perfResults[0].algo),
+ shape(self.weight), shape(input),
+ shape(self.output)))
+ end
+ end
+ else
+ errcheck('cudnnGetConvolutionForwardAlgorithm',
+ cudnn.getHandle(),
+ self.iDesc[0], self.weightDesc[0],
+ self.convDesc[0], self.oDesc[0],
+ algSearchMode, algWorkspaceLimit, algType)
+ end
+ algType[0] = self.fmode or algType[0]
+ self.fwdAlgType = algType
+ local bufSize = torch.LongTensor(1)
+ errcheck('cudnnGetConvolutionForwardWorkspaceSize',
+ cudnn.getHandle(),
+ self.iDesc[0], self.weightDesc[0],
+ self.convDesc[0], self.oDesc[0],
+ algType[0], bufSize:data())
+ maxBufSize = math.max(maxBufSize, bufSize[1])
+
+ -- create backwardFilterAlgorithm descriptors
+ local algType = ffi.new("cudnnConvolutionBwdFilterAlgo_t[?]", 1)
+ local algSearchMode = 'CUDNN_CONVOLUTION_BWD_FILTER_NO_WORKSPACE'
+ local algWorkspaceLimit = self.workspace_limit
+ or (self.nInputPlane * self.kH * self.kW * 4) -- 4 = sizeof int/float.
+ if self.fastest_mode or cudnn.fastest == true then
+ algSearchMode = 'CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST'
+ end
+
+ if cudnn.benchmark then -- the manual auto-tuner is run
+ if autotunerCache[2][autotunerHash] then
+ algType[0] = autotunerCache[2][autotunerHash]
+ if cudnn.verbose then
+ print('Autotuning VMC BWF: using cached algo = ', algType[0], ' for: ', autotunerHash)
+ end
+ else
+ local perfResults = ffi.new("cudnnConvolutionBwdFilterAlgoPerf_t[?]", 1)
+ local intt = torch.IntTensor(1);
+ errcheck('cudnnFindConvolutionBackwardFilterAlgorithm',
+ cudnn.getHandle(),
+ self.iDesc[0], self.oDesc[0],
+ self.convDesc[0], self.weightDesc[0],
+ 1, intt:data(), perfResults)
+ algType[0] = perfResults[0].algo
+ autotunerCache[2][autotunerHash] = perfResults[0].algo
+ if cudnn.verbose then
+ print(string.format(
+ "Autotuning backwardFilter: Time: %3.5f Memory: %8d Algorithm: %d"
+ .. " Weight: %15s Input: %15s Output: %15s",
+ perfResults[0].time, tonumber(perfResults[0].memory),
+ tonumber(perfResults[0].algo),
+ shape(self.weight), shape(input),
+ shape(self.output)))
+ end
+ end
+ else
+ errcheck('cudnnGetConvolutionBackwardFilterAlgorithm',
+ cudnn.getHandle(),
+ self.iDesc[0], self.oDesc[0],
+ self.convDesc[0], self.weightDesc[0],
+ algSearchMode, algWorkspaceLimit, algType)
+ end
+ algType[0] = self.bwmode or algType[0]
+ self.bwdFilterAlgType = algType
+ local bufSize = torch.LongTensor(1)
+ errcheck('cudnnGetConvolutionBackwardFilterWorkspaceSize',
+ cudnn.getHandle(),
+ self.iDesc[0], self.oDesc[0],
+ self.convDesc[0], self.weightDesc[0],
+ algType[0], bufSize:data())
+ maxBufSize = math.max(maxBufSize, bufSize[1])
+
+ -- create backwardDataAlgorithm descriptors
+ local algType = ffi.new("cudnnConvolutionBwdDataAlgo_t[?]", 1)
+ local algSearchMode = 'CUDNN_CONVOLUTION_BWD_DATA_NO_WORKSPACE'
+ local algWorkspaceLimit = self.workspace_limit
+ or (self.nInputPlane * self.kH * self.kW * 4) -- 4 = sizeof int/float.
+ if self.fastest_mode or cudnn.fastest == true then
+ algSearchMode = 'CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST'
+ end
+ if cudnn.benchmark then -- the manual auto-tuner is run
+ if autotunerCache[3][autotunerHash] then
+ algType[0] = autotunerCache[3][autotunerHash]
+ if cudnn.verbose then
+ print('Autotuning VMC BWD: using cached algo = ', algType[0], ' for: ', autotunerHash)
+ end
+ else
+ local perfResults = ffi.new("cudnnConvolutionBwdDataAlgoPerf_t[?]", 1)
+ local intt = torch.IntTensor(1);
+ errcheck('cudnnFindConvolutionBackwardDataAlgorithm',
+ cudnn.getHandle(),
+ self.weightDesc[0], self.oDesc[0],
+ self.convDesc[0], self.iDesc[0],
+ 1, intt:data(), perfResults)
+ algType[0] = perfResults[0].algo
+ autotunerCache[3][autotunerHash] = perfResults[0].algo
+ if cudnn.verbose then
+ print(string.format(
+ "Autotuning backwardData: Time: %3.5f Memory: %8d Algorithm: %d"
+ .. " Weight: %15s Input: %15s Output: %15s\n",
+ perfResults[0].time, tonumber(perfResults[0].memory),
+ tonumber(perfResults[0].algo),
+ shape(self.weight), shape(input),
+ shape(self.output)))
+ end
+ end
+ else
+ errcheck('cudnnGetConvolutionBackwardDataAlgorithm',
+ cudnn.getHandle(),
+ self.weightDesc[0], self.oDesc[0],
+ self.convDesc[0], self.iDesc[0],
+ algSearchMode, algWorkspaceLimit, algType)
+ end
+ algType[0] = self.bdmode or algType[0]
+ self.bwdDataAlgType = algType
+ local bufSize = torch.LongTensor(1)
+ errcheck('cudnnGetConvolutionBackwardDataWorkspaceSize',
+ cudnn.getHandle(),
+ self.weightDesc[0], self.oDesc[0],
+ self.convDesc[0], self.iDesc[0],
+ algType[0], bufSize:data())
+ maxBufSize = math.max(maxBufSize, bufSize[1])
+
+ self.extraBuffer = self.extraBuffer or cudnn.getSharedWorkspace()
+ self.extraBufferSizeInBytes = self.extraBuffer:nElement() * 4 -- float
+ if maxBufSize > self.extraBufferSizeInBytes then
+ self.extraBuffer:resize(math.ceil(maxBufSize/4))
+ self.extraBufferSizeInBytes = maxBufSize
+ end
+ -----------------------------------------------------------------------
if not batch then
self.output = self.output:view(self.output:size(2),
@@ -78,28 +291,119 @@ function VolumetricConvolution:createIODescriptors(input)
end
end
+local one = torch.FloatTensor({1});
+local zero = torch.FloatTensor({0});
+
+local function makeContiguous(self, input, gradOutput)
+ if not input:isContiguous() then
+ self._input = self._input or input.new()
+ self._input:typeAs(input):resizeAs(input):copy(input)
+ input = self._input
+ end
+ if gradOutput and not gradOutput:isContiguous() then
+ self._gradOutput = self._gradOutput or gradOutput.new()
+ self._gradOutput:typeAs(gradOutput):resizeAs(gradOutput):copy(gradOutput)
+ gradOutput = self._gradOutput
+ end
+ return input, gradOutput
+end
+
function VolumetricConvolution:updateOutput(input)
- return Convolution:updateOutput(input)
+ if not self.weightDesc then self:resetWeightDescriptors() end
+ input = makeContiguous(self, input)
+ self:createIODescriptors(input)
+ errcheck('cudnnConvolutionForward', cudnn.getHandle(),
+ one:data(),
+ self.iDesc[0], input:data(),
+ self.weightDesc[0], self.weight:data(),
+ self.convDesc[0], self.fwdAlgType[0],
+ self.extraBuffer:data(), self.extraBufferSizeInBytes,
+ zero:data(),
+ self.oDesc[0], self.output:data());
+ errcheck('cudnnAddTensor', cudnn.getHandle(),
+ one:data(),
+ self.biasDesc[0], self.bias:data(), one:data(),
+ self.oDescBias[0], self.output:data());
+ return self.output
end
function VolumetricConvolution:updateGradInput(input, gradOutput)
- return Convolution:updateGradInput(input)
+ if not self.gradInput then return end
+ self.gradInput:resizeAs(input)
+
+ input, gradOutput = makeContiguous(self, input, gradOutput)
+ assert(gradOutput:dim() == 4 or gradOutput:dim() == 5,
+ 'gradOutput has to be a 4D or 5D tensor');
+ if not self.weightDesc then self:resetWeightDescriptors() end
+ self:createIODescriptors(input)
+ errcheck('cudnnConvolutionBackwardData', cudnn.getHandle(),
+ one:data(),
+ self.weightDesc[0], self.weight:data(),
+ self.oDesc[0], gradOutput:data(),
+ self.convDesc[0],
+ self.bwdDataAlgType[0],
+ self.extraBuffer:data(), self.extraBufferSizeInBytes,
+ zero:data(),
+ self.iDesc[0], self.gradInput:data());
+ return self.gradInput
end
function VolumetricConvolution:accGradParameters(input, gradOutput, scale)
- return Convolution:accGradParameters(input, gradOutput, scale)
+ self.scaleT = self.scaleT or torch.FloatTensor(1):fill(1.0)
+ -- this line forces this member to always be on CPU (needed for cudnn)
+ self.scaleT = self.scaleT:float()
+
+ scale = scale or 1.0
+ self.scaleT[1] = scale
+ input, gradOutput = makeContiguous(self, input, gradOutput)
+ assert(gradOutput:dim() == 4 or gradOutput:dim() == 5,
+ 'gradOutput has to be a 4D or 5D tensor');
+ self:createIODescriptors(input)
+ if not self.weightDesc then self:resetWeightDescriptors() end
+ -- gradBias
+ errcheck('cudnnConvolutionBackwardBias', cudnn.getHandle(),
+ self.scaleT:data(),
+ self.oDescBias[0], gradOutput:data(),
+ one:data(),
+ self.biasDesc[0], self.gradBias:data());
+ -- gradWeight
+ errcheck('cudnnConvolutionBackwardFilter', cudnn.getHandle(),
+ self.scaleT:data(),
+ self.iDesc[0], input:data(),
+ self.oDesc[0], gradOutput:data(),
+ self.convDesc[0],
+ self.bwdFilterAlgType[0],
+ self.extraBuffer:data(), self.extraBufferSizeInBytes,
+ one:data(),
+ self.weightDesc[0], self.gradWeight:data());
end
function VolumetricConvolution:clearDesc()
- Convolution:clearDesc()
+ self.weightDesc = nil
+ self.biasDesc = nil
+ self.convDesc = nil
+ self.iDesc = nil
+ self.oDesc = nil
+ self.oDescBias = nil
+ self.fwdAlgType = nil
+ self.bwdDataAlgType = nil
+ self.bwdFilterAlgType = nil
+ self.extraBuffer = nil
+ self.extraBufferInBytes = nil
+ self.scaleT = nil
end
function VolumetricConvolution:write(f)
- Convolution:write(f)
+ self:clearDesc()
+ local var = {}
+ for k,v in pairs(self) do
+ var[k] = v
+ end
+ f:writeObject(var)
end
function VolumetricConvolution:clearState()
- return Convolution:clearState()
+ self:clearDesc()
+ nn.utils.clear(self, 'extraBuffer', '_input', '_gradOutput')
+ return nn.Module.clearState(self)
end
-
-return VolumetricConvolution
diff --git a/algo.lua b/algo.lua
deleted file mode 100644
index ba211ad..0000000
--- a/algo.lua
+++ /dev/null
@@ -1,129 +0,0 @@
-local ffi = require 'ffi'
-local errcheck = cudnn.errcheck
-
-local algo = {}
-local autotunerCache = {}
-autotunerCache['cudnnFindConvolutionForwardAlgorithm'] = {}
-autotunerCache['cudnnFindConvolutionBackwardFilterAlgorithm'] = {}
-autotunerCache['cudnnFindConvolutionBackwardDataAlgorithm'] = {}
-
-local function setupAlgo(self, algo_t, perf_t, findAPI, getAPI, wsAPI, algSearchMode, params)
-
- local algType = ffi.new(algo_t, 1)
-
- if cudnn.benchmark or cudnn.fastest then -- the manual auto-tuner is run
- local cachedAlgo = autotunerCache[findAPI][self.autotunerHash];
- if cachedAlgo then
- algType[0] = cachedAlgo
- if cudnn.verbose then
- print('\n', findAPI, ' using cached algo = ' , algType[0] , ' for: ', self.autotunerHash)
- end
- else
- local perfResults = ffi.new(perf_t, 1)
- local intt = torch.IntTensor(1)
- errcheck(findAPI,
- cudnn.getHandle(),
- params[1], params[2], params[3], params[4],
- 1, intt:data(), perfResults)
- algType[0] = perfResults[0].algo
- autotunerCache[findAPI][self.autotunerHash] = perfResults[0].algo
- if cudnn.verbose then
- print(string.format(
- "\n" .. findAPI .. " Time: %3.5f Memory: %8d Algorithm: %d"
- .. " hash: %45s",
- perfResults[0].time, tonumber(perfResults[0].memory),
- tonumber(perfResults[0].algo), self.autotunerHash ))
-
- end
- end
- else
-
- local algWorkspaceLimit = self.workspace_limit
- or (self.nInputPlane * self.kH * self.kW * self.weight.elementSize())
-
- errcheck(getAPI,
- cudnn.getHandle(),
- params[1], params[2], params[3], params[4],
- algSearchMode, algWorkspaceLimit, algType)
- if cudnn.verbose then
- print(string.format(
- "\n" .. getAPI .. " Limit: %d Algorithm: %d",
- tonumber(algWorkspaceLimit),
- tonumber(algType[0])))
- end
- end
- local bufSize = torch.LongTensor(1)
- errcheck(wsAPI,
- cudnn.getHandle(),
- params[1], params[2], params[3], params[4],
- algType[0], bufSize:data())
-
- self.extraBuffer = self.extraBuffer or cudnn.getSharedWorkspace()
- local extraBufferSizeInBytes = self.extraBuffer:nElement() * self.extraBuffer.elementSize()
-
- if cudnn.verbose then
- print(string.format(
- "\n" .. wsAPI .. " returned bufSize: %d, current extraBufferSizeInBytes: %d, %d elements",
- tonumber(bufSize[1]), tonumber(extraBufferSizeInBytes), tonumber(self.extraBuffer:nElement())))
- end
-
- if extraBufferSizeInBytes < bufSize[1] then
- self.extraBuffer:resize(math.ceil(bufSize[1]/self.extraBuffer.elementSize()))
- end
- return algType[0]
-end
-
-function algo.prepareHash(self, input_slice, output_slice)
- local function shape(x)
- return table.concat(x:size():totable(),'x')
- end
- self.autotunerHash = shape(self.weight) .. ';'
- .. shape(input_slice) .. ';'
- .. shape(output_slice)
-
- self.fwdAlgType = nil
- self.bwdDataAlgType = nil
- self.bwdFilterAlgType = nil
-end
-
-function algo.setupForwardAlgorithm(self, params)
- local algSearchMode
- if self.fastest_mode or cudnn.benchmark == true or cudnn.fastest == true then
- algSearchMode = 'CUDNN_CONVOLUTION_FWD_PREFER_FASTEST'
- else
- algSearchMode = 'CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT'
- end
-
- params = params or { self.iDesc[0], self.weightDesc[0], self.convDesc[0], self.oDesc[0] }
- self.fwdAlgType = self.fmode or
- setupAlgo(self,"cudnnConvolutionFwdAlgo_t[?]", "cudnnConvolutionFwdAlgoPerf_t[?]",
- 'cudnnFindConvolutionForwardAlgorithm', 'cudnnGetConvolutionForwardAlgorithm',
- 'cudnnGetConvolutionForwardWorkspaceSize', algSearchMode, params)
-end
-
-function algo.setupBackwardFilterAlgorithm(self, params)
- local algSearchMode = 'CUDNN_CONVOLUTION_BWD_FILTER_NO_WORKSPACE'
- if self.fastest_mode or cudnn.fastest == true then
- algSearchMode = 'CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST'
- end
- params = params or { self.iDesc[0], self.oDesc[0], self.convDesc[0], self.weightDesc[0] }
- self.bwdFilterAlgType = self.bwmode or
- setupAlgo(self,"cudnnConvolutionBwdFilterAlgo_t[?]", "cudnnConvolutionBwdFilterAlgoPerf_t[?]",
- 'cudnnFindConvolutionBackwardFilterAlgorithm', 'cudnnGetConvolutionBackwardFilterAlgorithm',
- 'cudnnGetConvolutionBackwardFilterWorkspaceSize', algSearchMode,
- params)
-end
-
-function algo.setupBackwardDataAlgorithm(self, params)
- local algSearchMode = 'CUDNN_CONVOLUTION_BWD_DATA_NO_WORKSPACE'
- if self.fastest_mode or cudnn.fastest == true then
- algSearchMode = 'CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST'
- end
- params = params or { self.weightDesc[0], self.oDesc[0], self.convDesc[0], self.iDesc[0] }
- self.bwdDataAlgType = self.bdmode or
- setupAlgo(self,"cudnnConvolutionBwdDataAlgo_t[?]", "cudnnConvolutionBwdDataAlgoPerf_t[?]",
- 'cudnnFindConvolutionBackwardDataAlgorithm', 'cudnnGetConvolutionBackwardDataAlgorithm',
- 'cudnnGetConvolutionBackwardDataWorkspaceSize', algSearchMode, params)
-end
-
-return algo
diff --git a/cudnn-scm-1.rockspec b/cudnn-scm-1.rockspec
index 3066d12..bd8072f 100644
--- a/cudnn-scm-1.rockspec
+++ b/cudnn-scm-1.rockspec
@@ -2,7 +2,7 @@ package = "cudnn"
version = "scm-1"
source = {
- url = "git://github.com/soumith/cudnn.torch.git"
+ url = "git://github.com/soumith/cudnn.torch.git",
}
description = {
diff --git a/functional.lua b/functional.lua
index 8eee7c9..cea9df9 100644
--- a/functional.lua
+++ b/functional.lua
@@ -105,7 +105,7 @@ cudnn.functional.Convolution2D_updateOutput = function(handle, input, weight, ou
local algSearchMode = 'CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT'
local algWorkspaceLimit = 0
if workspace then
- algWorkspaceLimit = workspace:nElement() * workspace:elementSize()
+ algWorkspaceLimit = workspace:nElement() * 4 -- 4 = sizeof float
end
errcheck('cudnnGetConvolutionForwardAlgorithm',
handle,
diff --git a/init.lua b/init.lua
index cc2089e..4b1c6e5 100644
--- a/init.lua
+++ b/init.lua
@@ -116,20 +116,6 @@ function cudnn.toDescriptor(t)
return descriptor
end
-function cudnn.createDescriptors(count, descs_type, create_func, destroy_func)
- local ds = ffi.new(descs_type, count)
- for i = 0, count - 1 do
- errcheck(create_func, ds + i)
- end
- local function destroyDescriptors(ds)
- for i = 0, count - 1 do
- errcheck(destroy_func, ds[i])
- end
- end
- ffi.gc(ds, destroyDescriptors)
- return ds
-end
-
local sharedBuffer = {}
for i=1,numDevices do
@@ -140,7 +126,7 @@ function cudnn.getSharedWorkspace()
local device = cutorch.getDevice()
local stream = cutorch.getStream() -- starts from 0
if not sharedBuffer[device][stream] then
- sharedBuffer[device][stream] = torch.CudaDoubleTensor(256)
+ sharedBuffer[device][stream] = torch.CudaTensor(1)
end
return sharedBuffer[device][stream]
end
@@ -179,4 +165,5 @@ require('cudnn.GRU')
require('cudnn.functional')
require('cudnn.convert')
+
return cudnn
diff --git a/test/test.lua b/test/test.lua
index aa8ea7f..a4f9bcb 100644
--- a/test/test.lua
+++ b/test/test.lua
@@ -25,7 +25,7 @@ local testparams_float = {
}
-- TODO: find out why the errors are so huge
-local testparams_double_err = {
+local testparams_double = {
test_type = 'torch.CudaDoubleTensor',
precision_forward = 1e+2,
precision_backward = 1e+3, -- 1e+4,
@@ -33,14 +33,6 @@ local testparams_double_err = {
precision_io = 1e-5,
}
-local testparams_double = {
- test_type = 'torch.CudaDoubleTensor',
- precision_forward = 1e-4,
- precision_backward = 2e-2,
- precision_jac = 1e-3,
- precision_io = 1e-5,
-}
-
local testparams = testparams_half
local function cast(input)
@@ -185,11 +177,8 @@ function cudnntest.SpatialConvolution_forward_single()
cutorch.synchronize()
mytester:asserteq(rescuda:dim(), 3, 'error in dimension')
local error = rescuda:float() - groundtruth:float()
- if cudnn.verbose and error:abs():max() > tonumber(testparams.precision_forward) then
- print('\n==== rescuda:float():\n', rescuda:float(), '\n==== groundtruth:float():\n', groundtruth:float())
- end
mytester:assertlt(error:abs():max(), testparams.precision_forward,
- 'error on state (forward)')
+ 'error on state (forward) ')
-- IO
local ferr,berr = jac.testIO(gconv, cast(input))
@@ -1054,7 +1043,7 @@ function cudnntest.SpatialCrossMapLRN_batch()
local size = math.random(1,3)*2+1
local nbfeatures = math.random(3,8)
local alpha = math.random(1,100)/100
- local beta = math.random(1,100)/100
+ local beta = math.random(0,100)/100
local k = math.random(1,3)
local tm = {}
@@ -1519,8 +1508,8 @@ mytester = torch.Tester()
mytester:add(cudnntest)
if torch.random(1,2) == 1 then
- cudnn.benchmark = true -- run manual auto-tuner
- cudnn.verbose = true
+ cudnn.benchmark = true -- run manual auto-tuner
+-- cudnn.verbose = true
end
@@ -1530,21 +1519,20 @@ for i=1,cutorch.getDeviceCount() do
print('Running test on device: #' .. i .. ' : ' .. prop.name)
cutorch.setDevice(i)
--- double tensor may be broken
--- print'Testing torch.CudaDoubleTensor'
--- torch.setdefaulttensortype('torch.DoubleTensor')
--- testparams = testparams_double
--- mytester:run()
+
+ print'Testing torch.CudaHalfTensor'
+ testparams = testparams_half
+ mytester:run()
print'Testing torch.CudaTensor'
testparams = testparams_float
mytester:run()
+-- double tensor may be broken at some places, gets NaNs.
+-- print'Testing torch.CudaDoubleTensor'
+-- testparams = testparams_double
+-- mytester:run()
--- half tensor is broken on Pascal
- print'Testing torch.CudaHalfTensor: note there may be errors on 6.x (Pascal) cards'
- testparams = testparams_half
- mytester:run()
end
os.execute('rm -f modelTemp.t7')
diff --git a/test/test_groups.lua b/test/test_groups.lua
index 1675fdd..8b386b9 100644
--- a/test/test_groups.lua
+++ b/test/test_groups.lua
@@ -34,6 +34,6 @@ ccn2_gradWeight = ccn2_conv.gradWeight:t()
assert((cudnn_output - ccn2_output):abs():max() < 1e-4)
assert((cudnn_gradInput - ccn2_gradInput):abs():max() < 1e-4)
-assert((cudnn_gradWeight - ccn2_gradWeight):abs():max() < 1e-1)
+assert((cudnn_gradWeight - ccn2_gradWeight):abs():max() < 5e-2)
print 'no assertions'