Merge pull request #231 from borisfom/algo

Refactoring CUDNN Find
author: Boris Fomitchev <borisfom@users.noreply.github.com> 2016-08-05 01:04:59 +0300
committer: GitHub <noreply@github.com> 2016-08-05 01:04:59 +0300
commit: 103e6ed51df3a909053c2a524d9e6fc6b35cbf60 (patch)
tree: 951b8203eeee55ded736943365300acae19771ae
parent: b812efdd1c3465547f0e5b0e24d2053f95ebfe2e (diff)
parent: fb1bec17939eb26f94da6a22f410ad316730b9e4 (diff)
11 files changed, 333 insertions, 889 deletions
diff --git a/RNN.lua b/RNN.lua
index 7f89da9..eab2ce4 100644
--- a/RNN.lua
+++ b/RNN.lua
@@ -63,43 +63,29 @@ function RNN:reset(stdv)
    self.gradWeight:resizeAs(self.weight):zero()
 end
 
-function RNN:createDescriptors(count, descs_type, create_func, destroy_func)
-   local ds = ffi.new(descs_type, count)
-   for i = 0, count - 1 do
-      errcheck(create_func, ds + i)
-   end
-   local function destroyDescriptors(ds)
-      for i = 0, count - 1 do
-         errcheck(destroy_func, ds[i])
-      end
-   end
-   ffi.gc(ds, destroyDescriptors)
-   return ds
-end
-
 function RNN:createDropoutDescriptors(count)
-   return self:createDescriptors(count,
+   return cudnn.createDescriptors(count,
                             'cudnnDropoutDescriptor_t[?]',
                             'cudnnCreateDropoutDescriptor',
                             'cudnnDestroyDropoutDescriptor')
 end
 
 function RNN:createFilterDescriptors(count)
-   return self:createDescriptors(count,
+   return cudnn.createDescriptors(count,
                             'cudnnFilterDescriptor_t[?]',
                             'cudnnCreateFilterDescriptor',
                             'cudnnDestroyFilterDescriptor')
 end
 
 function RNN:createRNNDescriptors(count)
-   return self:createDescriptors(count,
+   return cudnn.createDescriptors(count,
                             'cudnnRNNDescriptor_t[?]',
                             'cudnnCreateRNNDescriptor',
                             'cudnnDestroyRNNDescriptor')
 end
 
 function RNN:createTensorDescriptors(count)
-   return self:createDescriptors(count,
+   return cudnn.createDescriptors(count,
                             'cudnnTensorDescriptor_t[?]',
                             'cudnnCreateTensorDescriptor',
                             'cudnnDestroyTensorDescriptor')
@@ -383,7 +369,7 @@ function RNN:updateOutput(input)
 	if self.cellOutput then
 	   self.cellInput = self.cellOutput:clone()
         end
-   end    
+   end
    if (self.batchFirst) then
       self.output = self.output:transpose(1, 2)
    end
diff --git a/SpatialConvolution.lua b/SpatialConvolution.lua
index 58c78b2..1656154 100644
--- a/SpatialConvolution.lua
+++ b/SpatialConvolution.lua
@@ -1,13 +1,9 @@
 local SpatialConvolution, parent =
     torch.class('cudnn.SpatialConvolution', 'nn.SpatialConvolution')
 local ffi = require 'ffi'
+local algo = require 'cudnn.algo'
 local errcheck = cudnn.errcheck
 
-local autotunerCache = {}
-autotunerCache[1] = {} -- forward
-autotunerCache[2] = {} -- backwardFilter
-autotunerCache[3] = {} -- backwardData
-
 function SpatialConvolution:__init(nInputPlane, nOutputPlane,
                             kW, kH, dW, dH, padW, padH, groups)
     local delayedReset = self.reset
@@ -26,39 +22,38 @@ function SpatialConvolution:__init(nInputPlane, nOutputPlane,
     self:reset()
     -- should nil for serialization, the reset will still work
     self.reset = nil
+    return self
 end
 
--- if you change the configuration of the module manually, call this
-function SpatialConvolution:resetWeightDescriptors()
-    assert(cudnn.typemap[torch.typename(self.weight)], 'Only Cuda supported duh!')
+function SpatialConvolution:createWeightDescriptors()
+    assert(cudnn.typemap[torch.typename(self.weight)] or not self.weight, 'Only Cuda supported duh!')
     assert(cudnn.typemap[torch.typename(self.bias)] or not self.bias, 'Only Cuda supported duh!')
-    -- for compatibility
-    self.groups = self.groups or 1
-    -- create filterDescriptor for weight
-    self.weightDesc = ffi.new('struct cudnnFilterStruct*[1]')
-    errcheck('cudnnCreateFilterDescriptor', self.weightDesc)
-    local desc = torch.IntTensor({self.nOutputPlane/self.groups,
-                              self.nInputPlane/self.groups,
-                              self.kH, self.kW})
-    errcheck('cudnnSetFilterNdDescriptor', self.weightDesc[0],
-             cudnn.typemap[torch.typename(self.weight)], 'CUDNN_TENSOR_NCHW', 4,
-             desc:data());
-    local function destroyWDesc(d)
-        errcheck('cudnnDestroyFilterDescriptor', d[0]);
-    end
-    ffi.gc(self.weightDesc, destroyWDesc)
-
     -- create descriptor for bias
     if self.bias then
         self.biasDesc = cudnn.toDescriptor(self.bias:view(1, self.nOutputPlane,1,1))
     end
+    -- create filterDescriptor for weight
+    return cudnn.createDescriptors(1, 'struct cudnnFilterStruct*[?]',
+                                   'cudnnCreateFilterDescriptor', 'cudnnDestroyFilterDescriptor')
+end
+
+-- if you change the configuration of the module manually, call this
+function SpatialConvolution:resetWeightDescriptors(desc)
+    -- for compatibility
+    self.groups = self.groups or 1
+    self.weightDesc = SpatialConvolution.createWeightDescriptors(self)
+    desc = desc or torch.IntTensor({self.nOutputPlane/self.groups,
+                                    self.nInputPlane/self.groups,
+                                    self.kH, self.kW})
+    errcheck('cudnnSetFilterNdDescriptor', self.weightDesc[0],
+             cudnn.typemap[torch.typename(self.weight)], 'CUDNN_TENSOR_NCHW', self.nDim,
+             desc:data());
 end
 
 function SpatialConvolution:fastest(mode)
     if mode == nil then mode = true end
     self.fastest_mode = mode
-    self.iSize = self.iSize or torch.LongStorage(4)
-    self.iSize:fill(0)
+    self.iDesc = nil
     return self
 end
 
@@ -72,8 +67,7 @@ function SpatialConvolution:setMode(fmode, bdmode, bwmode)
     if bwmode ~= nil then
         self.bwmode = bwmode
     end
-    self.iSize = self.iSize or torch.LongStorage(4)
-    self.iSize:fill(0)
+    self.iDesc = nil
     return self
 end
 
@@ -90,32 +84,41 @@ function SpatialConvolution:noBias()
    return self
 end
 
-function SpatialConvolution:createIODescriptors(input)
-    local batch = true
-    if input:dim() == 3 then
-        input = input:view(1, input:size(1), input:size(2), input:size(3))
-        batch = false
+
+function SpatialConvolution:checkInputChanged(input)
+    self.nDim = self.nDim or 4
+    assert(input:dim() == self.nDim)
+    assert(input:isContiguous())
+    self.iSize = self.iSize or torch.LongStorage(self.nDim):fill(0)
+    self.groups = self.groups or 1
+    if not self.weightDesc then self:resetWeightDescriptors() end
+    if not self.iDesc or not self.oDesc or input:size(1) ~= self.iSize[1] or input:size(2) ~= self.iSize[2]
+    or input:size(3) ~= self.iSize[3] or input:size(4) ~= self.iSize[4] or (self.nDim==5 and input:size(5) ~= self.iSize[5]) then
+       self.iSize = input:size()
+
+       assert(self.nInputPlane == input:size(2), 'input has to contain: '
+                 .. self.nInputPlane
+                 .. ' feature maps, but received input of size: '
+                 .. input:size(1) .. ' x ' .. input:size(2) ..
+                 ' x ' .. input:size(3) .. ' x ' .. input:size(4))
+       return true
     end
-    assert(input:dim() == 4 and input:isContiguous());
-    self.iSize = self.iSize or torch.LongStorage(4):fill(0)
-    if not self.iDesc or not self.oDesc or
-        input:size(1) ~= self.iSize[1] or input:size(2) ~= self.iSize[2]
-    or input:size(3) ~= self.iSize[3] or input:size(4) ~= self.iSize[4] then
-        self.iSize = input:size()
-
-        assert(self.nInputPlane == input:size(2), 'input has to contain: '
-                   .. self.nInputPlane
-                   .. ' feature maps, but received input of size: '
-                   .. input:size(1) .. ' x ' .. input:size(2) ..
-                   ' x ' .. input:size(3) .. ' x ' .. input:size(4))
+    return false
+end
 
+function SpatialConvolution:createIODescriptors(input)
+   local batch = true
+   if input:dim() == 3 then
+      input = input:view(1, input:size(1), input:size(2), input:size(3))
+      batch = false
+   end
+   if SpatialConvolution.checkInputChanged(self, input) then
         -- create input descriptor
         local input_slice = input:narrow(2,1,self.nInputPlane/self.groups)
         self.iDesc = cudnn.toDescriptor(input_slice)
-
         -- create conv descriptor
-        self.convDesc = ffi.new('struct cudnnConvolutionStruct*[1]')
-        errcheck('cudnnCreateConvolutionDescriptor', self.convDesc)
+        self.convDesc = cudnn.createDescriptors(1, 'struct cudnnConvolutionStruct*[?]',
+                                                'cudnnCreateConvolutionDescriptor', 'cudnnDestroyConvolutionDescriptor')
         self.padH, self.padW = self.padH or 0, self.padW or 0
         local pad = torch.IntTensor({self.padH, self.padW})
         local stride = torch.IntTensor({self.dH, self.dW})
@@ -124,207 +127,24 @@ function SpatialConvolution:createIODescriptors(input)
                  2, pad:data(),
                  stride:data(), upscale:data(), 'CUDNN_CROSS_CORRELATION',
                  cudnn.configmap(torch.type(self.weight)));
-        local function destroyConvDesc(d)
-            errcheck('cudnnDestroyConvolutionDescriptor', d[0]);
-        end
-        ffi.gc(self.convDesc, destroyConvDesc)
+
 
         -- get output shape, resize output
-        local oSize = torch.IntTensor(4)
+        local oSize = torch.IntTensor(self.nDim)
         local oSizeD = oSize:data()
         errcheck('cudnnGetConvolutionNdForwardOutputDim',
                  self.convDesc[0], self.iDesc[0],
-                 self.weightDesc[0], 4, oSizeD)
+                 self.weightDesc[0], self.nDim, oSizeD)
         oSize[2] = oSize[2] * self.groups
         self.output:resize(oSize:long():storage())
 
-        -- create descriptor for output
         local output_slice = self.output:narrow(2,1,self.nOutputPlane/self.groups)
+        -- create descriptor for output
         self.oDesc = cudnn.toDescriptor(output_slice)
         self.oDescForBias = cudnn.toDescriptor(self.output)
 
-        -----------------------------------------------------------------------
-        local function shape(x)
-            local sz = x:size()
-            local str = ''
-            for i=1,sz:size() do
-                str = str .. sz[i] .. 'x'
-            end
-            if #str > 0 then
-                str = str:sub(1, #str-1)
-            end
-            return str
-        end
-        local autotunerHash = shape(self.weight) .. ';'
-            .. shape(input_slice) .. ';'
-            .. shape(output_slice)
-
-        local maxBufSize = 0
-
-        -- create forwardAlgorithm descriptors
-        local algType = ffi.new("cudnnConvolutionFwdAlgo_t[?]", 1)
-        local algSearchMode = 'CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT'
-        local algWorkspaceLimit = self.workspace_limit
-            or (self.nInputPlane * self.kH * self.kW * 4) -- 4 = sizeof int/float.
-
-        if self.fastest_mode or cudnn.fastest == true then
-            algSearchMode = 'CUDNN_CONVOLUTION_FWD_PREFER_FASTEST'
-        end
-
-        if cudnn.benchmark then -- the manual auto-tuner is run
-            if autotunerCache[1][autotunerHash] then
-                algType[0] = autotunerCache[1][autotunerHash]
-                if cudnn.verbose then
-                   print('Autotuning SC FW: using cached algo = ', algType[0], ' for: ', autotunerHash)
-                end
-            else
-                local perfResults = ffi.new("cudnnConvolutionFwdAlgoPerf_t[?]", 1)
-                local intt = torch.IntTensor(1);
-                errcheck('cudnnFindConvolutionForwardAlgorithm',
-                         cudnn.getHandle(),
-                         self.iDesc[0], self.weightDesc[0],
-                         self.convDesc[0], self.oDesc[0],
-                         1, intt:data(), perfResults)
-                algType[0] = perfResults[0].algo
-                autotunerCache[1][autotunerHash] = perfResults[0].algo
-                if cudnn.verbose then
-                    print(string.format(
-                              "\nAutotuning SC     Forward: Time: %3.5f Memory: %8d Algorithm: %d"
-                                  .. " Weight: %15s Input: %15s Output: %15s",
-                              perfResults[0].time, tonumber(perfResults[0].memory),
-                              tonumber(perfResults[0].algo),
-                              shape(self.weight), shape(input_slice),
-                              shape(output_slice)))
-                end
-            end
-        else
-            errcheck('cudnnGetConvolutionForwardAlgorithm',
-                     cudnn.getHandle(),
-                     self.iDesc[0], self.weightDesc[0],
-                     self.convDesc[0], self.oDesc[0],
-                     algSearchMode, algWorkspaceLimit, algType)
-        end
-        algType[0] = self.fmode or algType[0]
-        self.fwdAlgType = algType
-        local bufSize = torch.LongTensor(1)
-        errcheck('cudnnGetConvolutionForwardWorkspaceSize',
-                 cudnn.getHandle(),
-                 self.iDesc[0], self.weightDesc[0],
-                 self.convDesc[0], self.oDesc[0],
-                 algType[0], bufSize:data())
-        maxBufSize = math.max(maxBufSize, bufSize[1])
-
-        -- create backwardFilterAlgorithm descriptors
-        local algType = ffi.new("cudnnConvolutionBwdFilterAlgo_t[?]", 1)
-        local algSearchMode = 'CUDNN_CONVOLUTION_BWD_FILTER_NO_WORKSPACE'
-        local algWorkspaceLimit = self.workspace_limit
-            or (self.nInputPlane * self.kH * self.kW * 4) -- 4 = sizeof int/float.
-        if self.fastest_mode  or cudnn.fastest == true then
-            algSearchMode = 'CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST'
-        end
-
-        if cudnn.benchmark then -- the manual auto-tuner is run
-            if autotunerCache[2][autotunerHash] then
-                algType[0] = autotunerCache[2][autotunerHash]
-                if cudnn.verbose then
-                   print('Autotuning SC BW: using cached algo = ', algType[0], ' for: ', autotunerHash)
-                end
-            else
-                local perfResults = ffi.new("cudnnConvolutionBwdFilterAlgoPerf_t[?]", 1)
-                local intt = torch.IntTensor(1);
-                errcheck('cudnnFindConvolutionBackwardFilterAlgorithm',
-                         cudnn.getHandle(),
-                         self.iDesc[0], self.oDesc[0],
-                         self.convDesc[0], self.weightDesc[0],
-                         1, intt:data(), perfResults)
-                algType[0] = perfResults[0].algo
-                autotunerCache[2][autotunerHash] = perfResults[0].algo
-                if cudnn.verbose then
-                    print(string.format(
-                              "Autotuning backwardFilter: Time: %3.5f Memory: %8d Algorithm: %d"
-                                  .. " Weight: %15s Input: %15s Output: %15s",
-                              perfResults[0].time, tonumber(perfResults[0].memory),
-                              tonumber(perfResults[0].algo),
-                              shape(self.weight), shape(input_slice),
-                              shape(output_slice)))
-                end
-            end
-        else
-            errcheck('cudnnGetConvolutionBackwardFilterAlgorithm',
-                     cudnn.getHandle(),
-                     self.iDesc[0], self.oDesc[0],
-                     self.convDesc[0], self.weightDesc[0],
-                     algSearchMode, algWorkspaceLimit, algType)
-        end
-        algType[0] = self.bwmode or algType[0]
-        self.bwdFilterAlgType = algType
-        local bufSize = torch.LongTensor(1)
-        errcheck('cudnnGetConvolutionBackwardFilterWorkspaceSize',
-                 cudnn.getHandle(),
-                 self.iDesc[0], self.oDesc[0],
-                 self.convDesc[0], self.weightDesc[0],
-                 algType[0], bufSize:data())
-        maxBufSize = math.max(maxBufSize, bufSize[1])
-
-        -- create backwardDataAlgorithm descriptors
-        local algType = ffi.new("cudnnConvolutionBwdDataAlgo_t[?]", 1)
-        local algSearchMode = 'CUDNN_CONVOLUTION_BWD_DATA_NO_WORKSPACE'
-        local algWorkspaceLimit = self.workspace_limit
-            or (self.nInputPlane * self.kH * self.kW * 4) -- 4 = sizeof int/float.
-        if self.fastest_mode or cudnn.fastest == true then
-            algSearchMode = 'CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST'
-        end
-        if cudnn.benchmark then -- the manual auto-tuner is run
-            if autotunerCache[3][autotunerHash] then
-                algType[0] = autotunerCache[3][autotunerHash]
-                if cudnn.verbose then
-                   print('Autotuning SC BWD: using cached algo = ', algType[0], ' for: ', autotunerHash)
-                end
-            else
-                local perfResults = ffi.new("cudnnConvolutionBwdDataAlgoPerf_t[?]", 1)
-                local intt = torch.IntTensor(1);
-                errcheck('cudnnFindConvolutionBackwardDataAlgorithm',
-                         cudnn.getHandle(),
-                         self.weightDesc[0], self.oDesc[0],
-                         self.convDesc[0], self.iDesc[0],
-                         1, intt:data(), perfResults)
-                algType[0] = perfResults[0].algo
-                autotunerCache[3][autotunerHash] = perfResults[0].algo
-                if cudnn.verbose then
-                    print(string.format(
-                              "Autotuning   backwardData: Time: %3.5f Memory: %8d Algorithm: %d"
-                                  .. " Weight: %15s Input: %15s Output: %15s\n",
-                              perfResults[0].time, tonumber(perfResults[0].memory),
-                              tonumber(perfResults[0].algo),
-                              shape(self.weight), shape(input_slice),
-                              shape(output_slice)))
-                end
-            end
-        else
-            errcheck('cudnnGetConvolutionBackwardDataAlgorithm',
-                     cudnn.getHandle(),
-                     self.weightDesc[0], self.oDesc[0],
-                     self.convDesc[0], self.iDesc[0],
-                     algSearchMode, algWorkspaceLimit, algType)
-        end
-        algType[0] = self.bdmode or algType[0]
-        self.bwdDataAlgType = algType
-        local bufSize = torch.LongTensor(1)
-        errcheck('cudnnGetConvolutionBackwardDataWorkspaceSize',
-                 cudnn.getHandle(),
-                 self.weightDesc[0], self.oDesc[0],
-                 self.convDesc[0], self.iDesc[0],
-                 algType[0], bufSize:data())
-        maxBufSize = math.max(maxBufSize, bufSize[1])
-
-        self.extraBuffer = self.extraBuffer or cudnn.getSharedWorkspace()
-        self.extraBufferSizeInBytes = self.extraBuffer:nElement() * 4 -- float
-        if maxBufSize > self.extraBufferSizeInBytes then
-            self.extraBuffer:resize(math.ceil(maxBufSize/4))
-            self.extraBufferSizeInBytes = maxBufSize
-        end
+        algo.prepareHash(self, input_slice, output_slice)
 
-        -----------------------------------------------------------------------
         -- create offsets for groups
         local iH, iW = input:size(3), input:size(4)
         local kH, kW = self.kH, self.kW
@@ -338,13 +158,14 @@ function SpatialConvolution:createIODescriptors(input)
                                            self.output:size(3),
                                            self.output:size(4))
         end
-    end
+
+   end
 end
 
 local one = torch.FloatTensor({1});
 local zero = torch.FloatTensor({0});
 
-local function makeContiguous(self, input, gradOutput)
+function SpatialConvolution:makeContiguous(input, gradOutput)
    if not input:isContiguous() then
       self._input = self._input or input.new()
       self._input:typeAs(input):resizeAs(input):copy(input)
@@ -359,17 +180,18 @@ local function makeContiguous(self, input, gradOutput)
 end
 
 function SpatialConvolution:updateOutput(input)
-    if not self.weightDesc then self:resetWeightDescriptors() end
-    input = makeContiguous(self, input)
+    input = SpatialConvolution.makeContiguous(self, input)
     self:createIODescriptors(input)
-
+    if not self.fwdAlgType then
+       algo.setupForwardAlgorithm(self)
+    end
     for g = 0, self.groups - 1 do
         errcheck('cudnnConvolutionForward', cudnn.getHandle(),
                  one:data(),
                  self.iDesc[0], input:data() + g*self.input_offset,
                  self.weightDesc[0], self.weight:data() + g*self.weight_offset,
-                 self.convDesc[0], self.fwdAlgType[0],
-                 self.extraBuffer:data(), self.extraBufferSizeInBytes,
+                 self.convDesc[0], self.fwdAlgType,
+                 self.extraBuffer:data(), self.extraBuffer:nElement() * self.extraBuffer.elementSize(),
                  zero:data(),
                  self.oDesc[0], self.output:data() + g*self.output_offset);
     end
@@ -387,11 +209,12 @@ end
 function SpatialConvolution:updateGradInput(input, gradOutput)
     if not self.gradInput then return end
     self.gradInput:resizeAs(input)
-
-    input, gradOutput = makeContiguous(self, input, gradOutput)
-    assert(gradOutput:dim() == 3 or gradOutput:dim() == 4, 'gradOutput has to be 3D or 4D');
-    if not self.weightDesc then self:resetWeightDescriptors() end
+    input, gradOutput = SpatialConvolution.makeContiguous(self, input, gradOutput)
+    assert(gradOutput:dim() == self.nDim-1 or gradOutput:dim() == self.nDim, 'gradOutput has to be nDim or nDim-1');
     self:createIODescriptors(input)
+    if not self.bwdDataAlgType then
+       algo.setupBackwardDataAlgorithm(self)
+    end
 
     for g = 0,self.groups - 1 do
         errcheck('cudnnConvolutionBackwardData', cudnn.getHandle(),
@@ -399,8 +222,8 @@ function SpatialConvolution:updateGradInput(input, gradOutput)
                  self.weightDesc[0], self.weight:data() + g*self.weight_offset,
                  self.oDesc[0], gradOutput:data() + g*self.output_offset,
                  self.convDesc[0],
-                 self.bwdDataAlgType[0],
-                 self.extraBuffer:data(), self.extraBufferSizeInBytes,
+                 self.bwdDataAlgType,
+                 self.extraBuffer:data(), self.extraBuffer:nElement() * self.extraBuffer.elementSize(),
                  zero:data(),
                  self.iDesc[0], self.gradInput:data() + g*self.input_offset);
     end
@@ -413,13 +236,14 @@ function SpatialConvolution:accGradParameters(input, gradOutput, scale)
     self.scaleT = self.scaleT:float()
     scale = scale or 1.0
     self.scaleT[1] = scale
-
-    input, gradOutput = makeContiguous(self, input, gradOutput)
-
-    assert(gradOutput:dim() == 3 or gradOutput:dim() == 4, 'gradOutput has to be 3D or 4D');
-    if not self.weightDesc then self:resetWeightDescriptors() end
+    input, gradOutput = SpatialConvolution.makeContiguous(self, input, gradOutput)
+    assert(gradOutput:dim() == self.nDim-1 or gradOutput:dim() == self.nDim, 'gradOutput has to be nDim or nDim-1');
     self:createIODescriptors(input)
 
+    if not self.bwdFilterAlgType then
+       algo.setupBackwardFilterAlgorithm(self)
+    end
+
     -- gradBias
     if self.bias then
         errcheck('cudnnConvolutionBackwardBias', cudnn.getHandle(),
@@ -436,8 +260,8 @@ function SpatialConvolution:accGradParameters(input, gradOutput, scale)
                  self.iDesc[0], input:data() + g*self.input_offset,
                  self.oDesc[0], gradOutput:data() + g*self.output_offset,
                  self.convDesc[0],
-                 self.bwdFilterAlgType[0],
-                 self.extraBuffer:data(), self.extraBufferSizeInBytes,
+                 self.bwdFilterAlgType,
+                 self.extraBuffer:data(), self.extraBuffer:nElement() * self.extraBuffer.elementSize(),
                  one:data(),
                  self.weightDesc[0], self.gradWeight:data() + g*self.weight_offset);
     end
@@ -455,7 +279,6 @@ function SpatialConvolution:clearDesc()
     self.bwdDataAlgType = nil
     self.bwdFilterAlgType = nil
     self.extraBuffer = nil
-    self.extraBufferSizeInBytes = nil
     self.scaleT = nil
 end
 
@@ -470,6 +293,8 @@ end
 
 function SpatialConvolution:clearState()
    self:clearDesc()
-   nn.utils.clear(self, '_input', '_gradOutput')
+   nn.utils.clear(self, 'extraBuffer', '_input', '_gradOutput')
    return nn.Module.clearState(self)
 end
+
+return SpatialConvolution
diff --git a/SpatialFullConvolution.lua b/SpatialFullConvolution.lua
index ff91a13..1cdfb33 100644
--- a/SpatialFullConvolution.lua
+++ b/SpatialFullConvolution.lua
@@ -2,98 +2,52 @@ local SpatialFullConvolution, parent =
     torch.class('cudnn.SpatialFullConvolution', 'nn.SpatialFullConvolution')
 local ffi = require 'ffi'
 local errcheck = cudnn.errcheck
+local algo = require 'cudnn.algo'
 
 local autotunerCache = {}
 autotunerCache[1] = {} -- forward
 autotunerCache[2] = {} -- backwardFilter
 autotunerCache[3] = {} -- backwardData
 
+local Convolution = cudnn.SpatialConvolution
+SpatialFullConvolution.nDim = 4
+
 -- if you change the configuration of the module manually, call this
 function SpatialFullConvolution:resetWeightDescriptors()
-    assert(cudnn.typemap[torch.typename(self.weight)], 'Only Cuda supported duh!')
-    assert(cudnn.typemap[torch.typename(self.bias)] or not self.bias, 'Only Cuda supported duh!')
-    -- create filterDescriptor for weight
-    self.weightDesc = ffi.new('struct cudnnFilterStruct*[1]')
-    errcheck('cudnnCreateFilterDescriptor', self.weightDesc)
+    self.weightDesc = Convolution.createWeightDescriptors(self)
     local desc = torch.IntTensor({self.nInputPlane,
                                   self.nOutputPlane,
                                   self.kH, self.kW})
     errcheck('cudnnSetFilterNdDescriptor', self.weightDesc[0],
              cudnn.typemap[torch.typename(self.weight)], 'CUDNN_TENSOR_NCHW', 4,
              desc:data());
-    local function destroyWDesc(d)
-        errcheck('cudnnDestroyFilterDescriptor', d[0]);
-    end
-    ffi.gc(self.weightDesc, destroyWDesc)
-
-    -- create descriptor for bias
-    if self.bias then
-        self.biasDesc = cudnn.toDescriptor(self.bias:view(1, self.nOutputPlane,1,1))
-    end
 end
 
 function SpatialFullConvolution:fastest(mode)
-    if mode == nil then mode = true end
-    self.fastest_mode = mode
-    self.iSize = self.iSize or torch.LongStorage(4)
-    self.iSize:fill(0)
-    return self
+   return Convolution.fastest(self)
 end
 
 function SpatialFullConvolution:setMode(fmode, bdmode, bwmode)
-    if fmode ~= nil then
-        self.fmode = fmode
-    end
-    if bdmode ~= nil then
-        self.bdmode = bdmode
-    end
-    if bwmode ~= nil then
-        self.bwmode = bwmode
-    end
-    self.iSize = self.iSize or torch.LongStorage(4)
-    self.iSize:fill(0)
-    return self
+   return Convolution.setMode(self, fmode, bdmode, bwmode)
 end
 
 function SpatialFullConvolution:resetMode()
-    self.fmode = nil
-    self.bdmode = nil
-    self.bwmode = nil
-    return self
+   return Convolution.resetMode(self)
 end
 
 function SpatialFullConvolution:noBias()
-   self.bias = nil
-   self.gradBias = nil
-   return self
+   return Convolution.noBias(self)
 end
 
 function SpatialFullConvolution:createIODescriptors(input)
-    local batch = true
-    if input:dim() == 3 then
-        input = input:view(1, input:size(1), input:size(2), input:size(3))
-        batch = false
-    end
-    assert(input:dim() == 4 and input:isContiguous());
-    self.iSize = self.iSize or torch.LongStorage(4):fill(0)
-    if not self.iDesc or not self.oDesc or
-        input:size(1) ~= self.iSize[1] or input:size(2) ~= self.iSize[2]
-    or input:size(3) ~= self.iSize[3] or input:size(4) ~= self.iSize[4] then
-        self.iSize = input:size()
-
-        assert(self.nInputPlane == input:size(2), 'input has to contain: '
-                   .. self.nInputPlane
-                   .. ' feature maps, but received input of size: '
-                   .. input:size(1) .. ' x ' .. input:size(2) ..
-                   ' x ' .. input:size(3) .. ' x ' .. input:size(4))
-
+    if Convolution.checkInputChanged(self, input) then
         -- create input descriptor
-        local input_slice = {{},{1,self.nInputPlane},{},{}}
-        self.iDesc = cudnn.toDescriptor(input[input_slice])
+        local input_slice = input[{{},{1,self.nInputPlane},{},{}}]
+        self.iDesc = cudnn.toDescriptor(input_slice)
 
         -- create conv descriptor
-        self.convDesc = ffi.new('struct cudnnConvolutionStruct*[1]')
-        errcheck('cudnnCreateConvolutionDescriptor', self.convDesc)
+        self.convDesc = cudnn.createDescriptors(1, 'struct cudnnConvolutionStruct*[?]',
+                                                'cudnnCreateConvolutionDescriptor', 'cudnnDestroyConvolutionDescriptor')
         local pad = torch.IntTensor({self.padH, self.padW})
         local stride = torch.IntTensor({self.dH, self.dW})
         local upscale = torch.IntTensor({1,1})
@@ -101,10 +55,6 @@ function SpatialFullConvolution:createIODescriptors(input)
                  2, pad:data(),
                  stride:data(), upscale:data(), 'CUDNN_CROSS_CORRELATION',
                  cudnn.configmap(torch.type(self.weight)));
-        local function destroyConvDesc(d)
-            errcheck('cudnnDestroyConvolutionDescriptor', d[0]);
-        end
-        ffi.gc(self.convDesc, destroyConvDesc)
 
         -- get output shape, resize output
         local iwidth = input:size(4)
@@ -115,190 +65,12 @@ function SpatialFullConvolution:createIODescriptors(input)
         self.output:resize(oSize:long():storage())
 
         -- create descriptor for output
-        local output_slice = {{},{1,self.nOutputPlane},{},{}}
-        self.oDesc = cudnn.toDescriptor(self.output[output_slice])
+        local output_slice = self.output[{{},{1,self.nOutputPlane},{},{}}]
+        self.oDesc = cudnn.toDescriptor(output_slice)
         self.oDescForBias = cudnn.toDescriptor(self.output)
 
-        -----------------------------------------------------------------------
-        local function shape(x)
-            local sz = x:size()
-            local str = ''
-            for i=1,sz:size() do
-                str = str .. sz[i] .. 'x'
-            end
-            if #str > 0 then
-                str = str:sub(1, #str-1)
-            end
-            return str
-        end
-        local autotunerHash = shape(self.weight) .. ';'
-            .. shape(input[input_slice]) .. ';'
-            .. shape(self.output[output_slice])
+        algo.prepareHash(self, input_slice, output_slice)
 
-        local maxBufSize = 0
-
-        -- create forwardAlgorithm descriptors
-        local algType = ffi.new("cudnnConvolutionFwdAlgo_t[?]", 1)
-        local algSearchMode = 'CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT'
-        local algWorkspaceLimit = self.workspace_limit
-            or (self.nOutputPlane * self.kH * self.kW * 4) -- 4 = sizeof int/float.
-
-        if self.fastest_mode or cudnn.fastest == true then
-            algSearchMode = 'CUDNN_CONVOLUTION_FWD_PREFER_FASTEST'
-        end
-
-        if cudnn.benchmark then -- the manual auto-tuner is run
-            if autotunerCache[1][autotunerHash] then
-                algType[0] = autotunerCache[1][autotunerHash]
-                if cudnn.verbose then
-                   print('Autotuning SFC: using cached algo = ', algType[0], ' for: ', autotunerHash)
-                end
-            else
-                local perfResults = ffi.new("cudnnConvolutionFwdAlgoPerf_t[?]", 1)
-                local intt = torch.IntTensor(1);
-                errcheck('cudnnFindConvolutionForwardAlgorithm',
-                         cudnn.getHandle(),
-                         self.oDesc[0], self.weightDesc[0],
-                         self.convDesc[0], self.iDesc[0],
-                         1, intt:data(), perfResults)
-                algType[0] = perfResults[0].algo
-                autotunerCache[1][autotunerHash] = perfResults[0].algo
-                if cudnn.verbose then
-                    print(string.format(
-                              "Autotuning        Forward: Time: %3.5f Memory: %8d Algorithm: %d"
-                                  .. " Weight: %15s Input: %15s Output: %15s",
-                              perfResults[0].time, tonumber(perfResults[0].memory),
-                              tonumber(perfResults[0].algo),
-                              shape(self.weight), shape(input[input_slice]),
-                              shape(self.output[output_slice])))
-                end
-            end
-        else
-            errcheck('cudnnGetConvolutionForwardAlgorithm',
-                     cudnn.getHandle(),
-                     self.oDesc[0], self.weightDesc[0],
-                     self.convDesc[0], self.iDesc[0],
-                     algSearchMode, algWorkspaceLimit, algType)
-        end
-        algType[0] = self.fmode or algType[0]
-        self.fwdAlgType = algType
-        local bufSize = torch.LongTensor(1)
-        errcheck('cudnnGetConvolutionForwardWorkspaceSize',
-                 cudnn.getHandle(),
-                 self.oDesc[0], self.weightDesc[0],
-                 self.convDesc[0], self.iDesc[0],
-                 algType[0], bufSize:data())
-        maxBufSize = math.max(maxBufSize, bufSize[1])
-
-        -- create backwardFilterAlgorithm descriptors
-        local algType = ffi.new("cudnnConvolutionBwdFilterAlgo_t[?]", 1)
-        local algSearchMode = 'CUDNN_CONVOLUTION_BWD_FILTER_NO_WORKSPACE'
-        local algWorkspaceLimit = self.workspace_limit
-            or (self.nOutputPlane * self.kH * self.kW * 4) -- 4 = sizeof int/float.
-        if self.fastest_mode  or cudnn.fastest == true then
-            algSearchMode = 'CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST'
-        end
-
-        if cudnn.benchmark then -- the manual auto-tuner is run
-            if autotunerCache[2][autotunerHash] then
-                algType[0] = autotunerCache[2][autotunerHash]
-            else
-                local perfResults = ffi.new("cudnnConvolutionBwdFilterAlgoPerf_t[?]", 1)
-                local intt = torch.IntTensor(1);
-                errcheck('cudnnFindConvolutionBackwardFilterAlgorithm',
-                         cudnn.getHandle(),
-                         self.oDesc[0], self.iDesc[0],
-                         self.convDesc[0], self.weightDesc[0],
-                         1, intt:data(), perfResults)
-                algType[0] = perfResults[0].algo
-                autotunerCache[2][autotunerHash] = perfResults[0].algo
-                if cudnn.verbose then
-                    print(string.format(
-                              "Autotuning backwardFilter: Time: %3.5f Memory: %8d Algorithm: %d"
-                                  .. " Weight: %15s Input: %15s Output: %15s",
-                              perfResults[0].time, tonumber(perfResults[0].memory),
-                              tonumber(perfResults[0].algo),
-                              shape(self.weight), shape(input[input_slice]),
-                              shape(self.output[output_slice])))
-                end
-            end
-        else
-            errcheck('cudnnGetConvolutionBackwardFilterAlgorithm',
-                     cudnn.getHandle(),
-                     self.oDesc[0], self.iDesc[0],
-                     self.convDesc[0], self.weightDesc[0],
-                     algSearchMode, algWorkspaceLimit, algType)
-        end
-        algType[0] = self.bwmode or algType[0]
-        self.bwdFilterAlgType = algType
-        local bufSize = torch.LongTensor(1)
-        errcheck('cudnnGetConvolutionBackwardFilterWorkspaceSize',
-                 cudnn.getHandle(),
-                 self.oDesc[0], self.iDesc[0],
-                 self.convDesc[0], self.weightDesc[0],
-                 algType[0], bufSize:data())
-        maxBufSize = math.max(maxBufSize, bufSize[1])
-
-        -- create backwardDataAlgorithm descriptors
-        local algType = ffi.new("cudnnConvolutionBwdDataAlgo_t[?]", 1)
-        local algSearchMode = 'CUDNN_CONVOLUTION_BWD_DATA_NO_WORKSPACE'
-        local algWorkspaceLimit = self.workspace_limit
-            or (self.nOutputPlane * self.kH * self.kW * 4) -- 4 = sizeof int/float.
-        if self.fastest_mode or cudnn.fastest == true then
-            algSearchMode = 'CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST'
-        end
-        if cudnn.benchmark then -- the manual auto-tuner is run
-            if autotunerCache[3][autotunerHash] then
-                algType[0] = autotunerCache[3][autotunerHash]
-            else
-                local perfResults = ffi.new("cudnnConvolutionBwdDataAlgoPerf_t[?]", 1)
-                local intt = torch.IntTensor(1);
-                errcheck('cudnnFindConvolutionBackwardDataAlgorithm',
-                         cudnn.getHandle(),
-                         self.weightDesc[0], self.iDesc[0],
-                         self.convDesc[0], self.oDesc[0],
-                         1, intt:data(), perfResults)
-                algType[0] = perfResults[0].algo
-                autotunerCache[3][autotunerHash] = perfResults[0].algo
-                if cudnn.verbose then
-                    print(string.format(
-                              "Autotuning   backwardData: Time: %3.5f Memory: %8d Algorithm: %d"
-                                  .. " Weight: %15s Input: %15s Output: %15s\n",
-                              perfResults[0].time, tonumber(perfResults[0].memory),
-                              tonumber(perfResults[0].algo),
-                              shape(self.weight), shape(input[input_slice]),
-                              shape(self.output[output_slice])))
-                end
-            end
-        else
-            errcheck('cudnnGetConvolutionBackwardDataAlgorithm',
-                     cudnn.getHandle(),
-                     self.weightDesc[0], self.iDesc[0],
-                     self.convDesc[0], self.oDesc[0],
-                     algSearchMode, algWorkspaceLimit, algType)
-        end
-        algType[0] = self.bdmode or algType[0]
-        self.bwdDataAlgType = algType
-        local bufSize = torch.LongTensor(1)
-        errcheck('cudnnGetConvolutionBackwardDataWorkspaceSize',
-                 cudnn.getHandle(),
-                 self.weightDesc[0], self.iDesc[0],
-                 self.convDesc[0], self.oDesc[0],
-                 algType[0], bufSize:data())
-        maxBufSize = math.max(maxBufSize, bufSize[1])
-
-        self.extraBuffer = self.extraBuffer or cudnn.getSharedWorkspace()
-        self.extraBufferSizeInBytes = self.extraBuffer:nElement() * 4 -- float
-        if maxBufSize > self.extraBufferSizeInBytes then
-            self.extraBuffer:resize(math.ceil(maxBufSize/4))
-            self.extraBufferSizeInBytes = maxBufSize
-        end
-
-        if not batch then
-            self.output = self.output:view(self.output:size(2),
-                                           self.output:size(3),
-                                           self.output:size(4))
-        end
     end
 end
 
@@ -306,8 +78,11 @@ local one = torch.FloatTensor({1});
 local zero = torch.FloatTensor({0});
 
 function SpatialFullConvolution:updateOutput(input)
-    if not self.weightDesc then self:resetWeightDescriptors() end
     self:createIODescriptors(input)
+    if not self.bwdDataAlgType then
+       algo.setupBackwardDataAlgorithm(self, {self.weightDesc[0], self.iDesc[0],
+                                              self.convDesc[0], self.oDesc[0]})
+    end
 
     -- Because SpatialFullConvolution is performing the adjoint of the forward
     -- convolution operator, we need to swap the forward and backward passes.
@@ -315,8 +90,8 @@ function SpatialFullConvolution:updateOutput(input)
              one:data(),
              self.weightDesc[0], self.weight:data(),
              self.iDesc[0], input:data(),
-             self.convDesc[0], self.bwdDataAlgType[0],
-             self.extraBuffer:data(), self.extraBufferSizeInBytes,
+             self.convDesc[0], self.bwdDataAlgType,
+             self.extraBuffer:data(), self.extraBuffer:nElement() * self.extraBuffer.elementSize(),
              zero:data(),
              self.oDesc[0], self.output:data())
 
@@ -336,16 +111,19 @@ function SpatialFullConvolution:updateGradInput(input, gradOutput)
 
     assert(gradOutput:dim() == 3 or gradOutput:dim() == 4, 'gradOutput has to be 3D or 4D');
     assert(gradOutput:isContiguous(), 'gradOutput has to be contiguous')
-    if not self.weightDesc then self:resetWeightDescriptors() end
     self:createIODescriptors(input)
+    if not self.fwdDataAlgType then
+       algo.setupForwardAlgorithm(self, {self.oDesc[0], self.weightDesc[0],
+                                         self.convDesc[0], self.iDesc[0]})
+    end
 
     errcheck('cudnnConvolutionForward', cudnn.getHandle(),
              one:data(),
              self.oDesc[0], gradOutput:data(),
              self.weightDesc[0], self.weight:data(),
              self.convDesc[0],
-             self.fwdAlgType[0],
-             self.extraBuffer:data(), self.extraBufferSizeInBytes,
+             self.fwdAlgType,
+             self.extraBuffer:data(), self.extraBuffer:nElement() * self.extraBuffer.elementSize(),
              zero:data(),
              self.iDesc[0], self.gradInput:data());
     return self.gradInput
@@ -360,8 +138,11 @@ function SpatialFullConvolution:accGradParameters(input, gradOutput, scale)
 
     assert(gradOutput:dim() == 3 or gradOutput:dim() == 4, 'gradOutput has to be 3D or 4D');
     assert(gradOutput:isContiguous(), 'gradOutput has to be contiguous')
-    if not self.weightDesc then self:resetWeightDescriptors() end
     self:createIODescriptors(input)
+    if not self.bwdFilterAlgType then
+       algo.setupBackwardFilterAlgorithm(self, {self.oDesc[0], self.iDesc[0],
+                                                self.convDesc[0], self.weightDesc[0]})
+    end
 
     -- gradBias
     if self.bias then
@@ -378,8 +159,8 @@ function SpatialFullConvolution:accGradParameters(input, gradOutput, scale)
              self.oDesc[0], gradOutput:data(),
              self.iDesc[0], input:data(),
              self.convDesc[0],
-             self.bwdFilterAlgType[0],
-             self.extraBuffer:data(), self.extraBufferSizeInBytes,
+             self.bwdFilterAlgType,
+             self.extraBuffer:data(), self.extraBuffer:nElement() * self.extraBuffer.elementSize(),
              one:data(),
              self.weightDesc[0], self.gradWeight:data())
 end
@@ -396,7 +177,6 @@ function SpatialFullConvolution:clearDesc()
     self.bwdDataAlgType = nil
     self.bwdFilterAlgType = nil
     self.extraBuffer = nil
-    self.extraBufferSizeInBytes = nil
 end
 
 function SpatialFullConvolution:write(f)
diff --git a/TemporalConvolution.lua b/TemporalConvolution.lua
index 4648ffd..947cc4f 100644
--- a/TemporalConvolution.lua
+++ b/TemporalConvolution.lua
@@ -6,6 +6,8 @@ local TemporalConvolution, parent =
 --it is recommended to pass padding parameter to this routine and use cudnn implicit padding facilities.
 --limitation is that padding will be equal on both sides.
 
+local Convolution = cudnn.SpatialConvolution
+
 function TemporalConvolution:__init(inputFrameSize, outputFrameSize,
                             kH, dH, padH)
     local delayedReset = self.reset
@@ -14,7 +16,8 @@ function TemporalConvolution:__init(inputFrameSize, outputFrameSize,
     local nOutputPlane = outputFrameSize
     self.inputFrameSize = inputFrameSize
     self.outputFrameSize = outputFrameSize
-    cudnn.SpatialConvolution.__init(self, nInputPlane, nOutputPlane, kW, kH, 1, dH,0,padH)
+    self.nDim = 4
+    Convolution.__init(self, nInputPlane, nOutputPlane, kW, kH, 1, dH,0,padH)
     self.weight = self.weight:view(nOutputPlane,inputFrameSize*kH)
     self.gradWeight = self.gradWeight:view(outputFrameSize, inputFrameSize*kH)
 --self.dW and self.kW now have different meaning than in nn.TemporalConvolution, because
@@ -28,24 +31,24 @@ function TemporalConvolution:createIODescriptors(input)
     or input:size(3) ~= self.iSize[3] or input:size(4) ~= self.iSize[4] then
        sizeChanged = true
     end
-    cudnn.SpatialConvolution.createIODescriptors(self,input)
+    Convolution.createIODescriptors(self,input)
     if sizeChanged then
        self.oSize = self.output:size()
     end
 end
 
 function TemporalConvolution:fastest(mode)
-    self = cudnn.SpatialConvolution.fastest(self,mode)
+    self = Convolution.fastest(self,mode)
     return self
 end
 
 function TemporalConvolution:setMode(fmode, bdmode, bwmode)
-    self = cudnn.SpatialConvolution.setMode(self,fmode, bdmode, bwmode)
+    self = Convolution.setMode(self,fmode, bdmode, bwmode)
     return self
 end
 
 function TemporalConvolution:resetWeightDescriptors()
-    cudnn.SpatialConvolution.resetWeightDescriptors(self)
+    Convolution.resetWeightDescriptors(self)
 end
 
 local function inputview(input)
@@ -63,7 +66,7 @@ function TemporalConvolution:updateOutput(input)
    self._output = self._output or input.new()
    if self.output:storage() then self._output:set(self.output:storage()) else self._output = self.output end
    if self.buffer:storage() then self.output:set(self.buffer:storage(), 1, self.output:size()) else self.output = self.buffer end
-   cudnn.SpatialConvolution.updateOutput(self,_input)
+   Convolution.updateOutput(self,_input)
    self.buffer = self.output:view(self.oSize):transpose(2,3)
    self.output  = self._output:resize(self.buffer:size()):copy(self.buffer)
    -- self.output here is always 4D, use input dimensions to properly view output
@@ -92,7 +95,7 @@ function TemporalConvolution:updateGradInput(input, gradOutput)
    if not self.gradInput then return end
    local _gradOutput = transposeGradOutput(gradOutput,self.buffer)
    local _input = inputview(input)
-   self.gradInput = cudnn.SpatialConvolution.updateGradInput(self,_input, _gradOutput)
+   self.gradInput = Convolution.updateGradInput(self,_input, _gradOutput)
    if input:dim()==3 then
       self.gradInput = self.gradInput:view(self.iSize[1],self.iSize[3],self.iSize[4])
    else
@@ -106,7 +109,7 @@ function TemporalConvolution:accGradParameters(input,gradOutput,scale)
     local _input = inputview(input)
 -- transpose gradOutput (it will likely be transposed twice, hopefully, no big deal
     local _gradOutput = transposeGradOutput(gradOutput,self.buffer)
-    cudnn.SpatialConvolution.accGradParameters(self,_input,_gradOutput,scale)
+    Convolution.accGradParameters(self,_input,_gradOutput,scale)
 end
 
 function TemporalConvolution:clearDesc()
@@ -117,7 +120,7 @@ end
 
 function TemporalConvolution:write(f)
   self:clearDesc()
-  cudnn.SpatialConvolution.clearDesc(self)
+  Convolution.clearDesc(self)
     local var = {}
     for k,v in pairs(self) do
         var[k] = v
diff --git a/VolumetricConvolution.lua b/VolumetricConvolution.lua
index b255467..73fd9ce 100644
--- a/VolumetricConvolution.lua
+++ b/VolumetricConvolution.lua
@@ -2,83 +2,49 @@ local VolumetricConvolution, parent
    = torch.class('cudnn.VolumetricConvolution', 'nn.VolumetricConvolution')
 local ffi = require 'ffi'
 local errcheck = cudnn.errcheck
+local algo = require 'cudnn.algo'
 
-local autotunerCache = {}
-autotunerCache[1] = {} -- forward
-autotunerCache[2] = {} -- backwardFilter
-autotunerCache[3] = {} -- backwardData
+local Convolution = cudnn.SpatialConvolution
 
+function VolumetricConvolution:__init(nInputPlane, nOutputPlane,
+                                      kT, kW, kH, dW, dH, padW, padH)
+   self.nDim = 5
+   self.kT = kT
+   Convolution.__init(self,nInputPlane, nOutputPlane,
+                      kW, kH, dW, dH, padW, padH, 1)
+   return self
+end
 -- if you change the configuration of the module manually, call this
 function VolumetricConvolution:resetWeightDescriptors()
-   assert(cudnn.typemap[torch.typename(self.weight)], 'Only Cuda supported duh!')
-   assert(cudnn.typemap[torch.typename(self.bias)] or not self.bias, 'Only Cuda supported duh!')
-   -- create filterDescriptor for weight
-   self.weightDesc = ffi.new('struct cudnnFilterStruct*[1]')
-   errcheck('cudnnCreateFilterDescriptor', self.weightDesc)
    local desc = torch.IntTensor({self.nOutputPlane, self.nInputPlane,
                              self.kT, self.kH, self.kW})
-   errcheck('cudnnSetFilterNdDescriptor', self.weightDesc[0],
-            cudnn.typemap[torch.typename(self.weight)], 'CUDNN_TENSOR_NCHW', 5,
-            desc:data());
-   local function destroyWDesc(d)
-      errcheck('cudnnDestroyFilterDescriptor', d[0]);
-   end
-   ffi.gc(self.weightDesc, destroyWDesc)
-
-   -- create descriptor for bias
-   self.biasDesc = cudnn.toDescriptor(self.bias:view(1, self.nOutputPlane,
-                                                     1, 1))
+   Convolution.resetWeightDescriptors(self, desc)
 end
 
 function VolumetricConvolution:fastest(mode)
-   if mode == nil then mode = true end
-   self.fastest_mode = mode
-   self.iSize = self.iSize or torch.LongStorage(4)
-   self.iSize:fill(0)
-   return self
+   return Convolution.fastest(self)
 end
 
 function VolumetricConvolution:setMode(fmode, bdmode, bwmode)
-   if fmode ~= nil then
-      self.fmode = fmode
-   end
-   if bdmode ~= nil then
-      self.bdmode = bdmode
-   end
-   if bwmode ~= nil then
-      self.bwmode = bwmode
-   end
-   self.iSize = self.iSize or torch.LongStorage(4)
-   self.iSize:fill(0)
-   return self
+   return Convolution.setMode(self, fmode, bdmode, bwmode)
 end
 
 function VolumetricConvolution:resetMode()
-   self.fmode = nil
-   self.bdmode = nil
-   self.bwmode = nil
-   return self
+   return Convolution.resetMode(self)
 end
 
 function VolumetricConvolution:createIODescriptors(input)
-   local batch = true
    if input:dim() == 4 then
       input = input:view(1, input:size(1), input:size(2),
                          input:size(3), input:size(4))
       batch = false
    end
-   assert(input:dim() == 5 and input:isContiguous());
-   self.iSize = self.iSize or torch.LongStorage(4):fill(0)
-   if not self.iDesc or not self.oDesc or
-      input:size(1) ~= self.iSize[1] or input:size(2) ~= self.iSize[2]
-   or input:size(3) ~= self.iSize[3] or input:size(4) ~= self.iSize[4]
-   or input:size(5) ~= self.iSize[5] then
-         self.iSize = input:size()
+   if Convolution.checkInputChanged(self, input) then
          -- create input descriptor
          self.iDesc = cudnn.toDescriptor(input)
          -- create conv descriptor
-         self.convDesc = ffi.new('struct cudnnConvolutionStruct*[1]')
-         errcheck('cudnnCreateConvolutionDescriptor', self.convDesc)
+         self.convDesc = cudnn.createDescriptors(1, 'struct cudnnConvolutionStruct*[?]',
+                                                 'cudnnCreateConvolutionDescriptor', 'cudnnDestroyConvolutionDescriptor')
          local pad = torch.IntTensor({self.padT, self.padH, self.padW})
          local stride = torch.IntTensor({self.dT, self.dH, self.dW})
          local upscale = torch.IntTensor({1,1,1})
@@ -86,11 +52,6 @@ function VolumetricConvolution:createIODescriptors(input)
                   3, pad:data(),
                   stride:data(), upscale:data(), 'CUDNN_CROSS_CORRELATION',
                   cudnn.configmap(torch.type(self.weight)));
-         local function destroyConvDesc(d)
-            errcheck('cudnnDestroyConvolutionDescriptor', d[0]);
-         end
-         ffi.gc(self.convDesc, destroyConvDesc)
-
          -- create output descriptor and resize output
          local oSize = torch.IntTensor(5)
          local oSizeD = oSize:data()
@@ -106,181 +67,7 @@ function VolumetricConvolution:createIODescriptors(input)
                              self.output:size(3)*self.output:size(4),
                              self.output:size(5)))
 
-
-
-        -----------------------------------------------------------------------
-        local function shape(x)
- 	   return table.concat(x:size():totable(),'x')
-        end
-        local autotunerHash = shape(self.weight) .. ';'
-           .. shape(input) .. ';'
-           .. shape(self.output)
-
-        local maxBufSize = 0
-
-        -- create forwardAlgorithm descriptors
-        local algType = ffi.new("cudnnConvolutionFwdAlgo_t[?]", 1)
-        local algSearchMode = 'CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT'
-        local algWorkspaceLimit = self.workspace_limit
-            or (self.nInputPlane * self.kH * self.kW * 4) -- 4 = sizeof int/float.
-
-        if self.fastest_mode or cudnn.fastest == true then
-            algSearchMode = 'CUDNN_CONVOLUTION_FWD_PREFER_FASTEST'
-        end
-
-        if cudnn.benchmark then -- the manual auto-tuner is run
-            if autotunerCache[1][autotunerHash] then
-                algType[0] = autotunerCache[1][autotunerHash]
-                if cudnn.verbose then
-                   print('Autotuning VMC FW: using cached algo = ', algType[0], ' for: ', autotunerHash)
-                end
-            else
-                local perfResults = ffi.new("cudnnConvolutionFwdAlgoPerf_t[?]", 1)
-                local intt = torch.IntTensor(1);
-                errcheck('cudnnFindConvolutionForwardAlgorithm',
-                         cudnn.getHandle(),
-                         self.iDesc[0], self.weightDesc[0],
-                         self.convDesc[0], self.oDesc[0],
-                         1, intt:data(), perfResults)
-                algType[0] = perfResults[0].algo
-                autotunerCache[1][autotunerHash] = perfResults[0].algo
-                if cudnn.verbose then
-                    print(string.format(
-                              "\nAutotuning VMC    Forward: Time: %3.5f Memory: %8d Algorithm: %d"
-                                  .. " Weight: %15s Input: %15s Output: %15s",
-                              perfResults[0].time, tonumber(perfResults[0].memory),
-                              tonumber(perfResults[0].algo),
-                              shape(self.weight), shape(input),
-                              shape(self.output)))
-                end
-            end
-        else
-            errcheck('cudnnGetConvolutionForwardAlgorithm',
-                     cudnn.getHandle(),
-                     self.iDesc[0], self.weightDesc[0],
-                     self.convDesc[0], self.oDesc[0],
-                     algSearchMode, algWorkspaceLimit, algType)
-        end
-        algType[0] = self.fmode or algType[0]
-        self.fwdAlgType = algType
-        local bufSize = torch.LongTensor(1)
-        errcheck('cudnnGetConvolutionForwardWorkspaceSize',
-                 cudnn.getHandle(),
-                 self.iDesc[0], self.weightDesc[0],
-                 self.convDesc[0], self.oDesc[0],
-                 algType[0], bufSize:data())
-        maxBufSize = math.max(maxBufSize, bufSize[1])
-
-        -- create backwardFilterAlgorithm descriptors
-        local algType = ffi.new("cudnnConvolutionBwdFilterAlgo_t[?]", 1)
-        local algSearchMode = 'CUDNN_CONVOLUTION_BWD_FILTER_NO_WORKSPACE'
-        local algWorkspaceLimit = self.workspace_limit
-            or (self.nInputPlane * self.kH * self.kW * 4) -- 4 = sizeof int/float.
-        if self.fastest_mode  or cudnn.fastest == true then
-            algSearchMode = 'CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST'
-        end
-
-        if cudnn.benchmark then -- the manual auto-tuner is run
-            if autotunerCache[2][autotunerHash] then
-                algType[0] = autotunerCache[2][autotunerHash]
-                if cudnn.verbose then
-                   print('Autotuning VMC BWF: using cached algo = ', algType[0], ' for: ', autotunerHash)
-                end
-            else
-                local perfResults = ffi.new("cudnnConvolutionBwdFilterAlgoPerf_t[?]", 1)
-                local intt = torch.IntTensor(1);
-                errcheck('cudnnFindConvolutionBackwardFilterAlgorithm',
-                         cudnn.getHandle(),
-                         self.iDesc[0], self.oDesc[0],
-                         self.convDesc[0], self.weightDesc[0],
-                         1, intt:data(), perfResults)
-                algType[0] = perfResults[0].algo
-                autotunerCache[2][autotunerHash] = perfResults[0].algo
-                if cudnn.verbose then
-                    print(string.format(
-                              "Autotuning backwardFilter: Time: %3.5f Memory: %8d Algorithm: %d"
-                                  .. " Weight: %15s Input: %15s Output: %15s",
-                              perfResults[0].time, tonumber(perfResults[0].memory),
-                              tonumber(perfResults[0].algo),
-                              shape(self.weight), shape(input),
-                              shape(self.output)))
-                end
-            end
-        else
-            errcheck('cudnnGetConvolutionBackwardFilterAlgorithm',
-                     cudnn.getHandle(),
-                     self.iDesc[0], self.oDesc[0],
-                     self.convDesc[0], self.weightDesc[0],
-                     algSearchMode, algWorkspaceLimit, algType)
-        end
-        algType[0] = self.bwmode or algType[0]
-        self.bwdFilterAlgType = algType
-        local bufSize = torch.LongTensor(1)
-        errcheck('cudnnGetConvolutionBackwardFilterWorkspaceSize',
-                 cudnn.getHandle(),
-                 self.iDesc[0], self.oDesc[0],
-                 self.convDesc[0], self.weightDesc[0],
-                 algType[0], bufSize:data())
-        maxBufSize = math.max(maxBufSize, bufSize[1])
-
-        -- create backwardDataAlgorithm descriptors
-        local algType = ffi.new("cudnnConvolutionBwdDataAlgo_t[?]", 1)
-        local algSearchMode = 'CUDNN_CONVOLUTION_BWD_DATA_NO_WORKSPACE'
-        local algWorkspaceLimit = self.workspace_limit
-            or (self.nInputPlane * self.kH * self.kW * 4) -- 4 = sizeof int/float.
-        if self.fastest_mode or cudnn.fastest == true then
-            algSearchMode = 'CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST'
-        end
-        if cudnn.benchmark then -- the manual auto-tuner is run
-            if autotunerCache[3][autotunerHash] then
-                algType[0] = autotunerCache[3][autotunerHash]
-                if cudnn.verbose then
-                   print('Autotuning VMC BWD: using cached algo = ', algType[0], ' for: ', autotunerHash)
-                end
-            else
-                local perfResults = ffi.new("cudnnConvolutionBwdDataAlgoPerf_t[?]", 1)
-                local intt = torch.IntTensor(1);
-                errcheck('cudnnFindConvolutionBackwardDataAlgorithm',
-                         cudnn.getHandle(),
-                         self.weightDesc[0], self.oDesc[0],
-                         self.convDesc[0], self.iDesc[0],
-                         1, intt:data(), perfResults)
-                algType[0] = perfResults[0].algo
-                autotunerCache[3][autotunerHash] = perfResults[0].algo
-                if cudnn.verbose then
-                    print(string.format(
-                              "Autotuning   backwardData: Time: %3.5f Memory: %8d Algorithm: %d"
-                                  .. " Weight: %15s Input: %15s Output: %15s\n",
-                              perfResults[0].time, tonumber(perfResults[0].memory),
-                              tonumber(perfResults[0].algo),
-                              shape(self.weight), shape(input),
-                              shape(self.output)))
-                end
-            end
-        else
-            errcheck('cudnnGetConvolutionBackwardDataAlgorithm',
-                     cudnn.getHandle(),
-                     self.weightDesc[0], self.oDesc[0],
-                     self.convDesc[0], self.iDesc[0],
-                     algSearchMode, algWorkspaceLimit, algType)
-        end
-        algType[0] = self.bdmode or algType[0]
-        self.bwdDataAlgType = algType
-        local bufSize = torch.LongTensor(1)
-        errcheck('cudnnGetConvolutionBackwardDataWorkspaceSize',
-                 cudnn.getHandle(),
-                 self.weightDesc[0], self.oDesc[0],
-                 self.convDesc[0], self.iDesc[0],
-                 algType[0], bufSize:data())
-        maxBufSize = math.max(maxBufSize, bufSize[1])
-
-        self.extraBuffer = self.extraBuffer or cudnn.getSharedWorkspace()
-        self.extraBufferSizeInBytes = self.extraBuffer:nElement() * 4 -- float
-        if maxBufSize > self.extraBufferSizeInBytes then
-            self.extraBuffer:resize(math.ceil(maxBufSize/4))
-            self.extraBufferSizeInBytes = maxBufSize
-        end
-        -----------------------------------------------------------------------
+         algo.prepareHash(self, input, output)
 
          if not batch then
             self.output = self.output:view(self.output:size(2),
@@ -291,119 +78,28 @@ function VolumetricConvolution:createIODescriptors(input)
    end
 end
 
-local one = torch.FloatTensor({1});
-local zero = torch.FloatTensor({0});
-
-local function makeContiguous(self, input, gradOutput)
-   if not input:isContiguous() then
-      self._input = self._input or input.new()
-      self._input:typeAs(input):resizeAs(input):copy(input)
-      input = self._input
-   end
-   if gradOutput and not gradOutput:isContiguous() then
-      self._gradOutput = self._gradOutput or gradOutput.new()
-      self._gradOutput:typeAs(gradOutput):resizeAs(gradOutput):copy(gradOutput)
-      gradOutput = self._gradOutput
-   end
-   return input, gradOutput
-end
-
 function VolumetricConvolution:updateOutput(input)
-   if not self.weightDesc then self:resetWeightDescriptors() end
-   input = makeContiguous(self, input)
-   self:createIODescriptors(input)
-   errcheck('cudnnConvolutionForward', cudnn.getHandle(),
-            one:data(),
-            self.iDesc[0], input:data(),
-            self.weightDesc[0], self.weight:data(),
-            self.convDesc[0], self.fwdAlgType[0],
-            self.extraBuffer:data(), self.extraBufferSizeInBytes,
-            zero:data(),
-            self.oDesc[0], self.output:data());
-   errcheck('cudnnAddTensor', cudnn.getHandle(),
-            one:data(),
-            self.biasDesc[0], self.bias:data(), one:data(),
-            self.oDescBias[0], self.output:data());
-   return self.output
+   return Convolution:updateOutput(input)
 end
 
 function VolumetricConvolution:updateGradInput(input, gradOutput)
-   if not self.gradInput then return end
-   self.gradInput:resizeAs(input)
-
-   input, gradOutput = makeContiguous(self, input, gradOutput)
-   assert(gradOutput:dim() == 4 or gradOutput:dim() == 5,
-          'gradOutput has to be a 4D or 5D tensor');
-   if not self.weightDesc then self:resetWeightDescriptors() end
-   self:createIODescriptors(input)
-   errcheck('cudnnConvolutionBackwardData', cudnn.getHandle(),
-        one:data(),
-        self.weightDesc[0], self.weight:data(),
-        self.oDesc[0], gradOutput:data(),
-        self.convDesc[0],
-        self.bwdDataAlgType[0],
-        self.extraBuffer:data(), self.extraBufferSizeInBytes,
-        zero:data(),
-        self.iDesc[0], self.gradInput:data());
-   return self.gradInput
+   return Convolution:updateGradInput(input)
 end
 
 function VolumetricConvolution:accGradParameters(input, gradOutput, scale)
-   self.scaleT = self.scaleT or torch.FloatTensor(1):fill(1.0)
-   -- this line forces this member to always be on CPU (needed for cudnn)
-   self.scaleT = self.scaleT:float()
-
-   scale = scale or 1.0
-   self.scaleT[1] = scale
-   input, gradOutput = makeContiguous(self, input, gradOutput)
-   assert(gradOutput:dim() == 4 or gradOutput:dim() == 5,
-          'gradOutput has to be a 4D or 5D tensor');
-   self:createIODescriptors(input)
-   if not self.weightDesc then self:resetWeightDescriptors() end
-   -- gradBias
-   errcheck('cudnnConvolutionBackwardBias', cudnn.getHandle(),
-            self.scaleT:data(),
-            self.oDescBias[0], gradOutput:data(),
-            one:data(),
-            self.biasDesc[0], self.gradBias:data());
-   -- gradWeight
-   errcheck('cudnnConvolutionBackwardFilter', cudnn.getHandle(),
-        self.scaleT:data(),
-        self.iDesc[0], input:data(),
-        self.oDesc[0], gradOutput:data(),
-        self.convDesc[0],
-        self.bwdFilterAlgType[0],
-        self.extraBuffer:data(), self.extraBufferSizeInBytes,
-        one:data(),
-        self.weightDesc[0], self.gradWeight:data());
+   return Convolution:accGradParameters(input, gradOutput, scale)
 end
 
 function VolumetricConvolution:clearDesc()
-   self.weightDesc = nil
-   self.biasDesc = nil
-   self.convDesc = nil
-   self.iDesc = nil
-   self.oDesc = nil
-   self.oDescBias = nil
-   self.fwdAlgType = nil
-   self.bwdDataAlgType = nil
-   self.bwdFilterAlgType = nil
-   self.extraBuffer = nil
-   self.extraBufferInBytes = nil
-   self.scaleT = nil
+   Convolution:clearDesc()
 end
 
 function VolumetricConvolution:write(f)
-   self:clearDesc()
-   local var = {}
-   for k,v in pairs(self) do
-      var[k] = v
-   end
-   f:writeObject(var)
+   Convolution:write(f)
 end
 
 function VolumetricConvolution:clearState()
-   self:clearDesc()
-   nn.utils.clear(self, 'extraBuffer', '_input', '_gradOutput')
-   return nn.Module.clearState(self)
+   return Convolution:clearState()
 end
+
+return VolumetricConvolution
diff --git a/algo.lua b/algo.lua
new file mode 100644
index 0000000..ba211ad
--- /dev/null
+++ b/algo.lua
@@ -0,0 +1,129 @@
+local ffi = require 'ffi'
+local errcheck = cudnn.errcheck
+
+local algo = {}
+local autotunerCache = {}
+autotunerCache['cudnnFindConvolutionForwardAlgorithm'] = {}
+autotunerCache['cudnnFindConvolutionBackwardFilterAlgorithm'] = {}
+autotunerCache['cudnnFindConvolutionBackwardDataAlgorithm'] = {}
+
+local function setupAlgo(self, algo_t, perf_t, findAPI, getAPI, wsAPI, algSearchMode, params)
+
+        local algType = ffi.new(algo_t, 1)
+
+        if cudnn.benchmark or cudnn.fastest then -- the manual auto-tuner is run
+           local cachedAlgo = autotunerCache[findAPI][self.autotunerHash];
+            if cachedAlgo then
+               algType[0] = cachedAlgo
+                if cudnn.verbose then
+                   print('\n', findAPI, ' using cached algo = ' , algType[0] , ' for: ', self.autotunerHash)
+                end
+            else
+                local perfResults = ffi.new(perf_t, 1)
+                local intt = torch.IntTensor(1)
+                errcheck(findAPI,
+                         cudnn.getHandle(),
+                         params[1], params[2], params[3], params[4],
+                         1, intt:data(), perfResults)
+                algType[0] = perfResults[0].algo
+                autotunerCache[findAPI][self.autotunerHash] = perfResults[0].algo
+                if cudnn.verbose then
+                    print(string.format(
+                              "\n" .. findAPI .. " Time: %3.5f Memory: %8d Algorithm: %d"
+                                  .. " hash: %45s",
+                              perfResults[0].time, tonumber(perfResults[0].memory),
+                              tonumber(perfResults[0].algo), self.autotunerHash ))
+
+                end
+            end
+        else
+
+           local algWorkspaceLimit = self.workspace_limit
+              or (self.nInputPlane * self.kH * self.kW * self.weight.elementSize())
+
+            errcheck(getAPI,
+                     cudnn.getHandle(),
+                     params[1], params[2], params[3], params[4],
+                     algSearchMode, algWorkspaceLimit, algType)
+                if cudnn.verbose then
+                   print(string.format(
+                     "\n" .. getAPI .. " Limit: %d Algorithm: %d",
+                     tonumber(algWorkspaceLimit),
+                     tonumber(algType[0])))
+                end
+        end
+        local bufSize = torch.LongTensor(1)
+        errcheck(wsAPI,
+                 cudnn.getHandle(),
+                 params[1], params[2], params[3], params[4],
+                 algType[0], bufSize:data())
+
+        self.extraBuffer = self.extraBuffer or cudnn.getSharedWorkspace()
+        local extraBufferSizeInBytes = self.extraBuffer:nElement() * self.extraBuffer.elementSize()
+
+       if cudnn.verbose then
+           print(string.format(
+                    "\n" .. wsAPI .. " returned bufSize: %d, current extraBufferSizeInBytes: %d, %d elements",
+                    tonumber(bufSize[1]), tonumber(extraBufferSizeInBytes), tonumber(self.extraBuffer:nElement())))
+        end
+
+        if extraBufferSizeInBytes < bufSize[1] then
+           self.extraBuffer:resize(math.ceil(bufSize[1]/self.extraBuffer.elementSize()))
+        end
+        return algType[0]
+end
+
+function algo.prepareHash(self, input_slice, output_slice)
+   local function shape(x)
+      return table.concat(x:size():totable(),'x')
+   end
+   self.autotunerHash = shape(self.weight) .. ';'
+      .. shape(input_slice) .. ';'
+      .. shape(output_slice)
+
+   self.fwdAlgType = nil
+   self.bwdDataAlgType = nil
+   self.bwdFilterAlgType = nil
+end
+
+function algo.setupForwardAlgorithm(self, params)
+   local algSearchMode
+   if self.fastest_mode  or cudnn.benchmark == true or cudnn.fastest == true then
+      algSearchMode = 'CUDNN_CONVOLUTION_FWD_PREFER_FASTEST'
+   else
+      algSearchMode = 'CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT'
+   end
+
+   params = params or { self.iDesc[0], self.weightDesc[0], self.convDesc[0], self.oDesc[0] }
+   self.fwdAlgType = self.fmode or
+      setupAlgo(self,"cudnnConvolutionFwdAlgo_t[?]", "cudnnConvolutionFwdAlgoPerf_t[?]",
+                'cudnnFindConvolutionForwardAlgorithm', 'cudnnGetConvolutionForwardAlgorithm',
+                'cudnnGetConvolutionForwardWorkspaceSize', algSearchMode, params)
+end
+
+function algo.setupBackwardFilterAlgorithm(self, params)
+   local algSearchMode = 'CUDNN_CONVOLUTION_BWD_FILTER_NO_WORKSPACE'
+   if self.fastest_mode  or cudnn.fastest == true then
+      algSearchMode = 'CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST'
+   end
+   params = params or { self.iDesc[0], self.oDesc[0], self.convDesc[0], self.weightDesc[0] }
+   self.bwdFilterAlgType = self.bwmode or
+      setupAlgo(self,"cudnnConvolutionBwdFilterAlgo_t[?]", "cudnnConvolutionBwdFilterAlgoPerf_t[?]",
+                'cudnnFindConvolutionBackwardFilterAlgorithm', 'cudnnGetConvolutionBackwardFilterAlgorithm',
+                'cudnnGetConvolutionBackwardFilterWorkspaceSize', algSearchMode,
+                params)
+end
+
+function algo.setupBackwardDataAlgorithm(self, params)
+   local algSearchMode = 'CUDNN_CONVOLUTION_BWD_DATA_NO_WORKSPACE'
+   if self.fastest_mode  or cudnn.fastest == true then
+      algSearchMode = 'CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST'
+   end
+   params =  params or { self.weightDesc[0], self.oDesc[0], self.convDesc[0], self.iDesc[0] }
+   self.bwdDataAlgType = self.bdmode or
+      setupAlgo(self,"cudnnConvolutionBwdDataAlgo_t[?]", "cudnnConvolutionBwdDataAlgoPerf_t[?]",
+                'cudnnFindConvolutionBackwardDataAlgorithm', 'cudnnGetConvolutionBackwardDataAlgorithm',
+                'cudnnGetConvolutionBackwardDataWorkspaceSize', algSearchMode, params)
+end
+
+return algo
diff --git a/cudnn-scm-1.rockspec b/cudnn-scm-1.rockspec
index bd8072f..3066d12 100644
--- a/cudnn-scm-1.rockspec
+++ b/cudnn-scm-1.rockspec
@@ -2,7 +2,7 @@ package = "cudnn"
 version = "scm-1"
 
 source = {
-   url = "git://github.com/soumith/cudnn.torch.git",
+   url = "git://github.com/soumith/cudnn.torch.git"
 }
 
 description = {
diff --git a/functional.lua b/functional.lua
index cea9df9..8eee7c9 100644
--- a/functional.lua
+++ b/functional.lua
@@ -105,7 +105,7 @@ cudnn.functional.Convolution2D_updateOutput = function(handle, input, weight, ou
    local algSearchMode = 'CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT'
    local algWorkspaceLimit = 0
    if workspace then
-       algWorkspaceLimit = workspace:nElement() * 4 -- 4 = sizeof float
+       algWorkspaceLimit = workspace:nElement() * workspace:elementSize()
    end
    errcheck('cudnnGetConvolutionForwardAlgorithm',
             handle,
diff --git a/init.lua b/init.lua
index 4b1c6e5..cc2089e 100644
--- a/init.lua
+++ b/init.lua
@@ -116,6 +116,20 @@ function cudnn.toDescriptor(t)
    return descriptor
 end
 
+function cudnn.createDescriptors(count, descs_type, create_func, destroy_func)
+   local ds = ffi.new(descs_type, count)
+   for i = 0, count - 1 do
+      errcheck(create_func, ds + i)
+   end
+   local function destroyDescriptors(ds)
+      for i = 0, count - 1 do
+         errcheck(destroy_func, ds[i])
+      end
+   end
+   ffi.gc(ds, destroyDescriptors)
+   return ds
+end
+
 
 local sharedBuffer = {}
 for i=1,numDevices do
@@ -126,7 +140,7 @@ function cudnn.getSharedWorkspace()
     local device = cutorch.getDevice()
     local stream = cutorch.getStream() -- starts from 0
     if not sharedBuffer[device][stream] then
-        sharedBuffer[device][stream] = torch.CudaTensor(1)
+        sharedBuffer[device][stream] = torch.CudaDoubleTensor(256)
     end
     return sharedBuffer[device][stream]
 end
@@ -165,5 +179,4 @@ require('cudnn.GRU')
 require('cudnn.functional')
 require('cudnn.convert')
 
-
 return cudnn
diff --git a/test/test.lua b/test/test.lua
index a4f9bcb..aa8ea7f 100644
--- a/test/test.lua
+++ b/test/test.lua
@@ -25,7 +25,7 @@ local testparams_float = {
 }
 
 -- TODO: find out why the errors are so huge
-local testparams_double = {
+local testparams_double_err = {
    test_type = 'torch.CudaDoubleTensor',
    precision_forward = 1e+2,
    precision_backward = 1e+3, -- 1e+4,
@@ -33,6 +33,14 @@ local testparams_double = {
    precision_io = 1e-5,
 }
 
+local testparams_double = {
+   test_type = 'torch.CudaDoubleTensor',
+   precision_forward = 1e-4,
+   precision_backward = 2e-2,
+   precision_jac = 1e-3,
+   precision_io = 1e-5,
+}
+
 local testparams = testparams_half
 
 local function cast(input)
@@ -177,8 +185,11 @@ function cudnntest.SpatialConvolution_forward_single()
      cutorch.synchronize()
      mytester:asserteq(rescuda:dim(), 3, 'error in dimension')
      local error = rescuda:float() - groundtruth:float()
+     if cudnn.verbose and error:abs():max() > tonumber(testparams.precision_forward) then
+        print('\n==== rescuda:float():\n', rescuda:float(),  '\n==== groundtruth:float():\n', groundtruth:float())
+     end
      mytester:assertlt(error:abs():max(), testparams.precision_forward,
-                       'error on state (forward) ')
+                       'error on state (forward)')
 
      -- IO
      local ferr,berr = jac.testIO(gconv, cast(input))
@@ -1043,7 +1054,7 @@ function cudnntest.SpatialCrossMapLRN_batch()
    local size = math.random(1,3)*2+1
    local nbfeatures = math.random(3,8)
    local alpha = math.random(1,100)/100
-   local beta  = math.random(0,100)/100
+   local beta  = math.random(1,100)/100
    local k = math.random(1,3)
 
    local tm = {}
@@ -1508,8 +1519,8 @@ mytester = torch.Tester()
 mytester:add(cudnntest)
 
 if torch.random(1,2) == 1 then
-   cudnn.benchmark = true -- run manual auto-tuner
---   cudnn.verbose = true
+    cudnn.benchmark = true -- run manual auto-tuner
+    cudnn.verbose = true
 end
 
 
@@ -1519,20 +1530,21 @@ for i=1,cutorch.getDeviceCount() do
    print('Running test on device: #' .. i .. ' : ' .. prop.name)
 
    cutorch.setDevice(i)
-
-   print'Testing torch.CudaHalfTensor'
-   testparams = testparams_half
-   mytester:run()
+--   double tensor may be broken
+--   print'Testing torch.CudaDoubleTensor'
+--   torch.setdefaulttensortype('torch.DoubleTensor')
+--   testparams = testparams_double
+--   mytester:run()
 
    print'Testing torch.CudaTensor'
    testparams = testparams_float
    mytester:run()
 
---   double tensor may be broken at some places, gets NaNs.
---   print'Testing torch.CudaDoubleTensor'
---   testparams = testparams_double
---   mytester:run()
 
+--   half tensor is broken on Pascal
+   print'Testing torch.CudaHalfTensor: note there may be errors on 6.x (Pascal) cards'
+   testparams = testparams_half
+   mytester:run()
 end
 
 os.execute('rm -f modelTemp.t7')
diff --git a/test/test_groups.lua b/test/test_groups.lua
index 8b386b9..1675fdd 100644
--- a/test/test_groups.lua
+++ b/test/test_groups.lua
@@ -34,6 +34,6 @@ ccn2_gradWeight = ccn2_conv.gradWeight:t()
 
 assert((cudnn_output - ccn2_output):abs():max() < 1e-4)
 assert((cudnn_gradInput - ccn2_gradInput):abs():max() < 1e-4)
-assert((cudnn_gradWeight - ccn2_gradWeight):abs():max() < 5e-2)
+assert((cudnn_gradWeight - ccn2_gradWeight):abs():max() < 1e-1)
 
 print 'no assertions'
author	Boris Fomitchev <borisfom@users.noreply.github.com>	2016-08-05 01:04:59 +0300
committer	GitHub <noreply@github.com>	2016-08-05 01:04:59 +0300
commit	103e6ed51df3a909053c2a524d9e6fc6b35cbf60 (patch)
tree	951b8203eeee55ded736943365300acae19771ae
parent	b812efdd1c3465547f0e5b0e24d2053f95ebfe2e (diff)
parent	fb1bec17939eb26f94da6a22f410ad316730b9e4 (diff)