whitespace cleanups, fixing logsoftmax test

author: soumith <soumith@fb.com> 2015-09-15 20:32:36 +0300
committer: soumith <soumith@fb.com> 2015-09-15 20:32:36 +0300
commit: 4d5c3db15efc87fe4220fc06486a8d7be759dcc2 (patch)
tree: 87548ebc6a6c6113d952569d1ab72ccf6052ebb0
parent: 97f41c48602a345344bb5f76e73e4b2fbf7eb679 (diff)
5 files changed, 506 insertions, 514 deletions
diff --git a/SpatialConvolution.lua b/SpatialConvolution.lua
index fe7f027..04c9319 100644
--- a/SpatialConvolution.lua
+++ b/SpatialConvolution.lua
@@ -1,146 +1,144 @@
 local SpatialConvolution, parent =
-   torch.class('cudnn.SpatialConvolution', 'nn.SpatialConvolution')
+    torch.class('cudnn.SpatialConvolution', 'nn.SpatialConvolution')
 local ffi = require 'ffi'
 local errcheck = cudnn.errcheck
 
 function SpatialConvolution:__init(nInputPlane, nOutputPlane,
                             kW, kH, dW, dH, padW, padH, groups)
-   local delayedReset = self.reset
-   self.reset = function() end
-   parent.__init(self, nInputPlane, nOutputPlane, kW, kH, dW, dH)
-   self.reset = delayedReset
-   self.padW = padW or 0
-   self.padH = padH or 0
-   self.groups = groups or 1
-   assert(nInputPlane % self.groups == 0,
-          'nInputPlane should be divisible by nGroups')
-   assert(nOutputPlane % self.groups == 0,
-          'nOutputPlane should be divisible by nGroups')
-   self.weight = torch.Tensor(nOutputPlane, nInputPlane/self.groups, kW, kH)
-   self.gradWeight = torch.Tensor(nOutputPlane, nInputPlane/self.groups, kW, kH)
-   self:reset()
-   self.iSize = torch.LongStorage(4):fill(0)
-   self.fastest_mode = true
+    local delayedReset = self.reset
+    self.reset = function() end
+    parent.__init(self, nInputPlane, nOutputPlane, kW, kH, dW, dH)
+    self.reset = delayedReset
+    self.padW = padW or 0
+    self.padH = padH or 0
+    self.groups = groups or 1
+    assert(nInputPlane % self.groups == 0,
+           'nInputPlane should be divisible by nGroups')
+    assert(nOutputPlane % self.groups == 0,
+           'nOutputPlane should be divisible by nGroups')
+    self.weight = torch.Tensor(nOutputPlane, nInputPlane/self.groups, kW, kH)
+    self.gradWeight = torch.Tensor(nOutputPlane, nInputPlane/self.groups, kW, kH)
+    self:reset()
+    self.iSize = torch.LongStorage(4):fill(0)
+    self.fastest_mode = true
 end
 
 -- if you change the configuration of the module manually, call this
 function SpatialConvolution:resetWeightDescriptors()
-   assert(torch.typename(self.weight) == 'torch.CudaTensor',
-          'Only Cuda supported duh!')
-   assert(torch.typename(self.bias) == 'torch.CudaTensor',
-          'Only Cuda supported duh!')
-   -- for compatibility
-   self.groups = self.groups or 1
-   -- create filterDescriptor for weight
-   self.weightDesc = ffi.new('struct cudnnFilterStruct*[1]')
-   errcheck('cudnnCreateFilterDescriptor', self.weightDesc)
-   local desc = torch.IntTensor({self.nOutputPlane/self.groups,
-                                 self.nInputPlane/self.groups,
-                                 self.kH, self.kW})
-   errcheck('cudnnSetFilterNdDescriptor', self.weightDesc[0],
-            'CUDNN_DATA_FLOAT', 4,
-            desc:data());
-   local function destroyWDesc(d)
-      errcheck('cudnnDestroyFilterDescriptor', d[0]);
-   end
-   ffi.gc(self.weightDesc, destroyWDesc)
-
-   -- create descriptor for bias
-   self.biasDesc = cudnn.toDescriptor(self.bias:view(1, self.nOutputPlane,1,1))
+    assert(torch.typename(self.weight) == 'torch.CudaTensor',
+           'Only Cuda supported duh!')
+    assert(torch.typename(self.bias) == 'torch.CudaTensor',
+           'Only Cuda supported duh!')
+    -- for compatibility
+    self.groups = self.groups or 1
+    -- create filterDescriptor for weight
+    self.weightDesc = ffi.new('struct cudnnFilterStruct*[1]')
+    errcheck('cudnnCreateFilterDescriptor', self.weightDesc)
+    local desc = torch.IntTensor({self.nOutputPlane/self.groups,
+                              self.nInputPlane/self.groups,
+                              self.kH, self.kW})
+    errcheck('cudnnSetFilterNdDescriptor', self.weightDesc[0],
+             'CUDNN_DATA_FLOAT', 4,
+             desc:data());
+    local function destroyWDesc(d)
+        errcheck('cudnnDestroyFilterDescriptor', d[0]);
+    end
+    ffi.gc(self.weightDesc, destroyWDesc)
+
+    -- create descriptor for bias
+    self.biasDesc = cudnn.toDescriptor(self.bias:view(1, self.nOutputPlane,1,1))
 end
 
 function SpatialConvolution:fastest(mode)
-   if mode == nil then mode = true end
-   self.fastest_mode = mode
-   self.iSize:fill(0)
-   return self
+    if mode == nil then mode = true end
+    self.fastest_mode = mode
+    return self
 end
 
 function SpatialConvolution:setMode(fmode, bdmode, bwmode)
-   if fmode ~= nil then
-      self.fmode = fmode
-   end
-   if bdmode ~= nil then
-      self.bdmode = bdmode
-   end
-   if bwmode ~= nil then
-      self.bwmode = bwmode
-   end
-   self.iSize:fill(0)
-   return self
+    if fmode ~= nil then
+        self.fmode = fmode
+    end
+    if bdmode ~= nil then
+        self.bdmode = bdmode
+    end
+    if bwmode ~= nil then
+        self.bwmode = bwmode
+    end
+    return self
 end
 
 function SpatialConvolution:resetMode()
-   self.fmode = nil
-   self.bdmode = nil
-   self.bwmode = nil
-   return self
+    self.fmode = nil
+    self.bdmode = nil
+    self.bwmode = nil
+    return self
 end
 
 function SpatialConvolution:createIODescriptors(input)
-   local batch = true
-   if input:dim() == 3 then
-      input = input:view(1, input:size(1), input:size(2), input:size(3))
-      batch = false
-   end
-   assert(input:dim() == 4 and input:isContiguous());
-   if not self.iDesc or not self.oDesc or
-      input:size(1) ~= self.iSize[1] or input:size(2) ~= self.iSize[2]
-   or input:size(3) ~= self.iSize[3] or input:size(4) ~= self.iSize[4] then
-         self.iSize = input:size()
-
-         -- resize gradInput
-         if self.gradInput then self.gradInput:resizeAs(input); end
-         assert(self.nInputPlane == input:size(2), 'input has to contain: '
+    local batch = true
+    if input:dim() == 3 then
+        input = input:view(1, input:size(1), input:size(2), input:size(3))
+        batch = false
+    end
+    assert(input:dim() == 4 and input:isContiguous());
+    if not self.iDesc or not self.oDesc or
+        input:size(1) ~= self.iSize[1] or input:size(2) ~= self.iSize[2]
+    or input:size(3) ~= self.iSize[3] or input:size(4) ~= self.iSize[4] then
+        self.iSize = input:size()
+
+        -- resize gradInput
+        if self.gradInput then self.gradInput:resizeAs(input); end
+        assert(self.nInputPlane == input:size(2), 'input has to contain: '
                    .. self.nInputPlane
                    .. ' feature maps, but received input of size: '
                    .. input:size(1) .. ' x ' .. input:size(2) ..
                    ' x ' .. input:size(3) .. ' x ' .. input:size(4))
 
-         -- create input descriptor
-         local input_slice = {{},{1,self.nInputPlane/self.groups},{},{}}
-         self.iDesc = cudnn.toDescriptor(input[input_slice])
-
-         -- create conv descriptor
-         self.convDesc = ffi.new('struct cudnnConvolutionStruct*[1]')
-         errcheck('cudnnCreateConvolutionDescriptor', self.convDesc)
-         local pad = torch.IntTensor({self.padH, self.padW})
-         local stride = torch.IntTensor({self.dH, self.dW})
-         local upscale = torch.IntTensor({1,1})
-         errcheck('cudnnSetConvolutionNdDescriptor_v3', self.convDesc[0],
-                  2, pad:data(),
-                  stride:data(), upscale:data(), 'CUDNN_CROSS_CORRELATION',
-		  'CUDNN_DATA_FLOAT');
-         local function destroyConvDesc(d)
+        -- create input descriptor
+        local input_slice = {{},{1,self.nInputPlane/self.groups},{},{}}
+        self.iDesc = cudnn.toDescriptor(input[input_slice])
+
+        -- create conv descriptor
+        self.convDesc = ffi.new('struct cudnnConvolutionStruct*[1]')
+        errcheck('cudnnCreateConvolutionDescriptor', self.convDesc)
+        local pad = torch.IntTensor({self.padH, self.padW})
+        local stride = torch.IntTensor({self.dH, self.dW})
+        local upscale = torch.IntTensor({1,1})
+        errcheck('cudnnSetConvolutionNdDescriptor_v3', self.convDesc[0],
+                 2, pad:data(),
+                 stride:data(), upscale:data(), 'CUDNN_CROSS_CORRELATION',
+                 'CUDNN_DATA_FLOAT');
+        local function destroyConvDesc(d)
             errcheck('cudnnDestroyConvolutionDescriptor', d[0]);
-         end
-         ffi.gc(self.convDesc, destroyConvDesc)
-
-         -- get output shape, resize output
-         local oSize = torch.IntTensor(4)
-         local oSizeD = oSize:data()
-         errcheck('cudnnGetConvolutionNdForwardOutputDim',
-                  self.convDesc[0], self.iDesc[0],
-                  self.weightDesc[0], 4, oSizeD)
-         oSize[2] = oSize[2] * self.groups
-         self.output:resize(oSize:long():storage())
-
-         -- create descriptor for output
-         local output_slice = {{},{1,self.nOutputPlane/self.groups},{},{}}
-         self.oDesc = cudnn.toDescriptor(self.output[output_slice])
-         self.oDescForBias = cudnn.toDescriptor(self.output)
-
-	 -----------------------------------------------------------------------
-	 local maxBufSize = 0
-
-         -- create forwardAlgorithm descriptors
-         local algType = ffi.new("cudnnConvolutionFwdAlgo_t[?]", 1)
-	 local algSearchMode = 'CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT'
-	 local algWorkspaceLimit = self.workspace_limit
-	    or (self.nInputPlane * self.kH * self.kW * 4) -- 4 = sizeof int/float.
-
-	 if self.fastest_mode then algSearchMode = 'CUDNN_CONVOLUTION_FWD_PREFER_FASTEST' end
-         if cudnn.benchmark then -- the manual auto-tuner is run
+        end
+        ffi.gc(self.convDesc, destroyConvDesc)
+
+        -- get output shape, resize output
+        local oSize = torch.IntTensor(4)
+        local oSizeD = oSize:data()
+        errcheck('cudnnGetConvolutionNdForwardOutputDim',
+                 self.convDesc[0], self.iDesc[0],
+                 self.weightDesc[0], 4, oSizeD)
+        oSize[2] = oSize[2] * self.groups
+        self.output:resize(oSize:long():storage())
+
+        -- create descriptor for output
+        local output_slice = {{},{1,self.nOutputPlane/self.groups},{},{}}
+        self.oDesc = cudnn.toDescriptor(self.output[output_slice])
+        self.oDescForBias = cudnn.toDescriptor(self.output)
+
+        -----------------------------------------------------------------------
+        local maxBufSize = 0
+
+        -- create forwardAlgorithm descriptors
+        local algType = ffi.new("cudnnConvolutionFwdAlgo_t[?]", 1)
+        local algSearchMode = 'CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT'
+        local algWorkspaceLimit = self.workspace_limit
+            or (self.nInputPlane * self.kH * self.kW * 4) -- 4 = sizeof int/float.
+
+        if self.fastest_mode then algSearchMode = 'CUDNN_CONVOLUTION_FWD_PREFER_FASTEST' end
+        if cudnn.benchmark then -- the manual auto-tuner is run
             local perfResults = ffi.new("cudnnConvolutionFwdAlgoPerf_t[?]", 1)
             local intt = torch.IntTensor(1);
             errcheck('cudnnFindConvolutionForwardAlgorithm',
@@ -150,36 +148,36 @@ function SpatialConvolution:createIODescriptors(input)
                      1, intt:data(), perfResults)
             algType[0] = perfResults[0].algo
             if cudnn.verbose then
-               print('AutoTuning:', perfResults[0].time,
-                     tonumber(perfResults[0].memory), tonumber(perfResults[0].algo))
+                print('AutoTuning:', perfResults[0].time,
+                      tonumber(perfResults[0].memory), tonumber(perfResults[0].algo))
             end
-         else
+        else
             errcheck('cudnnGetConvolutionForwardAlgorithm',
                      cudnn.getHandle(),
                      self.iDesc[0], self.weightDesc[0],
                      self.convDesc[0], self.oDesc[0],
                      algSearchMode, algWorkspaceLimit, algType)
-         end
-	 algType[0] = self.fmode or algType[0]
-         self.fwdAlgType = algType
-         local bufSize = torch.LongTensor(1)
-         errcheck('cudnnGetConvolutionForwardWorkspaceSize',
-                  cudnn.getHandle(),
-                  self.iDesc[0], self.weightDesc[0],
-                  self.convDesc[0], self.oDesc[0],
-                  algType[0], bufSize:data())
-	 maxBufSize = math.max(maxBufSize, bufSize[1])
-
-	 -- create backwardFilterAlgorithm descriptors
-         local algType = ffi.new("cudnnConvolutionBwdFilterAlgo_t[?]", 1)
-	 local algSearchMode = 'CUDNN_CONVOLUTION_BWD_FILTER_NO_WORKSPACE'
-	 local algWorkspaceLimit = self.workspace_limit
-	    or (self.nInputPlane * self.kH * self.kW * 4) -- 4 = sizeof int/float.
-	 if self.fastest_mode then
+        end
+        algType[0] = self.fmode or algType[0]
+        self.fwdAlgType = algType
+        local bufSize = torch.LongTensor(1)
+        errcheck('cudnnGetConvolutionForwardWorkspaceSize',
+                 cudnn.getHandle(),
+                 self.iDesc[0], self.weightDesc[0],
+                 self.convDesc[0], self.oDesc[0],
+                 algType[0], bufSize:data())
+        maxBufSize = math.max(maxBufSize, bufSize[1])
+
+        -- create backwardFilterAlgorithm descriptors
+        local algType = ffi.new("cudnnConvolutionBwdFilterAlgo_t[?]", 1)
+        local algSearchMode = 'CUDNN_CONVOLUTION_BWD_FILTER_NO_WORKSPACE'
+        local algWorkspaceLimit = self.workspace_limit
+            or (self.nInputPlane * self.kH * self.kW * 4) -- 4 = sizeof int/float.
+        if self.fastest_mode then
             algSearchMode = 'CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST'
-         end
+        end
 
-         if cudnn.benchmark then -- the manual auto-tuner is run
+        if cudnn.benchmark then -- the manual auto-tuner is run
             local perfResults = ffi.new("cudnnConvolutionBwdFilterAlgoPerf_t[?]", 1)
             local intt = torch.IntTensor(1);
             errcheck('cudnnFindConvolutionBackwardFilterAlgorithm',
@@ -189,35 +187,35 @@ function SpatialConvolution:createIODescriptors(input)
                      1, intt:data(), perfResults)
             algType[0] = perfResults[0].algo
             if cudnn.verbose then
-               print('AutoTuning:', perfResults[0].time,
-                     tonumber(perfResults[0].memory), tonumber(perfResults[0].algo))
+                print('AutoTuning:', perfResults[0].time,
+                      tonumber(perfResults[0].memory), tonumber(perfResults[0].algo))
             end
-         else
+        else
             errcheck('cudnnGetConvolutionBackwardFilterAlgorithm',
                      cudnn.getHandle(),
                      self.iDesc[0], self.oDesc[0],
                      self.convDesc[0], self.weightDesc[0],
                      algSearchMode, algWorkspaceLimit, algType)
-         end
-	 algType[0] = self.bwmode or algType[0]
-         self.bwdFilterAlgType = algType
-         local bufSize = torch.LongTensor(1)
-         errcheck('cudnnGetConvolutionBackwardFilterWorkspaceSize',
-                  cudnn.getHandle(),
-                  self.iDesc[0], self.oDesc[0],
-                  self.convDesc[0], self.weightDesc[0],
-                  algType[0], bufSize:data())
-	 maxBufSize = math.max(maxBufSize, bufSize[1])
-
-	 -- create backwardDataAlgorithm descriptors
-         local algType = ffi.new("cudnnConvolutionBwdDataAlgo_t[?]", 1)
-	 local algSearchMode = 'CUDNN_CONVOLUTION_BWD_DATA_NO_WORKSPACE'
-	 local algWorkspaceLimit = self.workspace_limit
-	    or (self.nInputPlane * self.kH * self.kW * 4) -- 4 = sizeof int/float.
-	 if self.fastest_mode then
+        end
+        algType[0] = self.bwmode or algType[0]
+        self.bwdFilterAlgType = algType
+        local bufSize = torch.LongTensor(1)
+        errcheck('cudnnGetConvolutionBackwardFilterWorkspaceSize',
+                 cudnn.getHandle(),
+                 self.iDesc[0], self.oDesc[0],
+                 self.convDesc[0], self.weightDesc[0],
+                 algType[0], bufSize:data())
+        maxBufSize = math.max(maxBufSize, bufSize[1])
+
+        -- create backwardDataAlgorithm descriptors
+        local algType = ffi.new("cudnnConvolutionBwdDataAlgo_t[?]", 1)
+        local algSearchMode = 'CUDNN_CONVOLUTION_BWD_DATA_NO_WORKSPACE'
+        local algWorkspaceLimit = self.workspace_limit
+            or (self.nInputPlane * self.kH * self.kW * 4) -- 4 = sizeof int/float.
+        if self.fastest_mode then
             algSearchMode = 'CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST'
-         end
-         if cudnn.benchmark then -- the manual auto-tuner is run
+        end
+        if cudnn.benchmark then -- the manual auto-tuner is run
             local perfResults = ffi.new("cudnnConvolutionBwdDataAlgoPerf_t[?]", 1)
             local intt = torch.IntTensor(1);
             errcheck('cudnnFindConvolutionBackwardDataAlgorithm',
@@ -227,183 +225,183 @@ function SpatialConvolution:createIODescriptors(input)
                      1, intt:data(), perfResults)
             algType[0] = perfResults[0].algo
             if cudnn.verbose then
-               print('AutoTuning:', perfResults[0].time,
-                     tonumber(perfResults[0].memory), tonumber(perfResults[0].algo))
+                print('AutoTuning:', perfResults[0].time,
+                      tonumber(perfResults[0].memory), tonumber(perfResults[0].algo))
             end
-         else
+        else
             errcheck('cudnnGetConvolutionBackwardDataAlgorithm',
                      cudnn.getHandle(),
                      self.weightDesc[0], self.oDesc[0],
                      self.convDesc[0], self.iDesc[0],
                      algSearchMode, algWorkspaceLimit, algType)
-         end
-	 algType[0] = self.bdmode or algType[0]
-         self.bwdDataAlgType = algType
-         local bufSize = torch.LongTensor(1)
-         errcheck('cudnnGetConvolutionBackwardDataWorkspaceSize',
-                  cudnn.getHandle(),
-                  self.weightDesc[0], self.oDesc[0],
-                  self.convDesc[0], self.iDesc[0],
-                  algType[0], bufSize:data())
-	 maxBufSize = math.max(maxBufSize, bufSize[1])
-
-	 self.extraBuffer = self.extraBuffer or cudnn.getSharedWorkspace()
-	 self.extraBufferSizeInBytes = self.extraBuffer:nElement() * 4 -- float
-         if maxBufSize > self.extraBufferSizeInBytes then
-           self.extraBuffer:resize(math.ceil(maxBufSize/4))
-           self.extraBufferSizeInBytes = maxBufSize
-         end
-
-	 -----------------------------------------------------------------------
-         -- create offsets for groups
-         local iH, iW = input:size(3), input:size(4)
-         local kH, kW = self.kH, self.kW
-         local oH, oW = oSize[3], oSize[4]
-         self.input_offset = self.nInputPlane / self.groups * iH * iW
-         self.output_offset = self.nOutputPlane / self.groups * oH, oW
-         self.weight_offset = self.nInputPlane / self.groups * self.nOutputPlane / self.groups * kH * kW
-
-         if not batch then
+        end
+        algType[0] = self.bdmode or algType[0]
+        self.bwdDataAlgType = algType
+        local bufSize = torch.LongTensor(1)
+        errcheck('cudnnGetConvolutionBackwardDataWorkspaceSize',
+                 cudnn.getHandle(),
+                 self.weightDesc[0], self.oDesc[0],
+                 self.convDesc[0], self.iDesc[0],
+                 algType[0], bufSize:data())
+        maxBufSize = math.max(maxBufSize, bufSize[1])
+
+        self.extraBuffer = self.extraBuffer or cudnn.getSharedWorkspace()
+        self.extraBufferSizeInBytes = self.extraBuffer:nElement() * 4 -- float
+        if maxBufSize > self.extraBufferSizeInBytes then
+            self.extraBuffer:resize(math.ceil(maxBufSize/4))
+            self.extraBufferSizeInBytes = maxBufSize
+        end
+
+        -----------------------------------------------------------------------
+        -- create offsets for groups
+        local iH, iW = input:size(3), input:size(4)
+        local kH, kW = self.kH, self.kW
+        local oH, oW = oSize[3], oSize[4]
+        self.input_offset = self.nInputPlane / self.groups * iH * iW
+        self.output_offset = self.nOutputPlane / self.groups * oH, oW
+        self.weight_offset = self.nInputPlane / self.groups * self.nOutputPlane / self.groups * kH * kW
+
+        if not batch then
             self.gradInput = self.gradInput:view(self.gradInput:size(2),
                                                  self.gradInput:size(3),
                                                  self.gradInput:size(4))
             self.output = self.output:view(self.output:size(2),
                                            self.output:size(3),
                                            self.output:size(4))
-         end
-   end
+        end
+    end
 end
 
 local one = torch.FloatTensor({1});
 local zero = torch.FloatTensor({0});
 
 function SpatialConvolution:updateOutput(input)
-   if not self.weightDesc then self:resetWeightDescriptors() end
-   self:createIODescriptors(input)
-
-   local prevStream
-   local streamQueue = {}
-   if self.groups > 1 then -- try to do stream parallelization
-      prevStream = cutorch.getStream()
-
-      --[[
-         Only if prevStream is 0, then do parallelization.
-         the justification for this is that this is a hard problem, there is no
-         way to know if one is doing other kinds of stream-parallelization
-         (like GPUConcat), and if thats the case, streams are already
-         being ideally exploited.
-      --]]
-
-      if prevStream == 0 then
-         cutorch.reserveStreams(self.groups)
-         for i=1,self.groups do
-            cutorch.streamWaitFor(i, {prevStream})
-         end
-      end
-   end
-
-   for g = 0, self.groups - 1 do
-      -- stream-parallelize if appropriate
-      if self.groups > 1 and prevStream == 0 then
-         cutorch.setStream(g + 1)
-         table.insert(streamQueue, g + 1)
-      end
-
-      errcheck('cudnnConvolutionForward', cudnn.getHandle(),
-               one:data(),
-               self.iDesc[0], input:data() + g*self.input_offset,
-               self.weightDesc[0], self.weight:data() + g*self.weight_offset,
-               self.convDesc[0], self.fwdAlgType[0],
-               self.extraBuffer:data(), self.extraBufferSizeInBytes,
-               zero:data(),
-               self.oDesc[0], self.output:data() + g*self.output_offset);
-   end
-
-   if prevStream == 0 then
-      cutorch.setStream(prevStream)
-      cutorch.streamWaitFor(prevStream, streamQueue)
-   end
-
-   -- add bias
-   errcheck('cudnnAddTensor', cudnn.getHandle(),
-            'CUDNN_ADD_SAME_C',
-            one:data(), self.biasDesc[0], self.bias:data(),
-            one:data(), self.oDescForBias[0], self.output:data())
-
-   return self.output
+    if not self.weightDesc then self:resetWeightDescriptors() end
+    self:createIODescriptors(input)
+
+    local prevStream
+    local streamQueue = {}
+    if self.groups > 1 then -- try to do stream parallelization
+        prevStream = cutorch.getStream()
+
+        --[[
+            Only if prevStream is 0, then do parallelization.
+            the justification for this is that this is a hard problem, there is no
+            way to know if one is doing other kinds of stream-parallelization
+            (like GPUConcat), and if thats the case, streams are already
+            being ideally exploited.
+        --]]
+
+        if prevStream == 0 then
+            cutorch.reserveStreams(self.groups)
+            for i=1,self.groups do
+                cutorch.streamWaitFor(i, {prevStream})
+            end
+        end
+    end
+
+    for g = 0, self.groups - 1 do
+        -- stream-parallelize if appropriate
+        if self.groups > 1 and prevStream == 0 then
+            cutorch.setStream(g + 1)
+            table.insert(streamQueue, g + 1)
+        end
+
+        errcheck('cudnnConvolutionForward', cudnn.getHandle(),
+                 one:data(),
+                 self.iDesc[0], input:data() + g*self.input_offset,
+                 self.weightDesc[0], self.weight:data() + g*self.weight_offset,
+                 self.convDesc[0], self.fwdAlgType[0],
+                 self.extraBuffer:data(), self.extraBufferSizeInBytes,
+                 zero:data(),
+                 self.oDesc[0], self.output:data() + g*self.output_offset);
+    end
+
+    if prevStream == 0 then
+        cutorch.setStream(prevStream)
+        cutorch.streamWaitFor(prevStream, streamQueue)
+    end
+
+    -- add bias
+    errcheck('cudnnAddTensor', cudnn.getHandle(),
+             'CUDNN_ADD_SAME_C',
+             one:data(), self.biasDesc[0], self.bias:data(),
+             one:data(), self.oDescForBias[0], self.output:data())
+
+    return self.output
 end
 
 function SpatialConvolution:updateGradInput(input, gradOutput)
-   if not self.gradInput then return end
-
-   assert(gradOutput:dim() == 3 or gradOutput:dim() == 4, 'gradOutput has to be 3D or 4D');
-   assert(gradOutput:isContiguous(), 'gradOutput has to be contiguous')
-   if not self.weightDesc then self:resetWeightDescriptors() end
-   self:createIODescriptors(input)
-
-   for g = 0,self.groups - 1 do
-      errcheck('cudnnConvolutionBackwardData_v3', cudnn.getHandle(),
-               one:data(),
-               self.weightDesc[0], self.weight:data() + g*self.weight_offset,
-               self.oDesc[0], gradOutput:data() + g*self.output_offset,
-               self.convDesc[0],
-	       self.bwdDataAlgType[0],
-	       self.extraBuffer:data(), self.extraBufferSizeInBytes,
-               zero:data(),
-               self.iDesc[0], self.gradInput:data() + g*self.input_offset);
-   end
-   return self.gradInput
+    if not self.gradInput then return end
+
+    assert(gradOutput:dim() == 3 or gradOutput:dim() == 4, 'gradOutput has to be 3D or 4D');
+    assert(gradOutput:isContiguous(), 'gradOutput has to be contiguous')
+    if not self.weightDesc then self:resetWeightDescriptors() end
+    self:createIODescriptors(input)
+
+    for g = 0,self.groups - 1 do
+        errcheck('cudnnConvolutionBackwardData_v3', cudnn.getHandle(),
+                 one:data(),
+                 self.weightDesc[0], self.weight:data() + g*self.weight_offset,
+                 self.oDesc[0], gradOutput:data() + g*self.output_offset,
+                 self.convDesc[0],
+                 self.bwdDataAlgType[0],
+                 self.extraBuffer:data(), self.extraBufferSizeInBytes,
+                 zero:data(),
+                 self.iDesc[0], self.gradInput:data() + g*self.input_offset);
+    end
+    return self.gradInput
 end
 
 function SpatialConvolution:accGradParameters(input, gradOutput, scale)
-   self.scaleT = self.scaleT or torch.FloatTensor(1):fill(1.0)
-   -- this line forces this member to always be on CPU (needed for cudnn)
-   self.scaleT = self.scaleT:float()
-   scale = scale or 1.0
-   self.scaleT[1] = scale
-
-   assert(gradOutput:dim() == 3 or gradOutput:dim() == 4, 'gradOutput has to be 3D or 4D');
-   assert(gradOutput:isContiguous(), 'gradOutput has to be contiguous')
-   if not self.weightDesc then self:resetWeightDescriptors() end
-   self:createIODescriptors(input)
-
-   -- gradBias
-   errcheck('cudnnConvolutionBackwardBias', cudnn.getHandle(),
+    self.scaleT = self.scaleT or torch.FloatTensor(1):fill(1.0)
+    -- this line forces this member to always be on CPU (needed for cudnn)
+    self.scaleT = self.scaleT:float()
+    scale = scale or 1.0
+    self.scaleT[1] = scale
+
+    assert(gradOutput:dim() == 3 or gradOutput:dim() == 4, 'gradOutput has to be 3D or 4D');
+    assert(gradOutput:isContiguous(), 'gradOutput has to be contiguous')
+    if not self.weightDesc then self:resetWeightDescriptors() end
+    self:createIODescriptors(input)
+
+    -- gradBias
+    errcheck('cudnnConvolutionBackwardBias', cudnn.getHandle(),
              self.scaleT:data(),
              self.oDescForBias[0], gradOutput:data(),
              one:data(),
              self.biasDesc[0], self.gradBias:data())
 
-   for g = 0, self.groups - 1 do
-      -- gradWeight
-      errcheck('cudnnConvolutionBackwardFilter_v3', cudnn.getHandle(),
-               self.scaleT:data(),
-               self.iDesc[0], input:data() + g*self.input_offset,
-               self.oDesc[0], gradOutput:data() + g*self.output_offset,
-               self.convDesc[0],
-	       self.bwdFilterAlgType[0],
-	       self.extraBuffer:data(), self.extraBufferSizeInBytes,
-               one:data(),
-               self.weightDesc[0], self.gradWeight:data() + g*self.weight_offset);
-   end
+    for g = 0, self.groups - 1 do
+        -- gradWeight
+        errcheck('cudnnConvolutionBackwardFilter_v3', cudnn.getHandle(),
+                 self.scaleT:data(),
+                 self.iDesc[0], input:data() + g*self.input_offset,
+                 self.oDesc[0], gradOutput:data() + g*self.output_offset,
+                 self.convDesc[0],
+                 self.bwdFilterAlgType[0],
+                 self.extraBuffer:data(), self.extraBufferSizeInBytes,
+                 one:data(),
+                 self.weightDesc[0], self.gradWeight:data() + g*self.weight_offset);
+    end
 end
 
 function SpatialConvolution:write(f)
-   self.weightDesc = nil
-   self.biasDesc = nil
-   self.convDesc = nil
-   self.iDesc = nil
-   self.oDesc = nil
-   self.oDescForBias = nil
-   self.algType = nil
-   self.fwdAlgType = nil
-   self.bwdDataAlgType = nil
-   self.bwdFilterAlgType = nil
-   self.extraBuffer = nil
-   self.extraBufferSizeInBytes = nil
-   local var = {}
-   for k,v in pairs(self) do
-      var[k] = v
-   end
-   f:writeObject(var)
+    self.weightDesc = nil
+    self.biasDesc = nil
+    self.convDesc = nil
+    self.iDesc = nil
+    self.oDesc = nil
+    self.oDescForBias = nil
+    self.algType = nil
+    self.fwdAlgType = nil
+    self.bwdDataAlgType = nil
+    self.bwdFilterAlgType = nil
+    self.extraBuffer = nil
+    self.extraBufferSizeInBytes = nil
+    local var = {}
+    for k,v in pairs(self) do
+        var[k] = v
+    end
+    f:writeObject(var)
 end
diff --git a/VolumetricConvolution.lua b/VolumetricConvolution.lua
index fd2447d..4c73ce7 100644
--- a/VolumetricConvolution.lua
+++ b/VolumetricConvolution.lua
@@ -41,7 +41,6 @@ end
 function VolumetricConvolution:fastest(mode)
    if mode == nil then mode = true end
    self.fastest_mode = mode
-   self.iSize:fill(0)
    return self
 end
 
@@ -55,7 +54,6 @@ function VolumetricConvolution:setMode(fmode, bdmode, bwmode)
    if bwmode ~= nil then
       self.bwmode = bwmode
    end
-   self.iSize:fill(0)
    return self
 end
 
@@ -92,7 +90,7 @@ function VolumetricConvolution:createIODescriptors(input)
          errcheck('cudnnSetConvolutionNdDescriptor_v3', self.convDesc[0],
                   3, pad:data(),
                   stride:data(), upscale:data(), 'CUDNN_CROSS_CORRELATION',
-		  'CUDNN_DATA_FLOAT');
+          'CUDNN_DATA_FLOAT');
          local function destroyConvDesc(d)
             errcheck('cudnnDestroyConvolutionDescriptor', d[0]);
          end
@@ -112,20 +110,20 @@ function VolumetricConvolution:createIODescriptors(input)
                              self.output:size(2),
                              self.output:size(3)*self.output:size(4),
                              self.output:size(5)))
-	 -----------------------------------------------------------------
-	 local maxBufSize = 0
+     -----------------------------------------------------------------
+     local maxBufSize = 0
          -- create forwardAlgorithm descriptors for
          local algType = ffi.new("cudnnConvolutionFwdAlgo_t[?]", 1)
-	 local algSearchMode = 'CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT'
-	 local algWorkspaceLimit = self.workspace_limit
-	    or (self.nInputPlane * self.kT * self.kH * self.kW * 4) -- 4 = sizeof int/float.
-	 if self.fastest_mode then algSearchMode = 'CUDNN_CONVOLUTION_FWD_PREFER_FASTEST' end
+     local algSearchMode = 'CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT'
+     local algWorkspaceLimit = self.workspace_limit
+        or (self.nInputPlane * self.kT * self.kH * self.kW * 4) -- 4 = sizeof int/float.
+     if self.fastest_mode then algSearchMode = 'CUDNN_CONVOLUTION_FWD_PREFER_FASTEST' end
          errcheck('cudnnGetConvolutionForwardAlgorithm',
                   cudnn.getHandle(),
                   self.iDesc[0], self.weightDesc[0],
                   self.convDesc[0], self.oDesc[0],
                   algSearchMode, algWorkspaceLimit, algType)
-	 algType[0] = self.fmode or algType[0]
+     algType[0] = self.fmode or algType[0]
          self.fwdAlgType = algType
          local bufSize = torch.LongTensor(1)
          errcheck('cudnnGetConvolutionForwardWorkspaceSize',
@@ -133,20 +131,20 @@ function VolumetricConvolution:createIODescriptors(input)
                   self.iDesc[0], self.weightDesc[0],
                   self.convDesc[0], self.oDesc[0],
                   algType[0], bufSize:data())
-	 maxBufSize = math.max(maxBufSize, bufSize[1])
+     maxBufSize = math.max(maxBufSize, bufSize[1])
 
-	 -- create backwardFilterAlgorithm descriptors for
+     -- create backwardFilterAlgorithm descriptors for
          local algType = ffi.new("cudnnConvolutionBwdFilterAlgo_t[?]", 1)
-	 local algSearchMode = 'CUDNN_CONVOLUTION_BWD_FILTER_NO_WORKSPACE'
-	 local algWorkspaceLimit = self.workspace_limit
-	    or (self.nInputPlane * self.kT * self.kH * self.kW * 4) -- 4 = sizeof int/float.
-	 if self.fastest_mode then algSearchMode = 'CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST' end
+     local algSearchMode = 'CUDNN_CONVOLUTION_BWD_FILTER_NO_WORKSPACE'
+     local algWorkspaceLimit = self.workspace_limit
+        or (self.nInputPlane * self.kT * self.kH * self.kW * 4) -- 4 = sizeof int/float.
+     if self.fastest_mode then algSearchMode = 'CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST' end
          errcheck('cudnnGetConvolutionBackwardFilterAlgorithm',
                   cudnn.getHandle(),
                   self.iDesc[0], self.oDesc[0],
                   self.convDesc[0], self.weightDesc[0],
                   algSearchMode, algWorkspaceLimit, algType)
-	 algType[0] = self.bwmode or algType[0]
+     algType[0] = self.bwmode or algType[0]
          self.bwdFilterAlgType = algType
          local bufSize = torch.LongTensor(1)
          errcheck('cudnnGetConvolutionBackwardFilterWorkspaceSize',
@@ -154,20 +152,20 @@ function VolumetricConvolution:createIODescriptors(input)
                   self.iDesc[0], self.oDesc[0],
                   self.convDesc[0], self.weightDesc[0],
                   algType[0], bufSize:data())
-	 maxBufSize = math.max(maxBufSize, bufSize[1])
+     maxBufSize = math.max(maxBufSize, bufSize[1])
 
-	 -- create backwardDataAlgorithm descriptors for
+     -- create backwardDataAlgorithm descriptors for
          local algType = ffi.new("cudnnConvolutionBwdDataAlgo_t[?]", 1)
-	 local algSearchMode = 'CUDNN_CONVOLUTION_BWD_DATA_NO_WORKSPACE'
-	 local algWorkspaceLimit = self.workspace_limit
-	    or (self.nInputPlane * self.kT * self.kH * self.kW * 4) -- 4 = sizeof int/float.
-	 if self.fastest_mode then algSearchMode = 'CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST' end
+     local algSearchMode = 'CUDNN_CONVOLUTION_BWD_DATA_NO_WORKSPACE'
+     local algWorkspaceLimit = self.workspace_limit
+        or (self.nInputPlane * self.kT * self.kH * self.kW * 4) -- 4 = sizeof int/float.
+     if self.fastest_mode then algSearchMode = 'CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST' end
          errcheck('cudnnGetConvolutionBackwardDataAlgorithm',
                   cudnn.getHandle(),
                   self.weightDesc[0], self.oDesc[0],
                   self.convDesc[0], self.iDesc[0],
                   algSearchMode, algWorkspaceLimit, algType)
-	 algType[0] = self.bdmode or algType[0]
+     algType[0] = self.bdmode or algType[0]
          self.bwdDataAlgType = algType
          local bufSize = torch.LongTensor(1)
          errcheck('cudnnGetConvolutionBackwardDataWorkspaceSize',
@@ -175,16 +173,16 @@ function VolumetricConvolution:createIODescriptors(input)
                   self.weightDesc[0], self.oDesc[0],
                   self.convDesc[0], self.iDesc[0],
                   algType[0], bufSize:data())
-	 maxBufSize = math.max(maxBufSize, bufSize[1])
+     maxBufSize = math.max(maxBufSize, bufSize[1])
 
-	 self.extraBuffer = self.extraBuffer or cudnn.getSharedWorkspace()
-	 self.extraBufferSizeInBytes = self.extraBuffer:nElement() * 4 -- float
+     self.extraBuffer = self.extraBuffer or cudnn.getSharedWorkspace()
+     self.extraBufferSizeInBytes = self.extraBuffer:nElement() * 4 -- float
          if maxBufSize > self.extraBufferSizeInBytes then
            self.extraBuffer:resize(math.ceil(maxBufSize/4))
            self.extraBufferSizeInBytes = maxBufSize
          end
 
-	 -----------------------------------------------------------------
+     -----------------------------------------------------------------
          if not batch then
             self.gradInput = self.gradInput:view(self.gradInput:size(2),
                                                  self.gradInput:size(3),
@@ -226,14 +224,14 @@ function VolumetricConvolution:updateGradInput(input, gradOutput)
    if not self.weightDesc then self:resetWeightDescriptors() end
    self:createIODescriptors(input)
    errcheck('cudnnConvolutionBackwardData_v3', cudnn.getHandle(),
-	    one:data(),
-	    self.weightDesc[0], self.weight:data(),
-	    self.oDesc[0], gradOutput:data(),
-	    self.convDesc[0],
-	    self.bwdDataAlgType[0],
-	    self.extraBuffer:data(), self.extraBufferSizeInBytes,
-	    zero:data(),
-	    self.iDesc[0], self.gradInput:data());
+        one:data(),
+        self.weightDesc[0], self.weight:data(),
+        self.oDesc[0], gradOutput:data(),
+        self.convDesc[0],
+        self.bwdDataAlgType[0],
+        self.extraBuffer:data(), self.extraBufferSizeInBytes,
+        zero:data(),
+        self.iDesc[0], self.gradInput:data());
    return self.gradInput
 end
 
@@ -256,14 +254,14 @@ function VolumetricConvolution:accGradParameters(input, gradOutput, scale)
             self.biasDesc[0], self.gradBias:data());
    -- gradWeight
    errcheck('cudnnConvolutionBackwardFilter_v3', cudnn.getHandle(),
-	    self.scaleT:data(),
-	    self.iDesc[0], input:data(),
-	    self.oDesc[0], gradOutput:data(),
-	    self.convDesc[0],
-	    self.bwdFilterAlgType[0],
-	    self.extraBuffer:data(), self.extraBufferSizeInBytes,
-	    one:data(),
-	    self.weightDesc[0], self.gradWeight:data());
+        self.scaleT:data(),
+        self.iDesc[0], input:data(),
+        self.oDesc[0], gradOutput:data(),
+        self.convDesc[0],
+        self.bwdFilterAlgType[0],
+        self.extraBuffer:data(), self.extraBufferSizeInBytes,
+        one:data(),
+        self.weightDesc[0], self.gradWeight:data());
 end
 
 function VolumetricConvolution:write(f)
diff --git a/ffi.lua b/ffi.lua
index d749744..2bb68b7 100644
--- a/ffi.lua
+++ b/ffi.lua
@@ -111,13 +111,13 @@ cudnnStatus_t
    cudnnCreateConvolutionDescriptor(cudnnConvolutionDescriptor_t *convDesc );
 cudnnStatus_t
 cudnnSetConvolutionNdDescriptor_v3( cudnnConvolutionDescriptor_t convDesc,
-				    int arrayLength,
-				    const int padA[],
-				    const int filterStrideA[],
-				    const int upscaleA[],
-				    cudnnConvolutionMode_t mode,
-				    cudnnDataType_t dataType
-				    );
+            int arrayLength,
+            const int padA[],
+            const int filterStrideA[],
+            const int upscaleA[],
+            cudnnConvolutionMode_t mode,
+            cudnnDataType_t dataType
+            );
 
 cudnnStatus_t
    cudnnGetConvolutionNdForwardOutputDim(
@@ -156,14 +156,14 @@ typedef struct {
 
 cudnnStatus_t
 cudnnFindConvolutionForwardAlgorithm(cudnnHandle_t                      handle,
-				     const cudnnTensorDescriptor_t      srcDesc,
-				     const cudnnFilterDescriptor_t      filterDesc,
-				     const cudnnConvolutionDescriptor_t convDesc,
-				     const cudnnTensorDescriptor_t      destDesc,
-				     const int                          requestedCount,
-				     int                                *returnedCount,
-				     cudnnConvolutionFwdAlgoPerf_t      *perfResults
-				     );
+             const cudnnTensorDescriptor_t      srcDesc,
+             const cudnnFilterDescriptor_t      filterDesc,
+             const cudnnConvolutionDescriptor_t convDesc,
+             const cudnnTensorDescriptor_t      destDesc,
+             const int                          requestedCount,
+             int                                *returnedCount,
+             cudnnConvolutionFwdAlgoPerf_t      *perfResults
+             );
 
 
 cudnnStatus_t cudnnGetConvolutionForwardAlgorithm( cudnnHandle_t handle,
@@ -243,42 +243,42 @@ cudnnStatus_t cudnnFindConvolutionBackwardFilterAlgorithm( cudnnHandle_t handle,
 
 cudnnStatus_t
 cudnnGetConvolutionBackwardFilterAlgorithm(
-		cudnnHandle_t handle,
-		const cudnnTensorDescriptor_t          srcDesc,
-		const cudnnTensorDescriptor_t          diffDesc,
-		const cudnnConvolutionDescriptor_t     convDesc,
-		const cudnnFilterDescriptor_t          gradDesc,
-		cudnnConvolutionBwdFilterPreference_t  preference,
-		size_t                                 memoryLimitInbytes,
-		cudnnConvolutionBwdFilterAlgo_t        *algo
-					   );
+    cudnnHandle_t handle,
+    const cudnnTensorDescriptor_t          srcDesc,
+    const cudnnTensorDescriptor_t          diffDesc,
+    const cudnnConvolutionDescriptor_t     convDesc,
+    const cudnnFilterDescriptor_t          gradDesc,
+    cudnnConvolutionBwdFilterPreference_t  preference,
+    size_t                                 memoryLimitInbytes,
+    cudnnConvolutionBwdFilterAlgo_t        *algo
+             );
 
 cudnnStatus_t
 cudnnGetConvolutionBackwardFilterWorkspaceSize(
-		  cudnnHandle_t handle,
-		  const cudnnTensorDescriptor_t       srcDesc,
-		  const cudnnTensorDescriptor_t       diffDesc,
-		  const cudnnConvolutionDescriptor_t  convDesc,
-		  const cudnnFilterDescriptor_t       gradDesc,
-		  cudnnConvolutionBwdFilterAlgo_t     algo,
-		  size_t                              *sizeInBytes
-					       );
+      cudnnHandle_t handle,
+      const cudnnTensorDescriptor_t       srcDesc,
+      const cudnnTensorDescriptor_t       diffDesc,
+      const cudnnConvolutionDescriptor_t  convDesc,
+      const cudnnFilterDescriptor_t       gradDesc,
+      cudnnConvolutionBwdFilterAlgo_t     algo,
+      size_t                              *sizeInBytes
+                 );
 
 cudnnStatus_t cudnnConvolutionBackwardFilter_v3(
-		   cudnnHandle_t                       handle,
-		   const void                         *alpha,
-		   const cudnnTensorDescriptor_t       srcDesc,
-		   const void                         *srcData,
-		   const cudnnTensorDescriptor_t       diffDesc,
-		   const void                         *diffData,
-		   const cudnnConvolutionDescriptor_t  convDesc,
-		   cudnnConvolutionBwdFilterAlgo_t     algo,
-		   void                               *workSpace,
-		   size_t                              workSpaceSizeInBytes,
-		   const void                         *beta,
-		   const cudnnFilterDescriptor_t       gradDesc,
-		   void                               *gradData
-						);
+       cudnnHandle_t                       handle,
+       const void                         *alpha,
+       const cudnnTensorDescriptor_t       srcDesc,
+       const void                         *srcData,
+       const cudnnTensorDescriptor_t       diffDesc,
+       const void                         *diffData,
+       const cudnnConvolutionDescriptor_t  convDesc,
+       cudnnConvolutionBwdFilterAlgo_t     algo,
+       void                               *workSpace,
+       size_t                              workSpaceSizeInBytes,
+       const void                         *beta,
+       const cudnnFilterDescriptor_t       gradDesc,
+       void                               *gradData
+            );
 
 typedef enum
 {
@@ -312,42 +312,42 @@ cudnnStatus_t cudnnFindConvolutionBackwardDataAlgorithm(cudnnHandle_t handle,
                                                                    );
 
 cudnnStatus_t cudnnGetConvolutionBackwardDataAlgorithm(
-	     cudnnHandle_t                      handle,
-	     const cudnnFilterDescriptor_t      filterDesc,
-	     const cudnnTensorDescriptor_t       diffDesc,
-	     const cudnnConvolutionDescriptor_t convDesc,
-	     const cudnnTensorDescriptor_t       gradDesc,
-	     cudnnConvolutionBwdDataPreference_t    preference,
-	     size_t                             memoryLimitInbytes,
-	     cudnnConvolutionBwdDataAlgo_t         *algo
-						       );
+       cudnnHandle_t                      handle,
+       const cudnnFilterDescriptor_t      filterDesc,
+       const cudnnTensorDescriptor_t       diffDesc,
+       const cudnnConvolutionDescriptor_t convDesc,
+       const cudnnTensorDescriptor_t       gradDesc,
+       cudnnConvolutionBwdDataPreference_t    preference,
+       size_t                             memoryLimitInbytes,
+       cudnnConvolutionBwdDataAlgo_t         *algo
+                   );
 
 cudnnStatus_t cudnnGetConvolutionBackwardDataWorkspaceSize(
-		      cudnnHandle_t                      handle,
-		      const cudnnFilterDescriptor_t      filterDesc,
-		      const cudnnTensorDescriptor_t       diffDesc,
-		      const cudnnConvolutionDescriptor_t convDesc,
-		      const cudnnTensorDescriptor_t       gradDesc,
-		      cudnnConvolutionBwdDataAlgo_t          algo,
-		      size_t                            *sizeInBytes
-							   );
+          cudnnHandle_t                      handle,
+          const cudnnFilterDescriptor_t      filterDesc,
+          const cudnnTensorDescriptor_t       diffDesc,
+          const cudnnConvolutionDescriptor_t convDesc,
+          const cudnnTensorDescriptor_t       gradDesc,
+          cudnnConvolutionBwdDataAlgo_t          algo,
+          size_t                            *sizeInBytes
+                 );
 
 
 cudnnStatus_t cudnnConvolutionBackwardData_v3(
-		     cudnnHandle_t                       handle,
-		     const void                         *alpha,
-		     const cudnnFilterDescriptor_t       filterDesc,
-		     const void                         *filterData,
-		     const cudnnTensorDescriptor_t       diffDesc,
-		     const void                         *diffData,
-		     const cudnnConvolutionDescriptor_t  convDesc,
-		     cudnnConvolutionBwdDataAlgo_t           algo,
-		     void                               *workSpace,
-		     size_t                              workSpaceSizeInBytes,
-		     const void                         *beta,
-		     const cudnnTensorDescriptor_t       gradDesc,
-		     void                               *gradData
-					      );
+         cudnnHandle_t                       handle,
+         const void                         *alpha,
+         const cudnnFilterDescriptor_t       filterDesc,
+         const void                         *filterData,
+         const cudnnTensorDescriptor_t       diffDesc,
+         const void                         *diffData,
+         const cudnnConvolutionDescriptor_t  convDesc,
+         cudnnConvolutionBwdDataAlgo_t           algo,
+         void                               *workSpace,
+         size_t                              workSpaceSizeInBytes,
+         const void                         *beta,
+         const cudnnTensorDescriptor_t       gradDesc,
+         void                               *gradData
+                );
 
 
 typedef enum
@@ -365,19 +365,19 @@ typedef enum
 
 /* Function to perform forward softmax */
 cudnnStatus_t cudnnSoftmaxForward(  cudnnHandle_t                    handle,
-				    cudnnSoftmaxAlgorithm_t          algorithm,
-				    cudnnSoftmaxMode_t               mode,
-				    const void                      *alpha,
-				    const cudnnTensorDescriptor_t    srcDesc,
-				    const void                      *srcData,
-				    const void                      *beta,
-				    const cudnnTensorDescriptor_t    destDesc,
-				    void                            *destData
-				    );
+            cudnnSoftmaxAlgorithm_t          algorithm,
+            cudnnSoftmaxMode_t               mode,
+            const void                      *alpha,
+            const cudnnTensorDescriptor_t    srcDesc,
+            const void                      *srcData,
+            const void                      *beta,
+            const cudnnTensorDescriptor_t    destDesc,
+            void                            *destData
+            );
 
 /* Function to perform backward softmax */
 cudnnStatus_t cudnnSoftmaxBackward(
-		 cudnnHandle_t                    handle,
+     cudnnHandle_t                    handle,
                  cudnnSoftmaxAlgorithm_t          algorithm,
                  cudnnSoftmaxMode_t               mode,
                  const void                      *alpha,
@@ -399,15 +399,15 @@ typedef enum
 } cudnnPoolingMode_t;
 
 cudnnStatus_t cudnnCreatePoolingDescriptor(
-				      cudnnPoolingDescriptor_t *poolingDesc);
+              cudnnPoolingDescriptor_t *poolingDesc);
 cudnnStatus_t cudnnSetPoolingNdDescriptor(
-			 cudnnPoolingDescriptor_t poolingDesc,
+       cudnnPoolingDescriptor_t poolingDesc,
                          const cudnnPoolingMode_t mode,
                          int nbDims,
                          const int windowDimA[],
                          const int paddingA[],
                          const int strideA[]
-					  );
+            );
 
 cudnnStatus_t cudnnGetPoolingNdDescriptor(
                          const cudnnPoolingDescriptor_t poolingDesc,
@@ -421,12 +421,12 @@ cudnnStatus_t cudnnGetPoolingNdDescriptor(
 
 cudnnStatus_t cudnnGetPoolingNdForwardOutputDim(
                  const cudnnPoolingDescriptor_t poolingDesc,
-		 const cudnnTensorDescriptor_t inputTensorDesc,
-		 int nbDims,
-		 int outputTensorDimA[]);
+     const cudnnTensorDescriptor_t inputTensorDesc,
+     int nbDims,
+     int outputTensorDimA[]);
 
 cudnnStatus_t cudnnDestroyPoolingDescriptor(
-		      cudnnPoolingDescriptor_t poolingDesc );
+          cudnnPoolingDescriptor_t poolingDesc );
 
 cudnnStatus_t cudnnPoolingForward(  cudnnHandle_t handle,
                  const cudnnPoolingDescriptor_t   poolingDesc,
@@ -491,46 +491,46 @@ typedef enum
   } cudnnLRNMode_t;
 
 cudnnStatus_t cudnnSetLRNDescriptor(
-				    cudnnLRNDescriptor_t   normDesc,
-				    unsigned               lrnN,
-				    double                 lrnAlpha,
-				    double                 lrnBeta,
-				    double                 lrnK);
+            cudnnLRNDescriptor_t   normDesc,
+            unsigned               lrnN,
+            double                 lrnAlpha,
+            double                 lrnBeta,
+            double                 lrnK);
 
 cudnnStatus_t cudnnGetLRNDescriptor(
-				    cudnnLRNDescriptor_t   normDesc,
-				    unsigned*              lrnN,
-				    double*                lrnAlpha,
-				    double*                lrnBeta,
-				    double*                lrnK);
+            cudnnLRNDescriptor_t   normDesc,
+            unsigned*              lrnN,
+            double*                lrnAlpha,
+            double*                lrnBeta,
+            double*                lrnK);
 
 cudnnStatus_t cudnnDestroyLRNDescriptor( cudnnLRNDescriptor_t lrnDesc );
 
 cudnnStatus_t cudnnLRNCrossChannelForward(
-			  cudnnHandle_t                    handle,
-			  cudnnLRNDescriptor_t             normDesc,
-			  cudnnLRNMode_t                   lrnMode,
-			  const void*                      alpha,
-			  const cudnnTensorDescriptor_t    srcDesc,
-			  const void                      *srcData,
-			  const void                      *beta,
-			  const cudnnTensorDescriptor_t    destDesc,
-			  void                            *destData);
+        cudnnHandle_t                    handle,
+        cudnnLRNDescriptor_t             normDesc,
+        cudnnLRNMode_t                   lrnMode,
+        const void*                      alpha,
+        const cudnnTensorDescriptor_t    srcDesc,
+        const void                      *srcData,
+        const void                      *beta,
+        const cudnnTensorDescriptor_t    destDesc,
+        void                            *destData);
 
 cudnnStatus_t cudnnLRNCrossChannelBackward(
-			 cudnnHandle_t                    handle,
-			 cudnnLRNDescriptor_t             normDesc,
-			 cudnnLRNMode_t                   lrnMode,
-			 const void*                      alpha,
-			 const cudnnTensorDescriptor_t    srcDesc,
-			 const void                      *srcData,
-			 const cudnnTensorDescriptor_t    srcDiffDesc,
-			 const void                      *srcDiffData,
-			 const cudnnTensorDescriptor_t    destDesc,
-			 const void                      *destData,
-			 const void                      *beta,
-			 const cudnnTensorDescriptor_t    destDiffDesc,
-			 void                            *destDiffData);
+       cudnnHandle_t                    handle,
+       cudnnLRNDescriptor_t             normDesc,
+       cudnnLRNMode_t                   lrnMode,
+       const void*                      alpha,
+       const cudnnTensorDescriptor_t    srcDesc,
+       const void                      *srcData,
+       const cudnnTensorDescriptor_t    srcDiffDesc,
+       const void                      *srcDiffData,
+       const cudnnTensorDescriptor_t    destDesc,
+       const void                      *destData,
+       const void                      *beta,
+       const cudnnTensorDescriptor_t    destDiffDesc,
+       void                            *destDiffData);
 
 typedef enum
   {
@@ -538,36 +538,36 @@ typedef enum
   } cudnnDivNormMode_t;
 
 cudnnStatus_t cudnnDivisiveNormalizationForward(
-			cudnnHandle_t                    handle,
-			cudnnLRNDescriptor_t             normDesc,
-			cudnnDivNormMode_t               mode,
-			const void                      *alpha,
-			const cudnnTensorDescriptor_t    srcDesc,
-			const void                      *srcData,
-			const void                      *srcMeansData,
-			void                            *tempData,
-			void                            *tempData2,
-			const void                      *beta,
-			const cudnnTensorDescriptor_t    destDesc,
-			void                            *destData
-						);
+      cudnnHandle_t                    handle,
+      cudnnLRNDescriptor_t             normDesc,
+      cudnnDivNormMode_t               mode,
+      const void                      *alpha,
+      const cudnnTensorDescriptor_t    srcDesc,
+      const void                      *srcData,
+      const void                      *srcMeansData,
+      void                            *tempData,
+      void                            *tempData2,
+      const void                      *beta,
+      const cudnnTensorDescriptor_t    destDesc,
+      void                            *destData
+            );
 
 cudnnStatus_t cudnnDivisiveNormalizationBackward(
-			       cudnnHandle_t                    handle,
-			       cudnnLRNDescriptor_t             normDesc,
-			       cudnnDivNormMode_t               mode,
-			       const void                      *alpha,
-			       const cudnnTensorDescriptor_t    srcDesc,
-			       const void                      *srcData,
-			       const void                      *srcMeansData,
-			       const void                      *srcDiffData,
-			       void                            *tempData,
-			       void                            *tempData2,
-			       const void                      *betaData,
-			       const cudnnTensorDescriptor_t    destDataDesc,
-			       void                            *destDataDiff,
-			       void                            *destMeansDiff
-						 );
+             cudnnHandle_t                    handle,
+             cudnnLRNDescriptor_t             normDesc,
+             cudnnDivNormMode_t               mode,
+             const void                      *alpha,
+             const cudnnTensorDescriptor_t    srcDesc,
+             const void                      *srcData,
+             const void                      *srcMeansData,
+             const void                      *srcDiffData,
+             void                            *tempData,
+             void                            *tempData2,
+             const void                      *betaData,
+             const cudnnTensorDescriptor_t    destDataDesc,
+             void                            *destDataDiff,
+             void                            *destMeansDiff
+             );
 
 ]]
 
diff --git a/functional.lua b/functional.lua
index 5a343c5..66bb4d6 100644
--- a/functional.lua
+++ b/functional.lua
@@ -181,7 +181,7 @@ cudnn.functional.Convolution2D_updateGradInput = function(handle, input, weight,
                weightDesc[0], weight:data(),
                oDesc[0], gradOutput:data(),
                convDesc[0],
-	       algType[0],
+               algType[0],
                NULL, 0,
                zero:data(),
                iDesc[0], gradInput:data());
diff --git a/test/test.lua b/test/test.lua
index 4062425..8c22ece 100644
--- a/test/test.lua
+++ b/test/test.lua
@@ -692,15 +692,11 @@ end
 function cudnntest.LogSoftMax_batch()
    local bs = math.random(1,32)
    local from = math.random(1,32)
-   local outi = math.random(1,64)
-   local outj = math.random(1,64)
-   local ini = outi
-   local inj = outj
-   local input = torch.randn(bs,from,inj,ini):cuda()
-   local gradOutput = torch.randn(bs,from,outj,outi):cuda()
+   local input = torch.randn(bs,from):cuda()
+   local gradOutput = torch.randn(bs,from):cuda()
 
    local sconv = nn.LogSoftMax():cuda()
-   local groundtruth = sconv:forward(input:view(bs,-1))
+   local groundtruth = sconv:forward(input)
    local groundgrad = sconv:backward(input, gradOutput)
    cutorch.synchronize()
    local gconv = cudnn.LogSoftMax():cuda()
@@ -713,8 +709,8 @@ function cudnntest.LogSoftMax_batch()
    local rescuda = gconv:forward(input)
    local resgrad = gconv:backward(input, gradOutput)
    cutorch.synchronize()
-   mytester:asserteq(rescuda:dim(), 4, 'error in dimension')
-   mytester:asserteq(resgrad:dim(), 4, 'error in dimension')
+   mytester:asserteq(rescuda:dim(), 2, 'error in dimension')
+   mytester:asserteq(resgrad:dim(), 2, 'error in dimension')
 
    local error = rescuda:float() - groundtruth:float()
    mytester:assertlt(error:abs():max(),
author	soumith <soumith@fb.com>	2015-09-15 20:32:36 +0300
committer	soumith <soumith@fb.com>	2015-09-15 20:32:36 +0300
commit	4d5c3db15efc87fe4220fc06486a8d7be759dcc2 (patch)
tree	87548ebc6a6c6113d952569d1ab72ccf6052ebb0
parent	97f41c48602a345344bb5f76e73e4b2fbf7eb679 (diff)