make VolumetricFullConvolution use find

author: Natalia Gimeshein <ngimelshein@nvidia.com> 2016-10-06 02:47:28 +0300
committer: Natalia Gimeshein <ngimelshein@nvidia.com> 2016-10-08 02:03:02 +0300
commit: 068a0d2a85a3090d324656a2d7cf238952e8a91f (patch)
tree: b2fdc08004b181bdfaf89e1db8a9884d55d10288
parent: 94eb9baea49c9e69f8688de08cca9a6a05f35e9a (diff)
2 files changed, 63 insertions, 268 deletions
diff --git a/VolumetricFullConvolution.lua b/VolumetricFullConvolution.lua
index 3cc43a3..d0d00f2 100644
--- a/VolumetricFullConvolution.lua
+++ b/VolumetricFullConvolution.lua
@@ -1,65 +1,32 @@
 local VolumetricFullConvolution, parent
    = torch.class('cudnn.VolumetricFullConvolution', 'nn.VolumetricFullConvolution')
 local ffi = require 'ffi'
-local errcheck = cudnn.errcheck
+local find = require 'cudnn.find'
+local errcheck = find.errcheck
 
-local autotunerCache = {}
-autotunerCache[1] = {} -- forward
-autotunerCache[2] = {} -- backwardFilter
-autotunerCache[3] = {} -- backwardData
+local Convolution = cudnn.SpatialConvolution
 
 -- if you change the configuration of the module manually, call this
 function VolumetricFullConvolution:resetWeightDescriptors()
-   assert(cudnn.typemap[torch.typename(self.weight)], 'Only Cuda supported duh!')
-   assert(cudnn.typemap[torch.typename(self.bias)] or not self.bias, 'Only Cuda supported duh!')
-   -- create filterDescriptor for weight
-   self.weightDesc = ffi.new('struct cudnnFilterStruct*[1]')
-   errcheck('cudnnCreateFilterDescriptor', self.weightDesc)
    local desc = torch.IntTensor({self.nInputPlane, self.nOutputPlane,
                              self.kT, self.kH, self.kW})
-   errcheck('cudnnSetFilterNdDescriptor', self.weightDesc[0],
-            cudnn.typemap[torch.typename(self.weight)], 'CUDNN_TENSOR_NCHW', 5,
-            desc:data());
-   local function destroyWDesc(d)
-      errcheck('cudnnDestroyFilterDescriptor', d[0]);
-   end
-   ffi.gc(self.weightDesc, destroyWDesc)
-
-   -- create descriptor for bias
-   self.biasDesc = cudnn.toDescriptor(self.bias:view(1, self.nOutputPlane,
-                                                     1, 1))
+   return Convolution.resetWeightDescriptors(self,desc)
 end
 
 function VolumetricFullConvolution:fastest(mode)
-   if mode == nil then mode = true end
-   self.fastest_mode = mode
-   self.iSize = self.iSize or torch.LongStorage(5)
-   self.iSize:fill(0)
-   return self
+   return Convolution.fastest(self, mode)
 end
 
+
 function VolumetricFullConvolution:setMode(fmode, bdmode, bwmode)
-   if fmode ~= nil then
-      self.fmode = fmode
-   end
-   if bdmode ~= nil then
-      self.bdmode = bdmode
-   end
-   if bwmode ~= nil then
-      self.bwmode = bwmode
-   end
-   self.iSize = self.iSize or torch.LongStorage(5)
-   self.iSize:fill(0)
-   return self
+   return Convolution.setMode(self, fmode, bdmode, bwmode)
 end
 
 function VolumetricFullConvolution:resetMode()
-   self.fmode = nil
-   self.bdmode = nil
-   self.bwmode = nil
-   return self
+   return Convolution.resetMode(self)
 end
 
+
 function VolumetricFullConvolution:createIODescriptors(input)
    local batch = true
    if input:dim() == 4 then
@@ -69,27 +36,21 @@ function VolumetricFullConvolution:createIODescriptors(input)
    end
    assert(input:dim() == 5 and input:isContiguous());
    self.iSize = self.iSize or torch.LongStorage(5):fill(0)
-   if not self.iDesc or not self.oDesc or
-      input:size(1) ~= self.iSize[1] or input:size(2) ~= self.iSize[2]
-   or input:size(3) ~= self.iSize[3] or input:size(4) ~= self.iSize[4]
-   or input:size(5) ~= self.iSize[5] then
-         self.iSize = input:size()
+   if Convolution.checkInputChanged(self, input) then
+         local input_slice = input[{{},{1,self.nInputPlane},{},{}}]
+         self.iDesc = cudnn.toDescriptor(input_slice)
          -- create input descriptor
-         self.iDesc = cudnn.toDescriptor(input)
+--         self.iDesc = cudnn.toDescriptor(input)
          -- create conv descriptor
-         self.convDesc = ffi.new('struct cudnnConvolutionStruct*[1]')
-         errcheck('cudnnCreateConvolutionDescriptor', self.convDesc)
-         local pad = torch.IntTensor({self.padT, self.padH, self.padW})
-         local stride = torch.IntTensor({self.dT, self.dH, self.dW})
+         self.convDesc = cudnn.createDescriptors(1, 'struct cudnnConvolutionStruct*[?]',
+         'cudnnCreateConvolutionDescriptor', 'cudnnDestroyConvolutionDescriptor')
+         self.pad = torch.IntTensor({self.padT, self.padH, self.padW})
+         self.stride = torch.IntTensor({self.dT, self.dH, self.dW})
          local upscale = torch.IntTensor({1,1,1})
-         errcheck('cudnnSetConvolutionNdDescriptor', self.convDesc[0],
-                  3, pad:data(),
-                  stride:data(), upscale:data(), 'CUDNN_CROSS_CORRELATION',
+         errcheck(self, 'cudnnSetConvolutionNdDescriptor', self.convDesc[0],
+                  3, self.pad:data(),
+                  self.stride:data(), upscale:data(), 'CUDNN_CROSS_CORRELATION',
                   cudnn.configmap(torch.type(self.weight)));
-         local function destroyConvDesc(d)
-            errcheck('cudnnDestroyConvolutionDescriptor', d[0]);
-         end
-         ffi.gc(self.convDesc, destroyConvDesc)
 
         -- get output shape, resize output
         local iwidth = input:size(5)
@@ -102,196 +63,23 @@ function VolumetricFullConvolution:createIODescriptors(input)
         self.output:resize(oSize:long():storage())
 
         -- create descriptor for output
-        local output_slice = {{},{1,self.nOutputPlane},{},{}}
-        self.oDesc = cudnn.toDescriptor(self.output[output_slice])
-        self.oDescBias = cudnn.toDescriptor(
+        local output_slice = self.output[{{},{1,self.nOutputPlane},{},{}}]
+        self.oDesc = cudnn.toDescriptor(output_slice)
+        self.oDescForBias = cudnn.toDescriptor(
             self.output:view(self.output:size(1),
                              self.output:size(2),
                              self.output:size(3)*self.output:size(4),
                              self.output:size(5)))
-
-        -----------------------------------------------------------------------
-        local function shape(x)
- 	   return table.concat(x:size():totable(),'x')
-        end
-        local autotunerHash = shape(self.weight) .. ';'
-           .. shape(input) .. ';'
-           .. shape(self.output)
-
-        local maxBufSize = 0
-
-        -- create forwardAlgorithm descriptors
-        local algType = ffi.new("cudnnConvolutionFwdAlgo_t[?]", 1)
-        local algSearchMode = 'CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT'
-        local algWorkspaceLimit = self.workspace_limit
-            or (self.nOutputPlane * self.kT * self.kH * self.kW * 4) -- 4 = sizeof int/float.
-
-        if self.fastest_mode or cudnn.fastest == true then
-            algSearchMode = 'CUDNN_CONVOLUTION_FWD_PREFER_FASTEST'
-        end
-
-        if cudnn.benchmark then -- the manual auto-tuner is run
-            if autotunerCache[1][autotunerHash] then
-                algType[0] = autotunerCache[1][autotunerHash]
-                if cudnn.verbose then
-                   print('Autotuning VMC FW: using cached algo = ', algType[0], ' for: ', autotunerHash)
-                end
-            else
-                local perfResults = ffi.new("cudnnConvolutionFwdAlgoPerf_t[?]", 1)
-                local intt = torch.IntTensor(1);
-                errcheck('cudnnFindConvolutionForwardAlgorithm',
-                         cudnn.getHandle(),
-                         self.oDesc[0], self.weightDesc[0],
-                         self.convDesc[0], self.iDesc[0],
-                         1, intt:data(), perfResults)
-                algType[0] = perfResults[0].algo
-                autotunerCache[1][autotunerHash] = perfResults[0].algo
-                if cudnn.verbose then
-                    print(string.format(
-                              "\nAutotuning VMC    Forward: Time: %3.5f Memory: %8d Algorithm: %d"
-                                  .. " Weight: %15s Input: %15s Output: %15s",
-                              perfResults[0].time, tonumber(perfResults[0].memory),
-                              tonumber(perfResults[0].algo),
-                              shape(self.weight), shape(input),
-                              shape(self.output)))
-                end
-            end
-        else
-            errcheck('cudnnGetConvolutionForwardAlgorithm',
-                     cudnn.getHandle(),
-                     self.oDesc[0], self.weightDesc[0],
-                     self.convDesc[0], self.iDesc[0],
-                     algSearchMode, algWorkspaceLimit, algType)
-        end
-        algType[0] = self.fmode or algType[0]
-        self.fwdAlgType = algType
-        local bufSize = torch.LongTensor(1)
-        errcheck('cudnnGetConvolutionForwardWorkspaceSize',
-                 cudnn.getHandle(),
-                 self.oDesc[0], self.weightDesc[0],
-                 self.convDesc[0], self.iDesc[0],
-                 algType[0], bufSize:data())
-        maxBufSize = math.max(maxBufSize, bufSize[1])
-
-        -- create backwardFilterAlgorithm descriptors
-        local algType = ffi.new("cudnnConvolutionBwdFilterAlgo_t[?]", 1)
-        local algSearchMode = 'CUDNN_CONVOLUTION_BWD_FILTER_NO_WORKSPACE'
-        local algWorkspaceLimit = self.workspace_limit
-            or (self.nInputPlane * self.kT * self.kH * self.kW * 4) -- 4 = sizeof int/float.
-        if self.fastest_mode  or cudnn.fastest == true then
-            algSearchMode = 'CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST'
-        end
-
-        if cudnn.benchmark then -- the manual auto-tuner is run
-            if autotunerCache[2][autotunerHash] then
-                algType[0] = autotunerCache[2][autotunerHash]
-                if cudnn.verbose then
-                   print('Autotuning VMC BWF: using cached algo = ', algType[0], ' for: ', autotunerHash)
-                end
-            else
-                local perfResults = ffi.new("cudnnConvolutionBwdFilterAlgoPerf_t[?]", 1)
-                local intt = torch.IntTensor(1);
-                errcheck('cudnnFindConvolutionBackwardFilterAlgorithm',
-                         cudnn.getHandle(),
-                         self.oDesc[0], self.iDesc[0],
-                         self.convDesc[0], self.weightDesc[0],
-                         1, intt:data(), perfResults)
-                algType[0] = perfResults[0].algo
-                autotunerCache[2][autotunerHash] = perfResults[0].algo
-                if cudnn.verbose then
-                    print(string.format(
-                              "Autotuning backwardFilter: Time: %3.5f Memory: %8d Algorithm: %d"
-                                  .. " Weight: %15s Input: %15s Output: %15s",
-                              perfResults[0].time, tonumber(perfResults[0].memory),
-                              tonumber(perfResults[0].algo),
-                              shape(self.weight), shape(input),
-                              shape(self.output)))
-                end
-            end
-        else
-            errcheck('cudnnGetConvolutionBackwardFilterAlgorithm',
-                     cudnn.getHandle(),
-                     self.oDesc[0], self.iDesc[0],
-                     self.convDesc[0], self.weightDesc[0],
-                     algSearchMode, algWorkspaceLimit, algType)
-        end
-        algType[0] = self.bwmode or algType[0]
-        self.bwdFilterAlgType = algType
-        local bufSize = torch.LongTensor(1)
-        errcheck('cudnnGetConvolutionBackwardFilterWorkspaceSize',
-                 cudnn.getHandle(),
-                 self.oDesc[0], self.iDesc[0],
-                 self.convDesc[0], self.weightDesc[0],
-                 algType[0], bufSize:data())
-        maxBufSize = math.max(maxBufSize, bufSize[1])
-
-        -- create backwardDataAlgorithm descriptors
-        local algType = ffi.new("cudnnConvolutionBwdDataAlgo_t[?]", 1)
-        local algSearchMode = 'CUDNN_CONVOLUTION_BWD_DATA_NO_WORKSPACE'
-        local algWorkspaceLimit = self.workspace_limit
-            or (self.nOutputPlane * self.kH * self.kW * 4) -- 4 = sizeof int/float.
-        if self.fastest_mode or cudnn.fastest == true then
-            algSearchMode = 'CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST'
-        end
-        if cudnn.benchmark then -- the manual auto-tuner is run
-            if autotunerCache[3][autotunerHash] then
-                algType[0] = autotunerCache[3][autotunerHash]
-                if cudnn.verbose then
-                   print('Autotuning VMC BWD: using cached algo = ', algType[0], ' for: ', autotunerHash)
-                end
-            else
-                local perfResults = ffi.new("cudnnConvolutionBwdDataAlgoPerf_t[?]", 1)
-                local intt = torch.IntTensor(1);
-                errcheck('cudnnFindConvolutionBackwardDataAlgorithm',
-                         cudnn.getHandle(),
-                         self.weightDesc[0], self.iDesc[0],
-                         self.convDesc[0], self.oDesc[0],
-                         1, intt:data(), perfResults)
-                algType[0] = perfResults[0].algo
-                autotunerCache[3][autotunerHash] = perfResults[0].algo
-                if cudnn.verbose then
-                    print(string.format(
-                              "Autotuning   backwardData: Time: %3.5f Memory: %8d Algorithm: %d"
-                                  .. " Weight: %15s Input: %15s Output: %15s\n",
-                              perfResults[0].time, tonumber(perfResults[0].memory),
-                              tonumber(perfResults[0].algo),
-                              shape(self.weight), shape(input),
-                              shape(self.output)))
-                end
-            end
-        else
-            errcheck('cudnnGetConvolutionBackwardDataAlgorithm',
-                     cudnn.getHandle(),
-                     self.weightDesc[0], self.iDesc[0],
-                     self.convDesc[0], self.oDesc[0],
-                     algSearchMode, algWorkspaceLimit, algType)
-        end
-        algType[0] = self.bdmode or algType[0]
-        self.bwdDataAlgType = algType
-        local bufSize = torch.LongTensor(1)
-        errcheck('cudnnGetConvolutionBackwardDataWorkspaceSize',
-                 cudnn.getHandle(),
-                 self.weightDesc[0], self.iDesc[0],
-                 self.convDesc[0], self.oDesc[0],
-                 algType[0], bufSize:data())
-        maxBufSize = math.max(maxBufSize, bufSize[1])
-
-        self.extraBuffer = self.extraBuffer or cudnn.getSharedWorkspace()
-        self.extraBuffer = self.extraBuffer:cuda() -- always force float
-        self.extraBufferSizeInBytes =
-           self.extraBuffer:nElement() * 4 -- extraBuffer is always float
-        if maxBufSize > self.extraBufferSizeInBytes then
-           self.extraBuffer:resize(math.ceil(maxBufSize / 4))
-           self.extraBufferSizeInBytes = maxBufSize
-        end
-        -----------------------------------------------------------------------
-
-         if not batch then
+        self.input_offset = 0
+        self.output_offset = 0
+	self.weight_offset = 0
+        find:prepare(self, input_slice, output_slice)
+        if not batch then
             self.output = self.output:view(self.output:size(2),
                                            self.output:size(3),
                                            self.output:size(4),
                                            self.output:size(5))
-         end
+        end
    end
 end
 
@@ -315,23 +103,30 @@ end
 function VolumetricFullConvolution:updateOutput(input)
     if not self.weightDesc then self:resetWeightDescriptors() end
     self:createIODescriptors(input)
-
+    local finder = find.get()
     -- Because SpatialFullConvolution is performing the adjoint of the forward
     -- convolution operator, we need to swap the forward and backward passes.
-    errcheck('cudnnConvolutionBackwardData', cudnn.getHandle(),
+
+
+    local bwdDataAlgo = finder:backwardDataAlgorithm(self, {self.weightDesc[0], self.weight,
+                                                            self.iDesc[0],self.input_slice,
+                                                            self.convDesc[0], self.oDesc[0], self.output_slice})
+    local extraBuffer, extraBufferSize = cudnn.getSharedWorkspace()
+
+    errcheck(self, 'cudnnConvolutionBackwardData', cudnn.getHandle(),
              cudnn.scalar(input, 1),
              self.weightDesc[0], self.weight:data(),
              self.iDesc[0], input:data(),
-             self.convDesc[0], self.bwdDataAlgType[0],
-             self.extraBuffer:data(), self.extraBufferSizeInBytes,
+             self.convDesc[0], bwdDataAlgo,
+             extraBuffer, extraBufferSize,
              cudnn.scalar(input, 0),
              self.oDesc[0], self.output:data())
 
     -- add bias
     if self.bias then
-        errcheck('cudnnAddTensor', cudnn.getHandle(),
+        errcheck(self, 'cudnnAddTensor', cudnn.getHandle(),
                  cudnn.scalar(input, 1), self.biasDesc[0], self.bias:data(),
-                 cudnn.scalar(input, 1), self.oDescBias[0], self.output:data())
+                 cudnn.scalar(input, 1), self.oDescForBias[0], self.output:data())
     end
 
     return self.output
@@ -345,14 +140,19 @@ function VolumetricFullConvolution:updateGradInput(input, gradOutput)
     assert(gradOutput:isContiguous(), 'gradOutput has to be contiguous')
     if not self.weightDesc then self:resetWeightDescriptors() end
     self:createIODescriptors(input)
+    local finder = find.get()
+    local fwdAlgo = finder:forwardAlgorithm(self, {self.oDesc[0], self.output_slice,
+                                                   self.weightDesc[0], self.weight,
+                                 self.convDesc[0], self.iDesc[0], self.input_slice})
+    local extraBuffer, extraBufferSize = cudnn.getSharedWorkspace()
 
-    errcheck('cudnnConvolutionForward', cudnn.getHandle(),
+    errcheck(self,'cudnnConvolutionForward', cudnn.getHandle(),
              cudnn.scalar(input, 1),
              self.oDesc[0], gradOutput:data(),
              self.weightDesc[0], self.weight:data(),
              self.convDesc[0],
-             self.fwdAlgType[0],
-             self.extraBuffer:data(), self.extraBufferSizeInBytes,
+             fwdAlgo,
+             extraBuffer, extraBufferSize,
              cudnn.scalar(input, 0),
              self.iDesc[0], self.gradInput:data());
     return self.gradInput
@@ -372,36 +172,31 @@ function VolumetricFullConvolution:accGradParameters(input, gradOutput, scale)
    self:createIODescriptors(input)
    if not self.weightDesc then self:resetWeightDescriptors() end
    -- gradBias
-   errcheck('cudnnConvolutionBackwardBias', cudnn.getHandle(),
+
+   local finder = find.get()
+   local bwdFilterAlgo = finder:backwardFilterAlgorithm(self, {self.oDesc[0], self.output_slice,
+                                                                self.iDesc[0], self.input_slice,
+                                                  self.convDesc[0], self.weightDesc[0], self.weight})
+   errcheck(self, 'cudnnConvolutionBackwardBias', cudnn.getHandle(),
             self.scaleT:data(),
-            self.oDescBias[0], gradOutput:data(),
+            self.oDescForBias[0], gradOutput:data(),
             cudnn.scalar(input, 1),
             self.biasDesc[0], self.gradBias:data());
+   local extraBuffer, extraBufferSize = cudnn.getSharedWorkspace()
    -- gradWeight
-   errcheck('cudnnConvolutionBackwardFilter', cudnn.getHandle(),
+   errcheck(self, 'cudnnConvolutionBackwardFilter', cudnn.getHandle(),
         self.scaleT:data(),
         self.oDesc[0], gradOutput:data(),
         self.iDesc[0], input:data(),
         self.convDesc[0],
-        self.bwdFilterAlgType[0],
-        self.extraBuffer:data(), self.extraBufferSizeInBytes,
+        bwdFilterAlgo,
+        extraBuffer, extraBufferSize,
         cudnn.scalar(input, 1),
         self.weightDesc[0], self.gradWeight:data());
 end
 
 function VolumetricFullConvolution:clearDesc()
-   self.weightDesc = nil
-   self.biasDesc = nil
-   self.convDesc = nil
-   self.iDesc = nil
-   self.oDesc = nil
-   self.oDescBias = nil
-   self.fwdAlgType = nil
-   self.bwdDataAlgType = nil
-   self.bwdFilterAlgType = nil
-   self.extraBuffer = nil
-   self.extraBufferInBytes = nil
-   self.scaleT = nil
+   return Convolution.clearDesc(self)
 end
 
 function VolumetricFullConvolution:write(f)
diff --git a/test/test.lua b/test/test.lua
index 86c4d70..40d730c 100644
--- a/test/test.lua
+++ b/test/test.lua
@@ -944,7 +944,7 @@ mytester:add(cudnntest)
 
 -- cudnn.verbose=true
 -- cudnn.find.verbose=true
-cudnn.useFindEx=true
+cudnn.useFindEx=false
 
 for i = 1, cutorch.getDeviceCount() do
author	Natalia Gimeshein <ngimelshein@nvidia.com>	2016-10-06 02:47:28 +0300
committer	Natalia Gimeshein <ngimelshein@nvidia.com>	2016-10-08 02:03:02 +0300
commit	068a0d2a85a3090d324656a2d7cf238952e8a91f (patch)
tree	b2fdc08004b181bdfaf89e1db8a9884d55d10288
parent	94eb9baea49c9e69f8688de08cca9a6a05f35e9a (diff)