diff options
-rw-r--r-- | BatchNormalization.lua | 33 | ||||
-rw-r--r-- | Pointwise.lua | 11 | ||||
-rw-r--r-- | Pooling.lua | 11 | ||||
-rw-r--r-- | Pooling3D.lua | 13 | ||||
-rw-r--r-- | SpatialConvolution.lua | 39 | ||||
-rw-r--r-- | SpatialCrossMapLRN.lua | 12 | ||||
-rw-r--r-- | SpatialDivisiveNormalization.lua | 12 | ||||
-rw-r--r-- | SpatialFullConvolution.lua | 28 | ||||
-rw-r--r-- | SpatialSoftMax.lua | 12 | ||||
-rw-r--r-- | TemporalConvolution.lua | 6 | ||||
-rw-r--r-- | VolumetricConvolution.lua | 39 | ||||
-rw-r--r-- | VolumetricMaxPooling.lua | 3 | ||||
-rw-r--r-- | functional.lua | 28 | ||||
-rw-r--r-- | init.lua | 38 | ||||
-rw-r--r-- | test/test.lua | 34 |
15 files changed, 184 insertions, 135 deletions
diff --git a/BatchNormalization.lua b/BatchNormalization.lua index 77db075..ac77e4f 100644 --- a/BatchNormalization.lua +++ b/BatchNormalization.lua @@ -56,10 +56,6 @@ function BatchNormalization:createIODescriptors(input) end end -local one = torch.FloatTensor({1}); -local zero = torch.FloatTensor({0}); -local scaleTens = torch.FloatTensor(1); - function BatchNormalization:updateOutput(input) self:createIODescriptors(input) @@ -70,13 +66,13 @@ function BatchNormalization:updateOutput(input) if self.train then errcheck('cudnnBatchNormalizationForwardTraining', - cudnn.getHandle(), self.mode, one:data(), zero:data(), + cudnn.getHandle(), self.mode, cudnn.scalar(input, 1), cudnn.scalar(input, 0), self.iDesc[0], input:data(), self.oDesc[0], self.output:data(), self.sDesc[0], self.weight:data(), self.bias:data(), self.momentum, self.running_mean:data(), self.running_var:data(), self.eps, self.save_mean:data(), self.save_std:data()); else errcheck('cudnnBatchNormalizationForwardInference', - cudnn.getHandle(), self.mode, one:data(), zero:data(), + cudnn.getHandle(), self.mode, cudnn.scalar(input, 1), cudnn.scalar(input, 0), self.iDesc[0], input:data(), self.oDesc[0], self.output:data(), self.sDesc[0], self.weight:data(), self.bias:data(), self.running_mean:data(), self.running_var:data(), self.eps); @@ -85,17 +81,26 @@ function BatchNormalization:updateOutput(input) end local function backward(self,input,gradOutput, scale) + self.scaleT = self.scaleT or self.weight.new(1) + -- this line forces this member to always be on CPU (needed for cudnn) + self.scaleT = torch.type(self.weight) == 'torch.CudaDoubleTensor' + and self.scaleT:double() or self.scaleT:float() + scale = scale or 1.0 + self.scaleT[1] = scale + assert(gradOutput:isContiguous()) self:createIODescriptors(input) self.gradInput:resizeAs(input) - scale = scale or 1 - scaleTens:fill(scale) errcheck('cudnnBatchNormalizationBackward', - cudnn.getHandle(), self.mode, one:data(), zero:data(), scaleTens:data(), one:data(), - self.iDesc[0], input:data(), self.iDesc[0], gradOutput:data(), self.iDesc[0], self.gradInput:data(), - -- input is bottom, gradOutput is topDiff, self.gradInput is resultBottomDiff - self.sDesc[0], self.weight:data(), self.gradWeight:data(), self.gradBias:data(), - self.eps, self.save_mean:data(), self.save_std:data()); + cudnn.getHandle(), self.mode, cudnn.scalar(input, 1), + cudnn.scalar(input, 0), self.scaleT:data(), cudnn.scalar(input, 1), + self.iDesc[0], input:data(), self.iDesc[0], + gradOutput:data(), self.iDesc[0], self.gradInput:data(), + -- input is bottom, gradOutput is topDiff, + -- self.gradInput is resultBottomDiff + self.sDesc[0], self.weight:data(), self.gradWeight:data(), + self.gradBias:data(), self.eps, self.save_mean:data(), + self.save_std:data()); return self.gradInput end @@ -139,7 +144,7 @@ end function BatchNormalization:type(type, tensorCache) local _type = type == 'torch.CudaHalfTensor' and 'torch.CudaTensor' or type - parent.type(self, _type, tensorCache) + parent.type(self, _type, tensorCache) self.output = self.output:type(type) self.gradInput = self.gradInput:type(type) return self diff --git a/Pointwise.lua b/Pointwise.lua index 9cfe0f2..76e6499 100644 --- a/Pointwise.lua +++ b/Pointwise.lua @@ -37,17 +37,14 @@ function Pointwise:createIODescriptors(input) end -local one = torch.FloatTensor({1}); -local zero = torch.FloatTensor({0}); - function Pointwise:updateOutput(input) self:createIODescriptors(input) if self.inplace then self.output:set(input) end errcheck('cudnnActivationForward', cudnn.getHandle(), self.activDesc[0], - one:data(), + cudnn.scalar(input, 1), self.iDesc[0], input:data(), - zero:data(), + cudnn.scalar(input, 0), self.iDesc[0], self.output:data()); return self.output end @@ -67,11 +64,11 @@ function Pointwise:updateGradInput(input, gradOutput) end errcheck('cudnnActivationBackward', cudnn.getHandle(), self.activDesc[0], - one:data(), + cudnn.scalar(input, 1), self.iDesc[0], self.output:data(), self.iDesc[0], gradOutput:data(), self.iDesc[0], input:data(), - zero:data(), + cudnn.scalar(input, 0), self.iDesc[0], self.gradInput:data()); return self.gradInput end diff --git a/Pooling.lua b/Pooling.lua index 53220f1..9e13abf 100644 --- a/Pooling.lua +++ b/Pooling.lua @@ -75,17 +75,14 @@ function Pooling:createIODescriptors(input) end end -local one = torch.FloatTensor({1}); -local zero = torch.FloatTensor({0}); - function Pooling:updateOutput(input) if not self.poolDesc then self:resetPoolDescriptors() end self:createIODescriptors(input) errcheck('cudnnPoolingForward', cudnn.getHandle(), self.poolDesc[0], - one:data(), + cudnn.scalar(input, 1), self.iDesc[0], input:data(), - zero:data(), + cudnn.scalar(input, 0), self.oDesc[0], self.output:data()); return self.output end @@ -102,11 +99,11 @@ function Pooling:updateGradInput(input, gradOutput) self:createIODescriptors(input) errcheck('cudnnPoolingBackward', cudnn.getHandle(), self.poolDesc[0], - one:data(), + cudnn.scalar(input, 1), self.oDesc[0], self.output:data(), self.oDesc[0], gradOutput:data(), self.iDesc[0], input:data(), - zero:data(), + cudnn.scalar(input, 0), self.iDesc[0], self.gradInput:data()); return self.gradInput end diff --git a/Pooling3D.lua b/Pooling3D.lua index e4c0218..b488c9a 100644 --- a/Pooling3D.lua +++ b/Pooling3D.lua @@ -29,7 +29,7 @@ end function Pooling:resetPoolDescriptors() -- create pooling descriptor - self.padT = self.padT or 0 + self.padT = self.padT or 0 self.padW = self.padW or 0 self.padH = self.padH or 0 self.poolDesc = ffi.new('struct cudnnPoolingStruct*[1]') @@ -83,17 +83,14 @@ function Pooling:createIODescriptors(input) end end -local one = torch.FloatTensor({1}); -local zero = torch.FloatTensor({0}); - function Pooling:updateOutput(input) if not self.poolDesc then self:resetPoolDescriptors() end self:createIODescriptors(input) errcheck('cudnnPoolingForward', cudnn.getHandle(), self.poolDesc[0], - one:data(), + cudnn.scalar(input, 1), self.iDesc[0], input:data(), - zero:data(), + cudnn.scalar(input, 0), self.oDesc[0], self.output:data()); return self.output end @@ -112,11 +109,11 @@ function Pooling:updateGradInput(input, gradOutput) self:createIODescriptors(input) errcheck('cudnnPoolingBackward', cudnn.getHandle(), self.poolDesc[0], - one:data(), + cudnn.scalar(input, 1), self.oDesc[0], self.output:data(), self.oDesc[0], gradOutput:data(), self.iDesc[0], input:data(), - zero:data(), + cudnn.scalar(input, 0), self.iDesc[0], self.gradInput:data()); return self.gradInput end diff --git a/SpatialConvolution.lua b/SpatialConvolution.lua index 58c78b2..8680586 100644 --- a/SpatialConvolution.lua +++ b/SpatialConvolution.lua @@ -165,7 +165,7 @@ function SpatialConvolution:createIODescriptors(input) local algType = ffi.new("cudnnConvolutionFwdAlgo_t[?]", 1) local algSearchMode = 'CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT' local algWorkspaceLimit = self.workspace_limit - or (self.nInputPlane * self.kH * self.kW * 4) -- 4 = sizeof int/float. + or (self.nInputPlane * self.kH * self.kW * cudnn.sizeof(self.weight)) if self.fastest_mode or cudnn.fastest == true then algSearchMode = 'CUDNN_CONVOLUTION_FWD_PREFER_FASTEST' @@ -218,7 +218,7 @@ function SpatialConvolution:createIODescriptors(input) local algType = ffi.new("cudnnConvolutionBwdFilterAlgo_t[?]", 1) local algSearchMode = 'CUDNN_CONVOLUTION_BWD_FILTER_NO_WORKSPACE' local algWorkspaceLimit = self.workspace_limit - or (self.nInputPlane * self.kH * self.kW * 4) -- 4 = sizeof int/float. + or (self.nInputPlane * self.kH * self.kW * cudnn.sizeof(self.weight)) if self.fastest_mode or cudnn.fastest == true then algSearchMode = 'CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST' end @@ -270,7 +270,7 @@ function SpatialConvolution:createIODescriptors(input) local algType = ffi.new("cudnnConvolutionBwdDataAlgo_t[?]", 1) local algSearchMode = 'CUDNN_CONVOLUTION_BWD_DATA_NO_WORKSPACE' local algWorkspaceLimit = self.workspace_limit - or (self.nInputPlane * self.kH * self.kW * 4) -- 4 = sizeof int/float. + or (self.nInputPlane * self.kH * self.kW * cudnn.sizeof(self.weight)) if self.fastest_mode or cudnn.fastest == true then algSearchMode = 'CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST' end @@ -318,10 +318,12 @@ function SpatialConvolution:createIODescriptors(input) maxBufSize = math.max(maxBufSize, bufSize[1]) self.extraBuffer = self.extraBuffer or cudnn.getSharedWorkspace() - self.extraBufferSizeInBytes = self.extraBuffer:nElement() * 4 -- float + self.extraBuffer = self.extraBuffer:cuda() -- always force float + self.extraBufferSizeInBytes = + self.extraBuffer:nElement() * 4 -- extraBuffer is always float if maxBufSize > self.extraBufferSizeInBytes then - self.extraBuffer:resize(math.ceil(maxBufSize/4)) - self.extraBufferSizeInBytes = maxBufSize + self.extraBuffer:resize(math.ceil(maxBufSize / 4)) + self.extraBufferSizeInBytes = maxBufSize end ----------------------------------------------------------------------- @@ -341,8 +343,8 @@ function SpatialConvolution:createIODescriptors(input) end end -local one = torch.FloatTensor({1}); -local zero = torch.FloatTensor({0}); + + local function makeContiguous(self, input, gradOutput) if not input:isContiguous() then @@ -365,20 +367,20 @@ function SpatialConvolution:updateOutput(input) for g = 0, self.groups - 1 do errcheck('cudnnConvolutionForward', cudnn.getHandle(), - one:data(), + cudnn.scalar(input, 1), self.iDesc[0], input:data() + g*self.input_offset, self.weightDesc[0], self.weight:data() + g*self.weight_offset, self.convDesc[0], self.fwdAlgType[0], self.extraBuffer:data(), self.extraBufferSizeInBytes, - zero:data(), + cudnn.scalar(input, 0), self.oDesc[0], self.output:data() + g*self.output_offset); end -- add bias if self.bias then errcheck('cudnnAddTensor', cudnn.getHandle(), - one:data(), self.biasDesc[0], self.bias:data(), - one:data(), self.oDescForBias[0], self.output:data()) + cudnn.scalar(input, 1), self.biasDesc[0], self.bias:data(), + cudnn.scalar(input, 1), self.oDescForBias[0], self.output:data()) end return self.output @@ -395,22 +397,23 @@ function SpatialConvolution:updateGradInput(input, gradOutput) for g = 0,self.groups - 1 do errcheck('cudnnConvolutionBackwardData', cudnn.getHandle(), - one:data(), + cudnn.scalar(input, 1), self.weightDesc[0], self.weight:data() + g*self.weight_offset, self.oDesc[0], gradOutput:data() + g*self.output_offset, self.convDesc[0], self.bwdDataAlgType[0], self.extraBuffer:data(), self.extraBufferSizeInBytes, - zero:data(), + cudnn.scalar(input, 0), self.iDesc[0], self.gradInput:data() + g*self.input_offset); end return self.gradInput end function SpatialConvolution:accGradParameters(input, gradOutput, scale) - self.scaleT = self.scaleT or torch.FloatTensor(1):fill(1.0) + self.scaleT = self.scaleT or self.weight.new(1) -- this line forces this member to always be on CPU (needed for cudnn) - self.scaleT = self.scaleT:float() + self.scaleT = torch.type(self.weight) == 'torch.CudaDoubleTensor' + and self.scaleT:double() or self.scaleT:float() scale = scale or 1.0 self.scaleT[1] = scale @@ -425,7 +428,7 @@ function SpatialConvolution:accGradParameters(input, gradOutput, scale) errcheck('cudnnConvolutionBackwardBias', cudnn.getHandle(), self.scaleT:data(), self.oDescForBias[0], gradOutput:data(), - one:data(), + cudnn.scalar(input, 1), self.biasDesc[0], self.gradBias:data()) end @@ -438,7 +441,7 @@ function SpatialConvolution:accGradParameters(input, gradOutput, scale) self.convDesc[0], self.bwdFilterAlgType[0], self.extraBuffer:data(), self.extraBufferSizeInBytes, - one:data(), + cudnn.scalar(input, 1), self.weightDesc[0], self.gradWeight:data() + g*self.weight_offset); end end diff --git a/SpatialCrossMapLRN.lua b/SpatialCrossMapLRN.lua index 1f4ba33..4d75571 100644 --- a/SpatialCrossMapLRN.lua +++ b/SpatialCrossMapLRN.lua @@ -48,8 +48,8 @@ function LRN:createIODescriptors(input) end end -local one = torch.FloatTensor({1}); -local zero = torch.FloatTensor({0}); + + function LRN:updateOutput(input) if self.K then self.k, self.K = self.K, nil end @@ -58,9 +58,9 @@ function LRN:updateOutput(input) errcheck('cudnnLRNCrossChannelForward', cudnn.getHandle(), self.LRNDesc[0], 'CUDNN_LRN_CROSS_CHANNEL_DIM1', - one:data(), + cudnn.scalar(input, 1), self.iDesc[0], input:data(), - zero:data(), + cudnn.scalar(input, 0), self.iDesc[0], self.output:data()); return self.output end @@ -80,11 +80,11 @@ function LRN:updateGradInput(input, gradOutput) errcheck('cudnnLRNCrossChannelBackward', cudnn.getHandle(), self.LRNDesc[0], 'CUDNN_LRN_CROSS_CHANNEL_DIM1', - one:data(), + cudnn.scalar(input, 1), self.iDesc[0], self.output:data(), self.iDesc[0], gradOutput:data(), self.iDesc[0], input:data(), - zero:data(), + cudnn.scalar(input, 0), self.iDesc[0], self.gradInput:data()); return self.gradInput end diff --git a/SpatialDivisiveNormalization.lua b/SpatialDivisiveNormalization.lua index 3462b61..6917ec8 100644 --- a/SpatialDivisiveNormalization.lua +++ b/SpatialDivisiveNormalization.lua @@ -52,8 +52,8 @@ function DivisiveNorm:createIODescriptors(input) end end -local one = torch.FloatTensor({1}); -local zero = torch.FloatTensor({0}); + + function DivisiveNorm:updateOutput(input) if not self.DivisiveNormDesc then self:resetPoolDescriptors() end @@ -61,9 +61,9 @@ function DivisiveNorm:updateOutput(input) errcheck('cudnnDivisiveNormCrossChannelForward', cudnn.getHandle(), self.DivisiveNormDesc[0], 'CUDNN_DivisiveNorm_CROSS_CHANNEL_DIM1', - one:data(), + cudnn.scalar(input, 1), self.iDesc[0], input:data(), - zero:data(), + cudnn.scalar(input, 0), self.iDesc[0], self.output:data()); return self.output end @@ -80,11 +80,11 @@ function DivisiveNorm:updateGradInput(input, gradOutput) errcheck('cudnnDivisiveNormCrossChannelBackward', cudnn.getHandle(), self.DivisiveNormDesc[0], 'CUDNN_DivisiveNorm_CROSS_CHANNEL_DIM1', - one:data(), + cudnn.scalar(input, 1), self.iDesc[0], self.output:data(), self.iDesc[0], gradOutput:data(), self.iDesc[0], input:data(), - zero:data(), + cudnn.scalar(input, 0), self.iDesc[0], self.gradInput:data()); return self.gradInput end diff --git a/SpatialFullConvolution.lua b/SpatialFullConvolution.lua index ff91a13..e937893 100644 --- a/SpatialFullConvolution.lua +++ b/SpatialFullConvolution.lua @@ -302,8 +302,8 @@ function SpatialFullConvolution:createIODescriptors(input) end end -local one = torch.FloatTensor({1}); -local zero = torch.FloatTensor({0}); + + function SpatialFullConvolution:updateOutput(input) if not self.weightDesc then self:resetWeightDescriptors() end @@ -312,19 +312,19 @@ function SpatialFullConvolution:updateOutput(input) -- Because SpatialFullConvolution is performing the adjoint of the forward -- convolution operator, we need to swap the forward and backward passes. errcheck('cudnnConvolutionBackwardData', cudnn.getHandle(), - one:data(), + cudnn.scalar(input, 1), self.weightDesc[0], self.weight:data(), self.iDesc[0], input:data(), self.convDesc[0], self.bwdDataAlgType[0], self.extraBuffer:data(), self.extraBufferSizeInBytes, - zero:data(), + cudnn.scalar(input, 0), self.oDesc[0], self.output:data()) -- add bias if self.bias then errcheck('cudnnAddTensor', cudnn.getHandle(), - one:data(), self.biasDesc[0], self.bias:data(), - one:data(), self.oDescForBias[0], self.output:data()) + cudnn.scalar(input, 1), self.biasDesc[0], self.bias:data(), + cudnn.scalar(input, 1), self.oDescForBias[0], self.output:data()) end return self.output @@ -340,25 +340,27 @@ function SpatialFullConvolution:updateGradInput(input, gradOutput) self:createIODescriptors(input) errcheck('cudnnConvolutionForward', cudnn.getHandle(), - one:data(), + cudnn.scalar(input, 1), self.oDesc[0], gradOutput:data(), self.weightDesc[0], self.weight:data(), self.convDesc[0], self.fwdAlgType[0], self.extraBuffer:data(), self.extraBufferSizeInBytes, - zero:data(), + cudnn.scalar(input, 0), self.iDesc[0], self.gradInput:data()); return self.gradInput end function SpatialFullConvolution:accGradParameters(input, gradOutput, scale) - self.scaleT = self.scaleT or torch.FloatTensor(1):fill(1.0) + self.scaleT = self.scaleT or self.weight.new(1) -- this line forces this member to always be on CPU (needed for cudnn) - self.scaleT = self.scaleT:float() + self.scaleT = torch.type(self.weight) == 'torch.CudaDoubleTensor' + and self.scaleT:double() or self.scaleT:float() scale = scale or 1.0 self.scaleT[1] = scale - assert(gradOutput:dim() == 3 or gradOutput:dim() == 4, 'gradOutput has to be 3D or 4D'); + assert(gradOutput:dim() == 3 or gradOutput:dim() == 4, + 'gradOutput has to be 3D or 4D'); assert(gradOutput:isContiguous(), 'gradOutput has to be contiguous') if not self.weightDesc then self:resetWeightDescriptors() end self:createIODescriptors(input) @@ -368,7 +370,7 @@ function SpatialFullConvolution:accGradParameters(input, gradOutput, scale) errcheck('cudnnConvolutionBackwardBias', cudnn.getHandle(), self.scaleT:data(), self.oDescForBias[0], gradOutput:data(), - one:data(), + cudnn.scalar(input, 1), self.biasDesc[0], self.gradBias:data()) end @@ -380,7 +382,7 @@ function SpatialFullConvolution:accGradParameters(input, gradOutput, scale) self.convDesc[0], self.bwdFilterAlgType[0], self.extraBuffer:data(), self.extraBufferSizeInBytes, - one:data(), + cudnn.scalar(input, 1), self.weightDesc[0], self.gradWeight:data()) end diff --git a/SpatialSoftMax.lua b/SpatialSoftMax.lua index 167eb1f..7b52bdd 100644 --- a/SpatialSoftMax.lua +++ b/SpatialSoftMax.lua @@ -50,17 +50,17 @@ function SpatialSoftMax:createIODescriptors(input) end end -local one = torch.FloatTensor({1}); -local zero = torch.FloatTensor({0}); + + function SpatialSoftMax:updateOutput(input) self:createIODescriptors(input) errcheck('cudnnSoftmaxForward', cudnn.getHandle(), self.algorithm, self.mode, - one:data(), + cudnn.scalar(input, 1), self.iDesc[0], input:data(), - zero:data(), + cudnn.scalar(input, 0), self.oDesc[0], self.output:data()); return self.output end @@ -77,10 +77,10 @@ function SpatialSoftMax:updateGradInput(input, gradOutput) errcheck('cudnnSoftmaxBackward', cudnn.getHandle(), self.algorithm, self.mode, - one:data(), + cudnn.scalar(input, 1), self.oDesc[0], self.output:data(), self.oDesc[0], gradOutput:data(), - zero:data(), + cudnn.scalar(input, 0), self.iDesc[0], self.gradInput:data()); return self.gradInput end diff --git a/TemporalConvolution.lua b/TemporalConvolution.lua index 4648ffd..001d192 100644 --- a/TemporalConvolution.lua +++ b/TemporalConvolution.lua @@ -102,9 +102,9 @@ function TemporalConvolution:updateGradInput(input, gradOutput) end function TemporalConvolution:accGradParameters(input,gradOutput,scale) ---2d (4d) view of input - local _input = inputview(input) --- transpose gradOutput (it will likely be transposed twice, hopefully, no big deal + --2d (4d) view of input + local _input = inputview(input) + -- transpose gradOutput (it will likely be transposed twice, hopefully, no big deal local _gradOutput = transposeGradOutput(gradOutput,self.buffer) cudnn.SpatialConvolution.accGradParameters(self,_input,_gradOutput,scale) end diff --git a/VolumetricConvolution.lua b/VolumetricConvolution.lua index b255467..3f32c3d 100644 --- a/VolumetricConvolution.lua +++ b/VolumetricConvolution.lua @@ -275,10 +275,12 @@ function VolumetricConvolution:createIODescriptors(input) maxBufSize = math.max(maxBufSize, bufSize[1]) self.extraBuffer = self.extraBuffer or cudnn.getSharedWorkspace() - self.extraBufferSizeInBytes = self.extraBuffer:nElement() * 4 -- float + self.extraBuffer = self.extraBuffer:cuda() -- always force float + self.extraBufferSizeInBytes = + self.extraBuffer:nElement() * 4 -- extraBuffer is always float if maxBufSize > self.extraBufferSizeInBytes then - self.extraBuffer:resize(math.ceil(maxBufSize/4)) - self.extraBufferSizeInBytes = maxBufSize + self.extraBuffer:resize(math.ceil(maxBufSize / 4)) + self.extraBufferSizeInBytes = maxBufSize end ----------------------------------------------------------------------- @@ -291,8 +293,8 @@ function VolumetricConvolution:createIODescriptors(input) end end -local one = torch.FloatTensor({1}); -local zero = torch.FloatTensor({0}); + + local function makeContiguous(self, input, gradOutput) if not input:isContiguous() then @@ -313,16 +315,16 @@ function VolumetricConvolution:updateOutput(input) input = makeContiguous(self, input) self:createIODescriptors(input) errcheck('cudnnConvolutionForward', cudnn.getHandle(), - one:data(), + cudnn.scalar(input, 1), self.iDesc[0], input:data(), self.weightDesc[0], self.weight:data(), self.convDesc[0], self.fwdAlgType[0], self.extraBuffer:data(), self.extraBufferSizeInBytes, - zero:data(), + cudnn.scalar(input, 0), self.oDesc[0], self.output:data()); errcheck('cudnnAddTensor', cudnn.getHandle(), - one:data(), - self.biasDesc[0], self.bias:data(), one:data(), + cudnn.scalar(input, 1), + self.biasDesc[0], self.bias:data(), cudnn.scalar(input, 1), self.oDescBias[0], self.output:data()); return self.output end @@ -337,24 +339,25 @@ function VolumetricConvolution:updateGradInput(input, gradOutput) if not self.weightDesc then self:resetWeightDescriptors() end self:createIODescriptors(input) errcheck('cudnnConvolutionBackwardData', cudnn.getHandle(), - one:data(), + cudnn.scalar(input, 1), self.weightDesc[0], self.weight:data(), self.oDesc[0], gradOutput:data(), self.convDesc[0], self.bwdDataAlgType[0], self.extraBuffer:data(), self.extraBufferSizeInBytes, - zero:data(), + cudnn.scalar(input, 0), self.iDesc[0], self.gradInput:data()); return self.gradInput end function VolumetricConvolution:accGradParameters(input, gradOutput, scale) - self.scaleT = self.scaleT or torch.FloatTensor(1):fill(1.0) - -- this line forces this member to always be on CPU (needed for cudnn) - self.scaleT = self.scaleT:float() + self.scaleT = self.scaleT or self.weight.new(1) + -- this line forces this member to always be on CPU (needed for cudnn) + self.scaleT = torch.type(self.weight) == 'torch.CudaDoubleTensor' + and self.scaleT:double() or self.scaleT:float() + scale = scale or 1.0 + self.scaleT[1] = scale - scale = scale or 1.0 - self.scaleT[1] = scale input, gradOutput = makeContiguous(self, input, gradOutput) assert(gradOutput:dim() == 4 or gradOutput:dim() == 5, 'gradOutput has to be a 4D or 5D tensor'); @@ -364,7 +367,7 @@ function VolumetricConvolution:accGradParameters(input, gradOutput, scale) errcheck('cudnnConvolutionBackwardBias', cudnn.getHandle(), self.scaleT:data(), self.oDescBias[0], gradOutput:data(), - one:data(), + cudnn.scalar(input, 1), self.biasDesc[0], self.gradBias:data()); -- gradWeight errcheck('cudnnConvolutionBackwardFilter', cudnn.getHandle(), @@ -374,7 +377,7 @@ function VolumetricConvolution:accGradParameters(input, gradOutput, scale) self.convDesc[0], self.bwdFilterAlgType[0], self.extraBuffer:data(), self.extraBufferSizeInBytes, - one:data(), + cudnn.scalar(input, 1), self.weightDesc[0], self.gradWeight:data()); end diff --git a/VolumetricMaxPooling.lua b/VolumetricMaxPooling.lua index cf4d60b..e87d29c 100644 --- a/VolumetricMaxPooling.lua +++ b/VolumetricMaxPooling.lua @@ -1,4 +1,5 @@ -local VolumetricMaxPooling, parent = torch.class('cudnn.VolumetricMaxPooling', 'cudnn._Pooling3D') +local VolumetricMaxPooling, parent = torch.class('cudnn.VolumetricMaxPooling', + 'cudnn._Pooling3D') function VolumetricMaxPooling:updateOutput(input) self.mode = 'CUDNN_POOLING_MAX' diff --git a/functional.lua b/functional.lua index cea9df9..24c6030 100644 --- a/functional.lua +++ b/functional.lua @@ -13,8 +13,8 @@ end cudnn.functional = {} -local one = torch.FloatTensor({1}); -local zero = torch.FloatTensor({0}); + + local function Batch2D(t) return t:view(1, t:size(1), t:size(2), t:size(3)) @@ -28,8 +28,8 @@ cudnn.functional.bias2D_updateOutput = function(handle, bias, output) local biasDesc = cudnn.toDescriptor(bias:view(1, bias:nElement(),1,1)) local oDesc = cudnn.toDescriptor(output) errcheck('cudnnAddTensor', handle, - one:data(), biasDesc[0], bias:data(), - one:data(), oDesc[0], output:data()) + cudnn.scalar(output, 1), biasDesc[0], bias:data(), + cudnn.scalar(output, 1), oDesc[0], output:data()) end -- accumulates the gradients into gradBias. @@ -43,7 +43,7 @@ cudnn.functional.bias2D_accGradParameters = function(handle, gradOutput, gradBia errcheck('cudnnConvolutionBackwardBias', handle, scaleT:data(), oDesc[0], gradOutput:data(), - one:data(), + cudnn.scalar(gradOutput, 1), biasDesc[0], gradBias:data()) end @@ -115,12 +115,12 @@ cudnn.functional.Convolution2D_updateOutput = function(handle, input, weight, ou -- do convolution errcheck('cudnnConvolutionForward', handle, - one:data(), + cudnn.scalar(input, 1), iDesc[0], input:data(), weightDesc[0], weight:data(), convDesc[0], algType[0], workspace and workspace:data() or nil, algWorkspaceLimit, - zero:data(), + cudnn.scalar(input, 0), oDesc[0], output:data()); end @@ -176,13 +176,13 @@ cudnn.functional.Convolution2D_updateGradInput = function(handle, input, weight, -- do convolution errcheck('cudnnConvolutionBackwardData', handle, - one:data(), + cudnn.scalar(input, 1), weightDesc[0], weight:data(), oDesc[0], gradOutput:data(), convDesc[0], algType[0], NULL, 0, - zero:data(), + cudnn.scalar(input, 0), iDesc[0], gradInput:data()); @@ -249,7 +249,7 @@ cudnn.functional.Convolution2D_accGradParameters = function(handle, input, gradW convDesc[0], algType[0], NULL, 0, - one:data(), + cudnn.scalar(input, 1), weightDesc[0], gradWeight:data()); end @@ -298,9 +298,9 @@ cudnn.functional.Pooling_updateOutput = function(handle, mode, input, output, -- pool errcheck('cudnnPoolingForward', handle, poolDesc[0], - one:data(), + cudnn.scalar(input, 1), iDesc[0], input:data(), - zero:data(), + cudnn.scalar(input, 0), oDesc[0], output:data()); end @@ -361,11 +361,11 @@ cudnn.functional.Pooling_updateGradInput = function(handle, mode, input, output, -- pool errcheck('cudnnPoolingBackward', handle, poolDesc[0], - one:data(), + cudnn.scalar(input, 1), oDesc[0], output:data(), oDesc[0], gradOutput:data(), iDesc[0], input:data(), - zero:data(), + cudnn.scalar(input, 0), iDesc[0], gradInput:data()); end @@ -36,6 +36,42 @@ cudnn.typemap = { ['torch.CudaDoubleTensor'] = 'CUDNN_DATA_DOUBLE', } +cudnn.typemap = { + ['torch.CudaHalfTensor'] = 'CUDNN_DATA_HALF', + ['torch.CudaTensor'] = 'CUDNN_DATA_FLOAT', + ['torch.CudaDoubleTensor'] = 'CUDNN_DATA_DOUBLE', +} + +local sizeofmap = { + ['torch.CudaHalfTensor'] = ffi.sizeof('half'), + ['torch.CudaTensor'] = ffi.sizeof('float'), + ['torch.CudaDoubleTensor'] = ffi.sizeof('double'), +} + +function cudnn.sizeof(t) + return sizeofmap[torch.type(t)] +end + +local onemap = { + ['torch.CudaHalfTensor'] = torch.FloatTensor({1}), + ['torch.CudaTensor'] = torch.FloatTensor({1}), + ['torch.CudaDoubleTensor'] = torch.DoubleTensor({1}), +} +local zeromap = { + ['torch.CudaHalfTensor'] = torch.FloatTensor({0}), + ['torch.CudaTensor'] = torch.FloatTensor({0}), + ['torch.CudaDoubleTensor'] = torch.DoubleTensor({0}), +} +function cudnn.scalar(t, val) + if val == 1 then + return onemap[torch.type(t)]:data() + elseif val == 0 then + return zeromap[torch.type(t)]:data() + else + error('unknown scalar') + end +end + -- TODO: determine if device supports true half and use true half on it -- so far use float for half and float, double for double local function determineHalfCapability(dev) @@ -126,7 +162,7 @@ function cudnn.getSharedWorkspace() local device = cutorch.getDevice() local stream = cutorch.getStream() -- starts from 0 if not sharedBuffer[device][stream] then - sharedBuffer[device][stream] = torch.CudaTensor(1) + sharedBuffer[device][stream] = torch.CudaTensor(1) end return sharedBuffer[device][stream] end diff --git a/test/test.lua b/test/test.lua index ef0a5a3..0c9e852 100644 --- a/test/test.lua +++ b/test/test.lua @@ -43,6 +43,7 @@ end function torch.CudaHalfTensor:__sub(b) return self:cuda() - b:cuda() end + function torch.CudaHalfTensor:abs() return self:cuda():abs():cudaHalf() end @@ -55,6 +56,10 @@ function torch.CudaHalfTensor:mean() return self:cuda():mean() end +function torch.CudaDoubleTensor:__sub(b) + return self:cuda() - b:cuda() +end + function torch.CudaDoubleTensor:mean() return self:cuda():mean() end @@ -98,20 +103,23 @@ local function testLayer(nnlayer, cudnnlayer, input, gradOutput, scale, precision = testparams.precision_backward end mytester:assertlt(error, precision, 'error on ' .. name - .. ' batchMode = ' .. tostring(batchMode) - .. ' ' .. description) + .. ', batchMode = ' .. tostring(batchMode) + .. ', type = ' .. torch.type(res[name]) + .. ', ' .. description) end -- IO local ferr,berr = jac.testIO(cudnnlayer, cast(input)) mytester:assertlt(ferr, testparams.precision_io, torch.typename(cudnnlayer) .. ' - i/o forward err ' - .. ' batchMode = ' .. tostring(batchMode) - .. ' ' .. description) + .. ', batchMode = ' .. tostring(batchMode) + .. ', type = ' .. torch.type(res[name]) + .. ', ' .. description) mytester:assertlt(berr, testparams.precision_io, torch.typename(cudnnlayer) .. ' - i/o backward err ' - .. ' batchMode = ' .. tostring(batchMode) - .. ' ' .. description) + .. ', batchMode = ' .. tostring(batchMode) + .. ', type = ' .. torch.type(res[name]) + .. ', ' .. description) end function cudnntest.SpatialConvolution() @@ -779,21 +787,21 @@ for i = 1, 1 do -- cutorch.getDeviceCount() do testparams = testparams_float mytester:run() + -- double tensor may be broken at some places, gets NaNs. + print'Testing torch.CudaDoubleTensor' + testparams = testparams_double + mytester:run() + print( -[[Half and Double Tensor tests are disabled due to missing functionality. + [[Half Tensor tests are disabled due to missing functionality. They will be enabled once fully fixed and functional. -See https://github.com/soumith/cudnn.torch/issues/226 and https://github.com/soumith/cudnn.torch/issues/225 for progress +See https://github.com/soumith/cudnn.torch/issues/225 for progress ]]) - -- Developers, do not commit uncommented regions until bindings fixed -- print'Testing torch.CudaHalfTensor' -- testparams = testparams_half -- mytester:run() - -- double tensor may be broken at some places, gets NaNs. - -- print'Testing torch.CudaDoubleTensor' - -- testparams = testparams_double - -- mytester:run() end end |