Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/soumith/cudnn.torch.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSoumith Chintala <soumith@gmail.com>2016-12-09 02:48:29 +0300
committerGitHub <noreply@github.com>2016-12-09 02:48:29 +0300
commit51c16a76f6b148e1abb264215e83432ded2dcdeb (patch)
treef8116f46fc36717104b9f324943dee3350acd8a5
parent970d7249e5c680d20ecac98edebe1f507feeecac (diff)
parent488205c66cafc4f180213318e68295b5a84633e7 (diff)
Merge pull request #290 from NVIDIA/master
Improved existing 16->32 fallback. Added performance-based fallback.
-rw-r--r--SpatialConvolution.lua7
-rw-r--r--SpatialFullConvolution.lua10
-rw-r--r--TemporalConvolution.lua2
-rw-r--r--VolumetricConvolution.lua7
-rw-r--r--VolumetricFullConvolution.lua7
-rw-r--r--ffi.lua6
-rw-r--r--find.lua200
-rw-r--r--functional.lua28
-rw-r--r--init.lua15
-rw-r--r--test/test.lua37
10 files changed, 174 insertions, 145 deletions
diff --git a/SpatialConvolution.lua b/SpatialConvolution.lua
index 9b24591..830a7e6 100644
--- a/SpatialConvolution.lua
+++ b/SpatialConvolution.lua
@@ -127,12 +127,13 @@ function SpatialConvolution:createIODescriptors(input)
self.pad = {self.padH, self.padW}
self.stride = {self.dH, self.dW}
- self.convDesc = cudnn.setConvolutionDescriptor(
- { padA = self.pad,
+ self.convDescData = { padA = self.pad,
filterStrideA = self.stride,
upscaleA = {1,1},
dataType = cudnn.configmap(torch.type(self.weight))
- })
+ }
+
+ self.convDesc = cudnn.setConvolutionDescriptor(self.convDescData)
-- get output shape, resize output
local oSize = torch.IntTensor(4)
diff --git a/SpatialFullConvolution.lua b/SpatialFullConvolution.lua
index c41e7e2..0ba5cd5 100644
--- a/SpatialFullConvolution.lua
+++ b/SpatialFullConvolution.lua
@@ -47,11 +47,11 @@ function SpatialFullConvolution:createIODescriptors(input)
self.pad = {self.padH, self.padW}
self.stride = {self.dH, self.dW}
- self.convDesc = cudnn.setConvolutionDescriptor(
- { padA = self.pad,
- filterStrideA = self.stride,
- dataType = cudnn.configmap(torch.type(self.weight))
- })
+ self.convDescData = { padA = self.pad,
+ filterStrideA = self.stride,
+ dataType = cudnn.configmap(torch.type(self.weight))
+ }
+ self.convDesc = cudnn.setConvolutionDescriptor(self.convDescData)
-- get output shape, resize output
local iwidth = input:size(4)
diff --git a/TemporalConvolution.lua b/TemporalConvolution.lua
index 87f7775..22400ce 100644
--- a/TemporalConvolution.lua
+++ b/TemporalConvolution.lua
@@ -37,7 +37,7 @@ function TemporalConvolution:createIODescriptors(input)
end
function TemporalConvolution:fastest(mode)
- self = cudnn.SpatialConvolution.fastest(self,mode)
+ cudnn.SpatialConvolution.fastest(self,mode)
return self
end
diff --git a/VolumetricConvolution.lua b/VolumetricConvolution.lua
index 64d0925..9a337cc 100644
--- a/VolumetricConvolution.lua
+++ b/VolumetricConvolution.lua
@@ -43,10 +43,9 @@ function VolumetricConvolution:createIODescriptors(input)
if mathtype == 'CUDNN_DATA_HALF' then
mathtype = 'CUDNN_DATA_FLOAT'
end
- self.convDesc = cudnn.setConvolutionDescriptor(
- { padA = self.pad, filterStrideA = self.stride,
- dataType = mathtype
- })
+ self.convDescData = { padA = self.pad, filterStrideA = self.stride,
+ dataType = mathtype }
+ self.convDesc = cudnn.setConvolutionDescriptor(self.convDescData)
local oSize = torch.IntTensor(5)
errcheck('cudnnGetConvolutionNdForwardOutputDim',
diff --git a/VolumetricFullConvolution.lua b/VolumetricFullConvolution.lua
index 8f8bac6..d62b37e 100644
--- a/VolumetricFullConvolution.lua
+++ b/VolumetricFullConvolution.lua
@@ -45,10 +45,9 @@ function VolumetricFullConvolution:createIODescriptors(input)
-- create conv descriptor
self.pad = {self.padT, self.padH, self.padW}
self.stride = {self.dT, self.dH, self.dW}
- self.convDesc = cudnn.setConvolutionDescriptor(
- { padA = self.pad, filterStrideA = self.stride,
- dataType = cudnn.configmap(torch.type(self.weight))
- })
+ self.convDescData = { padA = self.pad, filterStrideA = self.stride,
+ dataType = cudnn.configmap(torch.type(self.weight))}
+ self.convDesc = cudnn.setConvolutionDescriptor(self.convDescData)
-- get output shape, resize output
local iwidth = input:size(5)
diff --git a/ffi.lua b/ffi.lua
index d5b5f8c..458e382 100644
--- a/ffi.lua
+++ b/ffi.lua
@@ -1614,10 +1614,10 @@ end
-- check cuDNN version
cudnn.version = tonumber(cudnn.C.cudnnGetVersion())
-if cudnn.version < 5005 then
- error('These bindings are for version 5005 or above, '
+if cudnn.version < 5005 or cudnn.version >= 6000 then
+ error('These bindings are for CUDNN 5.x (5005 <= cudnn.version > 6000) , '
.. 'while the loaded CuDNN is version: ' .. cudnn.version
- .. ' \nAre you using an older version of CuDNN?')
+ .. ' \nAre you using an older or newer version of CuDNN?')
end
-- check GPU driver version
diff --git a/find.lua b/find.lua
index 376a035..aa2e8f3 100644
--- a/find.lua
+++ b/find.lua
@@ -2,7 +2,12 @@ local ffi = require 'ffi'
local find = {}
find.__index = find
---find.verbose=true
+
+-- default is to get verbose on errors
+find.verbose=false
+find.verboseError=true
+find.verboseFallback=true
+
-- constants to index array tables below
local Fwd, BwdFilter, BwdData = 1, 2, 3
@@ -64,54 +69,19 @@ local bwdDataAlgoNames = {
local algoNames = {fwdAlgoNames, bwdFilterAlgoNames, bwdDataAlgoNames}
--- this function is here and not in init.lua (and has the suffix) as generic
--- getConvolutionDescriptor methood should have native lua tables instead of FFI
--- (like setConvolutionDescriptor does, to be used with it)
--- However this is counterproductive for the purposes it's used in this module
-local function getConvolutionDescriptor_ffi(desc)
- local CUDNN_DIM_MAX=8
- local data = {
- dim_p = ffi.new('int[1]'),
- padA = ffi.new('int[?]', CUDNN_DIM_MAX),
- filterStrideA = ffi.new('int[?]', CUDNN_DIM_MAX),
- upscaleA = ffi.new('int[?]', CUDNN_DIM_MAX),
- mode_p = ffi.new('cudnnConvolutionMode_t[1]'),
- math_p = ffi.new('cudnnDataType_t[1]')
- }
-
- local status = cudnn.call('cudnnGetConvolutionNdDescriptor', desc[0], CUDNN_DIM_MAX,
- data.dim_p, data.padA, data.filterStrideA,
- data.upscaleA, data.mode_p, data.math_p)
- if (status ~= ffi.C.CUDNN_STATUS_SUCCESS) then
- if find.verbose or find.verboseError then
- print("cudnnGetConvolutionNdDescriptor failed: ", tonumber(status))
- return nil
- end
+local function convDataString(layer)
+ local info = ''
+ if layer.convDescData then
+ local desc = layer.convDescData
+ info = ' convDesc=[mode : ' .. desc.mode .. ' datatype : ' .. desc.dataType .. ']'
end
-
- data.arrayLength = data.dim_p[0]
- data.mode = data.mode_p[0]
- data.dataType = data.math_p[0]
- return data
+ return info .. ' hash=' .. layer.autotunerHash
end
local function verboseCall(layer, f, ...)
- if find.verbose then
- print("find:verboseCall: calling " .. f .. ", hash: ", layer.autotunerHash)
- end
local status = cudnn.call(f, ...)
if (status ~= ffi.C.CUDNN_STATUS_SUCCESS) and (find.verbose or find.verboseError) then
- local prefix = "find:verboseCall:"
- print( prefix .. f .. " failed: ", tonumber(status))
- if layer.convDesc then
- local desc = getConvolutionDescriptor_ffi(layer.convDesc)
- if desc then
- print (prefix .. ' conv desc mode : ', desc.mode, ' datatype : ', desc.datatype)
- end
- end
- end
- if find.verbose then
- print("find:verboseCall: success, " .. f )
+ print("\n" .. f .. " failed: ", tonumber(status), convDataString(layer))
end
return status
end
@@ -123,36 +93,39 @@ local function checkedCall(layer, f, ...)
local str = ffi.string(cudnn.C.cudnnGetErrorString(status))
error('Error in CuDNN: ' .. str .. ' ('..f..')')
end
+ return status
end
find.checkedCall = checkedCall
local function noFallback(layer)
- if find.verbose then
- print("find.defaultFallback: verboseCall failed for: ", layer.autotunerHash)
+ if find.verbose or find.verboseFallback then
+ print("\nfind.defaultFallback: verboseCall failed for: ", convDataString(layer))
end
return false
end
+local function fallbackWarning(layer, msg)
+ if find.verbose or find.verboseFallback then
+ print("\n *** find.verboseFallback: " .. msg ..
+ "\n *** Falling back to 32-bit math for: " .. convDataString(layer))
+ print(" *** [ Set cudnn.find.verboseFallback to false to disable this message ] *** ")
+ print(" *** [ Alternatively, you may force CUDNN to always operate on CudaHalfTensors via 32-bit float conversion, in Lua: ] ***\n"
+ .." *** [ cudnn.configureMath({ ['torch.CudaHalfTensor'] = 'CUDNN_DATA_FLOAT'} ] ***")
+ print(" *** [ Note: result may be faster or slower than native FP16, depending on your GPU and CUDNN operations ] *** ")
+ end
+end
+
local function defaultFallback(layer, replay)
-- read conv descriptor
- local convDescData = getConvolutionDescriptor_ffi(layer.convDesc)
-
- if convDescData and convDescData.dataType == ffi.C.CUDNN_DATA_HALF then
- if find.verbose then
- if replay then
- print("find.defaultFallback: replay for ", layer.autotunerHash)
- else
- print("find.defaultFallback: no 16-bit float algo found, will try 32 bits for ", layer.autotunerHash)
- end
- end
- -- using direct FFI call, not cudnn.setConvolutionDescriptor, for efficiency and clarity
- checkedCall(layer, 'cudnnSetConvolutionNdDescriptor', layer.convDesc[0],
- convDescData.arrayLength,
- convDescData.padA,
- convDescData.filterStrideA,
- convDescData.upscaleA,
- convDescData.mode,
- ffi.C.CUDNN_DATA_FLOAT)
+ local convDescData = layer.convDescData
+ if convDescData and convDescData.dataType == "CUDNN_DATA_HALF" then
+ fallbackWarning(layer, replay
+ and "16->32 bit fallback replay "
+ or "No native FP16 algo found, will try 32-bit math")
+ -- update our record with fallback value
+ convDescData.dataType = "CUDNN_DATA_FLOAT"
+ -- update the descriptor in CUDNN
+ cudnn.setConvolutionDescriptor(convDescData, layer.convDesc)
return true
else
return false
@@ -358,6 +331,10 @@ function find:setupAlgo(layer, findAPI_idx, algSearchMode, params)
local function callCudnn(layer)
local ret = 0
validResults = 0
+ if not layer.convDesc or not layer.convDesc[0] then
+ error("No convDesc set on layer!")
+ end
+
if self.algoFamily == FindExFamily then
-- query temp workspace size
local tempWorkspace, tempWorkspaceSize = cudnn.getSharedWorkspace()
@@ -375,6 +352,10 @@ function find:setupAlgo(layer, findAPI_idx, algSearchMode, params)
else
-- GetFamily: emulate findXXX results layout
numPerfResults[0]=1
+ perfResults[0].algo = 0
+ perfResults[0].memory = 0
+ perfResults[0].status = 1
+
local algWorkspaceLimit = layer.workspace_limit
or (layer.nInputPlane * layer.kH * layer.kW * layer.weight.elementSize())
@@ -382,6 +363,10 @@ function find:setupAlgo(layer, findAPI_idx, algSearchMode, params)
cudnn.getHandle(),
params[1], params[3], layer.convDesc[0], params[6],
algSearchMode, algWorkspaceLimit, algType[findAPI_idx])
+ if ret ~= 0 then
+ return ret
+ end
+
local retAlgo = algType[findAPI_idx][0]
if find.verbose then
print(string.format(
@@ -395,6 +380,9 @@ function find:setupAlgo(layer, findAPI_idx, algSearchMode, params)
cudnn.getHandle(),
params[1], params[3], layer.convDesc[0], params[6],
retAlgo, bufSize:data())
+ if ret ~= 0 then
+ return ret
+ end
if find.verbose then
print(string.format(
"\n" .. getWSAlgos[findAPI_idx] .. ": bufSize: %d, current ws: %d",
@@ -427,31 +415,75 @@ function find:setupAlgo(layer, findAPI_idx, algSearchMode, params)
local fallback = ''
if (useFallback) then fallback = "[FALLBACK]" end
print(string.format(
- "\n" .. API .. " algo: %s (%d, status: %d), memory: %8d, count: %d"
- .. " hash: %45s " .. cacheHit .. fallback,
+ "\n" .. API .. " algo[%d]: %s (%d, status: %d), time: %.04f, memory: %8d, count: %d"
+ .. " %s " .. cacheHit .. fallback,
+ validResults,
algoNames[findAPI_idx][cachedAlgo[validResults].algo+1], cachedAlgo[validResults].algo, cachedAlgo[validResults].status,
- cachedAlgo[validResults].memory, r, layer.autotunerHash))
+ cachedAlgo[validResults].time, cachedAlgo[validResults].memory, r, convDataString(layer)))
end
end
end
- if validResults < 1 and find.verbose then
- print("Could not find any valid convolution algorithms for sizes: " .. layer.autotunerHash)
- -- todo: add case of multi-stream not fitting in size
+ if validResults < 1 then
return 1
end
return 0
end
+
+ local function performanceFallback(layer)
+ -- read conv descriptor
+ local convDescData = layer.convDescData
+
+ if convDescData and convDescData.dataType == "CUDNN_DATA_HALF" then
+ local savedResults = cachedAlgo
+ local savedNum = validResults
+ cachedAlgo = {}
+ validResults = 0
+ useFallback = true
+
+ -- update our record with fallback value
+ layer.convDescData.dataType = "CUDNN_DATA_FLOAT"
+ -- update the descriptor in CUDNN
+ cudnn.setConvolutionDescriptor(layer.convDescData, layer.convDesc)
+ -- do the actual call
+ local status = callCudnn(layer)
+ -- check if we got better results with float32
+ if status == 0 and validResults > 0 and cachedAlgo[1].time < savedResults[1].time then
+ if find.verbose or find.verboseFallback then
+ local msg = string.format("find.performanceFallback: found 32-bit float op is faster (%f) than FP16(%f), memory increase: %fM",
+ cachedAlgo[1].time, savedResults[1].time,
+ (tonumber(cachedAlgo[1].memory)-tonumber(savedResults[1].memory))/Meg)
+ fallbackWarning(layer, msg)
+ end
+ return
+ end
+ -- restore if we didn't
+ cachedAlgo = savedResults
+ validResults = savedNum
+ -- update our record with fallback value
+ layer.convDescData.dataType = "CUDNN_DATA_HALF"
+ -- update the descriptor in CUDNN
+ cudnn.setConvolutionDescriptor(layer.convDescData, layer.convDesc)
+
+ end
+ end
+
-- do the actual call
local status = callCudnn(layer)
if status ~= 0 or validResults < 1 then
if self.fallback and self.fallback(layer) then
- useFallback = true;
+ useFallback = true
status = callCudnn(layer)
- if status ~= 0 or validResults < 1 then
- error ("Fallback attempt failed for " .. API .. ', sizes: ' .. layer.autotunerHash)
- end
+ end
+ -- check again
+ if status ~= 0 or validResults < 1 then
+ error (API .. ' failed, sizes: ' .. convDataString(layer))
+ end
+ else
+ -- if we are running Find or FindEx in native fp16, check if this algo is actiually faster in pseudo
+ if self.algoFamily ~= GetFamily then
+ performanceFallback(layer)
end
end
self:store(layer, findAPI_idx, cachedAlgo)
@@ -475,9 +507,9 @@ function find:setupAlgo(layer, findAPI_idx, algSearchMode, params)
local fallback = ""
if (useFallback) then fallback = "[FALLBACK]" end
print(string.format(
- "\n" .. API .. ": %s(%d)[%d of %d] Workspace: %8fM (current ws size %fM, max: %dM free: %dM) hash: %45s" .. cacheHit .. fallback,
+ "\n" .. API .. ": %s(%d)[%d of %d] Workspace: %8fM (current ws size %fM, max: %dM free: %dM) %s" .. cacheHit .. fallback,
algoNames[findAPI_idx][cachedAlgo[retAlgo].algo+1], cachedAlgo[retAlgo].algo, retAlgo, #cachedAlgo,
- tonumber(cachedAlgo[retAlgo].memory)/Meg, curWorkspaceSize/Meg, self.maxWorkspaceSize/Meg, freeMemory/Meg, layer.autotunerHash))
+ tonumber(cachedAlgo[retAlgo].memory)/Meg, curWorkspaceSize/Meg, self.maxWorkspaceSize/Meg, freeMemory/Meg, convDataString(layer)))
end
return cachedAlgo[retAlgo].algo
end
@@ -513,9 +545,9 @@ end
function find:forwardAlgorithm(layer, params)
- if layer.fmode then
- setupWS(layer, params, layer.fmode, Fwd)
- return layer.fmode
+ if layer.fmode then
+ setupWS(layer, params, layer.fmode, Fwd)
+ return layer.fmode
end
local algSearchMode = 'CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT'
if layer.fastest_mode or cudnn.fastest == true then
@@ -526,9 +558,9 @@ end
function find:backwardFilterAlgorithm(layer, params)
-- Check if we are in "sticky" mode
- if layer.bwmode then
- setupWS(layer, params, layer.bwmode, BwdFilter)
- return layer.bwmode
+ if layer.bwmode then
+ setupWS(layer, params, layer.bwmode, BwdFilter)
+ return layer.bwmode
end
local algSearchMode = 'CUDNN_CONVOLUTION_BWD_FILTER_NO_WORKSPACE'
if layer.fastest_mode or cudnn.fastest == true then
@@ -540,9 +572,9 @@ end
function find:backwardDataAlgorithm(layer, params)
-- Check if we are in "sticky" mode
- if layer.bdmode then
- setupWS(layer, params, layer.bdmode, BwdData)
- return layer.bdmode
+ if layer.bdmode then
+ setupWS(layer, params, layer.bdmode, BwdData)
+ return layer.bdmode
end
local algSearchMode = 'CUDNN_CONVOLUTION_BWD_DATA_NO_WORKSPACE'
if layer.fastest_mode or cudnn.fastest == true then
diff --git a/functional.lua b/functional.lua
index e877cec..5385ffb 100644
--- a/functional.lua
+++ b/functional.lua
@@ -73,11 +73,10 @@ cudnn.functional.Convolution2D_updateOutput = function(handle, input, weight, ou
filterDimA = {nOutputPlane, nInputPlane, kH, kW}})
-- create a convolution descriptor
- local convDesc = cudnn.setConvolutionDescriptor(
- { padA = {padH, padW},
+ local convDescData = { padA = {padH, padW},
filterStrideA = {strideH, strideW},
dataType = getMathType(weight) }
- );
+ local convDesc = cudnn.setConvolutionDescriptor(convDescData);
-- create input descriptor
local iDesc = cudnn.toDescriptor(input)
@@ -97,6 +96,7 @@ cudnn.functional.Convolution2D_updateOutput = function(handle, input, weight, ou
local oDesc = cudnn.toDescriptor(output)
local layer = {
+ convDescData = convDescData,
convDesc = convDesc,
weight = weight,
nInputPlane = nInputPlane,
@@ -141,17 +141,17 @@ cudnn.functional.Convolution2D_updateGradInput = function(handle, input, weight,
filterDimA = {nOutputPlane, nInputPlane, kH, kW} })
-- create a convolution descriptor
- local convDesc = cudnn.setConvolutionDescriptor(
- { padA = {padH, padW},
- filterStrideA = {strideH, strideW},
- dataType = getMathType(weight)
- }
- );
+ local convDescData = { padA = {padH, padW},
+ filterStrideA = {strideH, strideW},
+ dataType = getMathType(weight)
+ }
+ local convDesc = cudnn.setConvolutionDescriptor(convDescData);
-- create input, output descriptor
local iDesc = cudnn.toDescriptor(input)
local oDesc = cudnn.toDescriptor(output)
local layer = {
+ convDescData = convDescData,
convDesc = convDesc,
weight = weight,
nInputPlane = nInputPlane,
@@ -193,11 +193,10 @@ cudnn.functional.Convolution2D_accGradParameters = function(handle, input, gradW
local weightDesc = cudnn.setFilterDescriptor({ dataType = cudnn.typemap[torch.type(input)],
filterDimA = {nOutputPlane, nInputPlane, kH, kW}})
-- create a convolution descriptor
- local convDesc = cudnn.setConvolutionDescriptor(
- { padA = {padH, padW},
- filterStrideA = {strideH, strideW},
- dataType = getMathType(gradWeight) }
- );
+ local convDescData = { padA = {padH, padW},
+ filterStrideA = {strideH, strideW},
+ dataType = getMathType(gradWeight) }
+ local convDesc = cudnn.setConvolutionDescriptor(convDescData);
-- create input, output descriptor
local iDesc = cudnn.toDescriptor(input)
@@ -205,6 +204,7 @@ cudnn.functional.Convolution2D_accGradParameters = function(handle, input, gradW
local layer = {
convDesc = convDesc,
+ convDescData = convDescData,
weight = gradWeight,
nInputPlane = nInputPlane,
nOutputPlane = nOutputPlane,
diff --git a/init.lua b/init.lua
index 6c8abd7..b4ba8eb 100644
--- a/init.lua
+++ b/init.lua
@@ -16,9 +16,6 @@ cudnn.fastest = false
-- Warning: this option is experimental and assumes at least 2 warmup iterations!
cudnn.useFindEx = false
--- if true, use 'pseudo-fp16' (half storage, float math) even if true fp16 math is available
-cudnn.useFloatMathForHalf = false
-
-- amount of memory to use on 1st iteration for FindEx
cudnn.initialWorkspaceBytes = 1024
@@ -209,17 +206,19 @@ end
function cudnn.setConvolutionDescriptor(data, desc)
- local dim = data.arrayLength or #data.padA
- local upscale = data.upscaleA or torch.IntStorage(dim):fill(1)
+ if not data.arrayLength then data.arrayLength = #data.padA end
+ if not data.upscaleA then data.upscaleA = torch.IntStorage(data.arrayLength):fill(1) end
+ if not data.mode then data.mode = 'CUDNN_CROSS_CORRELATION' end
+
local myDesc = desc or cudnn.createDescriptors(
1, 'struct cudnnConvolutionStruct*[?]',
'cudnnCreateConvolutionDescriptor', 'cudnnDestroyConvolutionDescriptor')
errcheck('cudnnSetConvolutionNdDescriptor', myDesc[0],
- dim,
+ data.arrayLength,
torch.IntTensor(data.padA):data(),
torch.IntTensor(data.filterStrideA):data(),
- torch.IntTensor(upscale):data(),
- data.mode or 'CUDNN_CROSS_CORRELATION',
+ torch.IntTensor(data.upscaleA):data(),
+ data.mode,
data.dataType)
return myDesc
end
diff --git a/test/test.lua b/test/test.lua
index 2b69fa2..46723fc 100644
--- a/test/test.lua
+++ b/test/test.lua
@@ -11,7 +11,7 @@ local jac = nn.Jacobian
local testparams_half = {
test_type = 'torch.CudaHalfTensor',
precision_forward = 2e-1,
- precision_backward = 8,
+ precision_backward = 10,
precision_jac = 1e-3,
precision_io = 1e-1,
}
@@ -131,7 +131,7 @@ function cudnntest.SpatialConvolution()
local input = torch.randn(bs,from,inj,ini):cuda()
local gradOutput = torch.randn(bs,to,outj,outi):cuda()
local sconv = nn.SpatialConvolution(from,to,ki,kj,si,sj):cuda()
- local gconv = cast(cudnn.SpatialConvolution(from,to,ki,kj,si,sj)):fastest()
+ local gconv = cast(cudnn.SpatialConvolution(from,to,ki,kj,si,sj))
gconv.weight:copy(sconv.weight)
gconv.bias:copy(sconv.bias)
@@ -162,7 +162,7 @@ function cudnntest.SpatialFullConvolution()
local input = torch.randn(bs,from,inj,ini):cuda()
local gradOutput = torch.randn(bs,to,outj,outi):cuda()
local sconv = nn.SpatialFullConvolution(from,to,ki,kj,si,sj):cuda()
- local gconv = cast(cudnn.SpatialFullConvolution(from,to,ki,kj,si,sj):cuda():fastest())
+ local gconv = cast(cudnn.SpatialFullConvolution(from,to,ki,kj,si,sj):cuda())
gconv.weight:copy(sconv.weight)
gconv.bias:copy(sconv.bias)
@@ -189,7 +189,7 @@ function cudnntest.TemporalConvolution()
local input = torch.randn(bs,ini,inputFrameSize):cuda()
local gradOutput = torch.randn(bs,outi,outputFrameSize):cuda()
local sconv = nn.TemporalConvolution(inputFrameSize,outputFrameSize, ki, si):cuda()
- local gconv = cast(cudnn.TemporalConvolution(inputFrameSize,outputFrameSize, ki, si):cuda():fastest())
+ local gconv = cast(cudnn.TemporalConvolution(inputFrameSize,outputFrameSize, ki, si):cuda())
gconv.weight:copy(sconv.weight:view(gconv.weight:size()))
gconv.bias:copy(sconv.bias)
@@ -225,7 +225,7 @@ function cudnntest.TemporalConvolution_padding_batch()
local groundweight = sconv.gradWeight
local groundbias = sconv.gradBias
- local gconv = cast(cudnn.TemporalConvolution(inputFrameSize,outputFrameSize, ki, si,pad_h):cuda():fastest())
+ local gconv = cast(cudnn.TemporalConvolution(inputFrameSize,outputFrameSize, ki, si,pad_h):cuda())
gconv.weight:copy(sconv.weight:view(gconv.weight:size()))
gconv.bias:copy(sconv.bias)
gconv:forward(cast(input))
@@ -330,10 +330,14 @@ function cudnntest.VolumetricFullConvolution()
local outk = (ink-1)*sk+kk
local scale = math.random()
+ if testparams.test_type == 'torch.CudaDoubleTensor' then
+ return
+ end
+
local input = torch.randn(bs,from,ink,inj,ini):cuda()
local gradOutput = torch.randn(bs,to,outk,outj,outi):cuda()
local sconv = nn.VolumetricFullConvolution(from,to,kk,ki,kj,sk,si,sj):cuda()
- local gconv = cast(cudnn.VolumetricFullConvolution(from,to,kk,ki,kj,sk,si,sj):cuda():fastest())
+ local gconv = cast(cudnn.VolumetricFullConvolution(from,to,kk,ki,kj,sk,si,sj):cuda())
gconv.weight:copy(sconv.weight)
gconv.bias:copy(sconv.bias)
@@ -936,18 +940,21 @@ mytester:add(cudnntest)
cudnn.verbose=false
cudnn.find.verbose=false
+-- this is the default, keep it for demo of 16->32 bit float fallback
+cudnn.find.verboseFallback=true
cudnn.useFindEx=false
-for i = 1, cutorch.getDeviceCount() do
- cudnn.configureMath()
+for i = 1, 1 do -- cutorch.getDeviceCount() do
- for _, benchmark in ipairs({true, false}) do
+ for _, benchmark, fast in ipairs({true, false}) do
cudnn.benchmark = benchmark
--- cudnn.reset()
+ -- use random fastest() test for non-benchmark case
+ if not benchmark then cudnn.fastest = tostring(math.random(0,1)) end
+
local prop = cutorch.getDeviceProperties(i)
print('Running test on device: #' .. i .. ' : ' .. prop.name
- .. ' with benchmark = ' .. tostring(cudnn.benchmark))
+ .. ' with benchmark = ' .. tostring(cudnn.benchmark) .. ' and fastest = ' .. tostring(cudnn.fastest))
cutorch.setDevice(i)
@@ -958,14 +965,6 @@ for i = 1, cutorch.getDeviceCount() do
print( 'Testing torch.CudaHalfTensor, torch.cudnn fp16 math is : ', cudnn.configmap('torch.CudaHalfTensor' ),
', cutorch.hasFastHalfInstructions() is ', cutorch.hasFastHalfInstructions())
- if cudnn.configmap('torch.CudaHalfTensor') ~= 'CUDNN_DATA_FLOAT' then
- print([[ Warning: 32-bit float math is forced for CudaHalfTensor test
- even though native fast 16-bit float math is available for this device.
- The reason is cudnn convolution algo find methods for fp16 and certain size combinations may fail.
- This should be fixed in next release.]])
- cudnn.configureMath({ ['torch.CudaHalfTensor'] = 'CUDNN_DATA_FLOAT'})
- end
-
testparams = testparams_half
mytester:run()