require 'cunn' require 'optim' -- If fbcunn and fbnn exists we'll do a profile of DataParallel local profileDp = pcall(function() require 'fbcunn'; require 'fbnn' end) local baseGpu = 1 -- First GPU to use local numGpus = cutorch.getDeviceCount() torch.setdefaulttensortype('torch.DoubleTensor') torch.setnumthreads(8) cutorch.setDevice(baseGpu) cutorch.reserveStreams(1) local typenames = { 'torch.CudaTensor', 'torch.CudaDoubleTensor', } local t2cpu = { ['torch.CudaTensor'] = 'torch.FloatTensor', ['torch.CudaDoubleTensor'] = 'torch.DoubleTensor', } local function checkHalf() if cutorch.hasHalf then table.insert(typenames, 'torch.CudaHalfTensor') t2cpu['torch.CudaHalfTensor'] = 'torch.HalfTensor' end end local function half_max_error(maxabs) -- arbitrarily double the precision limit return 2 * ((maxabs and (2^(math.floor(math.log(maxabs) / math.log(2)))) * (2^(-10))) or 0) end -- Create an instance of the test framework function precision(typename, max_error) if typename == 'torch.CudaHalfTensor' then return 5e-2 + half_max_error(max_error) else return 1e-5 end end -- Create an instance of the test framework local mytester = torch.Tester() local test = torch.TestSuite() local function copyTable(x) -- Shallow copy local ret = {} for key, value in pairs(x) do ret[key] = value end return ret end local function createSplitNetwork(dim, dimSize) local split = nn.ConcatTable() for i = 1, dimSize do split:add(nn.Narrow(dim, i, 1)) end return split end -- Build a binary classifier that takes in a table of tensors and outputs -- a table of tensors. We will split the BATCHES across GPUs. local function buildNet(width, height, pool, feat, filt, tableInOut, numConvs) local net = nn.Sequential() if tableInOut then net:add(nn.JoinTable(2)) -- Join R,G,B tensors into RGB end assert(math.fmod(filt,2) == 1) for i = 1, numConvs do local fin = 3 if (i > 1) then fin = feat end net:add(nn.SpatialConvolutionMM(fin, feat, filt, filt, 1, 1, (filt-1)/2)) net:add(nn.Threshold()) end net:add(nn.SpatialMaxPooling(pool, pool)) net:add(nn.Reshape(width * height * feat / (pool * pool))) net:add(nn.Linear(width * height * feat / (pool * pool), 2)) -- net:add(nn.SoftMax()) -- This is fake anyway, so just do regression :-) if tableInOut then net:add(createSplitNetwork(2,2)) end return net end local function serialize(net) net:clearState() local uniq = sys.execute('echo "$(($(date +%s%N)/1000000))"') local f = torch.DiskFile(string.format('/tmp/%s', uniq), 'w') f:binary() f:writeObject(net) f:close() return string.format('/tmp/%s', uniq) end local function deserialize(file) local f = torch.DiskFile(file) f:binary() local net = f:readObject() f:close() os.execute(string.format('rm %s', file)) return net end function test.DataParallelTable() for k, typename in ipairs(typenames) do test_DataParallelTable(typename) end end function test_DataParallelTable(gtype) local width = 16 local height = 16 local pool = 4 local feat = 8 local filt = 5 local numConvs = 2 local numSgdSteps = 10 local syncGpuCpuParamsEvery = 4 assert(numGpus > 1) -- test for various batchSizes, not necessarily multiples of nGpus: for _,batchSize in ipairs {2 * numGpus, 9, 15} do collectgarbage() -- Build a CPU model local cpuClassifier = buildNet(width, height, pool, feat, filt, true, numConvs) -- Build a multi-GPU model local gClassifier = nn.DataParallelTable(1):type(gtype) for i = 1, numGpus do local curGpu = math.fmod(baseGpu+(i-1)-1, cutorch.getDeviceCount()) + 1 cutorch.setDevice(curGpu) gClassifier:add(cpuClassifier:clone():type(gtype), curGpu) end cutorch.setDevice(baseGpu) -- Now wrap them in layers that will split up the input tensor and join the -- output tensor (I know this seems stupid - and it is - but we need to test -- DataParallelTable under table inputs and when it is embedded in a network. local cNet = nn.Sequential() cNet:add(createSplitNetwork(2,3)) cNet:add(cpuClassifier) cNet:add(nn.JoinTable(2)) cNet:type(gtype) local gNet = nn.Sequential() gNet:add(createSplitNetwork(2,3)) gNet:add(gClassifier) gNet:add(nn.JoinTable(2):type(gtype)) gNet:get(1):type(gtype) gNet:get(3):type(gtype) -- Force in a serialization / deserialization pass ------------ local file = serialize(gNet) gNet = nil collectgarbage() collectgarbage() gNet = deserialize(file) ---------------------------------------------------------------- local cInput = torch.rand(batchSize, 3, height, width):type(gtype) local gInput = cInput:type(gtype) local cTarget = torch.rand(batchSize, 2):type(gtype) local gTarget = cTarget:type(gtype):type(gtype) local cParams, cGradParams = cNet:getParameters() local gParams, gGradParams = gNet:getParameters() assert(cutorch.getDevice() == baseGpu, 'getParameters: didnt restore GPU state') -- Set up an MSE optimizer on the GPU and CPU local optimStateCpu = { learningRate = 0.1, -- Artificially big learning rate weightDecay = 0, momentum = 0.9, dampening = 0, learningRateDecay = 0, nesterov = true, } local optimStateGpu = copyTable(optimStateCpu) local optimMethod = optim.sgd local criterionCpu = nn.MSECriterion():type(gtype) local criterionGpu = criterionCpu:clone():type(gtype) for i = 1, numSgdSteps do collectgarbage() local fevalCpu = function(x) if x ~= cParams then cParams:copy(x) end cNet:zeroGradParameters() -- FPROP + BPROP on CPU local output = cNet:forward(cInput) local err = criterionCpu:forward(output, cTarget) local gradOutput = criterionCpu:backward(output, cTarget) local gradInput = cNet:backward(cInput, gradOutput) return err, cGradParams end local fevalGpu = function(x) if x ~= gParams then gParams:copy(x) end gNet:zeroGradParameters() assert(cutorch.getDevice() == baseGpu, 'zeroGradParameters: didnt restore GPU state') -- FPROP + BPROP on GPU local output = gNet:forward(gInput) assert(cutorch.getDevice() == baseGpu, 'DataParallelTable:forward didnt restore GPU state') local err = criterionGpu:forward(output, gTarget) local gradOutput = criterionGpu:backward(output, gTarget) local gradInput = gNet:backward(gInput, gradOutput) assert(cutorch.getDevice() == baseGpu, 'DataParallelTable:add didnt restore GPU state') return err, gGradParams end -- Perform an SGD step on the GPU and CPU optimMethod(fevalCpu, cParams, optimStateCpu) optimMethod(fevalGpu, gParams, optimStateGpu) gNet:findModules('nn.DataParallelTable')[1]:syncParameters() assert(cutorch.getDevice() == baseGpu, 'DataParallelTable:syncParameters didnt restore GPU state') -- Now make sure that everything is the same local cOutput = cNet.output local gOutput = gNet.output local cGradInput = cNet.gradInput local gGradInput = gNet.gradInput mytester:assertlt((cOutput:double() - gOutput:double()):abs():max(), precision(gtype, cOutput:clone():double():abs():max()), 'fprop error ' .. gtype) mytester:assertlt((criterionCpu.gradInput:double() - criterionCpu.gradInput:double()):abs():max(), precision(gtype, criterionGpu.gradInput:clone():double():abs():max()), 'CRITERION BPROP error ' .. gtype) mytester:assertlt((cParams:double() - gParams:double()):abs():max(), precision(gtype, cParams:clone():double():abs():max()), 'parameters error ' .. gtype) mytester:assertlt((cGradParams:double() - gGradParams:double()):abs():max(), precision(gtype, cGradParams:clone():double():abs():max()), 'BPROP error (gradParams) ' .. gtype) mytester:assertlt((cGradInput:double() - gGradInput:double()):abs():max(), precision(gtype, cGradInput:clone():double():abs():max()), 'BPROP error (gradInput) ' .. gtype) -- Sync the CPU and GPU weights every few "epochs" to prevent floating point -- drift between SGD iterations (ie, they will eventually be divergent after -- enough iters) if math.fmod(i, syncGpuCpuParamsEvery) == 0 then local cp = cNet:parameters() local gp = gNet:get(2):get(1):parameters() assert(#cp == #gp) for j = 1, #cp do cp[j]:copy(gp[j]) end end end end end function test.DataParallelTable_smallBatch() for k, typename in ipairs(typenames) do test_DataParallelTable_smallBatch(typename) end end function test_DataParallelTable_smallBatch(gtype) local net = nn.SpatialConvolution(3, 3, 3, 5):type(gtype) local dpt = nn.DataParallelTable(1):type(gtype) for i=1,numGpus do cutorch.withDevice(i, function() dpt:add(net:clone():type(gtype), i) end) end -- Check for batches that are smaller than numGpus or don't divide evenly for _,batchSize in ipairs{numGpus-1,2*numGpus-1} do local input = torch[gtype:match('torch.(%a+)')](batchSize,3,10,10):uniform(-1, 1) -- Check that forward works as expected local output = dpt:forward(input) local expected = net:forward(input) assert((expected - output):abs():max() < precision(gtype, expected:clone():abs():max()), 'unexpected output') local gradOutput = output:clone():uniform(-1, 1) local gradInput = dpt:updateGradInput(input, gradOutput) local expected = net:updateGradInput(input, gradOutput) assert((expected - gradInput):abs():max() < precision(gtype, expected:clone():abs():max()), 'unexpected gradInput') end end function test.DataParallelTable_emptyTensor() for k, typename in ipairs(typenames) do test_DataParallelTable_emptyTensor(typename) end end function test_DataParallelTable_emptyTensor(gtype) local net = nn.Sequential():add(nn.SelectTable(2)):add(nn.Linear(10,2)):type(gtype) local dpt = nn.DataParallelTable(1):type(gtype) for i=1,numGpus do cutorch.withDevice(i, function() dpt:add(net:clone():type(gtype), i) end) end local input = {torch[gtype:match('torch.(%a+)')](0), torch[gtype:match('torch.(%a+)')](numGpus, 10):fill(1)} local output = dpt:forward(input) local expected = net:forward(input) assert((output - expected ):abs():max() < precision(gtype, expected:clone():abs():max()), 'unexpected output') local gradOutput = output:clone():uniform(-1,1) local gradInput = dpt:backward(input, gradOutput) local expected = net:backward(input, gradOutput) assert((expected[2] - gradInput[2]):abs():max() < precision(gtype, expected[2]:clone():abs():max()), 'unexpected gradInput') end function test.DataParallelTable_type() for k, typename in ipairs(typenames) do test_DataParallelTable_type(typename) end end function test_DataParallelTable_type(gtype) local ctype = t2cpu[gtype] local net = nn.SpatialConvolution(3, 3, 3, 5):type(ctype) local dpt = nn.DataParallelTable(1):type(gtype) for i=1,numGpus do cutorch.withDevice(i, function() dpt:add(net:clone(), i) end) end dpt:type(gtype) ok = pcall(function() dpt:type(ctype) end) assert(not ok, 'should not be able to call DataParallelTable:type(' .. ctype .. ')') end function test.DataParallelTable_sync() for k, typename in ipairs(typenames) do test_DataParallelTable_sync(typename) end end function test_DataParallelTable_sync(gtype) -- Test that DataParallelTable automatically syncParameters in updateOutput -- if you forget to call :syncParameters() local nSteps = 10 local net = nn.Sequential() :add(nn.Linear(10, 10)) :add(nn.ReLU(true)) :add(nn.Linear(10, 10)) :type(gtype) local dpt = nn.DataParallelTable(1):type(gtype) for i=1,numGpus do cutorch.withDevice(i, function() dpt:add(net:clone(), i) end) end local criterion = nn.MSECriterion():type(gtype) local optimState = { learningRate = 1, momentum = 0, } local input = torch[gtype:match('torch.(%a+)')](numGpus,10) local target = torch[gtype:match('torch.(%a+)')](numGpus,10) local function feval(net) local params, gradParams = net:getParameters() return params, function(x) net:zeroGradParameters() local output = net:forward(input) local err = criterion:forward(output, target) local gradOutput = criterion:backward(output, target) local gradInput = net:backward(input, gradOutput) return err, gradParams end end local paramsDpt, fevalDpt = feval(dpt) local paramsBase, fevalBase = feval(net) for i=1,nSteps do input:uniform(-1, 1) target:uniform(-1, 1) optim.sgd(fevalDpt, paramsDpt, optimState) optim.sgd(fevalBase, paramsBase, optimState) end assert((paramsDpt - paramsBase):abs():max() < precision(gtype, paramsDpt:clone():abs():max()), 'parameters do not match') end function test.DataParallelTable_serialize() for k, typename in ipairs(typenames) do test_DataParallelTable_serialize(typename) end end function test_DataParallelTable_serialize(gtype) -- Test serialization after getParameters() local net = nn.Linear(10, 10):type(gtype) local dpt = nn.DataParallelTable(1):type(gtype) for i=1,numGpus do cutorch.withDevice(i, function() dpt:add(net:clone():type(gtype), i) end) end dpt:getParameters() dpt = deserialize(serialize(dpt)) local input = torch[gtype:match('torch.(%a+)')](numGpus,10):uniform(-1, 1) -- Check that forward works as expected local output = dpt:forward(input) assert(output and output:sum() ~= 0, 'unexpected output') -- Zero the weights on the first tower and sync paramteters -- to check that Tensors are pointing to the proper storages dpt.flattenedParams[1][1]:zero() dpt:syncParameters() output = dpt:forward(input) assert(output:sum() == 0, 'weights not zeroed') end function test.DataParallelTable_flattenParameters() for k, typename in ipairs(typenames) do test_DataParallelTable_flattenParameters(typename) end end function test_DataParallelTable_flattenParameters(gtype) -- Wrap only a part of a network with data parallel table and -- check if the correct number of parameters have been copied local seq = nn.Sequential() local layer1 = nn.Linear(10, 10):type(gtype) local layer2 = nn.Linear(10, 5):type(gtype) local dpt = nn.DataParallelTable(1, true, true):threads():type(gtype) dpt:add(layer2, torch.range(1, numGpus):totable()) seq:add(layer1):add(dpt) seq:getParameters() local input = torch.randn(7, 10):type(gtype) seq:forward(input) -- There are 55 parameters in layer 2 (50 + 5 bias weights) assert(dpt.flattenedParams[1][1]:size(1) == 55, "Incorrect number of " .. "parameters copied") -- Check grad weights assert(dpt.flattenedParams[1][2]:size(1) == 55, "Incorrect number of " .. "parameters copied") end function test.DataParallelTable_misc() for k, typename in ipairs(typenames) do test_DataParallelTable_misc(typename) end end function test_DataParallelTable_misc(gtype) local net = nn.Sequential() :add(nn.Linear(3, 10)) :add(nn.ReLU()) :add(nn.Linear(10, 7)) local dpt = nn.DataParallelTable(1):type(gtype) :add(net, torch.range(1, numGpus):totable()) :threads() :type(gtype) local input = torch.randn(8, 3):type(gtype) local output = dpt:forward(input) -- check that clone works dpt = dpt:clone() local output2 = dpt:forward(input) assert((output2 - output):abs():max() == 0) -- check findModules and listModules local modules = dpt:listModules() assert(#modules == #net:listModules() + 1) assert(torch.type(modules[1]) == 'nn.DataParallelTable') assert(torch.type(modules[2]) == 'nn.Sequential') assert(#dpt:findModules('nn.ReLU') == 1) end function test.DataParallelTable_noGradInput() for k, typename in ipairs(typenames) do test_DataParallelTable_noGradInput(typename) end end function test_DataParallelTable_noGradInput(gtype) local net = nn.Sequential() :add(nn.LookupTable(10, 10)) :add(nn.Linear(10, 7)) :add(nn.ReLU()) :type(gtype) local dpt = nn.DataParallelTable(1) :add(net, torch.range(1, numGpus):totable()) :threads() :type(gtype) local input = torch.Tensor(5):random(10):type(gtype) local output1 = net:forward(input):clone() local gradOutput = output1:clone():uniform(-1, 1) local gradInput1 = net:backward(input, gradOutput):clone() local output2 = dpt:forward(input) local gradInput2 = dpt:backward(input, gradOutput) mytester:assertlt((output1 - output2):abs():max(), precision(gtype, output1:clone():abs():max()), 'forward prop error') mytester:asserteq(gradInput2:nElement(), gradInput1:nElement()) end function test.DataParallelTable_accGradParameters() for k, typename in ipairs(typenames) do test_DataParallelTable_accGradParameters(typename) end end function test_DataParallelTable_accGradParameters(gtype) local net = nn.Sequential() :add(nn.Linear(3, 10)) :add(nn.ReLU()) :add(nn.Linear(10, 7)) :type(gtype) local inputs = {} local gradOutputs = {} for i=1,3 do inputs[i] = torch.randn(8, 3):type(gtype) gradOutputs[i] = torch.randn(8, 7):type(gtype) end local configs = { {1, false, false}, {1, true, false}, } local function accumulateGradient(m) m:zeroGradParameters() for i=1,#inputs do m:forward(inputs[i]) m:backward(inputs[i], gradOutputs[i]) end m:updateParameters(0.5) end local base = net:clone() accumulateGradient(base) local expected = base:forward(inputs[1]) for _, config in ipairs(configs) do local dpt = nn.DataParallelTable(table.unpack(config)) :add(net:clone(), torch.range(1, numGpus):totable()):type(gtype) accumulateGradient(dpt) local output = dpt:forward(inputs[1]) mytester:assertlt((output - expected):abs():max(), precision(gtype, expected:clone():abs():max()), 'invalid output ' .. gtype) end end function test.DataParallelTable_apply() for k, typename in ipairs(typenames) do test_DataParallelTable_apply(typename) end end function test_DataParallelTable_apply(gtype) local net = nn.Sequential() :add(nn.Linear(3, 10)) :add(nn.ReLU()) :add(nn.Linear(10, 7)) :type(gtype) local inputs = {} local gradOutputs = {} for i=1,3 do inputs[i] = torch.randn(8, 3):type(gtype) gradOutputs[i] = torch.randn(8, 7):type(gtype) end local configs = { {1, false, false}, {1, true, false}, } local function trainNetwork(m) -- Test that apply doesn't break everything. This will be very slow -- in the training loop, but should still be correct. local function emptyFn() end m:apply(emptyFn) for i=1,#inputs do m:zeroGradParameters() m:forward(inputs[i]) m:backward(inputs[i], gradOutputs[i]) m:updateParameters(0.1) m:apply(emptyFn) end end local base = net:clone() trainNetwork(base) local expected = base:forward(inputs[1]) for _, usethreads in ipairs{false,true} do for _, config in ipairs(configs) do local dpt = nn.DataParallelTable(table.unpack(config)) :add(net:clone(), torch.range(1, numGpus):totable()):type(gtype) if usethreads then dpt:threads() end trainNetwork(dpt) local output = dpt:forward(inputs[1]) mytester:assertlt((output - expected):abs():max(), precision(gtype, expected:clone():abs():max()), 'invalid output: flatten=' .. tostring(config[2]) .. ' threads=' .. tostring(usethreads)) end end end function test.DataParallelTable_streams() for k, typename in ipairs(typenames) do test_DataParallelTable_streams(typename) end end function test_DataParallelTable_streams(gtype) local net = nn.Sequential() :add(nn.Linear(3, 10)) :add(nn.ReLU()) :add(nn.Linear(10, 7)) :type(gtype) local input = torch.randn(8, 3):type(gtype) local gradOutput = torch.randn(8, 7):type(gtype) local gOutput = net:forward(input):clone() net:zeroGradParameters() local gGradInput = net:backward(input, gradOutput):clone() local configs = { {1, false, false}, {1, true, false}, {1, true, true}, } local function test(dpt) local output = dpt:forward(input) dpt:zeroGradParameters() local gradInput = dpt:backward(input, gradOutput) mytester:assert((output - gOutput):abs():max() == 0, 'invalid output') mytester:assert((gradInput - gGradInput):abs():max() == 0, 'invalid gradInput') end for _, stream in ipairs{0, 1} do cutorch.setStream(stream) for _, config in ipairs(configs) do for _, threads in ipairs{false, true} do local dpt = nn.DataParallelTable(table.unpack(config)) :add(net, torch.range(1, numGpus):totable()) :type(gtype) if threads then dpt:threads(function() cutorch.reserveStreams(1) cutorch.setStream(stream) end) end test(dpt) end end end cutorch.setStream(0) end function test.DataParallelTable_emptyData() for k, typename in ipairs(typenames) do test_DataParallelTable_emptyData(typename) end end function test_DataParallelTable_emptyData(gtype) local function eq(a,b) if not torch.isTensor(a) then local res = true for i = 1, #a do res = res and eq(a[i], b[i]) end return res end return a:clone():add(-b):abs():max() == 0 end local identity = nn.Linear(5,5) identity.bias:zero() identity.weight=torch.eye(5) local a = nn.DataParallelTable(1) a:add(identity, torch.range(1,numGpus):totable()) a:type(gtype) local inputs = {torch.range(1,numGpus*5):reshape(numGpus,5):type(gtype), torch.range(1,5):reshape(1,5):type(gtype), torch.range(1,10):reshape(2,5):type(gtype), } for _, input in ipairs(inputs) do local output = a:forward(input) local gradInput = a:backward(input, output) mytester:assert(eq(input, output)) mytester:assert(eq(input, gradInput)) end a = nn.DataParallelTable(1) a:add(nn.ParallelTable():add(identity):add(identity), torch.range(1,numGpus):totable()) a:type(gtype) for _, input in ipairs(inputs) do input = {input, input} local output = a:forward(input) local gradInput = a:backward(input, output) mytester:assert(eq(input, output)) mytester:assert(eq(input, gradInput)) end end function test.ProfileDataParallelTable() for k, typename in ipairs(typenames) do test_ProfileDataParallelTable(typename) end end function test_ProfileDataParallelTable(gtype) local width = 32 local height = 32 local pool = 4 local feat = 128 local filt = 7 local numConvs = 4 local numRepeats = 10 local modulesToTest = {} modulesToTest['DataParallelTable'] = nn.DataParallelTable if profileDp then modulesToTest['DataParallel'] = nn.DataParallel end local deviceCount = numGpus assert(deviceCount > 1) for moduleName, module in pairs(modulesToTest) do for numGpus = 1, deviceCount do collectgarbage() print('Profiling ' .. moduleName .. ' with ' .. numGpus .. ' gpus') local batchSize = 2 * 3 * 4 assert(math.fmod(batchSize, numGpus) == 0) -- Build a CPU model local cNet = buildNet(width, height, pool, feat, filt, false, numConvs) -- Build a multi-GPU model local gNet = module(1) if (moduleName == 'DataParallel') then cutorch.setDevice(baseGpu) gNet:type(gtype) elseif (moduleName == 'DataParallelTable') then gNet:type(gtype) end for i = 1, numGpus do local curGpu = math.fmod(baseGpu+(i-1)-1, cutorch.getDeviceCount())+1 cutorch.setDevice(curGpu) gNet:add(cNet:clone():type(gtype), curGpu) end cutorch.setDevice(baseGpu) local input = torch.rand(batchSize, 3, height, width):type(gtype) local target = torch.rand(batchSize, 2):type(gtype) local gParams, gGradParams if (moduleName == 'DataParallelTable') then -- Force in a serialization / deserialization pass ------------ local file = serialize(gNet) gNet = nil collectgarbage() collectgarbage() gNet = deserialize(file) ---------------------------------------------------------------- gParams, gGradParams = gNet:getParameters() end -- Set up an MSE optimizer on the GPU local optimState = { learningRate = 0.1, weightDecay = 0, momentum = 0.9, dampening = 0, learningRateDecay = 0, nesterov = true, } local optimMethod = optim.sgd local criterion = nn.MSECriterion():type(gtype) local timeGpuNet = 0 local opt if (moduleName == 'DataParallel') then opt = nn.Optim(gNet, optimState) end -- Call forward and backward once to hide allocations in profile do local output = gNet:forward(input) gNet:backward(input, output) end for i = 1, numRepeats do collectgarbage() local fevalGpu = function(x) if x ~= gParams then gParams:copy(x) end gNet:zeroGradParameters() local output = gNet:forward(input) local err = criterion:forward(output, target) local gradOutput = criterion:backward(output, target) local gradInput = gNet:backward(input, gradOutput) return err, gGradParams end -- Perform an SGD step and profile it sys.tic() if (moduleName == 'DataParallelTable') then optimMethod(fevalGpu, gParams, optimState) gNet:findModules('nn.DataParallelTable')[1]:syncParameters() else opt:optimize(optim.sgd, input, target, criterion) end cutorch.synchronize() timeGpuNet = timeGpuNet + sys.toc() collectgarbage() end print(' Time per FPROP+BPROP: ' .. timeGpuNet / numRepeats) end end end -- Now run the test above --checkHalf() -- half not enabled yet for DataParallelTable mytester:add(test) mytester:run()