Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/torch/cunn.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorngimel <ngimelshein@nvidia.com>2015-12-04 01:23:31 +0300
committerngimel <ngimelshein@nvidia.com>2015-12-04 01:23:31 +0300
commita9f790b62dff5fc3c0c8ba88d8fad9349ee6944a (patch)
tree144918e072f42602d5c936f7b3e46add29579b6a /DataParallelTable.lua
parentb905dbc5b2ebc2e253b2806b01cba5d99dc168a8 (diff)
add nccl support to DataParallelTable
Diffstat (limited to 'DataParallelTable.lua')
-rw-r--r--DataParallelTable.lua84
1 files changed, 48 insertions, 36 deletions
diff --git a/DataParallelTable.lua b/DataParallelTable.lua
index edb84c8..78dcc20 100644
--- a/DataParallelTable.lua
+++ b/DataParallelTable.lua
@@ -185,7 +185,7 @@ end
local DataParallelTable, parent = torch.class('nn.DataParallelTable',
'nn.Container')
-function DataParallelTable:__init(dimension, flattenParams)
+function DataParallelTable:__init(dimension, flattenParams, usenccl)
parent.__init(self)
if not dimension then
error "must specify a dimension!"
@@ -199,8 +199,15 @@ function DataParallelTable:__init(dimension, flattenParams)
self.outputGpu = {} -- outputs for each gpu
self.gradInputGpu = {} -- gradInput for each gpu
self.flattenParams = flattenParams ~= nil and flattenParams or true
+ self.usenccl = (usenccl ~= nil and usenccl or true) and self.flattenParams
self.flattenedParamsGpu = {} --flattened parameters for each gpu
self.flattenedGradParamsGpu = {} --flattened parameters for each gpu
+ if self.usenccl then
+ if not pcall(function() require('nccl') end) then
+ print("warning: could not load nccl, falling back to default communication")
+ self.usenccl=false
+ end
+ end
end
@@ -226,7 +233,7 @@ end
function DataParallelTable:flattenParameters()
- if #self.modules == 1 then return end
+ if #self.modules == 1 then self.flattenParams = false return end
local prevGpuid = cutorch.getDevice()
local sizetmp,stridetmp
for i=1, #self.modules do
@@ -382,51 +389,56 @@ function DataParallelTable:accGradParameters(input, gradOutput, scale)
-- Accumulate the gradients onto one GPU (the first one)
-- TODO: Parallelize this (ie a parallel merge)
- local baseParams, baseGradParams
- if self.flattenParams then
- baseGradParams = self.flattenedGradParamsGpu[baseGpuIndex]
+ if self.flattenParams and self.usenccl then
+ nccl.reduce(self.flattenedGradParamsGpu, nil,true,baseGpuIndex)
else
- _, baseGradParams = self.modules[baseGpuIndex]:parameters()
- end
- for i, module in ipairs(self.modules) do
- if (i ~= baseGpuIndex) then
- local gradParams
- if self.flattenParams then
- gradParams = self.flattenedGradParamsGpu[i]
- else
- _, gradParams = self.modules[i]:parameters()
- end
- deepTensorsAdd(baseGradParams, gradParams) -- dst, src
- cutorch.synchronize()
+ local baseParams, baseGradParams
+ if self.flattenParams then
+ baseGradParams = self.flattenedGradParamsGpu[baseGpuIndex]
+ else
+ _, baseGradParams = self.modules[baseGpuIndex]:parameters()
+ end
+ for i, module in ipairs(self.modules) do
+ if (i ~= baseGpuIndex) then
+ local gradParams
+ if self.flattenParams then
+ gradParams = self.flattenedGradParamsGpu[i]
+ else
+ _, gradParams = self.modules[i]:parameters()
+ end
+ deepTensorsAdd(baseGradParams, gradParams) -- dst, src
+ cutorch.synchronize()
+ end
end
end
-
-
setDevice(prevGpuid)
end
function DataParallelTable:syncParameters()
local prevGpuid = cutorch.getDevice()
- local baseParams
- if self.flattenParams then
- baseParams = self.flattenedParamsGpu[baseGpuIndex]
+ if self.flattenParams and self.usenccl then
+ nccl.bcast(self.flattenedParamsGpu, true,baseGpuIndex)
else
- baseParams, _ = self.modules[baseGpuIndex]:parameters()
- end
- -- TODO: Parallelize this (ie a parallel copy)
- for i, module in ipairs(self.modules) do
- if (i ~= baseGpuIndex) then
- local params
- if self.flattenParams then
- params = self.flattenedParamsGpu[i]
- else
- params, _ = self.modules[i]:parameters()
- end
- deepTensorsCopy(params, baseParams) -- dst, src
+ local baseParams
+ if self.flattenParams then
+ baseParams = self.flattenedParamsGpu[baseGpuIndex]
+ else
+ baseParams, _ = self.modules[baseGpuIndex]:parameters()
end
+ -- TODO: Parallelize this (ie a parallel copy)
+ for i, module in ipairs(self.modules) do
+ if (i ~= baseGpuIndex) then
+ local params
+ if self.flattenParams then
+ params = self.flattenedParamsGpu[i]
+ else
+ params, _ = self.modules[i]:parameters()
+ end
+ deepTensorsCopy(params, baseParams) -- dst, src
+ end
+ end
+ cutorch.synchronize()
end
- cutorch.synchronize()
-
setDevice(prevGpuid)
end