add nccl support to DataParallelTable

author: ngimel <ngimelshein@nvidia.com> 2015-12-04 01:23:31 +0300
committer: ngimel <ngimelshein@nvidia.com> 2015-12-04 01:23:31 +0300
commit: a9f790b62dff5fc3c0c8ba88d8fad9349ee6944a (patch)
tree: 144918e072f42602d5c936f7b3e46add29579b6a /DataParallelTable.lua
parent: b905dbc5b2ebc2e253b2806b01cba5d99dc168a8 (diff)
1 files changed, 48 insertions, 36 deletions
diff --git a/DataParallelTable.lua b/DataParallelTable.lua
index edb84c8..78dcc20 100644
--- a/DataParallelTable.lua
+++ b/DataParallelTable.lua
@@ -185,7 +185,7 @@ end
 local DataParallelTable, parent = torch.class('nn.DataParallelTable',
 'nn.Container')
 
-function DataParallelTable:__init(dimension, flattenParams)
+function DataParallelTable:__init(dimension, flattenParams, usenccl)
    parent.__init(self)
    if not dimension then
       error "must specify a dimension!"
@@ -199,8 +199,15 @@ function DataParallelTable:__init(dimension, flattenParams)
    self.outputGpu = {} -- outputs for each gpu
    self.gradInputGpu = {} -- gradInput for each gpu
    self.flattenParams = flattenParams ~= nil and flattenParams or true
+   self.usenccl = (usenccl ~= nil and usenccl or true) and self.flattenParams
    self.flattenedParamsGpu = {} --flattened parameters for each gpu
    self.flattenedGradParamsGpu = {} --flattened parameters for each gpu
+   if self.usenccl then
+     if not pcall(function() require('nccl') end) then
+        print("warning: could not load nccl, falling back to default communication")
+        self.usenccl=false
+     end
+   end
 end
 
 
@@ -226,7 +233,7 @@ end
 
 
 function DataParallelTable:flattenParameters()
-  if #self.modules == 1 then return end
+  if #self.modules == 1 then self.flattenParams = false return end
   local prevGpuid = cutorch.getDevice()
   local sizetmp,stridetmp
   for i=1, #self.modules do
@@ -382,51 +389,56 @@ function DataParallelTable:accGradParameters(input, gradOutput, scale)
 
    -- Accumulate the gradients onto one GPU (the first one)
    -- TODO: Parallelize this (ie a parallel merge)
-   local baseParams, baseGradParams
-   if self.flattenParams then
-     baseGradParams = self.flattenedGradParamsGpu[baseGpuIndex]
+   if self.flattenParams and self.usenccl then                
+       nccl.reduce(self.flattenedGradParamsGpu, nil,true,baseGpuIndex)
    else
-      _, baseGradParams = self.modules[baseGpuIndex]:parameters()
-   end
-   for i, module in ipairs(self.modules) do
-      if (i ~= baseGpuIndex) then
-         local gradParams
-         if self.flattenParams then
-            gradParams = self.flattenedGradParamsGpu[i]
-         else
-            _, gradParams = self.modules[i]:parameters()
-         end
-         deepTensorsAdd(baseGradParams, gradParams)  -- dst, src
-         cutorch.synchronize()
+      local baseParams, baseGradParams
+      if self.flattenParams then
+	baseGradParams = self.flattenedGradParamsGpu[baseGpuIndex]
+      else
+	 _, baseGradParams = self.modules[baseGpuIndex]:parameters()
+      end
+      for i, module in ipairs(self.modules) do
+	 if (i ~= baseGpuIndex) then
+	    local gradParams
+	    if self.flattenParams then
+	       gradParams = self.flattenedGradParamsGpu[i]
+	    else
+	       _, gradParams = self.modules[i]:parameters()
+	    end
+	    deepTensorsAdd(baseGradParams, gradParams)  -- dst, src
+	    cutorch.synchronize()         
+	 end
       end
    end
-
-
    setDevice(prevGpuid)
 end
 
 function DataParallelTable:syncParameters()
    local prevGpuid = cutorch.getDevice()
-   local baseParams
-   if self.flattenParams then
-        baseParams = self.flattenedParamsGpu[baseGpuIndex]
+   if self.flattenParams and self.usenccl then
+      nccl.bcast(self.flattenedParamsGpu, true,baseGpuIndex)
    else
-        baseParams, _ = self.modules[baseGpuIndex]:parameters()
-   end
-   -- TODO: Parallelize this (ie a parallel copy)
-   for i, module in ipairs(self.modules) do
-      if (i ~= baseGpuIndex) then
-         local params
-         if self.flattenParams then
-           params = self.flattenedParamsGpu[i]
-         else
-	   params, _ = self.modules[i]:parameters()
-         end
-         deepTensorsCopy(params, baseParams)  -- dst, src
+      local baseParams
+      if self.flattenParams then
+	   baseParams = self.flattenedParamsGpu[baseGpuIndex]
+      else
+	   baseParams, _ = self.modules[baseGpuIndex]:parameters()
       end
+      -- TODO: Parallelize this (ie a parallel copy)
+      for i, module in ipairs(self.modules) do
+	 if (i ~= baseGpuIndex) then
+	    local params
+	    if self.flattenParams then
+	      params = self.flattenedParamsGpu[i]
+	    else
+	      params, _ = self.modules[i]:parameters()
+	    end
+	    deepTensorsCopy(params, baseParams)  -- dst, src
+	 end
+      end
+      cutorch.synchronize()
    end
-   cutorch.synchronize()
-
    setDevice(prevGpuid)
 end
author	ngimel <ngimelshein@nvidia.com>	2015-12-04 01:23:31 +0300
committer	ngimel <ngimelshein@nvidia.com>	2015-12-04 01:23:31 +0300
commit	a9f790b62dff5fc3c0c8ba88d8fad9349ee6944a (patch)
tree	144918e072f42602d5c936f7b3e46add29579b6a /DataParallelTable.lua
parent	b905dbc5b2ebc2e253b2806b01cba5d99dc168a8 (diff)