Re-organized BatchOptimization modules.

author: Clement Farabet <clement.farabet@gmail.com> 2011-09-01 02:05:59 +0400
committer: Clement Farabet <clement.farabet@gmail.com> 2011-09-01 02:05:59 +0400
commit: 24219cba24be5a75d6666d1ca2a9a6ea0bde252a (patch)
tree: 3de1331ff9124dc5fffac19223e1ff016b0979c3
parent: cec2df99480f81c14b9e782ef84cb6440fd5044e (diff)
4 files changed, 101 insertions, 364 deletions
diff --git a/BatchOptimization.lua b/BatchOptimization.lua
index 0951ae4..4caa064 100644
--- a/BatchOptimization.lua
+++ b/BatchOptimization.lua
@@ -12,12 +12,6 @@ function Batch:__init(...)
       {arg='module', type='nn.Module', help='a module to train', req=true},
       {arg='criterion', type='nn.Criterion', 
        help='a criterion to estimate the error', req=true},
-      {arg='maxIterations', type='number', 
-       help='maximum nb of iterations per pass (0 = no max)', default=0},
-      {arg='maxLineSearch', type='number', 
-       help='maximum nb of steps in line search', default=20},
-      {arg='sparsity', type='number', 
-       help='sparsity coef (Orthantwise C)', default=0},
       {arg='parallelize', type='number', 
        help='parallelize onto N cores (experimental!)', default=1},
       {arg='verbose', type='number', 
@@ -25,10 +19,11 @@ function Batch:__init(...)
    )
    self.parameters = nnx.flattenParameters(nnx.getParameters(self.module))
    self.gradParameters = nnx.flattenParameters(nnx.getGradParameters(self.module))
+   self.evalCounter = 0
+   self.sampleCounter = 0
    if self.parallelize > 1 then
       self:setup_mapreduce()
    end
-   batch = {}
 end
 
 function Batch:forward(inputs, targets, options)
@@ -46,8 +41,13 @@ function Batch:forward_sequential(inputs, targets, options)
    --       + self.parameters contains the current X vector
    --       + self.gradParameters contains the estimated dF/dX vector
    --       + self.output contains the estimated (average) F(X)
-   batch.evaluate
+   self.evaluate
       = function()
+           -- verbose
+           if self.verbose >= 2 then
+              print('<BatchOptimization> evaluating f(X) + df/dX') 
+           end
+           local _t_ = sys.clock()
            -- reset gradients
            self.gradParameters:zero()
            -- f is the average of all criterions
@@ -70,13 +70,28 @@ function Batch:forward_sequential(inputs, targets, options)
                  self.posthook(self, {inputs[i], targets[i], options[i]})
               end
            end
+           -- update evaluation counter
+           self.evalCounter = self.evalCounter + 1
            -- normalize gradients
            self.gradParameters:div(#inputs)
+           -- verbose
+           if self.verbose >= 2 then 
+              print('<BatchOptimization> ' .. self.evalCounter .. 'th evaluation took ' .. (sys.clock() - _t_) .. ' sec')
+           end
            -- return average f(X)
-           return self.output/#inputs
+           self.output = self.output/#inputs
+           return self.output
         end
 
-   -- (3) return current output after optimization
+   -- (2) optimization callback
+   if self.optimize then
+      self:optimize()
+   end
+
+   -- (3) update sample counter
+   self.sampleCounter = self.sampleCounter + #inputs
+
+   -- (4) return current output after optimization
    return self.output
 end
 
@@ -90,8 +105,8 @@ function Batch:forward_mapreduce(inputs, targets, options)
          if type(self.prehook) == 'string' then
             parallel.children:send(self.prehook)
          else
-            print('\r<BatchOptimization> WARNING: when using para||el mode, hooks should be')
-            print('\r<BatchOptimization> WARNING: defined as strings. User prehook ignored.')
+            print('\r<BatchOptimization> WARNING: when using para||el mode,'..
+                  ' hooks should be defined as strings. User prehook ignored.')
             parallel.children:send('')
          end
       else
@@ -101,8 +116,8 @@ function Batch:forward_mapreduce(inputs, targets, options)
          if type(self.posthook) == 'string' then
             parallel.children:send(self.posthook)
          else
-            print('\r<BatchOptimization> WARNING: when using para||el mode, hooks should be')
-            print('<\rBatchOptimization> WARNING: defined as strings. User posthook ignored.')
+            print('\r<BatchOptimization> WARNING: when using para||el mode,'..
+                  ' hooks should be defined as strings. User posthook ignored.')
             parallel.children:send('')
          end
       else
@@ -142,30 +157,41 @@ function Batch:forward_mapreduce(inputs, targets, options)
    --       + self.parameters contains the current X vector
    --       + self.gradParameters contains the estimated dF/dX vector
    --       + self.output contains the estimated (average) F(X)
-   batch.evaluate
+   self.evaluate
       = function()
-           batch.evaluate_map()
-           return batch.evaluate_reduce()
+           -- verbose
+           if self.verbose >= 2 then 
+              print('<BatchOptimization> evaluating f(X) + df/dX') 
+           end
+           local _t_ = sys.clock()
+           -- do map/reduce
+           self.evaluate_map()
+           self.evaluate_reduce()
+           -- update evaluation counter
+           self.evalCounter = self.evalCounter + 1
+           -- verbose
+           if self.verbose >= 2 then 
+              print('<BatchOptimization> ' .. self.evalCounter .. 'th evaluation took ' .. (sys.clock() - _t_) .. ' sec')
+           end
+           return self.output
         end
 
    -- (1a) the map part of the evaluation: compute partial gradients
    --      in separate threads
-   batch.evaluate_map
+   self.evaluate_map
       = function()
            -- transmit new parameters to all workers
            parallel.children:send(self.parameters)
            -- then wait for all workers to return their partial gradParameters + outputs
-           for t = 1,P do
-              gradParametersPartial[t] = parallel.children[t]:receive()
-              outputsPartial[t] = parallel.children[t]:receive()
-           end
+           gradParametersPartial = parallel.children:receive()
+           outputsPartial = parallel.children:receive()
            -- force cleanup
            collectgarbage()
         end
 
    -- (1b) the reduce part of the evaluation: accumulate all
    --      partial estimates of the gradients
-   batch.evaluate_reduce
+   self.evaluate_reduce
       = function()
            -- accumulate partial gradients, and average
            self.gradParameters:zero()
@@ -178,14 +204,22 @@ function Batch:forward_mapreduce(inputs, targets, options)
            for t = 1,P do
               self.output = self.output + outputsPartial[t]
            end
-           return self.output/#inputs
+           self.output = self.output/#inputs
         end
 
+   -- (2) optimization callback
+   if self.optimize then
+      self:optimize()
+   end
+
    -- (3) reset workers so they're ready for next mini-batch
    parallel.children:send('break')
 
-   -- (4) return current output after optimization
-   return self.output/#inputs
+   -- (4) update sample counter
+   self.sampleCounter = self.sampleCounter + #inputs
+
+   -- (5) return current output after optimization
+   return self.output
 end
 
 function Batch:setup_mapreduce ()
@@ -194,7 +228,6 @@ function Batch:setup_mapreduce ()
       xerror('install parallel for Lua to enable parallel computing (luarocks install parallel)',
              'nn.BatchOptimization')
    end
-   parallel.setSharedSize(4*1024*1024)
    local P = self.parallelize
 
    -- (1) define code for workers
diff --git a/LBFGSOptimization.lua b/LBFGSOptimization.lua
index 976226d..720b7ba 100644
--- a/LBFGSOptimization.lua
+++ b/LBFGSOptimization.lua
@@ -1,13 +1,10 @@
-local LBFGS,parent = torch.class('nn.LBFGSOptimization', 'nn.Optimization')
+local LBFGS,parent = torch.class('nn.LBFGSOptimization', 'nn.BatchOptimization')
 
 function LBFGS:__init(...)
    require 'liblbfgs'
-   parent.__init(self)
+   parent.__init(self, ...)
    xlua.unpack_class(self, {...},
       'LBFGSOptimization', nil,
-      {arg='module', type='nn.Module', help='a module to train', req=true},
-      {arg='criterion', type='nn.Criterion', 
-       help='a criterion to estimate the error', req=true},
       {arg='maxIterations', type='number', 
        help='maximum nb of iterations per pass (0 = no max)', default=0},
       {arg='maxLineSearch', type='number', 
@@ -15,277 +12,17 @@ function LBFGS:__init(...)
       {arg='sparsity', type='number', 
        help='sparsity coef (Orthantwise C)', default=0},
       {arg='parallelize', type='number', 
-       help='parallelize onto N cores (experimental!)', default=1},
-      {arg='verbose', type='number', 
-       help='verbose level during training [0-2]', default=0}
+       help='parallelize onto N cores (experimental!)', default=1}
    )
    self.parameters = nnx.flattenParameters(nnx.getParameters(self.module))
    self.gradParameters = nnx.flattenParameters(nnx.getGradParameters(self.module))
-   if self.parallelize > 1 then
-      self:setup_mapreduce()
-   end
 end
 
-function LBFGS:forward(inputs, targets, options)
-   options = options or {}
-   if self.parallelize > 1 then
-      return self:forward_mapreduce(inputs, targets, options)
-   else
-      return self:forward_sequential(inputs, targets, options)
-   end
-end
-
-function LBFGS:forward_sequential(inputs, targets, options)
-   -- (1) construct a closure that compute f(inputs) + df/dW
-   --     after each call to that function:
-   --       + self.parameters contains the current X vector
-   --       + self.gradParameters contains the estimated dF/dX vector
-   --       + self.output contains the estimated (average) F(X)
-   lbfgs.evaluate
-      = function()
-           -- verbose
-           if self.verbose >= 2 then print('<LBFGSOptimization> evaluating f(X) + df/dX') end
-           local _t_ = sys.clock()
-           -- reset gradients
-           self.gradParameters:zero()
-           -- f is the average of all criterions
-           self.output = 0
-           -- given all inputs, evaluate gradients
-           for i = 1,#inputs do
-              -- user hook
-              if self.prehook then
-                 self.prehook(self, {inputs[i], targets[i], options[i]})
-              end
-              -- estimate f
-              local output = self.module:forward(inputs[i])
-              local err = self.criterion:forward(output, targets[i])
-              self.output = self.output + err
-              -- estimate df/dW
-              local df_do = self.criterion:backward(output, targets[i])
-              self.module:backward(inputs[i], df_do)
-              -- user hook
-              if self.posthook then
-                 self.posthook(self, {inputs[i], targets[i], options[i]})
-              end
-           end
-           -- normalize gradients
-           self.gradParameters:div(#inputs)
-           -- verbose
-           if self.verbose >= 2 then print('<LBFGSOptimization> f(X) + df/dX computed in ' .. (sys.clock() - _t_) .. ' sec') end
-           -- return average f(X)
-           return self.output/#inputs
-        end
-
-   -- (2) the magic function: will update the parameter vector
-   --     according to the l-BFGS method
-   self.output = lbfgs.run(self.parameters, self.gradParameters,
-                           self.maxIterations, self.maxLineSearch,
-                           self.sparsity, self.verbose)
-
-   -- (3) return current output after optimization
-   return self.output
-end
-
-function LBFGS:forward_mapreduce(inputs, targets, options)
-   -- parameters
-   local P = self.parallelize
-
-   -- transmit user hooks, if defined
-   if not self.hooksets then
-      if self.prehook then
-         if type(self.prehook) == 'string' then
-            parallel.children:send(self.prehook)
-         else
-            print('\r<LBFGSOptimization> WARNING: when using para||el mode, hooks should be defined as strings. User prehook ignored.')
-            parallel.children:send('')
-         end
-      else
-         parallel.children:send('')
-      end
-      if self.posthook then
-         if type(self.posthook) == 'string' then
-            parallel.children:send(self.posthook)
-         else
-            print('\r<LBFGSOptimization> WARNING: when using para||el mode, hooks should be defined as strings. User posthook ignored.')
-            parallel.children:send('')
-         end
-      else
-         parallel.children:send('')
-      end
-      self.hooksets = true
-   end
-
-   -- (0a) replicate output and gradParameters
-   local outputsPartial = {}
-   local gradParametersPartial = {}
-
-   -- (0b) divide input/target batch into N batches
-   local inputss = {}
-   local targetss = {}
-   local optionss = {}
-   for t = 1,P do
-      inputss[t] = {}
-      targetss[t] = {}
-      optionss[t] = {}
-      for i = t,#inputs,P do
-         table.insert(inputss[t], inputs[i])
-         table.insert(targetss[t], targets[i])
-         if options then table.insert(optionss[t], options[i]) end
-      end
-   end
-
-   -- (0c) send mini-batch to all workers
-   for t = 1,P do
-      parallel.children[t]:send(inputss[t])
-      parallel.children[t]:send(targetss[t])
-      parallel.children[t]:send(optionss[t])
-   end
-
-   -- (1) construct a closure that compute f(inputs) + df/dW
-   --     after each call to that function:
-   --       + self.parameters contains the current X vector
-   --       + self.gradParameters contains the estimated dF/dX vector
-   --       + self.output contains the estimated (average) F(X)
-   lbfgs.evaluate
-      = function()
-           lbfgs.evaluate_map()
-           return lbfgs.evaluate_reduce()
-        end
-
-   -- (1a) the map part of the evaluation: compute partial gradients
-   --      in separate threads
-   lbfgs.evaluate_map
-      = function()
-           -- verbose
-           if self.verbose >= 2 then print('<LBFGSOptimization> evaluating f(X) + df/dX') end
-           local _t_ = sys.clock()
-           -- transmit new parameters to all workers
-           parallel.children:send(self.parameters)
-           -- then wait for all workers to return their partial gradParameters + outputs
-           gradParametersPartial = parallel.children:receive()
-           outputsPartial = parallel.children:receive()
-           -- force cleanup
-           collectgarbage()
-           -- verbose
-           if self.verbose >= 2 then print('<LBFGSOptimization> f(X) + df/dX computed in ' .. (sys.clock() - _t_) .. ' sec') end
-        end
-
-   -- (1b) the reduce part of the evaluation: accumulate all
-   --      partial estimates of the gradients
-   lbfgs.evaluate_reduce
-      = function()
-           -- accumulate partial gradients, and average
-           self.gradParameters:zero()
-           for t = 1,P do
-              self.gradParameters:add(gradParametersPartial[t])
-           end
-           self.gradParameters:div(#inputs)
-           -- return average f(X)
-           self.output = 0
-           for t = 1,P do
-              self.output = self.output + outputsPartial[t]
-           end
-           return self.output/#inputs
-        end
-
-   -- (2) the magic function: will update the parameter vector
-   --     according to the l-BFGS method
+function LBFGS:optimize()
+   lbfgs.evaluate = self.evaluate
+   -- the magic function: will update the parameter vector
+   -- according to the l-BFGS method
    self.output = lbfgs.run(self.parameters, self.gradParameters,
                            self.maxIterations, self.maxLineSearch,
                            self.sparsity, self.verbose)
-
-   -- (3) reset workers so they're ready for next mini-batch
-   parallel.children:send('break')
-
-   -- (4) return current output after optimization
-   return self.output
-end
-
-function LBFGS:setup_mapreduce ()
-   -- (0) startup parallel package
-   if not xrequire 'parallel' then
-      xerror('install parallel for Lua to enable parallel computing (luarocks install parallel)',
-             'nn.LBFGSOptimization')
-   end
-   parallel.setSharedSize(4*1024*1024)
-   local P = self.parallelize
-
-   -- (1) define code for workers
-   local worker_code = [[
-         -- require packages
-         require 'nnx'
-
-         -- retrieve module + criterion at startup
-         module = parallel.parent:receive()
-         criterion = parallel.parent:receive()
-
-         -- create fake optimizer, for hooks
-         optimizer = {module=module, criterion=criterion}
-
-         -- retrieve optional prehook/posthook
-         prehook = parallel.parent:receive()
-         posthook = parallel.parent:receive()
-         if prehook ~= '' then loadstring(prehook)() else prehook = nil end
-         if posthook ~= '' then loadstring(posthook)() else posthook = nil end
-
-         -- get pointer to parameter and gradParameter vectors
-         parameters = nnx.flattenParameters(nnx.getParameters(module))
-         gradParameters = nnx.flattenParameters(nnx.getGradParameters(module))
-
-         -- outter loop: mini-batches
-         while true do
-            -- receive new mini-batch
-            inputs = parallel.parent:receive()
-            if type(inputs) == 'string' and inputs == 'break' then break end
-            targets = parallel.parent:receive()
-            options = parallel.parent:receive()
-
-            -- inner loop: evaluations
-            while true do
-               -- receive new set of parameters
-               newParameters = parallel.parent:receive()
-               if type(newParameters) == 'string' and newParameters == 'break' then break end
-               parameters:copy(newParameters)
-
-               -- reset gradients
-               gradParameters:zero()
-               -- f is the average of all criterions
-               local f_x = 0
-               -- evaluate gradients on inputs for this thread
-               for i = 1,#inputs do
-                  -- user hook
-                  if prehook then
-                     prehook(optimizer, {inputs[i], targets[i], options[i]})
-                  end
-                  -- estimate f
-                  local output = module:forward(inputs[i])
-                  local err = criterion:forward(output, targets[i])
-                  f_x = f_x + err
-                  -- estimate df/dW
-                  local df_do = criterion:backward(output, targets[i])
-                  module:backward(inputs[i], df_do)
-                  -- user hook
-                  if posthook then
-                     posthook(optimizer, {inputs[i], targets[i], options[i]})
-                  end
-               end
-
-               -- now send back gradParameters + partial output
-               parallel.parent:send(gradParameters)
-               parallel.parent:send(f_x)
-
-               -- force cleanup
-               collectgarbage()
-            end
-         end
-   ]]
-
-   -- (2) startup all workers
-   for t = 1,P do
-      parallel.run(worker_code)
-   end
-
-   -- (3) and send them the module + criterion architecture
-   parallel.children:send(self.module)
-   parallel.children:send(self.criterion)
 end
diff --git a/SGDOptimization.lua b/SGDOptimization.lua
index 88d63d0..ddbf220 100644
--- a/SGDOptimization.lua
+++ b/SGDOptimization.lua
@@ -1,81 +1,48 @@
-local SGD,parent = torch.class('nn.SGDOptimization', 'nn.Optimization')
+local SGD,parent = torch.class('nn.SGDOptimization', 'nn.BatchOptimization')
 
 function SGD:__init(...)
-   parent.__init(self)
+   parent.__init(self,...)
    xlua.unpack_class(self, {...},
       'SGDOptimization', nil,
-      {arg='module', type='nn.Module', help='a module to train', req=true},
-      {arg='criterion', type='nn.Criterion', help='a criterion to estimate the error', req=true},
-      {arg='learningRate', type='number', help='learning rate (W = W - rate*dE/dW)', default=1e-2},
-      {arg='learningRateDecay', type='number', help='learning rate decay (lr_t = lr_0 / (1 + samplesSeen*lrDecay))', default=0},
-      {arg='weightDecay', type='number', help='amount of weight decay (W = W - decay*W)', default=0},
-      {arg='momentum', type='number', help='amount of momentum on weights (dE/W = dE/dW*(1-momentum) + prev(dE/dW)*momentum)', default=0}
+      {arg='maxIterations', type='number', 
+       help='maximum nb of iterations per pass', default=1},
+      {arg='learningRate', type='number', 
+       help='learning rate (W = W - rate*dE/dW)', default=1e-2},
+      {arg='learningRateDecay', type='number', 
+       help='learning rate decay (lr_t = lr_0 / (1 + samplesSeen*lrDecay))', default=0},
+      {arg='weightDecay', type='number', 
+       help='amount of weight decay (W = W - decay*W)', default=0},
+      {arg='momentum', type='number', 
+       help='amount of momentum on weights (dE/W = dE/dW*(1-momentum) + prev(dE/dW)*momentum)', default=0}
    )
    self.parameters = nnx.flattenParameters(nnx.getParameters(self.module))
    self.gradParameters = nnx.flattenParameters(nnx.getGradParameters(self.module))
-   self.samplesSeen = 0
 end
 
-function SGD:forward(inputs, targets, options)
-   options = options or {}
-
-   -- reset gradients
-   self.gradParameters:zero()
-   
-   -- f is the average of all criterions
-   self.output = 0
-   
-   -- given all inputs, evaluate gradients
-   for i = 1,#inputs do
-      -- user hook
-      if self.prehook then
-         self.prehook(self, {inputs[i], targets[i], options[i]})
-      end
-
-      -- estimate f
-      local output = self.module:forward(inputs[i])
-      local err = self.criterion:forward(output, targets[i])
-      self.output = self.output + err
-      
-      -- estimate df/dW
-      local df_do = self.criterion:backward(output, targets[i])
-      self.module:backward(inputs[i], df_do)
-
-      -- user hook
-      if self.posthook then
-         self.posthook(self, {inputs[i], targets[i], options[i]})
+function SGD:optimize()
+   -- optimize N times
+   for i = 1,self.maxIterations do
+      -- evaluate f(X) + df/dX
+      self.evaluate()
+
+      -- apply momentum
+      if self.momentum ~= 0 then
+         if not self.currentGradParameters then
+            self.currentGradParameters = torch.Tensor():resizeAs(self.gradParameters):copy(self.gradParameters)
+         else
+            self.currentGradParameters:mul(self.momentum):add(1-self.momentum, self.gradParameters)
+         end
+      else
+         self.currentGradParameters = self.gradParameters
       end
-   end
 
-   -- update counter of samples seen
-   self.samplesSeen = self.samplesSeen + #inputs
-
-   -- renorm f
-   self.output = self.output / #inputs
-   
-   -- normalize gradients
-   self.gradParameters:div(#inputs)
-
-   -- apply momentum
-   if self.momentum ~= 0 then
-      if not self.currentGradParameters then
-         self.currentGradParameters = torch.Tensor():resizeAs(self.gradParameters):copy(self.gradParameters)
-      else
-         self.currentGradParameters:mul(self.momentum):add(1-self.momentum, self.gradParameters)
+      -- weight decay
+      if self.weightDecay ~= 0 then
+         self.parameters:add(-self.weightDecay, self.parameters)
       end
-   else
-      self.currentGradParameters = self.gradParameters
-   end
 
-   -- weight decay
-   if self.weightDecay ~= 0 then
-      self.parameters:add(-self.weightDecay, self.parameters)
+      -- update parameters
+      local learningRate = self.learningRate / (1 + self.sampleCounter*self.learningRateDecay)
+      self.parameters:add(-learningRate, self.currentGradParameters)
    end
-
-   -- update parameters
-   local learningRate = self.learningRate / (1 + self.samplesSeen*self.learningRateDecay)
-   self.parameters:add(-learningRate, self.currentGradParameters)
-
-   -- return current output
-   return self.output
 end
diff --git a/init.lua b/init.lua
index 6b09afc..3519294 100644
--- a/init.lua
+++ b/init.lua
@@ -100,9 +100,9 @@ torch.include('nnx', 'SpatialSparseCriterion.lua')
 
 -- optimizations:
 torch.include('nnx', 'Optimization.lua')
+torch.include('nnx', 'BatchOptimization.lua')
 torch.include('nnx', 'SGDOptimization.lua')
 torch.include('nnx', 'LBFGSOptimization.lua')
-torch.include('nnx', 'BatchOptimization.lua')
 
 -- trainers:
 torch.include('nnx', 'Trainer.lua')
author	Clement Farabet <clement.farabet@gmail.com>	2011-09-01 02:05:59 +0400
committer	Clement Farabet <clement.farabet@gmail.com>	2011-09-01 02:05:59 +0400
commit	24219cba24be5a75d6666d1ca2a9a6ea0bde252a (patch)
tree	3de1331ff9124dc5fffac19223e1ff016b0979c3
parent	cec2df99480f81c14b9e782ef84cb6440fd5044e (diff)