Merge branch 'genetic'

Conflicts: BatchOptimization.lua nnx-1.0-1.rockspec
author: Marco Scoffier <github@metm.org> 2011-09-28 01:20:52 +0400
committer: Marco Scoffier <github@metm.org> 2011-09-28 01:20:52 +0400
commit: 31b97b114f144aa286493c9a83acfd2b8ed32ff7 (patch)
tree: c5bc888232e743616edad2f5b38ac41c65bbf9e4
parent: 07a8194cf5d9c737af1b291d2b5b057a1f369437 (diff)
parent: 2f609a3e56dfb05f3b5246690b341cb44c32ba9e (diff)
4 files changed, 268 insertions, 2 deletions
diff --git a/BatchOptimization.lua b/BatchOptimization.lua
index f25cdc9..5255e57 100644
--- a/BatchOptimization.lua
+++ b/BatchOptimization.lua
@@ -130,6 +130,10 @@ function Batch:forward_mapreduce(inputs, targets, options)
          self.children[t]:send(targets)
          self.children[t]:send(options)
       end
+<<<<<<< HEAD
+=======
+         
+>>>>>>> genetic
    else
       -- (0b) divide input/target batch into N batches, based on speed
       -- of each worker
diff --git a/GeneticSGDOptimization.lua b/GeneticSGDOptimization.lua
new file mode 100644
index 0000000..412b7bf
--- /dev/null
+++ b/GeneticSGDOptimization.lua
@@ -0,0 +1,261 @@
+local GenSGD,parent = torch.class('nn.GeneticSGDOptimization',
+                                  'nn.BatchOptimization')
+
+-- this module parallelizes SGD in a particular way.  It sends out the
+-- same batch to each of several workers, each with a different learning
+-- rate.  The workers run and the parameters from the best worker and
+-- it's learning rate are kept for the next batch.
+
+function GenSGD:__init(...)
+   parent.__init(self,...)
+   xlua.unpack_class(self, {...},
+                     'GenSGDOptimization', nil,
+                     {arg='maxIterations', type='number',
+                      help='maximum nb of iterations per pass', default=1},
+                     {arg='learningRate', type='number',
+                      help='learning rate (W = W - rate*dE/dW)', default=1e-2},
+                     {arg='learningRateDecay', type='number',
+                      help='learning rate decay (lr_t = lr_0 / (1 + samplesSeen*lrDecay))',
+                      default=0},
+                     {arg='weightDecay', type='number',
+                      help='amount of weight decay (W = W - decay*W)', default=0},
+                     {arg='momentum', type='number',
+                      help='amount of momentum on weights (dE/W = dE/dW*(1-momentum) + prev(dE/dW)*momentum)', default=0}
+                  )
+   require 'lab'
+   if self.parallelize < 2 then
+      xerror('GenSGD needs to work on several processors: set parallelize',
+             'nn.GenSGDOptimization')
+   end
+   -- change the mapper to send the same batch to each worker
+   self.copyBatch = true
+   -- create default parameter set which will be randomized for each worker
+   self.baseParameters = { momentum          = self.momentum,
+                           weightDecay       = self.weightDecay,
+                           learningRate      = self.learningRate,
+                           learningRateDecay = self.learningRateDecay,
+                           sampleCounter     = self.sampleCounter
+                        }
+end
+
+function ldn(n,m,s)
+   -- pdf = lambda s,m,x: exp(-(log(x)-m)**2 / (2.*s**2)) / ( x*sqrt(2.*pi*s**2) )
+   local x = lab.rand(n)
+   local num = x:log():add(-1*m)
+   num = num:cmul(num):mul(-1):div(2 * s * s):exp():cdiv(x:mul(math.sqrt(2*math.pi*s*s)))
+   return num
+end
+
+-- we are changing the way we map and reduce.  It would be nice to
+-- change gradParametersPartial to ParametersPartial, as the logic is
+-- different for this kind of parallelization.
+function GenSGD:map_hook()
+   local P = self.parallelize
+   -- transmit new parameters to all workers
+   self.children:join()
+   self.children:send(self.parameters)
+   -- randomize learning rate (could randomize other bits).  Using a
+   -- log normal around the base rate.
+   -- local n = ldn(P,self.learningRate,5)
+   local n = lab.rand(P)*self.learningRate
+   -- n[1] = self.learningRate
+   self.baseParameters.sampleCounter = self.sampleCounter
+
+   for t = 1,P do
+      self.baseParameters.learningRate = n[t]
+      self.children[t]:send(self.baseParameters)
+   end
+   -- then wait for all workers to return their Parameters + outputs
+   -- should rename this to parametersParallel and optionsParallel
+   gradParametersPartial = self.children:receive()
+   outputsPartial = self.children:receive()
+   -- force cleanup
+   collectgarbage()
+end
+
+function GenSGD:reduce_hook()
+   local P = self.parallelize
+   local id = 0
+   local mx = 1e9
+   for t = 1,P do
+      if outputsPartial[t].f_x < mx then
+         id = t
+         mx = outputsPartial[t].f_x
+      end
+   end
+   if id == 0 then
+      xerror('diverging','nn.GenSGDOptimization')
+   else
+      self.baseParameters = outputsPartial[id]
+      self.learningRate = self.baseParameters.learningRate
+      self.output = self.baseParameters.f_x
+      print('Chose: '..self.learningRate..' b/c '..self.output)
+      -- in this case we get the parameters back directly
+      self.parameters:copy(gradParametersPartial[id])
+   end
+end
+
+function GenSGD:optimize()
+   self.evaluate()
+end
+
+-- optimization (could do others in this mode)
+GenSGD.optimizer =
+   function (module,params)
+      -- apply momentum (store in the module)
+      if params.momentum ~= 0 then
+         if not module.currentGradParameters then
+            module.currentGradParameters =
+               torch.Tensor():resizeAs(module.gradParameters):copy(module.gradParameters)
+         else
+            module.currentGradParameters:mul(params.momentum):add(1-params.momentum, module.gradParameters)
+         end
+      else
+         module.currentGradParameters = module.gradParameters
+      end
+
+      -- weight decay
+      if params.weightDecay ~= 0 then
+         module.parameters:add(-params.weightDecay, module.parameters)
+      end
+
+      -- update parameters
+      local learningRate =
+         params.learningRate / (1 + params.sampleCounter*params.learningRateDecay)
+      module.parameters:add(-learningRate, module.currentGradParameters)
+      -- make keep track of final rate
+      params.learningRate = learningRate
+   end
+
+function GenSGD:setup_mapreduce ()
+   -- (0) startup parallel package
+   if not xrequire 'parallel' then
+      xerror('install parallel for Lua to enable parallel computing (luarocks install parallel)',
+             'nn.GenSGDOptimization')
+   end
+   local worker_code  =
+      function()
+         -- require packages
+         require 'nnx'
+
+         -- retrieve optional code to setup worker
+         precode = parallel.parent:receive()
+         if type(precode) == 'function' then precode() end
+
+         -- retrieve module + criterion + optimimzer at startup
+         parallel.yield()
+
+         module    = parallel.parent:receive()
+         criterion = parallel.parent:receive()
+         optimizer = parallel.parent:receive()
+
+         -- retrieve optional prehook/posthook
+         prehook = parallel.parent:receive()
+         posthook = parallel.parent:receive()
+         if type(prehook) ~= 'function' then prehook = nil end
+         if type(posthook) ~= 'function' then posthook = nil end
+
+         -- I don't understand this [MS]
+         -- get pointer to parameter and gradParameter vectors
+         -- (this assumes that parameters+gradParameters are already flat parameters:
+         --  it should be the case, as the parent process flattens them at __init)
+         function check(tocheck)
+            for i = 2,#tocheck do
+               if tocheck[i]:storage() ~= tocheck[i-1]:storage() then
+                  print('<BatchOptimization> error: inconsistent parameter vector (not flat)')
+                  return
+               end
+            end
+         end
+         tableParameters = nnx.getParameters(module)
+         tableGradParameters = nnx.getGradParameters(module)
+         check(tableParameters)
+         check(tableGradParameters)
+         parameters = torch.Tensor():set(tableParameters[1]:storage())
+         gradParameters = torch.Tensor():set(tableGradParameters[1]:storage())
+
+         -- outer loop: mini-batches
+         while true do
+            -- sync
+            if parallel.yield() == 'break' then break end
+
+            -- receive new mini-batch
+            inputs  = parallel.parent:receive()
+            targets = parallel.parent:receive()
+            options = parallel.parent:receive()
+
+            -- inner loop: evaluations
+            while true do
+               -- sync
+               if parallel.yield() == 'break' then break end
+
+               -- receive new set of parameters
+               parameters:copy(parallel.parent:receive())
+               -- receive the learning rate etc. parameters which are
+               -- tweaked for each thread
+               optimization_parameters = parallel.parent:receive()
+
+               -- evaluate gradients on inputs for this thread and perform
+               -- SGD on these inputs
+               -- reset gradients
+               gradParameters:zero()
+
+               module.parameters = parameters
+               module.gradParameters = gradParameters
+
+               for i = 1,#inputs do
+                  -- estimate f
+                  local output = module:forward(inputs[i])
+                  local err = criterion:forward(output, targets[i])
+                  -- estimate df/dW
+                  local df_do = criterion:backward(output, targets[i])
+                  module:backward(inputs[i], df_do)
+                  module:accGradParameters(inputs[i], df_do)
+                  optimizer(module,optimization_parameters)
+               end
+               -- we need the result averaged over all the samples _after_
+               -- the gradient steps so do one more loop to fprop through
+               -- the samples and collect the error _after_ the optimization
+               local f_x = 0
+               for i = 1,#inputs do
+                  -- estimate f
+                  local output = module:forward(inputs[i])
+                  local err = criterion:forward(output, targets[i])
+                  f_x = f_x + err
+               end
+               -- in this case send back parameters themselves b/c they are
+               -- already optimized
+               parallel.parent:send(parameters)
+               -- need to make sure we keep track of what was used to
+               -- compute these params along with the outputs
+               optimization_parameters['f_x'] = f_x/#inputs
+               parallel.parent:send(optimization_parameters)
+               -- force cleanup
+               collectgarbage()
+            end
+         end
+      end
+
+   local setup = function()
+                    -- (1) optional calibration
+                    if parallel.remotes then
+                       parallel.calibrate()
+                    end
+
+                    -- (2) startup all workers
+                    self.children = parallel.sfork(self.parallelize)
+                    self.children:exec(worker_code)
+
+                    -- (3) send them optional config code
+                    self.children:send(self.precode or '')
+
+                    -- (4) and send them the module + criterion architecture
+                    self.children:join()
+                    self.children:send(self.module)
+                    self.children:send(self.criterion)
+                    self.children:send(self.optimizer)
+                 end
+
+   local ok,err = pcall(setup)
+   if not ok then parallel.close() error(err) end
+end
diff --git a/init.lua b/init.lua
index 6bc3617..ef4b030 100644
--- a/init.lua
+++ b/init.lua
@@ -104,6 +104,7 @@ torch.include('nnx', 'Optimization.lua')
 torch.include('nnx', 'BatchOptimization.lua')
 torch.include('nnx', 'SGDOptimization.lua')
 torch.include('nnx', 'LBFGSOptimization.lua')
+torch.include('nnx', 'GeneticSGDOptimization.lua')
 
 -- trainers:
 torch.include('nnx', 'Trainer.lua')
@@ -212,7 +213,7 @@ function nnx.flattenParameters(parameters)
          if param:storage() == parameters[i]:storage() then
             offsets[k] = offsets[i]
             if storageOffsets[k] ~= storageOffsets[i] or elements[k] ~= elements[i] then
-               error('<nnx.flattenParameters> canot flatten shared weights with different structures')
+               error('<nnx.flattenParameters> cannot flatten shared weights with different structures')
             end
             isView = true
             break
diff --git a/nnx-1.0-1.rockspec b/nnx-1.0-1.rockspec
index e53e1cd..101d0cd 100644
--- a/nnx-1.0-1.rockspec
+++ b/nnx-1.0-1.rockspec
@@ -138,7 +138,7 @@ build = {
          install_files(/lua/nnx Optimization.lua)
          install_files(/lua/nnx LBFGSOptimization.lua)
          install_files(/lua/nnx SGDOptimization.lua)
-         install_files(/lua/nnx GenSGDOptimization.lua)
+         install_files(/lua/nnx GeneticSGDOptimization.lua)
          install_files(/lua/nnx BatchOptimization.lua)
          install_files(/lua/nnx BatchTrainer.lua)
          add_subdirectory (test)
author	Marco Scoffier <github@metm.org>	2011-09-28 01:20:52 +0400
committer	Marco Scoffier <github@metm.org>	2011-09-28 01:20:52 +0400
commit	31b97b114f144aa286493c9a83acfd2b8ed32ff7 (patch)
tree	c5bc888232e743616edad2f5b38ac41c65bbf9e4
parent	07a8194cf5d9c737af1b291d2b5b057a1f369437 (diff)
parent	2f609a3e56dfb05f3b5246690b341cb44c32ba9e (diff)