diff options
author | Marco Scoffier <github@metm.org> | 2011-09-26 17:16:23 +0400 |
---|---|---|
committer | Marco Scoffier <github@metm.org> | 2011-09-26 17:16:23 +0400 |
commit | 64a47626531b7b1dd93eb505166490ef0c55246e (patch) | |
tree | cf8e9cb8f5486944a7a43c6cbf592739f93c4ea9 | |
parent | 6a54a1d023cb7f5a421d8186f396f1d17fe53b66 (diff) |
code before testing
-rw-r--r-- | GenSGDOptimization.lua | 147 | ||||
-rw-r--r-- | init.lua | 1 | ||||
-rw-r--r-- | nnx-1.0-1.rockspec | 1 |
3 files changed, 109 insertions, 40 deletions
diff --git a/GenSGDOptimization.lua b/GenSGDOptimization.lua index 61f7476..d2c1d7a 100644 --- a/GenSGDOptimization.lua +++ b/GenSGDOptimization.lua @@ -15,31 +15,105 @@ function GenSGD:__init(...) {arg='learningRate', type='number', help='learning rate (W = W - rate*dE/dW)', default=1e-2}, {arg='learningRateDecay', type='number', - help='learning rate decay (lr_t = lr_0 / (1 + samplesSeen*lrDecay))', default=0}, + help='learning rate decay (lr_t = lr_0 / (1 + samplesSeen*lrDecay))', + default=0}, {arg='weightDecay', type='number', help='amount of weight decay (W = W - decay*W)', default=0}, {arg='momentum', type='number', - help='amount of momentum on weights (dE/W = dE/dW*(1-momentum) + prev(dE/dW)*momentum)', default=0} + help='amount of momentum on weights (dE/W = dE/dW*(1-momentum) + prev(dE/dW)*momentum)', default=0}, + {arg='sigma', type='number', + help='sigma of gaussian used to randomize learningRate', + default = 1e3} ) + require 'lab' if self.parallelize < 2 then print('ERROR: GenSGD needs to work on several processors') end -- change the mapper to send the same batch to each worker self.copyBatch = true - self.currentLearningRate = learningRate - self.workerRates = torch.Tensor(self.P) + -- create default parameter set which will be randomized for each worker + self.baseParameters = { momentum = self.momentum, + weightDecay = self.weightDecay, + learningRate = self.learningRate, + learningRateDecay = self.learningRateDecay + } + self.workerParameters = torch.Tensor(self.P) end +-- we are changing the way we map and reduce. It would be nice to +-- change gradParametersPartial to ParametersPartial, as the logic is +-- different for this kind of parallelization. function GenSGD:map_hook() + -- transmit new parameters to all workers + self.children:join() + self.children:send(self.parameters) + -- randomize learning rate (could randomize other bits) + local n = self.learningRate + (lab.randn(P) * self.sigma) + for i = 1,P do + self.baseParameters[learningRate] = n[i] + self.children[t]:join() + self.children[t]:send(self.baseParameters) + end + + end + -- then wait for all workers to return their partial gradParameters + outputs + gradParametersPartial = self.children:receive() + outputsPartial = self.children:receive() + -- force cleanup + collectgarbage() end function GenSGD:reduce_hook() + local id = 0 + local mx = 1e9 + for t = 1,P do + if outputsPartial[t].f_x < mx then + id = t + mx = outputsPartial[t].f_x + end + end + if id == 0 then + print('ERROR: diverging') + else + self.baseParameters = outputsPartial[id] + self.output = self.currentParameters.f_x + -- in this case we get the parameters back directly + self.parameters:copy(gradParametersPartial[id]) + print('Winner: output = '..self.output.. + 'learningRate = '..self.baseParameters['learningRate']) + end end function GenSGD:optimize() self.evaluate() end +-- optimization (could do others in this mode) +function GenSGD:optimizer(module,params) + -- apply momentum (store in the module) + if params.momentum ~= 0 then + if not module.currentGradParameters then + module.currentGradParameters = + torch.Tensor():resizeAs(module.gradParameters):copy(module.gradParameters) + else + module.currentGradParameters:mul(params.momentum):add(1-params.momentum, module.gradParameters) + end + else + module.currentGradParameters = module.gradParameters + end + + -- weight decay + if params.weightDecay ~= 0 then + module.parameters:add(-params.weightDecay, module.parameters) + end + + -- update parameters + local learningRate = + params.learningRate / (1 + params.sampleCounter*params.learningRateDecay) + module.parameters:add(-learningRate, module.currentGradParameters) + -- make keep track of final rate + params.learningRate = learningRate +end function GenSGD:worker_code() -- require packages @@ -47,12 +121,13 @@ function GenSGD:worker_code() -- retrieve module + criterion at startup parallel.yield() + module = parallel.parent:receive() criterion = parallel.parent:receive() optimizer = parallel.parent:receive() - parameters = nnx.flattenParameters(nnx.getParameters(self.module)) - gradParameters = nnx.flattenParameters(nnx.getGradParameters(self.module)) + module.parameters = nnx.flattenParameters(nnx.getParameters(module)) + module.gradParameters = nnx.flattenParameters(nnx.getGradParameters(module)) -- outer loop: mini-batches while true do @@ -71,27 +146,42 @@ function GenSGD:worker_code() -- receive new set of parameters parameters:copy(parallel.parent:receive()) - - -- f is the average of all criterions - local f_x = 0 - -- evaluate gradients on inputs for this thread + + -- receive the learning rate etc. parameters which are + -- tweaked for each thread + optimization_parameters = parallel.parent:receive() + + -- evaluate gradients on inputs for this thread and perform + -- SGD on these inputs + -- reset gradients + gradParameters:zero() for i = 1,#inputs do - -- reset gradients - gradParameters:zero() -- estimate f local output = module:forward(inputs[i]) local err = criterion:forward(output, targets[i]) - f_x = f_x + err -- estimate df/dW local df_do = criterion:backward(output, targets[i]) module:backward(inputs[i], df_do) module:accGradParameters(inputs[i], df_do) - optimizer - + optimizer(module,optimization_parameters) end - -- now send back parameters b/c they are already optimized + -- we need the result averaged over all the samples _after_ + -- the gradient steps so do one more loop to fprop through + -- the samples and collect the error _after_ the optimization + local f_x = 0 + for i = 1,#inputs do + -- estimate f + local output = module:forward(inputs[i]) + local err = criterion:forward(output, targets[i]) + f_x = f_x + err + end + -- in this case send back parameters themselves b/c they are + -- already optimized parallel.parent:send(parameters) - parallel.parent:send(f_x) + -- need to make sure we keep track of what was used to + -- compute these params along with the outputs + optimization_parameters['f_x'] = f_x/#inputs + parallel.parent:send(optimization_parameters) -- force cleanup collectgarbage() end @@ -115,26 +205,3 @@ function GenSGD:setup() self.children:send(self.optimizer) end -function GenSGD:post_hook(module,options) - -- we do the SGD on the worker - -- apply momentum - if options.momentum ~= 0 then - if not module.currentGradParameters then - module.currentGradParameters = torch.Tensor():resizeAs(gradParameters):copy(gradParameters) - else - options.currentGradParameters:mul(options.momentum):add(1-options.momentum, gradParameters) - end - else - options.currentGradParameters = gradParameters - end - - -- weight decay - if options.weightDecay ~= 0 then - options.parameters:add(-options.weightDecay, options.parameters) - end - - -- update parameters - local learningRate = self.learningRate / - (1 + self.sampleCounter*self.learningRateDecay) - self.parameters:add(-learningRate, self.currentGradParameters) -end @@ -102,6 +102,7 @@ torch.include('nnx', 'Optimization.lua') torch.include('nnx', 'BatchOptimization.lua') torch.include('nnx', 'SGDOptimization.lua') torch.include('nnx', 'LBFGSOptimization.lua') +torch.include('nnx', 'GenSGDOptimization.lua') -- trainers: torch.include('nnx', 'Trainer.lua') diff --git a/nnx-1.0-1.rockspec b/nnx-1.0-1.rockspec index 2650547..0145271 100644 --- a/nnx-1.0-1.rockspec +++ b/nnx-1.0-1.rockspec @@ -138,6 +138,7 @@ build = { install_files(/lua/nnx Optimization.lua) install_files(/lua/nnx LBFGSOptimization.lua) install_files(/lua/nnx SGDOptimization.lua) + install_files(/lua/nnx GenSGDOptimization.lua) install_files(/lua/nnx BatchOptimization.lua) install_files(/lua/nnx BatchTrainer.lua) add_subdirectory (test) |