diff options
author | Clement Farabet <clement.farabet@gmail.com> | 2011-09-01 02:05:59 +0400 |
---|---|---|
committer | Clement Farabet <clement.farabet@gmail.com> | 2011-09-01 02:05:59 +0400 |
commit | 24219cba24be5a75d6666d1ca2a9a6ea0bde252a (patch) | |
tree | 3de1331ff9124dc5fffac19223e1ff016b0979c3 | |
parent | cec2df99480f81c14b9e782ef84cb6440fd5044e (diff) |
Re-organized BatchOptimization modules.
-rw-r--r-- | BatchOptimization.lua | 87 | ||||
-rw-r--r-- | LBFGSOptimization.lua | 277 | ||||
-rw-r--r-- | SGDOptimization.lua | 99 | ||||
-rw-r--r-- | init.lua | 2 |
4 files changed, 101 insertions, 364 deletions
diff --git a/BatchOptimization.lua b/BatchOptimization.lua index 0951ae4..4caa064 100644 --- a/BatchOptimization.lua +++ b/BatchOptimization.lua @@ -12,12 +12,6 @@ function Batch:__init(...) {arg='module', type='nn.Module', help='a module to train', req=true}, {arg='criterion', type='nn.Criterion', help='a criterion to estimate the error', req=true}, - {arg='maxIterations', type='number', - help='maximum nb of iterations per pass (0 = no max)', default=0}, - {arg='maxLineSearch', type='number', - help='maximum nb of steps in line search', default=20}, - {arg='sparsity', type='number', - help='sparsity coef (Orthantwise C)', default=0}, {arg='parallelize', type='number', help='parallelize onto N cores (experimental!)', default=1}, {arg='verbose', type='number', @@ -25,10 +19,11 @@ function Batch:__init(...) ) self.parameters = nnx.flattenParameters(nnx.getParameters(self.module)) self.gradParameters = nnx.flattenParameters(nnx.getGradParameters(self.module)) + self.evalCounter = 0 + self.sampleCounter = 0 if self.parallelize > 1 then self:setup_mapreduce() end - batch = {} end function Batch:forward(inputs, targets, options) @@ -46,8 +41,13 @@ function Batch:forward_sequential(inputs, targets, options) -- + self.parameters contains the current X vector -- + self.gradParameters contains the estimated dF/dX vector -- + self.output contains the estimated (average) F(X) - batch.evaluate + self.evaluate = function() + -- verbose + if self.verbose >= 2 then + print('<BatchOptimization> evaluating f(X) + df/dX') + end + local _t_ = sys.clock() -- reset gradients self.gradParameters:zero() -- f is the average of all criterions @@ -70,13 +70,28 @@ function Batch:forward_sequential(inputs, targets, options) self.posthook(self, {inputs[i], targets[i], options[i]}) end end + -- update evaluation counter + self.evalCounter = self.evalCounter + 1 -- normalize gradients self.gradParameters:div(#inputs) + -- verbose + if self.verbose >= 2 then + print('<BatchOptimization> ' .. self.evalCounter .. 'th evaluation took ' .. (sys.clock() - _t_) .. ' sec') + end -- return average f(X) - return self.output/#inputs + self.output = self.output/#inputs + return self.output end - -- (3) return current output after optimization + -- (2) optimization callback + if self.optimize then + self:optimize() + end + + -- (3) update sample counter + self.sampleCounter = self.sampleCounter + #inputs + + -- (4) return current output after optimization return self.output end @@ -90,8 +105,8 @@ function Batch:forward_mapreduce(inputs, targets, options) if type(self.prehook) == 'string' then parallel.children:send(self.prehook) else - print('\r<BatchOptimization> WARNING: when using para||el mode, hooks should be') - print('\r<BatchOptimization> WARNING: defined as strings. User prehook ignored.') + print('\r<BatchOptimization> WARNING: when using para||el mode,'.. + ' hooks should be defined as strings. User prehook ignored.') parallel.children:send('') end else @@ -101,8 +116,8 @@ function Batch:forward_mapreduce(inputs, targets, options) if type(self.posthook) == 'string' then parallel.children:send(self.posthook) else - print('\r<BatchOptimization> WARNING: when using para||el mode, hooks should be') - print('<\rBatchOptimization> WARNING: defined as strings. User posthook ignored.') + print('\r<BatchOptimization> WARNING: when using para||el mode,'.. + ' hooks should be defined as strings. User posthook ignored.') parallel.children:send('') end else @@ -142,30 +157,41 @@ function Batch:forward_mapreduce(inputs, targets, options) -- + self.parameters contains the current X vector -- + self.gradParameters contains the estimated dF/dX vector -- + self.output contains the estimated (average) F(X) - batch.evaluate + self.evaluate = function() - batch.evaluate_map() - return batch.evaluate_reduce() + -- verbose + if self.verbose >= 2 then + print('<BatchOptimization> evaluating f(X) + df/dX') + end + local _t_ = sys.clock() + -- do map/reduce + self.evaluate_map() + self.evaluate_reduce() + -- update evaluation counter + self.evalCounter = self.evalCounter + 1 + -- verbose + if self.verbose >= 2 then + print('<BatchOptimization> ' .. self.evalCounter .. 'th evaluation took ' .. (sys.clock() - _t_) .. ' sec') + end + return self.output end -- (1a) the map part of the evaluation: compute partial gradients -- in separate threads - batch.evaluate_map + self.evaluate_map = function() -- transmit new parameters to all workers parallel.children:send(self.parameters) -- then wait for all workers to return their partial gradParameters + outputs - for t = 1,P do - gradParametersPartial[t] = parallel.children[t]:receive() - outputsPartial[t] = parallel.children[t]:receive() - end + gradParametersPartial = parallel.children:receive() + outputsPartial = parallel.children:receive() -- force cleanup collectgarbage() end -- (1b) the reduce part of the evaluation: accumulate all -- partial estimates of the gradients - batch.evaluate_reduce + self.evaluate_reduce = function() -- accumulate partial gradients, and average self.gradParameters:zero() @@ -178,14 +204,22 @@ function Batch:forward_mapreduce(inputs, targets, options) for t = 1,P do self.output = self.output + outputsPartial[t] end - return self.output/#inputs + self.output = self.output/#inputs end + -- (2) optimization callback + if self.optimize then + self:optimize() + end + -- (3) reset workers so they're ready for next mini-batch parallel.children:send('break') - -- (4) return current output after optimization - return self.output/#inputs + -- (4) update sample counter + self.sampleCounter = self.sampleCounter + #inputs + + -- (5) return current output after optimization + return self.output end function Batch:setup_mapreduce () @@ -194,7 +228,6 @@ function Batch:setup_mapreduce () xerror('install parallel for Lua to enable parallel computing (luarocks install parallel)', 'nn.BatchOptimization') end - parallel.setSharedSize(4*1024*1024) local P = self.parallelize -- (1) define code for workers diff --git a/LBFGSOptimization.lua b/LBFGSOptimization.lua index 976226d..720b7ba 100644 --- a/LBFGSOptimization.lua +++ b/LBFGSOptimization.lua @@ -1,13 +1,10 @@ -local LBFGS,parent = torch.class('nn.LBFGSOptimization', 'nn.Optimization') +local LBFGS,parent = torch.class('nn.LBFGSOptimization', 'nn.BatchOptimization') function LBFGS:__init(...) require 'liblbfgs' - parent.__init(self) + parent.__init(self, ...) xlua.unpack_class(self, {...}, 'LBFGSOptimization', nil, - {arg='module', type='nn.Module', help='a module to train', req=true}, - {arg='criterion', type='nn.Criterion', - help='a criterion to estimate the error', req=true}, {arg='maxIterations', type='number', help='maximum nb of iterations per pass (0 = no max)', default=0}, {arg='maxLineSearch', type='number', @@ -15,277 +12,17 @@ function LBFGS:__init(...) {arg='sparsity', type='number', help='sparsity coef (Orthantwise C)', default=0}, {arg='parallelize', type='number', - help='parallelize onto N cores (experimental!)', default=1}, - {arg='verbose', type='number', - help='verbose level during training [0-2]', default=0} + help='parallelize onto N cores (experimental!)', default=1} ) self.parameters = nnx.flattenParameters(nnx.getParameters(self.module)) self.gradParameters = nnx.flattenParameters(nnx.getGradParameters(self.module)) - if self.parallelize > 1 then - self:setup_mapreduce() - end end -function LBFGS:forward(inputs, targets, options) - options = options or {} - if self.parallelize > 1 then - return self:forward_mapreduce(inputs, targets, options) - else - return self:forward_sequential(inputs, targets, options) - end -end - -function LBFGS:forward_sequential(inputs, targets, options) - -- (1) construct a closure that compute f(inputs) + df/dW - -- after each call to that function: - -- + self.parameters contains the current X vector - -- + self.gradParameters contains the estimated dF/dX vector - -- + self.output contains the estimated (average) F(X) - lbfgs.evaluate - = function() - -- verbose - if self.verbose >= 2 then print('<LBFGSOptimization> evaluating f(X) + df/dX') end - local _t_ = sys.clock() - -- reset gradients - self.gradParameters:zero() - -- f is the average of all criterions - self.output = 0 - -- given all inputs, evaluate gradients - for i = 1,#inputs do - -- user hook - if self.prehook then - self.prehook(self, {inputs[i], targets[i], options[i]}) - end - -- estimate f - local output = self.module:forward(inputs[i]) - local err = self.criterion:forward(output, targets[i]) - self.output = self.output + err - -- estimate df/dW - local df_do = self.criterion:backward(output, targets[i]) - self.module:backward(inputs[i], df_do) - -- user hook - if self.posthook then - self.posthook(self, {inputs[i], targets[i], options[i]}) - end - end - -- normalize gradients - self.gradParameters:div(#inputs) - -- verbose - if self.verbose >= 2 then print('<LBFGSOptimization> f(X) + df/dX computed in ' .. (sys.clock() - _t_) .. ' sec') end - -- return average f(X) - return self.output/#inputs - end - - -- (2) the magic function: will update the parameter vector - -- according to the l-BFGS method - self.output = lbfgs.run(self.parameters, self.gradParameters, - self.maxIterations, self.maxLineSearch, - self.sparsity, self.verbose) - - -- (3) return current output after optimization - return self.output -end - -function LBFGS:forward_mapreduce(inputs, targets, options) - -- parameters - local P = self.parallelize - - -- transmit user hooks, if defined - if not self.hooksets then - if self.prehook then - if type(self.prehook) == 'string' then - parallel.children:send(self.prehook) - else - print('\r<LBFGSOptimization> WARNING: when using para||el mode, hooks should be defined as strings. User prehook ignored.') - parallel.children:send('') - end - else - parallel.children:send('') - end - if self.posthook then - if type(self.posthook) == 'string' then - parallel.children:send(self.posthook) - else - print('\r<LBFGSOptimization> WARNING: when using para||el mode, hooks should be defined as strings. User posthook ignored.') - parallel.children:send('') - end - else - parallel.children:send('') - end - self.hooksets = true - end - - -- (0a) replicate output and gradParameters - local outputsPartial = {} - local gradParametersPartial = {} - - -- (0b) divide input/target batch into N batches - local inputss = {} - local targetss = {} - local optionss = {} - for t = 1,P do - inputss[t] = {} - targetss[t] = {} - optionss[t] = {} - for i = t,#inputs,P do - table.insert(inputss[t], inputs[i]) - table.insert(targetss[t], targets[i]) - if options then table.insert(optionss[t], options[i]) end - end - end - - -- (0c) send mini-batch to all workers - for t = 1,P do - parallel.children[t]:send(inputss[t]) - parallel.children[t]:send(targetss[t]) - parallel.children[t]:send(optionss[t]) - end - - -- (1) construct a closure that compute f(inputs) + df/dW - -- after each call to that function: - -- + self.parameters contains the current X vector - -- + self.gradParameters contains the estimated dF/dX vector - -- + self.output contains the estimated (average) F(X) - lbfgs.evaluate - = function() - lbfgs.evaluate_map() - return lbfgs.evaluate_reduce() - end - - -- (1a) the map part of the evaluation: compute partial gradients - -- in separate threads - lbfgs.evaluate_map - = function() - -- verbose - if self.verbose >= 2 then print('<LBFGSOptimization> evaluating f(X) + df/dX') end - local _t_ = sys.clock() - -- transmit new parameters to all workers - parallel.children:send(self.parameters) - -- then wait for all workers to return their partial gradParameters + outputs - gradParametersPartial = parallel.children:receive() - outputsPartial = parallel.children:receive() - -- force cleanup - collectgarbage() - -- verbose - if self.verbose >= 2 then print('<LBFGSOptimization> f(X) + df/dX computed in ' .. (sys.clock() - _t_) .. ' sec') end - end - - -- (1b) the reduce part of the evaluation: accumulate all - -- partial estimates of the gradients - lbfgs.evaluate_reduce - = function() - -- accumulate partial gradients, and average - self.gradParameters:zero() - for t = 1,P do - self.gradParameters:add(gradParametersPartial[t]) - end - self.gradParameters:div(#inputs) - -- return average f(X) - self.output = 0 - for t = 1,P do - self.output = self.output + outputsPartial[t] - end - return self.output/#inputs - end - - -- (2) the magic function: will update the parameter vector - -- according to the l-BFGS method +function LBFGS:optimize() + lbfgs.evaluate = self.evaluate + -- the magic function: will update the parameter vector + -- according to the l-BFGS method self.output = lbfgs.run(self.parameters, self.gradParameters, self.maxIterations, self.maxLineSearch, self.sparsity, self.verbose) - - -- (3) reset workers so they're ready for next mini-batch - parallel.children:send('break') - - -- (4) return current output after optimization - return self.output -end - -function LBFGS:setup_mapreduce () - -- (0) startup parallel package - if not xrequire 'parallel' then - xerror('install parallel for Lua to enable parallel computing (luarocks install parallel)', - 'nn.LBFGSOptimization') - end - parallel.setSharedSize(4*1024*1024) - local P = self.parallelize - - -- (1) define code for workers - local worker_code = [[ - -- require packages - require 'nnx' - - -- retrieve module + criterion at startup - module = parallel.parent:receive() - criterion = parallel.parent:receive() - - -- create fake optimizer, for hooks - optimizer = {module=module, criterion=criterion} - - -- retrieve optional prehook/posthook - prehook = parallel.parent:receive() - posthook = parallel.parent:receive() - if prehook ~= '' then loadstring(prehook)() else prehook = nil end - if posthook ~= '' then loadstring(posthook)() else posthook = nil end - - -- get pointer to parameter and gradParameter vectors - parameters = nnx.flattenParameters(nnx.getParameters(module)) - gradParameters = nnx.flattenParameters(nnx.getGradParameters(module)) - - -- outter loop: mini-batches - while true do - -- receive new mini-batch - inputs = parallel.parent:receive() - if type(inputs) == 'string' and inputs == 'break' then break end - targets = parallel.parent:receive() - options = parallel.parent:receive() - - -- inner loop: evaluations - while true do - -- receive new set of parameters - newParameters = parallel.parent:receive() - if type(newParameters) == 'string' and newParameters == 'break' then break end - parameters:copy(newParameters) - - -- reset gradients - gradParameters:zero() - -- f is the average of all criterions - local f_x = 0 - -- evaluate gradients on inputs for this thread - for i = 1,#inputs do - -- user hook - if prehook then - prehook(optimizer, {inputs[i], targets[i], options[i]}) - end - -- estimate f - local output = module:forward(inputs[i]) - local err = criterion:forward(output, targets[i]) - f_x = f_x + err - -- estimate df/dW - local df_do = criterion:backward(output, targets[i]) - module:backward(inputs[i], df_do) - -- user hook - if posthook then - posthook(optimizer, {inputs[i], targets[i], options[i]}) - end - end - - -- now send back gradParameters + partial output - parallel.parent:send(gradParameters) - parallel.parent:send(f_x) - - -- force cleanup - collectgarbage() - end - end - ]] - - -- (2) startup all workers - for t = 1,P do - parallel.run(worker_code) - end - - -- (3) and send them the module + criterion architecture - parallel.children:send(self.module) - parallel.children:send(self.criterion) end diff --git a/SGDOptimization.lua b/SGDOptimization.lua index 88d63d0..ddbf220 100644 --- a/SGDOptimization.lua +++ b/SGDOptimization.lua @@ -1,81 +1,48 @@ -local SGD,parent = torch.class('nn.SGDOptimization', 'nn.Optimization') +local SGD,parent = torch.class('nn.SGDOptimization', 'nn.BatchOptimization') function SGD:__init(...) - parent.__init(self) + parent.__init(self,...) xlua.unpack_class(self, {...}, 'SGDOptimization', nil, - {arg='module', type='nn.Module', help='a module to train', req=true}, - {arg='criterion', type='nn.Criterion', help='a criterion to estimate the error', req=true}, - {arg='learningRate', type='number', help='learning rate (W = W - rate*dE/dW)', default=1e-2}, - {arg='learningRateDecay', type='number', help='learning rate decay (lr_t = lr_0 / (1 + samplesSeen*lrDecay))', default=0}, - {arg='weightDecay', type='number', help='amount of weight decay (W = W - decay*W)', default=0}, - {arg='momentum', type='number', help='amount of momentum on weights (dE/W = dE/dW*(1-momentum) + prev(dE/dW)*momentum)', default=0} + {arg='maxIterations', type='number', + help='maximum nb of iterations per pass', default=1}, + {arg='learningRate', type='number', + help='learning rate (W = W - rate*dE/dW)', default=1e-2}, + {arg='learningRateDecay', type='number', + help='learning rate decay (lr_t = lr_0 / (1 + samplesSeen*lrDecay))', default=0}, + {arg='weightDecay', type='number', + help='amount of weight decay (W = W - decay*W)', default=0}, + {arg='momentum', type='number', + help='amount of momentum on weights (dE/W = dE/dW*(1-momentum) + prev(dE/dW)*momentum)', default=0} ) self.parameters = nnx.flattenParameters(nnx.getParameters(self.module)) self.gradParameters = nnx.flattenParameters(nnx.getGradParameters(self.module)) - self.samplesSeen = 0 end -function SGD:forward(inputs, targets, options) - options = options or {} - - -- reset gradients - self.gradParameters:zero() - - -- f is the average of all criterions - self.output = 0 - - -- given all inputs, evaluate gradients - for i = 1,#inputs do - -- user hook - if self.prehook then - self.prehook(self, {inputs[i], targets[i], options[i]}) - end - - -- estimate f - local output = self.module:forward(inputs[i]) - local err = self.criterion:forward(output, targets[i]) - self.output = self.output + err - - -- estimate df/dW - local df_do = self.criterion:backward(output, targets[i]) - self.module:backward(inputs[i], df_do) - - -- user hook - if self.posthook then - self.posthook(self, {inputs[i], targets[i], options[i]}) +function SGD:optimize() + -- optimize N times + for i = 1,self.maxIterations do + -- evaluate f(X) + df/dX + self.evaluate() + + -- apply momentum + if self.momentum ~= 0 then + if not self.currentGradParameters then + self.currentGradParameters = torch.Tensor():resizeAs(self.gradParameters):copy(self.gradParameters) + else + self.currentGradParameters:mul(self.momentum):add(1-self.momentum, self.gradParameters) + end + else + self.currentGradParameters = self.gradParameters end - end - -- update counter of samples seen - self.samplesSeen = self.samplesSeen + #inputs - - -- renorm f - self.output = self.output / #inputs - - -- normalize gradients - self.gradParameters:div(#inputs) - - -- apply momentum - if self.momentum ~= 0 then - if not self.currentGradParameters then - self.currentGradParameters = torch.Tensor():resizeAs(self.gradParameters):copy(self.gradParameters) - else - self.currentGradParameters:mul(self.momentum):add(1-self.momentum, self.gradParameters) + -- weight decay + if self.weightDecay ~= 0 then + self.parameters:add(-self.weightDecay, self.parameters) end - else - self.currentGradParameters = self.gradParameters - end - -- weight decay - if self.weightDecay ~= 0 then - self.parameters:add(-self.weightDecay, self.parameters) + -- update parameters + local learningRate = self.learningRate / (1 + self.sampleCounter*self.learningRateDecay) + self.parameters:add(-learningRate, self.currentGradParameters) end - - -- update parameters - local learningRate = self.learningRate / (1 + self.samplesSeen*self.learningRateDecay) - self.parameters:add(-learningRate, self.currentGradParameters) - - -- return current output - return self.output end @@ -100,9 +100,9 @@ torch.include('nnx', 'SpatialSparseCriterion.lua') -- optimizations: torch.include('nnx', 'Optimization.lua') +torch.include('nnx', 'BatchOptimization.lua') torch.include('nnx', 'SGDOptimization.lua') torch.include('nnx', 'LBFGSOptimization.lua') -torch.include('nnx', 'BatchOptimization.lua') -- trainers: torch.include('nnx', 'Trainer.lua') |