Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/clementfarabet/lua---nnx.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorClement Farabet <clement.farabet@gmail.com>2011-09-01 02:05:59 +0400
committerClement Farabet <clement.farabet@gmail.com>2011-09-01 02:05:59 +0400
commit24219cba24be5a75d6666d1ca2a9a6ea0bde252a (patch)
tree3de1331ff9124dc5fffac19223e1ff016b0979c3
parentcec2df99480f81c14b9e782ef84cb6440fd5044e (diff)
Re-organized BatchOptimization modules.
-rw-r--r--BatchOptimization.lua87
-rw-r--r--LBFGSOptimization.lua277
-rw-r--r--SGDOptimization.lua99
-rw-r--r--init.lua2
4 files changed, 101 insertions, 364 deletions
diff --git a/BatchOptimization.lua b/BatchOptimization.lua
index 0951ae4..4caa064 100644
--- a/BatchOptimization.lua
+++ b/BatchOptimization.lua
@@ -12,12 +12,6 @@ function Batch:__init(...)
{arg='module', type='nn.Module', help='a module to train', req=true},
{arg='criterion', type='nn.Criterion',
help='a criterion to estimate the error', req=true},
- {arg='maxIterations', type='number',
- help='maximum nb of iterations per pass (0 = no max)', default=0},
- {arg='maxLineSearch', type='number',
- help='maximum nb of steps in line search', default=20},
- {arg='sparsity', type='number',
- help='sparsity coef (Orthantwise C)', default=0},
{arg='parallelize', type='number',
help='parallelize onto N cores (experimental!)', default=1},
{arg='verbose', type='number',
@@ -25,10 +19,11 @@ function Batch:__init(...)
)
self.parameters = nnx.flattenParameters(nnx.getParameters(self.module))
self.gradParameters = nnx.flattenParameters(nnx.getGradParameters(self.module))
+ self.evalCounter = 0
+ self.sampleCounter = 0
if self.parallelize > 1 then
self:setup_mapreduce()
end
- batch = {}
end
function Batch:forward(inputs, targets, options)
@@ -46,8 +41,13 @@ function Batch:forward_sequential(inputs, targets, options)
-- + self.parameters contains the current X vector
-- + self.gradParameters contains the estimated dF/dX vector
-- + self.output contains the estimated (average) F(X)
- batch.evaluate
+ self.evaluate
= function()
+ -- verbose
+ if self.verbose >= 2 then
+ print('<BatchOptimization> evaluating f(X) + df/dX')
+ end
+ local _t_ = sys.clock()
-- reset gradients
self.gradParameters:zero()
-- f is the average of all criterions
@@ -70,13 +70,28 @@ function Batch:forward_sequential(inputs, targets, options)
self.posthook(self, {inputs[i], targets[i], options[i]})
end
end
+ -- update evaluation counter
+ self.evalCounter = self.evalCounter + 1
-- normalize gradients
self.gradParameters:div(#inputs)
+ -- verbose
+ if self.verbose >= 2 then
+ print('<BatchOptimization> ' .. self.evalCounter .. 'th evaluation took ' .. (sys.clock() - _t_) .. ' sec')
+ end
-- return average f(X)
- return self.output/#inputs
+ self.output = self.output/#inputs
+ return self.output
end
- -- (3) return current output after optimization
+ -- (2) optimization callback
+ if self.optimize then
+ self:optimize()
+ end
+
+ -- (3) update sample counter
+ self.sampleCounter = self.sampleCounter + #inputs
+
+ -- (4) return current output after optimization
return self.output
end
@@ -90,8 +105,8 @@ function Batch:forward_mapreduce(inputs, targets, options)
if type(self.prehook) == 'string' then
parallel.children:send(self.prehook)
else
- print('\r<BatchOptimization> WARNING: when using para||el mode, hooks should be')
- print('\r<BatchOptimization> WARNING: defined as strings. User prehook ignored.')
+ print('\r<BatchOptimization> WARNING: when using para||el mode,'..
+ ' hooks should be defined as strings. User prehook ignored.')
parallel.children:send('')
end
else
@@ -101,8 +116,8 @@ function Batch:forward_mapreduce(inputs, targets, options)
if type(self.posthook) == 'string' then
parallel.children:send(self.posthook)
else
- print('\r<BatchOptimization> WARNING: when using para||el mode, hooks should be')
- print('<\rBatchOptimization> WARNING: defined as strings. User posthook ignored.')
+ print('\r<BatchOptimization> WARNING: when using para||el mode,'..
+ ' hooks should be defined as strings. User posthook ignored.')
parallel.children:send('')
end
else
@@ -142,30 +157,41 @@ function Batch:forward_mapreduce(inputs, targets, options)
-- + self.parameters contains the current X vector
-- + self.gradParameters contains the estimated dF/dX vector
-- + self.output contains the estimated (average) F(X)
- batch.evaluate
+ self.evaluate
= function()
- batch.evaluate_map()
- return batch.evaluate_reduce()
+ -- verbose
+ if self.verbose >= 2 then
+ print('<BatchOptimization> evaluating f(X) + df/dX')
+ end
+ local _t_ = sys.clock()
+ -- do map/reduce
+ self.evaluate_map()
+ self.evaluate_reduce()
+ -- update evaluation counter
+ self.evalCounter = self.evalCounter + 1
+ -- verbose
+ if self.verbose >= 2 then
+ print('<BatchOptimization> ' .. self.evalCounter .. 'th evaluation took ' .. (sys.clock() - _t_) .. ' sec')
+ end
+ return self.output
end
-- (1a) the map part of the evaluation: compute partial gradients
-- in separate threads
- batch.evaluate_map
+ self.evaluate_map
= function()
-- transmit new parameters to all workers
parallel.children:send(self.parameters)
-- then wait for all workers to return their partial gradParameters + outputs
- for t = 1,P do
- gradParametersPartial[t] = parallel.children[t]:receive()
- outputsPartial[t] = parallel.children[t]:receive()
- end
+ gradParametersPartial = parallel.children:receive()
+ outputsPartial = parallel.children:receive()
-- force cleanup
collectgarbage()
end
-- (1b) the reduce part of the evaluation: accumulate all
-- partial estimates of the gradients
- batch.evaluate_reduce
+ self.evaluate_reduce
= function()
-- accumulate partial gradients, and average
self.gradParameters:zero()
@@ -178,14 +204,22 @@ function Batch:forward_mapreduce(inputs, targets, options)
for t = 1,P do
self.output = self.output + outputsPartial[t]
end
- return self.output/#inputs
+ self.output = self.output/#inputs
end
+ -- (2) optimization callback
+ if self.optimize then
+ self:optimize()
+ end
+
-- (3) reset workers so they're ready for next mini-batch
parallel.children:send('break')
- -- (4) return current output after optimization
- return self.output/#inputs
+ -- (4) update sample counter
+ self.sampleCounter = self.sampleCounter + #inputs
+
+ -- (5) return current output after optimization
+ return self.output
end
function Batch:setup_mapreduce ()
@@ -194,7 +228,6 @@ function Batch:setup_mapreduce ()
xerror('install parallel for Lua to enable parallel computing (luarocks install parallel)',
'nn.BatchOptimization')
end
- parallel.setSharedSize(4*1024*1024)
local P = self.parallelize
-- (1) define code for workers
diff --git a/LBFGSOptimization.lua b/LBFGSOptimization.lua
index 976226d..720b7ba 100644
--- a/LBFGSOptimization.lua
+++ b/LBFGSOptimization.lua
@@ -1,13 +1,10 @@
-local LBFGS,parent = torch.class('nn.LBFGSOptimization', 'nn.Optimization')
+local LBFGS,parent = torch.class('nn.LBFGSOptimization', 'nn.BatchOptimization')
function LBFGS:__init(...)
require 'liblbfgs'
- parent.__init(self)
+ parent.__init(self, ...)
xlua.unpack_class(self, {...},
'LBFGSOptimization', nil,
- {arg='module', type='nn.Module', help='a module to train', req=true},
- {arg='criterion', type='nn.Criterion',
- help='a criterion to estimate the error', req=true},
{arg='maxIterations', type='number',
help='maximum nb of iterations per pass (0 = no max)', default=0},
{arg='maxLineSearch', type='number',
@@ -15,277 +12,17 @@ function LBFGS:__init(...)
{arg='sparsity', type='number',
help='sparsity coef (Orthantwise C)', default=0},
{arg='parallelize', type='number',
- help='parallelize onto N cores (experimental!)', default=1},
- {arg='verbose', type='number',
- help='verbose level during training [0-2]', default=0}
+ help='parallelize onto N cores (experimental!)', default=1}
)
self.parameters = nnx.flattenParameters(nnx.getParameters(self.module))
self.gradParameters = nnx.flattenParameters(nnx.getGradParameters(self.module))
- if self.parallelize > 1 then
- self:setup_mapreduce()
- end
end
-function LBFGS:forward(inputs, targets, options)
- options = options or {}
- if self.parallelize > 1 then
- return self:forward_mapreduce(inputs, targets, options)
- else
- return self:forward_sequential(inputs, targets, options)
- end
-end
-
-function LBFGS:forward_sequential(inputs, targets, options)
- -- (1) construct a closure that compute f(inputs) + df/dW
- -- after each call to that function:
- -- + self.parameters contains the current X vector
- -- + self.gradParameters contains the estimated dF/dX vector
- -- + self.output contains the estimated (average) F(X)
- lbfgs.evaluate
- = function()
- -- verbose
- if self.verbose >= 2 then print('<LBFGSOptimization> evaluating f(X) + df/dX') end
- local _t_ = sys.clock()
- -- reset gradients
- self.gradParameters:zero()
- -- f is the average of all criterions
- self.output = 0
- -- given all inputs, evaluate gradients
- for i = 1,#inputs do
- -- user hook
- if self.prehook then
- self.prehook(self, {inputs[i], targets[i], options[i]})
- end
- -- estimate f
- local output = self.module:forward(inputs[i])
- local err = self.criterion:forward(output, targets[i])
- self.output = self.output + err
- -- estimate df/dW
- local df_do = self.criterion:backward(output, targets[i])
- self.module:backward(inputs[i], df_do)
- -- user hook
- if self.posthook then
- self.posthook(self, {inputs[i], targets[i], options[i]})
- end
- end
- -- normalize gradients
- self.gradParameters:div(#inputs)
- -- verbose
- if self.verbose >= 2 then print('<LBFGSOptimization> f(X) + df/dX computed in ' .. (sys.clock() - _t_) .. ' sec') end
- -- return average f(X)
- return self.output/#inputs
- end
-
- -- (2) the magic function: will update the parameter vector
- -- according to the l-BFGS method
- self.output = lbfgs.run(self.parameters, self.gradParameters,
- self.maxIterations, self.maxLineSearch,
- self.sparsity, self.verbose)
-
- -- (3) return current output after optimization
- return self.output
-end
-
-function LBFGS:forward_mapreduce(inputs, targets, options)
- -- parameters
- local P = self.parallelize
-
- -- transmit user hooks, if defined
- if not self.hooksets then
- if self.prehook then
- if type(self.prehook) == 'string' then
- parallel.children:send(self.prehook)
- else
- print('\r<LBFGSOptimization> WARNING: when using para||el mode, hooks should be defined as strings. User prehook ignored.')
- parallel.children:send('')
- end
- else
- parallel.children:send('')
- end
- if self.posthook then
- if type(self.posthook) == 'string' then
- parallel.children:send(self.posthook)
- else
- print('\r<LBFGSOptimization> WARNING: when using para||el mode, hooks should be defined as strings. User posthook ignored.')
- parallel.children:send('')
- end
- else
- parallel.children:send('')
- end
- self.hooksets = true
- end
-
- -- (0a) replicate output and gradParameters
- local outputsPartial = {}
- local gradParametersPartial = {}
-
- -- (0b) divide input/target batch into N batches
- local inputss = {}
- local targetss = {}
- local optionss = {}
- for t = 1,P do
- inputss[t] = {}
- targetss[t] = {}
- optionss[t] = {}
- for i = t,#inputs,P do
- table.insert(inputss[t], inputs[i])
- table.insert(targetss[t], targets[i])
- if options then table.insert(optionss[t], options[i]) end
- end
- end
-
- -- (0c) send mini-batch to all workers
- for t = 1,P do
- parallel.children[t]:send(inputss[t])
- parallel.children[t]:send(targetss[t])
- parallel.children[t]:send(optionss[t])
- end
-
- -- (1) construct a closure that compute f(inputs) + df/dW
- -- after each call to that function:
- -- + self.parameters contains the current X vector
- -- + self.gradParameters contains the estimated dF/dX vector
- -- + self.output contains the estimated (average) F(X)
- lbfgs.evaluate
- = function()
- lbfgs.evaluate_map()
- return lbfgs.evaluate_reduce()
- end
-
- -- (1a) the map part of the evaluation: compute partial gradients
- -- in separate threads
- lbfgs.evaluate_map
- = function()
- -- verbose
- if self.verbose >= 2 then print('<LBFGSOptimization> evaluating f(X) + df/dX') end
- local _t_ = sys.clock()
- -- transmit new parameters to all workers
- parallel.children:send(self.parameters)
- -- then wait for all workers to return their partial gradParameters + outputs
- gradParametersPartial = parallel.children:receive()
- outputsPartial = parallel.children:receive()
- -- force cleanup
- collectgarbage()
- -- verbose
- if self.verbose >= 2 then print('<LBFGSOptimization> f(X) + df/dX computed in ' .. (sys.clock() - _t_) .. ' sec') end
- end
-
- -- (1b) the reduce part of the evaluation: accumulate all
- -- partial estimates of the gradients
- lbfgs.evaluate_reduce
- = function()
- -- accumulate partial gradients, and average
- self.gradParameters:zero()
- for t = 1,P do
- self.gradParameters:add(gradParametersPartial[t])
- end
- self.gradParameters:div(#inputs)
- -- return average f(X)
- self.output = 0
- for t = 1,P do
- self.output = self.output + outputsPartial[t]
- end
- return self.output/#inputs
- end
-
- -- (2) the magic function: will update the parameter vector
- -- according to the l-BFGS method
+function LBFGS:optimize()
+ lbfgs.evaluate = self.evaluate
+ -- the magic function: will update the parameter vector
+ -- according to the l-BFGS method
self.output = lbfgs.run(self.parameters, self.gradParameters,
self.maxIterations, self.maxLineSearch,
self.sparsity, self.verbose)
-
- -- (3) reset workers so they're ready for next mini-batch
- parallel.children:send('break')
-
- -- (4) return current output after optimization
- return self.output
-end
-
-function LBFGS:setup_mapreduce ()
- -- (0) startup parallel package
- if not xrequire 'parallel' then
- xerror('install parallel for Lua to enable parallel computing (luarocks install parallel)',
- 'nn.LBFGSOptimization')
- end
- parallel.setSharedSize(4*1024*1024)
- local P = self.parallelize
-
- -- (1) define code for workers
- local worker_code = [[
- -- require packages
- require 'nnx'
-
- -- retrieve module + criterion at startup
- module = parallel.parent:receive()
- criterion = parallel.parent:receive()
-
- -- create fake optimizer, for hooks
- optimizer = {module=module, criterion=criterion}
-
- -- retrieve optional prehook/posthook
- prehook = parallel.parent:receive()
- posthook = parallel.parent:receive()
- if prehook ~= '' then loadstring(prehook)() else prehook = nil end
- if posthook ~= '' then loadstring(posthook)() else posthook = nil end
-
- -- get pointer to parameter and gradParameter vectors
- parameters = nnx.flattenParameters(nnx.getParameters(module))
- gradParameters = nnx.flattenParameters(nnx.getGradParameters(module))
-
- -- outter loop: mini-batches
- while true do
- -- receive new mini-batch
- inputs = parallel.parent:receive()
- if type(inputs) == 'string' and inputs == 'break' then break end
- targets = parallel.parent:receive()
- options = parallel.parent:receive()
-
- -- inner loop: evaluations
- while true do
- -- receive new set of parameters
- newParameters = parallel.parent:receive()
- if type(newParameters) == 'string' and newParameters == 'break' then break end
- parameters:copy(newParameters)
-
- -- reset gradients
- gradParameters:zero()
- -- f is the average of all criterions
- local f_x = 0
- -- evaluate gradients on inputs for this thread
- for i = 1,#inputs do
- -- user hook
- if prehook then
- prehook(optimizer, {inputs[i], targets[i], options[i]})
- end
- -- estimate f
- local output = module:forward(inputs[i])
- local err = criterion:forward(output, targets[i])
- f_x = f_x + err
- -- estimate df/dW
- local df_do = criterion:backward(output, targets[i])
- module:backward(inputs[i], df_do)
- -- user hook
- if posthook then
- posthook(optimizer, {inputs[i], targets[i], options[i]})
- end
- end
-
- -- now send back gradParameters + partial output
- parallel.parent:send(gradParameters)
- parallel.parent:send(f_x)
-
- -- force cleanup
- collectgarbage()
- end
- end
- ]]
-
- -- (2) startup all workers
- for t = 1,P do
- parallel.run(worker_code)
- end
-
- -- (3) and send them the module + criterion architecture
- parallel.children:send(self.module)
- parallel.children:send(self.criterion)
end
diff --git a/SGDOptimization.lua b/SGDOptimization.lua
index 88d63d0..ddbf220 100644
--- a/SGDOptimization.lua
+++ b/SGDOptimization.lua
@@ -1,81 +1,48 @@
-local SGD,parent = torch.class('nn.SGDOptimization', 'nn.Optimization')
+local SGD,parent = torch.class('nn.SGDOptimization', 'nn.BatchOptimization')
function SGD:__init(...)
- parent.__init(self)
+ parent.__init(self,...)
xlua.unpack_class(self, {...},
'SGDOptimization', nil,
- {arg='module', type='nn.Module', help='a module to train', req=true},
- {arg='criterion', type='nn.Criterion', help='a criterion to estimate the error', req=true},
- {arg='learningRate', type='number', help='learning rate (W = W - rate*dE/dW)', default=1e-2},
- {arg='learningRateDecay', type='number', help='learning rate decay (lr_t = lr_0 / (1 + samplesSeen*lrDecay))', default=0},
- {arg='weightDecay', type='number', help='amount of weight decay (W = W - decay*W)', default=0},
- {arg='momentum', type='number', help='amount of momentum on weights (dE/W = dE/dW*(1-momentum) + prev(dE/dW)*momentum)', default=0}
+ {arg='maxIterations', type='number',
+ help='maximum nb of iterations per pass', default=1},
+ {arg='learningRate', type='number',
+ help='learning rate (W = W - rate*dE/dW)', default=1e-2},
+ {arg='learningRateDecay', type='number',
+ help='learning rate decay (lr_t = lr_0 / (1 + samplesSeen*lrDecay))', default=0},
+ {arg='weightDecay', type='number',
+ help='amount of weight decay (W = W - decay*W)', default=0},
+ {arg='momentum', type='number',
+ help='amount of momentum on weights (dE/W = dE/dW*(1-momentum) + prev(dE/dW)*momentum)', default=0}
)
self.parameters = nnx.flattenParameters(nnx.getParameters(self.module))
self.gradParameters = nnx.flattenParameters(nnx.getGradParameters(self.module))
- self.samplesSeen = 0
end
-function SGD:forward(inputs, targets, options)
- options = options or {}
-
- -- reset gradients
- self.gradParameters:zero()
-
- -- f is the average of all criterions
- self.output = 0
-
- -- given all inputs, evaluate gradients
- for i = 1,#inputs do
- -- user hook
- if self.prehook then
- self.prehook(self, {inputs[i], targets[i], options[i]})
- end
-
- -- estimate f
- local output = self.module:forward(inputs[i])
- local err = self.criterion:forward(output, targets[i])
- self.output = self.output + err
-
- -- estimate df/dW
- local df_do = self.criterion:backward(output, targets[i])
- self.module:backward(inputs[i], df_do)
-
- -- user hook
- if self.posthook then
- self.posthook(self, {inputs[i], targets[i], options[i]})
+function SGD:optimize()
+ -- optimize N times
+ for i = 1,self.maxIterations do
+ -- evaluate f(X) + df/dX
+ self.evaluate()
+
+ -- apply momentum
+ if self.momentum ~= 0 then
+ if not self.currentGradParameters then
+ self.currentGradParameters = torch.Tensor():resizeAs(self.gradParameters):copy(self.gradParameters)
+ else
+ self.currentGradParameters:mul(self.momentum):add(1-self.momentum, self.gradParameters)
+ end
+ else
+ self.currentGradParameters = self.gradParameters
end
- end
- -- update counter of samples seen
- self.samplesSeen = self.samplesSeen + #inputs
-
- -- renorm f
- self.output = self.output / #inputs
-
- -- normalize gradients
- self.gradParameters:div(#inputs)
-
- -- apply momentum
- if self.momentum ~= 0 then
- if not self.currentGradParameters then
- self.currentGradParameters = torch.Tensor():resizeAs(self.gradParameters):copy(self.gradParameters)
- else
- self.currentGradParameters:mul(self.momentum):add(1-self.momentum, self.gradParameters)
+ -- weight decay
+ if self.weightDecay ~= 0 then
+ self.parameters:add(-self.weightDecay, self.parameters)
end
- else
- self.currentGradParameters = self.gradParameters
- end
- -- weight decay
- if self.weightDecay ~= 0 then
- self.parameters:add(-self.weightDecay, self.parameters)
+ -- update parameters
+ local learningRate = self.learningRate / (1 + self.sampleCounter*self.learningRateDecay)
+ self.parameters:add(-learningRate, self.currentGradParameters)
end
-
- -- update parameters
- local learningRate = self.learningRate / (1 + self.samplesSeen*self.learningRateDecay)
- self.parameters:add(-learningRate, self.currentGradParameters)
-
- -- return current output
- return self.output
end
diff --git a/init.lua b/init.lua
index 6b09afc..3519294 100644
--- a/init.lua
+++ b/init.lua
@@ -100,9 +100,9 @@ torch.include('nnx', 'SpatialSparseCriterion.lua')
-- optimizations:
torch.include('nnx', 'Optimization.lua')
+torch.include('nnx', 'BatchOptimization.lua')
torch.include('nnx', 'SGDOptimization.lua')
torch.include('nnx', 'LBFGSOptimization.lua')
-torch.include('nnx', 'BatchOptimization.lua')
-- trainers:
torch.include('nnx', 'Trainer.lua')