Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/clementfarabet/lua---nnx.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorClement Farabet <clement.farabet@gmail.com>2011-08-29 22:43:10 +0400
committerClement Farabet <clement.farabet@gmail.com>2011-08-29 22:43:10 +0400
commit1c45fa457e45fd70696bd379603db1a4d893409e (patch)
treec51a289a63a96730b8c4b68c0970fd6b2da43f57
parentb4f55bff226cd285c04e51080417a1b22243a47c (diff)
Completely got rid of calls to 'flatten'.
Now parameters are flattenned once and for all at startup, so that optimizations can have access to contiguous vectors (as was done in LUSH).
-rw-r--r--LBFGSOptimization.lua68
-rw-r--r--Optimization.lua49
-rw-r--r--SGDOptimization.lua10
3 files changed, 25 insertions, 102 deletions
diff --git a/LBFGSOptimization.lua b/LBFGSOptimization.lua
index b3f2f9b..73405ed 100644
--- a/LBFGSOptimization.lua
+++ b/LBFGSOptimization.lua
@@ -19,8 +19,8 @@ function LBFGS:__init(...)
{arg='verbose', type='number',
help='verbose level during training [0-2]', default=0}
)
- self.parametersT = nnx.getParameters(self.module)
- self.gradParametersT = nnx.getGradParameters(self.module)
+ self.parameters = nnx.flattenParameters(nnx.getParameters(self.module))
+ self.gradParameters = nnx.flattenParameters(nnx.getGradParameters(self.module))
if self.parallelize > 1 then
self:setup_mapreduce()
end
@@ -43,10 +43,8 @@ function LBFGS:forward_sequential(inputs, targets, options)
-- + self.output contains the estimated (average) F(X)
lbfgs.evaluate
= function()
- -- set parameters from current state
- self:unflatten(self.parametersT, self.gradParametersT)
-- reset gradients
- self.module:zeroGradParameters()
+ self.gradParameters:zero()
-- f is the average of all criterions
self.output = 0
-- given all inputs, evaluate gradients
@@ -67,27 +65,19 @@ function LBFGS:forward_sequential(inputs, targets, options)
self.posthook(self, {inputs[i], targets[i], options[i]})
end
end
- -- update state from computed parameters
- self:flatten(self.parametersT, self.gradParametersT)
-- normalize gradients
self.gradParameters:div(#inputs)
-- return average f(X)
return self.output/#inputs
end
- -- (2) store current parameters/gradParameters
- self:flatten(self.parametersT, self.gradParametersT)
-
- -- (3) the magic function: will update the parameter vector
+ -- (2) the magic function: will update the parameter vector
-- according to the l-BFGS method
self.output = lbfgs.run(self.parameters, self.gradParameters,
self.maxIterations, self.maxLineSearch,
self.sparsity, self.verbose)
- -- (4) last: read parameters back into the model
- self:unflatten(self.parametersT, self.gradParametersT)
-
- -- (5) return current output after optimization
+ -- (3) return current output after optimization
return self.output
end
@@ -123,8 +113,8 @@ function LBFGS:forward_mapreduce(inputs, targets, options)
end
-- (0a) replicate output and gradParameters
- local outputs = {}
- local gradParameters = {}
+ local outputsPartial = {}
+ local gradParametersPartial = {}
-- (0b) divide input/target batch into N batches
local inputss = {}
@@ -163,14 +153,12 @@ function LBFGS:forward_mapreduce(inputs, targets, options)
-- in separate threads
lbfgs.evaluate_map
= function()
- -- load parameters into current model
- self:unflatten(self.parametersT, self.gradParametersT)
-- transmit new parameters to all workers
- parallel.children:send(self.parametersT)
+ parallel.children:send(self.parameters)
-- then wait for all workers to return their partial gradParameters + outputs
for t = 1,P do
- gradParameters[t] = parallel.children[t]:receive()
- outputs[t] = parallel.children[t]:receive()
+ gradParametersPartial[t] = parallel.children[t]:receive()
+ outputsPartial[t] = parallel.children[t]:receive()
end
end
@@ -178,42 +166,30 @@ function LBFGS:forward_mapreduce(inputs, targets, options)
-- partial estimates of the gradients
lbfgs.evaluate_reduce
= function()
- -- temp vectors for accumulation
- self.gradParametersAcc = self.gradParametersAcc or torch.Tensor()
- self.gradParametersAcc:resizeAs(self.gradParameters):zero()
- -- update state from computed parameters
+ -- accumulate partial gradients, and average
+ self.gradParameters:zero()
for t = 1,P do
- self:flatten(self.parametersT, gradParameters[t])
- self.gradParametersAcc:add(self.gradParameters)
+ self.gradParameters:add(gradParametersPartial[t])
end
- self.gradParameters:copy(self.gradParametersAcc)
- -- normalize gradients
self.gradParameters:div(#inputs)
-- return average f(X)
self.output = 0
for t = 1,P do
- self.output = self.output + outputs[t]
+ self.output = self.output + outputsPartial[t]
end
- -- export parameters, again
return self.output/#inputs
end
- -- (2) store current parameters/gradParameters
- self:flatten(self.parametersT, self.gradParametersT)
-
- -- (3) the magic function: will update the parameter vector
+ -- (2) the magic function: will update the parameter vector
-- according to the l-BFGS method
self.output = lbfgs.run(self.parameters, self.gradParameters,
self.maxIterations, self.maxLineSearch,
self.sparsity, self.verbose)
- -- (4) last: read parameters back into the main (not parrallel) model
- self:unflatten(self.parametersT, self.gradParametersT)
-
- -- (6) reset workers so they're ready for next mini-batch
+ -- (3) reset workers so they're ready for next mini-batch
parallel.children:send('break')
- -- (5) return current output after optimization
+ -- (4) return current output after optimization
return self.output
end
@@ -245,8 +221,8 @@ function LBFGS:setup_mapreduce ()
if posthook ~= '' then loadstring(posthook)() else posthook = nil end
-- get pointer to parameter and gradParameter vectors
- parameters = nnx.getParameters(module)
- gradParameters = nnx.getGradParameters(module)
+ parameters = nnx.flattenParameters(nnx.getParameters(module))
+ gradParameters = nnx.flattenParameters(nnx.getGradParameters(module))
-- outter loop: mini-batches
while true do
@@ -261,12 +237,10 @@ function LBFGS:setup_mapreduce ()
-- receive new set of parameters
newParameters = parallel.parent:receive()
if type(newParameters) == 'string' and newParameters == 'break' then break end
- for i = 1,#newParameters do
- parameters[i]:copy(newParameters[i])
- end
+ parameters:copy(newParameters)
-- reset gradients
- module:zeroGradParameters()
+ gradParameters:zero()
-- f is the average of all criterions
local f_x = 0
-- evaluate gradients on inputs for this thread
diff --git a/Optimization.lua b/Optimization.lua
index f18c635..daf0a8d 100644
--- a/Optimization.lua
+++ b/Optimization.lua
@@ -1,56 +1,11 @@
local Optimization = torch.class('nn.Optimization')
function Optimization:__init()
+ self.output = 0
end
function Optimization:forward(inputs, targets)
- self:flatten(parameters, gradParameters)
self.output = 0
- self:unflatten(parameters, gradParameters)
+ print('<Optimization> WARNING: this is a virtual function, please overload !')
return self.output
end
-
-function Optimization:flatten(parameters, gradParameters)
- if type(parameters) == 'table' then
- -- create flat parameters
- self.parameters = self.parameters or torch.Tensor()
- self.gradParameters = self.gradParameters or torch.Tensor()
- -- assuming that the parameters won't change their size,
- -- we compute offsets once
- if not self.offsets then
- self.nParameters = 0
- self.offsets = {}
- for _,param in ipairs(parameters) do
- table.insert(self.offsets, self.nParameters+1)
- self.nParameters = self.nParameters + param:nElement()
- end
- self.parameters:resize(self.nParameters)
- self.gradParameters:resize(self.nParameters)
- end
- -- copy all params in flat array
- for i = 1,#parameters do
- local nElement = parameters[i]:nElement()
- self.parameters:narrow(1,self.offsets[i],nElement):copy(parameters[i])
- self.gradParameters:narrow(1,self.offsets[i],nElement):copy(gradParameters[i])
- end
- else
- self.parameters = parameters
- self.gradParameters = gradParameters
- end
-end
-
-function Optimization:unflatten(parameters, gradParameters)
- if type(parameters) == 'table' then
- -- copy all params into unflat arrays
- local offset = 1
- for i = 1,#parameters do
- local nElement = parameters[i]:nElement()
- parameters[i]:copy(self.parameters:narrow(1,offset,nElement))
- gradParameters[i]:copy(self.gradParameters:narrow(1,offset,nElement))
- offset = offset + nElement
- end
- else
- parameters = self.parameters
- gradParameters = self.gradParameters
- end
-end
diff --git a/SGDOptimization.lua b/SGDOptimization.lua
index 8bfe9a5..0ad5b28 100644
--- a/SGDOptimization.lua
+++ b/SGDOptimization.lua
@@ -10,8 +10,8 @@ function SGD:__init(...)
{arg='weightDecay', type='number', help='amount of weight decay (W = W - decay*W)', default=0},
{arg='momentum', type='number', help='amount of momentum on weights (dE/W = dE/dW*(1-momentum) + prev(dE/dW)*momentum)', default=0}
)
- self.parametersT = nnx.getParameters(self.module)
- self.gradParametersT = nnx.getGradParameters(self.module)
+ self.parameters = nnx.flattenParameters(nnx.getParameters(self.module))
+ self.gradParameters = nnx.flattenParameters(nnx.getGradParameters(self.module))
end
function SGD:forward(inputs, targets, options)
@@ -48,9 +48,6 @@ function SGD:forward(inputs, targets, options)
-- renorm f
self.output = self.output / #inputs
- -- update state from computed parameters
- self:flatten(self.parametersT, self.gradParametersT)
-
-- normalize gradients
self.gradParameters:div(#inputs)
@@ -73,9 +70,6 @@ function SGD:forward(inputs, targets, options)
-- update parameters
self.parameters:add(-self.learningRate, self.currentGradParameters)
- -- write compute parameters back in place
- self:unflatten(self.parametersT, self.gradParametersT)
-
-- return current output
return self.output
end