diff options
author | Clement Farabet <clement.farabet@gmail.com> | 2011-08-29 22:43:10 +0400 |
---|---|---|
committer | Clement Farabet <clement.farabet@gmail.com> | 2011-08-29 22:43:10 +0400 |
commit | 1c45fa457e45fd70696bd379603db1a4d893409e (patch) | |
tree | c51a289a63a96730b8c4b68c0970fd6b2da43f57 | |
parent | b4f55bff226cd285c04e51080417a1b22243a47c (diff) |
Completely got rid of calls to 'flatten'.
Now parameters are flattenned once and for all at startup,
so that optimizations can have access to contiguous vectors
(as was done in LUSH).
-rw-r--r-- | LBFGSOptimization.lua | 68 | ||||
-rw-r--r-- | Optimization.lua | 49 | ||||
-rw-r--r-- | SGDOptimization.lua | 10 |
3 files changed, 25 insertions, 102 deletions
diff --git a/LBFGSOptimization.lua b/LBFGSOptimization.lua index b3f2f9b..73405ed 100644 --- a/LBFGSOptimization.lua +++ b/LBFGSOptimization.lua @@ -19,8 +19,8 @@ function LBFGS:__init(...) {arg='verbose', type='number', help='verbose level during training [0-2]', default=0} ) - self.parametersT = nnx.getParameters(self.module) - self.gradParametersT = nnx.getGradParameters(self.module) + self.parameters = nnx.flattenParameters(nnx.getParameters(self.module)) + self.gradParameters = nnx.flattenParameters(nnx.getGradParameters(self.module)) if self.parallelize > 1 then self:setup_mapreduce() end @@ -43,10 +43,8 @@ function LBFGS:forward_sequential(inputs, targets, options) -- + self.output contains the estimated (average) F(X) lbfgs.evaluate = function() - -- set parameters from current state - self:unflatten(self.parametersT, self.gradParametersT) -- reset gradients - self.module:zeroGradParameters() + self.gradParameters:zero() -- f is the average of all criterions self.output = 0 -- given all inputs, evaluate gradients @@ -67,27 +65,19 @@ function LBFGS:forward_sequential(inputs, targets, options) self.posthook(self, {inputs[i], targets[i], options[i]}) end end - -- update state from computed parameters - self:flatten(self.parametersT, self.gradParametersT) -- normalize gradients self.gradParameters:div(#inputs) -- return average f(X) return self.output/#inputs end - -- (2) store current parameters/gradParameters - self:flatten(self.parametersT, self.gradParametersT) - - -- (3) the magic function: will update the parameter vector + -- (2) the magic function: will update the parameter vector -- according to the l-BFGS method self.output = lbfgs.run(self.parameters, self.gradParameters, self.maxIterations, self.maxLineSearch, self.sparsity, self.verbose) - -- (4) last: read parameters back into the model - self:unflatten(self.parametersT, self.gradParametersT) - - -- (5) return current output after optimization + -- (3) return current output after optimization return self.output end @@ -123,8 +113,8 @@ function LBFGS:forward_mapreduce(inputs, targets, options) end -- (0a) replicate output and gradParameters - local outputs = {} - local gradParameters = {} + local outputsPartial = {} + local gradParametersPartial = {} -- (0b) divide input/target batch into N batches local inputss = {} @@ -163,14 +153,12 @@ function LBFGS:forward_mapreduce(inputs, targets, options) -- in separate threads lbfgs.evaluate_map = function() - -- load parameters into current model - self:unflatten(self.parametersT, self.gradParametersT) -- transmit new parameters to all workers - parallel.children:send(self.parametersT) + parallel.children:send(self.parameters) -- then wait for all workers to return their partial gradParameters + outputs for t = 1,P do - gradParameters[t] = parallel.children[t]:receive() - outputs[t] = parallel.children[t]:receive() + gradParametersPartial[t] = parallel.children[t]:receive() + outputsPartial[t] = parallel.children[t]:receive() end end @@ -178,42 +166,30 @@ function LBFGS:forward_mapreduce(inputs, targets, options) -- partial estimates of the gradients lbfgs.evaluate_reduce = function() - -- temp vectors for accumulation - self.gradParametersAcc = self.gradParametersAcc or torch.Tensor() - self.gradParametersAcc:resizeAs(self.gradParameters):zero() - -- update state from computed parameters + -- accumulate partial gradients, and average + self.gradParameters:zero() for t = 1,P do - self:flatten(self.parametersT, gradParameters[t]) - self.gradParametersAcc:add(self.gradParameters) + self.gradParameters:add(gradParametersPartial[t]) end - self.gradParameters:copy(self.gradParametersAcc) - -- normalize gradients self.gradParameters:div(#inputs) -- return average f(X) self.output = 0 for t = 1,P do - self.output = self.output + outputs[t] + self.output = self.output + outputsPartial[t] end - -- export parameters, again return self.output/#inputs end - -- (2) store current parameters/gradParameters - self:flatten(self.parametersT, self.gradParametersT) - - -- (3) the magic function: will update the parameter vector + -- (2) the magic function: will update the parameter vector -- according to the l-BFGS method self.output = lbfgs.run(self.parameters, self.gradParameters, self.maxIterations, self.maxLineSearch, self.sparsity, self.verbose) - -- (4) last: read parameters back into the main (not parrallel) model - self:unflatten(self.parametersT, self.gradParametersT) - - -- (6) reset workers so they're ready for next mini-batch + -- (3) reset workers so they're ready for next mini-batch parallel.children:send('break') - -- (5) return current output after optimization + -- (4) return current output after optimization return self.output end @@ -245,8 +221,8 @@ function LBFGS:setup_mapreduce () if posthook ~= '' then loadstring(posthook)() else posthook = nil end -- get pointer to parameter and gradParameter vectors - parameters = nnx.getParameters(module) - gradParameters = nnx.getGradParameters(module) + parameters = nnx.flattenParameters(nnx.getParameters(module)) + gradParameters = nnx.flattenParameters(nnx.getGradParameters(module)) -- outter loop: mini-batches while true do @@ -261,12 +237,10 @@ function LBFGS:setup_mapreduce () -- receive new set of parameters newParameters = parallel.parent:receive() if type(newParameters) == 'string' and newParameters == 'break' then break end - for i = 1,#newParameters do - parameters[i]:copy(newParameters[i]) - end + parameters:copy(newParameters) -- reset gradients - module:zeroGradParameters() + gradParameters:zero() -- f is the average of all criterions local f_x = 0 -- evaluate gradients on inputs for this thread diff --git a/Optimization.lua b/Optimization.lua index f18c635..daf0a8d 100644 --- a/Optimization.lua +++ b/Optimization.lua @@ -1,56 +1,11 @@ local Optimization = torch.class('nn.Optimization') function Optimization:__init() + self.output = 0 end function Optimization:forward(inputs, targets) - self:flatten(parameters, gradParameters) self.output = 0 - self:unflatten(parameters, gradParameters) + print('<Optimization> WARNING: this is a virtual function, please overload !') return self.output end - -function Optimization:flatten(parameters, gradParameters) - if type(parameters) == 'table' then - -- create flat parameters - self.parameters = self.parameters or torch.Tensor() - self.gradParameters = self.gradParameters or torch.Tensor() - -- assuming that the parameters won't change their size, - -- we compute offsets once - if not self.offsets then - self.nParameters = 0 - self.offsets = {} - for _,param in ipairs(parameters) do - table.insert(self.offsets, self.nParameters+1) - self.nParameters = self.nParameters + param:nElement() - end - self.parameters:resize(self.nParameters) - self.gradParameters:resize(self.nParameters) - end - -- copy all params in flat array - for i = 1,#parameters do - local nElement = parameters[i]:nElement() - self.parameters:narrow(1,self.offsets[i],nElement):copy(parameters[i]) - self.gradParameters:narrow(1,self.offsets[i],nElement):copy(gradParameters[i]) - end - else - self.parameters = parameters - self.gradParameters = gradParameters - end -end - -function Optimization:unflatten(parameters, gradParameters) - if type(parameters) == 'table' then - -- copy all params into unflat arrays - local offset = 1 - for i = 1,#parameters do - local nElement = parameters[i]:nElement() - parameters[i]:copy(self.parameters:narrow(1,offset,nElement)) - gradParameters[i]:copy(self.gradParameters:narrow(1,offset,nElement)) - offset = offset + nElement - end - else - parameters = self.parameters - gradParameters = self.gradParameters - end -end diff --git a/SGDOptimization.lua b/SGDOptimization.lua index 8bfe9a5..0ad5b28 100644 --- a/SGDOptimization.lua +++ b/SGDOptimization.lua @@ -10,8 +10,8 @@ function SGD:__init(...) {arg='weightDecay', type='number', help='amount of weight decay (W = W - decay*W)', default=0}, {arg='momentum', type='number', help='amount of momentum on weights (dE/W = dE/dW*(1-momentum) + prev(dE/dW)*momentum)', default=0} ) - self.parametersT = nnx.getParameters(self.module) - self.gradParametersT = nnx.getGradParameters(self.module) + self.parameters = nnx.flattenParameters(nnx.getParameters(self.module)) + self.gradParameters = nnx.flattenParameters(nnx.getGradParameters(self.module)) end function SGD:forward(inputs, targets, options) @@ -48,9 +48,6 @@ function SGD:forward(inputs, targets, options) -- renorm f self.output = self.output / #inputs - -- update state from computed parameters - self:flatten(self.parametersT, self.gradParametersT) - -- normalize gradients self.gradParameters:div(#inputs) @@ -73,9 +70,6 @@ function SGD:forward(inputs, targets, options) -- update parameters self.parameters:add(-self.learningRate, self.currentGradParameters) - -- write compute parameters back in place - self:unflatten(self.parametersT, self.gradParametersT) - -- return current output return self.output end |