diff options
author | Clement Farabet <clement.farabet@gmail.com> | 2011-08-26 00:17:08 +0400 |
---|---|---|
committer | Clement Farabet <clement.farabet@gmail.com> | 2011-08-26 00:17:08 +0400 |
commit | 087d96271f60bd3fd80e8b3aaab72626cd0b6e53 (patch) | |
tree | 701d54f2c87911bc5098d1f53fbd9c0682d24e04 | |
parent | a3f9ab6ef678d5f442f3f2d7e857e3110c0470c6 (diff) |
Added map-reduce version of l-BFGS. In dev...
-rw-r--r-- | LBFGSOptimization.lua | 102 |
1 files changed, 102 insertions, 0 deletions
diff --git a/LBFGSOptimization.lua b/LBFGSOptimization.lua index 83393d9..f0e130a 100644 --- a/LBFGSOptimization.lua +++ b/LBFGSOptimization.lua @@ -10,6 +10,7 @@ function LBFGS:__init(...) {arg='maxIterations', type='number', help='maximum nb of iterations per pass (0 = no max)', default=0}, {arg='maxLineSearch', type='number', help='maximum nb of steps in line search', default=20}, {arg='sparsity', type='number', help='sparsity coef (Orthantwise C)', default=0}, + {arg='parallelize', type='number', help='parallelize onto N cores (experimental!)', default=1}, {arg='verbose', type='number', help='verbose level during training [0-2]', default=0} ) self.parametersT = nnx.getParameters(self.module) @@ -19,6 +20,14 @@ end function LBFGS:forward(inputs, targets, options) options = options or {} + if self.parallelize > 1 then + return self:forward_mapreduce(inputs, targets, options) + else + return self:forward_sequential(inputs, targets, options) + end +end + +function LBFGS:forward_sequential(inputs, targets, options) -- (1) construct a closure that compute f(inputs) + df/dW -- after each call to that function: -- + self.parameters contains the current X vector @@ -73,3 +82,96 @@ function LBFGS:forward(inputs, targets, options) -- (5) return current output after optimization return self.output end + +function LBFGS:forward_mapreduce(inputs, targets, options) + -- (0) clone module+criterion for parallel evaluations + local modules = {} + local criterions = {} + local outputs = {} + self.parametersT = {} + self.gradParametersT = {} + for m = 1,self.parallelize do + if m == 1 then + modules[m] = self.module + criterions[m] = self.criterion + else + modules[m] = self.module:clone() + criterions[m] = self.criterion:clone() + end + self.parametersT[m] = nnx.getParameters(modules[m]) + self.gradParametersT[m] = nnx.getGradParameters(modules[m]) + end + + -- (1) construct a closure that compute f(inputs) + df/dW + -- after each call to that function: + -- + self.parameters contains the current X vector + -- + self.gradParameters contains the estimated dF/dX vector + -- + self.output contains the estimated (average) F(X) + lbfgs.evaluate + = function() + for t = 1,self.parallelize do + lbfgs.evaluate_map(t) + end + return lbfgs.evaluate_reduce() + end + + -- (1a) the map part of the evaluation: compute partial gradients + -- in separate threads + lbfgs.evaluate_map + = function(thread) + -- set parameters from current state + self:unflatten(self.parametersT[thread], self.gradParametersT[thread]) + -- reset gradients + modules[thread]:zeroGradParameters() + -- f is the average of all criterions + outputs[thread] = 0 + -- given all inputs, evaluate gradients + for i = thread,#inputs,thread do + -- estimate f + local output = modules[thread]:forward(inputs[i]) + local err = criterions[thread]:forward(output, targets[i]) + outputs[thread] = outputs[thread] + err + -- estimate df/dW + local df_do = criterions[thread]:backward(output, targets[i]) + modules[thread]:backward(inputs[i], df_do) + end + end + + -- (1b) the reduce part of the evaluation: accumulate all + -- partial estimates of the gradients + lbfgs.evaluate_reduce + = function() + -- temp vectors for accumulation + self.gradParametersAcc = self.gradParametersAcc or torch.Tensor() + self.gradParametersAcc:resizeAs(self.gradParameters):zero() + -- update state from computed parameters + for t = 1,self.parallelize do + self:flatten(self.parametersT[1], self.gradParametersT[t]) + self.gradParametersAcc:copy(self.gradParameters) + end + self.gradParameters:copy(self.gradParametersAcc) + -- normalize gradients + self.gradParameters:div(#inputs) + -- return average f(X) + self.output = 0 + for t = 1,self.parallelize do + self.output = self.output + outputs[t] + end + return self.output/#inputs + end + + -- (2) store current parameters/gradParameters + self:flatten(self.parametersT[1], self.gradParametersT[1]) + + -- (3) the magic function: will update the parameter vector + -- according to the l-BFGS method + self.output = lbfgs.run(self.parameters, self.gradParameters, + self.maxIterations, self.maxLineSearch, + self.sparsity) + + -- (4) last: read parameters back into the model + self:unflatten(self.parametersT[1], self.gradParametersT[1]) + + -- (5) return current output after optimization + return self.output +end |