Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/clementfarabet/lua---nnx.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorClement Farabet <clement.farabet@gmail.com>2011-10-06 07:29:48 +0400
committerClement Farabet <clement.farabet@gmail.com>2011-10-06 07:29:48 +0400
commit8d97e1432dbb4192fc32df9439dd92da549f049a (patch)
tree1b7abce0d041c6e0ffadab156c3413ccc726b367
parentdc2d93d8f39cd213831b671c822fefa22652f179 (diff)
parenta95ad9db06d6bf7eadfdc0cea34de7e8f566e0e7 (diff)
Merge branch 'master' of github.com:clementfarabet/lua---nnx
-rw-r--r--SGDOptimization.lua136
-rw-r--r--lbfgs.h4
2 files changed, 138 insertions, 2 deletions
diff --git a/SGDOptimization.lua b/SGDOptimization.lua
index 60aab8a..29ace70 100644
--- a/SGDOptimization.lua
+++ b/SGDOptimization.lua
@@ -44,3 +44,139 @@ function SGD:optimize()
self.parameters:add(-learningRate, self.currentGradParameters)
end
end
+
+function SGD:condition(inputs, targets)
+
+ -- for now the only conditioning is Yann's optimal learning rate
+ -- from Efficient BackProp 1998
+ self.alpha = self.alpha or 1e-2 -- 1 / ||parameters|| ?
+ self.gamma = self.gamma or 0.95
+
+ if not self.phi then
+ -- make tensor in current default type
+ self.phi = torch.Tensor(self.gradParameters:size())
+ -- no lab functions for CudaTensors so
+ local old_type = torch.getdefaulttensortype()
+ if (old_type == 'torch.CudaTensor') then
+ torch.setdefaulttensortype('torch.FloatTensor')
+ end
+ local r = lab.randn(self.gradParameters:size())
+ r:div(r:norm()) -- norm 1
+ if (old_type == 'torch.CudaTensor') then
+ torch.setdefaulttensortype(old_type)
+ end
+ self.phi:copy(r)
+ end
+
+ -- scratch vectors which we don't want to re-allocate every time
+ self.param_bkup = self.param_bkup or torch.Tensor():resizeAs(self.parameters)
+ self.grad_bkup = self.grad_bkup or torch.Tensor():resizeAs(self.gradParameters)
+ -- single batch (not running average version)
+
+ if type(inputs) == 'table' then -- slow
+ print("<SGD conditioning> slow version ")
+ -- (1) compute dE/dw(w)
+ -- reset gradients
+ self.gradParameters:zero()
+ for i = 1,#inputs do
+ -- estimate f
+ local output = self.module:forward(inputs[i])
+ local err = self.criterion:forward(output, targets[i])
+ -- estimate df/dW
+ local df_do = self.criterion:backward(output, targets[i])
+ self.module:backward(inputs[i], df_do)
+ self.module:accGradParameters(inputs[i], df_do)
+ end
+ -- normalize gradients
+ self.gradParameters:div(#inputs)
+
+ -- backup gradient and weights
+ self.param_bkup:copy(self.parameters)
+ self.grad_bkup:copy(self.gradParameters)
+
+ -- (2) compute dE/dw(w + alpha * phi / || phi|| )
+ -- normalize + scale phi
+ print('norm phi before: ',self.phi:norm(),' alpha: ',self.alpha)
+ self.phi:div(self.phi:norm()):mul(self.alpha)
+ print('norm phi after: ',self.phi:norm())
+ -- perturb weights
+ print('norm param before: ',self.parameters:norm())
+ self.parameters:add(self.phi)
+ print('norm param after: ',self.parameters:norm())
+ -- reset gradients
+ self.gradParameters:zero()
+ --re-estimate f
+ for i = 1,#inputs do
+ -- estimate f
+ output = self.module:forward(inputs[i])
+ err = self.criterion:forward(output, targets[i])
+ -- estimate df/dW
+ df_do = self.criterion:backward(output, targets[i])
+ self.module:backward(inputs[i], df_do)
+ self.module:accGradParameters(inputs[i], df_do)
+ end
+ -- normalize gradients
+ self.gradParameters:div(#inputs)
+
+ -- (3) phi - 1/alpha(dE/dw(w + alpha * oldphi / || oldphi ||) - dE/dw(w))
+ -- compute new phi
+ self.phi:copy(self.grad_bkup):mul(-1):add(self.gradParameters):mul(1/self.alpha)
+ print('norm old_grad: ',self.grad_bkup:norm(),' norm cur_grad: ',self.gradParameters:norm(), ' norm phi: ',self.phi:norm())
+ -- (4) new learning rate eta = 1 / || phi ||
+ self.learningRate = 1 / self.phi:norm()
+ -- (5) reset parameters and zero gradients
+ self.parameters:copy(self.param_bkup)
+ self.gradParameters:zero()
+ else -- fast
+ -- (1) compute dE/dw(w)
+ -- reset gradients
+ self.gradParameters:zero()
+ -- estimate f
+ local output = self.module:forward(inputs)
+ local err = self.criterion:forward(output, targets)
+ -- estimate df/dW
+ local df_do = self.criterion:backward(output, targets)
+ self.module:backward(inputs, df_do)
+ self.module:accGradParameters(inputs, df_do)
+ -- backup gradient and weights
+ self.param_bkup:copy(self.parameters)
+ self.grad_bkup:copy(self.gradParameters)
+ -- divide by number of samples
+ -- self.grad_bkup:div(inputs:size(1))
+
+ -- (2) compute dE/dw(w + alpha * phi / || phi|| )
+ -- normalize + scale phi
+ print('norm phi before: ',self.phi:norm(),' alpha: ',self.alpha)
+ self.phi:div(self.phi:norm()):mul(self.alpha)
+ print('norm phi after: ',self.phi:norm())
+ -- perturb weights
+ print('norm param before: ',self.parameters:norm())
+ self.parameters:add(self.phi)
+ print('norm param after: ',self.parameters:norm())
+ -- reset gradients
+ self.gradParameters:zero()
+ --re-estimate f
+ output = self.module:forward(inputs)
+ self.output = self.criterion:forward(output, targets)
+ -- re-estimate df/dW
+ df_do = self.criterion:backward(output, targets)
+ self.module:backward(inputs, df_do)
+ self.module:accGradParameters(inputs, df_do)
+ -- self.gradParameters:div(inputs:size(1))
+
+ -- (3) phi - 1/alpha(dE/dw(w + alpha * oldphi / || oldphi ||) - dE/dw(w))
+ -- compute new phi
+ if true then
+ -- running average
+ self.phi:mul(self.gamma):add(self.grad_bkup):mul(-1):add(self.gradParameters):mul(1/self.alpha)
+ else
+ self.phi:copy(self.grad_bkup):mul(-1):add(self.gradParameters):mul(1/self.alpha)
+ end
+ print('norm old_grad: ',self.grad_bkup:norm(),' norm cur_grad: ',self.gradParameters:norm(), ' norm phi: ',self.phi:norm())
+ -- (4) new learning rate eta = 1 / || phi ||
+ self.learningRate = 1 / self.phi:norm()
+ -- (5) reset parameters and zero gradients
+ self.parameters:copy(self.param_bkup)
+ self.gradParameters:zero()
+ end
+end \ No newline at end of file
diff --git a/lbfgs.h b/lbfgs.h
index ecae23f..f40a514 100644
--- a/lbfgs.h
+++ b/lbfgs.h
@@ -137,7 +137,7 @@ extern "C" {
LBFGSERR_MAXIMUMLINESEARCH,
/** The algorithm routine reaches the maximum number of iterations. */
LBFGSERR_MAXIMUMITERATION,
- /** The algorithm routine reaches the maximum number of iterations. */
+ /** The algorithm routine reaches the maximum number of function evaluations. */
LBFGSERR_MAXIMUMEVALUATION,
/** Relative width of the interval of uncertainty is at most
lbfgs_parameter_t::xtol. */
@@ -309,7 +309,7 @@ extern "C" {
}
if (lbfgs_err == LBFGSERR_MAXIMUMEVALUATION) {
printf("LBFGSERR_MAXIMUMEVALUATION\n");
- printf(" The algorithm routine reaches the maximum number of iterations.\n");
+ printf(" The algorithm routine reaches the maximum number of function evaluations.\n");
return;
}
if (lbfgs_err == LBFGSERR_WIDTHTOOSMALL) {