diff options
author | Alfredo Canziani <alfredo.canziani@gmail.com> | 2016-06-29 07:49:32 +0300 |
---|---|---|
committer | Alfredo Canziani <alfredo.canziani@gmail.com> | 2016-06-30 05:51:21 +0300 |
commit | 63994c78b2eef4266e62e88e0ae444ee0c37074d (patch) | |
tree | 75d14d1c1d098ee0c9d96f88be112425f966b08d | |
parent | c0c4bbfcc14fad7bc484358821563fddd0b9031e (diff) |
Fix bad alignment, trailing spaces and tabs
-rw-r--r-- | adam.lua | 70 | ||||
-rw-r--r-- | adamax.lua | 70 | ||||
-rw-r--r-- | asgd.lua | 4 | ||||
-rw-r--r-- | cmaes.lua | 62 | ||||
-rw-r--r-- | fista.lua | 14 | ||||
-rw-r--r-- | lbfgs.lua | 6 | ||||
-rw-r--r-- | nag.lua | 6 | ||||
-rw-r--r-- | rmsprop.lua | 50 | ||||
-rw-r--r-- | rprop.lua | 154 | ||||
-rw-r--r-- | sgd.lua | 2 |
10 files changed, 219 insertions, 219 deletions
@@ -21,47 +21,47 @@ RETURN: ]] function optim.adam(opfunc, x, config, state) - -- (0) get/update state - local config = config or {} - local state = state or config - local lr = config.learningRate or 0.001 + -- (0) get/update state + local config = config or {} + local state = state or config + local lr = config.learningRate or 0.001 - local beta1 = config.beta1 or 0.9 - local beta2 = config.beta2 or 0.999 - local epsilon = config.epsilon or 1e-8 - local wd = config.weightDecay or 0 + local beta1 = config.beta1 or 0.9 + local beta2 = config.beta2 or 0.999 + local epsilon = config.epsilon or 1e-8 + local wd = config.weightDecay or 0 - -- (1) evaluate f(x) and df/dx - local fx, dfdx = opfunc(x) + -- (1) evaluate f(x) and df/dx + local fx, dfdx = opfunc(x) - -- (2) weight decay - if wd ~= 0 then - dfdx:add(wd, x) - end + -- (2) weight decay + if wd ~= 0 then + dfdx:add(wd, x) + end - -- Initialization - state.t = state.t or 0 - -- Exponential moving average of gradient values - state.m = state.m or x.new(dfdx:size()):zero() - -- Exponential moving average of squared gradient values - state.v = state.v or x.new(dfdx:size()):zero() - -- A tmp tensor to hold the sqrt(v) + epsilon - state.denom = state.denom or x.new(dfdx:size()):zero() + -- Initialization + state.t = state.t or 0 + -- Exponential moving average of gradient values + state.m = state.m or x.new(dfdx:size()):zero() + -- Exponential moving average of squared gradient values + state.v = state.v or x.new(dfdx:size()):zero() + -- A tmp tensor to hold the sqrt(v) + epsilon + state.denom = state.denom or x.new(dfdx:size()):zero() - state.t = state.t + 1 - - -- Decay the first and second moment running average coefficient - state.m:mul(beta1):add(1-beta1, dfdx) - state.v:mul(beta2):addcmul(1-beta2, dfdx, dfdx) + state.t = state.t + 1 - state.denom:copy(state.v):sqrt():add(epsilon) + -- Decay the first and second moment running average coefficient + state.m:mul(beta1):add(1-beta1, dfdx) + state.v:mul(beta2):addcmul(1-beta2, dfdx, dfdx) - local biasCorrection1 = 1 - beta1^state.t - local biasCorrection2 = 1 - beta2^state.t - local stepSize = lr * math.sqrt(biasCorrection2)/biasCorrection1 - -- (3) update x - x:addcdiv(-stepSize, state.m, state.denom) + state.denom:copy(state.v):sqrt():add(epsilon) - -- return x*, f(x) before optimization - return x, {fx} + local biasCorrection1 = 1 - beta1^state.t + local biasCorrection2 = 1 - beta2^state.t + local stepSize = lr * math.sqrt(biasCorrection2)/biasCorrection1 + -- (3) update x + x:addcdiv(-stepSize, state.m, state.denom) + + -- return x*, f(x) before optimization + return x, {fx} end @@ -20,47 +20,47 @@ RETURN: ]] function optim.adamax(opfunc, x, config, state) - -- (0) get/update state - local config = config or {} - local state = state or config - local lr = config.learningRate or 0.002 + -- (0) get/update state + local config = config or {} + local state = state or config + local lr = config.learningRate or 0.002 - local beta1 = config.beta1 or 0.9 - local beta2 = config.beta2 or 0.999 - local epsilon = config.epsilon or 1e-38 - local wd = config.weightDecay or 0 + local beta1 = config.beta1 or 0.9 + local beta2 = config.beta2 or 0.999 + local epsilon = config.epsilon or 1e-38 + local wd = config.weightDecay or 0 - -- (1) evaluate f(x) and df/dx - local fx, dfdx = opfunc(x) + -- (1) evaluate f(x) and df/dx + local fx, dfdx = opfunc(x) - -- (2) weight decay - if wd ~= 0 then - dfdx:add(wd, x) - end + -- (2) weight decay + if wd ~= 0 then + dfdx:add(wd, x) + end - -- Initialization - state.t = state.t or 0 - -- Exponential moving average of gradient values - state.m = state.m or x.new(dfdx:size()):zero() - -- Exponential moving average of the infinity norm - state.u = state.u or x.new(dfdx:size()):zero() - -- A tmp tensor to hold the input to max() - state.max = state.max or x.new(2, unpack(dfdx:size():totable())):zero() + -- Initialization + state.t = state.t or 0 + -- Exponential moving average of gradient values + state.m = state.m or x.new(dfdx:size()):zero() + -- Exponential moving average of the infinity norm + state.u = state.u or x.new(dfdx:size()):zero() + -- A tmp tensor to hold the input to max() + state.max = state.max or x.new(2, unpack(dfdx:size():totable())):zero() - state.t = state.t + 1 + state.t = state.t + 1 - -- Update biased first moment estimate. - state.m:mul(beta1):add(1-beta1, dfdx) - -- Update the exponentially weighted infinity norm. - state.max[1]:copy(state.u):mul(beta2) - state.max[2]:copy(dfdx):abs():add(epsilon) - state.u:max(state.max, 1) + -- Update biased first moment estimate. + state.m:mul(beta1):add(1-beta1, dfdx) + -- Update the exponentially weighted infinity norm. + state.max[1]:copy(state.u):mul(beta2) + state.max[2]:copy(dfdx):abs():add(epsilon) + state.u:max(state.max, 1) - local biasCorrection1 = 1 - beta1^state.t - local stepSize = lr/biasCorrection1 - -- (2) update x - x:addcdiv(-stepSize, state.m, state.u) + local biasCorrection1 = 1 - beta1^state.t + local stepSize = lr/biasCorrection1 + -- (2) update x + x:addcdiv(-stepSize, state.m, state.u) - -- return x*, f(x) before optimization - return x, {fx} + -- return x*, f(x) before optimization + return x, {fx} end @@ -1,6 +1,6 @@ --[[ An implementation of ASGD -ASGD: +ASGD: x := (1 - lambda eta_t) x - eta_t df/dx(z,x) a := a + mu_t [ x - a ] @@ -12,7 +12,7 @@ implements ASGD algoritm as in L.Bottou's sgd-2.0 ARGS: -- `opfunc` : a function that takes a single input (X), the point of +- `opfunc` : a function that takes a single input (X), the point of evaluation, and returns f(X) and df/dX - `x` : the initial point - `state` : a table describing the state of the optimizer; after each @@ -1,16 +1,16 @@ require 'torch' require 'math' -local BestSolution = {} ---[[ An implementation of `CMAES` (Covariance Matrix Adaptation Evolution Strategy), +local BestSolution = {} +--[[ An implementation of `CMAES` (Covariance Matrix Adaptation Evolution Strategy), ported from https://www.lri.fr/~hansen/barecmaes2.html. - + Parameters ---------- ARGS: -- `opfunc` : a function that takes a single input (X), the point of - evaluation, and returns f(X) and df/dX. Note that df/dX is not used +- `opfunc` : a function that takes a single input (X), the point of + evaluation, and returns f(X) and df/dX. Note that df/dX is not used - `x` : the initial point - `state.sigma` float, initial step-size (standard deviation in each @@ -20,16 +20,16 @@ ARGS: - `state.ftarget` float, target function value - `state.popsize` - population size. If this is left empty, + population size. If this is left empty, 4 + int(3 * log(|x|)) will be used -- `state.ftarget` +- `state.ftarget` stop if fitness < ftarget - `state.verb_disp` int, display on console every verb_disp iteration, 0 for never RETURN: - `x*` : the new `x` vector, at the optimal point -- `f` : a table of all function values: +- `f` : a table of all function values: `f[1]` is the value of the function before any optimization and `f[#f]` is the final fully optimized value, at `x*` --]] @@ -50,13 +50,13 @@ function optim.cmaes(opfunc, x, config, state) local min_iterations = state.min_iterations or 1 local lambda = state.popsize -- population size, offspring number - -- Strategy parameter setting: Selection + -- Strategy parameter setting: Selection if state.popsize == nil then lambda = 4 + math.floor(3 * math.log(N)) end local mu = lambda / 2 -- number of parents/points for recombination - local weights = torch.range(0,mu-1):apply(function(i) + local weights = torch.range(0,mu-1):apply(function(i) return math.log(mu+0.5) - math.log(i+1) end) -- recombination weights weights:div(weights:sum()) -- normalize recombination weights array local mueff = weights:sum()^2 / torch.pow(weights,2):sum() -- variance-effectiveness of sum w_i x_i @@ -69,18 +69,18 @@ function optim.cmaes(opfunc, x, config, state) local cmu = math.min(1 - c1, 2 * (mueff - 2 + 1/mueff) / ((N + 2)^2 + mueff)) -- and for rank-mu update local damps = 2 * mueff/lambda + 0.3 + cs -- damping for sigma, usually close to 1 - -- Initialize dynamic (internal) state variables + -- Initialize dynamic (internal) state variables local pc = torch.Tensor(N):zero():typeAs(x) -- evolution paths for C local ps = torch.Tensor(N):zero():typeAs(x) -- evolution paths for sigma - local B = torch.eye(N):typeAs(x) -- B defines the coordinate system + local B = torch.eye(N):typeAs(x) -- B defines the coordinate system local D = torch.Tensor(N):fill(1):typeAs(x) -- diagonal D defines the scaling - local C = torch.eye(N):typeAs(x) -- covariance matrix + local C = torch.eye(N):typeAs(x) -- covariance matrix if not pcall(function () torch.symeig(C,'V') end) then -- if error occurs trying to use symeig - error('torch.symeig not available for ' .. x:type() .. + error('torch.symeig not available for ' .. x:type() .. " please use Float- or DoubleTensor for x") end local candidates = torch.Tensor(lambda,N):typeAs(x) - local invsqrtC = torch.eye(N):typeAs(x) -- C^-1/2 + local invsqrtC = torch.eye(N):typeAs(x) -- C^-1/2 local eigeneval = 0 -- tracking the update of B and D local counteval = 0 local f_hist = {[1]=opfunc(x)} -- for bookkeeping output and termination @@ -90,7 +90,7 @@ function optim.cmaes(opfunc, x, config, state) local function ask() - --[[return a list of lambda candidate solutions according to + --[[return a list of lambda candidate solutions according to m + sig * Normal(0,C) = m + sig * B * D * Normal(0,I) --]] -- Eigendecomposition: first update B, D and invsqrtC from C @@ -117,9 +117,9 @@ function optim.cmaes(opfunc, x, config, state) Parameters ---------- - `arx` + `arx` a list of solutions, presumably from `ask()` - `fitvals` + `fitvals` the corresponding objective function values --]] -- bookkeeping, preparation counteval = counteval + lambda -- slightly artificial to do here @@ -142,7 +142,7 @@ function optim.cmaes(opfunc, x, config, state) local c = (cs * (2-cs) * mueff)^0.5 / sigma ps = ps - ps * cs + z * c -- exponential decay on ps - local hsig = (torch.sum(torch.pow(ps,2)) / + local hsig = (torch.sum(torch.pow(ps,2)) / (1-(1-cs)^(2*counteval/lambda)) / N < 2 + 4./(N+1)) hsig = hsig and 1.0 or 0.0 --use binary numbers @@ -155,23 +155,23 @@ function optim.cmaes(opfunc, x, config, state) for i=1,N do for j=1,N do local r = torch.range(1,mu) - r:apply(function(k) + r:apply(function(k) return weights[k] * (arx[k][i]-xold[i]) * (arx[k][j]-xold[j]) end) local Cmuij = torch.sum(r) / sigma^2 -- rank-mu update - C[i][j] = C[i][j] + ((-c1a - cmu) * C[i][j] + + C[i][j] = C[i][j] + ((-c1a - cmu) * C[i][j] + c1 * pc[i]*pc[j] + cmu * Cmuij) end end -- Adapt step-size sigma with factor <= exp(0.6) \approx 1.82 - sigma = sigma * math.exp(math.min(0.6, + sigma = sigma * math.exp(math.min(0.6, (cs / damps) * (torch.sum(torch.pow(ps,2))/N - 1)/2)) end - local function stop() - --[[return satisfied termination conditions in a table like - {'termination reason':value, ...}, for example {'tolfun':1e-12}, - or the empty table {}--]] + local function stop() + --[[return satisfied termination conditions in a table like + {'termination reason':value, ...}, for example {'tolfun':1e-12}, + or the empty table {}--]] local res = {} if counteval > 0 then if counteval >= maxEval then @@ -184,7 +184,7 @@ function optim.cmaes(opfunc, x, config, state) res['condition'] = 1e7 end if fitvals:nElement() > 1 and fitvals[fitvals:nElement()] - fitvals[1] < 1e-12 then - res['tolfun'] = 1e-12 + res['tolfun'] = 1e-12 end if sigma * torch.max(D) < 1e-11 then -- remark: max(D) >= max(diag(C))^0.5 @@ -206,8 +206,8 @@ function optim.cmaes(opfunc, x, config, state) end if iteration <= 2 or iteration % verb_modulo == 0 then local max_std = math.sqrt(torch.max(torch.diag(C))) - print(tostring(counteval).. ': ' .. - string.format(' %6.1f %8.1e ', torch.max(D) / torch.min(D), sigma * max_std) + print(tostring(counteval).. ': ' .. + string.format(' %6.1f %8.1e ', torch.max(D) / torch.min(D), sigma * max_std) .. tostring(fitvals[1])) end @@ -224,7 +224,7 @@ function optim.cmaes(opfunc, x, config, state) fitvals[i] = objfunc(candidate) end - tell(X) + tell(X) disp(verb_disp) end @@ -245,7 +245,7 @@ end -BestSolution.__index = BestSolution +BestSolution.__index = BestSolution function BestSolution.new(x, f, evals) local self = setmetatable({}, BestSolution) self.x = x @@ -17,7 +17,7 @@ On output, `params` will contain these additional fields that can be reused. - `params.L` : last used L value will be written. -These are temporary storages needed by the algo and if the same params object is +These are temporary storages needed by the algo and if the same params object is passed a second time, these same storages will be used without new allocation. - `params.xkm` : previous iterarion point @@ -26,7 +26,7 @@ passed a second time, these same storages will be used without new allocation. Returns the solution x and history of {function evals, number of line search ,...} -Algorithm is published in +Algorithm is published in @article{beck-fista-09, Author = {Beck, Amir and Teboulle, Marc}, @@ -38,7 +38,7 @@ Algorithm is published in Year = {2009}} ]] function optim.FistaLS(f, g, pl, xinit, params) - + local params = params or {} local L = params.L or 0.1 local Lstep = params.Lstep or 1.5 @@ -46,7 +46,7 @@ function optim.FistaLS(f, g, pl, xinit, params) local maxline = params.maxline or 20 local errthres = params.errthres or 1e-4 local doFistaUpdate = params.doFistaUpdate - local verbose = params.verbose + local verbose = params.verbose -- temporary allocations params.xkm = params.xkm or torch.Tensor() @@ -77,11 +77,11 @@ function optim.FistaLS(f, g, pl, xinit, params) -- get derivatives from smooth function local fy,gfy = f(y,'dx') --local gfy = f(y) - + local fply = 0 local gply = 0 local Q = 0 - + ---------------------------------------------- -- do line search to find new current location starting from fista loc local nline = 0 @@ -98,7 +98,7 @@ function optim.FistaLS(f, g, pl, xinit, params) -- evaluate this point F(ply) fply = f(ply) - + -- ply - y ply:add(-1, y) -- <ply-y , \Grad(f(y))> @@ -27,7 +27,7 @@ ARGS: RETURN: - `x*` : the new `x` vector, at the optimal point -- `f` : a table of all function values: +- `f` : a table of all function values: `f[1]` is the value of the function before any optimization and `f[#f]` is the final fully optimized value, at `x*` @@ -46,7 +46,7 @@ function optim.lbfgs(opfunc, x, config, state) local lineSearchOpts = config.lineSearchOptions local learningRate = config.learningRate or 1 local isverbose = config.verbose or false - + state.funcEval = state.funcEval or 0 state.nIter = state.nIter or 0 @@ -142,7 +142,7 @@ function optim.lbfgs(opfunc, x, config, state) table.insert(state.stp_bufs, s) end - -- compute the approximate (L-BFGS) inverse Hessian + -- compute the approximate (L-BFGS) inverse Hessian -- multiplied by the gradient local k = #old_dirs @@ -1,11 +1,11 @@ ---------------------------------------------------------------------- --- An implementation of SGD adapted with features of Nesterov's +-- An implementation of SGD adapted with features of Nesterov's -- Accelerated Gradient method, based on the paper -- On the Importance of Initialization and Momentum in Deep Learning -- Sutsveker et. al., ICML 2013 -- -- ARGS: --- opfunc : a function that takes a single input (X), the point of +-- opfunc : a function that takes a single input (X), the point of -- evaluation, and returns f(X) and df/dX -- x : the initial point -- state : a table describing the state of the optimizer; after each @@ -44,7 +44,7 @@ function optim.nag(opfunc, x, config, state) -- first step in the direction of the momentum vector if state.dfdx then - x:add(mom, state.dfdx) + x:add(mom, state.dfdx) end -- then compute gradient at that point -- comment out the above line to get the original SGD diff --git a/rmsprop.lua b/rmsprop.lua index 038af21..1eb526d 100644 --- a/rmsprop.lua +++ b/rmsprop.lua @@ -22,36 +22,36 @@ RETURN: ]] function optim.rmsprop(opfunc, x, config, state) - -- (0) get/update state - local config = config or {} - local state = state or config - local lr = config.learningRate or 1e-2 - local alpha = config.alpha or 0.99 - local epsilon = config.epsilon or 1e-8 - local wd = config.weightDecay or 0 - - -- (1) evaluate f(x) and df/dx - local fx, dfdx = opfunc(x) - - -- (2) weight decay - if wd ~= 0 then + -- (0) get/update state + local config = config or {} + local state = state or config + local lr = config.learningRate or 1e-2 + local alpha = config.alpha or 0.99 + local epsilon = config.epsilon or 1e-8 + local wd = config.weightDecay or 0 + + -- (1) evaluate f(x) and df/dx + local fx, dfdx = opfunc(x) + + -- (2) weight decay + if wd ~= 0 then dfdx:add(wd, x) - end + end - -- (3) initialize mean square values and square gradient storage - if not state.m then + -- (3) initialize mean square values and square gradient storage + if not state.m then state.m = torch.Tensor():typeAs(x):resizeAs(dfdx):fill(1) state.tmp = torch.Tensor():typeAs(x):resizeAs(dfdx) - end + end - -- (4) calculate new (leaky) mean squared values - state.m:mul(alpha) - state.m:addcmul(1.0-alpha, dfdx, dfdx) + -- (4) calculate new (leaky) mean squared values + state.m:mul(alpha) + state.m:addcmul(1.0-alpha, dfdx, dfdx) - -- (5) perform update - state.tmp:sqrt(state.m):add(epsilon) - x:addcdiv(-lr, dfdx, state.tmp) + -- (5) perform update + state.tmp:sqrt(state.m):add(epsilon) + x:addcdiv(-lr, dfdx, state.tmp) - -- return x*, f(x) before optimization - return x, {fx} + -- return x*, f(x) before optimization + return x, {fx} end @@ -20,83 +20,83 @@ RETURN: (Martin Riedmiller, Koray Kavukcuoglu 2013) --]] function optim.rprop(opfunc, x, config, state) - if config == nil and state == nil then - print('no state table RPROP initializing') - end - -- (0) get/update state - local config = config or {} - local state = state or config - local stepsize = config.stepsize or 0.1 - local etaplus = config.etaplus or 1.2 - local etaminus = config.etaminus or 0.5 - local stepsizemax = config.stepsizemax or 50.0 - local stepsizemin = config.stepsizemin or 1E-06 - local niter = config.niter or 1 - - local hfx = {} - - for i=1,niter do - - -- (1) evaluate f(x) and df/dx - local fx,dfdx = opfunc(x) - - -- init temp storage - if not state.delta then - state.delta = dfdx.new(dfdx:size()):zero() - state.stepsize = dfdx.new(dfdx:size()):fill(stepsize) - state.sign = dfdx.new(dfdx:size()) - state.psign = torch.ByteTensor(dfdx:size()) - state.nsign = torch.ByteTensor(dfdx:size()) - state.zsign = torch.ByteTensor(dfdx:size()) - state.dminmax = torch.ByteTensor(dfdx:size()) - if torch.type(x)=='torch.CudaTensor' then - -- Push to GPU - state.psign = state.psign:cuda() - state.nsign = state.nsign:cuda() - state.zsign = state.zsign:cuda() - state.dminmax = state.dminmax:cuda() - end - end - - -- sign of derivative from last step to this one - torch.cmul(state.sign, dfdx, state.delta) - torch.sign(state.sign, state.sign) - - -- get indices of >0, <0 and ==0 entries - state.sign.gt(state.psign, state.sign, 0) - state.sign.lt(state.nsign, state.sign, 0) - state.sign.eq(state.zsign, state.sign, 0) - - -- get step size updates - state.sign[state.psign] = etaplus - state.sign[state.nsign] = etaminus - state.sign[state.zsign] = 1 - - -- update stepsizes with step size updates - state.stepsize:cmul(state.sign) - - -- threshold step sizes - -- >50 => 50 - state.stepsize.gt(state.dminmax, state.stepsize, stepsizemax) - state.stepsize[state.dminmax] = stepsizemax - -- <1e-6 ==> 1e-6 - state.stepsize.lt(state.dminmax, state.stepsize, stepsizemin) - state.stepsize[state.dminmax] = stepsizemin - - -- for dir<0, dfdx=0 - -- for dir>=0 dfdx=dfdx - dfdx[state.nsign] = 0 - -- state.sign = sign(dfdx) - torch.sign(state.sign,dfdx) - - -- update weights - x:addcmul(-1,state.sign,state.stepsize) - - -- update state.dfdx with current dfdx - state.delta:copy(dfdx) - - table.insert(hfx,fx) - end + if config == nil and state == nil then + print('no state table RPROP initializing') + end + -- (0) get/update state + local config = config or {} + local state = state or config + local stepsize = config.stepsize or 0.1 + local etaplus = config.etaplus or 1.2 + local etaminus = config.etaminus or 0.5 + local stepsizemax = config.stepsizemax or 50.0 + local stepsizemin = config.stepsizemin or 1E-06 + local niter = config.niter or 1 + + local hfx = {} + + for i=1,niter do + + -- (1) evaluate f(x) and df/dx + local fx,dfdx = opfunc(x) + + -- init temp storage + if not state.delta then + state.delta = dfdx.new(dfdx:size()):zero() + state.stepsize = dfdx.new(dfdx:size()):fill(stepsize) + state.sign = dfdx.new(dfdx:size()) + state.psign = torch.ByteTensor(dfdx:size()) + state.nsign = torch.ByteTensor(dfdx:size()) + state.zsign = torch.ByteTensor(dfdx:size()) + state.dminmax = torch.ByteTensor(dfdx:size()) + if torch.type(x)=='torch.CudaTensor' then + -- Push to GPU + state.psign = state.psign:cuda() + state.nsign = state.nsign:cuda() + state.zsign = state.zsign:cuda() + state.dminmax = state.dminmax:cuda() + end + end + + -- sign of derivative from last step to this one + torch.cmul(state.sign, dfdx, state.delta) + torch.sign(state.sign, state.sign) + + -- get indices of >0, <0 and ==0 entries + state.sign.gt(state.psign, state.sign, 0) + state.sign.lt(state.nsign, state.sign, 0) + state.sign.eq(state.zsign, state.sign, 0) + + -- get step size updates + state.sign[state.psign] = etaplus + state.sign[state.nsign] = etaminus + state.sign[state.zsign] = 1 + + -- update stepsizes with step size updates + state.stepsize:cmul(state.sign) + + -- threshold step sizes + -- >50 => 50 + state.stepsize.gt(state.dminmax, state.stepsize, stepsizemax) + state.stepsize[state.dminmax] = stepsizemax + -- <1e-6 ==> 1e-6 + state.stepsize.lt(state.dminmax, state.stepsize, stepsizemin) + state.stepsize[state.dminmax] = stepsizemin + + -- for dir<0, dfdx=0 + -- for dir>=0 dfdx=dfdx + dfdx[state.nsign] = 0 + -- state.sign = sign(dfdx) + torch.sign(state.sign,dfdx) + + -- update weights + x:addcmul(-1,state.sign,state.stepsize) + + -- update state.dfdx with current dfdx + state.delta:copy(dfdx) + + table.insert(hfx,fx) + end -- return x*, f(x) before optimization return x,hfx @@ -70,7 +70,7 @@ function optim.sgd(opfunc, x, config, state) -- (4) learning rate decay (annealing) local clr = lr / (1 + nevals*lrd) - + -- (5) parameter update with single or individual learning rates if lrs then if not state.deltaParameters then |