Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/torch/optim.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAlfredo Canziani <alfredo.canziani@gmail.com>2016-06-29 07:49:32 +0300
committerAlfredo Canziani <alfredo.canziani@gmail.com>2016-06-30 05:51:21 +0300
commit63994c78b2eef4266e62e88e0ae444ee0c37074d (patch)
tree75d14d1c1d098ee0c9d96f88be112425f966b08d
parentc0c4bbfcc14fad7bc484358821563fddd0b9031e (diff)
Fix bad alignment, trailing spaces and tabs
-rw-r--r--adam.lua70
-rw-r--r--adamax.lua70
-rw-r--r--asgd.lua4
-rw-r--r--cmaes.lua62
-rw-r--r--fista.lua14
-rw-r--r--lbfgs.lua6
-rw-r--r--nag.lua6
-rw-r--r--rmsprop.lua50
-rw-r--r--rprop.lua154
-rw-r--r--sgd.lua2
10 files changed, 219 insertions, 219 deletions
diff --git a/adam.lua b/adam.lua
index a6ad588..505a779 100644
--- a/adam.lua
+++ b/adam.lua
@@ -21,47 +21,47 @@ RETURN:
]]
function optim.adam(opfunc, x, config, state)
- -- (0) get/update state
- local config = config or {}
- local state = state or config
- local lr = config.learningRate or 0.001
+ -- (0) get/update state
+ local config = config or {}
+ local state = state or config
+ local lr = config.learningRate or 0.001
- local beta1 = config.beta1 or 0.9
- local beta2 = config.beta2 or 0.999
- local epsilon = config.epsilon or 1e-8
- local wd = config.weightDecay or 0
+ local beta1 = config.beta1 or 0.9
+ local beta2 = config.beta2 or 0.999
+ local epsilon = config.epsilon or 1e-8
+ local wd = config.weightDecay or 0
- -- (1) evaluate f(x) and df/dx
- local fx, dfdx = opfunc(x)
+ -- (1) evaluate f(x) and df/dx
+ local fx, dfdx = opfunc(x)
- -- (2) weight decay
- if wd ~= 0 then
- dfdx:add(wd, x)
- end
+ -- (2) weight decay
+ if wd ~= 0 then
+ dfdx:add(wd, x)
+ end
- -- Initialization
- state.t = state.t or 0
- -- Exponential moving average of gradient values
- state.m = state.m or x.new(dfdx:size()):zero()
- -- Exponential moving average of squared gradient values
- state.v = state.v or x.new(dfdx:size()):zero()
- -- A tmp tensor to hold the sqrt(v) + epsilon
- state.denom = state.denom or x.new(dfdx:size()):zero()
+ -- Initialization
+ state.t = state.t or 0
+ -- Exponential moving average of gradient values
+ state.m = state.m or x.new(dfdx:size()):zero()
+ -- Exponential moving average of squared gradient values
+ state.v = state.v or x.new(dfdx:size()):zero()
+ -- A tmp tensor to hold the sqrt(v) + epsilon
+ state.denom = state.denom or x.new(dfdx:size()):zero()
- state.t = state.t + 1
-
- -- Decay the first and second moment running average coefficient
- state.m:mul(beta1):add(1-beta1, dfdx)
- state.v:mul(beta2):addcmul(1-beta2, dfdx, dfdx)
+ state.t = state.t + 1
- state.denom:copy(state.v):sqrt():add(epsilon)
+ -- Decay the first and second moment running average coefficient
+ state.m:mul(beta1):add(1-beta1, dfdx)
+ state.v:mul(beta2):addcmul(1-beta2, dfdx, dfdx)
- local biasCorrection1 = 1 - beta1^state.t
- local biasCorrection2 = 1 - beta2^state.t
- local stepSize = lr * math.sqrt(biasCorrection2)/biasCorrection1
- -- (3) update x
- x:addcdiv(-stepSize, state.m, state.denom)
+ state.denom:copy(state.v):sqrt():add(epsilon)
- -- return x*, f(x) before optimization
- return x, {fx}
+ local biasCorrection1 = 1 - beta1^state.t
+ local biasCorrection2 = 1 - beta2^state.t
+ local stepSize = lr * math.sqrt(biasCorrection2)/biasCorrection1
+ -- (3) update x
+ x:addcdiv(-stepSize, state.m, state.denom)
+
+ -- return x*, f(x) before optimization
+ return x, {fx}
end
diff --git a/adamax.lua b/adamax.lua
index c06fddd..2b64877 100644
--- a/adamax.lua
+++ b/adamax.lua
@@ -20,47 +20,47 @@ RETURN:
]]
function optim.adamax(opfunc, x, config, state)
- -- (0) get/update state
- local config = config or {}
- local state = state or config
- local lr = config.learningRate or 0.002
+ -- (0) get/update state
+ local config = config or {}
+ local state = state or config
+ local lr = config.learningRate or 0.002
- local beta1 = config.beta1 or 0.9
- local beta2 = config.beta2 or 0.999
- local epsilon = config.epsilon or 1e-38
- local wd = config.weightDecay or 0
+ local beta1 = config.beta1 or 0.9
+ local beta2 = config.beta2 or 0.999
+ local epsilon = config.epsilon or 1e-38
+ local wd = config.weightDecay or 0
- -- (1) evaluate f(x) and df/dx
- local fx, dfdx = opfunc(x)
+ -- (1) evaluate f(x) and df/dx
+ local fx, dfdx = opfunc(x)
- -- (2) weight decay
- if wd ~= 0 then
- dfdx:add(wd, x)
- end
+ -- (2) weight decay
+ if wd ~= 0 then
+ dfdx:add(wd, x)
+ end
- -- Initialization
- state.t = state.t or 0
- -- Exponential moving average of gradient values
- state.m = state.m or x.new(dfdx:size()):zero()
- -- Exponential moving average of the infinity norm
- state.u = state.u or x.new(dfdx:size()):zero()
- -- A tmp tensor to hold the input to max()
- state.max = state.max or x.new(2, unpack(dfdx:size():totable())):zero()
+ -- Initialization
+ state.t = state.t or 0
+ -- Exponential moving average of gradient values
+ state.m = state.m or x.new(dfdx:size()):zero()
+ -- Exponential moving average of the infinity norm
+ state.u = state.u or x.new(dfdx:size()):zero()
+ -- A tmp tensor to hold the input to max()
+ state.max = state.max or x.new(2, unpack(dfdx:size():totable())):zero()
- state.t = state.t + 1
+ state.t = state.t + 1
- -- Update biased first moment estimate.
- state.m:mul(beta1):add(1-beta1, dfdx)
- -- Update the exponentially weighted infinity norm.
- state.max[1]:copy(state.u):mul(beta2)
- state.max[2]:copy(dfdx):abs():add(epsilon)
- state.u:max(state.max, 1)
+ -- Update biased first moment estimate.
+ state.m:mul(beta1):add(1-beta1, dfdx)
+ -- Update the exponentially weighted infinity norm.
+ state.max[1]:copy(state.u):mul(beta2)
+ state.max[2]:copy(dfdx):abs():add(epsilon)
+ state.u:max(state.max, 1)
- local biasCorrection1 = 1 - beta1^state.t
- local stepSize = lr/biasCorrection1
- -- (2) update x
- x:addcdiv(-stepSize, state.m, state.u)
+ local biasCorrection1 = 1 - beta1^state.t
+ local stepSize = lr/biasCorrection1
+ -- (2) update x
+ x:addcdiv(-stepSize, state.m, state.u)
- -- return x*, f(x) before optimization
- return x, {fx}
+ -- return x*, f(x) before optimization
+ return x, {fx}
end
diff --git a/asgd.lua b/asgd.lua
index 659db22..cc1c459 100644
--- a/asgd.lua
+++ b/asgd.lua
@@ -1,6 +1,6 @@
--[[ An implementation of ASGD
-ASGD:
+ASGD:
x := (1 - lambda eta_t) x - eta_t df/dx(z,x)
a := a + mu_t [ x - a ]
@@ -12,7 +12,7 @@ implements ASGD algoritm as in L.Bottou's sgd-2.0
ARGS:
-- `opfunc` : a function that takes a single input (X), the point of
+- `opfunc` : a function that takes a single input (X), the point of
evaluation, and returns f(X) and df/dX
- `x` : the initial point
- `state` : a table describing the state of the optimizer; after each
diff --git a/cmaes.lua b/cmaes.lua
index 1045a48..74cd58a 100644
--- a/cmaes.lua
+++ b/cmaes.lua
@@ -1,16 +1,16 @@
require 'torch'
require 'math'
-local BestSolution = {}
---[[ An implementation of `CMAES` (Covariance Matrix Adaptation Evolution Strategy),
+local BestSolution = {}
+--[[ An implementation of `CMAES` (Covariance Matrix Adaptation Evolution Strategy),
ported from https://www.lri.fr/~hansen/barecmaes2.html.
-
+
Parameters
----------
ARGS:
-- `opfunc` : a function that takes a single input (X), the point of
- evaluation, and returns f(X) and df/dX. Note that df/dX is not used
+- `opfunc` : a function that takes a single input (X), the point of
+ evaluation, and returns f(X) and df/dX. Note that df/dX is not used
- `x` : the initial point
- `state.sigma`
float, initial step-size (standard deviation in each
@@ -20,16 +20,16 @@ ARGS:
- `state.ftarget`
float, target function value
- `state.popsize`
- population size. If this is left empty,
+ population size. If this is left empty,
4 + int(3 * log(|x|)) will be used
-- `state.ftarget`
+- `state.ftarget`
stop if fitness < ftarget
- `state.verb_disp`
int, display on console every verb_disp iteration, 0 for never
RETURN:
- `x*` : the new `x` vector, at the optimal point
-- `f` : a table of all function values:
+- `f` : a table of all function values:
`f[1]` is the value of the function before any optimization and
`f[#f]` is the final fully optimized value, at `x*`
--]]
@@ -50,13 +50,13 @@ function optim.cmaes(opfunc, x, config, state)
local min_iterations = state.min_iterations or 1
local lambda = state.popsize -- population size, offspring number
- -- Strategy parameter setting: Selection
+ -- Strategy parameter setting: Selection
if state.popsize == nil then
lambda = 4 + math.floor(3 * math.log(N))
end
local mu = lambda / 2 -- number of parents/points for recombination
- local weights = torch.range(0,mu-1):apply(function(i)
+ local weights = torch.range(0,mu-1):apply(function(i)
return math.log(mu+0.5) - math.log(i+1) end) -- recombination weights
weights:div(weights:sum()) -- normalize recombination weights array
local mueff = weights:sum()^2 / torch.pow(weights,2):sum() -- variance-effectiveness of sum w_i x_i
@@ -69,18 +69,18 @@ function optim.cmaes(opfunc, x, config, state)
local cmu = math.min(1 - c1, 2 * (mueff - 2 + 1/mueff) / ((N + 2)^2 + mueff)) -- and for rank-mu update
local damps = 2 * mueff/lambda + 0.3 + cs -- damping for sigma, usually close to 1
- -- Initialize dynamic (internal) state variables
+ -- Initialize dynamic (internal) state variables
local pc = torch.Tensor(N):zero():typeAs(x) -- evolution paths for C
local ps = torch.Tensor(N):zero():typeAs(x) -- evolution paths for sigma
- local B = torch.eye(N):typeAs(x) -- B defines the coordinate system
+ local B = torch.eye(N):typeAs(x) -- B defines the coordinate system
local D = torch.Tensor(N):fill(1):typeAs(x) -- diagonal D defines the scaling
- local C = torch.eye(N):typeAs(x) -- covariance matrix
+ local C = torch.eye(N):typeAs(x) -- covariance matrix
if not pcall(function () torch.symeig(C,'V') end) then -- if error occurs trying to use symeig
- error('torch.symeig not available for ' .. x:type() ..
+ error('torch.symeig not available for ' .. x:type() ..
" please use Float- or DoubleTensor for x")
end
local candidates = torch.Tensor(lambda,N):typeAs(x)
- local invsqrtC = torch.eye(N):typeAs(x) -- C^-1/2
+ local invsqrtC = torch.eye(N):typeAs(x) -- C^-1/2
local eigeneval = 0 -- tracking the update of B and D
local counteval = 0
local f_hist = {[1]=opfunc(x)} -- for bookkeeping output and termination
@@ -90,7 +90,7 @@ function optim.cmaes(opfunc, x, config, state)
local function ask()
- --[[return a list of lambda candidate solutions according to
+ --[[return a list of lambda candidate solutions according to
m + sig * Normal(0,C) = m + sig * B * D * Normal(0,I)
--]]
-- Eigendecomposition: first update B, D and invsqrtC from C
@@ -117,9 +117,9 @@ function optim.cmaes(opfunc, x, config, state)
Parameters
----------
- `arx`
+ `arx`
a list of solutions, presumably from `ask()`
- `fitvals`
+ `fitvals`
the corresponding objective function values --]]
-- bookkeeping, preparation
counteval = counteval + lambda -- slightly artificial to do here
@@ -142,7 +142,7 @@ function optim.cmaes(opfunc, x, config, state)
local c = (cs * (2-cs) * mueff)^0.5 / sigma
ps = ps - ps * cs + z * c -- exponential decay on ps
- local hsig = (torch.sum(torch.pow(ps,2)) /
+ local hsig = (torch.sum(torch.pow(ps,2)) /
(1-(1-cs)^(2*counteval/lambda)) / N < 2 + 4./(N+1))
hsig = hsig and 1.0 or 0.0 --use binary numbers
@@ -155,23 +155,23 @@ function optim.cmaes(opfunc, x, config, state)
for i=1,N do
for j=1,N do
local r = torch.range(1,mu)
- r:apply(function(k)
+ r:apply(function(k)
return weights[k] * (arx[k][i]-xold[i]) * (arx[k][j]-xold[j]) end)
local Cmuij = torch.sum(r) / sigma^2 -- rank-mu update
- C[i][j] = C[i][j] + ((-c1a - cmu) * C[i][j] +
+ C[i][j] = C[i][j] + ((-c1a - cmu) * C[i][j] +
c1 * pc[i]*pc[j] + cmu * Cmuij)
end
end
-- Adapt step-size sigma with factor <= exp(0.6) \approx 1.82
- sigma = sigma * math.exp(math.min(0.6,
+ sigma = sigma * math.exp(math.min(0.6,
(cs / damps) * (torch.sum(torch.pow(ps,2))/N - 1)/2))
end
- local function stop()
- --[[return satisfied termination conditions in a table like
- {'termination reason':value, ...}, for example {'tolfun':1e-12},
- or the empty table {}--]]
+ local function stop()
+ --[[return satisfied termination conditions in a table like
+ {'termination reason':value, ...}, for example {'tolfun':1e-12},
+ or the empty table {}--]]
local res = {}
if counteval > 0 then
if counteval >= maxEval then
@@ -184,7 +184,7 @@ function optim.cmaes(opfunc, x, config, state)
res['condition'] = 1e7
end
if fitvals:nElement() > 1 and fitvals[fitvals:nElement()] - fitvals[1] < 1e-12 then
- res['tolfun'] = 1e-12
+ res['tolfun'] = 1e-12
end
if sigma * torch.max(D) < 1e-11 then
-- remark: max(D) >= max(diag(C))^0.5
@@ -206,8 +206,8 @@ function optim.cmaes(opfunc, x, config, state)
end
if iteration <= 2 or iteration % verb_modulo == 0 then
local max_std = math.sqrt(torch.max(torch.diag(C)))
- print(tostring(counteval).. ': ' ..
- string.format(' %6.1f %8.1e ', torch.max(D) / torch.min(D), sigma * max_std)
+ print(tostring(counteval).. ': ' ..
+ string.format(' %6.1f %8.1e ', torch.max(D) / torch.min(D), sigma * max_std)
.. tostring(fitvals[1]))
end
@@ -224,7 +224,7 @@ function optim.cmaes(opfunc, x, config, state)
fitvals[i] = objfunc(candidate)
end
- tell(X)
+ tell(X)
disp(verb_disp)
end
@@ -245,7 +245,7 @@ end
-BestSolution.__index = BestSolution
+BestSolution.__index = BestSolution
function BestSolution.new(x, f, evals)
local self = setmetatable({}, BestSolution)
self.x = x
diff --git a/fista.lua b/fista.lua
index 7fba128..c8c6f5e 100644
--- a/fista.lua
+++ b/fista.lua
@@ -17,7 +17,7 @@ On output, `params` will contain these additional fields that can be reused.
- `params.L` : last used L value will be written.
-These are temporary storages needed by the algo and if the same params object is
+These are temporary storages needed by the algo and if the same params object is
passed a second time, these same storages will be used without new allocation.
- `params.xkm` : previous iterarion point
@@ -26,7 +26,7 @@ passed a second time, these same storages will be used without new allocation.
Returns the solution x and history of {function evals, number of line search ,...}
-Algorithm is published in
+Algorithm is published in
@article{beck-fista-09,
Author = {Beck, Amir and Teboulle, Marc},
@@ -38,7 +38,7 @@ Algorithm is published in
Year = {2009}}
]]
function optim.FistaLS(f, g, pl, xinit, params)
-
+
local params = params or {}
local L = params.L or 0.1
local Lstep = params.Lstep or 1.5
@@ -46,7 +46,7 @@ function optim.FistaLS(f, g, pl, xinit, params)
local maxline = params.maxline or 20
local errthres = params.errthres or 1e-4
local doFistaUpdate = params.doFistaUpdate
- local verbose = params.verbose
+ local verbose = params.verbose
-- temporary allocations
params.xkm = params.xkm or torch.Tensor()
@@ -77,11 +77,11 @@ function optim.FistaLS(f, g, pl, xinit, params)
-- get derivatives from smooth function
local fy,gfy = f(y,'dx')
--local gfy = f(y)
-
+
local fply = 0
local gply = 0
local Q = 0
-
+
----------------------------------------------
-- do line search to find new current location starting from fista loc
local nline = 0
@@ -98,7 +98,7 @@ function optim.FistaLS(f, g, pl, xinit, params)
-- evaluate this point F(ply)
fply = f(ply)
-
+
-- ply - y
ply:add(-1, y)
-- <ply-y , \Grad(f(y))>
diff --git a/lbfgs.lua b/lbfgs.lua
index 4c7a0b8..d850fcb 100644
--- a/lbfgs.lua
+++ b/lbfgs.lua
@@ -27,7 +27,7 @@ ARGS:
RETURN:
- `x*` : the new `x` vector, at the optimal point
-- `f` : a table of all function values:
+- `f` : a table of all function values:
`f[1]` is the value of the function before any optimization and
`f[#f]` is the final fully optimized value, at `x*`
@@ -46,7 +46,7 @@ function optim.lbfgs(opfunc, x, config, state)
local lineSearchOpts = config.lineSearchOptions
local learningRate = config.learningRate or 1
local isverbose = config.verbose or false
-
+
state.funcEval = state.funcEval or 0
state.nIter = state.nIter or 0
@@ -142,7 +142,7 @@ function optim.lbfgs(opfunc, x, config, state)
table.insert(state.stp_bufs, s)
end
- -- compute the approximate (L-BFGS) inverse Hessian
+ -- compute the approximate (L-BFGS) inverse Hessian
-- multiplied by the gradient
local k = #old_dirs
diff --git a/nag.lua b/nag.lua
index fd4210d..875d81e 100644
--- a/nag.lua
+++ b/nag.lua
@@ -1,11 +1,11 @@
----------------------------------------------------------------------
--- An implementation of SGD adapted with features of Nesterov's
+-- An implementation of SGD adapted with features of Nesterov's
-- Accelerated Gradient method, based on the paper
-- On the Importance of Initialization and Momentum in Deep Learning
-- Sutsveker et. al., ICML 2013
--
-- ARGS:
--- opfunc : a function that takes a single input (X), the point of
+-- opfunc : a function that takes a single input (X), the point of
-- evaluation, and returns f(X) and df/dX
-- x : the initial point
-- state : a table describing the state of the optimizer; after each
@@ -44,7 +44,7 @@ function optim.nag(opfunc, x, config, state)
-- first step in the direction of the momentum vector
if state.dfdx then
- x:add(mom, state.dfdx)
+ x:add(mom, state.dfdx)
end
-- then compute gradient at that point
-- comment out the above line to get the original SGD
diff --git a/rmsprop.lua b/rmsprop.lua
index 038af21..1eb526d 100644
--- a/rmsprop.lua
+++ b/rmsprop.lua
@@ -22,36 +22,36 @@ RETURN:
]]
function optim.rmsprop(opfunc, x, config, state)
- -- (0) get/update state
- local config = config or {}
- local state = state or config
- local lr = config.learningRate or 1e-2
- local alpha = config.alpha or 0.99
- local epsilon = config.epsilon or 1e-8
- local wd = config.weightDecay or 0
-
- -- (1) evaluate f(x) and df/dx
- local fx, dfdx = opfunc(x)
-
- -- (2) weight decay
- if wd ~= 0 then
+ -- (0) get/update state
+ local config = config or {}
+ local state = state or config
+ local lr = config.learningRate or 1e-2
+ local alpha = config.alpha or 0.99
+ local epsilon = config.epsilon or 1e-8
+ local wd = config.weightDecay or 0
+
+ -- (1) evaluate f(x) and df/dx
+ local fx, dfdx = opfunc(x)
+
+ -- (2) weight decay
+ if wd ~= 0 then
dfdx:add(wd, x)
- end
+ end
- -- (3) initialize mean square values and square gradient storage
- if not state.m then
+ -- (3) initialize mean square values and square gradient storage
+ if not state.m then
state.m = torch.Tensor():typeAs(x):resizeAs(dfdx):fill(1)
state.tmp = torch.Tensor():typeAs(x):resizeAs(dfdx)
- end
+ end
- -- (4) calculate new (leaky) mean squared values
- state.m:mul(alpha)
- state.m:addcmul(1.0-alpha, dfdx, dfdx)
+ -- (4) calculate new (leaky) mean squared values
+ state.m:mul(alpha)
+ state.m:addcmul(1.0-alpha, dfdx, dfdx)
- -- (5) perform update
- state.tmp:sqrt(state.m):add(epsilon)
- x:addcdiv(-lr, dfdx, state.tmp)
+ -- (5) perform update
+ state.tmp:sqrt(state.m):add(epsilon)
+ x:addcdiv(-lr, dfdx, state.tmp)
- -- return x*, f(x) before optimization
- return x, {fx}
+ -- return x*, f(x) before optimization
+ return x, {fx}
end
diff --git a/rprop.lua b/rprop.lua
index d6c9579..d7af164 100644
--- a/rprop.lua
+++ b/rprop.lua
@@ -20,83 +20,83 @@ RETURN:
(Martin Riedmiller, Koray Kavukcuoglu 2013)
--]]
function optim.rprop(opfunc, x, config, state)
- if config == nil and state == nil then
- print('no state table RPROP initializing')
- end
- -- (0) get/update state
- local config = config or {}
- local state = state or config
- local stepsize = config.stepsize or 0.1
- local etaplus = config.etaplus or 1.2
- local etaminus = config.etaminus or 0.5
- local stepsizemax = config.stepsizemax or 50.0
- local stepsizemin = config.stepsizemin or 1E-06
- local niter = config.niter or 1
-
- local hfx = {}
-
- for i=1,niter do
-
- -- (1) evaluate f(x) and df/dx
- local fx,dfdx = opfunc(x)
-
- -- init temp storage
- if not state.delta then
- state.delta = dfdx.new(dfdx:size()):zero()
- state.stepsize = dfdx.new(dfdx:size()):fill(stepsize)
- state.sign = dfdx.new(dfdx:size())
- state.psign = torch.ByteTensor(dfdx:size())
- state.nsign = torch.ByteTensor(dfdx:size())
- state.zsign = torch.ByteTensor(dfdx:size())
- state.dminmax = torch.ByteTensor(dfdx:size())
- if torch.type(x)=='torch.CudaTensor' then
- -- Push to GPU
- state.psign = state.psign:cuda()
- state.nsign = state.nsign:cuda()
- state.zsign = state.zsign:cuda()
- state.dminmax = state.dminmax:cuda()
- end
- end
-
- -- sign of derivative from last step to this one
- torch.cmul(state.sign, dfdx, state.delta)
- torch.sign(state.sign, state.sign)
-
- -- get indices of >0, <0 and ==0 entries
- state.sign.gt(state.psign, state.sign, 0)
- state.sign.lt(state.nsign, state.sign, 0)
- state.sign.eq(state.zsign, state.sign, 0)
-
- -- get step size updates
- state.sign[state.psign] = etaplus
- state.sign[state.nsign] = etaminus
- state.sign[state.zsign] = 1
-
- -- update stepsizes with step size updates
- state.stepsize:cmul(state.sign)
-
- -- threshold step sizes
- -- >50 => 50
- state.stepsize.gt(state.dminmax, state.stepsize, stepsizemax)
- state.stepsize[state.dminmax] = stepsizemax
- -- <1e-6 ==> 1e-6
- state.stepsize.lt(state.dminmax, state.stepsize, stepsizemin)
- state.stepsize[state.dminmax] = stepsizemin
-
- -- for dir<0, dfdx=0
- -- for dir>=0 dfdx=dfdx
- dfdx[state.nsign] = 0
- -- state.sign = sign(dfdx)
- torch.sign(state.sign,dfdx)
-
- -- update weights
- x:addcmul(-1,state.sign,state.stepsize)
-
- -- update state.dfdx with current dfdx
- state.delta:copy(dfdx)
-
- table.insert(hfx,fx)
- end
+ if config == nil and state == nil then
+ print('no state table RPROP initializing')
+ end
+ -- (0) get/update state
+ local config = config or {}
+ local state = state or config
+ local stepsize = config.stepsize or 0.1
+ local etaplus = config.etaplus or 1.2
+ local etaminus = config.etaminus or 0.5
+ local stepsizemax = config.stepsizemax or 50.0
+ local stepsizemin = config.stepsizemin or 1E-06
+ local niter = config.niter or 1
+
+ local hfx = {}
+
+ for i=1,niter do
+
+ -- (1) evaluate f(x) and df/dx
+ local fx,dfdx = opfunc(x)
+
+ -- init temp storage
+ if not state.delta then
+ state.delta = dfdx.new(dfdx:size()):zero()
+ state.stepsize = dfdx.new(dfdx:size()):fill(stepsize)
+ state.sign = dfdx.new(dfdx:size())
+ state.psign = torch.ByteTensor(dfdx:size())
+ state.nsign = torch.ByteTensor(dfdx:size())
+ state.zsign = torch.ByteTensor(dfdx:size())
+ state.dminmax = torch.ByteTensor(dfdx:size())
+ if torch.type(x)=='torch.CudaTensor' then
+ -- Push to GPU
+ state.psign = state.psign:cuda()
+ state.nsign = state.nsign:cuda()
+ state.zsign = state.zsign:cuda()
+ state.dminmax = state.dminmax:cuda()
+ end
+ end
+
+ -- sign of derivative from last step to this one
+ torch.cmul(state.sign, dfdx, state.delta)
+ torch.sign(state.sign, state.sign)
+
+ -- get indices of >0, <0 and ==0 entries
+ state.sign.gt(state.psign, state.sign, 0)
+ state.sign.lt(state.nsign, state.sign, 0)
+ state.sign.eq(state.zsign, state.sign, 0)
+
+ -- get step size updates
+ state.sign[state.psign] = etaplus
+ state.sign[state.nsign] = etaminus
+ state.sign[state.zsign] = 1
+
+ -- update stepsizes with step size updates
+ state.stepsize:cmul(state.sign)
+
+ -- threshold step sizes
+ -- >50 => 50
+ state.stepsize.gt(state.dminmax, state.stepsize, stepsizemax)
+ state.stepsize[state.dminmax] = stepsizemax
+ -- <1e-6 ==> 1e-6
+ state.stepsize.lt(state.dminmax, state.stepsize, stepsizemin)
+ state.stepsize[state.dminmax] = stepsizemin
+
+ -- for dir<0, dfdx=0
+ -- for dir>=0 dfdx=dfdx
+ dfdx[state.nsign] = 0
+ -- state.sign = sign(dfdx)
+ torch.sign(state.sign,dfdx)
+
+ -- update weights
+ x:addcmul(-1,state.sign,state.stepsize)
+
+ -- update state.dfdx with current dfdx
+ state.delta:copy(dfdx)
+
+ table.insert(hfx,fx)
+ end
-- return x*, f(x) before optimization
return x,hfx
diff --git a/sgd.lua b/sgd.lua
index ea13c55..e21c696 100644
--- a/sgd.lua
+++ b/sgd.lua
@@ -70,7 +70,7 @@ function optim.sgd(opfunc, x, config, state)
-- (4) learning rate decay (annealing)
local clr = lr / (1 + nevals*lrd)
-
+
-- (5) parameter update with single or individual learning rates
if lrs then
if not state.deltaParameters then