diff options
author | karpathy <andrej.karpathy@gmail.com> | 2015-04-16 22:39:08 +0300 |
---|---|---|
committer | karpathy <andrej.karpathy@gmail.com> | 2015-04-16 22:39:08 +0300 |
commit | 30f6bd3deb8e3b1977dd904a14bac27f7fe71f41 (patch) | |
tree | df137b773dc12e8df9554adbbf3edaf727b09fc7 | |
parent | 3d1d03d5bc22f7113c22e5dfdb12e3fa06a1fb18 (diff) |
adding some docs, cleanup to make things more consistent with existing code
-rw-r--r-- | rmsprop.lua | 23 |
1 files changed, 13 insertions, 10 deletions
diff --git a/rmsprop.lua b/rmsprop.lua index 6a84416..f0e6310 100644 --- a/rmsprop.lua +++ b/rmsprop.lua @@ -9,8 +9,10 @@ ARGS: - 'config.learningRate' : learning rate - 'config.alpha' : smoothing constant - 'config.epsilon' : value with which to inistialise m -- 'state = {m, dfdx_sq}' : a table describing the state of the optimizer; after each - call the state is modified +- 'state' : a table describing the state of the optimizer; + after each call the state is modified +- 'state.m' : leaky sum of squares of parameter gradients, +- 'state.tmp' : and the square root (with epsilon smoothing) RETURN: - `x` : the new x vector @@ -23,25 +25,26 @@ function optim.rmsprop(opfunc, x, config, state) local config = config or {} local state = state or config local lr = config.learningRate or 1e-2 - local alpha = config.alpha or 0.95 + local alpha = config.alpha or 0.99 local epsilon = config.epsilon or 1e-8 -- (1) evaluate f(x) and df/dx local fx, dfdx = opfunc(x) -- (2) initialize mean square values and square gradient storage - state.m = state.m or torch.Tensor():typeAs(dfdx):resizeAs(dfdx):zero() - state.tmp = state.tmp or x.new(dfdx:size()):zero() + if not state.m then + state.m = torch.Tensor():typeAs(x):resizeAs(dfdx):zero() + state.tmp = torch.Tensor():typeAs(x):resizeAs(dfdx) + end -- (3) calculate new (leaky) mean squared values state.m:mul(alpha) - state.m:addcmul(1.0-alpha,dfdx,dfdx) + state.m:addcmul(1.0-alpha, dfdx, dfdx) -- (4) perform update - state.tmp:sqrt(state.m) - x:addcdiv(-lr, dfdx, state.tmp:add(epsilon)) + state.tmp:sqrt(state.m):add(epsilon) + x:addcdiv(-lr, dfdx, state.tmp) -- return x*, f(x) before optimization - return x, {fx}, state.tmp + return x, {fx} end - |