add weight decay support to adam

author: gcheron <gcheron@gpu01.cm.cluster> 2016-06-10 18:24:02 +0300
committer: gcheron <gcheron@gpu01.cm.cluster> 2016-06-10 18:24:02 +0300
commit: 6534bfd77a7ef5f9dd116d03d35c6dbf9ab0bce4 (patch)
tree: cc243c5229b162290296a5461eb3717efb8920a8
parent: 016dca94bed88cdfd23a6ddddc5a48e9bdb114e5 (diff)
1 files changed, 8 insertions, 1 deletions
diff --git a/adam.lua b/adam.lua
index 89dd793..a6ad588 100644
--- a/adam.lua
+++ b/adam.lua
@@ -10,6 +10,7 @@ ARGS:
 - 'config.beta1'             : first moment coefficient
 - 'config.beta2'             : second moment coefficient
 - 'config.epsilon'           : for numerical stability
+- 'config.weightDecay'       : weight decay
 - 'state'                    : a table describing the state of the optimizer; after each
                               call the state is modified
 
@@ -28,10 +29,16 @@ function optim.adam(opfunc, x, config, state)
     local beta1 = config.beta1 or 0.9
     local beta2 = config.beta2 or 0.999
     local epsilon = config.epsilon or 1e-8
+    local wd = config.weightDecay or 0
 
     -- (1) evaluate f(x) and df/dx
     local fx, dfdx = opfunc(x)
 
+    -- (2) weight decay
+    if wd ~= 0 then
+       dfdx:add(wd, x)
+    end
+
     -- Initialization
     state.t = state.t or 0
     -- Exponential moving average of gradient values
@@ -52,7 +59,7 @@ function optim.adam(opfunc, x, config, state)
     local biasCorrection1 = 1 - beta1^state.t
     local biasCorrection2 = 1 - beta2^state.t
     local stepSize = lr * math.sqrt(biasCorrection2)/biasCorrection1
-    -- (2) update x
+    -- (3) update x
     x:addcdiv(-stepSize, state.m, state.denom)
 
     -- return x*, f(x) before optimization
author	gcheron <gcheron@gpu01.cm.cluster>	2016-06-10 18:24:02 +0300
committer	gcheron <gcheron@gpu01.cm.cluster>	2016-06-10 18:24:02 +0300
commit	6534bfd77a7ef5f9dd116d03d35c6dbf9ab0bce4 (patch)
tree	cc243c5229b162290296a5461eb3717efb8920a8
parent	016dca94bed88cdfd23a6ddddc5a48e9bdb114e5 (diff)