diff options
author | Koray Kavukcuoglu <koray@kavukcuoglu.org> | 2011-11-09 18:57:25 +0400 |
---|---|---|
committer | Koray Kavukcuoglu <koray@kavukcuoglu.org> | 2011-11-09 18:57:25 +0400 |
commit | 8eb46a42b238a14161fd78a38edcbd525924ac46 (patch) | |
tree | 460a135c67d4c277cba51a62d069636e87cb74ee | |
parent | d2a8a948595192a962d486df36ed182656e389d6 (diff) | |
parent | f542c2f46d3f3d41df1919cd9dd76cb53b189f53 (diff) |
Merge branch 'master' of github.com:clementfarabet/lua---nnx
-rw-r--r-- | ASGDOptimization.lua | 23 | ||||
-rw-r--r-- | BatchOptimization.lua | 15 | ||||
-rw-r--r-- | CMakeLists.txt | 1 | ||||
-rw-r--r-- | DataSet.lua | 4 | ||||
-rw-r--r-- | DiagHessian.lua | 2 | ||||
-rw-r--r-- | LBFGSOptimization.lua | 12 | ||||
-rw-r--r-- | SGDOptimization.lua | 37 | ||||
-rw-r--r-- | SpatialColorTransform.lua | 8 | ||||
-rw-r--r-- | init.lua | 21 | ||||
-rw-r--r-- | newCGOptimization.lua | 194 |
10 files changed, 303 insertions, 14 deletions
diff --git a/ASGDOptimization.lua b/ASGDOptimization.lua index 28e1131..03a2058 100644 --- a/ASGDOptimization.lua +++ b/ASGDOptimization.lua @@ -46,6 +46,7 @@ function ASGD:optimize() self.parameters:add(-self.eta_t, self.gradParameters) end -- (3) Average part + -- a := a + mu_t [ w - a ] self.a = self.a or self.parameters.new():resizeAs(self.parameters):zero() if self.mu_t ~= 1 then self.tmp = self.tmp or self.a.new():resizeAs(self.a) @@ -63,4 +64,24 @@ function ASGD:optimize() -- (4c) update mu_t -- mu_t = 1/max(1,t-t0) self.mu_t = 1 / math.max(1,self.t - self.t0) -end
\ No newline at end of file +end + +-- in ASGD we keep a copy of the parameters which is an averaged +-- version of the current parameters. This function is to test with +-- those averaged parameters. Best to run on batches because we have +-- to copy the full parameter vector + +function ASGD:test(_inputs, _targets) -- function test + -- (0) make a backup of the online parameters + self.backup = self.backup or + self.parameters.new():resizeAs(self.parameters) + self.backup:copy(self.parameters) + -- (1) copy average parameters into the model + self.parameters:copy(self.a) + -- (2) do the test with the average parameters + self.output = self.module:forward(_inputs) + self.error = self.criterion:forward(self.output, _targets) + -- (3) copy back the online parameters to continue training + self.parameters:copy(self.backup) + return self.error +end diff --git a/BatchOptimization.lua b/BatchOptimization.lua index 1e83bbb..b0c2a12 100644 --- a/BatchOptimization.lua +++ b/BatchOptimization.lua @@ -17,7 +17,13 @@ function Batch:__init(...) {arg='precode', type='function', help='optional code to be run by each parallel worker at their init'}, {arg='verbose', type='number', - help='verbose level during training [0-2]', default=0} + help='verbose level during training [0-2]', default=0}, + {arg='allreduce', type='boolean', help='use allreduce', default=false}, + {arg='allreduceSyncTime', type='boolean', help='sync period', default=1}, + {arg='allreduceMaster', type='string', help='master address', default='localhost'}, + {arg='allreduceUniqueId', type='boolean', help='job unique id', default=0}, + {arg='allreduceNbNodes', type='boolean', help='number of nodes', default=1}, + {arg='allreduceNodeId', type='boolean', help='this node\'s id', default=1} ) self.parameters = nnx.flattenParameters(nnx.getParameters(self.module)) self.gradParameters = nnx.flattenParameters(nnx.getGradParameters(self.module)) @@ -30,6 +36,13 @@ function Batch:__init(...) self:setup_mapreduce() end self.P = self.parallelize + + if self.allreduce then + xrequire 'allreduce' + allreduce.init(self.allreduceMaster, self.allreduceUniqueId, + self.allreduceNbNodes, self.allreduceNodeId) + self.accError = 0 + end end function Batch:forward(inputs, targets, options) diff --git a/CMakeLists.txt b/CMakeLists.txt index 70481dc..3bfb4fc 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -128,6 +128,7 @@ install_files(${INSTALL_PREFIX} SpatialRecursiveFovea.lua) install_files(${INSTALL_PREFIX} Optimization.lua) install_files(${INSTALL_PREFIX} LBFGSOptimization.lua) install_files(${INSTALL_PREFIX} CGOptimization.lua) +install_files(${INSTALL_PREFIX} newCGOptimization.lua) install_files(${INSTALL_PREFIX} SGDOptimization.lua) install_files(${INSTALL_PREFIX} ASGDOptimization.lua) install_files(${INSTALL_PREFIX} GeneticSGDOptimization.lua) diff --git a/DataSet.lua b/DataSet.lua index 9ed7cbe..4efb85d 100644 --- a/DataSet.lua +++ b/DataSet.lua @@ -58,6 +58,7 @@ function lDataSet:load(...) -- File found print('<DataSet> Loading samples from cached file ' .. fileName) f = torch.DiskFile(fileName, 'rw') + f:binary() self:read(f) f.close(f) datasetLoadedFromFile = true @@ -73,6 +74,7 @@ function lDataSet:load(...) if (fileName ~= nil) then print('<DataSet> Dumping dataset to cache file ' .. fileName .. ' for fast retrieval') f = torch.DiskFile(fileName, 'rw') + f:binary() self:write(f) f.close(f) end @@ -301,6 +303,7 @@ function lDataSet:save(fileName) self.fileName = fileName print('<DataSet> Saving DataSet to:',fileName) local file = torch.DiskFile(fileName, 'w') + file:binary() self:write(file) file:close() end @@ -310,6 +313,7 @@ function lDataSet:open(fileName) self.fileName = fileName print('<DataSet> Loading DataSet from File:',fileName) local file = torch.DiskFile(fileName, 'r') + file:binary() self:read(file) file:close() print('<DataSet> '..self.nbSamples..' samples loaded') diff --git a/DiagHessian.lua b/DiagHessian.lua index dfcdcaf..26aed3e 100644 --- a/DiagHessian.lua +++ b/DiagHessian.lua @@ -1,7 +1,7 @@ -- Module function nn.Module.backwardDiagHessian(self, input, diagHessianOutput) - self.diagHessianInput = self.diagHessianInput or input + self.diagHessianInput = self.diagHessianInput or diagHessianOutput return self.diagHessianInput end diff --git a/LBFGSOptimization.lua b/LBFGSOptimization.lua index 53dfe70..fce9ba5 100644 --- a/LBFGSOptimization.lua +++ b/LBFGSOptimization.lua @@ -37,7 +37,17 @@ end function LBFGS:optimize() -- callback for lBFGS - lbfgs.evaluate = self.evaluate + lbfgs.evaluate = function() + local loss = self.evaluate() + if self.allreduce then + local losst = torch.Tensor(1):fill(loss) + allreduce.accumulate(losst) + allreduce.accumulate(self.gradParameters) + loss = losst[1] + end + return loss + end + -- the magic function: will update the parameter vector according to the l-BFGS method self.output = lbfgs.run() end diff --git a/SGDOptimization.lua b/SGDOptimization.lua index e26c6ed..1309ae9 100644 --- a/SGDOptimization.lua +++ b/SGDOptimization.lua @@ -54,6 +54,16 @@ function SGD:optimize() -- normal single learningRate parameter update self.parameters:add(-learningRate, self.currentGradParameters) end + + -- (5) allreduce sync + if self.allreduce then + if (self.sampleCounter % self.allreduceSyncTime) == self.allreduceSyncTime-1 then + allreduce.best(self.parameters, self.accError) + self.accError = 0 + else + self.accError = self.accError + self.output + end + end end -- for loop on maxIterations end @@ -78,9 +88,11 @@ end function SGD:diagHessian(inputs, targets) if not self.learningRates then + print('<SGD> creating learningRates, initDiagHessian') -- do initialization - self.diagHessianEpsilon = self.diagHessianEpslion or 1e-3 + self.diagHessianEpsilon = self.diagHessianEpsilon or 1e-2 self.learningRates = torch.Tensor():typeAs(self.parameters):resizeAs(self.parameters):fill(1) + -- we can call this multiple times as it will only create the tensors once. self.module:initDiagHessianParameters() self.diagHessianParameters = nnx.flattenParameters(nnx.getDiagHessianParameters(self.module)) @@ -109,23 +121,30 @@ function SGD:diagHessian(inputs, targets) module:accDiagHessianParameters(inputs, critDiagHessian) self.diagHessianParameters:div(inputs:size(1)) end - -- protect diag hessian (the proper way of doing it is the commented code, - -- but for speed reasons, the uncommented code just works) + print('<diagHessian>') + print(' + before max ') + print(' + epsilon: '..self.diagHessianEpsilon) + print(' + norm of dhP: '..self.diagHessianParameters:norm()) + print(' + max dhP : '..self.diagHessianParameters:max()) + print(' + min dhp: '.. self.diagHessianParameters:min()) + -- protect diag hessian self.diagHessianParameters:apply( - function(x) - return math.max(x, self.diagHessianEpsilon) + function(x) + local out = math.max(math.abs(x), self.diagHessianEpsilon) + if (x < 0) then out = -out end + return out end) - --self.diagHessianParameters:add(self.diagHessianEpsilon) -- now learning rates are obtained like this: self.learningRates:cdiv(self.diagHessianParameters) - print('<diagHessian>') + -- test + print(' + after max') print(' + norm of dhP: '..self.diagHessianParameters:norm().. ' norm of LR: '..self.learningRates:norm()) print(' + max dhP : '..self.diagHessianParameters:max() .. - ' max LR: '..self.learningRates:max()) - print(' + min dhp: '.. self.diagHessianParameters:min() .. ' min LR: '..self.learningRates:min()) + print(' + min dhp: '.. self.diagHessianParameters:min() .. + ' max LR: '..self.learningRates:max()) -- self.learningRates:div(self.learningRates:norm()) end diff --git a/SpatialColorTransform.lua b/SpatialColorTransform.lua index 2179b0c..a06efd4 100644 --- a/SpatialColorTransform.lua +++ b/SpatialColorTransform.lua @@ -122,6 +122,13 @@ function SpatialColorTransform:backward(input, gradOutput) return self.gradInput end +function SpatialColorTransform:type(type) + parent.type(self,type) + if self.islinear then + self.linear:type(type) + end +end + function SpatialColorTransform:write(file) parent.write(self, file) file:writeObject(self.transform) @@ -138,4 +145,5 @@ function SpatialColorTransform:read(file) if self.islinear then self.linear = file:readObject() end + self.type = nil end @@ -108,6 +108,7 @@ torch.include('nnx', 'SGDOptimization.lua') torch.include('nnx', 'ASGDOptimization.lua') torch.include('nnx', 'LBFGSOptimization.lua') torch.include('nnx', 'CGOptimization.lua') +torch.include('nnx', 'newCGOptimization.lua') torch.include('nnx', 'GeneticSGDOptimization.lua') torch.include('nnx', 'DiagHessian.lua') @@ -124,7 +125,25 @@ torch.include('nnx', 'DataSet.lua') torch.include('nnx', 'DataList.lua') torch.include('nnx', 'DataSetLabelMe.lua') --- helpers: +-- torch helpers (should not be here): +function torch.save(filename, object, mode) + mode = mode or 'binary' + local file = torch.DiskFile(filename, 'w') + file[mode](file) + file:writeObject(object) + file:close() +end + +function torch.load(filename, mode) + mode = mode or 'binary' + local file = torch.DiskFile(filename, 'r') + file[mode](file) + local object = file:readObject() + file:close() + return object +end + +-- nn helpers: function nnx.empty(module) if module.modules then -- find submodules in classic containers 'modules' diff --git a/newCGOptimization.lua b/newCGOptimization.lua new file mode 100644 index 0000000..71b1c6e --- /dev/null +++ b/newCGOptimization.lua @@ -0,0 +1,194 @@ +local CG,parent = torch.class('nn.newCGOptimization', 'nn.BatchOptimization') +-- +-- wrapper around Koray's cg function which implements rasmussen's +-- matlab cg in pure lua. +-- Author: Marco Scoffier +-- +function CG:__init(...) + parent.__init(self, ...) + xlua.unpack_class(self, {...}, + 'cgOptimization', nil, + {arg='rho', type='number', default=0.1}, + {arg='sig', type='number', default=0.5}, + {arg='int', type='number', default=0.1}, + {arg='ext', type='number', default=3.0}, + {arg='max', type='number', default=20}, + {arg='ratio', type='number', default=100}, + {arg='length', type='number', default=25}, + {arg='red', type='number', default=1}, + {arg='verbose', type='number', default=0} + ) + + + + -- we need three points for the interpolation/extrapolation stuff + self.df1, self.df2, self.df3 = torch.Tensor(),torch.Tensor(),torch.Tensor() + + self.df1:resizeAs(self.parameters) + self.df2:resizeAs(self.parameters) + self.df3:resizeAs(self.parameters) + + -- search direction + self.s = torch.Tensor():resizeAs(self.parameters) + + -- we need a temp storage for X + self.x0 = torch.Tensor():resizeAs(self.parameters) + self.df0 = torch.Tensor():resizeAs(self.parameters) + +end + +function CG:optimize() + local rho = self.rho + local sig = self.sig + local int = self.int + local ext = self.ext + local max = self.max + local ratio = self.ratio + local length = self.length + local red = self.red + local verbose = self.verbose + + local i = 0 + local ls_failed = 0 + local fx = {} + + -- we need three points for the interpolation/extrapolation stuff + local z1, z2, z3 = 0,0,0 + local d1, d2, d3 = 0,0,0 + local f1, f2, f3 = 0,0,0 + local df1,df2,df3 = self.df1, self.df2, self.df3 + + local x = self.parameters + local s = self.s + + local x0 = self.x0 + local f0 = 0 + local df0 = self.df0 + + -- the magic function: will update the parameter vector using CG + -- evaluate at initial point + f1 = self.evaluate() + df1:copy(self.gradParameters) + i=i+1 + + -- initial search direction + s:copy(df1):mul(-1) + + d1 = -s:dot(s ) -- slope + z1 = red/(1-d1) -- initial step + + while i < math.abs(length) do + + x0:copy(x) + f0 = f1 + df0:copy(df1) + x:add(z1,s) + f2 = self.evaluate() + df2:copy(self.gradParameters) + i=i+1 + d2 = df2:dot(s) + f3,d3,z3 = f1,d1,-z1 -- init point 3 equal to point 1 + local m = math.min(max,length-i) + local success = 0 + local limit = -1 + + while true do + while (f2 > f1+z1*rho*d1 or d2 > -sig*d1) and m > 0 do + limit = z1 + if f2 > f1 then + z2 = z3 - (0.5*d3*z3*z3)/(d3*z3+f2-f3) + else + local A = 6*(f2-f3)/z3+3*(d2+d3) + local B = 3*(f3-f2)-z3*(d3+2*d2) + z2 = (math.sqrt(B*B-A*d2*z3*z3)-B)/A + end + if z2 ~= z2 or z2 == math.huge or z2 == -math.huge then + z2 = z3/2; + end + z2 = math.max(math.min(z2, int*z3),(1-int)*z3); + z1 = z1 + z2; + x:add(z2,s) + f2 = self.evaluate() + df2:copy(self.gradParameters) + i=i+1 + m = m - 1 + d2 = df2:dot(s) + z3 = z3-z2; + end + if f2 > f1+z1*rho*d1 or d2 > -sig*d1 then + break + elseif d2 > sig*d1 then + success = 1; + break; + elseif m == 0 then + break; + end + local A = 6*(f2-f3)/z3+3*(d2+d3); + local B = 3*(f3-f2)-z3*(d3+2*d2); + z2 = -d2*z3*z3/(B+math.sqrt(B*B-A*d2*z3*z3)) + + if z2 ~= z2 or z2 == math.huge or z2 == -math.huge or z2 < 0 then + if limit < -0.5 then + z2 = z1 * (ext -1) + else + z2 = (limit-z1)/2 + end + elseif (limit > -0.5) and (z2+z1) > limit then + z2 = (limit-z1)/2 + elseif limit < -0.5 and (z2+z1) > z1*ext then + z2 = z1*(ext-1) + elseif z2 < -z3*int then + z2 = -z3*int + elseif limit > -0.5 and z2 < (limit-z1)*(1-int) then + z2 = (limit-z1)*(1-int) + end + f3=f2; d3=d2; z3=-z2; + z1 = z1+z2; + + x:add(z2,s) + + f2 = self.evaluate() + df2:copy(self.gradParameters) + i=i+1 + m = m - 1 + d2 = df2:dot(s) + end + if success == 1 then + f1 = f2 + fx[#fx+1] = f1; + local ss = (df2:dot(df2)-df2:dot(df1)) / df1:dot(df1) + s:mul(ss) + s:add(-1,df2) + local tmp = df1:clone() + df1:copy(df2) + df2:copy(tmp) + d2 = df1:dot(s) + if d2> 0 then + s:copy(df1) + s:mul(-1) + d2 = -s:dot(s) + end + + z1 = z1 * math.min(ratio, d1/(d2-1e-320)) + d1 = d2 + ls_failed = 0 + else + x:copy(x0) + f1 = f0 + df1:copy(df0) + if ls_failed or i>length then + break + end + local tmp = df1:clone() + df1:copy(df2) + df2:copy(tmp) + s:copy(df1) + s:mul(-1) + d1 = -s:dot(s) + z1 = 1/(1-d1) + ls_failed = 1 + end + end + self.output = f1 -- self.evaluate(x) + collectgarbage() +end |