Merge branch 'master' of github.com:clementfarabet/lua---nnx

author: Koray Kavukcuoglu <koray@kavukcuoglu.org> 2011-11-09 18:57:25 +0400
committer: Koray Kavukcuoglu <koray@kavukcuoglu.org> 2011-11-09 18:57:25 +0400
commit: 8eb46a42b238a14161fd78a38edcbd525924ac46 (patch)
tree: 460a135c67d4c277cba51a62d069636e87cb74ee
parent: d2a8a948595192a962d486df36ed182656e389d6 (diff)
parent: f542c2f46d3f3d41df1919cd9dd76cb53b189f53 (diff)
10 files changed, 303 insertions, 14 deletions
diff --git a/ASGDOptimization.lua b/ASGDOptimization.lua
index 28e1131..03a2058 100644
--- a/ASGDOptimization.lua
+++ b/ASGDOptimization.lua
@@ -46,6 +46,7 @@ function ASGD:optimize()
       self.parameters:add(-self.eta_t, self.gradParameters)
    end
    -- (3) Average part
+   --     a := a + mu_t [ w - a ]
    self.a = self.a or self.parameters.new():resizeAs(self.parameters):zero()
    if self.mu_t ~= 1 then
       self.tmp = self.tmp or self.a.new():resizeAs(self.a)
@@ -63,4 +64,24 @@ function ASGD:optimize()
    -- (4c) update mu_t
    --   mu_t = 1/max(1,t-t0)
    self.mu_t = 1 / math.max(1,self.t - self.t0)
-end
-\ No newline at end of file
+end
+
+-- in ASGD we keep a copy of the parameters which is an averaged
+-- version of the current parameters.  This function is to test with
+-- those averaged parameters.  Best to run on batches because we have
+-- to copy the full parameter vector
+
+function ASGD:test(_inputs, _targets) -- function test
+   -- (0) make a backup of the online parameters
+   self.backup = self.backup or
+     self.parameters.new():resizeAs(self.parameters)
+   self.backup:copy(self.parameters)
+   -- (1) copy average parameters into the model
+   self.parameters:copy(self.a)
+   -- (2) do the test with the average parameters
+   self.output = self.module:forward(_inputs)
+   self.error  = self.criterion:forward(self.output, _targets)
+   -- (3) copy back the online parameters to continue training
+   self.parameters:copy(self.backup)
+   return self.error
+end
diff --git a/BatchOptimization.lua b/BatchOptimization.lua
index 1e83bbb..b0c2a12 100644
--- a/BatchOptimization.lua
+++ b/BatchOptimization.lua
@@ -17,7 +17,13 @@ function Batch:__init(...)
                      {arg='precode', type='function',
                       help='optional code to be run by each parallel worker at their init'},
                      {arg='verbose', type='number',
-                      help='verbose level during training [0-2]', default=0}
+                      help='verbose level during training [0-2]', default=0},
+                     {arg='allreduce', type='boolean', help='use allreduce', default=false},
+                     {arg='allreduceSyncTime', type='boolean', help='sync period', default=1},
+                     {arg='allreduceMaster', type='string', help='master address', default='localhost'},
+                     {arg='allreduceUniqueId', type='boolean', help='job unique id', default=0},
+                     {arg='allreduceNbNodes', type='boolean', help='number of nodes', default=1},
+                     {arg='allreduceNodeId', type='boolean', help='this node\'s id', default=1}
                   )
    self.parameters = nnx.flattenParameters(nnx.getParameters(self.module))
    self.gradParameters = nnx.flattenParameters(nnx.getGradParameters(self.module))
@@ -30,6 +36,13 @@ function Batch:__init(...)
       self:setup_mapreduce()
    end
    self.P = self.parallelize
+
+   if self.allreduce then
+      xrequire 'allreduce'
+      allreduce.init(self.allreduceMaster, self.allreduceUniqueId, 
+                     self.allreduceNbNodes, self.allreduceNodeId)
+      self.accError = 0
+   end
 end
 
 function Batch:forward(inputs, targets, options)
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 70481dc..3bfb4fc 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -128,6 +128,7 @@ install_files(${INSTALL_PREFIX} SpatialRecursiveFovea.lua)
 install_files(${INSTALL_PREFIX} Optimization.lua)
 install_files(${INSTALL_PREFIX} LBFGSOptimization.lua)
 install_files(${INSTALL_PREFIX} CGOptimization.lua)
+install_files(${INSTALL_PREFIX} newCGOptimization.lua)
 install_files(${INSTALL_PREFIX} SGDOptimization.lua)
 install_files(${INSTALL_PREFIX} ASGDOptimization.lua)
 install_files(${INSTALL_PREFIX} GeneticSGDOptimization.lua)
diff --git a/DataSet.lua b/DataSet.lua
index 9ed7cbe..4efb85d 100644
--- a/DataSet.lua
+++ b/DataSet.lua
@@ -58,6 +58,7 @@ function lDataSet:load(...)
          -- File found
          print('<DataSet> Loading samples from cached file ' .. fileName)
          f = torch.DiskFile(fileName, 'rw')
+         f:binary()
          self:read(f)
          f.close(f)
          datasetLoadedFromFile = true
@@ -73,6 +74,7 @@ function lDataSet:load(...)
       if (fileName ~= nil) then
          print('<DataSet> Dumping dataset to cache file ' .. fileName .. ' for fast retrieval')
          f = torch.DiskFile(fileName, 'rw')
+         f:binary()
          self:write(f)
          f.close(f)
       end
@@ -301,6 +303,7 @@ function lDataSet:save(fileName)
    self.fileName = fileName
    print('<DataSet> Saving DataSet to:',fileName)
    local file = torch.DiskFile(fileName, 'w')
+   file:binary()
    self:write(file)
    file:close()
 end
@@ -310,6 +313,7 @@ function lDataSet:open(fileName)
    self.fileName = fileName
    print('<DataSet> Loading DataSet from File:',fileName)
    local file = torch.DiskFile(fileName, 'r')
+   file:binary()
    self:read(file)
    file:close()
    print('<DataSet> '..self.nbSamples..' samples loaded')
diff --git a/DiagHessian.lua b/DiagHessian.lua
index dfcdcaf..26aed3e 100644
--- a/DiagHessian.lua
+++ b/DiagHessian.lua
@@ -1,7 +1,7 @@
 
 -- Module
 function nn.Module.backwardDiagHessian(self, input, diagHessianOutput)
-   self.diagHessianInput = self.diagHessianInput or input
+   self.diagHessianInput = self.diagHessianInput or diagHessianOutput
    return self.diagHessianInput
 end
 
diff --git a/LBFGSOptimization.lua b/LBFGSOptimization.lua
index 53dfe70..fce9ba5 100644
--- a/LBFGSOptimization.lua
+++ b/LBFGSOptimization.lua
@@ -37,7 +37,17 @@ end
 
 function LBFGS:optimize()
    -- callback for lBFGS
-   lbfgs.evaluate = self.evaluate
+   lbfgs.evaluate = function()
+                       local loss = self.evaluate()
+                       if self.allreduce then
+                          local losst = torch.Tensor(1):fill(loss)
+                          allreduce.accumulate(losst)
+                          allreduce.accumulate(self.gradParameters)
+                          loss = losst[1]
+                       end
+                       return loss
+                    end
+
    -- the magic function: will update the parameter vector according to the l-BFGS method
    self.output = lbfgs.run()
 end
diff --git a/SGDOptimization.lua b/SGDOptimization.lua
index e26c6ed..1309ae9 100644
--- a/SGDOptimization.lua
+++ b/SGDOptimization.lua
@@ -54,6 +54,16 @@ function SGD:optimize()
          -- normal single learningRate parameter update
          self.parameters:add(-learningRate, self.currentGradParameters)
       end
+
+      -- (5) allreduce sync
+      if self.allreduce then
+         if (self.sampleCounter % self.allreduceSyncTime) == self.allreduceSyncTime-1 then
+            allreduce.best(self.parameters, self.accError)
+            self.accError = 0
+         else
+            self.accError = self.accError + self.output 
+         end
+      end
    end -- for loop on maxIterations
 end
 
@@ -78,9 +88,11 @@ end
 
 function SGD:diagHessian(inputs, targets)
    if not self.learningRates then 
+      print('<SGD> creating learningRates, initDiagHessian')
       -- do initialization
-      self.diagHessianEpsilon = self.diagHessianEpslion or 1e-3
+      self.diagHessianEpsilon = self.diagHessianEpsilon or 1e-2
       self.learningRates = torch.Tensor():typeAs(self.parameters):resizeAs(self.parameters):fill(1)
+      -- we can call this multiple times as it will only create the tensors once.
       self.module:initDiagHessianParameters()
       self.diagHessianParameters = 
          nnx.flattenParameters(nnx.getDiagHessianParameters(self.module))
@@ -109,23 +121,30 @@ function SGD:diagHessian(inputs, targets)
       module:accDiagHessianParameters(inputs, critDiagHessian)
       self.diagHessianParameters:div(inputs:size(1))
    end
-   -- protect diag hessian (the proper way of doing it is the commented code,
-   -- but for speed reasons, the uncommented code just works)
+   print('<diagHessian>')
+   print(' + before max ')
+   print(' + epsilon: '..self.diagHessianEpsilon)
+   print(' + norm of dhP: '..self.diagHessianParameters:norm())
+   print(' + max dhP : '..self.diagHessianParameters:max())
+   print(' + min dhp: '.. self.diagHessianParameters:min())
+  -- protect diag hessian 
    self.diagHessianParameters:apply(
-      function(x) 
-	 return math.max(x, self.diagHessianEpsilon) 
+      function(x)
+	 local out = math.max(math.abs(x), self.diagHessianEpsilon) 
+	 if (x < 0) then out = -out end
+	 return out
       end)
-   --self.diagHessianParameters:add(self.diagHessianEpsilon)
 
    -- now learning rates are obtained like this:
    self.learningRates:cdiv(self.diagHessianParameters) 
-   print('<diagHessian>')
+   -- test 
+   print(' + after max')
    print(' + norm of dhP: '..self.diagHessianParameters:norm()..
       ' norm of LR: '..self.learningRates:norm())
    print(' + max dhP : '..self.diagHessianParameters:max() .. 
-      ' max LR: '..self.learningRates:max())
-   print(' + min dhp: '.. self.diagHessianParameters:min() ..
       ' min LR: '..self.learningRates:min())
+   print(' + min dhp: '.. self.diagHessianParameters:min() ..
+      ' max LR: '..self.learningRates:max())
    -- self.learningRates:div(self.learningRates:norm())
 end
 
diff --git a/SpatialColorTransform.lua b/SpatialColorTransform.lua
index 2179b0c..a06efd4 100644
--- a/SpatialColorTransform.lua
+++ b/SpatialColorTransform.lua
@@ -122,6 +122,13 @@ function SpatialColorTransform:backward(input, gradOutput)
    return self.gradInput
 end
 
+function SpatialColorTransform:type(type)
+   parent.type(self,type)
+   if self.islinear then
+      self.linear:type(type)
+   end
+end
+
 function SpatialColorTransform:write(file)
    parent.write(self, file)
    file:writeObject(self.transform)
@@ -138,4 +145,5 @@ function SpatialColorTransform:read(file)
    if self.islinear then
       self.linear = file:readObject()
    end
+   self.type = nil
 end
diff --git a/init.lua b/init.lua
index db7f308..073da59 100644
--- a/init.lua
+++ b/init.lua
@@ -108,6 +108,7 @@ torch.include('nnx', 'SGDOptimization.lua')
 torch.include('nnx', 'ASGDOptimization.lua')
 torch.include('nnx', 'LBFGSOptimization.lua')
 torch.include('nnx', 'CGOptimization.lua')
+torch.include('nnx', 'newCGOptimization.lua')
 torch.include('nnx', 'GeneticSGDOptimization.lua')
 torch.include('nnx', 'DiagHessian.lua')
 
@@ -124,7 +125,25 @@ torch.include('nnx', 'DataSet.lua')
 torch.include('nnx', 'DataList.lua')
 torch.include('nnx', 'DataSetLabelMe.lua')
 
--- helpers:
+-- torch helpers (should not be here):
+function torch.save(filename, object, mode)
+   mode = mode or 'binary'
+   local file = torch.DiskFile(filename, 'w')
+   file[mode](file)
+   file:writeObject(object)
+   file:close()
+end
+
+function torch.load(filename, mode)
+   mode = mode or 'binary'
+   local file = torch.DiskFile(filename, 'r')
+   file[mode](file)
+   local object = file:readObject()
+   file:close()
+   return object
+end
+
+-- nn helpers:
 function nnx.empty(module)
    if module.modules then
       -- find submodules in classic containers 'modules'
diff --git a/newCGOptimization.lua b/newCGOptimization.lua
new file mode 100644
index 0000000..71b1c6e
--- /dev/null
+++ b/newCGOptimization.lua
@@ -0,0 +1,194 @@
+local CG,parent = torch.class('nn.newCGOptimization', 'nn.BatchOptimization')
+--
+-- wrapper around Koray's cg function which implements rasmussen's
+-- matlab cg in pure lua.
+-- Author: Marco Scoffier
+--
+function CG:__init(...)
+   parent.__init(self, ...)
+   xlua.unpack_class(self, {...},
+                     'cgOptimization', nil,
+                     {arg='rho',    type='number', default=0.1},
+                     {arg='sig',    type='number', default=0.5},
+                     {arg='int',    type='number', default=0.1},
+                     {arg='ext',    type='number', default=3.0},
+                     {arg='max',    type='number', default=20},
+                     {arg='ratio',  type='number', default=100},
+                     {arg='length', type='number', default=25},
+                     {arg='red',    type='number', default=1},
+                     {arg='verbose', type='number', default=0}
+                  )
+
+
+
+   -- we need three points for the interpolation/extrapolation stuff
+   self.df1, self.df2, self.df3 = torch.Tensor(),torch.Tensor(),torch.Tensor()
+
+   self.df1:resizeAs(self.parameters)
+   self.df2:resizeAs(self.parameters)
+   self.df3:resizeAs(self.parameters)
+
+   -- search direction
+   self.s = torch.Tensor():resizeAs(self.parameters)
+
+   -- we need a temp storage for X
+   self.x0 = torch.Tensor():resizeAs(self.parameters)
+   self.df0 = torch.Tensor():resizeAs(self.parameters)
+
+end
+
+function CG:optimize()
+   local rho = self.rho
+   local sig = self.sig
+   local int = self.int
+   local ext = self.ext
+   local max = self.max
+   local ratio = self.ratio
+   local length = self.length
+   local red = self.red
+   local verbose = self.verbose
+
+   local i = 0
+   local ls_failed = 0
+   local fx  = {}
+   
+   -- we need three points for the interpolation/extrapolation stuff
+   local z1,  z2,  z3 = 0,0,0
+   local d1,  d2,  d3 = 0,0,0
+   local f1,  f2,  f3 = 0,0,0
+   local df1,df2,df3 = self.df1, self.df2, self.df3
+
+   local x = self.parameters
+   local s = self.s
+
+   local x0 = self.x0
+   local f0 = 0
+   local df0 = self.df0
+
+   -- the magic function: will update the parameter vector using CG
+   -- evaluate at initial point
+   f1 = self.evaluate()
+   df1:copy(self.gradParameters)
+   i=i+1
+
+   -- initial search direction
+   s:copy(df1):mul(-1)
+
+   d1 = -s:dot(s )         -- slope
+   z1 = red/(1-d1)         -- initial step
+
+   while i < math.abs(length) do
+
+      x0:copy(x)
+      f0 = f1
+      df0:copy(df1)
+      x:add(z1,s)
+      f2 = self.evaluate()
+      df2:copy(self.gradParameters)
+      i=i+1
+      d2 = df2:dot(s)
+      f3,d3,z3 = f1,d1,-z1   -- init point 3 equal to point 1
+      local m = math.min(max,length-i)
+      local success = 0
+      local limit = -1
+      
+      while true do
+	 while (f2 > f1+z1*rho*d1 or d2 > -sig*d1) and m > 0 do
+	    limit = z1
+	    if f2 > f1 then
+	       z2 = z3 - (0.5*d3*z3*z3)/(d3*z3+f2-f3)
+	    else
+	       local A = 6*(f2-f3)/z3+3*(d2+d3)
+	       local B = 3*(f3-f2)-z3*(d3+2*d2)
+	       z2 = (math.sqrt(B*B-A*d2*z3*z3)-B)/A
+	    end
+	    if z2 ~= z2 or z2 == math.huge or z2 == -math.huge then
+	       z2 = z3/2;
+	    end
+	    z2 = math.max(math.min(z2, int*z3),(1-int)*z3);
+	    z1 = z1 + z2;
+	    x:add(z2,s)
+	    f2 = self.evaluate()
+	    df2:copy(self.gradParameters)
+	    i=i+1
+	    m = m - 1
+	    d2 = df2:dot(s)
+	    z3 = z3-z2;
+	 end
+	 if f2 > f1+z1*rho*d1 or d2 > -sig*d1 then
+	    break
+	 elseif d2 > sig*d1 then
+	    success = 1;
+	    break;
+	 elseif m == 0 then
+	    break;
+	 end
+	 local A = 6*(f2-f3)/z3+3*(d2+d3);
+	 local B = 3*(f3-f2)-z3*(d3+2*d2);
+	 z2 = -d2*z3*z3/(B+math.sqrt(B*B-A*d2*z3*z3))
+
+	 if z2 ~= z2 or z2 == math.huge or z2 == -math.huge or z2 < 0 then
+	    if limit < -0.5 then
+	       z2 = z1 * (ext -1)
+	    else
+	       z2 = (limit-z1)/2
+	    end
+	 elseif (limit > -0.5) and (z2+z1) > limit then
+	    z2 = (limit-z1)/2
+	 elseif limit < -0.5 and (z2+z1) > z1*ext then
+	    z2 = z1*(ext-1)
+	 elseif z2 < -z3*int then
+	    z2 = -z3*int
+	 elseif limit > -0.5 and z2 < (limit-z1)*(1-int) then
+	    z2 = (limit-z1)*(1-int)
+	 end
+	 f3=f2; d3=d2; z3=-z2;
+	 z1 = z1+z2;
+
+	 x:add(z2,s)
+
+	 f2 = self.evaluate()
+	 df2:copy(self.gradParameters)
+	 i=i+1
+	 m = m - 1
+	 d2 = df2:dot(s)
+      end
+      if success == 1 then
+	 f1 = f2
+	 fx[#fx+1] = f1;
+	 local ss = (df2:dot(df2)-df2:dot(df1)) / df1:dot(df1)
+	 s:mul(ss)
+	 s:add(-1,df2)
+	 local tmp = df1:clone()
+	 df1:copy(df2)
+	 df2:copy(tmp)
+	 d2 = df1:dot(s)
+	 if d2> 0 then
+	    s:copy(df1)
+	    s:mul(-1)
+	    d2 = -s:dot(s)
+	 end
+
+	 z1 = z1 * math.min(ratio, d1/(d2-1e-320))
+	 d1 = d2
+	 ls_failed = 0
+      else
+	 x:copy(x0)
+	 f1 = f0
+	 df1:copy(df0)
+	 if ls_failed or i>length then
+	    break
+	 end	 
+	 local tmp = df1:clone()
+	 df1:copy(df2)
+	 df2:copy(tmp)
+	 s:copy(df1)
+	 s:mul(-1)
+	 d1 = -s:dot(s)
+	 z1 = 1/(1-d1)
+	 ls_failed = 1
+      end
+   end
+   self.output = f1 -- self.evaluate(x)
+   collectgarbage()
+end
author	Koray Kavukcuoglu <koray@kavukcuoglu.org>	2011-11-09 18:57:25 +0400
committer	Koray Kavukcuoglu <koray@kavukcuoglu.org>	2011-11-09 18:57:25 +0400
commit	8eb46a42b238a14161fd78a38edcbd525924ac46 (patch)
tree	460a135c67d4c277cba51a62d069636e87cb74ee
parent	d2a8a948595192a962d486df36ed182656e389d6 (diff)
parent	f542c2f46d3f3d41df1919cd9dd76cb53b189f53 (diff)