preliminary Batch modules for matlab optimization

Conflicts: OnlineTrainer.lua
author: Marc Scoffier <mps309@cassio.cs.nyu.edu> 2011-08-31 06:35:25 +0400
committer: Marc Scoffier <mps309@cassio.cs.nyu.edu> 2011-08-31 06:35:25 +0400
commit: 879c7f87c2e2727b5479c9e97a97aed59edfab13 (patch)
tree: be565e772c374a6d9b9317b226f1e346692f1fbf
parent: 470b00f1bbcf4b8d4ef671b703b8e6fa3209344a (diff)
parent: 987894db868ed9b5ad0cd746a8c3569985acd71d (diff)
7 files changed, 406 insertions, 42 deletions
diff --git a/BatchOptimization.lua b/BatchOptimization.lua
new file mode 100644
index 0000000..93cbd2a
--- /dev/null
+++ b/BatchOptimization.lua
@@ -0,0 +1,277 @@
+local Batch,parent = torch.class('nn.BatchOptimization', 'nn.Optimization')
+
+-- this is a generic class for any batch optimization modeled after
+-- the LBFGS optimization.  It simply provides a batch.evaluate() method
+-- which creates a self.parameters and self.gradParameters from your
+-- self.model
+
+function Batch:__init(...)
+   parent.__init(self)
+   xlua.unpack_class(self, {...},
+      'BatchOptimization', nil,
+      {arg='module', type='nn.Module', help='a module to train', req=true},
+      {arg='criterion', type='nn.Criterion', 
+       help='a criterion to estimate the error', req=true},
+      {arg='maxIterations', type='number', 
+       help='maximum nb of iterations per pass (0 = no max)', default=0},
+      {arg='maxLineSearch', type='number', 
+       help='maximum nb of steps in line search', default=20},
+      {arg='sparsity', type='number', 
+       help='sparsity coef (Orthantwise C)', default=0},
+      {arg='parallelize', type='number', 
+       help='parallelize onto N cores (experimental!)', default=1},
+      {arg='verbose', type='number', 
+       help='verbose level during training [0-2]', default=0}
+   )
+   self.parameters = nnx.flattenParameters(nnx.getParameters(self.module))
+   self.gradParameters = nnx.flattenParameters(nnx.getGradParameters(self.module))
+   if self.parallelize > 1 then
+      self:setup_mapreduce()
+   end
+end
+
+function Batch:forward(inputs, targets, options)
+   options = options or {}
+   if self.parallelize > 1 then
+      return self:forward_mapreduce(inputs, targets, options)
+   else
+      return self:forward_sequential(inputs, targets, options)
+   end
+end
+
+function Batch:forward_sequential(inputs, targets, options)
+   -- (1) construct a closure that compute f(inputs) + df/dW
+   --     after each call to that function:
+   --       + self.parameters contains the current X vector
+   --       + self.gradParameters contains the estimated dF/dX vector
+   --       + self.output contains the estimated (average) F(X)
+   batch.evaluate
+      = function()
+           -- reset gradients
+           self.gradParameters:zero()
+           -- f is the average of all criterions
+           self.output = 0
+           -- given all inputs, evaluate gradients
+           for i = 1,#inputs do
+              -- user hook
+              if self.prehook then
+                 self.prehook(self, {inputs[i], targets[i], options[i]})
+              end
+              -- estimate f
+              local output = self.module:forward(inputs[i])
+              local err = self.criterion:forward(output, targets[i])
+              self.output = self.output + err
+              -- estimate df/dW
+              local df_do = self.criterion:backward(output, targets[i])
+              self.module:backward(inputs[i], df_do)
+              -- user hook
+              if self.posthook then
+                 self.posthook(self, {inputs[i], targets[i], options[i]})
+              end
+           end
+           -- normalize gradients
+           self.gradParameters:div(#inputs)
+           -- return average f(X)
+           return self.output/#inputs
+        end
+
+   -- (3) return current output after optimization
+   return self.output
+end
+
+function Batch:forward_mapreduce(inputs, targets, options)
+   -- parameters
+   local P = self.parallelize
+
+   -- transmit user hooks, if defined
+   if not self.hooksets then
+      if self.prehook then
+         if type(self.prehook) == 'string' then
+            parallel.children:send(self.prehook)
+         else
+            print('\r<BatchOptimization> WARNING: when using para||el mode, hooks should be')
+            print('\r<BatchOptimization> WARNING: defined as strings. User prehook ignored.')
+            parallel.children:send('')
+         end
+      else
+         parallel.children:send('')
+      end
+      if self.posthook then
+         if type(self.posthook) == 'string' then
+            parallel.children:send(self.posthook)
+         else
+            print('\r<BatchOptimization> WARNING: when using para||el mode, hooks should be')
+            print('<\rBatchOptimization> WARNING: defined as strings. User posthook ignored.')
+            parallel.children:send('')
+         end
+      else
+         parallel.children:send('')
+      end
+      self.hooksets = true
+   end
+
+   -- (0a) replicate output and gradParameters
+   local outputsPartial = {}
+   local gradParametersPartial = {}
+
+   -- (0b) divide input/target batch into N batches
+   local inputss = {}
+   local targetss = {}
+   local optionss = {}
+   for t = 1,P do
+      inputss[t] = {}
+      targetss[t] = {}
+      optionss[t] = {}
+      for i = t,#inputs,P do
+         table.insert(inputss[t], inputs[i])
+         table.insert(targetss[t], targets[i])
+         if options then table.insert(optionss[t], options[i]) end
+      end
+   end
+
+   -- (0c) send mini-batch to all workers
+   for t = 1,P do
+      parallel.children[t]:send(inputss[t])
+      parallel.children[t]:send(targetss[t])
+      parallel.children[t]:send(optionss[t])
+   end
+
+   -- (1) construct a closure that compute f(inputs) + df/dW
+   --     after each call to that function:
+   --       + self.parameters contains the current X vector
+   --       + self.gradParameters contains the estimated dF/dX vector
+   --       + self.output contains the estimated (average) F(X)
+   batch.evaluate
+      = function()
+           batch.evaluate_map()
+           return batch.evaluate_reduce()
+        end
+
+   -- (1a) the map part of the evaluation: compute partial gradients
+   --      in separate threads
+   batch.evaluate_map
+      = function()
+           -- transmit new parameters to all workers
+           parallel.children:send(self.parameters)
+           -- then wait for all workers to return their partial gradParameters + outputs
+           for t = 1,P do
+              gradParametersPartial[t] = parallel.children[t]:receive()
+              outputsPartial[t] = parallel.children[t]:receive()
+           end
+           -- force cleanup
+           collectgarbage()
+        end
+
+   -- (1b) the reduce part of the evaluation: accumulate all
+   --      partial estimates of the gradients
+   batch.evaluate_reduce
+      = function()
+           -- accumulate partial gradients, and average
+           self.gradParameters:zero()
+           for t = 1,P do
+              self.gradParameters:add(gradParametersPartial[t])
+           end
+           self.gradParameters:div(#inputs)
+           -- return average f(X)
+           self.output = 0
+           for t = 1,P do
+              self.output = self.output + outputsPartial[t]
+           end
+           return self.output/#inputs
+        end
+
+   -- (3) reset workers so they're ready for next mini-batch
+   parallel.children:send('break')
+
+   -- (4) return current output after optimization
+   return self.output/#inputs
+end
+
+function Batch:setup_mapreduce ()
+   -- (0) startup parallel package
+   if not xrequire 'parallel' then
+      xerror('install parallel for Lua to enable parallel computing (luarocks install parallel)',
+             'nn.BatchOptimization')
+   end
+   parallel.setSharedSize(4*1024*1024)
+   local P = self.parallelize
+
+   -- (1) define code for workers
+   local worker_code = [[
+         -- require packages
+         require 'nnx'
+
+         -- retrieve module + criterion at startup
+         module = parallel.parent:receive()
+         criterion = parallel.parent:receive()
+
+         -- create fake optimizer, for hooks
+         optimizer = {module=module, criterion=criterion}
+
+         -- retrieve optional prehook/posthook
+         prehook = parallel.parent:receive()
+         posthook = parallel.parent:receive()
+         if prehook ~= '' then loadstring(prehook)() else prehook = nil end
+         if posthook ~= '' then loadstring(posthook)() else posthook = nil end
+
+         -- get pointer to parameter and gradParameter vectors
+         parameters = nnx.flattenParameters(nnx.getParameters(module))
+         gradParameters = nnx.flattenParameters(nnx.getGradParameters(module))
+
+         -- outter loop: mini-batches
+         while true do
+            -- receive new mini-batch
+            inputs = parallel.parent:receive()
+            if type(inputs) == 'string' and inputs == 'break' then break end
+            targets = parallel.parent:receive()
+            options = parallel.parent:receive()
+
+            -- inner loop: evaluations
+            while true do
+               -- receive new set of parameters
+               newParameters = parallel.parent:receive()
+               if type(newParameters) == 'string' and newParameters == 'break' then break end
+               parameters:copy(newParameters)
+
+               -- reset gradients
+               gradParameters:zero()
+               -- f is the average of all criterions
+               local f_x = 0
+               -- evaluate gradients on inputs for this thread
+               for i = 1,#inputs do
+                  -- user hook
+                  if prehook then
+                     prehook(optimizer, {inputs[i], targets[i], options[i]})
+                  end
+                  -- estimate f
+                  local output = module:forward(inputs[i])
+                  local err = criterion:forward(output, targets[i])
+                  f_x = f_x + err
+                  -- estimate df/dW
+                  local df_do = criterion:backward(output, targets[i])
+                  module:backward(inputs[i], df_do)
+                  -- user hook
+                  if posthook then
+                     posthook(optimizer, {inputs[i], targets[i], options[i]})
+                  end
+               end
+
+               -- now send back gradParameters + partial output
+               parallel.parent:send(gradParameters)
+               parallel.parent:send(f_x)
+
+               -- force cleanup
+               collectgarbage()
+            end
+         end
+   ]]
+
+   -- (2) startup all workers
+   for t = 1,P do
+      parallel.run(worker_code)
+   end
+
+   -- (3) and send them the module + criterion architecture
+   parallel.children:send(self.module)
+   parallel.children:send(self.criterion)
+end
diff --git a/BatchTrainer.lua b/BatchTrainer.lua
new file mode 100644
index 0000000..b774a9b
--- /dev/null
+++ b/BatchTrainer.lua
@@ -0,0 +1,120 @@
+local BatchTrainer, parent = torch.class('nn.BatchTrainer', 'nn.OnlineTrainer')
+
+-- Essentially simialar to the OnlineTrainer but only used the parts
+-- of the code which prepare the data and the tester. train() has been
+-- replaced by nextBatch() which moves the trainer one batch further
+-- in the data.  When the first epoch is finished then the batches are
+-- reused.  Each call to optimizer.forward() in nextBatch() creates a
+-- closure with the current batch as input.
+
+function BatchTrainer:__init(...)
+   -- parent.__init(self)
+   -- unpack args
+   xlua.unpack_class(self, {...},
+      'BatchTrainer', 
+
+      'A general-purpose online trainer class.\n'
+         .. 'Provides 4 user hooks to perform extra work after each sample, or each epoch:\n'
+         .. '> trainer = nn.BatchTrainer(...) \n'
+         .. '> trainer.hookTrainSample = function(trainer, sample) ... end \n'
+         .. '> trainer.hookTrainEpoch = function(trainer) ... end \n'
+         .. '> trainer.hookTestSample = function(trainer, sample) ... end \n'
+         .. '> trainer.hookTestEpoch = function(trainer) ... end \n'
+         .. '> ',
+
+      {arg='trainset', type='nn.DataList',
+       help='dataset to split into batches for closures',req=true},
+      {arg='module', type='nn.Module', help='a module to train', req=true},
+      {arg='criterion', type='nn.Criterion', 
+       help='a criterion to estimate the error'},
+      {arg='preprocessor', type='nn.Module', 
+       help='a preprocessor to prime the data before the module'},
+      {arg='optimizer', type='nn.Optimization', 
+       help='an optimization method'}, 
+      {arg='batchSize', type='number', 
+       help='[mini] batch size', default=1},
+      {arg='maxEpoch', type='number', 
+       help='maximum number of epochs', default=50},
+      {arg='dispProgress', type='boolean', 
+       help='display a progress bar during training/testing', default=true},
+      {arg='save', type='string', 
+       help='path to save networks and log training'},
+      {arg='timestamp', type='boolean', 
+       help='if true, appends a timestamp to each network saved', default=false}
+   )
+   -- private params
+   self.trainOffset = -self.batchSize
+   self.testOffset = 0
+
+   -- counters
+   self.epoch = 1
+   self.batch = 0
+end
+
+-- update the counters
+function BatchTrainer:next()
+   self.batch = self.batch + 1
+   self.trainOffset = self.trainOffset + self.batchSize
+   if self.trainOffset > self.trainset:size()-1 then
+      self.trainOffset = 1
+      self.epoch = self.epoch + 1
+      self.batch = 1
+   end
+   -- disp progress
+   if self.dispProgress then
+      xlua.progress(self.trainOffset, trainset:size())
+   end
+
+end
+
+-- this function is called train() in the online trainer.  I seems to
+-- make more sense to call it next_batch() here as the training is
+-- done outside of this code.
+
+function BatchTrainer:next_batch()
+   self.next()
+   local module = self.module
+   local criterion = self.criterion
+   local t = self.trainOffset
+   local ds = self.trainset:size()
+   local bs = self.batchSize
+   
+   print('<trainer> on training set:')
+   print("<trainer> online epoch # " .. self.epoch .. ' batch # '..self.batch.. '[batchSize = ' .. self.batchSize .. ']')
+
+   -- create mini batch
+   self.inputs = self.inputs or {}
+   self.targets = self.targets or {}
+   local inputs = {}
+   local targets = {}
+   if not self.inputs[self.batch] then
+
+      self.inputs[self.batch] = {}
+      inputs = self.inputs[self.batch] 
+      self.targets[self.batch] = {}
+      targets = self.targets[self.batch]
+
+      for i = t,math.min(t+bs-1,ds) do
+	 -- load new sample
+	 local sample = trainset[t + i]
+	 local input = sample[1]
+	 local target = sample[2]
+      
+	 -- optional preprocess (no learning is done for that guy)
+	 if self.preprocessor then input = self.preprocessor:forward(input) end
+	 
+      -- store input/target
+	 table.insert(inputs, input)
+	 table.insert(targets, target)
+      end
+   else  
+      -- get batch from cache
+      inputs = self.inputs[self.batch] 
+      targets = self.targets[self.batch]
+   end   
+
+   -- set up closure batch.evaluate() for optimizer
+   local error = self.optimizer:forward(inputs, targets)
+end
+
+
diff --git a/FindTorch.cmake b/FindTorch.cmake
index 8ada8cc..6658d42 100644
--- a/FindTorch.cmake
+++ b/FindTorch.cmake
@@ -13,9 +13,9 @@ if (TORCH_EXECUTABLE)
   get_filename_component (TORCH_BIN_DIR ${TORCH_EXECUTABLE} PATH)
 endif (TORCH_EXECUTABLE)
 
-find_library (TORCH_TH TH ${TORCH_BIN_DIR}/../lib)
-find_library (TORCH_luaT luaT ${TORCH_BIN_DIR}/../lib)
-find_library (TORCH_lua lua ${TORCH_BIN_DIR}/../lib)
+find_library (TORCH_TH TH ${TORCH_BIN_DIR}/../lib NO_DEFAULT_PATH)
+find_library (TORCH_luaT luaT ${TORCH_BIN_DIR}/../lib NO_DEFAULT_PATH)
+find_library (TORCH_lua lua ${TORCH_BIN_DIR}/../lib NO_DEFAULT_PATH)
 
 set (TORCH_LIBRARIES ${TORCH_TH} ${TORCH_luaT} ${TORCH_lua})
 
diff --git a/OnlineTrainer.lua b/OnlineTrainer.lua
index f213924..dc6e860 100644
--- a/OnlineTrainer.lua
+++ b/OnlineTrainer.lua
@@ -33,9 +33,6 @@ function OnlineTrainer:__init(...)
       {arg='timestamp', type='boolean', 
        help='if true, appends a timestamp to each network saved', default=false}
    )
-   -- private params
-   self.trainOffset = 0
-   self.testOffset = 0
 end
 
 function OnlineTrainer:log()
@@ -63,15 +60,6 @@ function OnlineTrainer:train(dataset)
    local criterion = self.criterion
    self.trainset = dataset
 
-   local shuffledIndices = {}
-   if not self.shuffleIndices then
-      for t = 1,dataset:size() do
-         shuffledIndices[t] = t
-      end
-   else
-      shuffledIndices = lab.randperm(dataset:size())
-   end
-
    while true do
       print('<trainer> on training set:')
       print("<trainer> online epoch # " .. self.epoch .. ' [batchSize = ' .. self.batchSize .. ']')
@@ -89,7 +77,7 @@ function OnlineTrainer:train(dataset)
          local targets = {}
          for i = t,math.min(t+self.batchSize-1,dataset:size()) do
             -- load new sample
-            local sample = dataset[shuffledIndices[self.trainOffset + i]]
+            local sample = dataset[i]
             local input = sample[1]
             local target = sample[2]
 
@@ -128,10 +116,6 @@ function OnlineTrainer:train(dataset)
 
       self.epoch = self.epoch + 1
 
-      if dataset.infiniteSet then
-         self.trainOffset = self.trainOffset + dataset:size()
-      end
-
       if self.maxEpoch > 0 and self.epoch > self.maxEpoch then
          print("<trainer> you have reached the maximum number of epochs")
          break
@@ -144,20 +128,10 @@ function OnlineTrainer:test(dataset)
    print('<trainer> on testing Set:')
 
    local module = self.module
-   local shuffledIndices = {}
    local criterion = self.criterion
    self.currentError = 0
    self.testset = dataset
 
-   local shuffledIndices = {}
-   if not self.shuffleIndices then
-      for t = 1,dataset:size() do
-         shuffledIndices[t] = t
-      end
-   else
-      shuffledIndices = lab.randperm(dataset:size())
-   end
-   
    self.time = sys.clock()
    for t = 1,dataset:size() do
       -- disp progress
@@ -166,7 +140,7 @@ function OnlineTrainer:test(dataset)
       end
 
       -- get new sample
-      local sample = dataset[shuffledIndices[self.testOffset + t]]
+      local sample = dataset[t]
       local input = sample[1]
       local target = sample[2]
       
@@ -197,10 +171,6 @@ function OnlineTrainer:test(dataset)
       self.hookTestEpoch(self)
    end
 
-   if dataset.infiniteSet then
-      self.testOffset = self.testOffset + dataset:size()
-   end
-
    return self.currentError
 end
 
diff --git a/Trainer.lua b/Trainer.lua
index 3388ef7..b7da770 100644
--- a/Trainer.lua
+++ b/Trainer.lua
@@ -4,7 +4,6 @@ function Trainer:__init()
    self.learningRate = 0.01
    self.learningRateDecay = 0
    self.maxIteration = 25
-   self.shuffleIndices = true
 end
 
 function Trainer:train(dataset)
@@ -14,14 +13,12 @@ function Trainer:write(file)
    file:writeDouble(self.learningRate)
    file:writeDouble(self.learningRateDecay)
    file:writeInt(self.maxIteration)
-   file:writeBool(self.shuffleIndices)
 end
 
 function Trainer:read(file)
    self.learningRate = file:readDouble()
    self.learningRateDecay = file:readDouble()
    self.maxIteration = file:readInt()
-   self.shuffleIndices = file:readBool()
 end
 
 function Trainer:share(mlp, ...)
@@ -30,10 +27,6 @@ function Trainer:share(mlp, ...)
    end
 end
 
-function Trainer:setShuffle(bool)
-      self.shuffleIndices = bool
-end
-
 function Trainer:clone(...)
    local f = torch.MemoryFile("rw"):binary()
    f:writeObject(self)
diff --git a/init.lua b/init.lua
index ea44de0..6b09afc 100644
--- a/init.lua
+++ b/init.lua
@@ -102,10 +102,12 @@ torch.include('nnx', 'SpatialSparseCriterion.lua')
 torch.include('nnx', 'Optimization.lua')
 torch.include('nnx', 'SGDOptimization.lua')
 torch.include('nnx', 'LBFGSOptimization.lua')
+torch.include('nnx', 'BatchOptimization.lua')
 
 -- trainers:
 torch.include('nnx', 'Trainer.lua')
 torch.include('nnx', 'OnlineTrainer.lua')
+torch.include('nnx', 'BatchTrainer.lua')
 
 -- datasets:
 torch.include('nnx', 'DataSet.lua')
diff --git a/nnx-1.0-1.rockspec b/nnx-1.0-1.rockspec
index 3af08d0..dcb8d1b 100644
--- a/nnx-1.0-1.rockspec
+++ b/nnx-1.0-1.rockspec
@@ -103,6 +103,8 @@ build = {
          install_files(/lua/nnx Optimization.lua)
          install_files(/lua/nnx LBFGSOptimization.lua)
          install_files(/lua/nnx SGDOptimization.lua)
+         install_files(/lua/nnx BatchOptimization.lua)
+         install_files(/lua/nnx BatchTrainer.lua)
          add_subdirectory (test)
          install_targets(/lib nnx)
    ]],
author	Marc Scoffier <mps309@cassio.cs.nyu.edu>	2011-08-31 06:35:25 +0400
committer	Marc Scoffier <mps309@cassio.cs.nyu.edu>	2011-08-31 06:35:25 +0400
commit	879c7f87c2e2727b5479c9e97a97aed59edfab13 (patch)
tree	be565e772c374a6d9b9317b226f1e346692f1fbf
parent	470b00f1bbcf4b8d4ef671b703b8e6fa3209344a (diff)
parent	987894db868ed9b5ad0cd746a8c3569985acd71d (diff)