----------------------------------------------------------------------
-- hessian.lua: this file appends extra methods to modules in nn,
-- to estimate diagonal elements of the Hessian. This is useful
-- to condition learning rates individually.
----------------------------------------------------------------------
nn.hessian = {}

----------------------------------------------------------------------
-- Hessian code is still experimental,
-- and deactivated by default
----------------------------------------------------------------------
function nn.hessian.enable()

   local function accDiagHessianParameters(module, input, diagHessianOutput, gw, hw)
      if #gw ~= #hw then
         error('Number of gradients is nto equal to number of hessians')
      end
      module.inputSq = module.inputSq or input.new()
      module.inputSq:resizeAs(input)
      torch.cmul(module.inputSq, input, input)
      -- replace gradients with hessian
      for i=1,#gw do
         local gwname = gw[i]
         local hwname = hw[i]
         local gwval = module[gwname]
         local hwval = module[hwname]
         if hwval == nil then
            module[hwname] = gwval.new():resizeAs(gwval)
            hwval = module[hwname]
         end
         module[gwname] = hwval
         module[hwname] = gwval
      end
      local oldOutput = module.output
      module.output = module.output.new():resizeAs(oldOutput)
      module.forward(module, module.inputSq)
      module.accGradParameters(module, module.inputSq, diagHessianOutput, 1)
      -- put back gradients
      for i=1,#gw do
         local gwname = gw[i]
         local hwname = hw[i]
         local gwval = module[gwname]
         local hwval = module[hwname]
         module[gwname] = hwval
         module[hwname] = gwval
      end
      module.output = oldOutput
   end
   nn.hessian.accDiagHessianParameters = accDiagHessianParameters

   local function updateDiagHessianInput(module, input, diagHessianOutput, w, wsq)
      if #w ~= #wsq then
         error('Number of weights is not equal to number of weights squares')
      end
      module.diagHessianInput = module.diagHessianInput or input.new()
      module.diagHessianInput:resizeAs(input):zero()

      local gi = module.gradInput
      module.gradInput = module.diagHessianInput
      for i=1,#w do
         local wname = w[i]
         local wsqname = wsq[i]
         local wval = module[wname]
         local wsqval = module[wsqname]
         if wsqval == nil then
            module[wsqname] = wval.new()
            wsqval = module[wsqname]
         end
         wsqval:resizeAs(wval)
         torch.cmul(wsqval, wval, wval)
         module[wsqname] = wval
         module[wname] = wsqval
      end
      module.updateGradInput(module,input,diagHessianOutput)
      for i=1,#w do
         local wname = w[i]
         local wsqname = wsq[i]
         local wval = module[wname]
         local wsqval = module[wsqname]
         module[wname] = wsqval
         module[wsqname] = wval
      end
      module.gradInput = gi
   end
   nn.hessian.updateDiagHessianInput = updateDiagHessianInput

   local function updateDiagHessianInputPointWise(module, input, diagHessianOutput)
      local tdh = diagHessianOutput.new():resizeAs(diagHessianOutput):fill(1)
      updateDiagHessianInput(module,input,tdh,{},{})
      module.diagHessianInput:cmul(module.diagHessianInput)
      module.diagHessianInput:cmul(diagHessianOutput)
   end
   nn.hessian.updateDiagHessianInputPointWise = updateDiagHessianInputPointWise

   local function initDiagHessianParameters(module,gw,hw)
      module.diagHessianInput = module.diagHessianInput or module.gradInput.new();
      for i=1,#gw do
         module[hw[i]] = module[hw[i]] or module[gw[i]].new():resizeAs(module[gw[i]])
      end
   end
   nn.hessian.initDiagHessianParameters = initDiagHessianParameters

   ----------------------------------------------------------------------
   -- Module
   ----------------------------------------------------------------------
   function nn.Module.updateDiagHessianInput(self, input, diagHessianOutput)
      error(torch.typename(self) .. ':updateDiagHessianInput() is undefined')
   end

   function nn.Module.accDiagHessianParameters(self, input, diagHessianOutput)
   end

   function nn.Module.initDiagHessianParameters()
   end

   ----------------------------------------------------------------------
   -- Sequential
   ----------------------------------------------------------------------
   function nn.Sequential.initDiagHessianParameters(self)
      for i=1,#self.modules do
         self.modules[i]:initDiagHessianParameters()
      end
   end

   function nn.Sequential.updateDiagHessianInput(self, input, diagHessianOutput)
      local currentDiagHessianOutput = diagHessianOutput
      local currentModule = self.modules[#self.modules]
      for i=#self.modules-1,1,-1 do
         local previousModule = self.modules[i]
         currentDiagHessianOutput = currentModule:updateDiagHessianInput(previousModule.output, currentDiagHessianOutput)
         currentModule = previousModule
      end
      currentDiagHessianOutput = currentModule:updateDiagHessianInput(input, currentDiagHessianOutput)
      self.diagHessianInput = currentDiagHessianOutput
      return currentDiagHessianOutput
   end

   function nn.Sequential.accDiagHessianParameters(self, input, diagHessianOutput)
      local currentDiagHessianOutput = diagHessianOutput
      local currentModule = self.modules[#self.modules]
      for i=#self.modules-1,1,-1 do
         local previousModule = self.modules[i]
         currentModule:accDiagHessianParameters(previousModule.output, currentDiagHessianOutput)
         currentDiagHessianOutput = currentModule.diagHessianInput
         currentModule = previousModule
      end
      currentModule:accDiagHessianParameters(input, currentDiagHessianOutput)
   end

   ----------------------------------------------------------------------
   -- Criterion
   ----------------------------------------------------------------------
   function nn.Criterion.updateDiagHessianInput(self, input, diagHessianOutput)
      error(torch.typename(self) .. ':updateDiagHessianInput() is undefined')
   end

   ----------------------------------------------------------------------
   -- MSECriterion
   ----------------------------------------------------------------------
   function nn.MSECriterion.updateDiagHessianInput(self, input, target)
      self.diagHessianInput = self.diagHessianInput or input.new()
      local val = 2
      if self.sizeAverage then
         val = val / input:nElement()
      end
      self.diagHessianInput:resizeAs(input):fill(val)
      return self.diagHessianInput
   end

   ----------------------------------------------------------------------
   -- WeightedMSECriterion
   ----------------------------------------------------------------------
   function nn.WeightedMSECriterion.updateDiagHessianInput(self,input,target)
      return nn.MSECriterion.updateDiagHessianInput(self,input,target)
   end

   ----------------------------------------------------------------------
   -- L1Cost
   ----------------------------------------------------------------------
   function nn.L1Cost.updateDiagHessianInput(self,input)
      self.diagHessianInput = self.diagHessianInput or input.new()
      self.diagHessianInput:resizeAs(input)
      self.diagHessianInput:fill(1)
      self.diagHessianInput[torch.eq(input,0)] = 0
      return self.diagHessianInput
   end

   ----------------------------------------------------------------------
   -- Linear
   ----------------------------------------------------------------------
   function nn.Linear.updateDiagHessianInput(self, input, diagHessianOutput)
      updateDiagHessianInput(self, input, diagHessianOutput, {'weight'}, {'weightSq'})
      return self.diagHessianInput
   end

   function nn.Linear.accDiagHessianParameters(self, input, diagHessianOutput)
      accDiagHessianParameters(self,input, diagHessianOutput, {'gradWeight','gradBias'}, {'diagHessianWeight','diagHessianBias'})
   end

   function nn.Linear.initDiagHessianParameters(self)
      initDiagHessianParameters(self,{'gradWeight','gradBias'},{'diagHessianWeight','diagHessianBias'})
   end

   ----------------------------------------------------------------------
   -- SpatialConvolution
   ----------------------------------------------------------------------
   function nn.SpatialConvolution.updateDiagHessianInput(self, input, diagHessianOutput)
      updateDiagHessianInput(self, input, diagHessianOutput, {'weight'}, {'weightSq'})
      return self.diagHessianInput
   end

   function nn.SpatialConvolution.accDiagHessianParameters(self, input, diagHessianOutput)
      accDiagHessianParameters(self,input, diagHessianOutput, {'gradWeight','gradBias'}, {'diagHessianWeight','diagHessianBias'})
   end

   function nn.SpatialConvolution.initDiagHessianParameters(self)
      initDiagHessianParameters(self,{'gradWeight','gradBias'},{'diagHessianWeight','diagHessianBias'})
   end

   ----------------------------------------------------------------------
   -- SpatialConvolutionLocal
   ----------------------------------------------------------------------
   function nn.SpatialConvolutionLocal.updateDiagHessianInput(self, input, diagHessianOutput)
      updateDiagHessianInput(self, input, diagHessianOutput, {'weight'}, {'weightSq'})
      return self.diagHessianInput
   end

   function nn.SpatialConvolutionLocal.accDiagHessianParameters(self, input, diagHessianOutput)
      accDiagHessianParameters(self,input, diagHessianOutput, {'gradWeight','gradBias'}, {'diagHessianWeight','diagHessianBias'})
   end

   function nn.SpatialConvolutionLocal.initDiagHessianParameters(self)
      initDiagHessianParameters(self,{'gradWeight','gradBias'},{'diagHessianWeight','diagHessianBias'})
   end

   ----------------------------------------------------------------------
   -- SpatialFullConvolution
   ----------------------------------------------------------------------
   function nn.SpatialFullConvolution.updateDiagHessianInput(self, input, diagHessianOutput)
      updateDiagHessianInput(self, input, diagHessianOutput, {'weight'}, {'weightSq'})
      return self.diagHessianInput
   end

   function nn.SpatialFullConvolution.accDiagHessianParameters(self, input, diagHessianOutput)
      accDiagHessianParameters(self,input, diagHessianOutput, {'gradWeight','gradBias'}, {'diagHessianWeight','diagHessianBias'})
   end

   function nn.SpatialFullConvolution.initDiagHessianParameters(self)
      initDiagHessianParameters(self,{'gradWeight','gradBias'},{'diagHessianWeight','diagHessianBias'})
   end

   ----------------------------------------------------------------------
   -- SpatialConvolutionMap
   ----------------------------------------------------------------------
   function nn.SpatialConvolutionMap.updateDiagHessianInput(self, input, diagHessianOutput)
      updateDiagHessianInput(self, input, diagHessianOutput, {'weight','bias'}, {'weightSq','biasSq'})
      return self.diagHessianInput
   end

   function nn.SpatialConvolutionMap.accDiagHessianParameters(self, input, diagHessianOutput)
      accDiagHessianParameters(self,input, diagHessianOutput, {'gradWeight','gradBias'}, {'diagHessianWeight','diagHessianBias'})
   end

   function nn.SpatialConvolutionMap.initDiagHessianParameters(self)
      initDiagHessianParameters(self,{'gradWeight','gradBias'},{'diagHessianWeight','diagHessianBias'})
   end

   ----------------------------------------------------------------------
   -- SpatialFullConvolutionMap
   ----------------------------------------------------------------------
   function nn.SpatialFullConvolutionMap.updateDiagHessianInput(self, input, diagHessianOutput)
      updateDiagHessianInput(self, input, diagHessianOutput, {'weight'}, {'weightSq'})
      return self.diagHessianInput
   end

   function nn.SpatialFullConvolutionMap.accDiagHessianParameters(self, input, diagHessianOutput)
      accDiagHessianParameters(self,input, diagHessianOutput, {'gradWeight','gradBias'}, {'diagHessianWeight','diagHessianBias'})
   end

   function nn.SpatialFullConvolutionMap.initDiagHessianParameters(self)
      initDiagHessianParameters(self,{'gradWeight','gradBias'},{'diagHessianWeight','diagHessianBias'})
   end

----------------------------------------------------------------------
   -- Tanh
   ----------------------------------------------------------------------
   function nn.Tanh.updateDiagHessianInput(self, input, diagHessianOutput)
      updateDiagHessianInputPointWise(self, input, diagHessianOutput)
      return self.diagHessianInput
   end

   ----------------------------------------------------------------------
   -- TanhShrink
   ----------------------------------------------------------------------
   function nn.TanhShrink.updateDiagHessianInput(self, input, diagHessianOutput)
      updateDiagHessianInputPointWise(self.tanh, input, diagHessianOutput)
      self.diagHessianInput = self.diagHessianInput or input.new():resizeAs(input)
      torch.add(self.diagHessianInput, self.tanh.diagHessianInput, diagHessianOutput)
      return self.diagHessianInput
   end

   ----------------------------------------------------------------------
   -- Square
   ----------------------------------------------------------------------
   function nn.Square.updateDiagHessianInput(self, input, diagHessianOutput)
      updateDiagHessianInputPointWise(self, input, diagHessianOutput)
      return self.diagHessianInput
   end

   ----------------------------------------------------------------------
   -- Sqrt
   ----------------------------------------------------------------------
   function nn.Sqrt.updateDiagHessianInput(self, input, diagHessianOutput)
      updateDiagHessianInputPointWise(self, input, diagHessianOutput)
      return self.diagHessianInput
   end

   ----------------------------------------------------------------------
   -- Reshape
   ----------------------------------------------------------------------
   function nn.Reshape.updateDiagHessianInput(self, input, diagHessianOutput)
      self.diagHessianInput = self.diagHessianInput or input.new()
      diagHessianOutput = diagHessianOutput:contiguous()
      self.diagHessianInput:set(diagHessianOutput):resizeAs(input)
      return self.diagHessianInput
   end

   ----------------------------------------------------------------------
   -- Parameters manipulation:
   -- we modify these functions such that they return hessian coefficients
   ----------------------------------------------------------------------
   function nn.Module.parameters(self)
      if self.weight and self.bias then
         return {self.weight, self.bias}, {self.gradWeight, self.gradBias}, {self.diagHessianWeight, self.diagHessianBias}
      elseif self.weight then
         return {self.weight}, {self.gradWeight}, {self.diagHessianWeight}
      elseif self.bias then
         return {self.bias}, {self.gradBias}, {self.diagHessianBias}
      else
         return
      end
   end

   function nn.Module.getParameters(self)
      -- get parameters
      local parameters,gradParameters,hessianParameters = self:parameters()
      -- flatten parameters and gradients
      local flatParameters = nn.Module.flatten(parameters)
      collectgarbage()
      local flatGradParameters = nn.Module.flatten(gradParameters)
      collectgarbage()
      local flatHessianParameters
      if hessianParameters and hessianParameters[1] then
         flatHessianParameters = nn.Module.flatten(hessianParameters)
         collectgarbage()
      end

      -- return new flat vector that contains all discrete parameters
      return flatParameters, flatGradParameters, flatHessianParameters
   end

   function nn.Sequential.parameters(self)
      local function tinsert(to, from)
         if torch.type(from) == 'table' then
            for i=1,#from do
               tinsert(to,from[i])
            end
         else
            table.insert(to,from)
         end
      end
      local w = {}
      local gw = {}
      local ggw = {}
      for i=1,#self.modules do
         local mw,mgw,mggw = self.modules[i]:parameters()
         if mw then
            tinsert(w,mw)
            tinsert(gw,mgw)
            tinsert(ggw,mggw)
         end
      end
      return w,gw,ggw
   end

   ----------------------------------------------------------------------
   -- Avoid multiple calls to enable()
   ----------------------------------------------------------------------
   function nn.hessian.enable()
   end
end