local WeightedEuclidean, parent = torch.class('nn.WeightedEuclidean', 'nn.Module')

function WeightedEuclidean:__init(inputSize,outputSize)
   parent.__init(self)

   self.weight = torch.Tensor(inputSize,outputSize)
   self.gradWeight = torch.Tensor(inputSize,outputSize)

   -- each template (output dim) has its own diagonal covariance matrix
   self.diagCov = torch.Tensor(inputSize,outputSize)
   self.gradDiagCov = torch.Tensor(inputSize,outputSize)

   self:reset()
end

function WeightedEuclidean:reset(stdv)
   if stdv then
      stdv = stdv * math.sqrt(3)
   else
      stdv = 1./math.sqrt(self.weight:size(1))
   end
   self.weight:uniform(-stdv, stdv)
   self.diagCov:fill(1)
end

local function view(res, src, ...)
   local args = {...}
   if src:isContiguous() then
      res:view(src, table.unpack(args))
   else
      res:reshape(src, table.unpack(args))
   end
end

function WeightedEuclidean:updateOutput(input)
   -- lazy-initialize 
   self._diagCov = self._diagCov or self.output.new()
   
   self._input = self._input or input.new()
   self._weight = self._weight or self.weight.new()
   self._expand = self._expand or self.output.new()
   self._expand2 = self._expand or self.output.new()
   self._expand3 = self._expand3 or self.output.new()
   self._repeat = self._repeat or self.output.new()
   self._repeat2 = self._repeat2 or self.output.new()
   self._repeat3 = self._repeat3 or self.output.new()
   
   local inputSize, outputSize = self.weight:size(1), self.weight:size(2)
   
   -- y_j = || c_j * (w_j - x) ||
   if input:dim() == 1 then
      view(self._input, input, inputSize, 1)
      self._expand:expandAs(self._input, self.weight)
      self._repeat:resizeAs(self._expand):copy(self._expand)
      self._repeat:add(-1, self.weight)
      self._repeat:cmul(self.diagCov)
      self.output:norm(self._repeat, 2, 1)
      self.output:resize(outputSize)
   elseif input:dim() == 2 then
      local batchSize = input:size(1)
      
      view(self._input, input, batchSize, inputSize, 1)
      self._expand:expand(self._input, batchSize, inputSize, outputSize)
      -- make the expanded tensor contiguous (requires lots of memory)
      self._repeat:resizeAs(self._expand):copy(self._expand)
      
      self._weight:view(self.weight, 1, inputSize, outputSize)
      self._expand2:expandAs(self._weight, self._repeat)
      
      self._diagCov:view(self.diagCov, 1, inputSize, outputSize)
      self._expand3:expandAs(self._diagCov, self._repeat)
      if torch.type(input) == 'torch.CudaTensor' then
         -- requires lots of memory, but minimizes cudaMallocs and loops
         self._repeat2:resizeAs(self._expand2):copy(self._expand2)
         self._repeat:add(-1, self._repeat2)
         self._repeat3:resizeAs(self._expand3):copy(self._expand3)
         self._repeat:cmul(self._repeat3)
      else
         self._repeat:add(-1, self._expand2)
         self._repeat:cmul(self._expand3)
      end
      
      self.output:norm(self._repeat, 2, 2)
      self.output:resize(batchSize, outputSize)
   else
      error"1D or 2D input expected"
   end
   return self.output
end

function WeightedEuclidean:updateGradInput(input, gradOutput)
   if not self.gradInput then
      return
   end
   
   self._div = self._div or input.new()
   self._output = self._output or self.output.new()
   self._expand4 = self._expand4 or input.new()
   self._gradOutput = self._gradOutput or input.new()
   
   if not self.fastBackward then
      self:updateOutput(input)
   end
   
   local inputSize, outputSize = self.weight:size(1), self.weight:size(2)

   --[[ 
   dy_j   -2 * c_j * c_j * (w_j - x)   c_j * c_j * (x - w_j) 
   ---- = -------------------------- = ---------------------
    dx     2 || c_j * (w_j - x) ||              y_j
   --]]
   
   -- to prevent div by zero (NaN) bugs
   self._output:resizeAs(self.output):copy(self.output):add(0.0000001)
   view(self._gradOutput, gradOutput, gradOutput:size())
   self._div:cdiv(gradOutput, self._output) 
   if input:dim() == 1 then
      self._div:resize(1, outputSize)
      self._expand4:expandAs(self._div, self.weight)
      
      if torch.type(input) == 'torch.CudaTensor' then
         self._repeat2:resizeAs(self._expand4):copy(self._expand4)
         self._repeat2:cmul(self._repeat)
      else
         self._repeat2:cmul(self._repeat, self._expand4)
      end
      
      self._repeat2:cmul(self.diagCov)
      self.gradInput:sum(self._repeat2, 2)
      self.gradInput:resizeAs(input)
   elseif input:dim() == 2 then
      local batchSize = input:size(1)
      
      self._div:resize(batchSize, 1, outputSize)
      self._expand4:expand(self._div, batchSize, inputSize, outputSize)
      
      if torch.type(input) == 'torch.CudaTensor' then
         self._repeat2:resizeAs(self._expand4):copy(self._expand4)
         self._repeat2:cmul(self._repeat)
         self._repeat2:cmul(self._repeat3)
      else
         self._repeat2:cmul(self._repeat, self._expand4)
         self._repeat2:cmul(self._expand3)
      end
      
      self.gradInput:sum(self._repeat2, 3)
      self.gradInput:resizeAs(input)
   else
      error"1D or 2D input expected"
   end
   
   return self.gradInput
end

function WeightedEuclidean:accGradParameters(input, gradOutput, scale)
   local inputSize, outputSize = self.weight:size(1), self.weight:size(2)
   scale = scale or 1
   
   --[[ 
   dy_j   2 * c_j * c_j * (w_j - x)    c_j * c_j * (w_j - x)
   ---- = ------------------------- = ---------------------
   dw_j    2 || c_j * (w_j - x) ||             y_j
   
   dy_j    2 * c_j * (w_j - x)^2    c_j * (w_j - x)^2 
   ---- = ----------------------- = ----------------- 
   dc_j   2 || c_j * (w_j - x) ||         y_j          
   --]]
   -- assumes a preceding call to updateGradInput 
   if input:dim() == 1 then
      self.gradWeight:add(-scale, self._repeat2)
      
      self._repeat:cdiv(self.diagCov)
      self._repeat:cmul(self._repeat)
      self._repeat:cmul(self.diagCov)
      
      if torch.type(input) == 'torch.CudaTensor' then
         self._repeat2:resizeAs(self._expand4):copy(self._expand4)
         self._repeat2:cmul(self._repeat)
      else
         self._repeat2:cmul(self._repeat, self._expand4)
      end
      
      self.gradDiagCov:add(self._repeat2)
   elseif input:dim() == 2 then
      self._sum = self._sum or input.new()
      self._sum:sum(self._repeat2, 1)
      self._sum:resize(inputSize, outputSize)
      self.gradWeight:add(-scale, self._sum)
      
      if torch.type(input) == 'torch.CudaTensor' then
         -- requires lots of memory, but minimizes cudaMallocs and loops
         self._repeat:cdiv(self._repeat3)
         self._repeat:cmul(self._repeat)
         self._repeat:cmul(self._repeat3)
         self._repeat2:resizeAs(self._expand4):copy(self._expand4)
         self._repeat:cmul(self._repeat2)
      else
         self._repeat:cdiv(self._expand3)
         self._repeat:cmul(self._repeat)
         self._repeat:cmul(self._expand3)
         self._repeat:cmul(self._expand4)
      end
      
      self._sum:sum(self._repeat, 1)
      self._sum:resize(inputSize, outputSize)
      self.gradDiagCov:add(scale, self._sum)
   else
      error"1D or 2D input expected"
   end
end

function WeightedEuclidean:type(type, tensorCache)
   if type then
      -- prevent premature memory allocations
      self._input = nil
      self._output = nil
      self._gradOutput = nil
      self._weight = nil
      self._div = nil
      self._sum = nil
      self._expand = nil
      self._expand2 = nil
      self._expand3 = nil
      self._expand4 = nil
      self._repeat = nil
      self._repeat2 = nil
      self._repeat3 = nil
   end
   return parent.type(self, type, tensorCache)
end

function WeightedEuclidean:parameters()
   return {self.weight, self.diagCov}, {self.gradWeight, self.gradDiagCov}
end

function WeightedEuclidean:accUpdateGradParameters(input, gradOutput, lr)
   local gradWeight = self.gradWeight
   local gradDiagCov = self.gradDiagCov
   self.gradWeight = self.weight
   self.gradDiagCov = self.diagCov
   self:accGradParameters(input, gradOutput, -lr)
   self.gradWeight = gradWeight
   self.gradDiagCov = gradDiagCov
end