local Normalize, parent = torch.class('nn.Normalize', 'nn.Module')

function Normalize:__init(p,eps)
  parent.__init(self)
  assert(p,'p-norm not provided')
  assert(p > 0, p..'-norm not supported')
  self.p = p
  self.eps = eps or 1e-10
end

function Normalize:updateOutput(input)
  assert(input:dim() <= 2, 'only 1d layer supported')
  local input_size = input:size()
  if input:dim() == 1 then
    input = input:view(1,-1)
  end

  self._output = self._output or input.new()
  self.norm = self.norm or input.new()
  self.buffer = self.buffer or input.new()

  self._output:resizeAs(input)

  if self.p == math.huge then
    -- specialization for the infinity norm
    if not self._indices then
      if torch.typename(self.output):find('torch%.Cuda.*Tensor') then
        self._indices = torch.CudaLongTensor and torch.CudaLongTensor() or torch.CudaTensor()
      else
        self._indices = torch.LongTensor()
      end
    end

    self.buffer:abs(input)
    torch.max(self.norm, self._indices, self.buffer, 2)
    self.norm:add(self.eps)
  else
    self.normp = self.normp or input.new()
    if self.p % 2 ~= 0 then
      self.buffer:abs(input):pow(self.p)
    else
      self.buffer:pow(input,self.p)
    end
    self.normp:sum(self.buffer,2):add(self.eps)
    self.norm:pow(self.normp,1/self.p)
  end
  self._output:cdiv(input, self.norm:view(-1,1):expandAs(input))

  self.output:view(self._output, input_size)
  return self.output
end

function Normalize:updateGradInput(input, gradOutput)
  assert(input:dim() <= 2, 'only 1d layer supported')
  assert(gradOutput:dim() <= 2, 'only 1d layer supported')

  local input_size = input:size()
  if input:dim() == 1 then
    input = input:view(1,-1)
  end

  local n = input:size(1) -- batch size
  local d = input:size(2) -- dimensionality of vectors

  self._gradInput = self._gradInput or input.new()
  self.cross = self.cross or input.new()
  -- compute diagonal term with gradOutput
  self._gradInput:resize(n,d)
  if self.p == math.huge then
    -- specialization for the inf case
    self._gradInput:cmul(self.norm:view(n,1,1):expand(n,d,1),gradOutput)
    self.buffer:resizeAs(input):zero()
    self.cross:resize(n,1)
    self.cross:gather(input,2,self._indices)
    self.cross:cdiv(self.norm)
    self.buffer:scatter(2,self._indices,self.cross)
  else
    self._gradInput:cmul(self.normp:view(n,1):expand(n,d), gradOutput)
    -- small optimizations for different p
    -- buffer = input*|input|^(p-2)
    if self.p % 2 ~= 0 then
      -- for non-even p, need to add absolute value
      if self.p < 2 then
        -- add eps to avoid possible division by 0
        self.buffer:abs(input):add(self.eps):pow(self.p-2):cmul(input)
      else
        self.buffer:abs(input):pow(self.p-2):cmul(input)
      end
    elseif self.p == 2 then
      -- special case for p == 2, pow(x,0) = 1
      self.buffer:copy(input)
    else
      -- p is even and > 2, pow(x,p) is always positive
      self.buffer:pow(input,self.p-2):cmul(input)
    end
  end
  -- compute cross term in two steps
  self.cross:resize(n,1)

  -- instead of having a huge temporary matrix (b1*b2),
  -- do the computations as b1*(b2*gradOutput). This avoids redundant
  -- computation and also a huge buffer of size n*d^2
  self.buffer2 = self.buffer2 or input.new() -- nxd
  self.buffer2:cmul(input, gradOutput)
  self.cross:sum(self.buffer2, 2)

  self.buffer:cmul(self.cross:expandAs(self.buffer))
  self._gradInput:add(-1, self.buffer)

  -- reuse cross buffer for normalization
  if self.p == math.huge then
    self.cross:cmul(self.norm,self.norm)
  else
    self.cross:cmul(self.normp,self.norm)
  end
  self._gradInput:cdiv(self.cross:expand(n,d))

  self.gradInput:view(self._gradInput, input_size)
  return self.gradInput
end

function Normalize:__tostring__()
  local s
  -- different prints if the norm is integer
  if self.p % 1 == 0 then
    s = '%s(%d)'
  else
    s = '%s(%f)'
  end
  return string.format(s,torch.type(self),self.p)
end

function Normalize:type(type, tensorCache)
    self._indices = nil
    parent.type(self, type, tensorCache)
    return self
end

function Normalize:clearState()
   nn.utils.clear(self, {
      '_output',
      '_indices',
      '_gradInput',
      'buffer',
      'norm',
      'normp',
      'cross',
   })
   return parent.clearState(self)
end