local CAdd, parent = torch.class("nn.CAdd", "nn.Module")

function CAdd:__init(...)
   parent.__init(self)

   local arg = {...}

   self.size = torch.LongStorage()
   local n = #arg
   if n == 1 and torch.type(arg[1]) == 'torch.LongStorage' then
      self.size:resize(#arg[1]):copy(arg[1])
   else
     self.size:resize(n)
     for i=1,n do
         self.size[i] = arg[i]
     end
   end

   self.bias = torch.Tensor(self.size)
   self.gradBias = torch.Tensor(self.size)

   self.output:resize(self.size)

   self:reset()
end

function CAdd:reset(stdv)
   if stdv then
      --std of uniform distribution on interval [-a,a] = a/sqrt(3)
      stdv = stdv * math.sqrt(3)
   else
      stdv = 1.0/math.sqrt(self.bias:nElement())
   end
   self.bias:uniform(-stdv,stdv)
end

function CAdd:updateOutput(input)
   self._output = self._output or input.new()
   self._bias = self._bias or input.new()
   self._expand = self._expand or input.new()
   self._repeat = self._repeat or input.new()

   self.output:resizeAs(input):copy(input)
   if input:nElement() == self.bias:nElement() then
      self.output:add(self.bias)
   else
      if self.bias:dim() == input:dim() then
         self._output:set(self.output)
         self._bias:set(self.bias)
      else
         local batchSize = input:size(1)
         self._output:view(self.output, batchSize, -1)
         self._bias:view(self.bias, 1, -1)
      end

      self._expand:expandAs(self._bias, self._output)

      --expandAs uses stride 0 and self._expand is not contiguous
      --cuda ops may assume contiguous input
      if torch.type(input) == 'torch.CudaTensor' then
         self._repeat:resizeAs(self._expand):copy(self._expand)
         self._output:add(self._repeat)
      else
         self._output:add(self._expand)
      end
   end

   return self.output
end

function CAdd:updateGradInput(input, gradOutput)
   self.gradInput = self.gradInput or input.new()
   self.gradInput:resizeAs(gradOutput):copy(gradOutput)

   return self.gradInput
end

function CAdd:accGradParameters(input, gradOutput, scale)
   scale = scale or 1

   self._gradBias = self._gradBias or gradOutput.new()
   self._gradOutput = self._gradOutput or gradOutput.new()
   self._repeat = self._repeat or gradOutput.new()

   if self.bias:nElement() == gradOutput:nElement() then
      self.gradBias:add(scale, gradOutput)
   else
      if self.bias:dim() == gradOutput:dim() then
         self._gradBias:set(self.gradBias)
         self._gradOutput:set(gradOutput)
      else
         local batchSize = input:size(1)
         self._gradBias:view(self.gradBias, 1, -1)
         self._gradOutput:view(gradOutput, batchSize, -1)
      end

      self._gradBias:expandAs(self._gradBias, self._gradOutput)

      --expandAs uses stride 0 and self._gradBias is not contiguous
      --cuda ops may assume contiguous input
      if torch.type(self._gradBias) == 'torch.CudaTensor' then
         self._repeat:resizeAs(self._gradBias):copy(self._gradBias)
         self._repeat:add(scale, self._gradOutput)
         self._gradBias:copy(self._repeat)
      else
         self._gradBias:add(scale, self._gradOutput)
      end
   end
end

function CAdd:type(type, tensorCache)
   if type then
      self:clearState()
   end
   return parent.type(self, type, tensorCache)
end

function CAdd:clearState()
   nn.utils.clear(self, {
      '_gradBias',
      '_expand',
      '_output',
      '_bias',
      '_repeat'
   })
   return parent.clearState(self)
end