#ifndef TH_GENERIC_FILE
#define TH_GENERIC_FILE "generic/SpatialMaxPooling.c"
#else

static int nn_(SpatialMaxPooling_updateOutput)(lua_State *L)
{
  THTensor *input = luaT_checkudata(L, 2, torch_Tensor);
  int kW = luaT_getfieldcheckint(L, 1, "kW");
  int kH = luaT_getfieldcheckint(L, 1, "kH");
  int dW = luaT_getfieldcheckint(L, 1, "dW");
  int dH = luaT_getfieldcheckint(L, 1, "dH");
  THTensor *indices = luaT_getfieldcheckudata(L, 1, "indices", torch_Tensor);
  THTensor *output = luaT_getfieldcheckudata(L, 1, "output", torch_Tensor);

  luaL_argcheck(L, input->nDimension == 3 || input->nDimension == 4 , 2, "3D or 4D (batch mode) tensor expected");
  int dimw = 2;
  int dimh = 1;
  long nbatch = 1;
  if (input->nDimension == 4) 
  {
    nbatch = input->size[0];
    dimw++;
    dimh++;
  }
  luaL_argcheck(L, input->size[dimw] >= kW && input->size[dimh] >= kH, 2, "input image smaller than kernel size");

  // sizes
  long nslices = input->size[dimh-1];
  long iheight = input->size[dimh];
  long iwidth = input->size[dimw];
  long oheight = (iheight - kH) / dH + 1;
  long owidth = (iwidth - kW) / dW + 1;

  // get contiguous input
  input = THTensor_(newContiguous)(input);

  // resize output
  if (input->nDimension == 3)
  {
    THTensor_(resize3d)(output, nslices, oheight, owidth);
    // indices will contain i,j locatyions for each output point
    THTensor_(resize4d)(indices, 2, nslices, oheight, owidth);
  }
  else
  {
    THTensor_(resize4d)(output, nbatch, nslices, oheight, owidth);
    // indices will contain i,j locatyions for each output point
    THTensor_(resize5d)(indices, 2, nbatch, nslices, oheight, owidth);
  }


  // get raw pointers
  real *input_data = THTensor_(data)(input);
  real *output_data = THTensor_(data)(output);
  real *indices_data = THTensor_(data)(indices);

  // compute max pooling for each input slice
  long k;
#pragma omp parallel for private(k)
  for (k = 0; k < nslices; k++)
  {
    long p;
    for (p = 0; p < nbatch; p++)
    {
      // pointers to slices
      real *input_p = input_data + p*nslices*iwidth*iheight + k*iwidth*iheight;
      real *output_p = output_data + p*nslices*owidth*oheight + k*owidth*oheight;
      real *indy_p = indices_data + p*nslices*owidth*oheight + k*owidth*oheight;
      real *indx_p = indices_data + (p+nbatch)*nslices*owidth*oheight + k*owidth*oheight;
      
      // loop over output
      int i,j;
      for(i = 0; i < oheight; i++) {
	for(j = 0; j < owidth; j++) {
	  // local pointers
	  real *ip = input_p + i*iwidth*dH + j*dW;
	  real *op = output_p + i*owidth + j;
	  real *indyp = indy_p + i*owidth + j;
	  real *indxp = indx_p + i*owidth + j;
	  
	  // compute local max:
	  long maxindex = -1;
	  real maxval = -THInf;
	  long tcntr = 0;
	  int x,y;
	  for(y = 0; y < kH; y++) {
	    for(x = 0; x < kW; x++) {
	      real val = *(ip + y*iwidth + x);
	      if (val > maxval) {
		maxval = val;
		maxindex = tcntr;
	      }
	      tcntr++;
	    }
	  }

	  // set output to local max
	  *op = maxval;
	  
	  // store location of max (x,y)
	  *indyp = (int)(maxindex / kW)+1;
	  *indxp = (maxindex % kW) +1;
	}
      }
    }
  }
  // cleanup
  THTensor_(free)(input);

  return 1;
}

static int nn_(SpatialMaxPooling_updateGradInput)(lua_State *L)
{
  THTensor *input = luaT_checkudata(L, 2, torch_Tensor);
  THTensor *gradOutput = luaT_checkudata(L, 3, torch_Tensor);
  int dW = luaT_getfieldcheckint(L, 1, "dW");
  int dH = luaT_getfieldcheckint(L, 1, "dH");
  THTensor *indices = luaT_getfieldcheckudata(L, 1, "indices", torch_Tensor);
  THTensor *gradInput = luaT_getfieldcheckudata(L, 1, "gradInput", torch_Tensor);

  // get contiguous gradOutput
  gradOutput = THTensor_(newContiguous)(gradOutput);

  // resize
  THTensor_(resizeAs)(gradInput, input);
  THTensor_(zero)(gradInput);

  int dimw = 2;
  int dimh = 1;
  long nbatch = 1;
  if (input->nDimension == 4) {
    nbatch = input->size[0];
    dimw++;
    dimh++;
  }


  // sizes
  int nslices = input->size[dimh-1];
  int iheight = input->size[dimh];
  int iwidth = input->size[dimw];
  int oheight = gradOutput->size[dimh];
  int owidth = gradOutput->size[dimw];

  // get raw pointers
  real *gradInput_data = THTensor_(data)(gradInput);
  real *gradOutput_data = THTensor_(data)(gradOutput);
  real *indices_data = THTensor_(data)(indices);

  // backprop
  long k;
#pragma omp parallel for private(k)
  for (k = 0; k < nslices; k++)
  {
    long p;
    for (p = 0; p < nbatch; p++)
    {
      // pointers to slices
      real *gradOutput_p = gradOutput_data + p*nslices*owidth*oheight + k*owidth*oheight;
      real *gradInput_p = gradInput_data + p*nslices*iwidth*iheight + k*iwidth*iheight;
      real *indy_p = indices_data + p*nslices*owidth*oheight + k*owidth*oheight;
      real *indx_p = indices_data + (p+nbatch)*nslices*owidth*oheight + k*owidth*oheight;
      
      // calculate max points
      int i,j;
      for(i = 0; i < oheight; i++) {
	for(j = 0; j < owidth; j++) {
	  // retrieve position of max
	  long maxi = *(indy_p + i*owidth + j) - 1 + i*dH;
	  long maxj = *(indx_p + i*owidth + j) - 1 + j*dW;
	  
	  // update gradient
	  *(gradInput_p + maxi*iwidth + maxj) += *(gradOutput_p + i*owidth + j);
	}
      }
    }
  }

  // cleanup
  THTensor_(free)(gradOutput);

  return 1;
}

static const struct luaL_Reg nn_(SpatialMaxPooling__) [] = {
  {"SpatialMaxPooling_updateOutput", nn_(SpatialMaxPooling_updateOutput)},
  {"SpatialMaxPooling_updateGradInput", nn_(SpatialMaxPooling_updateGradInput)},
  {NULL, NULL}
};

static void nn_(SpatialMaxPooling_init)(lua_State *L)
{
  luaT_pushmetatable(L, torch_Tensor);
  luaT_registeratname(L, nn_(SpatialMaxPooling__), "nn");
  lua_pop(L,1);
}

#endif