#ifndef THC_GENERIC_FILE
#define THC_GENERIC_FILE "generic/TemporalMaxPooling.cu"
#else

void THNN_(TemporalMaxPooling_updateOutput)(
           THCState *state,
           THCTensor *input,
           THCTensor *output,
           THCIndexTensor *indices,
           int kW, int dW) {

  int dimT = 0; // Temporal dimension
  int dimF = 1; // Feature dimension

  int batch = 1;
  int input_w;
  int input_n;
  int output_w;
  int nthreads;

  real *input_data;
  real *output_data;
  THCIndex_t *indices_data;

  THCUNN_assertSameGPU_generic(state, 3, input, output, indices);
  THArgCheck( input->nDimension == 2 || input->nDimension == 3, 2, "2D or 3D(batch mode) tensor expected");

  if (input->nDimension == 3)
  {
    dimT = 1;
    dimF = 2;
    batch = input->size[0];
  }
  THArgCheck( input->size[dimT] >= kW, 2, "input sequence smaller than kernel size");

  input = THCTensor_(newContiguous)(state, input);

  input_w = input->size[dimT];
  input_n = input->size[dimF];
  output_w = (input_w - kW) / dW + 1;

  if (input->nDimension == 2)
  {
    THCTensor_(resize2d)(state, output, output_w, input->size[dimF]);
    THCIndexTensor_(resize2d)(state, indices, output_w, input->size[dimF]);
  }
  else
  {
    THCTensor_(resize3d)(state, output, batch, output_w, input->size[dimF]);
    THCIndexTensor_(resize3d)(state, indices, batch, output_w, input->size[dimF]);
  }

  input_data = THCTensor_(data)(state, input);
  output_data = THCTensor_(data)(state, output);
  indices_data = THCIndexTensor_(data)(state, indices);

  dim3 blocks(batch);
  nthreads = (output_w / 32) * 32;
  if (output_w % 32 > 0) {
    nthreads += 32;
  }

  if (nthreads > TEMPORAL_MAX_POOLING_THREADS) {
    blocks.y = nthreads / TEMPORAL_MAX_POOLING_THREADS;
    if (nthreads % TEMPORAL_MAX_POOLING_THREADS > 0) {
      blocks.y += 1;
    }
    nthreads = TEMPORAL_MAX_POOLING_THREADS;
  }

  dim3 threads(nthreads);
  cunn_TemporalMaxPooling_updateOutputKernel <<< blocks, threads, 0, THCState_getCurrentStream(state) >>>(
      input_data, output_data, indices_data, input_w, input_n, output_w, kW, dW);
  THCudaCheck(cudaGetLastError());
  THCTensor_(free)(state, input);

}

void THNN_(TemporalMaxPooling_updateGradInput)(
           THCState *state,
           THCTensor *input,
           THCTensor *gradOutput,
           THCTensor *gradInput,
           THCIndexTensor *indices,
           int kW, int dW) {

  int dimT = 0; // Temporal dimension
  int dimF = 1; // Feature dimension

  int batch = 1;
  int input_w;
  int input_n;
  int output_w;
  int nthreads;

  real *gradInput_data;
  real *gradOutput_data;
  THCIndex_t *indices_data;

  THCUNN_assertSameGPU_generic(state, 4, input, gradOutput, gradInput, indices);
  THArgCheck( input->nDimension == 2 || input->nDimension == 3, 2, "2D or 3D(batch mode) tensor expected");

  THCTensor_(resizeAs)(state, gradInput, input);
  THCTensor_(zero)(state, gradInput);

  if (input->nDimension == 3)
  {
    dimT = 1;
    dimF = 2;
    batch = input->size[0];
  }
  THArgCheck( input->size[dimT] >= kW, 2, "input sequence smaller than kernel size");

  gradOutput = THCTensor_(newContiguous)(state, gradOutput);

  input_w = input->size[dimT];
  input_n = input->size[dimF];
  output_w = (input_w - kW) / dW + 1;

  gradInput_data = THCTensor_(data)(state, gradInput);
  gradOutput_data = THCTensor_(data)(state, gradOutput);
  indices_data = THCIndexTensor_(data)(state, indices);

  dim3 blocks(batch);
  nthreads = (output_w / 32) * 32;
  if (output_w % 32 > 0) {
    nthreads += 32;
  }

  if (nthreads > TEMPORAL_MAX_POOLING_THREADS) {
    blocks.y = nthreads / TEMPORAL_MAX_POOLING_THREADS;
    if (nthreads % TEMPORAL_MAX_POOLING_THREADS > 0) {
      blocks.y += 1;
    }
    nthreads = TEMPORAL_MAX_POOLING_THREADS;
  }

  dim3 threads(nthreads);
  if (kW <= dW) {
    cunn_TemporalMaxPooling_updateGradInputKernel <<< blocks, threads, 0, THCState_getCurrentStream(state) >>>(
        gradInput_data, gradOutput_data, indices_data, input_w, input_n, output_w, kW, dW);
  } else {
    cunn_TemporalMaxPooling_updateGradInputKernelAtomic <<< blocks, threads, 0, THCState_getCurrentStream(state) >>>(
        gradInput_data, gradOutput_data, indices_data, input_w, input_n, output_w, kW, dW);
  }
  THCudaCheck(cudaGetLastError());
  THCTensor_(free)(state, gradOutput);

}

#endif