Merge pull request #346 from gchanan/pointwiseGeneric

Add generic support for pointwise modules
author: Soumith Chintala <soumith@gmail.com> 2016-11-09 02:18:01 +0300
committer: GitHub <noreply@github.com> 2016-11-09 02:18:01 +0300
commit: 8d5027feb6aac1ce2ee4e6a346ddcb1b7e95d6fa (patch)
tree: f0a89adfb00e7a49031ca32a9badcf016bc599cb
parent: aa256bc7def4336dd9e6fa43b66bf00758b4922e (diff)
parent: 27479c372040b8cab4e53e9338e8ce840bdb67dd (diff)
125 files changed, 14077 insertions, 12901 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3ce811f..501a2de 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -14,6 +14,11 @@ FILE(WRITE THCUNN_h.lua "return [[")
 FILE(APPEND THCUNN_h.lua ${THCUNN_headers})
 FILE(APPEND THCUNN_h.lua "]]")
 
+FILE(STRINGS lib/THCUNN/generic/THCUNN.h THCUNN_generic_headers NEWLINE_CONSUME)
+FILE(WRITE THCUNN_generic_h.lua "return [[")
+FILE(APPEND THCUNN_generic_h.lua ${THCUNN_generic_headers})
+FILE(APPEND THCUNN_generic_h.lua "]]")
+
 FILE(GLOB luasrc *.lua)
 
 ADD_SUBDIRECTORY(lib)
diff --git a/THCUNN.lua b/THCUNN.lua
index 771cd0c..490cd5c 100644
--- a/THCUNN.lua
+++ b/THCUNN.lua
@@ -12,46 +12,145 @@ function THCUNN.getState()
    return THCState_ptr(cutorch.getState());
 end
 
-local THCUNN_h = require 'cunn.THCUNN_h'
+local THCUNN_generic_h = require 'cunn.THCUNN_generic_h'
 -- strip all lines starting with #
 -- to remove preprocessor directives originally present
 -- in THNN.h
-THCUNN_h = THCUNN_h:gsub("\n#[^\n]*", "")
-THCUNN_h = THCUNN_h:gsub("^#[^\n]*\n", "")
+THCUNN_generic_h = THCUNN_generic_h:gsub("\n#[^\n]*", "")
+THCUNN_generic_h = THCUNN_generic_h:gsub("^#[^\n]*\n", "")
 
-local preprocessed = string.gsub(THCUNN_h, 'TH_API ', '')
+local preprocessed_generic = string.gsub(THCUNN_generic_h, 'TH_API void THNN_%(([%a%d_]+)%)', 'void THNN_TYPE%1')
 
 local replacements =
 {
    {
       ['THTensor'] = 'THCudaTensor',
-      ['THIndexTensor'] = 'THCudaLongTensor',
+      ['THCIndexTensor'] = 'THCudaLongTensor',
       ['THIndex_t'] = 'long',
       ['THInteger_t'] = 'float'
    }
 }
 
-for i=1,#replacements do
-   local r = replacements[i]
-   local s = preprocessed
-   for k,v in pairs(r) do
-      s = string.gsub(s, k, v)
+local cct2lt = {
+   ['THCudaFloatTensor'] = 'torch.CudaTensor',
+   ['THCudaDoubleTensor'] = 'torch.CudaDoubleTensor',
+}
+
+local replacements_generic =
+{
+  {
+    ['THCTensor'] = 'THCudaTensor',
+    ['THCIndexTensor'] = 'THCudaLongTensor',
+    ['TYPE'] = 'Cuda',
+    ['real'] = 'float'
+  },
+  {
+    ['THCTensor'] = 'THCudaDoubleTensor',
+    ['THCIndexTensor'] = 'THCudaLongTensor',
+    ['TYPE'] = 'CudaDouble',
+    ['real'] = 'double',
+   }
+}
+
+if cutorch.hasHalf then
+  ffi.cdef("half THC_float2half(float a);")
+  ffi.cdef("float THC_half2float(half a);")
+  cct2lt['THCudaHalfTensor'] = 'torch.CudaHalfTensor'
+  local half_replacement = {
+    ['THCTensor'] = 'THCudaHalfTensor',
+    ['THCIndexTensor'] = 'THCudaLongTensor',
+    ['TYPE'] = 'CudaHalf',
+    ['real'] = 'half'
+  }
+  table.insert(replacements_generic, half_replacement)
+end
+
+for i=1,#replacements_generic do
+    local r = replacements_generic[i]
+    local s = preprocessed_generic
+    for k,v in pairs(r) do
+        s = string.gsub(s, k, v)
+    end
+    ffi.cdef(s)
+end
+
+local function extract_function_names_generic(s)
+   local t = {}
+   for n in string.gmatch(s, 'TH_API void THNN_%(([%a%d_]+)%)') do
+       t[#t+1] = n
    end
-   ffi.cdef(s)
+   return t
 end
 
-local function extract_function_names(s)
+local function find_positions(s, p)
+   local begin = 0
+   local positions = {}
+   while true do
+      local start, stop = string.find(s, p, begin)
+      if (start == nil) then break end
+      positions[#positions+1] = start
+      begin = stop + 1
+   end
+   return positions
+end
+
+local function extract_function_names_and_real_args(s)
    local t = {}
-   for n in string.gmatch(s, 'TH_API void THNN_Cuda([%a%d_]+)') do
-      t[#t+1] = n
+   for n in string.gmatch(s, 'TH_API ([^;]+)') do
+      local func_name = string.match(n, 'void THNN_%(([%a%d_]+)%)')
+      local param_positions = find_positions(n, ',')
+      local positions = {}
+      for x,y in ipairs(find_positions(n, 'real')) do
+          local found = false
+          for cn,cp in ipairs(param_positions) do
+              if cp > y then
+                positions[#positions+1] = cn
+                found = true
+                break
+              end
+          end
+          -- it is the last param
+          if not found then positions[#positions+1] = #param_positions + 1 end
+      end
+
+   t[func_name] = positions
    end
    return t
 end
 
+local real_args = extract_function_names_and_real_args(THCUNN_generic_h)
+
 -- build function table
-local function_names = extract_function_names(THCUNN_h)
+local function_names_generic = extract_function_names_generic(THCUNN_generic_h)
 
-THNN.kernels['torch.CudaTensor'] = THNN.bind(THCUNN.C, function_names, 'Cuda', THCUNN.getState)
+THNN.kernels['torch.CudaTensor'] = THNN.bind(THCUNN.C, function_names_generic, 'Cuda', THCUNN.getState)
 torch.getmetatable('torch.CudaTensor').THNN = THNN.kernels['torch.CudaTensor']
 
+THNN.kernels['torch.CudaDoubleTensor'] = THNN.bind(THCUNN.C, function_names_generic, 'CudaDouble', THCUNN.getState)
+torch.getmetatable('torch.CudaDoubleTensor').THNN = THNN.kernels['torch.CudaDoubleTensor']
+
+-- in order to call 'half' functions from lua, convert real arguments from
+-- to half since there is no other defined conversion
+local transform_reals_to_half = function(func_name, real_args, ...)
+    t = {}
+    -- this select logic is necessary to deal with nil arguments
+    for i = 1, select('#', ...) do
+        t[i] = select(i, ...)
+    end
+    for k,v in ipairs(real_args[func_name]) do
+        -- first argument (THCState) is added implicitly by bind
+        t[v-1] = ffi.C.THC_float2half(t[v-1])
+    end
+    return t
+end
+
+local raw_half_functions = THNN.bind(THCUNN.C, function_names_generic, 'CudaHalf', THCUNN.getState)
+for k,v in pairs(raw_half_functions) do
+    -- select required in case there are trailing nils
+    raw_half_functions[k] = function(...) v(unpack(transform_reals_to_half(k, real_args, ...), 1, select("#",...)))
+end
+end
+THNN.kernels['torch.CudaHalfTensor'] = raw_half_functions
+torch.getmetatable('torch.CudaHalfTensor').THNN = THNN.kernels['torch.CudaHalfTensor']
+
 return THCUNN
diff --git a/lib/THCUNN/Abs.cu b/lib/THCUNN/Abs.cu
index 81b3297..0abefd9 100644
--- a/lib/THCUNN/Abs.cu
+++ b/lib/THCUNN/Abs.cu
@@ -1,32 +1,24 @@
 #include "THCUNN.h"
-#include "common.h"
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
 
+template <typename T>
 struct absupdateOutput_functor
 {
-  __device__ void operator()(float* output, const float* input) const
+  __device__ void operator()(T* output, const T* input) const
   {
     *output = abs(*input);
   }
 };
 
-void THNN_CudaAbs_updateOutput(THCState *state, THCudaTensor *input, THCudaTensor *output)
-{
-  THCUNN_assertSameGPU(state, 2, input, output);
-  THCudaTensor_resizeAs(state, output, input);
-  THC_pointwiseApply2(state, output, input, absupdateOutput_functor());
-}
-
+template <typename T>
 struct absupdateGradInput_functor
 {
-  __device__ void operator()(float* gradInput, const float* input, const float* gradOutput) const
+  __device__ void operator()(T* gradInput, const T* input, const T* gradOutput) const
   {
     *gradInput = *input < 0 ? - *gradOutput : *gradOutput;
   }
 };
 
-void THNN_CudaAbs_updateGradInput(THCState *state, THCudaTensor *input, THCudaTensor *gradOutput, THCudaTensor *gradInput)
-{
-  THCUNN_assertSameGPU(state, 3, input, gradOutput, gradInput);
-  THCudaTensor_resizeAs(state, gradInput, input);
-  THC_pointwiseApply3(state, gradInput, input, gradOutput, absupdateGradInput_functor());
-}
+#include "generic/Abs.cu"
+#include "THCGenerateFloatTypes.h"
diff --git a/lib/THCUNN/AbsCriterion.cu b/lib/THCUNN/AbsCriterion.cu
index fd19cae..057c971 100644
--- a/lib/THCUNN/AbsCriterion.cu
+++ b/lib/THCUNN/AbsCriterion.cu
@@ -1,5 +1,7 @@
 #include "THCUNN.h"
 #include "common.h"
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
 
 #include <thrust/fill.h>
 #include <thrust/functional.h>
@@ -7,69 +9,30 @@
 #include <thrust/reduce.h>
 #include <thrust/inner_product.h>
 
+template <typename Dtype, typename Acctype>
 struct abs_functor
 {
-  __host__ __device__ float operator()(const float& x, const float& y) const
+  __host__ __device__ Acctype operator()(const Dtype& x, const Dtype& y) const
   {
-    float z = x-y;
-    return z >= 0 ? z : -z;
+    Dtype z = x-y;
+    return ScalarConvert<Dtype, Acctype>::to(z >= 0 ? z : -z);
   }
 };
 
-void THNN_CudaAbsCriterion_updateOutput(THCState *state, THCudaTensor *input, THCudaTensor *target, THCudaTensor *output, bool sizeAverage)
-{
-  THCUNN_assertSameGPU(state, 2, input, target);
-
-  long size = THCudaTensor_nElement(state, input);
-
-  input = THCudaTensor_newContiguous(state, input);
-  target = THCudaTensor_newContiguous(state, target);
-
-  thrust::device_ptr<float> input_data(THCudaTensor_data(state, input));
-  thrust::device_ptr<float> target_data(THCudaTensor_data(state, target));
-  float sum = thrust::inner_product(input_data, input_data+size, target_data, (float) 0, thrust::plus<float>(), abs_functor());
-
-  if (sizeAverage)
-    sum /= size;
-
-  THCudaTensor_free(state, input);
-  THCudaTensor_free(state, target);
-
-  THCudaTensor_set1d(state, output, 0, sum);
-}
-
+template <typename Dtype>
 struct abs_updateGradInput_functor
 {
-  const float norm;
+  const Dtype norm;
 
-  abs_updateGradInput_functor(float norm_)
+  abs_updateGradInput_functor(Dtype norm_)
     : norm(norm_)
   {}
 
-  __host__ __device__ float operator()(const float& x, const float& y) const
+  __host__ __device__ Dtype operator()(const Dtype& x, const Dtype& y) const
   {
     return (x - y) >= 0 ? norm : -norm;
   }
 };
 
-void THNN_CudaAbsCriterion_updateGradInput(THCState *state, THCudaTensor *input, THCudaTensor *target, THCudaTensor *gradInput, bool sizeAverage)
-{
-  THCUNN_assertSameGPU(state, 3, input, target, gradInput);
-
-  long size = THCudaTensor_nElement(state, input);
-  float norm = (sizeAverage ? 1./size : 1.);
-
-  input = THCudaTensor_newContiguous(state, input);
-  target = THCudaTensor_newContiguous(state, target);
-
-  THCudaTensor_resizeAs(state, gradInput, input);
-
-  thrust::device_ptr<float> input_data(THCudaTensor_data(state, input));
-  thrust::device_ptr<float> target_data(THCudaTensor_data(state, target));
-  thrust::device_ptr<float> gradInput_data(THCudaTensor_data(state, gradInput));
-
-  thrust::transform(input_data, input_data+size, target_data, gradInput_data, abs_updateGradInput_functor(norm));
-
-  THCudaTensor_free(state, input);
-  THCudaTensor_free(state, target);
-}
+#include "generic/AbsCriterion.cu"
+#include "THCGenerateFloatTypes.h"
diff --git a/lib/THCUNN/BCECriterion.cu b/lib/THCUNN/BCECriterion.cu
index c8edcfe..3653fc8 100644
--- a/lib/THCUNN/BCECriterion.cu
+++ b/lib/THCUNN/BCECriterion.cu
@@ -1,5 +1,7 @@
 #include "THCUNN.h"
 #include "common.h"
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
 
 #include <thrust/functional.h>
 #include <thrust/device_ptr.h>
@@ -7,148 +9,80 @@
 #include <thrust/transform.h>
 #include <thrust/transform_reduce.h>
 
-const float eps = 1e-12f;
+template <typename T>
+inline __device__ T eps();
 
+template <>
+inline __device__ float eps() { return 1e-12f; }
+
+template <>
+inline __device__ double eps() { return 1e-12; }
+
+template <typename Dtype, typename Acctype>
 struct bce_functor
 {
   template <class Tuple>
   __host__ __device__
-  float operator()(Tuple x)
+  Acctype operator()(Tuple x)
   {
-    float o = thrust::get<0>(x);
-    float t = thrust::get<1>(x);
-    return - (t * logf(o + eps) + (1.f - t) * logf(1.f - o + eps));
+    Dtype o = thrust::get<0>(x);
+    Dtype t = thrust::get<1>(x);
+    return - (t * THCNumerics<Acctype>::log(o + eps<Acctype>()) + (Acctype(1)- t) * THCNumerics<Acctype>::log(Acctype(1) - o + eps<Acctype>()));
   }
 };
 
+template <typename Dtype, typename Acctype>
 struct bce_functor_weights
 {
   template <class Tuple>
   __host__ __device__
-  float operator()(Tuple x)
+  Acctype operator()(Tuple x)
   {
-    float o = thrust::get<0>(x);
-    float t = thrust::get<1>(x);
-    float w = thrust::get<2>(x);
-    return - w * (t * logf(o + eps) + (1.f - t) * logf(1.f - o + eps));
+    Dtype o = thrust::get<0>(x);
+    Dtype t = thrust::get<1>(x);
+    Dtype w = thrust::get<2>(x);
+    return - w * (t * THCNumerics<Acctype>::log(o + eps<Acctype>()) + (Acctype(1) - t) * THCNumerics<Acctype>::log(Acctype(1) - o + eps<Acctype>()));
   }
 };
 
-void THNN_CudaBCECriterion_updateOutput(THCState *state, THCudaTensor *input, THCudaTensor *target, THCudaTensor *output, bool sizeAverage, THCudaTensor *weights)
-{
-  THCUNN_assertSameGPU(state, 3, input, target, weights);
-
-  long size = THCudaTensor_nElement(state, input);
-
-  input = THCudaTensor_newContiguous(state, input);
-  target = THCudaTensor_newContiguous(state, target);
-
-  thrust::device_ptr<float> input_data(THCudaTensor_data(state, input));
-  thrust::device_ptr<float> target_data(THCudaTensor_data(state, target));
-
-  float sum;
-  if (weights) {
-    weights = THCudaTensor_newContiguous(state, weights);
-    thrust::device_ptr<float> weights_data(THCudaTensor_data(state, weights));
-    sum = thrust::transform_reduce(
-      thrust::make_zip_iterator(thrust::make_tuple(input_data, target_data, weights_data)),
-      thrust::make_zip_iterator(thrust::make_tuple(input_data+size, target_data+size, weights_data+size)),
-      bce_functor_weights(),
-      (float) 0.f,
-      thrust::plus<float>()
-    );
-    THCudaTensor_free(state, weights);
-  } else {
-    sum = thrust::transform_reduce(
-      thrust::make_zip_iterator(thrust::make_tuple(input_data, target_data)),
-      thrust::make_zip_iterator(thrust::make_tuple(input_data+size, target_data+size)),
-      bce_functor(),
-      (float) 0.f,
-      thrust::plus<float>()
-    );
-  }
-
-  if (sizeAverage)
-    sum /= size;
-
-  THCudaTensor_free(state, input);
-  THCudaTensor_free(state, target);
-
-  THCudaTensor_set1d(state, output, 0, sum);
-}
-
+template <typename Dtype, typename Acctype>
 struct bce_updateGradInput_functor
 {
-  const float norm;
+  const Dtype norm;
 
-  bce_updateGradInput_functor(float norm_)
+  bce_updateGradInput_functor(Dtype norm_)
     : norm(norm_)
   {}
 
   template <class Tuple>
   __host__ __device__
-  float operator()(Tuple x)
+  Dtype operator()(Tuple x)
   {
-    float o = thrust::get<0>(x);
-    float t = thrust::get<1>(x);
-    return - (t - o) / ((1 - o + eps) * (o + eps)) * norm;
+    Dtype o = thrust::get<0>(x);
+    Dtype t = thrust::get<1>(x);
+    return ScalarConvert<Acctype,Dtype>::to(- (t - o) / ((Acctype(1) - o + eps<Acctype>()) * (o + eps<Acctype>())) * norm);
   }
 };
 
+template <typename Dtype, typename Acctype>
 struct bce_updateGradInput_functor_weights
 {
-  const float norm;
+  const Dtype norm;
 
-  bce_updateGradInput_functor_weights(float norm_)
+  bce_updateGradInput_functor_weights(Dtype norm_)
     : norm(norm_)
   {}
 
   template <class Tuple>
   __host__ __device__
-  float operator()(Tuple x)
+  Dtype operator()(Tuple x)
   {
-    float o = thrust::get<0>(x);
-    float t = thrust::get<1>(x);
-    float w = thrust::get<2>(x);
-    return - (t - o) / ((1 - o + eps) * (o + eps)) * norm * w;
+    Dtype o = thrust::get<0>(x);
+    Dtype t = thrust::get<1>(x);
+    Dtype w = thrust::get<2>(x);
+    return ScalarConvert<Acctype, Dtype>::to(- (t - o) / ((Acctype(1) - o + eps<Acctype>()) * (o + eps<Acctype>())) * norm * w);
   }
 };
 
-void THNN_CudaBCECriterion_updateGradInput(THCState *state, THCudaTensor *input, THCudaTensor *target, THCudaTensor *gradInput, bool sizeAverage, THCudaTensor *weights)
-{
-  THCUNN_assertSameGPU(state, 4, input, target, gradInput, weights);
-
-  long size = THCudaTensor_nElement(state, input);
-  float norm = (sizeAverage ? 1./size : 1.);
-
-  input = THCudaTensor_newContiguous(state, input);
-  target = THCudaTensor_newContiguous(state, target);
-
-  THCudaTensor_resizeAs(state, gradInput, input);
-
-  thrust::device_ptr<float> input_data(THCudaTensor_data(state, input));
-  thrust::device_ptr<float> target_data(THCudaTensor_data(state, target));
-  thrust::device_ptr<float> gradInput_data(THCudaTensor_data(state, gradInput));
-
-  if (weights) {
-    weights = THCudaTensor_newContiguous(state, weights);
-    thrust::device_ptr<float> weights_data(THCudaTensor_data(state, weights));
-    thrust::transform(
-      thrust::make_zip_iterator(thrust::make_tuple(input_data, target_data, weights_data)),
-      thrust::make_zip_iterator(thrust::make_tuple(input_data+size, target_data+size, weights_data+size)),
-      gradInput_data,
-      bce_updateGradInput_functor_weights(norm)
-    );
-    THCudaTensor_free(state, weights);
-  } else {
-    thrust::transform(
-      thrust::make_zip_iterator(thrust::make_tuple(input_data, target_data)),
-      thrust::make_zip_iterator(thrust::make_tuple(input_data+size, target_data+size)),
-      gradInput_data,
-      bce_updateGradInput_functor(norm)
-    );
-  }
-
-  THCudaTensor_free(state, input);
-  THCudaTensor_free(state, target);
-}
+#include "generic/BCECriterion.cu"
+#include "THCGenerateFloatTypes.h"
diff --git a/lib/THCUNN/BatchNormalization.cu b/lib/THCUNN/BatchNormalization.cu
index ca2cd8c..125e3ff 100644
--- a/lib/THCUNN/BatchNormalization.cu
+++ b/lib/THCUNN/BatchNormalization.cu
@@ -1,12 +1,12 @@
 #include "THCUNN.h"
 #include "common.h"
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
 
 #include "THCDeviceTensor.cuh"
 #include "THCDeviceTensorUtils.cuh"
 
 const int WARP_SIZE = 32;
-typedef THCDeviceTensor<float, 3> DeviceTensor3;
-typedef THCDeviceTensor<float, 1> DeviceTensor1;
 
 // The maximum number of threads in a block
 const int MAX_BLOCK_SIZE = 512;
@@ -27,11 +27,13 @@ __device__ __forceinline__ int getMSB(int val) {
   return 31 - __clz(val);
 }
 
+template <typename Dtype, typename Acctype>
 struct Float2 {
-  float v1, v2;
+  Acctype v1, v2;
   __device__ Float2() {}
-  __device__ Float2(float v1, float v2) : v1(v1), v2(v2) {}
-  __device__ Float2(float v) : v1(v), v2(v) {}
+  __device__ Float2(Dtype v1, Dtype v2) : v1(ScalarConvert<Dtype, Acctype>::to(v1)), v2(ScalarConvert<Dtype, Acctype>::to(v2)) {}
+  __device__ Float2(Dtype v) : v1(ScalarConvert<Dtype, Acctype>::to(v)), v2(ScalarConvert<Dtype, Acctype>::to(v)) {}
+  __device__ Float2(int v) : v1(ScalarConvert<int, Acctype>::to(v)), v2(ScalarConvert<int, Acctype>::to(v)) {}
   __device__ Float2& operator+=(const Float2& a) {
     v1 += a.v1;
     v2 += a.v2;
@@ -39,45 +41,49 @@ struct Float2 {
   }
 };
 
+template <typename Dtype, typename Acctype, typename DeviceTensor3>
 struct SumOp {
   __device__ SumOp(const DeviceTensor3 t) : tensor(t) {}
-  __device__ __forceinline__ float operator()(int batch, int plane, int n) {
-    return tensor[batch][plane][n];
+  __device__ __forceinline__ Acctype operator()(int batch, int plane, int n) {
+    return ScalarConvert<Dtype, Acctype>::to(tensor[batch][plane][n]);
   }
   const DeviceTensor3 tensor;
 };
 
+template <typename Dtype, typename Acctype, typename DeviceTensor3>
 struct VarOp {
-  __device__ VarOp(float m, const DeviceTensor3 t) : mean(m), tensor(t) {}
-  __device__ __forceinline__ float operator()(int batch, int plane, int n) {
-    float val = tensor[batch][plane][n];
+  __device__ VarOp(Acctype m, const DeviceTensor3 t) : mean(m), tensor(t) {}
+  __device__ __forceinline__ Acctype operator()(int batch, int plane, int n) {
+    Dtype val = tensor[batch][plane][n];
     return (val - mean) * (val - mean);
   }
-  const float mean;
+  const Acctype mean;
   const DeviceTensor3 tensor;
 };
 
+template <typename Dtype, typename Acctype, typename DeviceTensor3>
 struct GradOp {
-  __device__ GradOp(float m, const DeviceTensor3 i, const DeviceTensor3 g)
+  __device__ GradOp(Acctype m, const DeviceTensor3 i, const DeviceTensor3 g)
     : mean(m), input(i), gradOutput(g) {}
-  __device__ __forceinline__ Float2 operator()(int batch, int plane, int n) {
-    float g = gradOutput[batch][plane][n];
-    float c = input[batch][plane][n] - mean;
-    return Float2(g, g * c);
+  __device__ __forceinline__ Float2<Dtype, Acctype> operator()(int batch, int plane, int n) {
+    Dtype g = gradOutput[batch][plane][n];
+    Dtype c = ScalarConvert<Acctype, Dtype>::to(input[batch][plane][n] - mean);
+    return Float2<Dtype, Acctype>(g, g * c);
   }
-  const float mean;
+  const Acctype mean;
   const DeviceTensor3 input;
   const DeviceTensor3 gradOutput;
 };
 
 // Sum across all threads within a warp
-static __device__ __forceinline__ float warpSum(float val) {
+template <typename T>
+static __device__ __forceinline__ T warpSum(T val) {
 #if __CUDA_ARCH__ >= 300
   for (int i = 0; i < getMSB(WARP_SIZE); ++i) {
     val += __shfl_xor(val, 1 << i, WARP_SIZE);
   }
 #else
-  __shared__ float values[MAX_BLOCK_SIZE];
+  __shared__ T values[MAX_BLOCK_SIZE];
   values[threadIdx.x] = val;
   __threadfence_block();
   const int base = (threadIdx.x / WARP_SIZE) * WARP_SIZE;
@@ -88,14 +94,15 @@ static __device__ __forceinline__ float warpSum(float val) {
   return val;
 }
 
-static __device__ __forceinline__ Float2 warpSum(Float2 value) {
+template <typename Dtype, typename Acctype>
+static __device__ __forceinline__ Float2<Dtype, Acctype> warpSum(Float2<Dtype, Acctype> value) {
   value.v1 = warpSum(value.v1);
   value.v2 = warpSum(value.v2);
   return value;
 }
 
 // Sum across (batch, x/y/z) applying Op() pointwise
-template<typename T, typename Op>
+template<typename T, typename Op, typename DeviceTensor3>
 __device__ T reduce(Op op, DeviceTensor3 tensor, int plane) {
   T sum = (T)0;
   for (int batch = 0; batch < tensor.getSize(0); ++batch) {
@@ -130,32 +137,7 @@ __device__ T reduce(Op op, DeviceTensor3 tensor, int plane) {
   return shared[0];
 }
 
-template <int Dim>
-static THCDeviceTensor<float, Dim> devicetensor(THCState *state, THCudaTensor *t) {
-  if (!t) {
-    return THCDeviceTensor<float, Dim>();
-  }
-
-  int inDim = THCudaTensor_nDimension(state, t);
-  if (inDim == Dim) {
-    return toDeviceTensor<float, Dim>(state, t);
-  }
-
-  // View in which the last dimensions are collapsed or expanded as needed
-  THAssert(THCudaTensor_isContiguous(state, t));
-  int size[Dim];
-  for (int i = 0; i < Dim || i < inDim; ++i) {
-    if (i < Dim && i < inDim) {
-      size[i] = t->size[i];
-    } else if (i < Dim) {
-      size[i] = 1;
-    } else {
-      size[Dim - 1] *= t->size[i];
-    }
-  }
-  return THCDeviceTensor<float, Dim>(THCudaTensor_data(state, t), size);
-}
-
+template <typename Dtype, typename Acctype, typename DeviceTensor1, typename DeviceTensor3>
 __global__ void BatchNormalizationUpdateOutputInference_kernel(
     const DeviceTensor3 input,
     DeviceTensor3 output,
@@ -163,31 +145,32 @@ __global__ void BatchNormalizationUpdateOutputInference_kernel(
     DeviceTensor1 runningVar,
     const DeviceTensor1 weight,
     const DeviceTensor1 bias,
-    float epsilon) {
+    Acctype epsilon) {
 
   int plane = blockIdx.x;
 
-  float invstd = 1.0f / sqrt(runningVar[plane].ldg() + epsilon);
-  float mean = runningMean[plane].ldg();
-  float gamma = weight.numElements() > 0 ? weight[plane].ldg() : 1.0f;
-  float beta = bias.numElements() > 0 ? bias[plane].ldg() : 0.0f;
+  Acctype invstd = Acctype(1) / sqrt(runningVar[plane].ldg() + epsilon);
+  Acctype mean = ScalarConvert<Dtype, Acctype>::to(runningMean[plane].ldg());
+  Acctype gamma = weight.numElements() > 0 ? ScalarConvert<Dtype, Acctype>::to(weight[plane].ldg()) : Acctype(1);
+  Acctype beta = bias.numElements() > 0 ? ScalarConvert<Dtype, Acctype>::to(bias[plane].ldg()) : Acctype(0);
 
   // Write normalized and update the output
   for (int batch = 0; batch < input.getSize(0); batch++) {
     for (int x = threadIdx.x; x < input.getSize(2); x += blockDim.x) {
-      float inp = input[batch][plane][x].ldg();
-      output[batch][plane][x] = gamma * (inp - mean) * invstd + beta;
+      Dtype inp = input[batch][plane][x].ldg();
+      output[batch][plane][x] = ScalarConvert<Acctype, Dtype>::to(gamma * (inp - mean) * invstd + beta);
     }
   }
 }
 
+template <typename Dtype, typename Acctype, typename DeviceTensor1, typename DeviceTensor3>
 __global__ void BatchNormalizationUpdateOutput_kernel(
     const DeviceTensor3 input,
     DeviceTensor3 output,
     const DeviceTensor1 weight,
     const DeviceTensor1 bias,
-    const float epsilon,
-    const float momentum,
+    const Acctype epsilon,
+    const Acctype momentum,
     DeviceTensor1 runningMean,
     DeviceTensor1 runningVar,
     DeviceTensor1 saveMean,
@@ -196,73 +179,39 @@ __global__ void BatchNormalizationUpdateOutput_kernel(
   int plane = blockIdx.x;
   int N = input.getSize(0) * input.getSize(2);
 
-  float norm = 1.0f / N;
+  Acctype norm = Acctype(1) / N;
 
   // Compute the mean and variance across (batch, x/y/z)
-  float mean = reduce<float>(SumOp(input), input, plane) * norm;
+  Acctype mean = reduce<Acctype>(SumOp<Dtype, Acctype, DeviceTensor3>(input), input, plane) * norm;
   __syncthreads();
-  float varN = reduce<float>(VarOp(mean, input), input, plane);
-  float invStd = 0.0f;
-  if (varN != 0.0f || epsilon != 0.0f) {
+  Acctype varN = reduce<Acctype>(VarOp<Dtype, Acctype, DeviceTensor3>(mean, input), input, plane);
+  Acctype invStd = 0;
+  if (varN != Acctype(0) || epsilon != Acctype(0)) {
     invStd = 1 / sqrt(varN * norm + epsilon);
   }
 
   // Save the mean, variance, and moving averages
   if (threadIdx.x == 0) {
     // Momentum based writeback
-    float unbiasedVar = varN / (N - 1);
-    saveMean[plane] = mean;
-    saveStd[plane] = invStd;
-    runningMean[plane] = (1 - momentum) * runningMean[plane] + momentum * mean;
-    runningVar[plane] = (1 - momentum) * runningVar[plane] + momentum * unbiasedVar;
+    Acctype unbiasedVar = varN / (N - 1);
+    saveMean[plane] = ScalarConvert<Acctype, Dtype>::to(mean);
+    saveStd[plane] = ScalarConvert<Acctype, Dtype>::to(invStd);
+    runningMean[plane] = ScalarConvert<Acctype, Dtype>::to((1 - momentum) * runningMean[plane] + momentum * mean);
+    runningVar[plane] = ScalarConvert<Acctype, Dtype>::to((1 - momentum) * runningVar[plane] + momentum * unbiasedVar);
   }
 
   // Write normalized and update the output
-  float gamma = weight.numElements() > 0 ? weight[plane] : 1.0f;
-  float beta = bias.numElements() > 0 ? bias[plane] : 0.0f;
+  Acctype gamma = weight.numElements() > 0 ? ScalarConvert<Dtype, Acctype>::to(weight[plane]) : ScalarConvert<int, Acctype>::to(1);
+  Acctype beta = bias.numElements() > 0 ? ScalarConvert<Dtype, Acctype>::to(bias[plane]) : ScalarConvert<int, Acctype>::to(0);
   for (int batch = 0; batch < input.getSize(0); ++batch) {
     for (int x = threadIdx.x; x < input.getSize(2); x += blockDim.x) {
-      float inp = input[batch][plane][x].ldg();
-      output[batch][plane][x] = gamma * (inp - mean) * invStd + beta;
+      Dtype inp = input[batch][plane][x].ldg();
+      output[batch][plane][x] = ScalarConvert<Acctype, Dtype>::to(gamma * (inp - mean) * invStd + beta);
     }
   }
 }
 
-void THNN_CudaBatchNormalization_updateOutput(
-  THCState *state, THCudaTensor *input_, THCudaTensor *output_,
-  THCudaTensor *weight_, THCudaTensor *bias_, THCudaTensor *runningMean_,
-  THCudaTensor *runningVar_, THCudaTensor *saveMean_, THCudaTensor *saveStd_,
-  bool train, double momentum, double eps) {
-
-  THCUNN_assertSameGPU(state, 8, input_, output_, weight_, bias_, runningMean_,
-    runningVar_, saveMean_, saveStd_);
-  DeviceTensor3 input = devicetensor<3>(state, input_);
-  DeviceTensor3 output = devicetensor<3>(state, output_);
-  DeviceTensor1 weight = devicetensor<1>(state, weight_);
-  DeviceTensor1 bias = devicetensor<1>(state, bias_);
-  DeviceTensor1 runningMean = devicetensor<1>(state, runningMean_);
-  DeviceTensor1 runningVar = devicetensor<1>(state, runningVar_);
-  DeviceTensor1 saveMean = devicetensor<1>(state, saveMean_);
-  DeviceTensor1 saveStd = devicetensor<1>(state, saveStd_);
-
-  cudaStream_t s = THCState_getCurrentStream(state);
-  cudaDeviceProp *prop = THCState_getCurrentDeviceProperties(state);
-
-  if (!train) {
-    dim3 blocks(input.getSize(1));
-    dim3 threads(getNumThreads(input.getSize(2)));
-    BatchNormalizationUpdateOutputInference_kernel<<<blocks, threads, 0, s>>>(
-      input, output, runningMean, runningVar, weight, bias, eps);
-  } else {
-    dim3 blocks(input.getSize(1));
-    dim3 threads(getNumThreads(input.getSize(2)));
-    BatchNormalizationUpdateOutput_kernel<<<blocks, threads, 0, s>>>(
-      input, output, weight, bias, eps, momentum, runningMean, runningVar,
-      saveMean, saveStd);
-  }
-  THCudaCheck(cudaGetLastError());
-}
-
+template <typename Dtype, typename Acctype, typename DeviceTensor1, typename DeviceTensor3>
 __global__ void BatchNormalizationBackward_kernel(
     const DeviceTensor3 input,
     const DeviceTensor3 gradOutput,
@@ -275,45 +224,46 @@ __global__ void BatchNormalizationBackward_kernel(
     const DeviceTensor1 saveMean,
     const DeviceTensor1 saveStd,
     bool train,
-    float scale,
+    Acctype scale,
     double eps) {
 
   int plane = blockIdx.x;
   int N = gradOutput.getSize(0) * gradOutput.getSize(2);
 
-  float mean, stdVal;
+  Acctype mean, stdVal;
   if (train) {
-    mean = saveMean[plane];
-    stdVal = saveStd[plane];
+    mean = ScalarConvert<Dtype, Acctype>::to(saveMean[plane]);
+    stdVal = ScalarConvert<Dtype, Acctype>::to(saveStd[plane]);
   } else {
-    mean = runningMean[plane];
+    mean = ScalarConvert<Dtype, Acctype>::to(runningMean[plane]);
     stdVal = 1 / sqrt(runningVar[plane] + eps);
   }
 
-  float weightVal = weight.numElements() > 0 ? weight[plane] : 1.0f;
-  float norm = 1.0f / N;
+  Acctype weightVal = weight.numElements() > 0 ? ScalarConvert<Dtype, Acctype>::to(weight[plane]) : Acctype(1);
+  Acctype norm = Acctype(1) / N;
 
   // Compute two values across (batch, x/y/z) in one pass:
   // 1. Sum(gradOutput)
   // 2. DotProduct(input - mean, gradOutput)
-  Float2 res = reduce<Float2>(GradOp(mean, input, gradOutput), gradOutput, plane);
-  float gradOutputSum = res.v1;
-  float dotP = res.v2;
+  GradOp<Dtype, Acctype, DeviceTensor3> g(mean, input, gradOutput);
+  Float2<Dtype, Acctype> res = reduce<Float2<Dtype, Acctype>, GradOp<Dtype, Acctype, DeviceTensor3>, DeviceTensor3>(g, gradOutput, plane);
+  Acctype gradOutputSum = res.v1;
+  Acctype dotP = res.v2;
 
-  float gradMean = gradOutputSum * norm;
-  float projScale = dotP * norm * stdVal * stdVal;
-  float gradScale = stdVal * weightVal;
+  Acctype gradMean = gradOutputSum * norm;
+  Acctype projScale = dotP * norm * stdVal * stdVal;
+  Acctype gradScale = stdVal * weightVal;
 
   if (gradInput.numElements() > 0) {
     for (int batch = 0; batch < gradOutput.getSize(0); ++batch) {
       for (int x = threadIdx.x; x < gradOutput.getSize(2); x += blockDim.x) {
-        float gradOut = gradOutput[batch][plane][x];
+        Dtype gradOut = gradOutput[batch][plane][x];
         if (train) {
-          float inp = input[batch][plane][x];
-          float proj = (inp - mean) * projScale;
-          gradInput[batch][plane][x] = (gradOut - proj - gradMean) * gradScale;
+          Dtype inp = input[batch][plane][x];
+          Acctype proj = (inp - mean) * projScale;
+          gradInput[batch][plane][x] = ScalarConvert<Acctype, Dtype>::to((gradOut - proj - gradMean) * gradScale);
         } else {
-          gradInput[batch][plane][x] = gradOut * gradScale;
+          gradInput[batch][plane][x] = ScalarConvert<Acctype, Dtype>::to(gradOut * gradScale);
         }
       }
     }
@@ -321,42 +271,16 @@ __global__ void BatchNormalizationBackward_kernel(
 
   if (gradWeight.numElements() > 0) {
     if (threadIdx.x == 0) {
-      gradWeight[plane] += scale * dotP * stdVal;
+      gradWeight[plane] += ScalarConvert<Acctype, Dtype>::to(scale * dotP * stdVal);
     }
   }
 
   if (gradBias.numElements() > 0) {
     if (threadIdx.x == 0) {
-      gradBias[plane] += scale * gradOutputSum;
+      gradBias[plane] += ScalarConvert<Acctype, Dtype>::to(scale * gradOutputSum);
     }
   }
 }
 
-void THNN_CudaBatchNormalization_backward(
-  THCState *state, THCudaTensor *input_, THCudaTensor *gradOutput_,
-  THCudaTensor *gradInput_, THCudaTensor *gradWeight_, THCudaTensor *gradBias_,
-  THCudaTensor *weight_, THCudaTensor *runningMean_, THCudaTensor *runningVar_,
-  THCudaTensor *saveMean_, THCudaTensor *saveStd_, bool train, float scale, double eps) {
-
-  THCUNN_assertSameGPU(state, 10, input_, gradOutput_, gradInput_, gradWeight_,
-    gradBias_, weight_, runningMean_, runningVar_, saveMean_, saveStd_);
-  DeviceTensor3 input = devicetensor<3>(state, input_);
-  DeviceTensor3 gradOutput = devicetensor<3>(state, gradOutput_);
-  DeviceTensor3 gradInput = devicetensor<3>(state, gradInput_);
-  DeviceTensor1 gradWeight = devicetensor<1>(state, gradWeight_);
-  DeviceTensor1 gradBias = devicetensor<1>(state, gradBias_);
-  DeviceTensor1 weight = devicetensor<1>(state, weight_);
-  DeviceTensor1 runningMean = devicetensor<1>(state, runningMean_);
-  DeviceTensor1 runningVar = devicetensor<1>(state, runningVar_);
-  DeviceTensor1 saveMean = devicetensor<1>(state, saveMean_);
-  DeviceTensor1 saveStd = devicetensor<1>(state, saveStd_);
-
-  cudaStream_t s = THCState_getCurrentStream(state);
-
-  dim3 blocks(gradOutput.getSize(1));
-  dim3 threads(getNumThreads(gradOutput.getSize(2)));
-  BatchNormalizationBackward_kernel<<<blocks, threads, 0, s>>>(
-    input, gradOutput, gradInput, gradWeight, gradBias, weight, runningMean, runningVar,
-    saveMean, saveStd, train, scale, eps);
-  THCudaCheck(cudaGetLastError());
-}
+#include "generic/BatchNormalization.cu"
+#include "THCGenerateFloatTypes.h"
diff --git a/lib/THCUNN/CMakeLists.txt b/lib/THCUNN/CMakeLists.txt
index 8492503..5dd8712 100644
--- a/lib/THCUNN/CMakeLists.txt
+++ b/lib/THCUNN/CMakeLists.txt
@@ -31,6 +31,7 @@ ENDIF()
 
 FILE(GLOB src-cuda *.cu)
 
+CUDA_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
 CUDA_ADD_LIBRARY(THCUNN MODULE ${src-cuda})
 
 INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
diff --git a/lib/THCUNN/ClassNLLCriterion.cu b/lib/THCUNN/ClassNLLCriterion.cu
index 2b3ed8b..58684f4 100644
--- a/lib/THCUNN/ClassNLLCriterion.cu
+++ b/lib/THCUNN/ClassNLLCriterion.cu
@@ -1,16 +1,19 @@
 #include "THCUNN.h"
 #include "common.h"
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
 
 #include <stdio.h>
 #include <assert.h>
 
 static const int NTHREADS = 32;
 
-__global__ void cunn_ClassNLLCriterion_updateOutput_kernel1(float *output,
-                                                           float *total_weight,
-                                                           float *input,
-                                                           long  *target,
-                                                           float *weights,
+template <typename Dtype>
+__global__ void cunn_ClassNLLCriterion_updateOutput_kernel1(Dtype *output,
+                                                           Dtype *total_weight,
+                                                           Dtype *input,
+                                                           THCIndex_t  *target,
+                                                           Dtype *weights,
                                                            int size_average,
                                                            int n_classes) {
   assert(threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0);
@@ -20,7 +23,7 @@ __global__ void cunn_ClassNLLCriterion_updateOutput_kernel1(float *output,
 
   int t = (int)*target - TH_INDEX_BASE;
   assert(t >= 0 && t < n_classes);
-  float cur_weight = weights ? weights[t] : 1.0f;
+  Dtype cur_weight = weights ? weights[t] : ScalarConvert<int, Dtype>::to(1);
   *output = -cur_weight * input[t];
   *total_weight = cur_weight;
   if (size_average && *total_weight > 0) {
@@ -28,25 +31,26 @@ __global__ void cunn_ClassNLLCriterion_updateOutput_kernel1(float *output,
   }
 }
 
-__global__ void cunn_ClassNLLCriterion_updateOutput_kernel(float *output,
-                                                           float *total_weight,
-                                                           float *input,
-                                                           long *target,
-                                                           float *weights,
+template <typename Dtype, typename Acctype>
+__global__ void cunn_ClassNLLCriterion_updateOutput_kernel(Dtype *output,
+                                                           Dtype *total_weight,
+                                                           Dtype *input,
+                                                           THCIndex_t *target,
+                                                           Dtype *weights,
                                                            int size_average,
                                                            int nframe,
                                                            int ndim,
                                                            int n_classes) {
-  __shared__ float shInputs[NTHREADS], acc_weight[NTHREADS];
+  __shared__ Acctype shInputs[NTHREADS], acc_weight[NTHREADS];
   int i, t;
-  float cur_weight;
+  Dtype cur_weight;
 
-  shInputs[threadIdx.x] = 0.0f;
-  acc_weight[threadIdx.x] = 0.0f;
+  shInputs[threadIdx.x] = ScalarConvert<int, Acctype>::to(0);
+  acc_weight[threadIdx.x] = ScalarConvert<int, Acctype>::to(0);
   for (i = threadIdx.x; i < nframe; i += NTHREADS) {
       t = target[i] - TH_INDEX_BASE;
       assert(t >= 0 && t < n_classes);
-      cur_weight = weights ? weights[t] : 1.0f;
+      cur_weight = weights ? weights[t] : ScalarConvert<int, Dtype>::to(1);
       shInputs[threadIdx.x] -= input[i * ndim + t] * cur_weight;
       acc_weight[threadIdx.x] += cur_weight;
   }
@@ -56,39 +60,47 @@ __global__ void cunn_ClassNLLCriterion_updateOutput_kernel(float *output,
   // updateOutput_kernel
 
   if (threadIdx.x == 0) {
-    *output = *total_weight = 0;
+    *output = *total_weight = ScalarConvert<int, Dtype>::to(0);
+    Acctype outputAcc = 0;
+    Acctype total_weightAcc = 0;
     for (i = 0; i < NTHREADS; ++i){
-      *output += shInputs[i];
-      *total_weight += acc_weight[i];
+      // FIXME should we do somethigng here
+      outputAcc += shInputs[i];
+      total_weightAcc += acc_weight[i];
     }
+    *total_weight = ScalarConvert<Acctype, Dtype>::to(total_weightAcc);
+    *output = ScalarConvert<Acctype, Dtype>::to(outputAcc);
     if (size_average && *total_weight > 0) {
-      *output /= *total_weight;
+      *output = ScalarConvert<Acctype, Dtype>::to(outputAcc / total_weightAcc);
     }
+
   }
 }
 
+template <typename Dtype>
 __global__ void cunn_ClassNLLCriterion_updateGradInput_kernel1(
-  float* gradInput,
-  float* weights,
-  long* target,
-  float* total_weight,
+  Dtype* gradInput,
+  Dtype* weights,
+  THCIndex_t* target,
+  Dtype* total_weight,
   int size_average,
   int n_classes)
 {
   if (*total_weight <= 0) {
     return;
   }
-  float norm = size_average ? (1.0f / *total_weight) : 1.0f;
+  Dtype norm = size_average ? (ScalarConvert<int, Dtype>::to(1) / *total_weight) : ScalarConvert<int, Dtype>::to(1);
   int t = (int)*target - TH_INDEX_BASE;
   assert(t >= 0 && t < n_classes);
-  gradInput[t] = -(weights ? weights[t] : 1.0f) * norm;
+  gradInput[t] = -(weights ? weights[t] : ScalarConvert<int, Dtype>::to(1)) * norm;
 }
 
+template <typename Dtype>
 __global__ void cunn_ClassNLLCriterion_updateGradInput_kernel(
-  float *gradInput,
-  long *target,
-  float *weights,
-  float *total_weight,
+  Dtype *gradInput,
+  THCIndex_t *target,
+  Dtype *weights,
+  Dtype *total_weight,
   int size_average,
   int nframe,
   int ndim,
@@ -98,148 +110,14 @@ __global__ void cunn_ClassNLLCriterion_updateGradInput_kernel(
     return;
   }
   int i, t;
-  float norm = size_average ? (1.0f / *total_weight) : 1.0f;
+  Dtype norm = size_average ? (ScalarConvert<int, Dtype>::to(1) / *total_weight) : ScalarConvert<int, Dtype>::to(1);
 
   for (i = threadIdx.x; i < nframe; i += NTHREADS) {
     t = (int)target[i] - TH_INDEX_BASE;
     assert(t >= 0 && t < n_classes);
-    gradInput[i * ndim + t] = -(weights ? weights[t] : 1.0f) * norm;
-  }
-}
-
-void THNN_CudaClassNLLCriterion_updateOutput(THCState *state, THCudaTensor *input, THCudaLongTensor *target, THCudaTensor *output, bool sizeAverage, THCudaTensor *weights, THCudaTensor *total_weight) {
-  if (THCudaLongTensor_nDimension(state, target) > 1) {
-    THError("multi-target not supported");
-  }
-
-  int n_dims = THCudaTensor_nDimension(state, input);
-  int n_classes = THCudaTensor_size(state, input, n_dims - 1);
-
-  if (weights) {
-    THCUNN_assertSameGPU(
-      state, 5, input, target, weights, output, total_weight
-    );
-  } else {
-    THCUNN_assertSameGPU(
-      state, 4, input, target, output, total_weight
-    );
-  }
-
-  if (THCudaTensor_nDimension(state, input) > 2) {
-    THArgCheck(0, 2, "vector or matrix expected");
-  }
-  if (weights && THCudaTensor_nElement(state, weights) != n_classes) {
-    THError("weight tensor should be defined either for all or no classes");
-  }
-
-  input = THCudaTensor_newContiguous(state, input);
-  weights = weights ? THCudaTensor_newContiguous(state, weights) : NULL;
-  target = THCudaLongTensor_newContiguous(state, target);
-
-  float *input_data = THCudaTensor_data(state, input);
-  float *weights_data = weights ? THCudaTensor_data(state, weights) : NULL;
-  long  *target_data = THCudaLongTensor_data(state, target);
-  float *output_data = THCudaTensor_data(state, output);
-  float *total_weight_data = THCudaTensor_data(state, total_weight);
-
-  if (THCudaTensor_nDimension(state, input) == 1) {
-    cunn_ClassNLLCriterion_updateOutput_kernel1
-      <<<1, 1, 0, THCState_getCurrentStream(state)>>>(
-        output_data,
-        total_weight_data,
-        input_data,
-        target_data,
-        weights_data,
-        sizeAverage,
-        n_classes
-    );
-
-  } else if (THCudaTensor_nDimension(state, input) == 2) {
-    cunn_ClassNLLCriterion_updateOutput_kernel
-      <<<1, NTHREADS, 0, THCState_getCurrentStream(state)>>>(
-        output_data,
-        total_weight_data,
-        input_data,
-        target_data,
-        weights_data,
-        sizeAverage,
-        THCudaTensor_size(state, input, 0),
-        THCudaTensor_size(state, input, 1),
-        n_classes
-    );
-  }
-  THCudaCheck(cudaGetLastError());
-
-  if (weights) {
-    THCudaTensor_free(state, weights);
+    gradInput[i * ndim + t] = -(weights ? weights[t] : ScalarConvert<int, Dtype>::to(1)) * norm;
   }
-  THCudaLongTensor_free(state, target);
-  THCudaTensor_free(state, input);
 }
 
-void THNN_CudaClassNLLCriterion_updateGradInput(THCState *state, THCudaTensor *input, THCudaLongTensor *target, THCudaTensor *gradInput, bool sizeAverage, THCudaTensor *weights, THCudaTensor *total_weight) {
-  if (THCudaLongTensor_nDimension(state, target) > 1) {
-    THError("multi-target not supported");
-  }
-
-  int n_dims = THCudaTensor_nDimension(state, input);
-  int n_classes = THCudaTensor_size(state, input, n_dims - 1);
-
-  THArgCheck(THCudaTensor_isContiguous(state, gradInput), 4, "gradInput must be contiguous");
-
-  if (weights) {
-    THCUNN_assertSameGPU(
-      state, 5, weights, input, target, gradInput, total_weight
-    );
-  }
-  else {
-    THCUNN_assertSameGPU(
-      state, 4, input, target, gradInput, total_weight
-    );
-  }
-
-  if (THCudaTensor_nDimension(state, input) > 2) {
-    THArgCheck(0, 2, "vector or matrix expected");
-  }
-  if (weights && THCudaTensor_nElement(state, weights) != n_classes) {
-    THError("weight tensor should be defined either for all or no classes");
-  }
-
-  weights = weights ? THCudaTensor_newContiguous(state, weights) : NULL;
-  target = THCudaLongTensor_newContiguous(state, target);
-
-  float *weights_data = weights ? THCudaTensor_data(state, weights) : NULL;
-  float *gradInput_data = THCudaTensor_data(state, gradInput);
-  long  *target_data = THCudaLongTensor_data(state, target);
-  float *total_weight_data = THCudaTensor_data(state, total_weight);
-
-  if (THCudaTensor_nDimension(state, input) == 1) {
-    cunn_ClassNLLCriterion_updateGradInput_kernel1
-      <<<1, 1, 0, THCState_getCurrentStream(state)>>>(
-        gradInput_data,
-        weights_data,
-        target_data,
-        total_weight_data,
-        sizeAverage,
-        n_classes
-    );
-  } else {
-    cunn_ClassNLLCriterion_updateGradInput_kernel
-      <<<1, NTHREADS, 0, THCState_getCurrentStream(state)>>>(
-        gradInput_data,
-        target_data,
-        weights_data,
-        total_weight_data,
-        sizeAverage,
-        THCudaTensor_size(state, input, 0),
-        THCudaTensor_size(state, input, 1),
-        n_classes
-    );
-  }
-  THCudaCheck(cudaGetLastError());
-
-  if (weights) {
-    THCudaTensor_free(state, weights);
-  }
-  THCudaLongTensor_free(state, target);
-}
+#include "generic/ClassNLLCriterion.cu"
+#include "THCGenerateFloatTypes.h"
diff --git a/lib/THCUNN/DistKLDivCriterion.cu b/lib/THCUNN/DistKLDivCriterion.cu
index 2984672..aa453f8 100644
--- a/lib/THCUNN/DistKLDivCriterion.cu
+++ b/lib/THCUNN/DistKLDivCriterion.cu
@@ -1,5 +1,7 @@
 #include "THCUNN.h"
 #include "common.h"
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
 
 #include <thrust/fill.h>
 #include <thrust/functional.h>
@@ -7,76 +9,30 @@
 #include <thrust/reduce.h>
 #include <thrust/inner_product.h>
 
+template <typename Dtype, typename Acctype>
 struct kl_functor
 {
-  __host__ __device__ float operator()(const float& x, const float& y) const
+  __host__ __device__ Acctype operator()(const Dtype& x, const Dtype& y) const
   {
-      return y > 0 ? y * (log(y) - x) : 0;
+      Acctype yAcc = ScalarConvert<Dtype, Acctype>::to(y);
+      return y > 0 ? yAcc * (THCNumerics<Acctype>::log(yAcc) - x) : Acctype(0);
   }
 };
 
-void THNN_CudaDistKLDivCriterion_updateOutput(THCState *state, THCudaTensor *input, THCudaTensor *target, THCudaTensor *output, bool sizeAverage)
-{
-  THCUNN_assertSameGPU(state, 2, input, target);
-
-  THArgCheck(THCudaTensor_nElement(state, input) == THCudaTensor_nElement(state, target), 2,
-             "input and target need to have the same number of elements");
-
-  float sum;
-
-  long size = THCudaTensor_nElement(state, input);
-
-  input = THCudaTensor_newContiguous(state, input);
-  target = THCudaTensor_newContiguous(state, target);
-
-  thrust::device_ptr<float> input_data(THCudaTensor_data(state, input));
-  thrust::device_ptr<float> target_data(THCudaTensor_data(state, target));
-  sum = thrust::inner_product(input_data, input_data+size, target_data, (float) 0, thrust::plus<float>(), kl_functor());
-
-  if (sizeAverage)
-    sum /= size;
-
-  THCudaTensor_free(state, input);
-  THCudaTensor_free(state, target);
-
-  THCudaTensor_set1d(state, output, 0, sum);
-}
-
+template <typename Dtype>
 struct kl_updateGradInput_functor
 {
-  const float norm;
+  const Dtype norm;
 
-  kl_updateGradInput_functor(float norm_)
+  kl_updateGradInput_functor(Dtype norm_)
     : norm(norm_)
   {}
 
-  __host__ __device__ float operator()(const float& x, const float& y) const
+  __host__ __device__ Dtype operator()(const Dtype& x, const Dtype& y) const
   {
-      return y > 0 ? norm * (-y) : 0;
+      return y > 0 ? norm * (-y) : ScalarConvert<int, Dtype>::to(0);
   }
 };
 
-void THNN_CudaDistKLDivCriterion_updateGradInput(THCState *state, THCudaTensor *input, THCudaTensor *target, THCudaTensor *gradInput, bool sizeAverage)
-{
-  THCUNN_assertSameGPU(state, 3, input, target, gradInput);
-
-  THArgCheck(THCudaTensor_nElement(state, input) == THCudaTensor_nElement(state, target), 2,
-             "input and target need to have the same number of elements");
-
-  long size = THCudaTensor_nElement(state, input);
-  float norm = (sizeAverage ? 1./size : 1.);
-
-  input = THCudaTensor_newContiguous(state, input);
-  target = THCudaTensor_newContiguous(state, target);
-
-  THCudaTensor_resizeAs(state, gradInput, input);
-
-  thrust::device_ptr<float> input_data(THCudaTensor_data(state, input));
-  thrust::device_ptr<float> target_data(THCudaTensor_data(state, target));
-  thrust::device_ptr<float> gradInput_data(THCudaTensor_data(state, gradInput));
-
-  thrust::transform(input_data, input_data+size, target_data, gradInput_data, kl_updateGradInput_functor(norm));
-
-  THCudaTensor_free(state, input);
-  THCudaTensor_free(state, target);
-}
+#include "generic/DistKLDivCriterion.cu"
+#include "THCGenerateFloatTypes.h"
diff --git a/lib/THCUNN/ELU.cu b/lib/THCUNN/ELU.cu
index e4a05bc..ced993c 100644
--- a/lib/THCUNN/ELU.cu
+++ b/lib/THCUNN/ELU.cu
@@ -1,93 +1,67 @@
 #include "THCUNN.h"
-#include "common.h"
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
 
+template <typename T>
 struct ELUupdateOutput_functor
 {
-  const float alpha_;
+  const T alpha_;
 
-  ELUupdateOutput_functor(float alpha)
+  ELUupdateOutput_functor(T alpha)
     : alpha_(alpha)
   {}
 
-  __device__ void operator()(float *output, const float *input) const
+  __device__ void operator()(T *output, const T *input) const
   {
     *output = *input <= 0 ? (exp(*input) - 1) * alpha_ : *input;
   }
 };
 
 // in-place variant
+template <typename T>
 struct ELUupdateOutputIP_functor
 {
-  const float alpha_;
+  const T alpha_;
 
-  ELUupdateOutputIP_functor(float alpha)
+  ELUupdateOutputIP_functor(T alpha)
     : alpha_(alpha)
   {}
 
-  __device__ void operator()(float *x) const
+  __device__ void operator()(T *x) const
   {
     *x = *x <= 0 ? (exp(*x) - 1) * alpha_ : *x;
   }
 };
 
-void THNN_CudaELU_updateOutput(THCState *state, THCudaTensor *input, THCudaTensor *output,
-  float alpha, bool inplace)
-{
-  THCUNN_assertSameGPU(state, 2, input, output);
-
-  if (inplace)
-  {
-    THC_pointwiseApply1(state, input, ELUupdateOutputIP_functor(alpha));
-    THCudaTensor_set(state, output, input);
-  }
-  else
-  {
-    THCudaTensor_resizeAs(state, output, input);
-    THC_pointwiseApply2(state, output, input, ELUupdateOutput_functor(alpha));
-  }
-}
-
+template <typename T>
 struct ELUupdateGradInput_functor
 {
-  const float alpha_;
+  const T alpha_;
 
-  ELUupdateGradInput_functor(float alpha)
+  ELUupdateGradInput_functor(T alpha)
     : alpha_(alpha)
   {}
 
-  __device__ void operator()(float *gradInput, const float *output, const float *gradOutput) const
+  __device__ void operator()(T *gradInput, const T *output, const T *gradOutput) const
   {
     *gradInput = (*output) <= 0 ? (*gradOutput * (*output + alpha_)) : (*gradOutput);
   }
 };
 
+template <typename T>
 struct ELUupdateGradInputIP_functor
 {
-  const float alpha_;
+  const T alpha_;
 
-  ELUupdateGradInputIP_functor(float alpha)
+  ELUupdateGradInputIP_functor(T alpha)
     : alpha_(alpha)
   {}
 
-  __device__ void operator()(float *gradOutput, const float *output) const
+  __device__ void operator()(T *gradOutput, const T *output) const
   {
     *gradOutput = (*output) <= 0 ? (*gradOutput * (*output + alpha_)) : (*gradOutput);
   }
 };
 
-void THNN_CudaELU_updateGradInput(THCState *state, THCudaTensor *input, THCudaTensor *gradOutput,
-  THCudaTensor *gradInput, THCudaTensor *output, float alpha, bool inplace)
-{
-  THCUNN_assertSameGPU(state, 3, output, gradOutput, gradInput);
-
-  if (inplace)
-  {
-    THC_pointwiseApply2(state, gradOutput, output, ELUupdateGradInputIP_functor(alpha));
-    THCudaTensor_set(state, gradInput, gradOutput);
-  }
-  else
-  {
-    THCudaTensor_resizeAs(state, gradInput, output);
-    THC_pointwiseApply3(state, gradInput, output, gradOutput, ELUupdateGradInput_functor(alpha));
-  }
-}
+#include "generic/ELU.cu"
+#include "THCGenerateFloatTypes.h"
diff --git a/lib/THCUNN/HardTanh.cu b/lib/THCUNN/HardTanh.cu
index c7df1dd..b400beb 100644
--- a/lib/THCUNN/HardTanh.cu
+++ b/lib/THCUNN/HardTanh.cu
@@ -1,17 +1,19 @@
 #include "THCUNN.h"
-#include "common.h"
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
 
+template <typename T>
 struct hardtanhupdateOutput_functor
 {
-  const float max_val_;
-  const float min_val_;
+  const T max_val_;
+  const T min_val_;
 
-  hardtanhupdateOutput_functor(float min_val, float max_val)
+  hardtanhupdateOutput_functor(T min_val, T max_val)
     : min_val_(min_val)
     , max_val_(max_val)
   {}
 
-  __device__ void operator()(float *output, const float *input) const
+  __device__ void operator()(T *output, const T *input) const
   {
     if (*input < min_val_)
       *output = min_val_;
@@ -21,7 +23,7 @@ struct hardtanhupdateOutput_functor
       *output = max_val_;
   }
 
-  __device__ void operator()(float *input) const
+  __device__ void operator()(T *input) const
   {
     if (*input < min_val_)
       *input = min_val_;
@@ -30,74 +32,31 @@ struct hardtanhupdateOutput_functor
   }
 };
 
-void THNN_CudaHardTanh_updateOutput(
-      THCState *state,
-      THCudaTensor *input,
-      THCudaTensor *output,
-      float min_val,
-      float max_val,
-      bool inplace)
-{
-  THCUNN_assertSameGPU(state, 2, input, output);
-  if(inplace)
-  {
-    THCudaTensor_set(state, output, input);
-    THC_pointwiseApply1(state, output, hardtanhupdateOutput_functor(min_val, max_val));
-  }
-  else
-  {
-    THCudaTensor_resizeAs(state, output, input);
-    THC_pointwiseApply2(state, output, input,
-                               hardtanhupdateOutput_functor(min_val, max_val));
-  }
-}
-
+template <typename T>
 struct hardtanhupdateGradInput_functor
 {
-  const float max_val_;
-  const float min_val_;
+  const T max_val_;
+  const T min_val_;
 
-  hardtanhupdateGradInput_functor(float min_val, float max_val)
+  hardtanhupdateGradInput_functor(T min_val, T max_val)
     : min_val_(min_val)
     , max_val_(max_val)
   {}
 
-  __device__ void operator()(float *gradInput, const float *input, const float *gradOutput) const
+  __device__ void operator()(T *gradInput, const T *input, const T *gradOutput) const
   {
     if (*input < min_val_ || *input > max_val_)
-      *gradInput = 0;
+      *gradInput = ScalarConvert<int, T>::to(0);
     else
       *gradInput = *gradOutput;
   }
 
-  __device__ void operator()(float *gradInput, const float *input) const
+  __device__ void operator()(T *gradInput, const T *input) const
   {
     if (*input <= min_val_ || *input >= max_val_)
-      *gradInput = 0;
+      *gradInput = ScalarConvert<int, T>::to(0);
   }
 };
 
-void THNN_CudaHardTanh_updateGradInput(
-    THCState *state,
-    THCudaTensor *input,
-    THCudaTensor *gradOutput,
-    THCudaTensor *gradInput,
-    float min_val,
-    float max_val,
-    bool inplace)
-{
-  THCUNN_assertSameGPU(state, 3, input, gradOutput, gradInput);
-
-  if (inplace)
-  {
-    THCudaTensor_set(state, gradInput, gradOutput);
-    THC_pointwiseApply2(state, gradInput, input,
-                                 hardtanhupdateGradInput_functor(min_val, max_val));
-  }
-  else
-  {
-    THCudaTensor_resizeAs(state, gradInput, input);
-    THC_pointwiseApply3(state, gradInput, input, gradOutput,
-                                 hardtanhupdateGradInput_functor(min_val, max_val));
-  }
-}
+#include "generic/HardTanh.cu"
+#include "THCGenerateFloatTypes.h"
diff --git a/lib/THCUNN/L1Cost.cu b/lib/THCUNN/L1Cost.cu
index 7dd6bb2..eda58c1 100644
--- a/lib/THCUNN/L1Cost.cu
+++ b/lib/THCUNN/L1Cost.cu
@@ -1,57 +1,34 @@
 #include "THCUNN.h"
 #include "common.h"
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
 
 #include <thrust/device_ptr.h>
 #include <thrust/reduce.h>
-#include <thrust/transform.h>
+#include <thrust/transform_reduce.h>
 
+template <typename Dtype, typename Acctype>
 struct l1cost_functor
 {
-  __host__ __device__ float operator()(float x, float y) const
+  __host__ __device__ Acctype operator()(Dtype x) const
   {
-    return abs(x) + abs(y);
+    return THCNumerics<Acctype>::abs(ScalarConvert<Dtype, Acctype>::to(x));
   }
 };
 
-void THNN_CudaL1Cost_updateOutput(THCState *state, THCudaTensor *input, THCudaTensor *output)
-{
-  THCUNN_assertSameGPU(state, 1, input);
-  float sum;
-  long size = THCudaTensor_nElement(state, input);
-  input = THCudaTensor_newContiguous(state, input);
-  thrust::device_ptr<float> input_data(THCudaTensor_data(state, input));
-  sum = thrust::reduce(input_data, input_data+size, (float) 0, l1cost_functor());
-
-  THCudaTensor_free(state, input);
-
-  THCudaTensor_set1d(state, output, 0, sum);
-}
-
+template <typename Dtype>
 struct l1cost_updateGradInput_functor
 {
-  __host__ __device__ float operator()(float x) const
+  __host__ __device__ Dtype operator()(Dtype x) const
   {
     if (x > 0)
-      return 1;
+      return ScalarConvert<int, Dtype>::to(1);
     else if (x < 0)
-      return -1;
+      return ScalarConvert<int, Dtype>::to(-1);
     else
-      return 0;
+      return ScalarConvert<int, Dtype>::to(0);
   }
 };
 
-void THNN_CudaL1Cost_updateGradInput(THCState *state, THCudaTensor *input, THCudaTensor *gradOutput, THCudaTensor *gradInput)
-{
-  THCUNN_assertSameGPU(state, 2, input, gradInput);
-  long size = THCudaTensor_nElement(state, input);
-
-  input = THCudaTensor_newContiguous(state, input);
-  THCudaTensor_resizeAs(state, gradInput, input);
-
-  thrust::device_ptr<float> input_data(THCudaTensor_data(state, input));
-  thrust::device_ptr<float> gradInput_data(THCudaTensor_data(state, gradInput));
-
-  thrust::transform(input_data, input_data+size, gradInput_data, l1cost_updateGradInput_functor());
-
-  THCudaTensor_free(state, input);
-}
+#include "generic/L1Cost.cu"
+#include "THCGenerateFloatTypes.h"
diff --git a/lib/THCUNN/LeakyReLU.cu b/lib/THCUNN/LeakyReLU.cu
index a641821..8c0e6f8 100644
--- a/lib/THCUNN/LeakyReLU.cu
+++ b/lib/THCUNN/LeakyReLU.cu
@@ -1,103 +1,73 @@
 #include "THCUNN.h"
-#include "common.h"
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
 
+template <typename T>
 struct LeakyReLUUpdateOutput
 {
-  const float negval_;
+  const T negval_;
 
-  LeakyReLUUpdateOutput(float negval)
+  LeakyReLUUpdateOutput(T negval)
     : negval_(negval)
   {}
 
-  __device__ __forceinline__ void operator()(float *out, float *in)
+  __device__ __forceinline__ void operator()(T *out, T *in)
   {
-    float x = *in;
+    T x = *in;
     *out = (x > 0) ? x : x * negval_;
   }
 };
 
 // in-place variant
+template <typename T>
 struct LeakyReLUUpdateOutputIP
 {
-  const float negval_;
+  const T negval_;
 
-  LeakyReLUUpdateOutputIP(float negval)
+  LeakyReLUUpdateOutputIP(T negval)
     : negval_(negval)
   {}
 
-  __device__ __forceinline__ void operator()(float *x)
+  __device__ __forceinline__ void operator()(T *x)
   {
     *x = (*x > 0) ? *x : negval_ * (*x);
   }
 };
 
-void THNN_CudaLeakyReLU_updateOutput(THCState *state, THCudaTensor *input, THCudaTensor *output,
-  double negval, bool inplace)
-{
-  THCUNN_assertSameGPU(state, 2, input, output);
-
-  if (inplace)
-  {
-    THC_pointwiseApply1(state, input, LeakyReLUUpdateOutputIP(negval));
-    THCudaTensor_set(state, output, input);
-  }
-  else
-  {
-    THCudaTensor_resizeAs(state, output, input);
-    THC_pointwiseApply2(state, output, input, LeakyReLUUpdateOutput(negval));
-  }
-
-  THCudaCheck(cudaGetLastError());
-}
-
+template <typename T>
 struct LeakyReLUUpdateGradInput
 {
-  const float negval_;
+  const T negval_;
 
-  LeakyReLUUpdateGradInput(float negval)
+  LeakyReLUUpdateGradInput(T negval)
     : negval_(negval)
   {}
 
   __device__ __forceinline__ void operator()(
-    float* gradInput,
-    float* input,
-    float* gradOutput) const
+    T* gradInput,
+    T* input,
+    T* gradOutput) const
   {
     *gradInput = (*input > 0) ? *gradOutput : (*gradOutput) * negval_;
   }
 };
 
+template <typename T>
 struct LeakyReLUUpdateGradInputIP
 {
-  const float negval_;
+  const T negval_;
 
-  LeakyReLUUpdateGradInputIP(float negval)
+  LeakyReLUUpdateGradInputIP(T negval)
     : negval_(negval)
   {}
 
   __device__ __forceinline__ void operator()(
-    float* gradOutput,
-    float* input) const
+    T* gradOutput,
+    T* input) const
   {
     *gradOutput = (*input > 0) ? *gradOutput : (*gradOutput) * negval_;
   }
 };
 
-void THNN_CudaLeakyReLU_updateGradInput(THCState *state, THCudaTensor *input, THCudaTensor *gradOutput,
-  THCudaTensor *gradInput, double negval, bool inplace)
-{
-  THCUNN_assertSameGPU(state, 3, input, gradInput, gradOutput);
-
-  if (inplace)
-  {
-    THC_pointwiseApply2(state, gradOutput, input, LeakyReLUUpdateGradInputIP(negval));
-    THCudaTensor_set(state, gradInput, gradOutput);
-  }
-  else
-  {
-    THCudaTensor_resizeAs(state, gradInput, input);
-    THC_pointwiseApply3(state, gradInput, input, gradOutput, LeakyReLUUpdateGradInput(negval));
-  }
-
-  THCudaCheck(cudaGetLastError());
-}
+#include "generic/LeakyReLU.cu"
+#include "THCGenerateFloatTypes.h"
diff --git a/lib/THCUNN/LogSigmoid.cu b/lib/THCUNN/LogSigmoid.cu
index 2f56081..f008b63 100644
--- a/lib/THCUNN/LogSigmoid.cu
+++ b/lib/THCUNN/LogSigmoid.cu
@@ -1,35 +1,26 @@
 #include "THCUNN.h"
-#include "common.h"
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
 
+template <typename T>
 struct logSigmoid_updateOutput_functor
 {
-  __device__ void operator()(float *output, const float *input) const
+  __device__ void operator()(T *output, const T *input) const
   {
-    float z = exp(-*input);
-    *output = -log(1. + z);
+    T z = exp(-*input);
+    *output = ScalarConvert<double, T>::to(-log(1. + z));
   }
 };
 
-void THNN_CudaLogSigmoid_updateOutput(THCState *state, THCudaTensor *input, THCudaTensor *output, THCudaTensor *buffer)
-{
-  THCUNN_assertSameGPU(state, 2, input, output);
-  THCudaTensor_resizeAs(state, output, input);
-  THC_pointwiseApply2(state, output, input, logSigmoid_updateOutput_functor());
-}
-
+template <typename T>
 struct logSigmoid_updateGradInput_functor
 {
-  __device__ void operator()(float *gradInput, const float *input, const float *gradOutput) const
+  __device__ void operator()(T *gradInput, const T *input, const T *gradOutput) const
   {
-    float z = exp(-*input);
-    *gradInput = *gradOutput * z / (1. + z);
+    T z = exp(-*input);
+    *gradInput = ScalarConvert<double, T>::to(*gradOutput * z / (1. + z));
   }
 };
 
-void THNN_CudaLogSigmoid_updateGradInput(THCState *state, THCudaTensor *input, THCudaTensor *gradOutput,
-  THCudaTensor *gradInput , THCudaTensor *buffer)
-{
-  THCUNN_assertSameGPU(state, 3, input, gradOutput, gradInput);
-  THCudaTensor_resizeAs(state, gradInput, input);
-  THC_pointwiseApply3(state, gradInput, input, gradOutput, logSigmoid_updateGradInput_functor());
-}
+#include "generic/LogSigmoid.cu"
+#include "THCGenerateFloatTypes.h"
diff --git a/lib/THCUNN/LogSoftMax.cu b/lib/THCUNN/LogSoftMax.cu
index 70cab80..4d7973e 100644
--- a/lib/THCUNN/LogSoftMax.cu
+++ b/lib/THCUNN/LogSoftMax.cu
@@ -1,7 +1,10 @@
 #include "THCUNN.h"
-#include "common.h"
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
+#include "SharedMem.cuh"
 
-__global__ void cunn_SpatialLogSoftMax_updateOutput_kernel(float *output, float *input, int classSize, int height, int width)
+template <typename T, typename AccumT>
+__global__ void cunn_SpatialLogSoftMax_updateOutput_kernel(T *output, T *input, int classSize, int height, int width)
 {
   int batchIndex = blockIdx.x;
   int index = threadIdx.x;
@@ -18,11 +21,11 @@ __global__ void cunn_SpatialLogSoftMax_updateOutput_kernel(float *output, float
       (width*classSize)*y +
       (classSize)*x;
 
-    float sum = 0;
+    AccumT sum = 0;
     for (int i = 0; i < classSize; i++) {
-      sum += __expf(input[inputStartIndex + i]);
+      sum += THCNumerics<T>::exp(input[inputStartIndex + i]);
     }
-    sum = 1.0f / sum;
+    sum = AccumT(1) / sum;
 
     for (int i = 0; i < classSize; i++) {
       // calculate output index in torch layout (B x C x H x W)
@@ -31,13 +34,15 @@ __global__ void cunn_SpatialLogSoftMax_updateOutput_kernel(float *output, float
         (height*width)*i +
         (width)*y +
         x;
-      output[outputIndex] = logf(sum * __expf(input[inputStartIndex + i]));
+      output[outputIndex] = ScalarConvert<AccumT, T>::to(
+        THCNumerics<AccumT>::log(sum * THCNumerics<T>::exp(input[inputStartIndex + i])));
     }
     index += blockDim.x;
   }
 }
 
-__global__ void cunn_SpatialLogSoftMax_updateGradInput_kernel(float *gradInput, float *output, float *gradOutput, int classSize, int height, int width)
+template <typename T, typename AccumT>
+__global__ void cunn_SpatialLogSoftMax_updateGradInput_kernel(T *gradInput, T *output, T *gradOutput, int classSize, int height, int width)
 {
   int batchIndex = blockIdx.x;
   int index = threadIdx.x;
@@ -54,7 +59,7 @@ __global__ void cunn_SpatialLogSoftMax_updateGradInput_kernel(float *gradInput,
       (width*classSize)*y +
       (classSize)*x;
 
-    float sum = 0;
+    AccumT sum = 0;
     for (int i = 0; i < classSize; i++) {
       sum += gradOutput[outputStartIndex + i];
     }
@@ -66,70 +71,76 @@ __global__ void cunn_SpatialLogSoftMax_updateGradInput_kernel(float *gradInput,
         (height*width)*i +
         (width)*y +
         x;
-      gradInput[inputIndex] = gradOutput[outputStartIndex + i] - __expf(output[outputStartIndex + i]) * sum;
+      gradInput[inputIndex] = ScalarConvert<AccumT, T>::to(
+        gradOutput[outputStartIndex + i] - THCNumerics<T>::exp(output[outputStartIndex + i]) * sum);
     }
     index += blockDim.x;
   }
 }
 
+template <typename T, typename AccumT>
 struct MaxFloat
 {
-  __device__ __forceinline__ float operator()(float max, float v) const
+  __device__ __forceinline__ AccumT operator()(AccumT max, T v) const
   {
-    return fmaxf(max, v);
+    return fmaxType(max, v);
   }
 };
 
+template<typename T, typename AccumT>
 struct SumFloat
 {
-  __device__ __forceinline__ float operator()(float sum, float v) const
+  __device__ __forceinline__ AccumT operator()(AccumT sum, T v) const
   {
     return sum + v;
   }
 };
 
+template<typename T, typename AccumT>
 struct SumExpFloat
 {
-  __device__ __forceinline__ SumExpFloat(float v)
+  __device__ __forceinline__ SumExpFloat(T v)
     : max_k(v)
   {}
 
-  __device__ __forceinline__ float operator()(float sum, float v) const
+  __device__ __forceinline__ AccumT operator()(AccumT sum, T v) const
   {
-    return sum + expf(v - max_k);
+    return sum + THCNumerics<T>::exp(v - max_k);
   }
 
-  const float max_k;
+  const T max_k;
 };
 
+template<typename AccumT>
 struct NoFinal
 {
-  __device__ __forceinline__ float operator()(float v) const
+  __device__ __forceinline__ AccumT operator()(AccumT v) const
   {
     return v;
   }
 };
 
+template<typename AccumT>
 struct LSMFinal
 {
-  __device__ __forceinline__ LSMFinal(float m)
+  __device__ __forceinline__ LSMFinal(AccumT m)
     : max_k(m)
   {}
 
-  __device__ __forceinline__ float operator()(float v) const
+  __device__ __forceinline__ AccumT operator()(AccumT v) const
   {
-    return max_k + logf(v);
+    return max_k + THCNumerics<AccumT>::log(v);
   }
 
-  const float max_k;
+  const AccumT max_k;
 };
 
-template <typename Reduction, typename Finalize>
-__device__ __forceinline__ float
-blockReduce(float* smem, float val,
-            const Reduction& r,
-            float defaultVal,
-            const Finalize& f)
+template <template<typename, typename> class Reduction, template<typename> class Finalize, typename AccumT>
+__device__ __forceinline__ AccumT
+blockReduce(AccumT* smem, AccumT val,
+            const Reduction<AccumT, AccumT>& r,
+            AccumT defaultVal,
+            const Finalize<AccumT>& f)
 {
   // To avoid RaW races from chaining blockReduce calls together, we
   // need a sync here
@@ -139,7 +150,7 @@ blockReduce(float* smem, float val,
 
   __syncthreads();
 
-  float warpVal = defaultVal;
+  AccumT warpVal = defaultVal;
 
   // First warp will perform per-warp reductions for the remaining warps
   if ((threadIdx.x / 32) == 0) // only threads in warp1 go into this (if)
@@ -162,7 +173,7 @@ blockReduce(float* smem, float val,
   __syncthreads();
 
   // First thread will perform a reduction of the above per-warp reductions
-  float blockVal = defaultVal;
+  AccumT blockVal = defaultVal;
 
   if (threadIdx.x == 0)
   {
@@ -179,23 +190,23 @@ blockReduce(float* smem, float val,
   return smem[0];
 }
 
-template <typename Reduction>
-__device__ __forceinline__ float
-blockReduce(float* smem, float val,
-            const Reduction& r,
-            float defaultVal)
+template <template<typename, typename> class Reduction, typename AccumT>
+__device__ __forceinline__ AccumT
+blockReduce(AccumT* smem, AccumT val,
+            const Reduction<AccumT, AccumT>& r,
+            AccumT defaultVal)
 {
-  return blockReduce<Reduction, NoFinal>(smem, val, r, defaultVal, NoFinal());
+  return blockReduce<Reduction, NoFinal, AccumT>(smem, val, r, defaultVal, NoFinal<AccumT>());
 }
 
-template <typename Reduction, int ILP>
-__device__ __forceinline__ float
-ilpReduce(float* data,
+template <template<typename, typename> class Reduction, int ILP, typename T, typename AccumT>
+__device__ __forceinline__ AccumT
+ilpReduce(T* data,
           int size,
-          const Reduction& r,
-          float defaultVal)
+          const Reduction<T, AccumT>& r,
+          AccumT defaultVal)
 {
-  float threadVal = defaultVal;
+  AccumT threadVal = defaultVal;
   int offset = threadIdx.x;
 
   int last = size % (ILP * blockDim.x);
@@ -203,7 +214,7 @@ ilpReduce(float* data,
   // Body (unroll by ILP times)
   for (; offset < size - last; offset += blockDim.x * ILP)
   {
-    float tmp[ILP];
+    T tmp[ILP];
 
 #pragma unroll
     for (int j = 0; j < ILP; ++j)
@@ -227,28 +238,30 @@ ilpReduce(float* data,
   return threadVal;
 }
 
-template <int ILP>
+template <int ILP, typename T, typename AccumT>
 __global__ void
-cunn_LogSoftMax_updateOutput_kernel(float *output, float *input, int classes)
+cunn_LogSoftMax_updateOutput_kernel(T *output, T *input, int classes)
 {
-  extern __shared__ float buffer[];
+  SharedMem<AccumT> smem;
+  AccumT *buffer = smem.getPointer();
   // forward pointers to batch[blockIdx.x]
   // each block handles a sample in the mini-batch
   input += blockIdx.x * classes;
   output += blockIdx.x * classes;
 
   // find the max of the batch
-  float threadMax =
-    ilpReduce<MaxFloat, ILP>(input, classes, MaxFloat(), -FLT_MAX);
+  AccumT threadMax = ilpReduce<MaxFloat, ILP, T, AccumT>(
+      input, classes, MaxFloat<T, AccumT>(), -THCNumerics<AccumT>::max());
   // find the max over all batches
-  float max_k =
-    blockReduce<MaxFloat>(buffer, threadMax, MaxFloat(), -FLT_MAX);
+  AccumT max_k = blockReduce<MaxFloat, AccumT>(
+      buffer, threadMax, MaxFloat<AccumT, AccumT>(), -THCNumerics<AccumT>::max());
+  T max_k_non_accum = ScalarConvert<AccumT, T>::to(max_k);
 
-  float threadExp =
-    ilpReduce<SumExpFloat, ILP>(input, classes, SumExpFloat(max_k), 0.0f);
-  float logsum_k =
-    blockReduce<SumFloat, LSMFinal>(
-      buffer, threadExp, SumFloat(), 0.0f, LSMFinal(max_k));
+  AccumT threadExp = ilpReduce<SumExpFloat, ILP, T, AccumT>(
+      input, classes, SumExpFloat<T, AccumT>(max_k_non_accum), AccumT(0));
+  T logsum_k = ScalarConvert<AccumT, T>::to(
+      blockReduce<SumFloat, LSMFinal, AccumT>(
+          buffer, threadExp, SumFloat<AccumT, AccumT>(), AccumT(0), LSMFinal<AccumT>(max_k)));
 
   // Output LSM (hand ILP)
   int offset = threadIdx.x;
@@ -256,7 +269,7 @@ cunn_LogSoftMax_updateOutput_kernel(float *output, float *input, int classes)
   int last = classes % (ILP * blockDim.x);
   for (; offset < classes - last; offset += blockDim.x * ILP)
   {
-    float tmp[ILP];
+    T tmp[ILP];
 
 #pragma unroll
     for (int j = 0; j < ILP; ++j) {
@@ -276,30 +289,32 @@ cunn_LogSoftMax_updateOutput_kernel(float *output, float *input, int classes)
   }
 }
 
-template <int ILP>
+template <int ILP, typename T, typename AccumT>
 __global__ void
-cunn_LogSoftMax_updateGradInput_kernel(float *gradInput,
-                                       float *output,
-                                       float *gradOutput,
+cunn_LogSoftMax_updateGradInput_kernel(T *gradInput,
+                                       T *output,
+                                       T *gradOutput,
                                        int classes)
 {
-  extern __shared__ float buffer[];
+  SharedMem<AccumT> smem;
+  AccumT *buffer = smem.getPointer();
   gradInput += blockIdx.x * classes;
   output += blockIdx.x * classes;
   gradOutput += blockIdx.x * classes;
 
-  float threadSum =
-    ilpReduce<SumFloat, 4>(gradOutput, classes, SumFloat(), 0.0f);
-  float sum_k =
-    blockReduce<SumFloat>(buffer, threadSum, SumFloat(), 0.0f);
+  AccumT threadSum = ilpReduce<SumFloat, 4, T, AccumT>(
+      gradOutput, classes, SumFloat<T, AccumT>(), AccumT(0));
+  T sum_k = ScalarConvert<AccumT, T>::to(
+      blockReduce<SumFloat, AccumT>(
+          buffer, threadSum, SumFloat<AccumT, AccumT>(), AccumT(0)));
 
   // Update gradInput (hand ILP)
   int offset = threadIdx.x;
   int last = classes % (ILP * blockDim.x);
   for (; offset < classes - last; offset += blockDim.x * ILP)
   {
-    float tmpGradOutput[ILP];
-    float tmpOutput[ILP];
+    T tmpGradOutput[ILP];
+    T tmpOutput[ILP];
 
 #pragma unroll
     for (int j = 0; j < ILP; ++j)
@@ -312,237 +327,16 @@ cunn_LogSoftMax_updateGradInput_kernel(float *gradInput,
     for (int j = 0; j < ILP; ++j)
     {
       gradInput[offset + j * blockDim.x] =
-        tmpGradOutput[j] - __expf(tmpOutput[j]) * sum_k;
+        tmpGradOutput[j] - THCNumerics<T>::exp(tmpOutput[j]) * sum_k;
     }
   }
 
   for (; offset < classes; offset += blockDim.x)
   {
     gradInput[offset] =
-      gradOutput[offset] - __expf(output[offset]) * sum_k;
+      gradOutput[offset] - THCNumerics<T>::exp(output[offset]) * sum_k;
   }
 }
 
-void THNN_CudaLogSoftMax_updateOutput(THCState *state, THCudaTensor *input, THCudaTensor *output)
-{
-  THCUNN_assertSameGPU(state, 2, input, output);
-
-  THCudaTensor_resizeAs(state, output, input);
-
-  bool spatial  = false;
-  int batchSize = 1;
-  int classSize = 0;
-  int height = 0;
-  int width = 0;
-
-  int ndims = THCudaTensor_nDimension(state, input);
-
-  if (ndims == 1)
-  {
-    classSize = THCudaTensor_size(state, input, 0);
-    input = THCudaTensor_newContiguous(state, input);
-  }
-  else if (ndims == 2)
-  {
-    batchSize = THCudaTensor_size(state, input, 0);
-    classSize = THCudaTensor_size(state, input, 1);
-    input = THCudaTensor_newContiguous(state, input);
-  }
-  else if (ndims == 3)
-  {
-    spatial = true;
-    classSize = THCudaTensor_size(state, input, 0);
-    height = THCudaTensor_size(state, input, 1);
-    width = THCudaTensor_size(state, input, 2);
-
-    // create contiguous tensor with cuda layout from tensor with torch layout
-    // C x H x W -> W x H x C
-    THCudaTensor_transpose(state, input, input, 0, 2);
-    // W x H x C -> H x W x C
-    THCudaTensor_transpose(state, input, input, 0, 1);
-    THCudaTensor *transposedInput = THCudaTensor_newContiguous(state, input);
-    THCudaTensor_transpose(state, input, input, 0, 1);
-    THCudaTensor_transpose(state, input, input, 0, 2);
-    input = transposedInput;
-  }
-  else if (ndims == 4)
-  {
-    spatial = true;
-    batchSize = THCudaTensor_size(state, input, 0);
-    classSize = THCudaTensor_size(state, input, 1);
-    height = THCudaTensor_size(state, input, 2);
-    width = THCudaTensor_size(state, input, 3);
-
-    // create contiguous tensor with cuda layout from tensor with torch layout
-    // B x C x H x W -> B x W x H x C
-    THCudaTensor_transpose(state, input, input, 1, 3);
-    // B x W x H x C -> B x H x W x C
-    THCudaTensor_transpose(state, input, input, 1, 2);
-    THCudaTensor *transposedInput = THCudaTensor_newContiguous(state, input);
-    THCudaTensor_transpose(state, input, input, 1, 2);
-    THCudaTensor_transpose(state, input, input, 1, 3);
-    input = transposedInput;
-  }
-  else
-  {
-    THError("1D, 2D, 3D or 4D Tensor expected");
-  }
-
-  if (!spatial)
-  {
-    dim3 grid(batchSize);
-    dim3 block(1024);
-
-    cunn_LogSoftMax_updateOutput_kernel<2>
-      <<<grid, block, block.x * sizeof(float), THCState_getCurrentStream(state)>>>(
-        THCudaTensor_data(state, output),
-        THCudaTensor_data(state, input),
-        classSize
-    );
-  }
-  else
-  {
-    dim3 grid(batchSize);
-    dim3 block(1024);
-
-    cunn_SpatialLogSoftMax_updateOutput_kernel
-      <<<grid, block, 0, THCState_getCurrentStream(state)>>>(
-        THCudaTensor_data(state, output),
-        THCudaTensor_data(state, input),
-        classSize, height, width
-    );
-  }
-
-  cudaError errcode = cudaGetLastError();
-  if (errcode != cudaSuccess)
-  {
-    THError(cudaGetErrorString(errcode));
-  }
-
-  THCudaTensor_free(state, input);
-}
-
-void THNN_CudaLogSoftMax_updateGradInput(THCState *state, THCudaTensor *input, THCudaTensor *gradOutput,
-  THCudaTensor *gradInput, THCudaTensor *output)
-{
-  THCUNN_assertSameGPU(state, 3, output, gradOutput, gradInput);
-
-  THCudaTensor_resizeAs(state, gradInput, output);
-
-  bool spatial  = false;
-  int batchSize = 1;
-  int classSize = 0;
-  int height = 0;
-  int width = 0;
-
-  int ndims = THCudaTensor_nDimension(state, input);
-
-  if (ndims == 1)
-  {
-    classSize = THCudaTensor_size(state, gradInput, 0);
-    output = THCudaTensor_newContiguous(state, output);
-    gradOutput = THCudaTensor_newContiguous(state, gradOutput);
-  }
-  else if (ndims == 2)
-  {
-    batchSize = THCudaTensor_size(state, gradInput, 0);
-    classSize = THCudaTensor_size(state, gradInput, 1);
-    output = THCudaTensor_newContiguous(state, output);
-    gradOutput = THCudaTensor_newContiguous(state, gradOutput);
-  }
-  else if (ndims == 3)
-  {
-    spatial = true;
-    classSize = THCudaTensor_size(state, input, 0);
-    height = THCudaTensor_size(state, input, 1);
-    width = THCudaTensor_size(state, input, 2);
-
-    // create contiguous tensor with cuda layout from tensor with torch layout
-    // C x H x W -> W x H x C
-    THCudaTensor_transpose(state, output, output, 0, 2);
-    // W x H x C -> H x W x C
-    THCudaTensor_transpose(state, output, output, 0, 1);
-    THCudaTensor *transposedOutput = THCudaTensor_newContiguous(state, output);
-    THCudaTensor_transpose(state, output, output, 0, 1);
-    THCudaTensor_transpose(state, output, output, 0, 2);
-    output = transposedOutput;
-
-    // create contiguous tensor with cuda layout from tensor with torch layout
-    // C x H x W -> W x H x C
-    THCudaTensor_transpose(state, gradOutput, gradOutput, 0, 2);
-    // W x H x C -> H x W x C
-    THCudaTensor_transpose(state, gradOutput, gradOutput, 0, 1);
-    THCudaTensor *transposedGradOutput = THCudaTensor_newContiguous(state, gradOutput);
-    THCudaTensor_transpose(state, gradOutput, gradOutput, 0, 1);
-    THCudaTensor_transpose(state, gradOutput, gradOutput, 0, 2);
-    gradOutput = transposedGradOutput;
-  }
-  else if (ndims == 4)
-  {
-    spatial = true;
-    batchSize = THCudaTensor_size(state, gradInput, 0);
-    classSize = THCudaTensor_size(state, input, 1);
-    height = THCudaTensor_size(state, input, 2);
-    width = THCudaTensor_size(state, input, 3);
-
-    // create contiguous tensor with cuda layout from tensor with torch layout
-    // B x C x H x W -> B x W x H x C
-    THCudaTensor_transpose(state, output, output, 1, 3);
-    // B x W x H x C -> B x H x W x C
-    THCudaTensor_transpose(state, output, output, 1, 2);
-    THCudaTensor *transposedOutput = THCudaTensor_newContiguous(state, output);
-    THCudaTensor_transpose(state, output, output, 1, 2);
-    THCudaTensor_transpose(state, output, output, 1, 3);
-    output = transposedOutput;
-
-    // create contiguous tensor with cuda layout from tensor with torch layout
-    // B x C x H x W -> B x W x H x C
-    THCudaTensor_transpose(state, gradOutput, gradOutput, 1, 3);
-    // B x W x H x C -> B x H x W x C
-    THCudaTensor_transpose(state, gradOutput, gradOutput, 1, 2);
-    THCudaTensor *transposedGradOutput = THCudaTensor_newContiguous(state, gradOutput);
-    THCudaTensor_transpose(state, gradOutput, gradOutput, 1, 2);
-    THCudaTensor_transpose(state, gradOutput, gradOutput, 1, 3);
-    gradOutput = transposedGradOutput;
-  }
-  else
-  {
-    THError("1D, 2D, 3D or 4D Tensor expected");
-  }
-
-  if (!spatial)
-  {
-    dim3 grid(batchSize);
-    dim3 block(1024);
-
-    cunn_LogSoftMax_updateGradInput_kernel<2>
-      <<<grid, block, block.x * sizeof(float), THCState_getCurrentStream(state)>>>(
-        THCudaTensor_data(state, gradInput),
-        THCudaTensor_data(state, output),
-        THCudaTensor_data(state, gradOutput),
-        classSize
-    );
-  }
-  else
-  {
-    dim3 grid(batchSize);
-    dim3 block(1024);
-
-    cunn_SpatialLogSoftMax_updateGradInput_kernel
-      <<<grid, block, 0, THCState_getCurrentStream(state)>>>(
-        THCudaTensor_data(state, gradInput),
-        THCudaTensor_data(state, output),
-        THCudaTensor_data(state, gradOutput),
-        classSize, height, width
-    );
-  }
-
-  cudaError errcode = cudaGetLastError();
-  if (errcode != cudaSuccess)
-  {
-    THError(cudaGetErrorString(errcode));
-  }
-
-  THCudaTensor_free(state, gradOutput);
-  THCudaTensor_free(state, output);
-}
+#include "generic/LogSoftMax.cu"
+#include "THCGenerateFloatTypes.h"
diff --git a/lib/THCUNN/LookupTable.cu b/lib/THCUNN/LookupTable.cu
index 2b2040e..bb91c7e 100644
--- a/lib/THCUNN/LookupTable.cu
+++ b/lib/THCUNN/LookupTable.cu
@@ -9,6 +9,8 @@
 #include <thrust/system/cuda/execution_policy.h>
 #endif
 #include <thrust/unique.h>
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
 
 #ifndef DIVUP
 #define DIVUP(x, y) (((x) + (y) - 1) / (y))
@@ -49,8 +51,9 @@ __device__ __forceinline__ bool warpHasCollision(int val)
   return __any(dup) != 0;
 }
 
+template <typename Dtype>
 __global__ void cunn_LookupTable_accGradParametersKernelByFeature(
-  long *input, float *gradOutput, float *gradWeight, float scale, long numel,
+  long *input, Dtype *gradOutput, Dtype *gradWeight, Dtype scale, long numel,
   long stride, int paddingValue) {
 
   const int featureDim = blockIdx.x * 4 + threadIdx.x / 32;
@@ -78,8 +81,9 @@ __global__ void cunn_LookupTable_accGradParametersKernelByFeature(
       continue;
     }
 
-    float update = gradOutput[i*stride + featureDim] * scale;
+    Dtype update = gradOutput[i*stride + featureDim] * scale;
 
+    // FIXME: should we accumulate as accreal?
     // Check for collision
     if (warpHasCollision(weightIndex)) {
       // Run all lanes sequentially; warp divergence
@@ -95,9 +99,10 @@ __global__ void cunn_LookupTable_accGradParametersKernelByFeature(
   }
 }
 
+template <typename Dtype, typename Acctype>
 __global__ void cunn_LookupTable_accGradParametersKernel(
-  long *input, long *indices, float *gradOutput, float *gradWeight,
-  long *count, float defaultScale, long numel, long stride, int paddingValue) {
+  long *input, long *indices, Dtype *gradOutput, Dtype *gradWeight,
+  long *count, Dtype defaultScale, long numel, long stride, int paddingValue) {
 
   int idx = blockIdx.x * 4 + threadIdx.y;
 
@@ -122,10 +127,10 @@ __global__ void cunn_LookupTable_accGradParametersKernel(
       const int startFeature = threadIdx.x + blockIdx.y * blockDim.x * SZ;
       const int weightRow = ((int) input[idx] - TH_INDEX_BASE) * stride;
       const int gradOutputRow = ((int) indices[idx] - TH_INDEX_BASE) * stride;
-      const float scale = count ? defaultScale / count[idx] : defaultScale;
+      const Acctype scale = count ? ScalarConvert<Dtype, Acctype>::to(defaultScale) / count[idx] : ScalarConvert<Dtype, Acctype>::to(defaultScale);
 
-      float gradient[SZ];
-      float weight[SZ];
+      Acctype gradient[SZ];
+      Acctype weight[SZ];
 
       #pragma unroll
       for (int ii = 0; ii < SZ; ii++)
@@ -133,8 +138,8 @@ __global__ void cunn_LookupTable_accGradParametersKernel(
         int featureDim = startFeature + ii * WARP_SIZE;
         if (featureDim < stride)
         {
-          gradient[ii] = gradOutput[gradOutputRow + featureDim];
-          weight[ii] = gradWeight[weightRow + featureDim];
+          gradient[ii] = ScalarConvert<Dtype, Acctype>::to(gradOutput[gradOutputRow + featureDim]);
+          weight[ii] = ScalarConvert<Dtype, Acctype>::to(gradWeight[weightRow + featureDim]);
         }
       }
 
@@ -150,7 +155,7 @@ __global__ void cunn_LookupTable_accGradParametersKernel(
         int featureDim = startFeature + ii * WARP_SIZE;
         if (featureDim < stride)
         {
-          gradWeight[weightRow + featureDim] = weight[ii];
+          gradWeight[weightRow + featureDim] = ScalarConvert<Acctype, Dtype>::to(weight[ii]);
         }
       }
 
@@ -159,129 +164,23 @@ __global__ void cunn_LookupTable_accGradParametersKernel(
   }
 }
 
-void THNN_CudaLookupTable_accGradParameters(
-  THCState *state,
-  THIndexTensor *input,
-  THCudaTensor *gradOutput,
-  THCudaTensor *gradWeight,
-  THIndexTensor *count,
-  THIndexTensor *sorted,
-  THIndexTensor *indices,
-  bool scaleGradByFreq,
-  int paddingValue,
-  float scale)
-{
-  THCUNN_assertSameGPU(state, 5, input, gradOutput, gradWeight, sorted, indices);
-  if (!(THIndexTensor_(isContiguous)(state, input) &&
-        THCudaTensor_isContiguous(state, gradOutput) &&
-        THCudaTensor_isContiguous(state, gradWeight)))
-  {
-    THError("Tensors must be contiguous");
-  }
-
-  int nDim = THIndexTensor_(nDimension)(state, input);
-  if (nDim != 1 && nDim != 2)
-    THError("input must be a vector or matrix");
-
-  long numel = THIndexTensor_(nElement)(state, input);
-  long stride = gradWeight->stride[0];
-
-  cudaStream_t stream = THCState_getCurrentStream(state);
-
-  if (numel <= 768 && !scaleGradByFreq) {
-    cunn_LookupTable_accGradParametersKernelByFeature<<<DIVUP(stride,4), 128, 0, stream>>>(
-      THIndexTensor_(data)(state, input),
-      THCudaTensor_data(state, gradOutput),
-      THCudaTensor_data(state, gradWeight),
-      scale,
-      numel,
-      stride,
-      paddingValue);
-    THCudaCheck(cudaGetLastError());
-    return;
-  }
-
-  THLongStorage *inputSize = THIndexTensor_(newSizeOf)(state, input);
-  THIndexTensor_(resize)(state, sorted, inputSize, NULL);
-  THIndexTensor_(resize)(state, indices, inputSize, NULL);
-  THLongStorage_free(inputSize);
-
-  // Sort the inputs into sorted with the corresponding indices
-  THIndexTensor_(sort)(state, sorted, indices, input, 0, 0);
-
-  long *sorted_data = THIndexTensor_(data)(state, sorted);
-  long  *indices_data = THIndexTensor_(data)(state, indices);
-  long *count_data = NULL;
-
-  if (scaleGradByFreq)
-  {
-    THIndexTensor_(resizeAs)(state, count, input);
-    count_data = THIndexTensor_(data)(state, count);
-
-    thrust::device_ptr<long> sorted_ptr(sorted_data);
-    thrust::device_ptr<long> count_ptr(count_data);
-
-    // Compute an increasing sequence per unique item in sorted:
-    // sorted: 2 5 5 5 7 7 8 9 9
-    //  count: 1 1 2 3 1 2 1 1 2
-    thrust::inclusive_scan_by_key(
-#if CUDA_VERSION >= 7000
-      thrust::cuda::par.on(THCState_getCurrentStream(state)),
-#endif
-      sorted_ptr,
-      sorted_ptr + numel,
-      thrust::make_constant_iterator(1),
-      count_ptr
-    );
-
-    // Take the maximum of each count per unique key in reverse:
-    // sorted: 2 5 5 5 7 7 8 9 9
-    //  count: 1 3 3 3 2 2 1 2 2
-    thrust::inclusive_scan_by_key(
-#if CUDA_VERSION >= 7000
-      thrust::cuda::par.on(THCState_getCurrentStream(state)),
-#endif
-      thrust::make_reverse_iterator(sorted_ptr + numel),
-      thrust::make_reverse_iterator(sorted_ptr),
-      thrust::make_reverse_iterator(count_ptr + numel),
-      thrust::make_reverse_iterator(count_ptr + numel),
-      thrust::equal_to<long>(),
-      thrust::maximum<long>()
-    );
-  }
-
-  dim3 grid(DIVUP(numel,4), DIVUP(stride,128));
-  dim3 block(32, 4);
-  cunn_LookupTable_accGradParametersKernel<<<grid, block, 0, stream>>>(
-    sorted_data,
-    indices_data,
-    THCudaTensor_data(state, gradOutput),
-    THCudaTensor_data(state, gradWeight),
-    count_data,
-    scale,
-    numel,
-    stride,
-    paddingValue
-  );
-  THCudaCheck(cudaGetLastError());
-}
-
 /*
  * Keep the norm of weight smaller than maxNorm
  */
-template <typename T>
+template <typename Dtype, typename Acctype>
 struct pow_v
 {
-  T normType;
-  pow_v(T v) : normType(v) {}
+  Acctype normType;
+  pow_v(Dtype v) : normType(ScalarConvert<Dtype, Acctype>::to(v)) {}
   __host__ __device__
-  T operator()(const T& x) const {
+  Acctype operator()(const Dtype& x) const {
+    Acctype xA = ScalarConvert<Dtype, Acctype>::to(x);
     if (normType == 1)
-      return std::abs(x);
+      return std::abs(xA);
     else if (normType == 2)
-      return x * x;
+      return xA * xA;
     else
-      return std::pow(std::abs(x), normType);
+      return std::pow(std::abs(xA), normType);
   }
 };
 
@@ -296,47 +195,5 @@ struct multiply_s
   }
 };
 
-void THNN_CudaLookupTable_renorm(
-  THCState *state,
-  THIndexTensor *idx,
-  THCudaTensor *weight,
-  float maxNorm,
-  float normType)
-{
-  THCUNN_assertSameGPU(state, 2, idx, weight);
-  if (!(THIndexTensor_(isContiguous)(state, idx) &&
-        THCudaTensor_isContiguous(state, weight)))
-  {
-    THError("Tensors must be contiguous");
-  }
-  if (THIndexTensor_(nDimension)(state, idx) != 1)
-    THError("idx must be a vector");
-  if (normType <= 0)
-    THError("non-positive-norm not supported");
-
-  long numel = THIndexTensor_(nElement)(state, idx);
-  long stride = weight->stride[0];
-
-  // get the unique indices
-  thrust::device_ptr<float> weight_ptr(THCudaTensor_data(state, weight));
-  thrust::device_ptr<long> idx_ptr(THIndexTensor_(data)(state, idx));
-  thrust::device_ptr<long> end_ptr = thrust::unique(idx_ptr, idx_ptr+numel);
-  numel = end_ptr - idx_ptr;
-
-  pow_v<float> unary_pow(normType);
-  thrust::plus<float> binary_plus;
-  // numel << stride, since idx usually contains sparse row indices
-  for (long i = 0; i < numel; i++)
-  {
-    long k = idx_ptr[i] - TH_INDEX_BASE;
-    thrust::device_ptr<float> row_ptr = weight_ptr + k * stride;
-    float norm = thrust::transform_reduce(row_ptr, row_ptr + stride,
-      unary_pow, 0, binary_plus);
-    norm = std::pow(norm, (float) (1.0 / normType));
-    if (norm > maxNorm)
-    {
-      multiply_s<float> unary_mul(maxNorm / (norm + 1e-7));
-      thrust::transform(row_ptr, row_ptr + stride, row_ptr, unary_mul);
-    }
-  }
-}
+#include "generic/LookupTable.cu"
+#include "THCGenerateFloatTypes.h"
diff --git a/lib/THCUNN/MSECriterion.cu b/lib/THCUNN/MSECriterion.cu
index 982e19a..26a35a5 100644
--- a/lib/THCUNN/MSECriterion.cu
+++ b/lib/THCUNN/MSECriterion.cu
@@ -1,5 +1,7 @@
 #include "THCUNN.h"
 #include "common.h"
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
 
 #include <thrust/fill.h>
 #include <thrust/functional.h>
@@ -10,87 +12,32 @@
 #include <thrust/system/cuda/execution_policy.h>
 #endif
 
+template <typename Dtype, typename Acctype>
 struct mse_functor
 {
   mse_functor() {}
 
-  __host__ __device__ float operator()(const float &x, const float &y) const
+  __host__ __device__ Acctype operator()(const Dtype &x, const Dtype &y) const
   {
-    float z = x-y;
+    Acctype z = ScalarConvert<Dtype, Acctype>::to(x)-y;
     return z*z;
   }
 };
 
-void THNN_CudaMSECriterion_updateOutput(THCState *state, THCudaTensor *input, THCudaTensor *target, THCudaTensor *output, bool sizeAverage)
-{
-  THCUNN_assertSameGPU(state, 2, input, target);
-  THArgCheck(THCudaTensor_nElement(state, input) == THCudaTensor_nElement(state, target), 2,
-    "input and target need to have the same number of elements"
-  );
-
-  long size = THCudaTensor_nElement(state, input);
-
-  input = THCudaTensor_newContiguous(state, input);
-  target = THCudaTensor_newContiguous(state, target);
-
-  thrust::device_ptr<float> input_data(THCudaTensor_data(state, input));
-  thrust::device_ptr<float> target_data(THCudaTensor_data(state, target));
-  float sum = thrust::inner_product(
-#if CUDA_VERSION >= 7000
-    thrust::cuda::par.on(THCState_getCurrentStream(state)),
-#endif
-    input_data, input_data+size, target_data, (float) 0,
-    thrust::plus<float>(), mse_functor());
-
-  if (sizeAverage)
-    sum /= size;
-
-  THCudaTensor_free(state, input);
-  THCudaTensor_free(state, target);
-
-  THCudaTensor_set1d(state, output, 0, sum);
-}
-
+template <typename Dtype, typename Acctype>
 struct mse_updateGradInput_functor
 {
-  const float norm;
+  const Acctype norm;
 
-  mse_updateGradInput_functor(float norm_)
+  mse_updateGradInput_functor(Acctype norm_)
     : norm(norm_)
   {}
 
-  __host__ __device__ float operator()(const float &x, const float &y) const
+  __host__ __device__ Dtype operator()(const Dtype &x, const Dtype &y) const
   {
-    return norm * (x - y);
+    return ScalarConvert<Acctype, Dtype>::to(norm * (ScalarConvert<Dtype, Acctype>::to(x) - y));
   }
 };
 
-void THNN_CudaMSECriterion_updateGradInput(THCState *state, THCudaTensor *input, THCudaTensor *target, THCudaTensor *gradInput, bool sizeAverage)
-{
-  THCUNN_assertSameGPU(state, 3, input, target, gradInput);
-  THArgCheck(THCudaTensor_nElement(state, input) == THCudaTensor_nElement(state, target), 2,
-    "input and target need to have the same number of elements"
-  );
-
-  long size = THCudaTensor_nElement(state, input);
-  float norm = sizeAverage ? 2.f/size : 2.f;
-
-  input = THCudaTensor_newContiguous(state, input);
-  target = THCudaTensor_newContiguous(state, target);
-
-  THCudaTensor_resizeAs(state, gradInput, input);
-
-  thrust::device_ptr<float> input_data(THCudaTensor_data(state, input));
-  thrust::device_ptr<float> target_data(THCudaTensor_data(state, target));
-  thrust::device_ptr<float> gradInput_data(THCudaTensor_data(state, gradInput));
-
-  thrust::transform(
-#if CUDA_VERSION >= 7000
-    thrust::cuda::par.on(THCState_getCurrentStream(state)),
-#endif
-    input_data, input_data+size, target_data, gradInput_data,
-    mse_updateGradInput_functor(norm));
-
-  THCudaTensor_free(state, input);
-  THCudaTensor_free(state, target);
-}
+#include "generic/MSECriterion.cu"
+#include "THCGenerateFloatTypes.h"
diff --git a/lib/THCUNN/MarginCriterion.cu b/lib/THCUNN/MarginCriterion.cu
index b49e1dc..7ccdbb7 100644
--- a/lib/THCUNN/MarginCriterion.cu
+++ b/lib/THCUNN/MarginCriterion.cu
@@ -1,5 +1,7 @@
 #include "THCUNN.h"
 #include "common.h"
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
 
 #include <thrust/fill.h>
 #include <thrust/functional.h>
@@ -7,76 +9,37 @@
 #include <thrust/reduce.h>
 #include <thrust/inner_product.h>
 
+template <typename Dtype, typename Acctype>
 struct margin_functor
 {
-  margin_functor(float margin)
+  margin_functor(Acctype margin)
     : margin(margin)
   {}
 
-  __host__ __device__ float operator()(const float &x, const float &y) const
+  __host__ __device__ Acctype operator()(const Dtype &x, const Dtype &y) const
   {
-    float z = margin - x * y;
+    Acctype z = margin - ScalarConvert<Dtype, Acctype>::to(x) * y;
     return z >= 0 ? z : 0;
   }
 
-  const float margin;
+  const Acctype margin;
 };
 
-void THNN_CudaMarginCriterion_updateOutput(THCState *state, THCudaTensor *input, THCudaTensor *target, THCudaTensor *output, bool sizeAverage, float margin)
-{
-  THCUNN_assertSameGPU(state, 2, input, target);
-
-  long size = THCudaTensor_nElement(state, input);
-
-  input = THCudaTensor_newContiguous(state, input);
-  target = THCudaTensor_newContiguous(state, target);
-
-  thrust::device_ptr<float> input_data(THCudaTensor_data(state, input));
-  thrust::device_ptr<float> target_data(THCudaTensor_data(state, target));
-  float sum = thrust::inner_product(input_data, input_data+size, target_data, (float) 0, thrust::plus<float>(), margin_functor(margin));
-
-  if (sizeAverage)
-    sum /= size;
-
-  THCudaTensor_free(state, input);
-  THCudaTensor_free(state, target);
-
-  THCudaTensor_set1d(state, output, 0, sum);
-}
-
+template <typename Dtype, typename Acctype>
 struct margin_updateGradInput_functor
 {
-  const float margin, norm;
+  const Acctype margin, norm;
 
-  margin_updateGradInput_functor(float margin_, float norm_)
+  margin_updateGradInput_functor(Acctype margin_, Acctype norm_)
     : margin(margin_)
     , norm(norm_)
   {}
 
-  __host__ __device__ float operator()(const float &x, const float &y) const
+  __host__ __device__ Dtype operator()(const Dtype &x, const Dtype &y) const
   {
-    return (x * y) < margin ? -norm * y : 0;
+    return ScalarConvert<Acctype, Dtype>::to((ScalarConvert<Dtype, Acctype>::to(x) * y) < margin ? -norm * y : 0);
   }
 };
 
-void THNN_CudaMarginCriterion_updateGradInput(THCState *state, THCudaTensor *input, THCudaTensor *target, THCudaTensor *gradInput, bool sizeAverage, float margin)
-{
-  THCUNN_assertSameGPU(state, 3, input, target, gradInput);
-
-  long size = THCudaTensor_nElement(state, input);
-  float norm = sizeAverage ? 1.f/size : 1;
-
-  input = THCudaTensor_newContiguous(state, input);
-  target = THCudaTensor_newContiguous(state, target);
-
-  THCudaTensor_resizeAs(state, gradInput, input);
-
-  thrust::device_ptr<float> input_data(THCudaTensor_data(state, input));
-  thrust::device_ptr<float> target_data(THCudaTensor_data(state, target));
-  thrust::device_ptr<float> gradInput_data(THCudaTensor_data(state, gradInput));
-
-  thrust::transform(input_data, input_data+size, target_data, gradInput_data, margin_updateGradInput_functor(margin, norm));
-
-  THCudaTensor_free(state, input);
-  THCudaTensor_free(state, target);
-}
+#include "generic/MarginCriterion.cu"
+#include "THCGenerateFloatTypes.h"
diff --git a/lib/THCUNN/MultiLabelMarginCriterion.cu b/lib/THCUNN/MultiLabelMarginCriterion.cu
index 903e064..72fc486 100644
--- a/lib/THCUNN/MultiLabelMarginCriterion.cu
+++ b/lib/THCUNN/MultiLabelMarginCriterion.cu
@@ -1,60 +1,63 @@
 #include "THCUNN.h"
 #include "common.h"
 #include "THCReduceApplyUtils.cuh"
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
 
 #include <thrust/functional.h>
 
 #define MULTILABELMARGIN_THREADS 1024
 
-__global__ void cunn_MultiLabelMarginCriterion_updateOutput_kernel(float *output,
-                                                                   float *input,
-                                                                   float *target,
-                                                                   float *istarget,
+template <typename Dtype, typename Acctype>
+__global__ void cunn_MultiLabelMarginCriterion_updateOutput_kernel(Dtype *output,
+                                                                   Dtype *input,
+                                                                   THCIndex_t *target,
+                                                                   Dtype *istarget,
                                                                    int nframe,
                                                                    int dim,
                                                                    int sizeaverage)
 {
   // Temporary sums (for mapreduce)
-  __shared__ float sums[MULTILABELMARGIN_THREADS];
+  __shared__ Acctype sums[MULTILABELMARGIN_THREADS];
 
   // vectors:
   int k = blockIdx.x;
-  float *input_k = input + k*dim;
-  float *target_k = target + k*dim;
-  float *output_k = output + k;
-  float *istarget_k = istarget + k*dim;
+  Dtype *input_k = input + k*dim;
+  THCIndex_t *target_k = target + k*dim;
+  Dtype *output_k = output + k;
+  Dtype *istarget_k = istarget + k*dim;
 
   // zero istarget
   for (int d = threadIdx.x; d < dim; d += blockDim.x) {
-    istarget_k[d] = 0;
+    istarget_k[d] = ScalarConvert<int, Dtype>::to(0);
   }
   __syncthreads();
 
   // mark targets in istarget
   if (threadIdx.x == 0) {
     for (int dt = 0; dt < dim; dt++) {
-      int target_idx = (int)target_k[dt] - TH_INDEX_BASE;
+      int target_idx = target_k[dt] - TH_INDEX_BASE;
       if (target_idx < 0) break;
-      istarget_k[target_idx] = 1;
+      istarget_k[target_idx] = ScalarConvert<int, Dtype>::to(1);
     }
   }
   __syncthreads();
 
   // iterate over targets
-  float sum = 0;
+  Acctype sum = 0;
   for (int dt = 0; dt < dim; dt++) {
     // next target:
-    int target_idx = (int)target_k[dt] - TH_INDEX_BASE;
+    int target_idx = target_k[dt] - TH_INDEX_BASE;
     if (target_idx < 0) break;
 
     // current value for target
-    float input_target_k = input_k[target_idx];
+    Dtype input_target_k = input_k[target_idx];
 
     // compare to all inputs (multithreaded):
     for (int d = threadIdx.x; d < dim; d += blockDim.x) {
       // contribute to loss only if not a target
-      if (!istarget_k[d]) {
-        float z = 1 - input_target_k + input_k[d];
+      if (!ScalarConvert<Dtype, int>::to(istarget_k[d])) {
+        Dtype z = 1 - input_target_k + input_k[d];
         if (z > 0)
           sum += z;
       }
@@ -62,40 +65,41 @@ __global__ void cunn_MultiLabelMarginCriterion_updateOutput_kernel(float *output
   }
 
   // reduce
-  float totalSum = reduceBlock(sums, blockDim.x, sum, thrust::plus<float>(), 0.0f);
+  Acctype totalSum = reduceBlock(sums, blockDim.x, sum, thrust::plus<Acctype>(), (Acctype)0);
   if (threadIdx.x == 0) {
     if (sizeaverage) {
-      *output_k = (totalSum / dim) / nframe;
+      *output_k = ScalarConvert<Acctype, Dtype>::to((totalSum / dim) / nframe);
     } else {
-      *output_k = totalSum / dim;
+      *output_k = ScalarConvert<Acctype, Dtype>::to(totalSum / dim);
     }
   }
 }
 
-__global__ void cunn_MultiLabelMarginCriterion_updateGradInput_kernel(float *gradInput,
-                                                                      float *input,
-                                                                      float *target,
-                                                                      float *istarget,
+template <typename Dtype, typename Acctype>
+__global__ void cunn_MultiLabelMarginCriterion_updateGradInput_kernel(Dtype *gradInput,
+                                                                      Dtype *input,
+                                                                      THCIndex_t *target,
+                                                                      Dtype *istarget,
                                                                       int nframe,
                                                                       int dim,
                                                                       int sizeaverage)
 {
   // Temporary sums (for mapreduce)
-  __shared__ float sums[MULTILABELMARGIN_THREADS];
+  __shared__ Acctype sums[MULTILABELMARGIN_THREADS];
 
   // vectors:
   int k = blockIdx.x;
-  float *input_k = input + k*dim;
-  float *gradInput_k = gradInput + k*dim;
-  float *target_k = target + k*dim;
-  float *istarget_k = istarget + k*dim;
+  Dtype *input_k = input + k*dim;
+  Dtype *gradInput_k = gradInput + k*dim;
+  THCIndex_t *target_k = target + k*dim;
+  Dtype *istarget_k = istarget + k*dim;
 
   // gain:
-  float g = ( sizeaverage ? 1./((float)(nframe*dim)) : 1./((float)dim) );
+  Dtype g = ScalarConvert<Acctype, Dtype>::to( sizeaverage ? 1./((Acctype)(nframe*dim)) : 1./((Acctype)dim) );
 
   // zero gradients:
   for (int d = threadIdx.x; d < dim; d += blockDim.x) {
-    gradInput_k[d] = 0;
+    gradInput_k[d] = ScalarConvert<int, Dtype>::to(0);
   }
   __syncthreads();
 
@@ -106,14 +110,14 @@ __global__ void cunn_MultiLabelMarginCriterion_updateGradInput_kernel(float *gra
     if (target_idx < 0) break;
 
     // current value for target
-    float input_target_k = input_k[target_idx];
+    Dtype input_target_k = input_k[target_idx];
 
     // compare to all inputs (multithreaded):
-    float sum = 0;
+    Acctype sum = 0;
     for (int d = threadIdx.x; d < dim; d += blockDim.x) {
       // contribute to loss only if not a target
-      if (!istarget_k[d]) {
-        float z = 1 - input_target_k + input_k[d];
+      if (!ScalarConvert<Dtype, int>::to(istarget_k[d])) {
+        Dtype z = 1 - input_target_k + input_k[d];
         if (z > 0) {
           sum -= g;
           gradInput_k[d] += g;
@@ -123,118 +127,15 @@ __global__ void cunn_MultiLabelMarginCriterion_updateGradInput_kernel(float *gra
     __syncthreads();
 
     // reduce sum
-    float totalSum = reduceBlock(sums, blockDim.x, sum, thrust::plus<float>(), 0.0f);
+    Acctype totalSum = reduceBlock(sums, blockDim.x, sum, thrust::plus<Acctype>(), (Acctype)0);
     if (threadIdx.x == 0) {
-      gradInput_k[target_idx] += totalSum;
+      gradInput_k[target_idx] += ScalarConvert<Acctype, Dtype>::to(totalSum);
     }
     __syncthreads();
   }
 }
 
-void THNN_CudaMultiLabelMarginCriterion_updateOutput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *target,
-          THCudaTensor *output,
-          THCudaTensor *istarget,
-          bool sizeaverage)
-{
-  input = THCudaTensor_newContiguous(state, input);
-  target = THCudaTensor_newContiguous(state, target);
-  istarget = THCudaTensor_newContiguous(state, istarget);
-  THCudaTensor_resizeAs(state, istarget, input);
-
-  if(input->nDimension == 1)
-  {
-    THCudaTensor_resize1d(state, output, 1);
-
-    dim3 blocks(1);
-    dim3 threads(MULTILABELMARGIN_THREADS);
-
-    cunn_MultiLabelMarginCriterion_updateOutput_kernel<<<blocks,threads>>>(
-        THCudaTensor_data(state, output),
-        THCudaTensor_data(state, input),
-        THCudaTensor_data(state, target),
-        THCudaTensor_data(state, istarget),
-        1, input->size[0],
-        sizeaverage
-        );
-    THCudaCheck(cudaGetLastError());
-  }
-  else if(input->nDimension == 2)
-  {
-    THCudaTensor *output_tmp = THCudaTensor_newWithSize1d(state, input->size[0]);
-
-    dim3 blocks(input->size[0]);
-    dim3 threads(MULTILABELMARGIN_THREADS);
-
-    cunn_MultiLabelMarginCriterion_updateOutput_kernel<<<blocks,threads>>>(
-        THCudaTensor_data(state, output_tmp),
-        THCudaTensor_data(state, input),
-        THCudaTensor_data(state, target),
-        THCudaTensor_data(state, istarget),
-        input->size[0], input->size[1],
-        sizeaverage
-        );
-    THCudaCheck(cudaGetLastError());
-    THCudaTensor_resize1d(state, output, 1);
-    THCudaTensor_set1d(state, output, 0, THCudaTensor_sumall(state, output_tmp));
-    THCudaTensor_free(state, output_tmp);
-  }
-  else
-    THError("vector or matrix expected");
-
-  THCudaTensor_free(state, input);
-  THCudaTensor_free(state, target);
-  THCudaTensor_free(state, istarget);
-}
-
-void THNN_CudaMultiLabelMarginCriterion_updateGradInput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *target,
-          THCudaTensor *gradInput,
-          THCudaTensor *istarget,
-          bool sizeaverage)
-{
-  input = THCudaTensor_newContiguous(state, input);
-  target = THCudaTensor_newContiguous(state, target);
-  istarget = THCudaTensor_newContiguous(state, istarget);
-  THCudaTensor_resizeAs(state, gradInput, input);
-
-  if(gradInput->nDimension == 1)
-  {
-    dim3 blocks(1);
-    dim3 threads(MULTILABELMARGIN_THREADS);
-
-    cunn_MultiLabelMarginCriterion_updateGradInput_kernel<<<blocks,threads>>>(THCudaTensor_data(state, gradInput),
-        THCudaTensor_data(state, input),
-        THCudaTensor_data(state, target),
-        THCudaTensor_data(state, istarget),
-        1, gradInput->size[0],
-        sizeaverage);
-
-  }
-  else if(gradInput->nDimension == 2)
-  {
-    dim3 blocks(gradInput->size[0]);
-    dim3 threads(MULTILABELMARGIN_THREADS);
-
-    cunn_MultiLabelMarginCriterion_updateGradInput_kernel<<<blocks,threads>>>(THCudaTensor_data(state, gradInput),
-        THCudaTensor_data(state, input),
-        THCudaTensor_data(state, target),
-        THCudaTensor_data(state, istarget),
-        gradInput->size[0], gradInput->size[1],
-        sizeaverage);
-  }
-  else
-    THError("vector or matrix expected");
-
-  THCudaCheck(cudaGetLastError());
-
-  THCudaTensor_free(state, input);
-  THCudaTensor_free(state, target);
-  THCudaTensor_free(state, istarget);
-}
+#include "generic/MultiLabelMarginCriterion.cu"
+#include "THCGenerateFloatTypes.h"
 
 #undef MULTILABELMARGIN_THREADS
diff --git a/lib/THCUNN/MultiMarginCriterion.cu b/lib/THCUNN/MultiMarginCriterion.cu
index 31caa75..b5ed75b 100644
--- a/lib/THCUNN/MultiMarginCriterion.cu
+++ b/lib/THCUNN/MultiMarginCriterion.cu
@@ -1,17 +1,19 @@
 #include "THCUNN.h"
 #include "common.h"
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
 
 #define MULTIMARGIN_THREADS 128
 
-template <int P>
-__global__ void cunn_MultiMarginCriterion_updateOutput_kernel(float *output, float *input, float *target, float *weights, int nframe, int dim, bool sizeAverage, float margin)
+template <int P, typename Dtype, typename Acctype>
+__global__ void cunn_MultiMarginCriterion_updateOutput_kernel(Dtype *output, Dtype *input, THCIndex_t *target, Dtype *weights, int nframe, int dim, bool sizeAverage, Dtype margin)
 {
-  __shared__ float buffer[MULTIMARGIN_THREADS];
+  __shared__ Acctype buffer[MULTIMARGIN_THREADS];
   int k = blockIdx.x;
-  float *input_k = input + k*dim;
-  float *output_k = output + k;
+  Dtype *input_k = input + k*dim;
+  Dtype *output_k = output + k;
   int target_k = ((int)target[k]) - TH_INDEX_BASE;
-  float input_target_k = input_k[target_k];
+  Dtype input_target_k = input_k[target_k];
 
   int i_start = threadIdx.x;
   int i_end = dim;
@@ -20,12 +22,12 @@ __global__ void cunn_MultiMarginCriterion_updateOutput_kernel(float *output, flo
   buffer[threadIdx.x] = 0;
   for (int i = i_start; i < i_end; i += i_step)
   {
-    float z = margin - input_target_k + input_k[i];
+    Dtype z = margin - input_target_k + input_k[i];
     if (i == target_k)
       continue;
 
     if (z > 0) {
-      float h = (P==1) ? z : z*z;
+      Dtype h = (P==1) ? z : z*z;
       if(weights)
         h *= weights[target_k];
       buffer[threadIdx.x] += h;
@@ -36,26 +38,26 @@ __global__ void cunn_MultiMarginCriterion_updateOutput_kernel(float *output, flo
   // reduce
   if (threadIdx.x == 0)
   {
-    float sum = 0;
+    Acctype sum = 0;
     for (int i=0; i < blockDim.x; i++)
       sum += buffer[i];
 
-    *output_k = sum/dim;
+    *output_k = ScalarConvert<Acctype, Dtype>::to(sum/dim);
     if(sizeAverage)
       *output_k /= nframe;
   }
 }
 
-template <int P>
-__global__ void cunn_MultiMarginCriterion_updateGradInput_kernel(float *gradInput, float *input, float *target, float *weights, int nframe, int dim, bool sizeAverage, float margin)
+template <int P, typename Dtype, typename Acctype>
+__global__ void cunn_MultiMarginCriterion_updateGradInput_kernel(Dtype *gradInput, Dtype *input, THCIndex_t *target, Dtype *weights, int nframe, int dim, bool sizeAverage, Dtype margin)
 {
-  __shared__ float buffer[MULTIMARGIN_THREADS];
+  __shared__ Acctype buffer[MULTIMARGIN_THREADS];
   int k = blockIdx.x;
-  float *input_k = input + k*dim;
-  float *gradInput_k = gradInput + k*dim;
+  Dtype *input_k = input + k*dim;
+  Dtype *gradInput_k = gradInput + k*dim;
   int target_k = ((int)target[k]) - TH_INDEX_BASE;
-  float input_target_k = input_k[target_k];
-  float g = (sizeAverage ? 1./((float)(nframe*dim)) : 1./((float)dim));
+  Dtype input_target_k = input_k[target_k];
+  Acctype g = (sizeAverage ? 1./((Acctype)(nframe*dim)) : 1./((Acctype)dim));
 
   int i_start = threadIdx.x;
   int i_end = dim;
@@ -64,20 +66,20 @@ __global__ void cunn_MultiMarginCriterion_updateGradInput_kernel(float *gradInpu
   buffer[threadIdx.x] = 0;
   for (int i=i_start; i<i_end; i+=i_step)
   {
-    float z = margin - input_target_k + input_k[i];
+    Dtype z = margin - input_target_k + input_k[i];
     if (i == target_k)
       continue;
 
     if (z > 0)
     {
-      float h = (P == 1) ? g : 2*g*z;
+      Dtype h = ScalarConvert<Acctype, Dtype>::to((P == 1) ? g : 2*g*z);
       if(weights)
         h *= weights[target_k];
       buffer[threadIdx.x] -= h;
       gradInput_k[i] = h;
     }
     else
-      gradInput_k[i] = 0;
+      gradInput_k[i] = ScalarConvert<int, Dtype>::to(0);
   }
 
   __syncthreads();
@@ -85,175 +87,14 @@ __global__ void cunn_MultiMarginCriterion_updateGradInput_kernel(float *gradInpu
   // reduce
   if (threadIdx.x == 0)
   {
-    float gradInput_target_k = 0;
+    Acctype gradInput_target_k = 0;
     for (int i=0; i<blockDim.x; i++)
       gradInput_target_k += buffer[i];
-    gradInput_k[target_k] = gradInput_target_k;
+    gradInput_k[target_k] = ScalarConvert<Acctype, Dtype>::to(gradInput_target_k);
   }
 }
 
-void THNN_CudaMultiMarginCriterion_updateOutput(THCState *state, THCudaTensor *input,
-                                                THCudaTensor *target, THCudaTensor *output,
-                                                bool sizeAverage, int p, THCudaTensor *weights,
-                                                float margin)
-{
-  THCUNN_assertSameGPU(state, 2, input, target);
-  input = THCudaTensor_newContiguous(state, input);
-  if(weights)
-    weights = THCudaTensor_newContiguous(state, weights);
-  if (input->nDimension == 1)
-  {
-    dim3 blocks(1);
-    dim3 threads(MULTIMARGIN_THREADS);
-    if (p == 1)
-    {
-      cunn_MultiMarginCriterion_updateOutput_kernel<1> <<<blocks,threads, 0, THCState_getCurrentStream(state)>>>(
-        THCudaTensor_data(state, output),
-        THCudaTensor_data(state, input),
-        THCudaTensor_data(state, target),
-        weights ? THCudaTensor_data(state, weights) : NULL,
-        1, input->size[0],
-        sizeAverage,
-        margin
-      );
-    }
-    else if (p == 2)
-    {
-      cunn_MultiMarginCriterion_updateOutput_kernel<2> <<<blocks,threads, 0, THCState_getCurrentStream(state)>>>(
-        THCudaTensor_data(state, output),
-        THCudaTensor_data(state, input),
-        THCudaTensor_data(state, target),
-        weights ? THCudaTensor_data(state, weights) : NULL,
-        1, input->size[0],
-        sizeAverage,
-        margin
-      );
-    }
-    THCudaCheck(cudaGetLastError());
-  }
-  else if (input->nDimension == 2)
-  {
-    THCudaTensor *output_ = THCudaTensor_newWithSize1d(state, input->size[0]);  // tmp outupt buffer
-    dim3 blocks(input->size[0]);
-    dim3 threads(MULTIMARGIN_THREADS);
-    if (p == 1)
-    {
-      cunn_MultiMarginCriterion_updateOutput_kernel<1> <<<blocks,threads, 0, THCState_getCurrentStream(state)>>>(
-        THCudaTensor_data(state, output_),
-        THCudaTensor_data(state, input),
-        THCudaTensor_data(state, target),
-        weights ? THCudaTensor_data(state, weights) : NULL,
-        input->size[0], input->size[1],
-        sizeAverage,
-        margin
-      );
-    }
-    else if (p == 2)
-    {
-      cunn_MultiMarginCriterion_updateOutput_kernel<2> <<<blocks,threads, 0, THCState_getCurrentStream(state)>>>(
-        THCudaTensor_data(state, output_),
-        THCudaTensor_data(state, input),
-        THCudaTensor_data(state, target),
-        weights ? THCudaTensor_data(state, weights) : NULL,
-        input->size[0], input->size[1],
-        sizeAverage,
-        margin
-      );
-    }
-    THCudaCheck(cudaGetLastError());
-    float sum = THCudaTensor_sumall(state, output_);
-    THCudaTensor_set1d(state, output, 0, sum);
-    THCudaTensor_free(state, output_);
-  }
-  else
-  {
-    THError("vector or matrix expected");
-  }
-
-  THCudaTensor_free(state, input);
-  if(weights)
-    THCudaTensor_free(state, weights);
-}
-
-void THNN_CudaMultiMarginCriterion_updateGradInput(THCState *state, THCudaTensor *input,
-                                                   THCudaTensor *target, THCudaTensor *gradInput,
-                                                   bool sizeAverage, int p, THCudaTensor *weights,
-                                                   float margin)
-{
-  THCUNN_assertSameGPU(state, 3, input, gradInput, target);
-  input = THCudaTensor_newContiguous(state, input);
-  THCudaTensor_resizeAs(state, gradInput, input);
-  if(weights)
-    weights = THCudaTensor_newContiguous(state, weights);
+#include "generic/MultiMarginCriterion.cu"
+#include "THCGenerateFloatTypes.h"
 
-  if (input->nDimension == 1)
-  {
-    dim3 blocks(1);
-    dim3 threads(MULTIMARGIN_THREADS);
-
-    if (p == 1)
-    {
-      cunn_MultiMarginCriterion_updateGradInput_kernel<1> <<<blocks,threads, 0, THCState_getCurrentStream(state)>>>(
-        THCudaTensor_data(state, gradInput),
-        THCudaTensor_data(state, input),
-        THCudaTensor_data(state, target),
-        weights ? THCudaTensor_data(state, weights) : NULL,
-        1, gradInput->size[0],
-        sizeAverage,
-        margin
-      );
-    }
-    else if (p == 2)
-    {
-      cunn_MultiMarginCriterion_updateGradInput_kernel<2> <<<blocks,threads, 0, THCState_getCurrentStream(state)>>>(
-        THCudaTensor_data(state, gradInput),
-        THCudaTensor_data(state, input),
-        THCudaTensor_data(state, target),
-        weights ? THCudaTensor_data(state, weights) : NULL,
-        1, gradInput->size[0],
-        sizeAverage,
-        margin
-      );
-    }
-    THCudaCheck(cudaGetLastError());
-  }
-  else if (input->nDimension == 2)
-  {
-    dim3 blocks(gradInput->size[0]);
-    dim3 threads(MULTIMARGIN_THREADS);
-
-    if (p == 1)
-    {
-      cunn_MultiMarginCriterion_updateGradInput_kernel<1> <<<blocks,threads, 0, THCState_getCurrentStream(state)>>>(
-        THCudaTensor_data(state, gradInput),
-        THCudaTensor_data(state, input),
-        THCudaTensor_data(state, target),
-        weights ? THCudaTensor_data(state, weights) : NULL,
-        gradInput->size[0], gradInput->size[1],
-        sizeAverage,
-        margin
-      );
-    }
-    else if (p == 2)
-    {
-      cunn_MultiMarginCriterion_updateGradInput_kernel<2> <<<blocks,threads, 0, THCState_getCurrentStream(state)>>>(
-        THCudaTensor_data(state, gradInput),
-        THCudaTensor_data(state, input),
-        THCudaTensor_data(state, target),
-        weights ? THCudaTensor_data(state, weights) : NULL,
-        gradInput->size[0], gradInput->size[1],
-        sizeAverage,
-        margin
-      );
-    }
-    THCudaCheck(cudaGetLastError());
-  }
-  else
-  {
-    THError("vector or matrix expected");
-  }
-
-  THCudaTensor_free(state, input);
-  if(weights)
-    THCudaTensor_free(state, weights);
-}
+#undef MULTIMARGIN_THREADS
diff --git a/lib/THCUNN/PReLU.cu b/lib/THCUNN/PReLU.cu
index 048c0b4..dfac397 100644
--- a/lib/THCUNN/PReLU.cu
+++ b/lib/THCUNN/PReLU.cu
@@ -1,25 +1,27 @@
 #include "THCUNN.h"
-#include "THCReduce.cuh"
-#include "common.h"
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
 
-#include <thrust/functional.h>
+#include "common.h"
 
+template <typename T>
 struct PReLUUpdateOutput
 {
-  float* weight_;
+  T* weight_;
 
-  PReLUUpdateOutput(float* weight)
+  PReLUUpdateOutput(T* weight)
     : weight_(weight)
   {}
 
-  __device__ __forceinline__ void operator()(float *out, float *in)
+  __device__ __forceinline__ void operator()(T *out, T *in)
   {
-    float x = *in;
+    T x = *in;
     *out = (x > 0) ? x : weight_[0] * x;
   }
 };
 
-__global__ void preluForward(float *output, const float *input, const float *weight, int n, int nElemsPerSample, int mapSize)
+template <typename T>
+__global__ void preluForward(T *output, const T *input, const T *weight, int n, int nElemsPerSample, int mapSize)
 {
   CUDA_KERNEL_LOOP(i, n)
   {
@@ -29,63 +31,27 @@ __global__ void preluForward(float *output, const float *input, const float *wei
   }
 }
 
-void THNN_CudaPReLU_updateOutput(
-  THCState *state,
-  THCudaTensor *input,
-  THCudaTensor *output,
-  THCudaTensor *weight,
-  long nOutputPlane)
-{
-  THCudaTensor_resizeAs(state, output, input);
-
-  float *w = THCudaTensor_data(state, weight);
-
-  if (nOutputPlane == 0)
-  {
-    THC_pointwiseApply2(state, output, input, PReLUUpdateOutput(w));
-  }
-  else
-  {
-    int ndim = THCudaTensor_nDimension(state, input);
-    input = THCudaTensor_newContiguous(state, input);
-
-    int n = THCudaTensor_nElement(state, input);
-    int mapSize = 1;
-    if (ndim == 3)
-      mapSize = (input->size[1] * input->size[2]);
-    else if (ndim == 4)
-      mapSize = (input->size[2] * input->size[3]);
-    int nElemsPerSample = nOutputPlane * mapSize;
-    preluForward<<<GET_BLOCKS(n), CUDA_NUM_THREADS, 0, THCState_getCurrentStream(state)>>>(
-      THCudaTensor_data(state, output),
-      THCudaTensor_data(state, input),
-      w,
-      n, nElemsPerSample, mapSize
-    );
-    THCudaCheck(cudaGetLastError());
-    THCudaTensor_free(state, input);
-  }
-}
-
+template <typename T>
 struct PReLUUpdateGradInput
 {
-  float *weight_;
+  T *weight_;
 
-  PReLUUpdateGradInput(float *weight)
+  PReLUUpdateGradInput(T *weight)
     : weight_(weight)
   {}
 
-  __device__ __forceinline__ void operator()(float *gradInput, float *gradOutput, float *input)
+  __device__ __forceinline__ void operator()(T *gradInput, T *gradOutput, T *input)
   {
     *gradInput = *input > 0 ? *gradOutput : *gradOutput * *weight_;
   }
 };
 
+template <typename T>
 __global__ void preluBackward(
-  float *gradInput,
-  const float *input,
-  const float *weight,
-  const float *gradOutput,
+  T *gradInput,
+  const T *input,
+  const T *weight,
+  const T *gradOutput,
   int n, int nElemsPerSample, int mapSize)
 {
   CUDA_KERNEL_LOOP(i, n)
@@ -96,149 +62,44 @@ __global__ void preluBackward(
   }
 }
 
-void THNN_CudaPReLU_updateGradInput(
-  THCState *state,
-  THCudaTensor *input,
-  THCudaTensor *gradOutput,
-  THCudaTensor *gradInput,
-  THCudaTensor *weight,
-  long nOutputPlane)
-{
-  THCudaTensor_resizeAs(state, gradInput, input);
-
-  float *w = THCudaTensor_data(state, weight);
-  if (nOutputPlane == 0)
-  {
-    THC_pointwiseApply3(state, gradInput, gradOutput, input, PReLUUpdateGradInput(w));
-  }
-  else
-  {
-    int ndim = THCudaTensor_nDimension(state, input);
-    input = THCudaTensor_newContiguous(state, input);
-    gradOutput = THCudaTensor_newContiguous(state, gradOutput);
-
-    int n = THCudaTensor_nElement(state, input);
-    int mapSize = 1;
-    if (ndim == 3)
-      mapSize = (input->size[1] * input->size[2]);
-    else if (ndim == 4)
-      mapSize = (input->size[2] * input->size[3]);
-    int nElemsPerSample = nOutputPlane * mapSize;
-    preluBackward<<<GET_BLOCKS(n), CUDA_NUM_THREADS, 0, THCState_getCurrentStream(state)>>>(
-      THCudaTensor_data(state, gradInput),
-      THCudaTensor_data(state, input),
-      w,
-      THCudaTensor_data(state, gradOutput),
-      n, nElemsPerSample, mapSize
-    );
-    THCudaCheck(cudaGetLastError());
-    THCudaTensor_free(state, input);
-    THCudaTensor_free(state, gradOutput);
-  }
-}
-
+template <typename T>
 struct PReLUAccGradParametersShared
 {
-  __device__ __forceinline__ void operator()(float *gradInput, float  *input, float *gradOutput)
+  __device__ __forceinline__ void operator()(T *gradInput, T  *input, T *gradOutput)
   {
     *gradInput = (*input) * (*gradOutput) * (*input <= 0);
   }
 };
 
+template <typename T>
 struct PReLUAccGradParameters
 {
-  float scale;
+  T scale;
 
-  PReLUAccGradParameters(float scale)
+  PReLUAccGradParameters(T scale)
     : scale(scale)
   {}
 
-  __device__ __forceinline__ void operator()(float *gradInput, float *input, float *gradOutput)
+  __device__ __forceinline__ void operator()(T *gradInput, T *input, T *gradOutput)
   {
     *gradInput = (*input) * (*gradOutput) * scale * (*input <= 0);
   }
 };
 
+template <typename T>
 struct PReLUAccGradParameters1to1
 {
-  float scale;
+  T scale;
 
-  PReLUAccGradParameters1to1(float scale)
+  PReLUAccGradParameters1to1(T scale)
     : scale(scale)
   {}
 
-  __device__ __forceinline__ void operator()(float *gradWeight, float *input, float *gradOutput)
+  __device__ __forceinline__ void operator()(T *gradWeight, T *input, T *gradOutput)
   {
     *gradWeight += (*input) * (*gradOutput) * scale * (*input <= 0);
   }
 };
 
-void THNN_CudaPReLU_accGradParameters(
-  THCState *state,
-  THCudaTensor *input,
-  THCudaTensor *gradOutput,
-  THCudaTensor *gradInput,
-  THCudaTensor *weight,
-  THCudaTensor *gradWeight,
-  THCudaTensor *gradWeightBuf,
-  THCudaTensor *gradWeightBuf2,
-  long nOutputPlane,
-  float scale)
-{
-  // use grad input for temporary storage, then call updateGradInput again
-
-  if (nOutputPlane == 0)
-  {
-    THC_pointwiseApply3(state, gradInput, input, gradOutput, PReLUAccGradParametersShared());
-
-    // introduces a sync point
-    float sum = THCudaTensor_sumall(state, gradInput);
-    float w = THCudaTensor_get1d(state, gradWeight, 0);
-    THCudaTensor_set1d(state, gradWeight, 0, w + sum * scale);
-
-    // restore gradInput
-    THNN_CudaPReLU_updateGradInput(state, input, gradOutput, gradInput, weight, nOutputPlane);
-  }
-  else
-  {
-    int ndim = THCudaTensor_nDimension(state, input);
-
-    if (ndim == 1)
-    {
-      THC_pointwiseApply3(state, gradWeight, input, gradOutput, PReLUAccGradParameters1to1(scale));
-    }
-    else
-    {
-      THC_pointwiseApply3(state, gradInput, input, gradOutput, PReLUAccGradParameters(scale));
-      THCudaTensor *sumbuf = gradWeightBuf2;
-      THCudaTensor_resizeAs(state, gradWeightBuf, gradWeight);
-
-      if (ndim == 2)
-      {
-        THCudaTensor_sum(state, gradWeightBuf, gradInput, 0);
-        THCudaTensor_cadd(state, gradWeight, gradWeight, scale, gradWeightBuf);
-      }
-      else if (ndim == 3)
-      {
-        THCudaTensor *buffer = THCudaTensor_newContiguous(state, gradInput);
-        THCudaTensor_resize2d(state, buffer, nOutputPlane, input->size[1] * input->size[2]);
-        THCudaTensor_sum(state, gradWeightBuf, buffer, 1);
-        THCudaTensor_cadd(state, gradWeight, gradWeight, scale, gradWeightBuf);
-        THCudaTensor_free(state, buffer);
-      }
-      else if (ndim == 4)
-      {
-        THCudaTensor *buffer = THCudaTensor_newContiguous(state, gradInput);
-        THCudaTensor_resize3d(state, buffer, input->size[0], nOutputPlane, input->size[2] * input->size[3]);
-        THCudaTensor_resize2d(state, sumbuf, input->size[0], nOutputPlane);
-        THCudaTensor_sum(state, sumbuf, buffer, 2);
-        THCudaTensor_sum(state, gradWeightBuf, sumbuf, 0);
-        THCudaTensor_cadd(state, gradWeight, gradWeight, scale, gradWeightBuf);
-        THCudaTensor_free(state, buffer);
-      }
-
-      // restore gradInput
-      THNN_CudaPReLU_updateGradInput(state, input, gradOutput, gradInput, weight, nOutputPlane);
-    }
-  }
-}
+#include "generic/PReLU.cu"
+#include "THCGenerateFloatTypes.h"
diff --git a/lib/THCUNN/RReLU.cu b/lib/THCUNN/RReLU.cu
index 245598b..55a34ec 100644
--- a/lib/THCUNN/RReLU.cu
+++ b/lib/THCUNN/RReLU.cu
@@ -1,4 +1,6 @@
 #include "THCUNN.h"
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
 #include "common.h"
 #include <curand.h>
 #include <curand_kernel.h>
@@ -8,51 +10,72 @@
 #define BLOCK_SIZE 256
 #define NUM_BLOCKS(n) min((int)THCCeilDiv(n, (long) BLOCK_SIZE), MAX_NUM_BLOCKS)
 
+template<typename T>
+inline T __device__ curand_uniform_type(curandStateMtgp32 *state);
+
+template <>
+inline half __device__ curand_uniform_type<half>(curandStateMtgp32 *state) {
+  return ScalarConvert<float, half>::to(curand_uniform(state));
+}
+
+template <>
+inline float __device__ curand_uniform_type<float>(curandStateMtgp32 *state) {
+  return curand_uniform(state);
+}
+
+template <>
+inline double __device__ curand_uniform_type<double>(curandStateMtgp32 *state) {
+  return curand_uniform_double(state);
+}
+
+template <typename T>
 __global__ void rreluUpdateOutputTrain(int n, curandStateMtgp32 *state,
-  float *input, float* noise, float *output, double a, double b)
+  T *input, T* noise, T *output, double a, double b)
 {
   CUDA_KERNEL_LOOP(i, n)
   {
     if (input[i] <= 0)
     {
-      float r = curand_uniform(&state[blockIdx.x]);
-      r = r * (b-a) + a;
+      T r = curand_uniform_type<T>(&state[blockIdx.x]);
+      r = ScalarConvert<double, T>::to(r * (b-a) + a);
       output[i] = input[i] * r;
       noise[i] = r;
     }
     else
     {
       output[i] = input[i];
-      noise[i] = 1;
+      noise[i] = ScalarConvert<int, T>::to(1);
     }
   }
 }
 
+template <typename T>
 struct RReLUUpdateOutputEval_functor
 {
-  const float negSlope_;
+  const T negSlope_;
 
-  RReLUUpdateOutputEval_functor(float negSlope)
+  RReLUUpdateOutputEval_functor(T negSlope)
     : negSlope_(negSlope)
   {}
 
-  __device__ __forceinline__ void operator()(float *out, float *in)
+  __device__ __forceinline__ void operator()(T *out, T *in)
   {
-    const float x = *in;
-    const float r = x <= 0 ? negSlope_ : 1;
+    const T x = *in;
+    const T r = x <= 0 ? negSlope_ : ScalarConvert<int, T>::to(1);
     *out = x * r;
   }
 };
 
+template <typename T>
 struct RReLUUpdateOutputEvalIP_functor
 {
-  const float negSlope_;
+  const T negSlope_;
 
-  RReLUUpdateOutputEvalIP_functor(float negSlope)
+  RReLUUpdateOutputEvalIP_functor(T negSlope)
     : negSlope_(negSlope)
   {}
 
-  __device__ __forceinline__ void operator()(float *x)
+  __device__ __forceinline__ void operator()(T *x)
   {
     if (*x <= 0)
     {
@@ -61,74 +84,31 @@ struct RReLUUpdateOutputEvalIP_functor
   }
 };
 
-void THNN_CudaRReLU_updateOutput(THCState *state, THCudaTensor *input, THCudaTensor *output,
-  THCudaTensor *noise, double lower, double upper, bool train, bool inplace, void *generator)
-{
-  THCUNN_assertSameGPU(state, 3, input, output, noise);
-  struct curandStateMtgp32* gen_states = THCRandom_generatorStates(state);
-
-  if (train)
-  {
-    input = THCudaTensor_newContiguous(state, input);
-    THCudaTensor_resizeAs(state, noise, input);
-    float *input_data = THCudaTensor_data(state, input);
-    float *noise_data = THCudaTensor_data(state, noise);
-    long n = THCudaTensor_nElement(state, input);
-    if (inplace)
-    {
-      rreluUpdateOutputTrain<<<NUM_BLOCKS(n), BLOCK_SIZE, 0, THCState_getCurrentStream(state)>>>(
-        n, gen_states, input_data, noise_data, input_data, lower, upper);
-      THCudaTensor_set(state, output, input);
-    }
-    else
-    {
-      THCudaTensor_resizeAs(state, output, input);
-      float *output_data = THCudaTensor_data(state, output);
-      rreluUpdateOutputTrain<<<NUM_BLOCKS(n), BLOCK_SIZE, 0, THCState_getCurrentStream(state)>>>(
-        n, gen_states, input_data, noise_data, output_data, lower, upper);
-    }
-    THCudaCheck(cudaGetLastError());
-    THCudaTensor_free(state, input);
-  }
-  else
-  {
-    const double negSlope = (lower + upper) / 2;
-    if (inplace)
-    {
-      THC_pointwiseApply1(state, input, RReLUUpdateOutputEvalIP_functor(negSlope));
-      THCudaTensor_set(state, output, input);
-    }
-    else
-    {
-      THCudaTensor_resizeAs(state, output, input);
-      THC_pointwiseApply2(state, output, input, RReLUUpdateOutputEval_functor(negSlope));
-    }
-  }
-}
-
+template <typename T>
 struct RReLUupdateGradInputEval_functor
 {
-  const float negSlope_;
+  const T negSlope_;
 
-  RReLUupdateGradInputEval_functor(float negSlope)
+  RReLUupdateGradInputEval_functor(T negSlope)
     : negSlope_(negSlope)
   {}
 
-  __device__ __forceinline__ void operator()(float *gradIn, float *gradOut, float *in)
+  __device__ __forceinline__ void operator()(T *gradIn, T *gradOut, T *in)
   {
     *gradIn = (*in) <= 0 ? (*gradOut) * negSlope_ : (*gradOut);
   }
 };
 
+template <typename T>
 struct RReLUupdateGradInputEvalIP_functor
 {
-  const float negSlope_;
+  const T negSlope_;
 
-  RReLUupdateGradInputEvalIP_functor(float negSlope)
+  RReLUupdateGradInputEvalIP_functor(T negSlope)
     : negSlope_(negSlope)
   {}
 
-  __device__ __forceinline__ void operator()(float *gradOut, float *in)
+  __device__ __forceinline__ void operator()(T *gradOut, T *in)
   {
     if (*in <= 0)
     {
@@ -137,42 +117,5 @@ struct RReLUupdateGradInputEvalIP_functor
   }
 };
 
-void THNN_CudaRReLU_updateGradInput(THCState *state, THCudaTensor *input, THCudaTensor *gradOutput,
-  THCudaTensor *gradInput, THCudaTensor *noise, double lower, double upper, bool train, bool inplace)
-{
-  THCUNN_assertSameGPU(state, 4, input, gradOutput, gradInput, noise);
-
-  gradOutput = THCudaTensor_newContiguous(state, gradOutput);
-
-  if (train && upper - lower > 1E-6)    // e.g. if upper == lower, RReLU behaves like LeakyReLU
-  {
-    // multiply the gradient by the noise tensor
-    if (inplace)
-    {
-      THCudaTensor_cmul(state, gradOutput, gradOutput, noise);
-      THCudaTensor_set(state, gradInput, gradOutput);
-    }
-    else
-    {
-      THCudaTensor_resizeAs(state, gradInput, input);
-      THCudaTensor_cmul(state, gradInput, gradOutput, noise);
-    }
-  }
-  else
-  {
-    // use constant factor for negative input values
-    const double negSlope = (lower + upper) / 2;
-    if (inplace)
-    {
-      THC_pointwiseApply2(state, gradOutput, input, RReLUupdateGradInputEvalIP_functor(negSlope));
-      THCudaTensor_set(state, gradInput, gradOutput);
-    }
-    else
-    {
-      THCudaTensor_resizeAs(state, gradInput, input);
-      THC_pointwiseApply3(state, gradInput, gradOutput, input, RReLUupdateGradInputEval_functor(negSlope));
-    }
-  }
-
-  THCudaTensor_free(state, gradOutput);
-}
+#include "generic/RReLU.cu"
+#include "THCGenerateFloatTypes.h"
diff --git a/lib/THCUNN/SharedMem.cuh b/lib/THCUNN/SharedMem.cuh
new file mode 100644
index 0000000..8d83d9f
--- /dev/null
+++ b/lib/THCUNN/SharedMem.cuh
@@ -0,0 +1,43 @@
+// Based on the simpleTempltes CUDA example
+
+#ifndef THCUNN_SHAREDMEM_H
+#define THCUNN_SHAREDMEM_H
+
+template <typename T>
+struct SharedMem {
+  __device__ T *getPointer()
+  {
+    extern __device__ void error(void);
+    error();
+    return NULL;
+  }
+};
+
+template <>
+struct SharedMem<half>
+{
+  __device__ half *getPointer() {
+    extern __shared__ half s_half[];
+    return s_half;
+  }
+};
+
+template <>
+struct SharedMem<float>
+{
+  __device__ float *getPointer() {
+    extern __shared__ float s_float[];
+    return s_float;
+  }
+};
+
+template <>
+struct SharedMem<double>
+{
+  __device__ double *getPointer() {
+    extern __shared__ double s_double[];
+    return s_double;
+  }
+};
+
+#endif
diff --git a/lib/THCUNN/Sigmoid.cu b/lib/THCUNN/Sigmoid.cu
index f2a3675..623b84d 100644
--- a/lib/THCUNN/Sigmoid.cu
+++ b/lib/THCUNN/Sigmoid.cu
@@ -1,32 +1,24 @@
 #include "THCUNN.h"
-#include "common.h"
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
 
+template <typename T>
 struct sigmoidupdateOutput_functor
 {
-  __device__ void operator()(float *output, const float *input) const
+  __device__ void operator()(T *output, const T *input) const
   {
-    *output = 1./(1.+ exp(-*input));
+    *output = ScalarConvert<double, T>::to(1./(1.+ exp(-*input)));
   }
 };
 
-void THNN_CudaSigmoid_updateOutput(THCState *state, THCudaTensor *input, THCudaTensor *output)
-{
-  THCUNN_assertSameGPU(state, 2, input, output);
-  THCudaTensor_resizeAs(state, output, input);
-  THC_pointwiseApply2(state, output, input, sigmoidupdateOutput_functor());
-}
-
+template <typename T>
 struct sigmoidupdateGradInput_functor
 {
-  __device__ void operator()(float *gradInput, const float *output, const float *gradOutput) const
+  __device__ void operator()(T *gradInput, const T *output, const T *gradOutput) const
   {
-    *gradInput = *gradOutput * (1.-*output) * (*output);
+    *gradInput = ScalarConvert<double, T>::to(*gradOutput * (1.-*output) * (*output));
   }
 };
 
-void THNN_CudaSigmoid_updateGradInput(THCState *state, THCudaTensor *input, THCudaTensor *gradOutput, THCudaTensor *gradInput, THCudaTensor *output)
-{
-  THCUNN_assertSameGPU(state, 3, output, gradOutput, gradInput);
-  THCudaTensor_resizeAs(state, gradInput, output);
-  THC_pointwiseApply3(state, gradInput, output, gradOutput, sigmoidupdateGradInput_functor());
-}
+#include "generic/Sigmoid.cu"
+#include "THCGenerateFloatTypes.h"
diff --git a/lib/THCUNN/SmoothL1Criterion.cu b/lib/THCUNN/SmoothL1Criterion.cu
index 2b528ef..8e94fbc 100644
--- a/lib/THCUNN/SmoothL1Criterion.cu
+++ b/lib/THCUNN/SmoothL1Criterion.cu
@@ -1,5 +1,7 @@
 #include "THCUNN.h"
 #include "common.h"
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
 
 #include <thrust/fill.h>
 #include <thrust/functional.h>
@@ -10,97 +12,38 @@
 #include <thrust/system/cuda/execution_policy.h>
 #endif
 
+template <typename Dtype, typename Acctype>
 struct smoothl1_functor
 {
   smoothl1_functor() {}
 
-  __host__ __device__ float operator()(const float &x, const float &y) const
+  __host__ __device__ Acctype operator()(const Dtype &x, const Dtype &y) const
   {
-    float z = fabsf(x-y);
-    return z < 1.f ? 0.5f*z*z : z - 0.5f;
+    Acctype z = ScalarConvert<Dtype, Acctype>::to(THCNumerics<Dtype>::abs(x-y));
+    return z < Acctype(1) ? 0.5f*z*z : z - 0.5f;
   }
 };
 
-void THNN_CudaSmoothL1Criterion_updateOutput(THCState *state, THCudaTensor *input, THCudaTensor *target, THCudaTensor *output, bool sizeAverage)
-{
-  THCUNN_assertSameGPU(state, 2, input, target);
-  THArgCheck(
-    THCudaTensor_nElement(state, input) == THCudaTensor_nElement(state, target), 2,
-    "input and target need to have the same number of elements"
-  );
-
-  long size = THCudaTensor_nElement(state, input);
-
-  input = THCudaTensor_newContiguous(state, input);
-  target = THCudaTensor_newContiguous(state, target);
-
-  thrust::device_ptr<float> input_data(THCudaTensor_data(state, input));
-  thrust::device_ptr<float> target_data(THCudaTensor_data(state, target));
-  float sum = thrust::inner_product(
-#if CUDA_VERSION >= 7000
-    thrust::cuda::par.on(THCState_getCurrentStream(state)),
-#endif
-    input_data, input_data+size, target_data, (float) 0,
-    thrust::plus<float>(), smoothl1_functor()
-  );
-
-  if (sizeAverage)
-    sum /= size;
-
-  THCudaTensor_free(state, input);
-  THCudaTensor_free(state, target);
-
-  THCudaTensor_set1d(state, output, 0, sum);
-}
-
+template <typename Dtype>
 struct smoothl1_updateGradInput_functor
 {
-  const float norm;
+  const Dtype norm;
 
-  smoothl1_updateGradInput_functor(float norm_)
+  smoothl1_updateGradInput_functor(Dtype norm_)
     : norm(norm_)
   {}
 
-  __host__ __device__ float operator()(const float &x, const float &y) const
+  __host__ __device__ Dtype operator()(const Dtype &x, const Dtype &y) const
   {
-    float z = x - y;
-    if (z < -1.f)
+    Dtype z = x - y;
+    if (z < ScalarConvert<int, Dtype>::to(-1))
       return -norm;
-    else if (z > 1.f)
+    else if (z > ScalarConvert<int, Dtype>::to(1))
       return norm;
     else
       return norm * z;
   }
 };
 
-void THNN_CudaSmoothL1Criterion_updateGradInput(THCState *state, THCudaTensor *input, THCudaTensor *target, THCudaTensor *gradInput, bool sizeAverage)
-{
-  THCUNN_assertSameGPU(state, 3, input, target, gradInput);
-  THArgCheck(
-    THCudaTensor_nElement(state, input) == THCudaTensor_nElement(state, target), 2,
-    "input and target need to have the same number of elements"
-  );
-
-  long size = THCudaTensor_nElement(state, input);
-  float norm = sizeAverage ? 1./size : 1.;
-
-  input = THCudaTensor_newContiguous(state, input);
-  target = THCudaTensor_newContiguous(state, target);
-
-  THCudaTensor_resizeAs(state, gradInput, input);
-
-  thrust::device_ptr<float> input_data(THCudaTensor_data(state, input));
-  thrust::device_ptr<float> target_data(THCudaTensor_data(state, target));
-  thrust::device_ptr<float> gradInput_data(THCudaTensor_data(state, gradInput));
-
-  thrust::transform(
-#if CUDA_VERSION >= 7000
-    thrust::cuda::par.on(THCState_getCurrentStream(state)),
-#endif
-    input_data, input_data+size, target_data, gradInput_data,
-    smoothl1_updateGradInput_functor(norm)
-  );
-
-  THCudaTensor_free(state, input);
-  THCudaTensor_free(state, target);
-}
+#include "generic/SmoothL1Criterion.cu"
+#include "THCGenerateFloatTypes.h"
diff --git a/lib/THCUNN/SoftMarginCriterion.cu b/lib/THCUNN/SoftMarginCriterion.cu
index 2a15f4a..95c8e95 100644
--- a/lib/THCUNN/SoftMarginCriterion.cu
+++ b/lib/THCUNN/SoftMarginCriterion.cu
@@ -1,5 +1,7 @@
 #include "THCUNN.h"
 #include "common.h"
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
 
 #include <thrust/fill.h>
 #include <thrust/functional.h>
@@ -7,81 +9,29 @@
 #include <thrust/reduce.h>
 #include <thrust/inner_product.h>
 
+template <typename Dtype, typename Acctype>
 struct softmargin_functor
 {
-  __host__ __device__ float operator()(const float& x, const float& y) const
+  __host__ __device__ Acctype operator()(const Dtype& x, const Dtype& y) const
   {
-    return log(1 + exp(-x*y));
+    return log(1 + exp(ScalarConvert<Dtype, Acctype>::to(-x)*y));
   }
 };
 
-
-void THNN_CudaSoftMarginCriterion_updateOutput(THCState *state,
-                                               THCudaTensor *input,
-                                               THCudaTensor *target,
-                                               THCudaTensor *output,
-                                               int sizeAverage
-                                              )
-{
-  THCUNN_assertSameGPU(state, 2, input, target);
-  float sum;
-
-  long size = THCudaTensor_nElement(state, input);
-
-  input = THCudaTensor_newContiguous(state, input);
-  target = THCudaTensor_newContiguous(state, target);
-
-  thrust::device_ptr<float> input_data(THCudaTensor_data(state, input));
-  thrust::device_ptr<float> target_data(THCudaTensor_data(state, target));
-  sum = thrust::inner_product(input_data, input_data+size, target_data, (float) 0, thrust::plus<float>(), softmargin_functor());
-
-  if(sizeAverage)
-    sum /= size;
-
-  THCudaTensor_free(state, input);
-  THCudaTensor_free(state, target);
-
-  THCudaTensor_set1d(state, output, 0, sum);
-}
-
-
+template <typename Dtype, typename Acctype>
 struct softmargin_updateGradInput_functor
 {
-  const float norm;
+  const Acctype norm;
 
-  softmargin_updateGradInput_functor(float norm_) :
+  softmargin_updateGradInput_functor(Acctype norm_) :
     norm(norm_) {}
 
-  __host__ __device__ float operator()(const float& x, const float& y) const
+  __host__ __device__ Dtype operator()(const Dtype& x, const Dtype& y) const
     {
-      float temp = exp(-x*y);
-      return -y*temp*norm/(1.f + temp);
+      Acctype temp = exp(ScalarConvert<Dtype, Acctype>::to(-x)*y);
+      return ScalarConvert<Acctype, Dtype>::to(-y*temp*norm/(ScalarConvert<int, Acctype>::to(1) + temp));
     }
 };
 
-void THNN_CudaSoftMarginCriterion_updateGradInput(THCState *state,
-                                                  THCudaTensor *input,
-                                                  THCudaTensor *target,
-                                                  THCudaTensor *gradInput,
-                                                  int sizeAverage
-                                                 )
-{
-  THCUNN_assertSameGPU(state, 3, input, target, gradInput);
-
-  long size = THCudaTensor_nElement(state, input);
-  float norm = (sizeAverage ? 1./size : 1.);
-
-  input = THCudaTensor_newContiguous(state, input);
-  target = THCudaTensor_newContiguous(state, target);
-
-  THCudaTensor_resizeAs(state, gradInput, input);
-
-  thrust::device_ptr<float> input_data(THCudaTensor_data(state, input));
-  thrust::device_ptr<float> target_data(THCudaTensor_data(state, target));
-  thrust::device_ptr<float> gradInput_data(THCudaTensor_data(state, gradInput));
-
-  thrust::transform(input_data, input_data+size, target_data, gradInput_data, softmargin_updateGradInput_functor(norm));
-
-  THCudaTensor_free(state, input);
-  THCudaTensor_free(state, target);
-}
+#include "generic/SoftMarginCriterion.cu"
+#include "THCGenerateFloatTypes.h"
diff --git a/lib/THCUNN/SoftMax.cu b/lib/THCUNN/SoftMax.cu
index 987325f..5ae6b70 100644
--- a/lib/THCUNN/SoftMax.cu
+++ b/lib/THCUNN/SoftMax.cu
@@ -1,26 +1,29 @@
 #include "THCUNN.h"
-#include "common.h"
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
 
 #define SOFTMAX_THREADS 128
 
+template <typename T, typename AccumT>
 __global__ void cunn_SoftMax_updateOutput_kernel(
-  float *output, float *input, int nframe, int dim, int stride0, int stride1)
+  T *output, T *input, int nframe, int dim, int stride0, int stride1)
 {
-  __shared__ float buffer[SOFTMAX_THREADS+1];
-  float *input_k  = input  + blockIdx.x*dim*stride0 + blockIdx.y*stride1 + blockIdx.z;
-  float *output_k = output + blockIdx.x*dim*stride0 + blockIdx.y*stride1 + blockIdx.z;
+  __shared__ AccumT buffer[SOFTMAX_THREADS+1];
+  T *input_k  = input  + blockIdx.x*dim*stride0 + blockIdx.y*stride1 + blockIdx.z;
+  T *output_k = output + blockIdx.x*dim*stride0 + blockIdx.y*stride1 + blockIdx.z;
 
   int i_start = threadIdx.x;
   int i_end = dim;
   int i_step = blockDim.x;
 
   // max?
-  buffer[threadIdx.x] = -FLT_MAX;
+  buffer[threadIdx.x] = -THCNumerics<AccumT>::max();
   for (int i=i_start; i<i_end; i+=i_step)
   {
-    float z = input_k[i*stride0];
-    if (buffer[threadIdx.x] < z)
-      buffer[threadIdx.x] = z;
+    T z = input_k[i*stride0];
+    AccumT zAcc = ScalarConvert<T, AccumT>::to(z);
+    if (buffer[threadIdx.x] < zAcc)
+      buffer[threadIdx.x] = zAcc;
   }
 
 
@@ -29,7 +32,7 @@ __global__ void cunn_SoftMax_updateOutput_kernel(
   // reduce
   if (threadIdx.x == 0)
   {
-    float max_k = -FLT_MAX;
+    AccumT max_k = -THCNumerics<AccumT>::max();
     for (int i=0; i<blockDim.x; i++)
     {
       if (max_k < buffer[i])
@@ -41,11 +44,11 @@ __global__ void cunn_SoftMax_updateOutput_kernel(
   __syncthreads();
 
   // sum?
-  float max_k = buffer[SOFTMAX_THREADS];
-  buffer[threadIdx.x] = 0;
+  T max_k = ScalarConvert<AccumT, T>::to(buffer[SOFTMAX_THREADS]);
+  buffer[threadIdx.x] = ScalarConvert<int, AccumT>::to(0);
   for (int i=i_start; i<i_end; i+=i_step) {
-    float z = __expf(input_k[i*stride0]-max_k);
-    buffer[threadIdx.x] += z;
+    T z = THCNumerics<T>::exp(input_k[i*stride0]-max_k);
+    buffer[threadIdx.x] += ScalarConvert<T, AccumT>::to(z);
     output_k[i*stride0] = z;
   }
 
@@ -54,7 +57,7 @@ __global__ void cunn_SoftMax_updateOutput_kernel(
   // reduce
   if (threadIdx.x == 0)
   {
-    float sum_k = 0;
+    AccumT sum_k = ScalarConvert<int, AccumT>::to(0);
     for (int i=0; i<blockDim.x; i++)
       sum_k += buffer[i];
     buffer[SOFTMAX_THREADS] = sum_k;
@@ -63,34 +66,35 @@ __global__ void cunn_SoftMax_updateOutput_kernel(
   __syncthreads();
 
   // softmax
-  float sum_k = buffer[SOFTMAX_THREADS];
+  T sum_k = ScalarConvert<AccumT, T>::to(buffer[SOFTMAX_THREADS]);
   for (int i=i_start; i<i_end; i+=i_step)
     output_k[i*stride0] = output_k[i*stride0] / sum_k;
 }
 
+template <typename T, typename AccumT>
 __global__ void cunn_SoftMax_updateGradInput_kernel(
-  float *gradInput, float *output, float *gradOutput, int nframe, int dim, int stride0, int stride1)
+  T *gradInput, T *output, T *gradOutput, int nframe, int dim, int stride0, int stride1)
 {
-  __shared__ float buffer[SOFTMAX_THREADS];
-  float *gradInput_k  = gradInput  + blockIdx.x*dim*stride0 + blockIdx.y * stride1 + blockIdx.z;
-  float *output_k     = output     + blockIdx.x*dim*stride0 + blockIdx.y * stride1 + blockIdx.z;
-  float *gradOutput_k = gradOutput + blockIdx.x*dim*stride0 + blockIdx.y * stride1 + blockIdx.z;
+  __shared__ AccumT buffer[SOFTMAX_THREADS];
+  T *gradInput_k  = gradInput  + blockIdx.x*dim*stride0 + blockIdx.y * stride1 + blockIdx.z;
+  T *output_k     = output     + blockIdx.x*dim*stride0 + blockIdx.y * stride1 + blockIdx.z;
+  T *gradOutput_k = gradOutput + blockIdx.x*dim*stride0 + blockIdx.y * stride1 + blockIdx.z;
 
   int i_start = threadIdx.x;
   int i_end = dim;
   int i_step = blockDim.x;
 
   // sum?
-  buffer[threadIdx.x] = 0;
+  buffer[threadIdx.x] = ScalarConvert<int, AccumT>::to(0);
   for (int i=i_start; i<i_end; i+=i_step)
-    buffer[threadIdx.x] += gradOutput_k[i*stride0] * output_k[i*stride0];
+    buffer[threadIdx.x] += ScalarConvert<T, AccumT>::to(gradOutput_k[i*stride0] * output_k[i*stride0]);
 
   __syncthreads();
 
   // reduce
   if (threadIdx.x == 0)
   {
-    float sum_k = 0;
+    AccumT sum_k = ScalarConvert<int, AccumT>::to(0);
     for (int i=0; i<blockDim.x; i++)
       sum_k += buffer[i];
     buffer[0] = sum_k;
@@ -98,147 +102,12 @@ __global__ void cunn_SoftMax_updateGradInput_kernel(
 
   __syncthreads();
 
-  float sum_k = buffer[0];
+  T sum_k = ScalarConvert<AccumT, T>::to(buffer[0]);
   for (int i=i_start; i<i_end; i+=i_step)
     gradInput_k[i*stride0] = output_k[i*stride0] * (gradOutput_k[i*stride0] - sum_k);
 }
 
-void THNN_CudaSoftMax_updateOutput(THCState *state, THCudaTensor *input, THCudaTensor *output)
-{
-  THCUNN_assertSameGPU(state, 2, input, output);
-
-  input = THCudaTensor_newContiguous(state, input);
-  THCudaTensor_resizeAs(state, output, input);
-  long batchSize, dim, stride0, stride1 = 1;
-  long blocksY = 1, blocksZ = 1;
-
-  if (input->nDimension == 1)
-  {
-    batchSize = 1;
-    dim = input->size[0];
-    stride0 = 1;
-  }
-  else if (input->nDimension == 2)
-  {
-    batchSize = input->size[0];
-    dim = input->size[1];
-    stride0 = 1;
-  }
-  else if (input->nDimension == 3)
-  {
-    batchSize = 1;
-    dim = input->size[0];
-    blocksY = input->size[1];
-    blocksZ = input->size[2];
-    stride0 = blocksY * blocksZ;
-    stride1 = blocksZ;
-  }
-  else if (input->nDimension == 4)
-  {
-    batchSize = input->size[0];
-    dim = input->size[1];
-    blocksY = input->size[2];
-    blocksZ = input->size[3];
-    stride0 = blocksY * blocksZ;
-    stride1 = blocksZ;
-  }
-  else
-  {
-    THError("1D, 2D, 3D or 4D tensor expected");
-  }
-
-  // when possible use only 2d grid of thread blocks to stay compatible with compute capability 2.X devices.
-  if (blocksY * blocksZ < 65536)
-  {
-    blocksY *= blocksZ;
-    blocksZ = 1;
-    if (input->nDimension == 3 || input->nDimension == 4) {
-      stride0 = blocksY * blocksZ;
-      stride1 = blocksZ;
-    }
-  }
-
-  dim3 blocks(batchSize, blocksY, blocksZ);
-  dim3 threads(SOFTMAX_THREADS);
-  cunn_SoftMax_updateOutput_kernel<<<blocks, threads, 0, THCState_getCurrentStream(state)>>>(
-    THCudaTensor_data(state, output),
-    THCudaTensor_data(state, input),
-    batchSize, dim, stride0, stride1
-  );
-  THCudaCheck(cudaGetLastError());
-
-  THCudaTensor_free(state, input);
-}
-
-void THNN_CudaSoftMax_updateGradInput(THCState *state, THCudaTensor *input, THCudaTensor *gradOutput, THCudaTensor *gradInput, THCudaTensor *output)
-{
-  THCUNN_assertSameGPU(state, 3, output, gradOutput, gradInput);
-
-  output = THCudaTensor_newContiguous(state, output);
-  gradOutput = THCudaTensor_newContiguous(state, gradOutput);
-
-  THCudaTensor_resizeAs(state, gradInput, output);
-  long batchSize, dim, stride0, stride1 = 1;
-  long blocksY = 1, blocksZ = 1;
-
-  if (gradInput->nDimension == 1)
-  {
-    batchSize = 1;
-    dim = gradInput->size[0];
-    stride0 = 1;
-  }
-  else if (gradInput->nDimension == 2)
-  {
-    batchSize = gradInput->size[0];
-    dim = gradInput->size[1];
-    stride0 = 1;
-  }
-  else if (gradInput->nDimension == 3)
-  {
-    batchSize = 1;
-    dim = gradInput->size[0];
-    blocksY = gradInput->size[1];
-    blocksZ = gradInput->size[2];
-    stride0 = blocksY * blocksZ;
-    stride1 = blocksZ;
-  }
-  else if (gradInput->nDimension == 4)
-  {
-    batchSize = gradInput->size[0];
-    dim = gradInput->size[1];
-    blocksY = gradInput->size[2];
-    blocksZ = gradInput->size[3];
-    stride0 = blocksY * blocksZ;
-    stride1 = blocksZ;
-  }
-  else
-  {
-    THError("1D, 2D, 3D or 4D tensor expected");
-  }
-
-  // when possible use only 2d grid of thread blocks to stay compatible with compute capability 2.X devices.
-  if (blocksY * blocksZ < 65536)
-  {
-    blocksY *= blocksZ;
-    blocksZ = 1;
-    if (input->nDimension == 3 || input->nDimension == 4) {
-      stride0 = blocksY * blocksZ;
-      stride1 = blocksZ;
-    }
-  }
-
-  dim3 blocks(batchSize, blocksY, blocksZ);
-  dim3 threads(SOFTMAX_THREADS);
-  cunn_SoftMax_updateGradInput_kernel<<<blocks, threads, 0, THCState_getCurrentStream(state)>>>(
-    THCudaTensor_data(state, gradInput),
-    THCudaTensor_data(state, output),
-    THCudaTensor_data(state, gradOutput),
-    batchSize, dim, stride0, stride1
-  );
-  THCudaCheck(cudaGetLastError());
-
-  THCudaTensor_free(state, gradOutput);
-  THCudaTensor_free(state, output);
-}
+#include "generic/SoftMax.cu"
+#include "THCGenerateFloatTypes.h"
 
 #undef SOFTMAX_THREADS
diff --git a/lib/THCUNN/SoftPlus.cu b/lib/THCUNN/SoftPlus.cu
index 0d1609a..cb9ecb7 100644
--- a/lib/THCUNN/SoftPlus.cu
+++ b/lib/THCUNN/SoftPlus.cu
@@ -1,52 +1,42 @@
 #include "THCUNN.h"
-#include "common.h"
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
 
+template <typename T>
 struct softPlusupdateOutput_functor
 {
-  const float threshold;
-  const float beta;
+  const T threshold;
+  const T beta;
 
-  softPlusupdateOutput_functor(float threshold_, float beta_)
+  softPlusupdateOutput_functor(T threshold_, T beta_)
     : threshold(threshold_)
     , beta(beta_)
   {}
 
-  __device__ void operator()(float *output, const float *input) const
-  {
-    float betain = beta * (*input);
+  __device__ void operator()(T *output, const T *input) const {
+    T betain = beta * (*input);
     *output = ((betain) > threshold) ? *input : (1/beta) * log1p(exp(betain));
   }
 };
 
-void THNN_CudaSoftPlus_updateOutput(THCState *state, THCudaTensor *input, THCudaTensor *output, float beta, float threshold)
-{
-  THCUNN_assertSameGPU(state, 2, input, output);
-  THCudaTensor_resizeAs(state, output, input);
-  THC_pointwiseApply2(state, output, input, softPlusupdateOutput_functor(threshold, beta));
-}
-
+template <typename T>
 struct softPlusupdateGradInput_functor
 {
-  const float threshold;
-  const float beta;
+  const T threshold;
+  const T beta;
 
-  softPlusupdateGradInput_functor(float threshold_, float beta_)
+  softPlusupdateGradInput_functor(T threshold_, T beta_)
     : threshold(threshold_)
     , beta(beta_)
   {}
 
-  __device__ void operator()(float *gradInput, const float *output, const float *gradOutput) const
+  __device__ void operator()(T *gradInput, const T *output, const T *gradOutput) const
   {
-    float betaout = beta * (*output);
-    float exp_bo = exp(betaout);
+    T betaout = beta * (*output);
+    T exp_bo = exp(betaout);
     *gradInput = ((betaout) > threshold) ? *gradOutput : *gradOutput * (exp_bo - 1) / exp_bo;
   }
 };
 
-void THNN_CudaSoftPlus_updateGradInput(THCState *state, THCudaTensor *input, THCudaTensor *gradOutput, THCudaTensor *gradInput,
-  THCudaTensor *output, float beta, float threshold)
-{
-  THCUNN_assertSameGPU(state, 4, input, output, gradOutput, gradInput);
-  THCudaTensor_resizeAs(state, gradInput, output);
-  THC_pointwiseApply3(state, gradInput, output, gradOutput, softPlusupdateGradInput_functor(threshold, beta));
-}
+#include "generic/SoftPlus.cu"
+#include "THCGenerateFloatTypes.h"
diff --git a/lib/THCUNN/SoftShrink.cu b/lib/THCUNN/SoftShrink.cu
index 2a08570..7762449 100644
--- a/lib/THCUNN/SoftShrink.cu
+++ b/lib/THCUNN/SoftShrink.cu
@@ -1,54 +1,43 @@
 #include "THCUNN.h"
-#include "common.h"
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
 
+template <typename T>
 struct SoftShrinkUpdateOutput
 {
-  const float lambda_;
+  const T lambda_;
 
-  SoftShrinkUpdateOutput(float lambda)
+  SoftShrinkUpdateOutput(T lambda)
     : lambda_(lambda)
   {}
 
-  __device__ __forceinline__ void operator()(float *out, float *in)
+  __device__ __forceinline__ void operator()(T *out, T *in)
   {
-    float x = *in;
+    T x = *in;
     if (x > lambda_) *out = x - lambda_;
     else if (x < -lambda_) *out = x + lambda_;
-    else *out = 0;
+    else *out = ScalarConvert<int, T>::to(0);
   }
 };
 
-void THNN_CudaSoftShrink_updateOutput(THCState *state, THCudaTensor *input, THCudaTensor *output, double lambda)
-{
-  THCUNN_assertSameGPU(state, 2, input, output);
-  THCudaTensor_resizeAs(state, output, input);
-  THC_pointwiseApply2(state, output, input, SoftShrinkUpdateOutput(lambda));
-  THCudaCheck(cudaGetLastError());
-}
-
+template <typename T>
 struct SoftShrinkUpdateGradInput
 {
-  const float lambda_;
+  const T lambda_;
 
-  SoftShrinkUpdateGradInput(float lambda)
+  SoftShrinkUpdateGradInput(T lambda)
     : lambda_(lambda)
   {}
 
-  __device__ __forceinline__ void operator()(float *gradInput, float *input, float *gradOutput) const
+  __device__ __forceinline__ void operator()(T *gradInput, T *input, T *gradOutput) const
   {
-    float x = *input;
+    T x = *input;
     if (x > lambda_ || x < -lambda_)
       *gradInput = *gradOutput;
     else
-      *gradInput = 0;
+      *gradInput = ScalarConvert<int, T>::to(0);
   }
 };
 
-
-void THNN_CudaSoftShrink_updateGradInput(THCState *state, THCudaTensor *input, THCudaTensor *gradOutput, THCudaTensor *gradInput, double lambda)
-{
-  THCUNN_assertSameGPU(state, 3, input, gradOutput, gradInput);
-  THCudaTensor_resizeAs(state, gradInput, input);
-  THC_pointwiseApply3(state, gradInput, input, gradOutput, SoftShrinkUpdateGradInput(lambda));
-  THCudaCheck(cudaGetLastError());
-}
+#include "generic/SoftShrink.cu"
+#include "THCGenerateFloatTypes.h"
diff --git a/lib/THCUNN/SparseLinear.cu b/lib/THCUNN/SparseLinear.cu
index 577eec5..9435735 100644
--- a/lib/THCUNN/SparseLinear.cu
+++ b/lib/THCUNN/SparseLinear.cu
@@ -1,4 +1,7 @@
 #include "THCUNN.h"
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
+
 #include <cusparse.h>
 #include <thrust/device_vector.h>
 
@@ -13,250 +16,70 @@ static void init_cusparse() {
   }
 }
 
-static bool checkInput(THCudaTensor* t)
-{
-  return t->nDimension == 2 && t->size[1] == 3;
-}
-
-static bool checkSize2D(THCudaTensor* t, long size0, long size1)
-{
-  return t->nDimension == 2 && t->size[0] == size0 && t->size[1] == size1;
-}
-
-static bool checkSize1D(THCudaTensor* t, long size0)
-{
-  return t->nDimension == 1 && t->size[0] == size0;
-}
-
-void THNN_CudaSparseLinear_updateOutput(THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *output,
-          THCudaTensor *weight,
-          THCudaTensor *bias)
-{
-  THAssert(THCudaTensor_checkGPU(state, 4, input, output, weight, bias));
-
-  long h;
-  long outDim = THCudaTensor_size(state, weight, 0);
-  long inDim = THCudaTensor_size(state, weight, 1);
-
-  THArgCheck(checkInput(input), 2, "input size must be nnz x 3");
-  THArgCheck(THCudaTensor_nDimension(state, output) == 2, 3, "output must be batchsize x outputsize");
-  THArgCheck(checkSize1D(bias, outDim), 5, "bias size wrong");
-
-  long batchnum = THCudaTensor_size(state, output, 0);
-  long nnz = THCudaTensor_size(state, input, 0);
-
-  THCudaTensor *buffer = THCudaTensor_new(state);
-  THCudaTensor *sel = THCudaTensor_new(state);
-  THCudaTensor *values = THCudaTensor_new(state);
-  THCudaIntTensor *rowbuf = THCudaIntTensor_new(state);
-  THCudaIntTensor *csrPtrs = THCudaIntTensor_new(state);
-  THCudaIntTensor *colInds = THCudaIntTensor_new(state);
-
-  THCudaTensor_resize1d(state, values, nnz);
-  THCudaIntTensor_resize1d(state, rowbuf, nnz);
-  THCudaIntTensor_resize1d(state, colInds, nnz);
-  THCudaIntTensor_resize1d(state, csrPtrs, batchnum+1);
-
-  // Get data ready for cusparse, need CudaInt buffers
-  // We do not need to sort, since rows are already in order
-  // If rows might get out of order in future implementations, or if cusparse
-  //    complains with an illegal memory access, sort like we do in AccGradParameters
-  THCudaTensor_select(state, sel, input, 1, 0);
-  THCudaIntTensor_copyCudaFloat(state, rowbuf, sel);
-  THCudaTensor_select(state, sel, input, 1, 1);
-  THCudaIntTensor_copyCudaFloat(state, colInds, sel);
-  THCudaTensor_select(state, sel, input, 1, 2);
-  THCudaTensor_copyCuda(state, values, sel);
-
-  init_cusparse();
-  cusparseXcoo2csr(cusparse_handle,
-      THCudaIntTensor_data(state, rowbuf), nnz, batchnum,
-      THCudaIntTensor_data(state, csrPtrs), CUSPARSE_INDEX_BASE_ONE);
-
-  // output = bias
-  THCudaTensor_resize2d(state, buffer, outDim, batchnum);
-  THCudaTensor_zero(state, buffer);
-  for (h=0; h<batchnum; h++) {
-    THCudaTensor_select(state, sel, buffer, 1, h);
-    THCudaTensor_copy(state, sel, bias);
-  }
-
-  // output = W * x
-  float one = 1;
-  cusparseMatDescr_t descr = 0;
-  cusparseCreateMatDescr(&descr);
-  cusparseSetMatType(descr,CUSPARSE_MATRIX_TYPE_GENERAL);
-  cusparseSetMatIndexBase(descr,CUSPARSE_INDEX_BASE_ONE);
-  cusparseScsrmm(cusparse_handle,
-      CUSPARSE_OPERATION_NON_TRANSPOSE,
-      batchnum, outDim, inDim, nnz,
-      &one,
-      descr,
-      THCudaTensor_data(state, values),
-      THCudaIntTensor_data(state, csrPtrs),
-      THCudaIntTensor_data(state, colInds),
-      THCudaTensor_data(state, weight), inDim,
-      &one, THCudaTensor_data(state, buffer), batchnum
-  );
-  THCudaTensor_transpose(state, buffer, NULL, 0, 1);
-
-  // We do work in the buffer to keep the output contiguous
-  THCudaTensor_copy(state, output, buffer);
-
-  cusparseDestroyMatDescr(descr);
-  descr = 0;
-  THCudaTensor_free(state, buffer);
-  THCudaTensor_free(state, sel);
-  THCudaTensor_free(state, values);
-  THCudaIntTensor_free(state, rowbuf);
-  THCudaIntTensor_free(state, colInds);
-  THCudaIntTensor_free(state, csrPtrs);
+void THNN_CudaHalfSparseLinear_updateOutput(
+          THCState *state,
+          THCudaHalfTensor *input,
+          THCudaHalfTensor *output,
+          THCudaHalfTensor *weight,
+          THCudaHalfTensor *bias) {
+  THError("THCudaHalfTensor not supported with SparseLinear");
 }
 
-void THNN_CudaSparseLinear_accGradParameters(
+void THNN_CudaHalfSparseLinear_accGradParameters(
           THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *gradOutput,
-          THCudaTensor *gradWeight,
-          THCudaTensor *gradBias,
-          THCudaTensor *weight,
-          THCudaTensor *bias,
+          THCudaHalfTensor *input,
+          THCudaHalfTensor *gradOutput,
+          THCudaHalfTensor *gradWeight,
+          THCudaHalfTensor *gradBias,
+          THCudaHalfTensor *weight,
+          THCudaHalfTensor *bias,
           double weightDecay,
-          double scale)
-{
-  long outDim = THCudaTensor_size(state, weight, 0);
-  long inDim = THCudaTensor_size(state, weight, 1);
-
-  THArgCheck(checkInput(input), 2, "input size must be batchsize x nnz x 2");
-  THArgCheck(checkSize2D(gradWeight, outDim, inDim), 4, "gradWeight size wrong");
-  THArgCheck(checkSize1D(gradBias, outDim), 5, "gradBias size wrong");
-  THArgCheck(THCudaTensor_isContiguous(state, gradOutput), 3,
-             "gradOutput must be contiguous");
-
-  long nnz = THCudaTensor_size(state, input, 0);
-  long batchnum = THCudaTensor_size(state, gradOutput, 0);
-
-  THCudaTensor *buf = THCudaTensor_new(state);
-  THCudaTensor *cols = THCudaTensor_new(state);
-  THCudaTensor *sel = THCudaTensor_new(state);
-  THCudaLongTensor *inds = THCudaLongTensor_new(state);
-  THCudaTensor *values = THCudaTensor_new(state);
-  THCudaIntTensor *colbuf = THCudaIntTensor_new(state);
-  THCudaIntTensor *colPtrs = THCudaIntTensor_new(state);
-  THCudaIntTensor *rowInds = THCudaIntTensor_new(state);
-
-  THCudaTensor_select(state, sel, input, 1, 0); // rowInds
-  THCudaTensor_select(state, cols, input, 1, 1); // colInds
-  THCudaTensor_cadd(state, buf, sel, batchnum, cols); // colInds * buatchdim + rowInds
-  THCudaTensor_sort(state, buf, inds, buf, 0, 0); // Indicies are now in ind
-  THCudaTensor_indexSelect(state, buf, input, 0, inds);
-
-  THCudaTensor_resize1d(state, values, nnz);
-  THCudaIntTensor_resize1d(state, colbuf, nnz);
-  THCudaIntTensor_resize1d(state, rowInds, nnz);
-  THCudaIntTensor_resize1d(state, colPtrs, inDim+1);
-
-  // Get data ready for cusparse, need CudaInt buffers
-  THCudaTensor_select(state, sel, buf, 1, 0);
-  THCudaIntTensor_copyCudaFloat(state, rowInds, sel);
-  THCudaTensor_select(state, sel, buf, 1, 1);
-  THCudaIntTensor_copyCudaFloat(state, colbuf, sel);
-  THCudaTensor_select(state, sel, buf, 1, 2);
-  THCudaTensor_copyCuda(state, values, sel);
-
-  init_cusparse();
-  // Secretly coo2csc
-  cusparseXcoo2csr(cusparse_handle,
-      THCudaIntTensor_data(state, colbuf), nnz, inDim,
-      THCudaIntTensor_data(state, colPtrs), CUSPARSE_INDEX_BASE_ONE);
-
-  // FORTRAN expects contiguous col-major matricies
-  THCudaTensor_transpose(state, gradOutput, NULL, 0, 1);
-  THCudaTensor_resize2d(state, buf, batchnum, outDim);
-  THCudaTensor_copy(state, buf, gradOutput);
-  THCudaTensor_transpose(state, gradOutput, NULL, 0, 1); // Restore gradOutput
-
-  float one = 1;
-  cusparseMatDescr_t descr = 0;
-  cusparseCreateMatDescr(&descr);
-  cusparseSetMatType(descr,CUSPARSE_MATRIX_TYPE_GENERAL);
-  cusparseSetMatIndexBase(descr,CUSPARSE_INDEX_BASE_ONE);
-  cusparseScsrmm(cusparse_handle,
-      CUSPARSE_OPERATION_NON_TRANSPOSE,
-      inDim, outDim, batchnum, nnz,
-      &one,
-      descr,
-      THCudaTensor_data(state, values),
-      THCudaIntTensor_data(state, colPtrs),
-      THCudaIntTensor_data(state, rowInds),
-      THCudaTensor_data(state, buf), batchnum,
-      &one, THCudaTensor_data(state, gradWeight), inDim
-  );
-
-  THCudaTensor_sum(state, buf, gradOutput, 0);
-  THCudaTensor_resize1d(state, buf, outDim);
-  THCudaTensor_cadd(state, gradBias, gradBias, scale, buf);
-
-  if (weightDecay != 0)
-  {
-    THCudaTensor_cadd(state, gradWeight, gradWeight, weightDecay, weight);
-    THCudaTensor_cadd(state, gradBias, gradBias, weightDecay, bias);
-  }
-
-  THCudaTensor_free(state, buf);
-  THCudaTensor_free(state, sel);
-  THCudaTensor_free(state, cols);
-  THCudaLongTensor_free(state, inds);
-  THCudaTensor_free(state, values);
-  THCudaIntTensor_free(state, colbuf);
-  THCudaIntTensor_free(state, rowInds);
-  THCudaIntTensor_free(state, colPtrs);
+          double scale) {
+  THError("THCudaHalfTensor not supported with SparseLinear");
 }
 
-void THNN_CudaSparseLinear_legacyUpdateOutput(
+void THNN_CudaHalfSparseLinear_legacyUpdateOutput(
           THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *output,
-          THCudaTensor *weight,
-          THCudaTensor *bias) {
-  THError("CUDA does not support legacy input format, please use a table of nnz x 2 vectors");
+          THCudaHalfTensor *input,
+          THCudaHalfTensor *output,
+          THCudaHalfTensor *weight,
+          THCudaHalfTensor *bias) {
+  THError("THCudaHalfTensor not supported with SparseLinear");
 }
-void THNN_CudaSparseLinear_legacyAccGradParameters(
+
+void THNN_CudaHalfSparseLinear_legacyAccGradParameters(
           THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *gradOutput,
-          THCudaTensor *gradWeight,
-          THCudaTensor *gradBias,
-          THCudaTensor *weight,
-          THCudaTensor *bias,
+          THCudaHalfTensor *input,
+          THCudaHalfTensor *gradOutput,
+          THCudaHalfTensor *gradWeight,
+          THCudaHalfTensor *gradBias,
+          THCudaHalfTensor *weight,
+          THCudaHalfTensor *bias,
           double weightDecay,
           double scale) {
-  THError("CUDA does not support legacy input format, please use a table of nnz x 2 vectors");
+  THError("THCudaHalfTensor not supported with SparseLinear");
 }
 
-// Dense updates are pretty fast on the GPU
-void THNN_CudaSparseLinear_zeroGradParameters(
+void THNN_CudaHalfSparseLinear_zeroGradParameters(
           THCState *state,
-          THCudaTensor *gradWeight,
-          THCudaTensor *gradBias,
-          THCudaTensor *lastInput) {
-  THCudaTensor_zero(state, gradWeight);
-  THCudaTensor_zero(state, gradBias);
+          THCudaHalfTensor *gradWeight,
+          THCudaHalfTensor *gradBias,
+          THCudaHalfTensor *lastInput) {
+  THError("THCudaHalfTensor not supported with SparseLinear");
 }
 
-TH_API void THNN_CudaSparseLinear_updateParameters(
+void THNN_CudaHalfSparseLinear_updateParameters(
           THCState *state,
-          THCudaTensor *weight,
-          THCudaTensor *bias,
-          THCudaTensor *gradWeight,
-          THCudaTensor *gradBias,
-          THCudaTensor *lastInput,
+          THCudaHalfTensor *weight,
+          THCudaHalfTensor *bias,
+          THCudaHalfTensor *gradWeight,
+          THCudaHalfTensor *gradBias,
+          THCudaHalfTensor *lastInput,
           double learningRate) {
-  THCudaTensor_cadd(state, weight, weight, -learningRate, gradWeight);
-  THCudaTensor_cadd(state, bias, bias, -learningRate, gradBias);
+  THError("THCudaHalfTensor not supported with SparseLinear");
 }
 
-void THNN_CudaSparseLinear_cudaClearState(THCState *state) {
-}
+#include "generic/SparseLinear.cu"
+#include "THCGenerateFloatType.h"
+#include "generic/SparseLinear.cu"
+#include "THCGenerateDoubleType.h"
diff --git a/lib/THCUNN/SpatialAdaptiveMaxPooling.cu b/lib/THCUNN/SpatialAdaptiveMaxPooling.cu
index 5dd8659..2957953 100644
--- a/lib/THCUNN/SpatialAdaptiveMaxPooling.cu
+++ b/lib/THCUNN/SpatialAdaptiveMaxPooling.cu
@@ -1,5 +1,7 @@
 #include "THCUNN.h"
-#include "common.h"
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
+#include "THCAtomics.cuh"
 
 #define CUDA_MAX_THREADS 1024   // this is safe, in reality 256 is our limit
 
@@ -8,7 +10,8 @@
  *    this function adaptively maxpools an input 4D tensor along dimensions 2 and 3
  *    4D input, 4D output, 4D argmax x and y
  */
-__global__ void adaptivemaxpool(float *input, float *output, float *indices_x, float *indices_y,
+ template <typename T>
+__global__ void adaptivemaxpool(T *input, T *output, THCIndex_t *indices_x, THCIndex_t *indices_y,
                         int input_n, int input_h, int input_w,
                         int output_h, int output_w,
                         int strideh, int stridew,
@@ -29,7 +32,6 @@ __global__ void adaptivemaxpool(float *input, float *output, float *indices_x, f
   int yy_start = blockDim.y*blockIdx.y + threadIdx.y;
   int yy_end = output_h;
   const int yy_step = blockDim.y*gridDim.y;
-
   // select input/output plane
   output = output + o*output_w*output_h;
   input = input + i*strided;
@@ -50,17 +52,17 @@ __global__ void adaptivemaxpool(float *input, float *output, float *indices_x, f
       int kW = x_end-x_start;
 
       // Compute the mean of the input image...
-      float *ptr_input = input + y_start*strideh + x_start*stridew;
-      float *ptr_output = output + yy*output_w + xx;
-      float *ptr_ind_x = indices_x + yy*output_w + xx;
-      float *ptr_ind_y = indices_y + yy*output_w + xx;
+      T *ptr_input = input + y_start*strideh + x_start*stridew;
+      T *ptr_output = output + yy*output_w + xx;
+      THCIndex_t *ptr_ind_x = indices_x + yy*output_w + xx;
+      THCIndex_t *ptr_ind_y = indices_y + yy*output_w + xx;
       int argmax_x = -1;
       int argmax_y = -1;
-      float max = -FLT_MAX;
+      T max = THCNumerics<T>::min();
       int kx, ky;
       for(ky = 0; ky < kH; ky++) {
         for(kx = 0; kx < kW; kx++) {
-          float val = ptr_input[kx*stridew];
+          T val = ptr_input[kx*stridew];
           if (val > max) {
             max = val;
             argmax_x = kx;
@@ -81,7 +83,8 @@ __global__ void adaptivemaxpool(float *input, float *output, float *indices_x, f
  * Description:
  *    this function computes the gradInput from weight and gradOutput
  */
-__global__ void adaptivemaxgradinput(float *gradInput, float *gradOutput, float *indices_x, float *indices_y,
+ template <typename T>
+__global__ void adaptivemaxgradinput(T *gradInput, T *gradOutput, THCIndex_t *indices_x, THCIndex_t *indices_y,
                              int input_n, int input_h, int input_w,
                              int output_h, int output_w)
 {
@@ -116,11 +119,11 @@ __global__ void adaptivemaxgradinput(float *gradInput, float *gradOutput, float
 
       int x_start = (int)floor(float(xx) / output_w * input_w);
 
-      float *ptr_gradInput = gradInput + y_start*input_w + x_start;
-      float *ptr_gradOutput = gradOutput + yy*output_w + xx;
-      float *ptr_ind_x = indices_x + yy*output_w + xx;
-      float *ptr_ind_y = indices_y + yy*output_w + xx;
-      float z = *ptr_gradOutput;
+      T *ptr_gradInput = gradInput + y_start*input_w + x_start;
+      T *ptr_gradOutput = gradOutput + yy*output_w + xx;
+      THCIndex_t *ptr_ind_x = indices_x + yy*output_w + xx;
+      THCIndex_t *ptr_ind_y = indices_y + yy*output_w + xx;
+      T z = *ptr_gradOutput;
 
       int argmax_x = (*ptr_ind_x) - TH_INDEX_BASE;
       int argmax_y = (*ptr_ind_y) - TH_INDEX_BASE;
@@ -135,8 +138,9 @@ __global__ void adaptivemaxgradinput(float *gradInput, float *gradOutput, float
  *    this function computes the gradInput from weight and gradOutput
  *    when kH != dH or kW != dW (uses atomic add)
  */
+ template <typename T>
 __global__ void atomicadaptivemaxgradinput(
-  float *gradInput, float *gradOutput, float *indices_x, float *indices_y,
+  T *gradInput, T *gradOutput, THCIndex_t *indices_x, THCIndex_t *indices_y,
   int input_n, int input_h, int input_w, int output_h, int output_w
 )
 {
@@ -170,11 +174,11 @@ __global__ void atomicadaptivemaxgradinput(
 
       int x_start = (int)floor(float(xx) / output_w * input_w);
 
-      float *ptr_gradInput = gradInput + y_start*input_w + x_start;
-      float *ptr_gradOutput = gradOutput + yy*output_w + xx;
-      float *ptr_ind_x = indices_x + yy*output_w + xx;
-      float *ptr_ind_y = indices_y + yy*output_w + xx;
-      float z = *ptr_gradOutput;
+      T *ptr_gradInput = gradInput + y_start*input_w + x_start;
+      T *ptr_gradOutput = gradOutput + yy*output_w + xx;
+      THCIndex_t *ptr_ind_x = indices_x + yy*output_w + xx;
+      THCIndex_t *ptr_ind_y = indices_y + yy*output_w + xx;
+      T z = *ptr_gradOutput;
 
       int argmax_x = (*ptr_ind_x) - TH_INDEX_BASE;
       int argmax_y = (*ptr_ind_y) - TH_INDEX_BASE;
@@ -185,174 +189,7 @@ __global__ void atomicadaptivemaxgradinput(
   }
 }
 
-void THNN_CudaSpatialAdaptiveMaxPooling_updateOutput(THCState *state, THCudaTensor *input, THCudaTensor *output, THCudaTensor *indices, int nOutputCols, int nOutputRows)
-{
-  THCUNN_assertSameGPU(state, 3, input, output, indices);
-
-  float *indices_data;
-  float *output_data;
-  float *input_data;
-
-  THArgCheck(input->nDimension == 3 || input->nDimension == 4, 2, "3D or 4D (batch) tensor expected");
-
-  if (input->nDimension == 3) {
-    long nInputCols = input->size[2];
-    long nInputRows = input->size[1];
-    long nInputPlane = input->size[0];
-
-    long istride_d = input->stride[0];
-    long istride_h = input->stride[1];
-    long istride_w = input->stride[2];
-
-    input_data = THCudaTensor_data(state, input);
-
-    THCudaTensor_resize3d(state, output, nInputPlane, nOutputRows, nOutputCols);
-    THCudaTensor_resize4d(state, indices, 2, nInputPlane, nOutputRows, nOutputCols);
-
-    indices_data = THCudaTensor_data(state, indices);
-    output_data = THCudaTensor_data(state, output);
-
-    // cuda blocks & threads:
-    int yblocks = (int)(16L / nInputPlane);
-    yblocks = yblocks < 1 ? 1 : yblocks;
-    dim3 blocks(nInputPlane,yblocks);
-    dim3 threads(32,8);
-
-    // run maxpool kernel
-    adaptivemaxpool <<<blocks, threads, 0, THCState_getCurrentStream(state)>>> (input_data, output_data,
-                                   indices_data+nInputPlane*nOutputCols*nOutputRows, indices_data,
-                                   nInputPlane, nInputRows, nInputCols, nOutputRows, nOutputCols,
-                                   istride_h, istride_w, istride_d);
-    THCudaCheck(cudaGetLastError());
-
-  } else {
-    long nInputCols = input->size[3];
-    long nInputRows = input->size[2];
-    long nInputPlane = input->size[1];
-    long nbatch = input->size[0];
-
-    long istride_d = input->stride[1];
-    long istride_h = input->stride[2];
-    long istride_w = input->stride[3];
-
-    input = THCudaTensor_newContiguous(state, input);
-    input_data = THCudaTensor_data(state, input);
-
-    THCudaTensor_resize4d(state, output, nbatch, nInputPlane, nOutputRows, nOutputCols);
-    THCudaTensor_resize5d(state, indices, 2, nbatch, nInputPlane, nOutputRows, nOutputCols);
-
-    indices_data = THCudaTensor_data(state, indices);
-    output_data = THCudaTensor_data(state, output);
-
-    // cuda blocks & threads:
-    int yblocks = (int)(16L / nInputPlane);
-    yblocks = yblocks < 1 ? 1 : yblocks;
-    dim3 blocks(nInputPlane*nbatch,yblocks);
-    dim3 threads(32,8);
-
-    // run maxpool kernel
-    adaptivemaxpool <<<blocks, threads, 0, THCState_getCurrentStream(state)>>> (input_data, output_data,
-                                   indices_data+nbatch*nInputPlane*nOutputCols*nOutputRows, indices_data,
-                                   nInputPlane, nInputRows, nInputCols, nOutputRows, nOutputCols,
-                                   istride_h, istride_w, istride_d);
-    THCudaCheck(cudaGetLastError());
-    // clean
-    THCudaTensor_free(state, input);
-  }
-}
-
-void THNN_CudaSpatialAdaptiveMaxPooling_updateGradInput(THCState *state, THCudaTensor *input, THCudaTensor *gradOutput, THCudaTensor *gradInput, THCudaTensor *indices)
-{
-  bool atomic = true; // suboptimal, but without atomic it doesn't pass the tests
-
-  THCUNN_assertSameGPU(state, 4, input, indices, gradOutput, gradInput);
-
-  float *indices_data;
-  float *gradInput_data;
-  float *gradOutput_data;
-
-  gradOutput = THCudaTensor_newContiguous(state, gradOutput);
-
-  if (input->nDimension == 3) {
-    long nInputCols = input->size[2];
-    long nInputRows = input->size[1];
-    long nInputPlane = input->size[0];
-    long nOutputCols = gradOutput->size[2];
-    long nOutputRows = gradOutput->size[1];
-
-    //bool atomic = (nInputCols%nOutputCols != 0) || (nInputRows%nOutputRows != 0);
-
-    THCudaTensor_resizeAs(state, gradInput, input);
-    THCudaTensor_zero(state, gradInput);
-
-    indices_data = THCudaTensor_data(state, indices);
-    gradOutput_data = THCudaTensor_data(state, gradOutput);
-    gradInput_data = THCudaTensor_data(state, gradInput);
-
-    // cuda blocks & threads:
-    int yblocks = (int)(16L / nInputPlane);
-    yblocks = yblocks < 1 ? 1 : yblocks;
-    dim3 blocks(nInputPlane,yblocks);
-    dim3 threads(32,8);
-
-    if(atomic)
-    {
-      // run updateGradInput kernel, accumulate gradients atomically
-      atomicadaptivemaxgradinput <<<blocks, threads, 0, THCState_getCurrentStream(state)>>> (gradInput_data, gradOutput_data,
-                                          indices_data+nInputPlane*nOutputCols*nOutputRows, indices_data,
-                                          nInputPlane, nInputRows, nInputCols, nOutputRows, nOutputCols);
-    }
-    else
-    {
-      // run updateGradInput kernel
-      atomicadaptivemaxgradinput <<<blocks, threads, 0, THCState_getCurrentStream(state)>>> (gradInput_data, gradOutput_data,
-                                          indices_data+nInputPlane*nOutputCols*nOutputRows, indices_data,
-                                          nInputPlane, nInputRows, nInputCols, nOutputRows, nOutputCols);
-    }
-    THCudaCheck(cudaGetLastError());
-  } else {
-    long nInputCols = input->size[3];
-    long nInputRows = input->size[2];
-    long nInputPlane = input->size[1];
-    long nbatch = input->size[0];
-    long nOutputCols = gradOutput->size[3];
-    long nOutputRows = gradOutput->size[2];
-
-    //bool atomic = //(nInputCols%nOutputCols != 0) || (nInputRows%nOutputRows != 0);
-
-    THCudaTensor_resizeAs(state, gradInput, input);
-    THCudaTensor_zero(state, gradInput);
-
-    indices_data = THCudaTensor_data(state, indices);
-    gradOutput_data = THCudaTensor_data(state, gradOutput);
-    gradInput_data = THCudaTensor_data(state, gradInput);
-
-    // cuda blocks & threads:
-    int yblocks = (int)(16L / nInputPlane);
-    yblocks = yblocks < 1 ? 1 : yblocks;
-    dim3 blocks(nInputPlane*nbatch,yblocks);
-    dim3 threads(32,8);
-
-    if(atomic)
-    {
-      // run updateGradInput kernel, accumulate gradients atomically
-      atomicadaptivemaxgradinput <<<blocks, threads, 0, THCState_getCurrentStream(state)>>> (gradInput_data, gradOutput_data,
-                                          indices_data+nbatch*nInputPlane*nOutputCols*nOutputRows, indices_data,
-                                          nInputPlane, nInputRows, nInputCols, nOutputRows, nOutputCols);
-    }
-    else
-    {
-      // run updateGradInput kernel, accumulate gradients atomically
-      adaptivemaxgradinput <<<blocks, threads, 0, THCState_getCurrentStream(state)>>> (gradInput_data, gradOutput_data,
-                                          indices_data+nbatch*nInputPlane*nOutputCols*nOutputRows, indices_data,
-                                          nInputPlane, nInputRows, nInputCols, nOutputRows, nOutputCols);
-    }
-    THCudaCheck(cudaGetLastError());
-  }
-
-  // clean
-  THCudaTensor_free(state,gradOutput);
-
-}
+#include "generic/SpatialAdaptiveMaxPooling.cu"
+#include "THCGenerateFloatTypes.h"
 
 #undef CUDA_MAX_THREADS
diff --git a/lib/THCUNN/SpatialAveragePooling.cu b/lib/THCUNN/SpatialAveragePooling.cu
index 65060c9..5f77e06 100644
--- a/lib/THCUNN/SpatialAveragePooling.cu
+++ b/lib/THCUNN/SpatialAveragePooling.cu
@@ -1,7 +1,9 @@
 #include "THCUNN.h"
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
 #include "common.h"
 
-template <typename Dtype, bool COUNT_INCLUDE_PAD>
+template <typename Dtype, typename Acctype, bool COUNT_INCLUDE_PAD>
 __global__ void AvePoolForward(const int nthreads,
     const Dtype* const bottom_data, const int num, const int channels,
     const int height, const int width, const int pooled_height,
@@ -22,7 +24,7 @@ __global__ void AvePoolForward(const int nthreads,
     wstart = max(wstart, 0);
     hend = min(hend, height);
     wend = min(wend, width);
-    Dtype aveval = 0;
+    Acctype aveval = Acctype(0);
     const Dtype* const bottom_slice = bottom_data + (n * channels + c) * height * width;
     for (int h = hstart; h < hend; ++h) {
       for (int w = wstart; w < wend; ++w) {
@@ -30,87 +32,13 @@ __global__ void AvePoolForward(const int nthreads,
       }
     }
     if(COUNT_INCLUDE_PAD)
-      top_data[index] = aveval / pool_size;
+      top_data[index] = ScalarConvert<Acctype, Dtype>::to(aveval / pool_size);
     else
-      top_data[index] = aveval / ((hend - hstart) * (wend - wstart));
+      top_data[index] = ScalarConvert<Acctype, Dtype>::to(aveval / ((hend - hstart) * (wend - wstart)));
   }
 }
 
-
-void THNN_CudaSpatialAveragePooling_updateOutput(THCState *state, THCudaTensor *input, THCudaTensor *output, int kW, int kH, int dW, int dH, int padW, int padH, bool ceil_mode, bool count_include_pad)
-{
-  THCUNN_assertSameGPU(state, 2, input, output);
-  THArgCheck(input->nDimension == 3 || input->nDimension == 4, 2, "3D or 4D (batch) tensor expected");
-
-  long nInputCols, nInputRows, nInputPlane, batchSize;
-  long nOutputCols, nOutputRows;
-
-  if (input->nDimension == 3) {
-    nInputCols = input->size[2];
-    nInputRows = input->size[1];
-    nInputPlane = input->size[0];
-    batchSize = 1;
-  }
-  else
-  {
-    nInputCols = input->size[3];
-    nInputRows = input->size[2];
-    nInputPlane = input->size[1];
-    batchSize = input->size[0];
-  }
-
-  THArgCheck(nInputCols >= kW - 2*padW && nInputRows >= kH - 2*padH, 2, "input image smaller than kernel size");
-  THArgCheck(kW/2 >= padW && kH/2 >= padH, 2, "pad should be smaller than half of kernel size");
-
-  if(ceil_mode) {
-    nOutputCols = ceil(float(nInputCols - kW + 2*padW) / float(dW)) + 1;
-    nOutputRows = ceil(float(nInputRows - kH + 2*padH) / float(dH)) + 1;
-  }
-  else {
-    nOutputCols = floor(float(nInputCols - kW + 2*padW) / float(dW)) + 1;
-    nOutputRows = floor(float(nInputRows - kH + 2*padH) / float(dH)) + 1;
-  }
-  if (padW || padH)
-  {
-    // ensure that the last pooling starts inside the image
-    // needed to avoid problems in ceil mode
-    if ((nOutputRows - 1)*dH >= nInputRows + padH)
-      --nOutputRows;
-    if ((nOutputCols  - 1)*dW >= nInputCols  + padW)
-      --nOutputCols;
-  }
-
-  input = THCudaTensor_newContiguous(state, input);
-  float* input_data = THCudaTensor_data(state, input);
-
-  THCudaTensor_resize4d(state, output, batchSize, nInputPlane, nOutputRows, nOutputCols);
-
-  float* output_data = THCudaTensor_data(state, output);
-
-  int count = THCudaTensor_nElement(state, output);
-
-  if(count_include_pad)
-    AvePoolForward<float, true>
-      <<<GET_BLOCKS(count), CUDA_NUM_THREADS, 0, THCState_getCurrentStream(state) >>>(
-        count, input_data,
-        batchSize, nInputPlane, nInputRows, nInputCols, nOutputRows, nOutputCols,
-        kH, kW, dH, dW, padH, padW, output_data);
-  else
-    AvePoolForward<float, false>
-      <<<GET_BLOCKS(count), CUDA_NUM_THREADS, 0, THCState_getCurrentStream(state) >>>(
-        count, input_data,
-        batchSize, nInputPlane, nInputRows, nInputCols, nOutputRows, nOutputCols,
-        kH, kW, dH, dW, padH, padW, output_data);
-  THCudaCheck(cudaGetLastError());
-
-  if(input->nDimension == 3)
-    THCudaTensor_resize3d(state, output, nInputPlane, nOutputRows, nOutputCols);
-
-  THCudaTensor_free(state, input);
-
-}
-
-template <typename Dtype, bool COUNT_INCLUDE_PAD>
+template <typename Dtype, typename Acctype, bool COUNT_INCLUDE_PAD>
 __global__ void AvePoolBackward(const int nthreads, const Dtype* const top_diff,
     const int num, const int channels, const int height,
     const int width, const int pooled_height, const int pooled_width,
@@ -128,7 +56,7 @@ __global__ void AvePoolBackward(const int nthreads, const Dtype* const top_diff,
     const int phend = min(h / stride_h + 1, pooled_height);
     const int pwstart = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1;
     const int pwend = min(w / stride_w + 1, pooled_width);
-    Dtype gradient = 0;
+    Acctype gradient = Acctype(0);
     const Dtype* const top_diff_slice =
         top_diff + (n * channels + c) * pooled_height * pooled_width;
     for (int ph = phstart; ph < phend; ++ph) {
@@ -149,76 +77,9 @@ __global__ void AvePoolBackward(const int nthreads, const Dtype* const top_diff,
           gradient += top_diff_slice[ph * pooled_width + pw] / ((hend - hstart) * (wend - wstart));
       }
     }
-    bottom_diff[index] = gradient;
-  }
-}
-
-void THNN_CudaSpatialAveragePooling_updateGradInput(THCState *state, THCudaTensor *input, THCudaTensor *gradOutput, THCudaTensor *gradInput, int kW, int kH, int dW, int dH, int padW, int padH, bool ceil_mode, bool count_include_pad)
-{
-  THCUNN_assertSameGPU(state, 3, input, gradOutput, gradInput);
-
-  input = THCudaTensor_newContiguous(state, input);
-  gradOutput = THCudaTensor_newContiguous(state, gradOutput);
-
-  long nInputCols, nInputRows, nInputPlane, batchSize;
-  long nOutputCols, nOutputRows;
-
-  if (input->nDimension == 3) {
-    nInputCols = input->size[2];
-    nInputRows = input->size[1];
-    nInputPlane = input->size[0];
-    batchSize = 1;
-  }
-  else
-  {
-    nInputCols = input->size[3];
-    nInputRows = input->size[2];
-    nInputPlane = input->size[1];
-    batchSize = input->size[0];
+    bottom_diff[index] = ScalarConvert<Acctype, Dtype>::to(gradient);
   }
-
-  if(ceil_mode) {
-    nOutputCols = ceil(float(nInputCols - kW + 2*padW) / float(dW)) + 1;
-    nOutputRows = ceil(float(nInputRows - kH + 2*padH) / float(dH)) + 1;
-  }
-  else {
-    nOutputCols = floor(float(nInputCols - kW + 2*padW) / float(dW)) + 1;
-    nOutputRows = floor(float(nInputRows - kH + 2*padH) / float(dH)) + 1;
-  }
-  if (padW || padH)
-  {
-    // ensure that the last pooling starts inside the image
-    // needed to avoid problems in ceil mode
-    if ((nOutputRows - 1)*dH >= nInputRows + padH)
-      --nOutputRows;
-    if ((nOutputCols  - 1)*dW >= nInputCols  + padW)
-      --nOutputCols;
-  }
-
-  THCudaTensor_resizeAs(state, gradInput, input);
-
-  int count = THCudaTensor_nElement(state, input);
-
-  if(count_include_pad)
-    AvePoolBackward<float, true>
-      <<< GET_BLOCKS(count), CUDA_NUM_THREADS, 0, THCState_getCurrentStream(state) >>>
-        (count,
-        THCudaTensor_data(state, gradOutput),
-        batchSize, nInputPlane, nInputRows, nInputCols, nOutputRows, nOutputCols,
-        kH, kW, dH, dW, padH, padW,
-        THCudaTensor_data(state, gradInput));
-  else
-    AvePoolBackward<float, false>
-      <<< GET_BLOCKS(count), CUDA_NUM_THREADS, 0, THCState_getCurrentStream(state) >>>
-        (count,
-        THCudaTensor_data(state, gradOutput),
-        batchSize, nInputPlane, nInputRows, nInputCols, nOutputRows, nOutputCols,
-        kH, kW, dH, dW, padH, padW,
-        THCudaTensor_data(state, gradInput));
-  THCudaCheck(cudaGetLastError());
-
-  // clean
-  THCudaTensor_free(state, input);
-  THCudaTensor_free(state, gradOutput);
 }
 
+#include "generic/SpatialAveragePooling.cu"
+#include "THCGenerateFloatTypes.h"
diff --git a/lib/THCUNN/SpatialClassNLLCriterion.cu b/lib/THCUNN/SpatialClassNLLCriterion.cu
index 0469520..4b2f523 100644
--- a/lib/THCUNN/SpatialClassNLLCriterion.cu
+++ b/lib/THCUNN/SpatialClassNLLCriterion.cu
@@ -1,29 +1,30 @@
 #include "THCUNN.h"
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
+#include "THCAtomics.cuh"
 #include "common.h"
 
-#include <stdio.h>
-#include <assert.h>
-
 #include <thrust/functional.h>
 
+template <typename T, typename AccumT>
 __global__ void cunn_SpatialClassNLLCriterion_updateOutput_kernel(
-          float *output,
-          float *total_weight,
-          float *input,
-          long *target,
-          float *weights,
+          T *output,
+          T *total_weight,
+          T *input,
+          THCIndex_t *target,
+          T *weights,
           int size_average,
           int batch_size,
           int n_classes,
           int map_nelem,
           int blocks_per_sample)
 {
-  __shared__ float partial_sums[CUDA_NUM_THREADS];
+  __shared__ AccumT partial_sums[CUDA_NUM_THREADS];
 
   int i, t;
-  float cur_weight;
-  float input_sum = 0;
-  float acc_weight = 0;
+  T cur_weight;
+  AccumT input_sum = 0;
+  AccumT acc_weight = 0;
 
   int sample = blockIdx.x / blocks_per_sample;
   int toffset = sample * map_nelem;
@@ -34,30 +35,31 @@ __global__ void cunn_SpatialClassNLLCriterion_updateOutput_kernel(
        i += step) {
     t = target[toffset + i] - TH_INDEX_BASE;
     assert(t >= 0 && t < n_classes);
-    cur_weight = weights ? weights[t] : 1.0f;
+    cur_weight = weights ? weights[t] : ScalarConvert<int, T>::to(1);
     input_sum -= input[ioffset + i + map_nelem * t] * cur_weight;
     acc_weight += cur_weight;
   }
 
   __syncthreads();
 
-  input_sum = reduceBlock(partial_sums, blockDim.x, input_sum, thrust::plus<float>(), 0.0f);
-  acc_weight = reduceBlock(partial_sums, blockDim.x, acc_weight, thrust::plus<float>(), 0.0f);
+  input_sum = reduceBlock(partial_sums, blockDim.x, input_sum, thrust::plus<AccumT>(), AccumT(0));
+  acc_weight = reduceBlock(partial_sums, blockDim.x, acc_weight, thrust::plus<AccumT>(), AccumT(0));
 
   if (threadIdx.x == 0) {
-    atomicAdd(total_weight, acc_weight);
+    atomicAdd(total_weight, ScalarConvert<AccumT, T>::to(acc_weight));
     if (size_average && acc_weight > 0)
-      atomicAdd(output, input_sum / acc_weight / gridDim.x);
+      atomicAdd(output, ScalarConvert<AccumT, T>::to(input_sum / acc_weight / gridDim.x));
     else
-      atomicAdd(output, input_sum);
+      atomicAdd(output, ScalarConvert<AccumT, T>::to(input_sum));
   }
 }
 
+template<typename T>
 __global__ void cunn_SpatialClassNLLCriterion_updateGradInput_kernel(
-          float *gradInput,
-          long *target,
-          float *weights,
-          float *total_weight,
+          T *gradInput,
+          THCIndex_t *target,
+          T *weights,
+          T *total_weight,
           int size_average,
           int batch_size,
           int n_classes,
@@ -68,7 +70,7 @@ __global__ void cunn_SpatialClassNLLCriterion_updateGradInput_kernel(
     return;
 
   int i, t;
-  float norm = size_average ? (1.0f / *total_weight) : 1.0f;
+  T norm = size_average ? (ScalarConvert<int, T>::to(1) / *total_weight) : ScalarConvert<int, T>::to(1);
 
   int sample = blockIdx.x / blocks_per_sample;
   int step = blockDim.x * blocks_per_sample;
@@ -79,127 +81,9 @@ __global__ void cunn_SpatialClassNLLCriterion_updateGradInput_kernel(
        i += step) {
     t = (int)target[toffset + i] - TH_INDEX_BASE;
     assert(t >= 0 && t < n_classes);
-    gradInput[ioffset + i + map_nelem * t] = -(weights ? weights[t] : 1.0f) * norm;
+    gradInput[ioffset + i + map_nelem * t] = -(weights ? weights[t] : ScalarConvert<int, T>::to(1)) * norm;
   }
 }
 
-void THNN_CudaSpatialClassNLLCriterion_updateOutput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaLongTensor *target,
-          THCudaTensor *output,
-          bool sizeAverage,
-          THCudaTensor *weights,
-          THCudaTensor *total_weight)
-{
-  THArgCheck(THCudaLongTensor_nDimension(state, target) == 3, 1,
-               "only batches of spatial targets supported (3D tensors)");
-  THArgCheck(THCudaTensor_nDimension(state, input) == 4, 2,
-               "only batches of spatial inputs supported (4D tensors)");
-  if (weights && THCudaTensor_nElement(state, weights) != THCudaTensor_size(state, input, 1)) {
-    THError("weight tensor should be defined either for all or no classes");
-  }
-
-  if (weights)
-    THCUNN_assertSameGPU(state, 5, input, target, weights, output, total_weight);
-  else
-    THCUNN_assertSameGPU(state, 4, input, target, output, total_weight);
-
-  input = THCudaTensor_newContiguous(state, input);
-  weights = weights ? THCudaTensor_newContiguous(state, weights) : NULL;
-  target = THCudaLongTensor_newContiguous(state, target);
-
-  float *input_data = THCudaTensor_data(state, input);
-  float *weights_data = weights ? THCudaTensor_data(state, weights) : NULL;
-  long  *target_data = THCudaLongTensor_data(state, target);
-  float *output_data = THCudaTensor_data(state, output);
-  float *total_weight_data = THCudaTensor_data(state, total_weight);
-
-  long batch_size = THCudaLongTensor_size(state, target, 0);
-  long map_nelem = THCudaLongTensor_nElement(state, target) / batch_size;
-  int blocks_per_sample = GET_BLOCKS(map_nelem) / 128;
-  blocks_per_sample = (blocks_per_sample == 0) ? 1 : blocks_per_sample;
-  int total_blocks = blocks_per_sample * batch_size;
-
-  THCudaTensor_fill(state, output, 0);
-  THCudaTensor_fill(state, total_weight, 0);
-
-  cunn_SpatialClassNLLCriterion_updateOutput_kernel
-    <<<total_blocks, CUDA_NUM_THREADS, 0, THCState_getCurrentStream(state)>>>(
-      output_data,
-      total_weight_data,
-      input_data,
-      target_data,
-      weights_data,
-      sizeAverage,
-      THCudaTensor_size(state, input, 0),
-      THCudaTensor_size(state, input, 1),
-      THCudaTensor_size(state, input, 2) * THCudaTensor_size(state, input, 3),
-      blocks_per_sample
-  );
-  THCudaCheck(cudaGetLastError());
-
-  if (weights)
-    THCudaTensor_free(state, weights);
-  THCudaLongTensor_free(state, target);
-  THCudaTensor_free(state, input);
-}
-
-void THNN_CudaSpatialClassNLLCriterion_updateGradInput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaLongTensor *target,
-          THCudaTensor *gradInput,
-          bool sizeAverage,
-          THCudaTensor *weights,
-          THCudaTensor *total_weight)
-{
-  THArgCheck(THCudaLongTensor_nDimension(state, target) == 3, 1,
-               "only batches of spatial targets supported (3D tensors)");
-  THArgCheck(THCudaTensor_nDimension(state, input) == 4, 2,
-               "only batches of spatial inputs supported (4D tensors)");
-  THArgCheck(THCudaTensor_isContiguous(state, gradInput), 4,
-               "gradInput must be contiguous");
-  if (weights && THCudaTensor_nElement(state, weights) != THCudaTensor_size(state, input, 1)) {
-    THError("weight tensor should be defined either for all or no classes");
-  }
-
-  if (weights)
-    THCUNN_assertSameGPU(state, 5, weights, input, target, gradInput, total_weight);
-  else
-    THCUNN_assertSameGPU(state, 4, input, target, gradInput, total_weight);
-
-  input = THCudaTensor_newContiguous(state, input);
-  weights = weights ? THCudaTensor_newContiguous(state, weights) : NULL;
-  target = THCudaLongTensor_newContiguous(state, target);
-
-  float *weights_data = weights ? THCudaTensor_data(state, weights) : NULL;
-  float *gradInput_data = THCudaTensor_data(state, gradInput);
-  long *target_data = THCudaLongTensor_data(state, target);
-  float *total_weight_data = THCudaTensor_data(state, total_weight);
-
-  long batch_size = THCudaLongTensor_size(state, target, 0);
-  long map_nelem = THCudaLongTensor_nElement(state, target) / batch_size;
-  int blocks_per_sample = GET_BLOCKS(map_nelem) / 128;
-  blocks_per_sample = (blocks_per_sample == 0) ? 1 : blocks_per_sample;
-  int total_blocks = blocks_per_sample * batch_size;
-
-  cunn_SpatialClassNLLCriterion_updateGradInput_kernel
-    <<<total_blocks, CUDA_NUM_THREADS, 0, THCState_getCurrentStream(state)>>>(
-      gradInput_data,
-      target_data,
-      weights_data,
-      total_weight_data,
-      sizeAverage,
-      THCudaTensor_size(state, input, 0),
-      THCudaTensor_size(state, input, 1),
-      THCudaTensor_size(state, input, 2) *THCudaTensor_size(state, input, 3),
-      blocks_per_sample
-  );
-  THCudaCheck(cudaGetLastError());
-
-  if (weights)
-    THCudaTensor_free(state, weights);
-  THCudaLongTensor_free(state, target);
-  THCudaTensor_free(state, input);
-}
+#include "generic/SpatialClassNLLCriterion.cu"
+#include "THCGenerateFloatTypes.h"
diff --git a/lib/THCUNN/SpatialConvolutionLocal.cu b/lib/THCUNN/SpatialConvolutionLocal.cu
index 7f758ef..e5b1f98 100644
--- a/lib/THCUNN/SpatialConvolutionLocal.cu
+++ b/lib/THCUNN/SpatialConvolutionLocal.cu
@@ -2,321 +2,8 @@
 #include "common.h"
 #include "im2col.h"
 
-void THNN_CudaSpatialConvolutionLocal_updateOutput(
-    THCState *state,
-    THCudaTensor *input,
-    THCudaTensor *output,
-    THCudaTensor *weight,
-    THCudaTensor *bias,
-    THCudaTensor *finput,
-    THCudaTensor *fgradInput,
-    int kW, int kH,
-    int dW, int dH,
-    int padW, int padH,
-    long inputWidth, long inputHeight,
-    long outputWidth, long outputHeight)
-{
-  THCUNN_assertSameGPU(state, 5, input, output, weight,
-                                 bias, finput);
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
 
-  long nInputPlane = THCudaTensor_size(state,weight,2)/(kW*kH);
-  long nOutputPlane = THCudaTensor_size(state,weight,1);
-
-  int batch = 1;
-  if (input->nDimension == 3) {
-    // Force batch
-    batch = 0;
-    THCudaTensor_resize4d(state, input, 1, nInputPlane, inputHeight, inputWidth);
-  }
-
-  // Batch size + input planes
-  long batchSize = input->size[0];
-
-  // Resize output
-  THCudaTensor_resize4d(state, output, batchSize, nOutputPlane, outputHeight, outputWidth);
-
-  // Augment the input
-  THCudaTensor_resize3d(state, finput, batchSize, nInputPlane*kW*kH, outputHeight*outputWidth);
-
-  // Helpers
-  THCudaTensor *input_n = THCudaTensor_new(state);
-  THCudaTensor *finput_n = THCudaTensor_new(state);
-  THCudaTensor *output_n = THCudaTensor_new(state);
-
-  // For each elt in batch, do:
-  for (int elt = 0; elt < batchSize; elt ++) {
-    THCudaTensor *finput3d, *output3d;
-    THCudaTensor *wslice = THCudaTensor_new(state);
-    THCudaTensor *islice = THCudaTensor_new(state);
-    THCudaTensor *oslice = THCudaTensor_new(state);
-
-    // Matrix mulitply per output:
-    THCudaTensor_select(state, input_n, input, 0, elt);
-    THCudaTensor_select(state, finput_n, finput, 0, elt);
-    THCudaTensor_select(state, output_n, output, 0, elt);
-
-    // Extract columns:
-    im2col(
-      THCState_getCurrentStream(state),
-      THCudaTensor_data(state, input_n),
-      nInputPlane, inputHeight, inputWidth, kH, kW, padH, padW, dH, dW,
-      1, 1, THCudaTensor_data(state, finput_n)
-    );
-
-    output3d = THCudaTensor_newWithStorage3d(state, output_n->storage, output_n->storageOffset,
-                                             outputHeight*outputWidth, 1,
-                                             nOutputPlane, outputHeight*outputWidth,
-                                             1, nOutputPlane*outputHeight*outputWidth);
-
-    finput3d = THCudaTensor_newWithStorage3d(state, finput_n->storage, finput_n->storageOffset,
-                                             outputHeight*outputWidth, 1,
-                                             kW*kH*nInputPlane, outputHeight*outputWidth,
-                                             1, kW*kH*nInputPlane*outputHeight*outputWidth);
-
-    THCudaTensor_copy(state, output_n, bias);
-
-    for (int i = 0; i < outputHeight; i++) {
-      for(int j = 0; j < outputWidth; j++) {
-        int sliceidx = i * outputWidth + j;
-        THCudaTensor_select(state, wslice, weight, 0, sliceidx);
-        THCudaTensor_select(state, islice, finput3d, 0, sliceidx);
-        THCudaTensor_select(state, oslice, output3d, 0, sliceidx);
-        THCudaTensor_addmm(state, oslice, 1.0, oslice, 1.0, wslice, islice);
-      }
-    }
-
-
-    // weight:    oH*oW x nOutputPlane x nInputPlane*kH*kW
-    // finput3d:  oH*oW x nInputPlane*kH*kW x 1
-    // THCudaTensor_baddbmm(state, output3d, 1.0, output3d, 1.0, weight, finput3d);
-    // output3d:  oH*oW x nOutputPlane x 1
-
-    THCudaTensor_free(state, output3d);
-    THCudaTensor_free(state, finput3d);
-    THCudaTensor_free(state, wslice);
-    THCudaTensor_free(state, islice);
-    THCudaTensor_free(state, oslice);
-  }
-
-  // Free
-  THCudaTensor_free(state, input_n);
-  THCudaTensor_free(state, finput_n);
-  THCudaTensor_free(state, output_n);
-
-  // Resize output
-  if (batch == 0) {
-    THCudaTensor_resize3d(state, output, nOutputPlane, outputHeight, outputWidth);
-    THCudaTensor_resize3d(state, input, nInputPlane, inputHeight, inputWidth);
-  }
-}
-
-void THNN_CudaSpatialConvolutionLocal_updateGradInput(
-    THCState *state,
-    THCudaTensor *input,
-    THCudaTensor *gradOutput,
-    THCudaTensor *gradInput,
-    THCudaTensor *weight,
-    THCudaTensor *finput,
-    THCudaTensor *fgradInput,
-    int kW, int kH,
-    int dW, int dH,
-    int padW, int padH,
-    long inputWidth, long inputHeight,
-    long outputWidth, long outputHeight)
-{
-  THCUNN_assertSameGPU(state, 5, input, gradOutput, weight,
-                                 fgradInput, gradInput);
-
-  long nInputPlane = THCudaTensor_size(state,weight,2)/(kW*kH);
-  long nOutputPlane = THCudaTensor_size(state,weight,1);
-
-  int batch = 1;
-  if (input->nDimension == 3) {
-    // Force batch
-    batch = 0;
-    THCudaTensor_resize4d(state, input, 1, nInputPlane, inputHeight, inputWidth);
-    THCudaTensor_resize4d(state, gradOutput, 1, nOutputPlane, outputHeight, outputWidth);
-  }
-
-  // Batch size + input planes
-  long batchSize = input->size[0];
-
-  // Resize output
-  THCudaTensor_resize4d(state, gradInput, batchSize, nInputPlane, inputHeight, inputWidth);
-
-  // Resize temporary columns
-  THCudaTensor_resize3d(state, fgradInput, batchSize, nInputPlane*kW*kH, outputHeight*outputWidth);
-
-  // Helpers
-  THCudaTensor *gradInput_n = THCudaTensor_new(state);
-  THCudaTensor *fgradInput_n = THCudaTensor_new(state);
-  THCudaTensor *gradOutput_n = THCudaTensor_new(state);
-
-  THCudaTensor_transpose(state, weight, weight, 1, 2);
-
-  // For each elt in batch, do:
-  for (int elt = 0; elt < batchSize; elt ++) {
-    THCudaTensor *gradOutput3d, *fgradInput3d;
-    THCudaTensor *wslice = THCudaTensor_new(state);
-    THCudaTensor *gislice = THCudaTensor_new(state);
-    THCudaTensor *goslice = THCudaTensor_new(state);
-
-    // Matrix mulitply per sample:
-    THCudaTensor_select(state, gradInput_n, gradInput, 0, elt);
-    THCudaTensor_select(state, fgradInput_n, fgradInput, 0, elt);
-    THCudaTensor_select(state, gradOutput_n, gradOutput, 0, elt);
-
-    gradOutput3d = THCudaTensor_newWithStorage3d(state, gradOutput_n->storage, gradOutput_n->storageOffset,
-                                               outputHeight*outputWidth, 1,
-                                               nOutputPlane, outputHeight*outputWidth,
-                                               1, nOutputPlane*outputHeight*outputWidth);
-    fgradInput3d = THCudaTensor_newWithStorage3d(state, fgradInput_n->storage, fgradInput_n->storageOffset,
-                                               outputHeight*outputWidth, 1,
-                                               kW*kH*nInputPlane, outputHeight*outputWidth,
-                                               1, kW*kH*nInputPlane*outputHeight*outputWidth);
-
-    for (int i = 0; i < outputHeight; i++) {
-      for(int j = 0; j < outputWidth; j++) {
-        int sliceidx = i * outputWidth + j;
-        THCudaTensor_select(state, wslice, weight, 0, sliceidx);
-        THCudaTensor_select(state, gislice, fgradInput3d, 0, sliceidx);
-        THCudaTensor_select(state, goslice, gradOutput3d, 0, sliceidx);
-        THCudaTensor_addmm(state, gislice, 0.0, gislice, 1.0, wslice, goslice);
-      }
-    }
-
-    // weight:        oH*oW x nInputPlane*kH*kW x nOutputPlane
-    // gradOutput3d:  oH*oW x nOutputPlane x 1
-    //THCudaTensor_baddbmm(state, fgradInput3d, 0.0, fgradInput3d, 1.0, weight, gradOutput3d);
-    // fgradInput3d:  oH*oW x nInputPlane*kH*kW x 1
-
-    // Unpack columns back into input:
-    col2im(
-      THCState_getCurrentStream(state),
-      THCudaTensor_data(state, fgradInput_n),
-      nInputPlane, inputHeight, inputWidth, kH, kW, padH, padW, dH, dW,
-      1, 1, THCudaTensor_data(state, gradInput_n)
-    );
-
-    THCudaTensor_free(state, gradOutput3d);
-    THCudaTensor_free(state, fgradInput3d);
-    THCudaTensor_free(state, wslice);
-    THCudaTensor_free(state, gislice);
-    THCudaTensor_free(state, goslice);
-  }
-
-  // Free
-  THCudaTensor_free(state, gradInput_n);
-  THCudaTensor_free(state, fgradInput_n);
-  THCudaTensor_free(state, gradOutput_n);
-
-  // Resize output
-  if (batch == 0) {
-    THCudaTensor_resize3d(state, gradOutput, nOutputPlane, outputHeight, outputWidth);
-    THCudaTensor_resize3d(state, input, nInputPlane, inputHeight, inputWidth);
-    THCudaTensor_resize3d(state, gradInput, nInputPlane, inputHeight, inputWidth);
-  }
-
-  THCudaTensor_transpose(state, weight, weight, 1, 2);
-}
-
-void THNN_CudaSpatialConvolutionLocal_accGradParameters(
-    THCState *state,
-    THCudaTensor *input,
-    THCudaTensor *gradOutput,
-    THCudaTensor *gradWeight,
-    THCudaTensor *gradBias,
-    THCudaTensor *finput,
-    THCudaTensor *fgradInput,
-    int kW, int kH,
-    int dW, int dH,
-    int padW, int padH,
-    long inputWidth, long inputHeight,
-    long outputWidth, long outputHeight,
-    float scale)
-{
-  THCUNN_assertSameGPU(state, 5, input, gradOutput, gradWeight,
-                                 gradBias, finput);
-
-  long nInputPlane = THCudaTensor_size(state,gradWeight,2)/(kW*kH);
-  long nOutputPlane = THCudaTensor_size(state,gradWeight,1);
-
-  int batch = 1;
-  if (input->nDimension == 3) {
-    // Force batch
-    batch = 0;
-    THCudaTensor_resize4d(state, input, 1, nInputPlane, inputHeight, inputWidth);
-    THCudaTensor_resize4d(state, gradOutput, 1, nOutputPlane, outputHeight, outputWidth);
-  }
-
-  // Batch size + input planes
-  long batchSize = input->size[0];
-
-  // Helpers
-  THCudaTensor *input_n = THCudaTensor_new(state);
-  THCudaTensor *finput_n = THCudaTensor_new(state);
-  THCudaTensor *gradOutput_n = THCudaTensor_new(state);
-
-  // For each elt in batch, do:
-  for (int elt = 0; elt < batchSize; elt ++) {
-    THCudaTensor *gradOutput3d, *finput3d;
-    THCudaTensor *gwslice = THCudaTensor_new(state);
-    THCudaTensor *islice = THCudaTensor_new(state);
-    THCudaTensor *goslice = THCudaTensor_new(state);
-
-    // Matrix mulitply per output:
-    THCudaTensor_select(state, input_n, input, 0, elt);
-    THCudaTensor_select(state, finput_n, finput, 0, elt);
-    THCudaTensor_select(state, gradOutput_n, gradOutput, 0, elt);
-
-    gradOutput3d = THCudaTensor_newWithStorage3d(state, gradOutput_n->storage, gradOutput_n->storageOffset,
-                                                 outputHeight*outputWidth, 1,
-                                                 nOutputPlane, outputHeight*outputWidth,
-                                                 1, nOutputPlane*outputHeight*outputWidth);
-    finput3d = THCudaTensor_newWithStorage3d(state, finput_n->storage, finput_n->storageOffset,
-                                             outputHeight*outputWidth, 1,
-                                             1, kW*kH*nInputPlane*outputHeight*outputWidth,
-                                             kW*kH*nInputPlane, outputHeight*outputWidth);
-
-    // Extract columns:
-    im2col(
-      THCState_getCurrentStream(state),
-      THCudaTensor_data(state, input_n),
-      nInputPlane, inputHeight, inputWidth, kH, kW, padH, padW, dH, dW,
-      1, 1, THCudaTensor_data(state, finput_n)
-    );
-
-    for (int i = 0; i < outputHeight; i++) {
-      for(int j = 0; j < outputWidth; j++) {
-        int sliceidx = i * outputWidth + j;
-        THCudaTensor_select(state, gwslice, gradWeight, 0, sliceidx);
-        THCudaTensor_select(state, goslice, gradOutput3d, 0, sliceidx);
-        THCudaTensor_select(state, islice, finput3d, 0, sliceidx);
-        THCudaTensor_addmm(state, gwslice, 1.0, gwslice, scale, goslice, islice);
-      }
-    }
-    // gradOutput3d:  oH*oW x nOutputPlane x 1
-    // finput3d:      oH*oW x 1 x kW*kH*nInputPlane
-    //THCudaTensor_baddbmm(state, gradWeight, 1.0, gradWeight, scale, gradOutput3d, finput3d);
-    // gradWeight:    oH*oW x nOutputPlane x kW*kH*nInputPlane
-
-    THCudaTensor_cadd(state, gradBias, gradBias, scale, gradOutput_n);
-
-    THCudaTensor_free(state, gradOutput3d);
-    THCudaTensor_free(state, finput3d);
-    THCudaTensor_free(state, gwslice);
-    THCudaTensor_free(state, goslice);
-    THCudaTensor_free(state, islice);
-  }
-
-  // Free
-  THCudaTensor_free(state, input_n);
-  THCudaTensor_free(state, finput_n);
-  THCudaTensor_free(state, gradOutput_n);
-
-  // Resize
-  if (batch == 0) {
-    THCudaTensor_resize3d(state, gradOutput, nOutputPlane, outputHeight, outputWidth);
-    THCudaTensor_resize3d(state, input, nInputPlane, inputHeight, inputWidth);
-  }
-}
+#include "generic/SpatialConvolutionLocal.cu"
+#include "THCGenerateFloatTypes.h"
diff --git a/lib/THCUNN/SpatialConvolutionMM.cu b/lib/THCUNN/SpatialConvolutionMM.cu
index 4714e9d..2a88047 100644
--- a/lib/THCUNN/SpatialConvolutionMM.cu
+++ b/lib/THCUNN/SpatialConvolutionMM.cu
@@ -2,352 +2,8 @@
 #include "common.h"
 #include "im2col.h"
 
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
 
-void THNN_CudaSpatialConvolutionMM_updateOutput(THCState *state, THCudaTensor *input, THCudaTensor *output, THCudaTensor *weight, THCudaTensor *bias, THCudaTensor *columns, THCudaTensor *ones, int kW, int kH, int dW, int dH, int padW, int padH) {
-
-  THCUNN_assertSameGPU(state, 5, input, output, weight, columns, ones);
-  if (bias) {
-    THCUNN_assertSameGPU(state, 2, weight, bias);
-  }
-  THArgCheck(input->nDimension == 3 || input->nDimension == 4, 2, "3D or 4D (batch mode) tensor is expected");
-  THArgCheck(!bias || weight->size[0] == bias->size[0], 4, "nOutputPlane mismatch in weight and bias");
-  THArgCheck(kW > 0 && kH > 0, 8, "kernel size should be greater than zero");
-  THArgCheck(dW > 0 && dH > 0, 10, "stride should be greater than zero");
-  THArgCheck(weight->nDimension == 2 || weight->nDimension == 4, 4, "weight tensor should be 2D or 4D");
-
-  int freeWeight = 0;
-
-  // Params:
-  int nInputPlane = weight->nDimension == 2 ? weight->size[1]/(kH*kW) : weight->size[1];
-  int nOutputPlane = weight->size[0];
-
-  if (weight->nDimension == 4) {
-    long s1 = weight->size[0];
-    long s2 = weight->size[1] * weight->size[2] * weight->size[3];
-    weight = THCudaTensor_newWithStorage2d(state, weight->storage, weight->storageOffset, s1, -1, s2, -1);
-    freeWeight = 1;
-  }
-
-  int batch = 1;
-  if (input->nDimension == 3) {
-    THArgCheck(input->size[0] == nInputPlane, 2, "input channels and nInputPlane dont match");
-    // Force batch
-    batch = 0;
-    THCudaTensor_resize4d(state, input, 1, input->size[0], input->size[1], input->size[2]);
-  } else {
-    THArgCheck(input->size[1] == nInputPlane, 2, "input channels and nInputPlane dont match");
-  }
-
-  long inputWidth   = input->size[3];
-  long inputHeight  = input->size[2];
-  long outputWidth  = (inputWidth + 2*padW - kW) / dW + 1;
-  long outputHeight = (inputHeight + 2*padH - kH) / dH + 1;
-
-  if (outputWidth < 1 || outputHeight < 1)
-    THError("Given input size: (%dx%dx%d). Calculated output size: (%dx%dx%d). Output size is too small",
-        nInputPlane,inputHeight,inputWidth,nOutputPlane,outputHeight,outputWidth);
-
-  // Batch size + input planes
-  long batchSize = input->size[0];
-
-  // Resize output
-  THCudaTensor_resize4d(state, output, batchSize, nOutputPlane, outputHeight, outputWidth);
-
-  // Resize temporary columns
-  THCudaTensor_resize2d(state, columns, nInputPlane*kW*kH, outputHeight*outputWidth);
-
-  // Define a buffer of ones, for bias accumulation
-  // Note: this buffer can be shared with other modules, it only ever gets increased,
-  // and always contains ones.
-  if (ones->nDimension != 2 || ones->size[0]*ones->size[1] < outputHeight*outputWidth) {
-    // Resize plane and fill with ones...
-    THCudaTensor_resize2d(state, ones, outputHeight, outputWidth);
-    THCudaTensor_fill(state, ones, 1);
-  }
-
-  // Helpers
-  THCudaTensor *input_n = THCudaTensor_new(state);
-  THCudaTensor *output_n = THCudaTensor_new(state);
-
-  // For each elt in batch, do:
-  for (int elt = 0; elt < batchSize; elt ++) {
-    // Matrix mulitply per output:
-    THCudaTensor_select(state, input_n, input, 0, elt);
-    THCudaTensor_select(state, output_n, output, 0, elt);
-
-    // Do Bias first:
-    // M,N,K are dims of matrix A and B
-    // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
-    long m_ = nOutputPlane;
-    long n_ = outputHeight * outputWidth;
-    long k_ = 1;
-
-    // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
-    if (bias) {
-      THCudaBlas_Sgemm(
-          state,
-          't', 'n',
-          n_, m_, k_,
-          1,
-          THCudaTensor_data(state, ones), k_,
-          THCudaTensor_data(state, bias), k_,
-          0,
-          THCudaTensor_data(state, output_n), n_
-      );
-    } else {
-      THCudaTensor_zero(state, output_n);
-    }
-
-    // Extract columns:
-    im2col(
-      THCState_getCurrentStream(state),
-      THCudaTensor_data(state, input_n),
-      nInputPlane, inputHeight, inputWidth, kH, kW, padH, padW, dH, dW,
-      1, 1, THCudaTensor_data(state, columns)
-    );
-
-    // M,N,K are dims of matrix A and B
-    // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
-    long m = nOutputPlane;
-    long n = columns->size[1];
-    long k = nInputPlane*kH*kW;
-
-    // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
-    THCudaBlas_Sgemm(
-        state,
-        'n', 'n',
-        n, m, k,
-        1,
-        THCudaTensor_data(state, columns), n,
-        THCudaTensor_data(state, weight), k,
-        1,
-        THCudaTensor_data(state, output_n), n
-    );
-  }
-
-  // Free
-  THCudaTensor_free(state, input_n);
-  THCudaTensor_free(state, output_n);
-  if (freeWeight)
-    THCudaTensor_free(state, weight);
-
-  // Resize output
-  if (batch == 0) {
-    THCudaTensor_resize3d(state, output, nOutputPlane, outputHeight, outputWidth);
-    THCudaTensor_resize3d(state, input, nInputPlane, inputHeight, inputWidth);
-  }
-}
-
-void THNN_CudaSpatialConvolutionMM_updateGradInput(THCState *state, THCudaTensor *input, THCudaTensor *gradOutput, THCudaTensor *gradInput, THCudaTensor *weight, THCudaTensor *gradColumns, THCudaTensor *ones, int kW, int kH, int dW, int dH, int padW, int padH) {
-
-  THCUNN_assertSameGPU(state, 5, input, gradOutput, weight,
-                                 gradColumns, gradInput);
-  THArgCheck(input->nDimension == 3 || input->nDimension == 4, 2, "3D or 4D (batch mode) tensor is expected");
-  THArgCheck(kW > 0 && kH > 0, 9, "kernel size should be greater than zero");
-  THArgCheck(dW > 0 && dH > 0, 11, "stride should be greater than zero");
-  THArgCheck(weight->nDimension == 2 || weight->nDimension == 4, 4, "weight tensor should be 2D or 4D");
-
-  // Params
-  int nInputPlane = weight->nDimension == 2 ? weight->size[1]/(kW*kH) : weight->size[1];
-  int nOutputPlane = weight->size[0];
-
-  int freeWeight = 0;
-  if (weight->nDimension == 4) {
-    long s1 = weight->size[0];
-    long s2 = weight->size[1] * weight->size[2] * weight->size[3];
-    weight = THCudaTensor_newWithStorage2d(state, weight->storage, weight->storageOffset, s1, -1, s2, -1);
-    freeWeight = 1;
-  }
-
-  int batch = 1;
-  if (input->nDimension == 3) {
-    // Force batch
-    batch = 0;
-    THCudaTensor_resize4d(state, input, 1, input->size[0], input->size[1], input->size[2]);
-    THCudaTensor_resize4d(state, gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2]);
-  }
-
-  long inputWidth   = input->size[3];
-  long inputHeight  = input->size[2];
-  long outputWidth  = (inputWidth + 2*padW - kW) / dW + 1;
-  long outputHeight = (inputHeight + 2*padH - kH) / dH + 1;
-
-  // Batch size + input planes
-  long batchSize = input->size[0];
-
-  // Resize output
-  THCudaTensor_resize4d(state, gradInput, batchSize, nInputPlane, inputHeight, inputWidth);
-
-  // Resize temporary columns
-  THCudaTensor_resize2d(state, gradColumns, nInputPlane*kW*kH, outputHeight*outputWidth);
-
-  // Helpers
-  THCudaTensor *gradInput_n = THCudaTensor_new(state);
-  THCudaTensor *gradOutput_n = THCudaTensor_new(state);
-
-  // For each elt in batch, do:
-  for (int elt = 0; elt < batchSize; elt ++) {
-    // Matrix mulitply per sample:
-    THCudaTensor_select(state, gradInput_n, gradInput, 0, elt);
-    THCudaTensor_select(state, gradOutput_n, gradOutput, 0, elt);
-
-    // M,N,K are dims of matrix A and B
-    // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
-    long m = nInputPlane*kW*kH;
-    long n = gradColumns->size[1];
-    long k = nOutputPlane;
-
-    // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
-    THCudaBlas_Sgemm(
-        state,
-        'n', 't',
-        n, m, k,
-        1,
-        THCudaTensor_data(state, gradOutput_n), n,
-        THCudaTensor_data(state, weight), m,
-        0,
-        THCudaTensor_data(state, gradColumns), n
-    );
-
-    // Unpack columns back into input:
-    col2im(
-      THCState_getCurrentStream(state),
-      THCudaTensor_data(state, gradColumns),
-      nInputPlane, inputHeight, inputWidth, kH, kW, padH, padW, dH, dW,
-      1, 1, THCudaTensor_data(state, gradInput_n)
-    );
-  }
-
-  // Free
-  THCudaTensor_free(state, gradInput_n);
-  THCudaTensor_free(state, gradOutput_n);
-  if (freeWeight)
-    THCudaTensor_free(state, weight);
-
-  // Resize output
-  if (batch == 0) {
-    THCudaTensor_resize3d(state, gradOutput, nOutputPlane, outputHeight, outputWidth);
-    THCudaTensor_resize3d(state, input, nInputPlane, inputHeight, inputWidth);
-    THCudaTensor_resize3d(state, gradInput, nInputPlane, inputHeight, inputWidth);
-  }
-}
-
-void THNN_CudaSpatialConvolutionMM_accGradParameters(THCState *state, THCudaTensor *input, THCudaTensor *gradOutput, THCudaTensor *gradWeight, THCudaTensor *gradBias, THCudaTensor *columns, THCudaTensor *ones, int kW, int kH, int dW, int dH, int padW, int padH, float scale) {
-
-  THCUNN_assertSameGPU(state, 5, input, gradOutput, gradWeight, columns, ones);
-  if (gradBias) {
-   THCUNN_assertSameGPU(state, 2, gradWeight, gradBias);
-  }
-  THArgCheck(input->nDimension == 3 || input->nDimension == 4, 2, "3D or 4D (batch mode) tensor is expected");
-  THArgCheck(!gradBias || gradWeight->size[0] == gradBias->size[0], 4, "nOutputPlane mismatch in gradWeight and gradBias");
-  THArgCheck(kW > 0 && kH > 0, 8, "kernel size should be greater than zero");
-  THArgCheck(dW > 0 && dH > 0, 10, "stride should be greater than zero");
-  THArgCheck(gradWeight->nDimension == 2 || gradWeight->nDimension == 4, 4, "gradWeight tensor should be 2D or 4D");
-
-  // Params
-  int nInputPlane = gradWeight->nDimension == 2 ? gradWeight->size[1]/(kW*kH) : gradWeight->size[1];
-  int nOutputPlane = gradWeight->size[0];
-
-  int freeWeight = 0;
-  if (gradWeight->nDimension == 4) {
-    long s1 = gradWeight->size[0];
-    long s2 = gradWeight->size[1] * gradWeight->size[2] * gradWeight->size[3];
-    gradWeight = THCudaTensor_newWithStorage2d(state, gradWeight->storage, gradWeight->storageOffset, s1, -1, s2, -1);
-    freeWeight = 1;
-  }
-
-  int batch = 1;
-  if (input->nDimension == 3) {
-    // Force batch
-    batch = 0;
-    THCudaTensor_resize4d(state, input, 1, input->size[0], input->size[1], input->size[2]);
-    THCudaTensor_resize4d(state, gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2]);
-  }
-
-  long inputWidth   = input->size[3];
-  long inputHeight  = input->size[2];
-  long outputWidth  = (inputWidth + 2*padW - kW) / dW + 1;
-  long outputHeight = (inputHeight + 2*padH - kH) / dH + 1;
-
-  // Batch size + input planes
-  long batchSize = input->size[0];
-
-  // Define a buffer of ones, for bias accumulation
-  if (ones->nDimension != 2 || ones->size[0]*ones->size[1] < outputHeight*outputWidth) {
-    // Resize plane and fill with ones...
-    THCudaTensor_resize2d(state, ones, outputHeight, outputWidth);
-    THCudaTensor_fill(state, ones, 1);
-  }
-
-  // Resize temporary columns
-  THCudaTensor_resize2d(state, columns, nInputPlane*kW*kH, outputHeight*outputWidth);
-
-  // Helpers
-  THCudaTensor *input_n = THCudaTensor_new(state);
-  THCudaTensor *gradOutput_n = THCudaTensor_new(state);
-
-  // For each elt in batch, do:
-  for (int elt = 0; elt < batchSize; elt ++) {
-    // Matrix mulitply per output:
-    THCudaTensor_select(state, input_n, input, 0, elt);
-    THCudaTensor_select(state, gradOutput_n, gradOutput, 0, elt);
-
-    // Extract columns:
-    im2col(
-      THCState_getCurrentStream(state),
-      THCudaTensor_data(state, input_n),
-      nInputPlane, inputHeight, inputWidth, kH, kW, padH, padW, dH, dW,
-      1, 1, THCudaTensor_data(state, columns)
-    );
-
-    // M,N,K are dims of matrix A and B
-    // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
-    long m = nOutputPlane;
-    long n = nInputPlane*kW*kH;
-    long k = columns->size[1];
-
-    // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
-    THCudaBlas_Sgemm(
-        state,
-        't', 'n',
-        n, m, k,
-        scale,
-        THCudaTensor_data(state, columns), k,
-        THCudaTensor_data(state, gradOutput_n), k,
-        1,
-        THCudaTensor_data(state, gradWeight), n
-    );
-
-    // Do Bias:
-    // M,N,K are dims of matrix A and B
-    // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
-    long m_ = nOutputPlane;
-    long k_ = outputHeight * outputWidth;
-
-    // Do GEMV (note: this is a bit confusing because gemv assumes column-major matrices)
-    if (gradBias) {
-      THCudaBlas_Sgemv(
-          state,
-          't',
-          k_, m_,
-          scale,
-          THCudaTensor_data(state, gradOutput_n), k_,
-          THCudaTensor_data(state, ones), 1,
-          1,
-          THCudaTensor_data(state, gradBias), 1
-      );
-    }
-  }
-
-  // Free
-  THCudaTensor_free(state, input_n);
-  THCudaTensor_free(state, gradOutput_n);
-  if (freeWeight)
-    THCudaTensor_free(state, gradWeight);
-
-  // Resize
-  if (batch == 0) {
-    THCudaTensor_resize3d(state, gradOutput, nOutputPlane, outputHeight, outputWidth);
-    THCudaTensor_resize3d(state, input, nInputPlane, inputHeight, inputWidth);
-  }
-}
+#include "generic/SpatialConvolutionMM.cu"
+#include "THCGenerateFloatTypes.h"
diff --git a/lib/THCUNN/SpatialCrossMapLRN.cu b/lib/THCUNN/SpatialCrossMapLRN.cu
index 9c93926..cd37320 100644
--- a/lib/THCUNN/SpatialCrossMapLRN.cu
+++ b/lib/THCUNN/SpatialCrossMapLRN.cu
@@ -1,7 +1,9 @@
 #include "THCUNN.h"
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
 #include "common.h"
 
-template <typename Dtype>
+template <typename Dtype, typename Acctype>
 __global__ void
 #if __CUDA_ARCH__ >= 320
 __launch_bounds__(CUDA_NUM_THREADS)
@@ -22,7 +24,7 @@ LRNFillScale(const int nthreads, const Dtype* const in,
     int head = 0;
     const int pre_pad = (size - 1) / 2;
     const int post_pad = size - pre_pad - 1;
-    Dtype accum_scale = 0;
+    Acctype accum_scale = Acctype(0);
     // fill the scale at [n, :, h, w]
     // accumulate values
     while (head < post_pad && head < channels) {
@@ -36,7 +38,7 @@ LRNFillScale(const int nthreads, const Dtype* const in,
         accum_scale -= in_off[(head - size) * step]
                        * in_off[(head - size) * step];
       }
-      scale_off[(head - post_pad) * step] = k + accum_scale * alpha_over_size;
+      scale_off[(head - post_pad) * step] = ScalarConvert<Acctype, Dtype>::to(k + accum_scale * alpha_over_size);
       ++head;
     }
     // subtract only
@@ -45,20 +47,21 @@ LRNFillScale(const int nthreads, const Dtype* const in,
         accum_scale -= in_off[(head - size) * step]
                        * in_off[(head - size) * step];
       }
-      scale_off[(head - post_pad) * step] = k + accum_scale * alpha_over_size;
+      scale_off[(head - post_pad) * step] = ScalarConvert<Acctype, Dtype>::to(k + accum_scale * alpha_over_size);
       ++head;
     }
   }
 }
 
-__global__ void LRNComputeOutput(const int nthreads, const float* in,
-    const float* scale, const float negative_beta, float* out) {
+template <typename Dtype>
+__global__ void LRNComputeOutput(const int nthreads, const Dtype* in,
+    const Dtype* scale, const Dtype negative_beta, Dtype* out) {
   CUDA_KERNEL_LOOP(index, nthreads) {
     out[index] = in[index] * pow(scale[index], negative_beta);
   }
 }
 
-template <typename Dtype>
+template <typename Dtype, typename Acctype>
 __global__ void LRNComputeDiff(const int nthreads,
     const Dtype* const bottom_data, const Dtype* const top_data,
     const Dtype* const scale, const Dtype* const top_diff,
@@ -80,7 +83,7 @@ __global__ void LRNComputeDiff(const int nthreads,
     int head = 0;
     const int pre_pad = size - (size + 1) / 2;
     const int post_pad = size - pre_pad - 1;
-    Dtype accum_ratio = 0;
+    Acctype accum_ratio = Acctype(0);
     // accumulate values
     while (head < post_pad && head < channels) {
       accum_ratio += top_diff_off[head * step] * top_off[head * step] /
@@ -96,9 +99,9 @@ __global__ void LRNComputeDiff(const int nthreads,
             top_off[(head - size) * step] / scale_off[(head - size) * step];
       }
       bottom_diff_off[(head - post_pad) * step] =
-          top_diff_off[(head - post_pad) * step]
+          ScalarConvert<Acctype, Dtype>::to(top_diff_off[(head - post_pad) * step]
             * pow(scale_off[(head - post_pad) * step], negative_beta)
-          - cache_ratio * bottom_off[(head - post_pad) * step] * accum_ratio;
+          - cache_ratio * bottom_off[(head - post_pad) * step] * accum_ratio);
       ++head;
     }
     // subtract only
@@ -108,121 +111,14 @@ __global__ void LRNComputeDiff(const int nthreads,
             top_off[(head - size) * step] / scale_off[(head - size) * step];
       }
       bottom_diff_off[(head - post_pad) * step] =
-          top_diff_off[(head - post_pad) * step]
+          ScalarConvert<Acctype, Dtype>::to(top_diff_off[(head - post_pad) * step]
             * pow(scale_off[(head - post_pad) * step], negative_beta)
-          - cache_ratio * bottom_off[(head - post_pad) * step] * accum_ratio;
+          - cache_ratio * bottom_off[(head - post_pad) * step] * accum_ratio);
       ++head;
     }
   }
 }
 
-extern "C"
-void LRNforward(THCState* state, THCudaTensor* input, THCudaTensor* output,
-    THCudaTensor* scale, int local_size, float alpha, float beta, float k)
-{
-  THCudaTensor_resizeAs(state, output, input);
-  THCudaTensor_resizeAs(state, scale, input);
-  
-  int batchSize;
-  int nInputPlane;
-  int imsize_h;
-  int imsize_w;
-
-  if (input->nDimension == 3) {
-    batchSize = 1;
-    nInputPlane = input->size[0];
-    imsize_h = input->size[1];
-    imsize_w = input->size[2];
-  }
-  else
-  {
-    batchSize = input->size[0];
-    nInputPlane = input->size[1];
-    imsize_h = input->size[2];
-    imsize_w = input->size[3];
-  }
-
-  input = THCudaTensor_newContiguous(state, input);
-
-  int n_threads = batchSize * imsize_h * imsize_w;
-  LRNFillScale<<<GET_BLOCKS(n_threads), CUDA_NUM_THREADS, 0, THCState_getCurrentStream(state)>>>(
-      n_threads, THCudaTensor_data(state, input), batchSize, nInputPlane, imsize_h, imsize_w, local_size,
-      alpha / local_size, k, THCudaTensor_data(state, scale));
-  n_threads *= nInputPlane;
-  THCudaCheck(cudaGetLastError());
-  LRNComputeOutput<<<GET_BLOCKS(n_threads), CUDA_NUM_THREADS, 0, THCState_getCurrentStream(state)>>>(
-    n_threads, THCudaTensor_data(state, input), THCudaTensor_data(state, scale), -beta, THCudaTensor_data(state, output));
-  THCudaCheck(cudaGetLastError());
-
-  THCudaTensor_free(state, input);
-}
-
-
-extern "C"
-void LRNbackward(THCState* state, THCudaTensor* input, THCudaTensor* output,
-    THCudaTensor* gradOutput, THCudaTensor* gradInput, THCudaTensor* scale,
-    int local_size, float alpha, float beta, float k)
-{
-  THCudaTensor_resizeAs(state, gradInput, input);
-  
-  int batchSize;
-  int nInputPlane;
-  int imsize_h;
-  int imsize_w;
-
-  if (input->nDimension == 3) {
-    batchSize = 1;
-    nInputPlane = input->size[0];
-    imsize_h = input->size[1];
-    imsize_w = input->size[2];
-  }
-  else
-  {
-    batchSize = input->size[0];
-    nInputPlane = input->size[1];
-    imsize_h = input->size[2];
-    imsize_w = input->size[3];
-  }
-
-  input = THCudaTensor_newContiguous(state, input);
-  gradOutput = THCudaTensor_newContiguous(state, gradOutput);
-
-  int n_threads = batchSize * imsize_h * imsize_w;
-  LRNComputeDiff<<<GET_BLOCKS(n_threads), CUDA_NUM_THREADS, 0, THCState_getCurrentStream(state)>>>(
-      n_threads, THCudaTensor_data(state, input), THCudaTensor_data(state, output),
-      THCudaTensor_data(state, scale), THCudaTensor_data(state, gradOutput), batchSize, nInputPlane, imsize_h, imsize_w,
-      local_size, -beta, float(2. * alpha * beta / local_size),
-      THCudaTensor_data(state, gradInput));
-  THCudaCheck(cudaGetLastError());
-
-  THCudaTensor_free(state, input);
-  THCudaTensor_free(state, gradOutput);
-}
-
-void THNN_CudaSpatialCrossMapLRN_updateOutput(
-    THCState *state,
-    THCudaTensor *input,
-    THCudaTensor *output,
-    THCudaTensor *scale,
-    int size,
-    float alpha,
-    float beta,
-    float k)
-{
-  LRNforward(state, input, output, scale, size, alpha, beta, k);
-}
 
-void THNN_CudaSpatialCrossMapLRN_updateGradInput(
-    THCState *state,
-    THCudaTensor *input,
-    THCudaTensor *gradOutput,
-    THCudaTensor *gradInput,
-    THCudaTensor *scale,
-    THCudaTensor *output,
-    int size,
-    float alpha,
-    float beta,
-    float k)
-{
-  LRNbackward(state, input, output, gradOutput, gradInput, scale, size, alpha, beta, k);
-}
+#include "generic/SpatialCrossMapLRN.cu"
+#include "THCGenerateFloatTypes.h"
diff --git a/lib/THCUNN/SpatialDilatedConvolution.cu b/lib/THCUNN/SpatialDilatedConvolution.cu
index 5b2f0e6..a4a8e38 100644
--- a/lib/THCUNN/SpatialDilatedConvolution.cu
+++ b/lib/THCUNN/SpatialDilatedConvolution.cu
@@ -2,338 +2,8 @@
 #include "common.h"
 #include "im2col.h"
 
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
 
-void THNN_CudaSpatialDilatedConvolution_updateOutput(THCState *state,
-            THCudaTensor *input, THCudaTensor *output, THCudaTensor *weight,
-            THCudaTensor *bias, THCudaTensor *columns,
-            THCudaTensor *ones, int kW, int kH, int dW, int dH,
-            int padW, int padH, int dilationW, int dilationH) {
-
-  THCUNN_assertSameGPU(state, 5, input, output, weight, columns, ones);
-  if (bias) {
-    THCUNN_assertSameGPU(state, 2, weight, bias);
-  }
-  THArgCheck(input->nDimension == 3 || input->nDimension == 4, 2, "3D or 4D (batch mode) tensor is expected");
-  THArgCheck(weight->nDimension == 4, 4, "weight tensor must be 4D (nOutputPlane,nInputPlane,kH,kW)");
-  THArgCheck(!bias || weight->size[0] == bias->size[0], 4, "nOutputPlane mismatch in weight and bias");
-  THArgCheck(kW > 0 && kH > 0, 8, "kernel size should be greater than zero");
-  THArgCheck(dW > 0 && dH > 0, 10, "stride should be greater than zero");
-
-  // Params:
-  int nInputPlane = weight->size[1];
-  int nOutputPlane = weight->size[0];
-
-  int batch = 1;
-  if (input->nDimension == 3) {
-    THArgCheck(input->size[0] == nInputPlane, 2, "input channels and nInputPlane dont match");
-    // Force batch
-    batch = 0;
-    THCudaTensor_resize4d(state, input, 1, input->size[0], input->size[1], input->size[2]);
-  } else {
-    THArgCheck(input->size[1] == nInputPlane, 2, "input channels and nInputPlane dont match");
-  }
-
-  long inputWidth   = input->size[3];
-  long inputHeight  = input->size[2];
-  long outputWidth  = (inputWidth + 2*padW - (dilationW * (kW - 1) + 1)) / dW + 1;
-  long outputHeight = (inputHeight + 2*padH - (dilationH * (kH - 1) + 1)) / dH + 1;
-
-  if (outputWidth < 1 || outputHeight < 1)
-    THError("Given input size: (%dx%dx%d). Calculated output size: (%dx%dx%d). Output size is too small",
-        nInputPlane,inputHeight,inputWidth,nOutputPlane,outputHeight,outputWidth);
-
-  // Batch size + input planes
-  long batchSize = input->size[0];
-
-  // Resize output
-  THCudaTensor_resize4d(state, output, batchSize, nOutputPlane, outputHeight, outputWidth);
-
-  // Resize temporary columns
-  THCudaTensor_resize2d(state, columns, nInputPlane*kW*kH, outputHeight*outputWidth);
-
-  // Define a buffer of ones, for bias accumulation
-  // Note: this buffer can be shared with other modules, it only ever gets increased,
-  // and always contains ones.
-  if (ones->nDimension != 2 || ones->size[0]*ones->size[1] < outputHeight*outputWidth) {
-    // Resize plane and fill with ones...
-    THCudaTensor_resize2d(state, ones, outputHeight, outputWidth);
-    THCudaTensor_fill(state, ones, 1);
-  }
-
-  // Helpers
-  THCudaTensor *input_n = THCudaTensor_new(state);
-  THCudaTensor *output_n = THCudaTensor_new(state);
-
-  // For each elt in batch, do:
-  for (int elt = 0; elt < batchSize; elt ++) {
-    // Matrix mulitply per output:
-    THCudaTensor_select(state, input_n, input, 0, elt);
-    THCudaTensor_select(state, output_n, output, 0, elt);
-
-    // Do Bias first:
-    // M,N,K are dims of matrix A and B
-    // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
-    long m_ = nOutputPlane;
-    long n_ = outputHeight * outputWidth;
-    long k_ = 1;
-
-    // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
-    if (bias) {
-      THCudaBlas_Sgemm(
-          state,
-          't', 'n',
-          n_, m_, k_,
-          1,
-          THCudaTensor_data(state, ones), k_,
-          THCudaTensor_data(state, bias), k_,
-          0,
-          THCudaTensor_data(state, output_n), n_
-      );
-    } else {
-      THCudaTensor_zero(state, output_n);
-    }
-
-    // Extract columns:
-    im2col(
-      THCState_getCurrentStream(state),
-      THCudaTensor_data(state, input_n),
-      nInputPlane, inputHeight, inputWidth, kH, kW, padH, padW, dH, dW,
-      dilationH, dilationW,
-      THCudaTensor_data(state, columns)
-    );
-
-    // M,N,K are dims of matrix A and B
-    // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
-    long m = nOutputPlane;
-    long n = columns->size[1];
-    long k = nInputPlane*kH*kW;
-
-    // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
-    THCudaBlas_Sgemm(
-        state,
-        'n', 'n',
-        n, m, k,
-        1,
-        THCudaTensor_data(state, columns), n,
-        THCudaTensor_data(state, weight), k,
-        1,
-        THCudaTensor_data(state, output_n), n
-    );
-  }
-
-  // Free
-  THCudaTensor_free(state, input_n);
-  THCudaTensor_free(state, output_n);
-
-  // Resize output
-  if (batch == 0) {
-    THCudaTensor_resize3d(state, output, nOutputPlane, outputHeight, outputWidth);
-    THCudaTensor_resize3d(state, input, nInputPlane, inputHeight, inputWidth);
-  }
-}
-
-void THNN_CudaSpatialDilatedConvolution_updateGradInput(THCState *state,
-               THCudaTensor *input, THCudaTensor *gradOutput,
-               THCudaTensor *gradInput, THCudaTensor *weight,
-               THCudaTensor *gradColumns,
-               int kW, int kH, int dW, int dH, int padW, int padH,
-               int dilationW, int dilationH ) {
-
-  THCUNN_assertSameGPU(state, 5, input, gradOutput, weight,
-                                 gradColumns, gradInput);
-  THArgCheck(input->nDimension == 3 || input->nDimension == 4, 2, "3D or 4D (batch mode) tensor is expected");
-  THArgCheck(weight->nDimension == 4, 4, "weight tensor must be 4D (nOutputPlane,nInputPlane,kH,kW)");
-  THArgCheck(kW > 0 && kH > 0, 9, "kernel size should be greater than zero");
-  THArgCheck(dW > 0 && dH > 0, 11, "stride should be greater than zero");
-
-  // Params
-  int nInputPlane = weight->size[1];
-  int nOutputPlane = weight->size[0];
-
-  int batch = 1;
-  if (input->nDimension == 3) {
-    // Force batch
-    batch = 0;
-    THCudaTensor_resize4d(state, input, 1, input->size[0], input->size[1], input->size[2]);
-    THCudaTensor_resize4d(state, gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2]);
-  }
-
-  long inputWidth   = input->size[3];
-  long inputHeight  = input->size[2];
-  long outputWidth  = (inputWidth + 2*padW - (dilationW * (kW - 1) + 1)) / dW + 1;
-  long outputHeight = (inputHeight + 2*padH - (dilationH * (kH - 1) + 1)) / dH + 1;
-
-  // Batch size + input planes
-  long batchSize = input->size[0];
-
-  // Resize output
-  THCudaTensor_resize4d(state, gradInput, batchSize, nInputPlane, inputHeight, inputWidth);
-
-  // Resize temporary columns
-  THCudaTensor_resize2d(state, gradColumns, nInputPlane*kW*kH, outputHeight*outputWidth);
-
-  // Helpers
-  THCudaTensor *gradInput_n = THCudaTensor_new(state);
-  THCudaTensor *gradOutput_n = THCudaTensor_new(state);
-
-  // For each elt in batch, do:
-  for (int elt = 0; elt < batchSize; elt ++) {
-    // Matrix mulitply per sample:
-    THCudaTensor_select(state, gradInput_n, gradInput, 0, elt);
-    THCudaTensor_select(state, gradOutput_n, gradOutput, 0, elt);
-
-    // M,N,K are dims of matrix A and B
-    // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
-    long m = nInputPlane*kW*kH;
-    long n = gradColumns->size[1];
-    long k = nOutputPlane;
-
-    // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
-    THCudaBlas_Sgemm(
-        state,
-        'n', 't',
-        n, m, k,
-        1,
-        THCudaTensor_data(state, gradOutput_n), n,
-        THCudaTensor_data(state, weight), m,
-        0,
-        THCudaTensor_data(state, gradColumns), n
-    );
-
-    // Unpack columns back into input:
-    col2im(
-      THCState_getCurrentStream(state),
-      THCudaTensor_data(state, gradColumns),
-      nInputPlane, inputHeight, inputWidth, kH, kW, padH, padW, dH, dW,
-      dilationH, dilationW,
-      THCudaTensor_data(state, gradInput_n)
-    );
-  }
-
-  // Free
-  THCudaTensor_free(state, gradInput_n);
-  THCudaTensor_free(state, gradOutput_n);
-
-  // Resize output
-  if (batch == 0) {
-    THCudaTensor_resize3d(state, gradOutput, nOutputPlane, outputHeight, outputWidth);
-    THCudaTensor_resize3d(state, input, nInputPlane, inputHeight, inputWidth);
-    THCudaTensor_resize3d(state, gradInput, nInputPlane, inputHeight, inputWidth);
-  }
-}
-
-void THNN_CudaSpatialDilatedConvolution_accGradParameters(THCState *state,
-                     THCudaTensor *input, THCudaTensor *gradOutput,
-                     THCudaTensor *gradWeight, THCudaTensor *gradBias,
-                     THCudaTensor *columns, THCudaTensor *ones,
-                     int kW, int kH, int dW, int dH,
-                     int padW, int padH, int dilationW, int dilationH, float scale) {
-
-  THCUNN_assertSameGPU(state, 5, input, gradOutput, gradWeight, columns, ones);
-  if (gradBias) {
-   THCUNN_assertSameGPU(state, 2, gradWeight, gradBias);
-  }
-  THArgCheck(input->nDimension == 3 || input->nDimension == 4, 2, "3D or 4D (batch mode) tensor is expected");
-  THArgCheck(gradWeight->nDimension == 4, 4, "gradWeight tensor must be 4D (nOutputPlane,nInputPlane,kH,kW)");
-  THArgCheck(!gradBias || gradWeight->size[0] == gradBias->size[0], 4, "nOutputPlane mismatch in gradWeight and gradBias");
-  THArgCheck(kW > 0 && kH > 0, 8, "kernel size should be greater than zero");
-  THArgCheck(dW > 0 && dH > 0, 10, "stride should be greater than zero");
-
-  // Params
-  int nInputPlane = gradWeight->size[1];
-  int nOutputPlane = gradWeight->size[0];
-
-  int batch = 1;
-  if (input->nDimension == 3) {
-    // Force batch
-    batch = 0;
-    THCudaTensor_resize4d(state, input, 1, input->size[0], input->size[1], input->size[2]);
-    THCudaTensor_resize4d(state, gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2]);
-  }
-
-  long inputWidth   = input->size[3];
-  long inputHeight  = input->size[2];
-  long outputWidth  = (inputWidth + 2*padW - (dilationW * (kW - 1) + 1)) / dW + 1;
-  long outputHeight = (inputHeight + 2*padH - (dilationH * (kH - 1) + 1)) / dH + 1;
-
-  // Batch size + input planes
-  long batchSize = input->size[0];
-
-  // Define a buffer of ones, for bias accumulation
-  if (ones->nDimension != 2 || ones->size[0]*ones->size[1] < outputHeight*outputWidth) {
-    // Resize plane and fill with ones...
-    THCudaTensor_resize2d(state, ones, outputHeight, outputWidth);
-    THCudaTensor_fill(state, ones, 1);
-  }
-
-  // Resize temporary columns
-  THCudaTensor_resize2d(state, columns, nInputPlane*kW*kH, outputHeight*outputWidth);
-
-  // Helpers
-  THCudaTensor *input_n = THCudaTensor_new(state);
-  THCudaTensor *gradOutput_n = THCudaTensor_new(state);
-
-  // For each elt in batch, do:
-  for (int elt = 0; elt < batchSize; elt ++) {
-    // Matrix mulitply per output:
-    THCudaTensor_select(state, input_n, input, 0, elt);
-    THCudaTensor_select(state, gradOutput_n, gradOutput, 0, elt);
-
-    // Extract columns:
-    im2col(
-      THCState_getCurrentStream(state),
-      THCudaTensor_data(state, input_n),
-      nInputPlane, inputHeight, inputWidth, kH, kW, padH, padW, dH, dW,
-      dilationH, dilationW,
-      THCudaTensor_data(state, columns)
-    );
-
-    // M,N,K are dims of matrix A and B
-    // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
-    long m = nOutputPlane;
-    long n = nInputPlane*kW*kH;
-    long k = columns->size[1];
-
-    // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
-    THCudaBlas_Sgemm(
-        state,
-        't', 'n',
-        n, m, k,
-        scale,
-        THCudaTensor_data(state, columns), k,
-        THCudaTensor_data(state, gradOutput_n), k,
-        1,
-        THCudaTensor_data(state, gradWeight), n
-    );
-
-    // Do Bias:
-    // M,N,K are dims of matrix A and B
-    // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
-    long m_ = nOutputPlane;
-    long k_ = outputHeight * outputWidth;
-
-    // Do GEMV (note: this is a bit confusing because gemv assumes column-major matrices)
-    if (gradBias) {
-      THCudaBlas_Sgemv(
-          state,
-          't',
-          k_, m_,
-          scale,
-          THCudaTensor_data(state, gradOutput_n), k_,
-          THCudaTensor_data(state, ones), 1,
-          1,
-          THCudaTensor_data(state, gradBias), 1
-      );
-    }
-  }
-
-  // Free
-  THCudaTensor_free(state, input_n);
-  THCudaTensor_free(state, gradOutput_n);
-
-  // Resize
-  if (batch == 0) {
-    THCudaTensor_resize3d(state, gradOutput, nOutputPlane, outputHeight, outputWidth);
-    THCudaTensor_resize3d(state, input, nInputPlane, inputHeight, inputWidth);
-  }
-}
+#include "generic/SpatialDilatedConvolution.cu"
+#include "THCGenerateFloatTypes.h"
diff --git a/lib/THCUNN/SpatialDilatedMaxPooling.cu b/lib/THCUNN/SpatialDilatedMaxPooling.cu
index 26ac65d..94fd681 100644
--- a/lib/THCUNN/SpatialDilatedMaxPooling.cu
+++ b/lib/THCUNN/SpatialDilatedMaxPooling.cu
@@ -1,15 +1,17 @@
 #include "THCUNN.h"
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
 #include "common.h"
 
 // kernels borrowed from Caffe
-template <typename Dtype>
+template <typename Dtype, typename AccType>
 __global__ void MaxPoolForward(const int nthreads, const Dtype* bottom_data,
     const int num, const int channels, const int height,
     const int width, const int pooled_height, const int pooled_width,
     const int kernel_h, const int kernel_w, const int stride_h,
     const int stride_w, const int pad_h, const int pad_w,
     const int dilation_h, const int dilation_w, Dtype* top_data,
-    Dtype* top_mask) {
+    long* top_mask) {
   CUDA_KERNEL_LOOP(index, nthreads) {
     int pw = index % pooled_width;
     int ph = (index / pooled_width) % pooled_height;
@@ -23,26 +25,26 @@ __global__ void MaxPoolForward(const int nthreads, const Dtype* bottom_data,
       hstart += dilation_h;
     while(wstart < 0)
       wstart += dilation_w;
-    Dtype maxval = -FLT_MAX;
+    AccType maxval = THCNumerics<AccType>::min();
     int maxidx = -1;
     bottom_data += (n * channels + c) * height * width;
     for (int h = hstart; h < hend; h += dilation_h) {
       for (int w = wstart; w < wend; w += dilation_w) {
-        if (bottom_data[h * width + w] > maxval) {
+        if (ScalarConvert<Dtype, AccType>::to(bottom_data[h * width + w]) > maxval) {
           maxidx = h * width + w;
-          maxval = bottom_data[maxidx];
+          maxval = ScalarConvert<Dtype, AccType>::to(bottom_data[maxidx]);
         }
       }
     }
-    top_data[index] = maxval;
+    top_data[index] = ScalarConvert<AccType, Dtype>::to(maxval);
     top_mask[index] = maxidx + TH_INDEX_BASE;
   }
 }
 
 
-template <typename Dtype>
+template <typename Dtype, typename AccType>
 __global__ void MaxPoolBackward(const int nthreads, const Dtype* top_diff,
-    const Dtype* top_mask, const int num, const int channels,
+    const long* top_mask, const int num, const int channels,
     const int height, const int width, const int pooled_height,
     const int pooled_width, const int kernel_h, const int kernel_w,
     const int stride_h, const int stride_w, const int pad_h, const int pad_w,
@@ -61,147 +63,21 @@ __global__ void MaxPoolBackward(const int nthreads, const Dtype* top_diff,
     int pwstart =
         (w + pad_w < ((kernel_w - 1) * dilation_w + 1)) ? 0 : (w + pad_w - ((kernel_w - 1) * dilation_w + 1)) / stride_w + 1;
     int pwend = min((w + pad_w) / stride_w + 1, pooled_width);
-    
-    Dtype gradient = 0;
+
+    AccType gradient = AccType(0);
     int offset = (n * channels + c) * pooled_height * pooled_width;
     top_diff += offset;
     top_mask += offset;
     for (int ph = phstart; ph < phend; ++ph) {
       for (int pw = pwstart; pw < pwend; ++pw) {
-	if (top_mask[ph * pooled_width + pw] - TH_INDEX_BASE == h * width + w) {
-	  gradient += top_diff[ph * pooled_width + pw];
-	}
+        if (top_mask[ph * pooled_width + pw] - TH_INDEX_BASE == h * width + w) {
+          gradient += ScalarConvert<Dtype, AccType>::to(top_diff[ph * pooled_width + pw]);
+        }
       }
     }
-    bottom_diff[index] = gradient;
-  }
-}
-
-void THNN_CudaSpatialDilatedMaxPooling_updateOutput(THCState *state, THCudaTensor *input, THCudaTensor *output, THCudaTensor *indices, int kW, int kH, int dW, int dH, int padW, int padH, int dilationW, int dilationH, bool ceil_mode)
-{
-
-  THCUNN_assertSameGPU(state, 3, input, output, indices);
-  THArgCheck(input->nDimension == 3 || input->nDimension == 4, 2, "3D or 4D (batch) tensor expected");
-
-  long nInputCols, nInputRows, nInputPlane, batchSize;
-  long nOutputCols, nOutputRows;
-
-  if (input->nDimension == 3) {
-    nInputCols = input->size[2];
-    nInputRows = input->size[1];
-    nInputPlane = input->size[0];
-    batchSize = 1;
+    bottom_diff[index] = ScalarConvert<AccType, Dtype>::to(gradient);
   }
-  else
-  {
-    nInputCols = input->size[3];
-    nInputRows = input->size[2];
-    nInputPlane = input->size[1];
-    batchSize = input->size[0];
-  }
-
-  THArgCheck(nInputCols >= kW - padW && nInputRows >= kH - padH, 2, "input image smaller than kernel size");
-  THArgCheck(kW/2 >= padW && kH/2 >= padH, 2, "pad should be smaller than half of kernel size");
-
-  if(ceil_mode) {
-    nOutputCols = ceil(float(nInputCols - (dilationW * (kW - 1) + 1) + 2*padW) / float(dW)) + 1;
-    nOutputRows = ceil(float(nInputRows - (dilationH * (kH - 1) + 1) + 2*padH) / float(dH)) + 1;
-  }
-  else {
-    nOutputCols = floor(float(nInputCols - (dilationW * (kW - 1) + 1) + 2*padW) / float(dW)) + 1;
-    nOutputRows = floor(float(nInputRows - (dilationH * (kH - 1) + 1) + 2*padH) / float(dH)) + 1;
-  }
-
-if (nOutputCols < 1 || nOutputRows < 1)
-    THError("Given input size: (%dx%dx%d). Calculated output size: (%dx%dx%d). Output size is too small",
-            nInputPlane,nInputRows,nInputCols,nInputPlane,nOutputRows,nOutputCols);
-
-if (padW || padH)
-  {
-    // ensure that the last pooling starts inside the image
-    if ((nOutputRows - 1)*dH >= nInputRows + padH)
-      --nOutputRows;
-    if ((nOutputCols  - 1)*dW >= nInputCols  + padW)
-      --nOutputCols;
-  }
-
-  input = THCudaTensor_newContiguous(state, input);
-  float* input_data = THCudaTensor_data(state, input);
-
-  THCudaTensor_resize4d(state, output, batchSize, nInputPlane, nOutputRows, nOutputCols);
-  THCudaTensor_resizeAs(state, indices, output);
-
-  float* indices_data = THCudaTensor_data(state, indices);
-  float* output_data = THCudaTensor_data(state, output);
-
-  int count = THCudaTensor_nElement(state, output);
-
-  MaxPoolForward <<< GET_BLOCKS(count), CUDA_NUM_THREADS, 0, THCState_getCurrentStream(state) >>>
-      (count, input_data,
-      batchSize, nInputPlane, nInputRows, nInputCols, nOutputRows, nOutputCols,
-      kH, kW, dH, dW, padH, padW, dilationH, dilationW, output_data, indices_data);
-  THCudaCheck(cudaGetLastError());
-
-  if(input->nDimension == 3)
-    THCudaTensor_resize3d(state, output, nInputPlane, nOutputRows, nOutputCols);
-
-  THCudaTensor_free(state, input);
 }
 
-void THNN_CudaSpatialDilatedMaxPooling_updateGradInput(THCState *state, THCudaTensor *input, THCudaTensor *gradOutput, THCudaTensor *gradInput, THCudaTensor *indices, int kW, int kH, int dW, int dH, int padW, int padH, int dilationW, int dilationH, bool ceil_mode)
-{
-  THCUNN_assertSameGPU(state, 4, input, gradOutput, indices, gradInput);
-
-  input = THCudaTensor_newContiguous(state, input);
-  gradOutput = THCudaTensor_newContiguous(state, gradOutput);
-
-  long nInputCols, nInputRows, nInputPlane, batchSize;
-  long nOutputCols, nOutputRows;
-
-  if (input->nDimension == 3) {
-    nInputCols = input->size[2];
-    nInputRows = input->size[1];
-    nInputPlane = input->size[0];
-    batchSize = 1;
-  }
-  else
-  {
-    nInputCols = input->size[3];
-    nInputRows = input->size[2];
-    nInputPlane = input->size[1];
-    batchSize = input->size[0];
-  }
-
-  if(ceil_mode) {
-    nOutputCols = ceil(float(nInputCols - (dilationW * (kW - 1) + 1) + 2*padW) / float(dW)) + 1;
-    nOutputRows = ceil(float(nInputRows - (dilationH * (kH - 1) + 1) + 2*padH) / float(dH)) + 1;
-  }
-  else {
-    nOutputCols = floor(float(nInputCols - (dilationW * (kW - 1) + 1) + 2*padW) / float(dW)) + 1;
-    nOutputRows = floor(float(nInputRows - (dilationH * (kH - 1) + 1) + 2*padH) / float(dH)) + 1;
-  }
-
-  if (nOutputCols < 1 || nOutputRows < 1)
-    THError("Given input size: (%dx%dx%d). Calculated output size: (%dx%dx%d). Output size is too small",
-            nInputPlane,nInputRows,nInputCols,nInputPlane,nOutputRows,nOutputCols);
-
-  gradOutput = THCudaTensor_newContiguous(state, gradOutput);
-  THCudaTensor_resizeAs(state, gradInput, input);
-
-  int count = THCudaTensor_nElement(state, input);
-
-  MaxPoolBackward <<< GET_BLOCKS(count), CUDA_NUM_THREADS, 0, THCState_getCurrentStream(state) >>>
-      (count,
-      THCudaTensor_data(state, gradOutput),
-      THCudaTensor_data(state, indices),
-      batchSize, nInputPlane, nInputRows, nInputCols, nOutputRows, nOutputCols,
-      kH, kW, dH, dW, padH, padW, dilationH, dilationW,
-      THCudaTensor_data(state, gradInput));
-  THCudaCheck(cudaGetLastError());
-
-  THCudaTensor_free(state, gradOutput);
-
-  // clean
-  THCudaTensor_free(state, input);
-  THCudaTensor_free(state, gradOutput);
-}
+#include "generic/SpatialDilatedMaxPooling.cu"
+#include "THCGenerateFloatTypes.h"
diff --git a/lib/THCUNN/SpatialFractionalMaxPooling.cu b/lib/THCUNN/SpatialFractionalMaxPooling.cu
index 289b1d6..22a2878 100644
--- a/lib/THCUNN/SpatialFractionalMaxPooling.cu
+++ b/lib/THCUNN/SpatialFractionalMaxPooling.cu
@@ -3,15 +3,19 @@
 #include "THCDeviceTensor.cuh"
 #include "THCDeviceTensorUtils.cuh"
 #include "THCDeviceUtils.cuh"
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
+#include "THCAtomics.cuh"
 
 #include <cfloat>
 
-__device__ inline float getInterval(float sample,
-                                    int index,
-                                    int inputSize,
-                                    int outputSize,
-                                    int poolSize) {
-  float alpha = (float)(inputSize - poolSize) / (float) (outputSize - 1);
+template <typename Dtype, typename Acctype>
+__device__ inline int getInterval(Acctype sample,
+                                  int index,
+                                  int inputSize,
+                                  int outputSize,
+                                  int poolSize) {
+  Acctype alpha = (Acctype)(inputSize - poolSize) / (Acctype) (outputSize - 1);
   if (index == outputSize - 1) {
     return inputSize - poolSize;
   } else {
@@ -20,12 +24,12 @@ __device__ inline float getInterval(float sample,
 }
 
 // We template on poolSizeW to allow the innermost loop to be unrolled
-template <int PoolSizeWStatic>
+template <int PoolSizeWStatic, typename Dtype, typename Acctype>
 __global__ void SpatialFractionalMaxPooling_updateOutput(
-  THCDeviceTensor<float, 4> input,
-  THCDeviceTensor<float, 4> output,
-  THCDeviceTensor<float, 4> indices,
-  THCDeviceTensor<float, 3> samples,
+  THCDeviceTensor<Dtype, 4> input,
+  THCDeviceTensor<Dtype, 4> output,
+  THCDeviceTensor<THCIndex_t, 4> indices,
+  THCDeviceTensor<Dtype, 3> samples,
   int poolSizeW, int poolSizeH) {
 
   // Output (h, w) point that this thread is responsible for
@@ -38,33 +42,39 @@ __global__ void SpatialFractionalMaxPooling_updateOutput(
     int outputW = ourOutputPoint % output.getSize(3);
     int outputH = ourOutputPoint / output.getSize(3);
 
-    int poolW = getInterval(samples[batch][plane][0], outputW,
+    int poolW = getInterval<Dtype, Acctype>(ScalarConvert<Dtype, Acctype>::to(samples[batch][plane][0]), outputW,
                             input.getSize(3), output.getSize(3), poolSizeW);
-    int poolH = getInterval(samples[batch][plane][1], outputH,
+    int poolH = getInterval<Dtype, Acctype>(ScalarConvert<Dtype, Acctype>::to(samples[batch][plane][1]), outputH,
                             input.getSize(2), output.getSize(2), poolSizeH);
 
-    float maxVal = -FLT_MAX;
+    Dtype maxVal = THCNumerics<Dtype>::min();
     int maxIndex = -1;
 
     for (int h = poolH; h < poolH + poolSizeH; ++h) {
       if (PoolSizeWStatic == -1) {
         for (int w = poolW; w < poolW + poolSizeW; ++w) {
-          float val = input[batch][plane][h][w];
-          maxVal = fmaxf(val, maxVal);
-          maxIndex = (maxVal == val) ? (h * input.getSize(3) + w) : maxIndex;
+          Dtype val = input[batch][plane][h][w];
+          // for consistency with THNN, favor the first max
+          if (val > maxVal) {
+            maxIndex = h * input.getSize(3) + w;
+            maxVal = val;
+          }
         }
       } else {
 #pragma unroll
         for (int i = 0; i < PoolSizeWStatic; ++i) {
           int w = i + poolW;
-          float val = input[batch][plane][h][w];
-          maxVal = fmaxf(val, maxVal);
-          maxIndex = (maxVal == val) ? (h * input.getSize(3) + w) : maxIndex;
+          Dtype val = input[batch][plane][h][w];
+          // for consistency with THNN, favor the first max
+          if (val > maxVal) {
+            maxIndex = h * input.getSize(3) + w;
+            maxVal = val;
+          }
         }
       }
     }
 
-    assert(maxVal != -FLT_MAX);
+    assert(maxVal != THCNumerics<Dtype>::min());
     assert(maxIndex != -1);
 
     // +1 for Lua index
@@ -73,100 +83,11 @@ __global__ void SpatialFractionalMaxPooling_updateOutput(
   }
 }
 
-void THNN_CudaSpatialFractionalMaxPooling_updateOutput(
-    THCState *state,
-    THCudaTensor *input,
-    THCudaTensor *output,
-    int outputW, int outputH,
-    int poolSizeW, int poolSizeH,
-    THCudaTensor *indices,
-    THCudaTensor *randomSamples)
-{
-  int planeDim = 0;
-  int dimh = 1;
-  int dimw = 2;
-  long numBatch = 1;
-
-  long numInputDims = THCudaTensor_nDimension(state, input);
-  THArgCheck(numInputDims == 3 || numInputDims == 4, 2,
-                "3D or 4D (batch mode) tensor expected");
-
-  if (numInputDims == 4) {
-    numBatch = THCudaTensor_size(state, input, 0);
-    planeDim++;
-    dimh++;
-    dimw++;
-  }
-
-  /* sizes */
-  long numPlanes = THCudaTensor_size(state, input, planeDim);
-  long inputH = THCudaTensor_size(state, input, dimh);
-  long inputW = THCudaTensor_size(state, input, dimw);
-
-  THArgCheck(outputH + poolSizeH - 1 < inputH, 6,
-                "poolSizeH too large relative to input height");
-  THArgCheck(outputW + poolSizeW - 1 < inputW, 5,
-                "poolSizeW too large relative to input width");
-
-  THCDeviceTensor<float, 4> devInput;
-  THCDeviceTensor<float, 4> devOutput;
-  THCDeviceTensor<float, 4> devIndices;
-  THCDeviceTensor<float, 3> devSamples =
-    toDeviceTensor<float, 3>(state, randomSamples);
-
-  if (numInputDims == 3) {
-    /* resize output */
-    THCudaTensor_resize3d(state, output, numPlanes, outputH, outputW);
-    /* indices will contain the locations for each output point */
-    THCudaTensor_resize3d(state, indices, numPlanes, outputH, outputW);
-
-    devInput = toDeviceTensor<float, 3>(state, input).upcastOuter<4>();
-    devOutput = toDeviceTensor<float, 3>(state, output).upcastOuter<4>();
-    devIndices = toDeviceTensor<float, 3>(state, indices).upcastOuter<4>();
-  } else {
-    THCudaTensor_resize4d(state, output, numBatch, numPlanes, outputH, outputW);
-    /* indices will contain the locations for each output point */
-    THCudaTensor_resize4d(state, indices, numBatch, numPlanes, outputH, outputW);
-
-    devInput = toDeviceTensor<float, 4>(state, input);
-    devOutput = toDeviceTensor<float, 4>(state, output);
-    devIndices = toDeviceTensor<float, 4>(state, indices);
-  }
-
-  // block is limited to 4 warps
-  // grid handles overflow per each plane
-  int outputPlaneSize = devOutput.getSize(2) * devOutput.getSize(3);
-  dim3 grid(THCCeilDiv(outputPlaneSize, 128),
-            devInput.getSize(1),
-            devInput.getSize(0));
-  dim3 block(outputPlaneSize > 128 ? 128 : outputPlaneSize);
-
-#define SFMP_UPDATE_OUTPUT(POOL_W)                                      \
-  SpatialFractionalMaxPooling_updateOutput<POOL_W>                      \
-    <<<grid, block, 0, THCState_getCurrentStream(state)>>>(             \
-      devInput, devOutput, devIndices, devSamples, poolSizeW, poolSizeH);
-
-#define SFMP_UPDATE_OUTPUT_CASE(POOL_W)                 \
-  case POOL_W: SFMP_UPDATE_OUTPUT(POOL_W); break
-
-  switch (poolSizeW) {
-    SFMP_UPDATE_OUTPUT_CASE(2);
-    SFMP_UPDATE_OUTPUT_CASE(3);
-    SFMP_UPDATE_OUTPUT_CASE(4);
-    SFMP_UPDATE_OUTPUT_CASE(5);
-    SFMP_UPDATE_OUTPUT_CASE(6);
-    SFMP_UPDATE_OUTPUT_CASE(7);
-    default:
-      // dynamic pool width
-      SFMP_UPDATE_OUTPUT_CASE(-1);
-  }
-  THCudaCheck(cudaGetLastError());
-}
-
+template <typename Dtype>
 __global__ void SpatialFractionalMaxPooling_updateGradInput(
-  THCDeviceTensor<float, 4> gradInput,
-  THCDeviceTensor<float, 4> gradOutput,
-  THCDeviceTensor<float, 4> indices) {
+  THCDeviceTensor<Dtype, 4> gradInput,
+  THCDeviceTensor<Dtype, 4> gradOutput,
+  THCDeviceTensor<THCIndex_t, 4> indices) {
   // Output (h, w) point that this thread is responsible for
   int ourOutputPoint = threadIdx.x + blockIdx.x * blockDim.x;
   int plane = blockIdx.y;
@@ -188,62 +109,5 @@ __global__ void SpatialFractionalMaxPooling_updateGradInput(
   }
 }
 
-void THNN_CudaSpatialFractionalMaxPooling_updateGradInput(
-    THCState *state,
-    THCudaTensor *input,
-    THCudaTensor *gradOutput,
-    THCudaTensor *gradInput,
-    int outputW, int outputH,
-    int poolSizeW, int poolSizeH,
-    THCudaTensor *indices)
-{
-  int dimh = 1;
-  int dimw = 2;
-
-  long numInputDims = THCudaTensor_nDimension(state, input);
-  if (numInputDims == 4) {
-    dimh++;
-    dimw++;
-  }
-
-  /* sizes */
-  long inputH = THCudaTensor_size(state, input, dimh);
-  long inputW = THCudaTensor_size(state, input, dimw);
-
-  THArgCheck(outputH == THCudaTensor_size(state, gradOutput, dimh), 3,
-                "gradOutput height unexpected");
-  THArgCheck(outputW == THCudaTensor_size(state, gradOutput, dimw), 3,
-                "gradOutput width unexpected");
-
-  /* resize */
-  THCudaTensor_resizeAs(state, gradInput, input);
-  THCudaTensor_zero(state, gradInput);
-
-  THCDeviceTensor<float, 4> devGradInput;
-  THCDeviceTensor<float, 4> devGradOutput;
-  THCDeviceTensor<float, 4> devIndices;
-
-  /* backprop */
-  if (numInputDims == 3) {
-    devGradInput = toDeviceTensor<float, 3>(state, gradInput).upcastOuter<4>();
-    devGradOutput = toDeviceTensor<float, 3>(state, gradOutput).upcastOuter<4>();
-    devIndices = toDeviceTensor<float, 3>(state, indices).upcastOuter<4>();
-  } else {
-    devGradInput = toDeviceTensor<float, 4>(state, gradInput);
-    devGradOutput = toDeviceTensor<float, 4>(state, gradOutput);
-    devIndices = toDeviceTensor<float, 4>(state, indices);
-  }
-
-  // block is limited to 4 warps
-  // grid handles overflow per each plane
-  int outputPlaneSize = devGradOutput.getSize(2) * devGradOutput.getSize(3);
-  dim3 grid(THCCeilDiv(outputPlaneSize, 128),
-            devGradInput.getSize(1),
-            devGradInput.getSize(0));
-  dim3 block(outputPlaneSize > 128 ? 128 : outputPlaneSize);
-
-  SpatialFractionalMaxPooling_updateGradInput
-    <<<grid, block, 0, THCState_getCurrentStream(state)>>>(
-      devGradInput, devGradOutput, devIndices);
-  THCudaCheck(cudaGetLastError());
-}
+#include "generic/SpatialFractionalMaxPooling.cu"
+#include "THCGenerateFloatTypes.h"
diff --git a/lib/THCUNN/SpatialFullConvolution.cu b/lib/THCUNN/SpatialFullConvolution.cu
index 4c31485..4e37ecf 100644
--- a/lib/THCUNN/SpatialFullConvolution.cu
+++ b/lib/THCUNN/SpatialFullConvolution.cu
@@ -1,337 +1,8 @@
 #include "THCUNN.h"
 #include "im2col.h"
 
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
 
-void THNN_CudaSpatialFullConvolution_updateOutput(
-    THCState *state,
-    THCudaTensor *input,
-    THCudaTensor *output,
-    THCudaTensor *weight,
-    THCudaTensor *bias,
-    THCudaTensor *columns,
-    THCudaTensor *ones,
-    int kW, int kH,
-    int dW, int dH,
-    int padW, int padH,
-    int adjW, int adjH)
-{
-
-  int nInputPlane = THCudaTensor_size(state, weight, 0);
-  int nOutputPlane = THCudaTensor_size(state, weight, 1);
-
-  THCUNN_assertSameGPU(state, 6, input, output, weight,
-                                 bias, columns, ones);
-
-  THArgCheck(input->nDimension == 3 || input->nDimension == 4, 2, "3D or 4D (batch mode) tensor is expected");
-
-  int batch = 1;
-  if (input->nDimension == 3) {
-    THArgCheck(input->size[0] == nInputPlane, 2, "input channels and nInputPlane dont match");
-    // Force batch
-    batch = 0;
-    THCudaTensor_resize4d(state, input, 1, input->size[0], input->size[1], input->size[2]);
-  } else {
-    THArgCheck(input->size[1] == nInputPlane, 2, "input channels and nInputPlane dont match");
-  }
-
-  long inputWidth   = input->size[3];
-  long inputHeight  = input->size[2];
-  long outputWidth  = (inputWidth - 1) * dW - 2*padW + kW + adjW;
-  long outputHeight = (inputHeight - 1) * dH - 2*padH + kH + adjH;
-
-  // Batch size + input planes
-  long batchSize = input->size[0];
-
-  // Resize output
-  THCudaTensor_resize4d(state, output, batchSize, nOutputPlane, outputHeight, outputWidth);
-
-  // Resize temporary columns
-  THCudaTensor_resize2d(state, columns, nOutputPlane*kW*kH, inputHeight*inputWidth);
-
-  // Define a buffer of ones, for bias accumulation
-  // Note: this buffer can be shared with other modules, it only ever gets increased,
-  // and always contains ones.
-  if (ones->nDimension != 2 || ones->size[0]*ones->size[1] < outputHeight*outputWidth) {
-    // Resize plane and fill with ones...
-    THCudaTensor_resize2d(state, ones, outputHeight, outputWidth);
-    THCudaTensor_fill(state, ones, 1);
-  }
-
-  // Helpers
-  THCudaTensor *input_n = THCudaTensor_new(state);
-  THCudaTensor *output_n = THCudaTensor_new(state);
-
-  // For each elt in batch, do:
-  for (int elt = 0; elt < batchSize; elt ++) {
-    // Matrix mulitply per output:
-    THCudaTensor_select(state, input_n, input, 0, elt);
-    THCudaTensor_select(state, output_n, output, 0, elt);
-
-    // M,N,K are dims of matrix A and B
-    // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
-    long m = weight->size[1] * weight->size[2] * weight->size[3];
-    long n = columns->size[1];
-    long k = weight->size[0];
-
-    // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
-    THCudaBlas_Sgemm(
-        state,
-        'n', 't',
-        n, m, k,
-        1,
-        THCudaTensor_data(state, input_n), n,
-        THCudaTensor_data(state, weight), m,
-        0,
-        THCudaTensor_data(state, columns), n
-    );
-
-    // Unpack columns back into input:
-    col2im(
-      THCState_getCurrentStream(state),
-      THCudaTensor_data(state, columns),
-      nOutputPlane, outputHeight, outputWidth, kH, kW, padH, padW, dH, dW,
-      1, 1, THCudaTensor_data(state, output_n)
-    );
-
-    // Do Bias after:
-    // M,N,K are dims of matrix A and B
-    // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
-    long m_ = nOutputPlane;
-    long n_ = outputHeight * outputWidth;
-    long k_ = 1;
-
-    // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
-    if (bias) {
-      THCudaBlas_Sgemm(
-          state,
-          't', 'n',
-          n_, m_, k_,
-          1,
-          THCudaTensor_data(state, ones), k_,
-          THCudaTensor_data(state, bias), k_,
-          1,
-          THCudaTensor_data(state, output_n), n_
-      );
-    }
-
-  }
-
-  // Free
-  THCudaTensor_free(state, input_n);
-  THCudaTensor_free(state, output_n);
-
-  // Resize output
-  if (batch == 0) {
-    THCudaTensor_resize3d(state, output, nOutputPlane, outputHeight, outputWidth);
-    THCudaTensor_resize3d(state, input, nInputPlane, inputHeight, inputWidth);
-  }
-}
-
-void THNN_CudaSpatialFullConvolution_updateGradInput(
-    THCState *state,
-    THCudaTensor *input,
-    THCudaTensor *gradOutput,
-    THCudaTensor *gradInput,
-    THCudaTensor *weight,
-    THCudaTensor *gradColumns,
-    int kW, int kH,
-    int dW, int dH,
-    int padW, int padH,
-    int adjW, int adjH)
-{
-  int nInputPlane = THCudaTensor_size(state, weight, 0);
-  int nOutputPlane = THCudaTensor_size(state, weight, 1);
-
-  THCUNN_assertSameGPU(state, 5, input, gradOutput, weight,
-                                 gradColumns, gradInput);
-  THArgCheck(input->nDimension == 3 || input->nDimension == 4, 2, "3D or 4D (batch mode) tensor is expected");
-
-  int batch = 1;
-  if (input->nDimension == 3) {
-    // Force batch
-    batch = 0;
-    THCudaTensor_resize4d(state, input, 1, input->size[0], input->size[1], input->size[2]);
-    THCudaTensor_resize4d(state, gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2]);
-  }
-
-  long inputWidth   = input->size[3];
-  long inputHeight  = input->size[2];
-  long outputWidth  = (inputWidth - 1) * dW - 2*padW + kW + adjW;
-  long outputHeight = (inputHeight - 1) * dH - 2*padH + kH + adjH;
-
-  // Batch size + input planes
-  long batchSize = input->size[0];
-
-  // Resize output
-  THCudaTensor_resize4d(state, gradInput, batchSize, nInputPlane, inputHeight, inputWidth);
-
-  // Resize temporary columns
-  THCudaTensor_resize2d(state, gradColumns, nOutputPlane*kW*kH, inputHeight*inputWidth);
-
-  // Helpers
-  THCudaTensor *gradInput_n = THCudaTensor_new(state);
-  THCudaTensor *gradOutput_n = THCudaTensor_new(state);
-
-  // For each elt in batch, do:
-  for (int elt = 0; elt < batchSize; elt ++) {
-    // Matrix mulitply per sample:
-    THCudaTensor_select(state, gradInput_n, gradInput, 0, elt);
-    THCudaTensor_select(state, gradOutput_n, gradOutput, 0, elt);
-
-    // Extract columns:
-    im2col(
-      THCState_getCurrentStream(state),
-      THCudaTensor_data(state, gradOutput_n),
-      nOutputPlane, outputHeight, outputWidth, kH, kW, padH, padW, dH, dW,
-      1, 1, THCudaTensor_data(state, gradColumns)
-    );
-
-
-    // M,N,K are dims of matrix A and B
-    // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
-    long m = weight->size[0];
-    long n = gradColumns->size[1];
-    long k = weight->size[1] * weight->size[2] * weight->size[3];
-
-    // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
-    THCudaBlas_Sgemm(
-        state,
-        'n', 'n',
-        n, m, k,
-        1,
-        THCudaTensor_data(state, gradColumns), n,
-        THCudaTensor_data(state, weight), k,
-        0,
-        THCudaTensor_data(state, gradInput_n), n
-    );
-  }
-
-
-  // Free
-  THCudaTensor_free(state, gradInput_n);
-  THCudaTensor_free(state, gradOutput_n);
-
-  // Resize output
-  if (batch == 0) {
-    THCudaTensor_resize3d(state, gradOutput, nOutputPlane, outputHeight, outputWidth);
-    THCudaTensor_resize3d(state, input, nInputPlane, inputHeight, inputWidth);
-    THCudaTensor_resize3d(state, gradInput, nInputPlane, inputHeight, inputWidth);
-  }
-}
-
-
-void THNN_CudaSpatialFullConvolution_accGradParameters(
-    THCState *state,
-    THCudaTensor *input,
-    THCudaTensor *gradOutput,
-    THCudaTensor *gradWeight,
-    THCudaTensor *gradBias,
-    THCudaTensor *columns,
-    THCudaTensor *ones,
-    int kW, int kH,
-    int dW, int dH,
-    int padW, int padH,
-    int adjW, int adjH,
-    float scale)
-{
-  int nInputPlane = THCudaTensor_size(state, gradWeight, 0);
-  int nOutputPlane = THCudaTensor_size(state, gradWeight, 1);
-
-  THCUNN_assertSameGPU(state, 6, input, gradOutput, gradWeight,
-                                 gradBias, columns, ones);
-
-  THArgCheck(input->nDimension == 3 || input->nDimension == 4, 2, "3D or 4D (batch mode) tensor is expected");
-
-  int batch = 1;
-  if (input->nDimension == 3) {
-    // Force batch
-    batch = 0;
-    THCudaTensor_resize4d(state, input, 1, input->size[0], input->size[1], input->size[2]);
-    THCudaTensor_resize4d(state, gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2]);
-  }
-
-  long inputWidth   = input->size[3];
-  long inputHeight  = input->size[2];
-  long outputWidth  = (inputWidth - 1) * dW - 2*padW + kW + adjW;
-  long outputHeight = (inputHeight - 1) * dH - 2*padH + kH + adjH;
-
-  // Batch size + input planes
-  long batchSize = input->size[0];
-
-  // Define a buffer of ones, for bias accumulation
-  if (ones->nDimension != 2 || ones->size[0]*ones->size[1] < outputHeight*outputWidth) {
-    // Resize plane and fill with ones...
-    THCudaTensor_resize2d(state, ones, outputHeight, outputWidth);
-    THCudaTensor_fill(state, ones, 1);
-  }
-
-  // Resize temporary columns
-  THCudaTensor_resize2d(state, columns, nOutputPlane*kW*kH, inputHeight*inputWidth);
-
-  // Helpers
-  THCudaTensor *input_n = THCudaTensor_new(state);
-  THCudaTensor *gradOutput_n = THCudaTensor_new(state);
-
-  // For each elt in batch, do:
-  for (int elt = 0; elt < batchSize; elt ++) {
-    // Matrix mulitply per output:
-    THCudaTensor_select(state, input_n, input, 0, elt);
-    THCudaTensor_select(state, gradOutput_n, gradOutput, 0, elt);
-
-    // Extract columns:
-    im2col(
-      THCState_getCurrentStream(state),
-      THCudaTensor_data(state, gradOutput_n),
-      nOutputPlane, outputHeight, outputWidth, kH, kW, padH, padW, dH, dW,
-      1, 1, THCudaTensor_data(state, columns)
-    );
-
-    // M,N,K are dims of matrix A and B
-    // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
-    long n = columns->size[0];   // nOutputPlane * kh * kw
-    long m = input_n->size[0];   // nInputPlane
-    long k = columns->size[1];   // inputHeight * inputWidth
-
-    // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
-    THCudaBlas_Sgemm(
-        state,
-        't', 'n',
-        n, m, k,
-        scale,
-        THCudaTensor_data(state, columns), k,
-        THCudaTensor_data(state, input_n), k,
-        1,
-        THCudaTensor_data(state, gradWeight), n
-    );
-
-    // Do Bias:
-    // M,N,K are dims of matrix A and B
-    // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
-    long m_ = nOutputPlane;
-    long k_ = outputHeight * outputWidth;
-
-    // Do GEMV (note: this is a bit confusing because gemv assumes column-major matrices)
-    if (gradBias) {
-      THCudaBlas_Sgemv(
-          state,
-          't',
-          k_, m_,
-          scale,
-          THCudaTensor_data(state, gradOutput_n), k_,
-          THCudaTensor_data(state, ones), 1,
-          1,
-          THCudaTensor_data(state, gradBias), 1
-      );
-    }
-  }
-
-  // Free
-  THCudaTensor_free(state, input_n);
-  THCudaTensor_free(state, gradOutput_n);
-
-  // Resize
-  if (batch == 0) {
-    THCudaTensor_resize3d(state, gradOutput, nOutputPlane, outputHeight, outputWidth);
-    THCudaTensor_resize3d(state, input, nInputPlane, inputHeight, inputWidth);
-  }
-}
+#include "generic/SpatialFullConvolution.cu"
+#include "THCGenerateFloatTypes.h"
diff --git a/lib/THCUNN/SpatialMaxPooling.cu b/lib/THCUNN/SpatialMaxPooling.cu
index ac6e3fd..90e6fe4 100644
--- a/lib/THCUNN/SpatialMaxPooling.cu
+++ b/lib/THCUNN/SpatialMaxPooling.cu
@@ -1,18 +1,4 @@
 #include "THCUNN.h"
-#include "common.h"
 
-void THNN_CudaSpatialMaxPooling_updateOutput(THCState *state, THCudaTensor *input, THCudaTensor *output, THCudaTensor *indices, int kW, int kH, int dW, int dH, int padW, int padH, bool ceil_mode)
-{
-  THNN_CudaSpatialDilatedMaxPooling_updateOutput(
-    state, input, output, indices, 
-    kW, kH, dW, dH, padW, padH, 1, 1, ceil_mode);
-
-}
-
-void THNN_CudaSpatialMaxPooling_updateGradInput(THCState *state, THCudaTensor *input, THCudaTensor *gradOutput, THCudaTensor *gradInput, THCudaTensor *indices, int kW, int kH, int dW, int dH, int padW, int padH, bool ceil_mode)
-{
-  THNN_CudaSpatialDilatedMaxPooling_updateGradInput(
-    state, input, gradOutput, gradInput, indices,
-    kW, kH, dW, dH, padW, padH, 1, 1, ceil_mode);
-
-}
+#include "generic/SpatialMaxPooling.cu"
+#include "THCGenerateFloatTypes.h"
diff --git a/lib/THCUNN/SpatialMaxUnpooling.cu b/lib/THCUNN/SpatialMaxUnpooling.cu
index b56bd56..2d8a481 100644
--- a/lib/THCUNN/SpatialMaxUnpooling.cu
+++ b/lib/THCUNN/SpatialMaxUnpooling.cu
@@ -2,7 +2,7 @@
 #include "common.h"
 
 template <typename Dtype>
-__global__ void MaxUnpoolForward(const int nthreads, const Dtype* bottom_data, const Dtype* bottom_mask,
+__global__ void MaxUnpoolForward(const int nthreads, const Dtype* bottom_data, const long* bottom_mask,
     const int num, const int channels, const int iheight, const int iwidth, const int oheight, const int owidth, Dtype* top_data) {
   CUDA_KERNEL_LOOP(index, nthreads) { //index here indices the input pixels
     int c = (index / iwidth / iheight) % channels;
@@ -15,7 +15,7 @@ __global__ void MaxUnpoolForward(const int nthreads, const Dtype* bottom_data, c
 }
 
 template <typename Dtype>
-__global__ void MaxUnpoolBackward(const int nthreads, const Dtype* top_diff, const Dtype* bottom_mask,
+__global__ void MaxUnpoolBackward(const int nthreads, const Dtype* top_diff, const long* bottom_mask,
     const int num, const int channels, const int iheight, const int iwidth, const int oheight, const int owidth, Dtype* bottom_diff) {
   CUDA_KERNEL_LOOP(index, nthreads) {
     int c = (index / iwidth / iheight) % channels;
@@ -27,79 +27,5 @@ __global__ void MaxUnpoolBackward(const int nthreads, const Dtype* top_diff, con
   }
 }
 
-void THNN_CudaSpatialMaxUnpooling_updateOutput(THCState *state, THCudaTensor *input, THCudaTensor *output, THCudaTensor *indices, int owidth, int oheight)
-{
-  THCUNN_assertSameGPU(state, 3, input, output, indices);
-  THArgCheck(input->nDimension == 3 || input->nDimension == 4, 2, "3D or 4D (batch) tensor expected");
-
-  long nInputCols, nInputRows, nInputPlane, batchSize;
-
-  if (input->nDimension == 3) {
-    nInputCols = input->size[2];
-    nInputRows = input->size[1];
-    nInputPlane = input->size[0];
-    batchSize = 1;
-  }
-  else
-  {
-    nInputCols = input->size[3];
-    nInputRows = input->size[2];
-    nInputPlane = input->size[1];
-    batchSize = input->size[0];
-  }
-
-  input = THCudaTensor_newContiguous(state, input);
-  indices = THCudaTensor_newContiguous(state, indices);
-  THCudaTensor_resize4d(state, output, batchSize, nInputPlane, oheight, owidth);
-  THCudaTensor_zero(state, output);
-
-  int count = THCudaTensor_nElement(state, input);
-
-  MaxUnpoolForward <<< GET_BLOCKS(count), CUDA_NUM_THREADS, 0, THCState_getCurrentStream(state) >>>
-      (count, THCudaTensor_data(state, input), THCudaTensor_data(state, indices),
-      batchSize, nInputPlane, nInputRows, nInputCols, oheight, owidth, THCudaTensor_data(state, output));
-  THCudaCheck(cudaGetLastError());
-
-  if(input->nDimension == 3)
-    THCudaTensor_resize3d(state, output, nInputPlane, oheight, owidth);
-
-  THCudaTensor_free(state, input);
-
-}
-
-void THNN_CudaSpatialMaxUnpooling_updateGradInput(THCState *state, THCudaTensor *input, THCudaTensor *gradOutput, THCudaTensor *gradInput, THCudaTensor *indices, int owidth, int oheight)
-{
-  THCUNN_assertSameGPU(state, 4, input, gradOutput, indices, gradInput);
-
-  long nInputCols, nInputRows, nInputPlane, batchSize;
-
-  if (input->nDimension == 3) {
-    nInputCols = input->size[2];
-    nInputRows = input->size[1];
-    nInputPlane = input->size[0];
-    batchSize = 1;
-  }
-  else
-  {
-    nInputCols = input->size[3];
-    nInputRows = input->size[2];
-    nInputPlane = input->size[1];
-    batchSize = input->size[0];
-  }
-
-  input = THCudaTensor_newContiguous(state, input);
-  indices = THCudaTensor_newContiguous(state, indices);
-  gradOutput = THCudaTensor_newContiguous(state, gradOutput);
-  THCudaTensor_resizeAs(state, gradInput, input);
-
-  int count = THCudaTensor_nElement(state, input);
-
-  MaxUnpoolBackward <<< GET_BLOCKS(count), CUDA_NUM_THREADS, 0, THCState_getCurrentStream(state) >>>
-      (count, THCudaTensor_data(state, gradOutput), THCudaTensor_data(state, indices),
-      batchSize, nInputPlane, nInputRows, nInputCols, oheight, owidth, THCudaTensor_data(state, gradInput));
-  THCudaCheck(cudaGetLastError());
-
-  // clean
-  THCudaTensor_free(state, input);
-  THCudaTensor_free(state, gradOutput);
-}
+#include "generic/SpatialMaxUnpooling.cu"
+#include "THCGenerateFloatTypes.h"
diff --git a/lib/THCUNN/SpatialReflectionPadding.cu b/lib/THCUNN/SpatialReflectionPadding.cu
index c3ae14e..75aae1a 100644
--- a/lib/THCUNN/SpatialReflectionPadding.cu
+++ b/lib/THCUNN/SpatialReflectionPadding.cu
@@ -5,9 +5,14 @@
 #include "THCDeviceUtils.cuh"
 #include "THCReduceApplyUtils.cuh"
 
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
+#include "THCAtomics.cuh"
+
+template<typename Dtype>
 __global__ void SpatialReflectionPadding_updateOutput(
-  THCDeviceTensor<float, 4> input,
-  THCDeviceTensor<float, 4> output,
+  THCDeviceTensor<Dtype, 4> input,
+  THCDeviceTensor<Dtype, 4> output,
   int padT, int padB, int padL, int padR) {
 
   int outputPointId = threadIdx.x + blockIdx.x * blockDim.x;
@@ -36,70 +41,14 @@ __global__ void SpatialReflectionPadding_updateOutput(
                   + 2 * padT + input.getSize(2) - 1
                   - oStartY + iStartY;
 
-  float valueToCopy = input[batch][plane][inputPointY][inputPointX];
+  Dtype valueToCopy = input[batch][plane][inputPointY][inputPointX];
   output[batch][plane][outputPointY][outputPointX] = valueToCopy;
 }
 
-void THNN_CudaSpatialReflectionPadding_updateOutput(THCState *state,
-                                                    THCudaTensor *input,
-                                                    THCudaTensor *output,
-                                                    int padL, int padR,
-                                                    int padT, int padB
-                                                   ) {
-  THArgCheck(TensorUtils<THCudaTensor>::canUse32BitIndexMath(state, input), 2,
-             "input tensor must fit into 32-bit index math");
-
-  int planeDim = 0;
-  int dimh = 1;
-  int dimw = 2;
-  int numBatch = 1;
-
-  int numInputDims = THCudaTensor_nDimension(state, input);
-  THArgCheck(numInputDims == 3 || numInputDims == 4, 2,
-                "input must be 3 or 4-dimensional");
-
-  if (numInputDims == 4) {
-    numBatch = THCudaTensor_size(state, input, 0);
-    planeDim++;
-    dimh++;
-    dimw++;
-  }
-
-  int numPlanes = THCudaTensor_size(state, input, planeDim);
-  int inputH = THCudaTensor_size(state, input, dimh);
-  int inputW = THCudaTensor_size(state, input, dimw);
-  int outputH = inputH + padT + padB;
-  int outputW  = inputW + padL + padR;
-
-  THCDeviceTensor<float, 4> devInput;
-  THCDeviceTensor<float, 4> devOutput;
-
-  if (numInputDims == 3) {
-    THCudaTensor_resize3d(state, output, numPlanes, outputH, outputW);
-
-    devInput = toDeviceTensor<float, 3>(state, input).upcastOuter<4>();
-    devOutput = toDeviceTensor<float, 3>(state, output).upcastOuter<4>();
-  } else {
-    THCudaTensor_resize4d(state, output, numBatch, numPlanes, outputH, outputW);
-
-    devInput = toDeviceTensor<float, 4>(state, input);
-    devOutput = toDeviceTensor<float, 4>(state, output);
-  }
-
-  int outputPlaneSize = devOutput.getSize(2) * devOutput.getSize(3);
-  dim3 gridSize(THCCeilDiv(outputPlaneSize, 256),
-            devOutput.getSize(1),
-            devOutput.getSize(0));
-  dim3 blockSize(outputPlaneSize > 256 ? 256 : outputPlaneSize);
-
-  SpatialReflectionPadding_updateOutput<<<gridSize, blockSize, 0, THCState_getCurrentStream(state)>>>(
-    devInput, devOutput, padT, padB, padL, padR);
-  THCudaCheck(cudaGetLastError());
-}
-
+template <typename Dtype>
 __global__ void SpatialReflectionPadding_updateGradInput(
-  THCDeviceTensor<float, 4> gradInput,
-  THCDeviceTensor<float, 4> gradOutput,
+  THCDeviceTensor<Dtype, 4> gradInput,
+  THCDeviceTensor<Dtype, 4> gradOutput,
   int padT, int padB, int padL, int padR) {
 
   int outputPointId = threadIdx.x + blockIdx.x * blockDim.x;
@@ -128,54 +77,9 @@ __global__ void SpatialReflectionPadding_updateGradInput(
                   + 2 * padT + gradInput.getSize(2) - 1
                   - oStartY + iStartY;
 
-  float valueToCopy = gradOutput[batch][plane][outputPointY][outputPointX];
+  Dtype valueToCopy = gradOutput[batch][plane][outputPointY][outputPointX];
   atomicAdd(&gradInput[batch][plane][inputPointY][inputPointX], valueToCopy);
 }
 
-void THNN_CudaSpatialReflectionPadding_updateGradInput(THCState *state,
-                                                       THCudaTensor *input,
-                                                       THCudaTensor *gradOutput,
-                                                       THCudaTensor *gradInput,
-                                                       int padL, int padR,
-                                                       int padT, int padB) {
-
-  THArgCheck(TensorUtils<THCudaTensor>::canUse32BitIndexMath(state, input), 2,
-                "input tensor must fit into 32-bit index math");
-  THArgCheck(TensorUtils<THCudaTensor>::canUse32BitIndexMath(state, gradOutput), 3,
-                "output gradient tensor must fit into 32-bit index math");
-
-  int planeDim = 0;
-  int dimh = 1;
-  int dimw = 2;
-
-  int numInputDims = THCudaTensor_nDimension(state, input);
-  if (numInputDims == 4) {
-    planeDim++;
-    dimh++;
-    dimw++;
-  }
-
-  THCudaTensor_resizeAs(state, gradInput, input);
-  THCudaTensor_zero(state, gradInput);
-
-  THCDeviceTensor<float, 4> devGradInput;
-  THCDeviceTensor<float, 4> devGradOutput;
-
-  if (numInputDims == 3) {
-    devGradInput = toDeviceTensor<float, 3>(state, gradInput).upcastOuter<4>();
-    devGradOutput = toDeviceTensor<float, 3>(state, gradOutput).upcastOuter<4>();
-  } else {
-    devGradInput = toDeviceTensor<float, 4>(state, gradInput);
-    devGradOutput = toDeviceTensor<float, 4>(state, gradOutput);
-  }
-
-  int outputPlaneSize = devGradOutput.getSize(2) * devGradOutput.getSize(3);
-  dim3 gridSize(THCCeilDiv(outputPlaneSize, 256),
-            devGradOutput.getSize(1),
-            devGradOutput.getSize(0));
-  dim3 blockSize(outputPlaneSize > 256 ? 256 : outputPlaneSize);
-
-  SpatialReflectionPadding_updateGradInput<<<gridSize, blockSize, 0, THCState_getCurrentStream(state)>>>(
-    devGradInput, devGradOutput, padT, padB, padL, padR);
-  THCudaCheck(cudaGetLastError());
-}
+#include "generic/SpatialReflectionPadding.cu"
+#include "THCGenerateFloatTypes.h"
diff --git a/lib/THCUNN/SpatialReplicationPadding.cu b/lib/THCUNN/SpatialReplicationPadding.cu
index fc09291..f022e57 100644
--- a/lib/THCUNN/SpatialReplicationPadding.cu
+++ b/lib/THCUNN/SpatialReplicationPadding.cu
@@ -5,9 +5,14 @@
 #include "THCDeviceUtils.cuh"
 #include "THCReduceApplyUtils.cuh"
 
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
+#include "THCAtomics.cuh"
+
+template <typename Dtype>
 __global__ void SpatialReplicationPadding_updateOutput(
-  THCDeviceTensor<float, 4> input,
-  THCDeviceTensor<float, 4> output,
+  THCDeviceTensor<Dtype, 4> input,
+  THCDeviceTensor<Dtype, 4> output,
   int padT, int padB, int padL, int padR) {
 
   int outputPointId = threadIdx.x + blockIdx.x * blockDim.x;
@@ -27,70 +32,14 @@ __global__ void SpatialReplicationPadding_updateOutput(
   int inputPointX = min(max(padL, outputPointX), input.getSize(3) + padL - 1) - oStartX + iStartX;
   int inputPointY = min(max(padT, outputPointY), input.getSize(2) + padT - 1) - oStartY + iStartY;
 
-  float valueToCopy = input[batch][plane][inputPointY][inputPointX];
+  Dtype valueToCopy = input[batch][plane][inputPointY][inputPointX];
   output[batch][plane][outputPointY][outputPointX] = valueToCopy;
 }
 
-void THNN_CudaSpatialReplicationPadding_updateOutput(THCState *state,
-                                                     THCudaTensor *input,
-                                                     THCudaTensor *output,
-                                                     int padL, int padR,
-                                                     int padT, int padB
-                                                    ) {
-  THArgCheck(TensorUtils<THCudaTensor>::canUse32BitIndexMath(state, input), 2,
-             "input tensor must fit into 32-bit index math");
-
-  int planeDim = 0;
-  int dimh = 1;
-  int dimw = 2;
-  int numBatch = 1;
-
-  int numInputDims = THCudaTensor_nDimension(state, input);
-  THArgCheck(numInputDims == 3 || numInputDims == 4, 2,
-             "input must be 3 or 4-dimensional");
-
-  if (numInputDims == 4) {
-    numBatch = THCudaTensor_size(state, input, 0);
-    planeDim++;
-    dimh++;
-    dimw++;
-  }
-
-  int numPlanes = THCudaTensor_size(state, input, planeDim);
-  int inputH = THCudaTensor_size(state, input, dimh);
-  int inputW = THCudaTensor_size(state, input, dimw);
-  int outputH = inputH + padT + padB;
-  int outputW  = inputW + padL + padR;
-
-  THCDeviceTensor<float, 4> devInput;
-  THCDeviceTensor<float, 4> devOutput;
-
-  if (numInputDims == 3) {
-    THCudaTensor_resize3d(state, output, numPlanes, outputH, outputW);
-
-    devInput = toDeviceTensor<float, 3>(state, input).upcastOuter<4>();
-    devOutput = toDeviceTensor<float, 3>(state, output).upcastOuter<4>();
-  } else {
-    THCudaTensor_resize4d(state, output, numBatch, numPlanes, outputH, outputW);
-
-    devInput = toDeviceTensor<float, 4>(state, input);
-    devOutput = toDeviceTensor<float, 4>(state, output);
-  }
-
-  int outputPlaneSize = devOutput.getSize(2) * devOutput.getSize(3);
-  dim3 gridSize(THCCeilDiv(outputPlaneSize, 256),
-            devOutput.getSize(1),
-            devOutput.getSize(0));
-  dim3 blockSize(outputPlaneSize > 256 ? 256 : outputPlaneSize);
-
-  SpatialReplicationPadding_updateOutput<<<gridSize, blockSize, 0, THCState_getCurrentStream(state)>>>(
-    devInput, devOutput, padT, padB, padL, padR);
-
-}
-
+template <typename Dtype>
 __global__ void SpatialReplicationPadding_updateGradInput(
-  THCDeviceTensor<float, 4> gradInput,
-  THCDeviceTensor<float, 4> gradOutput,
+  THCDeviceTensor<Dtype, 4> gradInput,
+  THCDeviceTensor<Dtype, 4> gradOutput,
   int padT, int padB, int padL, int padR) {
 
   int outputPointId = threadIdx.x + blockIdx.x * blockDim.x;
@@ -110,54 +59,10 @@ __global__ void SpatialReplicationPadding_updateGradInput(
   int inputPointX = min(max(padL, outputPointX), gradInput.getSize(3) + padL - 1) - oStartX + iStartX;
   int inputPointY = min(max(padT, outputPointY), gradInput.getSize(2) + padT - 1) - oStartY + iStartY;
 
-  float valueToCopy = gradOutput[batch][plane][outputPointY][outputPointX];
+  Dtype valueToCopy = gradOutput[batch][plane][outputPointY][outputPointX];
   atomicAdd(&gradInput[batch][plane][inputPointY][inputPointX], valueToCopy);
 }
 
-void THNN_CudaSpatialReplicationPadding_updateGradInput(THCState *state,
-                                                        THCudaTensor *input,
-                                                        THCudaTensor *gradOutput,
-                                                        THCudaTensor *gradInput,
-                                                        int padL, int padR,
-                                                        int padT, int padB) {
-
-  THArgCheck(TensorUtils<THCudaTensor>::canUse32BitIndexMath(state, input), 2,
-                "input tensor must fit into 32-bit index math");
-  THArgCheck(TensorUtils<THCudaTensor>::canUse32BitIndexMath(state, gradOutput), 3,
-                "output gradient tensor must fit into 32-bit index math");
-
-  int planeDim = 0;
-  int dimh = 1;
-  int dimw = 2;
-
-  int numInputDims = THCudaTensor_nDimension(state, input);
-  if (numInputDims == 4) {
-    planeDim++;
-    dimh++;
-    dimw++;
-  }
-
-  THCudaTensor_resizeAs(state, gradInput, input);
-  THCudaTensor_zero(state, gradInput);
 
-  THCDeviceTensor<float, 4> devGradInput;
-  THCDeviceTensor<float, 4> devGradOutput;
-
-  if (numInputDims == 3) {
-    devGradInput = toDeviceTensor<float, 3>(state, gradInput).upcastOuter<4>();
-    devGradOutput = toDeviceTensor<float, 3>(state, gradOutput).upcastOuter<4>();
-  } else {
-    devGradInput = toDeviceTensor<float, 4>(state, gradInput);
-    devGradOutput = toDeviceTensor<float, 4>(state, gradOutput);
-  }
-
-  int outputPlaneSize = devGradOutput.getSize(2) * devGradOutput.getSize(3);
-  dim3 gridSize(THCCeilDiv(outputPlaneSize, 256),
-            devGradOutput.getSize(1),
-            devGradOutput.getSize(0));
-  dim3 blockSize(outputPlaneSize > 256 ? 256 : outputPlaneSize);
-
-  SpatialReplicationPadding_updateGradInput<<<gridSize, blockSize, 0, THCState_getCurrentStream(state)>>>(
-    devGradInput, devGradOutput, padT, padB, padL, padR);
-
-}
+#include "generic/SpatialReplicationPadding.cu"
+#include "THCGenerateFloatTypes.h"
diff --git a/lib/THCUNN/SpatialSubSampling.cu b/lib/THCUNN/SpatialSubSampling.cu
index d801862..a38661b 100644
--- a/lib/THCUNN/SpatialSubSampling.cu
+++ b/lib/THCUNN/SpatialSubSampling.cu
@@ -1,5 +1,7 @@
 #include "THCUNN.h"
-#include "common.h"
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
+#include "THCAtomics.cuh"
 
 #define CUDA_MAX_THREADS 1024   // this is safe, in reality 256 is our limit
 
@@ -8,7 +10,8 @@
  *    this function subsamples an input 3D tensor along dimensions 1 and 2
  *    3D input, 3D output, 1D weight, 1D bias
  */
-__global__ void subsample(float *input, float *output, float *weight, float *bias,
+ template <typename Dtype, typename Acctype>
+__global__ void subsample(Dtype *input, Dtype *output, Dtype *weight, Dtype *bias,
                           int input_n, int input_h, int input_w,
                           int kH, int kW, int dH, int dW)
 {
@@ -37,18 +40,18 @@ __global__ void subsample(float *input, float *output, float *weight, float *bia
   input = input + i*input_w*input_h;
 
   // Get the good mask for (k,i) (k out, i in)
-  float the_weight = weight[k];
+  Dtype the_weight = weight[k];
 
   // Initialize to the bias
-  float the_bias = bias[k];
+  Dtype the_bias = bias[k];
 
   // For all output pixels...
   for(yy = yy_start; yy < yy_end; yy+=yy_step) {
     for(xx = xx_start; xx < xx_end; xx+=xx_step) {
       // Compute the mean of the input image...
-      float *ptr_input = input + yy*dH*input_w + xx*dW;
-      float *ptr_output = output + yy*output_w + xx;
-      float sum = 0;
+      Dtype *ptr_input = input + yy*dH*input_w + xx*dW;
+      Dtype *ptr_output = output + yy*output_w + xx;
+      Acctype sum = 0;
       int kx, ky;
       for(ky = 0; ky < kH; ky++) {
         for(kx = 0; kx < kW; kx++)
@@ -56,7 +59,7 @@ __global__ void subsample(float *input, float *output, float *weight, float *bia
         ptr_input += input_w; // next input line
       }
       // Update output
-      *ptr_output = the_weight*sum + the_bias;
+      *ptr_output = ScalarConvert<Acctype, Dtype>::to(the_weight*sum + the_bias);
     }
   }
 }
@@ -65,7 +68,8 @@ __global__ void subsample(float *input, float *output, float *weight, float *bia
  * Description:
  *    this function computes the gradWeight from input and gradOutput
  */
-__global__ void subgradweight(float *input, float *gradOutput, float *gradWeight, float *gradBias,
+ template <typename Dtype, typename Acctype>
+__global__ void subgradweight(Dtype *input, Dtype *gradOutput, Dtype *gradWeight, Dtype *gradBias,
                               int input_n, int input_h, int input_w,
                               int kH, int kW, int dH, int dW,
                               float scale)
@@ -98,15 +102,15 @@ __global__ void subgradweight(float *input, float *gradOutput, float *gradWeight
   int tid = blockDim.x*threadIdx.y + threadIdx.x;
 
   // create array to hold partial sums
-  __shared__ float sums[CUDA_MAX_THREADS];
+  __shared__ Acctype sums[CUDA_MAX_THREADS];
   sums[tid] = 0;
 
   // compute partial sums
   for(yy = yy_start; yy < yy_end; yy+=yy_step) {
     for(xx = xx_start; xx < xx_end; xx+=xx_step) {
-      float *ptr_input = input + yy*dH*input_w + xx*dW;
-      float *ptr_gradOutput = gradOutput + yy*output_w + xx;
-      float z = *ptr_gradOutput;
+      Dtype *ptr_input = input + yy*dH*input_w + xx*dW;
+      Dtype *ptr_gradOutput = gradOutput + yy*output_w + xx;
+      Dtype z = *ptr_gradOutput;
       long kx, ky;
       for(ky = 0; ky < kH; ky++) {
         for(kx = 0; kx < kW; kx++) {
@@ -120,7 +124,11 @@ __global__ void subgradweight(float *input, float *gradOutput, float *gradWeight
 
   // reduce: accumulate all partial sums to produce final gradWeight
   if ((threadIdx.x == 0) && (threadIdx.y == 0)) {
-    for(int i = 0; i < blockDim.x*blockDim.y; i++) gradWeight[k] += scale*sums[i];
+    Acctype scaledSums = Acctype(0);
+    for(int i = 0; i < blockDim.x*blockDim.y; i++) {
+      scaledSums += scale*sums[i];
+    }
+    gradWeight[k] += ScalarConvert<Acctype, Dtype>::to(scaledSums);
   }
   __syncthreads();
 
@@ -133,8 +141,11 @@ __global__ void subgradweight(float *input, float *gradOutput, float *gradWeight
 
   // reduce gradBias
   if ((threadIdx.x == 0) && (threadIdx.y == 0)) {
-    for (int i=0; i<(blockDim.x*blockDim.y); i++)
-      gradBias[k] += scale*sums[i];
+    Acctype scaledSums = Acctype(0);
+    for (int i=0; i<(blockDim.x*blockDim.y); i++) {
+      scaledSums += scale*sums[i];
+    }
+    gradBias[k] += ScalarConvert<Acctype, Dtype>::to(scaledSums);
   }
 }
 
@@ -142,7 +153,8 @@ __global__ void subgradweight(float *input, float *gradOutput, float *gradWeight
  * Description:
  *    this function computes the gradInput from weight and gradOutput
  */
-__global__ void subgradinput(float *gradInput, float *gradOutput, float *weight,
+ template <typename Dtype>
+__global__ void subgradinput(Dtype *gradInput, Dtype *gradOutput, Dtype *weight,
                              int input_n, int input_h, int input_w,
                              int kH, int kW, int dH, int dW)
 {
@@ -171,18 +183,20 @@ __global__ void subgradinput(float *gradInput, float *gradOutput, float *weight,
   gradInput = gradInput + i*input_w*input_h;
 
   // get weight
-  float the_weight = weight[k];
+  Dtype the_weight = weight[k];
 
   // compute gradInput
   for(yy = yy_start; yy < yy_end; yy+=yy_step) {
     for(xx = xx_start; xx < xx_end; xx+=xx_step) {
-      float *ptr_gradInput = gradInput + yy*dH*input_w + xx*dW;
-      float *ptr_gradOutput = gradOutput + yy*output_w + xx;
-      float z = *ptr_gradOutput * the_weight;
+      Dtype *ptr_gradInput = gradInput + yy*dH*input_w + xx*dW;
+      Dtype *ptr_gradOutput = gradOutput + yy*output_w + xx;
+      Dtype z = *ptr_gradOutput * the_weight;
       int kx, ky;
       for(ky = 0; ky < kH; ky++) {
-        for(kx = 0; kx < kW; kx++)
+        for(kx = 0; kx < kW; kx++) {
+          // FIXME: should this be done at accreal precision?
           ptr_gradInput[kx] += z;
+        }
         ptr_gradInput += input_w;
       }
     }
@@ -193,7 +207,8 @@ __global__ void subgradinput(float *gradInput, float *gradOutput, float *weight,
  * Description:
  *    this function computes the gradInput from weight and gradOutput
  */
-__global__ void subgradinputAtomic(float *gradInput, float *gradOutput, float *weight,
+ template <typename Dtype>
+__global__ void subgradinputAtomic(Dtype *gradInput, Dtype *gradOutput, Dtype *weight,
                                    int input_n, int input_h, int input_w,
                                    int kH, int kW, int dH, int dW)
 {
@@ -222,17 +237,18 @@ __global__ void subgradinputAtomic(float *gradInput, float *gradOutput, float *w
   gradInput = gradInput + i*input_w*input_h;
 
   // get weight
-  float the_weight = weight[k];
+  Dtype the_weight = weight[k];
 
   // compute gradInput
   for(yy = yy_start; yy < yy_end; yy+=yy_step) {
     for(xx = xx_start; xx < xx_end; xx+=xx_step) {
-      float *ptr_gradInput = gradInput + yy*dH*input_w + xx*dW;
-      float *ptr_gradOutput = gradOutput + yy*output_w + xx;
-      float z = *ptr_gradOutput * the_weight;
+      Dtype *ptr_gradInput = gradInput + yy*dH*input_w + xx*dW;
+      Dtype *ptr_gradOutput = gradOutput + yy*output_w + xx;
+      Dtype z = *ptr_gradOutput * the_weight;
       int kx, ky;
       for(ky = 0; ky < kH; ky++) {
         for(kx = 0; kx < kW; kx++) {
+          // FIXME: should this be done at accreal precision?
           atomicAdd(&(ptr_gradInput[kx]), z);
         }
         ptr_gradInput += input_w;
@@ -241,205 +257,8 @@ __global__ void subgradinputAtomic(float *gradInput, float *gradOutput, float *w
   }
 }
 
-void THNN_CudaSpatialSubSampling_updateOutput(THCState *state, THCudaTensor *input, THCudaTensor *output, THCudaTensor *weight, THCudaTensor *bias, int kW, int kH, int dW, int dH)
-{
-  float *weight_data = THCudaTensor_data(state, weight);
-  float *bias_data = THCudaTensor_data(state, bias);
-  float *output_data;
-  float *input_data;
-
-  int nInputPlane = THCudaTensor_size(state, weight, 0);
-
-  THCUNN_assertSameGPU(state, 4, input, output, weight, bias);
-  THArgCheck(input->nDimension == 3 || input->nDimension == 4, 2, "3D or 4D (batch) tensor expected");
-
-  if (input->nDimension == 3) {
-    long nInputCols = input->size[2];
-    long nInputRows = input->size[1];
-    long nOutputCols = (nInputCols - kW) / dW + 1;
-    long nOutputRows = (nInputRows - kH) / dH + 1;
-
-    THArgCheck(input->size[0] == nInputPlane, 2, "invalid number of input planes");
-    THArgCheck(nInputCols >= kW && nInputRows >= kH, 2, "input image smaller than kernel size");
-
-    input = THCudaTensor_newContiguous(state, input);
-    input_data = THCudaTensor_data(state, input);
-
-    THCudaTensor_resize3d(state, output, nInputPlane, nOutputRows, nOutputCols);
-    output_data = THCudaTensor_data(state, output);
-
-    // cuda blocks & threads:
-    int yblocks = (int)(16L / nInputPlane);
-    yblocks = yblocks < 1 ? 1 : yblocks;
-    dim3 blocks(nInputPlane,yblocks);
-    dim3 threads(32,8);
-
-    // run subsample kernel
-    subsample <<<blocks, threads, 0, THCState_getCurrentStream(state)>>> (
-      input_data, output_data, weight_data, bias_data,
-      nInputPlane, nInputRows, nInputCols, kH, kW, dH, dW);
-    THCudaCheck(cudaGetLastError());
-  } else {
-    long nInputCols = input->size[3];
-    long nInputRows = input->size[2];
-    long nbatch = input->size[0];
-    long nOutputCols = (nInputCols - kW) / dW + 1;
-    long nOutputRows = (nInputRows - kH) / dH + 1;
-
-    THArgCheck(input->size[1] == nInputPlane, 2, "invalid number of input planes");
-    THArgCheck(nInputCols >= kW && nInputRows >= kH, 2, "input image smaller than kernel size");
-
-    input = THCudaTensor_newContiguous(state, input);
-    input_data = THCudaTensor_data(state, input);
-
-    THCudaTensor_resize4d(state, output, nbatch, nInputPlane, nOutputRows, nOutputCols);
-    output_data = THCudaTensor_data(state, output);
-
-    // cuda blocks & threads:
-    int yblocks = (int)(16L / nInputPlane);
-    yblocks = yblocks < 1 ? 1 : yblocks;
-    dim3 blocks(nInputPlane*nbatch,yblocks);
-    dim3 threads(32,8);
-
-    // run subsample kernel
-    subsample <<<blocks, threads, 0, THCState_getCurrentStream(state)>>> (
-      input_data, output_data, weight_data, bias_data,
-      nInputPlane, nInputRows, nInputCols, kH, kW, dH, dW);
-    THCudaCheck(cudaGetLastError());
-  }
-
-  // clean
-  THCudaTensor_free(state, input);
-
-}
-
-void THNN_CudaSpatialSubSampling_updateGradInput(THCState *state, THCudaTensor *input, THCudaTensor *gradOutput, THCudaTensor *gradInput, THCudaTensor *weight, int kW, int kH, int dW, int dH)
-{
-  THCUNN_assertSameGPU(state, 4, input, gradOutput, weight, gradInput);
-
-  int nInputPlane = THCudaTensor_size(state, weight, 0);
-
-  if (input->nDimension == 3) {
-    long nInputCols = input->size[2];
-    long nInputRows = input->size[1];
-
-    float *weight_data = THCudaTensor_data(state, weight);
-    float *gradOutput_data = THCudaTensor_data(state, gradOutput);
-    float *gradInput_data;
-
-    THCudaTensor_resizeAs(state, gradInput, input);
-    THCudaTensor_zero(state, gradInput);
-    gradInput_data = THCudaTensor_data(state, gradInput);
-
-    // cuda blocks & threads:
-    int yblocks = (int)(16L / nInputPlane);
-    yblocks = yblocks < 1 ? 1 : yblocks;
-    dim3 blocks(nInputPlane,yblocks);
-    dim3 threads(32,8);
-
-    // run updateGradInput kernel
-    if (kH <= dH && kW <= dW) {
-      subgradinput <<<blocks, threads, 0, THCState_getCurrentStream(state)>>> (
-        gradInput_data, gradOutput_data, weight_data,
-        nInputPlane, nInputRows, nInputCols, kH, kW, dH, dW);
-    } else {
-      subgradinputAtomic <<<blocks, threads, 0, THCState_getCurrentStream(state)>>> (
-        gradInput_data, gradOutput_data, weight_data,
-        nInputPlane, nInputRows, nInputCols, kH, kW, dH, dW);
-    }
-    THCudaCheck(cudaGetLastError());
-  } else {
-    long nInputCols = input->size[3];
-    long nInputRows = input->size[2];
-    long nbatch = input->size[0];
-
-    float *weight_data = THCudaTensor_data(state, weight);
-    float *gradOutput_data = THCudaTensor_data(state, gradOutput);
-    float *gradInput_data;
-
-    THCudaTensor_resizeAs(state, gradInput, input);
-    THCudaTensor_zero(state, gradInput);
-    gradInput_data = THCudaTensor_data(state, gradInput);
-
-    // cuda blocks & threads:
-    int yblocks = (int)(16L / nInputPlane);
-    yblocks = yblocks < 1 ? 1 : yblocks;
-    dim3 blocks(nInputPlane*nbatch,yblocks);
-    dim3 threads(32,8);
-
-    // run updateGradInput kernel
-    if (kH <= dH && kW <= dW) {
-      subgradinput <<<blocks, threads, 0, THCState_getCurrentStream(state)>>> (
-        gradInput_data, gradOutput_data, weight_data,
-        nInputPlane, nInputRows, nInputCols, kH, kW, dH, dW);
-    } else {
-      subgradinputAtomic <<<blocks, threads, 0, THCState_getCurrentStream(state)>>> (
-        gradInput_data, gradOutput_data, weight_data,
-        nInputPlane, nInputRows, nInputCols, kH, kW, dH, dW);
-    }
-    THCudaCheck(cudaGetLastError());
-  }
-}
-
-void THNN_CudaSpatialSubSampling_accGradParameters(THCState *state, THCudaTensor *input, THCudaTensor *gradOutput, THCudaTensor *gradWeight, THCudaTensor *gradBias, int kW, int kH, int dW, int dH, float scale)
-{
-  THCUNN_assertSameGPU(state, 4, input, gradOutput, gradWeight, gradBias);
-
-  int nInputPlane = THCudaTensor_size(state, gradWeight, 0);
-
-  if (input->nDimension == 3) {
-    long nInputCols = input->size[2];
-    long nInputRows = input->size[1];
-
-    float *gradWeight_data = THCudaTensor_data(state, gradWeight);
-    float *gradBias_data = THCudaTensor_data(state, gradBias);
-    float *gradOutput_data = THCudaTensor_data(state, gradOutput);
-    float *input_data;
-
-    input = THCudaTensor_newContiguous(state, input);
-    input_data = THCudaTensor_data(state, input);
-
-    // cuda blocks & threads:
-    dim3 blocks(nInputPlane);
-    dim3 threads(32,8);
-
-    // run gradweight kernel
-    subgradweight <<<blocks, threads, 0, THCState_getCurrentStream(state)>>> (
-      input_data, gradOutput_data, gradWeight_data, gradBias_data,
-      nInputPlane, nInputRows, nInputCols, kH, kW, dH, dW, scale);
-    THCudaCheck(cudaGetLastError());
-  } else {
-    long nInputCols = input->size[3];
-    long nInputRows = input->size[2];
-    long nbatch = input->size[0];
-
-    float *gradWeight_data = THCudaTensor_data(state, gradWeight);
-    float *gradBias_data = THCudaTensor_data(state, gradBias);
-    float *gradOutput_data = THCudaTensor_data(state, gradOutput);
-    float *input_data;
-
-    input = THCudaTensor_newContiguous(state, input);
-    input_data = THCudaTensor_data(state, input);
-
-    // cuda blocks & threads:
-    dim3 blocks(nInputPlane);
-    dim3 threads(32,8);
-
-    // run gradweight kernel
-    long sl;
-    for (sl=0; sl<nbatch; sl++) {
-      subgradweight <<<blocks, threads, 0, THCState_getCurrentStream(state)>>> (
-        input_data + sl*input->stride[0],
-        gradOutput_data + sl*gradOutput->stride[0],
-        gradWeight_data, gradBias_data,
-        nInputPlane, nInputRows, nInputCols, kH, kW, dH, dW, scale);
-    }
-    THCudaCheck(cudaGetLastError());
-  }
 
-  // clean
-  THCudaTensor_free(state, input);
-
-}
+#include "generic/SpatialSubSampling.cu"
+#include "THCGenerateFloatTypes.h"
 
 #undef CUDA_MAX_THREADS
diff --git a/lib/THCUNN/SpatialUpSamplingBilinear.cu b/lib/THCUNN/SpatialUpSamplingBilinear.cu
index 27fab85..039f8e7 100644
--- a/lib/THCUNN/SpatialUpSamplingBilinear.cu
+++ b/lib/THCUNN/SpatialUpSamplingBilinear.cu
@@ -5,10 +5,14 @@
 #include "THCDeviceTensor.cuh"
 #include "THCDeviceTensorUtils.cuh"
 #include "THCDeviceUtils.cuh"
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
+#include "THCAtomics.cuh"
 
+template<typename Dtype, typename Acctype>
 __global__ void caffe_gpu_interp2_kernel(const int n,
-    const float rheight, const float rwidth,
-    const THCDeviceTensor<float, 4> data1, THCDeviceTensor<float, 4> data2) {
+    const Acctype rheight, const Acctype rwidth,
+    const THCDeviceTensor<Dtype, 4> data1, THCDeviceTensor<Dtype, 4> data2) {
   int index = threadIdx.x + blockIdx.x * blockDim.x;
   const int batchsize = data1.getSize(0);
   const int channels = data1.getSize(1);
@@ -26,72 +30,42 @@ __global__ void caffe_gpu_interp2_kernel(const int n,
       const int w1 = w2;
       for (int n = 0; n < batchsize ; n++){
         for (int c = 0; c < channels; ++c) {
-          const float val = data1[n][c][h1][w1];
+          const Dtype val = data1[n][c][h1][w1];
           data2[n][c][h2][w2] = val;
         }
       }
       return;
     }
     //
-    const float h1r = rheight * h2;
+    const Acctype h1r = rheight * h2;
     const int h1 = h1r;
     const int h1p = (h1 < height1 - 1) ? 1 : 0;
-    const float h1lambda = h1r - h1;
-    const float h0lambda = 1.0f - h1lambda;
+    const Acctype h1lambda = h1r - h1;
+    const Acctype h0lambda = Acctype(1) - h1lambda;
     //
-    const float w1r = rwidth * w2;
+    const Acctype w1r = rwidth * w2;
     const int w1 = w1r;
     const int w1p = (w1 < width1 - 1) ? 1 : 0;
-    const float w1lambda = w1r - w1;
-    const float w0lambda = 1.0f - w1lambda;
+    const Acctype w1lambda = w1r - w1;
+    const Acctype w0lambda = Acctype(1) - w1lambda;
     //
     for (int n = 0; n < batchsize ; n++){
         for (int c = 0; c < channels; ++c) {
-        const float val = h0lambda * (w0lambda * data1[n][c][h1][w1]
+        const Acctype val = h0lambda * (w0lambda * data1[n][c][h1][w1]
                             + w1lambda * data1[n][c][h1][w1+w1p])
                             + h1lambda * (w0lambda * data1[n][c][h1+h1p][w1]
                             + w1lambda * data1[n][c][h1+h1p][w1+w1p]);
-        data2[n][c][h2][w2] = val;
+        data2[n][c][h2][w2] = ScalarConvert<Acctype, Dtype>::to(val);
       }
     }
   }
 }
 
-void THNN_CudaSpatialUpSamplingBilinear_updateOutput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *output,
-	  int outputHeight,
-          int outputWidth) {
-  input = THCudaTensor_newContiguous(state, input);
-  output = THCudaTensor_newContiguous(state, output);
-  THCUNN_assertSameGPU(state, 2, input, output);
-  THCudaTensor_zero(state, output);
-  THCDeviceTensor<float, 4> idata = toDeviceTensor<float, 4>(state, input);
-  THCDeviceTensor<float, 4> odata = toDeviceTensor<float, 4>(state, output);
-  int height1 = idata.getSize(2);
-  int width1 = idata.getSize(3);
-  int height2 = odata.getSize(2);
-  int width2 = odata.getSize(3);
-  assert( height1 > 0 && width1 > 0 && height2 > 0 && width2 > 0);
-  const float rheight= (height2 > 1) ? (float)(height1 - 1)/(height2 - 1) : 0.f;
-  const float rwidth = (width2 > 1) ? (float)(width1 - 1)/(width2 - 1) : 0.f;
-  const int num_kernels = height2 * width2;
-  const int num_threads =
-    THCState_getCurrentDeviceProperties(state)->maxThreadsPerBlock;
-  cudaStream_t stream = THCState_getCurrentStream(state);
-  caffe_gpu_interp2_kernel<<<THCCeilDiv(num_kernels, num_threads), num_threads ,
-   0 , stream>>>(num_kernels, rheight, rwidth, idata, odata);
-  THCudaCheck(cudaGetLastError());
-  THCudaTensor_free(state, input);
-  THCudaTensor_free(state, output);
-}
-
-
 // Backward (adjoint) operation 1 <- 2 (accumulates)
+template <typename Dtype, typename Acctype>
 __global__ void caffe_gpu_interp2_kernel_backward(const int n,
-    const float rheight, const float rwidth,
-    THCDeviceTensor<float, 4> data1, const THCDeviceTensor<float, 4> data2){
+    const Acctype rheight, const Acctype rwidth,
+    THCDeviceTensor<Dtype, 4> data1, const THCDeviceTensor<Dtype, 4> data2){
   int index = threadIdx.x + blockIdx.x * blockDim.x;
   const int batchsize = data1.getSize(0);
   const int channels = data1.getSize(1);
@@ -108,69 +82,41 @@ __global__ void caffe_gpu_interp2_kernel_backward(const int n,
       const int w1 = w2;
       for (int n = 0; n < batchsize ; n++){
         for (int c = 0; c < channels; ++c) {
-          const float val = data2[n][c][h1][w1];
+          const Dtype val = data2[n][c][h1][w1];
           data1[n][c][h2][w2] += val;
         }
       }
       return;
     }
     //
-    const float h1r = rheight * h2;
+    const Acctype h1r = rheight * h2;
     const int h1 = h1r;
     const int h1p = (h1 < height1 - 1) ? 1 : 0;
-    const float h1lambda = h1r - h1;
-    const float h0lambda = 1.0f - h1lambda;
+    const Acctype h1lambda = h1r - h1;
+    const Acctype h0lambda = Acctype(1) - h1lambda;
     //
-    const float w1r = rwidth * w2;
+    const Acctype w1r = rwidth * w2;
     const int w1 = w1r;
     const int w1p = (w1 < width1 - 1) ? 1 : 0;
-    const float w1lambda = w1r - w1;
-    const float w0lambda = 1.0f - w1lambda;
+    const Acctype w1lambda = w1r - w1;
+    const Acctype w0lambda = Acctype(1) - w1lambda;
     //
     for (int n = 0; n < batchsize ; n++){
       for (int c = 0; c < channels; ++c) {
-        const float d2val = data2[n][c][h2][w2];
-        atomicAdd(data1[n][c][h1][w1].data(), h0lambda * w0lambda * d2val);
-        atomicAdd(data1[n][c][h1][w1+w1p].data(), h0lambda * w1lambda * d2val);
-        atomicAdd(data1[n][c][h1+h1p][w1].data(), h1lambda * w0lambda * d2val);
+        const Dtype d2val = data2[n][c][h2][w2];
+        atomicAdd(data1[n][c][h1][w1].data(),
+                  ScalarConvert<Acctype, Dtype>::to(h0lambda * w0lambda * d2val));
+        atomicAdd(data1[n][c][h1][w1+w1p].data(),
+                  ScalarConvert<Acctype, Dtype>::to(h0lambda * w1lambda * d2val));
+        atomicAdd(data1[n][c][h1+h1p][w1].data(),
+                  ScalarConvert<Acctype, Dtype>::to(h1lambda * w0lambda * d2val));
         atomicAdd(data1[n][c][h1+h1p][w1+w1p].data(),
-                                                  h1lambda * w1lambda * d2val);
+                  ScalarConvert<Acctype, Dtype>::to(h1lambda * w1lambda * d2val));
       }
     }
   }
 }
 
 
-void THNN_CudaSpatialUpSamplingBilinear_updateGradInput(
-          THCState *state,
-          THCudaTensor *gradOutput,
-          THCudaTensor *gradInput,
-          int nbatch,
-          int nchannels,
-          int inputHeight,
-          int inputWidth,
-          int outputHeight,
-          int outputWidth) {
-  gradInput = THCudaTensor_newContiguous(state, gradInput);
-  gradOutput = THCudaTensor_newContiguous(state, gradOutput);
-  THCUNN_assertSameGPU(state, 2, gradOutput, gradInput);
-  THCudaTensor_zero(state, gradInput);
-  THCDeviceTensor<float, 4> data1 = toDeviceTensor<float, 4>(state, gradInput);
-  THCDeviceTensor<float, 4> data2 = toDeviceTensor<float, 4>(state, gradOutput);
-  int height1 = data1.getSize(2);
-  int width1 = data1.getSize(3);
-  int height2 = data2.getSize(2);
-  int width2 = data2.getSize(3);
-  assert(height1 > 0 && width1 > 0 && height2 > 0 && width2 > 0);
-  const float rheight= (height2 > 1) ? (float)(height1 - 1)/(height2 - 1) : 0.f;
-  const float rwidth = (width2 > 1) ? (float)(width1 - 1) / (width2 - 1) : 0.f;
-  const int num_kernels = height2 * width2;
-  const int num_threads =
-    THCState_getCurrentDeviceProperties(state)->maxThreadsPerBlock;
-  cudaStream_t stream = THCState_getCurrentStream(state);
-  caffe_gpu_interp2_kernel_backward<<<THCCeilDiv(num_kernels, num_threads),
-  num_threads, 0, stream>>>(num_kernels, rheight, rwidth, data1, data2);
-  THCudaCheck(cudaGetLastError());
-  THCudaTensor_free(state, gradInput);
-  THCudaTensor_free(state, gradOutput);
-}
+#include "generic/SpatialUpSamplingBilinear.cu"
+#include "THCGenerateFloatTypes.h"
diff --git a/lib/THCUNN/SpatialUpSamplingNearest.cu b/lib/THCUNN/SpatialUpSamplingNearest.cu
index c188d3a..5753c6e 100644
--- a/lib/THCUNN/SpatialUpSamplingNearest.cu
+++ b/lib/THCUNN/SpatialUpSamplingNearest.cu
@@ -6,6 +6,8 @@
 #include <thrust/transform_reduce.h>
 #include <thrust/functional.h>
 
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
 
 /*
  * Description:
@@ -46,7 +48,8 @@ __device__ int translate_idx_inv(int ii, int d1, int d2, int d3, int scale_facto
 
 }
 
-__global__ void upscale(float *input, float *output, long no_elements,
+template <typename Dtype>
+__global__ void upscale(Dtype *input, Dtype *output, long no_elements,
                         int scale_factor, int d1, int d2, int d3)
 {
   // output offset:
@@ -57,121 +60,26 @@ __global__ void upscale(float *input, float *output, long no_elements,
   output[ii]=input[ipidx];
 }
 
-
-void THNN_CudaSpatialUpSamplingNearest_updateOutput(THCState *state, THCudaTensor *input, THCudaTensor *output, int scale_factor)
-{
-  THCudaTensor_zero(state, output);
-
-  THCUNN_assertSameGPU(state, 2, input, output);
-
-  input = THCudaTensor_newContiguous(state, input);
-  // This is for allocating output Tensor
-  long no_elements = 1;
-  for(int i = 0; i < input->nDimension; i++){
-    no_elements *= input->size[i];
-  }
-  no_elements *= scale_factor * scale_factor;
-
-  int d1;
-  int d2;
-  int d3;
-
-  if (input->nDimension == 3) {
-    d1 = output->size[0];
-    d2 = output->size[1];
-    d3 = output->size[2];
-  } else {
-    d1 = output->size[1];
-    d2 = output->size[2];
-    d3 = output->size[3];
-  }
-
-  float *input_data = THCudaTensor_data(state, input);
-  float *output_data = THCudaTensor_data(state, output);
-
-  // cuda blocks & threads:
-  long nthreads = 256;
-  // Max number of blocks: http://en.wikipedia.org/wiki/CUDA
-  // 65535 for SM 2.x, 2^32 -1 for >= 3.0
-  // TODO: When we move to SM 3.5 we should update this
-  long n_xblocks = min(max((int)ceil((float)no_elements / nthreads), 1), 65535);
-  long n_yblocks = (long)ceil((float)no_elements / (float)(n_xblocks * nthreads));
-  if (n_yblocks > 65535) {
-    THError("Input size is too large!  aborting");
-  }
-  dim3 blocks(n_xblocks, n_yblocks);
-  dim3 threads(nthreads);
-
-  // kernel:
-  upscale<<<blocks, threads, 0, THCState_getCurrentStream(state)>>> (input_data, output_data, no_elements, scale_factor, d1, d2, d3);
-  THCudaCheck(cudaGetLastError());
-
-  // final cut:
-  THCudaTensor_free(state, input);
-}
-
 /*
  * Description:
  */
-__global__ void downscale(float *gradInput_data, float *gradOutput_data, long no_elements,
+template <typename Dtype, typename Acctype>
+__global__ void downscale(Dtype *gradInput_data, Dtype *gradOutput_data, long no_elements,
                               int scale_factor, int d1, int d2, int d3)
 {
   // output offset:
   long ii = threadIdx.x + blockDim.x * blockIdx.x;
   ii += threadIdx.y + blockDim.y * (blockDim.x * gridDim.x) * blockIdx.y;
   if (ii >= no_elements) return;
+  Acctype sum = Acctype(0);
   for (int i=0; i < scale_factor; i++){
     for(int j=0; j < scale_factor; j++){
       int ipidx = translate_idx_inv(ii, d1, d2, d3, scale_factor, i, j);
-      gradInput_data[ii] += gradOutput_data[ipidx];
+      sum += gradOutput_data[ipidx];
     }
   }
+  gradInput_data[ii] += ScalarConvert<Acctype, Dtype>::to(sum);
 }
 
-
-void THNN_CudaSpatialUpSamplingNearest_updateGradInput(THCState *state, THCudaTensor *input, THCudaTensor *gradOutput, THCudaTensor *gradInput, int scale_factor)
-{
-  THCUNN_assertSameGPU(state, 2, gradOutput, gradInput);
-
-  THCudaTensor_zero(state, gradInput);
-
-  float *gradInput_data = THCudaTensor_data(state, gradInput);
-  float *gradOutput_data = THCudaTensor_data(state, gradOutput);
-
-  long no_elements = 1;
-  for(int i = 0; i < gradInput->nDimension; i++){
-    no_elements *= gradInput->size[i];
-  }
-
-  int d1;
-  int d2;
-  int d3;
-
-  if (gradInput->nDimension == 3) {
-    d1 = gradInput->size[0];
-    d2 = gradInput->size[1];
-    d3 = gradInput->size[2];
-  } else {
-    d1 = gradInput->size[1];
-    d2 = gradInput->size[2];
-    d3 = gradInput->size[3];
-  }
-
-  // cuda blocks & threads:
-  long nthreads = 256;
-  // Max number of blocks: http://en.wikipedia.org/wiki/CUDA
-  // 65535 for SM 2.x, 2^32 -1 for >= 3.0
-  // TODO: When we move to SM 3.5 we should update this
-  long n_xblocks = min(max((int)ceil((float)no_elements / nthreads), 1), 65535);
-  long n_yblocks = (long)ceil((float)no_elements / (float)(n_xblocks * nthreads));
-  if (n_yblocks > 65535) {
-    THError("Input size is too large!  aborting");
-  }
-  dim3 blocks(n_xblocks, n_yblocks);
-  dim3 threads(nthreads);
-
-  // kernel:
-  downscale<<<blocks, threads, 0, THCState_getCurrentStream(state)>>> (gradInput_data, gradOutput_data, no_elements,
-    scale_factor, d1, d2, d3);
-  THCudaCheck(cudaGetLastError());
-}
+#include "generic/SpatialUpSamplingNearest.cu"
+#include "THCGenerateFloatTypes.h"
diff --git a/lib/THCUNN/Sqrt.cu b/lib/THCUNN/Sqrt.cu
index e1a4fe3..9d0dddc 100644
--- a/lib/THCUNN/Sqrt.cu
+++ b/lib/THCUNN/Sqrt.cu
@@ -1,40 +1,32 @@
 #include "THCUNN.h"
-#include "common.h"
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
 
+template <typename T>
 struct sqrtupdateOutput_functor
 {
-  const float bias;
+  const T bias;
 
-  sqrtupdateOutput_functor(float bias_)
+  sqrtupdateOutput_functor(T bias_)
     : bias(bias_)
   {}
 
-  __device__ void operator()(float *output, const float *input) const
+  __device__ void operator()(T *output, const T *input) const
   {
     *output = sqrt(*input + bias);
   }
 };
 
-void THNN_CudaSqrt_updateOutput(THCState *state, THCudaTensor *input, THCudaTensor *output, float eps)
-{
-  THCUNN_assertSameGPU(state, 2, input, output);
-  THCudaTensor_resizeAs(state, output, input);
-  THC_pointwiseApply2(state, output, input, sqrtupdateOutput_functor(eps));
-}
-
+template <typename T>
 struct sqrtupdateGradInput_functor
 {
   sqrtupdateGradInput_functor() {}
 
-  __device__ void operator()(float *gradInput, const float *output, const float *gradOutput) const
+  __device__ void operator()(T *gradInput, const T *output, const T *gradOutput) const
   {
-    *gradInput = (*output == 0.0f) ? 0.0f : ((0.5f * *gradOutput) / *output);
+    *gradInput = (THCNumerics<T>::eq(*output,ScalarConvert<float, T>::to(0.0f))) ? ScalarConvert<float, T>::to(0.0f) : ((ScalarConvert<float, T>::to(0.5f) * *gradOutput) / *output);
   }
 };
 
-void THNN_CudaSqrt_updateGradInput(THCState *state, THCudaTensor *input, THCudaTensor *gradOutput, THCudaTensor *gradInput, THCudaTensor *output)
-{
-  THCUNN_assertSameGPU(state, 3, output, gradOutput, gradInput);
-  THCudaTensor_resizeAs(state, gradInput, output);
-  THC_pointwiseApply3(state, gradInput, output, gradOutput, sqrtupdateGradInput_functor());
-}
+#include "generic/Sqrt.cu"
+#include "THCGenerateFloatTypes.h"
diff --git a/lib/THCUNN/Square.cu b/lib/THCUNN/Square.cu
index a6d147c..4acf67f 100644
--- a/lib/THCUNN/Square.cu
+++ b/lib/THCUNN/Square.cu
@@ -1,32 +1,24 @@
 #include "THCUNN.h"
-#include "common.h"
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
 
+template <typename T>
 struct squareupdateOutput_functor
 {
-  __device__ void operator()(float* output, const float* input) const
+  __device__ void operator()(T* output, const T* input) const
   {
     *output = (*input) * (*input);
   }
 };
 
-void THNN_CudaSquare_updateOutput(THCState *state, THCudaTensor *input, THCudaTensor *output)
-{
-  THCUNN_assertSameGPU(state, 2, input, output);
-  THCudaTensor_resizeAs(state, output, input);
-  THC_pointwiseApply2(state, output, input, squareupdateOutput_functor());
-}
-
+template <typename T>
 struct squareupdateGradInput_functor
 {
-  __device__ void operator()(float* gradInput, const float* input, const float* gradOutput) const
+  __device__ void operator()(T* gradInput, const T* input, const T* gradOutput) const
   {
-    *gradInput = 2.0 * (*gradOutput) * (*input);
+    *gradInput = ScalarConvert<double, T>::to(2.0) * (*gradOutput) * (*input);
   }
 };
 
-void THNN_CudaSquare_updateGradInput(THCState *state, THCudaTensor *input, THCudaTensor *gradOutput, THCudaTensor *gradInput)
-{
-  THCUNN_assertSameGPU(state, 3, input, gradOutput, gradInput);
-  THCudaTensor_resizeAs(state, gradInput, input);
-  THC_pointwiseApply3(state, gradInput, input, gradOutput, squareupdateGradInput_functor());
-}
+#include "generic/Square.cu"
+#include "THCGenerateFloatTypes.h"
diff --git a/lib/THCUNN/THCHalfAutoNumerics.cuh b/lib/THCUNN/THCHalfAutoNumerics.cuh
new file mode 100644
index 0000000..89a9602
--- /dev/null
+++ b/lib/THCUNN/THCHalfAutoNumerics.cuh
@@ -0,0 +1,248 @@
+#ifndef THC_HALF_AUTO_NUMERICS_INC
+#define THC_HALF_AUTO_NUMERICS_INC
+
+#include "THCHalf.h"
+#include "THCNumerics.cuh"
+
+// Half numerics functions defined as free functions, so cunn code can be
+//written generically, i.e. without excessive calling of THCNumerics<half> functions.
+
+#ifdef CUDA_HALF_TENSOR
+
+// these functions should move to THCNumerics
+
+inline __host__ __device__ half fmaxType(half x, half y) {
+  return THCNumerics<half>::ge(x, y) ? x : y;
+}
+
+inline __host__ __device__ float fmaxType(float x, half y) {
+  return fmaxf(x, ScalarConvert<half, float>::to(y));
+}
+
+inline __host__ __device__ float fmaxType(float x, float y) {
+  return fmaxf(x, y);
+}
+
+inline __host__ __device__ double fmaxType(double x, double y) {
+  return fmax(x, y);
+}
+
+inline __host__ __device__ half mul(half a, half b) {
+  #ifdef __CUDA_ARCH__
+  #ifdef CUDA_HALF_INSTRUCTIONS
+    return __hmul(a, b);
+  #else
+    float fa = __half2float(a);
+    float fb = __half2float(b);
+    return __float2half( fa * fb );
+  #endif
+  #else // __CUDA_ARCH__
+    return THC_float2half(THC_half2float(a) * THC_half2float(b));
+  #endif
+}
+
+inline __host__ __device__ half div(half a, half b) {
+  #ifdef __CUDA_ARCH__
+  #ifdef CUDA_HALF_INSTRUCTIONS
+    return __hdiv(a, b);
+  #else
+    float fa = __half2float(a);
+    float fb = __half2float(b);
+    return __float2half( fa / fb );
+  #endif
+  #else // __CUDA_ARCH__
+    return THC_float2half(THC_half2float(a) / THC_half2float(b));
+  #endif
+}
+
+// arithmetic functions
+
+inline __host__ __device__ half operator+(half a, half b) {
+  return THCNumerics<half>::add(a, b);
+}
+
+inline __host__ __device__ float operator+(half a, float b) {
+  return ScalarConvert<half, float>::to(a) + b;
+}
+
+inline __host__ __device__ float operator+(float a, half b) {
+  return a + ScalarConvert<half, float>::to(b);
+}
+
+inline __host__ __device__ double operator+(double a, half b) {
+  return a + ScalarConvert<half, double>::to(b);
+}
+
+inline __host__ __device__ half operator-(half a) {
+  return THCNumerics<half>::neg(a);
+}
+
+inline __host__ __device__ half operator-(half a, half b) {
+  return THCNumerics<half>::add(a, THCNumerics<half>::neg(b));
+}
+
+inline __host__ __device__ half operator-(half a, int b) {
+  return THCNumerics<half>::add(a, THCNumerics<half>::neg(ScalarConvert<int, half>::to(b)));
+}
+
+inline __host__ __device__ float operator-(half a, float b) {
+  return ScalarConvert<half, float>::to(a) - b;
+}
+
+inline __host__ __device__ double operator-(half a, double b) {
+  return ScalarConvert<half, double>::to(a) - b;
+}
+
+inline __host__ __device__ half operator-(int a, half b) {
+  return THCNumerics<half>::add(ScalarConvert<int, half>::to(a), THCNumerics<half>::neg(b));
+}
+
+inline __host__ __device__ float operator-(float a, half b) {
+  return a - ScalarConvert<half, float>::to(b);
+}
+
+inline __host__ __device__ double operator-(double a, half b) {
+  return a - ScalarConvert<half, double>::to(b);
+}
+
+inline __host__ __device__ half operator*(half a, half b) {
+  return mul(a, b);
+}
+
+inline __host__ __device__ float operator*(half a, float b) {
+  return ScalarConvert<half, float>::to(a) * b;
+}
+
+inline __host__ __device__ double operator*(half a, double b) {
+  return ScalarConvert<half, double>::to(a) * b;
+}
+
+inline __host__ __device__ half operator*(half a, int b) {
+  return a * ScalarConvert<int, half>::to(b);
+}
+
+inline __host__ __device__ float operator*(float a, half b) {
+  return a * ScalarConvert<half, float>::to(b);
+}
+
+inline __host__ __device__ double operator*(double a, half b) {
+  return a * ScalarConvert<half, double>::to(b);
+}
+
+inline __host__ __device__ half operator/(half a, half b) {
+  return div(a, b);
+}
+
+inline __host__ __device__ float operator/(float a, half b) {
+  return a / ScalarConvert<half, float>::to(b);
+}
+
+inline __host__ __device__ double operator/(double a, half b) {
+  return a / ScalarConvert<half, double>::to(b);
+}
+
+inline __host__ __device__ half operator/(int a, half b) {
+  return ScalarConvert<int, half>::to(a) / b;
+}
+
+inline __host__ __device__ float operator/(half a, float b) {
+  return ScalarConvert<half, float>::to(a) / b;
+}
+
+inline __host__ __device__ double operator/(half a, double b) {
+  return ScalarConvert<half, double>::to(a) / b;
+}
+
+inline __host__ __device__ half operator/(half a, int b) {
+  return a / ScalarConvert<int, half>::to(b);
+}
+
+inline __host__ __device__ half& operator+=(half &lhs, const half &rhs) {
+  lhs = lhs + rhs;
+  return lhs;
+}
+inline __host__ __device__ float& operator+=(float &lhs, const half &rhs) {
+  lhs = lhs + rhs;
+  return lhs;
+}
+
+inline __host__ __device__ float& operator-=(float &lhs, const half &rhs) {
+  lhs = lhs - rhs;
+  return lhs;
+}
+
+inline __host__ __device__ half& operator*=(half &lhs, const half &rhs) {
+  lhs = lhs * rhs;
+  return lhs;
+}
+
+inline __host__ __device__ half& operator/=(half &lhs, const int &rhs) {
+  lhs = lhs / rhs;
+  return lhs;
+}
+
+inline __host__ __device__ half& operator/=(half &lhs, const half &rhs) {
+  lhs = lhs / rhs;
+  return lhs;
+}
+
+inline __host__ __device__ half abs(half a) {
+  return THCNumerics<half>::abs(a);
+}
+
+inline __host__ __device__ half exp(half a) {
+  return THCNumerics<half>::exp(a);
+}
+
+inline __host__ __device__ half log1p(half a) {
+  return THCNumerics<half>::log1p(a);
+}
+
+inline __host__ __device__ half pow(half a, half b) {
+  return THCNumerics<half>::pow(a, b);
+}
+
+inline __host__ __device__ half sqrt(half a) {
+  return THCNumerics<half>::sqrt(a);
+}
+
+inline __host__ __device__ half tanh(half a) {
+  return THCNumerics<half>::tanh(a);
+}
+
+// comparison functions
+
+inline __host__ __device__ bool operator<(half a, half b) {
+  return THCNumerics<half>::lt(a, b);
+}
+
+inline __host__ __device__ bool operator<=(half a, half b) {
+  return THCNumerics<half>::le(a, b);
+}
+
+inline __host__ __device__ bool operator<=(half a, int b) {
+  return THCNumerics<half>::le(a, ScalarConvert<int, half>::to(b));
+}
+
+inline __host__ __device__ bool operator<(half a, int b) {
+  return THCNumerics<half>::lt(a, ScalarConvert<int, half>::to(b));
+}
+
+inline __host__ __device__ bool operator>(half a, half b) {
+  return THCNumerics<half>::gt(a, b);
+}
+
+inline __host__ __device__ bool operator>(half a, int b) {
+  return THCNumerics<half>::gt(a, ScalarConvert<int, half>::to(b));
+}
+
+inline __host__ __device__ bool operator>=(half a, half b) {
+  return THCNumerics<half>::ge(a, b);
+}
+
+inline __host__ __device__ bool operator>=(half a, int b) {
+  return THCNumerics<half>::ge(a, ScalarConvert<int ,half>::to(b));
+}
+
+#endif
+#endif
diff --git a/lib/THCUNN/THCUNN.h b/lib/THCUNN/THCUNN.h
index 9aeef2d..4e95b06 100644
--- a/lib/THCUNN/THCUNN.h
+++ b/lib/THCUNN/THCUNN.h
@@ -1,1090 +1,11 @@
 #include <THC/THC.h>
 #include <THC/THCApply.cuh>
 
-#define THIndexTensor THCudaLongTensor
-#define THIndexTensor_(NAME) THCudaLongTensor_ ## NAME
+#define THCIndexTensor THCudaLongTensor
+#define THCIndexTensor_(NAME) THCudaLongTensor_ ## NAME
+typedef long THCIndex_t;
 
-TH_API void THNN_CudaAbs_updateOutput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *output);
-TH_API void THNN_CudaAbs_updateGradInput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *gradOutput,
-          THCudaTensor *gradInput);
+#define THNN_(NAME) TH_CONCAT_3(THNN_, CReal, NAME)
 
-TH_API void THNN_CudaAbsCriterion_updateOutput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *target,
-          THCudaTensor *output,
-          bool sizeAverage);
-TH_API void THNN_CudaAbsCriterion_updateGradInput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *target,
-          THCudaTensor *gradInput,
-          bool sizeAverage);
-
-TH_API void THNN_CudaBCECriterion_updateOutput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *target,
-          THCudaTensor *output,
-          bool sizeAverage,
-          THCudaTensor *weights);      // [OPTIONAL]
-TH_API void THNN_CudaBCECriterion_updateGradInput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *target,
-          THCudaTensor *gradInput,
-          bool sizeAverage,
-          THCudaTensor *weights);      // [OPTIONAL]
-
-TH_API void THNN_CudaClassNLLCriterion_updateOutput(
-          THCState *state,
-          THCudaTensor *input,
-          THIndexTensor *target,
-          THCudaTensor *output,
-          bool sizeAverage,
-          THCudaTensor *weights,       // [OPTIONAL]
-          THCudaTensor *total_weight);
-TH_API void THNN_CudaClassNLLCriterion_updateGradInput(
-          THCState *state,
-          THCudaTensor *input,
-          THIndexTensor *target,
-          THCudaTensor *gradInput,
-          bool sizeAverage,
-          THCudaTensor *weights,       // [OPTIONAL]
-          THCudaTensor *total_weight);
-
-TH_API void THNN_CudaSpatialClassNLLCriterion_updateOutput(
-          THCState *state,
-          THCudaTensor *input,
-          THIndexTensor *target,
-          THCudaTensor *output,
-          bool sizeAverage,
-          THCudaTensor *weights,       // [OPTIONAL]
-          THCudaTensor *total_weight);
-TH_API void THNN_CudaSpatialClassNLLCriterion_updateGradInput(
-          THCState *state,
-          THCudaTensor *input,
-          THIndexTensor *target,
-          THCudaTensor *gradInput,
-          bool sizeAverage,
-          THCudaTensor *weights,       // [OPTIONAL]
-          THCudaTensor *total_weight);
-
-TH_API void THNN_CudaDistKLDivCriterion_updateOutput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *target,
-          THCudaTensor *output,
-          bool sizeAverage);
-TH_API void THNN_CudaDistKLDivCriterion_updateGradInput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *target,
-          THCudaTensor *gradInput,
-          bool sizeAverage);
-
-TH_API void THNN_CudaELU_updateOutput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *output,
-          float alpha,
-          bool inplace);
-TH_API void THNN_CudaELU_updateGradInput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *gradOutput,
-          THCudaTensor *gradInput,
-          THCudaTensor *output,
-          float alpha,
-          bool inplace);
-
-TH_API void THNN_CudaHardTanh_updateOutput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *output,
-          float min_val,
-          float max_val,
-          bool inplace);
-TH_API void THNN_CudaHardTanh_updateGradInput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *gradOutput,
-          THCudaTensor *gradInput,
-          float min_val,
-          float max_val,
-          bool inplace);
-
-TH_API void THNN_CudaL1Cost_updateOutput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *output);
-TH_API void THNN_CudaL1Cost_updateGradInput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *gradOutput,    // [OPTIONAL]
-          THCudaTensor *gradInput);
-
-TH_API void THNN_CudaLeakyReLU_updateOutput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *output,
-          double negval, bool inplace);
-TH_API void THNN_CudaLeakyReLU_updateGradInput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *gradOutput,
-          THCudaTensor *gradInput,
-          double negval,
-          bool inplace);
-
-TH_API void THNN_CudaLogSigmoid_updateOutput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *output,
-          THCudaTensor *buffer);
-TH_API void THNN_CudaLogSigmoid_updateGradInput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *gradOutput,
-          THCudaTensor *gradInput,
-          THCudaTensor *buffer);
-
-TH_API void THNN_CudaLogSoftMax_updateOutput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *output);
-TH_API void THNN_CudaLogSoftMax_updateGradInput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *gradOutput,
-          THCudaTensor *gradInput,
-          THCudaTensor *output);
-
-TH_API void THNN_CudaLookupTable_accGradParameters(
-          THCState *state,
-          THIndexTensor *input,
-          THCudaTensor *gradOutput,
-          THCudaTensor *gradWeight,
-          THIndexTensor *count,
-          THIndexTensor *sorted,        // [OPTIONAL]
-          THIndexTensor *indices,       // [OPTIONAL]
-          bool scaleGradByFreq,
-          int paddingValue,
-          float scale);
-
-TH_API void THNN_CudaLookupTable_renorm(
-          THCState *state,
-          THIndexTensor *idx,
-          THCudaTensor *weight,
-          float maxNorm,
-          float normType);
-
-TH_API void THNN_CudaMarginCriterion_updateOutput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *target,
-          THCudaTensor *output,
-          bool sizeAverage,
-          float margin);
-TH_API void THNN_CudaMarginCriterion_updateGradInput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *target,
-          THCudaTensor *gradInput,
-          bool sizeAverage,
-          float margin);
-
-TH_API void THNN_CudaSoftMarginCriterion_updateOutput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *target,
-          THCudaTensor *output,
-          int sizeAverage);
-
-TH_API void THNN_CudaSoftMarginCriterion_updateGradInput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *target,
-          THCudaTensor *gradInput,
-          int sizeAverage);
-
-TH_API void THNN_CudaMSECriterion_updateOutput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *target,
-          THCudaTensor *output,
-          bool sizeAverage);
-TH_API void THNN_CudaMSECriterion_updateGradInput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *target,
-          THCudaTensor *gradInput,
-          bool sizeAverage);
-
-TH_API void THNN_CudaMultiMarginCriterion_updateOutput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *target,
-          THCudaTensor *output,
-          bool sizeAverage,
-          int p,
-          THCudaTensor *weights,       // [OPTIONAL]
-          float margin);
-TH_API void THNN_CudaMultiMarginCriterion_updateGradInput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *target,
-          THCudaTensor *gradInput,
-          bool sizeAverage,
-          int p,
-          THCudaTensor *weights,       // [OPTIONAL]
-          float margin);
-
-TH_API void THNN_CudaMultiLabelMarginCriterion_updateOutput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *target,
-          THCudaTensor *output,
-          THCudaTensor *istarget,
-          bool sizeAverage);
-TH_API void THNN_CudaMultiLabelMarginCriterion_updateGradInput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *target,
-          THCudaTensor *gradInput,
-          THCudaTensor *istarget,
-          bool sizeAverage);
-
-TH_API void THNN_CudaPReLU_updateOutput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *output,
-          THCudaTensor *weight,
-          long nOutputPlane);
-TH_API void THNN_CudaPReLU_updateGradInput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *gradOutput,
-          THCudaTensor *gradInput,
-          THCudaTensor *weight,
-          long nOutputPlane);
-TH_API void THNN_CudaPReLU_accGradParameters(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *gradOutput,
-          THCudaTensor *gradInput,
-          THCudaTensor *weight,
-          THCudaTensor *gradWeight,
-          THCudaTensor *gradWeightBuf,
-          THCudaTensor *gradWeightBuf2,
-          long nOutputPlane,
-          float scale);
-
-TH_API void THNN_CudaRReLU_updateOutput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *output,
-          THCudaTensor *noise,
-          double lower,
-          double upper,
-          bool train,
-          bool inplace,
-          void *generator);
-TH_API void THNN_CudaRReLU_updateGradInput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *gradOutput,
-          THCudaTensor *gradInput,
-          THCudaTensor *noise,
-          double lower,
-          double upper,
-          bool train,
-          bool inplace);
-
-TH_API void THNN_CudaSigmoid_updateOutput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *output);
-TH_API void THNN_CudaSigmoid_updateGradInput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *gradOutput,
-          THCudaTensor *gradInput,
-          THCudaTensor *output);
-
-TH_API void THNN_CudaSmoothL1Criterion_updateOutput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *target,
-          THCudaTensor *output,
-          bool sizeAverage);
-TH_API void THNN_CudaSmoothL1Criterion_updateGradInput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *target,
-          THCudaTensor *gradInput,
-          bool sizeAverage);
-
-TH_API void THNN_CudaSoftMax_updateOutput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *output);
-TH_API void THNN_CudaSoftMax_updateGradInput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *gradOutput,
-          THCudaTensor *gradInput,
-          THCudaTensor *output);
-
-TH_API void THNN_CudaSoftPlus_updateOutput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *output,
-          float beta,
-          float threshold);
-TH_API void THNN_CudaSoftPlus_updateGradInput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *gradOutput,
-          THCudaTensor *gradInput,
-          THCudaTensor *output,
-          float beta,
-          float threshold);
-
-TH_API void THNN_CudaSoftShrink_updateOutput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *output,
-          double lambda);
-TH_API void THNN_CudaSoftShrink_updateGradInput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *gradOutput,
-          THCudaTensor *gradInput,
-          double lambda);
-
-TH_API void THNN_CudaSqrt_updateOutput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *output,
-          float eps);
-TH_API void THNN_CudaSqrt_updateGradInput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *gradOutput,
-          THCudaTensor *gradInput,
-          THCudaTensor *output);
-
-TH_API void THNN_CudaSquare_updateOutput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *output);
-TH_API void THNN_CudaSquare_updateGradInput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *gradOutput,
-          THCudaTensor *gradInput);
-
-TH_API void THNN_CudaTanh_updateOutput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *output);
-TH_API void THNN_CudaTanh_updateGradInput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *gradOutput,
-          THCudaTensor *gradInput,
-          THCudaTensor *output);
-
-TH_API void THNN_CudaThreshold_updateOutput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *output,
-          double threshold,
-          double val,
-          bool inplace);
-TH_API void THNN_CudaThreshold_updateGradInput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *gradOutput,
-          THCudaTensor *gradInput,
-          double threshold,
-          double val,
-          bool inplace);
-
-TH_API void THNN_CudaTemporalConvolution_updateOutput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *output,
-          THCudaTensor *weight,
-          THCudaTensor *bias,
-          int kW, int dW,
-          int inputFrameSize,
-          int outputFrameSize);
-
-TH_API void THNN_CudaTemporalConvolution_updateGradInput(
-          THCState* state,
-          THCudaTensor *input,
-          THCudaTensor *gradOutput,
-          THCudaTensor *gradInput,
-          THCudaTensor *weight,
-          int kW, int dW);
-
-TH_API void THNN_CudaTemporalConvolution_accGradParameters(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *gradOutput,
-          THCudaTensor *gradWeight,
-          THCudaTensor *gradBias,
-          int kW, int dW,
-          float scale);
-
-TH_API void THNN_CudaTemporalMaxPooling_updateOutput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *output,
-          THCudaTensor *indices,
-          int kW, int dW);
-
-TH_API void THNN_CudaTemporalMaxPooling_updateGradInput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *gradOutput,
-          THCudaTensor *gradInput,
-          THCudaTensor *indices,
-          int kW, int dW);
-
-TH_API void THNN_CudaSparseLinear_updateOutput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *output,
-          THCudaTensor *weight,
-          THCudaTensor *bias);
-TH_API void THNN_CudaSparseLinear_accGradParameters(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *gradOutput,
-          THCudaTensor *gradWeight,
-          THCudaTensor *gradBias,
-          THCudaTensor *weight,
-          THCudaTensor *bias,
-          double weightDecay,
-          double scale);
-TH_API void THNN_CudaSparseLinear_legacyUpdateOutput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *output,
-          THCudaTensor *weight,
-          THCudaTensor *bias);
-TH_API void THNN_CudaSparseLinear_legacyAccGradParameters(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *gradOutput,
-          THCudaTensor *gradWeight,
-          THCudaTensor *gradBias,
-          THCudaTensor *weight,
-          THCudaTensor *bias,
-          double weightDecay,
-          double scale);
-TH_API void THNN_CudaSparseLinear_zeroGradParameters(
-          THCState *state,
-          THCudaTensor *gradWeight,
-          THCudaTensor *gradBias,
-          THCudaTensor *lastInput);
-TH_API void THNN_CudaSparseLinear_updateParameters(
-          THCState *state,
-          THCudaTensor *weight,
-          THCudaTensor *bias,
-          THCudaTensor *gradWeight,
-          THCudaTensor *gradBias,
-          THCudaTensor *lastInput,
-          double learningRate);
-
-TH_API void THNN_CudaBatchNormalization_updateOutput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *output,
-          THCudaTensor *weight,        // [OPTIONAL]
-          THCudaTensor *bias,          // [OPTIONAL]
-          THCudaTensor *runningMean,
-          THCudaTensor *runningVar,
-          THCudaTensor *saveMean,
-          THCudaTensor *saveStd,
-          bool train,
-          double momentum,
-          double eps);
-TH_API void THNN_CudaBatchNormalization_backward(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *gradOutput,
-          THCudaTensor *gradInput,     // [OPTIONAL]
-          THCudaTensor *gradWeight,    // [OPTIONAL]
-          THCudaTensor *gradBias,      // [OPTIONAL]
-          THCudaTensor *weight,        // [OPTIONAL]
-          THCudaTensor *running_mean,
-          THCudaTensor *running_var,
-          THCudaTensor *save_mean,
-          THCudaTensor *save_std,
-          bool train,
-          float scale,
-          double eps);
-
-TH_API void THNN_CudaSpatialConvolutionMM_updateOutput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *output,
-          THCudaTensor *weight,
-          THCudaTensor *bias,          // [OPTIONAL]
-          THCudaTensor *columns,
-          THCudaTensor *ones,
-          int kW, int kH,
-          int dW, int dH,
-          int padW, int padH);
-TH_API void THNN_CudaSpatialConvolutionMM_updateGradInput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *gradOutput,
-          THCudaTensor *gradInput,
-          THCudaTensor *weight,
-          THCudaTensor *columns,
-          THCudaTensor *ones,
-          int kW, int kH,
-          int dW, int dH,
-          int padW, int padH);
-TH_API void THNN_CudaSpatialConvolutionMM_accGradParameters(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *gradOutput,
-          THCudaTensor *gradWeight,
-          THCudaTensor *gradBias,      // [OPTIONAL]
-          THCudaTensor *columns,
-          THCudaTensor *ones,
-          int kW, int kH,
-          int dW, int dH,
-          int padW, int padH,
-          float scale);
-
-TH_API void THNN_CudaSpatialConvolutionLocal_updateOutput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *output,
-          THCudaTensor *weight,
-          THCudaTensor *bias,
-          THCudaTensor *finput,
-          THCudaTensor *fgradInput,
-          int kW, int kH,
-          int dW, int dH,
-          int padW, int padH,
-          long inputWidth, long inputHeight,
-          long outputWidth, long outputHeight);
-TH_API void THNN_CudaSpatialConvolutionLocal_updateGradInput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *gradOutput,
-          THCudaTensor *gradInput,
-          THCudaTensor *weight,
-          THCudaTensor *finput,
-          THCudaTensor *fgradInput,
-          int kW, int kH,
-          int dW, int dH,
-          int padW, int padH,
-          long inputWidth, long inputHeight,
-          long outputWidth, long outputHeight);
-TH_API void THNN_CudaSpatialConvolutionLocal_accGradParameters(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *gradOutput,
-          THCudaTensor *gradWeight,
-          THCudaTensor *gradBias,
-          THCudaTensor *finput,
-          THCudaTensor *fgradInput,
-          int kW, int kH,
-          int dW, int dH,
-          int padW, int padH,
-          long inputWidth, long inputHeight,
-          long outputWidth, long outputHeight,
-          float scale);
-
-TH_API void THNN_CudaSpatialFullConvolution_updateOutput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *output,
-          THCudaTensor *weight,
-          THCudaTensor *bias,          // [OPTIONAL]
-          THCudaTensor *columns,
-          THCudaTensor *ones,
-          int kW, int kH,
-          int dW, int dH,
-          int padW, int padH,
-          int adjW, int adjH);
-TH_API void THNN_CudaSpatialFullConvolution_updateGradInput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *gradOutput,
-          THCudaTensor *gradInput,
-          THCudaTensor *weight,
-          THCudaTensor *gradColumns,
-          int kW, int kH,
-          int dW, int dH,
-          int padW, int padH,
-          int adjW, int adjH);
-TH_API void THNN_CudaSpatialFullConvolution_accGradParameters(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *gradOutput,
-          THCudaTensor *gradWeight,
-          THCudaTensor *gradBias,      // [OPTIONAL]
-          THCudaTensor *columns,
-          THCudaTensor *ones,
-          int kW, int kH,
-          int dW, int dH,
-          int padW, int padH,
-          int adjW, int adjH,
-          float scale);
-
-TH_API void THNN_CudaSpatialDilatedConvolution_updateOutput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *output,
-          THCudaTensor *weight,
-          THCudaTensor *bias,          // [OPTIONAL]
-          THCudaTensor *columns,
-          THCudaTensor *ones,
-          int kW, int kH,
-          int dW, int dH,
-          int padW, int padH,
-          int dilationW, int dilationH);
-
-TH_API void THNN_CudaSpatialDilatedConvolution_updateGradInput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *gradOutput,
-          THCudaTensor *gradInput,
-          THCudaTensor *weight,
-          THCudaTensor *gradColumns,
-          int kW, int kH,
-          int dW, int dH,
-          int padW, int padH,
-          int dilationW, int dilationH );
-
-TH_API void THNN_CudaSpatialDilatedConvolution_accGradParameters(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *gradOutput,
-          THCudaTensor *gradWeight,
-          THCudaTensor *gradBias,      // [OPTIONAL]
-          THCudaTensor *columns,
-          THCudaTensor *ones,
-          int kW, int kH,
-          int dW, int dH,
-          int padW, int padH,
-          int dilationW, int dilationH,
-          float scale);
-
-TH_API void THNN_CudaSpatialCrossMapLRN_updateOutput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *output,
-          THCudaTensor *scale,
-          int size,
-          float alpha,
-          float beta,
-          float k);
-TH_API void THNN_CudaSpatialCrossMapLRN_updateGradInput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *gradOutput,
-          THCudaTensor *gradInput,
-          THCudaTensor *scale,
-          THCudaTensor *output,
-          int size,
-          float alpha,
-          float beta,
-          float k);
-
-TH_API void THNN_CudaSpatialAdaptiveMaxPooling_updateOutput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *output,
-          THCudaTensor *indices,
-          int nOutputCols,
-          int nOutputRows);
-TH_API void THNN_CudaSpatialAdaptiveMaxPooling_updateGradInput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *gradOutput,
-          THCudaTensor *gradInput,
-          THCudaTensor *indices);
-
-TH_API void THNN_CudaSpatialAveragePooling_updateOutput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *output,
-          int kW, int kH,
-          int dW, int dH,
-          int padW, int padH,
-          bool ceil_mode,
-          bool count_include_pad);
-TH_API void THNN_CudaSpatialAveragePooling_updateGradInput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *gradOutput,
-          THCudaTensor *gradInput,
-          int kW, int kH,
-          int dW, int dH,
-          int padW, int padH,
-          bool ceil_mode,
-          bool count_include_pad);
-
-TH_API void THNN_CudaSpatialMaxPooling_updateOutput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *output,
-          THCudaTensor *indices,
-          int kW, int kH,
-          int dW, int dH,
-          int padW, int padH,
-          bool ceil_mode);
-TH_API void THNN_CudaSpatialMaxPooling_updateGradInput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *gradOutput,
-          THCudaTensor *gradInput,
-          THCudaTensor *indices,
-          int kW, int kH,
-          int dW, int dH,
-          int padW, int padH,
-          bool ceil_mode);
-
-TH_API void THNN_CudaSpatialDilatedMaxPooling_updateOutput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *output,
-          THCudaTensor *indices,
-          int kW, int kH,
-          int dW, int dH,
-          int padW, int padH,
-          int dilationW, int dilationH,
-          bool ceil_mode);
-TH_API void THNN_CudaSpatialDilatedMaxPooling_updateGradInput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *gradOutput,
-          THCudaTensor *gradInput,
-          THCudaTensor *indices,
-          int kW, int kH,
-          int dW, int dH,
-          int padW, int padH,
-          int dilationW, int dilationH,
-          bool ceil_mode);
-
-TH_API void THNN_CudaSpatialMaxUnpooling_updateOutput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *output,
-          THCudaTensor *indices,
-          int owidth, int oheight);
-TH_API void THNN_CudaSpatialMaxUnpooling_updateGradInput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *gradOutput,
-          THCudaTensor *gradInput,
-          THCudaTensor *indices,
-          int owidth, int oheight);
-
-TH_API void THNN_CudaSpatialFractionalMaxPooling_updateOutput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *output,
-          int outputW, int outputH,
-          int poolSizeW, int poolSizeH,
-          THCudaTensor *indices,
-          THCudaTensor *randomSamples);
-TH_API void THNN_CudaSpatialFractionalMaxPooling_updateGradInput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *gradOutput,
-          THCudaTensor *gradInput,
-          int outputW, int outputH,
-          int poolSizeW, int poolSizeH,
-          THCudaTensor *indices);
-
-TH_API void THNN_CudaSpatialSubSampling_updateOutput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *output,
-          THCudaTensor *weight,
-          THCudaTensor *bias,
-          int kW, int kH,
-          int dW, int dH);
-TH_API void THNN_CudaSpatialSubSampling_updateGradInput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *gradOutput,
-          THCudaTensor *gradInput,
-          THCudaTensor *weight,
-          int kW, int kH,
-          int dW, int dH);
-TH_API void THNN_CudaSpatialSubSampling_accGradParameters(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *gradOutput,
-          THCudaTensor *gradWeight,
-          THCudaTensor *gradBias,
-          int kW, int kH,
-          int dW, int dH,
-          float scale);
-
-TH_API void THNN_CudaSpatialUpSamplingNearest_updateOutput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *output,
-          int scale_factor);
-TH_API void THNN_CudaSpatialUpSamplingNearest_updateGradInput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *gradOutput,
-          THCudaTensor *gradInput,
-          int scale_factor);
-
-TH_API void THNN_CudaSpatialUpSamplingBilinear_updateOutput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *output,
-	  int outputHeight,
-          int outputWidth);
-TH_API void THNN_CudaSpatialUpSamplingBilinear_updateGradInput(
-          THCState *state,
-          THCudaTensor *gradOutput,
-          THCudaTensor *gradInput,
-          int nbatch,
-          int nchannels,
-          int inputHeight,
-          int inputWidth,
-          int outputHeight,
-          int outputWidth);
-
-TH_API void THNN_CudaVolumetricAveragePooling_updateOutput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *output,
-          int kT, int kW, int kH,
-          int dT, int dW, int dH);
-TH_API void THNN_CudaVolumetricAveragePooling_updateGradInput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *gradOutput,
-          THCudaTensor *gradInput,
-          int kT, int kW, int kH,
-          int dT, int dW, int dH);
-
-TH_API void THNN_CudaVolumetricConvolution_updateOutput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *output,
-          THCudaTensor *weight,
-          THCudaTensor *bias,
-          THCudaTensor *finput,
-          THCudaTensor *fgradInput,
-          int dT, int dW, int dH,
-          int padT, int padW, int padH);
-TH_API void THNN_CudaVolumetricConvolution_updateGradInput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *gradOutput,
-          THCudaTensor *gradInput,
-          THCudaTensor *weight,
-          THCudaTensor *finput,
-          int dT, int dW, int dH,
-          int padT, int padW, int padH);
-TH_API void THNN_CudaVolumetricConvolution_accGradParameters(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *gradOutput,
-          THCudaTensor *gradWeight,
-          THCudaTensor *gradBias,
-          THCudaTensor *finput,
-          THCudaTensor *fgradInput,
-          int dT, int dW, int dH,
-          int padT, int padW, int padH,
-          float scale);
-
-TH_API void THNN_CudaVolumetricFullConvolution_updateOutput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *output,
-          THCudaTensor *weight,
-          THCudaTensor *bias,
-          THCudaTensor *finput,
-          THCudaTensor *fgradInput,
-          int dT, int dW, int dH,
-          int padT, int padW, int padH,
-          int adjT, int adjW, int adjH);
-TH_API void THNN_CudaVolumetricFullConvolution_updateGradInput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *gradOutput,
-          THCudaTensor *gradInput,
-          THCudaTensor *weight,
-          THCudaTensor *finput,
-          THCudaTensor *fgradInput,
-          int dT, int dW, int dH,
-          int padT, int padW, int padH,
-          int adjT, int adjW, int adjH);
-TH_API void THNN_CudaVolumetricFullConvolution_accGradParameters(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *gradOutput,
-          THCudaTensor *gradWeight,
-          THCudaTensor *gradBias,
-          THCudaTensor *finput,
-          THCudaTensor *fgradInput,
-          int dT, int dW, int dH,
-          int padT, int padW, int padH,
-          int adjT, int adjW, int adjH,
-          float scale);
-
-TH_API void THNN_CudaVolumetricDilatedConvolution_updateOutput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *output,
-          THCudaTensor *weight,
-          THCudaTensor *bias,
-          THCudaTensor *columns,
-          THCudaTensor *ones,
-          int kT, int kW, int kH,
-          int dT, int dW, int dH,
-          int padT, int padW, int padH,
-          int dilationT, int dilationW, int dilationH);
-
-TH_API void THNN_CudaVolumetricDilatedConvolution_updateGradInput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *gradOutput,
-          THCudaTensor *gradInput,
-          THCudaTensor *weight,
-          THCudaTensor *gradColumns,
-          int kT, int kW, int kH,
-          int dT, int dW, int dH,
-          int padT, int padW, int padH,
-          int dilationT, int dilationW, int dilationH);
-
-TH_API void THNN_CudaVolumetricDilatedConvolution_accGradParameters(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *gradOutput,
-          THCudaTensor *gradWeight,
-          THCudaTensor *gradBias,
-          THCudaTensor *columns,
-          THCudaTensor *ones,
-          int kT, int kW, int kH,
-          int dT, int dW, int dH,
-          int padT, int padW, int padH,
-          int dilationT, int dilationW, int dilationH,
-          float scale);
-
-TH_API void THNN_CudaVolumetricMaxPooling_updateOutput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *output,
-          THCudaTensor *indices,
-          int kT, int kW, int kH,
-          int dT, int dW, int dH,
-          int padT, int padW, int padH,
-          bool ceilMode);
-TH_API void THNN_CudaVolumetricMaxPooling_updateGradInput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *gradOutput,
-          THCudaTensor *gradInput,
-          THCudaTensor *indices,
-          int dT, int dW, int dH,
-          int padT, int padW, int padH);
-
-TH_API void THNN_CudaVolumetricDilatedMaxPooling_updateOutput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *output,
-          THCudaTensor *indices,
-          int kT, int kW, int kH,
-          int dT, int dW, int dH,
-          int padT, int padW, int padH,
-          int dilationT, int dilationW, int dilationH,
-          bool ceilMode);
-TH_API void THNN_CudaVolumetricDilatedMaxPooling_updateGradInput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *gradOutput,
-          THCudaTensor *gradInput,
-          THCudaTensor *indices,
-          int dT, int dW, int dH,
-          int padT, int padW, int padH,
-          int dilationT, int dilationW, int dilationH);
-
-TH_API void THNN_CudaVolumetricMaxUnpooling_updateOutput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *output,
-          THCudaTensor *indices,
-          int outputTime, int outputWidth, int outputHeight,
-          int dT, int dW, int dH,
-          int padT, int padW, int padH);
-TH_API void THNN_CudaVolumetricMaxUnpooling_updateGradInput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *gradOutput,
-          THCudaTensor *gradInput,
-          THCudaTensor *indices,
-          int outputTime, int outputWidth, int outputHeight,
-          int dT, int dW, int dH,
-          int padT, int padW, int padH);
-
-TH_API void THNN_CudaSpatialReflectionPadding_updateOutput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *output,
-          int padL, int padR,
-          int padT, int padB);
-TH_API void THNN_CudaSpatialReflectionPadding_updateGradInput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *gradOutput,
-          THCudaTensor *gradInput,
-          int padL, int padR,
-          int padT, int padB);
-
-TH_API void THNN_CudaSpatialReplicationPadding_updateOutput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *output,
-          int padL, int padR,
-          int padT, int padB);
-TH_API void THNN_CudaSpatialReplicationPadding_updateGradInput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *gradOutput,
-          THCudaTensor *gradInput,
-          int padL, int padR,
-          int padT, int padB);
-
-TH_API void THNN_CudaVolumetricReplicationPadding_updateOutput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *output,
-          int pleft, int pright,
-          int ptop, int pbottom,
-          int pfront, int pback);
-TH_API void THNN_CudaVolumetricReplicationPadding_updateGradInput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *gradOutput,
-          THCudaTensor *gradInput,
-          int pleft, int pright,
-          int ptop, int pbottom,
-          int pfront, int pback);
+#include "generic/THCUNN.h"
+#include "THCGenerateFloatTypes.h"
diff --git a/lib/THCUNN/Tanh.cu b/lib/THCUNN/Tanh.cu
index 726169a..0bd1ee8 100644
--- a/lib/THCUNN/Tanh.cu
+++ b/lib/THCUNN/Tanh.cu
@@ -1,32 +1,24 @@
 #include "THCUNN.h"
-#include "common.h"
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
 
+template <typename T>
 struct tanhupdateOutput_functor
 {
-  __device__ void operator()(float *output, const float *input) const
+  __device__ void operator()(T *output, const T *input) const
   {
     *output = tanh(*input);
   }
 };
 
-void THNN_CudaTanh_updateOutput(THCState *state, THCudaTensor *input, THCudaTensor *output)
-{
-  THCUNN_assertSameGPU(state, 2, input, output);
-  THCudaTensor_resizeAs(state, output, input);
-  THC_pointwiseApply2(state, output, input, tanhupdateOutput_functor());
-}
-
+template <typename T>
 struct tanhupdateGradInput_functor
 {
-  __device__ void operator()(float *gradInput, const float *output, const float *gradOutput) const
+  __device__ void operator()(T *gradInput, const T *output, const T *gradOutput) const
   {
     *gradInput = *gradOutput * (1 - *output * *output);
   }
 };
 
-void THNN_CudaTanh_updateGradInput(THCState *state, THCudaTensor *input, THCudaTensor *gradOutput, THCudaTensor *gradInput, THCudaTensor *output)
-{
-  THCUNN_assertSameGPU(state, 3, output, gradOutput, gradInput);
-  THCudaTensor_resizeAs(state, gradInput, output);
-  THC_pointwiseApply3(state, gradInput, output, gradOutput, tanhupdateGradInput_functor());
-}
+#include "generic/Tanh.cu"
+#include "THCGenerateFloatTypes.h"
diff --git a/lib/THCUNN/TemporalConvolution.cu b/lib/THCUNN/TemporalConvolution.cu
index 078fd7b..f4e9c69 100644
--- a/lib/THCUNN/TemporalConvolution.cu
+++ b/lib/THCUNN/TemporalConvolution.cu
@@ -1,345 +1,7 @@
 #include "THCUNN.h"
 #include "common.h"
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
 
-void THNN_CudaTemporalConvolution_updateOutput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *output,
-          THCudaTensor *weight,
-          THCudaTensor *bias,
-          int kW, int dW,
-          int inputFrameSize,
-          int outputFrameSize) {
-
-  THCudaTensor *outputWindow, *inputWindow;
-  int nInputFrame, nOutputFrame;
-  long k, i;
-
-  int dimS = 0; // sequence dimension
-  int dimF = 1; // feature dimension
-
-  THCUNN_assertSameGPU(state, 4, input, output, weight, bias);
-  THArgCheck( input->nDimension == 2 || input->nDimension == 3, 2, "2D or 3D(batch mode) tensor expected");
-
-  if (input->nDimension == 3)
-  {
-    dimS = 1;
-    dimF = 2;
-  }
-  THArgCheck( input->size[dimF] == inputFrameSize, 2, "invalid input frame size");
-  THArgCheck( input->size[dimS] >= kW, 2, "input sequence smaller than kernel size");
-
-  input = THCudaTensor_newContiguous(state, input);
-  outputWindow = THCudaTensor_new(state);
-  inputWindow = THCudaTensor_new(state);
-
-  nInputFrame = input->size[dimS];
-  nOutputFrame = (nInputFrame - kW) / dW + 1;
-
-  if (input->nDimension == 2)
-  {
-    THCudaTensor_resize2d(state, output,
-                          nOutputFrame,
-                          outputFrameSize);
-
-    /* bias first */
-    for(k = 0; k < nOutputFrame; k++)
-    {
-      THCudaTensor_select(state, outputWindow, output, 0, k);
-      THCudaTensor_copy(state, outputWindow, bias);
-    }
-
-
-    /* ouch */
-    for(k = 0; nOutputFrame > 0; k++)
-    {
-      long outputFrameStride = (kW-1)/dW+1;
-      long inputFrameStride = outputFrameStride*dW;
-      long nFrame = (nInputFrame-k*dW-kW)/inputFrameStride + 1;
-      nOutputFrame -= nFrame;
-
-      THCudaTensor_setStorage2d(state, inputWindow, input->storage,
-                              input->storageOffset+k*dW*input->size[1],
-                              nFrame, inputFrameStride*input->size[1],
-                              kW*input->size[1], 1);
-
-      THCudaTensor_setStorage2d(state, outputWindow, output->storage,
-                              output->storageOffset + k*output->size[1],
-                              nFrame, outputFrameStride*output->size[1],
-                              output->size[1], 1);
-
-      THCudaTensor_transpose(state, weight, NULL, 0, 1);
-      THCudaTensor_addmm(state, outputWindow, 1, outputWindow, 1, inputWindow, weight);
-      THCudaTensor_transpose(state, weight, NULL, 0, 1);
-    }
-  }
-  else
-  {
-    THCudaTensor *outputSample = THCudaTensor_new(state);
-    THCudaTensor *inputSample = THCudaTensor_new(state);
-    int nBatchFrame = input->size[0];
-
-    THCudaTensor_resize3d(state, output,
-                          nBatchFrame,
-                          nOutputFrame,
-                          outputFrameSize);
-
-    for(i = 0; i < nBatchFrame; i++)
-    {
-      THCudaTensor_select(state, outputSample, output, 0, i);
-      THCudaTensor_select(state, inputSample, input, 0, i);
-      long nOutputSampleFrame = nOutputFrame;
-
-      /* bias first */
-      for(k = 0; k < nOutputFrame; k++)
-      {
-        THCudaTensor_select(state, outputWindow, outputSample, 0, k);
-        THCudaTensor_copy(state, outputWindow, bias);
-      }
-
-      /* ouch */
-      for(k = 0; nOutputSampleFrame > 0; k++)
-      {
-        long outputFrameStride = (kW-1)/dW+1;
-        long inputFrameStride = outputFrameStride*dW;
-        long nFrame = (nInputFrame-k*dW-kW)/inputFrameStride + 1;
-        nOutputSampleFrame -= nFrame;
-
-        THCudaTensor_setStorage2d(state, inputWindow, inputSample->storage,
-                                inputSample->storageOffset+k*dW*inputSample->size[1],
-                                nFrame, inputFrameStride*inputSample->size[1],
-                                kW*inputSample->size[1], 1);
-
-        THCudaTensor_setStorage2d(state, outputWindow, outputSample->storage,
-                                outputSample->storageOffset + k*outputSample->size[1],
-                                nFrame, outputFrameStride*outputSample->size[1],
-                                outputSample->size[1], 1);
-
-        THCudaTensor_transpose(state, weight, NULL, 0, 1);
-        THCudaTensor_addmm(state, outputWindow, 1, outputWindow, 1, inputWindow, weight);
-        THCudaTensor_transpose(state, weight, NULL, 0, 1);
-      }
-    }
-    THCudaTensor_free(state, outputSample);
-    THCudaTensor_free(state, inputSample);
-  }
-
-  THCudaTensor_free(state, outputWindow);
-  THCudaTensor_free(state, inputWindow);
-  THCudaTensor_free(state, input);
-
-}
-
-void THNN_CudaTemporalConvolution_updateGradInput(
-          THCState* state,
-          THCudaTensor *input,
-          THCudaTensor *gradOutput,
-          THCudaTensor *gradInput,
-          THCudaTensor *weight,
-          int kW, int dW) {
-
-  long nInputFrame;
-  long nOutputFrame;
-
-  THCudaTensor *gradOutputWindow;
-  THCudaTensor *gradInputWindow;
-  long k, i;
-
-  int dimS = 0; // sequence dimension
-
-  THCUNN_assertSameGPU(state, 4, input, gradOutput, weight, gradInput);
-
-  if (gradOutput->nDimension == 3)
-  {
-    dimS = 1;
-  }
-
-  nInputFrame = input->size[dimS];
-  nOutputFrame = gradOutput->size[dimS];
-
-
-  /* Not necessary with partial backprop: */
-  gradOutputWindow = THCudaTensor_new(state);
-  gradInputWindow = THCudaTensor_new(state);
-
-  THCudaTensor_resizeAs(state, gradInput, input);
-  THCudaTensor_zero(state, gradInput);
-
-  if (gradOutput->nDimension == 2)
-  {
-    /* ouch */
-    for(k = 0; nOutputFrame > 0; k++)
-    {
-      long outputFrameStride = (kW-1)/dW+1;
-      long inputFrameStride = outputFrameStride*dW;
-      long nFrame = (nInputFrame-k*dW-kW)/inputFrameStride + 1;
-      nOutputFrame -= nFrame;
-
-      THCudaTensor_setStorage2d(state, gradOutputWindow, gradOutput->storage,
-                              gradOutput->storageOffset + k*gradOutput->size[1],
-                              nFrame, outputFrameStride*gradOutput->size[1],
-                              gradOutput->size[1], 1);
-
-      THCudaTensor_setStorage2d(state, gradInputWindow, gradInput->storage,
-                              gradInput->storageOffset+k*dW*gradInput->size[1],
-                              nFrame, inputFrameStride*gradInput->size[1],
-                              kW*gradInput->size[1], 1);
-
-      THCudaTensor_addmm(state, gradInputWindow, 1, gradInputWindow, 1, gradOutputWindow, weight);
-    }
-  }
-  else
-  {
-    THCudaTensor *gradOutputSample = THCudaTensor_new(state);
-    THCudaTensor *gradInputSample = THCudaTensor_new(state);
-    long nBatchFrame = input->size[0];
-    for(i = 0; i < nBatchFrame; i++)
-    {
-      THCudaTensor_select(state, gradOutputSample, gradOutput, 0, i);
-      THCudaTensor_select(state, gradInputSample, gradInput, 0, i);
-      long nOutputSampleFrame = nOutputFrame;
-
-      /* ouch */
-      for(k = 0; nOutputSampleFrame > 0; k++)
-      {
-        long outputFrameStride = (kW-1)/dW+1;
-        long inputFrameStride = outputFrameStride*dW;
-        long nFrame = (nInputFrame-k*dW-kW)/inputFrameStride + 1;
-        nOutputSampleFrame -= nFrame;
-
-        THCudaTensor_setStorage2d(state, gradOutputWindow, gradOutputSample->storage,
-                                gradOutputSample->storageOffset + k*gradOutputSample->size[1],
-                                nFrame, outputFrameStride*gradOutputSample->size[1],
-                                gradOutputSample->size[1], 1);
-
-        THCudaTensor_setStorage2d(state, gradInputWindow, gradInputSample->storage,
-                                gradInputSample->storageOffset+k*dW*gradInputSample->size[1],
-                                nFrame, inputFrameStride*gradInputSample->size[1],
-                                kW*gradInputSample->size[1], 1);
-
-        THCudaTensor_addmm(state, gradInputWindow, 1, gradInputWindow, 1, gradOutputWindow, weight);
-      }
-    }
-    THCudaTensor_free(state, gradOutputSample);
-    THCudaTensor_free(state, gradInputSample);
-  }
-
-  THCudaTensor_free(state, gradOutputWindow);
-  THCudaTensor_free(state, gradInputWindow);
-
-}
-
-void THNN_CudaTemporalConvolution_accGradParameters(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *gradOutput,
-          THCudaTensor *gradWeight,
-          THCudaTensor *gradBias,
-          int kW, int dW,
-          float scale) {
-
-  long nInputFrame;
-  long nOutputFrame;
-
-  THCudaTensor *gradOutputWindow;
-  THCudaTensor *inputWindow;
-  long k, i;
-
-  int dimS = 0; // sequence dimension
-
-  if (gradOutput->nDimension == 3)
-  {
-    dimS = 1;
-  }
-
-  nInputFrame = input->size[dimS];
-  nOutputFrame = gradOutput->size[dimS];
-
-  /* Not necessary with partial backprop: */
-  input = THCudaTensor_newContiguous(state, input);
-  gradOutputWindow = THCudaTensor_new(state);
-  inputWindow = THCudaTensor_new(state);
-
-  if (input->nDimension == 2)
-  {
-    /* bias first */
-    for(k = 0; k < nOutputFrame; k++)
-    {
-      THCudaTensor_select(state, gradOutputWindow, gradOutput, 0, k);
-      THCudaTensor_cadd(state, gradBias, gradBias, scale, gradOutputWindow);
-    }
-
-    /* ouch */
-    for(k = 0; nOutputFrame > 0; k++)
-    {
-      long outputFrameStride = (kW-1)/dW+1;
-      long inputFrameStride = outputFrameStride*dW;
-      long nFrame = (nInputFrame-k*dW-kW)/inputFrameStride + 1;
-      nOutputFrame -= nFrame;
-
-      THCudaTensor_setStorage2d(state, inputWindow, input->storage,
-                              input->storageOffset+k*dW*input->size[1],
-                              nFrame, inputFrameStride*input->size[1],
-                              kW*input->size[1], 1);
-
-      THCudaTensor_setStorage2d(state, gradOutputWindow, gradOutput->storage,
-                              gradOutput->storageOffset + k*gradOutput->size[1],
-                              nFrame, outputFrameStride*gradOutput->size[1],
-                              gradOutput->size[1], 1);
-
-      THCudaTensor_transpose(state, gradOutputWindow, NULL, 0, 1);
-      THCudaTensor_addmm(state, gradWeight, 1, gradWeight, scale, gradOutputWindow, inputWindow);
-      THCudaTensor_transpose(state, gradOutputWindow, NULL, 0, 1);
-    }
-  }
-  else
-  {
-    THCudaTensor *gradOutputSample = THCudaTensor_new(state);
-    THCudaTensor *inputSample = THCudaTensor_new(state);
-    long nBatchFrame = input->size[0];
-
-    for(i = 0; i < nBatchFrame; i++)
-    {
-      THCudaTensor_select(state, gradOutputSample, gradOutput, 0, i);
-      THCudaTensor_select(state, inputSample, input, 0, i);
-      long nOutputSampleFrame = nOutputFrame;
-
-      /* bias first */
-      for(k = 0; k < nOutputFrame; k++)
-      {
-        THCudaTensor_select(state, gradOutputWindow, gradOutputSample, 0, k);
-        THCudaTensor_cadd(state, gradBias, gradBias, scale, gradOutputWindow);
-      }
-
-      /* ouch */
-      for(k = 0; nOutputSampleFrame > 0; k++)
-      {
-        long outputFrameStride = (kW-1)/dW+1;
-        long inputFrameStride = outputFrameStride*dW;
-        long nFrame = (nInputFrame-k*dW-kW)/inputFrameStride + 1;
-        nOutputSampleFrame -= nFrame;
-
-        THCudaTensor_setStorage2d(state, inputWindow, inputSample->storage,
-                                inputSample->storageOffset+k*dW*inputSample->size[1],
-                                nFrame, inputFrameStride*inputSample->size[1],
-                                kW*inputSample->size[1], 1);
-
-        THCudaTensor_setStorage2d(state, gradOutputWindow, gradOutputSample->storage,
-                                gradOutputSample->storageOffset + k*gradOutputSample->size[1],
-                                nFrame, outputFrameStride*gradOutputSample->size[1],
-                                gradOutputSample->size[1], 1);
-
-        THCudaTensor_transpose(state, gradOutputWindow, NULL, 0, 1);
-        THCudaTensor_addmm(state, gradWeight, 1, gradWeight, scale, gradOutputWindow, inputWindow);
-        THCudaTensor_transpose(state, gradOutputWindow, NULL, 0, 1);
-      }
-    }
-    THCudaTensor_free(state, gradOutputSample);
-    THCudaTensor_free(state, inputSample);
-  }
-
-  THCudaTensor_free(state, gradOutputWindow);
-  THCudaTensor_free(state, inputWindow);
-  THCudaTensor_free(state, input);
-
-}
+#include "generic/TemporalConvolution.cu"
+#include "THCGenerateFloatTypes.h"
diff --git a/lib/THCUNN/TemporalMaxPooling.cu b/lib/THCUNN/TemporalMaxPooling.cu
index 0b31c60..384e409 100644
--- a/lib/THCUNN/TemporalMaxPooling.cu
+++ b/lib/THCUNN/TemporalMaxPooling.cu
@@ -1,28 +1,32 @@
 #include "THCUNN.h"
 #include "common.h"
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
+#include "THCAtomics.cuh"
 
 #define TEMPORAL_MAX_POOLING_THREADS 1024
 
-__global__ void cunn_TemporalMaxPooling_updateOutputKernel(float *input, float *output, float *indices, int input_w, int input_n, int output_w, int kW, int dW) {
+template <typename Dtype>
+__global__ void cunn_TemporalMaxPooling_updateOutputKernel(Dtype *input, Dtype *output, THCIndex_t *indices, int input_w, int input_n, int output_w, int kW, int dW) {
   // Block idx is the batch index, thread idx + block idx y * MAX_THREADS is the time index
-  float *input_data = input + blockIdx.x * input_w * input_n + (
+  Dtype *input_data = input + blockIdx.x * input_w * input_n + (
       threadIdx.x + blockIdx.y * TEMPORAL_MAX_POOLING_THREADS) * input_n * dW;
-  float *output_data = output + blockIdx.x * output_w * input_n + (
+  Dtype *output_data = output + blockIdx.x * output_w * input_n + (
       threadIdx.x + blockIdx.y * TEMPORAL_MAX_POOLING_THREADS) * input_n;
-  float *indices_data = indices + blockIdx.x * output_w * input_n + (
+  THCIndex_t *indices_data = indices + blockIdx.x * output_w * input_n + (
       threadIdx.x + blockIdx.y * TEMPORAL_MAX_POOLING_THREADS) * input_n;
 
   int feat = 0;
   int time = 0;
   int max_time = input_n * kW;
 
-  float max_value;
-  float max_index = 0.0;
+  Dtype max_value;
+  THCIndex_t max_index = 0;
 
   if (threadIdx.x + blockIdx.y * TEMPORAL_MAX_POOLING_THREADS < output_w) {
     // For all features
     for (feat = 0; feat < input_n; ++feat) {
-      max_value = -FLT_MAX;
+      max_value = THCNumerics<Dtype>::min();
       // For all values in the kernel space
       for (time = 0; time < max_time; time += input_n) {
         if (max_value < input_data[time + feat]) {
@@ -31,18 +35,19 @@ __global__ void cunn_TemporalMaxPooling_updateOutputKernel(float *input, float *
         }
       }
       output_data[feat] = max_value;
-      indices_data[feat] = (float)max_index;
+      indices_data[feat] = max_index;
     }
   }
 }
 
-__global__ void cunn_TemporalMaxPooling_updateGradInputKernel(float *gradInput, float *gradOutput, float *indices, int input_w, int input_n, int output_w, int kW, int dW) {
+template <typename Dtype>
+__global__ void cunn_TemporalMaxPooling_updateGradInputKernel(Dtype *gradInput, Dtype *gradOutput, THCIndex_t *indices, int input_w, int input_n, int output_w, int kW, int dW) {
   // Block idx is the batch index, thread idx + block idx y * MAX_THREADS is the time index
-  float *gradInput_data = gradInput + blockIdx.x * input_w * input_n + (
+  Dtype *gradInput_data = gradInput + blockIdx.x * input_w * input_n + (
       threadIdx.x + blockIdx.y * TEMPORAL_MAX_POOLING_THREADS) * input_n * dW;
-  float *gradOutput_data = gradOutput + blockIdx.x * output_w * input_n + (
+  Dtype *gradOutput_data = gradOutput + blockIdx.x * output_w * input_n + (
       threadIdx.x + blockIdx.y * TEMPORAL_MAX_POOLING_THREADS) * input_n;
-  float *indices_data = indices + blockIdx.x * output_w * input_n + (
+  THCIndex_t *indices_data = indices + blockIdx.x * output_w * input_n + (
       threadIdx.x + blockIdx.y * TEMPORAL_MAX_POOLING_THREADS) * input_n;
 
   int feat = 0;
@@ -50,18 +55,19 @@ __global__ void cunn_TemporalMaxPooling_updateGradInputKernel(float *gradInput,
   if (threadIdx.x + blockIdx.y * TEMPORAL_MAX_POOLING_THREADS < output_w) {
     // For all features
     for (feat = 0; feat < input_n; ++feat) {
-      gradInput_data[(int)indices_data[feat] * input_n + feat] += gradOutput_data[feat];
+      gradInput_data[indices_data[feat] * input_n + feat] += gradOutput_data[feat];
     }
   }
 }
 
-__global__ void cunn_TemporalMaxPooling_updateGradInputKernelAtomic(float *gradInput, float *gradOutput, float *indices, int input_w, int input_n, int output_w, int kW, int dW) {
+template <typename Dtype>
+__global__ void cunn_TemporalMaxPooling_updateGradInputKernelAtomic(Dtype *gradInput, Dtype *gradOutput, THCIndex_t *indices, int input_w, int input_n, int output_w, int kW, int dW) {
   // Block idx is the batch index, thread idx + block idx y * MAX_THREADS is the time index
-  float *gradInput_data = gradInput + blockIdx.x * input_w * input_n + (
+  Dtype *gradInput_data = gradInput + blockIdx.x * input_w * input_n + (
       threadIdx.x + blockIdx.y * TEMPORAL_MAX_POOLING_THREADS) * input_n * dW;
-  float *gradOutput_data = gradOutput + blockIdx.x * output_w * input_n + (
+  Dtype *gradOutput_data = gradOutput + blockIdx.x * output_w * input_n + (
       threadIdx.x + blockIdx.y * TEMPORAL_MAX_POOLING_THREADS) * input_n;
-  float *indices_data = indices + blockIdx.x * output_w * input_n + (
+  THCIndex_t *indices_data = indices + blockIdx.x * output_w * input_n + (
       threadIdx.x + blockIdx.y * TEMPORAL_MAX_POOLING_THREADS) * input_n;
 
   int feat = 0;
@@ -69,153 +75,10 @@ __global__ void cunn_TemporalMaxPooling_updateGradInputKernelAtomic(float *gradI
   if (threadIdx.x + blockIdx.y * TEMPORAL_MAX_POOLING_THREADS < output_w) {
     // For all features
     for (feat = 0; feat < input_n; ++feat) {
-      atomicAdd(&gradInput_data[(int)indices_data[feat] * input_n + feat], gradOutput_data[feat]);
+      atomicAdd(&gradInput_data[indices_data[feat] * input_n + feat], gradOutput_data[feat]);
     }
   }
 }
 
-void THNN_CudaTemporalMaxPooling_updateOutput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *output,
-          THCudaTensor *indices,
-          int kW, int dW) {
-
-  int dimT = 0; // Temporal dimension
-  int dimF = 1; // Feature dimension
-
-  int batch = 1;
-  int input_w;
-  int input_n;
-  int output_w;
-  int nthreads;
-
-  float *input_data;
-  float *output_data;
-  float *indices_data;
-
-  THCUNN_assertSameGPU(state, 3, input, output, indices);
-  THArgCheck( input->nDimension == 2 || input->nDimension == 3, 2, "2D or 3D(batch mode) tensor expected");
-
-  if (input->nDimension == 3)
-  {
-    dimT = 1;
-    dimF = 2;
-    batch = input->size[0];
-  }
-  THArgCheck( input->size[dimT] >= kW, 2, "input sequence smaller than kernel size");
-
-  input = THCudaTensor_newContiguous(state, input);
-
-  input_w = input->size[dimT];
-  input_n = input->size[dimF];
-  output_w = (input_w - kW) / dW + 1;
-
-  if (input->nDimension == 2)
-  {
-    THCudaTensor_resize2d(state, output, output_w, input->size[dimF]);
-    THCudaTensor_resize2d(state, indices, output_w, input->size[dimF]);
-  }
-  else
-  {
-    THCudaTensor_resize3d(state, output, batch, output_w, input->size[dimF]);
-    THCudaTensor_resize3d(state, indices, batch, output_w, input->size[dimF]);
-  }
-
-  input_data = THCudaTensor_data(state, input);
-  output_data = THCudaTensor_data(state, output);
-  indices_data = THCudaTensor_data(state, indices);
-
-  dim3 blocks(batch);
-  nthreads = (output_w / 32) * 32;
-  if (output_w % 32 > 0) {
-    nthreads += 32;
-  }
-
-  if (nthreads > TEMPORAL_MAX_POOLING_THREADS) {
-    blocks.y = nthreads / TEMPORAL_MAX_POOLING_THREADS;
-    if (nthreads % TEMPORAL_MAX_POOLING_THREADS > 0) {
-      blocks.y += 1;
-    }
-    nthreads = TEMPORAL_MAX_POOLING_THREADS;
-  }
-
-  dim3 threads(nthreads);
-  cunn_TemporalMaxPooling_updateOutputKernel <<< blocks, threads, 0, THCState_getCurrentStream(state) >>>(
-      input_data, output_data, indices_data, input_w, input_n, output_w, kW, dW);
-  THCudaCheck(cudaGetLastError());
-  THCudaTensor_free(state, input);
-
-}
-
-void THNN_CudaTemporalMaxPooling_updateGradInput(
-          THCState *state,
-          THCudaTensor *input,
-          THCudaTensor *gradOutput,
-          THCudaTensor *gradInput,
-          THCudaTensor *indices,
-          int kW, int dW) {
-
-  int dimT = 0; // Temporal dimension
-  int dimF = 1; // Feature dimension
-
-  int batch = 1;
-  int input_w;
-  int input_n;
-  int output_w;
-  int nthreads;
-
-  float *gradInput_data;
-  float *gradOutput_data;
-  float *indices_data;
-
-  THCUNN_assertSameGPU(state, 4, input, gradOutput, gradInput, indices);
-  THArgCheck( input->nDimension == 2 || input->nDimension == 3, 2, "2D or 3D(batch mode) tensor expected");
-
-  THCudaTensor_resizeAs(state, gradInput, input);
-  THCudaTensor_zero(state, gradInput);
-
-  if (input->nDimension == 3)
-  {
-    dimT = 1;
-    dimF = 2;
-    batch = input->size[0];
-  }
-  THArgCheck( input->size[dimT] >= kW, 2, "input sequence smaller than kernel size");
-
-  gradOutput = THCudaTensor_newContiguous(state, gradOutput);
-
-  input_w = input->size[dimT];
-  input_n = input->size[dimF];
-  output_w = (input_w - kW) / dW + 1;
-
-  gradInput_data = THCudaTensor_data(state, gradInput);
-  gradOutput_data = THCudaTensor_data(state, gradOutput);
-  indices_data = THCudaTensor_data(state, indices);
-
-  dim3 blocks(batch);
-  nthreads = (output_w / 32) * 32;
-  if (output_w % 32 > 0) {
-    nthreads += 32;
-  }
-
-  if (nthreads > TEMPORAL_MAX_POOLING_THREADS) {
-    blocks.y = nthreads / TEMPORAL_MAX_POOLING_THREADS;
-    if (nthreads % TEMPORAL_MAX_POOLING_THREADS > 0) {
-      blocks.y += 1;
-    }
-    nthreads = TEMPORAL_MAX_POOLING_THREADS;
-  }
-
-  dim3 threads(nthreads);
-  if (kW <= dW) {
-    cunn_TemporalMaxPooling_updateGradInputKernel <<< blocks, threads, 0, THCState_getCurrentStream(state) >>>(
-        gradInput_data, gradOutput_data, indices_data, input_w, input_n, output_w, kW, dW);
-  } else {
-    cunn_TemporalMaxPooling_updateGradInputKernelAtomic <<< blocks, threads, 0, THCState_getCurrentStream(state) >>>(
-        gradInput_data, gradOutput_data, indices_data, input_w, input_n, output_w, kW, dW);
-  }
-  THCudaCheck(cudaGetLastError());
-  THCudaTensor_free(state, gradOutput);
-
-}
+#include "generic/TemporalMaxPooling.cu"
+#include "THCGenerateFloatTypes.h"
diff --git a/lib/THCUNN/Threshold.cu b/lib/THCUNN/Threshold.cu
index 623add8..47fe794 100644
--- a/lib/THCUNN/Threshold.cu
+++ b/lib/THCUNN/Threshold.cu
@@ -1,112 +1,74 @@
 #include "THCUNN.h"
-#include "common.h"
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
 
+template <typename T>
 struct ThresholdUpdateOutput
 {
-  const float threshold_;
-  const float val_;
+  const T threshold_;
+  const T val_;
 
-  ThresholdUpdateOutput(float threshold, float val)
+  ThresholdUpdateOutput(T threshold, T val)
     : threshold_(threshold)
     , val_(val)
   {}
 
-  __device__ __forceinline__ void operator()(float *out, float *in)
+  __device__ __forceinline__ void operator()(T *out, T *in)
   {
-    float x = *in;
+    T x = *in;
     *out = (x > threshold_) ? x : val_;
   }
 };
 
 // in-place variant
+template <typename T>
 struct ThresholdUpdateOutputIP
 {
-  const float threshold_;
-  const float val_;
+  const T threshold_;
+  const T val_;
 
-  ThresholdUpdateOutputIP(float threshold, float val)
+  ThresholdUpdateOutputIP(T threshold, T val)
     : threshold_(threshold)
     , val_(val)
   {}
 
-  __device__ __forceinline__ void operator()(float *x)
+  __device__ __forceinline__ void operator()(T *x)
   {
     *x = (*x > threshold_) ? *x : val_;
   }
 };
 
-void THNN_CudaThreshold_updateOutput(THCState *state, THCudaTensor *input, THCudaTensor *output,
-  double threshold, double val, bool inplace)
-{
-  THCUNN_assertSameGPU(state, 2, input, output);
-
-  if (inplace)
-  {
-    THC_pointwiseApply1(state, input,
-      ThresholdUpdateOutputIP(threshold, val)
-    );
-    THCudaTensor_set(state, output, input);
-  }
-  else
-  {
-    THCudaTensor_resizeAs(state, output, input);
-    THC_pointwiseApply2(state, output, input,
-      ThresholdUpdateOutput(threshold, val)
-    );
-  }
-
-  THCudaCheck(cudaGetLastError());
-}
-
+template <typename T>
 struct ThresholdUpdateGradInput
 {
-  const float threshold_;
+  const T threshold_;
 
-  ThresholdUpdateGradInput(float threshold)
+  ThresholdUpdateGradInput(T threshold)
     : threshold_(threshold)
   {}
 
   __device__ __forceinline__ void operator()(
-    float *gradInput, float *input, float *gradOutput) const
+    T *gradInput, T *input, T *gradOutput) const
   {
-    *gradInput = (*input > threshold_) ? *gradOutput : 0;
+    *gradInput = (*input > threshold_) ? *gradOutput : ScalarConvert<int, T>::to(0);
   }
 };
 
+template <typename T>
 struct ThresholdUpdateGradInputIP
 {
-  const float threshold_;
+  const T threshold_;
 
-  ThresholdUpdateGradInputIP(float threshold)
+  ThresholdUpdateGradInputIP(T threshold)
     : threshold_(threshold)
   {}
 
   __device__ __forceinline__ void operator()(
-    float *gradOutput, float *input) const
+    T *gradOutput, T *input) const
   {
-    *gradOutput = (*input > threshold_) ? *gradOutput : 0;
+    *gradOutput = (*input > threshold_) ? *gradOutput : ScalarConvert<int, T>::to(0);
   }
 };
 
-void THNN_CudaThreshold_updateGradInput(THCState *state, THCudaTensor *input, THCudaTensor *gradOutput,
-  THCudaTensor *gradInput, double threshold, double val, bool inplace)
-{
-  THCUNN_assertSameGPU(state, 3, input, gradInput, gradOutput);
-
-  if (inplace)
-  {
-    THC_pointwiseApply2(state, gradOutput, input,
-      ThresholdUpdateGradInputIP(threshold)
-    );
-    THCudaTensor_set(state, gradInput, gradOutput);
-  }
-  else
-  {
-    THCudaTensor_resizeAs(state, gradInput, input);
-    THC_pointwiseApply3(state, gradInput, input, gradOutput,
-       ThresholdUpdateGradInput(threshold)
-    );
-  }
-
-  THCudaCheck(cudaGetLastError());
-}
+#include "generic/Threshold.cu"
+#include "THCGenerateFloatTypes.h"
diff --git a/lib/THCUNN/VolumetricAveragePooling.cu b/lib/THCUNN/VolumetricAveragePooling.cu
index 9542232..f584dcf 100644
--- a/lib/THCUNN/VolumetricAveragePooling.cu
+++ b/lib/THCUNN/VolumetricAveragePooling.cu
@@ -3,10 +3,14 @@
 #include "THCDeviceTensor.cuh"
 #include "THCDeviceTensorUtils.cuh"
 #include "THCDeviceUtils.cuh"
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
+#include "THCAtomics.cuh"
 
+template <typename Dtype, typename Acctype>
 __global__ void cuda_VolumetricAveragePooling_updateOutput(
-  THCDeviceTensor<float, 4> input, THCDeviceTensor<float, 4> output,
-  int kT, int kH, int kW, int dT, int dH, int dW, float normFactor, int offsetZ)
+  THCDeviceTensor<Dtype, 4> input, THCDeviceTensor<Dtype, 4> output,
+  int kT, int kH, int kW, int dT, int dH, int dW, Acctype normFactor, int offsetZ)
 {
   int oCol   = blockIdx.x * blockDim.x + threadIdx.x;
   int oRow   = blockIdx.y * blockDim.y + threadIdx.y;
@@ -15,7 +19,7 @@ __global__ void cuda_VolumetricAveragePooling_updateOutput(
 
   if (oRow < output.getSize(2) && oCol < output.getSize(3))
   {
-    float sum = 0.0;
+    Acctype sum = 0.0;
 
     int iColumn = oCol * dW;
     int iRow    = oRow    * dH;
@@ -33,7 +37,7 @@ __global__ void cuda_VolumetricAveragePooling_updateOutput(
             {
               if (iColumn + column < input.getSize(3))
               {
-                float val = input[slice][iFrame + frame][iRow + row][iColumn + column];
+                Dtype val = input[slice][iFrame + frame][iRow + row][iColumn + column];
                 sum += val;
               }
             }
@@ -42,17 +46,17 @@ __global__ void cuda_VolumetricAveragePooling_updateOutput(
       }
     }
 
-    output[slice][oFrame][oRow][oCol] = sum * normFactor;
+    output[slice][oFrame][oRow][oCol] = ScalarConvert<Acctype, Dtype>::to(sum * normFactor);
   }
 }
 
 // Inner-most loop size (kW) passed as template parameter for
 // performance reasons.
 //
-template<int KERNEL_WIDTH>
+template<int KERNEL_WIDTH, typename Dtype, typename Acctype>
 __global__ void cuda_VolumetricAveragePooling_updateOutput(
-  THCDeviceTensor<float, 4> input, THCDeviceTensor<float, 4> output,
-  int kT, int kH, int dT, int dH, int dW, float normFactor, int offsetZ)
+  THCDeviceTensor<Dtype, 4> input, THCDeviceTensor<Dtype, 4> output,
+  int kT, int kH, int dT, int dH, int dW, Acctype normFactor, int offsetZ)
 {
   int oCol   = blockIdx.x * blockDim.x + threadIdx.x;
   int oRow   = blockIdx.y * blockDim.y + threadIdx.y;
@@ -61,7 +65,7 @@ __global__ void cuda_VolumetricAveragePooling_updateOutput(
 
   if (oRow < output.getSize(2) && oCol < output.getSize(3))
   {
-    float sum = 0.0;
+    Acctype sum = 0.0;
 
     int iColumn = oCol * dW;
     int iRow    = oRow    * dH;
@@ -79,7 +83,7 @@ __global__ void cuda_VolumetricAveragePooling_updateOutput(
             {
               if (iColumn + column < input.getSize(3))
               {
-                float val = input[slice][iFrame + frame][iRow + row][iColumn + column];
+                Dtype val = input[slice][iFrame + frame][iRow + row][iColumn + column];
                 sum += val;
               }
             }
@@ -88,7 +92,7 @@ __global__ void cuda_VolumetricAveragePooling_updateOutput(
       }
     }
 
-    output[slice][oFrame][oRow][oCol] = sum * normFactor;
+    output[slice][oFrame][oRow][oCol] = ScalarConvert<Acctype, Dtype>::to(sum * normFactor);
   }
 }
 
@@ -97,127 +101,11 @@ __global__ void cuda_VolumetricAveragePooling_updateOutput(
     cudaInput, cudaOutput, kT, kH, dT, dH, dW, normFactor, offsetZ); \
   break
 
-
-void THNN_CudaVolumetricAveragePooling_updateOutput(
-  THCState *state, THCudaTensor *input, THCudaTensor *output,
-  int kT, int kW, int kH,
-  int dT, int dW, int dH)
-{
-  int batchSize;
-  int inputSlices;
-  int inputTime;
-  int inputHeight;
-  int inputWidth;
-
-  if (THCudaTensor_nDimension(state, input) == 4)
-  {
-    THArgCheck(
-      THCudaTensor_size(state, input, 1) >= kT &&
-      THCudaTensor_size(state, input, 2) >= kH &&
-      THCudaTensor_size(state, input, 3) >= kW, 2,
-      "input image smaller than kernel size"
-    );
-
-    /* sizes */
-    batchSize   = 1;
-    inputSlices = THCudaTensor_size(state, input, 0);
-    inputTime   = THCudaTensor_size(state, input, 1);
-    inputHeight = THCudaTensor_size(state, input, 2);
-    inputWidth  = THCudaTensor_size(state, input, 3);
-  }
-  else if (THCudaTensor_nDimension(state, input) == 5)
-  {
-    THArgCheck(
-      THCudaTensor_size(state, input, 2) >= kT &&
-      THCudaTensor_size(state, input, 3) >= kH &&
-      THCudaTensor_size(state, input, 4) >= kW, 2,
-      "input image smaller than kernel size"
-
-    );
-    /* sizes */
-    batchSize   = THCudaTensor_size(state, input, 0);
-    inputSlices = THCudaTensor_size(state, input, 1);
-    inputTime   = THCudaTensor_size(state, input, 2);
-    inputHeight = THCudaTensor_size(state, input, 3);
-    inputWidth  = THCudaTensor_size(state, input, 4);
-  }
-  else
-  {
-    THArgCheck(false, 2, "4D or 5D tensor expected");
-  }
-
-  int outputTime   = (inputTime   - kT) / dT + 1;
-  int outputHeight = (inputHeight - kH) / dH + 1;
-  int outputWidth  = (inputWidth  - kW) / dW + 1;
-
-  if (input->nDimension == 4) /* 4D */
-  {
-    /* resize output */
-    THCudaTensor_resize4d(state, output, inputSlices,
-                          outputTime, outputHeight, outputWidth);
-  }
-  else /* 5D */
-  {
-    THCudaTensor_resize5d(state, output, batchSize, inputSlices,
-                          outputTime, outputHeight, outputWidth);
-  }
-
-  input = THCudaTensor_newContiguous(state, input);
-
-  // Collapse batch and feature dimensions
-  THCDeviceTensor<float, 4> cudaInput;
-  THCDeviceTensor<float, 4> cudaOutput;
-  if (THCudaTensor_nDimension(state, input) == 4)
-  {
-    cudaInput  = toDeviceTensor<float, 4>(state, input);
-    cudaOutput = toDeviceTensor<float, 4>(state, output);
-  }
-  else
-  {
-    cudaInput  = toDeviceTensor<float, 5>(state, input).downcastOuter<4>();
-    cudaOutput = toDeviceTensor<float, 5>(state, output).downcastOuter<4>();
-  }
-
-  int totalZ = outputTime * inputSlices * batchSize;
-  int offsetZ = 0;
-  dim3 block(32, 8);
-  while (totalZ > 0) {
-    dim3 grid(THCCeilDiv(outputWidth, static_cast<int>(block.x)),
-              THCCeilDiv(outputHeight, static_cast<int>(block.y)),
-              totalZ > 65535 ? 65535 : totalZ);
-
-    float normFactor = 1.0f / static_cast<float>(kT * kH * kW);
-    switch (kW)
-      {
-        LAUNCH_UPDATE_OUTPUT_KERNEL_WIDTH(1);
-        LAUNCH_UPDATE_OUTPUT_KERNEL_WIDTH(2);
-        LAUNCH_UPDATE_OUTPUT_KERNEL_WIDTH(3);
-        LAUNCH_UPDATE_OUTPUT_KERNEL_WIDTH(4);
-        LAUNCH_UPDATE_OUTPUT_KERNEL_WIDTH(5);
-        LAUNCH_UPDATE_OUTPUT_KERNEL_WIDTH(6);
-        LAUNCH_UPDATE_OUTPUT_KERNEL_WIDTH(7);
-      default:
-        cuda_VolumetricAveragePooling_updateOutput<<<grid, block>>>(
-                                                                    cudaInput,
-                                                                    cudaOutput,
-                                                                    kT, kH, kW,
-                                                                    dT, dH, dW,
-                                                                    normFactor,
-                                                                    offsetZ
-                                                                    );
-        break;
-      }
-    totalZ -= 65535;
-    offsetZ += 65535;
-    THCudaCheck(cudaGetLastError());
-  }
-  THCudaTensor_free(state, input);
-}
-
+template <typename Dtype, typename Acctype>
 __global__ void cuda_VolumetricAveragePooling_updateGradInput_Stride1(
-  THCDeviceTensor<float, 4> gradOutput,
-  THCDeviceTensor<float, 4> gradInput,
-  int kT, int kH, int kW, float normFactor, int offsetZ)
+  THCDeviceTensor<Dtype, 4> gradOutput,
+  THCDeviceTensor<Dtype, 4> gradInput,
+  int kT, int kH, int kW, Acctype normFactor, int offsetZ)
 {
   int iCol   = blockIdx.x * blockDim.x + threadIdx.x;
   int iRow   = blockIdx.y * blockDim.y + threadIdx.y;
@@ -227,8 +115,8 @@ __global__ void cuda_VolumetricAveragePooling_updateGradInput_Stride1(
   // guard against over-tiled threads
   if (iRow < gradInput.getSize(2) && iCol < gradInput.getSize(3))
   {
-    float sum = 0.0;
-    float *gOut = &gradOutput[slice][max(0, iFrame - kT + 1)]
+    Acctype sum = 0.0;
+    Dtype *gOut = &gradOutput[slice][max(0, iFrame - kT + 1)]
       [max(0, iRow - kH + 1)][max(0, iCol - kW + 1)];
     int frameOffset = 0;
     for (int oFrame  = max(0, iFrame - kT + 1);
@@ -252,13 +140,14 @@ __global__ void cuda_VolumetricAveragePooling_updateGradInput_Stride1(
       }
       frameOffset += gradOutput.getSize(2) * gradOutput.getSize(3);
     }
-    gradInput[slice][iFrame][iRow][iCol] = sum * normFactor;
+    gradInput[slice][iFrame][iRow][iCol] = ScalarConvert<Acctype, Dtype>::to(sum * normFactor);
   }
 }
 
+template <typename Dtype, typename Acctype>
 __global__ void cuda_VolumetricAveragePooling_updateGradInput_atomicAdd(
-  THCDeviceTensor<float, 4> gradOutput,
-  THCDeviceTensor<float, 4> gradInput,
+  THCDeviceTensor<Dtype, 4> gradOutput,
+  THCDeviceTensor<Dtype, 4> gradInput,
   int kT, int kH, int kW, int dT, int dH, int dW, int offsetZ)
 {
   int oCol   = blockIdx.x * blockDim.x + threadIdx.x;
@@ -269,7 +158,8 @@ __global__ void cuda_VolumetricAveragePooling_updateGradInput_atomicAdd(
   // guard against over-tiled threads
   if (oRow < gradOutput.getSize(2) && oCol < gradOutput.getSize(3))
   {
-    float val = gradOutput[slice][oFrame][oRow][oCol] / (kT * kH * kW);
+    Dtype val = ScalarConvert<Acctype, Dtype>::to(
+      ScalarConvert<Dtype, Acctype>::to(gradOutput[slice][oFrame][oRow][oCol]) / (kT * kH * kW));
     for (int iFrame = oFrame * dT; iFrame < oFrame * dT + kT; ++iFrame)
     {
       for (int iRow = oRow * dH; iRow < oRow * dH + kH; ++iRow)
@@ -283,9 +173,10 @@ __global__ void cuda_VolumetricAveragePooling_updateGradInput_atomicAdd(
   }
 }
 
+template <typename Dtype, typename Acctype>
 __global__ void cuda_VolumetricAveragePooling_updateGradInput(
-  THCDeviceTensor<float, 4> gradOutput,
-  THCDeviceTensor<float, 4> gradInput,
+  THCDeviceTensor<Dtype, 4> gradOutput,
+  THCDeviceTensor<Dtype, 4> gradInput,
   int kT, int kH, int kW,
   int dT, int dH, int dW, int offsetZ)
 {
@@ -297,7 +188,8 @@ __global__ void cuda_VolumetricAveragePooling_updateGradInput(
   // guard against over-tiled threads
   if (oRow < gradOutput.getSize(2) && oCol < gradOutput.getSize(3))
   {
-    float val = gradOutput[slice][oFrame][oRow][oCol] / (kT * kH * kW);
+    Dtype val = ScalarConvert<Acctype, Dtype>::to(
+      ScalarConvert<Dtype, Acctype>::to(gradOutput[slice][oFrame][oRow][oCol]) / (kT * kH * kW));
     for (int iFrame = oFrame * dT; iFrame < oFrame * dT + kT; ++iFrame)
     {
       for (int iRow = oRow * dH; iRow < oRow * dH + kH; ++iRow)
@@ -311,116 +203,5 @@ __global__ void cuda_VolumetricAveragePooling_updateGradInput(
   }
 }
 
-void THNN_CudaVolumetricAveragePooling_updateGradInput(
-  THCState *state,
-  THCudaTensor *input,
-  THCudaTensor *gradOutput,
-  THCudaTensor *gradInput,
-  int kT, int kW, int kH,
-  int dT, int dW, int dH)
-{
-  bool kernelsOverlap = (dT < kT) || (dH < kH) || (dW < kW);
-
-  // Resize and initialize result tensor.
-  THCudaTensor_resizeAs(state, gradInput, input);
-  THCudaTensor_zero(state, gradInput);
-
-  int batchSize;
-  int inputSlices;
-  int inputTime;
-  int inputHeight;
-  int inputWidth;
-
-  int outputTime;
-  int outputHeight;
-  int outputWidth;
-
-  if (THCudaTensor_nDimension(state, input) == 4) /* 4D */
-  {
-    batchSize = 1;
-    inputSlices  = THCudaTensor_size(state, input, 0);
-    inputTime    = THCudaTensor_size(state, input, 1);
-    inputHeight  = THCudaTensor_size(state, input, 2);
-    inputWidth   = THCudaTensor_size(state, input, 3);
-
-    outputTime   = THCudaTensor_size(state, gradOutput, 1);
-    outputHeight = THCudaTensor_size(state, gradOutput, 2);
-    outputWidth  = THCudaTensor_size(state, gradOutput, 3);
-  }
-  else
-  {
-    batchSize    = THCudaTensor_size(state, input, 0);
-    inputSlices  = THCudaTensor_size(state, input, 1);
-    inputTime    = THCudaTensor_size(state, input, 2);
-    inputHeight  = THCudaTensor_size(state, input, 3);
-    inputWidth   = THCudaTensor_size(state, input, 4);
-
-    outputTime   = THCudaTensor_size(state, gradOutput, 2);
-    outputHeight = THCudaTensor_size(state, gradOutput, 3);
-    outputWidth  = THCudaTensor_size(state, gradOutput, 4);
-  }
-
-  gradOutput = THCudaTensor_newContiguous(state, gradOutput);
-
-  // Collapse batch and feature dimensions
-  THCDeviceTensor<float, 4> cudaGradInput;
-  THCDeviceTensor<float, 4> cudaGradOutput;
-  if (THCudaTensor_nDimension(state, input) == 4)
-  {
-    cudaGradInput  = toDeviceTensor<float, 4>(state, gradInput);
-    cudaGradOutput = toDeviceTensor<float, 4>(state, gradOutput);
-  }
-  else
-  {
-    cudaGradInput =
-      toDeviceTensor<float, 5>(state, gradInput).downcastOuter<4>();
-    cudaGradOutput =
-      toDeviceTensor<float, 5>(state, gradOutput).downcastOuter<4>();
-  }
-
-  dim3 block(32, 8);
-
-  // Optimizing for stride 1 is probably only of limited value, but this
-  // specialization yields 3x speedup over the atomicAdd implementation.
-  if (dT == 1 && dH == 1 && dW == 1)
-  {
-    int totalZ = inputTime * inputSlices * batchSize;
-    int offsetZ = 0;
-    while (totalZ > 0) {
-      dim3 grid(THCCeilDiv(inputWidth, static_cast<int>(block.x)),
-                THCCeilDiv(inputHeight, static_cast<int>(block.y)),
-                totalZ > 65535 ? 65535 : totalZ);
-      cuda_VolumetricAveragePooling_updateGradInput_Stride1<<<grid, block>>>(
-         cudaGradOutput, cudaGradInput, kT, kH, kW, 1.0f/(kT * kH * kW), offsetZ);
-      THCudaCheck(cudaGetLastError());
-      totalZ -= 65535;
-      offsetZ += 65535;
-    }
-  }
-  else
-  {
-    int totalZ = outputTime * inputSlices * batchSize;
-    int offsetZ = 0;
-    while (totalZ > 0) {
-
-      dim3 grid(THCCeilDiv(outputWidth, static_cast<int>(block.x)),
-                THCCeilDiv(outputHeight, static_cast<int>(block.y)),
-                totalZ > 65535 ? 65535 : totalZ);
-      if (kernelsOverlap)
-        {
-          cuda_VolumetricAveragePooling_updateGradInput_atomicAdd<<<grid, block>>>(
-            cudaGradOutput, cudaGradInput, kT, kH, kW, dT, dH, dW, offsetZ);
-        }
-      else
-        {
-          cuda_VolumetricAveragePooling_updateGradInput<<<grid, block>>>(
-             cudaGradOutput, cudaGradInput, kT, kH, kW, dT, dH, dW, offsetZ);
-        }
-      THCudaCheck(cudaGetLastError());
-      totalZ -= 65535;
-      offsetZ += 65535;
-    }
-  }
-
-  THCudaTensor_free(state, gradOutput);
-}
+#include "generic/VolumetricAveragePooling.cu"
+#include "THCGenerateFloatTypes.h"
diff --git a/lib/THCUNN/VolumetricConvolution.cu b/lib/THCUNN/VolumetricConvolution.cu
index 8576ae8..78f45f9 100644
--- a/lib/THCUNN/VolumetricConvolution.cu
+++ b/lib/THCUNN/VolumetricConvolution.cu
@@ -1,16 +1,19 @@
 #include "THCUNN.h"
 #include "common.h"
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
 
 // Kernel for fast unfold+copy
 // Borrowed from Theano
 // Authors: Arjun Jain, Frédéric Bastien, Jan Schlüter, Nicolas Ballas
-__global__ void im3d2col_kernel(const int n, const float* data_im,
+template <typename Dtype>
+__global__ void im3d2col_kernel(const int n, const Dtype* data_im,
                                 const int height, const int width, const int depth,
                                 const int kernel_h, const int kernel_w, const int kernel_d,
                                 const int pad_h, const int pad_w, const int pad_d,
                                 const int stride_h, const int stride_w, const int stride_d,
                                 const int height_col, const int width_col, const int depth_col,
-                                float* data_col)
+                                Dtype* data_col)
 {
   CUDA_KERNEL_LOOP(index, n)
   {
@@ -29,11 +32,11 @@ __global__ void im3d2col_kernel(const int n, const float* data_im,
     int w_in = w_out * stride_w - pad_w;
     int d_in = d_out * stride_d - pad_d;
 
-    float* data_col_ptr = data_col;
+    Dtype* data_col_ptr = data_col;
     data_col_ptr += channel_out * (height_col * width_col * depth_col) +
       h_out * (width_col * depth_col) + w_out * depth_col + d_out;
 
-    const float* data_im_ptr = data_im;
+    const Dtype* data_im_ptr = data_im;
     data_im_ptr += channel_in * (height * width * depth) +
       h_in * (width * depth) + w_in * depth + d_in;
 
@@ -48,7 +51,7 @@ __global__ void im3d2col_kernel(const int n, const float* data_im,
           int d = d_in + k;
           *data_col_ptr = (h >= 0 && w >= 0 && d >= 0 &&
                            h < height && w < width && d < depth) ?
-                           data_im_ptr[i * (width * depth) + j *depth + k] : 0;
+                           data_im_ptr[i * (width * depth) + j *depth + k] : ScalarConvert<int, Dtype>::to(0);
           data_col_ptr += height_col * width_col * depth_col;
         }
       }
@@ -56,12 +59,13 @@ __global__ void im3d2col_kernel(const int n, const float* data_im,
   }
 }
 
-void im3d2col(cudaStream_t stream, const float* data_im, const int channels,
+template <typename Dtype>
+void im3d2col(cudaStream_t stream, const Dtype* data_im, const int channels,
               const int height, const int width, const int depth,
               const int kernel_h, const int kernel_w, const int kernel_d,
               const int pad_h, const int pad_w, const int pad_d,
               const int stride_h, const int stride_w, const int stride_d,
-              float* data_col)
+              Dtype* data_col)
 {
   // We are going to launch channels * height_col * width_col * depth_col kernels, each
   // kernel responsible for copying a single-channel grid.
@@ -80,19 +84,19 @@ void im3d2col(cudaStream_t stream, const float* data_im, const int channels,
   THCudaCheck(cudaGetLastError());
 }
 
-
-__global__ void col2im3d_kernel(const int n, const float* data_col,
+template <typename Dtype, typename Acctype>
+__global__ void col2im3d_kernel(const int n, const Dtype* data_col,
                                 const int height, const int width, const int depth,
                                 const int channels,
                                 const int patch_h, const int patch_w, const int patch_d,
                                 const int pad_h, const int pad_w, const int pad_d,
                                 const int stride_h, const int stride_w, const int stride_d,
                                 const int height_col, const int width_col, const int depth_col,
-                                float* data_im)
+                                Dtype* data_im)
 {
   CUDA_KERNEL_LOOP(index, n)
   {
-    float val = 0;
+    Acctype val = 0;
     int d = index % depth + pad_d;
     int w_index = index / depth;
     int w = w_index % width + pad_w;
@@ -120,16 +124,17 @@ __global__ void col2im3d_kernel(const int n, const float* data_col,
           val += data_col[offset + h_col * coeff_h_col + w_col * coeff_w_col + d_col * coeff_d_col];
       }
    }
-    data_im[index] = val;
+    data_im[index] = ScalarConvert<Acctype, Dtype>::to(val);
   }
 }
 
-void col2im3d(cudaStream_t stream, const float* data_col, const int channels,
+template <typename Dtype, typename Acctype>
+void col2im3d(cudaStream_t stream, const Dtype* data_col, const int channels,
               const int height, const int width, const int depth,
               const int patch_h, const int patch_w, const int patch_d,
               const int pad_h, const int pad_w, const int pad_d,
               const int stride_h, const int stride_w, const int stride_d,
-              float* data_im)
+              Dtype* data_im)
 {
   int height_col = (height + 2 * pad_h - patch_h) / stride_h + 1;
   int width_col = (width + 2 * pad_w - patch_w) / stride_w + 1;
@@ -138,7 +143,7 @@ void col2im3d(cudaStream_t stream, const float* data_col, const int channels,
 
   // To avoid involving atomic operations, we will launch one kernel per
   // bottom dimension, and then in the kernel add up the top dimensions.
-  col2im3d_kernel<<<GET_BLOCKS(num_kernels),
+  col2im3d_kernel<Dtype, Acctype><<<GET_BLOCKS(num_kernels),
     CUDA_NUM_THREADS, 0, stream>>>(num_kernels, data_col,
                                    height, width, depth, channels,
                                    patch_h, patch_w, patch_d,
@@ -149,371 +154,5 @@ void col2im3d(cudaStream_t stream, const float* data_col, const int channels,
   THCudaCheck(cudaGetLastError());
 }
 
-void THNN_CudaVolumetricConvolution_updateOutput(
-  THCState *state,
-  THCudaTensor *input,
-  THCudaTensor *output,
-  THCudaTensor *weight,
-  THCudaTensor *bias,
-  THCudaTensor *finput,
-  THCudaTensor *fgradInput,
-  int dT, int dW, int dH,
-  int padT, int padW, int padH)
-{
-  THCudaTensor *columns = finput;
-  THCudaTensor *ones = fgradInput;
-  THCUNN_assertSameGPU(state, 6, input, output, weight, bias, columns, ones);
-
-  THArgCheck(input->nDimension == 4 || input->nDimension == 5, 2,
-    "4D or 5D (batch mode) tensor is expected"
-  );
-
-  THArgCheck(weight->nDimension == 5, 4,
-    "5D weight tensor is expected (nOutputPlane x nInputPlane x kT x kH x kW)"
-  );
-
-  int nOutputPlane = (int)weight->size[0];
-  int nInputPlane  = (int)weight->size[1];
-  int kT           = (int)weight->size[2];
-  int kH           = (int)weight->size[3];
-  int kW           = (int)weight->size[4];
-
-  int batch = 1;
-  if (input->nDimension == 4)
-  {
-    // Force batch
-    batch = 0;
-    THCudaTensor_resize5d(state, input, 1, input->size[0], input->size[1],
-                          input->size[2], input->size[3]);
-  }
-
-  long inputWidth   = input->size[3];
-  long inputHeight  = input->size[2];
-  long inputDepth   = input->size[4];
-  long outputWidth  = (inputWidth  + 2*padH - kH) / dH + 1;
-  long outputHeight = (inputHeight + 2*padT - kT) / dT + 1;
-  long outputDepth  = (inputDepth  + 2*padW - kW) / dW + 1;
-
-  // Batch size + input planes
-  long batchSize = input->size[0];
-
-  // Resize output
-  THCudaTensor_resize5d(state, output, batchSize, nOutputPlane,
-                        outputHeight, outputWidth, outputDepth);
-
-  // Resize temporary columns
-  THCudaTensor_resize2d(state, columns, nInputPlane*kW*kH*kT, outputDepth*outputHeight*outputWidth);
-
-  // Define a buffer of ones, for bias accumulation
-  // Note: this buffer can be shared with other modules, it only ever gets increased,
-  // and always contains ones.
-  if (ones->nDimension != 3 || ones->size[0]*ones->size[1]*ones->size[2] < outputDepth*outputHeight*outputWidth)
-  {
-    // Resize plane and fill with ones...
-    THCudaTensor_resize3d(state, ones, outputHeight, outputWidth, outputDepth);
-    THCudaTensor_fill(state, ones, 1);
-  }
-
-  // Helpers
-  THCudaTensor *input_n = THCudaTensor_new(state);
-  THCudaTensor *output_n = THCudaTensor_new(state);
-
-  // For each elt in batch, do:
-  for (int elt = 0; elt < batchSize; elt ++)
-  {
-    // Matrix mulitply per output:
-    THCudaTensor_select(state, input_n, input, 0, elt);
-    THCudaTensor_select(state, output_n, output, 0, elt);
-
-    // Do Bias first:
-    // M,N,K are dims of matrix A and B
-    // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
-    long m_ = nOutputPlane;
-    long n_ = outputDepth * outputHeight * outputWidth;
-    long k_ = 1;
-
-    // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
-    THCudaBlas_Sgemm(
-      state,
-      't', 'n',
-      n_, m_, k_,
-      1,
-      THCudaTensor_data(state, ones), k_,
-      THCudaTensor_data(state, bias), k_,
-      0,
-      THCudaTensor_data(state, output_n), n_
-    );
-
-    // Extract columns:
-    im3d2col(
-      THCState_getCurrentStream(state),
-      THCudaTensor_data(state, input_n),
-      nInputPlane, inputHeight, inputWidth, inputDepth, kT, kH, kW, padT, padH, padW, dT, dH, dW,
-      THCudaTensor_data(state, columns)
-    );
-
-    // M,N,K are dims of matrix A and B
-    // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
-    long m = weight->size[0];
-    long n = columns->size[1];
-    long k = weight->size[1]*weight->size[2]*weight->size[3]*weight->size[4];
-
-    // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
-    THCudaBlas_Sgemm(
-      state,
-      'n', 'n',
-      n, m, k,
-      1,
-      THCudaTensor_data(state, columns), n,
-      THCudaTensor_data(state, weight), k,
-      1,
-      THCudaTensor_data(state, output_n), n
-    );
-  }
-
-  // Free
-  THCudaTensor_free(state, input_n);
-  THCudaTensor_free(state, output_n);
-
-  // Resize output
-  if (batch == 0)
-  {
-    THCudaTensor_resize4d(state, output, nOutputPlane, outputHeight, outputWidth, outputDepth);
-    THCudaTensor_resize4d(state, input, nInputPlane, inputHeight, inputWidth, inputDepth);
-  }
-}
-
-void THNN_CudaVolumetricConvolution_updateGradInput(
-  THCState *state,
-  THCudaTensor *input,
-  THCudaTensor *gradOutput,
-  THCudaTensor *gradInput,
-  THCudaTensor *weight,
-  THCudaTensor *finput,
-  int dT, int dW, int dH,
-  int padT, int padW, int padH)
-{
-  THArgCheck(weight->nDimension == 5, 4,
-    "5D weight tensor is expected (nOutputPlane x nInputPlane x kT x kH x kW)"
-  );
-
-  int nOutputPlane = (int)weight->size[0];
-  int nInputPlane  = (int)weight->size[1];
-  int kT           = (int)weight->size[2];
-  int kH           = (int)weight->size[3];
-  int kW           = (int)weight->size[4];
-
-  THCudaTensor *gradColumns = finput;
-
-  THCUNN_assertSameGPU(state, 5, input, gradOutput, weight, gradColumns, gradInput);
-  THArgCheck(input->nDimension == 4 || input->nDimension == 5, 2,
-    "4D or 5D (batch mode) tensor is expected"
-  );
-
-  int batch = 1;
-  if (input->nDimension == 4)
-  {
-    // Force batch
-    batch = 0;
-    THCudaTensor_resize5d(state, input, 1, input->size[0], input->size[1], input->size[2], input->size[3]);
-    THCudaTensor_resize5d(state, gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2], gradOutput->size[3]);
-  }
-
-  long inputWidth   = input->size[3];
-  long inputHeight  = input->size[2];
-  long inputDepth   = input->size[4];
-  long outputWidth  = (inputWidth  + 2*padH - kH) / dH + 1;
-  long outputHeight = (inputHeight + 2*padT - kT) / dT + 1;
-  long outputDepth  = (inputDepth  + 2*padW - kW) / dW + 1;
-
-  // Batch size + input planes
-  long batchSize = input->size[0];
-
-  // Resize output
-  THCudaTensor_resize5d(state, gradInput, batchSize, nInputPlane, inputHeight, inputWidth, inputDepth);
-
-  // Resize temporary columns
-  THCudaTensor_resize2d(state, gradColumns, nInputPlane*kH*kT*kW, outputDepth*outputHeight*outputWidth);
-
-  // Helpers
-  THCudaTensor *input_n = THCudaTensor_new(state);
-  THCudaTensor *gradInput_n = THCudaTensor_new(state);
-  THCudaTensor *gradOutput_n = THCudaTensor_new(state);
-
-  // For each elt in batch, do:
-  for (int elt = 0; elt < batchSize; elt ++)
-  {
-    // Matrix mulitply per sample:
-    THCudaTensor_select(state, input_n, input, 0, elt);
-    THCudaTensor_select(state, gradInput_n, gradInput, 0, elt);
-    THCudaTensor_select(state, gradOutput_n, gradOutput, 0, elt);
-
-    // M,N,K are dims of matrix A and B
-    // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
-    long m = weight->size[1]*weight->size[2]*weight->size[3]*weight->size[4];
-    long n = gradColumns->size[1];
-    long k = weight->size[0];
-
-    // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
-    THCudaBlas_Sgemm(
-      state,
-      'n', 't',
-      n, m, k,
-      1,
-      THCudaTensor_data(state, gradOutput_n), n,
-      THCudaTensor_data(state, weight), m,
-      0,
-      THCudaTensor_data(state, gradColumns), n
-    );
-
-    // Unpack columns back into input:
-    col2im3d(
-      THCState_getCurrentStream(state),
-      THCudaTensor_data(state, gradColumns),
-      nInputPlane, inputHeight, inputWidth, inputDepth, kT, kH, kW, padT, padH, padW, dT, dH, dW,
-      THCudaTensor_data(state, gradInput_n)
-    );
-  }
-
-  // Free
-  THCudaTensor_free(state, input_n);
-  THCudaTensor_free(state, gradInput_n);
-  THCudaTensor_free(state, gradOutput_n);
-
-  // Resize output
-  if (batch == 0)
-  {
-    THCudaTensor_resize4d(state, gradOutput, nOutputPlane, outputHeight, outputWidth, outputDepth);
-    THCudaTensor_resize4d(state, input, nInputPlane, inputHeight, inputWidth, inputDepth);
-    THCudaTensor_resize4d(state, gradInput, nInputPlane, inputHeight, inputWidth, inputDepth);
-  }
-}
-
-void THNN_CudaVolumetricConvolution_accGradParameters(
-  THCState *state,
-  THCudaTensor *input,
-  THCudaTensor *gradOutput,
-  THCudaTensor *gradWeight,
-  THCudaTensor *gradBias,
-  THCudaTensor *finput,
-  THCudaTensor *fgradInput,
-  int dT, int dW, int dH,
-  int padT, int padW, int padH,
-  float scale)
-{
-  THCudaTensor *columns = finput;
-  THCudaTensor *ones = fgradInput;
-  THCUNN_assertSameGPU(state, 6, input, gradOutput, gradWeight, gradBias, columns, ones);
-
-  THArgCheck(gradWeight->nDimension == 5, 4,
-    "5D gradWeight tensor is expected (nOutputPlane x nInputPlane x kT x kH x kW)"
-  );
-
-  int nOutputPlane = (int)gradWeight->size[0];
-  int nInputPlane  = (int)gradWeight->size[1];
-  int kT           = (int)gradWeight->size[2];
-  int kH           = (int)gradWeight->size[3];
-  int kW           = (int)gradWeight->size[4];
-
-  THArgCheck(
-    input->nDimension == 4 || input->nDimension == 5, 2,
-    "3D or 4D (batch mode) tensor is expected"
-  );
-
-  int batch = 1;
-  if (input->nDimension == 4)
-  {
-    // Force batch
-    batch = 0;
-    THCudaTensor_resize5d(state, input, 1, input->size[0], input->size[1], input->size[2], input->size[3]);
-    THCudaTensor_resize5d(state, gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2], gradOutput->size[3]);
-  }
-
-  long inputWidth   = input->size[3];
-  long inputHeight  = input->size[2];
-  long inputDepth   = input->size[4];
-  long outputWidth  = (inputWidth  + 2*padH - kH) / dH + 1;
-  long outputHeight = (inputHeight + 2*padT - kT) / dT + 1;
-  long outputDepth  = (inputDepth  + 2*padW - kW) / dW + 1;
-
-  // Batch size + input planes
-  long batchSize = input->size[0];
-
-  // Define a buffer of ones, for bias accumulation
-  if (ones->nDimension != 3 || ones->size[0]*ones->size[1]*ones->size[2] < outputDepth*outputHeight*outputWidth)
-  {
-    // Resize plane and fill with ones...
-    THCudaTensor_resize3d(state, ones, outputHeight, outputWidth, outputDepth);
-    THCudaTensor_fill(state, ones, 1);
-  }
-
-  // Resize temporary columns
-  THCudaTensor_resize2d(state, columns, nInputPlane*kH*kT*kW, outputDepth*outputHeight*outputWidth);
-
-  // Helpers
-  THCudaTensor *input_n = THCudaTensor_new(state);
-  THCudaTensor *gradOutput_n = THCudaTensor_new(state);
-
-  // For each elt in batch, do:
-  for (int elt = 0; elt < batchSize; elt ++)
-  {
-    // Matrix mulitply per output:
-    THCudaTensor_select(state, input_n, input, 0, elt);
-    THCudaTensor_select(state, gradOutput_n, gradOutput, 0, elt);
-
-    // Extract columns:
-    im3d2col(
-      THCState_getCurrentStream(state),
-      THCudaTensor_data(state, input_n),
-      nInputPlane, inputHeight, inputWidth, inputDepth, kT, kH, kW, padT, padH, padW, dT, dH, dW,
-      THCudaTensor_data(state, columns)
-    );
-
-    // M,N,K are dims of matrix A and B
-    // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
-    long m = gradWeight->size[0];
-    long n = gradWeight->size[1]*gradWeight->size[2]*gradWeight->size[3]*gradWeight->size[4];
-    long k = columns->size[1];
-
-    // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
-    THCudaBlas_Sgemm(
-      state,
-      't', 'n',
-      n, m, k,
-      scale,
-      THCudaTensor_data(state, columns), k,
-      THCudaTensor_data(state, gradOutput_n), k,
-      1,
-      THCudaTensor_data(state, gradWeight), n
-    );
-
-    // Do Bias:
-    // M,N,K are dims of matrix A and B
-    // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
-    long m_ = nOutputPlane;
-    long k_ = outputDepth * outputHeight * outputWidth;
-
-    // Do GEMV (note: this is a bit confusing because gemv assumes column-major matrices)
-    THCudaBlas_Sgemv(
-      state,
-      't',
-      k_, m_,
-      scale,
-      THCudaTensor_data(state, gradOutput_n), k_,
-      THCudaTensor_data(state, ones), 1,
-      1,
-      THCudaTensor_data(state, gradBias), 1
-    );
-  }
-
-  // Free
-  THCudaTensor_free(state, input_n);
-  THCudaTensor_free(state, gradOutput_n);
-
-  // Resize
-  if (batch == 0)
-  {
-    THCudaTensor_resize4d(state, gradOutput, nOutputPlane, outputHeight, outputWidth, outputDepth);
-    THCudaTensor_resize4d(state, input, nInputPlane, inputHeight, inputWidth, inputDepth);
-  }
-}
+#include "generic/VolumetricConvolution.cu"
+#include "THCGenerateFloatTypes.h"
diff --git a/lib/THCUNN/VolumetricDilatedConvolution.cu b/lib/THCUNN/VolumetricDilatedConvolution.cu
index 72caba7..d82e02d 100644
--- a/lib/THCUNN/VolumetricDilatedConvolution.cu
+++ b/lib/THCUNN/VolumetricDilatedConvolution.cu
@@ -1,367 +1,8 @@
 #include "THCUNN.h"
 #include "common.h"
 #include "vol2col.h"
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
 
-
-void THNN_CudaVolumetricDilatedConvolution_updateOutput(
-  THCState *state,
-  THCudaTensor *input,
-  THCudaTensor *output,
-  THCudaTensor *weight,
-  THCudaTensor *bias,
-  THCudaTensor *columns,
-  THCudaTensor *ones,
-  int kT, int kW, int kH,
-  int dT, int dW, int dH,
-  int padT, int padW, int padH,
-  int dilationT, int dilationW, int dilationH) {
-
-  THCUNN_assertSameGPU(state, 5, input, output, weight, columns, ones);
-  if (bias) {
-    THCUNN_assertSameGPU(state, 2, weight, bias);
-  }
-  THArgCheck(input->nDimension == 4 || input->nDimension == 5, 2, "4D or 5D (batch mode) tensor is expected, but got: %d", input->nDimension);
-  THArgCheck(weight->nDimension == 5, 4, "weight tensor must be 5D (nOutputPlane,nInputPlane,kT,kH,kW)");
-  THArgCheck(!bias || weight->size[0] == bias->size[0], 4, "nOutputPlane mismatch in weight and bias");
-  THArgCheck(kT > 0 && kW > 0 && kH > 0, 8, "kernel size should be greater than zero");
-  THArgCheck(dT > 0 && dW > 0 && dH > 0, 10, "stride should be greater than zero");
-
-  // Params:
-  int nInputPlane = weight->size[1];
-  int nOutputPlane = weight->size[0];
-
-  int batch = 1;
-  if (input->nDimension == 4) {
-    THArgCheck(input->size[0] == nInputPlane, 2, "input channels and nInputPlane dont match");
-    // Force batch
-    batch = 0;
-    THCudaTensor_resize5d(state, input, 1, input->size[0], input->size[1], input->size[2], input->size[3]);
-  } else {
-    THArgCheck(input->size[1] == nInputPlane, 2, "input channels and nInputPlane dont match");
-  }
-
-  long inputDepth  = input->size[2];
-  long inputHeight  = input->size[3];
-  long inputWidth   = input->size[4];
-  long outputDepth  = (inputDepth  + 2*padT - (dilationT * (kT - 1) + 1)) / dT + 1;
-  long outputHeight = (inputHeight + 2*padH - (dilationH * (kH - 1) + 1)) / dH + 1;
-  long outputWidth  = (inputWidth  + 2*padW - (dilationW * (kW - 1) + 1)) / dW + 1;
-
-  if (outputDepth < 1 || outputWidth < 1 || outputHeight < 1)
-    THError("Given input size: (%dx%dx%dx%d). Calculated output size: (%dx%dx%dx%d). Output size is too small",
-            nInputPlane,inputDepth,inputHeight,inputWidth,nOutputPlane,outputDepth,outputHeight,outputWidth);
-
-  // Batch size + input planes
-  long batchSize = input->size[0];
-
-  // Resize output
-  THCudaTensor_resize5d(state, output, batchSize, nOutputPlane, outputDepth, outputHeight, outputWidth);
-
-  // Resize temporary columns
-  THCudaTensor_resize2d(state, columns, nInputPlane*kT*kW*kH, outputDepth*outputHeight*outputWidth);
-
-  // Define a buffer of ones, for bias accumulation
-  // Note: this buffer can be shared with other modules, it only ever gets increased,
-  // and always contains ones.
-  if (ones->nDimension != 2 || ones->size[0]*ones->size[1]*ones->size[2] < outputDepth*outputHeight*outputWidth) {
-    // Resize plane and fill with ones...
-    THCudaTensor_resize3d(state, ones, outputDepth, outputHeight, outputWidth);
-    THCudaTensor_fill(state, ones, 1);
-  }
-
-  // Helpers
-  THCudaTensor *input_n = THCudaTensor_new(state);
-  THCudaTensor *output_n = THCudaTensor_new(state);
-
-  // For each elt in batch, do:
-  for (int elt = 0; elt < batchSize; elt ++) {
-    // Matrix mulitply per output:
-    THCudaTensor_select(state, input_n, input, 0, elt);
-    THCudaTensor_select(state, output_n, output, 0, elt);
-
-    // Do Bias first:
-    // M,N,K are dims of matrix A and B
-    // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
-    long m_ = nOutputPlane;
-    long n_ = outputDepth * outputHeight * outputWidth;
-    long k_ = 1;
-
-    // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
-    if (bias) {
-      THCudaBlas_Sgemm(
-          state,
-          't', 'n',
-          n_, m_, k_,
-          1,
-          THCudaTensor_data(state, ones), k_,
-          THCudaTensor_data(state, bias), k_,
-          0,
-          THCudaTensor_data(state, output_n), n_
-      );
-    } else {
-      THCudaTensor_zero(state, output_n);
-    }
-
-    // Extract columns:
-    vol2col(
-      THCState_getCurrentStream(state),
-      THCudaTensor_data(state, input_n),
-      nInputPlane, inputDepth, inputHeight, inputWidth,
-      kT, kH, kW, padT, padH, padW, dT, dH, dW,
-      dilationT, dilationH, dilationW,
-      THCudaTensor_data(state, columns)
-    );
-
-    // M,N,K are dims of matrix A and B
-    // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
-    long m = nOutputPlane;
-    long n = columns->size[1];
-    long k = nInputPlane*kT*kH*kW;
-
-    // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
-    THCudaBlas_Sgemm(
-        state,
-        'n', 'n',
-        n, m, k,
-        1,
-        THCudaTensor_data(state, columns), n,
-        THCudaTensor_data(state, weight), k,
-        1,
-        THCudaTensor_data(state, output_n), n
-    );
-  }
-
-  // Free
-  THCudaTensor_free(state, input_n);
-  THCudaTensor_free(state, output_n);
-
-  // Resize output
-  if (batch == 0) {
-    THCudaTensor_resize4d(state, output, nOutputPlane, outputDepth, outputHeight, outputWidth);
-    THCudaTensor_resize4d(state, input, nInputPlane, inputDepth, inputHeight, inputWidth);
-  }
-}
-
-void THNN_CudaVolumetricDilatedConvolution_updateGradInput(
-  THCState *state,
-  THCudaTensor *input,
-  THCudaTensor *gradOutput,
-  THCudaTensor *gradInput,
-  THCudaTensor *weight,
-  THCudaTensor *gradColumns,
-  int kT, int kW, int kH,
-  int dT, int dW, int dH,
-  int padT, int padW, int padH,
-  int dilationT, int dilationW, int dilationH) {
-
-  THCUNN_assertSameGPU(state, 5, input, gradOutput, weight,
-                                 gradColumns, gradInput);
-  THArgCheck(input->nDimension == 4 || input->nDimension == 5, 2, "4D or 5D (batch mode) tensor is expected");
-  THArgCheck(gradOutput->nDimension == 4 || gradOutput->nDimension == 5, 3, "4D or 5D (batch mode) tensor is expected");
-  THArgCheck(weight->nDimension == 5, 4, "weight tensor must be 5D (nOutputPlane,nInputPlane,kT,kH,kW)");
-  THArgCheck(kT > 0 && kW > 0 && kH > 0, 8, "kernel size should be greater than zero");
-  THArgCheck(dT > 0 && dW > 0 && dH > 0, 10, "stride should be greater than zero");
-
-  // Params
-  int nInputPlane = weight->size[1];
-  int nOutputPlane = weight->size[0];
-
-  int batch = 1;
-  if (input->nDimension == 4) {
-    // Force batch
-    batch = 0;
-    THCudaTensor_resize5d(state, input, 1, input->size[0], input->size[1], input->size[2], input->size[3]);
-    THCudaTensor_resize5d(state, gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2], gradOutput->size[3]);
-  }
-
-  long inputDepth  = input->size[2];
-  long inputWidth   = input->size[4];
-  long inputHeight  = input->size[3];
-  long outputDepth  = (inputDepth + 2*padT - (dilationT * (kT - 1) + 1)) / dT + 1;
-  long outputWidth  = (inputWidth + 2*padW - (dilationW * (kW - 1) + 1)) / dW + 1;
-  long outputHeight = (inputHeight + 2*padH - (dilationH * (kH - 1) + 1)) / dH + 1;
-
-  // Batch size + input planes
-  long batchSize = input->size[0];
-
-  // Resize output
-  THCudaTensor_resize5d(state, gradInput, batchSize, nInputPlane, inputDepth, inputHeight, inputWidth);
-
-  // Resize temporary columns
-  THCudaTensor_resize2d(state, gradColumns, nInputPlane*kT*kW*kH, outputDepth*outputHeight*outputWidth);
-
-  // Helpers
-  THCudaTensor *gradInput_n = THCudaTensor_new(state);
-  THCudaTensor *gradOutput_n = THCudaTensor_new(state);
-
-  // For each elt in batch, do:
-  for (int elt = 0; elt < batchSize; elt ++) {
-    // Matrix mulitply per sample:
-    THCudaTensor_select(state, gradInput_n, gradInput, 0, elt);
-    THCudaTensor_select(state, gradOutput_n, gradOutput, 0, elt);
-
-    // M,N,K are dims of matrix A and B
-    // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
-    long m = nInputPlane*kT*kW*kH;
-    long n = gradColumns->size[1];
-    long k = nOutputPlane;
-
-    // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
-    THCudaBlas_Sgemm(
-        state,
-        'n', 't',
-        n, m, k,
-        1,
-        THCudaTensor_data(state, gradOutput_n), n,
-        THCudaTensor_data(state, weight), m,
-        0,
-        THCudaTensor_data(state, gradColumns), n
-    );
-
-    // Unpack columns back into input:
-    col2vol(
-      THCState_getCurrentStream(state),
-      THCudaTensor_data(state, gradColumns),
-      nInputPlane, inputDepth, inputHeight, inputWidth,
-      kT, kH, kW, padT, padH, padW, dT, dH, dW,
-      dilationT, dilationH, dilationW,
-      THCudaTensor_data(state, gradInput_n)
-    );
-  }
-
-  // Free
-  THCudaTensor_free(state, gradInput_n);
-  THCudaTensor_free(state, gradOutput_n);
-
-  // Resize output
-  if (batch == 0) {
-    THCudaTensor_resize4d(state, gradOutput, nOutputPlane, outputDepth, outputHeight, outputWidth);
-    THCudaTensor_resize4d(state, input, nInputPlane, inputDepth, inputHeight, inputWidth);
-    THCudaTensor_resize4d(state, gradInput, nInputPlane, inputDepth, inputHeight, inputWidth);
-  }
-}
-
-void THNN_CudaVolumetricDilatedConvolution_accGradParameters(
-  THCState *state,
-  THCudaTensor *input,
-  THCudaTensor *gradOutput,
-  THCudaTensor *gradWeight,
-  THCudaTensor *gradBias,
-  THCudaTensor *columns,
-  THCudaTensor *ones,
-  int kT, int kW, int kH,
-  int dT, int dW, int dH,
-  int padT, int padW, int padH,
-  int dilationT, int dilationW, int dilationH,
-  float scale) {
-
-  THCUNN_assertSameGPU(state, 5, input, gradOutput, gradWeight, columns, ones);
-  if (gradBias) {
-   THCUNN_assertSameGPU(state, 2, gradWeight, gradBias);
-  }
-  THArgCheck(input->nDimension == 4 || input->nDimension == 5, 2, "4D or 5D (batch mode) tensor is expected");
-  THArgCheck(gradOutput->nDimension == 4 || gradOutput->nDimension == 5, 3, "4D or 5D (batch mode) tensor is expected");
-  THArgCheck(gradWeight->nDimension == 5, 4, "gradWeight tensor must be 5D (nOutputPlane,nInputPlane,kT,kH,kW)");
-  THArgCheck(kT > 0 && kW > 0 && kH > 0, 8, "kernel size should be greater than zero");
-  THArgCheck(dT > 0 && dW > 0 && dH > 0, 10, "stride should be greater than zero");
-
-  // Params
-  int nInputPlane = gradWeight->size[1];
-  int nOutputPlane = gradWeight->size[0];
-
-  int batch = 1;
-  if (input->nDimension == 4) {
-    // Force batch
-    batch = 0;
-    THCudaTensor_resize5d(state, input, 1, input->size[0], input->size[1], input->size[2], input->size[3]);
-    THCudaTensor_resize5d(state, gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2], gradOutput->size[3]);
-  }
-
-  long inputDepth  = input->size[2];
-  long inputWidth   = input->size[4];
-  long inputHeight  = input->size[3];
-  long outputDepth  = (inputDepth + 2*padT - (dilationT * (kT - 1) + 1)) / dT + 1;
-  long outputWidth  = (inputWidth + 2*padW - (dilationW * (kW - 1) + 1)) / dW + 1;
-  long outputHeight = (inputHeight + 2*padH - (dilationH * (kH - 1) + 1)) / dH + 1;
-
-  // Batch size + input planes
-  long batchSize = input->size[0];
-
-  // Define a buffer of ones, for bias accumulation
-  if (ones->nDimension != 3 || ones->size[0]*ones->size[1]*ones->size[2] < outputDepth*outputHeight*outputWidth) {
-    // Resize plane and fill with ones...
-    THCudaTensor_resize3d(state, ones, outputDepth, outputHeight, outputWidth);
-    THCudaTensor_fill(state, ones, 1);
-  }
-
-  // Resize temporary columns
-  THCudaTensor_resize2d(state, columns, nInputPlane*kT*kW*kH, outputDepth*outputHeight*outputWidth);
-
-  // Helpers
-  THCudaTensor *input_n = THCudaTensor_new(state);
-  THCudaTensor *gradOutput_n = THCudaTensor_new(state);
-
-  // For each elt in batch, do:
-  for (int elt = 0; elt < batchSize; elt ++) {
-    // Matrix mulitply per output:
-    THCudaTensor_select(state, input_n, input, 0, elt);
-    THCudaTensor_select(state, gradOutput_n, gradOutput, 0, elt);
-
-    // Extract columns:
-    vol2col(
-      THCState_getCurrentStream(state),
-      THCudaTensor_data(state, input_n),
-      nInputPlane, inputDepth, inputHeight, inputWidth, kT, kH, kW, padT, padH, padW, dT, dH, dW,
-      dilationT, dilationH, dilationW,
-      THCudaTensor_data(state, columns)
-    );
-
-    // M,N,K are dims of matrix A and B
-    // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
-    long m = nOutputPlane;
-    long n = nInputPlane*kT*kW*kH;
-    long k = columns->size[1];
-
-    // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
-    THCudaBlas_Sgemm(
-        state,
-        't', 'n',
-        n, m, k,
-        scale,
-        THCudaTensor_data(state, columns), k,
-        THCudaTensor_data(state, gradOutput_n), k,
-        1,
-        THCudaTensor_data(state, gradWeight), n
-    );
-
-    // Do Bias:
-    // M,N,K are dims of matrix A and B
-    // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
-    long m_ = nOutputPlane;
-    long k_ = outputDepth * outputHeight * outputWidth;
-
-    // Do GEMV (note: this is a bit confusing because gemv assumes column-major matrices)
-    if (gradBias) {
-      THCudaBlas_Sgemv(
-          state,
-          't',
-          k_, m_,
-          scale,
-          THCudaTensor_data(state, gradOutput_n), k_,
-          THCudaTensor_data(state, ones), 1,
-          1,
-          THCudaTensor_data(state, gradBias), 1
-      );
-    }
-  }
-
-  // Free
-  THCudaTensor_free(state, input_n);
-  THCudaTensor_free(state, gradOutput_n);
-
-    // Resize output
-  if (batch == 0) {
-    THCudaTensor_resize4d(state, gradOutput, nOutputPlane, outputDepth, outputHeight, outputWidth);
-    THCudaTensor_resize4d(state, input, nInputPlane, inputDepth, inputHeight, inputWidth);
-  }
-}
+#include "generic/VolumetricDilatedConvolution.cu"
+#include "THCGenerateFloatTypes.h"
diff --git a/lib/THCUNN/VolumetricDilatedMaxPooling.cu b/lib/THCUNN/VolumetricDilatedMaxPooling.cu
index 49c56e7..0cff32c 100644
--- a/lib/THCUNN/VolumetricDilatedMaxPooling.cu
+++ b/lib/THCUNN/VolumetricDilatedMaxPooling.cu
@@ -3,16 +3,20 @@
 #include "THCDeviceTensor.cuh"
 #include "THCDeviceTensorUtils.cuh"
 #include "THCDeviceUtils.cuh"
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
+#include "THCAtomics.cuh"
 
 #include <cfloat>
 
+template <typename Dtype>
 __global__ void cuda_VolumetricDilatedMaxPooling_updateOutput(
-  THCDeviceTensor<float, 4> input,
-  THCDeviceTensor<float, 4> indices,
-  THCDeviceTensor<float, 4> output,
+  THCDeviceTensor<Dtype, 4> input,
+  THCDeviceTensor<THCIndex_t, 4> indices,
+  THCDeviceTensor<Dtype, 4> output,
   int kT, int kH, int kW,
   int dT, int dH, int dW,
-  int padT, int padH, int padW, 
+  int padT, int padH, int padW,
   int dilationT, int dilationH, int dilationW,
   int offsetZ)
 {
@@ -31,7 +35,7 @@ __global__ void cuda_VolumetricDilatedMaxPooling_updateOutput(
     int maxRow = 0;
     int maxFrame = 0;
 
-    float max = -FLT_MAX;
+    Dtype max = THCNumerics<Dtype>::min();
 
     for (int frame = 0; frame < kT; ++frame)
     {
@@ -45,7 +49,7 @@ __global__ void cuda_VolumetricDilatedMaxPooling_updateOutput(
             {
               if (iColumn + column * dilationW < input.getSize(3) && iColumn + column * dilationW >= 0)
               {
-                float val = input[slice][iFrame + frame * dilationT][iRow + row * dilationH][iColumn + column * dilationW];
+                Dtype val = input[slice][iFrame + frame * dilationT][iRow + row * dilationH][iColumn + column * dilationW];
 
                 if (max < val)
                 {
@@ -62,7 +66,7 @@ __global__ void cuda_VolumetricDilatedMaxPooling_updateOutput(
     }
 
     output[slice][oFrame][oRow][oColumn] = max;
-    float *idx = &indices[slice][oFrame][oRow][oColumn];
+    THCIndex_t *idx = &indices[slice][oFrame][oRow][oColumn];
     ((unsigned char*)(idx))[0] = maxFrame;
     ((unsigned char*)(idx))[1] = maxRow;
     ((unsigned char*)(idx))[2] = maxColumn;
@@ -70,13 +74,13 @@ __global__ void cuda_VolumetricDilatedMaxPooling_updateOutput(
   }
 }
 
-template <int KERNEL_WIDTH>
+template <int KERNEL_WIDTH, typename Dtype>
 __global__ void cuda_VolumetricDilatedMaxPooling_updateOutput(
-  THCDeviceTensor<float, 4> input, THCDeviceTensor<float, 4> indices,
-  THCDeviceTensor<float, 4> output,
+  THCDeviceTensor<Dtype, 4> input, THCDeviceTensor<THCIndex_t, 4> indices,
+  THCDeviceTensor<Dtype, 4> output,
   int kT, int kH,
   int dT, int dH, int dW,
-  int padT, int padH, int padW, 
+  int padT, int padH, int padW,
   int dilationT, int dilationH, int dilationW,
   int offsetZ)
 {
@@ -95,7 +99,7 @@ __global__ void cuda_VolumetricDilatedMaxPooling_updateOutput(
     int maxRow = 0;
     int maxFrame;
 
-    float max = -FLT_MAX;
+    Dtype max = THCNumerics<Dtype>::min();
 
     for (int frame = 0; frame < kT; ++frame)
     {
@@ -109,7 +113,7 @@ __global__ void cuda_VolumetricDilatedMaxPooling_updateOutput(
             {
               if (iColumn + column * dilationW < input.getSize(3) && iColumn + column * dilationW >= 0)
               {
-                float val = input[slice][iFrame + frame * dilationT][iRow + row * dilationH][iColumn + column * dilationW];
+                Dtype val = input[slice][iFrame + frame * dilationT][iRow + row * dilationH][iColumn + column * dilationW];
 
                 if (max < val)
                 {
@@ -126,7 +130,7 @@ __global__ void cuda_VolumetricDilatedMaxPooling_updateOutput(
     }
 
     output[slice][oFrame][oRow][oColumn] = max;
-    float *idx = &indices[slice][oFrame][oRow][oColumn];
+    THCIndex_t *idx = &indices[slice][oFrame][oRow][oColumn];
     ((unsigned char*)(idx))[0] = maxFrame;
     ((unsigned char*)(idx))[1] = maxRow;
     ((unsigned char*)(idx))[2] = maxColumn;
@@ -134,193 +138,13 @@ __global__ void cuda_VolumetricDilatedMaxPooling_updateOutput(
   }
 }
 
-#define UPDATE_OUTPUT_KERNEL_WIDTH(KW) case KW:                         \
-  cuda_VolumetricDilatedMaxPooling_updateOutput<KW><<<grid, block,             \
-    0, THCState_getCurrentStream(state)>>>(                             \
-    cudaInput, cudaIndices, cudaOutput, kT, kH, dT, dH, dW, padT, padH, padW,\
-    dilationT, dilationH, dilationW, offsetZ); \
-    break
-
-
-void THNN_CudaVolumetricDilatedMaxPooling_updateOutput(
-  THCState *state, THCudaTensor *input, THCudaTensor *output, THCudaTensor *indices,
-  int kT, int kW, int kH,
-  int dT, int dW, int dH,
-  int padT, int padW, int padH,
-  int dilationT, int dilationW, int dilationH,
-  bool ceilMode)
-{
-  int batchSize;
-  int inputSlices;
-  int inputTime;
-  int inputHeight;
-  int inputWidth;
-  int outputTime;
-  int outputHeight;
-  int outputWidth;
-
-  THCUNN_assertSameGPU(state, 3, input, indices, output);
-
-  if (THCudaTensor_nDimension(state, input) == 4)
-  {
-    THArgCheck(
-      THCudaTensor_size(state, input, 1) >= kT &&
-      THCudaTensor_size(state, input, 2) >= kH &&
-      THCudaTensor_size(state, input, 3) >= kW, 2,
-      "input image smaller than kernel size"
-    );
-
-    /* sizes */
-    batchSize   = 1;
-    inputSlices = THCudaTensor_size(state, input, 0);
-    inputTime   = THCudaTensor_size(state, input, 1);
-    inputHeight = THCudaTensor_size(state, input, 2);
-    inputWidth  = THCudaTensor_size(state, input, 3);
-  }
-  else if (THCudaTensor_nDimension(state, input) == 5)
-  {
-    THArgCheck(
-      THCudaTensor_size(state, input, 4) >= kW &&
-      THCudaTensor_size(state, input, 3) >= kH &&
-      THCudaTensor_size(state, input, 2) >= kT, 2,
-      "input image smaller than kernel size"
-    );
-
-    /* sizes */
-    batchSize   = THCudaTensor_size(state, input, 0);
-    inputSlices = THCudaTensor_size(state, input, 1);
-    inputTime   = THCudaTensor_size(state, input, 2);
-    inputHeight = THCudaTensor_size(state, input, 3);
-    inputWidth  = THCudaTensor_size(state, input, 4);
-  }
-  else
-  {
-    THArgCheck(false, 2, "4D or 5D tensor expected");
-  }
-
-  THArgCheck(kT/2 >= padT && kW/2 >= padW && kH/2 >= padH, 2,
-    "pad should be smaller than half of kernel size"
-  );
-
-  if (ceilMode)
-  {
-    outputTime   = (int)(ceil((float)(inputTime - (dilationT * (kT - 1) + 1) + 2*padT) / dT)) + 1;
-    outputHeight = (int)(ceil((float)(inputHeight - (dilationH * (kH - 1) + 1) + 2*padH) / dH)) + 1;
-    outputWidth  = (int)(ceil((float)(inputWidth  - (dilationW * (kW - 1) + 1) + 2*padW) / dW)) + 1;
-  }
-  else
-  {
-    outputTime   = (int)(floor((float)(inputTime - (dilationT * (kT - 1) + 1) + 2*padT) / dT)) + 1;
-    outputHeight = (int)(floor((float)(inputHeight - (dilationH * (kH - 1) + 1) + 2*padH) / dH)) + 1;
-    outputWidth  = (int)(floor((float)(inputWidth  - (dilationW * (kW - 1) + 1) + 2*padW) / dW)) + 1;
-  }
-
-  if (outputTime < 1 || outputHeight < 1 || outputWidth < 1)
-    THError("Given input size: (%dx%dx%dx%d). Calculated output size: (%dx%dx%dx%d). Output size is too small",
-            inputSlices,inputTime,inputHeight,inputWidth,inputSlices,outputTime,outputHeight,outputWidth);
-
-  if (padT || padW || padH)
-  {
-    if ((outputTime - 1)*dT >= inputTime + padT)
-      --outputTime;
-    if ((outputHeight - 1)*dH >= inputHeight + padH)
-      --outputHeight;
-    if ((outputWidth  - 1)*dW >= inputWidth  + padW)
-      --outputWidth;
-  }
-
-  if (input->nDimension == 4) /* 4D */
-  {
-    /* resize output */
-    THCudaTensor_resize4d(state, output, inputSlices,
-                          outputTime, outputHeight, outputWidth);
-    /* indices pack ti,i,j locations for each output point as uchar into
-     each float of the tensor */
-    THCudaTensor_resize4d(state, indices, inputSlices,
-                          outputTime, outputHeight, outputWidth);
-  }
-  else
-  { /* 5D */
-    THCudaTensor_resize5d(state, output, batchSize, inputSlices,
-                          outputTime, outputHeight, outputWidth);
-    // Index tensor packs index offsets as uchars into floats
-    THCudaTensor_resize5d(state, indices, batchSize, inputSlices,
-                          outputTime, outputHeight, outputWidth);
-  }
-
-  input = THCudaTensor_newContiguous(state, input);
-
-  // Collapse batch and feature dimensions
-  THCDeviceTensor<float, 4> cudaInput;
-  THCDeviceTensor<float, 4> cudaOutput;
-  if (THCudaTensor_nDimension(state, input) == 4)
-  {
-    cudaInput  = toDeviceTensor<float, 4>(state, input);
-    cudaOutput = toDeviceTensor<float, 4>(state, output);
-  }
-  else
-  {
-    cudaInput  = toDeviceTensor<float, 5>(state, input).downcastOuter<4>();
-    cudaOutput = toDeviceTensor<float, 5>(state, output).downcastOuter<4>();
-  }
-
-  THLongStorage *indicesSize = THLongStorage_newWithSize(4);
-  long indicesSizeRaw[4] = { batchSize * inputSlices,
-                            outputTime, outputHeight, outputWidth };
-  THLongStorage_rawCopy(indicesSize, indicesSizeRaw);
-
-  THCudaTensor *indices1 = THCudaTensor_newWithStorage(
-    state, THCudaTensor_storage(state, indices),
-    THCudaTensor_storageOffset(state, indices),
-    indicesSize, NULL);
-
-  THLongStorage_free(indicesSize);
-
-  THCDeviceTensor<float, 4> cudaIndices =
-    toDeviceTensor<float, 4>(state, indices1);
-
-  int totalZ = outputTime * inputSlices * batchSize;
-  int offsetZ = 0;
-  dim3 block(32, 8);
-
-  while (totalZ > 0) {
-    dim3 grid(THCCeilDiv(outputWidth, static_cast<int>(block.x)),
-              THCCeilDiv(outputHeight, static_cast<int>(block.y)),
-              totalZ > 65535 ? 65535 : totalZ);
-
-    switch (kW)
-      {
-        UPDATE_OUTPUT_KERNEL_WIDTH(1);
-        UPDATE_OUTPUT_KERNEL_WIDTH(2);
-        UPDATE_OUTPUT_KERNEL_WIDTH(3);
-        UPDATE_OUTPUT_KERNEL_WIDTH(4);
-        UPDATE_OUTPUT_KERNEL_WIDTH(5);
-        UPDATE_OUTPUT_KERNEL_WIDTH(6);
-        UPDATE_OUTPUT_KERNEL_WIDTH(7);
-      default:
-        cuda_VolumetricDilatedMaxPooling_updateOutput<<<grid, block,
-          0, THCState_getCurrentStream(state)>>>(
-                             cudaInput, cudaIndices, cudaOutput,
-                             kT, kH, kW, dT, dH, dW,
-                             padT, padH, padW, dilationT, dilationH, dilationW, offsetZ);
-      }
-    THCudaCheck(cudaGetLastError());
-    totalZ -= 65535;
-    offsetZ += 65535;
-  }
-
-  THCudaTensor_free(state, input);
-  THCudaTensor_free(state, indices1);
-}
-
-#undef UPDATE_OUTPUT_KERNEL_WIDTH
-
+template <typename Dtype>
 __global__ void cuda_VolumetricDilatedMaxPooling_updateGradInput(
-  THCDeviceTensor<float, 4> gradOutput,
-  THCDeviceTensor<float, 4> indices,
-  THCDeviceTensor<float, 4> gradInput,
+  THCDeviceTensor<Dtype, 4> gradOutput,
+  THCDeviceTensor<THCIndex_t, 4> indices,
+  THCDeviceTensor<Dtype, 4> gradInput,
   int dT, int dH, int dW,
-  int padT, int padH, int padW, 
+  int padT, int padH, int padW,
   int dilationT, int dilationH, int dilationW,
   int offsetZ)
 {
@@ -331,7 +155,7 @@ __global__ void cuda_VolumetricDilatedMaxPooling_updateGradInput(
 
   if (oRow < gradOutput.getSize(2) && oColumn < gradOutput.getSize(3))
   {
-    float *idx = &indices[slice][oFrame][oRow][oColumn];
+    THCIndex_t *idx = &indices[slice][oFrame][oRow][oColumn];
     int iFrame  = ((unsigned char*)(idx))[0] * dilationT + oFrame  * dT - padT;
     int iRow    = ((unsigned char*)(idx))[1] * dilationH + oRow    * dH - padH;
     int iColumn = ((unsigned char*)(idx))[2] * dilationW + oColumn * dW - padW;
@@ -340,98 +164,5 @@ __global__ void cuda_VolumetricDilatedMaxPooling_updateGradInput(
   }
 }
 
-void THNN_CudaVolumetricDilatedMaxPooling_updateGradInput(
-  THCState *state, THCudaTensor *input, THCudaTensor *gradOutput, THCudaTensor *gradInput,
-  THCudaTensor *indices,
-  int dT, int dW, int dH,
-  int padT, int padW, int padH,
-  int dilationT, int dilationW, int dilationH)
-{
-  // Resize and initialize result tensor.
-  THCudaTensor_resizeAs(state, gradInput, input);
-  THCudaTensor_zero(state, gradInput);
-
-  int batchSize;
-  int inputSlices;
-
-  int outputTime;
-  int outputHeight;
-  int outputWidth;
-
-  THCUNN_assertSameGPU(state, 4, input, indices, gradOutput, gradInput);
-
-  if (THCudaTensor_nDimension(state, input) == 4) /* 4D */
-  {
-    batchSize = 1;
-    inputSlices  = THCudaTensor_size(state, input, 0);
-
-    outputTime   = THCudaTensor_size(state, gradOutput, 1);
-    outputHeight = THCudaTensor_size(state, gradOutput, 2);
-    outputWidth  = THCudaTensor_size(state, gradOutput, 3);
-  }
-  else
-  {
-    batchSize    = THCudaTensor_size(state, input, 0);
-    inputSlices  = THCudaTensor_size(state, input, 1);
-
-    outputTime   = THCudaTensor_size(state, gradOutput, 2);
-    outputHeight = THCudaTensor_size(state, gradOutput, 3);
-    outputWidth  = THCudaTensor_size(state, gradOutput, 4);
-  }
-
-  gradOutput = THCudaTensor_newContiguous(state, gradOutput);
-
-  // Collapse batch and feature dimensions
-  THCDeviceTensor<float, 4> cudaGradInput;
-  THCDeviceTensor<float, 4> cudaGradOutput;
-  if (THCudaTensor_nDimension(state, input) == 4)
-  {
-    cudaGradInput  = toDeviceTensor<float, 4>(state, gradInput);
-    cudaGradOutput = toDeviceTensor<float, 4>(state, gradOutput);
-  }
-  else
-  {
-    cudaGradInput =
-      toDeviceTensor<float, 5>(state, gradInput).downcastOuter<4>();
-    cudaGradOutput =
-      toDeviceTensor<float, 5>(state, gradOutput).downcastOuter<4>();
-  }
-
-  THLongStorage *indicesSize = THLongStorage_newWithSize(4);
-  long indicesSizeRaw[4] = { batchSize * inputSlices,
-                           outputTime, outputHeight, outputWidth };
-  THLongStorage_rawCopy(indicesSize, indicesSizeRaw);
-  THCudaTensor *indices1 = THCudaTensor_newWithStorage(
-    state, THCudaTensor_storage(state, indices),
-    THCudaTensor_storageOffset(state, indices), indicesSize, NULL);
-  THLongStorage_free(indicesSize);
-
-  THCDeviceTensor<float, 4> cudaIndices =
-    toDeviceTensor<float, 4>(state, indices1);
-
-  int totalZ = outputTime * inputSlices * batchSize;
-  int offsetZ = 0;
-  dim3 block(32, 8);
-
-  while (totalZ > 0) {
-    dim3 grid(THCCeilDiv(outputWidth, static_cast<int>(block.x)),
-              THCCeilDiv(outputHeight, static_cast<int>(block.y)),
-              totalZ > 65535 ? 65535 : totalZ);
-
-    cuda_VolumetricDilatedMaxPooling_updateGradInput<<<grid, block,
-      0, THCState_getCurrentStream(state)>>>(
-                                             cudaGradOutput,
-                                             cudaIndices,
-                                             cudaGradInput,
-                                             dT, dH, dW,
-                                             padT, padH, padW, 
-                                             dilationT, dilationH, dilationW, offsetZ);
-    THCudaCheck(cudaGetLastError());
-    totalZ -= 65535;
-    offsetZ += 65535;
-  }
-
-  // cleanup
-  THCudaTensor_free(state, gradOutput);
-  THCudaTensor_free(state, indices1);
-}
+#include "generic/VolumetricDilatedMaxPooling.cu"
+#include "THCGenerateFloatTypes.h"
diff --git a/lib/THCUNN/VolumetricFullConvolution.cu b/lib/THCUNN/VolumetricFullConvolution.cu
index ae414b7..93c4c0f 100644
--- a/lib/THCUNN/VolumetricFullConvolution.cu
+++ b/lib/THCUNN/VolumetricFullConvolution.cu
@@ -1,356 +1,8 @@
 #include "THCUNN.h"
 #include "common.h"
 #include "vol2col.h"
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
 
-
-void THNN_CudaVolumetricFullConvolution_updateOutput(
-    THCState *state,
-    THCudaTensor *input,
-    THCudaTensor *output,
-    THCudaTensor *weight,
-    THCudaTensor *bias,
-    THCudaTensor *finput,
-    THCudaTensor *fgradInput,
-    int dT, int dW, int dH,
-    int padT, int padW, int padH,
-    int adjT, int adjW, int adjH)
-{
-
-  THCudaTensor *columns = finput;
-  THCudaTensor *ones    = fgradInput;
-
-  int nInputPlane = THCudaTensor_size(state, weight, 0);
-  int nOutputPlane = THCudaTensor_size(state, weight, 1);
-  const int kT           = (int)weight->size[2];
-  const int kH           = (int)weight->size[3];
-  const int kW           = (int)weight->size[4];
-
-  THCUNN_assertSameGPU(state, 6, input, output, weight,
-                                 bias, columns, ones);
-  THArgCheck(input->nDimension == 4 || input->nDimension == 5, 2, "4D or 5D (batch mode) tensor is expected");
-
-  int batch = 1;
-  if (input->nDimension == 4) {
-    THArgCheck(input->size[0] == nInputPlane, 2, "input channels and nInputPlane dont match");
-    // Force batch
-    batch = 0;
-    THCudaTensor_resize5d(state, input, 1, input->size[0], input->size[1], input->size[2], input->size[3]);
-  } else {
-    THArgCheck(input->size[1] == nInputPlane, 2, "input channels and nInputPlane dont match");
-  }
-
-  long inputWidth   = input->size[4];
-  long inputHeight  = input->size[3];
-  long inputDepth  = input->size[2];
-  long outputWidth  = (inputWidth - 1) * dW - 2*padW + kW + adjW;
-  long outputHeight = (inputHeight - 1) * dH - 2*padH + kH + adjH;
-  long outputDepth = (inputDepth - 1) * dT - 2*padT + kT + adjT;
-
-  // Batch size + input planes
-  long batchSize = input->size[0];
-
-  // Resize output
-  THCudaTensor_resize5d(state, output, batchSize, nOutputPlane, outputDepth, outputHeight, outputWidth);
-
-  // Resize temporary columns
-  THCudaTensor_resize2d(state, columns, nOutputPlane*kW*kH*kT, inputDepth*inputHeight*inputWidth);
-
-  // Define a buffer of ones, for bias accumulation
-  // Note: this buffer can be shared with other modules, it only ever gets increased,
-  // and always contains ones.
-  if (ones->nDimension != 3 || ones->size[0]*ones->size[1]*ones->size[2] < outputDepth*outputHeight*outputWidth) {
-    // Resize plane and fill with ones...
-    THCudaTensor_resize3d(state, ones, outputDepth, outputHeight, outputWidth);
-    THCudaTensor_fill(state, ones, 1);
-  }
-
-  // Helpers
-  THCudaTensor *input_n = THCudaTensor_new(state);
-  THCudaTensor *output_n = THCudaTensor_new(state);
-
-  // For each elt in batch, do:
-  for (int elt = 0; elt < batchSize; elt ++) {
-    // Matrix mulitply per output:
-    THCudaTensor_select(state, input_n, input, 0, elt);
-    THCudaTensor_select(state, output_n, output, 0, elt);
-
-    // M,N,K are dims of matrix A and B
-    // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
-    long m = weight->size[1] * weight->size[2] * weight->size[3] * weight->size[4];
-    long n = columns->size[1];
-    long k = weight->size[0];
-
-    // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
-    THCudaBlas_Sgemm(
-        state,
-        'n', 't',
-        n, m, k,
-        1,
-        THCudaTensor_data(state, input_n), n,
-        THCudaTensor_data(state, weight), m,
-        0,
-        THCudaTensor_data(state, columns), n
-    );
-
-    // Unpack columns back into input:
-    col2vol(
-      THCState_getCurrentStream(state),
-      THCudaTensor_data(state, columns),
-      nOutputPlane, outputDepth, outputHeight, outputWidth, kT, kH, kW, padT, padH, padW, dT, dH, dW,
-      1,1,1,
-      THCudaTensor_data(state, output_n)
-    );
-
-    // Do Bias after:
-    // M,N,K are dims of matrix A and B
-    // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
-    long m_ = nOutputPlane;
-    long n_ = outputDepth * outputHeight * outputWidth;
-    long k_ = 1;
-
-    // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
-    THCudaBlas_Sgemm(
-        state,
-        't', 'n',
-        n_, m_, k_,
-        1,
-        THCudaTensor_data(state, ones), k_,
-        THCudaTensor_data(state, bias), k_,
-        1,
-        THCudaTensor_data(state, output_n), n_
-    );
-
-  }
-
-  // Free
-  THCudaTensor_free(state, input_n);
-  THCudaTensor_free(state, output_n);
-
-  // Resize output
-  if (batch == 0) {
-    THCudaTensor_resize4d(state, output, nOutputPlane, outputDepth, outputHeight, outputWidth);
-    THCudaTensor_resize4d(state, input, nInputPlane, inputDepth, inputHeight, inputWidth);
-  }
-}
-
-void THNN_CudaVolumetricFullConvolution_updateGradInput(
-    THCState *state,
-    THCudaTensor *input,
-    THCudaTensor *gradOutput,
-    THCudaTensor *gradInput,
-    THCudaTensor *weight,
-    THCudaTensor *finput,
-    THCudaTensor *fgradInput,
-    int dT, int dW, int dH,
-    int padT, int padW, int padH,
-    int adjT, int adjW, int adjH)
-{
-  THCudaTensor *gradColumns = finput;
-
-  int nInputPlane = THCudaTensor_size(state, weight, 0);
-  int nOutputPlane = THCudaTensor_size(state, weight, 1);
-  const int kT           = (int)weight->size[2];
-  const int kH           = (int)weight->size[3];
-  const int kW           = (int)weight->size[4];
-
-  THCUNN_assertSameGPU(state, 5, input, gradOutput, weight,
-                                 gradColumns, gradInput);
-  THArgCheck(input->nDimension == 4 || input->nDimension == 5, 2, "4D or 5D (batch mode) tensor is expected");
-
-  int batch = 1;
-  if (input->nDimension == 4) {
-    // Force batch
-    batch = 0;
-    THCudaTensor_resize5d(state, input, 1, input->size[0], input->size[1], input->size[2], input->size[3]);
-    THCudaTensor_resize5d(state, gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2], gradOutput->size[3]);
-  }
-
-  long inputWidth   = input->size[4];
-  long inputHeight  = input->size[3];
-  long inputDepth   = input->size[2];
-  long outputWidth  = (inputWidth - 1) * dW - 2*padW + kW + adjW;
-  long outputHeight = (inputHeight - 1) * dH - 2*padH + kH + adjH;
-  long outputDepth = (inputDepth - 1) * dT - 2*padT + kT + adjT;
-
-  // Batch size + input planes
-  long batchSize = input->size[0];
-
-  // Resize output
-  THCudaTensor_resize5d(state, gradInput, batchSize, nInputPlane, inputDepth, inputHeight, inputWidth);
-
-  // Resize temporary columns
-  THCudaTensor_resize2d(state, gradColumns, nOutputPlane*kW*kH*kT, inputDepth*inputHeight*inputWidth);
-
-  // Helpers
-  THCudaTensor *gradInput_n = THCudaTensor_new(state);
-  THCudaTensor *gradOutput_n = THCudaTensor_new(state);
-
-  // For each elt in batch, do:
-  for (int elt = 0; elt < batchSize; elt ++) {
-    // Matrix mulitply per sample:
-    THCudaTensor_select(state, gradInput_n, gradInput, 0, elt);
-    THCudaTensor_select(state, gradOutput_n, gradOutput, 0, elt);
-
-    // Extract columns:
-    vol2col(
-      THCState_getCurrentStream(state),
-      THCudaTensor_data(state, gradOutput_n),
-      nOutputPlane, outputDepth, outputHeight, outputWidth, kT, kH, kW, padT, padH, padW, dT, dH, dW,
-      1,1,1,
-      THCudaTensor_data(state, gradColumns)
-    );
-
-
-    // M,N,K are dims of matrix A and B
-    // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
-    long m = weight->size[0];
-    long n = gradColumns->size[1];
-    long k = weight->size[1] * weight->size[2] * weight->size[3] * weight->size[4];
-
-    // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
-    THCudaBlas_Sgemm(
-        state,
-        'n', 'n',
-        n, m, k,
-        1,
-        THCudaTensor_data(state, gradColumns), n,
-        THCudaTensor_data(state, weight), k,
-        0,
-        THCudaTensor_data(state, gradInput_n), n
-    );
-  }
-
-
-  // Free
-  THCudaTensor_free(state, gradInput_n);
-  THCudaTensor_free(state, gradOutput_n);
-
-  // Resize output
-  if (batch == 0) {
-    THCudaTensor_resize4d(state, gradOutput, nOutputPlane, outputDepth, outputHeight, outputWidth);
-    THCudaTensor_resize4d(state, input, nInputPlane, inputDepth, inputHeight, inputWidth);
-    THCudaTensor_resize4d(state, gradInput, nInputPlane, inputDepth, inputHeight, inputWidth);
-  }
-}
-
-
-void THNN_CudaVolumetricFullConvolution_accGradParameters(
-    THCState *state,
-    THCudaTensor *input,
-    THCudaTensor *gradOutput,
-    THCudaTensor *gradWeight,
-    THCudaTensor *gradBias,
-    THCudaTensor *finput,
-    THCudaTensor *fgradInput,
-    int dT, int dW, int dH,
-    int padT, int padW, int padH,
-    int adjT, int adjW, int adjH,
-    float scale)
-{
-  THCudaTensor *columns = finput;
-  THCudaTensor *ones = fgradInput;
-
-  int nInputPlane = THCudaTensor_size(state, gradWeight, 0);
-  int nOutputPlane = THCudaTensor_size(state, gradWeight, 1);
-  const int kT           = (int)gradWeight->size[2];
-  const int kH           = (int)gradWeight->size[3];
-  const int kW           = (int)gradWeight->size[4];
-
-  THCUNN_assertSameGPU(state, 6, input, gradOutput, gradWeight,
-                                 gradBias, columns, ones);
-  THArgCheck(input->nDimension == 4 || input->nDimension == 5, 2, "4D or 5D (batch mode) tensor is expected");
-
-  int batch = 1;
-  if (input->nDimension == 4) {
-    // Force batch
-    batch = 0;
-    THCudaTensor_resize5d(state, input, 1, input->size[0], input->size[1], input->size[2], input->size[3]);
-    THCudaTensor_resize5d(state, gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2], gradOutput->size[3]);
-  }
-
-  long inputWidth   = input->size[4];
-  long inputHeight  = input->size[3];
-  long inputDepth   = input->size[2];
-  long outputWidth  = (inputWidth - 1) * dW - 2*padW + kW + adjW;
-  long outputHeight = (inputHeight - 1) * dH - 2*padH + kH + adjH;
-  long outputDepth  = (inputDepth - 1) * dT - 2*padT + kT + adjT;
-
-  // Batch size + input planes
-  long batchSize = input->size[0];
-
-  // Define a buffer of ones, for bias accumulation
-  if (ones->nDimension != 3 || ones->size[0]*ones->size[1]*ones->size[2] < outputDepth*outputHeight*outputWidth) {
-    // Resize plane and fill with ones...
-    THCudaTensor_resize3d(state, ones, outputDepth, outputHeight, outputWidth);
-    THCudaTensor_fill(state, ones, 1);
-  }
-
-  // Resize temporary columns
-  THCudaTensor_resize2d(state, columns, nOutputPlane*kW*kH*kT, inputDepth*inputHeight*inputWidth);
-
-  // Helpers
-  THCudaTensor *input_n = THCudaTensor_new(state);
-  THCudaTensor *gradOutput_n = THCudaTensor_new(state);
-
-  // For each elt in batch, do:
-  for (int elt = 0; elt < batchSize; elt ++) {
-    // Matrix mulitply per output:
-    THCudaTensor_select(state, input_n, input, 0, elt);
-    THCudaTensor_select(state, gradOutput_n, gradOutput, 0, elt);
-
-    // Extract columns:
-    vol2col(
-      THCState_getCurrentStream(state),
-      THCudaTensor_data(state, gradOutput_n),
-      nOutputPlane, outputDepth, outputHeight, outputWidth, kT, kH, kW, padT, padH, padW, dT, dH, dW,
-      1,1,1,
-      THCudaTensor_data(state, columns)
-    );
-
-    // M,N,K are dims of matrix A and B
-    // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
-    long n = columns->size[0];   // nOutputPlane * kt * kh * kw
-    long m = input_n->size[0];   // nInputPlane
-    long k = columns->size[1];   // inputHeight * inputWidth
-
-    // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
-    THCudaBlas_Sgemm(
-        state,
-        't', 'n',
-        n, m, k,
-        scale,
-        THCudaTensor_data(state, columns), k,
-        THCudaTensor_data(state, input_n), k,
-        1,
-        THCudaTensor_data(state, gradWeight), n
-    );
-
-    // Do Bias:
-    // M,N,K are dims of matrix A and B
-    // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
-    long m_ = nOutputPlane;
-    long k_ = outputDepth * outputHeight * outputWidth;
-
-    // Do GEMV (note: this is a bit confusing because gemv assumes column-major matrices)
-    THCudaBlas_Sgemv(
-        state,
-        't',
-        k_, m_,
-        scale,
-        THCudaTensor_data(state, gradOutput_n), k_,
-        THCudaTensor_data(state, ones), 1,
-        1,
-        THCudaTensor_data(state, gradBias), 1
-    );
-  }
-
-  // Free
-  THCudaTensor_free(state, input_n);
-  THCudaTensor_free(state, gradOutput_n);
-
-  // Resize
-  if (batch == 0) {
-    THCudaTensor_resize4d(state, gradOutput, nOutputPlane, outputDepth, outputHeight, outputWidth);
-    THCudaTensor_resize4d(state, input, nInputPlane, inputDepth, inputHeight, inputWidth);
-  }
-}
+#include "generic/VolumetricFullConvolution.cu"
+#include "THCGenerateFloatTypes.h"
diff --git a/lib/THCUNN/VolumetricMaxPooling.cu b/lib/THCUNN/VolumetricMaxPooling.cu
index 892bef9..2f7de7b 100644
--- a/lib/THCUNN/VolumetricMaxPooling.cu
+++ b/lib/THCUNN/VolumetricMaxPooling.cu
@@ -6,27 +6,5 @@
 
 #include <cfloat>
 
-void THNN_CudaVolumetricMaxPooling_updateOutput(
-  THCState *state, THCudaTensor *input, THCudaTensor *output, THCudaTensor *indices,
-  int kT, int kW, int kH,
-  int dT, int dW, int dH,
-  int padT, int padW, int padH,
-  bool ceilMode)
-{
-  THNN_CudaVolumetricDilatedMaxPooling_updateOutput(
-    state, input, output, indices,
-    kT, kW, kH, dT, dW, dH, padT, padW, padH, 1, 1, 1, ceilMode);
-
-}
-
-void THNN_CudaVolumetricMaxPooling_updateGradInput(
-  THCState *state, THCudaTensor *input, THCudaTensor *gradOutput, THCudaTensor *gradInput,
-  THCudaTensor *indices,
-  int dT, int dW, int dH,
-  int padT, int padW, int padH)
-{
-  THNN_CudaVolumetricDilatedMaxPooling_updateGradInput(
-    state, input, gradOutput, gradInput, indices,
-    dT, dW, dH, padT, padW, padH, 1, 1, 1);
-
-}
+#include "generic/VolumetricMaxPooling.cu"
+#include "THCGenerateFloatTypes.h"
diff --git a/lib/THCUNN/VolumetricMaxUnpooling.cu b/lib/THCUNN/VolumetricMaxUnpooling.cu
index b583211..5d2b8b4 100644
--- a/lib/THCUNN/VolumetricMaxUnpooling.cu
+++ b/lib/THCUNN/VolumetricMaxUnpooling.cu
@@ -3,13 +3,16 @@
 #include "THCDeviceTensor.cuh"
 #include "THCDeviceTensorUtils.cuh"
 #include "THCDeviceUtils.cuh"
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
 
 #include <cfloat>
 
+template <typename Dtype>
 __global__ void cuda_VolumetricMaxUnpooling_updateOutput(
-  THCDeviceTensor<float, 4> input,
-  THCDeviceTensor<float, 4> indices,
-  THCDeviceTensor<float, 4> output,
+  THCDeviceTensor<Dtype, 4> input,
+  THCDeviceTensor<THCIndex_t, 4> indices,
+  THCDeviceTensor<Dtype, 4> output,
   int dT, int dH, int dW,
   int padT, int padH, int padW, int offsetZ)
 {
@@ -24,9 +27,9 @@ __global__ void cuda_VolumetricMaxUnpooling_updateOutput(
     long start_h = iRow * dH - padH;
     long start_w = iColumn * dW - padW;
 
-    float val = input[slice][iFrame][iRow][iColumn];
-    
-    float *idx = &indices[slice][iFrame][iRow][iColumn];
+    Dtype val = input[slice][iFrame][iRow][iColumn];
+
+    THCIndex_t *idx = &indices[slice][iFrame][iRow][iColumn];
     long maxz = ((unsigned char*)(idx))[0];
     long maxy = ((unsigned char*)(idx))[1];
     long maxx = ((unsigned char*)(idx))[2];
@@ -34,104 +37,11 @@ __global__ void cuda_VolumetricMaxUnpooling_updateOutput(
   }
 }
 
-void THNN_CudaVolumetricMaxUnpooling_updateOutput(
-  THCState *state, THCudaTensor *input, THCudaTensor *output, THCudaTensor *indices,
-  int outputTime, int outputWidth, int outputHeight,
-  int dT, int dW, int dH,
-  int padT, int padW, int padH)
-{
-  int batchSize;
-  int inputSlices;
-  int inputTime;
-  int inputHeight;
-  int inputWidth;
-
-  THCUNN_assertSameGPU(state, 3, input, indices, output);
-
-  if (THCudaTensor_nDimension(state, input) == 4)
-  {
-    /* sizes */
-    batchSize   = 1;
-    inputSlices = THCudaTensor_size(state, input, 0);
-    inputTime   = THCudaTensor_size(state, input, 1);
-    inputHeight = THCudaTensor_size(state, input, 2);
-    inputWidth  = THCudaTensor_size(state, input, 3);
-  }
-  else if (THCudaTensor_nDimension(state, input) == 5)
-  {
-    /* sizes */
-    batchSize   = THCudaTensor_size(state, input, 0);
-    inputSlices = THCudaTensor_size(state, input, 1);
-    inputTime   = THCudaTensor_size(state, input, 2);
-    inputHeight = THCudaTensor_size(state, input, 3);
-    inputWidth  = THCudaTensor_size(state, input, 4);
-  }
-  else
-  {
-    THArgCheck(false, 2, "4D or 5D tensor expected");
-  }
-
-  if (input->nDimension == 4) /* 4D */
-  {
-    /* resize output */
-    THCudaTensor_resize4d(state, output, inputSlices,
-                          outputTime, outputHeight, outputWidth);
-  }
-  else
-  { /* 5D */
-    THCudaTensor_resize5d(state, output, batchSize, inputSlices,
-                          outputTime, outputHeight, outputWidth);
-  }
-
-  input = THCudaTensor_newContiguous(state, input);
-  indices = THCudaTensor_newContiguous(state, indices);
-  THCudaTensor_zero(state, output);
-  
-  // Collapse batch and feature dimensions
-  THCDeviceTensor<float, 4> cudaInput;
-  THCDeviceTensor<float, 4> cudaOutput;
-  THCDeviceTensor<float, 4> cudaIndices;
-
-  if (THCudaTensor_nDimension(state, input) == 4)
-  {
-    cudaInput  = toDeviceTensor<float, 4>(state, input);
-    cudaOutput = toDeviceTensor<float, 4>(state, output);
-    cudaIndices = toDeviceTensor<float, 4>(state, indices);
-  }
-  else
-  {
-    cudaInput  = toDeviceTensor<float, 5>(state, input).downcastOuter<4>();
-    cudaOutput = toDeviceTensor<float, 5>(state, output).downcastOuter<4>();
-    cudaIndices = toDeviceTensor<float, 5>(state, indices).downcastOuter<4>();
-  }
-
-  int totalZ = inputTime * inputSlices * batchSize;
-  int offsetZ = 0;
-  dim3 block(32, 8);
-
-  while (totalZ > 0) {
-    dim3 grid(THCCeilDiv(inputWidth, static_cast<int>(block.x)),
-              THCCeilDiv(inputHeight, static_cast<int>(block.y)),
-              totalZ > 65535 ? 65535 : totalZ);
-
-    cuda_VolumetricMaxUnpooling_updateOutput<<<grid, block,
-          0, THCState_getCurrentStream(state)>>>(
-                             cudaInput, cudaIndices, cudaOutput,
-                             dT, dH, dW,
-                             padT, padH, padW, offsetZ);
-    THCudaCheck(cudaGetLastError());
-    totalZ -= 65535;
-    offsetZ += 65535;
-  }
-
-  THCudaTensor_free(state, input);
-  THCudaTensor_free(state, indices);
-}
-
+template <typename Dtype>
 __global__ void cuda_VolumetricMaxUnpooling_updateGradInput(
-  THCDeviceTensor<float, 4> gradOutput,
-  THCDeviceTensor<float, 4> indices,
-  THCDeviceTensor<float, 4> gradInput,
+  THCDeviceTensor<Dtype, 4> gradOutput,
+  THCDeviceTensor<THCIndex_t, 4> indices,
+  THCDeviceTensor<Dtype, 4> gradInput,
   int dT, int dH, int dW,
   int padT, int padH, int padW, int offsetZ)
 {
@@ -142,105 +52,21 @@ __global__ void cuda_VolumetricMaxUnpooling_updateGradInput(
 
   if (iRow < gradInput.getSize(2) && iColumn < gradInput.getSize(3))
   {
-    
+
     long start_t = iFrame * dT - padT;
     long start_h = iRow * dH - padH;
     long start_w = iColumn * dW - padW;
 
-    float *idx = &indices[slice][iFrame][iRow][iColumn];
+    THCIndex_t *idx = &indices[slice][iFrame][iRow][iColumn];
     long maxz = ((unsigned char*)(idx))[0];
     long maxy = ((unsigned char*)(idx))[1];
     long maxx = ((unsigned char*)(idx))[2];
 
-    float grad_val = gradOutput[slice][start_t + maxz][start_h + maxy][start_w + maxx];
+    Dtype grad_val = gradOutput[slice][start_t + maxz][start_h + maxy][start_w + maxx];
 
     gradInput[slice][iFrame][iRow][iColumn] = grad_val;
   }
 }
 
-void THNN_CudaVolumetricMaxUnpooling_updateGradInput(
-  THCState *state, THCudaTensor *input, THCudaTensor *gradOutput, THCudaTensor *gradInput,
-  THCudaTensor *indices,
-  int outputTime, int outputWidth, int outputHeight,
-  int dT, int dW, int dH,
-  int padT, int padW, int padH)
-{
-  
-  int batchSize;
-  int inputSlices;
-  int inputTime;
-  int inputHeight;
-  int inputWidth;
-  
-  THCUNN_assertSameGPU(state, 4, input, indices, gradOutput, gradInput);
-
-  if (THCudaTensor_nDimension(state, input) == 4) /* 4D */
-  {
-    batchSize = 1;
-    inputSlices  = THCudaTensor_size(state, input, 0);
-    inputTime   = THCudaTensor_size(state, input, 1);
-    inputHeight = THCudaTensor_size(state, input, 2);
-    inputWidth  = THCudaTensor_size(state, input, 3);
-  }
-  else
-  {
-    batchSize    = THCudaTensor_size(state, input, 0);
-    inputSlices  = THCudaTensor_size(state, input, 1);
-    inputTime   = THCudaTensor_size(state, input, 2);
-    inputHeight = THCudaTensor_size(state, input, 3);
-    inputWidth  = THCudaTensor_size(state, input, 4);
-  }
-
-  input = THCudaTensor_newContiguous(state, input);
-  indices = THCudaTensor_newContiguous(state, indices);
-  gradOutput = THCudaTensor_newContiguous(state, gradOutput);
-  THCudaTensor_resizeAs(state, gradInput, input);
-  THCudaTensor_zero(state, gradInput);
-
-  // Collapse batch and feature dimensions
-  THCDeviceTensor<float, 4> cudaGradInput;
-  THCDeviceTensor<float, 4> cudaGradOutput;
-  THCDeviceTensor<float, 4> cudaIndices;
-  
-  if (THCudaTensor_nDimension(state, input) == 4)
-  {
-    cudaGradInput  = toDeviceTensor<float, 4>(state, gradInput);
-    cudaGradOutput = toDeviceTensor<float, 4>(state, gradOutput);
-    cudaIndices = toDeviceTensor<float, 4>(state, indices);
-  }
-  else
-  {
-    cudaGradInput =
-      toDeviceTensor<float, 5>(state, gradInput).downcastOuter<4>();
-    cudaGradOutput =
-      toDeviceTensor<float, 5>(state, gradOutput).downcastOuter<4>();
-    cudaIndices =
-      toDeviceTensor<float, 5>(state, indices).downcastOuter<4>();
-  }
-
-  int totalZ = inputTime * inputSlices * batchSize;
-  int offsetZ = 0;
-  dim3 block(32, 8);
-
-  while (totalZ > 0) {
-    dim3 grid(THCCeilDiv(inputWidth, static_cast<int>(block.x)),
-              THCCeilDiv(inputHeight, static_cast<int>(block.y)),
-              totalZ > 65535 ? 65535 : totalZ);
-
-    cuda_VolumetricMaxUnpooling_updateGradInput<<<grid, block,
-      0, THCState_getCurrentStream(state)>>>(
-                                             cudaGradOutput,
-                                             cudaIndices,
-                                             cudaGradInput,
-                                             dT, dH, dW,
-                                             padT, padH, padW, offsetZ);
-    THCudaCheck(cudaGetLastError());
-    totalZ -= 65535;
-    offsetZ += 65535;
-  }
-
-  // cleanup
-  THCudaTensor_free(state, input);
-  THCudaTensor_free(state, gradOutput);
-  THCudaTensor_free(state, indices);
-}
-\ No newline at end of file
+#include "generic/VolumetricMaxUnpooling.cu"
+#include "THCGenerateFloatTypes.h"
diff --git a/lib/THCUNN/VolumetricReplicationPadding.cu b/lib/THCUNN/VolumetricReplicationPadding.cu
index cca51d5..39c37b4 100644
--- a/lib/THCUNN/VolumetricReplicationPadding.cu
+++ b/lib/THCUNN/VolumetricReplicationPadding.cu
@@ -4,10 +4,14 @@
 #include "THCDeviceTensorUtils.cuh"
 #include "THCDeviceUtils.cuh"
 #include "THCReduceApplyUtils.cuh"
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
+#include "THCAtomics.cuh"
 
+template <typename Dtype>
 __global__ void VolumetricReplicationPadding_updateOutput(
-  THCDeviceTensor<float, 5> input,
-  THCDeviceTensor<float, 5> output,
+  THCDeviceTensor<Dtype, 5> input,
+  THCDeviceTensor<Dtype, 5> output,
   int pfront, int pback, int ptop, int pbottom, int pleft, int pright) {
 
   int outputPointId = threadIdx.x + blockIdx.x * blockDim.x;
@@ -35,76 +39,15 @@ __global__ void VolumetricReplicationPadding_updateOutput(
   int inputPointZ = min(max(pfront, outputPointZ),
                         input.getSize(2) + pfront - 1) - oStartZ + iStartZ;
 
-  float valueToCopy =
+  Dtype valueToCopy =
       input[batch][plane][inputPointZ][inputPointY][inputPointX];
   output[batch][plane][outputPointZ][outputPointY][outputPointX] = valueToCopy;
 }
 
-void THNN_CudaVolumetricReplicationPadding_updateOutput(THCState *state,
-                                                        THCudaTensor *input,
-                                                        THCudaTensor *output,
-                                                        int pleft, int pright,
-                                                        int ptop, int pbottom,
-                                                        int pfront, int pback) {
-  THArgCheck(TensorUtils<THCudaTensor>::canUse32BitIndexMath(state, input), 2,
-             "input tensor must fit into 32-bit index math");
-
-  int planeDim = 0;
-  int dimd = 1;
-  int dimh = 2;
-  int dimw = 3;
-  int numBatch = 1;
-
-  int numInputDims = THCudaTensor_nDimension(state, input);
-  THArgCheck(numInputDims == 4 || numInputDims == 5, 2,
-             "input must be 4 or 5-dimensional");
-
-  if (numInputDims == 5) {
-    numBatch = THCudaTensor_size(state, input, 0);
-    planeDim++;
-    dimd++;
-    dimh++;
-    dimw++;
-  }
-
-  int numPlanes = THCudaTensor_size(state, input, planeDim);
-  int inputD = THCudaTensor_size(state, input, dimd);
-  int inputH = THCudaTensor_size(state, input, dimh);
-  int inputW = THCudaTensor_size(state, input, dimw);
-  int outputD = inputD + pfront + pback;
-  int outputH = inputH + ptop + pbottom;
-  int outputW  = inputW + pleft + pright;
-
-  THCDeviceTensor<float, 5> devInput;
-  THCDeviceTensor<float, 5> devOutput;
-
-  if (numInputDims == 4) {
-    THCudaTensor_resize4d(state, output, numPlanes, outputD, outputH, outputW);
-
-    devInput = toDeviceTensor<float, 4>(state, input).upcastOuter<5>();
-    devOutput = toDeviceTensor<float, 4>(state, output).upcastOuter<5>();
-  } else {
-    THCudaTensor_resize5d(state, output, numBatch, numPlanes, outputD, outputH,
-                          outputW);
-
-    devInput = toDeviceTensor<float, 5>(state, input);
-    devOutput = toDeviceTensor<float, 5>(state, output);
-  }
-
-  int outputPlaneSize = devOutput.getSize(2) * devOutput.getSize(3) *
-      devOutput.getSize(4);
-  dim3 gridSize(THCCeilDiv(outputPlaneSize, 256),
-            devOutput.getSize(1),
-            devOutput.getSize(0));
-  dim3 blockSize(outputPlaneSize > 256 ? 256 : outputPlaneSize);
-
-  VolumetricReplicationPadding_updateOutput<<<gridSize, blockSize, 0, THCState_getCurrentStream(state)>>>(
-    devInput, devOutput, pfront, pback, ptop, pbottom, pleft, pright);
-}
-
+template <typename Dtype>
 __global__ void VolumetricReplicationPadding_updateGradInput(
-  THCDeviceTensor<float, 5> gradInput,
-  THCDeviceTensor<float, 5> gradOutput,
+  THCDeviceTensor<Dtype, 5> gradInput,
+  THCDeviceTensor<Dtype, 5> gradOutput,
   int pfront, int pback, int ptop, int pbottom, int pleft, int pright) {
   int outputPointId = threadIdx.x + blockIdx.x * blockDim.x;
   int plane = blockIdx.y;
@@ -134,56 +77,12 @@ __global__ void VolumetricReplicationPadding_updateGradInput(
   int inputPointZ = min(max(pfront, outputPointZ),
                         gradInput.getSize(2) + pfront - 1) - oStartZ + iStartZ;
 
-  float valueToCopy =
+  Dtype valueToCopy =
       gradOutput[batch][plane][outputPointZ][outputPointY][outputPointX];
   atomicAdd(&gradInput[batch][plane][inputPointZ][inputPointY][inputPointX],
             valueToCopy);
 }
 
-void THNN_CudaVolumetricReplicationPadding_updateGradInput(
-  THCState *state, THCudaTensor *input, THCudaTensor *gradOutput,
-  THCudaTensor *gradInput, int pleft, int pright, int ptop, int pbottom,
-  int pfront, int pback) {
-  THArgCheck(TensorUtils<THCudaTensor>::canUse32BitIndexMath(state, input), 2,
-             "input tensor must fit into 32-bit index math");
-  THArgCheck(TensorUtils<THCudaTensor>::canUse32BitIndexMath(state, gradOutput),
-             3, "output gradient tensor must fit into 32-bit index math");
-
-  int planeDim = 0;
-  int dimd = 1;
-  int dimh = 2;
-  int dimw = 3;
-
-  int numInputDims = THCudaTensor_nDimension(state, input);
-  if (numInputDims == 5) {
-    planeDim++;
-    dimd++;
-    dimh++;
-    dimw++;
-  }
-
-  THCudaTensor_resizeAs(state, gradInput, input);
-  THCudaTensor_zero(state, gradInput);
 
-  THCDeviceTensor<float, 5> devGradInput;
-  THCDeviceTensor<float, 5> devGradOutput;
-
-  if (numInputDims == 4) {
-    devGradInput = toDeviceTensor<float, 4>(state, gradInput).upcastOuter<5>();
-    devGradOutput =
-        toDeviceTensor<float, 4>(state, gradOutput).upcastOuter<5>();
-  } else {
-    devGradInput = toDeviceTensor<float, 5>(state, gradInput);
-    devGradOutput = toDeviceTensor<float, 5>(state, gradOutput);
-  }
-
-  int outputPlaneSize = devGradOutput.getSize(2) * devGradOutput.getSize(3) *
-      devGradOutput.getSize(4);
-  dim3 gridSize(THCCeilDiv(outputPlaneSize, 256),
-            devGradOutput.getSize(1),
-            devGradOutput.getSize(0));
-  dim3 blockSize(outputPlaneSize > 256 ? 256 : outputPlaneSize);
-
-  VolumetricReplicationPadding_updateGradInput<<<gridSize, blockSize, 0, THCState_getCurrentStream(state)>>>(
-    devGradInput, devGradOutput, pfront, pback, ptop, pbottom, pleft, pright);
-}
+#include "generic/VolumetricReplicationPadding.cu"
+#include "THCGenerateFloatTypes.h"
diff --git a/lib/THCUNN/common.h b/lib/THCUNN/common.h
index e097535..12fbdd1 100644
--- a/lib/THCUNN/common.h
+++ b/lib/THCUNN/common.h
@@ -8,6 +8,10 @@
 #define THCUNN_assertSameGPU(...) THAssertMsg(THCudaTensor_checkGPU(__VA_ARGS__), \
   "Some of weight/gradient/input tensors are located on different GPUs. Please move them to a single one.")
 
+// _generic can be removed once everything is genericized
+#define THCUNN_assertSameGPU_generic(...) THAssertMsg(THCTensor_(checkGPU)(__VA_ARGS__), \
+  "Some of weight/gradient/input tensors are located on different GPUs. Please move them to a single one.")
+
 // Use 1024 threads per block, which requires cuda sm_2x or above
 const int CUDA_NUM_THREADS = 1024;
 
@@ -17,4 +21,12 @@ inline int GET_BLOCKS(const int N)
   return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS;
 }
 
+#define THCUNN_resizeAs_indices(STATE, I1, I2)              \
+  THLongStorage *size2 = THCTensor_(newSizeOf)(STATE, I2); \
+  if (!THCudaLongTensor_isSize(STATE, I1, size2))           \
+  { \
+    THCudaLongTensor_resize(STATE, I1, size2, NULL);        \
+  } \
+  THLongStorage_free(size2);
+
 #endif
diff --git a/lib/THCUNN/generic/Abs.cu b/lib/THCUNN/generic/Abs.cu
new file mode 100644
index 0000000..dc1e9a2
--- /dev/null
+++ b/lib/THCUNN/generic/Abs.cu
@@ -0,0 +1,28 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/Abs.cu"
+#else
+
+#include "../common.h"
+
+void THNN_(Abs_updateOutput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *output)
+{
+  THCUNN_assertSameGPU_generic(state, 2, input, output);
+  THCTensor_(resizeAs)(state, output, input);
+  THC_pointwiseApply2(state, output, input, absupdateOutput_functor<real>());
+}
+
+void THNN_(Abs_updateGradInput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *gradOutput,
+           THCTensor *gradInput)
+{
+  THCUNN_assertSameGPU_generic(state, 3, input, gradOutput, gradInput);
+  THCTensor_(resizeAs)(state, gradInput, input);
+  THC_pointwiseApply3(state, gradInput, input, gradOutput, absupdateGradInput_functor<real>());
+}
+
+#endif
diff --git a/lib/THCUNN/generic/AbsCriterion.cu b/lib/THCUNN/generic/AbsCriterion.cu
new file mode 100644
index 0000000..904fbba
--- /dev/null
+++ b/lib/THCUNN/generic/AbsCriterion.cu
@@ -0,0 +1,59 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/AbsCriterion.cu"
+#else
+
+void THNN_(AbsCriterion_updateOutput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *target,
+           THCTensor *output,
+           bool sizeAverage)
+{
+  THCUNN_assertSameGPU_generic(state, 2, input, target);
+
+  long size = THCTensor_(nElement)(state, input);
+
+  input = THCTensor_(newContiguous)(state, input);
+  target = THCTensor_(newContiguous)(state, target);
+
+  thrust::device_ptr<real> input_data(THCTensor_(data)(state, input));
+  thrust::device_ptr<real> target_data(THCTensor_(data)(state, target));
+  accreal sum = thrust::inner_product(input_data, input_data+size, target_data, (accreal)0, thrust::plus<accreal>(), abs_functor<real, accreal>());
+
+  if (sizeAverage)
+    sum /= size;
+
+  THCTensor_(free)(state, input);
+  THCTensor_(free)(state, target);
+
+  THCTensor_(set1d)(state, output, 0, ScalarConvert<accreal, real>::to(sum));
+}
+
+void THNN_(AbsCriterion_updateGradInput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *target,
+           THCTensor *gradInput,
+           bool sizeAverage)
+{
+  THCUNN_assertSameGPU_generic(state, 3, input, target, gradInput);
+
+  long size = THCTensor_(nElement)(state, input);
+  real norm = ScalarConvert<double, real>::to(sizeAverage ? 1./size : 1.);
+
+  input = THCTensor_(newContiguous)(state, input);
+  target = THCTensor_(newContiguous)(state, target);
+
+  THCTensor_(resizeAs)(state, gradInput, input);
+
+  thrust::device_ptr<real> input_data(THCTensor_(data)(state, input));
+  thrust::device_ptr<real> target_data(THCTensor_(data)(state, target));
+  thrust::device_ptr<real> gradInput_data(THCTensor_(data)(state, gradInput));
+
+  thrust::transform(input_data, input_data+size, target_data, gradInput_data, abs_updateGradInput_functor<real>(norm));
+
+  THCTensor_(free)(state, input);
+  THCTensor_(free)(state, target);
+}
+
+#endif
diff --git a/lib/THCUNN/generic/BCECriterion.cu b/lib/THCUNN/generic/BCECriterion.cu
new file mode 100644
index 0000000..a6f76d6
--- /dev/null
+++ b/lib/THCUNN/generic/BCECriterion.cu
@@ -0,0 +1,99 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/BCECriterion.cu"
+#else
+
+void THNN_(BCECriterion_updateOutput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *target,
+           THCTensor *output,
+           bool sizeAverage,
+           THCTensor *weights)
+{
+  THCUNN_assertSameGPU_generic(state, 3, input, target, weights);
+
+  long size = THCTensor_(nElement)(state, input);
+
+  input = THCTensor_(newContiguous)(state, input);
+  target = THCTensor_(newContiguous)(state, target);
+
+  thrust::device_ptr<real> input_data(THCTensor_(data)(state, input));
+  thrust::device_ptr<real> target_data(THCTensor_(data)(state, target));
+
+  accreal sum;
+  if (weights) {
+    weights = THCTensor_(newContiguous)(state, weights);
+    thrust::device_ptr<real> weights_data(THCTensor_(data)(state, weights));
+    sum = thrust::transform_reduce(
+      thrust::make_zip_iterator(thrust::make_tuple(input_data, target_data, weights_data)),
+      thrust::make_zip_iterator(thrust::make_tuple(input_data+size, target_data+size, weights_data+size)),
+      bce_functor_weights<real, accreal>(),
+      (accreal) 0,
+      thrust::plus<accreal>()
+    );
+    THCTensor_(free)(state, weights);
+  } else {
+    sum = thrust::transform_reduce(
+      thrust::make_zip_iterator(thrust::make_tuple(input_data, target_data)),
+      thrust::make_zip_iterator(thrust::make_tuple(input_data+size, target_data+size)),
+      bce_functor<real, accreal>(),
+      (accreal) 0,
+      thrust::plus<accreal>()
+    );
+  }
+
+  if (sizeAverage)
+    sum /= size;
+
+  THCTensor_(free)(state, input);
+  THCTensor_(free)(state, target);
+
+  THCTensor_(set1d)(state, output, 0, ScalarConvert<accreal, real>::to(sum));
+}
+
+void THNN_(BCECriterion_updateGradInput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *target,
+           THCTensor *gradInput,
+           bool sizeAverage,
+           THCTensor *weights)
+{
+  THCUNN_assertSameGPU_generic(state, 4, input, target, gradInput, weights);
+
+  long size = THCTensor_(nElement)(state, input);
+  real norm = ScalarConvert<accreal, real>::to(sizeAverage ? accreal(1)/size : accreal(1));
+
+  input = THCTensor_(newContiguous)(state, input);
+  target = THCTensor_(newContiguous)(state, target);
+
+  THCTensor_(resizeAs)(state, gradInput, input);
+
+  thrust::device_ptr<real> input_data(THCTensor_(data)(state, input));
+  thrust::device_ptr<real> target_data(THCTensor_(data)(state, target));
+  thrust::device_ptr<real> gradInput_data(THCTensor_(data)(state, gradInput));
+
+  if (weights) {
+    weights = THCTensor_(newContiguous)(state, weights);
+    thrust::device_ptr<real> weights_data(THCTensor_(data)(state, weights));
+    thrust::transform(
+      thrust::make_zip_iterator(thrust::make_tuple(input_data, target_data, weights_data)),
+      thrust::make_zip_iterator(thrust::make_tuple(input_data+size, target_data+size, weights_data+size)),
+      gradInput_data,
+      bce_updateGradInput_functor_weights<real, accreal>(norm)
+    );
+    THCTensor_(free)(state, weights);
+  } else {
+    thrust::transform(
+      thrust::make_zip_iterator(thrust::make_tuple(input_data, target_data)),
+      thrust::make_zip_iterator(thrust::make_tuple(input_data+size, target_data+size)),
+      gradInput_data,
+      bce_updateGradInput_functor<real, accreal>(norm)
+    );
+  }
+
+  THCTensor_(free)(state, input);
+  THCTensor_(free)(state, target);
+}
+
+#endif
diff --git a/lib/THCUNN/generic/BatchNormalization.cu b/lib/THCUNN/generic/BatchNormalization.cu
new file mode 100644
index 0000000..80bb937
--- /dev/null
+++ b/lib/THCUNN/generic/BatchNormalization.cu
@@ -0,0 +1,97 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/BatchNormalization.cu"
+#else
+
+#define DeviceTensor3 THCDeviceTensor<real, 3>
+#define DeviceTensor1 THCDeviceTensor<real, 1>
+
+template <int Dim>
+static THCDeviceTensor<real, Dim> devicetensor(THCState *state, THCTensor *t) {
+  if (!t) {
+    return THCDeviceTensor<real, Dim>();
+  }
+
+  int inDim = THCTensor_(nDimension)(state, t);
+  if (inDim == Dim) {
+    return toDeviceTensor<real, Dim>(state, t);
+  }
+
+  // View in which the last dimensions are collapsed or expanded as needed
+  THAssert(THCTensor_(isContiguous)(state, t));
+  int size[Dim];
+  for (int i = 0; i < Dim || i < inDim; ++i) {
+    if (i < Dim && i < inDim) {
+      size[i] = t->size[i];
+    } else if (i < Dim) {
+      size[i] = 1;
+    } else {
+      size[Dim - 1] *= t->size[i];
+    }
+  }
+  return THCDeviceTensor<real, Dim>(THCTensor_(data)(state, t), size);
+}
+
+void THNN_(BatchNormalization_updateOutput)(
+  THCState *state, THCTensor *input_, THCTensor *output_,
+  THCTensor *weight_, THCTensor *bias_, THCTensor *runningMean_,
+  THCTensor *runningVar_, THCTensor *saveMean_, THCTensor *saveStd_,
+  bool train, double momentum, double eps) {
+
+  DeviceTensor3 input = devicetensor<3>(state, input_);
+  DeviceTensor3 output = devicetensor<3>(state, output_);
+  DeviceTensor1 weight = devicetensor<1>(state, weight_);
+  DeviceTensor1 bias = devicetensor<1>(state, bias_);
+  DeviceTensor1 runningMean = devicetensor<1>(state, runningMean_);
+  DeviceTensor1 runningVar = devicetensor<1>(state, runningVar_);
+  DeviceTensor1 saveMean = devicetensor<1>(state, saveMean_);
+  DeviceTensor1 saveStd = devicetensor<1>(state, saveStd_);
+
+  cudaStream_t s = THCState_getCurrentStream(state);
+  cudaDeviceProp *prop = THCState_getCurrentDeviceProperties(state);
+
+  if (!train) {
+    dim3 blocks(input.getSize(1));
+    dim3 threads(getNumThreads(input.getSize(2)));
+    BatchNormalizationUpdateOutputInference_kernel<real, accreal, DeviceTensor1, DeviceTensor3> <<<blocks, threads, 0, s>>>(
+      input, output, runningMean, runningVar, weight, bias, eps);
+  } else {
+    dim3 blocks(input.getSize(1));
+    dim3 threads(getNumThreads(input.getSize(2)));
+    BatchNormalizationUpdateOutput_kernel<real, accreal, DeviceTensor1, DeviceTensor3> <<<blocks, threads, 0, s>>>(
+      input, output, weight, bias, eps, momentum, runningMean, runningVar,
+      saveMean, saveStd);
+  }
+  THCudaCheck(cudaGetLastError());
+}
+
+void THNN_(BatchNormalization_backward)(
+  THCState *state, THCTensor *input_, THCTensor *gradOutput_,
+  THCTensor *gradInput_, THCTensor *gradWeight_, THCTensor *gradBias_,
+  THCTensor *weight_, THCTensor *runningMean_, THCTensor *runningVar_,
+  THCTensor *saveMean_, THCTensor *saveStd_, bool train, float scale, double eps) {
+
+  DeviceTensor3 input = devicetensor<3>(state, input_);
+  DeviceTensor3 gradOutput = devicetensor<3>(state, gradOutput_);
+  DeviceTensor3 gradInput = devicetensor<3>(state, gradInput_);
+  DeviceTensor1 gradWeight = devicetensor<1>(state, gradWeight_);
+  DeviceTensor1 gradBias = devicetensor<1>(state, gradBias_);
+  DeviceTensor1 weight = devicetensor<1>(state, weight_);
+  DeviceTensor1 runningMean = devicetensor<1>(state, runningMean_);
+  DeviceTensor1 runningVar = devicetensor<1>(state, runningVar_);
+  DeviceTensor1 saveMean = devicetensor<1>(state, saveMean_);
+  DeviceTensor1 saveStd = devicetensor<1>(state, saveStd_);
+
+  cudaStream_t s = THCState_getCurrentStream(state);
+
+  dim3 blocks(gradOutput.getSize(1));
+  dim3 threads(getNumThreads(gradOutput.getSize(2)));
+  BatchNormalizationBackward_kernel<real,  accreal,  DeviceTensor1, DeviceTensor3> <<<blocks, threads, 0, s>>>(
+    input, gradOutput, gradInput, gradWeight, gradBias, weight, runningMean, runningVar,
+    saveMean, saveStd, train, scale, eps);
+  THCudaCheck(cudaGetLastError());
+}
+
+#undef DeviceTensor3
+#undef DeviceTensor1
+
+#endif
diff --git a/lib/THCUNN/generic/ClassNLLCriterion.cu b/lib/THCUNN/generic/ClassNLLCriterion.cu
new file mode 100644
index 0000000..62925a8
--- /dev/null
+++ b/lib/THCUNN/generic/ClassNLLCriterion.cu
@@ -0,0 +1,156 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/ClassNLLCriterion.cu"
+#else
+
+void THNN_(ClassNLLCriterion_updateOutput)(
+           THCState *state,
+           THCTensor *input,
+           THCIndexTensor *target,
+           THCTensor *output,
+           bool sizeAverage,
+           THCTensor *weights,
+           THCTensor *total_weight) {
+  if (THCIndexTensor_(nDimension)(state, target) > 1) {
+    THError("multi-target not supported");
+  }
+
+  int n_dims = THCTensor_(nDimension)(state, input);
+  int n_classes = THCTensor_(size)(state, input, n_dims - 1);
+
+  if (weights) {
+    THCUNN_assertSameGPU_generic(
+      state, 5, input, target, weights, output, total_weight
+    );
+  } else {
+    THCUNN_assertSameGPU_generic(
+      state, 4, input, target, output, total_weight
+    );
+  }
+
+  if (THCTensor_(nDimension)(state, input) > 2) {
+    THArgCheck(0, 2, "vector or matrix expected");
+  }
+  if (weights && THCTensor_(nElement)(state, weights) != n_classes) {
+    THError("weight tensor should be defined either for all or no classes");
+  }
+
+  input = THCTensor_(newContiguous)(state, input);
+  weights = weights ? THCTensor_(newContiguous)(state, weights) : NULL;
+  target = THCIndexTensor_(newContiguous)(state, target);
+
+  real *input_data = THCTensor_(data)(state, input);
+  real *weights_data = weights ? THCTensor_(data)(state, weights) : NULL;
+  THCIndex_t  *target_data = THCIndexTensor_(data)(state, target);
+  real *output_data = THCTensor_(data)(state, output);
+  real *total_weight_data = THCTensor_(data)(state, total_weight);
+
+  if (THCTensor_(nDimension)(state, input) == 1) {
+    cunn_ClassNLLCriterion_updateOutput_kernel1<real>
+      <<<1, 1, 0, THCState_getCurrentStream(state)>>>(
+        output_data,
+        total_weight_data,
+        input_data,
+        target_data,
+        weights_data,
+        sizeAverage,
+        n_classes
+    );
+
+  } else if (THCTensor_(nDimension)(state, input) == 2) {
+    cunn_ClassNLLCriterion_updateOutput_kernel<real, accreal>
+      <<<1, NTHREADS, 0, THCState_getCurrentStream(state)>>>(
+        output_data,
+        total_weight_data,
+        input_data,
+        target_data,
+        weights_data,
+        sizeAverage,
+        THCTensor_(size)(state, input, 0),
+        THCTensor_(size)(state, input, 1),
+        n_classes
+    );
+  }
+  THCudaCheck(cudaGetLastError());
+
+  if (weights) {
+    THCTensor_(free)(state, weights);
+  }
+  THCIndexTensor_(free)(state, target);
+  THCTensor_(free)(state, input);
+}
+
+void THNN_(ClassNLLCriterion_updateGradInput)(
+           THCState *state,
+           THCTensor *input,
+           THCIndexTensor *target,
+           THCTensor *gradInput,
+           bool sizeAverage,
+           THCTensor *weights,
+           THCTensor *total_weight) {
+  if (THCIndexTensor_(nDimension)(state, target) > 1) {
+    THError("multi-target not supported");
+  }
+
+  int n_dims = THCTensor_(nDimension)(state, input);
+  int n_classes = THCTensor_(size)(state, input, n_dims - 1);
+
+  THArgCheck(THCTensor_(isContiguous)(state, gradInput), 4, "gradInput must be contiguous");
+
+  if (weights) {
+    THCUNN_assertSameGPU(
+      state, 5, weights, input, target, gradInput, total_weight
+    );
+  }
+  else {
+    THCUNN_assertSameGPU(
+      state, 4, input, target, gradInput, total_weight
+    );
+  }
+
+  if (THCTensor_(nDimension)(state, input) > 2) {
+    THArgCheck(0, 2, "vector or matrix expected");
+  }
+  if (weights && THCTensor_(nElement)(state, weights) != n_classes) {
+    THError("weight tensor should be defined either for all or no classes");
+  }
+
+  weights = weights ? THCTensor_(newContiguous)(state, weights) : NULL;
+  target = THCIndexTensor_(newContiguous)(state, target);
+
+  real *weights_data = weights ? THCTensor_(data)(state, weights) : NULL;
+  real *gradInput_data = THCTensor_(data)(state, gradInput);
+  THCIndex_t  *target_data = THCIndexTensor_(data)(state, target);
+  real *total_weight_data = THCTensor_(data)(state, total_weight);
+
+  if (THCTensor_(nDimension)(state, input) == 1) {
+    cunn_ClassNLLCriterion_updateGradInput_kernel1<real>
+      <<<1, 1, 0, THCState_getCurrentStream(state)>>>(
+        gradInput_data,
+        weights_data,
+        target_data,
+        total_weight_data,
+        sizeAverage,
+        n_classes
+    );
+  } else {
+    cunn_ClassNLLCriterion_updateGradInput_kernel<real>
+      <<<1, NTHREADS, 0, THCState_getCurrentStream(state)>>>(
+        gradInput_data,
+        target_data,
+        weights_data,
+        total_weight_data,
+        sizeAverage,
+        THCTensor_(size)(state, input, 0),
+        THCTensor_(size)(state, input, 1),
+        n_classes
+    );
+  }
+  THCudaCheck(cudaGetLastError());
+
+  if (weights) {
+    THCTensor_(free)(state, weights);
+  }
+  THCIndexTensor_(free)(state, target);
+}
+
+#endif
diff --git a/lib/THCUNN/generic/DistKLDivCriterion.cu b/lib/THCUNN/generic/DistKLDivCriterion.cu
new file mode 100644
index 0000000..e06fe9e
--- /dev/null
+++ b/lib/THCUNN/generic/DistKLDivCriterion.cu
@@ -0,0 +1,67 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/DistKLDivCriterion.cu"
+#else
+
+void THNN_(DistKLDivCriterion_updateOutput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *target,
+           THCTensor *output,
+           bool sizeAverage)
+{
+  THCUNN_assertSameGPU_generic(state, 2, input, target);
+
+  THArgCheck(THCTensor_(nElement)(state, input) == THCTensor_(nElement)(state, target), 2,
+             "input and target need to have the same number of elements");
+
+  accreal sum;
+
+  long size = THCTensor_(nElement)(state, input);
+
+  input = THCTensor_(newContiguous)(state, input);
+  target = THCTensor_(newContiguous)(state, target);
+
+  thrust::device_ptr<real> input_data(THCTensor_(data)(state, input));
+  thrust::device_ptr<real> target_data(THCTensor_(data)(state, target));
+  sum = thrust::inner_product(input_data, input_data+size, target_data, (accreal) 0, thrust::plus<accreal>(), kl_functor<real, accreal>());
+
+  if (sizeAverage)
+    sum /= size;
+
+  THCTensor_(free)(state, input);
+  THCTensor_(free)(state, target);
+
+  THCTensor_(set1d)(state, output, 0, ScalarConvert<accreal, real>::to(sum));
+}
+
+void THNN_(DistKLDivCriterion_updateGradInput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *target,
+           THCTensor *gradInput,
+           bool sizeAverage)
+{
+  THCUNN_assertSameGPU_generic(state, 3, input, target, gradInput);
+
+  THArgCheck(THCTensor_(nElement)(state, input) == THCTensor_(nElement)(state, target), 2,
+             "input and target need to have the same number of elements");
+
+  long size = THCTensor_(nElement)(state, input);
+  real norm = (sizeAverage ? ScalarConvert<accreal, real>::to(accreal(1)/size) : ScalarConvert<int, real>::to(1));
+
+  input = THCTensor_(newContiguous)(state, input);
+  target = THCTensor_(newContiguous)(state, target);
+
+  THCTensor_(resizeAs)(state, gradInput, input);
+
+  thrust::device_ptr<real> input_data(THCTensor_(data)(state, input));
+  thrust::device_ptr<real> target_data(THCTensor_(data)(state, target));
+  thrust::device_ptr<real> gradInput_data(THCTensor_(data)(state, gradInput));
+
+  thrust::transform(input_data, input_data+size, target_data, gradInput_data, kl_updateGradInput_functor<real>(norm));
+
+  THCTensor_(free)(state, input);
+  THCTensor_(free)(state, target);
+}
+
+#endif
diff --git a/lib/THCUNN/generic/ELU.cu b/lib/THCUNN/generic/ELU.cu
new file mode 100644
index 0000000..291f26e
--- /dev/null
+++ b/lib/THCUNN/generic/ELU.cu
@@ -0,0 +1,53 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/ELU.cu"
+#else
+
+#include "../common.h"
+
+
+void THNN_(ELU_updateOutput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *output,
+           real alpha,
+           bool inplace)
+{
+  THCUNN_assertSameGPU_generic(state, 2, input, output);
+
+  if (inplace)
+  {
+    THC_pointwiseApply1(state, input, ELUupdateOutputIP_functor<real>(alpha));
+    THCTensor_(set)(state, output, input);
+  }
+  else
+  {
+    THCTensor_(resizeAs)(state, output, input);
+    THC_pointwiseApply2(state, output, input, ELUupdateOutput_functor<real>(alpha));
+  }
+}
+
+
+void THNN_(ELU_updateGradInput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *gradOutput,
+           THCTensor *gradInput,
+           THCTensor *output,
+           real alpha,
+           bool inplace)
+{
+  THCUNN_assertSameGPU_generic(state, 3, output, gradOutput, gradInput);
+
+  if (inplace)
+  {
+    THC_pointwiseApply2(state, gradOutput, output, ELUupdateGradInputIP_functor<real>(alpha));
+    THCTensor_(set)(state, gradInput, gradOutput);
+  }
+  else
+  {
+    THCTensor_(resizeAs)(state, gradInput, output);
+    THC_pointwiseApply3(state, gradInput, output, gradOutput, ELUupdateGradInput_functor<real>(alpha));
+  }
+}
+
+#endif
diff --git a/lib/THCUNN/generic/HardTanh.cu b/lib/THCUNN/generic/HardTanh.cu
new file mode 100644
index 0000000..8899476
--- /dev/null
+++ b/lib/THCUNN/generic/HardTanh.cu
@@ -0,0 +1,54 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/HardTanh.cu"
+#else
+
+#include "../common.h"
+
+void THNN_(HardTanh_updateOutput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *output,
+           real min_val,
+           real max_val,
+           bool inplace)
+{
+  THCUNN_assertSameGPU_generic(state, 2, input, output);
+  if(inplace)
+  {
+    THCTensor_(set)(state, output, input);
+    THC_pointwiseApply1(state, output, hardtanhupdateOutput_functor<real>(min_val, max_val));
+  }
+  else
+  {
+    THCTensor_(resizeAs)(state, output, input);
+    THC_pointwiseApply2(state, output, input,
+                               hardtanhupdateOutput_functor<real>(min_val, max_val));
+  }
+}
+
+void THNN_(HardTanh_updateGradInput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *gradOutput,
+           THCTensor *gradInput,
+           real min_val,
+           real max_val,
+           bool inplace)
+{
+  THCUNN_assertSameGPU_generic(state, 3, input, gradOutput, gradInput);
+
+  if (inplace)
+  {
+    THCTensor_(set)(state, gradInput, gradOutput);
+    THC_pointwiseApply2(state, gradInput, input,
+                                 hardtanhupdateGradInput_functor<real>(min_val, max_val));
+  }
+  else
+  {
+    THCTensor_(resizeAs)(state, gradInput, input);
+    THC_pointwiseApply3(state, gradInput, input, gradOutput,
+                                 hardtanhupdateGradInput_functor<real>(min_val, max_val));
+  }
+}
+
+#endif
diff --git a/lib/THCUNN/generic/L1Cost.cu b/lib/THCUNN/generic/L1Cost.cu
new file mode 100644
index 0000000..de15189
--- /dev/null
+++ b/lib/THCUNN/generic/L1Cost.cu
@@ -0,0 +1,42 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/L1Cost.cu"
+#else
+
+void THNN_(L1Cost_updateOutput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *output)
+{
+  THCUNN_assertSameGPU_generic(state, 1, input);
+  accreal sum;
+  long size = THCTensor_(nElement)(state, input);
+  input = THCTensor_(newContiguous)(state, input);
+  thrust::device_ptr<real> input_data(THCTensor_(data)(state, input));
+  sum = thrust::transform_reduce(input_data, input_data+size, l1cost_functor<real, accreal>(), accreal(0), thrust::plus<accreal>());
+
+  THCTensor_(free)(state, input);
+
+  THCTensor_(set1d)(state, output, 0, ScalarConvert<accreal, real>::to(sum));
+}
+
+void THNN_(L1Cost_updateGradInput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *gradOutput,
+           THCTensor *gradInput)
+{
+  THCUNN_assertSameGPU_generic(state, 2, input, gradInput);
+  long size = THCTensor_(nElement)(state, input);
+
+  input = THCTensor_(newContiguous)(state, input);
+  THCTensor_(resizeAs)(state, gradInput, input);
+
+  thrust::device_ptr<real> input_data(THCTensor_(data)(state, input));
+  thrust::device_ptr<real> gradInput_data(THCTensor_(data)(state, gradInput));
+
+  thrust::transform(input_data, input_data+size, gradInput_data, l1cost_updateGradInput_functor<real>());
+
+  THCTensor_(free)(state, input);
+}
+
+#endif
diff --git a/lib/THCUNN/generic/LeakyReLU.cu b/lib/THCUNN/generic/LeakyReLU.cu
new file mode 100644
index 0000000..20fb778
--- /dev/null
+++ b/lib/THCUNN/generic/LeakyReLU.cu
@@ -0,0 +1,54 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/LeakyReLU.cu"
+#else
+
+#include "../common.h"
+
+void THNN_(LeakyReLU_updateOutput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *output,
+           real negval,
+           bool inplace)
+{
+  THCUNN_assertSameGPU_generic(state, 2, input, output);
+
+  if (inplace)
+  {
+    THC_pointwiseApply1(state, input, LeakyReLUUpdateOutputIP<real>(negval));
+    THCTensor_(set)(state, output, input);
+  }
+  else
+  {
+    THCTensor_(resizeAs)(state, output, input);
+    THC_pointwiseApply2(state, output, input, LeakyReLUUpdateOutput<real>(negval));
+  }
+
+  THCudaCheck(cudaGetLastError());
+}
+
+void THNN_(LeakyReLU_updateGradInput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *gradOutput,
+           THCTensor *gradInput,
+           real negval,
+           bool inplace)
+{
+  THCUNN_assertSameGPU_generic(state, 3, input, gradInput, gradOutput);
+
+  if (inplace)
+  {
+    THC_pointwiseApply2(state, gradOutput, input, LeakyReLUUpdateGradInputIP<real>(negval));
+    THCTensor_(set)(state, gradInput, gradOutput);
+  }
+  else
+  {
+    THCTensor_(resizeAs)(state, gradInput, input);
+    THC_pointwiseApply3(state, gradInput, input, gradOutput, LeakyReLUUpdateGradInput<real>(negval));
+  }
+
+  THCudaCheck(cudaGetLastError());
+}
+
+#endif
diff --git a/lib/THCUNN/generic/LogSigmoid.cu b/lib/THCUNN/generic/LogSigmoid.cu
new file mode 100644
index 0000000..4a6a4c9
--- /dev/null
+++ b/lib/THCUNN/generic/LogSigmoid.cu
@@ -0,0 +1,30 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/LogSigmoid.cu"
+#else
+
+#include "../common.h"
+
+void THNN_(LogSigmoid_updateOutput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *output,
+           THCTensor *buffer)
+{
+  THCUNN_assertSameGPU_generic(state, 2, input, output);
+  THCTensor_(resizeAs)(state, output, input);
+  THC_pointwiseApply2(state, output, input, logSigmoid_updateOutput_functor<real>());
+}
+
+void THNN_(LogSigmoid_updateGradInput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *gradOutput,
+           THCTensor *gradInput,
+           THCTensor *buffer)
+{
+  THCUNN_assertSameGPU_generic(state, 3, input, gradOutput, gradInput);
+  THCTensor_(resizeAs)(state, gradInput, input);
+  THC_pointwiseApply3(state, gradInput, input, gradOutput, logSigmoid_updateGradInput_functor<real>());
+}
+
+#endif
diff --git a/lib/THCUNN/generic/LogSoftMax.cu b/lib/THCUNN/generic/LogSoftMax.cu
new file mode 100644
index 0000000..5755ea2
--- /dev/null
+++ b/lib/THCUNN/generic/LogSoftMax.cu
@@ -0,0 +1,238 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/LogSoftMax.cu"
+#else
+
+#include "../common.h"
+
+void THNN_(LogSoftMax_updateOutput)(
+          THCState *state,
+          THCTensor *input,
+          THCTensor *output)
+{
+  THCUNN_assertSameGPU_generic(state, 2, input, output);
+
+  THCTensor_(resizeAs)(state, output, input);
+
+  bool spatial  = false;
+  int batchSize = 1;
+  int classSize = 0;
+  int height = 0;
+  int width = 0;
+
+  int ndims = THCTensor_(nDimension)(state, input);
+
+  if (ndims == 1)
+  {
+    classSize = THCTensor_(size)(state, input, 0);
+    input = THCTensor_(newContiguous)(state, input);
+  }
+  else if (ndims == 2)
+  {
+    batchSize = THCTensor_(size)(state, input, 0);
+    classSize = THCTensor_(size)(state, input, 1);
+    input = THCTensor_(newContiguous)(state, input);
+  }
+  else if (ndims == 3)
+  {
+    spatial = true;
+    classSize = THCTensor_(size)(state, input, 0);
+    height = THCTensor_(size)(state, input, 1);
+    width = THCTensor_(size)(state, input, 2);
+
+    // create contiguous tensor with cuda layout from tensor with torch layout
+    // C x H x W -> W x H x C
+    THCTensor_(transpose)(state, input, input, 0, 2);
+    // W x H x C -> H x W x C
+    THCTensor_(transpose)(state, input, input, 0, 1);
+    THCTensor *transposedInput = THCTensor_(newContiguous)(state, input);
+    THCTensor_(transpose)(state, input, input, 0, 1);
+    THCTensor_(transpose)(state, input, input, 0, 2);
+    input = transposedInput;
+  }
+  else if (ndims == 4)
+  {
+    spatial = true;
+    batchSize = THCTensor_(size)(state, input, 0);
+    classSize = THCTensor_(size)(state, input, 1);
+    height = THCTensor_(size)(state, input, 2);
+    width = THCTensor_(size)(state, input, 3);
+
+    // create contiguous tensor with cuda layout from tensor with torch layout
+    // B x C x H x W -> B x W x H x C
+    THCTensor_(transpose)(state, input, input, 1, 3);
+    // B x W x H x C -> B x H x W x C
+    THCTensor_(transpose)(state, input, input, 1, 2);
+    THCTensor *transposedInput = THCTensor_(newContiguous)(state, input);
+    THCTensor_(transpose)(state, input, input, 1, 2);
+    THCTensor_(transpose)(state, input, input, 1, 3);
+    input = transposedInput;
+  }
+  else
+  {
+    THError("1D, 2D, 3D or 4D Tensor expected");
+  }
+
+  if (!spatial)
+  {
+    dim3 grid(batchSize);
+    dim3 block(1024);
+
+    cunn_LogSoftMax_updateOutput_kernel<2, real, accreal>
+      <<<grid, block, block.x * sizeof(accreal), THCState_getCurrentStream(state)>>>(
+        THCTensor_(data)(state, output),
+        THCTensor_(data)(state, input),
+        classSize
+    );
+  }
+  else
+  {
+    dim3 grid(batchSize);
+    dim3 block(1024);
+
+    cunn_SpatialLogSoftMax_updateOutput_kernel<real, accreal>
+      <<<grid, block, 0, THCState_getCurrentStream(state)>>>(
+        THCTensor_(data)(state, output),
+        THCTensor_(data)(state, input),
+        classSize, height, width
+    );
+  }
+
+  cudaError errcode = cudaGetLastError();
+  if (errcode != cudaSuccess)
+  {
+    THError(cudaGetErrorString(errcode));
+  }
+
+  THCTensor_(free)(state, input);
+}
+
+void THNN_(LogSoftMax_updateGradInput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *gradOutput,
+           THCTensor *gradInput,
+           THCTensor *output)
+{
+  THCUNN_assertSameGPU_generic(state, 3, output, gradOutput, gradInput);
+
+  THCTensor_(resizeAs)(state, gradInput, output);
+
+  bool spatial  = false;
+  int batchSize = 1;
+  int classSize = 0;
+  int height = 0;
+  int width = 0;
+
+  int ndims = THCTensor_(nDimension)(state, input);
+
+  if (ndims == 1)
+  {
+    classSize = THCTensor_(size)(state, gradInput, 0);
+    output = THCTensor_(newContiguous)(state, output);
+    gradOutput = THCTensor_(newContiguous)(state, gradOutput);
+  }
+  else if (ndims == 2)
+  {
+    batchSize = THCTensor_(size)(state, gradInput, 0);
+    classSize = THCTensor_(size)(state, gradInput, 1);
+    output = THCTensor_(newContiguous)(state, output);
+    gradOutput = THCTensor_(newContiguous)(state, gradOutput);
+  }
+  else if (ndims == 3)
+  {
+    spatial = true;
+    classSize = THCTensor_(size)(state, input, 0);
+    height = THCTensor_(size)(state, input, 1);
+    width = THCTensor_(size)(state, input, 2);
+
+    // create contiguous tensor with cuda layout from tensor with torch layout
+    // C x H x W -> W x H x C
+    THCTensor_(transpose)(state, output, output, 0, 2);
+    // W x H x C -> H x W x C
+    THCTensor_(transpose)(state, output, output, 0, 1);
+    THCTensor *transposedOutput = THCTensor_(newContiguous)(state, output);
+    THCTensor_(transpose)(state, output, output, 0, 1);
+    THCTensor_(transpose)(state, output, output, 0, 2);
+    output = transposedOutput;
+
+    // create contiguous tensor with cuda layout from tensor with torch layout
+    // C x H x W -> W x H x C
+    THCTensor_(transpose)(state, gradOutput, gradOutput, 0, 2);
+    // W x H x C -> H x W x C
+    THCTensor_(transpose)(state, gradOutput, gradOutput, 0, 1);
+    THCTensor *transposedGradOutput = THCTensor_(newContiguous)(state, gradOutput);
+    THCTensor_(transpose)(state, gradOutput, gradOutput, 0, 1);
+    THCTensor_(transpose)(state, gradOutput, gradOutput, 0, 2);
+    gradOutput = transposedGradOutput;
+  }
+  else if (ndims == 4)
+  {
+    spatial = true;
+    batchSize = THCTensor_(size)(state, gradInput, 0);
+    classSize = THCTensor_(size)(state, input, 1);
+    height = THCTensor_(size)(state, input, 2);
+    width = THCTensor_(size)(state, input, 3);
+
+    // create contiguous tensor with cuda layout from tensor with torch layout
+    // B x C x H x W -> B x W x H x C
+    THCTensor_(transpose)(state, output, output, 1, 3);
+    // B x W x H x C -> B x H x W x C
+    THCTensor_(transpose)(state, output, output, 1, 2);
+    THCTensor *transposedOutput = THCTensor_(newContiguous)(state, output);
+    THCTensor_(transpose)(state, output, output, 1, 2);
+    THCTensor_(transpose)(state, output, output, 1, 3);
+    output = transposedOutput;
+
+    // create contiguous tensor with cuda layout from tensor with torch layout
+    // B x C x H x W -> B x W x H x C
+    THCTensor_(transpose)(state, gradOutput, gradOutput, 1, 3);
+    // B x W x H x C -> B x H x W x C
+    THCTensor_(transpose)(state, gradOutput, gradOutput, 1, 2);
+    THCTensor *transposedGradOutput = THCTensor_(newContiguous)(state, gradOutput);
+    THCTensor_(transpose)(state, gradOutput, gradOutput, 1, 2);
+    THCTensor_(transpose)(state, gradOutput, gradOutput, 1, 3);
+    gradOutput = transposedGradOutput;
+  }
+  else
+  {
+    THError("1D, 2D, 3D or 4D Tensor expected");
+  }
+
+  if (!spatial)
+  {
+    dim3 grid(batchSize);
+    dim3 block(1024);
+
+    cunn_LogSoftMax_updateGradInput_kernel<2, real, accreal>
+      <<<grid, block, block.x * sizeof(accreal), THCState_getCurrentStream(state)>>>(
+        THCTensor_(data)(state, gradInput),
+        THCTensor_(data)(state, output),
+        THCTensor_(data)(state, gradOutput),
+        classSize
+    );
+  }
+  else
+  {
+    dim3 grid(batchSize);
+    dim3 block(1024);
+
+    cunn_SpatialLogSoftMax_updateGradInput_kernel<real, accreal>
+      <<<grid, block, 0, THCState_getCurrentStream(state)>>>(
+        THCTensor_(data)(state, gradInput),
+        THCTensor_(data)(state, output),
+        THCTensor_(data)(state, gradOutput),
+        classSize, height, width
+    );
+  }
+
+  cudaError errcode = cudaGetLastError();
+  if (errcode != cudaSuccess)
+  {
+    THError(cudaGetErrorString(errcode));
+  }
+
+  THCTensor_(free)(state, gradOutput);
+  THCTensor_(free)(state, output);
+}
+
+#endif
diff --git a/lib/THCUNN/generic/LookupTable.cu b/lib/THCUNN/generic/LookupTable.cu
new file mode 100644
index 0000000..2027425
--- /dev/null
+++ b/lib/THCUNN/generic/LookupTable.cu
@@ -0,0 +1,157 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/LookupTable.cu"
+#else
+
+void THNN_(LookupTable_accGradParameters)(
+           THCState *state,
+           THCIndexTensor *input,
+           THCTensor *gradOutput,
+           THCTensor *gradWeight,
+           THCIndexTensor *count,
+           THCIndexTensor *sorted,
+           THCIndexTensor *indices,
+           bool scaleGradByFreq,
+           int paddingValue,
+           real scale)
+{
+  THCUNN_assertSameGPU_generic(state, 5, input, gradOutput, gradWeight, sorted, indices);
+  if (!(THCIndexTensor_(isContiguous)(state, input) &&
+        THCTensor_(isContiguous)(state, gradOutput) &&
+        THCTensor_(isContiguous)(state, gradWeight)))
+  {
+    THError("Tensors must be contiguous");
+  }
+
+  int nDim = THCIndexTensor_(nDimension)(state, input);
+  if (nDim != 1 && nDim != 2)
+    THError("input must be a vector or matrix");
+
+  long numel = THCIndexTensor_(nElement)(state, input);
+  long stride = gradWeight->stride[0];
+
+  cudaStream_t stream = THCState_getCurrentStream(state);
+
+  if (numel <= 768 && !scaleGradByFreq) {
+    cunn_LookupTable_accGradParametersKernelByFeature<<<DIVUP(stride,4), 128, 0, stream>>>(
+      THCIndexTensor_(data)(state, input),
+      THCTensor_(data)(state, gradOutput),
+      THCTensor_(data)(state, gradWeight),
+      scale,
+      numel,
+      stride,
+      paddingValue);
+    THCudaCheck(cudaGetLastError());
+    return;
+  }
+
+  THLongStorage *inputSize = THCIndexTensor_(newSizeOf)(state, input);
+  THCIndexTensor_(resize)(state, sorted, inputSize, NULL);
+  THCIndexTensor_(resize)(state, indices, inputSize, NULL);
+  THLongStorage_free(inputSize);
+
+  // Sort the inputs into sorted with the corresponding indices
+  THCIndexTensor_(sort)(state, sorted, indices, input, 0, 0);
+
+  THCIndex_t *sorted_data = THCIndexTensor_(data)(state, sorted);
+  THCIndex_t  *indices_data = THCIndexTensor_(data)(state, indices);
+  THCIndex_t *count_data = NULL;
+
+  if (scaleGradByFreq)
+  {
+    THCIndexTensor_(resizeAs)(state, count, input);
+    count_data = THCIndexTensor_(data)(state, count);
+
+    thrust::device_ptr<THCIndex_t> sorted_ptr(sorted_data);
+    thrust::device_ptr<THCIndex_t> count_ptr(count_data);
+
+    // Compute an increasing sequence per unique item in sorted:
+    // sorted: 2 5 5 5 7 7 8 9 9
+    //  count: 1 1 2 3 1 2 1 1 2
+    thrust::inclusive_scan_by_key(
+#if CUDA_VERSION >= 7000
+      thrust::cuda::par.on(THCState_getCurrentStream(state)),
+#endif
+      sorted_ptr,
+      sorted_ptr + numel,
+      thrust::make_constant_iterator(1),
+      count_ptr
+    );
+
+    // Take the maximum of each count per unique key in reverse:
+    // sorted: 2 5 5 5 7 7 8 9 9
+    //  count: 1 3 3 3 2 2 1 2 2
+    thrust::inclusive_scan_by_key(
+#if CUDA_VERSION >= 7000
+      thrust::cuda::par.on(THCState_getCurrentStream(state)),
+#endif
+      thrust::make_reverse_iterator(sorted_ptr + numel),
+      thrust::make_reverse_iterator(sorted_ptr),
+      thrust::make_reverse_iterator(count_ptr + numel),
+      thrust::make_reverse_iterator(count_ptr + numel),
+      thrust::equal_to<long>(),
+      thrust::maximum<long>()
+    );
+  }
+
+  dim3 grid(DIVUP(numel,4), DIVUP(stride,128));
+  dim3 block(32, 4);
+  cunn_LookupTable_accGradParametersKernel<real, accreal><<<grid, block, 0, stream>>>(
+    sorted_data,
+    indices_data,
+    THCTensor_(data)(state, gradOutput),
+    THCTensor_(data)(state, gradWeight),
+    count_data,
+    scale,
+    numel,
+    stride,
+    paddingValue
+  );
+  THCudaCheck(cudaGetLastError());
+}
+
+void THNN_(LookupTable_renorm)(
+           THCState *state,
+           THCIndexTensor *idx,
+           THCTensor *weight,
+           real maxNorm,
+           real normType)
+{
+  THCUNN_assertSameGPU_generic(state, 2, idx, weight);
+  if (!(THCIndexTensor_(isContiguous)(state, idx) &&
+        THCTensor_(isContiguous)(state, weight)))
+  {
+    THError("Tensors must be contiguous");
+  }
+  if (THCIndexTensor_(nDimension)(state, idx) != 1)
+    THError("idx must be a vector");
+  if (normType <= 0)
+    THError("non-positive-norm not supported");
+
+  THCIndex_t numel = THCIndexTensor_(nElement)(state, idx);
+  long stride = weight->stride[0];
+
+  // get the unique indices
+  thrust::device_ptr<real> weight_ptr(THCTensor_(data)(state, weight));
+  thrust::device_ptr<THCIndex_t> idx_ptr(THCIndexTensor_(data)(state, idx));
+  thrust::device_ptr<THCIndex_t> end_ptr = thrust::unique(idx_ptr, idx_ptr+numel);
+  numel = end_ptr - idx_ptr;
+
+  pow_v<real, accreal> unary_pow(normType);
+  thrust::plus<accreal> binary_plus;
+  // numel << stride, since idx usually contains sparse row indices
+  for (long i = 0; i < numel; i++)
+  {
+    THCIndex_t k = idx_ptr[i] - TH_INDEX_BASE;
+    thrust::device_ptr<real> row_ptr = weight_ptr + k * stride;
+    accreal norm = thrust::transform_reduce(row_ptr, row_ptr + stride,
+      unary_pow, 0, binary_plus);
+    norm = std::pow(norm, (accreal) (1.0 / normType));
+    if (norm > ScalarConvert<real, accreal>::to(maxNorm))
+    {
+      multiply_s<real> unary_mul(ScalarConvert<accreal, real>::to(maxNorm / (norm + 1e-7)));
+      thrust::transform(row_ptr, row_ptr + stride, row_ptr, unary_mul);
+    }
+  }
+}
+
+#endif
diff --git a/lib/THCUNN/generic/MSECriterion.cu b/lib/THCUNN/generic/MSECriterion.cu
new file mode 100644
index 0000000..f4a38de
--- /dev/null
+++ b/lib/THCUNN/generic/MSECriterion.cu
@@ -0,0 +1,75 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/MSECriterion.cu"
+#else
+
+void THNN_(MSECriterion_updateOutput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *target,
+           THCTensor *output,
+           bool sizeAverage)
+{
+  THCUNN_assertSameGPU_generic(state, 2, input, target);
+  THArgCheck(THCTensor_(nElement)(state, input) == THCTensor_(nElement)(state, target), 2,
+    "input and target need to have the same number of elements"
+  );
+
+  long size = THCTensor_(nElement)(state, input);
+
+  input = THCTensor_(newContiguous)(state, input);
+  target = THCTensor_(newContiguous)(state, target);
+
+  thrust::device_ptr<real> input_data(THCTensor_(data)(state, input));
+  thrust::device_ptr<real> target_data(THCTensor_(data)(state, target));
+  accreal sum = thrust::inner_product(
+#if CUDA_VERSION >= 7000
+    thrust::cuda::par.on(THCState_getCurrentStream(state)),
+#endif
+    input_data, input_data+size, target_data, (accreal) 0,
+    thrust::plus<accreal>(), mse_functor<real, accreal>());
+
+  if (sizeAverage)
+    sum /= size;
+
+  THCTensor_(free)(state, input);
+  THCTensor_(free)(state, target);
+
+  THCTensor_(set1d)(state, output, 0, ScalarConvert<accreal, real>::to(sum));
+}
+
+void THNN_(MSECriterion_updateGradInput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *target,
+           THCTensor *gradInput,
+           bool sizeAverage)
+{
+  THCUNN_assertSameGPU_generic(state, 3, input, target, gradInput);
+  THArgCheck(THCTensor_(nElement)(state, input) == THCTensor_(nElement)(state, target), 2,
+    "input and target need to have the same number of elements"
+  );
+
+  long size = THCTensor_(nElement)(state, input);
+  accreal norm = sizeAverage ? (accreal)(2)/size : (accreal)(2);
+
+  input = THCTensor_(newContiguous)(state, input);
+  target = THCTensor_(newContiguous)(state, target);
+
+  THCTensor_(resizeAs)(state, gradInput, input);
+
+  thrust::device_ptr<real> input_data(THCTensor_(data)(state, input));
+  thrust::device_ptr<real> target_data(THCTensor_(data)(state, target));
+  thrust::device_ptr<real> gradInput_data(THCTensor_(data)(state, gradInput));
+
+  thrust::transform(
+#if CUDA_VERSION >= 7000
+    thrust::cuda::par.on(THCState_getCurrentStream(state)),
+#endif
+    input_data, input_data+size, target_data, gradInput_data,
+    mse_updateGradInput_functor<real, accreal>(norm));
+
+  THCTensor_(free)(state, input);
+  THCTensor_(free)(state, target);
+}
+
+#endif
diff --git a/lib/THCUNN/generic/MarginCriterion.cu b/lib/THCUNN/generic/MarginCriterion.cu
new file mode 100644
index 0000000..4b7e85f
--- /dev/null
+++ b/lib/THCUNN/generic/MarginCriterion.cu
@@ -0,0 +1,64 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/MarginCriterion.cu"
+#else
+
+void THNN_(MarginCriterion_updateOutput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *target,
+           THCTensor *output,
+           bool sizeAverage,
+           real margin)
+{
+  THCUNN_assertSameGPU_generic(state, 2, input, target);
+
+  long size = THCTensor_(nElement)(state, input);
+
+  input = THCTensor_(newContiguous)(state, input);
+  target = THCTensor_(newContiguous)(state, target);
+
+  thrust::device_ptr<real> input_data(THCTensor_(data)(state, input));
+  thrust::device_ptr<real> target_data(THCTensor_(data)(state, target));
+  accreal sum = thrust::inner_product(input_data, input_data+size, target_data, (accreal) 0, thrust::plus<accreal>(),
+      margin_functor<real, accreal>(ScalarConvert<real, accreal>::to(margin)));
+
+  if (sizeAverage)
+    sum /= size;
+
+  THCTensor_(free)(state, input);
+  THCTensor_(free)(state, target);
+
+  THCTensor_(set1d)(state, output, 0, ScalarConvert<accreal, real>::to(sum));
+}
+
+
+void THNN_(MarginCriterion_updateGradInput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *target,
+           THCTensor *gradInput,
+           bool sizeAverage,
+           real margin)
+{
+  THCUNN_assertSameGPU_generic(state, 3, input, target, gradInput);
+
+  long size = THCTensor_(nElement)(state, input);
+  accreal norm = sizeAverage ? 1.f/size : 1;
+
+  input = THCTensor_(newContiguous)(state, input);
+  target = THCTensor_(newContiguous)(state, target);
+
+  THCTensor_(resizeAs)(state, gradInput, input);
+
+  thrust::device_ptr<real> input_data(THCTensor_(data)(state, input));
+  thrust::device_ptr<real> target_data(THCTensor_(data)(state, target));
+  thrust::device_ptr<real> gradInput_data(THCTensor_(data)(state, gradInput));
+
+  thrust::transform(input_data, input_data+size, target_data, gradInput_data,
+      margin_updateGradInput_functor<real, accreal>(ScalarConvert<real, accreal>::to(margin), norm));
+
+  THCTensor_(free)(state, input);
+  THCTensor_(free)(state, target);
+}
+
+#endif
diff --git a/lib/THCUNN/generic/MultiLabelMarginCriterion.cu b/lib/THCUNN/generic/MultiLabelMarginCriterion.cu
new file mode 100644
index 0000000..c9a56c4
--- /dev/null
+++ b/lib/THCUNN/generic/MultiLabelMarginCriterion.cu
@@ -0,0 +1,111 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/MultiLabelMarginCriterion.cu"
+#else
+
+void THNN_(MultiLabelMarginCriterion_updateOutput)(
+           THCState *state,
+           THCTensor *input,
+           THCIndexTensor *target,
+           THCTensor *output,
+           THCTensor *istarget,
+           bool sizeaverage)
+{
+  input = THCTensor_(newContiguous)(state, input);
+  target = THCIndexTensor_(newContiguous)(state, target);
+  istarget = THCTensor_(newContiguous)(state, istarget);
+  THCTensor_(resizeAs)(state, istarget, input);
+
+  if(input->nDimension == 1)
+  {
+    THCTensor_(resize1d)(state, output, 1);
+
+    dim3 blocks(1);
+    dim3 threads(MULTILABELMARGIN_THREADS);
+
+    cunn_MultiLabelMarginCriterion_updateOutput_kernel<real, accreal> <<<blocks,threads>>>(
+        THCTensor_(data)(state, output),
+        THCTensor_(data)(state, input),
+        THCIndexTensor_(data)(state, target),
+        THCTensor_(data)(state, istarget),
+        1, input->size[0],
+        sizeaverage
+        );
+    THCudaCheck(cudaGetLastError());
+  }
+  else if(input->nDimension == 2)
+  {
+    THCTensor *output_tmp = THCTensor_(newWithSize1d)(state, input->size[0]);
+
+    dim3 blocks(input->size[0]);
+    dim3 threads(MULTILABELMARGIN_THREADS);
+
+    cunn_MultiLabelMarginCriterion_updateOutput_kernel<real, accreal> <<<blocks,threads>>>(
+        THCTensor_(data)(state, output_tmp),
+        THCTensor_(data)(state, input),
+        THCIndexTensor_(data)(state, target),
+        THCTensor_(data)(state, istarget),
+        input->size[0], input->size[1],
+        sizeaverage
+        );
+    THCudaCheck(cudaGetLastError());
+    THCTensor_(resize1d)(state, output, 1);
+    THCTensor_(set1d)(state, output, 0, ScalarConvert<accreal, real>::to(THCTensor_(sumall)(state, output_tmp)));
+    THCTensor_(free)(state, output_tmp);
+  }
+  else
+    THError("vector or matrix expected");
+
+  THCTensor_(free)(state, input);
+  THCIndexTensor_(free)(state, target);
+  THCTensor_(free)(state, istarget);
+}
+
+void THNN_(MultiLabelMarginCriterion_updateGradInput)(
+            THCState *state,
+            THCTensor *input,
+            THCIndexTensor *target,
+            THCTensor *gradInput,
+            THCTensor *istarget,
+            bool sizeaverage)
+{
+  input = THCTensor_(newContiguous)(state, input);
+  target = THCIndexTensor_(newContiguous)(state, target);
+  istarget = THCTensor_(newContiguous)(state, istarget);
+  THCTensor_(resizeAs)(state, gradInput, input);
+
+  if(gradInput->nDimension == 1)
+  {
+    dim3 blocks(1);
+    dim3 threads(MULTILABELMARGIN_THREADS);
+
+    cunn_MultiLabelMarginCriterion_updateGradInput_kernel<real, accreal> <<<blocks,threads>>>(THCTensor_(data)(state, gradInput),
+        THCTensor_(data)(state, input),
+        THCIndexTensor_(data)(state, target),
+        THCTensor_(data)(state, istarget),
+        1, gradInput->size[0],
+        sizeaverage);
+
+  }
+  else if(gradInput->nDimension == 2)
+  {
+    dim3 blocks(gradInput->size[0]);
+    dim3 threads(MULTILABELMARGIN_THREADS);
+
+    cunn_MultiLabelMarginCriterion_updateGradInput_kernel<real, accreal> <<<blocks,threads>>>(THCTensor_(data)(state, gradInput),
+        THCTensor_(data)(state, input),
+        THCIndexTensor_(data)(state, target),
+        THCTensor_(data)(state, istarget),
+        gradInput->size[0], gradInput->size[1],
+        sizeaverage);
+  }
+  else
+    THError("vector or matrix expected");
+
+  THCudaCheck(cudaGetLastError());
+
+  THCTensor_(free)(state, input);
+  THCIndexTensor_(free)(state, target);
+  THCTensor_(free)(state, istarget);
+}
+
+#endif
diff --git a/lib/THCUNN/generic/MultiMarginCriterion.cu b/lib/THCUNN/generic/MultiMarginCriterion.cu
new file mode 100644
index 0000000..303c9a9
--- /dev/null
+++ b/lib/THCUNN/generic/MultiMarginCriterion.cu
@@ -0,0 +1,181 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/MultiMarginCriterion.cu"
+#else
+
+void THNN_(MultiMarginCriterion_updateOutput)(
+           THCState *state,
+           THCTensor *input,
+           THCIndexTensor *target,
+           THCTensor *output,
+           bool sizeAverage,
+           int p,
+           THCTensor *weights,
+           real margin)
+{
+  THCUNN_assertSameGPU_generic(state, 2, input, target);
+  input = THCTensor_(newContiguous)(state, input);
+  if(weights)
+    weights = THCTensor_(newContiguous)(state, weights);
+  if (input->nDimension == 1)
+  {
+    dim3 blocks(1);
+    dim3 threads(MULTIMARGIN_THREADS);
+    if (p == 1)
+    {
+      cunn_MultiMarginCriterion_updateOutput_kernel<1, real, accreal> <<<blocks,threads, 0, THCState_getCurrentStream(state)>>>(
+        THCTensor_(data)(state, output),
+        THCTensor_(data)(state, input),
+        THCIndexTensor_(data)(state, target),
+        weights ? THCTensor_(data)(state, weights) : NULL,
+        1, input->size[0],
+        sizeAverage,
+        margin
+      );
+    }
+    else if (p == 2)
+    {
+      cunn_MultiMarginCriterion_updateOutput_kernel<2, real, accreal> <<<blocks,threads, 0, THCState_getCurrentStream(state)>>>(
+        THCTensor_(data)(state, output),
+        THCTensor_(data)(state, input),
+        THCIndexTensor_(data)(state, target),
+        weights ? THCTensor_(data)(state, weights) : NULL,
+        1, input->size[0],
+        sizeAverage,
+        margin
+      );
+    }
+    THCudaCheck(cudaGetLastError());
+  }
+  else if (input->nDimension == 2)
+  {
+    THCTensor *output_ = THCTensor_(newWithSize1d)(state, input->size[0]);  // tmp outupt buffer
+    dim3 blocks(input->size[0]);
+    dim3 threads(MULTIMARGIN_THREADS);
+    if (p == 1)
+    {
+      cunn_MultiMarginCriterion_updateOutput_kernel<1, real, accreal> <<<blocks,threads, 0, THCState_getCurrentStream(state)>>>(
+        THCTensor_(data)(state, output_),
+        THCTensor_(data)(state, input),
+        THCIndexTensor_(data)(state, target),
+        weights ? THCTensor_(data)(state, weights) : NULL,
+        input->size[0], input->size[1],
+        sizeAverage,
+        margin
+      );
+    }
+    else if (p == 2)
+    {
+      cunn_MultiMarginCriterion_updateOutput_kernel<2, real, accreal> <<<blocks,threads, 0, THCState_getCurrentStream(state)>>>(
+        THCTensor_(data)(state, output_),
+        THCTensor_(data)(state, input),
+        THCIndexTensor_(data)(state, target),
+        weights ? THCTensor_(data)(state, weights) : NULL,
+        input->size[0], input->size[1],
+        sizeAverage,
+        margin
+      );
+    }
+    THCudaCheck(cudaGetLastError());
+    float sum = THCTensor_(sumall)(state, output_);
+    THCTensor_(set1d)(state, output, 0, ScalarConvert<accreal, real>::to(sum));
+    THCTensor_(free)(state, output_);
+  }
+  else
+  {
+    THError("vector or matrix expected");
+  }
+
+  THCTensor_(free)(state, input);
+  if(weights)
+    THCTensor_(free)(state, weights);
+}
+
+void THNN_(MultiMarginCriterion_updateGradInput)(
+           THCState *state,
+           THCTensor *input,
+           THCIndexTensor *target,
+           THCTensor *gradInput,
+           bool sizeAverage,
+           int p,
+           THCTensor *weights,
+           real margin)
+{
+  THCUNN_assertSameGPU_generic(state, 3, input, gradInput, target);
+  input = THCTensor_(newContiguous)(state, input);
+  THCTensor_(resizeAs)(state, gradInput, input);
+  if(weights)
+    weights = THCTensor_(newContiguous)(state, weights);
+
+  if (input->nDimension == 1)
+  {
+    dim3 blocks(1);
+    dim3 threads(MULTIMARGIN_THREADS);
+
+    if (p == 1)
+    {
+      cunn_MultiMarginCriterion_updateGradInput_kernel<1, real, accreal> <<<blocks,threads, 0, THCState_getCurrentStream(state)>>>(
+        THCTensor_(data)(state, gradInput),
+        THCTensor_(data)(state, input),
+        THCIndexTensor_(data)(state, target),
+        weights ? THCTensor_(data)(state, weights) : NULL,
+        1, gradInput->size[0],
+        sizeAverage,
+        margin
+      );
+    }
+    else if (p == 2)
+    {
+      cunn_MultiMarginCriterion_updateGradInput_kernel<2, real, accreal> <<<blocks,threads, 0, THCState_getCurrentStream(state)>>>(
+        THCTensor_(data)(state, gradInput),
+        THCTensor_(data)(state, input),
+        THCIndexTensor_(data)(state, target),
+        weights ? THCTensor_(data)(state, weights) : NULL,
+        1, gradInput->size[0],
+        sizeAverage,
+        margin
+      );
+    }
+    THCudaCheck(cudaGetLastError());
+  }
+  else if (input->nDimension == 2)
+  {
+    dim3 blocks(gradInput->size[0]);
+    dim3 threads(MULTIMARGIN_THREADS);
+
+    if (p == 1)
+    {
+      cunn_MultiMarginCriterion_updateGradInput_kernel<1, real, accreal> <<<blocks,threads, 0, THCState_getCurrentStream(state)>>>(
+        THCTensor_(data)(state, gradInput),
+        THCTensor_(data)(state, input),
+        THCIndexTensor_(data)(state, target),
+        weights ? THCTensor_(data)(state, weights) : NULL,
+        gradInput->size[0], gradInput->size[1],
+        sizeAverage,
+        margin
+      );
+    }
+    else if (p == 2)
+    {
+      cunn_MultiMarginCriterion_updateGradInput_kernel<2, real, accreal> <<<blocks,threads, 0, THCState_getCurrentStream(state)>>>(
+        THCTensor_(data)(state, gradInput),
+        THCTensor_(data)(state, input),
+        THCIndexTensor_(data)(state, target),
+        weights ? THCTensor_(data)(state, weights) : NULL,
+        gradInput->size[0], gradInput->size[1],
+        sizeAverage,
+        margin
+      );
+    }
+    THCudaCheck(cudaGetLastError());
+  }
+  else
+  {
+    THError("vector or matrix expected");
+  }
+
+  THCTensor_(free)(state, input);
+  if(weights)
+    THCTensor_(free)(state, weights);
+}
+
+#endif
diff --git a/lib/THCUNN/generic/PReLU.cu b/lib/THCUNN/generic/PReLU.cu
new file mode 100644
index 0000000..8f2f670
--- /dev/null
+++ b/lib/THCUNN/generic/PReLU.cu
@@ -0,0 +1,154 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/PReLU.cu"
+#else
+
+void THNN_(PReLU_updateOutput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *output,
+           THCTensor *weight,
+           long nOutputPlane)
+{
+  THCTensor_(resizeAs)(state, output, input);
+
+  real *w = THCTensor_(data)(state, weight);
+
+  if (nOutputPlane == 0)
+  {
+    THC_pointwiseApply2(state, output, input, PReLUUpdateOutput<real>(w));
+  }
+  else
+  {
+    int ndim = THCTensor_(nDimension)(state, input);
+    input = THCTensor_(newContiguous)(state, input);
+
+    int n = THCTensor_(nElement)(state, input);
+    int mapSize = 1;
+    if (ndim == 3)
+      mapSize = (input->size[1] * input->size[2]);
+    else if (ndim == 4)
+      mapSize = (input->size[2] * input->size[3]);
+    int nElemsPerSample = nOutputPlane * mapSize;
+    preluForward<<<GET_BLOCKS(n), CUDA_NUM_THREADS, 0, THCState_getCurrentStream(state)>>>(
+      THCTensor_(data)(state, output),
+      THCTensor_(data)(state, input),
+      w,
+      n, nElemsPerSample, mapSize
+    );
+    THCudaCheck(cudaGetLastError());
+    THCTensor_(free)(state, input);
+  }
+}
+
+void THNN_(PReLU_updateGradInput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *gradOutput,
+           THCTensor *gradInput,
+           THCTensor *weight,
+           long nOutputPlane)
+{
+  THCTensor_(resizeAs)(state, gradInput, input);
+
+  real *w = THCTensor_(data)(state, weight);
+  if (nOutputPlane == 0)
+  {
+    THC_pointwiseApply3(state, gradInput, gradOutput, input, PReLUUpdateGradInput<real>(w));
+  }
+  else
+  {
+    int ndim = THCTensor_(nDimension)(state, input);
+    input = THCTensor_(newContiguous)(state, input);
+    gradOutput = THCTensor_(newContiguous)(state, gradOutput);
+
+    int n = THCTensor_(nElement)(state, input);
+    int mapSize = 1;
+    if (ndim == 3)
+      mapSize = (input->size[1] * input->size[2]);
+    else if (ndim == 4)
+      mapSize = (input->size[2] * input->size[3]);
+    int nElemsPerSample = nOutputPlane * mapSize;
+    preluBackward<<<GET_BLOCKS(n), CUDA_NUM_THREADS, 0, THCState_getCurrentStream(state)>>>(
+      THCTensor_(data)(state, gradInput),
+      THCTensor_(data)(state, input),
+      w,
+      THCTensor_(data)(state, gradOutput),
+      n, nElemsPerSample, mapSize
+    );
+    THCudaCheck(cudaGetLastError());
+    THCTensor_(free)(state, input);
+    THCTensor_(free)(state, gradOutput);
+  }
+}
+
+void THNN_(PReLU_accGradParameters)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *gradOutput,
+           THCTensor *gradInput,
+           THCTensor *weight,
+           THCTensor *gradWeight,
+           THCTensor *gradWeightBuf,
+           THCTensor *gradWeightBuf2,
+           long nOutputPlane,
+           real scale)
+{
+  // use grad input for temporary storage, then call updateGradInput again
+
+  if (nOutputPlane == 0)
+  {
+    THC_pointwiseApply3(state, gradInput, input, gradOutput, PReLUAccGradParametersShared<real>());
+
+    // introduces a sync point
+    real sum = ScalarConvert<accreal, real>::to(THCTensor_(sumall)(state, gradInput));
+    real w = THCTensor_(get1d)(state, gradWeight, 0);
+    THCTensor_(set1d)(state, gradWeight, 0, w + sum * scale);
+
+    // restore gradInput
+    THNN_(PReLU_updateGradInput)(state, input, gradOutput, gradInput, weight, nOutputPlane);
+  }
+  else
+  {
+    int ndim = THCTensor_(nDimension)(state, input);
+
+    if (ndim == 1)
+    {
+      THC_pointwiseApply3(state, gradWeight, input, gradOutput, PReLUAccGradParameters1to1<real>(scale));
+    }
+    else
+    {
+      THC_pointwiseApply3(state, gradInput, input, gradOutput, PReLUAccGradParameters<real>(scale));
+      THCTensor *sumbuf = gradWeightBuf2;
+      THCTensor_(resizeAs)(state, gradWeightBuf, gradWeight);
+
+      if (ndim == 2)
+      {
+        THCTensor_(sum)(state, gradWeightBuf, gradInput, 0);
+        THCTensor_(cadd)(state, gradWeight, gradWeight, scale, gradWeightBuf);
+      }
+      else if (ndim == 3)
+      {
+        THCTensor *buffer = THCTensor_(newContiguous)(state, gradInput);
+        THCTensor_(resize2d)(state, buffer, nOutputPlane, input->size[1] * input->size[2]);
+        THCTensor_(sum)(state, gradWeightBuf, buffer, 1);
+        THCTensor_(cadd)(state, gradWeight, gradWeight, scale, gradWeightBuf);
+        THCTensor_(free)(state, buffer);
+      }
+      else if (ndim == 4)
+      {
+        THCTensor *buffer = THCTensor_(newContiguous)(state, gradInput);
+        THCTensor_(resize3d)(state, buffer, input->size[0], nOutputPlane, input->size[2] * input->size[3]);
+        THCTensor_(resize2d)(state, sumbuf, input->size[0], nOutputPlane);
+        THCTensor_(sum)(state, sumbuf, buffer, 2);
+        THCTensor_(sum)(state, gradWeightBuf, sumbuf, 0);
+        THCTensor_(cadd)(state, gradWeight, gradWeight, scale, gradWeightBuf);
+        THCTensor_(free)(state, buffer);
+      }
+
+      // restore gradInput
+      THNN_(PReLU_updateGradInput)(state, input, gradOutput, gradInput, weight, nOutputPlane);
+    }
+  }
+}
+
+#endif
diff --git a/lib/THCUNN/generic/RReLU.cu b/lib/THCUNN/generic/RReLU.cu
new file mode 100644
index 0000000..93ad99b
--- /dev/null
+++ b/lib/THCUNN/generic/RReLU.cu
@@ -0,0 +1,108 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/RReLU.cu"
+#else
+
+#include "../common.h"
+
+void THNN_(RReLU_updateOutput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *output,
+           THCTensor *noise,
+           double lower,
+           double upper,
+           bool train,
+           bool inplace,
+           void *generator)
+{
+  THCUNN_assertSameGPU_generic(state, 3, input, output, noise);
+  struct curandStateMtgp32* gen_states = THCRandom_generatorStates(state);
+
+  if (train)
+  {
+    input = THCTensor_(newContiguous)(state, input);
+    THCTensor_(resizeAs)(state, noise, input);
+    real *input_data = THCTensor_(data)(state, input);
+    real *noise_data = THCTensor_(data)(state, noise);
+    long n = THCTensor_(nElement)(state, input);
+    if (inplace)
+    {
+      rreluUpdateOutputTrain<<<NUM_BLOCKS(n), BLOCK_SIZE, 0, THCState_getCurrentStream(state)>>>(
+        n, gen_states, input_data, noise_data, input_data, lower, upper);
+      THCTensor_(set)(state, output, input);
+    }
+    else
+    {
+      THCTensor_(resizeAs)(state, output, input);
+      real *output_data = THCTensor_(data)(state, output);
+      rreluUpdateOutputTrain<<<NUM_BLOCKS(n), BLOCK_SIZE, 0, THCState_getCurrentStream(state)>>>(
+        n, gen_states, input_data, noise_data, output_data, lower, upper);
+    }
+    THCudaCheck(cudaGetLastError());
+    THCTensor_(free)(state, input);
+  }
+  else
+  {
+    const real negSlope = ScalarConvert<double, real>::to((lower + upper) / 2);
+    if (inplace)
+    {
+      THC_pointwiseApply1(state, input, RReLUUpdateOutputEvalIP_functor<real>(negSlope));
+      THCTensor_(set)(state, output, input);
+    }
+    else
+    {
+      THCTensor_(resizeAs)(state, output, input);
+      THC_pointwiseApply2(state, output, input, RReLUUpdateOutputEval_functor<real>(negSlope));
+    }
+  }
+}
+
+void THNN_(RReLU_updateGradInput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *gradOutput,
+           THCTensor *gradInput,
+           THCTensor *noise,
+           double lower,
+           double upper,
+           bool train,
+           bool inplace)
+{
+  THCUNN_assertSameGPU_generic(state, 4, input, gradOutput, gradInput, noise);
+
+  gradOutput = THCTensor_(newContiguous)(state, gradOutput);
+
+  if (train && upper - lower > 1E-6)    // e.g. if upper == lower, RReLU behaves like LeakyReLU
+  {
+    // multiply the gradient by the noise tensor
+    if (inplace)
+    {
+      THCTensor_(cmul)(state, gradOutput, gradOutput, noise);
+      THCTensor_(set)(state, gradInput, gradOutput);
+    }
+    else
+    {
+      THCTensor_(resizeAs)(state, gradInput, input);
+      THCTensor_(cmul)(state, gradInput, gradOutput, noise);
+    }
+  }
+  else
+  {
+    // use constant factor for negative input values
+    const real negSlope = ScalarConvert<double, real>::to((lower + upper) / 2);
+    if (inplace)
+    {
+      THC_pointwiseApply2(state, gradOutput, input, RReLUupdateGradInputEvalIP_functor<real>(negSlope));
+      THCTensor_(set)(state, gradInput, gradOutput);
+    }
+    else
+    {
+      THCTensor_(resizeAs)(state, gradInput, input);
+      THC_pointwiseApply3(state, gradInput, gradOutput, input, RReLUupdateGradInputEval_functor<real>(negSlope));
+    }
+  }
+
+  THCTensor_(free)(state, gradOutput);
+}
+
+#endif
diff --git a/lib/THCUNN/generic/Sigmoid.cu b/lib/THCUNN/generic/Sigmoid.cu
new file mode 100644
index 0000000..7bc5739
--- /dev/null
+++ b/lib/THCUNN/generic/Sigmoid.cu
@@ -0,0 +1,29 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/Sigmoid.cu"
+#else
+
+#include "../common.h"
+
+void THNN_(Sigmoid_updateOutput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *output)
+{
+  THCUNN_assertSameGPU_generic(state, 2, input, output);
+  THCTensor_(resizeAs)(state, output, input);
+  THC_pointwiseApply2(state, output, input, sigmoidupdateOutput_functor<real>());
+}
+
+void THNN_(Sigmoid_updateGradInput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *gradOutput,
+           THCTensor *gradInput,
+           THCTensor *output)
+{
+  THCUNN_assertSameGPU_generic(state, 3, output, gradOutput, gradInput);
+  THCTensor_(resizeAs)(state, gradInput, output);
+  THC_pointwiseApply3(state, gradInput, output, gradOutput, sigmoidupdateGradInput_functor<real>());
+}
+
+#endif
diff --git a/lib/THCUNN/generic/SmoothL1Criterion.cu b/lib/THCUNN/generic/SmoothL1Criterion.cu
new file mode 100644
index 0000000..8bf6865
--- /dev/null
+++ b/lib/THCUNN/generic/SmoothL1Criterion.cu
@@ -0,0 +1,79 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/SmoothL1Criterion.cu"
+#else
+
+void THNN_(SmoothL1Criterion_updateOutput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *target,
+           THCTensor *output,
+           bool sizeAverage)
+{
+  THCUNN_assertSameGPU_generic(state, 2, input, target);
+  THArgCheck(
+    THCTensor_(nElement)(state, input) == THCTensor_(nElement)(state, target), 2,
+    "input and target need to have the same number of elements"
+  );
+
+  long size = THCTensor_(nElement)(state, input);
+
+  input = THCTensor_(newContiguous)(state, input);
+  target = THCTensor_(newContiguous)(state, target);
+
+  thrust::device_ptr<real> input_data(THCTensor_(data)(state, input));
+  thrust::device_ptr<real> target_data(THCTensor_(data)(state, target));
+  accreal sum = thrust::inner_product(
+#if CUDA_VERSION >= 7000
+    thrust::cuda::par.on(THCState_getCurrentStream(state)),
+#endif
+    input_data, input_data+size, target_data, (accreal) 0,
+    thrust::plus<accreal>(), smoothl1_functor<real, accreal>()
+  );
+
+  if (sizeAverage)
+    sum /= size;
+
+  THCTensor_(free)(state, input);
+  THCTensor_(free)(state, target);
+
+  THCTensor_(set1d)(state, output, 0, ScalarConvert<accreal, real>::to(sum));
+}
+
+void THNN_(SmoothL1Criterion_updateGradInput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *target,
+           THCTensor *gradInput,
+           bool sizeAverage)
+{
+  THCUNN_assertSameGPU_generic(state, 3, input, target, gradInput);
+  THArgCheck(
+    THCTensor_(nElement)(state, input) == THCTensor_(nElement)(state, target), 2,
+    "input and target need to have the same number of elements"
+  );
+
+  long size = THCTensor_(nElement)(state, input);
+  real norm = ScalarConvert<accreal, real>::to(sizeAverage ? accreal(1)/size : accreal(1));
+
+  input = THCTensor_(newContiguous)(state, input);
+  target = THCTensor_(newContiguous)(state, target);
+
+  THCTensor_(resizeAs)(state, gradInput, input);
+
+  thrust::device_ptr<real> input_data(THCTensor_(data)(state, input));
+  thrust::device_ptr<real> target_data(THCTensor_(data)(state, target));
+  thrust::device_ptr<real> gradInput_data(THCTensor_(data)(state, gradInput));
+
+  thrust::transform(
+#if CUDA_VERSION >= 7000
+    thrust::cuda::par.on(THCState_getCurrentStream(state)),
+#endif
+    input_data, input_data+size, target_data, gradInput_data,
+    smoothl1_updateGradInput_functor<real>(norm)
+  );
+
+  THCTensor_(free)(state, input);
+  THCTensor_(free)(state, target);
+}
+
+#endif
diff --git a/lib/THCUNN/generic/SoftMarginCriterion.cu b/lib/THCUNN/generic/SoftMarginCriterion.cu
new file mode 100644
index 0000000..b5b5329
--- /dev/null
+++ b/lib/THCUNN/generic/SoftMarginCriterion.cu
@@ -0,0 +1,60 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/SoftMarginCriterion.cu"
+#else
+
+void THNN_(SoftMarginCriterion_updateOutput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *target,
+           THCTensor *output,
+           int sizeAverage)
+{
+  THCUNN_assertSameGPU_generic(state, 2, input, target);
+  accreal sum;
+
+  long size = THCTensor_(nElement)(state, input);
+
+  input = THCTensor_(newContiguous)(state, input);
+  target = THCTensor_(newContiguous)(state, target);
+
+  thrust::device_ptr<real> input_data(THCTensor_(data)(state, input));
+  thrust::device_ptr<real> target_data(THCTensor_(data)(state, target));
+  sum = thrust::inner_product(input_data, input_data+size, target_data, (accreal) 0, thrust::plus<accreal>(), softmargin_functor<real, accreal>());
+
+  if(sizeAverage)
+    sum /= size;
+
+  THCTensor_(free)(state, input);
+  THCTensor_(free)(state, target);
+
+  THCTensor_(set1d)(state, output, 0, ScalarConvert<accreal, real>::to(sum));
+}
+
+void THNN_(SoftMarginCriterion_updateGradInput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *target,
+           THCTensor *gradInput,
+           int sizeAverage)
+{
+  THCUNN_assertSameGPU_generic(state, 3, input, target, gradInput);
+
+  long size = THCTensor_(nElement)(state, input);
+  accreal norm = (sizeAverage ? 1./size : 1.);
+
+  input = THCTensor_(newContiguous)(state, input);
+  target = THCTensor_(newContiguous)(state, target);
+
+  THCTensor_(resizeAs)(state, gradInput, input);
+
+  thrust::device_ptr<real> input_data(THCTensor_(data)(state, input));
+  thrust::device_ptr<real> target_data(THCTensor_(data)(state, target));
+  thrust::device_ptr<real> gradInput_data(THCTensor_(data)(state, gradInput));
+
+  thrust::transform(input_data, input_data+size, target_data, gradInput_data, softmargin_updateGradInput_functor<real, accreal>(norm));
+
+  THCTensor_(free)(state, input);
+  THCTensor_(free)(state, target);
+}
+
+#endif
diff --git a/lib/THCUNN/generic/SoftMax.cu b/lib/THCUNN/generic/SoftMax.cu
new file mode 100644
index 0000000..917e3f0
--- /dev/null
+++ b/lib/THCUNN/generic/SoftMax.cu
@@ -0,0 +1,153 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/SoftMax.cu"
+#else
+
+#include "../common.h"
+
+void THNN_(SoftMax_updateOutput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *output)
+{
+  THCUNN_assertSameGPU_generic(state, 2, input, output);
+
+  input = THCTensor_(newContiguous)(state, input);
+  THCTensor_(resizeAs)(state, output, input);
+  long batchSize, dim, stride0, stride1 = 1;
+  long blocksY = 1, blocksZ = 1;
+
+  if (input->nDimension == 1)
+  {
+    batchSize = 1;
+    dim = input->size[0];
+    stride0 = 1;
+  }
+  else if (input->nDimension == 2)
+  {
+    batchSize = input->size[0];
+    dim = input->size[1];
+    stride0 = 1;
+  }
+  else if (input->nDimension == 3)
+  {
+    batchSize = 1;
+    dim = input->size[0];
+    blocksY = input->size[1];
+    blocksZ = input->size[2];
+    stride0 = blocksY * blocksZ;
+    stride1 = blocksZ;
+  }
+  else if (input->nDimension == 4)
+  {
+    batchSize = input->size[0];
+    dim = input->size[1];
+    blocksY = input->size[2];
+    blocksZ = input->size[3];
+    stride0 = blocksY * blocksZ;
+    stride1 = blocksZ;
+  }
+  else
+  {
+    THError("1D, 2D, 3D or 4D tensor expected");
+  }
+
+  // when possible use only 2d grid of thread blocks to stay compatible with compute capability 2.X devices.
+  if (blocksY * blocksZ < 65536)
+  {
+    blocksY *= blocksZ;
+    blocksZ = 1;
+    if (input->nDimension == 3 || input->nDimension == 4) {
+      stride0 = blocksY * blocksZ;
+      stride1 = blocksZ;
+    }
+  }
+
+  dim3 blocks(batchSize, blocksY, blocksZ);
+  dim3 threads(SOFTMAX_THREADS);
+  cunn_SoftMax_updateOutput_kernel<real, accreal><<<blocks, threads, 0, THCState_getCurrentStream(state)>>>(
+    THCTensor_(data)(state, output),
+    THCTensor_(data)(state, input),
+    batchSize, dim, stride0, stride1
+  );
+  THCudaCheck(cudaGetLastError());
+
+  THCTensor_(free)(state, input);
+}
+
+void THNN_(SoftMax_updateGradInput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *gradOutput,
+           THCTensor *gradInput,
+           THCTensor *output)
+{
+  THCUNN_assertSameGPU_generic(state, 3, output, gradOutput, gradInput);
+
+  output = THCTensor_(newContiguous)(state, output);
+  gradOutput = THCTensor_(newContiguous)(state, gradOutput);
+
+  THCTensor_(resizeAs)(state, gradInput, output);
+  long batchSize, dim, stride0, stride1 = 1;
+  long blocksY = 1, blocksZ = 1;
+
+  if (gradInput->nDimension == 1)
+  {
+    batchSize = 1;
+    dim = gradInput->size[0];
+    stride0 = 1;
+  }
+  else if (gradInput->nDimension == 2)
+  {
+    batchSize = gradInput->size[0];
+    dim = gradInput->size[1];
+    stride0 = 1;
+  }
+  else if (gradInput->nDimension == 3)
+  {
+    batchSize = 1;
+    dim = gradInput->size[0];
+    blocksY = gradInput->size[1];
+    blocksZ = gradInput->size[2];
+    stride0 = blocksY * blocksZ;
+    stride1 = blocksZ;
+  }
+  else if (gradInput->nDimension == 4)
+  {
+    batchSize = gradInput->size[0];
+    dim = gradInput->size[1];
+    blocksY = gradInput->size[2];
+    blocksZ = gradInput->size[3];
+    stride0 = blocksY * blocksZ;
+    stride1 = blocksZ;
+  }
+  else
+  {
+    THError("1D, 2D, 3D or 4D tensor expected");
+  }
+
+  // when possible use only 2d grid of thread blocks to stay compatible with compute capability 2.X devices.
+  if (blocksY * blocksZ < 65536)
+  {
+    blocksY *= blocksZ;
+    blocksZ = 1;
+    if (input->nDimension == 3 || input->nDimension == 4) {
+      stride0 = blocksY * blocksZ;
+      stride1 = blocksZ;
+    }
+  }
+
+  dim3 blocks(batchSize, blocksY, blocksZ);
+  dim3 threads(SOFTMAX_THREADS);
+  cunn_SoftMax_updateGradInput_kernel<real, accreal><<<blocks, threads, 0, THCState_getCurrentStream(state)>>>(
+    THCTensor_(data)(state, gradInput),
+    THCTensor_(data)(state, output),
+    THCTensor_(data)(state, gradOutput),
+    batchSize, dim, stride0, stride1
+  );
+  THCudaCheck(cudaGetLastError());
+
+  THCTensor_(free)(state, gradOutput);
+  THCTensor_(free)(state, output);
+}
+
+#endif
diff --git a/lib/THCUNN/generic/SoftPlus.cu b/lib/THCUNN/generic/SoftPlus.cu
new file mode 100644
index 0000000..39794b0
--- /dev/null
+++ b/lib/THCUNN/generic/SoftPlus.cu
@@ -0,0 +1,33 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/SoftPlus.cu"
+#else
+
+#include "../common.h"
+
+void THNN_(SoftPlus_updateOutput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *output,
+           real beta,
+           real threshold)
+{
+  THCUNN_assertSameGPU_generic(state, 2, input, output);
+  THCTensor_(resizeAs)(state, output, input);
+  THC_pointwiseApply2(state, output, input, softPlusupdateOutput_functor<real>(threshold, beta));
+}
+
+void THNN_(SoftPlus_updateGradInput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *gradOutput,
+           THCTensor *gradInput,
+           THCTensor *output,
+           real beta,
+           real threshold)
+{
+  THCUNN_assertSameGPU_generic(state, 4, input, output, gradOutput, gradInput);
+  THCTensor_(resizeAs)(state, gradInput, output);
+  THC_pointwiseApply3(state, gradInput, output, gradOutput, softPlusupdateGradInput_functor<real>(threshold, beta));
+}
+
+#endif
diff --git a/lib/THCUNN/generic/SoftShrink.cu b/lib/THCUNN/generic/SoftShrink.cu
new file mode 100644
index 0000000..aa530bb
--- /dev/null
+++ b/lib/THCUNN/generic/SoftShrink.cu
@@ -0,0 +1,32 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/SoftShrink.cu"
+#else
+
+#include "../common.h"
+
+void THNN_(SoftShrink_updateOutput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *output,
+           real lambda)
+{
+  THCUNN_assertSameGPU_generic(state, 2, input, output);
+  THCTensor_(resizeAs)(state, output, input);
+  THC_pointwiseApply2(state, output, input, SoftShrinkUpdateOutput<real>(lambda));
+  THCudaCheck(cudaGetLastError());
+}
+
+void THNN_(SoftShrink_updateGradInput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *gradOutput,
+           THCTensor *gradInput,
+           real lambda)
+{
+  THCUNN_assertSameGPU_generic(state, 3, input, gradOutput, gradInput);
+  THCTensor_(resizeAs)(state, gradInput, input);
+  THC_pointwiseApply3(state, gradInput, input, gradOutput, SoftShrinkUpdateGradInput<real>(lambda));
+  THCudaCheck(cudaGetLastError());
+}
+
+#endif
diff --git a/lib/THCUNN/generic/SparseLinear.cu b/lib/THCUNN/generic/SparseLinear.cu
new file mode 100644
index 0000000..438a8e4
--- /dev/null
+++ b/lib/THCUNN/generic/SparseLinear.cu
@@ -0,0 +1,269 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/SparseLinear.cu"
+#else
+
+static bool checkInput(THCTensor* t)
+{
+  return t->nDimension == 2 && t->size[1] == 3;
+}
+
+static bool checkSize2D(THCTensor* t, long size0, long size1)
+{
+  return t->nDimension == 2 && t->size[0] == size0 && t->size[1] == size1;
+}
+
+static bool checkSize1D(THCTensor* t, long size0)
+{
+  return t->nDimension == 1 && t->size[0] == size0;
+}
+
+static inline void copyCudaFloatingType(THCState *state, THCudaIntTensor *buf, THCTensor *t) {
+  #ifdef THC_REAL_IS_FLOAT
+  THCudaIntTensor_copyCudaFloat(state, buf, t);
+  #elif defined(THC_REAL_IS_DOUBLE)
+  THCudaIntTensor_copyCudaDouble(state, buf, t);
+  #elif defined(THC_REAL_IS_HALF)
+  THCudaIntTensor_copyCudaHalf(state, buf, t);
+  #endif
+}
+
+void THNN_(SparseLinear_updateOutput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *output,
+           THCTensor *weight,
+           THCTensor *bias)
+{
+  THAssert(THCTensor_(checkGPU)(state, 4, input, output, weight, bias));
+
+  long h;
+  long outDim = THCTensor_(size)(state, weight, 0);
+  long inDim = THCTensor_(size)(state, weight, 1);
+
+  THArgCheck(checkInput(input), 2, "input size must be nnz x 3");
+  THArgCheck(THCTensor_(nDimension)(state, output) == 2, 3, "output must be batchsize x outputsize");
+  THArgCheck(checkSize1D(bias, outDim), 5, "bias size wrong");
+
+  long batchnum = THCTensor_(size)(state, output, 0);
+  long nnz = THCTensor_(size)(state, input, 0);
+
+  THCTensor *buffer = THCTensor_(new)(state);
+  THCTensor *sel = THCTensor_(new)(state);
+  THCTensor *values = THCTensor_(new)(state);
+  THCudaIntTensor *rowbuf = THCudaIntTensor_new(state);
+  THCudaIntTensor *csrPtrs = THCudaIntTensor_new(state);
+  THCudaIntTensor *colInds = THCudaIntTensor_new(state);
+
+  THCTensor_(resize1d)(state, values, nnz);
+  THCudaIntTensor_resize1d(state, rowbuf, nnz);
+  THCudaIntTensor_resize1d(state, colInds, nnz);
+  THCudaIntTensor_resize1d(state, csrPtrs, batchnum+1);
+
+  // Get data ready for cusparse, need CudaInt buffers
+  // We do not need to sort, since rows are already in order
+  // If rows might get out of order in future implementations, or if cusparse
+  //    complains with an illegal memory access, sort like we do in AccGradParameters
+  THCTensor_(select)(state, sel, input, 1, 0);
+  copyCudaFloatingType(state, rowbuf, sel);
+  THCTensor_(select)(state, sel, input, 1, 1);
+  copyCudaFloatingType(state, colInds, sel);
+  THCTensor_(select)(state, sel, input, 1, 2);
+  THCTensor_(copyCuda)(state, values, sel);
+
+  init_cusparse();
+  cusparseXcoo2csr(cusparse_handle,
+      THCudaIntTensor_data(state, rowbuf), nnz, batchnum,
+      THCudaIntTensor_data(state, csrPtrs), CUSPARSE_INDEX_BASE_ONE);
+
+  // output = bias
+  THCTensor_(resize2d)(state, buffer, outDim, batchnum);
+  THCTensor_(zero)(state, buffer);
+  for (h=0; h<batchnum; h++) {
+    THCTensor_(select)(state, sel, buffer, 1, h);
+    THCTensor_(copy)(state, sel, bias);
+  }
+
+  // output = W * x
+  real one = ScalarConvert<int, real>::to(1);
+  cusparseMatDescr_t descr = 0;
+  cusparseCreateMatDescr(&descr);
+  cusparseSetMatType(descr,CUSPARSE_MATRIX_TYPE_GENERAL);
+  cusparseSetMatIndexBase(descr,CUSPARSE_INDEX_BASE_ONE);
+  #ifdef THC_REAL_IS_FLOAT
+  cusparseScsrmm(cusparse_handle,
+  #elif defined(THC_REAL_IS_DOUBLE)
+  cusparseDcsrmm(cusparse_handle,
+  #endif
+      CUSPARSE_OPERATION_NON_TRANSPOSE,
+      batchnum, outDim, inDim, nnz,
+      &one,
+      descr,
+      THCTensor_(data)(state, values),
+      THCudaIntTensor_data(state, csrPtrs),
+      THCudaIntTensor_data(state, colInds),
+      THCTensor_(data)(state, weight), inDim,
+      &one, THCTensor_(data)(state, buffer), batchnum
+  );
+  THCTensor_(transpose)(state, buffer, NULL, 0, 1);
+
+  // We do work in the buffer to keep the output contiguous
+  THCTensor_(copy)(state, output, buffer);
+
+  cusparseDestroyMatDescr(descr);
+  descr = 0;
+  THCTensor_(free)(state, buffer);
+  THCTensor_(free)(state, sel);
+  THCTensor_(free)(state, values);
+  THCudaIntTensor_free(state, rowbuf);
+  THCudaIntTensor_free(state, colInds);
+  THCudaIntTensor_free(state, csrPtrs);
+}
+
+void THNN_(SparseLinear_accGradParameters)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *gradOutput,
+           THCTensor *gradWeight,
+           THCTensor *gradBias,
+           THCTensor *weight,
+           THCTensor *bias,
+           double weightDecay,
+           double scale)
+{
+  long outDim = THCTensor_(size)(state, weight, 0);
+  long inDim = THCTensor_(size)(state, weight, 1);
+
+  THArgCheck(checkInput(input), 2, "input size must be batchsize x nnz x 2");
+  THArgCheck(checkSize2D(gradWeight, outDim, inDim), 4, "gradWeight size wrong");
+  THArgCheck(checkSize1D(gradBias, outDim), 5, "gradBias size wrong");
+  THArgCheck(THCTensor_(isContiguous)(state, gradOutput), 3,
+             "gradOutput must be contiguous");
+
+  long nnz = THCTensor_(size)(state, input, 0);
+  long batchnum = THCTensor_(size)(state, gradOutput, 0);
+
+  THCTensor *buf = THCTensor_(new)(state);
+  THCTensor *cols = THCTensor_(new)(state);
+  THCTensor *sel = THCTensor_(new)(state);
+  THCudaLongTensor *inds = THCudaLongTensor_new(state);
+  THCTensor *values = THCTensor_(new)(state);
+  THCudaIntTensor *colbuf = THCudaIntTensor_new(state);
+  THCudaIntTensor *colPtrs = THCudaIntTensor_new(state);
+  THCudaIntTensor *rowInds = THCudaIntTensor_new(state);
+
+  THCTensor_(select)(state, sel, input, 1, 0); // rowInds
+  THCTensor_(select)(state, cols, input, 1, 1); // colInds
+  THCTensor_(cadd)(state, buf, sel, batchnum, cols); // colInds * buatchdim + rowInds
+  THCTensor_(sort)(state, buf, inds, buf, 0, 0); // Indicies are now in ind
+  THCTensor_(indexSelect)(state, buf, input, 0, inds);
+
+  THCTensor_(resize1d)(state, values, nnz);
+  THCudaIntTensor_resize1d(state, colbuf, nnz);
+  THCudaIntTensor_resize1d(state, rowInds, nnz);
+  THCudaIntTensor_resize1d(state, colPtrs, inDim+1);
+
+  // Get data ready for cusparse, need CudaInt buffers
+  THCTensor_(select)(state, sel, buf, 1, 0);
+  copyCudaFloatingType(state, rowInds, sel);
+  THCTensor_(select)(state, sel, buf, 1, 1);
+  copyCudaFloatingType(state, colbuf, sel);
+  THCTensor_(select)(state, sel, buf, 1, 2);
+  THCTensor_(copyCuda)(state, values, sel);
+
+  init_cusparse();
+  // Secretly coo2csc
+  cusparseXcoo2csr(cusparse_handle,
+      THCudaIntTensor_data(state, colbuf), nnz, inDim,
+      THCudaIntTensor_data(state, colPtrs), CUSPARSE_INDEX_BASE_ONE);
+
+  // FORTRAN expects contiguous col-major matricies
+  THCTensor_(transpose)(state, gradOutput, NULL, 0, 1);
+  THCTensor_(resize2d)(state, buf, batchnum, outDim);
+  THCTensor_(copy)(state, buf, gradOutput);
+  THCTensor_(transpose)(state, gradOutput, NULL, 0, 1); // Restore gradOutput
+
+  real one = ScalarConvert<int, real>::to(1);
+  cusparseMatDescr_t descr = 0;
+  cusparseCreateMatDescr(&descr);
+  cusparseSetMatType(descr,CUSPARSE_MATRIX_TYPE_GENERAL);
+  cusparseSetMatIndexBase(descr,CUSPARSE_INDEX_BASE_ONE);
+  #ifdef THC_REAL_IS_FLOAT
+  cusparseScsrmm(cusparse_handle,
+  #elif defined(THC_REAL_IS_DOUBLE)
+  cusparseDcsrmm(cusparse_handle,
+  #endif
+      CUSPARSE_OPERATION_NON_TRANSPOSE,
+      inDim, outDim, batchnum, nnz,
+      &one,
+      descr,
+      THCTensor_(data)(state, values),
+      THCudaIntTensor_data(state, colPtrs),
+      THCudaIntTensor_data(state, rowInds),
+      THCTensor_(data)(state, buf), batchnum,
+      &one, THCTensor_(data)(state, gradWeight), inDim
+  );
+
+  THCTensor_(sum)(state, buf, gradOutput, 0);
+  THCTensor_(resize1d)(state, buf, outDim);
+  THCTensor_(cadd)(state, gradBias, gradBias, scale, buf);
+
+  if (weightDecay != 0)
+  {
+    THCTensor_(cadd)(state, gradWeight, gradWeight, weightDecay, weight);
+    THCTensor_(cadd)(state, gradBias, gradBias, weightDecay, bias);
+  }
+
+  THCTensor_(free)(state, buf);
+  THCTensor_(free)(state, sel);
+  THCTensor_(free)(state, cols);
+  THCudaLongTensor_free(state, inds);
+  THCTensor_(free)(state, values);
+  THCudaIntTensor_free(state, colbuf);
+  THCudaIntTensor_free(state, rowInds);
+  THCudaIntTensor_free(state, colPtrs);
+}
+
+void THNN_(SparseLinear_legacyUpdateOutput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *output,
+           THCTensor *weight,
+           THCTensor *bias) {
+  THError("CUDA does not support legacy input format, please use a table of nnz x 2 vectors");
+}
+void THNN_(SparseLinear_legacyAccGradParameters)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *gradOutput,
+           THCTensor *gradWeight,
+           THCTensor *gradBias,
+           THCTensor *weight,
+           THCTensor *bias,
+           double weightDecay,
+           double scale) {
+  THError("CUDA does not support legacy input format, please use a table of nnz x 2 vectors");
+}
+
+// Dense updates are pretty fast on the GPU
+void THNN_(SparseLinear_zeroGradParameters)(
+           THCState *state,
+           THCTensor *gradWeight,
+           THCTensor *gradBias,
+           THCTensor *lastInput) {
+  THCTensor_(zero)(state, gradWeight);
+  THCTensor_(zero)(state, gradBias);
+}
+
+void THNN_(SparseLinear_updateParameters)(
+           THCState *state,
+           THCTensor *weight,
+           THCTensor *bias,
+           THCTensor *gradWeight,
+           THCTensor *gradBias,
+           THCTensor *lastInput,
+           double learningRate) {
+  THCTensor_(cadd)(state, weight, weight, -learningRate, gradWeight);
+  THCTensor_(cadd)(state, bias, bias, -learningRate, gradBias);
+}
+
+#endif
diff --git a/lib/THCUNN/generic/SpatialAdaptiveMaxPooling.cu b/lib/THCUNN/generic/SpatialAdaptiveMaxPooling.cu
new file mode 100644
index 0000000..fc6fe6d
--- /dev/null
+++ b/lib/THCUNN/generic/SpatialAdaptiveMaxPooling.cu
@@ -0,0 +1,188 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/SpatialAdaptiveMaxPooling.cu"
+#else
+
+#include "../common.h"
+
+void THNN_(SpatialAdaptiveMaxPooling_updateOutput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *output,
+           THCIndexTensor *indices,
+           int nOutputCols,
+           int nOutputRows)
+{
+  THCUNN_assertSameGPU_generic(state, 3, input, output, indices);
+
+  THCIndex_t *indices_data;
+  real *output_data;
+  real *input_data;
+
+  THArgCheck(input->nDimension == 3 || input->nDimension == 4, 2, "3D or 4D (batch) tensor expected");
+
+  if (input->nDimension == 3) {
+    long nInputCols = input->size[2];
+    long nInputRows = input->size[1];
+    long nInputPlane = input->size[0];
+
+    long istride_d = input->stride[0];
+    long istride_h = input->stride[1];
+    long istride_w = input->stride[2];
+
+    input_data = THCTensor_(data)(state, input);
+
+    THCTensor_(resize3d)(state, output, nInputPlane, nOutputRows, nOutputCols);
+    THCIndexTensor_(resize4d)(state, indices, 2, nInputPlane, nOutputRows, nOutputCols);
+
+    indices_data = THCIndexTensor_(data)(state, indices);
+    output_data = THCTensor_(data)(state, output);
+
+    // cuda blocks & threads:
+    int yblocks = (int)(16L / nInputPlane);
+    yblocks = yblocks < 1 ? 1 : yblocks;
+    dim3 blocks(nInputPlane,yblocks);
+    dim3 threads(32,8);
+
+    // run maxpool kernel
+    adaptivemaxpool <<<blocks, threads, 0, THCState_getCurrentStream(state)>>> (input_data, output_data,
+                                   indices_data+nInputPlane*nOutputCols*nOutputRows, indices_data,
+                                   nInputPlane, nInputRows, nInputCols, nOutputRows, nOutputCols,
+                                   istride_h, istride_w, istride_d);
+    THCudaCheck(cudaGetLastError());
+
+  } else {
+    long nInputCols = input->size[3];
+    long nInputRows = input->size[2];
+    long nInputPlane = input->size[1];
+    long nbatch = input->size[0];
+
+    long istride_d = input->stride[1];
+    long istride_h = input->stride[2];
+    long istride_w = input->stride[3];
+
+    input = THCTensor_(newContiguous)(state, input);
+    input_data = THCTensor_(data)(state, input);
+
+    THCTensor_(resize4d)(state, output, nbatch, nInputPlane, nOutputRows, nOutputCols);
+    THCIndexTensor_(resize5d)(state, indices, 2, nbatch, nInputPlane, nOutputRows, nOutputCols);
+
+    indices_data = THCIndexTensor_(data)(state, indices);
+    output_data = THCTensor_(data)(state, output);
+
+    // cuda blocks & threads:
+    int yblocks = (int)(16L / nInputPlane);
+    yblocks = yblocks < 1 ? 1 : yblocks;
+    dim3 blocks(nInputPlane*nbatch,yblocks);
+    dim3 threads(32,8);
+
+    // run maxpool kernel
+    adaptivemaxpool <<<blocks, threads, 0, THCState_getCurrentStream(state)>>> (input_data, output_data,
+                                   indices_data+nbatch*nInputPlane*nOutputCols*nOutputRows, indices_data,
+                                   nInputPlane, nInputRows, nInputCols, nOutputRows, nOutputCols,
+                                   istride_h, istride_w, istride_d);
+    THCudaCheck(cudaGetLastError());
+    // clean
+    THCTensor_(free)(state, input);
+  }
+}
+
+void THNN_(SpatialAdaptiveMaxPooling_updateGradInput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *gradOutput,
+           THCTensor *gradInput,
+           THCIndexTensor *indices)
+{
+  bool atomic = true; // suboptimal, but without atomic it doesn't pass the tests
+
+  THCUNN_assertSameGPU_generic(state, 4, input, indices, gradOutput, gradInput);
+
+  THCIndex_t *indices_data;
+  real *gradInput_data;
+  real *gradOutput_data;
+
+  gradOutput = THCTensor_(newContiguous)(state, gradOutput);
+
+  if (input->nDimension == 3) {
+    long nInputCols = input->size[2];
+    long nInputRows = input->size[1];
+    long nInputPlane = input->size[0];
+    long nOutputCols = gradOutput->size[2];
+    long nOutputRows = gradOutput->size[1];
+
+    //bool atomic = (nInputCols%nOutputCols != 0) || (nInputRows%nOutputRows != 0);
+
+    THCTensor_(resizeAs)(state, gradInput, input);
+    THCTensor_(zero)(state, gradInput);
+
+    indices_data = THCIndexTensor_(data)(state, indices);
+    gradOutput_data = THCTensor_(data)(state, gradOutput);
+    gradInput_data = THCTensor_(data)(state, gradInput);
+
+    // cuda blocks & threads:
+    int yblocks = (int)(16L / nInputPlane);
+    yblocks = yblocks < 1 ? 1 : yblocks;
+    dim3 blocks(nInputPlane,yblocks);
+    dim3 threads(32,8);
+
+    if(atomic)
+    {
+      // run updateGradInput kernel, accumulate gradients atomically
+      atomicadaptivemaxgradinput <<<blocks, threads, 0, THCState_getCurrentStream(state)>>> (gradInput_data, gradOutput_data,
+                                          indices_data+nInputPlane*nOutputCols*nOutputRows, indices_data,
+                                          nInputPlane, nInputRows, nInputCols, nOutputRows, nOutputCols);
+    }
+    else
+    {
+      // run updateGradInput kernel
+      atomicadaptivemaxgradinput <<<blocks, threads, 0, THCState_getCurrentStream(state)>>> (gradInput_data, gradOutput_data,
+                                          indices_data+nInputPlane*nOutputCols*nOutputRows, indices_data,
+                                          nInputPlane, nInputRows, nInputCols, nOutputRows, nOutputCols);
+    }
+    THCudaCheck(cudaGetLastError());
+  } else {
+    long nInputCols = input->size[3];
+    long nInputRows = input->size[2];
+    long nInputPlane = input->size[1];
+    long nbatch = input->size[0];
+    long nOutputCols = gradOutput->size[3];
+    long nOutputRows = gradOutput->size[2];
+
+    //bool atomic = //(nInputCols%nOutputCols != 0) || (nInputRows%nOutputRows != 0);
+
+    THCTensor_(resizeAs)(state, gradInput, input);
+    THCTensor_(zero)(state, gradInput);
+
+    indices_data = THCIndexTensor_(data)(state, indices);
+    gradOutput_data = THCTensor_(data)(state, gradOutput);
+    gradInput_data = THCTensor_(data)(state, gradInput);
+
+    // cuda blocks & threads:
+    int yblocks = (int)(16L / nInputPlane);
+    yblocks = yblocks < 1 ? 1 : yblocks;
+    dim3 blocks(nInputPlane*nbatch,yblocks);
+    dim3 threads(32,8);
+
+    if(atomic)
+    {
+      // run updateGradInput kernel, accumulate gradients atomically
+      atomicadaptivemaxgradinput <<<blocks, threads, 0, THCState_getCurrentStream(state)>>> (gradInput_data, gradOutput_data,
+                                          indices_data+nbatch*nInputPlane*nOutputCols*nOutputRows, indices_data,
+                                          nInputPlane, nInputRows, nInputCols, nOutputRows, nOutputCols);
+    }
+    else
+    {
+      // run updateGradInput kernel, accumulate gradients atomically
+      adaptivemaxgradinput <<<blocks, threads, 0, THCState_getCurrentStream(state)>>> (gradInput_data, gradOutput_data,
+                                          indices_data+nbatch*nInputPlane*nOutputCols*nOutputRows, indices_data,
+                                          nInputPlane, nInputRows, nInputCols, nOutputRows, nOutputCols);
+    }
+    THCudaCheck(cudaGetLastError());
+  }
+
+  // clean
+  THCTensor_(free)(state,gradOutput);
+
+}
+
+#endif
diff --git a/lib/THCUNN/generic/SpatialAveragePooling.cu b/lib/THCUNN/generic/SpatialAveragePooling.cu
new file mode 100644
index 0000000..09f539e
--- /dev/null
+++ b/lib/THCUNN/generic/SpatialAveragePooling.cu
@@ -0,0 +1,164 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/SpatialAveragePooling.cu"
+#else
+
+void THNN_(SpatialAveragePooling_updateOutput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *output,
+           int kW, int kH,
+           int dW, int dH,
+           int padW, int padH,
+           bool ceil_mode,
+           bool count_include_pad)
+{
+  THCUNN_assertSameGPU_generic(state, 2, input, output);
+  THArgCheck(input->nDimension == 3 || input->nDimension == 4, 2, "3D or 4D (batch) tensor expected");
+
+  long nInputCols, nInputRows, nInputPlane, batchSize;
+  long nOutputCols, nOutputRows;
+
+  if (input->nDimension == 3) {
+    nInputCols = input->size[2];
+    nInputRows = input->size[1];
+    nInputPlane = input->size[0];
+    batchSize = 1;
+  }
+  else
+  {
+    nInputCols = input->size[3];
+    nInputRows = input->size[2];
+    nInputPlane = input->size[1];
+    batchSize = input->size[0];
+  }
+
+  THArgCheck(nInputCols >= kW - 2*padW && nInputRows >= kH - 2*padH, 2, "input image smaller than kernel size");
+  THArgCheck(kW/2 >= padW && kH/2 >= padH, 2, "pad should be smaller than half of kernel size");
+
+  if(ceil_mode) {
+    nOutputCols = ceil(float(nInputCols - kW + 2*padW) / float(dW)) + 1;
+    nOutputRows = ceil(float(nInputRows - kH + 2*padH) / float(dH)) + 1;
+  }
+  else {
+    nOutputCols = floor(float(nInputCols - kW + 2*padW) / float(dW)) + 1;
+    nOutputRows = floor(float(nInputRows - kH + 2*padH) / float(dH)) + 1;
+  }
+  if (padW || padH)
+  {
+    // ensure that the last pooling starts inside the image
+    // needed to avoid problems in ceil mode
+    if ((nOutputRows - 1)*dH >= nInputRows + padH)
+      --nOutputRows;
+    if ((nOutputCols  - 1)*dW >= nInputCols  + padW)
+      --nOutputCols;
+  }
+
+  input = THCTensor_(newContiguous)(state, input);
+  real* input_data = THCTensor_(data)(state, input);
+
+  THCTensor_(resize4d)(state, output, batchSize, nInputPlane, nOutputRows, nOutputCols);
+
+  real* output_data = THCTensor_(data)(state, output);
+
+  int count = THCTensor_(nElement)(state, output);
+
+  if(count_include_pad)
+    AvePoolForward<real, accreal, true>
+      <<<GET_BLOCKS(count), CUDA_NUM_THREADS, 0, THCState_getCurrentStream(state) >>>(
+        count, input_data,
+        batchSize, nInputPlane, nInputRows, nInputCols, nOutputRows, nOutputCols,
+        kH, kW, dH, dW, padH, padW, output_data);
+  else
+    AvePoolForward<real, accreal, false>
+      <<<GET_BLOCKS(count), CUDA_NUM_THREADS, 0, THCState_getCurrentStream(state) >>>(
+        count, input_data,
+        batchSize, nInputPlane, nInputRows, nInputCols, nOutputRows, nOutputCols,
+        kH, kW, dH, dW, padH, padW, output_data);
+  THCudaCheck(cudaGetLastError());
+
+  if(input->nDimension == 3)
+    THCTensor_(resize3d)(state, output, nInputPlane, nOutputRows, nOutputCols);
+
+  THCTensor_(free)(state, input);
+
+}
+
+void THNN_(SpatialAveragePooling_updateGradInput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *gradOutput,
+           THCTensor *gradInput,
+           int kW, int kH,
+           int dW, int dH,
+           int padW, int padH,
+           bool ceil_mode,
+           bool count_include_pad)
+{
+  THCUNN_assertSameGPU_generic(state, 3, input, gradOutput, gradInput);
+
+  input = THCTensor_(newContiguous)(state, input);
+  gradOutput = THCTensor_(newContiguous)(state, gradOutput);
+
+  long nInputCols, nInputRows, nInputPlane, batchSize;
+  long nOutputCols, nOutputRows;
+
+  if (input->nDimension == 3) {
+    nInputCols = input->size[2];
+    nInputRows = input->size[1];
+    nInputPlane = input->size[0];
+    batchSize = 1;
+  }
+  else
+  {
+    nInputCols = input->size[3];
+    nInputRows = input->size[2];
+    nInputPlane = input->size[1];
+    batchSize = input->size[0];
+  }
+
+  if(ceil_mode) {
+    nOutputCols = ceil(float(nInputCols - kW + 2*padW) / float(dW)) + 1;
+    nOutputRows = ceil(float(nInputRows - kH + 2*padH) / float(dH)) + 1;
+  }
+  else {
+    nOutputCols = floor(float(nInputCols - kW + 2*padW) / float(dW)) + 1;
+    nOutputRows = floor(float(nInputRows - kH + 2*padH) / float(dH)) + 1;
+  }
+  if (padW || padH)
+  {
+    // ensure that the last pooling starts inside the image
+    // needed to avoid problems in ceil mode
+    if ((nOutputRows - 1)*dH >= nInputRows + padH)
+      --nOutputRows;
+    if ((nOutputCols  - 1)*dW >= nInputCols  + padW)
+      --nOutputCols;
+  }
+
+  THCTensor_(resizeAs)(state, gradInput, input);
+
+  int count = THCTensor_(nElement)(state, input);
+
+  if(count_include_pad)
+    AvePoolBackward<real, accreal, true>
+      <<< GET_BLOCKS(count), CUDA_NUM_THREADS, 0, THCState_getCurrentStream(state) >>>
+        (count,
+        THCTensor_(data)(state, gradOutput),
+        batchSize, nInputPlane, nInputRows, nInputCols, nOutputRows, nOutputCols,
+        kH, kW, dH, dW, padH, padW,
+        THCTensor_(data)(state, gradInput));
+  else
+    AvePoolBackward<real, accreal, false>
+      <<< GET_BLOCKS(count), CUDA_NUM_THREADS, 0, THCState_getCurrentStream(state) >>>
+        (count,
+        THCTensor_(data)(state, gradOutput),
+        batchSize, nInputPlane, nInputRows, nInputCols, nOutputRows, nOutputCols,
+        kH, kW, dH, dW, padH, padW,
+        THCTensor_(data)(state, gradInput));
+  THCudaCheck(cudaGetLastError());
+
+  // clean
+  THCTensor_(free)(state, input);
+  THCTensor_(free)(state, gradOutput);
+}
+
+#endif
diff --git a/lib/THCUNN/generic/SpatialClassNLLCriterion.cu b/lib/THCUNN/generic/SpatialClassNLLCriterion.cu
new file mode 100644
index 0000000..edb3a0c
--- /dev/null
+++ b/lib/THCUNN/generic/SpatialClassNLLCriterion.cu
@@ -0,0 +1,126 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/SpatialClassNLLCriterion.cu"
+#else
+
+void THNN_(SpatialClassNLLCriterion_updateOutput)(
+           THCState *state,
+           THCTensor *input,
+           THCIndexTensor *target,
+           THCTensor *output,
+           bool sizeAverage,
+           THCTensor *weights,
+           THCTensor *total_weight)
+{
+  THArgCheck(THCIndexTensor_(nDimension)(state, target) == 3, 1,
+               "only batches of spatial targets supported (3D tensors)");
+  THArgCheck(THCTensor_(nDimension)(state, input) == 4, 2,
+               "only batches of spatial inputs supported (4D tensors)");
+  if (weights && THCTensor_(nElement)(state, weights) != THCTensor_(size)(state, input, 1)) {
+    THError("weight tensor should be defined either for all or no classes");
+  }
+
+  if (weights)
+    THCUNN_assertSameGPU_generic(state, 5, input, target, weights, output, total_weight);
+  else
+    THCUNN_assertSameGPU_generic(state, 4, input, target, output, total_weight);
+
+  input = THCTensor_(newContiguous)(state, input);
+  weights = weights ? THCTensor_(newContiguous)(state, weights) : NULL;
+  target = THCIndexTensor_(newContiguous)(state, target);
+
+  real *input_data = THCTensor_(data)(state, input);
+  real *weights_data = weights ? THCTensor_(data)(state, weights) : NULL;
+  THCIndex_t  *target_data = THCIndexTensor_(data)(state, target);
+  real *output_data = THCTensor_(data)(state, output);
+  real *total_weight_data = THCTensor_(data)(state, total_weight);
+
+  THCIndex_t batch_size = THCIndexTensor_(size)(state, target, 0);
+  THCIndex_t map_nelem = THCIndexTensor_(nElement)(state, target) / batch_size;
+  int blocks_per_sample = GET_BLOCKS(map_nelem) / 128;
+  blocks_per_sample = (blocks_per_sample == 0) ? 1 : blocks_per_sample;
+  int total_blocks = blocks_per_sample * batch_size;
+
+  THCTensor_(fill)(state, output, ScalarConvert<int, real>::to(0));
+  THCTensor_(fill)(state, total_weight, ScalarConvert<int, real>::to(0));
+
+  cunn_SpatialClassNLLCriterion_updateOutput_kernel<real, accreal>
+    <<<total_blocks, CUDA_NUM_THREADS, 0, THCState_getCurrentStream(state)>>>(
+      output_data,
+      total_weight_data,
+      input_data,
+      target_data,
+      weights_data,
+      sizeAverage,
+      THCTensor_(size)(state, input, 0),
+      THCTensor_(size)(state, input, 1),
+      THCTensor_(size)(state, input, 2) * THCTensor_(size)(state, input, 3),
+      blocks_per_sample
+  );
+  THCudaCheck(cudaGetLastError());
+
+  if (weights)
+    THCTensor_(free)(state, weights);
+  THCIndexTensor_(free)(state, target);
+  THCTensor_(free)(state, input);
+}
+
+void THNN_(SpatialClassNLLCriterion_updateGradInput)(
+           THCState *state,
+           THCTensor *input,
+           THCIndexTensor *target,
+           THCTensor *gradInput,
+           bool sizeAverage,
+           THCTensor *weights,
+           THCTensor *total_weight)
+{
+  THArgCheck(THCIndexTensor_(nDimension)(state, target) == 3, 1,
+               "only batches of spatial targets supported (3D tensors)");
+  THArgCheck(THCTensor_(nDimension)(state, input) == 4, 2,
+               "only batches of spatial inputs supported (4D tensors)");
+  THArgCheck(THCTensor_(isContiguous)(state, gradInput), 4,
+               "gradInput must be contiguous");
+  if (weights && THCTensor_(nElement)(state, weights) != THCTensor_(size)(state, input, 1)) {
+    THError("weight tensor should be defined either for all or no classes");
+  }
+
+  if (weights)
+    THCUNN_assertSameGPU_generic(state, 5, weights, input, target, gradInput, total_weight);
+  else
+    THCUNN_assertSameGPU_generic(state, 4, input, target, gradInput, total_weight);
+
+  input = THCTensor_(newContiguous)(state, input);
+  weights = weights ? THCTensor_(newContiguous)(state, weights) : NULL;
+  target = THCIndexTensor_(newContiguous)(state, target);
+
+  real *weights_data = weights ? THCTensor_(data)(state, weights) : NULL;
+  real *gradInput_data = THCTensor_(data)(state, gradInput);
+  THCIndex_t *target_data = THCIndexTensor_(data)(state, target);
+  real *total_weight_data = THCTensor_(data)(state, total_weight);
+
+  THCIndex_t batch_size = THCIndexTensor_(size)(state, target, 0);
+  THCIndex_t map_nelem = THCIndexTensor_(nElement)(state, target) / batch_size;
+  int blocks_per_sample = GET_BLOCKS(map_nelem) / 128;
+  blocks_per_sample = (blocks_per_sample == 0) ? 1 : blocks_per_sample;
+  int total_blocks = blocks_per_sample * batch_size;
+
+  cunn_SpatialClassNLLCriterion_updateGradInput_kernel
+    <<<total_blocks, CUDA_NUM_THREADS, 0, THCState_getCurrentStream(state)>>>(
+      gradInput_data,
+      target_data,
+      weights_data,
+      total_weight_data,
+      sizeAverage,
+      THCTensor_(size)(state, input, 0),
+      THCTensor_(size)(state, input, 1),
+      THCTensor_(size)(state, input, 2) *THCTensor_(size)(state, input, 3),
+      blocks_per_sample
+  );
+  THCudaCheck(cudaGetLastError());
+
+  if (weights)
+    THCTensor_(free)(state, weights);
+  THCIndexTensor_(free)(state, target);
+  THCTensor_(free)(state, input);
+}
+
+#endif
diff --git a/lib/THCUNN/generic/SpatialConvolutionLocal.cu b/lib/THCUNN/generic/SpatialConvolutionLocal.cu
new file mode 100644
index 0000000..d317fe2
--- /dev/null
+++ b/lib/THCUNN/generic/SpatialConvolutionLocal.cu
@@ -0,0 +1,324 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/SpatialConvolutionLocal.cu"
+#else
+
+void THNN_(SpatialConvolutionLocal_updateOutput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *output,
+           THCTensor *weight,
+           THCTensor *bias,
+           THCTensor *finput,
+           THCTensor *fgradInput,
+           int kW, int kH,
+           int dW, int dH,
+           int padW, int padH,
+           long inputWidth, long inputHeight,
+           long outputWidth, long outputHeight)
+{
+  THCUNN_assertSameGPU_generic(state, 5, input, output, weight,
+                                 bias, finput);
+
+  long nInputPlane = THCTensor_(size)(state,weight,2)/(kW*kH);
+  long nOutputPlane = THCTensor_(size)(state,weight,1);
+
+  int batch = 1;
+  if (input->nDimension == 3) {
+    // Force batch
+    batch = 0;
+    THCTensor_(resize4d)(state, input, 1, nInputPlane, inputHeight, inputWidth);
+  }
+
+  // Batch size + input planes
+  long batchSize = input->size[0];
+
+  // Resize output
+  THCTensor_(resize4d)(state, output, batchSize, nOutputPlane, outputHeight, outputWidth);
+
+  // Augment the input
+  THCTensor_(resize3d)(state, finput, batchSize, nInputPlane*kW*kH, outputHeight*outputWidth);
+
+  // Helpers
+  THCTensor *input_n = THCTensor_(new)(state);
+  THCTensor *finput_n = THCTensor_(new)(state);
+  THCTensor *output_n = THCTensor_(new)(state);
+
+  // For each elt in batch, do:
+  for (int elt = 0; elt < batchSize; elt ++) {
+    THCTensor *finput3d, *output3d;
+    THCTensor *wslice = THCTensor_(new)(state);
+    THCTensor *islice = THCTensor_(new)(state);
+    THCTensor *oslice = THCTensor_(new)(state);
+
+    // Matrix mulitply per output:
+    THCTensor_(select)(state, input_n, input, 0, elt);
+    THCTensor_(select)(state, finput_n, finput, 0, elt);
+    THCTensor_(select)(state, output_n, output, 0, elt);
+
+    // Extract columns:
+    im2col(
+      THCState_getCurrentStream(state),
+      THCTensor_(data)(state, input_n),
+      nInputPlane, inputHeight, inputWidth, kH, kW, padH, padW, dH, dW,
+      1, 1, THCTensor_(data)(state, finput_n)
+    );
+
+    output3d = THCTensor_(newWithStorage3d)(state, output_n->storage, output_n->storageOffset,
+                                             outputHeight*outputWidth, 1,
+                                             nOutputPlane, outputHeight*outputWidth,
+                                             1, nOutputPlane*outputHeight*outputWidth);
+
+    finput3d = THCTensor_(newWithStorage3d)(state, finput_n->storage, finput_n->storageOffset,
+                                             outputHeight*outputWidth, 1,
+                                             kW*kH*nInputPlane, outputHeight*outputWidth,
+                                             1, kW*kH*nInputPlane*outputHeight*outputWidth);
+
+    THCTensor_(copy)(state, output_n, bias);
+
+    for (int i = 0; i < outputHeight; i++) {
+      for(int j = 0; j < outputWidth; j++) {
+        int sliceidx = i * outputWidth + j;
+        THCTensor_(select)(state, wslice, weight, 0, sliceidx);
+        THCTensor_(select)(state, islice, finput3d, 0, sliceidx);
+        THCTensor_(select)(state, oslice, output3d, 0, sliceidx);
+        THCTensor_(addmm)(state, oslice, ScalarConvert<int, real>::to(1), oslice, ScalarConvert<int, real>::to(1), wslice, islice);
+      }
+    }
+
+
+    // weight:    oH*oW x nOutputPlane x nInputPlane*kH*kW
+    // finput3d:  oH*oW x nInputPlane*kH*kW x 1
+    // THCTensor_(baddbmm)(state, output3d, 1.0, output3d, 1.0, weight, finput3d);
+    // output3d:  oH*oW x nOutputPlane x 1
+
+    THCTensor_(free)(state, output3d);
+    THCTensor_(free)(state, finput3d);
+    THCTensor_(free)(state, wslice);
+    THCTensor_(free)(state, islice);
+    THCTensor_(free)(state, oslice);
+  }
+
+  // Free
+  THCTensor_(free)(state, input_n);
+  THCTensor_(free)(state, finput_n);
+  THCTensor_(free)(state, output_n);
+
+  // Resize output
+  if (batch == 0) {
+    THCTensor_(resize3d)(state, output, nOutputPlane, outputHeight, outputWidth);
+    THCTensor_(resize3d)(state, input, nInputPlane, inputHeight, inputWidth);
+  }
+}
+
+void THNN_(SpatialConvolutionLocal_updateGradInput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *gradOutput,
+           THCTensor *gradInput,
+           THCTensor *weight,
+           THCTensor *finput,
+           THCTensor *fgradInput,
+           int kW, int kH,
+           int dW, int dH,
+           int padW, int padH,
+           long inputWidth, long inputHeight,
+           long outputWidth, long outputHeight)
+{
+  THCUNN_assertSameGPU_generic(state, 5, input, gradOutput, weight,
+                                 fgradInput, gradInput);
+
+  long nInputPlane = THCTensor_(size)(state,weight,2)/(kW*kH);
+  long nOutputPlane = THCTensor_(size)(state,weight,1);
+
+  int batch = 1;
+  if (input->nDimension == 3) {
+    // Force batch
+    batch = 0;
+    THCTensor_(resize4d)(state, input, 1, nInputPlane, inputHeight, inputWidth);
+    THCTensor_(resize4d)(state, gradOutput, 1, nOutputPlane, outputHeight, outputWidth);
+  }
+
+  // Batch size + input planes
+  long batchSize = input->size[0];
+
+  // Resize output
+  THCTensor_(resize4d)(state, gradInput, batchSize, nInputPlane, inputHeight, inputWidth);
+
+  // Resize temporary columns
+  THCTensor_(resize3d)(state, fgradInput, batchSize, nInputPlane*kW*kH, outputHeight*outputWidth);
+
+  // Helpers
+  THCTensor *gradInput_n = THCTensor_(new)(state);
+  THCTensor *fgradInput_n = THCTensor_(new)(state);
+  THCTensor *gradOutput_n = THCTensor_(new)(state);
+
+  THCTensor_(transpose)(state, weight, weight, 1, 2);
+
+  // For each elt in batch, do:
+  for (int elt = 0; elt < batchSize; elt ++) {
+    THCTensor *gradOutput3d, *fgradInput3d;
+    THCTensor *wslice = THCTensor_(new)(state);
+    THCTensor *gislice = THCTensor_(new)(state);
+    THCTensor *goslice = THCTensor_(new)(state);
+
+    // Matrix mulitply per sample:
+    THCTensor_(select)(state, gradInput_n, gradInput, 0, elt);
+    THCTensor_(select)(state, fgradInput_n, fgradInput, 0, elt);
+    THCTensor_(select)(state, gradOutput_n, gradOutput, 0, elt);
+
+    gradOutput3d = THCTensor_(newWithStorage3d)(state, gradOutput_n->storage, gradOutput_n->storageOffset,
+                                               outputHeight*outputWidth, 1,
+                                               nOutputPlane, outputHeight*outputWidth,
+                                               1, nOutputPlane*outputHeight*outputWidth);
+    fgradInput3d = THCTensor_(newWithStorage3d)(state, fgradInput_n->storage, fgradInput_n->storageOffset,
+                                               outputHeight*outputWidth, 1,
+                                               kW*kH*nInputPlane, outputHeight*outputWidth,
+                                               1, kW*kH*nInputPlane*outputHeight*outputWidth);
+
+    for (int i = 0; i < outputHeight; i++) {
+      for(int j = 0; j < outputWidth; j++) {
+        int sliceidx = i * outputWidth + j;
+        THCTensor_(select)(state, wslice, weight, 0, sliceidx);
+        THCTensor_(select)(state, gislice, fgradInput3d, 0, sliceidx);
+        THCTensor_(select)(state, goslice, gradOutput3d, 0, sliceidx);
+        THCTensor_(addmm)(state, gislice, ScalarConvert<int, real>::to(0), gislice, ScalarConvert<int, real>::to(1), wslice, goslice);
+      }
+    }
+
+    // weight:        oH*oW x nInputPlane*kH*kW x nOutputPlane
+    // gradOutput3d:  oH*oW x nOutputPlane x 1
+    //THCTensor_(baddbmm)(state, fgradInput3d, 0.0, fgradInput3d, 1.0, weight, gradOutput3d);
+    // fgradInput3d:  oH*oW x nInputPlane*kH*kW x 1
+
+    // Unpack columns back into input:
+    col2im<real, accreal>(
+      THCState_getCurrentStream(state),
+      THCTensor_(data)(state, fgradInput_n),
+      nInputPlane, inputHeight, inputWidth, kH, kW, padH, padW, dH, dW,
+      1, 1, THCTensor_(data)(state, gradInput_n)
+    );
+
+    THCTensor_(free)(state, gradOutput3d);
+    THCTensor_(free)(state, fgradInput3d);
+    THCTensor_(free)(state, wslice);
+    THCTensor_(free)(state, gislice);
+    THCTensor_(free)(state, goslice);
+  }
+
+  // Free
+  THCTensor_(free)(state, gradInput_n);
+  THCTensor_(free)(state, fgradInput_n);
+  THCTensor_(free)(state, gradOutput_n);
+
+  // Resize output
+  if (batch == 0) {
+    THCTensor_(resize3d)(state, gradOutput, nOutputPlane, outputHeight, outputWidth);
+    THCTensor_(resize3d)(state, input, nInputPlane, inputHeight, inputWidth);
+    THCTensor_(resize3d)(state, gradInput, nInputPlane, inputHeight, inputWidth);
+  }
+
+  THCTensor_(transpose)(state, weight, weight, 1, 2);
+}
+
+void THNN_(SpatialConvolutionLocal_accGradParameters)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *gradOutput,
+           THCTensor *gradWeight,
+           THCTensor *gradBias,
+           THCTensor *finput,
+           THCTensor *fgradInput,
+           int kW, int kH,
+           int dW, int dH,
+           int padW, int padH,
+           long inputWidth, long inputHeight,
+           long outputWidth, long outputHeight,
+           real scale)
+{
+  THCUNN_assertSameGPU_generic(state, 5, input, gradOutput, gradWeight,
+                                 gradBias, finput);
+
+  long nInputPlane = THCTensor_(size)(state,gradWeight,2)/(kW*kH);
+  long nOutputPlane = THCTensor_(size)(state,gradWeight,1);
+
+  int batch = 1;
+  if (input->nDimension == 3) {
+    // Force batch
+    batch = 0;
+    THCTensor_(resize4d)(state, input, 1, nInputPlane, inputHeight, inputWidth);
+    THCTensor_(resize4d)(state, gradOutput, 1, nOutputPlane, outputHeight, outputWidth);
+  }
+
+  // Batch size + input planes
+  long batchSize = input->size[0];
+
+  // Helpers
+  THCTensor *input_n = THCTensor_(new)(state);
+  THCTensor *finput_n = THCTensor_(new)(state);
+  THCTensor *gradOutput_n = THCTensor_(new)(state);
+
+  // For each elt in batch, do:
+  for (int elt = 0; elt < batchSize; elt ++) {
+    THCTensor *gradOutput3d, *finput3d;
+    THCTensor *gwslice = THCTensor_(new)(state);
+    THCTensor *islice = THCTensor_(new)(state);
+    THCTensor *goslice = THCTensor_(new)(state);
+
+    // Matrix mulitply per output:
+    THCTensor_(select)(state, input_n, input, 0, elt);
+    THCTensor_(select)(state, finput_n, finput, 0, elt);
+    THCTensor_(select)(state, gradOutput_n, gradOutput, 0, elt);
+
+    gradOutput3d = THCTensor_(newWithStorage3d)(state, gradOutput_n->storage, gradOutput_n->storageOffset,
+                                                 outputHeight*outputWidth, 1,
+                                                 nOutputPlane, outputHeight*outputWidth,
+                                                 1, nOutputPlane*outputHeight*outputWidth);
+    finput3d = THCTensor_(newWithStorage3d)(state, finput_n->storage, finput_n->storageOffset,
+                                             outputHeight*outputWidth, 1,
+                                             1, kW*kH*nInputPlane*outputHeight*outputWidth,
+                                             kW*kH*nInputPlane, outputHeight*outputWidth);
+
+    // Extract columns:
+    im2col(
+      THCState_getCurrentStream(state),
+      THCTensor_(data)(state, input_n),
+      nInputPlane, inputHeight, inputWidth, kH, kW, padH, padW, dH, dW,
+      1, 1, THCTensor_(data)(state, finput_n)
+    );
+
+    for (int i = 0; i < outputHeight; i++) {
+      for(int j = 0; j < outputWidth; j++) {
+        int sliceidx = i * outputWidth + j;
+        THCTensor_(select)(state, gwslice, gradWeight, 0, sliceidx);
+        THCTensor_(select)(state, goslice, gradOutput3d, 0, sliceidx);
+        THCTensor_(select)(state, islice, finput3d, 0, sliceidx);
+        THCTensor_(addmm)(state, gwslice, ScalarConvert<int, real>::to(1), gwslice, scale, goslice, islice);
+      }
+    }
+    // gradOutput3d:  oH*oW x nOutputPlane x 1
+    // finput3d:      oH*oW x 1 x kW*kH*nInputPlane
+    //THCTensor_(baddbmm)(state, gradWeight, 1.0, gradWeight, scale, gradOutput3d, finput3d);
+    // gradWeight:    oH*oW x nOutputPlane x kW*kH*nInputPlane
+
+    THCTensor_(cadd)(state, gradBias, gradBias, scale, gradOutput_n);
+
+    THCTensor_(free)(state, gradOutput3d);
+    THCTensor_(free)(state, finput3d);
+    THCTensor_(free)(state, gwslice);
+    THCTensor_(free)(state, goslice);
+    THCTensor_(free)(state, islice);
+  }
+
+  // Free
+  THCTensor_(free)(state, input_n);
+  THCTensor_(free)(state, finput_n);
+  THCTensor_(free)(state, gradOutput_n);
+
+  // Resize
+  if (batch == 0) {
+    THCTensor_(resize3d)(state, gradOutput, nOutputPlane, outputHeight, outputWidth);
+    THCTensor_(resize3d)(state, input, nInputPlane, inputHeight, inputWidth);
+  }
+}
+
+#endif
diff --git a/lib/THCUNN/generic/SpatialConvolutionMM.cu b/lib/THCUNN/generic/SpatialConvolutionMM.cu
new file mode 100644
index 0000000..46a62fb
--- /dev/null
+++ b/lib/THCUNN/generic/SpatialConvolutionMM.cu
@@ -0,0 +1,427 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/SpatialConvolutionMM.cu"
+#else
+
+void THNN_(SpatialConvolutionMM_updateOutput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *output,
+           THCTensor *weight,
+           THCTensor *bias,
+           THCTensor *columns,
+           THCTensor *ones,
+           int kW, int kH,
+           int dW, int dH,
+           int padW, int padH) {
+
+  THCUNN_assertSameGPU_generic(state, 5, input, output, weight, columns, ones);
+  if (bias) {
+    THCUNN_assertSameGPU_generic(state, 2, weight, bias);
+  }
+  THArgCheck(input->nDimension == 3 || input->nDimension == 4, 2, "3D or 4D (batch mode) tensor is expected");
+  THArgCheck(!bias || weight->size[0] == bias->size[0], 4, "nOutputPlane mismatch in weight and bias");
+  THArgCheck(kW > 0 && kH > 0, 8, "kernel size should be greater than zero");
+  THArgCheck(dW > 0 && dH > 0, 10, "stride should be greater than zero");
+  THArgCheck(weight->nDimension == 2 || weight->nDimension == 4, 4, "weight tensor should be 2D or 4D");
+
+  int freeWeight = 0;
+
+  // Params:
+  int nInputPlane = weight->nDimension == 2 ? weight->size[1]/(kH*kW) : weight->size[1];
+  int nOutputPlane = weight->size[0];
+
+  if (weight->nDimension == 4) {
+    long s1 = weight->size[0];
+    long s2 = weight->size[1] * weight->size[2] * weight->size[3];
+    weight = THCTensor_(newWithStorage2d)(state, weight->storage, weight->storageOffset, s1, -1, s2, -1);
+    freeWeight = 1;
+  }
+
+  int batch = 1;
+  if (input->nDimension == 3) {
+    THArgCheck(input->size[0] == nInputPlane, 2, "input channels and nInputPlane dont match");
+    // Force batch
+    batch = 0;
+    THCTensor_(resize4d)(state, input, 1, input->size[0], input->size[1], input->size[2]);
+  } else {
+    THArgCheck(input->size[1] == nInputPlane, 2, "input channels and nInputPlane dont match");
+  }
+
+  long inputWidth   = input->size[3];
+  long inputHeight  = input->size[2];
+  long outputWidth  = (inputWidth + 2*padW - kW) / dW + 1;
+  long outputHeight = (inputHeight + 2*padH - kH) / dH + 1;
+
+  if (outputWidth < 1 || outputHeight < 1)
+    THError("Given input size: (%dx%dx%d). Calculated output size: (%dx%dx%d). Output size is too small",
+        nInputPlane,inputHeight,inputWidth,nOutputPlane,outputHeight,outputWidth);
+
+  // Batch size + input planes
+  long batchSize = input->size[0];
+
+  // Resize output
+  THCTensor_(resize4d)(state, output, batchSize, nOutputPlane, outputHeight, outputWidth);
+
+  // Resize temporary columns
+  THCTensor_(resize2d)(state, columns, nInputPlane*kW*kH, outputHeight*outputWidth);
+
+  // Define a buffer of ones, for bias accumulation
+  // Note: this buffer can be shared with other modules, it only ever gets increased,
+  // and always contains ones.
+  if (ones->nDimension != 2 || ones->size[0]*ones->size[1] < outputHeight*outputWidth) {
+    // Resize plane and fill with ones...
+    THCTensor_(resize2d)(state, ones, outputHeight, outputWidth);
+    THCTensor_(fill)(state, ones, ScalarConvert<int, real>::to(1));
+  }
+
+  // Helpers
+  THCTensor *input_n = THCTensor_(new)(state);
+  THCTensor *output_n = THCTensor_(new)(state);
+
+  // For each elt in batch, do:
+  for (int elt = 0; elt < batchSize; elt ++) {
+    // Matrix mulitply per output:
+    THCTensor_(select)(state, input_n, input, 0, elt);
+    THCTensor_(select)(state, output_n, output, 0, elt);
+
+    // Do Bias first:
+    // M,N,K are dims of matrix A and B
+    // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
+    long m_ = nOutputPlane;
+    long n_ = outputHeight * outputWidth;
+    long k_ = 1;
+
+    // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+    if (bias) {
+      #ifdef THC_REAL_IS_FLOAT
+      THCudaBlas_Sgemm(
+      #elif defined(THC_REAL_IS_HALF)
+      THCudaBlas_Hgemm(
+      #elif defined(THC_REAL_IS_DOUBLE)
+      THCudaBlas_Dgemm(
+      #endif
+          state,
+          't', 'n',
+          n_, m_, k_,
+          ScalarConvert<int, real>::to(1),
+          THCTensor_(data)(state, ones), k_,
+          THCTensor_(data)(state, bias), k_,
+          ScalarConvert<int, real>::to(0),
+          THCTensor_(data)(state, output_n), n_
+      );
+    } else {
+      THCTensor_(zero)(state, output_n);
+    }
+
+    // Extract columns:
+    im2col(
+      THCState_getCurrentStream(state),
+      THCTensor_(data)(state, input_n),
+      nInputPlane, inputHeight, inputWidth, kH, kW, padH, padW, dH, dW,
+      1, 1, THCTensor_(data)(state, columns)
+    );
+
+    // M,N,K are dims of matrix A and B
+    // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
+    long m = nOutputPlane;
+    long n = columns->size[1];
+    long k = nInputPlane*kH*kW;
+
+    // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+    #ifdef THC_REAL_IS_FLOAT
+    THCudaBlas_Sgemm(
+    #elif defined(THC_REAL_IS_HALF)
+    THCudaBlas_Hgemm(
+    #elif defined(THC_REAL_IS_DOUBLE)
+    THCudaBlas_Dgemm(
+    #endif
+        state,
+        'n', 'n',
+        n, m, k,
+        ScalarConvert<int, real>::to(1),
+        THCTensor_(data)(state, columns), n,
+        THCTensor_(data)(state, weight), k,
+        ScalarConvert<int, real>::to(1),
+        THCTensor_(data)(state, output_n), n
+    );
+  }
+
+  // Free
+  THCTensor_(free)(state, input_n);
+  THCTensor_(free)(state, output_n);
+  if (freeWeight)
+    THCTensor_(free)(state, weight);
+
+  // Resize output
+  if (batch == 0) {
+    THCTensor_(resize3d)(state, output, nOutputPlane, outputHeight, outputWidth);
+    THCTensor_(resize3d)(state, input, nInputPlane, inputHeight, inputWidth);
+  }
+}
+
+void THNN_(SpatialConvolutionMM_updateGradInput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *gradOutput,
+           THCTensor *gradInput,
+           THCTensor *weight,
+           THCTensor *gradColumns,
+           THCTensor *ones,
+           int kW, int kH,
+           int dW, int dH,
+           int padW, int padH) {
+
+  THCUNN_assertSameGPU_generic(state, 5, input, gradOutput, weight,
+                                 gradColumns, gradInput);
+  THArgCheck(input->nDimension == 3 || input->nDimension == 4, 2, "3D or 4D (batch mode) tensor is expected");
+  THArgCheck(kW > 0 && kH > 0, 9, "kernel size should be greater than zero");
+  THArgCheck(dW > 0 && dH > 0, 11, "stride should be greater than zero");
+  THArgCheck(weight->nDimension == 2 || weight->nDimension == 4, 4, "weight tensor should be 2D or 4D");
+
+  // Params
+  int nInputPlane = weight->nDimension == 2 ? weight->size[1]/(kW*kH) : weight->size[1];
+  int nOutputPlane = weight->size[0];
+
+  int freeWeight = 0;
+  if (weight->nDimension == 4) {
+    long s1 = weight->size[0];
+    long s2 = weight->size[1] * weight->size[2] * weight->size[3];
+    weight = THCTensor_(newWithStorage2d)(state, weight->storage, weight->storageOffset, s1, -1, s2, -1);
+    freeWeight = 1;
+  }
+
+  int batch = 1;
+  if (input->nDimension == 3) {
+    // Force batch
+    batch = 0;
+    THCTensor_(resize4d)(state, input, 1, input->size[0], input->size[1], input->size[2]);
+    THCTensor_(resize4d)(state, gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2]);
+  }
+
+  long inputWidth   = input->size[3];
+  long inputHeight  = input->size[2];
+  long outputWidth  = (inputWidth + 2*padW - kW) / dW + 1;
+  long outputHeight = (inputHeight + 2*padH - kH) / dH + 1;
+
+  // Batch size + input planes
+  long batchSize = input->size[0];
+
+  // Resize output
+  THCTensor_(resize4d)(state, gradInput, batchSize, nInputPlane, inputHeight, inputWidth);
+
+  // Resize temporary columns
+  THCTensor_(resize2d)(state, gradColumns, nInputPlane*kW*kH, outputHeight*outputWidth);
+
+  // Helpers
+  THCTensor *gradInput_n = THCTensor_(new)(state);
+  THCTensor *gradOutput_n = THCTensor_(new)(state);
+
+  // For each elt in batch, do:
+  for (int elt = 0; elt < batchSize; elt ++) {
+    // Matrix mulitply per sample:
+    THCTensor_(select)(state, gradInput_n, gradInput, 0, elt);
+    THCTensor_(select)(state, gradOutput_n, gradOutput, 0, elt);
+
+    // M,N,K are dims of matrix A and B
+    // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
+    long m = nInputPlane*kW*kH;
+    long n = gradColumns->size[1];
+    long k = nOutputPlane;
+
+    // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+    #ifdef THC_REAL_IS_FLOAT
+    THCudaBlas_Sgemm(
+    #elif defined(THC_REAL_IS_HALF)
+    THCudaBlas_Hgemm(
+    #elif defined(THC_REAL_IS_DOUBLE)
+    THCudaBlas_Dgemm(
+    #endif
+        state,
+        'n', 't',
+        n, m, k,
+        ScalarConvert<int, real>::to(1),
+        THCTensor_(data)(state, gradOutput_n), n,
+        THCTensor_(data)(state, weight), m,
+        ScalarConvert<int, real>::to(0),
+        THCTensor_(data)(state, gradColumns), n
+    );
+
+    // Unpack columns back into input:
+    col2im<real, accreal>(
+      THCState_getCurrentStream(state),
+      THCTensor_(data)(state, gradColumns),
+      nInputPlane, inputHeight, inputWidth, kH, kW, padH, padW, dH, dW,
+      1, 1, THCTensor_(data)(state, gradInput_n)
+    );
+  }
+
+  // Free
+  THCTensor_(free)(state, gradInput_n);
+  THCTensor_(free)(state, gradOutput_n);
+  if (freeWeight)
+    THCTensor_(free)(state, weight);
+
+  // Resize output
+  if (batch == 0) {
+    THCTensor_(resize3d)(state, gradOutput, nOutputPlane, outputHeight, outputWidth);
+    THCTensor_(resize3d)(state, input, nInputPlane, inputHeight, inputWidth);
+    THCTensor_(resize3d)(state, gradInput, nInputPlane, inputHeight, inputWidth);
+  }
+}
+
+void THNN_(SpatialConvolutionMM_accGradParameters)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *gradOutput,
+           THCTensor *gradWeight,
+           THCTensor *gradBias,
+           THCTensor *columns,
+           THCTensor *ones,
+           int kW, int kH,
+           int dW, int dH,
+           int padW, int padH,
+           real scale) {
+
+  THCUNN_assertSameGPU_generic(state, 5, input, gradOutput, gradWeight, columns, ones);
+  if (gradBias) {
+   THCUNN_assertSameGPU_generic(state, 2, gradWeight, gradBias);
+  }
+  THArgCheck(input->nDimension == 3 || input->nDimension == 4, 2, "3D or 4D (batch mode) tensor is expected");
+  THArgCheck(!gradBias || gradWeight->size[0] == gradBias->size[0], 4, "nOutputPlane mismatch in gradWeight and gradBias");
+  THArgCheck(kW > 0 && kH > 0, 8, "kernel size should be greater than zero");
+  THArgCheck(dW > 0 && dH > 0, 10, "stride should be greater than zero");
+  THArgCheck(gradWeight->nDimension == 2 || gradWeight->nDimension == 4, 4, "gradWeight tensor should be 2D or 4D");
+
+  // Params
+  int nInputPlane = gradWeight->nDimension == 2 ? gradWeight->size[1]/(kW*kH) : gradWeight->size[1];
+  int nOutputPlane = gradWeight->size[0];
+
+  int freeWeight = 0;
+  if (gradWeight->nDimension == 4) {
+    long s1 = gradWeight->size[0];
+    long s2 = gradWeight->size[1] * gradWeight->size[2] * gradWeight->size[3];
+    gradWeight = THCTensor_(newWithStorage2d)(state, gradWeight->storage, gradWeight->storageOffset, s1, -1, s2, -1);
+    freeWeight = 1;
+  }
+
+  int batch = 1;
+  if (input->nDimension == 3) {
+    // Force batch
+    batch = 0;
+    THCTensor_(resize4d)(state, input, 1, input->size[0], input->size[1], input->size[2]);
+    THCTensor_(resize4d)(state, gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2]);
+  }
+
+  long inputWidth   = input->size[3];
+  long inputHeight  = input->size[2];
+  long outputWidth  = (inputWidth + 2*padW - kW) / dW + 1;
+  long outputHeight = (inputHeight + 2*padH - kH) / dH + 1;
+
+  // Batch size + input planes
+  long batchSize = input->size[0];
+
+  // Define a buffer of ones, for bias accumulation
+  if (ones->nDimension != 2 || ones->size[0]*ones->size[1] < outputHeight*outputWidth) {
+    // Resize plane and fill with ones...
+    THCTensor_(resize2d)(state, ones, outputHeight, outputWidth);
+    THCTensor_(fill)(state, ones, ScalarConvert<int, real>::to(1));
+  }
+
+  // Resize temporary columns
+  THCTensor_(resize2d)(state, columns, nInputPlane*kW*kH, outputHeight*outputWidth);
+
+  // Helpers
+  THCTensor *input_n = THCTensor_(new)(state);
+  THCTensor *gradOutput_n = THCTensor_(new)(state);
+
+  // For each elt in batch, do:
+  for (int elt = 0; elt < batchSize; elt ++) {
+    // Matrix mulitply per output:
+    THCTensor_(select)(state, input_n, input, 0, elt);
+    THCTensor_(select)(state, gradOutput_n, gradOutput, 0, elt);
+
+    // Extract columns:
+    im2col(
+      THCState_getCurrentStream(state),
+      THCTensor_(data)(state, input_n),
+      nInputPlane, inputHeight, inputWidth, kH, kW, padH, padW, dH, dW,
+      1, 1, THCTensor_(data)(state, columns)
+    );
+
+    // M,N,K are dims of matrix A and B
+    // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
+    long m = nOutputPlane;
+    long n = nInputPlane*kW*kH;
+    long k = columns->size[1];
+
+    // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+    #ifdef THC_REAL_IS_FLOAT
+    THCudaBlas_Sgemm(
+    #elif defined(THC_REAL_IS_HALF)
+    THCudaBlas_Hgemm(
+    #elif defined(THC_REAL_IS_DOUBLE)
+    THCudaBlas_Dgemm(
+    #endif
+        state,
+        't', 'n',
+        n, m, k,
+        scale,
+        THCTensor_(data)(state, columns), k,
+        THCTensor_(data)(state, gradOutput_n), k,
+        ScalarConvert<int, real>::to(1),
+        THCTensor_(data)(state, gradWeight), n
+    );
+
+    // Do Bias:
+    // M,N,K are dims of matrix A and B
+    // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
+    long m_ = nOutputPlane;
+    long k_ = outputHeight * outputWidth;
+
+    // Do GEMV (note: this is a bit confusing because gemv assumes column-major matrices)
+    if (gradBias) {
+      #if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE)
+      #ifdef THC_REAL_IS_FLOAT
+      THCudaBlas_Sgemv(
+      #elif defined(THC_REAL_IS_DOUBLE)
+      THCudaBlas_Dgemv(
+      #endif
+          state,
+          't',
+          k_, m_,
+          scale,
+          THCTensor_(data)(state, gradOutput_n), k_,
+          THCTensor_(data)(state, ones), 1,
+          ScalarConvert<int, real>::to(1),
+          THCTensor_(data)(state, gradBias), 1
+      );
+      #endif
+      #ifdef THC_REAL_IS_HALF
+      THCudaBlas_Hgemm(
+          state,
+          't', 'n',
+          m_, 1, k_,
+          scale,
+          THCTensor_(data)(state, gradOutput_n), k_,
+          THCTensor_(data)(state, ones), k_,
+          ScalarConvert<int, real>::to(1),
+          THCTensor_(data)(state, gradBias), m_
+      );
+      #endif
+    }
+  }
+
+  // Free
+  THCTensor_(free)(state, input_n);
+  THCTensor_(free)(state, gradOutput_n);
+  if (freeWeight)
+    THCTensor_(free)(state, gradWeight);
+
+  // Resize
+  if (batch == 0) {
+    THCTensor_(resize3d)(state, gradOutput, nOutputPlane, outputHeight, outputWidth);
+    THCTensor_(resize3d)(state, input, nInputPlane, inputHeight, inputWidth);
+  }
+}
+
+#endif
diff --git a/lib/THCUNN/generic/SpatialCrossMapLRN.cu b/lib/THCUNN/generic/SpatialCrossMapLRN.cu
new file mode 100644
index 0000000..a09ea0b
--- /dev/null
+++ b/lib/THCUNN/generic/SpatialCrossMapLRN.cu
@@ -0,0 +1,114 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/SpatialCrossMapLRN.cu"
+#else
+
+void LRNforward(THCState* state, THCTensor* input, THCTensor* output,
+    THCTensor* scale, int local_size, real alpha, real beta, real k)
+{
+  THCTensor_(resizeAs)(state, output, input);
+  THCTensor_(resizeAs)(state, scale, input);
+
+  int batchSize;
+  int nInputPlane;
+  int imsize_h;
+  int imsize_w;
+
+  if (input->nDimension == 3) {
+    batchSize = 1;
+    nInputPlane = input->size[0];
+    imsize_h = input->size[1];
+    imsize_w = input->size[2];
+  }
+  else
+  {
+    batchSize = input->size[0];
+    nInputPlane = input->size[1];
+    imsize_h = input->size[2];
+    imsize_w = input->size[3];
+  }
+
+  input = THCTensor_(newContiguous)(state, input);
+
+  int n_threads = batchSize * imsize_h * imsize_w;
+  LRNFillScale<real, accreal> <<<GET_BLOCKS(n_threads), CUDA_NUM_THREADS, 0, THCState_getCurrentStream(state)>>>(
+      n_threads, THCTensor_(data)(state, input), batchSize, nInputPlane, imsize_h, imsize_w, local_size,
+      alpha / local_size, k, THCTensor_(data)(state, scale));
+  n_threads *= nInputPlane;
+  THCudaCheck(cudaGetLastError());
+  LRNComputeOutput<<<GET_BLOCKS(n_threads), CUDA_NUM_THREADS, 0, THCState_getCurrentStream(state)>>>(
+    n_threads, THCTensor_(data)(state, input), THCTensor_(data)(state, scale), -beta, THCTensor_(data)(state, output));
+  THCudaCheck(cudaGetLastError());
+
+  THCTensor_(free)(state, input);
+}
+
+
+void LRNbackward(THCState* state, THCTensor* input, THCTensor* output,
+    THCTensor* gradOutput, THCTensor* gradInput, THCTensor* scale,
+    int local_size, real alpha, real beta, real k)
+{
+  THCTensor_(resizeAs)(state, gradInput, input);
+
+  int batchSize;
+  int nInputPlane;
+  int imsize_h;
+  int imsize_w;
+
+  if (input->nDimension == 3) {
+    batchSize = 1;
+    nInputPlane = input->size[0];
+    imsize_h = input->size[1];
+    imsize_w = input->size[2];
+  }
+  else
+  {
+    batchSize = input->size[0];
+    nInputPlane = input->size[1];
+    imsize_h = input->size[2];
+    imsize_w = input->size[3];
+  }
+
+  input = THCTensor_(newContiguous)(state, input);
+  gradOutput = THCTensor_(newContiguous)(state, gradOutput);
+
+  int n_threads = batchSize * imsize_h * imsize_w;
+  LRNComputeDiff<real, accreal> <<<GET_BLOCKS(n_threads), CUDA_NUM_THREADS, 0, THCState_getCurrentStream(state)>>>(
+      n_threads, THCTensor_(data)(state, input), THCTensor_(data)(state, output),
+      THCTensor_(data)(state, scale), THCTensor_(data)(state, gradOutput), batchSize, nInputPlane, imsize_h, imsize_w,
+      local_size, -beta, ScalarConvert<int, real>::to(2) * alpha * beta / local_size,
+      THCTensor_(data)(state, gradInput));
+  THCudaCheck(cudaGetLastError());
+
+  THCTensor_(free)(state, input);
+  THCTensor_(free)(state, gradOutput);
+}
+
+void THNN_(SpatialCrossMapLRN_updateOutput)(
+    THCState *state,
+    THCTensor *input,
+    THCTensor *output,
+    THCTensor *scale,
+    int size,
+    real alpha,
+    real beta,
+    real k)
+{
+  LRNforward(state, input, output, scale, size, alpha, beta, k);
+}
+
+void THNN_(SpatialCrossMapLRN_updateGradInput)(
+    THCState *state,
+    THCTensor *input,
+    THCTensor *gradOutput,
+    THCTensor *gradInput,
+    THCTensor *scale,
+    THCTensor *output,
+    int size,
+    real alpha,
+    real beta,
+    real k)
+{
+  LRNbackward(state, input, output, gradOutput, gradInput, scale, size, alpha, beta, k);
+}
+
+#endif
diff --git a/lib/THCUNN/generic/SpatialDilatedConvolution.cu b/lib/THCUNN/generic/SpatialDilatedConvolution.cu
new file mode 100644
index 0000000..fb5fe51
--- /dev/null
+++ b/lib/THCUNN/generic/SpatialDilatedConvolution.cu
@@ -0,0 +1,402 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/SpatialDilatedConvolution.cu"
+#else
+
+void THNN_(SpatialDilatedConvolution_updateOutput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *output,
+           THCTensor *weight,
+           THCTensor *bias,
+           THCTensor *columns,
+           THCTensor *ones,
+           int kW, int kH,
+           int dW, int dH,
+           int padW, int padH,
+           int dilationW, int dilationH) {
+
+  THCUNN_assertSameGPU_generic(state, 5, input, output, weight, columns, ones);
+  if (bias) {
+    THCUNN_assertSameGPU_generic(state, 2, weight, bias);
+  }
+  THArgCheck(input->nDimension == 3 || input->nDimension == 4, 2, "3D or 4D (batch mode) tensor is expected");
+  THArgCheck(weight->nDimension == 4, 4, "weight tensor must be 4D (nOutputPlane,nInputPlane,kH,kW)");
+  THArgCheck(!bias || weight->size[0] == bias->size[0], 4, "nOutputPlane mismatch in weight and bias");
+  THArgCheck(kW > 0 && kH > 0, 8, "kernel size should be greater than zero");
+  THArgCheck(dW > 0 && dH > 0, 10, "stride should be greater than zero");
+  THArgCheck(dilationW > 0 && dilationH > 0, 14, "dilation should be greater than 0");
+
+  // Params:
+  int nInputPlane = weight->size[1];
+  int nOutputPlane = weight->size[0];
+
+  int batch = 1;
+  if (input->nDimension == 3) {
+    THArgCheck(input->size[0] == nInputPlane, 2, "input channels and nInputPlane dont match");
+    // Force batch
+    batch = 0;
+    THCTensor_(resize4d)(state, input, 1, input->size[0], input->size[1], input->size[2]);
+  } else {
+    THArgCheck(input->size[1] == nInputPlane, 2, "input channels and nInputPlane dont match");
+  }
+
+  long inputWidth   = input->size[3];
+  long inputHeight  = input->size[2];
+  long outputWidth  = (inputWidth + 2*padW - (dilationW * (kW - 1) + 1)) / dW + 1;
+  long outputHeight = (inputHeight + 2*padH - (dilationH * (kH - 1) + 1)) / dH + 1;
+
+  if (outputWidth < 1 || outputHeight < 1)
+    THError("Given input size: (%dx%dx%d). Calculated output size: (%dx%dx%d). Output size is too small",
+        nInputPlane,inputHeight,inputWidth,nOutputPlane,outputHeight,outputWidth);
+
+  // Batch size + input planes
+  long batchSize = input->size[0];
+
+  // Resize output
+  THCTensor_(resize4d)(state, output, batchSize, nOutputPlane, outputHeight, outputWidth);
+
+  // Resize temporary columns
+  THCTensor_(resize2d)(state, columns, nInputPlane*kW*kH, outputHeight*outputWidth);
+
+  // Define a buffer of ones, for bias accumulation
+  // Note: this buffer can be shared with other modules, it only ever gets increased,
+  // and always contains ones.
+  if (ones->nDimension != 2 || ones->size[0]*ones->size[1] < outputHeight*outputWidth) {
+    // Resize plane and fill with ones...
+    THCTensor_(resize2d)(state, ones, outputHeight, outputWidth);
+    THCTensor_(fill)(state, ones, ScalarConvert<int, real>::to(1));
+  }
+
+  // Helpers
+  THCTensor *input_n = THCTensor_(new)(state);
+  THCTensor *output_n = THCTensor_(new)(state);
+
+  // For each elt in batch, do:
+  for (int elt = 0; elt < batchSize; elt ++) {
+    // Matrix mulitply per output:
+    THCTensor_(select)(state, input_n, input, 0, elt);
+    THCTensor_(select)(state, output_n, output, 0, elt);
+
+    // Do Bias first:
+    // M,N,K are dims of matrix A and B
+    // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
+    long m_ = nOutputPlane;
+    long n_ = outputHeight * outputWidth;
+    long k_ = 1;
+
+    // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+    if (bias) {
+      #ifdef THC_REAL_IS_FLOAT
+      THCudaBlas_Sgemm(
+      #elif defined(THC_REAL_IS_HALF)
+      THCudaBlas_Hgemm(
+      #elif defined(THC_REAL_IS_DOUBLE)
+      THCudaBlas_Dgemm(
+      #endif
+          state,
+          't', 'n',
+          n_, m_, k_,
+          ScalarConvert<int, real>::to(1),
+          THCTensor_(data)(state, ones), k_,
+          THCTensor_(data)(state, bias), k_,
+          ScalarConvert<int, real>::to(0),
+          THCTensor_(data)(state, output_n), n_
+      );
+    } else {
+      THCTensor_(zero)(state, output_n);
+    }
+
+    // Extract columns:
+    im2col(
+      THCState_getCurrentStream(state),
+      THCTensor_(data)(state, input_n),
+      nInputPlane, inputHeight, inputWidth, kH, kW, padH, padW, dH, dW,
+      dilationH, dilationW,
+      THCTensor_(data)(state, columns)
+    );
+
+    // M,N,K are dims of matrix A and B
+    // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
+    long m = nOutputPlane;
+    long n = columns->size[1];
+    long k = nInputPlane*kH*kW;
+
+    // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+    #ifdef THC_REAL_IS_FLOAT
+    THCudaBlas_Sgemm(
+    #elif defined(THC_REAL_IS_HALF)
+    THCudaBlas_Hgemm(
+    #elif defined(THC_REAL_IS_DOUBLE)
+    THCudaBlas_Dgemm(
+    #endif
+        state,
+        'n', 'n',
+        n, m, k,
+        ScalarConvert<int, real>::to(1),
+        THCTensor_(data)(state, columns), n,
+        THCTensor_(data)(state, weight), k,
+        ScalarConvert<int, real>::to(1),
+        THCTensor_(data)(state, output_n), n
+    );
+  }
+
+  // Free
+  THCTensor_(free)(state, input_n);
+  THCTensor_(free)(state, output_n);
+
+  // Resize output
+  if (batch == 0) {
+    THCTensor_(resize3d)(state, output, nOutputPlane, outputHeight, outputWidth);
+    THCTensor_(resize3d)(state, input, nInputPlane, inputHeight, inputWidth);
+  }
+}
+
+void THNN_(SpatialDilatedConvolution_updateGradInput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *gradOutput,
+           THCTensor *gradInput,
+           THCTensor *weight,
+           THCTensor *gradColumns,
+           int kW, int kH,
+           int dW, int dH,
+           int padW, int padH,
+           int dilationW, int dilationH) {
+
+  THCUNN_assertSameGPU_generic(state, 5, input, gradOutput, weight,
+                                 gradColumns, gradInput);
+  THArgCheck(input->nDimension == 3 || input->nDimension == 4, 2, "3D or 4D (batch mode) tensor is expected");
+  THArgCheck(weight->nDimension == 4, 4, "weight tensor must be 4D (nOutputPlane,nInputPlane,kH,kW)");
+  THArgCheck(kW > 0 && kH > 0, 9, "kernel size should be greater than zero");
+  THArgCheck(dW > 0 && dH > 0, 11, "stride should be greater than zero");
+
+  // Params
+  int nInputPlane = weight->size[1];
+  int nOutputPlane = weight->size[0];
+
+  int batch = 1;
+  if (input->nDimension == 3) {
+    // Force batch
+    batch = 0;
+    THCTensor_(resize4d)(state, input, 1, input->size[0], input->size[1], input->size[2]);
+    THCTensor_(resize4d)(state, gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2]);
+  }
+
+  long inputWidth   = input->size[3];
+  long inputHeight  = input->size[2];
+  long outputWidth  = (inputWidth + 2*padW - (dilationW * (kW - 1) + 1)) / dW + 1;
+  long outputHeight = (inputHeight + 2*padH - (dilationH * (kH - 1) + 1)) / dH + 1;
+
+  // Batch size + input planes
+  long batchSize = input->size[0];
+
+  // Resize output
+  THCTensor_(resize4d)(state, gradInput, batchSize, nInputPlane, inputHeight, inputWidth);
+
+  // Resize temporary columns
+  THCTensor_(resize2d)(state, gradColumns, nInputPlane*kW*kH, outputHeight*outputWidth);
+
+  // Helpers
+  THCTensor *gradInput_n = THCTensor_(new)(state);
+  THCTensor *gradOutput_n = THCTensor_(new)(state);
+
+  // For each elt in batch, do:
+  for (int elt = 0; elt < batchSize; elt ++) {
+    // Matrix mulitply per sample:
+    THCTensor_(select)(state, gradInput_n, gradInput, 0, elt);
+    THCTensor_(select)(state, gradOutput_n, gradOutput, 0, elt);
+
+    // M,N,K are dims of matrix A and B
+    // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
+    long m = nInputPlane*kW*kH;
+    long n = gradColumns->size[1];
+    long k = nOutputPlane;
+
+    // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+    #ifdef THC_REAL_IS_FLOAT
+    THCudaBlas_Sgemm(
+    #elif defined(THC_REAL_IS_HALF)
+    THCudaBlas_Hgemm(
+    #elif defined(THC_REAL_IS_DOUBLE)
+    THCudaBlas_Dgemm(
+    #endif
+        state,
+        'n', 't',
+        n, m, k,
+        ScalarConvert<int, real>::to(1),
+        THCTensor_(data)(state, gradOutput_n), n,
+        THCTensor_(data)(state, weight), m,
+        ScalarConvert<int, real>::to(0),
+        THCTensor_(data)(state, gradColumns), n
+    );
+
+    // Unpack columns back into input:
+    col2im<real, accreal>(
+      THCState_getCurrentStream(state),
+      THCTensor_(data)(state, gradColumns),
+      nInputPlane, inputHeight, inputWidth, kH, kW, padH, padW, dH, dW,
+      dilationH, dilationW,
+      THCTensor_(data)(state, gradInput_n)
+    );
+  }
+
+  // Free
+  THCTensor_(free)(state, gradInput_n);
+  THCTensor_(free)(state, gradOutput_n);
+
+  // Resize output
+  if (batch == 0) {
+    THCTensor_(resize3d)(state, gradOutput, nOutputPlane, outputHeight, outputWidth);
+    THCTensor_(resize3d)(state, input, nInputPlane, inputHeight, inputWidth);
+    THCTensor_(resize3d)(state, gradInput, nInputPlane, inputHeight, inputWidth);
+  }
+}
+
+void THNN_(SpatialDilatedConvolution_accGradParameters)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *gradOutput,
+           THCTensor *gradWeight,
+           THCTensor *gradBias,
+           THCTensor *columns,
+           THCTensor *ones,
+           int kW, int kH,
+           int dW, int dH,
+           int padW, int padH,
+           int dilationW, int dilationH,
+           real scale) {
+
+  THCUNN_assertSameGPU_generic(state, 5, input, gradOutput, gradWeight, columns, ones);
+  if (gradBias) {
+   THCUNN_assertSameGPU_generic(state, 2, gradWeight, gradBias);
+  }
+  THArgCheck(input->nDimension == 3 || input->nDimension == 4, 2, "3D or 4D (batch mode) tensor is expected");
+  THArgCheck(gradWeight->nDimension == 4, 4, "gradWeight tensor must be 4D (nOutputPlane,nInputPlane,kH,kW)");
+  THArgCheck(!gradBias || gradWeight->size[0] == gradBias->size[0], 4, "nOutputPlane mismatch in gradWeight and gradBias");
+  THArgCheck(kW > 0 && kH > 0, 8, "kernel size should be greater than zero");
+  THArgCheck(dW > 0 && dH > 0, 10, "stride should be greater than zero");
+
+  // Params
+  int nInputPlane = gradWeight->size[1];
+  int nOutputPlane = gradWeight->size[0];
+
+  int batch = 1;
+  if (input->nDimension == 3) {
+    // Force batch
+    batch = 0;
+    THCTensor_(resize4d)(state, input, 1, input->size[0], input->size[1], input->size[2]);
+    THCTensor_(resize4d)(state, gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2]);
+  }
+
+  long inputWidth   = input->size[3];
+  long inputHeight  = input->size[2];
+  long outputWidth  = (inputWidth + 2*padW - (dilationW * (kW - 1) + 1)) / dW + 1;
+  long outputHeight = (inputHeight + 2*padH - (dilationH * (kH - 1) + 1)) / dH + 1;
+
+  // Batch size + input planes
+  long batchSize = input->size[0];
+
+  // Define a buffer of ones, for bias accumulation
+  if (ones->nDimension != 2 || ones->size[0]*ones->size[1] < outputHeight*outputWidth) {
+    // Resize plane and fill with ones...
+    THCTensor_(resize2d)(state, ones, outputHeight, outputWidth);
+    THCTensor_(fill)(state, ones, ScalarConvert<int, real>::to(1));
+  }
+
+  // Resize temporary columns
+  THCTensor_(resize2d)(state, columns, nInputPlane*kW*kH, outputHeight*outputWidth);
+
+  // Helpers
+  THCTensor *input_n = THCTensor_(new)(state);
+  THCTensor *gradOutput_n = THCTensor_(new)(state);
+
+  // For each elt in batch, do:
+  for (int elt = 0; elt < batchSize; elt ++) {
+    // Matrix mulitply per output:
+    THCTensor_(select)(state, input_n, input, 0, elt);
+    THCTensor_(select)(state, gradOutput_n, gradOutput, 0, elt);
+
+    // Extract columns:
+    im2col(
+      THCState_getCurrentStream(state),
+      THCTensor_(data)(state, input_n),
+      nInputPlane, inputHeight, inputWidth, kH, kW, padH, padW, dH, dW,
+      dilationH, dilationW,
+      THCTensor_(data)(state, columns)
+    );
+
+    // M,N,K are dims of matrix A and B
+    // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
+    long m = nOutputPlane;
+    long n = nInputPlane*kW*kH;
+    long k = columns->size[1];
+
+    // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+    #ifdef THC_REAL_IS_FLOAT
+    THCudaBlas_Sgemm(
+    #elif defined(THC_REAL_IS_HALF)
+    THCudaBlas_Hgemm(
+    #elif defined(THC_REAL_IS_DOUBLE)
+    THCudaBlas_Dgemm(
+    #endif
+        state,
+        't', 'n',
+        n, m, k,
+        scale,
+        THCTensor_(data)(state, columns), k,
+        THCTensor_(data)(state, gradOutput_n), k,
+        ScalarConvert<int, real>::to(1),
+        THCTensor_(data)(state, gradWeight), n
+    );
+
+    // Do Bias:
+    // M,N,K are dims of matrix A and B
+    // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
+    long m_ = nOutputPlane;
+    long k_ = outputHeight * outputWidth;
+
+    // Do GEMV (note: this is a bit confusing because gemv assumes column-major matrices)
+    if (gradBias) {
+      #if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE)
+      #ifdef THC_REAL_IS_FLOAT
+      THCudaBlas_Sgemv(
+      #elif defined(THC_REAL_IS_DOUBLE)
+      THCudaBlas_Dgemv(
+      #endif
+          state,
+          't',
+          k_, m_,
+          scale,
+          THCTensor_(data)(state, gradOutput_n), k_,
+          THCTensor_(data)(state, ones), 1,
+          ScalarConvert<int, real>::to(1),
+          THCTensor_(data)(state, gradBias), 1
+      );
+      #endif
+      #ifdef THC_REAL_IS_HALF
+      THCudaBlas_Hgemm(
+          state,
+          't', 'n',
+          m_, 1, k_,
+          scale,
+          THCTensor_(data)(state, gradOutput_n), k_,
+          THCTensor_(data)(state, ones), k_,
+          ScalarConvert<int, real>::to(1),
+          THCTensor_(data)(state, gradBias), m_
+      );
+      #endif
+    }
+  }
+
+  // Free
+  THCTensor_(free)(state, input_n);
+  THCTensor_(free)(state, gradOutput_n);
+
+  // Resize
+  if (batch == 0) {
+    THCTensor_(resize3d)(state, gradOutput, nOutputPlane, outputHeight, outputWidth);
+    THCTensor_(resize3d)(state, input, nInputPlane, inputHeight, inputWidth);
+  }
+}
+
+#endif
diff --git a/lib/THCUNN/generic/SpatialDilatedMaxPooling.cu b/lib/THCUNN/generic/SpatialDilatedMaxPooling.cu
new file mode 100644
index 0000000..b99b5e6
--- /dev/null
+++ b/lib/THCUNN/generic/SpatialDilatedMaxPooling.cu
@@ -0,0 +1,155 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/SpatialDilatedMaxPooling.cu"
+#else
+
+#include "../common.h"
+
+void THNN_(SpatialDilatedMaxPooling_updateOutput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *output,
+           THCIndexTensor *indices,
+           int kW, int kH,
+           int dW, int dH,
+           int padW, int padH,
+           int dilationW, int dilationH,
+           bool ceil_mode)
+{
+
+  THCUNN_assertSameGPU_generic(state, 3, input, output, indices);
+  THArgCheck(input->nDimension == 3 || input->nDimension == 4, 2, "3D or 4D (batch) tensor expected");
+
+  long nInputCols, nInputRows, nInputPlane, batchSize;
+  long nOutputCols, nOutputRows;
+
+  if (input->nDimension == 3) {
+    nInputCols = input->size[2];
+    nInputRows = input->size[1];
+    nInputPlane = input->size[0];
+    batchSize = 1;
+  }
+  else
+  {
+    nInputCols = input->size[3];
+    nInputRows = input->size[2];
+    nInputPlane = input->size[1];
+    batchSize = input->size[0];
+  }
+
+  THArgCheck(nInputCols >= kW - padW && nInputRows >= kH - padH, 2, "input image smaller than kernel size");
+  THArgCheck(kW/2 >= padW && kH/2 >= padH, 2, "pad should be smaller than half of kernel size");
+  THArgCheck(dilationW > 0 && dilationH > 0, 11, "dilation should be greater than 0");
+  if(ceil_mode) {
+    nOutputCols = ceil(float(nInputCols - (dilationW * (kW - 1) + 1) + 2*padW) / float(dW)) + 1;
+    nOutputRows = ceil(float(nInputRows - (dilationH * (kH - 1) + 1) + 2*padH) / float(dH)) + 1;
+  }
+  else {
+    nOutputCols = floor(float(nInputCols - (dilationW * (kW - 1) + 1) + 2*padW) / float(dW)) + 1;
+    nOutputRows = floor(float(nInputRows - (dilationH * (kH - 1) + 1) + 2*padH) / float(dH)) + 1;
+  }
+
+if (nOutputCols < 1 || nOutputRows < 1)
+    THError("Given input size: (%dx%dx%d). Calculated output size: (%dx%dx%d). Output size is too small",
+            nInputPlane,nInputRows,nInputCols,nInputPlane,nOutputRows,nOutputCols);
+
+if (padW || padH)
+  {
+    // ensure that the last pooling starts inside the image
+    if ((nOutputRows - 1)*dH >= nInputRows + padH)
+      --nOutputRows;
+    if ((nOutputCols  - 1)*dW >= nInputCols  + padW)
+      --nOutputCols;
+  }
+
+  input = THCTensor_(newContiguous)(state, input);
+  real* input_data = THCTensor_(data)(state, input);
+
+  THCTensor_(resize4d)(state, output, batchSize, nInputPlane, nOutputRows, nOutputCols);
+  THCUNN_resizeAs_indices(state, indices, output);
+
+  THCIndex_t* indices_data = THCIndexTensor_(data)(state, indices);
+  real* output_data = THCTensor_(data)(state, output);
+
+  int count = THCTensor_(nElement)(state, output);
+
+  MaxPoolForward<real, accreal> <<< GET_BLOCKS(count), CUDA_NUM_THREADS, 0, THCState_getCurrentStream(state) >>>
+      (count, input_data,
+      batchSize, nInputPlane, nInputRows, nInputCols, nOutputRows, nOutputCols,
+      kH, kW, dH, dW, padH, padW, dilationH, dilationW, output_data, indices_data);
+  THCudaCheck(cudaGetLastError());
+
+  if(input->nDimension == 3)
+    THCTensor_(resize3d)(state, output, nInputPlane, nOutputRows, nOutputCols);
+
+  THCTensor_(free)(state, input);
+}
+
+void THNN_(SpatialDilatedMaxPooling_updateGradInput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *gradOutput,
+           THCTensor *gradInput,
+           THCIndexTensor *indices,
+           int kW, int kH,
+           int dW, int dH,
+           int padW, int padH,
+           int dilationW, int dilationH,
+           bool ceil_mode)
+{
+  THCUNN_assertSameGPU_generic(state, 4, input, gradOutput, indices, gradInput);
+
+  input = THCTensor_(newContiguous)(state, input);
+  gradOutput = THCTensor_(newContiguous)(state, gradOutput);
+
+  long nInputCols, nInputRows, nInputPlane, batchSize;
+  long nOutputCols, nOutputRows;
+
+  if (input->nDimension == 3) {
+    nInputCols = input->size[2];
+    nInputRows = input->size[1];
+    nInputPlane = input->size[0];
+    batchSize = 1;
+  }
+  else
+  {
+    nInputCols = input->size[3];
+    nInputRows = input->size[2];
+    nInputPlane = input->size[1];
+    batchSize = input->size[0];
+  }
+
+  if(ceil_mode) {
+     nOutputCols = ceil(float(nInputCols - (dilationW * (kW - 1) + 1) + 2*padW) / float(dW)) + 1;
+     nOutputRows = ceil(float(nInputRows - (dilationH * (kH - 1) + 1) + 2*padH) / float(dH)) + 1;
+   }
+   else {
+     nOutputCols = floor(float(nInputCols - (dilationW * (kW - 1) + 1) + 2*padW) / float(dW)) + 1;
+     nOutputRows = floor(float(nInputRows - (dilationH * (kH - 1) + 1) + 2*padH) / float(dH)) + 1;
+   }
+
+  if (nOutputCols < 1 || nOutputRows < 1)
+    THError("Given input size: (%dx%dx%d). Calculated output size: (%dx%dx%d). Output size is too small",
+            nInputPlane,nInputRows,nInputCols,nInputPlane,nOutputRows,nOutputCols);
+
+  gradOutput = THCTensor_(newContiguous)(state, gradOutput);
+  THCTensor_(resizeAs)(state, gradInput, input);
+
+  int count = THCTensor_(nElement)(state, input);
+
+  MaxPoolBackward<real, accreal> <<< GET_BLOCKS(count), CUDA_NUM_THREADS, 0, THCState_getCurrentStream(state) >>>
+      (count,
+      THCTensor_(data)(state, gradOutput),
+      THCIndexTensor_(data)(state, indices),
+      batchSize, nInputPlane, nInputRows, nInputCols, nOutputRows, nOutputCols,
+      kH, kW, dH, dW, padH, padW, dilationH, dilationW,
+      THCTensor_(data)(state, gradInput));
+  THCudaCheck(cudaGetLastError());
+
+  THCTensor_(free)(state, gradOutput);
+
+  // clean
+  THCTensor_(free)(state, input);
+  THCTensor_(free)(state, gradOutput);
+}
+
+#endif
diff --git a/lib/THCUNN/generic/SpatialFractionalMaxPooling.cu b/lib/THCUNN/generic/SpatialFractionalMaxPooling.cu
new file mode 100644
index 0000000..6e7fc99
--- /dev/null
+++ b/lib/THCUNN/generic/SpatialFractionalMaxPooling.cu
@@ -0,0 +1,155 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/SpatialFractionalMaxPooling.cu"
+#else
+
+void THNN_(SpatialFractionalMaxPooling_updateOutput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *output,
+           int outputW, int outputH,
+           int poolSizeW, int poolSizeH,
+           THCIndexTensor *indices,
+           THCTensor *randomSamples)
+{
+  int planeDim = 0;
+  int dimh = 1;
+  int dimw = 2;
+  long numBatch = 1;
+
+  long numInputDims = THCTensor_(nDimension)(state, input);
+  THArgCheck(numInputDims == 3 || numInputDims == 4, 2,
+                "3D or 4D (batch mode) tensor expected");
+
+  if (numInputDims == 4) {
+    numBatch = THCTensor_(size)(state, input, 0);
+    planeDim++;
+    dimh++;
+    dimw++;
+  }
+
+  /* sizes */
+  long numPlanes = THCTensor_(size)(state, input, planeDim);
+  long inputH = THCTensor_(size)(state, input, dimh);
+  long inputW = THCTensor_(size)(state, input, dimw);
+
+  THArgCheck(outputH + poolSizeH - 1 < inputH, 6,
+                "poolSizeH too large relative to input height");
+  THArgCheck(outputW + poolSizeW - 1 < inputW, 5,
+                "poolSizeW too large relative to input width");
+
+  THCDeviceTensor<real, 4> devInput;
+  THCDeviceTensor<real, 4> devOutput;
+  THCDeviceTensor<THCIndex_t, 4> devIndices;
+  THCDeviceTensor<real, 3> devSamples =
+    toDeviceTensor<real, 3>(state, randomSamples);
+
+  if (numInputDims == 3) {
+    /* resize output */
+    THCTensor_(resize3d)(state, output, numPlanes, outputH, outputW);
+    /* indices will contain the locations for each output point */
+    THCIndexTensor_(resize3d)(state, indices, numPlanes, outputH, outputW);
+
+    devInput = toDeviceTensor<real, 3>(state, input).upcastOuter<4>();
+    devOutput = toDeviceTensor<real, 3>(state, output).upcastOuter<4>();
+    devIndices = toDeviceTensor<THCIndex_t, 3>(state, indices).upcastOuter<4>();
+  } else {
+    THCTensor_(resize4d)(state, output, numBatch, numPlanes, outputH, outputW);
+    /* indices will contain the locations for each output point */
+    THCIndexTensor_(resize4d)(state, indices, numBatch, numPlanes, outputH, outputW);
+
+    devInput = toDeviceTensor<real, 4>(state, input);
+    devOutput = toDeviceTensor<real, 4>(state, output);
+    devIndices = toDeviceTensor<THCIndex_t, 4>(state, indices);
+  }
+
+  // block is limited to 4 warps
+  // grid handles overflow per each plane
+  int outputPlaneSize = devOutput.getSize(2) * devOutput.getSize(3);
+  dim3 grid(THCCeilDiv(outputPlaneSize, 128),
+            devInput.getSize(1),
+            devInput.getSize(0));
+  dim3 block(outputPlaneSize > 128 ? 128 : outputPlaneSize);
+
+#define SFMP_UPDATE_OUTPUT(POOL_W)                                      \
+  SpatialFractionalMaxPooling_updateOutput<POOL_W, real, accreal>       \
+    <<<grid, block, 0, THCState_getCurrentStream(state)>>>(             \
+      devInput, devOutput, devIndices, devSamples, poolSizeW, poolSizeH);
+
+#define SFMP_UPDATE_OUTPUT_CASE(POOL_W)                 \
+  case POOL_W: SFMP_UPDATE_OUTPUT(POOL_W); break
+
+  switch (poolSizeW) {
+    SFMP_UPDATE_OUTPUT_CASE(2);
+    SFMP_UPDATE_OUTPUT_CASE(3);
+    SFMP_UPDATE_OUTPUT_CASE(4);
+    SFMP_UPDATE_OUTPUT_CASE(5);
+    SFMP_UPDATE_OUTPUT_CASE(6);
+    SFMP_UPDATE_OUTPUT_CASE(7);
+    default:
+      // dynamic pool width
+      SFMP_UPDATE_OUTPUT_CASE(-1);
+  }
+  THCudaCheck(cudaGetLastError());
+}
+
+void THNN_(SpatialFractionalMaxPooling_updateGradInput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *gradOutput,
+           THCTensor *gradInput,
+           int outputW, int outputH,
+           int poolSizeW, int poolSizeH,
+           THCIndexTensor *indices)
+{
+  int dimh = 1;
+  int dimw = 2;
+
+  long numInputDims = THCTensor_(nDimension)(state, input);
+  if (numInputDims == 4) {
+    dimh++;
+    dimw++;
+  }
+
+  /* sizes */
+  long inputH = THCTensor_(size)(state, input, dimh);
+  long inputW = THCTensor_(size)(state, input, dimw);
+
+  THArgCheck(outputH == THCTensor_(size)(state, gradOutput, dimh), 3,
+                "gradOutput height unexpected");
+  THArgCheck(outputW == THCTensor_(size)(state, gradOutput, dimw), 3,
+                "gradOutput width unexpected");
+
+  /* resize */
+  THCTensor_(resizeAs)(state, gradInput, input);
+  THCTensor_(zero)(state, gradInput);
+
+  THCDeviceTensor<real, 4> devGradInput;
+  THCDeviceTensor<real, 4> devGradOutput;
+  THCDeviceTensor<THCIndex_t, 4> devIndices;
+
+  /* backprop */
+  if (numInputDims == 3) {
+    devGradInput = toDeviceTensor<real, 3>(state, gradInput).upcastOuter<4>();
+    devGradOutput = toDeviceTensor<real, 3>(state, gradOutput).upcastOuter<4>();
+    devIndices = toDeviceTensor<THCIndex_t, 3>(state, indices).upcastOuter<4>();
+  } else {
+    devGradInput = toDeviceTensor<real, 4>(state, gradInput);
+    devGradOutput = toDeviceTensor<real, 4>(state, gradOutput);
+    devIndices = toDeviceTensor<THCIndex_t, 4>(state, indices);
+  }
+
+  // block is limited to 4 warps
+  // grid handles overflow per each plane
+  int outputPlaneSize = devGradOutput.getSize(2) * devGradOutput.getSize(3);
+  dim3 grid(THCCeilDiv(outputPlaneSize, 128),
+            devGradInput.getSize(1),
+            devGradInput.getSize(0));
+  dim3 block(outputPlaneSize > 128 ? 128 : outputPlaneSize);
+
+  SpatialFractionalMaxPooling_updateGradInput
+    <<<grid, block, 0, THCState_getCurrentStream(state)>>>(
+      devGradInput, devGradOutput, devIndices);
+  THCudaCheck(cudaGetLastError());
+}
+
+#endif
diff --git a/lib/THCUNN/generic/SpatialFullConvolution.cu b/lib/THCUNN/generic/SpatialFullConvolution.cu
new file mode 100644
index 0000000..7f0ebc7
--- /dev/null
+++ b/lib/THCUNN/generic/SpatialFullConvolution.cu
@@ -0,0 +1,381 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/SpatialFullConvolution.cu"
+#else
+
+void THNN_(SpatialFullConvolution_updateOutput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *output,
+           THCTensor *weight,
+           THCTensor *bias,
+           THCTensor *columns,
+           THCTensor *ones,
+           int kW, int kH,
+           int dW, int dH,
+           int padW, int padH,
+           int adjW, int adjH)
+{
+
+  int nInputPlane = THCTensor_(size)(state, weight, 0);
+  int nOutputPlane = THCTensor_(size)(state, weight, 1);
+
+  THCUNN_assertSameGPU_generic(state, 6, input, output, weight,
+                                 bias, columns, ones);
+
+  THArgCheck(input->nDimension == 3 || input->nDimension == 4, 2, "3D or 4D (batch mode) tensor is expected");
+
+  int batch = 1;
+  if (input->nDimension == 3) {
+    THArgCheck(input->size[0] == nInputPlane, 2, "input channels and nInputPlane dont match");
+    // Force batch
+    batch = 0;
+    THCTensor_(resize4d)(state, input, 1, input->size[0], input->size[1], input->size[2]);
+  } else {
+    THArgCheck(input->size[1] == nInputPlane, 2, "input channels and nInputPlane dont match");
+  }
+
+  long inputWidth   = input->size[3];
+  long inputHeight  = input->size[2];
+  long outputWidth  = (inputWidth - 1) * dW - 2*padW + kW + adjW;
+  long outputHeight = (inputHeight - 1) * dH - 2*padH + kH + adjH;
+
+  // Batch size + input planes
+  long batchSize = input->size[0];
+
+  // Resize output
+  THCTensor_(resize4d)(state, output, batchSize, nOutputPlane, outputHeight, outputWidth);
+
+  // Resize temporary columns
+  THCTensor_(resize2d)(state, columns, nOutputPlane*kW*kH, inputHeight*inputWidth);
+
+  // Define a buffer of ones, for bias accumulation
+  // Note: this buffer can be shared with other modules, it only ever gets increased,
+  // and always contains ones.
+  if (ones->nDimension != 2 || ones->size[0]*ones->size[1] < outputHeight*outputWidth) {
+    // Resize plane and fill with ones...
+    THCTensor_(resize2d)(state, ones, outputHeight, outputWidth);
+    THCTensor_(fill)(state, ones, ScalarConvert<int, real>::to(1));
+  }
+
+  // Helpers
+  THCTensor *input_n = THCTensor_(new)(state);
+  THCTensor *output_n = THCTensor_(new)(state);
+
+  // For each elt in batch, do:
+  for (int elt = 0; elt < batchSize; elt ++) {
+    // Matrix mulitply per output:
+    THCTensor_(select)(state, input_n, input, 0, elt);
+    THCTensor_(select)(state, output_n, output, 0, elt);
+
+    // M,N,K are dims of matrix A and B
+    // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
+    long m = weight->size[1] * weight->size[2] * weight->size[3];
+    long n = columns->size[1];
+    long k = weight->size[0];
+
+    // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+    #ifdef THC_REAL_IS_FLOAT
+    THCudaBlas_Sgemm(
+    #elif defined(THC_REAL_IS_HALF)
+    THCudaBlas_Hgemm(
+    #elif defined(THC_REAL_IS_DOUBLE)
+    THCudaBlas_Dgemm(
+    #endif
+        state,
+        'n', 't',
+        n, m, k,
+        ScalarConvert<int, real>::to(1),
+        THCTensor_(data)(state, input_n), n,
+        THCTensor_(data)(state, weight), m,
+        ScalarConvert<int, real>::to(0),
+        THCTensor_(data)(state, columns), n
+    );
+
+    // Unpack columns back into input:
+    col2im<real, accreal>(
+      THCState_getCurrentStream(state),
+      THCTensor_(data)(state, columns),
+      nOutputPlane, outputHeight, outputWidth, kH, kW, padH, padW, dH, dW,
+      1, 1, THCTensor_(data)(state, output_n)
+    );
+
+    // Do Bias after:
+    // M,N,K are dims of matrix A and B
+    // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
+    long m_ = nOutputPlane;
+    long n_ = outputHeight * outputWidth;
+    long k_ = 1;
+
+    // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+    if (bias) {
+      #ifdef THC_REAL_IS_FLOAT
+      THCudaBlas_Sgemm(
+      #elif defined(THC_REAL_IS_HALF)
+      THCudaBlas_Hgemm(
+      #elif defined(THC_REAL_IS_DOUBLE)
+      THCudaBlas_Dgemm(
+      #endif
+          state,
+          't', 'n',
+          n_, m_, k_,
+          ScalarConvert<int, real>::to(1),
+          THCTensor_(data)(state, ones), k_,
+          THCTensor_(data)(state, bias), k_,
+          ScalarConvert<int, real>::to(1),
+          THCTensor_(data)(state, output_n), n_
+      );
+    }
+
+  }
+
+  // Free
+  THCTensor_(free)(state, input_n);
+  THCTensor_(free)(state, output_n);
+
+  // Resize output
+  if (batch == 0) {
+    THCTensor_(resize3d)(state, output, nOutputPlane, outputHeight, outputWidth);
+    THCTensor_(resize3d)(state, input, nInputPlane, inputHeight, inputWidth);
+  }
+}
+
+void THNN_(SpatialFullConvolution_updateGradInput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *gradOutput,
+           THCTensor *gradInput,
+           THCTensor *weight,
+           THCTensor *gradColumns,
+           int kW, int kH,
+           int dW, int dH,
+           int padW, int padH,
+           int adjW, int adjH)
+{
+  int nInputPlane = THCTensor_(size)(state, weight, 0);
+  int nOutputPlane = THCTensor_(size)(state, weight, 1);
+
+  THCUNN_assertSameGPU_generic(state, 5, input, gradOutput, weight,
+                                 gradColumns, gradInput);
+  THArgCheck(input->nDimension == 3 || input->nDimension == 4, 2, "3D or 4D (batch mode) tensor is expected");
+
+  int batch = 1;
+  if (input->nDimension == 3) {
+    // Force batch
+    batch = 0;
+    THCTensor_(resize4d)(state, input, 1, input->size[0], input->size[1], input->size[2]);
+    THCTensor_(resize4d)(state, gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2]);
+  }
+
+  long inputWidth   = input->size[3];
+  long inputHeight  = input->size[2];
+  long outputWidth  = (inputWidth - 1) * dW - 2*padW + kW + adjW;
+  long outputHeight = (inputHeight - 1) * dH - 2*padH + kH + adjH;
+
+  // Batch size + input planes
+  long batchSize = input->size[0];
+
+  // Resize output
+  THCTensor_(resize4d)(state, gradInput, batchSize, nInputPlane, inputHeight, inputWidth);
+
+  // Resize temporary columns
+  THCTensor_(resize2d)(state, gradColumns, nOutputPlane*kW*kH, inputHeight*inputWidth);
+
+  // Helpers
+  THCTensor *gradInput_n = THCTensor_(new)(state);
+  THCTensor *gradOutput_n = THCTensor_(new)(state);
+
+  // For each elt in batch, do:
+  for (int elt = 0; elt < batchSize; elt ++) {
+    // Matrix mulitply per sample:
+    THCTensor_(select)(state, gradInput_n, gradInput, 0, elt);
+    THCTensor_(select)(state, gradOutput_n, gradOutput, 0, elt);
+
+    // Extract columns:
+    im2col(
+      THCState_getCurrentStream(state),
+      THCTensor_(data)(state, gradOutput_n),
+      nOutputPlane, outputHeight, outputWidth, kH, kW, padH, padW, dH, dW,
+      1, 1, THCTensor_(data)(state, gradColumns)
+    );
+
+
+    // M,N,K are dims of matrix A and B
+    // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
+    long m = weight->size[0];
+    long n = gradColumns->size[1];
+    long k = weight->size[1] * weight->size[2] * weight->size[3];
+
+    // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+    #ifdef THC_REAL_IS_FLOAT
+    THCudaBlas_Sgemm(
+    #elif defined(THC_REAL_IS_HALF)
+    THCudaBlas_Hgemm(
+    #elif defined(THC_REAL_IS_DOUBLE)
+    THCudaBlas_Dgemm(
+    #endif
+        state,
+        'n', 'n',
+        n, m, k,
+        ScalarConvert<int, real>::to(1),
+        THCTensor_(data)(state, gradColumns), n,
+        THCTensor_(data)(state, weight), k,
+        ScalarConvert<int, real>::to(0),
+        THCTensor_(data)(state, gradInput_n), n
+    );
+  }
+
+
+  // Free
+  THCTensor_(free)(state, gradInput_n);
+  THCTensor_(free)(state, gradOutput_n);
+
+  // Resize output
+  if (batch == 0) {
+    THCTensor_(resize3d)(state, gradOutput, nOutputPlane, outputHeight, outputWidth);
+    THCTensor_(resize3d)(state, input, nInputPlane, inputHeight, inputWidth);
+    THCTensor_(resize3d)(state, gradInput, nInputPlane, inputHeight, inputWidth);
+  }
+}
+
+
+void THNN_(SpatialFullConvolution_accGradParameters)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *gradOutput,
+           THCTensor *gradWeight,
+           THCTensor *gradBias,
+           THCTensor *columns,
+           THCTensor *ones,
+           int kW, int kH,
+           int dW, int dH,
+           int padW, int padH,
+           int adjW, int adjH,
+           real scale)
+{
+  int nInputPlane = THCTensor_(size)(state, gradWeight, 0);
+  int nOutputPlane = THCTensor_(size)(state, gradWeight, 1);
+
+  THCUNN_assertSameGPU_generic(state, 6, input, gradOutput, gradWeight,
+                                 gradBias, columns, ones);
+
+  THArgCheck(input->nDimension == 3 || input->nDimension == 4, 2, "3D or 4D (batch mode) tensor is expected");
+
+  int batch = 1;
+  if (input->nDimension == 3) {
+    // Force batch
+    batch = 0;
+    THCTensor_(resize4d)(state, input, 1, input->size[0], input->size[1], input->size[2]);
+    THCTensor_(resize4d)(state, gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2]);
+  }
+
+  long inputWidth   = input->size[3];
+  long inputHeight  = input->size[2];
+  long outputWidth  = (inputWidth - 1) * dW - 2*padW + kW + adjW;
+  long outputHeight = (inputHeight - 1) * dH - 2*padH + kH + adjH;
+
+  // Batch size + input planes
+  long batchSize = input->size[0];
+
+  // Define a buffer of ones, for bias accumulation
+  if (ones->nDimension != 2 || ones->size[0]*ones->size[1] < outputHeight*outputWidth) {
+    // Resize plane and fill with ones...
+    THCTensor_(resize2d)(state, ones, outputHeight, outputWidth);
+    THCTensor_(fill)(state, ones, ScalarConvert<int, real>::to(1));
+  }
+
+  // Resize temporary columns
+  THCTensor_(resize2d)(state, columns, nOutputPlane*kW*kH, inputHeight*inputWidth);
+
+  // Helpers
+  THCTensor *input_n = THCTensor_(new)(state);
+  THCTensor *gradOutput_n = THCTensor_(new)(state);
+
+  // For each elt in batch, do:
+  for (int elt = 0; elt < batchSize; elt ++) {
+    // Matrix mulitply per output:
+    THCTensor_(select)(state, input_n, input, 0, elt);
+    THCTensor_(select)(state, gradOutput_n, gradOutput, 0, elt);
+
+    // Extract columns:
+    im2col(
+      THCState_getCurrentStream(state),
+      THCTensor_(data)(state, gradOutput_n),
+      nOutputPlane, outputHeight, outputWidth, kH, kW, padH, padW, dH, dW,
+      1, 1, THCTensor_(data)(state, columns)
+    );
+
+    // M,N,K are dims of matrix A and B
+    // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
+    long n = columns->size[0];   // nOutputPlane * kh * kw
+    long m = input_n->size[0];   // nInputPlane
+    long k = columns->size[1];   // inputHeight * inputWidth
+
+    // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+    #ifdef THC_REAL_IS_FLOAT
+    THCudaBlas_Sgemm(
+    #elif defined(THC_REAL_IS_HALF)
+    THCudaBlas_Hgemm(
+    #elif defined(THC_REAL_IS_DOUBLE)
+    THCudaBlas_Dgemm(
+    #endif
+        state,
+        't', 'n',
+        n, m, k,
+        scale,
+        THCTensor_(data)(state, columns), k,
+        THCTensor_(data)(state, input_n), k,
+        ScalarConvert<int, real>::to(1),
+        THCTensor_(data)(state, gradWeight), n
+    );
+
+    // Do Bias:
+    // M,N,K are dims of matrix A and B
+    // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
+    long m_ = nOutputPlane;
+    long k_ = outputHeight * outputWidth;
+
+    // Do GEMV (note: this is a bit confusing because gemv assumes column-major matrices)
+    if (gradBias) {
+      #if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE)
+      #ifdef THC_REAL_IS_FLOAT
+      THCudaBlas_Sgemv(
+      #elif defined(THC_REAL_IS_DOUBLE)
+      THCudaBlas_Dgemv(
+      #endif
+          state,
+          't',
+          k_, m_,
+          scale,
+          THCTensor_(data)(state, gradOutput_n), k_,
+          THCTensor_(data)(state, ones), 1,
+          ScalarConvert<int, real>::to(1),
+          THCTensor_(data)(state, gradBias), 1
+      );
+      #endif
+      #ifdef THC_REAL_IS_HALF
+      THCudaBlas_Hgemm(
+          state,
+          't', 'n',
+          m_, 1, k_,
+          scale,
+          THCTensor_(data)(state, gradOutput_n), k_,
+          THCTensor_(data)(state, ones), k_,
+          ScalarConvert<int, real>::to(1),
+          THCTensor_(data)(state, gradBias), m_
+      );
+      #endif
+    }
+  }
+
+  // Free
+  THCTensor_(free)(state, input_n);
+  THCTensor_(free)(state, gradOutput_n);
+
+  // Resize
+  if (batch == 0) {
+    THCTensor_(resize3d)(state, gradOutput, nOutputPlane, outputHeight, outputWidth);
+    THCTensor_(resize3d)(state, input, nInputPlane, inputHeight, inputWidth);
+  }
+}
+
+#endif
diff --git a/lib/THCUNN/generic/SpatialMaxPooling.cu b/lib/THCUNN/generic/SpatialMaxPooling.cu
new file mode 100644
index 0000000..6be838d
--- /dev/null
+++ b/lib/THCUNN/generic/SpatialMaxPooling.cu
@@ -0,0 +1,40 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/SpatialMaxPooling.cu"
+#else
+
+#include "../common.h"
+
+void THNN_(SpatialMaxPooling_updateOutput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *output,
+           THCIndexTensor *indices,
+           int kW, int kH,
+           int dW, int dH,
+           int padW, int padH,
+           bool ceil_mode)
+{
+  THNN_(SpatialDilatedMaxPooling_updateOutput)(
+    state, input, output, indices,
+    kW, kH, dW, dH, padW, padH, 1, 1, ceil_mode);
+
+}
+
+void THNN_(SpatialMaxPooling_updateGradInput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *gradOutput,
+           THCTensor *gradInput,
+           THCIndexTensor *indices,
+           int kW, int kH,
+           int dW, int dH,
+           int padW, int padH,
+           bool ceil_mode)
+{
+  THNN_(SpatialDilatedMaxPooling_updateGradInput)(
+    state, input, gradOutput, gradInput, indices,
+    kW, kH, dW, dH, padW, padH, 1, 1, ceil_mode);
+
+}
+
+#endif
diff --git a/lib/THCUNN/generic/SpatialMaxUnpooling.cu b/lib/THCUNN/generic/SpatialMaxUnpooling.cu
new file mode 100644
index 0000000..8b0c15e
--- /dev/null
+++ b/lib/THCUNN/generic/SpatialMaxUnpooling.cu
@@ -0,0 +1,93 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/SpatialMaxUnpooling.cu"
+#else
+
+void THNN_(SpatialMaxUnpooling_updateOutput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *output,
+           THCIndexTensor *indices,
+           int owidth, int oheight)
+{
+  THCUNN_assertSameGPU_generic(state, 3, input, output, indices);
+  THArgCheck(input->nDimension == 3 || input->nDimension == 4, 2, "3D or 4D (batch) tensor expected");
+
+  long nInputCols, nInputRows, nInputPlane, batchSize;
+
+  if (input->nDimension == 3) {
+    nInputCols = input->size[2];
+    nInputRows = input->size[1];
+    nInputPlane = input->size[0];
+    batchSize = 1;
+  }
+  else
+  {
+    nInputCols = input->size[3];
+    nInputRows = input->size[2];
+    nInputPlane = input->size[1];
+    batchSize = input->size[0];
+  }
+
+  input = THCTensor_(newContiguous)(state, input);
+  indices = THCIndexTensor_(newContiguous)(state, indices);
+  THCTensor_(resize4d)(state, output, batchSize, nInputPlane, oheight, owidth);
+  THCTensor_(zero)(state, output);
+
+  int count = THCTensor_(nElement)(state, input);
+
+  MaxUnpoolForward <<< GET_BLOCKS(count), CUDA_NUM_THREADS, 0, THCState_getCurrentStream(state) >>>
+      (count, THCTensor_(data)(state, input), THCIndexTensor_(data)(state, indices),
+      batchSize, nInputPlane, nInputRows, nInputCols, oheight, owidth, THCTensor_(data)(state, output));
+  THCudaCheck(cudaGetLastError());
+
+  if(input->nDimension == 3)
+    THCTensor_(resize3d)(state, output, nInputPlane, oheight, owidth);
+
+  THCTensor_(free)(state, input);
+
+}
+
+void THNN_(SpatialMaxUnpooling_updateGradInput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *gradOutput,
+           THCTensor *gradInput,
+           THCIndexTensor *indices,
+           int owidth, int oheight)
+{
+  THCUNN_assertSameGPU_generic(state, 4, input, gradOutput, indices, gradInput);
+
+  long nInputCols, nInputRows, nInputPlane, batchSize;
+
+  if (input->nDimension == 3) {
+    nInputCols = input->size[2];
+    nInputRows = input->size[1];
+    nInputPlane = input->size[0];
+    batchSize = 1;
+  }
+  else
+  {
+    nInputCols = input->size[3];
+    nInputRows = input->size[2];
+    nInputPlane = input->size[1];
+    batchSize = input->size[0];
+  }
+
+  input = THCTensor_(newContiguous)(state, input);
+  indices = THCIndexTensor_(newContiguous)(state, indices);
+  gradOutput = THCTensor_(newContiguous)(state, gradOutput);
+  THCTensor_(resizeAs)(state, gradInput, input);
+
+  int count = THCTensor_(nElement)(state, input);
+
+  MaxUnpoolBackward <<< GET_BLOCKS(count), CUDA_NUM_THREADS, 0, THCState_getCurrentStream(state) >>>
+      (count, THCTensor_(data)(state, gradOutput), THCIndexTensor_(data)(state, indices),
+      batchSize, nInputPlane, nInputRows, nInputCols, oheight, owidth, THCTensor_(data)(state, gradInput));
+  THCudaCheck(cudaGetLastError());
+
+  // clean
+  THCTensor_(free)(state, input);
+  THCTensor_(free)(state, gradOutput);
+}
+
+#endif
diff --git a/lib/THCUNN/generic/SpatialReflectionPadding.cu b/lib/THCUNN/generic/SpatialReflectionPadding.cu
new file mode 100644
index 0000000..e4efa91
--- /dev/null
+++ b/lib/THCUNN/generic/SpatialReflectionPadding.cu
@@ -0,0 +1,110 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/SpatialReflectionPadding.cu"
+#else
+
+void THNN_(SpatialReflectionPadding_updateOutput)(THCState *state,
+           THCTensor *input,
+           THCTensor *output,
+           int padL, int padR,
+           int padT, int padB) {
+  THArgCheck(TensorUtils<THCTensor>::canUse32BitIndexMath(state, input), 2,
+             "input tensor must fit into 32-bit index math");
+
+  int planeDim = 0;
+  int dimh = 1;
+  int dimw = 2;
+  int numBatch = 1;
+
+  int numInputDims = THCTensor_(nDimension)(state, input);
+  THArgCheck(numInputDims == 3 || numInputDims == 4, 2,
+                "input must be 3 or 4-dimensional");
+
+  if (numInputDims == 4) {
+    numBatch = THCTensor_(size)(state, input, 0);
+    planeDim++;
+    dimh++;
+    dimw++;
+  }
+
+  int numPlanes = THCTensor_(size)(state, input, planeDim);
+  int inputH = THCTensor_(size)(state, input, dimh);
+  int inputW = THCTensor_(size)(state, input, dimw);
+  int outputH = inputH + padT + padB;
+  int outputW  = inputW + padL + padR;
+
+  THCDeviceTensor<real, 4> devInput;
+  THCDeviceTensor<real, 4> devOutput;
+
+  if (numInputDims == 3) {
+    THCTensor_(resize3d)(state, output, numPlanes, outputH, outputW);
+
+    devInput = toDeviceTensor<real, 3>(state, input).upcastOuter<4>();
+    devOutput = toDeviceTensor<real, 3>(state, output).upcastOuter<4>();
+  } else {
+    THCTensor_(resize4d)(state, output, numBatch, numPlanes, outputH, outputW);
+
+    devInput = toDeviceTensor<real, 4>(state, input);
+    devOutput = toDeviceTensor<real, 4>(state, output);
+  }
+
+  int outputPlaneSize = devOutput.getSize(2) * devOutput.getSize(3);
+  dim3 gridSize(THCCeilDiv(outputPlaneSize, 256),
+            devOutput.getSize(1),
+            devOutput.getSize(0));
+  dim3 blockSize(outputPlaneSize > 256 ? 256 : outputPlaneSize);
+
+  SpatialReflectionPadding_updateOutput<<<gridSize, blockSize, 0, THCState_getCurrentStream(state)>>>(
+    devInput, devOutput, padT, padB, padL, padR);
+  THCudaCheck(cudaGetLastError());
+}
+
+void THNN_(SpatialReflectionPadding_updateGradInput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *gradOutput,
+           THCTensor *gradInput,
+           int padL, int padR,
+           int padT, int padB) {
+
+  THArgCheck(TensorUtils<THCTensor>::canUse32BitIndexMath(state, input), 2,
+                "input tensor must fit into 32-bit index math");
+  THArgCheck(TensorUtils<THCTensor>::canUse32BitIndexMath(state, gradOutput), 3,
+                "output gradient tensor must fit into 32-bit index math");
+
+  int planeDim = 0;
+  int dimh = 1;
+  int dimw = 2;
+
+  int numInputDims = THCTensor_(nDimension)(state, input);
+  if (numInputDims == 4) {
+    planeDim++;
+    dimh++;
+    dimw++;
+  }
+
+  THCTensor_(resizeAs)(state, gradInput, input);
+  THCTensor_(zero)(state, gradInput);
+
+  THCDeviceTensor<real, 4> devGradInput;
+  THCDeviceTensor<real, 4> devGradOutput;
+
+  if (numInputDims == 3) {
+    devGradInput = toDeviceTensor<real, 3>(state, gradInput).upcastOuter<4>();
+    devGradOutput = toDeviceTensor<real, 3>(state, gradOutput).upcastOuter<4>();
+  } else {
+    devGradInput = toDeviceTensor<real, 4>(state, gradInput);
+    devGradOutput = toDeviceTensor<real, 4>(state, gradOutput);
+  }
+
+  int outputPlaneSize = devGradOutput.getSize(2) * devGradOutput.getSize(3);
+  dim3 gridSize(THCCeilDiv(outputPlaneSize, 256),
+            devGradOutput.getSize(1),
+            devGradOutput.getSize(0));
+  dim3 blockSize(outputPlaneSize > 256 ? 256 : outputPlaneSize);
+
+  SpatialReflectionPadding_updateGradInput<<<gridSize, blockSize, 0, THCState_getCurrentStream(state)>>>(
+    devGradInput, devGradOutput, padT, padB, padL, padR);
+  THCudaCheck(cudaGetLastError());
+}
+
+#endif
diff --git a/lib/THCUNN/generic/SpatialReplicationPadding.cu b/lib/THCUNN/generic/SpatialReplicationPadding.cu
new file mode 100644
index 0000000..cd775d3
--- /dev/null
+++ b/lib/THCUNN/generic/SpatialReplicationPadding.cu
@@ -0,0 +1,111 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/SpatialReplicationPadding.cu"
+#else
+
+void THNN_(SpatialReplicationPadding_updateOutput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *output,
+           int padL, int padR,
+           int padT, int padB) {
+  THArgCheck(TensorUtils<THCTensor>::canUse32BitIndexMath(state, input), 2,
+             "input tensor must fit into 32-bit index math");
+
+  int planeDim = 0;
+  int dimh = 1;
+  int dimw = 2;
+  int numBatch = 1;
+
+  int numInputDims = THCTensor_(nDimension)(state, input);
+  THArgCheck(numInputDims == 3 || numInputDims == 4, 2,
+             "input must be 3 or 4-dimensional");
+
+  if (numInputDims == 4) {
+    numBatch = THCTensor_(size)(state, input, 0);
+    planeDim++;
+    dimh++;
+    dimw++;
+  }
+
+  int numPlanes = THCTensor_(size)(state, input, planeDim);
+  int inputH = THCTensor_(size)(state, input, dimh);
+  int inputW = THCTensor_(size)(state, input, dimw);
+  int outputH = inputH + padT + padB;
+  int outputW  = inputW + padL + padR;
+
+  THCDeviceTensor<real, 4> devInput;
+  THCDeviceTensor<real, 4> devOutput;
+
+  if (numInputDims == 3) {
+    THCTensor_(resize3d)(state, output, numPlanes, outputH, outputW);
+
+    devInput = toDeviceTensor<real, 3>(state, input).upcastOuter<4>();
+    devOutput = toDeviceTensor<real, 3>(state, output).upcastOuter<4>();
+  } else {
+    THCTensor_(resize4d)(state, output, numBatch, numPlanes, outputH, outputW);
+
+    devInput = toDeviceTensor<real, 4>(state, input);
+    devOutput = toDeviceTensor<real, 4>(state, output);
+  }
+
+  int outputPlaneSize = devOutput.getSize(2) * devOutput.getSize(3);
+  dim3 gridSize(THCCeilDiv(outputPlaneSize, 256),
+            devOutput.getSize(1),
+            devOutput.getSize(0));
+  dim3 blockSize(outputPlaneSize > 256 ? 256 : outputPlaneSize);
+
+  SpatialReplicationPadding_updateOutput<<<gridSize, blockSize, 0, THCState_getCurrentStream(state)>>>(
+    devInput, devOutput, padT, padB, padL, padR);
+
+}
+
+void THNN_(SpatialReplicationPadding_updateGradInput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *gradOutput,
+           THCTensor *gradInput,
+           int padL, int padR,
+           int padT, int padB) {
+
+  THArgCheck(TensorUtils<THCTensor>::canUse32BitIndexMath(state, input), 2,
+                "input tensor must fit into 32-bit index math");
+  THArgCheck(TensorUtils<THCTensor>::canUse32BitIndexMath(state, gradOutput), 3,
+                "output gradient tensor must fit into 32-bit index math");
+
+  int planeDim = 0;
+  int dimh = 1;
+  int dimw = 2;
+
+  int numInputDims = THCTensor_(nDimension)(state, input);
+  if (numInputDims == 4) {
+    planeDim++;
+    dimh++;
+    dimw++;
+  }
+
+  THCTensor_(resizeAs)(state, gradInput, input);
+  THCTensor_(zero)(state, gradInput);
+
+  THCDeviceTensor<real, 4> devGradInput;
+  THCDeviceTensor<real, 4> devGradOutput;
+
+  if (numInputDims == 3) {
+    devGradInput = toDeviceTensor<real, 3>(state, gradInput).upcastOuter<4>();
+    devGradOutput = toDeviceTensor<real, 3>(state, gradOutput).upcastOuter<4>();
+  } else {
+    devGradInput = toDeviceTensor<real, 4>(state, gradInput);
+    devGradOutput = toDeviceTensor<real, 4>(state, gradOutput);
+  }
+
+  int outputPlaneSize = devGradOutput.getSize(2) * devGradOutput.getSize(3);
+  dim3 gridSize(THCCeilDiv(outputPlaneSize, 256),
+            devGradOutput.getSize(1),
+            devGradOutput.getSize(0));
+  dim3 blockSize(outputPlaneSize > 256 ? 256 : outputPlaneSize);
+
+  SpatialReplicationPadding_updateGradInput<<<gridSize, blockSize, 0, THCState_getCurrentStream(state)>>>(
+    devGradInput, devGradOutput, padT, padB, padL, padR);
+
+}
+
+#endif
diff --git a/lib/THCUNN/generic/SpatialSubSampling.cu b/lib/THCUNN/generic/SpatialSubSampling.cu
new file mode 100644
index 0000000..4b5f238
--- /dev/null
+++ b/lib/THCUNN/generic/SpatialSubSampling.cu
@@ -0,0 +1,230 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/SpatialSubSampling.cu"
+#else
+
+#include "../common.h"
+
+void THNN_(SpatialSubSampling_updateOutput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *output,
+           THCTensor *weight,
+           THCTensor *bias,
+           int kW, int kH,
+           int dW, int dH)
+{
+  real *weight_data = THCTensor_(data)(state, weight);
+  real *bias_data = THCTensor_(data)(state, bias);
+  real *output_data;
+  real *input_data;
+
+  int nInputPlane = THCTensor_(size)(state, weight, 0);
+
+  THCUNN_assertSameGPU_generic(state, 4, input, output, weight, bias);
+  THArgCheck(input->nDimension == 3 || input->nDimension == 4, 2, "3D or 4D (batch) tensor expected");
+
+  if (input->nDimension == 3) {
+    long nInputCols = input->size[2];
+    long nInputRows = input->size[1];
+    long nOutputCols = (nInputCols - kW) / dW + 1;
+    long nOutputRows = (nInputRows - kH) / dH + 1;
+
+    THArgCheck(input->size[0] == nInputPlane, 2, "invalid number of input planes");
+    THArgCheck(nInputCols >= kW && nInputRows >= kH, 2, "input image smaller than kernel size");
+
+    input = THCTensor_(newContiguous)(state, input);
+    input_data = THCTensor_(data)(state, input);
+
+    THCTensor_(resize3d)(state, output, nInputPlane, nOutputRows, nOutputCols);
+    output_data = THCTensor_(data)(state, output);
+
+    // cuda blocks & threads:
+    int yblocks = (int)(16L / nInputPlane);
+    yblocks = yblocks < 1 ? 1 : yblocks;
+    dim3 blocks(nInputPlane,yblocks);
+    dim3 threads(32,8);
+
+    // run subsample kernel
+    subsample<real, accreal> <<<blocks, threads, 0, THCState_getCurrentStream(state)>>> (
+      input_data, output_data, weight_data, bias_data,
+      nInputPlane, nInputRows, nInputCols, kH, kW, dH, dW);
+    THCudaCheck(cudaGetLastError());
+  } else {
+    long nInputCols = input->size[3];
+    long nInputRows = input->size[2];
+    long nbatch = input->size[0];
+    long nOutputCols = (nInputCols - kW) / dW + 1;
+    long nOutputRows = (nInputRows - kH) / dH + 1;
+
+    THArgCheck(input->size[1] == nInputPlane, 2, "invalid number of input planes");
+    THArgCheck(nInputCols >= kW && nInputRows >= kH, 2, "input image smaller than kernel size");
+
+    input = THCTensor_(newContiguous)(state, input);
+    input_data = THCTensor_(data)(state, input);
+
+    THCTensor_(resize4d)(state, output, nbatch, nInputPlane, nOutputRows, nOutputCols);
+    output_data = THCTensor_(data)(state, output);
+
+    // cuda blocks & threads:
+    int yblocks = (int)(16L / nInputPlane);
+    yblocks = yblocks < 1 ? 1 : yblocks;
+    dim3 blocks(nInputPlane*nbatch,yblocks);
+    dim3 threads(32,8);
+
+    // run subsample kernel
+    subsample<real, accreal> <<<blocks, threads, 0, THCState_getCurrentStream(state)>>> (
+      input_data, output_data, weight_data, bias_data,
+      nInputPlane, nInputRows, nInputCols, kH, kW, dH, dW);
+    THCudaCheck(cudaGetLastError());
+  }
+
+  // clean
+  THCTensor_(free)(state, input);
+
+}
+
+void THNN_(SpatialSubSampling_updateGradInput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *gradOutput,
+           THCTensor *gradInput,
+           THCTensor *weight,
+           int kW, int kH,
+           int dW, int dH)
+{
+  THCUNN_assertSameGPU_generic(state, 4, input, gradOutput, weight, gradInput);
+
+  int nInputPlane = THCTensor_(size)(state, weight, 0);
+
+  if (input->nDimension == 3) {
+    long nInputCols = input->size[2];
+    long nInputRows = input->size[1];
+
+    real *weight_data = THCTensor_(data)(state, weight);
+    real *gradOutput_data = THCTensor_(data)(state, gradOutput);
+    real *gradInput_data;
+
+    THCTensor_(resizeAs)(state, gradInput, input);
+    THCTensor_(zero)(state, gradInput);
+    gradInput_data = THCTensor_(data)(state, gradInput);
+
+    // cuda blocks & threads:
+    int yblocks = (int)(16L / nInputPlane);
+    yblocks = yblocks < 1 ? 1 : yblocks;
+    dim3 blocks(nInputPlane,yblocks);
+    dim3 threads(32,8);
+
+    // run updateGradInput kernel
+    if (kH <= dH && kW <= dW) {
+      subgradinput <<<blocks, threads, 0, THCState_getCurrentStream(state)>>> (
+        gradInput_data, gradOutput_data, weight_data,
+        nInputPlane, nInputRows, nInputCols, kH, kW, dH, dW);
+    } else {
+      subgradinputAtomic <<<blocks, threads, 0, THCState_getCurrentStream(state)>>> (
+        gradInput_data, gradOutput_data, weight_data,
+        nInputPlane, nInputRows, nInputCols, kH, kW, dH, dW);
+    }
+    THCudaCheck(cudaGetLastError());
+  } else {
+    long nInputCols = input->size[3];
+    long nInputRows = input->size[2];
+    long nbatch = input->size[0];
+
+    real *weight_data = THCTensor_(data)(state, weight);
+    real *gradOutput_data = THCTensor_(data)(state, gradOutput);
+    real *gradInput_data;
+
+    THCTensor_(resizeAs)(state, gradInput, input);
+    THCTensor_(zero)(state, gradInput);
+    gradInput_data = THCTensor_(data)(state, gradInput);
+
+    // cuda blocks & threads:
+    int yblocks = (int)(16L / nInputPlane);
+    yblocks = yblocks < 1 ? 1 : yblocks;
+    dim3 blocks(nInputPlane*nbatch,yblocks);
+    dim3 threads(32,8);
+
+    // run updateGradInput kernel
+    if (kH <= dH && kW <= dW) {
+      subgradinput <<<blocks, threads, 0, THCState_getCurrentStream(state)>>> (
+        gradInput_data, gradOutput_data, weight_data,
+        nInputPlane, nInputRows, nInputCols, kH, kW, dH, dW);
+    } else {
+      subgradinputAtomic <<<blocks, threads, 0, THCState_getCurrentStream(state)>>> (
+        gradInput_data, gradOutput_data, weight_data,
+        nInputPlane, nInputRows, nInputCols, kH, kW, dH, dW);
+    }
+    THCudaCheck(cudaGetLastError());
+  }
+}
+
+void THNN_(SpatialSubSampling_accGradParameters)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *gradOutput,
+           THCTensor *gradWeight,
+           THCTensor *gradBias,
+           int kW, int kH,
+           int dW, int dH,
+           float scale)
+{
+  THCUNN_assertSameGPU_generic(state, 4, input, gradOutput, gradWeight, gradBias);
+
+  int nInputPlane = THCTensor_(size)(state, gradWeight, 0);
+
+  if (input->nDimension == 3) {
+    long nInputCols = input->size[2];
+    long nInputRows = input->size[1];
+
+    real *gradWeight_data = THCTensor_(data)(state, gradWeight);
+    real *gradBias_data = THCTensor_(data)(state, gradBias);
+    real *gradOutput_data = THCTensor_(data)(state, gradOutput);
+    real *input_data;
+
+    input = THCTensor_(newContiguous)(state, input);
+    input_data = THCTensor_(data)(state, input);
+
+    // cuda blocks & threads:
+    dim3 blocks(nInputPlane);
+    dim3 threads(32,8);
+
+    // run gradweight kernel
+    subgradweight<real, accreal> <<<blocks, threads, 0, THCState_getCurrentStream(state)>>> (
+      input_data, gradOutput_data, gradWeight_data, gradBias_data,
+      nInputPlane, nInputRows, nInputCols, kH, kW, dH, dW, scale);
+    THCudaCheck(cudaGetLastError());
+  } else {
+    long nInputCols = input->size[3];
+    long nInputRows = input->size[2];
+    long nbatch = input->size[0];
+
+    real *gradWeight_data = THCTensor_(data)(state, gradWeight);
+    real *gradBias_data = THCTensor_(data)(state, gradBias);
+    real *gradOutput_data = THCTensor_(data)(state, gradOutput);
+    real *input_data;
+
+    input = THCTensor_(newContiguous)(state, input);
+    input_data = THCTensor_(data)(state, input);
+
+    // cuda blocks & threads:
+    dim3 blocks(nInputPlane);
+    dim3 threads(32,8);
+
+    // run gradweight kernel
+    long sl;
+    for (sl=0; sl<nbatch; sl++) {
+      subgradweight<real, accreal> <<<blocks, threads, 0, THCState_getCurrentStream(state)>>> (
+        input_data + sl*input->stride[0],
+        gradOutput_data + sl*gradOutput->stride[0],
+        gradWeight_data, gradBias_data,
+        nInputPlane, nInputRows, nInputCols, kH, kW, dH, dW, scale);
+    }
+    THCudaCheck(cudaGetLastError());
+  }
+
+  // clean
+  THCTensor_(free)(state, input);
+
+}
+
+#endif
diff --git a/lib/THCUNN/generic/SpatialUpSamplingBilinear.cu b/lib/THCUNN/generic/SpatialUpSamplingBilinear.cu
new file mode 100644
index 0000000..e21f269
--- /dev/null
+++ b/lib/THCUNN/generic/SpatialUpSamplingBilinear.cu
@@ -0,0 +1,72 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/SpatialUpSamplingBilinear.cu"
+#else
+
+void THNN_(SpatialUpSamplingBilinear_updateOutput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *output,
+           int outputHeight,
+           int outputWidth)
+{
+  input = THCTensor_(newContiguous)(state, input);
+  output = THCTensor_(newContiguous)(state, output);
+  THCUNN_assertSameGPU_generic(state, 2, input, output);
+  THCTensor_(zero)(state, output);
+  THCDeviceTensor<real, 4> idata = toDeviceTensor<real, 4>(state, input);
+  THCDeviceTensor<real, 4> odata = toDeviceTensor<real, 4>(state, output);
+  int height1 = idata.getSize(2);
+  int width1 = idata.getSize(3);
+  int height2 = odata.getSize(2);
+  int width2 = odata.getSize(3);
+  assert( height1 > 0 && width1 > 0 && height2 > 0 && width2 > 0);
+  const accreal rheight= (height2 > 1) ? (accreal)(height1 - 1)/(height2 - 1) : accreal(0);
+  const accreal rwidth = (width2 > 1) ? (accreal)(width1 - 1)/(width2 - 1) : accreal(0);
+  const int num_kernels = height2 * width2;
+  const int num_threads =
+    THCState_getCurrentDeviceProperties(state)->maxThreadsPerBlock;
+  cudaStream_t stream = THCState_getCurrentStream(state);
+  caffe_gpu_interp2_kernel<real, accreal> <<<THCCeilDiv(num_kernels, num_threads), num_threads ,
+   0 , stream>>>(num_kernels, rheight, rwidth, idata, odata);
+  THCudaCheck(cudaGetLastError());
+  THCTensor_(free)(state, input);
+  THCTensor_(free)(state, output);
+}
+
+
+void THNN_(SpatialUpSamplingBilinear_updateGradInput)(
+           THCState *state,
+           THCTensor *gradOutput,
+           THCTensor *gradInput,
+           int nbatch,
+           int nchannels,
+           int inputHeight,
+           int inputWidth,
+           int outputHeight,
+           int outputWidth)
+{
+  gradInput = THCTensor_(newContiguous)(state, gradInput);
+  gradOutput = THCTensor_(newContiguous)(state, gradOutput);
+  THCUNN_assertSameGPU_generic(state, 2, gradOutput, gradInput);
+  THCTensor_(zero)(state, gradInput);
+  THCDeviceTensor<real, 4> data1 = toDeviceTensor<real, 4>(state, gradInput);
+  THCDeviceTensor<real, 4> data2 = toDeviceTensor<real, 4>(state, gradOutput);
+  int height1 = data1.getSize(2);
+  int width1 = data1.getSize(3);
+  int height2 = data2.getSize(2);
+  int width2 = data2.getSize(3);
+  assert(height1 > 0 && width1 > 0 && height2 > 0 && width2 > 0);
+  const accreal rheight= (height2 > 1) ? (accreal)(height1 - 1)/(height2 - 1) : accreal(0);
+  const accreal rwidth = (width2 > 1) ? (accreal)(width1 - 1) / (width2 - 1) : accreal(0);
+  const int num_kernels = height2 * width2;
+  const int num_threads =
+    THCState_getCurrentDeviceProperties(state)->maxThreadsPerBlock;
+  cudaStream_t stream = THCState_getCurrentStream(state);
+  caffe_gpu_interp2_kernel_backward<real ,accreal> <<<THCCeilDiv(num_kernels, num_threads),
+  num_threads, 0, stream>>>(num_kernels, rheight, rwidth, data1, data2);
+  THCudaCheck(cudaGetLastError());
+  THCTensor_(free)(state, gradInput);
+  THCTensor_(free)(state, gradOutput);
+}
+
+#endif
diff --git a/lib/THCUNN/generic/SpatialUpSamplingNearest.cu b/lib/THCUNN/generic/SpatialUpSamplingNearest.cu
new file mode 100644
index 0000000..5bc4ff9
--- /dev/null
+++ b/lib/THCUNN/generic/SpatialUpSamplingNearest.cu
@@ -0,0 +1,115 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/SpatialUpSamplingNearest.cu"
+#else
+
+#include "../common.h"
+
+void THNN_(SpatialUpSamplingNearest_updateOutput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *output,
+           int scale_factor)
+{
+  THCTensor_(zero)(state, output);
+
+  THCUNN_assertSameGPU_generic(state, 2, input, output);
+
+  input = THCTensor_(newContiguous)(state, input);
+  // This is for allocating output Tensor
+  long no_elements = 1;
+  for(int i = 0; i < input->nDimension; i++){
+    no_elements *= input->size[i];
+  }
+  no_elements *= scale_factor * scale_factor;
+
+  int d1;
+  int d2;
+  int d3;
+
+  if (input->nDimension == 3) {
+    d1 = output->size[0];
+    d2 = output->size[1];
+    d3 = output->size[2];
+  } else {
+    d1 = output->size[1];
+    d2 = output->size[2];
+    d3 = output->size[3];
+  }
+
+  real *input_data = THCTensor_(data)(state, input);
+  real *output_data = THCTensor_(data)(state, output);
+
+  // cuda blocks & threads:
+  long nthreads = 256;
+  // Max number of blocks: http://en.wikipedia.org/wiki/CUDA
+  // 65535 for SM 2.x, 2^32 -1 for >= 3.0
+  // TODO: When we move to SM 3.5 we should update this
+  long n_xblocks = min(max((int)ceil((float)no_elements / nthreads), 1), 65535);
+  long n_yblocks = (long)ceil((float)no_elements / (float)(n_xblocks * nthreads));
+  if (n_yblocks > 65535) {
+    THError("Input size is too large!  aborting");
+  }
+  dim3 blocks(n_xblocks, n_yblocks);
+  dim3 threads(nthreads);
+
+  // kernel:
+  upscale<<<blocks, threads, 0, THCState_getCurrentStream(state)>>> (input_data, output_data, no_elements, scale_factor, d1, d2, d3);
+  THCudaCheck(cudaGetLastError());
+
+  // final cut:
+  THCTensor_(free)(state, input);
+}
+
+void THNN_(SpatialUpSamplingNearest_updateGradInput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *gradOutput,
+           THCTensor *gradInput,
+           int scale_factor)
+{
+  THCUNN_assertSameGPU_generic(state, 2, gradOutput, gradInput);
+
+  THCTensor_(zero)(state, gradInput);
+
+  real *gradInput_data = THCTensor_(data)(state, gradInput);
+  real *gradOutput_data = THCTensor_(data)(state, gradOutput);
+
+  long no_elements = 1;
+  for(int i = 0; i < gradInput->nDimension; i++){
+    no_elements *= gradInput->size[i];
+  }
+
+  int d1;
+  int d2;
+  int d3;
+
+  if (gradInput->nDimension == 3) {
+    d1 = gradInput->size[0];
+    d2 = gradInput->size[1];
+    d3 = gradInput->size[2];
+  } else {
+    d1 = gradInput->size[1];
+    d2 = gradInput->size[2];
+    d3 = gradInput->size[3];
+  }
+
+  // cuda blocks & threads:
+  long nthreads = 256;
+  // Max number of blocks: http://en.wikipedia.org/wiki/CUDA
+  // 65535 for SM 2.x, 2^32 -1 for >= 3.0
+  // TODO: When we move to SM 3.5 we should update this
+  long n_xblocks = min(max((int)ceil((float)no_elements / nthreads), 1), 65535);
+  long n_yblocks = (long)ceil((float)no_elements / (float)(n_xblocks * nthreads));
+  if (n_yblocks > 65535) {
+    THError("Input size is too large!  aborting");
+  }
+  dim3 blocks(n_xblocks, n_yblocks);
+  dim3 threads(nthreads);
+
+  // kernel:
+  downscale<real ,accreal> <<<blocks, threads, 0, THCState_getCurrentStream(state)>>> (gradInput_data, gradOutput_data, no_elements,
+    scale_factor, d1, d2, d3);
+  THCudaCheck(cudaGetLastError());
+}
+
+#endif
diff --git a/lib/THCUNN/generic/Sqrt.cu b/lib/THCUNN/generic/Sqrt.cu
new file mode 100644
index 0000000..a49798f
--- /dev/null
+++ b/lib/THCUNN/generic/Sqrt.cu
@@ -0,0 +1,30 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/Sqrt.cu"
+#else
+
+#include "../common.h"
+
+void THNN_(Sqrt_updateOutput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *output,
+           real eps)
+{
+  THCUNN_assertSameGPU_generic(state, 2, input, output);
+  THCTensor_(resizeAs)(state, output, input);
+  THC_pointwiseApply2(state, output, input, sqrtupdateOutput_functor<real>(eps));
+}
+
+void THNN_(Sqrt_updateGradInput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *gradOutput,
+           THCTensor *gradInput,
+           THCTensor *output)
+{
+  THCUNN_assertSameGPU_generic(state, 3, output, gradOutput, gradInput);
+  THCTensor_(resizeAs)(state, gradInput, output);
+  THC_pointwiseApply3(state, gradInput, output, gradOutput, sqrtupdateGradInput_functor<real>());
+}
+
+#endif
diff --git a/lib/THCUNN/generic/Square.cu b/lib/THCUNN/generic/Square.cu
new file mode 100644
index 0000000..06018d7
--- /dev/null
+++ b/lib/THCUNN/generic/Square.cu
@@ -0,0 +1,28 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/Square.cu"
+#else
+
+#include "../common.h"
+
+void THNN_(Square_updateOutput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *output)
+{
+  THCUNN_assertSameGPU_generic(state, 2, input, output);
+  THCTensor_(resizeAs)(state, output, input);
+  THC_pointwiseApply2(state, output, input, squareupdateOutput_functor<real>());
+}
+
+void THNN_(Square_updateGradInput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *gradOutput,
+           THCTensor *gradInput)
+{
+  THCUNN_assertSameGPU_generic(state, 3, input, gradOutput, gradInput);
+  THCTensor_(resizeAs)(state, gradInput, input);
+  THC_pointwiseApply3(state, gradInput, input, gradOutput, squareupdateGradInput_functor<real>());
+}
+
+#endif
diff --git a/lib/THCUNN/generic/THCUNN.h b/lib/THCUNN/generic/THCUNN.h
new file mode 100644
index 0000000..91d68ae
--- /dev/null
+++ b/lib/THCUNN/generic/THCUNN.h
@@ -0,0 +1,1171 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/THCUNN.h"
+#else
+
+TH_API void THNN_(Abs_updateOutput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *output);
+
+TH_API void THNN_(Abs_updateGradInput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *gradOutput,
+                  THCTensor *gradInput);
+
+TH_API void THNN_(AbsCriterion_updateOutput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *target,
+                  THCTensor *output,
+                  bool sizeAverage);
+
+TH_API void THNN_(AbsCriterion_updateGradInput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *target,
+                  THCTensor *gradInput,
+                  bool sizeAverage);
+
+TH_API void THNN_(BatchNormalization_updateOutput)(
+                  THCState *state,
+                  THCTensor *input_,
+                  THCTensor *output_,
+                  THCTensor *weight_,        // [OPTIONAL]
+                  THCTensor *bias_,          // [OPTIONAL]
+                  THCTensor *runningMean_,
+                  THCTensor *runningVar_,
+                  THCTensor *saveMean_,
+                  THCTensor *saveStd_,
+                  bool train,
+                  double momentum,
+                  double eps);
+
+TH_API void THNN_(BatchNormalization_backward)(
+                  THCState *state,
+                  THCTensor *input_,
+                  THCTensor *gradOutput_,
+                  THCTensor *gradInput_,        // [OPTIONAL]
+                  THCTensor *gradWeight_,       // [OPTIONAL]
+                  THCTensor *gradBias_,         // [OPTIONAL]
+                  THCTensor *weight_,           // [OPTIONAL]
+                  THCTensor *runningMean_,
+                  THCTensor *runningVar_,
+                  THCTensor *saveMean_,
+                  THCTensor *saveStd_,
+                  bool train,
+                  float scale,
+                  double eps);
+
+TH_API void THNN_(BCECriterion_updateOutput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *target,
+                  THCTensor *output,
+                  bool sizeAverage,
+                  THCTensor *weights);        // [OPTIONAL]
+
+TH_API void THNN_(BCECriterion_updateGradInput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *target,
+                  THCTensor *gradInput,
+                  bool sizeAverage,
+                  THCTensor *weights);        // [OPTIONAL]
+
+TH_API void THNN_(ClassNLLCriterion_updateOutput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCIndexTensor *target,
+                  THCTensor *output,
+                  bool sizeAverage,
+                  THCTensor *weights,       // [OPTIONAL]
+                  THCTensor *total_weight);
+
+TH_API void THNN_(ClassNLLCriterion_updateGradInput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCIndexTensor *target,
+                  THCTensor *gradInput,
+                  bool sizeAverage,
+                  THCTensor *weights,       // [OPTIONAL]
+                  THCTensor *total_weight);
+
+TH_API void THNN_(DistKLDivCriterion_updateOutput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *target,
+                  THCTensor *output,
+                  bool sizeAverage);
+
+TH_API void THNN_(DistKLDivCriterion_updateGradInput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *target,
+                  THCTensor *gradInput,
+                  bool sizeAverage);
+
+TH_API void THNN_(ELU_updateOutput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *output,
+                  real alpha,
+                  bool inplace);
+
+TH_API void THNN_(ELU_updateGradInput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *gradOutput,
+                  THCTensor *gradInput,
+                  THCTensor *output,
+                  real alpha,
+                  bool inplace);
+
+TH_API void THNN_(HardTanh_updateOutput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *output,
+                  real min_val,
+                  real max_val,
+                  bool inplace);
+
+TH_API void THNN_(HardTanh_updateGradInput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *gradOutput,
+                  THCTensor *gradInput,
+                  real min_val,
+                  real max_val,
+                  bool inplace);
+
+TH_API void THNN_(LeakyReLU_updateOutput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *output,
+                  real negval,
+                  bool inplace);
+
+TH_API void THNN_(LeakyReLU_updateGradInput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *gradOutput,
+                  THCTensor *gradInput,
+                  real negval,
+                  bool inplace);
+
+TH_API void THNN_(LogSigmoid_updateOutput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *output,
+                  THCTensor *buffer);
+
+TH_API void THNN_(LogSigmoid_updateGradInput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *gradOutput,
+                  THCTensor *gradInput,
+                  THCTensor *buffer);
+
+TH_API void THNN_(LogSoftMax_updateOutput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *output);
+
+TH_API void THNN_(LogSoftMax_updateGradInput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *gradOutput,
+                  THCTensor *gradInput,
+                  THCTensor *output);
+
+TH_API void THNN_(LookupTable_accGradParameters)(
+                  THCState *state,
+                  THCIndexTensor *input,
+                  THCTensor *gradOutput,
+                  THCTensor *gradWeight,
+                  THCIndexTensor *count,
+                  THCIndexTensor *sorted,       // [OPTIONAL]
+                  THCIndexTensor *indices,      // [OPTIONAL]
+                  bool scaleGradByFreq,
+                  int paddingValue,
+                  real scale);
+
+TH_API void THNN_(LookupTable_renorm)(
+                  THCState *state,
+                  THCIndexTensor *idx,
+                  THCTensor *weight,
+                  real maxNorm,
+                  real normType);
+
+TH_API void THNN_(L1Cost_updateOutput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *output);
+
+TH_API void THNN_(L1Cost_updateGradInput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *gradOutput,       // [OPTIONAL]
+                  THCTensor *gradInput);
+
+TH_API void THNN_(MarginCriterion_updateOutput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *target,
+                  THCTensor *output,
+                  bool sizeAverage,
+                  real margin);
+
+TH_API void THNN_(MarginCriterion_updateGradInput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *target,
+                  THCTensor *gradInput,
+                  bool sizeAverage,
+                  real margin);
+
+TH_API void THNN_(MSECriterion_updateOutput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *target,
+                  THCTensor *output,
+                  bool sizeAverage);
+
+TH_API void THNN_(MSECriterion_updateGradInput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *target,
+                  THCTensor *gradInput,
+                  bool sizeAverage);
+
+TH_API void THNN_(MultiLabelMarginCriterion_updateOutput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCIndexTensor *target,
+                  THCTensor *output,
+                  THCTensor *istarget,
+                  bool sizeaverage);
+
+TH_API void THNN_(MultiLabelMarginCriterion_updateGradInput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCIndexTensor *target,
+                  THCTensor *gradInput,
+                  THCTensor *istarget,
+                  bool sizeaverage);
+
+TH_API void THNN_(MultiMarginCriterion_updateGradInput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCIndexTensor *target,
+                  THCTensor *gradInput,
+                  bool sizeAverage,
+                  int p,
+                  THCTensor *weights,
+                  real margin);
+
+TH_API void THNN_(MultiMarginCriterion_updateOutput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCIndexTensor *target,
+                  THCTensor *output,
+                  bool sizeAverage,
+                  int p,
+                  THCTensor *weights,       // [OPTIONAL]
+                  real margin);
+
+TH_API void THNN_(MultiMarginCriterion_updateGradInput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCIndexTensor *target,
+                  THCTensor *gradInput,
+                  bool sizeAverage,
+                  int p,
+                  THCTensor *weights,       // [OPTIONAL]
+                  real margin);
+
+TH_API void THNN_(PReLU_updateOutput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *output,
+                  THCTensor *weight,
+                  long nOutputPlane);
+
+TH_API void THNN_(PReLU_updateGradInput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *gradOutput,
+                  THCTensor *gradInput,
+                  THCTensor *weight,
+                  long nOutputPlane);
+
+TH_API void THNN_(PReLU_accGradParameters)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *gradOutput,
+                  THCTensor *gradInput,
+                  THCTensor *weight,
+                  THCTensor *gradWeight,
+                  THCTensor *gradWeightBuf,
+                  THCTensor *gradWeightBuf2,
+                  long nOutputPlane,
+                  real scale);
+
+TH_API void THNN_(SmoothL1Criterion_updateOutput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *target,
+                  THCTensor *output,
+                  bool sizeAverage);
+
+TH_API void THNN_(SmoothL1Criterion_updateGradInput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *target,
+                  THCTensor *gradInput,
+                  bool sizeAverage);
+
+TH_API void THNN_(SparseLinear_updateOutput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *output,
+                  THCTensor *weight,
+                  THCTensor *bias);
+
+TH_API void THNN_(SparseLinear_accGradParameters)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *gradOutput,
+                  THCTensor *gradWeight,
+                  THCTensor *gradBias,
+                  THCTensor *weight,
+                  THCTensor *bias,
+                  double weightDecay,
+                  double scale);
+
+TH_API void THNN_(SparseLinear_legacyUpdateOutput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *output,
+                  THCTensor *weight,
+                  THCTensor *bias);
+
+TH_API void THNN_(SparseLinear_legacyAccGradParameters)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *gradOutput,
+                  THCTensor *gradWeight,
+                  THCTensor *gradBias,
+                  THCTensor *weight,
+                  THCTensor *bias,
+                  double weightDecay,
+                  double scale);
+
+TH_API void THNN_(SparseLinear_zeroGradParameters)(
+                  THCState *state,
+                  THCTensor *gradWeight,
+                  THCTensor *gradBias,
+                  THCTensor *lastInput);
+
+TH_API void THNN_(SparseLinear_updateParameters)(
+                  THCState *state,
+                  THCTensor *weight,
+                  THCTensor *bias,
+                  THCTensor *gradWeight,
+                  THCTensor *gradBias,
+                  THCTensor *lastInput,
+                  double learningRate);
+
+TH_API void THNN_(SpatialAdaptiveMaxPooling_updateOutput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *output,
+                  THCIndexTensor *indices,
+                  int nOutputCols,
+                  int nOutputRows);
+
+TH_API void THNN_(SpatialAdaptiveMaxPooling_updateGradInput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *gradOutput,
+                  THCTensor *gradInput,
+                  THCIndexTensor *indices);
+
+TH_API void THNN_(SpatialAveragePooling_updateOutput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *output,
+                  int kW, int kH,
+                  int dW, int dH,
+                  int padW, int padH,
+                  bool ceil_mode,
+                  bool count_include_pad);
+
+TH_API void THNN_(SpatialAveragePooling_updateGradInput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *gradOutput,
+                  THCTensor *gradInput,
+                  int kW, int kH,
+                  int dW, int dH,
+                  int padW, int padH,
+                  bool ceil_mode,
+                  bool count_include_pad);
+
+TH_API void THNN_(SpatialClassNLLCriterion_updateOutput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCIndexTensor *target,
+                  THCTensor *output,
+                  bool sizeAverage,
+                  THCTensor *weights,       // [OPTIONAL]
+                  THCTensor *total_weight);
+
+TH_API void THNN_(SpatialClassNLLCriterion_updateGradInput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCIndexTensor *target,
+                  THCTensor *gradInput,
+                  bool sizeAverage,
+                  THCTensor *weights,       // [OPTIONAL]
+                  THCTensor *total_weight);
+
+TH_API void THNN_(SpatialConvolutionLocal_updateOutput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *output,
+                  THCTensor *weight,
+                  THCTensor *bias,
+                  THCTensor *finput,
+                  THCTensor *fgradInput,
+                  int kW, int kH,
+                  int dW, int dH,
+                  int padW, int padH,
+                  long inputWidth, long inputHeight,
+                  long outputWidth, long outputHeight);
+
+TH_API void THNN_(SpatialConvolutionLocal_updateGradInput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *gradOutput,
+                  THCTensor *gradInput,
+                  THCTensor *weight,
+                  THCTensor *finput,
+                  THCTensor *fgradInput,
+                  int kW, int kH,
+                  int dW, int dH,
+                  int padW, int padH,
+                  long inputWidth, long inputHeight,
+                  long outputWidth, long outputHeight);
+
+TH_API void THNN_(SpatialConvolutionLocal_accGradParameters)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *gradOutput,
+                  THCTensor *gradWeight,
+                  THCTensor *gradBias,
+                  THCTensor *finput,
+                  THCTensor *fgradInput,
+                  int kW, int kH,
+                  int dW, int dH,
+                  int padW, int padH,
+                  long inputWidth, long inputHeight,
+                  long outputWidth, long outputHeight,
+                  real scale);
+
+TH_API void THNN_(SpatialConvolutionMM_updateOutput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *output,
+                  THCTensor *weight,
+                  THCTensor *bias,
+                  THCTensor *columns,
+                  THCTensor *ones,
+                  int kW, int kH,
+                  int dW, int dH,
+                  int padW, int padH);
+
+TH_API void THNN_(SpatialConvolutionMM_updateGradInput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *gradOutput,
+                  THCTensor *gradInput,
+                  THCTensor *weight,
+                  THCTensor *gradColumns,
+                  THCTensor *ones,
+                  int kW, int kH,
+                  int dW, int dH,
+                  int padW, int padH);
+
+TH_API void THNN_(SpatialConvolutionMM_accGradParameters)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *gradOutput,
+                  THCTensor *gradWeight,
+                  THCTensor *gradBias,
+                  THCTensor *columns,
+                  THCTensor *ones,
+                  int kW, int kH,
+                  int dW, int dH,
+                  int padW, int padH,
+                  real scale);
+
+TH_API void THNN_(SpatialCrossMapLRN_updateOutput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *output,
+                  THCTensor *scale,
+                  int size,
+                  real alpha,
+                  real beta,
+                  real k);
+
+TH_API void THNN_(SpatialCrossMapLRN_updateGradInput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *gradOutput,
+                  THCTensor *gradInput,
+                  THCTensor *scale,
+                  THCTensor *output,
+                  int size,
+                  real alpha,
+                  real beta,
+                  real k);
+
+TH_API void THNN_(SpatialDilatedConvolution_updateOutput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *output,
+                  THCTensor *weight,
+                  THCTensor *bias,            // [OPTIONAL]
+                  THCTensor *columns,
+                  THCTensor *ones,
+                  int kW, int kH,
+                  int dW, int dH,
+                  int padW, int padH,
+                  int dilationW, int dilationH);
+
+TH_API void THNN_(SpatialDilatedConvolution_updateGradInput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *gradOutput,
+                  THCTensor *gradInput,
+                  THCTensor *weight,
+                  THCTensor *gradColumns,
+                  int kW, int kH,
+                  int dW, int dH,
+                  int padW, int padH,
+                  int dilationW, int dilationH);
+
+TH_API void THNN_(SpatialDilatedConvolution_accGradParameters)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *gradOutput,
+                  THCTensor *gradWeight,
+                  THCTensor *gradBias,        // [OPTIONAL]
+                  THCTensor *columns,
+                  THCTensor *ones,
+                  int kW, int kH,
+                  int dW, int dH,
+                  int padW, int padH,
+                  int dilationW, int dilationH,
+                  real scale);
+
+TH_API void THNN_(SpatialDilatedMaxPooling_updateOutput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *output,
+                  THCIndexTensor *indices,
+                  int kW, int kH,
+                  int dW, int dH,
+                  int padW, int padH,
+                  int dilationW, int dilationH,
+                  bool ceil_mode);
+
+TH_API void THNN_(SpatialDilatedMaxPooling_updateGradInput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *gradOutput,
+                  THCTensor *gradInput,
+                  THCIndexTensor *indices,
+                  int kW, int kH,
+                  int dW, int dH,
+                  int padW, int padH,
+                  int dilationW, int dilationH,
+                  bool ceil_mode);
+
+TH_API void THNN_(SpatialFractionalMaxPooling_updateOutput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *output,
+                  int outputW, int outputH,
+                  int poolSizeW, int poolSizeH,
+                  THCIndexTensor *indices,
+                  THCTensor *randomSamples);
+
+TH_API void THNN_(SpatialFractionalMaxPooling_updateGradInput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *gradOutput,
+                  THCTensor *gradInput,
+                  int outputW, int outputH,
+                  int poolSizeW, int poolSizeH,
+                  THCIndexTensor *indices);
+
+TH_API void THNN_(SpatialFullConvolution_updateOutput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *output,
+                  THCTensor *weight,
+                  THCTensor *bias,          // [OPTIONAL]
+                  THCTensor *columns,
+                  THCTensor *ones,
+                  int kW, int kH,
+                  int dW, int dH,
+                  int padW, int padH,
+                  int adjW, int adjH);
+
+TH_API void THNN_(SpatialFullConvolution_updateGradInput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *gradOutput,
+                  THCTensor *gradInput,
+                  THCTensor *weight,
+                  THCTensor *gradColumns,
+                  int kW, int kH,
+                  int dW, int dH,
+                  int padW, int padH,
+                  int adjW, int adjH);
+
+TH_API void THNN_(SpatialFullConvolution_accGradParameters)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *gradOutput,
+                  THCTensor *gradWeight,
+                  THCTensor *gradBias,     // [OPTIONAL]
+                  THCTensor *columns,
+                  THCTensor *ones,
+                  int kW, int kH,
+                  int dW, int dH,
+                  int padW, int padH,
+                  int adjW, int adjH,
+                  real scale);
+
+TH_API void THNN_(SpatialMaxPooling_updateOutput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *output,
+                  THCIndexTensor *indices,
+                  int kW, int kH,
+                  int dW, int dH,
+                  int padW, int padH,
+                  bool ceil_mode);
+
+TH_API void THNN_(SpatialMaxPooling_updateGradInput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *gradOutput,
+                  THCTensor *gradInput,
+                  THCIndexTensor *indices,
+                  int kW, int kH,
+                  int dW, int dH,
+                  int padW, int padH,
+                  bool ceil_mode);
+
+TH_API void THNN_(SpatialMaxUnpooling_updateOutput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *output,
+                  THCIndexTensor *indices,
+                  int owidth, int oheight);
+
+TH_API void THNN_(SpatialMaxUnpooling_updateGradInput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *gradOutput,
+                  THCTensor *gradInput,
+                  THCIndexTensor *indices,
+                  int owidth, int oheight);
+
+TH_API void THNN_(SpatialReflectionPadding_updateOutput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *output,
+                  int padL, int padR,
+                  int padT, int padB);
+
+TH_API void THNN_(SpatialReflectionPadding_updateGradInput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *gradOutput,
+                  THCTensor *gradInput,
+                  int padL, int padR,
+                  int padT, int padB);
+
+TH_API void THNN_(SpatialReplicationPadding_updateOutput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *output,
+                  int padL, int padR,
+                  int padT, int padB);
+
+TH_API void THNN_(SpatialReplicationPadding_updateGradInput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *gradOutput,
+                  THCTensor *gradInput,
+                  int padL, int padR,
+                  int padT, int padB);
+
+TH_API void THNN_(SpatialSubSampling_updateOutput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *output,
+                  THCTensor *weight,
+                  THCTensor *bias,
+                  int kW, int kH,
+                  int dW, int dH);
+
+TH_API void THNN_(SpatialSubSampling_updateGradInput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *gradOutput,
+                  THCTensor *gradInput,
+                  THCTensor *weight,
+                  int kW, int kH,
+                  int dW, int dH);
+
+TH_API void THNN_(SpatialSubSampling_accGradParameters)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *gradOutput,
+                  THCTensor *gradWeight,
+                  THCTensor *gradBias,
+                  int kW, int kH,
+                  int dW, int dH,
+                  float scale);
+
+TH_API void THNN_(SpatialUpSamplingBilinear_updateOutput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *output,
+                  int outputHeight,
+                  int outputWidth);
+
+TH_API void THNN_(SpatialUpSamplingBilinear_updateGradInput)(
+                  THCState *state,
+                  THCTensor *gradOutput,
+                  THCTensor *gradInput,
+                  int nbatch,
+                  int nchannels,
+                  int inputHeight,
+                  int inputWidth,
+                  int outputHeight,
+                  int outputWidth);
+
+TH_API void THNN_(SpatialUpSamplingNearest_updateGradInput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *gradOutput,
+                  THCTensor *gradInput,
+                  int scale_factor);
+
+TH_API void THNN_(SpatialUpSamplingNearest_updateOutput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *output,
+                  int scale_factor);
+
+TH_API void THNN_(RReLU_updateOutput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *output,
+                  THCTensor *noise,
+                  double lower,
+                  double upper,
+                  bool train,
+                  bool inplace,
+                  void *generator);
+
+TH_API void THNN_(RReLU_updateGradInput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *gradOutput,
+                  THCTensor *gradInput,
+                  THCTensor *noise,
+                  double lower,
+                  double upper,
+                  bool train,
+                  bool inplace);
+
+TH_API void THNN_(Sigmoid_updateOutput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *output);
+
+TH_API void THNN_(Sigmoid_updateGradInput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *gradOutput,
+                  THCTensor *gradInput,
+                  THCTensor *output);
+
+TH_API void THNN_(SoftMarginCriterion_updateOutput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *target,
+                  THCTensor *output,
+                  int sizeAverage);
+
+TH_API void THNN_(SoftMarginCriterion_updateGradInput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *target,
+                  THCTensor *gradInput,
+                  int sizeAverage);
+
+TH_API void THNN_(SoftMax_updateOutput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *output);
+
+TH_API void THNN_(SoftMax_updateGradInput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *gradOutput,
+                  THCTensor *gradInput,
+                  THCTensor *output);
+
+TH_API void THNN_(SoftPlus_updateOutput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *output,
+                  real beta,
+                  real threshold);
+
+TH_API void THNN_(SoftPlus_updateGradInput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *gradOutput,
+                  THCTensor *gradInput,
+                  THCTensor *output,
+                  real beta,
+                  real threshold);
+
+TH_API void THNN_(SoftShrink_updateOutput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *output,
+                  real lambda);
+
+TH_API void THNN_(SoftShrink_updateGradInput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *gradOutput,
+                  THCTensor *gradInput,
+                  real lambda);
+
+TH_API void THNN_(Square_updateOutput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *output);
+
+TH_API void THNN_(Square_updateGradInput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *gradOutput,
+                  THCTensor *gradInput);
+
+TH_API void THNN_(Sqrt_updateOutput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *output,
+                  real eps);
+
+TH_API void THNN_(Sqrt_updateGradInput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *gradOutput,
+                  THCTensor *gradInput,
+                  THCTensor *output);
+
+TH_API void THNN_(Tanh_updateOutput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *output);
+
+TH_API void THNN_(Tanh_updateGradInput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *gradOutput,
+                  THCTensor *gradInput,
+                  THCTensor *output);
+
+TH_API void THNN_(TemporalConvolution_updateOutput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *output,
+                  THCTensor *weight,
+                  THCTensor *bias,
+                  int kW, int dW,
+                  int inputFrameSize,
+                  int outputFrameSize);
+
+TH_API void THNN_(TemporalConvolution_updateGradInput)(
+                  THCState* state,
+                  THCTensor *input,
+                  THCTensor *gradOutput,
+                  THCTensor *gradInput,
+                  THCTensor *weight,
+                  int kW, int dW);
+
+TH_API void THNN_(TemporalConvolution_accGradParameters)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *gradOutput,
+                  THCTensor *gradWeight,
+                  THCTensor *gradBias,
+                  int kW, int dW,
+                  real scale);
+
+TH_API void THNN_(TemporalMaxPooling_updateOutput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *output,
+                  THCIndexTensor *indices,
+                  int kW, int dW);
+
+TH_API void THNN_(TemporalMaxPooling_updateGradInput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *gradOutput,
+                  THCTensor *gradInput,
+                  THCIndexTensor *indices,
+                  int kW, int dW);
+
+TH_API void THNN_(Threshold_updateOutput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *output,
+                  real threshold,
+                  real val,
+                  bool inplace);
+
+TH_API void THNN_(Threshold_updateGradInput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *gradOutput,
+                  THCTensor *gradInput,
+                  real threshold,
+                  real val,
+                  bool inplace);
+
+TH_API void THNN_(VolumetricAveragePooling_updateOutput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *output,
+                  int kT, int kW, int kH,
+                  int dT, int dW, int dH);
+
+TH_API void THNN_(VolumetricAveragePooling_updateGradInput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *gradOutput,
+                  THCTensor *gradInput,
+                  int kT, int kW, int kH,
+                  int dT, int dW, int dH);
+
+TH_API void THNN_(VolumetricReplicationPadding_updateOutput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *output,
+                  int pleft, int pright,
+                  int ptop, int pbottom,
+                  int pfront, int pback);
+
+TH_API void THNN_(VolumetricConvolution_updateOutput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *output,
+                  THCTensor *weight,
+                  THCTensor *bias,
+                  THCTensor *finput,
+                  THCTensor *fgradInput,
+                  int dT, int dW, int dH,
+                  int padT, int padW, int padH);
+
+TH_API void THNN_(VolumetricConvolution_updateGradInput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *gradOutput,
+                  THCTensor *gradInput,
+                  THCTensor *weight,
+                  THCTensor *finput,
+                  int dT, int dW, int dH,
+                  int padT, int padW, int padH);
+
+TH_API void THNN_(VolumetricConvolution_accGradParameters)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *gradOutput,
+                  THCTensor *gradWeight,
+                  THCTensor *gradBias,
+                  THCTensor *finput,
+                  THCTensor *fgradInput,
+                  int dT, int dW, int dH,
+                  int padT, int padW, int padH,
+                  real scale);
+
+TH_API void THNN_(VolumetricDilatedConvolution_updateOutput)(
+                  THCState *state,
+                  THCTensor  *input,
+                  THCTensor  *output,
+                  THCTensor  *weight,
+                  THCTensor  *bias,
+                  THCTensor  *columns,
+                  THCTensor  *ones,
+                  int kT, int kW, int kH,
+                  int dT, int dW, int dH,
+                  int padT, int padW, int padH,
+                  int dilationT, int dilationW, int dilationH);
+
+TH_API void THNN_(VolumetricDilatedConvolution_updateGradInput)(
+                  THCState *state,
+                  THCTensor  *input,
+                  THCTensor  *gradOutput,
+                  THCTensor  *gradInput,
+                  THCTensor  *weight,
+                  THCTensor  *gradColumns,
+                  int kT, int kW, int kH,
+                  int dT, int dW, int dH,
+                  int padT, int padW, int padH,
+                  int dilationT, int dilationW, int dilationH);
+
+TH_API void THNN_(VolumetricDilatedConvolution_accGradParameters)(
+                  THCState *state,
+                  THCTensor  *input,
+                  THCTensor  *gradOutput,
+                  THCTensor  *gradWeight,
+                  THCTensor  *gradBias,
+                  THCTensor  *columns,
+                  THCTensor  *ones,
+                  int kT, int kW, int kH,
+                  int dT, int dW, int dH,
+                  int padT, int padW, int padH,
+                  int dilationT, int dilationW, int dilationH,
+                  real scale);
+
+TH_API void THNN_(VolumetricDilatedMaxPooling_updateOutput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *output,
+                  THCIndexTensor *indices,
+                  int kT, int kW, int kH,
+                  int dT, int dW, int dH,
+                  int padT, int padW, int padH,
+                  int dilationT, int dilationW, int dilationH,
+                  bool ceilMode);
+
+TH_API void THNN_(VolumetricDilatedMaxPooling_updateGradInput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *gradOutput,
+                  THCTensor *gradInput,
+                  THCIndexTensor *indices,
+                  int dT, int dW, int dH,
+                  int padT, int padW, int padH,
+                  int dilationT, int dilationW, int dilationH);
+
+TH_API void THNN_(VolumetricFullConvolution_updateOutput)(
+                  THCState *state,
+                  THCTensor  *input,
+                  THCTensor  *output,
+                  THCTensor  *weight,
+                  THCTensor  *bias,
+                  THCTensor  *finput,
+                  THCTensor  *fgradInput,
+                  int dT, int dW, int dH,
+                  int padT, int padW, int padH,
+                  int adjT, int adjW, int adjH);
+
+TH_API void THNN_(VolumetricFullConvolution_updateGradInput)(
+                  THCState *state,
+                  THCTensor  *input,
+                  THCTensor  *gradOutput,
+                  THCTensor  *gradInput,
+                  THCTensor  *weight,
+                  THCTensor  *finput,
+                  THCTensor  *fgradInput,
+                  int dT, int dW, int dH,
+                  int padT, int padW, int padH,
+                  int adjT, int adjW, int adjH);
+
+TH_API void THNN_(VolumetricFullConvolution_accGradParameters)(
+                  THCState *state,
+                  THCTensor  *input,
+                  THCTensor  *gradOutput,
+                  THCTensor  *gradWeight,
+                  THCTensor  *gradBias,
+                  THCTensor  *finput,
+                  THCTensor  *fgradInput,
+                  int dT, int dW, int dH,
+                  int padT, int padW, int padH,
+                  int adjT, int adjW, int adjH,
+                  real scale);
+
+TH_API void THNN_(VolumetricMaxPooling_updateOutput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *output,
+                  THCIndexTensor *indices,
+                  int kT, int kW, int kH,
+                  int dT, int dW, int dH,
+                  int padT, int padW, int padH,
+                  bool ceilMode);
+
+TH_API void THNN_(VolumetricMaxPooling_updateGradInput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *gradOutput,
+                  THCTensor *gradInput,
+                  THCIndexTensor *indices,
+                  int dT, int dW, int dH,
+                  int padT, int padW, int padH);
+
+TH_API void THNN_(VolumetricMaxUnpooling_updateOutput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *output,
+                  THCIndexTensor *indices,
+                  int outputTime, int outputWidth, int outputHeight,
+                  int dT, int dW, int dH,
+                  int padT, int padW, int padH);
+
+TH_API void THNN_(VolumetricMaxUnpooling_updateGradInput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *gradOutput,
+                  THCTensor *gradInput,
+                  THCIndexTensor *indices,
+                  int outputTime, int outputWidth, int outputHeight,
+                  int dT, int dW, int dH,
+                  int padT, int padW, int padH);
+
+TH_API void THNN_(VolumetricReplicationPadding_updateGradInput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *gradOutput,
+                  THCTensor *gradInput,
+                  int pleft, int pright,
+                  int ptop, int pbottom,
+                  int pfront, int pback);
+
+TH_API void THNN_(VolumetricReplicationPadding_updateOutput)(
+                  THCState *state,
+                  THCTensor *input,
+                  THCTensor *output,
+                  int pleft, int pright,
+                  int ptop, int pbottom,
+                  int pfront, int pback);
+
+#endif
diff --git a/lib/THCUNN/generic/Tanh.cu b/lib/THCUNN/generic/Tanh.cu
new file mode 100644
index 0000000..3db4ce0
--- /dev/null
+++ b/lib/THCUNN/generic/Tanh.cu
@@ -0,0 +1,29 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/Tanh.cu"
+#else
+
+#include "../common.h"
+
+void THNN_(Tanh_updateOutput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *output)
+{
+  THCUNN_assertSameGPU_generic(state, 2, input, output);
+  THCTensor_(resizeAs)(state, output, input);
+  THC_pointwiseApply2(state, output, input, tanhupdateOutput_functor<real>());
+}
+
+void THNN_(Tanh_updateGradInput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *gradOutput,
+           THCTensor *gradInput,
+           THCTensor *output)
+{
+  THCUNN_assertSameGPU_generic(state, 3, output, gradOutput, gradInput);
+  THCTensor_(resizeAs)(state, gradInput, output);
+  THC_pointwiseApply3(state, gradInput, output, gradOutput, tanhupdateGradInput_functor<real>());
+}
+
+#endif
diff --git a/lib/THCUNN/generic/TemporalConvolution.cu b/lib/THCUNN/generic/TemporalConvolution.cu
new file mode 100644
index 0000000..f074cb7
--- /dev/null
+++ b/lib/THCUNN/generic/TemporalConvolution.cu
@@ -0,0 +1,348 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/TemporalConvolution.cu"
+#else
+
+void THNN_(TemporalConvolution_updateOutput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *output,
+           THCTensor *weight,
+           THCTensor *bias,
+           int kW, int dW,
+           int inputFrameSize,
+           int outputFrameSize) {
+
+  THCTensor *outputWindow, *inputWindow;
+  int nInputFrame, nOutputFrame;
+  long k, i;
+
+  int dimS = 0; // sequence dimension
+  int dimF = 1; // feature dimension
+
+  THCUNN_assertSameGPU_generic(state, 4, input, output, weight, bias);
+  THArgCheck( input->nDimension == 2 || input->nDimension == 3, 2, "2D or 3D(batch mode) tensor expected");
+
+  if (input->nDimension == 3)
+  {
+    dimS = 1;
+    dimF = 2;
+  }
+  THArgCheck( input->size[dimF] == inputFrameSize, 2, "invalid input frame size");
+  THArgCheck( input->size[dimS] >= kW, 2, "input sequence smaller than kernel size");
+
+  input = THCTensor_(newContiguous)(state, input);
+  outputWindow = THCTensor_(new)(state);
+  inputWindow = THCTensor_(new)(state);
+
+  nInputFrame = input->size[dimS];
+  nOutputFrame = (nInputFrame - kW) / dW + 1;
+
+  if (input->nDimension == 2)
+  {
+    THCTensor_(resize2d)(state, output,
+                          nOutputFrame,
+                          outputFrameSize);
+
+    /* bias first */
+    for(k = 0; k < nOutputFrame; k++)
+    {
+      THCTensor_(select)(state, outputWindow, output, 0, k);
+      THCTensor_(copy)(state, outputWindow, bias);
+    }
+
+
+    /* ouch */
+    for(k = 0; nOutputFrame > 0; k++)
+    {
+      long outputFrameStride = (kW-1)/dW+1;
+      long inputFrameStride = outputFrameStride*dW;
+      long nFrame = (nInputFrame-k*dW-kW)/inputFrameStride + 1;
+      nOutputFrame -= nFrame;
+
+      THCTensor_(setStorage2d)(state, inputWindow, input->storage,
+                              input->storageOffset+k*dW*input->size[1],
+                              nFrame, inputFrameStride*input->size[1],
+                              kW*input->size[1], 1);
+
+      THCTensor_(setStorage2d)(state, outputWindow, output->storage,
+                              output->storageOffset + k*output->size[1],
+                              nFrame, outputFrameStride*output->size[1],
+                              output->size[1], 1);
+
+      THCTensor_(transpose)(state, weight, NULL, 0, 1);
+      THCTensor_(addmm)(state, outputWindow, ScalarConvert<int, real>::to(1), outputWindow, ScalarConvert<int, real>::to(1), inputWindow, weight);
+      THCTensor_(transpose)(state, weight, NULL, 0, 1);
+    }
+  }
+  else
+  {
+    THCTensor *outputSample = THCTensor_(new)(state);
+    THCTensor *inputSample = THCTensor_(new)(state);
+    int nBatchFrame = input->size[0];
+
+    THCTensor_(resize3d)(state, output,
+                          nBatchFrame,
+                          nOutputFrame,
+                          outputFrameSize);
+
+    for(i = 0; i < nBatchFrame; i++)
+    {
+      THCTensor_(select)(state, outputSample, output, 0, i);
+      THCTensor_(select)(state, inputSample, input, 0, i);
+      long nOutputSampleFrame = nOutputFrame;
+
+      /* bias first */
+      for(k = 0; k < nOutputFrame; k++)
+      {
+        THCTensor_(select)(state, outputWindow, outputSample, 0, k);
+        THCTensor_(copy)(state, outputWindow, bias);
+      }
+
+      /* ouch */
+      for(k = 0; nOutputSampleFrame > 0; k++)
+      {
+        long outputFrameStride = (kW-1)/dW+1;
+        long inputFrameStride = outputFrameStride*dW;
+        long nFrame = (nInputFrame-k*dW-kW)/inputFrameStride + 1;
+        nOutputSampleFrame -= nFrame;
+
+        THCTensor_(setStorage2d)(state, inputWindow, inputSample->storage,
+                                inputSample->storageOffset+k*dW*inputSample->size[1],
+                                nFrame, inputFrameStride*inputSample->size[1],
+                                kW*inputSample->size[1], 1);
+
+        THCTensor_(setStorage2d)(state, outputWindow, outputSample->storage,
+                                outputSample->storageOffset + k*outputSample->size[1],
+                                nFrame, outputFrameStride*outputSample->size[1],
+                                outputSample->size[1], 1);
+
+        THCTensor_(transpose)(state, weight, NULL, 0, 1);
+        THCTensor_(addmm)(state, outputWindow, ScalarConvert<int, real>::to(1), outputWindow, ScalarConvert<int, real>::to(1), inputWindow, weight);
+        THCTensor_(transpose)(state, weight, NULL, 0, 1);
+      }
+    }
+    THCTensor_(free)(state, outputSample);
+    THCTensor_(free)(state, inputSample);
+  }
+
+  THCTensor_(free)(state, outputWindow);
+  THCTensor_(free)(state, inputWindow);
+  THCTensor_(free)(state, input);
+
+}
+
+void THNN_(TemporalConvolution_updateGradInput)(
+           THCState* state,
+           THCTensor *input,
+           THCTensor *gradOutput,
+           THCTensor *gradInput,
+           THCTensor *weight,
+           int kW, int dW) {
+
+  long nInputFrame;
+  long nOutputFrame;
+
+  THCTensor *gradOutputWindow;
+  THCTensor *gradInputWindow;
+  long k, i;
+
+  int dimS = 0; // sequence dimension
+
+  THCUNN_assertSameGPU_generic(state, 4, input, gradOutput, weight, gradInput);
+
+  if (gradOutput->nDimension == 3)
+  {
+    dimS = 1;
+  }
+
+  nInputFrame = input->size[dimS];
+  nOutputFrame = gradOutput->size[dimS];
+
+
+  /* Not necessary with partial backprop: */
+  gradOutputWindow = THCTensor_(new)(state);
+  gradInputWindow = THCTensor_(new)(state);
+
+  THCTensor_(resizeAs)(state, gradInput, input);
+  THCTensor_(zero)(state, gradInput);
+
+  if (gradOutput->nDimension == 2)
+  {
+    /* ouch */
+    for(k = 0; nOutputFrame > 0; k++)
+    {
+      long outputFrameStride = (kW-1)/dW+1;
+      long inputFrameStride = outputFrameStride*dW;
+      long nFrame = (nInputFrame-k*dW-kW)/inputFrameStride + 1;
+      nOutputFrame -= nFrame;
+
+      THCTensor_(setStorage2d)(state, gradOutputWindow, gradOutput->storage,
+                              gradOutput->storageOffset + k*gradOutput->size[1],
+                              nFrame, outputFrameStride*gradOutput->size[1],
+                              gradOutput->size[1], 1);
+
+      THCTensor_(setStorage2d)(state, gradInputWindow, gradInput->storage,
+                              gradInput->storageOffset+k*dW*gradInput->size[1],
+                              nFrame, inputFrameStride*gradInput->size[1],
+                              kW*gradInput->size[1], 1);
+
+      THCTensor_(addmm)(state, gradInputWindow, ScalarConvert<int, real>::to(1), gradInputWindow, ScalarConvert<int, real>::to(1), gradOutputWindow, weight);
+    }
+  }
+  else
+  {
+    THCTensor *gradOutputSample = THCTensor_(new)(state);
+    THCTensor *gradInputSample = THCTensor_(new)(state);
+    long nBatchFrame = input->size[0];
+    for(i = 0; i < nBatchFrame; i++)
+    {
+      THCTensor_(select)(state, gradOutputSample, gradOutput, 0, i);
+      THCTensor_(select)(state, gradInputSample, gradInput, 0, i);
+      long nOutputSampleFrame = nOutputFrame;
+
+      /* ouch */
+      for(k = 0; nOutputSampleFrame > 0; k++)
+      {
+        long outputFrameStride = (kW-1)/dW+1;
+        long inputFrameStride = outputFrameStride*dW;
+        long nFrame = (nInputFrame-k*dW-kW)/inputFrameStride + 1;
+        nOutputSampleFrame -= nFrame;
+
+        THCTensor_(setStorage2d)(state, gradOutputWindow, gradOutputSample->storage,
+                                gradOutputSample->storageOffset + k*gradOutputSample->size[1],
+                                nFrame, outputFrameStride*gradOutputSample->size[1],
+                                gradOutputSample->size[1], 1);
+
+        THCTensor_(setStorage2d)(state, gradInputWindow, gradInputSample->storage,
+                                gradInputSample->storageOffset+k*dW*gradInputSample->size[1],
+                                nFrame, inputFrameStride*gradInputSample->size[1],
+                                kW*gradInputSample->size[1], 1);
+
+        THCTensor_(addmm)(state, gradInputWindow, ScalarConvert<int, real>::to(1), gradInputWindow, ScalarConvert<int, real>::to(1), gradOutputWindow, weight);
+      }
+    }
+    THCTensor_(free)(state, gradOutputSample);
+    THCTensor_(free)(state, gradInputSample);
+  }
+
+  THCTensor_(free)(state, gradOutputWindow);
+  THCTensor_(free)(state, gradInputWindow);
+
+}
+
+void THNN_(TemporalConvolution_accGradParameters)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *gradOutput,
+           THCTensor *gradWeight,
+           THCTensor *gradBias,
+           int kW, int dW,
+           real scale) {
+
+  long nInputFrame;
+  long nOutputFrame;
+
+  THCTensor *gradOutputWindow;
+  THCTensor *inputWindow;
+  long k, i;
+
+  int dimS = 0; // sequence dimension
+
+  if (gradOutput->nDimension == 3)
+  {
+    dimS = 1;
+  }
+
+  nInputFrame = input->size[dimS];
+  nOutputFrame = gradOutput->size[dimS];
+
+  /* Not necessary with partial backprop: */
+  input = THCTensor_(newContiguous)(state, input);
+  gradOutputWindow = THCTensor_(new)(state);
+  inputWindow = THCTensor_(new)(state);
+
+  if (input->nDimension == 2)
+  {
+    /* bias first */
+    for(k = 0; k < nOutputFrame; k++)
+    {
+      THCTensor_(select)(state, gradOutputWindow, gradOutput, 0, k);
+      THCTensor_(cadd)(state, gradBias, gradBias, scale, gradOutputWindow);
+    }
+
+    /* ouch */
+    for(k = 0; nOutputFrame > 0; k++)
+    {
+      long outputFrameStride = (kW-1)/dW+1;
+      long inputFrameStride = outputFrameStride*dW;
+      long nFrame = (nInputFrame-k*dW-kW)/inputFrameStride + 1;
+      nOutputFrame -= nFrame;
+
+      THCTensor_(setStorage2d)(state, inputWindow, input->storage,
+                              input->storageOffset+k*dW*input->size[1],
+                              nFrame, inputFrameStride*input->size[1],
+                              kW*input->size[1], 1);
+
+      THCTensor_(setStorage2d)(state, gradOutputWindow, gradOutput->storage,
+                              gradOutput->storageOffset + k*gradOutput->size[1],
+                              nFrame, outputFrameStride*gradOutput->size[1],
+                              gradOutput->size[1], 1);
+
+      THCTensor_(transpose)(state, gradOutputWindow, NULL, 0, 1);
+      THCTensor_(addmm)(state, gradWeight, ScalarConvert<int, real>::to(1), gradWeight, scale, gradOutputWindow, inputWindow);
+      THCTensor_(transpose)(state, gradOutputWindow, NULL, 0, 1);
+    }
+  }
+  else
+  {
+    THCTensor *gradOutputSample = THCTensor_(new)(state);
+    THCTensor *inputSample = THCTensor_(new)(state);
+    long nBatchFrame = input->size[0];
+
+    for(i = 0; i < nBatchFrame; i++)
+    {
+      THCTensor_(select)(state, gradOutputSample, gradOutput, 0, i);
+      THCTensor_(select)(state, inputSample, input, 0, i);
+      long nOutputSampleFrame = nOutputFrame;
+
+      /* bias first */
+      for(k = 0; k < nOutputFrame; k++)
+      {
+        THCTensor_(select)(state, gradOutputWindow, gradOutputSample, 0, k);
+        THCTensor_(cadd)(state, gradBias, gradBias, scale, gradOutputWindow);
+      }
+
+      /* ouch */
+      for(k = 0; nOutputSampleFrame > 0; k++)
+      {
+        long outputFrameStride = (kW-1)/dW+1;
+        long inputFrameStride = outputFrameStride*dW;
+        long nFrame = (nInputFrame-k*dW-kW)/inputFrameStride + 1;
+        nOutputSampleFrame -= nFrame;
+
+        THCTensor_(setStorage2d)(state, inputWindow, inputSample->storage,
+                                inputSample->storageOffset+k*dW*inputSample->size[1],
+                                nFrame, inputFrameStride*inputSample->size[1],
+                                kW*inputSample->size[1], 1);
+
+        THCTensor_(setStorage2d)(state, gradOutputWindow, gradOutputSample->storage,
+                                gradOutputSample->storageOffset + k*gradOutputSample->size[1],
+                                nFrame, outputFrameStride*gradOutputSample->size[1],
+                                gradOutputSample->size[1], 1);
+
+        THCTensor_(transpose)(state, gradOutputWindow, NULL, 0, 1);
+        THCTensor_(addmm)(state, gradWeight, ScalarConvert<int, real>::to(1), gradWeight, scale, gradOutputWindow, inputWindow);
+        THCTensor_(transpose)(state, gradOutputWindow, NULL, 0, 1);
+      }
+    }
+    THCTensor_(free)(state, gradOutputSample);
+    THCTensor_(free)(state, inputSample);
+  }
+
+  THCTensor_(free)(state, gradOutputWindow);
+  THCTensor_(free)(state, inputWindow);
+  THCTensor_(free)(state, input);
+
+}
+
+#endif
diff --git a/lib/THCUNN/generic/TemporalMaxPooling.cu b/lib/THCUNN/generic/TemporalMaxPooling.cu
new file mode 100644
index 0000000..d833c5a
--- /dev/null
+++ b/lib/THCUNN/generic/TemporalMaxPooling.cu
@@ -0,0 +1,151 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/TemporalMaxPooling.cu"
+#else
+
+void THNN_(TemporalMaxPooling_updateOutput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *output,
+           THCIndexTensor *indices,
+           int kW, int dW) {
+
+  int dimT = 0; // Temporal dimension
+  int dimF = 1; // Feature dimension
+
+  int batch = 1;
+  int input_w;
+  int input_n;
+  int output_w;
+  int nthreads;
+
+  real *input_data;
+  real *output_data;
+  THCIndex_t *indices_data;
+
+  THCUNN_assertSameGPU_generic(state, 3, input, output, indices);
+  THArgCheck( input->nDimension == 2 || input->nDimension == 3, 2, "2D or 3D(batch mode) tensor expected");
+
+  if (input->nDimension == 3)
+  {
+    dimT = 1;
+    dimF = 2;
+    batch = input->size[0];
+  }
+  THArgCheck( input->size[dimT] >= kW, 2, "input sequence smaller than kernel size");
+
+  input = THCTensor_(newContiguous)(state, input);
+
+  input_w = input->size[dimT];
+  input_n = input->size[dimF];
+  output_w = (input_w - kW) / dW + 1;
+
+  if (input->nDimension == 2)
+  {
+    THCTensor_(resize2d)(state, output, output_w, input->size[dimF]);
+    THCIndexTensor_(resize2d)(state, indices, output_w, input->size[dimF]);
+  }
+  else
+  {
+    THCTensor_(resize3d)(state, output, batch, output_w, input->size[dimF]);
+    THCIndexTensor_(resize3d)(state, indices, batch, output_w, input->size[dimF]);
+  }
+
+  input_data = THCTensor_(data)(state, input);
+  output_data = THCTensor_(data)(state, output);
+  indices_data = THCIndexTensor_(data)(state, indices);
+
+  dim3 blocks(batch);
+  nthreads = (output_w / 32) * 32;
+  if (output_w % 32 > 0) {
+    nthreads += 32;
+  }
+
+  if (nthreads > TEMPORAL_MAX_POOLING_THREADS) {
+    blocks.y = nthreads / TEMPORAL_MAX_POOLING_THREADS;
+    if (nthreads % TEMPORAL_MAX_POOLING_THREADS > 0) {
+      blocks.y += 1;
+    }
+    nthreads = TEMPORAL_MAX_POOLING_THREADS;
+  }
+
+  dim3 threads(nthreads);
+  cunn_TemporalMaxPooling_updateOutputKernel <<< blocks, threads, 0, THCState_getCurrentStream(state) >>>(
+      input_data, output_data, indices_data, input_w, input_n, output_w, kW, dW);
+  THCudaCheck(cudaGetLastError());
+  THCTensor_(free)(state, input);
+
+}
+
+void THNN_(TemporalMaxPooling_updateGradInput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *gradOutput,
+           THCTensor *gradInput,
+           THCIndexTensor *indices,
+           int kW, int dW) {
+
+  int dimT = 0; // Temporal dimension
+  int dimF = 1; // Feature dimension
+
+  int batch = 1;
+  int input_w;
+  int input_n;
+  int output_w;
+  int nthreads;
+
+  real *gradInput_data;
+  real *gradOutput_data;
+  THCIndex_t *indices_data;
+
+  THCUNN_assertSameGPU_generic(state, 4, input, gradOutput, gradInput, indices);
+  THArgCheck( input->nDimension == 2 || input->nDimension == 3, 2, "2D or 3D(batch mode) tensor expected");
+
+  THCTensor_(resizeAs)(state, gradInput, input);
+  THCTensor_(zero)(state, gradInput);
+
+  if (input->nDimension == 3)
+  {
+    dimT = 1;
+    dimF = 2;
+    batch = input->size[0];
+  }
+  THArgCheck( input->size[dimT] >= kW, 2, "input sequence smaller than kernel size");
+
+  gradOutput = THCTensor_(newContiguous)(state, gradOutput);
+
+  input_w = input->size[dimT];
+  input_n = input->size[dimF];
+  output_w = (input_w - kW) / dW + 1;
+
+  gradInput_data = THCTensor_(data)(state, gradInput);
+  gradOutput_data = THCTensor_(data)(state, gradOutput);
+  indices_data = THCIndexTensor_(data)(state, indices);
+
+  dim3 blocks(batch);
+  nthreads = (output_w / 32) * 32;
+  if (output_w % 32 > 0) {
+    nthreads += 32;
+  }
+
+  if (nthreads > TEMPORAL_MAX_POOLING_THREADS) {
+    blocks.y = nthreads / TEMPORAL_MAX_POOLING_THREADS;
+    if (nthreads % TEMPORAL_MAX_POOLING_THREADS > 0) {
+      blocks.y += 1;
+    }
+    nthreads = TEMPORAL_MAX_POOLING_THREADS;
+  }
+
+  dim3 threads(nthreads);
+  if (kW <= dW) {
+    cunn_TemporalMaxPooling_updateGradInputKernel <<< blocks, threads, 0, THCState_getCurrentStream(state) >>>(
+        gradInput_data, gradOutput_data, indices_data, input_w, input_n, output_w, kW, dW);
+  } else {
+    cunn_TemporalMaxPooling_updateGradInputKernelAtomic <<< blocks, threads, 0, THCState_getCurrentStream(state) >>>(
+        gradInput_data, gradOutput_data, indices_data, input_w, input_n, output_w, kW, dW);
+  }
+  THCudaCheck(cudaGetLastError());
+  THCTensor_(free)(state, gradOutput);
+
+}
+
+#endif
diff --git a/lib/THCUNN/generic/Threshold.cu b/lib/THCUNN/generic/Threshold.cu
new file mode 100644
index 0000000..0ab84c1
--- /dev/null
+++ b/lib/THCUNN/generic/Threshold.cu
@@ -0,0 +1,64 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/Threshold.cu"
+#else
+
+#include "../common.h"
+
+void THNN_(Threshold_updateOutput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *output,
+           real threshold,
+           real val,
+           bool inplace)
+{
+  THCUNN_assertSameGPU_generic(state, 2, input, output);
+
+  if (inplace)
+  {
+    THC_pointwiseApply1(state, input,
+      ThresholdUpdateOutputIP<real>(threshold, val)
+    );
+    THCTensor_(set)(state, output, input);
+  }
+  else
+  {
+    THCTensor_(resizeAs)(state, output, input);
+    THC_pointwiseApply2(state, output, input,
+      ThresholdUpdateOutput<real>(threshold, val)
+    );
+  }
+
+  THCudaCheck(cudaGetLastError());
+}
+
+void THNN_(Threshold_updateGradInput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *gradOutput,
+           THCTensor *gradInput,
+           real threshold,
+           real val,
+           bool inplace)
+{
+  THCUNN_assertSameGPU_generic(state, 3, input, gradInput, gradOutput);
+
+  if (inplace)
+  {
+    THC_pointwiseApply2(state, gradOutput, input,
+      ThresholdUpdateGradInputIP<real>(threshold)
+    );
+    THCTensor_(set)(state, gradInput, gradOutput);
+  }
+  else
+  {
+    THCTensor_(resizeAs)(state, gradInput, input);
+    THC_pointwiseApply3(state, gradInput, input, gradOutput,
+       ThresholdUpdateGradInput<real>(threshold)
+    );
+  }
+
+  THCudaCheck(cudaGetLastError());
+}
+
+#endif
diff --git a/lib/THCUNN/generic/VolumetricAveragePooling.cu b/lib/THCUNN/generic/VolumetricAveragePooling.cu
new file mode 100644
index 0000000..62823a8
--- /dev/null
+++ b/lib/THCUNN/generic/VolumetricAveragePooling.cu
@@ -0,0 +1,237 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/VolumetricAveragePooling.cu"
+#else
+
+void THNN_(VolumetricAveragePooling_updateOutput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *output,
+           int kT, int kW, int kH,
+           int dT, int dW, int dH)
+{
+  int batchSize;
+  int inputSlices;
+  int inputTime;
+  int inputHeight;
+  int inputWidth;
+
+  if (THCTensor_(nDimension)(state, input) == 4)
+  {
+    THArgCheck(
+      THCTensor_(size)(state, input, 1) >= kT &&
+      THCTensor_(size)(state, input, 2) >= kH &&
+      THCTensor_(size)(state, input, 3) >= kW, 2,
+      "input image smaller than kernel size"
+    );
+
+    /* sizes */
+    batchSize   = 1;
+    inputSlices = THCTensor_(size)(state, input, 0);
+    inputTime   = THCTensor_(size)(state, input, 1);
+    inputHeight = THCTensor_(size)(state, input, 2);
+    inputWidth  = THCTensor_(size)(state, input, 3);
+  }
+  else if (THCTensor_(nDimension)(state, input) == 5)
+  {
+    THArgCheck(
+      THCTensor_(size)(state, input, 2) >= kT &&
+      THCTensor_(size)(state, input, 3) >= kH &&
+      THCTensor_(size)(state, input, 4) >= kW, 2,
+      "input image smaller than kernel size"
+
+    );
+    /* sizes */
+    batchSize   = THCTensor_(size)(state, input, 0);
+    inputSlices = THCTensor_(size)(state, input, 1);
+    inputTime   = THCTensor_(size)(state, input, 2);
+    inputHeight = THCTensor_(size)(state, input, 3);
+    inputWidth  = THCTensor_(size)(state, input, 4);
+  }
+  else
+  {
+    THArgCheck(false, 2, "4D or 5D tensor expected");
+  }
+
+  int outputTime   = (inputTime   - kT) / dT + 1;
+  int outputHeight = (inputHeight - kH) / dH + 1;
+  int outputWidth  = (inputWidth  - kW) / dW + 1;
+
+  if (input->nDimension == 4) /* 4D */
+  {
+    /* resize output */
+    THCTensor_(resize4d)(state, output, inputSlices,
+                         outputTime, outputHeight, outputWidth);
+  }
+  else /* 5D */
+  {
+    THCTensor_(resize5d)(state, output, batchSize, inputSlices,
+                         outputTime, outputHeight, outputWidth);
+  }
+
+  input = THCTensor_(newContiguous)(state, input);
+
+  // Collapse batch and feature dimensions
+  THCDeviceTensor<real, 4> cudaInput;
+  THCDeviceTensor<real, 4> cudaOutput;
+  if (THCTensor_(nDimension)(state, input) == 4)
+  {
+    cudaInput  = toDeviceTensor<real, 4>(state, input);
+    cudaOutput = toDeviceTensor<real, 4>(state, output);
+  }
+  else
+  {
+    cudaInput  = toDeviceTensor<real, 5>(state, input).downcastOuter<4>();
+    cudaOutput = toDeviceTensor<real, 5>(state, output).downcastOuter<4>();
+  }
+
+  int totalZ = outputTime * inputSlices * batchSize;
+  int offsetZ = 0;
+  dim3 block(32, 8);
+  while (totalZ > 0) {
+    dim3 grid(THCCeilDiv(outputWidth, static_cast<int>(block.x)),
+              THCCeilDiv(outputHeight, static_cast<int>(block.y)),
+              totalZ > 65535 ? 65535 : totalZ);
+
+    accreal normFactor = ScalarConvert<int, accreal>::to(1) / static_cast<accreal>(kT * kH * kW);
+    switch (kW)
+      {
+        LAUNCH_UPDATE_OUTPUT_KERNEL_WIDTH(1);
+        LAUNCH_UPDATE_OUTPUT_KERNEL_WIDTH(2);
+        LAUNCH_UPDATE_OUTPUT_KERNEL_WIDTH(3);
+        LAUNCH_UPDATE_OUTPUT_KERNEL_WIDTH(4);
+        LAUNCH_UPDATE_OUTPUT_KERNEL_WIDTH(5);
+        LAUNCH_UPDATE_OUTPUT_KERNEL_WIDTH(6);
+        LAUNCH_UPDATE_OUTPUT_KERNEL_WIDTH(7);
+      default:
+        cuda_VolumetricAveragePooling_updateOutput<real, accreal><<<grid, block>>>(
+                                                                    cudaInput,
+                                                                    cudaOutput,
+                                                                    kT, kH, kW,
+                                                                    dT, dH, dW,
+                                                                    normFactor,
+                                                                    offsetZ
+                                                                    );
+        break;
+      }
+    totalZ -= 65535;
+    offsetZ += 65535;
+    THCudaCheck(cudaGetLastError());
+  }
+  THCTensor_(free)(state, input);
+}
+
+void THNN_(VolumetricAveragePooling_updateGradInput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *gradOutput,
+           THCTensor *gradInput,
+           int kT, int kW, int kH,
+           int dT, int dW, int dH)
+{
+  bool kernelsOverlap = (dT < kT) || (dH < kH) || (dW < kW);
+
+  // Resize and initialize result tensor.
+  THCTensor_(resizeAs)(state, gradInput, input);
+  THCTensor_(zero)(state, gradInput);
+
+  int batchSize;
+  int inputSlices;
+  int inputTime;
+  int inputHeight;
+  int inputWidth;
+
+  int outputTime;
+  int outputHeight;
+  int outputWidth;
+
+  if (THCTensor_(nDimension)(state, input) == 4) /* 4D */
+  {
+    batchSize = 1;
+    inputSlices  = THCTensor_(size)(state, input, 0);
+    inputTime    = THCTensor_(size)(state, input, 1);
+    inputHeight  = THCTensor_(size)(state, input, 2);
+    inputWidth   = THCTensor_(size)(state, input, 3);
+
+    outputTime   = THCTensor_(size)(state, gradOutput, 1);
+    outputHeight = THCTensor_(size)(state, gradOutput, 2);
+    outputWidth  = THCTensor_(size)(state, gradOutput, 3);
+  }
+  else
+  {
+    batchSize    = THCTensor_(size)(state, input, 0);
+    inputSlices  = THCTensor_(size)(state, input, 1);
+    inputTime    = THCTensor_(size)(state, input, 2);
+    inputHeight  = THCTensor_(size)(state, input, 3);
+    inputWidth   = THCTensor_(size)(state, input, 4);
+
+    outputTime   = THCTensor_(size)(state, gradOutput, 2);
+    outputHeight = THCTensor_(size)(state, gradOutput, 3);
+    outputWidth  = THCTensor_(size)(state, gradOutput, 4);
+  }
+
+  gradOutput = THCTensor_(newContiguous)(state, gradOutput);
+
+  // Collapse batch and feature dimensions
+  THCDeviceTensor<real, 4> cudaGradInput;
+  THCDeviceTensor<real, 4> cudaGradOutput;
+  if (THCTensor_(nDimension)(state, input) == 4)
+  {
+    cudaGradInput  = toDeviceTensor<real, 4>(state, gradInput);
+    cudaGradOutput = toDeviceTensor<real, 4>(state, gradOutput);
+  }
+  else
+  {
+    cudaGradInput =
+      toDeviceTensor<real, 5>(state, gradInput).downcastOuter<4>();
+    cudaGradOutput =
+      toDeviceTensor<real, 5>(state, gradOutput).downcastOuter<4>();
+  }
+
+  dim3 block(32, 8);
+
+  // Optimizing for stride 1 is probably only of limited value, but this
+  // specialization yields 3x speedup over the atomicAdd implementation.
+  if (dT == 1 && dH == 1 && dW == 1)
+  {
+    int totalZ = inputTime * inputSlices * batchSize;
+    int offsetZ = 0;
+    while (totalZ > 0) {
+      dim3 grid(THCCeilDiv(inputWidth, static_cast<int>(block.x)),
+                THCCeilDiv(inputHeight, static_cast<int>(block.y)),
+                totalZ > 65535 ? 65535 : totalZ);
+      cuda_VolumetricAveragePooling_updateGradInput_Stride1<real, accreal><<<grid, block>>>(
+         cudaGradOutput, cudaGradInput, kT, kH, kW, 1.0f/(kT * kH * kW), offsetZ);
+      THCudaCheck(cudaGetLastError());
+      totalZ -= 65535;
+      offsetZ += 65535;
+    }
+  }
+  else
+  {
+    int totalZ = outputTime * inputSlices * batchSize;
+    int offsetZ = 0;
+    while (totalZ > 0) {
+
+      dim3 grid(THCCeilDiv(outputWidth, static_cast<int>(block.x)),
+                THCCeilDiv(outputHeight, static_cast<int>(block.y)),
+                totalZ > 65535 ? 65535 : totalZ);
+      if (kernelsOverlap)
+        {
+          cuda_VolumetricAveragePooling_updateGradInput_atomicAdd<real, accreal><<<grid, block>>>(
+            cudaGradOutput, cudaGradInput, kT, kH, kW, dT, dH, dW, offsetZ);
+        }
+      else
+        {
+          cuda_VolumetricAveragePooling_updateGradInput<real, accreal><<<grid, block>>>(
+             cudaGradOutput, cudaGradInput, kT, kH, kW, dT, dH, dW, offsetZ);
+        }
+      THCudaCheck(cudaGetLastError());
+      totalZ -= 65535;
+      offsetZ += 65535;
+    }
+  }
+
+  THCTensor_(free)(state, gradOutput);
+}
+
+#endif
diff --git a/lib/THCUNN/generic/VolumetricConvolution.cu b/lib/THCUNN/generic/VolumetricConvolution.cu
new file mode 100644
index 0000000..8ed0bca
--- /dev/null
+++ b/lib/THCUNN/generic/VolumetricConvolution.cu
@@ -0,0 +1,416 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/VolumetricConvolution.cu"
+#else
+
+void THNN_(VolumetricConvolution_updateOutput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *output,
+           THCTensor *weight,
+           THCTensor *bias,
+           THCTensor *finput,
+           THCTensor *fgradInput,
+           int dT, int dW, int dH,
+           int padT, int padW, int padH)
+{
+  THCTensor *columns = finput;
+  THCTensor *ones = fgradInput;
+  THCUNN_assertSameGPU_generic(state, 6, input, output, weight, bias, columns, ones);
+
+  THArgCheck(input->nDimension == 4 || input->nDimension == 5, 2,
+    "4D or 5D (batch mode) tensor is expected"
+  );
+
+  THArgCheck(weight->nDimension == 5, 4,
+    "5D weight tensor is expected (nOutputPlane x nInputPlane x kT x kH x kW)"
+  );
+
+  int nOutputPlane = (int)weight->size[0];
+  int nInputPlane  = (int)weight->size[1];
+  int kT           = (int)weight->size[2];
+  int kH           = (int)weight->size[3];
+  int kW           = (int)weight->size[4];
+
+  int batch = 1;
+  if (input->nDimension == 4)
+  {
+    // Force batch
+    batch = 0;
+    THCTensor_(resize5d)(state, input, 1, input->size[0], input->size[1],
+                          input->size[2], input->size[3]);
+  }
+
+  long inputWidth   = input->size[3];
+  long inputHeight  = input->size[2];
+  long inputDepth   = input->size[4];
+  long outputWidth  = (inputWidth  + 2*padH - kH) / dH + 1;
+  long outputHeight = (inputHeight + 2*padT - kT) / dT + 1;
+  long outputDepth  = (inputDepth  + 2*padW - kW) / dW + 1;
+
+  // Batch size + input planes
+  long batchSize = input->size[0];
+
+  // Resize output
+  THCTensor_(resize5d)(state, output, batchSize, nOutputPlane,
+                        outputHeight, outputWidth, outputDepth);
+
+  // Resize temporary columns
+  THCTensor_(resize2d)(state, columns, nInputPlane*kW*kH*kT, outputDepth*outputHeight*outputWidth);
+
+  // Define a buffer of ones, for bias accumulation
+  // Note: this buffer can be shared with other modules, it only ever gets increased,
+  // and always contains ones.
+  if (ones->nDimension != 3 || ones->size[0]*ones->size[1]*ones->size[2] < outputDepth*outputHeight*outputWidth)
+  {
+    // Resize plane and fill with ones...
+    THCTensor_(resize3d)(state, ones, outputHeight, outputWidth, outputDepth);
+    THCTensor_(fill)(state, ones, ScalarConvert<int, real>::to(1));
+  }
+
+  // Helpers
+  THCTensor *input_n = THCTensor_(new)(state);
+  THCTensor *output_n = THCTensor_(new)(state);
+
+  // For each elt in batch, do:
+  for (int elt = 0; elt < batchSize; elt ++)
+  {
+    // Matrix mulitply per output:
+    THCTensor_(select)(state, input_n, input, 0, elt);
+    THCTensor_(select)(state, output_n, output, 0, elt);
+
+    // Do Bias first:
+    // M,N,K are dims of matrix A and B
+    // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
+    long m_ = nOutputPlane;
+    long n_ = outputDepth * outputHeight * outputWidth;
+    long k_ = 1;
+
+    // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+    #ifdef THC_REAL_IS_FLOAT
+    THCudaBlas_Sgemm(
+    #elif defined(THC_REAL_IS_HALF)
+    THCudaBlas_Hgemm(
+    #elif defined(THC_REAL_IS_DOUBLE)
+    THCudaBlas_Dgemm(
+    #endif
+      state,
+      't', 'n',
+      n_, m_, k_,
+      ScalarConvert<int, real>::to(1),
+      THCTensor_(data)(state, ones), k_,
+      THCTensor_(data)(state, bias), k_,
+      ScalarConvert<int, real>::to(0),
+      THCTensor_(data)(state, output_n), n_
+    );
+
+    // Extract columns:
+    im3d2col(
+      THCState_getCurrentStream(state),
+      THCTensor_(data)(state, input_n),
+      nInputPlane, inputHeight, inputWidth, inputDepth, kT, kH, kW, padT, padH, padW, dT, dH, dW,
+      THCTensor_(data)(state, columns)
+    );
+
+    // M,N,K are dims of matrix A and B
+    // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
+    long m = weight->size[0];
+    long n = columns->size[1];
+    long k = weight->size[1]*weight->size[2]*weight->size[3]*weight->size[4];
+
+    // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+    #ifdef THC_REAL_IS_FLOAT
+    THCudaBlas_Sgemm(
+    #elif defined(THC_REAL_IS_HALF)
+    THCudaBlas_Hgemm(
+    #elif defined(THC_REAL_IS_DOUBLE)
+    THCudaBlas_Dgemm(
+    #endif
+      state,
+      'n', 'n',
+      n, m, k,
+      ScalarConvert<int, real>::to(1),
+      THCTensor_(data)(state, columns), n,
+      THCTensor_(data)(state, weight), k,
+      ScalarConvert<int, real>::to(1),
+      THCTensor_(data)(state, output_n), n
+    );
+  }
+
+  // Free
+  THCTensor_(free)(state, input_n);
+  THCTensor_(free)(state, output_n);
+
+  // Resize output
+  if (batch == 0)
+  {
+    THCTensor_(resize4d)(state, output, nOutputPlane, outputHeight, outputWidth, outputDepth);
+    THCTensor_(resize4d)(state, input, nInputPlane, inputHeight, inputWidth, inputDepth);
+  }
+}
+
+void THNN_(VolumetricConvolution_updateGradInput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *gradOutput,
+           THCTensor *gradInput,
+           THCTensor *weight,
+           THCTensor *finput,
+           int dT, int dW, int dH,
+           int padT, int padW, int padH)
+{
+  THArgCheck(weight->nDimension == 5, 4,
+    "5D weight tensor is expected (nOutputPlane x nInputPlane x kT x kH x kW)"
+  );
+
+  int nOutputPlane = (int)weight->size[0];
+  int nInputPlane  = (int)weight->size[1];
+  int kT           = (int)weight->size[2];
+  int kH           = (int)weight->size[3];
+  int kW           = (int)weight->size[4];
+
+  THCTensor *gradColumns = finput;
+
+  THCUNN_assertSameGPU_generic(state, 5, input, gradOutput, weight, gradColumns, gradInput);
+  THArgCheck(input->nDimension == 4 || input->nDimension == 5, 2,
+    "4D or 5D (batch mode) tensor is expected"
+  );
+
+  int batch = 1;
+  if (input->nDimension == 4)
+  {
+    // Force batch
+    batch = 0;
+    THCTensor_(resize5d)(state, input, 1, input->size[0], input->size[1], input->size[2], input->size[3]);
+    THCTensor_(resize5d)(state, gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2], gradOutput->size[3]);
+  }
+
+  long inputWidth   = input->size[3];
+  long inputHeight  = input->size[2];
+  long inputDepth   = input->size[4];
+  long outputWidth  = (inputWidth  + 2*padH - kH) / dH + 1;
+  long outputHeight = (inputHeight + 2*padT - kT) / dT + 1;
+  long outputDepth  = (inputDepth  + 2*padW - kW) / dW + 1;
+
+  // Batch size + input planes
+  long batchSize = input->size[0];
+
+  // Resize output
+  THCTensor_(resize5d)(state, gradInput, batchSize, nInputPlane, inputHeight, inputWidth, inputDepth);
+
+  // Resize temporary columns
+  THCTensor_(resize2d)(state, gradColumns, nInputPlane*kH*kT*kW, outputDepth*outputHeight*outputWidth);
+
+  // Helpers
+  THCTensor *input_n = THCTensor_(new)(state);
+  THCTensor *gradInput_n = THCTensor_(new)(state);
+  THCTensor *gradOutput_n = THCTensor_(new)(state);
+
+  // For each elt in batch, do:
+  for (int elt = 0; elt < batchSize; elt ++)
+  {
+    // Matrix mulitply per sample:
+    THCTensor_(select)(state, input_n, input, 0, elt);
+    THCTensor_(select)(state, gradInput_n, gradInput, 0, elt);
+    THCTensor_(select)(state, gradOutput_n, gradOutput, 0, elt);
+
+    // M,N,K are dims of matrix A and B
+    // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
+    long m = weight->size[1]*weight->size[2]*weight->size[3]*weight->size[4];
+    long n = gradColumns->size[1];
+    long k = weight->size[0];
+
+    // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+    #ifdef THC_REAL_IS_FLOAT
+    THCudaBlas_Sgemm(
+    #elif defined(THC_REAL_IS_HALF)
+    THCudaBlas_Hgemm(
+    #elif defined(THC_REAL_IS_DOUBLE)
+    THCudaBlas_Dgemm(
+    #endif
+      state,
+      'n', 't',
+      n, m, k,
+      ScalarConvert<int, real>::to(1),
+      THCTensor_(data)(state, gradOutput_n), n,
+      THCTensor_(data)(state, weight), m,
+      ScalarConvert<int, real>::to(0),
+      THCTensor_(data)(state, gradColumns), n
+    );
+
+    // Unpack columns back into input:
+    col2im3d<real, accreal>(
+      THCState_getCurrentStream(state),
+      THCTensor_(data)(state, gradColumns),
+      nInputPlane, inputHeight, inputWidth, inputDepth, kT, kH, kW, padT, padH, padW, dT, dH, dW,
+      THCTensor_(data)(state, gradInput_n)
+    );
+  }
+
+  // Free
+  THCTensor_(free)(state, input_n);
+  THCTensor_(free)(state, gradInput_n);
+  THCTensor_(free)(state, gradOutput_n);
+
+  // Resize output
+  if (batch == 0)
+  {
+    THCTensor_(resize4d)(state, gradOutput, nOutputPlane, outputHeight, outputWidth, outputDepth);
+    THCTensor_(resize4d)(state, input, nInputPlane, inputHeight, inputWidth, inputDepth);
+    THCTensor_(resize4d)(state, gradInput, nInputPlane, inputHeight, inputWidth, inputDepth);
+  }
+}
+
+void THNN_(VolumetricConvolution_accGradParameters)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *gradOutput,
+           THCTensor *gradWeight,
+           THCTensor *gradBias,
+           THCTensor *finput,
+           THCTensor *fgradInput,
+           int dT, int dW, int dH,
+           int padT, int padW, int padH,
+           real scale)
+{
+  THCTensor *columns = finput;
+  THCTensor *ones = fgradInput;
+  THCUNN_assertSameGPU_generic(state, 6, input, gradOutput, gradWeight, gradBias, columns, ones);
+
+  THArgCheck(gradWeight->nDimension == 5, 4,
+    "5D gradWeight tensor is expected (nOutputPlane x nInputPlane x kT x kH x kW)"
+  );
+
+  int nOutputPlane = (int)gradWeight->size[0];
+  int nInputPlane  = (int)gradWeight->size[1];
+  int kT           = (int)gradWeight->size[2];
+  int kH           = (int)gradWeight->size[3];
+  int kW           = (int)gradWeight->size[4];
+
+  THArgCheck(
+    input->nDimension == 4 || input->nDimension == 5, 2,
+    "3D or 4D (batch mode) tensor is expected"
+  );
+
+  int batch = 1;
+  if (input->nDimension == 4)
+  {
+    // Force batch
+    batch = 0;
+    THCTensor_(resize5d)(state, input, 1, input->size[0], input->size[1], input->size[2], input->size[3]);
+    THCTensor_(resize5d)(state, gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2], gradOutput->size[3]);
+  }
+
+  long inputWidth   = input->size[3];
+  long inputHeight  = input->size[2];
+  long inputDepth   = input->size[4];
+  long outputWidth  = (inputWidth  + 2*padH - kH) / dH + 1;
+  long outputHeight = (inputHeight + 2*padT - kT) / dT + 1;
+  long outputDepth  = (inputDepth  + 2*padW - kW) / dW + 1;
+
+  // Batch size + input planes
+  long batchSize = input->size[0];
+
+  // Define a buffer of ones, for bias accumulation
+  if (ones->nDimension != 3 || ones->size[0]*ones->size[1]*ones->size[2] < outputDepth*outputHeight*outputWidth)
+  {
+    // Resize plane and fill with ones...
+    THCTensor_(resize3d)(state, ones, outputHeight, outputWidth, outputDepth);
+    THCTensor_(fill)(state, ones, ScalarConvert<int, real>::to(1));
+  }
+
+  // Resize temporary columns
+  THCTensor_(resize2d)(state, columns, nInputPlane*kH*kT*kW, outputDepth*outputHeight*outputWidth);
+
+  // Helpers
+  THCTensor *input_n = THCTensor_(new)(state);
+  THCTensor *gradOutput_n = THCTensor_(new)(state);
+
+  // For each elt in batch, do:
+  for (int elt = 0; elt < batchSize; elt ++)
+  {
+    // Matrix mulitply per output:
+    THCTensor_(select)(state, input_n, input, 0, elt);
+    THCTensor_(select)(state, gradOutput_n, gradOutput, 0, elt);
+
+    // Extract columns:
+    im3d2col(
+      THCState_getCurrentStream(state),
+      THCTensor_(data)(state, input_n),
+      nInputPlane, inputHeight, inputWidth, inputDepth, kT, kH, kW, padT, padH, padW, dT, dH, dW,
+      THCTensor_(data)(state, columns)
+    );
+
+    // M,N,K are dims of matrix A and B
+    // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
+    long m = gradWeight->size[0];
+    long n = gradWeight->size[1]*gradWeight->size[2]*gradWeight->size[3]*gradWeight->size[4];
+    long k = columns->size[1];
+
+    // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+    #ifdef THC_REAL_IS_FLOAT
+    THCudaBlas_Sgemm(
+    #elif defined(THC_REAL_IS_HALF)
+    THCudaBlas_Hgemm(
+    #elif defined(THC_REAL_IS_DOUBLE)
+    THCudaBlas_Dgemm(
+    #endif
+      state,
+      't', 'n',
+      n, m, k,
+      scale,
+      THCTensor_(data)(state, columns), k,
+      THCTensor_(data)(state, gradOutput_n), k,
+      ScalarConvert<int, real>::to(1),
+      THCTensor_(data)(state, gradWeight), n
+    );
+
+    // Do Bias:
+    // M,N,K are dims of matrix A and B
+    // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
+    long m_ = nOutputPlane;
+    long k_ = outputDepth * outputHeight * outputWidth;
+
+    // Do GEMV (note: this is a bit confusing because gemv assumes column-major matrices)
+    #if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE)
+    #ifdef THC_REAL_IS_FLOAT
+    THCudaBlas_Sgemv(
+    #elif defined(THC_REAL_IS_DOUBLE)
+    THCudaBlas_Dgemv(
+    #endif
+      state,
+      't',
+      k_, m_,
+      scale,
+      THCTensor_(data)(state, gradOutput_n), k_,
+      THCTensor_(data)(state, ones), 1,
+      ScalarConvert<int, real>::to(1),
+      THCTensor_(data)(state, gradBias), 1
+    );
+    #endif
+    #ifdef THC_REAL_IS_HALF
+    THCudaBlas_Hgemm(
+      state,
+      't', 'n',
+      m_, 1, k_,
+      scale,
+      THCTensor_(data)(state, gradOutput_n), k_,
+      THCTensor_(data)(state, ones), k_,
+      ScalarConvert<int, real>::to(1),
+      THCTensor_(data)(state, gradBias), m_
+    );
+    #endif
+  }
+
+  // Free
+  THCTensor_(free)(state, input_n);
+  THCTensor_(free)(state, gradOutput_n);
+
+  // Resize
+  if (batch == 0)
+  {
+    THCTensor_(resize4d)(state, gradOutput, nOutputPlane, outputHeight, outputWidth, outputDepth);
+    THCTensor_(resize4d)(state, input, nInputPlane, inputHeight, inputWidth, inputDepth);
+  }
+}
+
+#endif
diff --git a/lib/THCUNN/generic/VolumetricDilatedConvolution.cu b/lib/THCUNN/generic/VolumetricDilatedConvolution.cu
new file mode 100644
index 0000000..22536fe
--- /dev/null
+++ b/lib/THCUNN/generic/VolumetricDilatedConvolution.cu
@@ -0,0 +1,411 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/VolumetricDilatedConvolution.cu"
+#else
+
+void THNN_(VolumetricDilatedConvolution_updateOutput)(
+           THCState *state,
+           THCTensor  *input,
+           THCTensor  *output,
+           THCTensor  *weight,
+           THCTensor  *bias,
+           THCTensor  *columns,
+           THCTensor  *ones,
+           int kT, int kW, int kH,
+           int dT, int dW, int dH,
+           int padT, int padW, int padH,
+           int dilationT, int dilationW, int dilationH) {
+
+  THCUNN_assertSameGPU_generic(state, 5, input, output, weight, columns, ones);
+  if (bias) {
+    THCUNN_assertSameGPU_generic(state, 2, weight, bias);
+  }
+  THArgCheck(input->nDimension == 4 || input->nDimension == 5, 2, "4D or 5D (batch mode) tensor is expected, but got: %d", input->nDimension);
+  THArgCheck(weight->nDimension == 5, 4, "weight tensor must be 5D (nOutputPlane,nInputPlane,kT,kH,kW)");
+  THArgCheck(!bias || weight->size[0] == bias->size[0], 4, "nOutputPlane mismatch in weight and bias");
+  THArgCheck(kT > 0 && kW > 0 && kH > 0, 8, "kernel size should be greater than zero");
+  THArgCheck(dT > 0 && dW > 0 && dH > 0, 10, "stride should be greater than zero");
+  THArgCheck(dilationT > 0 && dilationW > 0 && dilationH > 0, 16, "dilation should be greater than 0");
+
+  // Params:
+  int nInputPlane = weight->size[1];
+  int nOutputPlane = weight->size[0];
+
+  int batch = 1;
+  if (input->nDimension == 4) {
+    THArgCheck(input->size[0] == nInputPlane, 2, "input channels and nInputPlane dont match");
+    // Force batch
+    batch = 0;
+    THCTensor_(resize5d)(state, input, 1, input->size[0], input->size[1], input->size[2], input->size[3]);
+  } else {
+    THArgCheck(input->size[1] == nInputPlane, 2, "input channels and nInputPlane dont match");
+  }
+
+  long inputDepth  = input->size[2];
+  long inputHeight  = input->size[3];
+  long inputWidth   = input->size[4];
+  long outputDepth  = (inputDepth  + 2*padT - (dilationT * (kT - 1) + 1)) / dT + 1;
+  long outputHeight = (inputHeight + 2*padH - (dilationH * (kH - 1) + 1)) / dH + 1;
+  long outputWidth  = (inputWidth  + 2*padW - (dilationW * (kW - 1) + 1)) / dW + 1;
+
+  if (outputDepth < 1 || outputWidth < 1 || outputHeight < 1)
+    THError("Given input size: (%dx%dx%dx%d). Calculated output size: (%dx%dx%dx%d). Output size is too small",
+            nInputPlane,inputDepth,inputHeight,inputWidth,nOutputPlane,outputDepth,outputHeight,outputWidth);
+
+  // Batch size + input planes
+  long batchSize = input->size[0];
+
+  // Resize output
+  THCTensor_(resize5d)(state, output, batchSize, nOutputPlane, outputDepth, outputHeight, outputWidth);
+
+  // Resize temporary columns
+  THCTensor_(resize2d)(state, columns, nInputPlane*kT*kW*kH, outputDepth*outputHeight*outputWidth);
+
+  // Define a buffer of ones, for bias accumulation
+  // Note: this buffer can be shared with other modules, it only ever gets increased,
+  // and always contains ones.
+  if (ones->nDimension != 2 || ones->size[0]*ones->size[1]*ones->size[2] < outputDepth*outputHeight*outputWidth) {
+    // Resize plane and fill with ones...
+    THCTensor_(resize3d)(state, ones, outputDepth, outputHeight, outputWidth);
+    THCTensor_(fill)(state, ones, ScalarConvert<int, real>::to(1));
+  }
+
+  // Helpers
+  THCTensor  *input_n = THCTensor_(new)(state);
+  THCTensor  *output_n = THCTensor_(new)(state);
+
+  // For each elt in batch, do:
+  for (int elt = 0; elt < batchSize; elt ++) {
+    // Matrix mulitply per output:
+    THCTensor_(select)(state, input_n, input, 0, elt);
+    THCTensor_(select)(state, output_n, output, 0, elt);
+
+    // Do Bias first:
+    // M,N,K are dims of matrix A and B
+    // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
+    long m_ = nOutputPlane;
+    long n_ = outputDepth * outputHeight * outputWidth;
+    long k_ = 1;
+
+    // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+    if (bias) {
+      #ifdef THC_REAL_IS_FLOAT
+      THCudaBlas_Sgemm(
+      #elif defined(THC_REAL_IS_HALF)
+      THCudaBlas_Hgemm(
+      #elif defined(THC_REAL_IS_DOUBLE)
+      THCudaBlas_Dgemm(
+      #endif
+          state,
+          't', 'n',
+          n_, m_, k_,
+          ScalarConvert<int, real>::to(1),
+          THCTensor_(data)(state, ones), k_,
+          THCTensor_(data)(state, bias), k_,
+          ScalarConvert<int, real>::to(0),
+          THCTensor_(data)(state, output_n), n_
+      );
+    } else {
+      THCTensor_(zero)(state, output_n);
+    }
+
+    // Extract columns:
+    vol2col(
+      THCState_getCurrentStream(state),
+      THCTensor_(data)(state, input_n),
+      nInputPlane, inputDepth, inputHeight, inputWidth,
+      kT, kH, kW, padT, padH, padW, dT, dH, dW,
+      dilationT, dilationH, dilationW,
+      THCTensor_(data)(state, columns)
+    );
+
+    // M,N,K are dims of matrix A and B
+    // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
+    long m = nOutputPlane;
+    long n = columns->size[1];
+    long k = nInputPlane*kT*kH*kW;
+
+    // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+    #ifdef THC_REAL_IS_FLOAT
+    THCudaBlas_Sgemm(
+    #elif defined(THC_REAL_IS_HALF)
+    THCudaBlas_Hgemm(
+    #elif defined(THC_REAL_IS_DOUBLE)
+    THCudaBlas_Dgemm(
+    #endif
+        state,
+        'n', 'n',
+        n, m, k,
+        ScalarConvert<int, real>::to(1),
+        THCTensor_(data)(state, columns), n,
+        THCTensor_(data)(state, weight), k,
+        ScalarConvert<int, real>::to(1),
+        THCTensor_(data)(state, output_n), n
+    );
+  }
+
+  // Free
+  THCTensor_(free)(state, input_n);
+  THCTensor_(free)(state, output_n);
+
+  // Resize output
+  if (batch == 0) {
+    THCTensor_(resize4d)(state, output, nOutputPlane, outputDepth, outputHeight, outputWidth);
+    THCTensor_(resize4d)(state, input, nInputPlane, inputDepth, inputHeight, inputWidth);
+  }
+}
+
+void THNN_(VolumetricDilatedConvolution_updateGradInput)(
+           THCState *state,
+           THCTensor  *input,
+           THCTensor  *gradOutput,
+           THCTensor  *gradInput,
+           THCTensor  *weight,
+           THCTensor  *gradColumns,
+           int kT, int kW, int kH,
+           int dT, int dW, int dH,
+           int padT, int padW, int padH,
+           int dilationT, int dilationW, int dilationH) {
+
+  THCUNN_assertSameGPU_generic(state, 5, input, gradOutput, weight,
+                                 gradColumns, gradInput);
+  THArgCheck(input->nDimension == 4 || input->nDimension == 5, 2, "4D or 5D (batch mode) tensor is expected");
+  THArgCheck(gradOutput->nDimension == 4 || gradOutput->nDimension == 5, 3, "4D or 5D (batch mode) tensor is expected");
+  THArgCheck(weight->nDimension == 5, 4, "weight tensor must be 5D (nOutputPlane,nInputPlane,kT,kH,kW)");
+  THArgCheck(kT > 0 && kW > 0 && kH > 0, 8, "kernel size should be greater than zero");
+  THArgCheck(dT > 0 && dW > 0 && dH > 0, 10, "stride should be greater than zero");
+
+  // Params
+  int nInputPlane = weight->size[1];
+  int nOutputPlane = weight->size[0];
+
+  int batch = 1;
+  if (input->nDimension == 4) {
+    // Force batch
+    batch = 0;
+    THCTensor_(resize5d)(state, input, 1, input->size[0], input->size[1], input->size[2], input->size[3]);
+    THCTensor_(resize5d)(state, gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2], gradOutput->size[3]);
+  }
+
+  long inputDepth  = input->size[2];
+  long inputWidth   = input->size[4];
+  long inputHeight  = input->size[3];
+  long outputDepth  = (inputDepth + 2*padT - (dilationT * (kT - 1) + 1)) / dT + 1;
+  long outputWidth  = (inputWidth + 2*padW - (dilationW * (kW - 1) + 1)) / dW + 1;
+  long outputHeight = (inputHeight + 2*padH - (dilationH * (kH - 1) + 1)) / dH + 1;
+
+  // Batch size + input planes
+  long batchSize = input->size[0];
+
+  // Resize output
+  THCTensor_(resize5d)(state, gradInput, batchSize, nInputPlane, inputDepth, inputHeight, inputWidth);
+
+  // Resize temporary columns
+  THCTensor_(resize2d)(state, gradColumns, nInputPlane*kT*kW*kH, outputDepth*outputHeight*outputWidth);
+
+  // Helpers
+  THCTensor  *gradInput_n = THCTensor_(new)(state);
+  THCTensor  *gradOutput_n = THCTensor_(new)(state);
+
+  // For each elt in batch, do:
+  for (int elt = 0; elt < batchSize; elt ++) {
+    // Matrix mulitply per sample:
+    THCTensor_(select)(state, gradInput_n, gradInput, 0, elt);
+    THCTensor_(select)(state, gradOutput_n, gradOutput, 0, elt);
+
+    // M,N,K are dims of matrix A and B
+    // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
+    long m = nInputPlane*kT*kW*kH;
+    long n = gradColumns->size[1];
+    long k = nOutputPlane;
+
+    // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+    #ifdef THC_REAL_IS_FLOAT
+    THCudaBlas_Sgemm(
+    #elif defined(THC_REAL_IS_HALF)
+    THCudaBlas_Hgemm(
+    #elif defined(THC_REAL_IS_DOUBLE)
+    THCudaBlas_Dgemm(
+    #endif
+        state,
+        'n', 't',
+        n, m, k,
+        ScalarConvert<int, real>::to(1),
+        THCTensor_(data)(state, gradOutput_n), n,
+        THCTensor_(data)(state, weight), m,
+        ScalarConvert<int, real>::to(0),
+        THCTensor_(data)(state, gradColumns), n
+    );
+
+    // Unpack columns back into input:
+    col2vol<real, accreal>(
+      THCState_getCurrentStream(state),
+      THCTensor_(data)(state, gradColumns),
+      nInputPlane, inputDepth, inputHeight, inputWidth,
+      kT, kH, kW, padT, padH, padW, dT, dH, dW,
+      dilationT, dilationH, dilationW,
+      THCTensor_(data)(state, gradInput_n)
+    );
+  }
+
+  // Free
+  THCTensor_(free)(state, gradInput_n);
+  THCTensor_(free)(state, gradOutput_n);
+
+  // Resize output
+  if (batch == 0) {
+    THCTensor_(resize4d)(state, gradOutput, nOutputPlane, outputDepth, outputHeight, outputWidth);
+    THCTensor_(resize4d)(state, input, nInputPlane, inputDepth, inputHeight, inputWidth);
+    THCTensor_(resize4d)(state, gradInput, nInputPlane, inputDepth, inputHeight, inputWidth);
+  }
+}
+
+void THNN_(VolumetricDilatedConvolution_accGradParameters)(
+           THCState *state,
+           THCTensor  *input,
+           THCTensor  *gradOutput,
+           THCTensor  *gradWeight,
+           THCTensor  *gradBias,
+           THCTensor  *columns,
+           THCTensor  *ones,
+           int kT, int kW, int kH,
+           int dT, int dW, int dH,
+           int padT, int padW, int padH,
+           int dilationT, int dilationW, int dilationH,
+           real scale) {
+
+  THCUNN_assertSameGPU_generic(state, 5, input, gradOutput, gradWeight, columns, ones);
+  if (gradBias) {
+   THCUNN_assertSameGPU_generic(state, 2, gradWeight, gradBias);
+  }
+  THArgCheck(input->nDimension == 4 || input->nDimension == 5, 2, "4D or 5D (batch mode) tensor is expected");
+  THArgCheck(gradOutput->nDimension == 4 || gradOutput->nDimension == 5, 3, "4D or 5D (batch mode) tensor is expected");
+  THArgCheck(gradWeight->nDimension == 5, 4, "gradWeight tensor must be 5D (nOutputPlane,nInputPlane,kT,kH,kW)");
+  THArgCheck(kT > 0 && kW > 0 && kH > 0, 8, "kernel size should be greater than zero");
+  THArgCheck(dT > 0 && dW > 0 && dH > 0, 10, "stride should be greater than zero");
+
+  // Params
+  int nInputPlane = gradWeight->size[1];
+  int nOutputPlane = gradWeight->size[0];
+
+  int batch = 1;
+  if (input->nDimension == 4) {
+    // Force batch
+    batch = 0;
+    THCTensor_(resize5d)(state, input, 1, input->size[0], input->size[1], input->size[2], input->size[3]);
+    THCTensor_(resize5d)(state, gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2], gradOutput->size[3]);
+  }
+
+  long inputDepth  = input->size[2];
+  long inputWidth   = input->size[4];
+  long inputHeight  = input->size[3];
+  long outputDepth  = (inputDepth + 2*padT - (dilationT * (kT - 1) + 1)) / dT + 1;
+  long outputWidth  = (inputWidth + 2*padW - (dilationW * (kW - 1) + 1)) / dW + 1;
+  long outputHeight = (inputHeight + 2*padH - (dilationH * (kH - 1) + 1)) / dH + 1;
+
+  // Batch size + input planes
+  long batchSize = input->size[0];
+
+  // Define a buffer of ones, for bias accumulation
+  if (ones->nDimension != 3 || ones->size[0]*ones->size[1]*ones->size[2] < outputDepth*outputHeight*outputWidth) {
+    // Resize plane and fill with ones...
+    THCTensor_(resize3d)(state, ones, outputDepth, outputHeight, outputWidth);
+    THCTensor_(fill)(state, ones, ScalarConvert<int, real>::to(1));
+  }
+
+  // Resize temporary columns
+  THCTensor_(resize2d)(state, columns, nInputPlane*kT*kW*kH, outputDepth*outputHeight*outputWidth);
+
+  // Helpers
+  THCTensor  *input_n = THCTensor_(new)(state);
+  THCTensor  *gradOutput_n = THCTensor_(new)(state);
+
+  // For each elt in batch, do:
+  for (int elt = 0; elt < batchSize; elt ++) {
+    // Matrix mulitply per output:
+    THCTensor_(select)(state, input_n, input, 0, elt);
+    THCTensor_(select)(state, gradOutput_n, gradOutput, 0, elt);
+
+    // Extract columns:
+    vol2col(
+      THCState_getCurrentStream(state),
+      THCTensor_(data)(state, input_n),
+      nInputPlane, inputDepth, inputHeight, inputWidth, kT, kH, kW, padT, padH, padW, dT, dH, dW,
+      dilationT, dilationH, dilationW,
+      THCTensor_(data)(state, columns)
+    );
+
+    // M,N,K are dims of matrix A and B
+    // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
+    long m = nOutputPlane;
+    long n = nInputPlane*kT*kW*kH;
+    long k = columns->size[1];
+
+    // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+    #ifdef THC_REAL_IS_FLOAT
+    THCudaBlas_Sgemm(
+    #elif defined(THC_REAL_IS_HALF)
+    THCudaBlas_Hgemm(
+    #elif defined(THC_REAL_IS_DOUBLE)
+    THCudaBlas_Dgemm(
+    #endif
+        state,
+        't', 'n',
+        n, m, k,
+        scale,
+        THCTensor_(data)(state, columns), k,
+        THCTensor_(data)(state, gradOutput_n), k,
+        ScalarConvert<int, real>::to(1),
+        THCTensor_(data)(state, gradWeight), n
+    );
+
+    // Do Bias:
+    // M,N,K are dims of matrix A and B
+    // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
+    long m_ = nOutputPlane;
+    long k_ = outputDepth * outputHeight * outputWidth;
+
+    // Do GEMV (note: this is a bit confusing because gemv assumes column-major matrices)
+    if (gradBias) {
+      #if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE)
+      #ifdef THC_REAL_IS_FLOAT
+      THCudaBlas_Sgemv(
+      #elif defined(THC_REAL_IS_DOUBLE)
+      THCudaBlas_Dgemv(
+      #endif
+          state,
+          't',
+          k_, m_,
+          scale,
+          THCTensor_(data)(state, gradOutput_n), k_,
+          THCTensor_(data)(state, ones), 1,
+          ScalarConvert<int, real>::to(1),
+          THCTensor_(data)(state, gradBias), 1
+      );
+      #endif
+      #ifdef THC_REAL_IS_HALF
+      THCudaBlas_Hgemm(
+          state,
+          't', 'n',
+          m_, 1, k_,
+          scale,
+          THCTensor_(data)(state, gradOutput_n), k_,
+          THCTensor_(data)(state, ones), k_,
+          ScalarConvert<int, real>::to(1),
+          THCTensor_(data)(state, gradBias), m_
+      );
+      #endif
+    }
+  }
+
+  // Free
+  THCTensor_(free)(state, input_n);
+  THCTensor_(free)(state, gradOutput_n);
+
+    // Resize output
+  if (batch == 0) {
+    THCTensor_(resize4d)(state, gradOutput, nOutputPlane, outputDepth, outputHeight, outputWidth);
+    THCTensor_(resize4d)(state, input, nInputPlane, inputDepth, inputHeight, inputWidth);
+  }
+}
+
+#endif
diff --git a/lib/THCUNN/generic/VolumetricDilatedMaxPooling.cu b/lib/THCUNN/generic/VolumetricDilatedMaxPooling.cu
new file mode 100644
index 0000000..0d6d084
--- /dev/null
+++ b/lib/THCUNN/generic/VolumetricDilatedMaxPooling.cu
@@ -0,0 +1,290 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/VolumetricDilatedMaxPooling.cu"
+#else
+
+#define UPDATE_OUTPUT_KERNEL_WIDTH(KW) case KW:                         \
+  cuda_VolumetricDilatedMaxPooling_updateOutput<KW><<<grid, block,             \
+    0, THCState_getCurrentStream(state)>>>(                             \
+    cudaInput, cudaIndices, cudaOutput, kT, kH, dT, dH, dW, padT, padH, padW,\
+    dilationT, dilationH, dilationW, offsetZ); \
+    break
+
+void THNN_(VolumetricDilatedMaxPooling_updateOutput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *output,
+           THCIndexTensor *indices,
+           int kT, int kW, int kH,
+           int dT, int dW, int dH,
+           int padT, int padW, int padH,
+           int dilationT, int dilationW, int dilationH,
+           bool ceilMode)
+{
+  int batchSize;
+  int inputSlices;
+  int inputTime;
+  int inputHeight;
+  int inputWidth;
+  int outputTime;
+  int outputHeight;
+  int outputWidth;
+
+  THCUNN_assertSameGPU_generic(state, 3, input, indices, output);
+
+  if (THCTensor_(nDimension)(state, input) == 4)
+  {
+    THArgCheck(
+      THCTensor_(size)(state, input, 1) >= kT &&
+      THCTensor_(size)(state, input, 2) >= kH &&
+      THCTensor_(size)(state, input, 3) >= kW, 2,
+      "input image smaller than kernel size"
+    );
+
+    /* sizes */
+    batchSize   = 1;
+    inputSlices = THCTensor_(size)(state, input, 0);
+    inputTime   = THCTensor_(size)(state, input, 1);
+    inputHeight = THCTensor_(size)(state, input, 2);
+    inputWidth  = THCTensor_(size)(state, input, 3);
+  }
+  else if (THCTensor_(nDimension)(state, input) == 5)
+  {
+    THArgCheck(
+      THCTensor_(size)(state, input, 4) >= kW &&
+      THCTensor_(size)(state, input, 3) >= kH &&
+      THCTensor_(size)(state, input, 2) >= kT, 2,
+      "input image smaller than kernel size"
+    );
+
+    /* sizes */
+    batchSize   = THCTensor_(size)(state, input, 0);
+    inputSlices = THCTensor_(size)(state, input, 1);
+    inputTime   = THCTensor_(size)(state, input, 2);
+    inputHeight = THCTensor_(size)(state, input, 3);
+    inputWidth  = THCTensor_(size)(state, input, 4);
+  }
+  else
+  {
+    THArgCheck(false, 2, "4D or 5D tensor expected");
+  }
+
+  THArgCheck(kT/2 >= padT && kW/2 >= padW && kH/2 >= padH, 2,
+    "pad should be smaller than half of kernel size"
+  );
+  THArgCheck(dilationT > 0 && dilationW > 0 && dilationH > 0, 14,
+    "dilation should be greater than 0"
+  );
+
+  if (ceilMode)
+  {
+    outputTime   = (int)(ceil((float)(inputTime - (dilationT * (kT - 1) + 1) + 2*padT) / dT)) + 1;
+    outputHeight = (int)(ceil((float)(inputHeight - (dilationH * (kH - 1) + 1) + 2*padH) / dH)) + 1;
+    outputWidth  = (int)(ceil((float)(inputWidth  - (dilationW * (kW - 1) + 1) + 2*padW) / dW)) + 1;
+  }
+  else
+  {
+    outputTime   = (int)(floor((float)(inputTime - (dilationT * (kT - 1) + 1) + 2*padT) / dT)) + 1;
+    outputHeight = (int)(floor((float)(inputHeight - (dilationH * (kH - 1) + 1) + 2*padH) / dH)) + 1;
+    outputWidth  = (int)(floor((float)(inputWidth  - (dilationW * (kW - 1) + 1) + 2*padW) / dW)) + 1;
+  }
+
+  if (outputTime < 1 || outputHeight < 1 || outputWidth < 1)
+    THError("Given input size: (%dx%dx%dx%d). Calculated output size: (%dx%dx%dx%d). Output size is too small",
+            inputSlices,inputTime,inputHeight,inputWidth,inputSlices,outputTime,outputHeight,outputWidth);
+
+  if (padT || padW || padH)
+  {
+    if ((outputTime - 1)*dT >= inputTime + padT)
+      --outputTime;
+    if ((outputHeight - 1)*dH >= inputHeight + padH)
+      --outputHeight;
+    if ((outputWidth  - 1)*dW >= inputWidth  + padW)
+      --outputWidth;
+  }
+
+  if (input->nDimension == 4) /* 4D */
+  {
+    /* resize output */
+    THCTensor_(resize4d)(state, output, inputSlices,
+                          outputTime, outputHeight, outputWidth);
+    /* indices pack ti,i,j locations for each output point as uchar into
+     each float of the tensor */
+    THCIndexTensor_(resize4d)(state, indices, inputSlices,
+                          outputTime, outputHeight, outputWidth);
+  }
+  else
+  { /* 5D */
+    THCTensor_(resize5d)(state, output, batchSize, inputSlices,
+                          outputTime, outputHeight, outputWidth);
+    // Index tensor packs index offsets as uchars into floats
+    THCIndexTensor_(resize5d)(state, indices, batchSize, inputSlices,
+                          outputTime, outputHeight, outputWidth);
+  }
+
+  input = THCTensor_(newContiguous)(state, input);
+
+  // Collapse batch and feature dimensions
+  THCDeviceTensor<real, 4> cudaInput;
+  THCDeviceTensor<real, 4> cudaOutput;
+  if (THCTensor_(nDimension)(state, input) == 4)
+  {
+    cudaInput  = toDeviceTensor<real, 4>(state, input);
+    cudaOutput = toDeviceTensor<real, 4>(state, output);
+  }
+  else
+  {
+    cudaInput  = toDeviceTensor<real, 5>(state, input).downcastOuter<4>();
+    cudaOutput = toDeviceTensor<real, 5>(state, output).downcastOuter<4>();
+  }
+
+  THLongStorage *indicesSize = THLongStorage_newWithSize(4);
+  long indicesSizeRaw[4] = { batchSize * inputSlices,
+                            outputTime, outputHeight, outputWidth };
+  THLongStorage_rawCopy(indicesSize, indicesSizeRaw);
+
+  THCIndexTensor *indices1 = THCIndexTensor_(newWithStorage)(
+    state, THCIndexTensor_(storage)(state, indices),
+    THCIndexTensor_(storageOffset)(state, indices),
+    indicesSize, NULL);
+
+  THLongStorage_free(indicesSize);
+
+  THCDeviceTensor<THCIndex_t, 4> cudaIndices =
+    toDeviceTensor<THCIndex_t, 4>(state, indices1);
+
+  int totalZ = outputTime * inputSlices * batchSize;
+  int offsetZ = 0;
+  dim3 block(32, 8);
+
+  while (totalZ > 0) {
+    dim3 grid(THCCeilDiv(outputWidth, static_cast<int>(block.x)),
+              THCCeilDiv(outputHeight, static_cast<int>(block.y)),
+              totalZ > 65535 ? 65535 : totalZ);
+
+    switch (kW)
+      {
+        UPDATE_OUTPUT_KERNEL_WIDTH(1);
+        UPDATE_OUTPUT_KERNEL_WIDTH(2);
+        UPDATE_OUTPUT_KERNEL_WIDTH(3);
+        UPDATE_OUTPUT_KERNEL_WIDTH(4);
+        UPDATE_OUTPUT_KERNEL_WIDTH(5);
+        UPDATE_OUTPUT_KERNEL_WIDTH(6);
+        UPDATE_OUTPUT_KERNEL_WIDTH(7);
+      default:
+        cuda_VolumetricDilatedMaxPooling_updateOutput<<<grid, block,
+          0, THCState_getCurrentStream(state)>>>(
+                             cudaInput, cudaIndices, cudaOutput,
+                             kT, kH, kW, dT, dH, dW,
+                             padT, padH, padW, dilationT, dilationH, dilationW, offsetZ);
+      }
+    THCudaCheck(cudaGetLastError());
+    totalZ -= 65535;
+    offsetZ += 65535;
+  }
+
+  THCTensor_(free)(state, input);
+  THCIndexTensor_(free)(state, indices1);
+}
+
+#undef UPDATE_OUTPUT_KERNEL_WIDTH
+
+void THNN_(VolumetricDilatedMaxPooling_updateGradInput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *gradOutput,
+           THCTensor *gradInput,
+           THCIndexTensor *indices,
+           int dT, int dW, int dH,
+           int padT, int padW, int padH,
+           int dilationT, int dilationW, int dilationH)
+{
+  // Resize and initialize result tensor.
+  THCTensor_(resizeAs)(state, gradInput, input);
+  THCTensor_(zero)(state, gradInput);
+
+  int batchSize;
+  int inputSlices;
+
+  int outputTime;
+  int outputHeight;
+  int outputWidth;
+
+  THCUNN_assertSameGPU_generic(state, 4, input, indices, gradOutput, gradInput);
+
+  if (THCTensor_(nDimension)(state, input) == 4) /* 4D */
+  {
+    batchSize = 1;
+    inputSlices  = THCTensor_(size)(state, input, 0);
+
+    outputTime   = THCTensor_(size)(state, gradOutput, 1);
+    outputHeight = THCTensor_(size)(state, gradOutput, 2);
+    outputWidth  = THCTensor_(size)(state, gradOutput, 3);
+  }
+  else
+  {
+    batchSize    = THCTensor_(size)(state, input, 0);
+    inputSlices  = THCTensor_(size)(state, input, 1);
+
+    outputTime   = THCTensor_(size)(state, gradOutput, 2);
+    outputHeight = THCTensor_(size)(state, gradOutput, 3);
+    outputWidth  = THCTensor_(size)(state, gradOutput, 4);
+  }
+
+  gradOutput = THCTensor_(newContiguous)(state, gradOutput);
+
+  // Collapse batch and feature dimensions
+  THCDeviceTensor<real, 4> cudaGradInput;
+  THCDeviceTensor<real, 4> cudaGradOutput;
+  if (THCTensor_(nDimension)(state, input) == 4)
+  {
+    cudaGradInput  = toDeviceTensor<real, 4>(state, gradInput);
+    cudaGradOutput = toDeviceTensor<real, 4>(state, gradOutput);
+  }
+  else
+  {
+    cudaGradInput =
+      toDeviceTensor<real, 5>(state, gradInput).downcastOuter<4>();
+    cudaGradOutput =
+      toDeviceTensor<real, 5>(state, gradOutput).downcastOuter<4>();
+  }
+
+  THLongStorage *indicesSize = THLongStorage_newWithSize(4);
+  long indicesSizeRaw[4] = { batchSize * inputSlices,
+                           outputTime, outputHeight, outputWidth };
+  THLongStorage_rawCopy(indicesSize, indicesSizeRaw);
+  THCIndexTensor *indices1 = THCIndexTensor_(newWithStorage)(
+    state, THCIndexTensor_(storage)(state, indices),
+    THCIndexTensor_(storageOffset)(state, indices), indicesSize, NULL);
+  THLongStorage_free(indicesSize);
+
+  THCDeviceTensor<THCIndex_t, 4> cudaIndices =
+    toDeviceTensor<THCIndex_t, 4>(state, indices1);
+
+  int totalZ = outputTime * inputSlices * batchSize;
+  int offsetZ = 0;
+  dim3 block(32, 8);
+
+  while (totalZ > 0) {
+    dim3 grid(THCCeilDiv(outputWidth, static_cast<int>(block.x)),
+              THCCeilDiv(outputHeight, static_cast<int>(block.y)),
+              totalZ > 65535 ? 65535 : totalZ);
+
+    cuda_VolumetricDilatedMaxPooling_updateGradInput<<<grid, block,
+      0, THCState_getCurrentStream(state)>>>(
+                                             cudaGradOutput,
+                                             cudaIndices,
+                                             cudaGradInput,
+                                             dT, dH, dW,
+                                             padT, padH, padW,
+                                             dilationT, dilationH, dilationW, offsetZ);
+    THCudaCheck(cudaGetLastError());
+    totalZ -= 65535;
+    offsetZ += 65535;
+  }
+
+  // cleanup
+  THCTensor_(free)(state, gradOutput);
+  THCIndexTensor_(free)(state, indices1);
+}
+
+#endif
diff --git a/lib/THCUNN/generic/VolumetricFullConvolution.cu b/lib/THCUNN/generic/VolumetricFullConvolution.cu
new file mode 100644
index 0000000..7a1f788
--- /dev/null
+++ b/lib/THCUNN/generic/VolumetricFullConvolution.cu
@@ -0,0 +1,399 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/VolumetricFullConvolution.cu"
+#else
+
+void THNN_(VolumetricFullConvolution_updateOutput)(
+           THCState *state,
+           THCTensor  *input,
+           THCTensor  *output,
+           THCTensor  *weight,
+           THCTensor  *bias,
+           THCTensor  *finput,
+           THCTensor  *fgradInput,
+           int dT, int dW, int dH,
+           int padT, int padW, int padH,
+           int adjT, int adjW, int adjH)
+{
+
+  THCTensor  *columns = finput;
+  THCTensor  *ones    = fgradInput;
+
+  int nInputPlane = THCTensor_(size)(state, weight, 0);
+  int nOutputPlane = THCTensor_(size)(state, weight, 1);
+  const int kT           = (int)weight->size[2];
+  const int kH           = (int)weight->size[3];
+  const int kW           = (int)weight->size[4];
+
+  THCUNN_assertSameGPU_generic(state, 6, input, output, weight,
+                                 bias, columns, ones);
+  THArgCheck(input->nDimension == 4 || input->nDimension == 5, 2, "4D or 5D (batch mode) tensor is expected");
+
+  int batch = 1;
+  if (input->nDimension == 4) {
+    THArgCheck(input->size[0] == nInputPlane, 2, "input channels and nInputPlane dont match");
+    // Force batch
+    batch = 0;
+    THCTensor_(resize5d)(state, input, 1, input->size[0], input->size[1], input->size[2], input->size[3]);
+  } else {
+    THArgCheck(input->size[1] == nInputPlane, 2, "input channels and nInputPlane dont match");
+  }
+
+  long inputWidth   = input->size[4];
+  long inputHeight  = input->size[3];
+  long inputDepth  = input->size[2];
+  long outputWidth  = (inputWidth - 1) * dW - 2*padW + kW + adjW;
+  long outputHeight = (inputHeight - 1) * dH - 2*padH + kH + adjH;
+  long outputDepth = (inputDepth - 1) * dT - 2*padT + kT + adjT;
+
+  // Batch size + input planes
+  long batchSize = input->size[0];
+
+  // Resize output
+  THCTensor_(resize5d)(state, output, batchSize, nOutputPlane, outputDepth, outputHeight, outputWidth);
+
+  // Resize temporary columns
+  THCTensor_(resize2d)(state, columns, nOutputPlane*kW*kH*kT, inputDepth*inputHeight*inputWidth);
+
+  // Define a buffer of ones, for bias accumulation
+  // Note: this buffer can be shared with other modules, it only ever gets increased,
+  // and always contains ones.
+  if (ones->nDimension != 3 || ones->size[0]*ones->size[1]*ones->size[2] < outputDepth*outputHeight*outputWidth) {
+    // Resize plane and fill with ones...
+    THCTensor_(resize3d)(state, ones, outputDepth, outputHeight, outputWidth);
+    THCTensor_(fill)(state, ones, ScalarConvert<int, real>::to(1));
+  }
+
+  // Helpers
+  THCTensor  *input_n = THCTensor_(new)(state);
+  THCTensor  *output_n = THCTensor_(new)(state);
+
+  // For each elt in batch, do:
+  for (int elt = 0; elt < batchSize; elt ++) {
+    // Matrix mulitply per output:
+    THCTensor_(select)(state, input_n, input, 0, elt);
+    THCTensor_(select)(state, output_n, output, 0, elt);
+
+    // M,N,K are dims of matrix A and B
+    // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
+    long m = weight->size[1] * weight->size[2] * weight->size[3] * weight->size[4];
+    long n = columns->size[1];
+    long k = weight->size[0];
+
+    // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+    #ifdef THC_REAL_IS_FLOAT
+    THCudaBlas_Sgemm(
+    #elif defined(THC_REAL_IS_HALF)
+    THCudaBlas_Hgemm(
+    #elif defined(THC_REAL_IS_DOUBLE)
+    THCudaBlas_Dgemm(
+    #endif
+        state,
+        'n', 't',
+        n, m, k,
+        ScalarConvert<int, real>::to(1),
+        THCTensor_(data)(state, input_n), n,
+        THCTensor_(data)(state, weight), m,
+        ScalarConvert<int, real>::to(0),
+        THCTensor_(data)(state, columns), n
+    );
+
+    // Unpack columns back into input:
+    col2vol<real, accreal>(
+      THCState_getCurrentStream(state),
+      THCTensor_(data)(state, columns),
+      nOutputPlane, outputDepth, outputHeight, outputWidth, kT, kH, kW, padT, padH, padW, dT, dH, dW,
+      1,1,1,
+      THCTensor_(data)(state, output_n)
+    );
+
+    // Do Bias after:
+    // M,N,K are dims of matrix A and B
+    // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
+    long m_ = nOutputPlane;
+    long n_ = outputDepth * outputHeight * outputWidth;
+    long k_ = 1;
+
+    // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+    #ifdef THC_REAL_IS_FLOAT
+    THCudaBlas_Sgemm(
+    #elif defined(THC_REAL_IS_HALF)
+    THCudaBlas_Hgemm(
+    #elif defined(THC_REAL_IS_DOUBLE)
+    THCudaBlas_Dgemm(
+    #endif
+        state,
+        't', 'n',
+        n_, m_, k_,
+        ScalarConvert<int, real>::to(1),
+        THCTensor_(data)(state, ones), k_,
+        THCTensor_(data)(state, bias), k_,
+        ScalarConvert<int, real>::to(1),
+        THCTensor_(data)(state, output_n), n_
+    );
+
+  }
+
+  // Free
+  THCTensor_(free)(state, input_n);
+  THCTensor_(free)(state, output_n);
+
+  // Resize output
+  if (batch == 0) {
+    THCTensor_(resize4d)(state, output, nOutputPlane, outputDepth, outputHeight, outputWidth);
+    THCTensor_(resize4d)(state, input, nInputPlane, inputDepth, inputHeight, inputWidth);
+  }
+}
+
+void THNN_(VolumetricFullConvolution_updateGradInput)(
+           THCState *state,
+           THCTensor  *input,
+           THCTensor  *gradOutput,
+           THCTensor  *gradInput,
+           THCTensor  *weight,
+           THCTensor  *finput,
+           THCTensor  *fgradInput,
+           int dT, int dW, int dH,
+           int padT, int padW, int padH,
+           int adjT, int adjW, int adjH)
+{
+  THCTensor  *gradColumns = finput;
+
+  int nInputPlane = THCTensor_(size)(state, weight, 0);
+  int nOutputPlane = THCTensor_(size)(state, weight, 1);
+  const int kT           = (int)weight->size[2];
+  const int kH           = (int)weight->size[3];
+  const int kW           = (int)weight->size[4];
+
+  THCUNN_assertSameGPU_generic(state, 5, input, gradOutput, weight,
+                                 gradColumns, gradInput);
+  THArgCheck(input->nDimension == 4 || input->nDimension == 5, 2, "4D or 5D (batch mode) tensor is expected");
+
+  int batch = 1;
+  if (input->nDimension == 4) {
+    // Force batch
+    batch = 0;
+    THCTensor_(resize5d)(state, input, 1, input->size[0], input->size[1], input->size[2], input->size[3]);
+    THCTensor_(resize5d)(state, gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2], gradOutput->size[3]);
+  }
+
+  long inputWidth   = input->size[4];
+  long inputHeight  = input->size[3];
+  long inputDepth   = input->size[2];
+  long outputWidth  = (inputWidth - 1) * dW - 2*padW + kW + adjW;
+  long outputHeight = (inputHeight - 1) * dH - 2*padH + kH + adjH;
+  long outputDepth = (inputDepth - 1) * dT - 2*padT + kT + adjT;
+
+  // Batch size + input planes
+  long batchSize = input->size[0];
+
+  // Resize output
+  THCTensor_(resize5d)(state, gradInput, batchSize, nInputPlane, inputDepth, inputHeight, inputWidth);
+
+  // Resize temporary columns
+  THCTensor_(resize2d)(state, gradColumns, nOutputPlane*kW*kH*kT, inputDepth*inputHeight*inputWidth);
+
+  // Helpers
+  THCTensor  *gradInput_n = THCTensor_(new)(state);
+  THCTensor  *gradOutput_n = THCTensor_(new)(state);
+
+  // For each elt in batch, do:
+  for (int elt = 0; elt < batchSize; elt ++) {
+    // Matrix mulitply per sample:
+    THCTensor_(select)(state, gradInput_n, gradInput, 0, elt);
+    THCTensor_(select)(state, gradOutput_n, gradOutput, 0, elt);
+
+    // Extract columns:
+    vol2col(
+      THCState_getCurrentStream(state),
+      THCTensor_(data)(state, gradOutput_n),
+      nOutputPlane, outputDepth, outputHeight, outputWidth, kT, kH, kW, padT, padH, padW, dT, dH, dW,
+      1,1,1,
+      THCTensor_(data)(state, gradColumns)
+    );
+
+
+    // M,N,K are dims of matrix A and B
+    // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
+    long m = weight->size[0];
+    long n = gradColumns->size[1];
+    long k = weight->size[1] * weight->size[2] * weight->size[3] * weight->size[4];
+
+    // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+    #ifdef THC_REAL_IS_FLOAT
+    THCudaBlas_Sgemm(
+    #elif defined(THC_REAL_IS_HALF)
+    THCudaBlas_Hgemm(
+    #elif defined(THC_REAL_IS_DOUBLE)
+    THCudaBlas_Dgemm(
+    #endif
+        state,
+        'n', 'n',
+        n, m, k,
+        ScalarConvert<int, real>::to(1),
+        THCTensor_(data)(state, gradColumns), n,
+        THCTensor_(data)(state, weight), k,
+        ScalarConvert<int, real>::to(0),
+        THCTensor_(data)(state, gradInput_n), n
+    );
+  }
+
+
+  // Free
+  THCTensor_(free)(state, gradInput_n);
+  THCTensor_(free)(state, gradOutput_n);
+
+  // Resize output
+  if (batch == 0) {
+    THCTensor_(resize4d)(state, gradOutput, nOutputPlane, outputDepth, outputHeight, outputWidth);
+    THCTensor_(resize4d)(state, input, nInputPlane, inputDepth, inputHeight, inputWidth);
+    THCTensor_(resize4d)(state, gradInput, nInputPlane, inputDepth, inputHeight, inputWidth);
+  }
+}
+
+
+void THNN_(VolumetricFullConvolution_accGradParameters)(
+           THCState *state,
+           THCTensor  *input,
+           THCTensor  *gradOutput,
+           THCTensor  *gradWeight,
+           THCTensor  *gradBias,
+           THCTensor  *finput,
+           THCTensor  *fgradInput,
+           int dT, int dW, int dH,
+           int padT, int padW, int padH,
+           int adjT, int adjW, int adjH,
+           real scale)
+{
+  THCTensor  *columns = finput;
+  THCTensor  *ones = fgradInput;
+
+  int nInputPlane = THCTensor_(size)(state, gradWeight, 0);
+  int nOutputPlane = THCTensor_(size)(state, gradWeight, 1);
+  const int kT           = (int)gradWeight->size[2];
+  const int kH           = (int)gradWeight->size[3];
+  const int kW           = (int)gradWeight->size[4];
+
+  THCUNN_assertSameGPU_generic(state, 6, input, gradOutput, gradWeight,
+                                 gradBias, columns, ones);
+  THArgCheck(input->nDimension == 4 || input->nDimension == 5, 2, "4D or 5D (batch mode) tensor is expected");
+
+  int batch = 1;
+  if (input->nDimension == 4) {
+    // Force batch
+    batch = 0;
+    THCTensor_(resize5d)(state, input, 1, input->size[0], input->size[1], input->size[2], input->size[3]);
+    THCTensor_(resize5d)(state, gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2], gradOutput->size[3]);
+  }
+
+  long inputWidth   = input->size[4];
+  long inputHeight  = input->size[3];
+  long inputDepth   = input->size[2];
+  long outputWidth  = (inputWidth - 1) * dW - 2*padW + kW + adjW;
+  long outputHeight = (inputHeight - 1) * dH - 2*padH + kH + adjH;
+  long outputDepth  = (inputDepth - 1) * dT - 2*padT + kT + adjT;
+
+  // Batch size + input planes
+  long batchSize = input->size[0];
+
+  // Define a buffer of ones, for bias accumulation
+  if (ones->nDimension != 3 || ones->size[0]*ones->size[1]*ones->size[2] < outputDepth*outputHeight*outputWidth) {
+    // Resize plane and fill with ones...
+    THCTensor_(resize3d)(state, ones, outputDepth, outputHeight, outputWidth);
+    THCTensor_(fill)(state, ones, ScalarConvert<int, real>::to(1));
+  }
+
+  // Resize temporary columns
+  THCTensor_(resize2d)(state, columns, nOutputPlane*kW*kH*kT, inputDepth*inputHeight*inputWidth);
+
+  // Helpers
+  THCTensor  *input_n = THCTensor_(new)(state);
+  THCTensor  *gradOutput_n = THCTensor_(new)(state);
+
+  // For each elt in batch, do:
+  for (int elt = 0; elt < batchSize; elt ++) {
+    // Matrix mulitply per output:
+    THCTensor_(select)(state, input_n, input, 0, elt);
+    THCTensor_(select)(state, gradOutput_n, gradOutput, 0, elt);
+
+    // Extract columns:
+    vol2col(
+      THCState_getCurrentStream(state),
+      THCTensor_(data)(state, gradOutput_n),
+      nOutputPlane, outputDepth, outputHeight, outputWidth, kT, kH, kW, padT, padH, padW, dT, dH, dW,
+      1,1,1,
+      THCTensor_(data)(state, columns)
+    );
+
+    // M,N,K are dims of matrix A and B
+    // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
+    long n = columns->size[0];   // nOutputPlane * kt * kh * kw
+    long m = input_n->size[0];   // nInputPlane
+    long k = columns->size[1];   // inputHeight * inputWidth
+
+    // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+    #ifdef THC_REAL_IS_FLOAT
+    THCudaBlas_Sgemm(
+    #elif defined(THC_REAL_IS_HALF)
+    THCudaBlas_Hgemm(
+    #elif defined(THC_REAL_IS_DOUBLE)
+    THCudaBlas_Dgemm(
+    #endif
+        state,
+        't', 'n',
+        n, m, k,
+        scale,
+        THCTensor_(data)(state, columns), k,
+        THCTensor_(data)(state, input_n), k,
+        ScalarConvert<int, real>::to(1),
+        THCTensor_(data)(state, gradWeight), n
+    );
+
+    // Do Bias:
+    // M,N,K are dims of matrix A and B
+    // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
+    long m_ = nOutputPlane;
+    long k_ = outputDepth * outputHeight * outputWidth;
+
+    // Do GEMV (note: this is a bit confusing because gemv assumes column-major matrices)
+    #if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE)
+    #ifdef THC_REAL_IS_FLOAT
+    THCudaBlas_Sgemv(
+    #elif defined(THC_REAL_IS_DOUBLE)
+    THCudaBlas_Dgemv(
+    #endif
+        state,
+        't',
+        k_, m_,
+        scale,
+        THCTensor_(data)(state, gradOutput_n), k_,
+        THCTensor_(data)(state, ones), 1,
+        ScalarConvert<int, real>::to(1),
+        THCTensor_(data)(state, gradBias), 1
+    );
+    #endif
+    #ifdef THC_REAL_IS_HALF
+    THCudaBlas_Hgemm(
+        state,
+        't', 'n',
+        m_, 1, k_,
+        scale,
+        THCTensor_(data)(state, gradOutput_n), k_,
+        THCTensor_(data)(state, ones), k_,
+        ScalarConvert<int, real>::to(1),
+        THCTensor_(data)(state, gradBias), m_
+    );
+    #endif
+  }
+
+  // Free
+  THCTensor_(free)(state, input_n);
+  THCTensor_(free)(state, gradOutput_n);
+
+  // Resize
+  if (batch == 0) {
+    THCTensor_(resize4d)(state, gradOutput, nOutputPlane, outputDepth, outputHeight, outputWidth);
+    THCTensor_(resize4d)(state, input, nInputPlane, inputDepth, inputHeight, inputWidth);
+  }
+}
+
+#endif
diff --git a/lib/THCUNN/generic/VolumetricMaxPooling.cu b/lib/THCUNN/generic/VolumetricMaxPooling.cu
new file mode 100644
index 0000000..4a55a45
--- /dev/null
+++ b/lib/THCUNN/generic/VolumetricMaxPooling.cu
@@ -0,0 +1,36 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/VolumetricMaxPooling.cu"
+#else
+
+void THNN_(VolumetricMaxPooling_updateOutput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *output,
+           THCIndexTensor *indices,
+           int kT, int kW, int kH,
+           int dT, int dW, int dH,
+           int padT, int padW, int padH,
+           bool ceilMode)
+{
+  THNN_(VolumetricDilatedMaxPooling_updateOutput)(
+    state, input, output, indices,
+    kT, kW, kH, dT, dW, dH, padT, padW, padH, 1, 1, 1, ceilMode);
+
+}
+
+void THNN_(VolumetricMaxPooling_updateGradInput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *gradOutput,
+           THCTensor *gradInput,
+           THCIndexTensor *indices,
+           int dT, int dW, int dH,
+           int padT, int padW, int padH)
+{
+  THNN_(VolumetricDilatedMaxPooling_updateGradInput)(
+    state, input, gradOutput, gradInput, indices,
+    dT, dW, dH, padT, padW, padH, 1, 1, 1);
+
+}
+
+#endif
diff --git a/lib/THCUNN/generic/VolumetricMaxUnpooling.cu b/lib/THCUNN/generic/VolumetricMaxUnpooling.cu
new file mode 100644
index 0000000..2a68094
--- /dev/null
+++ b/lib/THCUNN/generic/VolumetricMaxUnpooling.cu
@@ -0,0 +1,192 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/VolumetricMaxUnpooling.cu"
+#else
+
+void THNN_(VolumetricMaxUnpooling_updateOutput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *output,
+           THCIndexTensor *indices,
+           int outputTime, int outputWidth, int outputHeight,
+           int dT, int dW, int dH,
+           int padT, int padW, int padH)
+{
+  int batchSize;
+  int inputSlices;
+  int inputTime;
+  int inputHeight;
+  int inputWidth;
+
+  THCUNN_assertSameGPU_generic(state, 3, input, indices, output);
+
+  if (THCTensor_(nDimension)(state, input) == 4)
+  {
+    /* sizes */
+    batchSize   = 1;
+    inputSlices = THCTensor_(size)(state, input, 0);
+    inputTime   = THCTensor_(size)(state, input, 1);
+    inputHeight = THCTensor_(size)(state, input, 2);
+    inputWidth  = THCTensor_(size)(state, input, 3);
+  }
+  else if (THCTensor_(nDimension)(state, input) == 5)
+  {
+    /* sizes */
+    batchSize   = THCTensor_(size)(state, input, 0);
+    inputSlices = THCTensor_(size)(state, input, 1);
+    inputTime   = THCTensor_(size)(state, input, 2);
+    inputHeight = THCTensor_(size)(state, input, 3);
+    inputWidth  = THCTensor_(size)(state, input, 4);
+  }
+  else
+  {
+    THArgCheck(false, 2, "4D or 5D tensor expected");
+  }
+
+  if (input->nDimension == 4) /* 4D */
+  {
+    /* resize output */
+    THCTensor_(resize4d)(state, output, inputSlices,
+                          outputTime, outputHeight, outputWidth);
+  }
+  else
+  { /* 5D */
+    THCTensor_(resize5d)(state, output, batchSize, inputSlices,
+                          outputTime, outputHeight, outputWidth);
+  }
+
+  input = THCTensor_(newContiguous)(state, input);
+  indices = THCIndexTensor_(newContiguous)(state, indices);
+  THCTensor_(zero)(state, output);
+
+  // Collapse batch and feature dimensions
+  THCDeviceTensor<real, 4> cudaInput;
+  THCDeviceTensor<real, 4> cudaOutput;
+  THCDeviceTensor<THCIndex_t, 4> cudaIndices;
+
+  if (THCTensor_(nDimension)(state, input) == 4)
+  {
+    cudaInput  = toDeviceTensor<real, 4>(state, input);
+    cudaOutput = toDeviceTensor<real, 4>(state, output);
+    cudaIndices = toDeviceTensor<THCIndex_t, 4>(state, indices);
+  }
+  else
+  {
+    cudaInput  = toDeviceTensor<real, 5>(state, input).downcastOuter<4>();
+    cudaOutput = toDeviceTensor<real, 5>(state, output).downcastOuter<4>();
+    cudaIndices = toDeviceTensor<THCIndex_t, 5>(state, indices).downcastOuter<4>();
+  }
+
+  int totalZ = inputTime * inputSlices * batchSize;
+  int offsetZ = 0;
+  dim3 block(32, 8);
+
+  while (totalZ > 0) {
+    dim3 grid(THCCeilDiv(inputWidth, static_cast<int>(block.x)),
+              THCCeilDiv(inputHeight, static_cast<int>(block.y)),
+              totalZ > 65535 ? 65535 : totalZ);
+
+    cuda_VolumetricMaxUnpooling_updateOutput<<<grid, block,
+          0, THCState_getCurrentStream(state)>>>(
+                             cudaInput, cudaIndices, cudaOutput,
+                             dT, dH, dW,
+                             padT, padH, padW, offsetZ);
+    THCudaCheck(cudaGetLastError());
+    totalZ -= 65535;
+    offsetZ += 65535;
+  }
+
+  THCTensor_(free)(state, input);
+  THCIndexTensor_(free)(state, indices);
+}
+
+void THNN_(VolumetricMaxUnpooling_updateGradInput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *gradOutput,
+           THCTensor *gradInput,
+           THCIndexTensor *indices,
+           int outputTime, int outputWidth, int outputHeight,
+           int dT, int dW, int dH,
+           int padT, int padW, int padH)
+{
+
+  int batchSize;
+  int inputSlices;
+  int inputTime;
+  int inputHeight;
+  int inputWidth;
+
+  THCUNN_assertSameGPU_generic(state, 4, input, indices, gradOutput, gradInput);
+
+  if (THCTensor_(nDimension)(state, input) == 4) /* 4D */
+  {
+    batchSize = 1;
+    inputSlices  = THCTensor_(size)(state, input, 0);
+    inputTime   = THCTensor_(size)(state, input, 1);
+    inputHeight = THCTensor_(size)(state, input, 2);
+    inputWidth  = THCTensor_(size)(state, input, 3);
+  }
+  else
+  {
+    batchSize    = THCTensor_(size)(state, input, 0);
+    inputSlices  = THCTensor_(size)(state, input, 1);
+    inputTime   = THCTensor_(size)(state, input, 2);
+    inputHeight = THCTensor_(size)(state, input, 3);
+    inputWidth  = THCTensor_(size)(state, input, 4);
+  }
+
+  input = THCTensor_(newContiguous)(state, input);
+  indices = THCIndexTensor_(newContiguous)(state, indices);
+  gradOutput = THCTensor_(newContiguous)(state, gradOutput);
+  THCTensor_(resizeAs)(state, gradInput, input);
+  THCTensor_(zero)(state, gradInput);
+
+  // Collapse batch and feature dimensions
+  THCDeviceTensor<real, 4> cudaGradInput;
+  THCDeviceTensor<real, 4> cudaGradOutput;
+  THCDeviceTensor<THCIndex_t, 4> cudaIndices;
+
+  if (THCTensor_(nDimension)(state, input) == 4)
+  {
+    cudaGradInput  = toDeviceTensor<real, 4>(state, gradInput);
+    cudaGradOutput = toDeviceTensor<real, 4>(state, gradOutput);
+    cudaIndices = toDeviceTensor<THCIndex_t, 4>(state, indices);
+  }
+  else
+  {
+    cudaGradInput =
+      toDeviceTensor<real, 5>(state, gradInput).downcastOuter<4>();
+    cudaGradOutput =
+      toDeviceTensor<real, 5>(state, gradOutput).downcastOuter<4>();
+    cudaIndices =
+      toDeviceTensor<THCIndex_t, 5>(state, indices).downcastOuter<4>();
+  }
+
+  int totalZ = inputTime * inputSlices * batchSize;
+  int offsetZ = 0;
+  dim3 block(32, 8);
+
+  while (totalZ > 0) {
+    dim3 grid(THCCeilDiv(inputWidth, static_cast<int>(block.x)),
+              THCCeilDiv(inputHeight, static_cast<int>(block.y)),
+              totalZ > 65535 ? 65535 : totalZ);
+
+    cuda_VolumetricMaxUnpooling_updateGradInput<<<grid, block,
+      0, THCState_getCurrentStream(state)>>>(
+                                             cudaGradOutput,
+                                             cudaIndices,
+                                             cudaGradInput,
+                                             dT, dH, dW,
+                                             padT, padH, padW, offsetZ);
+    THCudaCheck(cudaGetLastError());
+    totalZ -= 65535;
+    offsetZ += 65535;
+  }
+
+  // cleanup
+  THCTensor_(free)(state, input);
+  THCTensor_(free)(state, gradOutput);
+  THCIndexTensor_(free)(state, indices);
+}
+
+#endif
diff --git a/lib/THCUNN/generic/VolumetricReplicationPadding.cu b/lib/THCUNN/generic/VolumetricReplicationPadding.cu
new file mode 100644
index 0000000..60bf36c
--- /dev/null
+++ b/lib/THCUNN/generic/VolumetricReplicationPadding.cu
@@ -0,0 +1,120 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/VolumetricReplicationPadding.cu"
+#else
+
+void THNN_(VolumetricReplicationPadding_updateOutput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *output,
+           int pleft, int pright,
+           int ptop, int pbottom,
+           int pfront, int pback) {
+  THArgCheck(TensorUtils<THCTensor>::canUse32BitIndexMath(state, input), 2,
+             "input tensor must fit into 32-bit index math");
+
+  int planeDim = 0;
+  int dimd = 1;
+  int dimh = 2;
+  int dimw = 3;
+  int numBatch = 1;
+
+  int numInputDims = THCTensor_(nDimension)(state, input);
+  THArgCheck(numInputDims == 4 || numInputDims == 5, 2,
+             "input must be 4 or 5-dimensional");
+
+  if (numInputDims == 5) {
+    numBatch = THCTensor_(size)(state, input, 0);
+    planeDim++;
+    dimd++;
+    dimh++;
+    dimw++;
+  }
+
+  int numPlanes = THCTensor_(size)(state, input, planeDim);
+  int inputD = THCTensor_(size)(state, input, dimd);
+  int inputH = THCTensor_(size)(state, input, dimh);
+  int inputW = THCTensor_(size)(state, input, dimw);
+  int outputD = inputD + pfront + pback;
+  int outputH = inputH + ptop + pbottom;
+  int outputW  = inputW + pleft + pright;
+
+  THCDeviceTensor<real, 5> devInput;
+  THCDeviceTensor<real, 5> devOutput;
+
+  if (numInputDims == 4) {
+    THCTensor_(resize4d)(state, output, numPlanes, outputD, outputH, outputW);
+
+    devInput = toDeviceTensor<real, 4>(state, input).upcastOuter<5>();
+    devOutput = toDeviceTensor<real, 4>(state, output).upcastOuter<5>();
+  } else {
+    THCTensor_(resize5d)(state, output, numBatch, numPlanes, outputD, outputH,
+                          outputW);
+
+    devInput = toDeviceTensor<real, 5>(state, input);
+    devOutput = toDeviceTensor<real, 5>(state, output);
+  }
+
+  int outputPlaneSize = devOutput.getSize(2) * devOutput.getSize(3) *
+      devOutput.getSize(4);
+  dim3 gridSize(THCCeilDiv(outputPlaneSize, 256),
+            devOutput.getSize(1),
+            devOutput.getSize(0));
+  dim3 blockSize(outputPlaneSize > 256 ? 256 : outputPlaneSize);
+
+  VolumetricReplicationPadding_updateOutput<real><<<gridSize, blockSize, 0, THCState_getCurrentStream(state)>>>(
+    devInput, devOutput, pfront, pback, ptop, pbottom, pleft, pright);
+}
+
+void THNN_(VolumetricReplicationPadding_updateGradInput)(
+           THCState *state,
+           THCTensor *input,
+           THCTensor *gradOutput,
+           THCTensor *gradInput,
+           int pleft, int pright,
+           int ptop, int pbottom,
+           int pfront, int pback) {
+  THArgCheck(TensorUtils<THCTensor>::canUse32BitIndexMath(state, input), 2,
+             "input tensor must fit into 32-bit index math");
+  THArgCheck(TensorUtils<THCTensor>::canUse32BitIndexMath(state, gradOutput),
+             3, "output gradient tensor must fit into 32-bit index math");
+
+  int planeDim = 0;
+  int dimd = 1;
+  int dimh = 2;
+  int dimw = 3;
+
+  int numInputDims = THCTensor_(nDimension)(state, input);
+  if (numInputDims == 5) {
+    planeDim++;
+    dimd++;
+    dimh++;
+    dimw++;
+  }
+
+  THCTensor_(resizeAs)(state, gradInput, input);
+  THCTensor_(zero)(state, gradInput);
+
+  THCDeviceTensor<real, 5> devGradInput;
+  THCDeviceTensor<real, 5> devGradOutput;
+
+  if (numInputDims == 4) {
+    devGradInput = toDeviceTensor<real, 4>(state, gradInput).upcastOuter<5>();
+    devGradOutput =
+        toDeviceTensor<real, 4>(state, gradOutput).upcastOuter<5>();
+  } else {
+    devGradInput = toDeviceTensor<real, 5>(state, gradInput);
+    devGradOutput = toDeviceTensor<real, 5>(state, gradOutput);
+  }
+
+  int outputPlaneSize = devGradOutput.getSize(2) * devGradOutput.getSize(3) *
+      devGradOutput.getSize(4);
+  dim3 gridSize(THCCeilDiv(outputPlaneSize, 256),
+            devGradOutput.getSize(1),
+            devGradOutput.getSize(0));
+  dim3 blockSize(outputPlaneSize > 256 ? 256 : outputPlaneSize);
+
+  VolumetricReplicationPadding_updateGradInput<<<gridSize, blockSize, 0, THCState_getCurrentStream(state)>>>(
+    devGradInput, devGradOutput, pfront, pback, ptop, pbottom, pleft, pright);
+}
+
+#endif
diff --git a/lib/THCUNN/im2col.h b/lib/THCUNN/im2col.h
index a81dbd2..ba57263 100644
--- a/lib/THCUNN/im2col.h
+++ b/lib/THCUNN/im2col.h
@@ -2,6 +2,7 @@
 #define THCUNN_IM2COL_H
 
 #include "common.h"
+#include "THCNumerics.cuh"
 
 // Kernel for fast unfold+copy
 // (borrowed from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/conv_layer.cu)
@@ -29,7 +30,7 @@ __global__ void im2col_kernel(const int n, const Dtype* data_im,
         int h = h_in + i * dilation_h;
         int w = w_in + j * dilation_w;
         *data_col = (h >= 0 && w >= 0 && h < height && w < width) ?
-          data_im[i * dilation_h * width + j * dilation_w] : 0;
+          data_im[i * dilation_h * width + j * dilation_w] : ScalarConvert<int, Dtype>::to(0);
         data_col += height_col * width_col;
       }
     }
@@ -59,7 +60,7 @@ void im2col(cudaStream_t stream, const Dtype* data_im, const int channels,
   THCudaCheck(cudaGetLastError());
 }
 
-template <typename Dtype>
+template <typename Dtype, typename Acctype>
 __global__ void col2im_kernel(const int n, const Dtype* data_col,
                                   const int height, const int width, const int channels,
                                   const int kernel_h, const int kernel_w,
@@ -69,7 +70,7 @@ __global__ void col2im_kernel(const int n, const Dtype* data_col,
                                   const int height_col, const int width_col,
                                   Dtype* data_im) {
   CUDA_KERNEL_LOOP(index, n) {
-    Dtype val = 0;
+    Acctype val = Acctype(0);
     const int w_im = index % width + pad_w;
     const int h_im = (index / width) % height + pad_h;
     const int c_im = index / (width * height);
@@ -96,11 +97,11 @@ __global__ void col2im_kernel(const int n, const Dtype* data_col,
         }
       }
     }
-    data_im[index] = val;
+    data_im[index] = ScalarConvert<Acctype, Dtype>::to(val);
   }
 }
 
-template <typename Dtype>
+template <typename Dtype, typename Acctype>
 void col2im(cudaStream_t stream, const Dtype* data_col, const int channels,
             const int height, const int width,
             const int patch_h, const int patch_w, const int pad_h,
@@ -113,7 +114,7 @@ void col2im(cudaStream_t stream, const Dtype* data_col, const int channels,
   int num_kernels = channels * height * width;
   // To avoid involving atomic operations, we will launch one kernel per
   // bottom dimension, and then in the kernel add up the top dimensions.
-  col2im_kernel <<<GET_BLOCKS(num_kernels), CUDA_NUM_THREADS, 0, stream>>> (
+  col2im_kernel<Dtype, Acctype> <<<GET_BLOCKS(num_kernels), CUDA_NUM_THREADS, 0, stream>>> (
       num_kernels, data_col, height, width, channels,
       patch_h, patch_w, pad_h, pad_w, stride_h, stride_w,
       dilation_h, dilation_w,
diff --git a/lib/THCUNN/vol2col.h b/lib/THCUNN/vol2col.h
index add88e1..15b110e 100644
--- a/lib/THCUNN/vol2col.h
+++ b/lib/THCUNN/vol2col.h
@@ -2,6 +2,7 @@
 #define THCUNN_VOL2COL_H
 
 #include "common.h"
+#include "THCNumerics.cuh"
 
 // Kernel for fast unfold+copy on volumes
 template <typename Dtype>
@@ -33,7 +34,7 @@ CUDA_KERNEL_LOOP(index, n) {
           int h = h_in + j * dilation_h;
           int w = w_in + k * dilation_w;
           *data_col = (t >= 0 && h >= 0 && w >= 0 && t < depth && h < height && w < width) ?
-            data_vol[i * dilation_t * height * width + j * dilation_h * width + k * dilation_w] : 0;
+            data_vol[i * dilation_t * height * width + j * dilation_h * width + k * dilation_w] : ScalarConvert<int, Dtype>::to(0);
           data_col += depth_col * height_col * width_col;
         }
       }
@@ -65,7 +66,7 @@ void vol2col(cudaStream_t stream, const Dtype* data_vol, const int channels,
   THCudaCheck(cudaGetLastError());
 }
 
-template <typename Dtype>
+template <typename Dtype, typename Acctype>
 __global__ void vol2im_kernel(const int n, const Dtype* data_col,
     const int depth, const int height, const int width, const int channels,
     const int kernel_t, const int kernel_h, const int kernel_w,
@@ -75,7 +76,7 @@ __global__ void vol2im_kernel(const int n, const Dtype* data_col,
     const int depth_col, const int height_col, const int width_col,
     Dtype* data_vol) {
   CUDA_KERNEL_LOOP(index, n) {
-    Dtype val = 0;
+    Acctype val = Acctype(0);
     const int w_im = index % width + pad_w;
     const int h_im = (index / width) % height + pad_h;
     const int t_im = (index / width / height) % depth + pad_t;
@@ -112,11 +113,11 @@ __global__ void vol2im_kernel(const int n, const Dtype* data_col,
         }
       }
     }
-    data_vol[index] = val;
+    data_vol[index] = ScalarConvert<Acctype, Dtype>::to(val);
   }
 }
 
-template <typename Dtype>
+template <typename Dtype, typename Acctype>
 void col2vol(cudaStream_t stream, const Dtype* data_col, const int channels,
     const int depth, const int height, const int width,
     const int patch_t, const int patch_h, const int patch_w,
@@ -130,7 +131,7 @@ void col2vol(cudaStream_t stream, const Dtype* data_col, const int channels,
   int num_kernels = channels * depth * height * width;
   // To avoid involving atomic operations, we will launch one kernel per
   // bottom dimension, and then in the kernel add up the top dimensions.
-  vol2im_kernel <<<GET_BLOCKS(num_kernels), CUDA_NUM_THREADS, 0, stream>>> (
+  vol2im_kernel<Dtype, Acctype> <<<GET_BLOCKS(num_kernels), CUDA_NUM_THREADS, 0, stream>>> (
       num_kernels, data_col, depth, height, width, channels,
       patch_t, patch_h, patch_w, pad_t, pad_h, pad_w, stride_t, stride_h, stride_w,
       dilation_t, dilation_h, dilation_w,
diff --git a/test.lua b/test.lua
index 48beced..7cb2854 100644
--- a/test.lua
+++ b/test.lua
@@ -1,4 +1,5 @@
 local cunntest = torch.TestSuite()
+local ffi = require 'ffi'
 local precision_forward = 1e-4
 local precision_backward = 1e-2
 local nloop = 1
@@ -6,134 +7,183 @@ local times = {}
 
 --e.g.: th -lcunn -e "nn.testcuda{'Sigmoid_forward'}"
 
-local function pointwise_forward(proto_module, name, max_error)
-   local size = math.random(1,100)
+local typenames = {
+  'torch.CudaTensor',
+  'torch.CudaDoubleTensor',
+}
 
-   local tm = {}
-   local title = string.format(name..'.forward %d -> %d', size, size)
-   times[title] = tm
+local t2cpu = {
+  ['torch.CudaTensor'] = 'torch.FloatTensor',
+  ['torch.CudaDoubleTensor'] = 'torch.DoubleTensor',
 
-   local input = torch.randn(size)
-   if name == 'Sqrt' then input:abs() end
-   local sconv = proto_module
-   local groundtruth = sconv:forward(input)
-   local a = torch.Timer()
-   for i = 1,nloop do
-      groundtruth = sconv:forward(input)
+}
+
+local function checkHalf()
+   if cutorch.hasHalf then
+       table.insert(typenames, 'torch.CudaHalfTensor')
+       t2cpu['torch.CudaHalfTensor'] = 'torch.FloatTensor'
    end
-   tm.cpu = a:time().real
+end
 
-   input = input:cuda()
-   local gconv = proto_module:clone():cuda()
-   local rescuda = gconv:forward(input)
-   a:reset()
-   for i = 1,nloop do
-      rescuda = gconv:forward(input)
+-- workarounds for non-existant functions
+function torch.CudaHalfTensor:mean()
+   return self:cuda():mean()
+end
+
+function torch.CudaDoubleTensor:mean()
+   return self:cuda():mean()
+end
+
+local function half_max_error(maxabs)
+  -- arbitrarily double the precision limit
+  return 2 * ((maxabs and (2^(math.floor(math.log(maxabs) / math.log(2)))) * (2^(-10))) or 0)
+end
+
+-- half has additional error on top of double/float
+local function precision_forward_type(precision_f, tensor_type, maxabs)
+   if (tensor_type == 'torch.CudaHalfTensor') then
+      return 1e-2 + precision_f + half_max_error(maxabs)
+   else
+      return precision_f
    end
-   cutorch.synchronize()
-   tm.gpu = a:time().real
+end
 
-   local error = rescuda:float() - groundtruth
-   mytester:assertlt(error:abs():max(), max_error, 'error on state (forward) ')
+local function precision_backward_type(precision_b, tensor_type, maxabs)
+   if (tensor_type == 'torch.CudaHalfTensor') then
+      return 1e-1 + precision_b + half_max_error(maxabs)
+   else
+      return precision_b
+   end
+end
+
+local function precision_backward_conv_weightbias(precision_b, tensor_type, maxabs)
+   if (tensor_type == 'torch.CudaHalfTensor') then
+      -- cudnn uses 8 here
+      return 2 + precision_b + half_max_error(maxabs)
+   else
+      return precision_b
+   end
+end
+
+local function pointwise_forward(proto_module, name, max_error)
+   local size = math.random(1,100)
+
+   for k, typename in ipairs(typenames) do
+      local input = torch.randn(size):type(typename)
+      local ctype = t2cpu[typename]
+      local input = input:type(ctype)
+      if name == 'Sqrt' then input:abs() end
+      local sconv = proto_module:type(ctype)
+      local groundtruth = sconv:forward(input)
+
+      input = input:type(typename)
+      local gconv = proto_module:clone():type(typename)
+      local rescuda = gconv:forward(input)
+
+      local error = rescuda:double() - groundtruth:double()
+      mytester:assertlt(error:abs():max(), precision_forward_type(max_error, typename),
+        string.format('error on state (forward) with %s', typename))
+    end
 end
 
 local function pointwise_backward(proto_module, name, max_error)
    local size = math.random(1,100)
 
-   local tm = {}
-   local title = string.format(name..'.backward %d -> %d', size, size)
-   times[title] = tm
+   for k, typename in ipairs(typenames) do
+      local input = torch.randn(size):type(typename)
+      local gradOutput = torch.randn(size):type(typename)
 
-   local input = torch.randn(size)
-   if name == 'Sqrt' then input:abs() end
-   local gradOutput = torch.randn(size)
-   local sconv = proto_module
-   sconv:forward(input)
-   local groundgrad = sconv:backward(input, gradOutput)
-   local a = torch.Timer()
-   for i = 1,nloop do
-      groundgrad = sconv:backward(input, gradOutput)
-   end
-   tm.cpu = a:time().real
+      local ctype = t2cpu[typename]
+      input = input:type(ctype)
+      gradOutput = gradOutput:type(ctype)
+      if name == 'Sqrt' then input:abs() end
+      local sconv = proto_module:type(ctype)
+      sconv:forward(input)
+      local groundgrad = sconv:backward(input, gradOutput)
 
-   input = input:cuda()
-   gradOutput = gradOutput:cuda()
-   local gconv = proto_module:clone():cuda()
-   gconv:forward(input)
-   local rescuda = gconv:backward(input, gradOutput)
-   a:reset()
-   for i = 1,nloop do
-      rescuda = gconv:backward(input, gradOutput)
-   end
-   cutorch.synchronize()
-   tm.gpu = a:time().real
+      input = input:type(typename)
+      gradOutput = gradOutput:type(typename)
+      local gconv = proto_module:clone():type(typename)
+      gconv:forward(input)
+      local rescuda = gconv:backward(input, gradOutput)
 
-   local error = rescuda:float() - groundgrad
+      local error = rescuda:double() - groundgrad:double()
 
-   mytester:assertlt(error:abs():max(), max_error, 'error on state (backward) ')
+      mytester:assertlt(error:abs():max(), precision_backward_type(max_error, typename),
+        string.format('error on state (backward) with %s', typename))
+    end
 end
 
 local function pointwise_backward_inplace(proto_module, name)
    local size = math.random(1,100)
 
-   local tm = {}
-   local title = string.format(name..'.backward_inplace %d -> %d', size, size)
-   times[title] = tm
-
-   local input = torch.randn(size)
-   if name == 'Sqrt' then input:abs() end
-   local gradOutput = torch.randn(size)
-   local sconv = proto_module
-   local groundgrad = sconv:backward(input, gradOutput)
-   mytester:assertTensorEq(groundgrad:float(),
-                           gradOutput:float(),
-                           0.000001, "inplace not respected")
-
-   local input = torch.randn(size):cuda()
-   if name == 'Sqrt' then input:abs() end
-   local gradOutput = torch.randn(size):cuda()
-   local sconv = proto_module:clone():cuda()
-   local groundgrad = sconv:backward(input, gradOutput)
-   mytester:assertTensorEq(groundgrad:float(),
-                           gradOutput:float(),
-                           0.000001, "cuda inplace not respected")
+   for k, typename in ipairs(typenames) do
+      local input = torch.randn(size):type(typename)
+      local ctype = t2cpu[typename]
+      input = input:type(ctype)
+      if name == 'Sqrt' then input:abs() end
+      local gradOutput = torch.randn(size)
+      gradOutput = gradOutput:type(ctype)
+      local sconv = proto_module:type(ctype)
+      local groundgrad = sconv:backward(input, gradOutput)
+      mytester:assertTensorEq(groundgrad:double(),
+                              gradOutput:double(),
+                              0.000001,
+                              string.format("inplace not respected for %s", ctype))
+
+      input = torch.randn(size)
+      input = input:type(typename)
+      if name == 'Sqrt' then input:abs() end
+      gradOutput = torch.randn(size)
+      gradOutput = gradOutput:type(typename)
+      local sconv = proto_module:clone():type(typename)
+      local groundgrad = sconv:backward(input, gradOutput)
+      mytester:assertTensorEq(groundgrad:double(),
+                              gradOutput:double(),
+                              0.000001,
+                              string.format("cuda inplace not respected for %s", typename))
+    end
 end
 
 local function pointwise_transposed(proto_module, name, max_error)
    max_error = max_error or 1e-7
-   local tm = {}
-   local title = name .. '.transposed'
-   times[title] = tm
 
-   local input = torch.Tensor(11, 19):uniform(-1, 1)
-   if name == 'Sqrt' then
-      input:uniform(0.1, 1)
-   end
-   local inputCUDA = input:clone():cuda()
+   for k, typename in ipairs(typenames) do
+      local ctype = t2cpu[typename]
+      local input = torch.Tensor(11, 19):uniform(-1, 1):type(typename)
+      input = input:type(ctype)
+      local proto_module = proto_module:type(ctype)
+      if name == 'Sqrt' then
+        input:uniform(0.1, 1)
+      end
+      local inputCUDA = input:clone():type(typename)
 
-   local cuda_module = proto_module:clone():cuda()
+      local cuda_module = proto_module:clone():type(typename)
 
-   -- transpose the inputs and DON'T make contiguous
-   input = input:transpose(1, 2)
-   inputCUDA = inputCUDA:transpose(1, 2)
+      -- transpose the inputs and DON'T make contiguous
+      input = input:transpose(1, 2)
+      inputCUDA = inputCUDA:transpose(1, 2)
 
-   local output = proto_module:forward(input)
-   local outputCUDA = cuda_module:forward(inputCUDA)
+      local output = proto_module:forward(input)
+      local outputCUDA = cuda_module:forward(inputCUDA)
 
-   local error = outputCUDA:float() - output
-   mytester:assertlt(error:abs():max(), max_error, 'error on state (forward) ')
+      local error = outputCUDA:double() - output:double()
+      mytester:assertlt(error:abs():max(), precision_forward_type(max_error, typename),
+        string.format('error on state (forward) for %s', typename))
 
-   local gradOutput = torch.Tensor(11, 19):uniform(-1, 1)
-   local gradOutputCUDA = gradOutput:clone():cuda()
+      local gradOutput = torch.Tensor(11, 19):uniform(-1, 1):type(ctype)
+      local gradOutputCUDA = gradOutput:clone():type(typename)
 
-   gradOutput = gradOutput:transpose(1, 2)
-   gradOutputCUDA = gradOutputCUDA:transpose(1, 2)
+      gradOutput = gradOutput:transpose(1, 2)
+      gradOutputCUDA = gradOutputCUDA:transpose(1, 2)
 
-   local gradInput = proto_module:backward(input, gradOutput)
-   local gradInputCUDA  = cuda_module:backward(inputCUDA, gradOutputCUDA)
+      local gradInput = proto_module:backward(input, gradOutput)
+      local gradInputCUDA  = cuda_module:backward(inputCUDA, gradOutputCUDA)
 
-   local error = gradInputCUDA:float() - gradInput
-   mytester:assertlt(error:abs():max(), max_error,  'error on state (backward) ')
+      local error = gradInputCUDA:double() - gradInput:double()
+      mytester:assertlt(error:abs():max(), precision_backward_type(max_error, typename),
+        string.format('error on state (backward) for %s', typename))
+    end
 end
 
 function cunntest.Tanh_forward()
@@ -253,29 +303,32 @@ end
 function cunntest.Sqrt_zero()
    local size = math.random(1, 100)
 
-   -- Test zero inputs; we will avoid a div-by-zero by setting to zero
-   local module_gpu = nn.Sqrt():cuda()
-   local input_gpu = torch.CudaTensor(size, size):zero()
-   module_gpu:forward(input_gpu)
+   for k, typename in ipairs(typenames) do
+      -- Test zero inputs; we will avoid a div-by-zero by setting to zero
+      local module_gpu = nn.Sqrt():type(typename)
+      local input_gpu = torch.CudaTensor(size, size):zero():type(typename)
+      module_gpu:forward(input_gpu)
 
-   local gradOutput_gpu = torch.CudaTensor(size, size):fill(1)
-   local gradInput_gpu = module_gpu:backward(input_gpu, gradOutput_gpu)
+      local gradOutput_gpu = torch.CudaTensor(size, size):fill(1):type(typename)
+      local gradInput_gpu = module_gpu:backward(input_gpu, gradOutput_gpu)
 
-   mytester:assertTensorEq(gradInput_gpu:float(),
-                           torch.FloatTensor(size, size):zero(),
-                           0.000001, "error in sqrt backward singularity")
+      mytester:assertTensorEq(gradInput_gpu:double(),
+                              torch.DoubleTensor(size, size):zero(),
+                              0.000001, "error in sqrt backward singularity")
 
-   -- Verify CPU and GPU zero behavior equivalency
-   local module_cpu = nn.Sqrt()
-   local input_cpu = input_gpu:float()
-   module_cpu:forward(input_cpu)
+      -- Verify CPU and GPU zero behavior equivalency
+      local ctype = t2cpu[typename]
+      local module_cpu = nn.Sqrt():type(ctype)
+      local input_cpu = input_gpu:type(ctype)
+      module_cpu:forward(input_cpu)
 
-   local gradOutput_cpu = gradOutput_gpu:float()
-   local gradInput_cpu = module_cpu:backward(input_cpu, gradOutput_cpu)
+      local gradOutput_cpu = gradOutput_gpu:type(ctype)
+      local gradInput_cpu = module_cpu:backward(input_cpu, gradOutput_cpu)
 
-   mytester:assertTensorEq(gradInput_gpu:float(),
-                           gradInput_cpu:float(),
-                           0.000001, "Sqrt_zero CPU and GPU not equivalent")
+      mytester:assertTensorEq(gradInput_gpu:double(),
+                            gradInput_cpu:double(),
+                            0.000001, "Sqrt_zero CPU and GPU not equivalent")
+    end
 end
 
 function cunntest.Sqrt_transposed()
@@ -295,15 +348,18 @@ function cunntest.Square_transposed()
 end
 
 function cunntest.SoftShrink_forward()
-  pointwise_forward(nn.SoftShrink(math.random()), 'SoftShrink', precision_forward)
+  local r = ffi.C.THC_half2float(ffi.C.THC_float2half(math.random()))
+  pointwise_forward(nn.SoftShrink(r), 'SoftShrink', precision_forward)
 end
 
 function cunntest.SoftShrink_backward()
-  pointwise_backward(nn.SoftShrink(math.random()), 'SoftShrink', precision_backward)
+  local r = ffi.C.THC_half2float(ffi.C.THC_float2half(math.random()))
+  pointwise_backward(nn.SoftShrink(r), 'SoftShrink', precision_backward)
 end
 
 function cunntest.SoftShrink_transposed()
-  pointwise_transposed(nn.SoftShrink(math.random()), 'SoftShrink', precision_backward)
+  local r = ffi.C.THC_half2float(ffi.C.THC_float2half(math.random()))
+  pointwise_transposed(nn.SoftShrink(r), 'SoftShrink', precision_backward)
 end
 
 function cunntest.ELU_forward()
@@ -363,67 +419,48 @@ function cunntest.LogSoftMax_forward_batch()
    local size = math.random(1,256)
    local bs = math.random(32,256)
 
-   local tm = {}
-   local title = string.format('LogSoftMax forward batch %d x %d -> %d x %d', bs, size, bs, size)
-   times[title] = tm
-
-   local input = torch.randn(bs, size)
-   local sconv = nn.LogSoftMax()
-   local groundtruth = sconv:forward(input)
-   local a = torch.Timer()
-   for i = 1,nloop do
-      groundtruth = sconv:forward(input)
-   end
-   tm.cpu = a:time().real
+   for k, typename in ipairs(typenames) do
+      local input = torch.randn(bs, size):type(typename)
+      local ctype = t2cpu[typename]
+      input = input:type(ctype)
+      local sconv = nn.LogSoftMax():type(ctype)
+      local groundtruth = sconv:forward(input)
 
-   input = input:cuda()
-   local gconv = nn.LogSoftMax():cuda()
-   local rescuda = gconv:forward(input)
-   a:reset()
-   for i = 1,nloop do
-      rescuda = gconv:forward(input)
-   end
-   cutorch.synchronize()
-   tm.gpu = a:time().real
+      input = input:type(typename)
+      local gconv = nn.LogSoftMax():type(typename)
+      local rescuda = gconv:forward(input)
 
-   local error = rescuda:float() - groundtruth
-   mytester:assertlt(error:abs():max(), precision_forward*10, 'error on state (forward) ')
+      local error = rescuda:double() - groundtruth:double()
+      mytester:assertlt(error:abs():max(), precision_forward_type(precision_forward*10, typename),
+          string.format('error on state (forward) with %s', typename))
+    end
 end
 
 function cunntest.LogSoftMax_backward_batch()
    local size = math.random(1,256)
    local bs = math.random(32,256)
 
-   local tm = {}
-   local title = string.format('LogSoftMax.backward batch %d x %d -> %d x %d', bs, size, bs, size)
-   times[title] = tm
-
-   local input = torch.randn(bs, size)
-   local gradOutput = torch.randn(bs, size)
-   local sconv = nn.LogSoftMax()
-   sconv:forward(input)
-   local groundgrad = sconv:backward(input, gradOutput)
-   local a = torch.Timer()
-   for i = 1,nloop do
-      groundgrad = sconv:backward(input, gradOutput)
-   end
-   tm.cpu = a:time().real
+   for k, typename in ipairs(typenames) do
+      local input = torch.randn(bs, size):type(typename)
+      local gradOutput = torch.randn(bs, size):type(typename)
+      local ctype = t2cpu[typename]
+      input = input:type(ctype)
+      gradOutput = gradOutput:type(ctype)
+      local sconv = nn.LogSoftMax():type(ctype)
+      sconv:forward(input)
+      local groundgrad = sconv:backward(input, gradOutput)
 
-   input = input:cuda()
-   gradOutput = gradOutput:cuda()
-   local gconv = sconv:clone():cuda()
-   gconv:forward(input)
-   local rescuda = gconv:backward(input, gradOutput)
-   a:reset()
-   for i = 1,nloop do
-      rescuda = gconv:backward(input, gradOutput)
-   end
-   cutorch.synchronize()
-   tm.gpu = a:time().real
+      input = input:type(typename)
+      gradOutput = gradOutput:type(typename)
+      local gconv = sconv:clone():type(typename)
+      gconv:forward(input)
+      local rescuda = gconv:backward(input, gradOutput)
 
-   local error = rescuda:float() - groundgrad
+      local error = rescuda:double() - groundgrad:double()
 
-   mytester:assertlt(error:abs():max(), precision_backward, 'error on state (backward) ')
+      mytester:assertlt(error:abs():max(), precision_backward_type(precision_backward, typename),
+          string.format('error on state (backward) with %s', typename))
+    end
 end
 
 function cunntest.SpatialLogSoftMax_forward()
@@ -431,31 +468,22 @@ function cunntest.SpatialLogSoftMax_forward()
    local ini = math.random(8,32)
    local inj = math.random(8,32)
 
-   local tm = {}
-   local title = string.format('SpatialLogSoftMax forward %d x %d x %d', size, inj, ini)
-   times[title] = tm
-
-   local input = torch.randn(size, inj, ini)
-   local sconv = nn.SpatialLogSoftMax()
-   local groundtruth = sconv:forward(input)
-   local a = torch.Timer()
-   for i = 1,nloop do
-      groundtruth = sconv:forward(input)
-   end
-   tm.cpu = a:time().real
+   for k, typename in ipairs(typenames) do
+      local input = torch.randn(size, inj, ini):type(typename)
+      local ctype = t2cpu[typename]
+      input = input:type(ctype)
+      local sconv = nn.SpatialLogSoftMax():type(ctype)
+      local groundtruth = sconv:forward(input):type(ctype)
 
-   input = input:cuda()
-   local gconv = nn.SpatialLogSoftMax():cuda()
-   local rescuda = gconv:forward(input)
-   a:reset()
-   for i = 1,nloop do
-      rescuda = gconv:forward(input)
-   end
-   cutorch.synchronize()
-   tm.gpu = a:time().real
+      input = input:type(typename)
+      local gconv = nn.SpatialLogSoftMax():type(typename)
+      local rescuda = gconv:forward(input)
 
-   local error = rescuda:float() - groundtruth
-   mytester:assertlt(error:abs():max(), precision_forward*25, 'error on state (forward) ')
+      local error = rescuda:double() - groundtruth:double()
+      mytester:assertlt(error:abs():max(),
+          precision_forward_type(precision_forward*25, typename),
+          string.format('error on state (forward) with %s', typename))
+    end
 end
 
 function cunntest.SpatialLogSoftMax_backward()
@@ -463,36 +491,27 @@ function cunntest.SpatialLogSoftMax_backward()
    local ini = math.random(8,32)
    local inj = math.random(8,32)
 
-   local tm = {}
-   local title = string.format('SpatialLogSoftMax.backward %d x %d x %d', size, inj, ini)
-   times[title] = tm
-
-   local input = torch.randn(size, inj, ini)
-   local gradOutput = torch.randn(size, inj, ini)
-   local sconv = nn.SpatialLogSoftMax()
-   sconv:forward(input)
-   local groundgrad = sconv:backward(input, gradOutput)
-   local a = torch.Timer()
-   for i = 1,nloop do
-      groundgrad = sconv:backward(input, gradOutput)
-   end
-   tm.cpu = a:time().real
+   for k, typename in ipairs(typenames) do
+      local input = torch.randn(size, inj, ini):type(typename)
+      local gradOutput = torch.randn(size, inj, ini):type(typename)
+      local ctype = t2cpu[typename]
+      input = input:type(ctype)
+      gradOutput = gradOutput:type(ctype)
+      local sconv = nn.SpatialLogSoftMax():type(ctype)
+      sconv:forward(input)
+      local groundgrad = sconv:backward(input, gradOutput)
 
-   input = input:cuda()
-   gradOutput = gradOutput:cuda()
-   local gconv = sconv:clone():cuda()
-   gconv:forward(input)
-   local rescuda = gconv:backward(input, gradOutput)
-   a:reset()
-   for i = 1,nloop do
-      rescuda = gconv:backward(input, gradOutput)
-   end
-   cutorch.synchronize()
-   tm.gpu = a:time().real
+      input = input:type(typename)
+      gradOutput = gradOutput:type(typename)
+      local gconv = sconv:clone():type(typename)
+      gconv:forward(input)
+      local rescuda = gconv:backward(input, gradOutput)
 
-   local error = rescuda:float() - groundgrad
+      local error = rescuda:double() - groundgrad:double()
 
-   mytester:assertlt(error:abs():max(), precision_backward, 'error on state (backward) ')
+      mytester:assertlt(error:abs():max(), precision_backward_type(precision_backward, typename),
+          string.format('error on state (backward) with %s', typename))
+    end
 end
 
 function cunntest.SpatialLogSoftMax_forward_batch()
@@ -501,31 +520,22 @@ function cunntest.SpatialLogSoftMax_forward_batch()
    local ini = math.random(8,32)
    local inj = math.random(8,32)
 
-   local tm = {}
-   local title = string.format('SpatialLogSoftMax forward batch %d x %d x %d x %d', bs, size, inj, ini)
-   times[title] = tm
-
-   local input = torch.randn(bs, size, inj, ini)
-   local sconv = nn.SpatialLogSoftMax()
-   local groundtruth = sconv:forward(input)
-   local a = torch.Timer()
-   for i = 1,nloop do
-      groundtruth = sconv:forward(input)
-   end
-   tm.cpu = a:time().real
+   for k, typename in ipairs(typenames) do
+      local input = torch.randn(bs, size, inj, ini):type(typename)
+      local ctype = t2cpu[typename]
+      input = input:type(ctype)
+      local sconv = nn.SpatialLogSoftMax():type(ctype)
+      local groundtruth = sconv:forward(input)
 
-   input = input:cuda()
-   local gconv = nn.SpatialLogSoftMax():cuda()
-   local rescuda = gconv:forward(input)
-   a:reset()
-   for i = 1,nloop do
-      rescuda = gconv:forward(input)
-   end
-   cutorch.synchronize()
-   tm.gpu = a:time().real
+      input = input:type(typename)
+      local gconv = nn.SpatialLogSoftMax():type(typename)
+      local rescuda = gconv:forward(input)
 
-   local error = rescuda:float() - groundtruth
-   mytester:assertlt(error:abs():max(), precision_forward*25, 'error on state (forward) ')
+      local error = rescuda:double() - groundtruth:double()
+      mytester:assertlt(error:abs():max(),
+          precision_forward_type(precision_forward*25, typename),
+          string.format('error on state (forward) with %s', typename))
+    end
 end
 
 function cunntest.SpatialLogSoftMax_backward_batch()
@@ -534,36 +544,27 @@ function cunntest.SpatialLogSoftMax_backward_batch()
    local ini = math.random(8,32)
    local inj = math.random(8,32)
 
-   local tm = {}
-   local title = string.format('SpatialLogSoftMax.backward batch %d x %d x %d x %d', bs, size, inj, ini)
-   times[title] = tm
-
-   local input = torch.randn(bs, size, inj, ini)
-   local gradOutput = torch.randn(bs, size, inj, ini)
-   local sconv = nn.SpatialLogSoftMax()
-   sconv:forward(input)
-   local groundgrad = sconv:backward(input, gradOutput)
-   local a = torch.Timer()
-   for i = 1,nloop do
-      groundgrad = sconv:backward(input, gradOutput)
-   end
-   tm.cpu = a:time().real
+   for k, typename in ipairs(typenames) do
+      local input = torch.randn(bs, size, inj, ini):type(typename)
+      local gradOutput = torch.randn(bs, size, inj, ini):type(typename)
+      local ctype = t2cpu[typename]
+      input = input:type(ctype)
+      gradOutput = gradOutput:type(ctype)
+      local sconv = nn.SpatialLogSoftMax():type(ctype)
+      sconv:forward(input)
+      local groundgrad = sconv:backward(input, gradOutput)
 
-   input = input:cuda()
-   gradOutput = gradOutput:cuda()
-   local gconv = sconv:clone():cuda()
-   gconv:forward(input)
-   local rescuda = gconv:backward(input, gradOutput)
-   a:reset()
-   for i = 1,nloop do
-      rescuda = gconv:backward(input, gradOutput)
-   end
-   cutorch.synchronize()
-   tm.gpu = a:time().real
+      input = input:type(typename)
+      gradOutput = gradOutput:type(typename)
+      local gconv = sconv:clone():type(typename)
+      gconv:forward(input)
+      local rescuda = gconv:backward(input, gradOutput)
 
-   local error = rescuda:float() - groundgrad
+      local error = rescuda:double() - groundgrad:double()
 
-   mytester:assertlt(error:abs():max(), precision_backward, 'error on state (backward) ')
+      mytester:assertlt(error:abs():max(), precision_backward_type(precision_backward, typename),
+          string.format('error on state (backward) with %s', typename))
+    end
 end
 
 
@@ -732,44 +733,35 @@ function cunntest.SparseLinear_forward()
     local ini = math.random(50,100)
     local inj = math.random(5,10)
 
-    local module = nn.SparseLinear(ini,inj)
-    local sslin = module
-    local gslin = module:clone():cuda()
-
-    -- Create a random sparse vector
-    local input = {}
-    for i=1,inb do
-        local nnz = math.random(5, 10)
-        local inds = torch.randperm(ini)[{{1,nnz}}]
-        input[i] = torch.Tensor(nnz, 2)
-        input[i]:select(2,1):copy(inds)
-        input[i]:select(2,2):copy(torch.rand(nnz))
-    end
+    for k, typename in ipairs(typenames) do
+        if typename ~= "torch.CudaHalfTensor" then
+            local ctype = t2cpu[typename]
+            local module = nn.SparseLinear(ini,inj):type(ctype)
+            local sslin = module
+            local gslin = module:clone():type(typename)
+
+            -- Create a random sparse vector
+            local input = {}
+            for i=1,inb do
+                local nnz = math.random(5, 10)
+                local inds = torch.randperm(ini)[{{1,nnz}}]
+                input[i] = torch.Tensor(nnz, 2):type(ctype)
+                input[i]:select(2,1):copy(inds)
+                input[i]:select(2,2):copy(torch.rand(nnz):type(typename):type(ctype))
+            end
 
-    local tm = {}
-    local title = string.format('SparseLinear forward %d -> %d', ini, inj)
-    times[title] = tm
+            local groundtruth = sslin:forward(input)
+            sslin:zeroGradParameters()
 
-    local groundtruth = sslin:forward(input)
-    sslin:zeroGradParameters()
-    local a = torch.Timer()
-    for i = 1,nloop do
-        groundtruth = sslin:forward(input)
-    end
-    tm.cpu = a:time().real
-
-    for i,v in ipairs(input) do input[i] = input[i]:cuda() end
-    local rescuda = gslin:forward(input)
-    gslin:zeroGradParameters()
-    a:reset()
-    for i = 1,nloop do
-        rescuda = gslin:forward(input)
-    end
-    cutorch.synchronize()
-    tm.gpu = a:time().real
+            for i,v in ipairs(input) do input[i] = input[i]:type(typename) end
+            local rescuda = gslin:forward(input)
+            gslin:zeroGradParameters()
 
-    local error = rescuda:float() - groundtruth
-    mytester:assertlt(error:abs():max(), precision_forward, 'error on state (forward) ')
+            local error = rescuda:double() - groundtruth:double()
+            mytester:assertlt(error:abs():max(), precision_forward_type(precision_forward, typename),
+                string.format('error on state (forward) with %s', typename))
+        end
+    end
 end
 
 function cunntest.SparseLinear_backward()
@@ -777,201 +769,184 @@ function cunntest.SparseLinear_backward()
     local ini = math.random(50,100)
     local inj = math.random(5,10)
 
-    local gslin = nn.SparseLinear(ini,inj):cuda()
-    local sslin = nn.Linear(ini,inj)
-    gslin.weight = sslin.weight:clone():cuda()
-    gslin.bias = sslin.bias:clone():cuda()
-
-    -- Create a random sparse vector
-    local input = {}
-    local nonsparse = torch.zeros(inb, ini)
-    for i=1,inb do
-        local nnz = math.random(3, 5)
-        local inds = torch.randperm(ini)[{{1,nnz}}]
-        input[i] = torch.Tensor(nnz, 2)
-        input[i]:select(2,1):copy(inds)
-        input[i]:select(2,2):copy(torch.rand(nnz))
-        nonsparse[i]:scatter(1, input[i]:select(2,1):long(), input[i]:select(2,2))
-    end
+    for k, typename in ipairs(typenames) do
+        if typename ~= "torch.CudaHalfTensor" then
+            local ctype = t2cpu[typename]
+            local gslin = nn.SparseLinear(ini,inj):type(typename)
+            local sslin = nn.Linear(ini,inj):type(ctype)
+            gslin.weight = sslin.weight:clone():type(typename)
+            gslin.bias = sslin.bias:clone():type(typename)
+
+            -- Create a random sparse vector
+            local input = {}
+            local nonsparse = torch.zeros(inb, ini):type(ctype)
+            for i=1,inb do
+                local nnz = math.random(3, 5)
+                local inds = torch.randperm(ini)[{{1,nnz}}]
+                input[i] = torch.Tensor(nnz, 2):type(ctype)
+                input[i]:select(2,1):copy(inds)
+                input[i]:select(2,2):copy(torch.rand(nnz):type(typename):type(ctype))
+                nonsparse[i]:scatter(1, input[i]:select(2,1):long(), input[i]:select(2,2))
+            end
 
-    local tm = {}
-    local title = string.format('SparseLinear backward %d <- %d', ini, inj)
-    times[title] = tm
-
-    local gradOutput = torch.randn(inb, inj)
-    sslin:forward(nonsparse)
-    local groundgrad = sslin:backward(nonsparse, gradOutput)
-    sslin:zeroGradParameters()
-    local a = torch.Timer()
-    for i = 1,nloop do
-        sslin:backward(nonsparse, gradOutput)
-    end
-    tm.cpu = a:time().real
-    local groundweight = sslin.gradWeight
-    local groundbias = sslin.gradBias
-
-    for i,v in ipairs(input) do input[i] = input[i]:cuda() end
-    gradOutput = gradOutput:cuda()
-    gslin:forward(input)
-    local rescuda = gslin:backward(input, gradOutput)
-    gslin:zeroGradParameters()
-    a:reset()
-    for i = 1,nloop do
-        gslin:backward(input, gradOutput)
+            local gradOutput = torch.randn(inb, inj):type(typename):type(ctype)
+            sslin:forward(nonsparse)
+            local groundgrad = sslin:backward(nonsparse, gradOutput)
+            sslin:zeroGradParameters()
+            local groundweight = sslin.gradWeight
+            local groundbias = sslin.gradBias
+
+            for i,v in ipairs(input) do input[i] = input[i]:type(typename) end
+            gradOutput = gradOutput:type(typename)
+            gslin:forward(input)
+            local rescuda = gslin:backward(input, gradOutput)
+            gslin:zeroGradParameters()
+            local weightcuda = gslin.gradWeight
+            local biascuda = gslin.gradBias
+
+            local werror = weightcuda:double() - groundweight:double()
+            local berror = biascuda:double() - groundbias:double()
+
+            mytester:assertlt(werror:abs():max(), precision_backward_type(precision_backward, typename),
+                string.format('error on weight (backward) with %s', typename))
+            mytester:assertlt(berror:abs():max(), precision_backward_type(precision_backward, typename),
+                string.format('error on bias (backward) with %s', typename))
+
+            gslin:updateParameters(.1)
+            sslin:updateParameters(.1)
+            werror = gslin.weight:double() - sslin.weight:double()
+            berror = gslin.bias:double() - sslin.bias:double()
+
+            mytester:assertlt(werror:abs():max(), precision_backward_type(precision_backward, typename),
+                string.format('error on weight (update) with %s', typename))
+            mytester:assertlt(berror:abs():max(), precision_backward_type(precision_backward, typename),
+                string.format('error on bias (update) with %s', typename))
+
+            gslin:zeroGradParameters()
+        end
     end
-    local weightcuda = gslin.gradWeight
-    local biascuda = gslin.gradBias
-    cutorch.synchronize()
-    tm.gpu = a:time().real
-
-    local werror = weightcuda:float() - groundweight
-    local berror = biascuda:float() - groundbias
-
-    mytester:assertlt(werror:abs():max(), precision_backward, 'error on weight (backward) ')
-    mytester:assertlt(berror:abs():max(), precision_backward, 'error on bias (backward) ')
-
-    gslin:updateParameters(.1)
-    sslin:updateParameters(.1)
-    werror = gslin.weight:float() - sslin.weight
-    berror = gslin.bias:float() - sslin.bias
-
-    mytester:assertlt(werror:abs():max(), precision_backward, 'error on weight (update) ')
-    mytester:assertlt(berror:abs():max(), precision_backward, 'error on bias (update) ')
-
-    gslin:zeroGradParameters()
 end
 
 local function BatchNormalization_forward(moduleName, inputSize)
    local planes = inputSize[2]
-   local tm = {}
-   local title = moduleName .. '.forward ' .. table.concat(inputSize, 'x')
-   times[title] = tm
 
-   local input = torch.randn(table.unpack(inputSize))
-   local sbnorm = nn[moduleName](planes)
-   local groundtruth = sbnorm:forward(input)
-   local a = torch.Timer()
-   for i = 1,nloop do
-      groundtruth = sbnorm:forward(input)
-   end
-   tm.cpu = a:time().real
+   for k, typename in ipairs(typenames) do
+      local input = torch.randn(table.unpack(inputSize)):type(typename)
 
-   input = input:cuda()
-   local gbnorm = nn[moduleName](planes):cuda()
-   gbnorm.weight = sbnorm.weight:cuda()
-   gbnorm.bias = sbnorm.bias:cuda()
-   local rescuda = gbnorm:forward(input)
+      local ctype = t2cpu[typename]
+      input = input:type(ctype)
+      local sbnorm = nn[moduleName](planes):type(ctype)
+      local groundtruth = sbnorm:forward(input)
 
-   a:reset()
-   for i = 1,nloop do
-      rescuda = gbnorm:forward(input)
-   end
-   cutorch.synchronize()
-   tm.gpu = a:time().real
+      input = input:type(typename)
+      local gbnorm = nn[moduleName](planes):type(typename)
+      gbnorm.weight = sbnorm.weight:type(typename)
+      gbnorm.bias = sbnorm.bias:type(typename)
+      local rescuda = gbnorm:forward(input)
 
-   local error = rescuda:float() - groundtruth
-   mytester:assertlt(error:abs():max(), precision_forward, 'error on state (forward)')
-   mytester:assertlt((gbnorm.running_mean:float() - sbnorm.running_mean):abs():max(),
-      precision_forward, 'error on running_mean (forward)')
-   mytester:assertlt((gbnorm.running_var:float() - sbnorm.running_var):abs():max(),
-      precision_forward, 'error on running_var (forward)')
+      local error = rescuda:double() - groundtruth:double()
+      mytester:assertlt(error:abs():max(), precision_forward_type(precision_forward, typename, rescuda:abs():max()),
+         string.format('error on state (forward) with %s', typename))
+      mytester:assertlt((gbnorm.running_mean:double() - sbnorm.running_mean:double()):abs():max(),
+         precision_forward_type(precision_forward, typename, gbnorm.running_mean:abs():max()),
+         string.format('error on running_mean (forward) with %s', typenanme))
+      mytester:assertlt((gbnorm.running_var:double() - sbnorm.running_var:double()):abs():max(),
+         precision_forward_type(precision_forward, typename, gbnorm.running_var:abs():max()),
+         string.format('error on running_var (forward) with %s', typename))
+   end
 end
 
 local function BatchNormalization_forward_inference(moduleName, inputSize)
    local planes = inputSize[2]
-   local tm = {}
-   local title = moduleName .. '.forward (evaluate) ' .. table.concat(inputSize, 'x')
-   times[title] = tm
 
-   local input = torch.randn(table.unpack(inputSize))
-   local sbnorm = nn[moduleName](planes)
-   sbnorm.running_mean:normal(1, 2)
-   sbnorm.running_var:uniform(1e-3, 2)
-   sbnorm:evaluate()
-   local groundtruth = sbnorm:forward(input)
-   local a = torch.Timer()
-   for i = 1,nloop do
-      groundtruth = sbnorm:forward(input)
-   end
-   tm.cpu = a:time().real
+   for k, typename in ipairs(typenames) do
+      local input = torch.randn(table.unpack(inputSize)):type(typename)
 
-   input = input:cuda()
-   local gbnorm = nn[moduleName](planes):cuda()
-   gbnorm:evaluate()
-   gbnorm.weight = sbnorm.weight:cuda()
-   gbnorm.bias = sbnorm.bias:cuda()
-   gbnorm.running_mean = sbnorm.running_mean:cuda()
-   gbnorm.running_var = sbnorm.running_var:cuda()
-   local rescuda = gbnorm:forward(input)
-   a:reset()
-   for i = 1,nloop do
-      rescuda = gbnorm:forward(input)
-   end
-   cutorch.synchronize()
-   tm.gpu = a:time().real
+      local ctype = t2cpu[typename]
+      input = input:type(ctype)
+      local sbnorm = nn[moduleName](planes):type(ctype)
+      sbnorm.running_mean:normal(1, 2)
+      sbnorm.running_var:uniform(1e-3, 2)
+      sbnorm.running_var = sbnorm.running_var:type(typename):type(ctype)
+      sbnorm.running_mean = sbnorm.running_mean:type(typename):type(ctype)
 
-   local error = rescuda:float() - groundtruth
-   mytester:assertlt(error:abs():max(), precision_forward, 'error on state (forward evaluate)')
+      sbnorm:evaluate()
+      local groundtruth = sbnorm:forward(input)
+
+      input = input:type(typename)
+      local gbnorm = nn[moduleName](planes):type(typename)
+      gbnorm:evaluate()
+      gbnorm.weight = sbnorm.weight:type(typename)
+      gbnorm.bias = sbnorm.bias:type(typename)
+      gbnorm.running_mean = sbnorm.running_mean:type(typename)
+      gbnorm.running_var = sbnorm.running_var:type(typename)
+      local rescuda = gbnorm:forward(input)
+
+      local error = rescuda:double() - groundtruth:double()
+      mytester:assertlt(error:abs():max(), precision_forward_type(precision_forward, typename, rescuda:abs():max()),
+         string.format('error on state (forward evaluate) with %s', typename))
+   end
 end
 
 local function BatchNormalization_backward(moduleName, mode, inputSize, backwardFn)
    assert(mode == 'training' or mode == 'evaluation', 'invalid mode')
 
    local planes = inputSize[2]
-   local tm = {}
-   local title = moduleName .. '.backward ' .. table.concat(inputSize, 'x')
-   times[title] = tm
 
-   local input = torch.randn(table.unpack(inputSize))
-   local gradOutput = torch.randn(table.unpack(inputSize))
-   local sbnorm = nn[moduleName](planes)
-   if mode == 'training' then
-     sbnorm:training()
-   else
-     sbnorm:evaluate()
-   end
-   sbnorm:forward(input)
-   sbnorm:zeroGradParameters()
-   local groundgrad = backwardFn(sbnorm, input, gradOutput)
-   local a = torch.Timer()
-   for i = 1,nloop do
-      sbnorm:zeroGradParameters()
-      groundgrad = backwardFn(sbnorm, input, gradOutput)
-   end
-   local groundweight = sbnorm.gradWeight
-   local groundbias = sbnorm.gradBias
-   tm.cpu = a:time().real
+   for k, typename in ipairs(typenames) do
+      local input = torch.randn(table.unpack(inputSize)):type(typename)
+      local gradOutput = torch.randn(table.unpack(inputSize)):type(typename)
 
-   input = input:cuda()
-   gradOutput = gradOutput:cuda()
-   local gbnorm = nn[moduleName](planes):cuda()
-   if mode == 'training' then
-     gbnorm:training()
-   else
-     gbnorm:evaluate()
-   end
-   gbnorm.weight = sbnorm.weight:cuda()
-   gbnorm.bias = sbnorm.bias:cuda()
-   gbnorm:forward(input)
-   gbnorm:zeroGradParameters()
-   local rescuda = backwardFn(gbnorm, input, gradOutput)
-   a:reset()
-   for i = 1,nloop do
+      local ctype = t2cpu[typename]
+      input = input:type(ctype)
+      gradOutput = gradOutput:type(ctype)
+      local sbnorm = nn[moduleName](planes):type(ctype)
+      if mode == 'training' then
+        sbnorm:training()
+      else
+        sbnorm:evaluate()
+      end
+      sbnorm:forward(input)
+      sbnorm:zeroGradParameters()
+      local groundgrad = backwardFn(sbnorm, input, gradOutput)
+      local groundweight = sbnorm.gradWeight
+      local groundbias = sbnorm.gradBias
+
+      input = input:type(typename)
+      gradOutput = gradOutput:type(typename)
+      local gbnorm = nn[moduleName](planes):type(typename)
+      if mode == 'training' then
+        gbnorm:training()
+      else
+        gbnorm:evaluate()
+      end
+      gbnorm.weight = sbnorm.weight:type(typename)
+      gbnorm.bias = sbnorm.bias:type(typename)
+      gbnorm:forward(input)
       gbnorm:zeroGradParameters()
-      rescuda = backwardFn(gbnorm, input, gradOutput)
-   end
-   local weightcuda = gbnorm.gradWeight
-   local biascuda = gbnorm.gradBias
-   cutorch.synchronize()
-   tm.gpu = a:time().real
-
-   local error = rescuda:float() - groundgrad
-   local werror = weightcuda:float() - groundweight
-   local berror = biascuda:float() - groundbias
-
-   mytester:assertlt(error:abs():max(), precision_backward, 'error on state (backward) ')
-   mytester:assertlt(werror:abs():max(), precision_backward, 'error on weight (backward) ')
-   mytester:assertlt(berror:abs():max(), precision_backward, 'error on bias (backward) ')
+      local rescuda = backwardFn(gbnorm, input, gradOutput)
+      local weightcuda = gbnorm.gradWeight
+      local biascuda = gbnorm.gradBias
+
+      local error = rescuda:double() - groundgrad:double()
+      local werror = weightcuda:double() - groundweight:double()
+      local berror = biascuda:double() - groundbias:double()
+
+      local backerror = precision_backward_type(precision_backward, typename, rescuda:abs():max())
+      if typename == 'torch.CudaHalfTensor' and (mode == 'training') then
+        -- this correction is empirical; mean can be off by roughly 4e-4, multiplied by roughly stdval^2.
+        backerror = backerror + (sbnorm.save_std:max())^2 * 4e-4
+      end
+      mytester:assertlt(error:abs():max(),
+        backerror,
+        string.format('error on state (backward) with %s', typename))
+      mytester:assertlt(werror:abs():max(),
+        precision_backward_type(precision_backward, typename, weightcuda:abs():max()),
+        string.format('error on weight (backward) with %s', typename))
+      mytester:assertlt(berror:abs():max(),
+        precision_backward_type(precision_backward, typename, biascuda:abs():max()),
+        string.format('error on bias (backward) with %s', typename))
+    end
 end
 
 local function testBatchNormalization(name, dim, k)
@@ -1027,44 +1002,34 @@ function cunntest.SpatialConvolutionMM_forward_single()
    local ini = (outi-1)*si+ki-padW*2
    local inj = (outj-1)*sj+kj-padH*2
 
-   local tm = {}
-   local title = string.format('SpatialConvolutionMM.forward %dx%dx%d o %dx%d -> %dx%dx%d [s: %dx%d] [p: %dx%d]',
-                               from, inj, ini, kj, ki, to, outj, outi, sj, si, padH, padW)
-   times[title] = tm
-
    local function jacTests(noBias)
       noBias = noBias or false
-      local input = torch.randn(from,inj,ini)
-      local sconv = nn.SpatialConvolutionMM(from,to,ki,kj,si,sj,padW,padH)
-      if noBias then
-         sconv:noBias()
-      end
-      local groundtruth = sconv:forward(input)
-      local a = torch.Timer()
-      for i = 1,nloop do
-         groundtruth = sconv:forward(input)
-      end
-      tm.cpu = a:time().real
-
-      input = input:cuda()
-      local gconv = nn.SpatialConvolutionMM(from,to,ki,kj,si,sj,padW,padH):cuda()
-      if noBias then
-         gconv:noBias()
-      end
-      gconv.weight = sconv.weight:cuda()
-      if gconv.bias then
-         gconv.bias = sconv.bias:cuda()
-      end
-      local rescuda = gconv:forward(input)
-      a:reset()
-      for i = 1,nloop do
-         rescuda = gconv:forward(input)
+      for k, typename in ipairs(typenames) do
+         local input = torch.randn(from,inj,ini):type(typename)
+
+         local ctype = t2cpu[typename]
+         input = input:type(ctype)
+         local sconv = nn.SpatialConvolutionMM(from,to,ki,kj,si,sj,padW,padH):type(ctype)
+         if noBias then
+            sconv:noBias()
+         end
+         local groundtruth = sconv:forward(input)
+
+         input = input:type(typename)
+         local gconv = nn.SpatialConvolutionMM(from,to,ki,kj,si,sj,padW,padH):type(typename)
+         if noBias then
+            gconv:noBias()
+         end
+         gconv.weight = sconv.weight:type(typename)
+         if gconv.bias then
+            gconv.bias = sconv.bias:type(typename)
+         end
+         local rescuda = gconv:forward(input)
+
+         local error = rescuda:double() - groundtruth:double()
+         mytester:assertlt(error:abs():max(), precision_forward_type(precision_forward, typename),
+            string.format('error on state (forward) with %s', typename))
       end
-      cutorch.synchronize()
-      tm.gpu = a:time().real
-
-      local error = rescuda:float() - groundtruth
-      mytester:assertlt(error:abs():max(), precision_forward, 'error on state (forward) ')
    end
 
    jacTests(false)
@@ -1086,44 +1051,34 @@ function cunntest.SpatialConvolutionMM_forward_batch()
    local ini = (outi-1)*si+ki-padW*2
    local inj = (outj-1)*sj+kj-padH*2
 
-   local tm = {}
-   local title = string.format('SpatialConvolutionMM.forward %dx%dx%dx%d o %dx%d -> %dx%dx%dx%d [s: %dx%d] [p: %dx%d]',
-                               bs, from, inj, ini, kj, ki, bs, to, outj, outi, sj, si, padH, padW)
-   times[title] = tm
-
    local function jacTests(noBias)
       noBias = noBias or false
-      local input = torch.randn(bs,from,inj,ini)
-      local sconv = nn.SpatialConvolutionMM(from,to,ki,kj,si,sj,padW,padH)
-      if noBias then
-         sconv:noBias()
-      end
-      local groundtruth = sconv:forward(input)
-      local a = torch.Timer()
-      for i = 1,nloop do
-         groundtruth = sconv:forward(input)
+      for k, typename in ipairs(typenames) do
+         local input = torch.randn(bs,from,inj,ini):type(typename)
+
+         local ctype = t2cpu[typename]
+         input = input:type(ctype)
+         local sconv = nn.SpatialConvolutionMM(from,to,ki,kj,si,sj,padW,padH):type(ctype)
+         if noBias then
+            sconv:noBias()
+         end
+         local groundtruth = sconv:forward(input)
+
+         input = input:type(typename)
+         local gconv = nn.SpatialConvolutionMM(from,to,ki,kj,si,sj,padW,padH):type(typename)
+         if noBias then
+            gconv:noBias()
+         end
+         gconv.weight = sconv.weight:type(typename)
+         if gconv.bias then
+            gconv.bias = sconv.bias:type(typename)
+         end
+         local rescuda = gconv:forward(input)
+
+         local error = rescuda:double() - groundtruth:double()
+         mytester:assertlt(error:abs():max(), precision_forward_type(precision_forward, typename),
+            string.format('error on state (forward) with %s', typename))
       end
-      tm.cpu = a:time().real
-
-      input = input:cuda()
-      local gconv = nn.SpatialConvolutionMM(from,to,ki,kj,si,sj,padW,padH):cuda()
-      if noBias then
-         gconv:noBias()
-      end
-      gconv.weight = sconv.weight:cuda()
-      if gconv.bias then
-         gconv.bias = sconv.bias:cuda()
-      end
-      local rescuda = gconv:forward(input)
-      a:reset()
-      for i = 1,nloop do
-         rescuda = gconv:forward(input)
-      end
-      cutorch.synchronize()
-      tm.gpu = a:time().real
-
-      local error = rescuda:float() - groundtruth
-      mytester:assertlt(error:abs():max(), precision_forward, 'error on state (forward) ')
    end
 
 
@@ -1143,62 +1098,56 @@ function cunntest.SpatialConvolutionMM_backward_single()
    local ini = (outi-1)*si+ki-padW*2
    local inj = (outj-1)*sj+kj-padH*2
 
-   local tm = {}
-   local title = string.format('SpatialConvolutionMM.backward %dx%dx%d o %dx%d -> %dx%dx%d [s: %dx%d] [p: %dx%d]',
-                               from, inj, ini, kj, ki, to, outj, outi, sj, si, padH, padW)
-   times[title] = tm
-
    local function jacTests(noBias)
       noBias = noBias or false
-      local input = torch.randn(from,inj,ini)
-      local gradOutput = torch.randn(to,outj,outi)
-      local sconv = nn.SpatialConvolutionMM(from,to,ki,kj,si,sj,padW,padH)
-      if noBias then
-         sconv:noBias()
-      end
-      sconv:forward(input)
-      sconv:zeroGradParameters()
-      local groundgrad = sconv:backward(input, gradOutput)
-      local a = torch.Timer()
-      for i = 1,nloop do
-         sconv:zeroGradParameters()
-         groundgrad = sconv:backward(input, gradOutput)
-      end
-      local groundweight = sconv.gradWeight
-      local groundbias = sconv.gradBias
-      tm.cpu = a:time().real
 
-      input = input:cuda()
-      gradOutput = gradOutput:cuda()
-      local gconv = nn.SpatialConvolutionMM(from,to,ki,kj,si,sj,padW,padH):cuda()
-      if noBias then
-         gconv:noBias()
-      end
-      gconv.weight = sconv.weight:cuda()
-      if gconv.bias then
-         gconv.bias = sconv.bias:cuda()
-      end
-      gconv:forward(input)
-      gconv:zeroGradParameters()
-      local rescuda = gconv:backward(input, gradOutput)
-      a:reset()
-      for i = 1,nloop do
+      for k, typename in ipairs(typenames) do
+         local input = torch.randn(from,inj,ini):type(typename)
+         local gradOutput = torch.randn(to,outj,outi):type(typename)
+
+         local ctype = t2cpu[typename]
+         input = input:type(ctype)
+         gradOutput = gradOutput:type(ctype)
+         local sconv = nn.SpatialConvolutionMM(from,to,ki,kj,si,sj,padW,padH):type(ctype)
+         if noBias then
+            sconv:noBias()
+         end
+         sconv:forward(input)
+         sconv:zeroGradParameters()
+         local groundgrad = sconv:backward(input, gradOutput)
+         local groundweight = sconv.gradWeight
+         local groundbias = sconv.gradBias
+
+         input = input:type(typename)
+         gradOutput = gradOutput:type(typename)
+         local gconv = nn.SpatialConvolutionMM(from,to,ki,kj,si,sj,padW,padH):type(typename)
+         if noBias then
+            gconv:noBias()
+         end
+         gconv.weight = sconv.weight:type(typename)
+         if gconv.bias then
+            gconv.bias = sconv.bias:type(typename)
+         end
+         gconv:forward(input)
          gconv:zeroGradParameters()
-         rescuda = gconv:backward(input, gradOutput)
-      end
-      local weightcuda = gconv.gradWeight
-      cutorch.synchronize()
-      tm.gpu = a:time().real
-
-      local error = rescuda:float() - groundgrad
-      local werror = weightcuda:float() - groundweight
-
-      mytester:assertlt(error:abs():max(), precision_backward, 'error on state (backward) ')
-      mytester:assertlt(werror:abs():max(), precision_backward, 'error on weight (backward) ')
-
-      if gconv.bias then
-         local berror = gconv.gradBias:float() - groundbias
-         mytester:assertlt(berror:abs():max(), precision_backward, 'error on bias (backward) ')
+         local rescuda = gconv:backward(input, gradOutput)
+         local weightcuda = gconv.gradWeight
+
+         local error = rescuda:double() - groundgrad:double()
+         local werror = weightcuda:double() - groundweight:double()
+
+         mytester:assertlt(error:abs():max(), precision_backward_type(precision_backward, typename),
+            string.format('error on state (backward) with %s', typename))
+         mytester:assertlt(werror:abs():max(),
+            precision_backward_conv_weightbias(precision_backward, typename, weightcuda:abs():max()),
+            string.format('error on weight (backward) with %s', typename))
+
+         if gconv.bias then
+            local berror = gconv.gradBias:double() - groundbias:double()
+            mytester:assertlt(berror:abs():max(),
+                precision_backward_conv_weightbias(precision_backward, typename, gconv.gradBias:abs():max()),
+                string.format('error on bias (backward) with %s', typename))
+         end
       end
    end
 
@@ -1221,61 +1170,55 @@ function cunntest.SpatialConvolutionMM_backward_batch()
    local ini = (outi-1)*si+ki-padW*2
    local inj = (outj-1)*sj+kj-padH*2
 
-   local tm = {}
-   local title = string.format('SpatialConvolutionMM.backward %dx%dx%dx%d o %dx%d -> %dx%dx%dx%d [s: %dx%d] [p: %dx%d]',
-                               bs, from, inj, ini, kj, ki, bs, to, outj, outi, sj, si, padH, padW)
-   times[title] = tm
-
    local function jacTests(noBias)
       noBias = noBias or false
-      local input = torch.randn(bs,from,inj,ini)
-      local gradOutput = torch.randn(bs,to,outj,outi)
-      local sconv = nn.SpatialConvolutionMM(from,to,ki,kj,si,sj,padW,padH)
-      if noBias then
-         sconv:noBias()
-      end
-      sconv:forward(input)
-      sconv:zeroGradParameters()
-      local groundgrad = sconv:backward(input, gradOutput)
-      local a = torch.Timer()
-      for i = 1,nloop do
-         sconv:zeroGradParameters()
-         groundgrad = sconv:backward(input, gradOutput)
-      end
-      local groundweight = sconv.gradWeight
-      local groundbias = sconv.gradBias
-      tm.cpu = a:time().real
 
-      input = input:cuda()
-      gradOutput = gradOutput:cuda()
-      local gconv = nn.SpatialConvolutionMM(from,to,ki,kj,si,sj,padW,padH):cuda()
-      if noBias then
-         gconv:noBias()
-      end
-      gconv.weight = sconv.weight:cuda()
-      if gconv.bias then
-         gconv.bias = sconv.bias:cuda()
-      end
-      gconv:forward(input)
-      gconv:zeroGradParameters()
-      local rescuda = gconv:backward(input, gradOutput)
-      a:reset()
-      for i = 1,nloop do
+      for k, typename in ipairs(typenames) do
+         local input = torch.randn(bs,from,inj,ini)
+         local gradOutput = torch.randn(bs,to,outj,outi)
+
+         local ctype = t2cpu[typename]
+         input = input:type(ctype)
+         gradOutput = gradOutput:type(ctype)
+         local sconv = nn.SpatialConvolutionMM(from,to,ki,kj,si,sj,padW,padH):type(ctype)
+         if noBias then
+            sconv:noBias()
+         end
+         sconv:forward(input)
+         sconv:zeroGradParameters()
+         local groundgrad = sconv:backward(input, gradOutput)
+         local groundweight = sconv.gradWeight
+         local groundbias = sconv.gradBias
+
+         input = input:type(typename)
+         gradOutput = gradOutput:type(typename)
+         local gconv = nn.SpatialConvolutionMM(from,to,ki,kj,si,sj,padW,padH):type(typename)
+         if noBias then
+            gconv:noBias()
+         end
+         gconv.weight = sconv.weight:type(typename)
+         if gconv.bias then
+            gconv.bias = sconv.bias:type(typename)
+         end
+         gconv:forward(input)
          gconv:zeroGradParameters()
-         rescuda = gconv:backward(input, gradOutput)
-      end
-      local weightcuda = gconv.gradWeight
-      cutorch.synchronize()
-      tm.gpu = a:time().real
-
-      local error = rescuda:float() - groundgrad
-      local werror = weightcuda:float() - groundweight
-
-      mytester:assertlt(error:abs():max(), precision_backward, 'error on state (backward) ')
-      mytester:assertlt(werror:abs():max(), precision_backward, 'error on weight (backward) ')
-      if gconv.bias then
-         local berror = gconv.gradBias:float() - groundbias
-         mytester:assertlt(berror:abs():max(), precision_backward, 'error on bias (backward) ')
+         local rescuda = gconv:backward(input, gradOutput)
+         local weightcuda = gconv.gradWeight
+
+         local error = rescuda:double() - groundgrad:double()
+         local werror = weightcuda:double() - groundweight:double()
+
+         mytester:assertlt(error:abs():max(), precision_backward_type(precision_backward, typename),
+            string.format('error on state (backward) with %s', typename))
+         mytester:assertlt(werror:abs():max(),
+            precision_backward_conv_weightbias(precision_backward, typename, weightcuda:abs():max()),
+            string.format('error on weight (backward) with %s', typename))
+         if gconv.bias then
+            local berror = gconv.gradBias:double() - groundbias:double()
+            mytester:assertlt(berror:abs():max(),
+                precision_backward_conv_weightbias(precision_backward, typename, gconv.gradBias:abs():max()),
+                string.format('error on bias (backward) with %s', typename))
+         end
       end
    end
 
@@ -1297,34 +1240,24 @@ function cunntest.SpatialConvolutionLocal_forward_single()
    local ini = (outi-1)*si+ki-padW*2
    local inj = (outj-1)*sj+kj-padH*2
 
-   local tm = {}
-   local title = string.format('SpatialConvolutionLocal.forward %dx%dx%d o %dx%d -> %dx%dx%d [s: %dx%d] [p: %dx%d]',
-                               from, inj, ini, kj, ki, to, outj, outi, sj, si, padH, padW)
-   times[title] = tm
+   for k, typename in ipairs(typenames) do
+      local input = torch.randn(from,inj,ini):type(typename)
 
-   local input = torch.randn(from,inj,ini)
-   local sconv = nn.SpatialConvolutionLocal(from,to,ini,inj,ki,kj,si,sj,padW,padH)
-   local groundtruth = sconv:forward(input)
-   local a = torch.Timer()
-   for i = 1,nloop do
-      groundtruth = sconv:forward(input)
-   end
-   tm.cpu = a:time().real
+      local ctype = t2cpu[typename]
+      input = input:type(ctype)
+      local sconv = nn.SpatialConvolutionLocal(from,to,ini,inj,ki,kj,si,sj,padW,padH):type(ctype)
+      local groundtruth = sconv:forward(input)
 
-   input = input:cuda()
-   local gconv = nn.SpatialConvolutionLocal(from,to,ini,inj,ki,kj,si,sj,padW,padH):cuda()
-   gconv.weight = sconv.weight:cuda()
-   gconv.bias = sconv.bias:cuda()
-   local rescuda = gconv:forward(input)
-   a:reset()
-   for i = 1,nloop do
-      rescuda = gconv:forward(input)
-   end
-   cutorch.synchronize()
-   tm.gpu = a:time().real
+      input = input:type(typename)
+      local gconv = nn.SpatialConvolutionLocal(from,to,ini,inj,ki,kj,si,sj,padW,padH):type(typename)
+      gconv.weight = sconv.weight:type(typename)
+      gconv.bias = sconv.bias:type(typename)
+      local rescuda = gconv:forward(input)
 
-   local error = rescuda:float() - groundtruth
-   mytester:assertlt(error:abs():max(), precision_forward, 'error on state (forward) ')
+      local error = rescuda:double() - groundtruth:double()
+      mytester:assertlt(error:abs():max(), precision_forward_type(precision_forward, typename),
+          string.format('error on state (forward) with %s', typename))
+   end
 end
 
 function cunntest.SpatialConvolutionLocal_forward_batch()
@@ -1342,34 +1275,24 @@ function cunntest.SpatialConvolutionLocal_forward_batch()
    local ini = (outi-1)*si+ki-padW*2
    local inj = (outj-1)*sj+kj-padH*2
 
-   local tm = {}
-   local title = string.format('SpatialConvolutionLocal.forward %dx%dx%dx%d o %dx%d -> %dx%dx%dx%d [s: %dx%d] [p: %dx%d]',
-                               bs, from, inj, ini, kj, ki, bs, to, outj, outi, sj, si, padH, padW)
-   times[title] = tm
+   for k, typename in ipairs(typenames) do
+      local input = torch.randn(bs,from,inj,ini):type(typename)
 
-   local input = torch.randn(bs,from,inj,ini)
-   local sconv = nn.SpatialConvolutionLocal(from,to,ini,inj,ki,kj,si,sj,padW,padH)
-   local groundtruth = sconv:forward(input)
-   local a = torch.Timer()
-   for i = 1,nloop do
-      groundtruth = sconv:forward(input)
-   end
-   tm.cpu = a:time().real
+      local ctype = t2cpu[typename]
+      input = input:type(ctype)
+      local sconv = nn.SpatialConvolutionLocal(from,to,ini,inj,ki,kj,si,sj,padW,padH):type(ctype)
+      local groundtruth = sconv:forward(input)
 
-   input = input:cuda()
-   local gconv = nn.SpatialConvolutionLocal(from,to,ini,inj,ki,kj,si,sj,padW,padH):cuda()
-   gconv.weight = sconv.weight:cuda()
-   gconv.bias = sconv.bias:cuda()
-   local rescuda = gconv:forward(input)
-   a:reset()
-   for i = 1,nloop do
-      rescuda = gconv:forward(input)
-   end
-   cutorch.synchronize()
-   tm.gpu = a:time().real
+      input = input:type(typename)
+      local gconv = nn.SpatialConvolutionLocal(from,to,ini,inj,ki,kj,si,sj,padW,padH):type(typename)
+      gconv.weight = sconv.weight:type(typename)
+      gconv.bias = sconv.bias:type(typename)
+      local rescuda = gconv:forward(input)
 
-   local error = rescuda:float() - groundtruth
-   mytester:assertlt(error:abs():max(), precision_forward, 'error on state (forward) ')
+      local error = rescuda:double() - groundtruth:double()
+      mytester:assertlt(error:abs():max(), precision_forward_type(precision_forward, typename),
+          string.format('error on state (forward) with %s', typename))
+   end
 end
 
 function cunntest.SpatialConvolutionLocal_backward_single()
@@ -1386,51 +1309,44 @@ function cunntest.SpatialConvolutionLocal_backward_single()
    local ini = (outi-1)*si+ki-padW*2
    local inj = (outj-1)*sj+kj-padH*2
 
-   local tm = {}
-   local title = string.format('SpatialConvolutionLocal.backward %dx%dx%d o %dx%d -> %dx%dx%d [s: %dx%d] [p: %dx%d]',
-                               from, inj, ini, kj, ki, to, outj, outi, sj, si, padH, padW)
-   times[title] = tm
+   for k, typename in ipairs(typenames) do
+      local input = torch.randn(from,inj,ini):type(typename)
+      local gradOutput = torch.randn(to,outj,outi):type(typename)
 
-   local input = torch.randn(from,inj,ini)
-   local gradOutput = torch.randn(to,outj,outi)
-   local sconv = nn.SpatialConvolutionLocal(from,to,ini,inj,ki,kj,si,sj,padW,padH)
-   sconv:forward(input)
-   sconv:zeroGradParameters()
-   local groundgrad = sconv:backward(input, gradOutput)
-   local a = torch.Timer()
-   for i = 1,nloop do
+      local ctype = t2cpu[typename]
+      input = input:type(ctype)
+      gradOutput = gradOutput:type(ctype)
+      local sconv = nn.SpatialConvolutionLocal(from,to,ini,inj,ki,kj,si,sj,padW,padH):type(ctype)
+      sconv:forward(input)
       sconv:zeroGradParameters()
-      groundgrad = sconv:backward(input, gradOutput)
-   end
-   local groundweight = sconv.gradWeight
-   local groundbias = sconv.gradBias
-   tm.cpu = a:time().real
+      local groundgrad = sconv:backward(input, gradOutput)
+      local groundweight = sconv.gradWeight
+      local groundbias = sconv.gradBias
 
-   input = input:cuda()
-   gradOutput = gradOutput:cuda()
-   local gconv = nn.SpatialConvolutionLocal(from,to,ini,inj,ki,kj,si,sj,padW,padH):cuda()
-   gconv.weight = sconv.weight:cuda()
-   gconv.bias = sconv.bias:cuda()
-   gconv:forward(input)
-   gconv:zeroGradParameters()
-   local rescuda = gconv:backward(input, gradOutput)
-   a:reset()
-   for i = 1,nloop do
+      input = input:type(typename)
+      gradOutput = gradOutput:type(typename)
+      local gconv = nn.SpatialConvolutionLocal(from,to,ini,inj,ki,kj,si,sj,padW,padH):type(typename)
+      gconv.weight = sconv.weight:type(typename)
+      gconv.bias = sconv.bias:type(typename)
+      gconv:forward(input)
       gconv:zeroGradParameters()
-      rescuda = gconv:backward(input, gradOutput)
-   end
-   local weightcuda = gconv.gradWeight
-   local biascuda = gconv.gradBias
-   cutorch.synchronize()
-   tm.gpu = a:time().real
+      local rescuda = gconv:backward(input, gradOutput)
+      local weightcuda = gconv.gradWeight
+      local biascuda = gconv.gradBias
 
-   local error = rescuda:float() - groundgrad
-   local werror = weightcuda:float() - groundweight
-   local berror = biascuda:float() - groundbias
+      local error = rescuda:double() - groundgrad:double()
+      local werror = weightcuda:double() - groundweight:double()
+      local berror = biascuda:double() - groundbias:double()
 
-   mytester:assertlt(error:abs():max(), precision_backward, 'error on state (backward) ')
-   mytester:assertlt(werror:abs():max(), precision_backward, 'error on weight (backward) ')
-   mytester:assertlt(berror:abs():max(), precision_backward, 'error on bias (backward) ')
+      mytester:assertlt(error:abs():max(), precision_backward_type(precision_backward, typename),
+          string.format('error on state (backward) with %s', typename))
+      mytester:assertlt(werror:abs():max(),
+          precision_backward_conv_weightbias(precision_backward, typename, weightcuda:abs():max()),
+          string.format('error on weight (backward) with %s', typename))
+      mytester:assertlt(berror:abs():max(),
+          precision_backward_conv_weightbias(precision_backward, typename, biascuda:abs():max()),
+          string.format('error on bias (backward) with %s', typename))
+   end
 end
 
 function cunntest.SpatialConvolutionLocal_backward_batch()
@@ -1448,51 +1364,44 @@ function cunntest.SpatialConvolutionLocal_backward_batch()
    local ini = (outi-1)*si+ki-padW*2
    local inj = (outj-1)*sj+kj-padH*2
 
-   local tm = {}
-   local title = string.format('SpatialConvolutionLocal.backward %dx%dx%dx%d o %dx%d -> %dx%dx%dx%d [s: %dx%d] [p: %dx%d]',
-                               bs, from, inj, ini, kj, ki, bs, to, outj, outi, sj, si, padH, padW)
-   times[title] = tm
+   for k, typename in ipairs(typenames) do
+      local input = torch.randn(bs,from,inj,ini):type(typename)
+      local gradOutput = torch.randn(bs,to,outj,outi):type(typename)
 
-   local input = torch.randn(bs,from,inj,ini)
-   local gradOutput = torch.randn(bs,to,outj,outi)
-   local sconv = nn.SpatialConvolutionLocal(from,to,ini,inj,ki,kj,si,sj,padW,padH)
-   sconv:forward(input)
-   sconv:zeroGradParameters()
-   local groundgrad = sconv:backward(input, gradOutput)
-   local a = torch.Timer()
-   for i = 1,nloop do
+      local ctype = t2cpu[typename]
+      input = input:type(ctype)
+      gradOutput = gradOutput:type(ctype)
+      local sconv = nn.SpatialConvolutionLocal(from,to,ini,inj,ki,kj,si,sj,padW,padH):type(ctype)
+      sconv:forward(input)
       sconv:zeroGradParameters()
-      groundgrad = sconv:backward(input, gradOutput)
-   end
-   local groundweight = sconv.gradWeight
-   local groundbias = sconv.gradBias
-   tm.cpu = a:time().real
+      local groundgrad = sconv:backward(input, gradOutput)
+      local groundweight = sconv.gradWeight
+      local groundbias = sconv.gradBias
 
-   input = input:cuda()
-   gradOutput = gradOutput:cuda()
-   local gconv = nn.SpatialConvolutionLocal(from,to,ini,inj,ki,kj,si,sj,padW,padH):cuda()
-   gconv.weight = sconv.weight:cuda()
-   gconv.bias = sconv.bias:cuda()
-   gconv:forward(input)
-   gconv:zeroGradParameters()
-   local rescuda = gconv:backward(input, gradOutput)
-   a:reset()
-   for i = 1,nloop do
+      input = input:type(typename)
+      gradOutput = gradOutput:type(typename)
+      local gconv = nn.SpatialConvolutionLocal(from,to,ini,inj,ki,kj,si,sj,padW,padH):type(typename)
+      gconv.weight = sconv.weight:type(typename)
+      gconv.bias = sconv.bias:type(typename)
+      gconv:forward(input)
       gconv:zeroGradParameters()
-      rescuda = gconv:backward(input, gradOutput)
-   end
-   local weightcuda = gconv.gradWeight
-   local biascuda = gconv.gradBias
-   cutorch.synchronize()
-   tm.gpu = a:time().real
+      local rescuda = gconv:backward(input, gradOutput)
+      local weightcuda = gconv.gradWeight
+      local biascuda = gconv.gradBias
 
-   local error = rescuda:float() - groundgrad
-   local werror = weightcuda:float() - groundweight
-   local berror = biascuda:float() - groundbias
+      local error = rescuda:double() - groundgrad:double()
+      local werror = weightcuda:double() - groundweight:double()
+      local berror = biascuda:double() - groundbias:double()
 
-   mytester:assertlt(error:abs():max(), precision_backward, 'error on state (backward) ')
-   mytester:assertlt(werror:abs():max(), precision_backward, 'error on weight (backward) ')
-   mytester:assertlt(berror:abs():max(), precision_backward, 'error on bias (backward) ')
+      mytester:assertlt(error:abs():max(), precision_backward_type(precision_backward, typename),
+          string.format('error on state (backward) with %s', typename))
+      mytester:assertlt(werror:abs():max(),
+          precision_backward_conv_weightbias(precision_backward, typename, weightcuda:abs():max()),
+          string.format('error on weight (backward) with %s', typename))
+      mytester:assertlt(berror:abs():max(),
+          precision_backward_conv_weightbias(precision_backward, typename, biascuda:abs():max()),
+          string.format('error on bias (backward) with %s', typename))
+   end
 end
 
 function cunntest.SpatialFullConvolution_forward_single()
@@ -1511,46 +1420,34 @@ function cunntest.SpatialFullConvolution_forward_single()
    local ini = math.floor((outi + 2 * padW - ki) / si + 1)
    local inj = math.floor((outj + 2 * padH - kj) / sj + 1)
 
-   local tm = {}
-   local title =
-      string.format('SpatialFullConvolution.forward %dx%dx%d o %dx%d '
-                       .. '-> %dx%dx%d [s: %dx%d] [p: %dx%d] [a: %dx%d]',
-                    from, inj, ini, kj, ki, to, outj, outi, sj, si, padH, padW, adjH, adjW)
-   times[title] = tm
-
    local function jacTests(noBias)
       noBias = noBias or false
-      local input = torch.randn(from,inj,ini)
-      local sconv = nn.SpatialFullConvolution(from,to,ki,kj,si,sj,padW,padH,adjW,adjH)
-      if noBias then
-         sconv:noBias()
+      for k, typename in ipairs(typenames) do
+         local input = torch.randn(from,inj,ini):type(typename)
+
+         local ctype = t2cpu[typename]
+         input = input:type(ctype)
+         local sconv = nn.SpatialFullConvolution(from,to,ki,kj,si,sj,padW,padH,adjW,adjH):type(ctype)
+         if noBias then
+            sconv:noBias()
+         end
+         local groundtruth = sconv:forward(input)
+
+         input = input:type(typename)
+         local gconv = nn.SpatialFullConvolution(from,to,ki,kj,si,sj,padW,padH,adjW,adjH):type(typename)
+         if noBias then
+            gconv:noBias()
+         end
+         gconv.weight = sconv.weight:type(typename)
+         if gconv.bias then
+            gconv.bias = sconv.bias:type(typename)
+         end
+         local rescuda = gconv:forward(input)
+
+         local error = rescuda:double() - groundtruth:double()
+         mytester:assertlt(error:abs():max(), precision_forward_type(precision_forward, typename),
+            string.format('error on state (forward) with %s', typename))
       end
-      local groundtruth = sconv:forward(input)
-      local a = torch.Timer()
-      for i = 1,nloop do
-        groundtruth = sconv:forward(input)
-      end
-      tm.cpu = a:time().real
-
-      input = input:cuda()
-      local gconv = nn.SpatialFullConvolution(from,to,ki,kj,si,sj,padW,padH,adjW,adjH):cuda()
-      if noBias then
-         gconv:noBias()
-      end
-      gconv.weight = sconv.weight:cuda()
-      if gconv.bias then
-        gconv.bias = sconv.bias:cuda()
-      end
-      local rescuda = gconv:forward(input)
-      a:reset()
-      for i = 1,nloop do
-        rescuda = gconv:forward(input)
-      end
-      cutorch.synchronize()
-      tm.gpu = a:time().real
-
-      local error = rescuda:float() - groundtruth
-      mytester:assertlt(error:abs():max(), precision_forward, 'error on state (forward) ')
    end
 
    jacTests(false)
@@ -1574,44 +1471,34 @@ function cunntest.SpatialFullConvolution_forward_batch()
    local ini = math.floor((outi + 2 * padW - ki) / si + 1)
    local inj = math.floor((outj + 2 * padH - kj) / sj + 1)
 
-   local tm = {}
-   local title = string.format('SpatialFullConvolution.forward %dx%dx%dx%d o %dx%d -> %dx%dx%dx%d [s: %dx%d] [p: %dx%d] [a: %dx%d]',
-                               bs, from, inj, ini, kj, ki, bs, to, outj, outi, sj, si, padH, padW, adjH, adjW)
-   times[title] = tm
-
    local function jacTests(noBias)
       noBias = noBias or false
-      local input = torch.randn(bs,from,inj,ini)
-      local sconv = nn.SpatialFullConvolution(from,to,ki,kj,si,sj,padW,padH,adjW,adjH)
-      if noBias then
-         sconv:noBias()
-      end
-      local groundtruth = sconv:forward(input)
-      local a = torch.Timer()
-      for i = 1,nloop do
-        groundtruth = sconv:forward(input)
-      end
-      tm.cpu = a:time().real
-
-      input = input:cuda()
-      local gconv = nn.SpatialFullConvolution(from,to,ki,kj,si,sj,padW,padH,adjW,adjH):cuda()
-      if noBias then
-         gconv:noBias()
+      for k, typename in ipairs(typenames) do
+         local input = torch.randn(bs,from,inj,ini):type(typename)
+
+         local ctype = t2cpu[typename]
+         input = input:type(ctype)
+         local sconv = nn.SpatialFullConvolution(from,to,ki,kj,si,sj,padW,padH,adjW,adjH):type(ctype)
+         if noBias then
+            sconv:noBias()
+         end
+         local groundtruth = sconv:forward(input)
+
+         input = input:type(typename)
+         local gconv = nn.SpatialFullConvolution(from,to,ki,kj,si,sj,padW,padH,adjW,adjH):type(typename)
+         if noBias then
+            gconv:noBias()
+         end
+         gconv.weight = sconv.weight:type(typename)
+         if gconv.bias then
+            gconv.bias = sconv.bias:type(typename)
+         end
+         local rescuda = gconv:forward(input)
+
+         local error = rescuda:double() - groundtruth:double()
+         mytester:assertlt(error:abs():max(), precision_forward_type(precision_forward, typename),
+              string.format('error on state (forward) with %s', typename))
       end
-      gconv.weight = sconv.weight:cuda()
-      if gconv.bias then
-         gconv.bias = sconv.bias:cuda()
-      end
-      local rescuda = gconv:forward(input)
-      a:reset()
-      for i = 1,nloop do
-        rescuda = gconv:forward(input)
-      end
-      cutorch.synchronize()
-      tm.gpu = a:time().real
-
-      local error = rescuda:float() - groundtruth
-      mytester:assertlt(error:abs():max(), precision_forward, 'error on state (forward) ')
    end
 
    jacTests(false)
@@ -1634,62 +1521,54 @@ function cunntest.SpatialFullConvolution_backward_single()
    local ini = math.floor((outi + 2 * padW - ki) / si + 1)
    local inj = math.floor((outj + 2 * padH - kj) / sj + 1)
 
-   local tm = {}
-   local title = string.format('SpatialFullConvolution.backward %dx%dx%d o %dx%d -> %dx%dx%d [s: %dx%d] [p: %dx%d] [a: %dx%d]',
-                               from, inj, ini, kj, ki, to, outj, outi, sj, si, padH, padW, adjH, adjW)
-   times[title] = tm
-
    local function jacTests(noBias)
       noBias = noBias or false
-      local input = torch.randn(from,inj,ini)
-      local sconv = nn.SpatialFullConvolution(from,to,ki,kj,si,sj,padW,padH,adjW,adjH)
-      if noBias then
-         sconv:noBias()
-      end
-      local output = sconv:forward(input)
-      local gradOutput = output:clone():normal()
-      sconv:zeroGradParameters()
-      local groundgrad = sconv:backward(input, gradOutput)
-      local a = torch.Timer()
-      for i = 1,nloop do
+      for k, typename in ipairs(typenames) do
+         local input = torch.randn(from,inj,ini):type(typename)
+
+         local ctype = t2cpu[typename]
+         input = input:type(ctype)
+         local sconv = nn.SpatialFullConvolution(from,to,ki,kj,si,sj,padW,padH,adjW,adjH):type(ctype)
+         if noBias then
+            sconv:noBias()
+         end
+         local output = sconv:forward(input)
+         local gradOutput = output:clone():normal()
          sconv:zeroGradParameters()
-         groundgrad = sconv:backward(input, gradOutput)
-      end
-      local groundweight = sconv.gradWeight
-      local groundbias = sconv.gradBias
-      tm.cpu = a:time().real
-
-      input = input:cuda()
-      gradOutput = gradOutput:cuda()
-      local gconv = nn.SpatialFullConvolution(from,to,ki,kj,si,sj,padW,padH,adjW,adjH):cuda()
-      if noBias then
-         gconv:noBias()
-      end
-      gconv.weight = sconv.weight:cuda()
-      if gconv.bias then
-         gconv.bias = sconv.bias:cuda()
-      end
-      gconv:forward(input)
-      gconv:zeroGradParameters()
-      local rescuda = gconv:backward(input, gradOutput)
-      a:reset()
-      for i = 1,nloop do
+         local groundgrad = sconv:backward(input, gradOutput)
+         local groundweight = sconv.gradWeight
+         local groundbias = sconv.gradBias
+
+         input = input:type(typename)
+         gradOutput = gradOutput:type(typename)
+         local gconv = nn.SpatialFullConvolution(from,to,ki,kj,si,sj,padW,padH,adjW,adjH):type(typename)
+         if noBias then
+            gconv:noBias()
+         end
+         gconv.weight = sconv.weight:type(typename)
+         if gconv.bias then
+            gconv.bias = sconv.bias:type(typename)
+         end
+         gconv:forward(input)
          gconv:zeroGradParameters()
-         rescuda = gconv:backward(input, gradOutput)
-      end
-      local weightcuda = gconv.gradWeight
-      cutorch.synchronize()
-      tm.gpu = a:time().real
-
-      local error = rescuda:float() - groundgrad
-      local werror = weightcuda:float() - groundweight
-
-      mytester:assertlt(error:abs():max(), precision_backward, 'error on state (backward) ')
-      mytester:assertlt(werror:abs():max(), precision_backward, 'error on weight (backward) ')
-
-      if gconv.bias then
-        local berror = gconv.gradBias:float() - groundbias
-        mytester:assertlt(berror:abs():max(), precision_backward, 'error on bias (backward) ')
+         local rescuda = gconv:backward(input, gradOutput)
+         local weightcuda = gconv.gradWeight
+
+         local error = rescuda:double() - groundgrad:double()
+         local werror = weightcuda:double() - groundweight:double()
+
+         mytester:assertlt(error:abs():max(), precision_backward_type(precision_backward, typename),
+            string.format('error on state (backward) with %s', typename))
+         mytester:assertlt(werror:abs():max(),
+            precision_backward_conv_weightbias(precision_backward, typename, weightcuda:abs():max()),
+            string.format('error on weight (backward) with %s', typename))
+
+         if gconv.bias then
+            local berror = gconv.gradBias:double() - groundbias:double()
+            mytester:assertlt(berror:abs():max(),
+               precision_backward_conv_weightbias(precision_backward, typename, gconv.gradBias:abs():max()),
+               string.format('error on bias (backward) with %s', typename))
+         end
       end
    end
 
@@ -1714,63 +1593,54 @@ function cunntest.SpatialFullConvolution_backward_batch()
    local ini = math.floor((outi + 2 * padW - ki) / si + 1)
    local inj = math.floor((outj + 2 * padH - kj) / sj + 1)
 
-   local tm = {}
-   local title = string.format('SpatialFullConvolution.backward %dx%dx%dx%d o %dx%d '
-                                  .. '-> %dx%dx%dx%d [s: %dx%d] [p: %dx%d] [a: %dx%d]',
-                               bs, from, inj, ini, kj, ki,
-                               bs, to, outj, outi, sj, si, padH, padW, adjH, adjW)
-   times[title] = tm
-
    local function jacTests(noBias)
       noBias = noBias or false
-      local input = torch.randn(bs,from,inj,ini)
-      local sconv = nn.SpatialFullConvolution(from,to,ki,kj,si,sj,padW,padH,adjW,adjH)
-      if noBias then
-         sconv:noBias()
-      end
-      local output = sconv:forward(input)
-      local gradOutput = output:clone():normal()
-      sconv:zeroGradParameters()
-      local groundgrad = sconv:backward(input, gradOutput)
-      local a = torch.Timer()
-      for i = 1,nloop do
-         sconv:zeroGradParameters()
-         groundgrad = sconv:backward(input, gradOutput)
-      end
-      local groundweight = sconv.gradWeight
-      local groundbias = sconv.gradBias
-      tm.cpu = a:time().real
 
-      input = input:cuda()
-      gradOutput = gradOutput:cuda()
-      local gconv = nn.SpatialFullConvolution(from,to,ki,kj,si,sj,padW,padH,adjW,adjH):cuda()
-      if noBias then
-         gconv:noBias()
-      end
-      gconv.weight = sconv.weight:cuda()
-      if gconv.bias then
-         gconv.bias = sconv.bias:cuda()
-      end
-      gconv:forward(input)
-      gconv:zeroGradParameters()
-      local rescuda = gconv:backward(input, gradOutput)
-      a:reset()
-      for i = 1,nloop do
+      for k, typename in ipairs(typenames) do
+         local input = torch.randn(bs,from,inj,ini):type(typename)
+
+         local ctype = t2cpu[typename]
+         input = input:type(ctype)
+         local sconv = nn.SpatialFullConvolution(from,to,ki,kj,si,sj,padW,padH,adjW,adjH):type(ctype)
+         if noBias then
+            sconv:noBias()
+         end
+         local output = sconv:forward(input)
+         local gradOutput = output:clone():normal()
+         sconv:zeroGradParameters()
+         local groundgrad = sconv:backward(input, gradOutput)
+         local groundweight = sconv.gradWeight
+         local groundbias = sconv.gradBias
+
+         input = input:type(typename)
+         gradOutput = gradOutput:type(typename)
+         local gconv = nn.SpatialFullConvolution(from,to,ki,kj,si,sj,padW,padH,adjW,adjH):type(typename)
+         if noBias then
+            gconv:noBias()
+         end
+         gconv.weight = sconv.weight:type(typename)
+         if gconv.bias then
+            gconv.bias = sconv.bias:type(typename)
+         end
+         gconv:forward(input)
          gconv:zeroGradParameters()
-         rescuda = gconv:backward(input, gradOutput)
-      end
-      local weightcuda = gconv.gradWeight
-      cutorch.synchronize()
-      tm.gpu = a:time().real
-
-      local error = rescuda:float() - groundgrad
-      local werror = weightcuda:float() - groundweight
-
-      mytester:assertlt(error:abs():max(), precision_backward, 'error on state (backward) ')
-      mytester:assertlt(werror:abs():max(), precision_backward, 'error on weight (backward) ')
-      if gconv.bias then
-         local berror = gconv.gradBias:float() - groundbias
-         mytester:assertlt(berror:abs():max(), precision_backward, 'error on bias (backward) ')
+         local rescuda = gconv:backward(input, gradOutput)
+         local weightcuda = gconv.gradWeight
+
+         local error = rescuda:double() - groundgrad:double()
+         local werror = weightcuda:double() - groundweight:double()
+
+         mytester:assertlt(error:abs():max(), precision_backward_type(precision_backward, typename),
+            string.format('error on state (backward) with %s', typename))
+         mytester:assertlt(werror:abs():max(),
+            precision_backward_conv_weightbias(precision_backward, typename, weightcuda:abs():max()),
+            string.format('error on weight (backward) with %s', typename))
+         if gconv.bias then
+            local berror = gconv.gradBias:double() - groundbias:double()
+            mytester:assertlt(berror:abs():max(),
+               precision_backward_conv_weightbias(precision_backward, typename, gconv.gradBias:abs():max()),
+               string.format('error on bias (backward) with %s', typename))
+         end
       end
    end
 
@@ -1794,36 +1664,24 @@ function cunntest.SpatialDilatedConvolution_forward_single()
    local ini = (outi - 1) * si - 2 * padW + dilationW * (ki-1) + 1
    local inj = (outj - 1) * sj - 2 * padH + dilationH * (kj-1) + 1
 
-   local tm = {}
-   local title =
-      string.format('SpatialDilatedConvolution.forward %dx%dx%d o %dx%d '
-                       .. '-> %dx%dx%d [s: %dx%d] [p: %dx%d] [a: %dx%d]',
-                    from, inj, ini, kj, ki, to, outj, outi, sj, si, padH, padW, dilationH, dilationW)
-   times[title] = tm
+   for k, typename in ipairs(typenames) do
+      local input = torch.randn(from,inj,ini):type(typename)
 
-   local input = torch.randn(from,inj,ini)
-   local sconv = nn.SpatialDilatedConvolution(from,to,ki,kj,si,sj,padW,padH,dilationW,dilationH)
-   local groundtruth = sconv:forward(input)
-   local a = torch.Timer()
-   for i = 1,nloop do
-      groundtruth = sconv:forward(input)
-   end
-   tm.cpu = a:time().real
+      local ctype = t2cpu[typename]
+      input = input:type(ctype)
+      local sconv = nn.SpatialDilatedConvolution(from,to,ki,kj,si,sj,padW,padH,dilationW,dilationH):type(ctype)
+      local groundtruth = sconv:forward(input)
 
-   input = input:cuda()
-   local gconv = nn.SpatialDilatedConvolution(from,to,ki,kj,si,sj,padW,padH,dilationW,dilationH):cuda()
-   gconv.weight = sconv.weight:cuda()
-   gconv.bias = sconv.bias:cuda()
-   local rescuda = gconv:forward(input)
-   a:reset()
-   for i = 1,nloop do
-      rescuda = gconv:forward(input)
-   end
-   cutorch.synchronize()
-   tm.gpu = a:time().real
+      input = input:type(typename)
+      local gconv = nn.SpatialDilatedConvolution(from,to,ki,kj,si,sj,padW,padH,dilationW,dilationH):type(typename)
+      gconv.weight = sconv.weight:type(typename)
+      gconv.bias = sconv.bias:type(typename)
+      local rescuda = gconv:forward(input)
 
-   local error = rescuda:float() - groundtruth
-   mytester:assertlt(error:abs():max(), precision_forward, 'error on state (forward) ')
+      local error = rescuda:double() - groundtruth:double()
+      mytester:assertlt(error:abs():max(), precision_forward_type(precision_forward, typename),
+         string.format('error on state (forward) with %s', typename))
+   end
 end
 
 function cunntest.SpatialDilatedConvolution_forward_batch()
@@ -1838,39 +1696,29 @@ function cunntest.SpatialDilatedConvolution_forward_batch()
    local padH = math.random(0,1)
    local outi = math.random(ki, 64)
    local outj = math.random(kj, 64)
-   local dilationW = math.random(0,10)
-   local dilationH = math.random(0,10)
+   local dilationW = math.random(1,10)
+   local dilationH = math.random(1,10)
    local ini = (outi - 1) * si - 2 * padW + dilationW * (ki-1) + 1
    local inj = (outj - 1) * sj - 2 * padH + dilationH * (kj-1) + 1
 
-   local tm = {}
-   local title = string.format('SpatialDilatedConvolution.forward %dx%dx%dx%d o %dx%d -> %dx%dx%dx%d [s: %dx%d] [p: %dx%d] [a: %dx%d]',
-                               bs, from, inj, ini, kj, ki, bs, to, outj, outi, sj, si, padH, padW, dilationH, dilationW)
-   times[title] = tm
+   for k, typename in ipairs(typenames) do
+      local input = torch.randn(bs,from,inj,ini):type(typename)
 
-   local input = torch.randn(bs,from,inj,ini)
-   local sconv = nn.SpatialDilatedConvolution(from,to,ki,kj,si,sj,padW,padH,dilationW,dilationH)
-   local groundtruth = sconv:forward(input)
-   local a = torch.Timer()
-   for i = 1,nloop do
-      groundtruth = sconv:forward(input)
-   end
-   tm.cpu = a:time().real
+      local ctype = t2cpu[typename]
+      input = input:type(ctype)
+      local sconv = nn.SpatialDilatedConvolution(from,to,ki,kj,si,sj,padW,padH,dilationW,dilationH):type(ctype)
+      local groundtruth = sconv:forward(input)
 
-   input = input:cuda()
-   local gconv = nn.SpatialDilatedConvolution(from,to,ki,kj,si,sj,padW,padH,dilationW,dilationH):cuda()
-   gconv.weight = sconv.weight:cuda()
-   gconv.bias = sconv.bias:cuda()
-   local rescuda = gconv:forward(input)
-   a:reset()
-   for i = 1,nloop do
-      rescuda = gconv:forward(input)
-   end
-   cutorch.synchronize()
-   tm.gpu = a:time().real
+      input = input:type(typename)
+      local gconv = nn.SpatialDilatedConvolution(from,to,ki,kj,si,sj,padW,padH,dilationW,dilationH):type(typename)
+      gconv.weight = sconv.weight:type(typename)
+      gconv.bias = sconv.bias:type(typename)
+      local rescuda = gconv:forward(input)
 
-   local error = rescuda:float() - groundtruth
-   mytester:assertlt(error:abs():max(), precision_forward, 'error on state (forward) ')
+      local error = rescuda:double() - groundtruth:double()
+      mytester:assertlt(error:abs():max(), precision_forward_type(precision_forward, typename),
+         string.format('error on state (forward) with %s', typename))
+   end
 end
 
 function cunntest.SpatialDilatedConvolution_backward_single()
@@ -1884,56 +1732,48 @@ function cunntest.SpatialDilatedConvolution_backward_single()
    local padH = math.random(0,1)
    local outi = math.random(ki, 64)
    local outj = math.random(kj, 64)
-   local dilationW = math.random(0,10)
-   local dilationH = math.random(0,10)
+   local dilationW = math.random(1,10)
+   local dilationH = math.random(1,10)
    local ini = (outi - 1) * si - 2 * padW + dilationW * (ki-1) + 1
    local inj = (outj - 1) * sj - 2 * padH + dilationH * (kj-1) + 1
 
-   local tm = {}
-   local title = string.format('SpatialDilatedConvolution.backward %dx%dx%d o %dx%d -> %dx%dx%d [s: %dx%d] [p: %dx%d] [a: %dx%d]',
-                               from, inj, ini, kj, ki, to, outj, outi, sj, si, padH, padW, dilationH, dilationW)
-   times[title] = tm
+   for k, typename in ipairs(typenames) do
+      local input = torch.randn(from,inj,ini):type(typename)
 
-   local input = torch.randn(from,inj,ini)
-   local sconv = nn.SpatialDilatedConvolution(from,to,ki,kj,si,sj,padW,padH,dilationW,dilationH)
-   local output = sconv:forward(input)
-   local gradOutput = output:clone():normal()
-   sconv:zeroGradParameters()
-   local groundgrad = sconv:backward(input, gradOutput)
-   local a = torch.Timer()
-   for i = 1,nloop do
+      local ctype = t2cpu[typename]
+      input = input:type(ctype)
+      local sconv = nn.SpatialDilatedConvolution(from,to,ki,kj,si,sj,padW,padH,dilationW,dilationH):type(ctype)
+      local output = sconv:forward(input)
+      local gradOutput = output:clone():normal()
       sconv:zeroGradParameters()
-      groundgrad = sconv:backward(input, gradOutput)
-   end
-   local groundweight = sconv.gradWeight
-   local groundbias = sconv.gradBias
-   tm.cpu = a:time().real
+      local groundgrad = sconv:backward(input, gradOutput)
+      local groundweight = sconv.gradWeight
+      local groundbias = sconv.gradBias
 
-   input = input:cuda()
-   gradOutput = gradOutput:cuda()
-   local gconv = nn.SpatialDilatedConvolution(from,to,ki,kj,si,sj,padW,padH,dilationW,dilationH):cuda()
-   gconv.weight = sconv.weight:cuda()
-   gconv.bias = sconv.bias:cuda()
-   gconv:forward(input)
-   gconv:zeroGradParameters()
-   local rescuda = gconv:backward(input, gradOutput)
-   a:reset()
-   for i = 1,nloop do
+      input = input:type(typename)
+      gradOutput = gradOutput:type(typename)
+      local gconv = nn.SpatialDilatedConvolution(from,to,ki,kj,si,sj,padW,padH,dilationW,dilationH):type(typename)
+      gconv.weight = sconv.weight:type(typename)
+      gconv.bias = sconv.bias:type(typename)
+      gconv:forward(input)
       gconv:zeroGradParameters()
-      rescuda = gconv:backward(input, gradOutput)
-   end
-   local weightcuda = gconv.gradWeight
-   local biascuda = gconv.gradBias
-   cutorch.synchronize()
-   tm.gpu = a:time().real
+      local rescuda = gconv:backward(input, gradOutput)
+      local weightcuda = gconv.gradWeight
+      local biascuda = gconv.gradBias
 
-   local error = rescuda:float() - groundgrad
-   local werror = weightcuda:float() - groundweight
-   local berror = biascuda:float() - groundbias
+      local error = rescuda:double() - groundgrad:double()
+      local werror = weightcuda:double() - groundweight:double()
+      local berror = biascuda:double() - groundbias:double()
 
-   mytester:assertlt(error:abs():max(), precision_backward, 'error on state (backward) ')
-   mytester:assertlt(werror:abs():max(), precision_backward, 'error on weight (backward) ')
-   mytester:assertlt(berror:abs():max(), precision_backward, 'error on bias (backward) ')
+      mytester:assertlt(error:abs():max(), precision_backward_type(precision_backward, typename),
+         string.format('error on state (backward) with %s', typename))
+      mytester:assertlt(werror:abs():max(),
+         precision_backward_conv_weightbias(precision_backward, typename, weightcuda:abs():max()),
+         string.format('error on weight (backward) with %s', typename))
+      mytester:assertlt(berror:abs():max(),
+         precision_backward_conv_weightbias(precision_backward, typename, biascuda:abs():max()),
+         string.format('error on bias (backward) with %s', typename))
+   end
 end
 
 function cunntest.SpatialDilatedConvolution_backward_batch()
@@ -1948,58 +1788,48 @@ function cunntest.SpatialDilatedConvolution_backward_batch()
    local padH = math.random(0,1)
    local outi = math.random(ki, 64)
    local outj = math.random(kj, 64)
-   local dilationW = math.random(0,10)
-   local dilationH = math.random(0,10)
+   local dilationW = math.random(1,10)
+   local dilationH = math.random(1,10)
    local ini = (outi - 1) * si - 2 * padW + dilationW * (ki-1) + 1
    local inj = (outj - 1) * sj - 2 * padH + dilationH * (kj-1) + 1
 
-   local tm = {}
-   local title = string.format('SpatialDilatedConvolution.backward %dx%dx%dx%d o %dx%d '
-                                  .. '-> %dx%dx%dx%d [s: %dx%d] [p: %dx%d] [a: %dx%d]',
-                               bs, from, inj, ini, kj, ki,
-                               bs, to, outj, outi, sj, si, padH, padW, dilationH, dilationW)
-   times[title] = tm
+   for k, typename in ipairs(typenames) do
+      local input = torch.randn(bs,from,inj,ini):type(typename)
 
-   local input = torch.randn(bs,from,inj,ini)
-   local sconv = nn.SpatialDilatedConvolution(from,to,ki,kj,si,sj,padW,padH,dilationW,dilationH)
-   local output = sconv:forward(input)
-   local gradOutput = output:clone():normal()
-   sconv:zeroGradParameters()
-   local groundgrad = sconv:backward(input, gradOutput)
-   local a = torch.Timer()
-   for i = 1,nloop do
+      local ctype = t2cpu[typename]
+      input = input:type(ctype)
+      local sconv = nn.SpatialDilatedConvolution(from,to,ki,kj,si,sj,padW,padH,dilationW,dilationH):type(ctype)
+      local output = sconv:forward(input)
+      local gradOutput = output:clone():normal()
       sconv:zeroGradParameters()
-      groundgrad = sconv:backward(input, gradOutput)
-   end
-   local groundweight = sconv.gradWeight
-   local groundbias = sconv.gradBias
-   tm.cpu = a:time().real
+      local groundgrad = sconv:backward(input, gradOutput)
+      local groundweight = sconv.gradWeight
+      local groundbias = sconv.gradBias
 
-   input = input:cuda()
-   gradOutput = gradOutput:cuda()
-   local gconv = nn.SpatialDilatedConvolution(from,to,ki,kj,si,sj,padW,padH,dilationW,dilationH):cuda()
-   gconv.weight = sconv.weight:cuda()
-   gconv.bias = sconv.bias:cuda()
-   gconv:forward(input)
-   gconv:zeroGradParameters()
-   local rescuda = gconv:backward(input, gradOutput)
-   a:reset()
-   for i = 1,nloop do
+      input = input:type(typename)
+      gradOutput = gradOutput:type(typename)
+      local gconv = nn.SpatialDilatedConvolution(from,to,ki,kj,si,sj,padW,padH,dilationW,dilationH):type(typename)
+      gconv.weight = sconv.weight:type(typename)
+      gconv.bias = sconv.bias:type(typename)
+      gconv:forward(input)
       gconv:zeroGradParameters()
-      rescuda = gconv:backward(input, gradOutput)
-   end
-   local weightcuda = gconv.gradWeight
-   local biascuda = gconv.gradBias
-   cutorch.synchronize()
-   tm.gpu = a:time().real
+      local rescuda = gconv:backward(input, gradOutput)
+      local weightcuda = gconv.gradWeight
+      local biascuda = gconv.gradBias
 
-   local error = rescuda:float() - groundgrad
-   local werror = weightcuda:float() - groundweight
-   local berror = biascuda:float() - groundbias
+      local error = rescuda:double() - groundgrad:double()
+      local werror = weightcuda:double() - groundweight:double()
+      local berror = biascuda:double() - groundbias:double()
 
-   mytester:assertlt(error:abs():max(), precision_backward, 'error on state (backward) ')
-   mytester:assertlt(werror:abs():max(), precision_backward, 'error on weight (backward) ')
-   mytester:assertlt(berror:abs():max(), precision_backward, 'error on bias (backward) ')
+      mytester:assertlt(error:abs():max(), precision_backward_type(precision_backward, typename),
+         string.format('error on state (backward) with %s', typename))
+      mytester:assertlt(werror:abs():max(),
+         precision_backward_conv_weightbias(precision_backward, typename, weightcuda:abs():max()),
+         string.format('error on weight (backward) with %s', typename))
+      mytester:assertlt(berror:abs():max(),
+         precision_backward_conv_weightbias(precision_backward, typename, biascuda:abs():max()),
+         string.format('error on bias (backward) with %s', typename))
+   end
 end
 
 function cunntest.SpatialSubSampling_forward()
@@ -2014,37 +1844,27 @@ function cunntest.SpatialSubSampling_forward()
    local ini = (outi-1)*si+ki
    local inj = (outj-1)*sj+kj
 
-   local tm = {}
-   local title = string.format('SpatialSubSampling.forward %dx%dx%d o %dx%d -> %dx%dx%d',
-                               from, inj, ini, kj, ki, to, outj, outi)
-   times[title] = tm
+   for k, typename in ipairs(typenames) do
+      local input = torch.randn(from,inj,ini):type(typename)
 
-   local input = torch.randn(from,inj,ini)
-   local sconv = nn.SpatialSubSampling(from,ki,kj,si,sj)
-   local groundtruth = sconv:forward(input)
-   local a = torch.Timer()
-   for i = 1,nloop do
-      groundtruth = sconv:forward(input)
-   end
-   tm.cpu = a:time().real
+      local ctype = t2cpu[typename]
+      input = input:type(ctype)
+      local sconv = nn.SpatialSubSampling(from,ki,kj,si,sj):type(ctype)
+      local groundtruth = sconv:forward(input)
 
-   input = input:cuda()
-   local gconv = nn.SpatialSubSampling(from,ki,kj,si,sj):cuda()
-   gconv.weight = sconv.weight:cuda()
-   gconv.bias = sconv.bias:cuda()
-   local rescuda = gconv:forward(input)
-   a:reset()
-   for i = 1,nloop do
-      rescuda = gconv:forward(input)
-   end
-   cutorch.synchronize()
-   tm.gpu = a:time().real
+      input = input:type(typename)
+      local gconv = nn.SpatialSubSampling(from,ki,kj,si,sj):type(typename)
+      gconv.weight = sconv.weight:type(typename)
+      gconv.bias = sconv.bias:type(typename)
+      local rescuda = gconv:forward(input)
 
-   local error = rescuda:float() - groundtruth
-   mytester:assertlt(error:abs():max(), precision_forward, 'error on state (forward) ')
+      local error = rescuda:double() - groundtruth:double()
+      mytester:assertlt(error:abs():max(), precision_forward_type(precision_forward, typename),
+          string.format('error on state (forward) with %s', typename))
+   end
 end
 
-function cunntest.SpatialSubSampling_forward_batch()
+function cunntest.Sampling_forward_batch()
    local bs = math.random(4,10)
    local from = math.random(1,64)
    local to = from
@@ -2057,34 +1877,24 @@ function cunntest.SpatialSubSampling_forward_batch()
    local ini = (outi-1)*si+ki
    local inj = (outj-1)*sj+kj
 
-   local tm = {}
-   local title = string.format('SpatialSubSampling.forward %dx%dx%dx%d o %dx%d -> %dx%dx%dx%d',
-                               bs, from, inj, ini, kj, ki, bs, to, outj, outi)
-   times[title] = tm
+   for k, typename in ipairs(typenames) do
+      local input = torch.randn(bs,from,inj,ini):type(typename)
 
-   local input = torch.randn(bs,from,inj,ini)
-   local sconv = nn.SpatialSubSampling(from,ki,kj,si,sj)
-   local groundtruth = sconv:forward(input)
-   local a = torch.Timer()
-   for i = 1,nloop do
-      groundtruth = sconv:forward(input)
-   end
-   tm.cpu = a:time().real
+      local ctype = t2cpu[typename]
+      input = input:type(ctype)
+      local sconv = nn.SpatialSubSampling(from,ki,kj,si,sj):type(ctype)
+      local groundtruth = sconv:forward(input)
 
-   input = input:cuda()
-   local gconv = nn.SpatialSubSampling(from,ki,kj,si,sj):cuda()
-   gconv.weight = sconv.weight:cuda()
-   gconv.bias = sconv.bias:cuda()
-   local rescuda = gconv:forward(input)
-   a:reset()
-   for i = 1,nloop do
-      rescuda = gconv:forward(input)
-   end
-   cutorch.synchronize()
-   tm.gpu = a:time().real
+      input = input:type(typename)
+      local gconv = nn.SpatialSubSampling(from,ki,kj,si,sj):type(typename)
+      gconv.weight = sconv.weight:type(typename)
+      gconv.bias = sconv.bias:type(typename)
+      local rescuda = gconv:forward(input)
 
-   local error = rescuda:float() - groundtruth
-   mytester:assertlt(error:abs():max(), precision_forward, 'error on state (forward) ')
+      local error = rescuda:double() - groundtruth:double()
+      mytester:assertlt(error:abs():max(), precision_forward_type(precision_forward, typename),
+          string.format('error on state (forward) with %s', typename))
+   end
 end
 
 function cunntest.SpatialSubSampling_backward()
@@ -2099,51 +1909,50 @@ function cunntest.SpatialSubSampling_backward()
    local ini = (outi-1)*si+ki
    local inj = (outj-1)*sj+kj
 
-   local tm = {}
-   local title = string.format('SpatialSubSampling.backward %dx%dx%d o %dx%d -> %dx%dx%d',
-                               from, inj, ini, kj, ki, to, outj, outi)
-   times[title] = tm
+   for k, typename in ipairs(typenames) do
+      -- FIXME: SpatialSubSampling accumulates directly to real, causes
+      -- precision issues with half
+      precision_backward_old = precision_backward
+      if typename == 'torch.CudaHalfTensor' then
+          precision_backward = 0.4
+      end
+      local input = torch.randn(from,inj,ini):type(typename)
+      local gradOutput = torch.randn(to,outj,outi):type(typename)
 
-   local input = torch.randn(from,inj,ini)
-   local gradOutput = torch.randn(to,outj,outi)
-   local sconv = nn.SpatialSubSampling(from,ki,kj,si,sj)
-   sconv:forward(input)
-   sconv:zeroGradParameters()
-   local groundgrad = sconv:backward(input, gradOutput)
-   local a = torch.Timer()
-   for i = 1,nloop do
+      local ctype = t2cpu[typename]
+      input = input:type(ctype)
+      gradOutput = gradOutput:type(ctype)
+      local sconv = nn.SpatialSubSampling(from,ki,kj,si,sj):type(ctype)
+      sconv:forward(input)
       sconv:zeroGradParameters()
-      groundgrad = sconv:backward(input, gradOutput)
-   end
-   local groundweight = sconv.gradWeight
-   local groundbias = sconv.gradBias
-   tm.cpu = a:time().real
+      local groundgrad = sconv:backward(input, gradOutput)
+      local groundweight = sconv.gradWeight
+      local groundbias = sconv.gradBias
 
-   input = input:cuda()
-   gradOutput = gradOutput:cuda()
-   local gconv = nn.SpatialSubSampling(from,ki,kj,si,sj):cuda()
-   gconv.weight = sconv.weight:cuda()
-   gconv.bias = sconv.bias:cuda()
-   gconv:forward(input)
-   gconv:zeroGradParameters()
-   local rescuda = gconv:backward(input, gradOutput)
-   a:reset()
-   for i = 1,nloop do
+      input = input:type(typename)
+      gradOutput = gradOutput:type(typename)
+      local gconv = nn.SpatialSubSampling(from,ki,kj,si,sj):type(typename)
+      gconv.weight = sconv.weight:type(typename)
+      gconv.bias = sconv.bias:type(typename)
+      gconv:forward(input)
       gconv:zeroGradParameters()
-      rescuda = gconv:backward(input, gradOutput)
-   end
-   local weightcuda = gconv.gradWeight
-   local biascuda = gconv.gradBias
-   cutorch.synchronize()
-   tm.gpu = a:time().real
+      local rescuda = gconv:backward(input, gradOutput)
+      local weightcuda = gconv.gradWeight
+      local biascuda = gconv.gradBias
 
-   local error = rescuda:float() - groundgrad
-   local werror = weightcuda:float() - groundweight
-   local berror = biascuda:float() - groundbias
+      local error = rescuda:double() - groundgrad:double()
+      local werror = weightcuda:double() - groundweight:double()
+      local berror = biascuda:double() - groundbias:double()
 
-   mytester:assertlt(error:abs():max(), precision_backward, 'error on state (backward) ')
-   mytester:assertlt(werror:abs():max(), precision_backward, 'error on weight (backward) ')
-   mytester:assertlt(berror:abs():max(), precision_backward, 'error on bias (backward) ')
+      mytester:assertlt(error:abs():max(), precision_backward_type(precision_backward, typename),
+          string.format('error on state (backward) with %s', typename))
+      mytester:assertlt(werror:abs():max(), precision_backward_type(precision_backward, typename),
+          string.format('error on weight (backward) with %s', typename))
+      mytester:assertlt(berror:abs():max(), precision_backward_type(precision_backward, typename),
+          string.format('error on bias (backward) with %s', typename))
+
+      precision_backward = precision_backward_old
+   end
 end
 
 function cunntest.SpatialSubSampling_backward_batch()
@@ -2159,51 +1968,47 @@ function cunntest.SpatialSubSampling_backward_batch()
    local ini = (outi-1)*si+ki
    local inj = (outj-1)*sj+kj
 
-   local tm = {}
-   local title = string.format('SpatialSubSampling.backward %dx%dx%dx%d o %dx%d -> %dx%dx%dx%d',
-                               bs, from, inj, ini, kj, ki, bs, to, outj, outi)
-   times[title] = tm
+   for k, typename in ipairs(typenames) do
+      local input = torch.randn(bs,from,inj,ini):type(typename)
+      local gradOutput = torch.randn(bs,to,outj,outi):type(typename)
 
-   local input = torch.randn(bs,from,inj,ini)
-   local gradOutput = torch.randn(bs,to,outj,outi)
-   local sconv = nn.SpatialSubSampling(from,ki,kj,si,sj)
-   sconv:forward(input)
-   sconv:zeroGradParameters()
-   local groundgrad = sconv:backward(input, gradOutput)
-   local a = torch.Timer()
-   for i = 1,nloop do
+      local ctype = t2cpu[typename]
+      input = input:type(ctype)
+      gradOutput = gradOutput:type(ctype)
+      local sconv = nn.SpatialSubSampling(from,ki,kj,si,sj):type(ctype)
+      sconv:forward(input)
       sconv:zeroGradParameters()
-      groundgrad = sconv:backward(input, gradOutput)
-   end
-   local groundweight = sconv.gradWeight
-   local groundbias = sconv.gradBias
-   tm.cpu = a:time().real
+      local groundgrad = sconv:backward(input, gradOutput)
+      local groundweight = sconv.gradWeight
+      local groundbias = sconv.gradBias
 
-   input = input:cuda()
-   gradOutput = gradOutput:cuda()
-   local gconv = nn.SpatialSubSampling(from,ki,kj,si,sj):cuda()
-   gconv.weight = sconv.weight:cuda()
-   gconv.bias = sconv.bias:cuda()
-   gconv:forward(input)
-   gconv:zeroGradParameters()
-   local rescuda = gconv:backward(input, gradOutput)
-   a:reset()
-   for i = 1,nloop do
+      input = input:type(typename)
+      gradOutput = gradOutput:type(typename)
+      local gconv = nn.SpatialSubSampling(from,ki,kj,si,sj):type(typename)
+      gconv.weight = sconv.weight:type(typename)
+      gconv.bias = sconv.bias:type(typename)
+      gconv:forward(input)
       gconv:zeroGradParameters()
-      rescuda = gconv:backward(input, gradOutput)
-   end
-   local weightcuda = gconv.gradWeight
-   local biascuda = gconv.gradBias
-   cutorch.synchronize()
-   tm.gpu = a:time().real
+      local rescuda = gconv:backward(input, gradOutput)
+      local weightcuda = gconv.gradWeight
+      local biascuda = gconv.gradBias
 
-   local error = rescuda:float() - groundgrad
-   local werror = weightcuda:float() - groundweight
-   local berror = biascuda:float() - groundbias
+      local error = rescuda:double() - groundgrad:double()
+      local werror = weightcuda:double() - groundweight:double()
+      local berror = biascuda:double() - groundbias:double()
 
-   mytester:assertlt(error:abs():max(), precision_backward, 'error on state (backward) ')
-   mytester:assertlt(werror:abs():max(), precision_backward, 'error on weight (backward) ')
-   mytester:assertlt(berror:abs():max(), precision_backward, 'error on bias (backward) ')
+      -- FIXME: SpatialSubSampling accumulates directly to real, causes
+      -- precision issues with half, so we double the error tolerance
+      mytester:assertlt(error:abs():max(),
+          2*precision_backward_type(precision_backward, typename, rescuda:abs():max()),
+          string.format('error on state (backward) with %s', typename))
+      mytester:assertlt(werror:abs():max(),
+          2*precision_backward_type(precision_backward, typename, weightcuda:abs():max()),
+          string.format('error on weight (backward) with %s', typename))
+      mytester:assertlt(berror:abs():max(),
+          2*precision_backward_type(precision_backward, typename, biascuda:abs():max()),
+          string.format('error on bias (backward) with %s', typename))
+   end
 end
 
 function cunntest.SpatialMaxPooling_forward()
@@ -2221,36 +2026,26 @@ function cunntest.SpatialMaxPooling_forward()
    local inj = (outj-1)*sj+kj - padj*2
    local ceil_mode = math.random(0,1) == 1
 
-   local tm = {}
-   local title = string.format('SpatialMaxPooling.forward %dx%dx%d o %dx%d -> %dx%dx%d',
-                               from, inj, ini, kj, ki, to, outj, outi)
-   times[title] = tm
-
-   local input = torch.randn(from,inj,ini)
-   local sconv = nn.SpatialMaxPooling(ki,kj,si,sj,padi,padj)
-   if ceil_mode then sconv:ceil() end
-   local groundtruth = sconv:forward(input)
-   local a = torch.Timer()
-   for i = 1,nloop do
-      groundtruth = sconv:forward(input)
-   end
-   tm.cpu = a:time().real
+   for k, typename in ipairs(typenames) do
+      local input = torch.randn(from,inj,ini):type(typename)
+      local ctype = t2cpu[typename]
+      input = input:type(ctype)
+      local sconv = nn.SpatialMaxPooling(ki,kj,si,sj,padi,padj):type(ctype)
+      if ceil_mode then sconv:ceil() end
+      local groundtruth = sconv:forward(input)
 
-   input = input:cuda()
-   local gconv = nn.SpatialMaxPooling(ki,kj,si,sj,padi,padj):cuda()
-   if ceil_mode then gconv:ceil() end
-   local rescuda = gconv:forward(input)
-   a:reset()
-   for i = 1,nloop do
-      rescuda = gconv:forward(input)
-   end
-   cutorch.synchronize()
-   tm.gpu = a:time().real
+      input = input:type(typename)
+      local gconv = nn.SpatialMaxPooling(ki,kj,si,sj,padi,padj):type(typename)
+      if ceil_mode then gconv:ceil() end
+      local rescuda = gconv:forward(input)
 
-   local error = rescuda:float() - groundtruth
-   mytester:assertlt(error:abs():max(), precision_forward, 'error on state (forward) ')
-   local error_ind = gconv.indices:float() - sconv.indices
-   mytester:asserteq(error_ind:max(), 0, 'error on indices (forward) ')
+      local error = rescuda:double() - groundtruth:double()
+      mytester:assertlt(error:abs():max(), precision_forward_type(precision_forward, typename),
+          string.format('error on state (forward) with %s', typename))
+      local error_ind = gconv.indices:long() - sconv.indices
+      mytester:asserteq(error_ind:max(), 0,
+          string.format('error on indices (forward) with %s', typename))
+    end
 end
 
 function cunntest.SpatialMaxPooling_forward_batch()
@@ -2269,34 +2064,23 @@ function cunntest.SpatialMaxPooling_forward_batch()
    local inj = (outj-1)*sj+kj - padj*2
    local ceil_mode = math.random(0,1) == 1
 
-   local tm = {}
-   local title = string.format('SpatialMaxPooling.forward %dx%dx%dx%d o %dx%d -> %dx%dx%dx%d',
-                               bs, from, inj, ini, kj, ki, bs, to, outj, outi)
-   times[title] = tm
-
-   local input = torch.randn(bs,from,inj,ini)
-   local sconv = nn.SpatialMaxPooling(ki,kj,si,sj,padi,padj)
-   if ceil_mode then sconv:ceil() end
-   local groundtruth = sconv:forward(input)
-   local a = torch.Timer()
-   for i = 1,nloop do
-      groundtruth = sconv:forward(input)
-   end
-   tm.cpu = a:time().real
+   for k, typename in ipairs(typenames) do
+      local input = torch.randn(bs,from,inj,ini):type(typename)
+      local ctype = t2cpu[typename]
+      input = input:type(ctype)
+      local sconv = nn.SpatialMaxPooling(ki,kj,si,sj,padi,padj):type(ctype)
+      if ceil_mode then sconv:ceil() end
+      local groundtruth = sconv:forward(input)
 
-   input = input:cuda()
-   local gconv = nn.SpatialMaxPooling(ki,kj,si,sj,padi,padj):cuda()
-   if ceil_mode then gconv:ceil() end
-   local rescuda = gconv:forward(input)
-   a:reset()
-   for i = 1,nloop do
-      rescuda = gconv:forward(input)
-   end
-   cutorch.synchronize()
-   tm.gpu = a:time().real
+      input = input:type(typename)
+      local gconv = nn.SpatialMaxPooling(ki,kj,si,sj,padi,padj):type(typename)
+      if ceil_mode then gconv:ceil() end
+      local rescuda = gconv:forward(input)
 
-   local error = rescuda:float() - groundtruth
-   mytester:assertlt(error:abs():max(), precision_forward, 'error on state (forward) ')
+      local error = rescuda:double() - groundtruth:double()
+      mytester:assertlt(error:abs():max(), precision_forward_type(precision_forward, typename),
+          string.format('error on state (forward) with %s', typename))
+    end
 end
 
 function cunntest.SpatialMaxUnpooling_forward_batch()
@@ -2316,40 +2100,29 @@ function cunntest.SpatialMaxUnpooling_forward_batch()
    local ini = fun((outi + padi*2 - ki)/si) +1
    local inj = fun((outj + padj*2 - kj)/sj) +1
 
-   local tm = {}
-   local title = string.format('SpatialMaxUnpooling.forward %dx%dx%dx%d o %dx%d -> %dx%dx%dx%d',
-                               bs, from, inj, ini, kj, ki, bs, to, outj, outi)
-   times[title] = tm
+   for k, typename in ipairs(typenames) do
+      local ctype = t2cpu[typename]
+      local pooler = nn.SpatialMaxPooling(ki,kj,si,sj,padi,padj):type(ctype)
+      if ceil_mode then pooler:ceil() end
+      local sunpool = nn.SpatialMaxUnpooling(pooler):type(ctype)
 
-   local pooler = nn.SpatialMaxPooling(ki,kj,si,sj,padi,padj)
-   if ceil_mode then pooler:ceil() end
-   local sunpool = nn.SpatialMaxUnpooling(pooler)
-
-   local original = torch.randn(bs,from,outj,outi)
-   local input = pooler:forward(original)
-   local groundtruth = sunpool:forward(input)
-   local a = torch.Timer()
-   for i = 1,nloop do
-      groundtruth = sunpool:forward(input)
-   end
-   tm.cpu = a:time().real
+      local original = torch.randn(bs,from,outj,outi):type(typename)
+      original = original:type(ctype)
+      local input = pooler:forward(original)
+      local groundtruth = sunpool:forward(input)
 
-   original = original:cuda()
-   pooler = nn.SpatialMaxPooling(ki,kj,si,sj,padi,padj):cuda()
-   if ceil_mode then pooler:ceil() end
-   local gunpool = nn.SpatialMaxUnpooling(pooler):cuda()
+      original = original:type(typename)
+      pooler = nn.SpatialMaxPooling(ki,kj,si,sj,padi,padj):type(typename)
+      if ceil_mode then pooler:ceil() end
+      local gunpool = nn.SpatialMaxUnpooling(pooler):type(typename)
 
-   input = pooler:forward(original)
-   local rescuda = gunpool:forward(input)
-   a:reset()
-   for i = 1,nloop do
-      rescuda = gunpool:forward(input)
-   end
-   cutorch.synchronize()
-   tm.gpu = a:time().real
+      input = pooler:forward(original)
+      local rescuda = gunpool:forward(input)
 
-   local error = rescuda:float() - groundtruth
-   mytester:assertlt(error:abs():max(), precision_forward, 'error on state (forward) ')
+      local error = rescuda:double() - groundtruth:double()
+      mytester:assertlt(error:abs():max(), precision_forward_type(precision_forward, typename),
+          string.format('error on state (forward) with %s', typename))
+    end
 end
 
 function cunntest.SpatialMaxPooling_backward()
@@ -2365,45 +2138,34 @@ function cunntest.SpatialMaxPooling_backward()
    local padj = math.random(0,kj/2-1)
    local ini = (outi-1)*si+ki - padi*2
    local inj = (outj-1)*sj+kj - padj*2
-   local ceil_mode = math.random(0,1) == 1
+   local ceil_mode = true--math.random(0,1) == 1
 
-   local tm = {}
-   local title = string.format('SpatialMaxPooling.backward %dx%dx%d o %dx%d -> %dx%dx%d',
-                               from, inj, ini, kj, ki, to, outj, outi)
-   times[title] = tm
+   for k, typename in ipairs(typenames) do
+      local input = torch.randn(from,inj,ini):type(typename)
+      local gradOutput = torch.randn(to,outj,outi):type(typename)
+      local ctype = t2cpu[typename]
+      input = input:type(ctype)
+      gradOutput = gradOutput:type(ctype)
 
-   local input = torch.randn(from,inj,ini)
-   local gradOutput = torch.randn(to,outj,outi)
-   local sconv = nn.SpatialMaxPooling(ki,kj,si,sj,padi,padj)
-   if ceil_mode then sconv:ceil() end
-   sconv:forward(input)
-   sconv:zeroGradParameters()
-   local groundgrad = sconv:backward(input, gradOutput)
-   local a = torch.Timer()
-   for i = 1,nloop do
+      local sconv = nn.SpatialMaxPooling(ki,kj,si,sj,padi,padj):type(ctype)
+      if ceil_mode then sconv:ceil() end
+      sconv:forward(input)
       sconv:zeroGradParameters()
-      groundgrad = sconv:backward(input, gradOutput)
-   end
-   tm.cpu = a:time().real
+      local groundgrad = sconv:backward(input, gradOutput)
 
-   input = input:cuda()
-   gradOutput = gradOutput:cuda()
-   local gconv = nn.SpatialMaxPooling(ki,kj,si,sj,padi,padj):cuda()
-   if ceil_mode then gconv:ceil() end
-   gconv:forward(input)
-   gconv:zeroGradParameters()
-   local rescuda = gconv:backward(input, gradOutput)
-   a:reset()
-   for i = 1,nloop do
+      input = input:type(typename)
+      gradOutput = gradOutput:type(typename)
+      local gconv = nn.SpatialMaxPooling(ki,kj,si,sj,padi,padj):type(typename)
+      if ceil_mode then gconv:ceil() end
+      gconv:forward(input)
       gconv:zeroGradParameters()
-      rescuda = gconv:backward(input, gradOutput)
-   end
-   cutorch.synchronize()
-   tm.gpu = a:time().real
+      local rescuda = gconv:backward(input, gradOutput)
 
-   local error = rescuda:float() - groundgrad
+      local error = rescuda:double() - groundgrad:double()
 
-   mytester:assertlt(error:abs():max(), precision_backward, 'error on state (backward) ')
+      mytester:assertlt(error:abs():max(), precision_backward_type(precision_backward, typename),
+          string.format('error on state (backward) with %s', typename))
+    end
 end
 
 function cunntest.SpatialMaxPooling_backward_batch()
@@ -2422,43 +2184,31 @@ function cunntest.SpatialMaxPooling_backward_batch()
    local inj = (outj-1)*sj+kj - padj*2
    local ceil_mode = math.random(0,1) == 1
 
-   local tm = {}
-   local title = string.format('SpatialMaxPooling.backward %dx%dx%dx%d o %dx%d -> %dx%dx%dx%d',
-                               bs, from, inj, ini, kj, ki, bs, to, outj, outi)
-   times[title] = tm
-
-   local input = torch.randn(bs,from,inj,ini)
-   local gradOutput = torch.randn(bs,to,outj,outi)
-   local sconv = nn.SpatialMaxPooling(ki,kj,si,sj,padi,padj)
-   if ceil_mode then sconv:ceil() end
-   sconv:forward(input)
-   sconv:zeroGradParameters()
-   local groundgrad = sconv:backward(input, gradOutput)
-   local a = torch.Timer()
-   for i = 1,nloop do
+   for k, typename in ipairs(typenames) do
+      local input = torch.randn(bs,from,inj,ini):type(typename)
+      local gradOutput = torch.randn(bs,to,outj,outi):type(typename)
+      local ctype = t2cpu[typename]
+      local input = input:type(ctype)
+      local gradOutput = gradOutput:type(ctype)
+      local sconv = nn.SpatialMaxPooling(ki,kj,si,sj,padi,padj):type(ctype)
+      if ceil_mode then sconv:ceil() end
+      sconv:forward(input)
       sconv:zeroGradParameters()
-      groundgrad = sconv:backward(input, gradOutput)
-   end
-   tm.cpu = a:time().real
+      local groundgrad = sconv:backward(input, gradOutput)
 
-   input = input:cuda()
-   gradOutput = gradOutput:cuda()
-   local gconv = nn.SpatialMaxPooling(ki,kj,si,sj,padi,padj):cuda()
-   if ceil_mode then gconv:ceil() end
-   gconv:forward(input)
-   gconv:zeroGradParameters()
-   local rescuda = gconv:backward(input, gradOutput)
-   a:reset()
-   for i = 1,nloop do
+      input = input:type(typename)
+      gradOutput = gradOutput:type(typename)
+      local gconv = nn.SpatialMaxPooling(ki,kj,si,sj,padi,padj):type(typename)
+      if ceil_mode then gconv:ceil() end
+      gconv:forward(input)
       gconv:zeroGradParameters()
-      rescuda = gconv:backward(input, gradOutput)
-   end
-   cutorch.synchronize()
-   tm.gpu = a:time().real
+      local rescuda = gconv:backward(input, gradOutput)
 
-   local error = rescuda:float() - groundgrad
+      local error = rescuda:double() - groundgrad:double()
 
-   mytester:assertlt(error:abs():max(), precision_backward, 'error on state (backward) ')
+      mytester:assertlt(error:abs():max(), precision_backward_type(precision_backward, typename),
+          string.format('error on state (backward) with %s', typename))
+    end
 end
 
 function cunntest.SpatialMaxUnpooling_backward_batch()
@@ -2478,50 +2228,38 @@ function cunntest.SpatialMaxUnpooling_backward_batch()
    local ini = fun((outi + padi*2 - ki)/si) +1
    local inj = fun((outj + padj*2 - kj)/sj) +1
 
-   local tm = {}
-   local title = string.format('SpatialMaxUnpooling.backward %dx%dx%dx%d o %dx%d -> %dx%dx%dx%d',
-                               bs, from, inj, ini, kj, ki, bs, to, outj, outi)
-   times[title] = tm
-
-   local pooler = nn.SpatialMaxPooling(ki,kj,si,sj,padi,padj)
-   if ceil_mode then pooler:ceil() end
-   local sunpool = nn.SpatialMaxUnpooling(pooler)
-
-   local original = torch.randn(bs,from,outj,outi)
-   local input = pooler:forward(original)
-   local gradOutput = torch.randn(original:size())
-   sunpool:forward(input)
-   sunpool:zeroGradParameters()
-   local groundgrad = sunpool:backward(input, gradOutput)
-   local a = torch.Timer()
-   for i = 1,nloop do
+   for k, typename in ipairs(typenames) do
+      local ctype = t2cpu[typename]
+      local pooler = nn.SpatialMaxPooling(ki,kj,si,sj,padi,padj):type(ctype)
+      if ceil_mode then pooler:ceil() end
+      local sunpool = nn.SpatialMaxUnpooling(pooler):type(ctype)
+
+      local original = torch.randn(bs,from,outj,outi):type(typename)
+      original = original:type(ctype)
+      local input = pooler:forward(original)
+      local gradOutput = torch.randn(original:size()):type(typename)
+      gradOutput = gradOutput:type(ctype)
+      sunpool:forward(input)
       sunpool:zeroGradParameters()
-      groundgrad = sunpool:backward(input, gradOutput)
-   end
-   tm.cpu = a:time().real
+      local groundgrad = sunpool:backward(input, gradOutput)
 
-   pooler = nn.SpatialMaxPooling(ki,kj,si,sj,padi,padj):cuda()
-   if ceil_mode then pooler:ceil() end
-   local gunpool = nn.SpatialMaxUnpooling(pooler):cuda()
+      pooler = nn.SpatialMaxPooling(ki,kj,si,sj,padi,padj):type(typename)
+      if ceil_mode then pooler:ceil() end
+      local gunpool = nn.SpatialMaxUnpooling(pooler):type(typename)
 
-   original = original:cuda()
-   input = pooler:forward(original)
-   gunpool:forward(input)
+      original = original:type(typename)
+      input = pooler:forward(original)
+      gunpool:forward(input)
 
-   gradOutput = gradOutput:cuda()
-   gunpool:zeroGradParameters()
-   local rescuda = gunpool:backward(input, gradOutput)
-   a:reset()
-   for i = 1,nloop do
+      gradOutput = gradOutput:type(typename)
       gunpool:zeroGradParameters()
-      rescuda = gunpool:backward(input, gradOutput)
-   end
-   cutorch.synchronize()
-   tm.gpu = a:time().real
+      local rescuda = gunpool:backward(input, gradOutput)
 
-   local error = rescuda:float() - groundgrad
+      local error = rescuda:double() - groundgrad:double()
 
-   mytester:assertlt(error:abs():max(), precision_backward, 'error on state (backward) ')
+      mytester:assertlt(error:abs():max(), precision_backward_type(precision_backward, typename),
+          string.format('error on state (backward) with %s', typename))
+    end
 end
 
 function cunntest.SpatialDilatedMaxPooling_forward()
@@ -2541,36 +2279,26 @@ function cunntest.SpatialDilatedMaxPooling_forward()
    local inj = (outj-1)*sj+(dilationj*(kj-1)+1)-2*padj
    local ceil_mode = math.random(0,1) == 1
 
-   local tm = {}
-   local title = string.format('SpatialDilatedMaxPooling.forward %dx%dx%d o %dx%d -> %dx%dx%d',
-                               from, inj, ini, kj, ki, to, outj, outi)
-   times[title] = tm
-
-   local input = torch.randn(from,inj,ini)
-   local sconv = nn.SpatialDilatedMaxPooling(ki,kj,si,sj,padi,padj,dilationi,dilationj)
-   if ceil_mode then sconv:ceil() end
-   local groundtruth = sconv:forward(input)
-   local a = torch.Timer()
-   for i = 1,nloop do
-      groundtruth = sconv:forward(input)
-   end
-   tm.cpu = a:time().real
+   for k, typename in ipairs(typenames) do
+      local input = torch.randn(from,inj,ini):type(typename)
+      local ctype = t2cpu[typename]
+      input = input:type(ctype)
+      local sconv = nn.SpatialDilatedMaxPooling(ki,kj,si,sj,padi,padj,dilationi,dilationj):type(ctype)
+      if ceil_mode then sconv:ceil() end
+      local groundtruth = sconv:forward(input)
 
-   input = input:cuda()
-   local gconv = nn.SpatialDilatedMaxPooling(ki,kj,si,sj,padi,padj,dilationi,dilationj):cuda()
-   if ceil_mode then gconv:ceil() end
-   local rescuda = gconv:forward(input)
-   a:reset()
-   for i = 1,nloop do
-      rescuda = gconv:forward(input)
-   end
-   cutorch.synchronize()
-   tm.gpu = a:time().real
+      input = input:type(typename)
+      local gconv = nn.SpatialDilatedMaxPooling(ki,kj,si,sj,padi,padj,dilationi,dilationj):type(typename)
+      if ceil_mode then gconv:ceil() end
+      local rescuda = gconv:forward(input)
 
-   local error = rescuda:float() - groundtruth
-   mytester:assertlt(error:abs():max(), precision_forward, 'error on state (forward) ')
-   local error_ind = gconv.indices:float() - sconv.indices
-   mytester:asserteq(error_ind:max(), 0, 'error on indices (forward) ')
+      local error = rescuda:double() - groundtruth:double()
+      mytester:assertlt(error:abs():max(), precision_forward_type(precision_forward, typename),
+          string.format('error on state (forward) with %s', typename))
+      local error_ind = gconv.indices:long() - sconv.indices
+      mytester:asserteq(error_ind:max(), 0,
+          string.format('error on indices (forward) with %s', typename))
+    end
 end
 
 function cunntest.SpatialDilatedMaxPooling_forward_batch()
@@ -2591,34 +2319,23 @@ function cunntest.SpatialDilatedMaxPooling_forward_batch()
    local inj = (outj-1)*sj+(dilationj*(kj-1)+1)-2*padj
    local ceil_mode = math.random(0,1) == 1
 
-   local tm = {}
-   local title = string.format('SpatialDilatedMaxPooling.forward %dx%dx%dx%d o %dx%d -> %dx%dx%dx%d',
-                               bs, from, inj, ini, kj, ki, bs, to, outj, outi)
-   times[title] = tm
-
-   local input = torch.randn(bs,from,inj,ini)
-   local sconv = nn.SpatialDilatedMaxPooling(ki,kj,si,sj,padi,padj,dilationi,dilationj)
-   if ceil_mode then sconv:ceil() end
-   local groundtruth = sconv:forward(input)
-   local a = torch.Timer()
-   for i = 1,nloop do
-      groundtruth = sconv:forward(input)
-   end
-   tm.cpu = a:time().real
+   for k, typename in ipairs(typenames) do
+      local input = torch.randn(bs,from,inj,ini):type(typename)
+      local ctype = t2cpu[typename]
+      input = input:type(ctype)
+      local sconv = nn.SpatialDilatedMaxPooling(ki,kj,si,sj,padi,padj,dilationi,dilationj):type(ctype)
+      if ceil_mode then sconv:ceil() end
+      local groundtruth = sconv:forward(input)
 
-   input = input:cuda()
-   local gconv = nn.SpatialDilatedMaxPooling(ki,kj,si,sj,padi,padj,dilationi,dilationj):cuda()
-   if ceil_mode then gconv:ceil() end
-   local rescuda = gconv:forward(input)
-   a:reset()
-   for i = 1,nloop do
-      rescuda = gconv:forward(input)
-   end
-   cutorch.synchronize()
-   tm.gpu = a:time().real
+      input = input:type(typename)
+      local gconv = nn.SpatialDilatedMaxPooling(ki,kj,si,sj,padi,padj,dilationi,dilationj):type(typename)
+      if ceil_mode then gconv:ceil() end
+      local rescuda = gconv:forward(input)
 
-   local error = rescuda:float() - groundtruth
-   mytester:assertlt(error:abs():max(), precision_forward, 'error on state (forward) ')
+      local error = rescuda:double() - groundtruth:double()
+      mytester:assertlt(error:abs():max(), precision_forward_type(precision_forward, typename),
+          string.format('error on state (forward) with %s', typename))
+    end
 end
 
 function cunntest.SpatialDilatedMaxPooling_backward()
@@ -2638,43 +2355,31 @@ function cunntest.SpatialDilatedMaxPooling_backward()
    local inj = (outj-1)*sj+(dilationj*(kj-1)+1)-2*padj
    local ceil_mode = math.random(0,1) == 1
 
-   local tm = {}
-   local title = string.format('SpatialDilatedMaxPooling.backward %dx%dx%d o %dx%d -> %dx%dx%d',
-                               from, inj, ini, kj, ki, to, outj, outi)
-   times[title] = tm
-
-   local input = torch.randn(from,inj,ini)
-   local gradOutput = torch.randn(to,outj,outi)
-   local sconv = nn.SpatialDilatedMaxPooling(ki,kj,si,sj,padi,padj,dilationi,dilationj)
-   if ceil_mode then sconv:ceil() end
-   sconv:forward(input)
-   sconv:zeroGradParameters()
-   local groundgrad = sconv:backward(input, gradOutput)
-   local a = torch.Timer()
-   for i = 1,nloop do
+   for k, typename in ipairs(typenames) do
+      local input = torch.randn(from,inj,ini):type(typename)
+      local gradOutput = torch.randn(to,outj,outi):type(typename)
+      local ctype = t2cpu[typename]
+      input = input:type(ctype)
+      gradOutput = gradOutput:type(ctype)
+      local sconv = nn.SpatialDilatedMaxPooling(ki,kj,si,sj,padi,padj,dilationi,dilationj):type(ctype)
+      if ceil_mode then sconv:ceil() end
+      sconv:forward(input)
       sconv:zeroGradParameters()
-      groundgrad = sconv:backward(input, gradOutput)
-   end
-   tm.cpu = a:time().real
+      local groundgrad = sconv:backward(input, gradOutput)
 
-   input = input:cuda()
-   gradOutput = gradOutput:cuda()
-   local gconv = nn.SpatialDilatedMaxPooling(ki,kj,si,sj,padi,padj,dilationi,dilationj):cuda()
-   if ceil_mode then gconv:ceil() end
-   gconv:forward(input)
-   gconv:zeroGradParameters()
-   local rescuda = gconv:backward(input, gradOutput)
-   a:reset()
-   for i = 1,nloop do
+      input = input:type(typename)
+      gradOutput = gradOutput:type(typename)
+      local gconv = nn.SpatialDilatedMaxPooling(ki,kj,si,sj,padi,padj,dilationi,dilationj):type(typename)
+      if ceil_mode then gconv:ceil() end
+      gconv:forward(input)
       gconv:zeroGradParameters()
-      rescuda = gconv:backward(input, gradOutput)
-   end
-   cutorch.synchronize()
-   tm.gpu = a:time().real
+      local rescuda = gconv:backward(input, gradOutput)
 
-   local error = rescuda:float() - groundgrad
+      local error = rescuda:double() - groundgrad:double()
 
-   mytester:assertlt(error:abs():max(), precision_backward, 'error on state (backward) ')
+      mytester:assertlt(error:abs():max(), precision_backward_type(precision_backward, typename),
+          string.format('error on state (backward) with %s', typename))
+    end
 end
 
 function cunntest.SpatialDilatedMaxPooling_backward_batch()
@@ -2695,43 +2400,31 @@ function cunntest.SpatialDilatedMaxPooling_backward_batch()
    local inj = (outj-1)*sj+(dilationj*(kj-1)+1)-2*padj
    local ceil_mode = math.random(0,1) == 1
 
-   local tm = {}
-   local title = string.format('SpatialDilatedMaxPooling.backward %dx%dx%dx%d o %dx%d -> %dx%dx%dx%d',
-                               bs, from, inj, ini, kj, ki, bs, to, outj, outi)
-   times[title] = tm
-
-   local input = torch.randn(bs,from,inj,ini)
-   local gradOutput = torch.randn(bs,to,outj,outi)
-   local sconv = nn.SpatialDilatedMaxPooling(ki,kj,si,sj,padi,padj,dilationi,dilationj)
-   if ceil_mode then sconv:ceil() end
-   sconv:forward(input)
-   sconv:zeroGradParameters()
-   local groundgrad = sconv:backward(input, gradOutput)
-   local a = torch.Timer()
-   for i = 1,nloop do
+   for k, typename in ipairs(typenames) do
+      local input = torch.randn(bs,from,inj,ini):type(typename)
+      local gradOutput = torch.randn(bs,to,outj,outi):type(typename)
+      local ctype = t2cpu[typename]
+      input = input:type(ctype)
+      gradOutput = gradOutput:type(ctype)
+      local sconv = nn.SpatialDilatedMaxPooling(ki,kj,si,sj,padi,padj,dilationi,dilationj):type(ctype)
+      if ceil_mode then sconv:ceil() end
+      sconv:forward(input)
       sconv:zeroGradParameters()
-      groundgrad = sconv:backward(input, gradOutput)
-   end
-   tm.cpu = a:time().real
+      local groundgrad = sconv:backward(input, gradOutput)
 
-   input = input:cuda()
-   gradOutput = gradOutput:cuda()
-   local gconv = nn.SpatialDilatedMaxPooling(ki,kj,si,sj,padi,padj,dilationi,dilationj):cuda()
-   if ceil_mode then gconv:ceil() end
-   gconv:forward(input)
-   gconv:zeroGradParameters()
-   local rescuda = gconv:backward(input, gradOutput)
-   a:reset()
-   for i = 1,nloop do
+      input = input:type(typename)
+      gradOutput = gradOutput:type(typename)
+      local gconv = nn.SpatialDilatedMaxPooling(ki,kj,si,sj,padi,padj,dilationi,dilationj):type(typename)
+      if ceil_mode then gconv:ceil() end
+      gconv:forward(input)
       gconv:zeroGradParameters()
-      rescuda = gconv:backward(input, gradOutput)
-   end
-   cutorch.synchronize()
-   tm.gpu = a:time().real
+      local rescuda = gconv:backward(input, gradOutput)
 
-   local error = rescuda:float() - groundgrad
+      local error = rescuda:double() - groundgrad:double()
 
-   mytester:assertlt(error:abs():max(), precision_backward, 'error on state (backward) ')
+      mytester:assertlt(error:abs():max(), precision_backward_type(precision_backward, typename),
+          string.format('error on state (backward) with %s', typename))
+    end
 end
 
 function cunntest.SpatialFractionalMaxPooling_forward()
@@ -2752,66 +2445,55 @@ function cunntest.SpatialFractionalMaxPooling_forward()
     local ratioW = outW / inW
     local ratioH = outH / inH
 
-    local tm = {}
-    local title =
-        string.format('SpatialFractionalMaxPooling.forward %dx%dx%dx%d o %dx%d -> %dx%dx%dx%d',
-                      batch, plane, inH, inW, poolSizeH, poolSizeW, batch, plane, outH, outW)
-    times[title] = tm
-
-    local input = nil
-    if batch == 1 then
-        input = torch.Tensor(plane, inH, inW):uniform()
-    else
-        input = torch.Tensor(batch, plane, inH, inW):uniform()
-    end
+    for k, typename in ipairs(typenames) do
+        local input = nil
+        if batch == 1 then
+            input = torch.Tensor(plane, inH, inW):uniform():type(typename)
+        else
+            input = torch.Tensor(batch, plane, inH, inW):uniform():type(typename)
+        end
 
-    local module = nil
-    if useRatio then
-        module =
-            nn.SpatialFractionalMaxPooling(poolSizeW, poolSizeH, ratioW, ratioH)
-    else
-        module =
-            nn.SpatialFractionalMaxPooling(poolSizeW, poolSizeH, outW, outH)
-    end
+        local ctype = t2cpu[typename]
+        input = input:type(ctype)
+        local module = nil
+        if useRatio then
+            module =
+                nn.SpatialFractionalMaxPooling(poolSizeW, poolSizeH, ratioW, ratioH):type(ctype)
+        else
+            module =
+                nn.SpatialFractionalMaxPooling(poolSizeW, poolSizeH, outW, outH):type(ctype)
+        end
 
-    module:fixPoolingRegions()
+        module:fixPoolingRegions()
 
-    local groundtruth = module:forward(input)
-    local a = torch.Timer()
-    for i = 1,nloop do
-        groundtruth = module:forward(input)
-    end
-    tm.cpu = a:time().real
+        local groundtruth = module:forward(input)
 
-    input = input:cuda()
+        input = input:type(typename)
 
-    local gmodule = nil
-    if useRatio then
-        gmodule =
-            nn.SpatialFractionalMaxPooling(poolSizeW, poolSizeH, ratioW, ratioH)
-    else
-        gmodule =
-            nn.SpatialFractionalMaxPooling(poolSizeW, poolSizeH, outW, outH)
-    end
+        local gmodule = nil
+        if useRatio then
+            gmodule =
+                nn.SpatialFractionalMaxPooling(poolSizeW, poolSizeH, ratioW, ratioH)
+        else
+            gmodule =
+                nn.SpatialFractionalMaxPooling(poolSizeW, poolSizeH, outW, outH)
+        end
 
-    gmodule = gmodule:fixPoolingRegions():cuda()
+        gmodule = gmodule:fixPoolingRegions():type(typename)
 
-    -- For comparison purposes, make sure we are using the same random pooling regions
-    -- as the CPU
-    gmodule.randomSamples = module.randomSamples:cuda()
+        -- For comparison purposes, make sure we are using the same random pooling regions
+        -- as the CPU
+        gmodule.randomSamples = module.randomSamples:type(typename)
 
-    local rescuda = gmodule:forward(input)
-    a:reset()
-    for i = 1,nloop do
-        rescuda = gmodule:forward(input)
-    end
-    cutorch.synchronize()
-    tm.gpu = a:time().real
+        local rescuda = gmodule:forward(input)
 
-    local error = rescuda:float() - groundtruth
-    mytester:assertlt(error:abs():max(), precision_forward, 'error on state (forward) ')
-    local error_ind = gmodule.indices:float() - module.indices
-    mytester:asserteq(error_ind:abs():max(), 0, 'error on indices (forward) ')
+        local error = rescuda:double() - groundtruth:double()
+        mytester:assertlt(error:abs():max(), precision_forward_type(precision_forward, typename),
+            string.format('error on state (forward) with %s', typename))
+        local error_ind = gmodule.indices:long() - module.indices
+        mytester:asserteq(error_ind:abs():max(), 0,
+            string.format('error on indices (forward) with %s', typename))
+    end
 end
 
 function cunntest.SpatialFractionalMaxPooling_backward()
@@ -2828,78 +2510,71 @@ function cunntest.SpatialFractionalMaxPooling_backward()
     local inW = math.random(minInW, minInW + 6)
     local inH = math.random(minInH, minInH + 6)
 
-    local tm = {}
-    local title =
-        string.format('SpatialFractionalMaxPooling.backward %dx%dx%dx%d o %dx%d -> %dx%dx%dx%d',
-                      batch, plane, inH, inW, poolSizeH, poolSizeW, batch, plane, outH, outW)
-    times[title] = tm
-
-    local input = nil
-    local gradOutput = nil
-    if batch == 1 then
-        input = torch.Tensor(plane, inH, inW):uniform()
-        gradOutput = torch.Tensor(plane, outH, outW):uniform()
-    else
-        input = torch.Tensor(batch, plane, inH, inW):uniform()
-        gradOutput = torch.Tensor(batch, plane, outH, outW):uniform()
-    end
+    for k, typename in ipairs(typenames) do
+        local input = nil
+        local gradOutput = nil
+        if batch == 1 then
+            input = torch.Tensor(plane, inH, inW):uniform():type(typename)
+            gradOutput = torch.Tensor(plane, outH, outW):uniform():type(typename)
+        else
+            input = torch.Tensor(batch, plane, inH, inW):uniform():type(typename)
+            gradOutput = torch.Tensor(batch, plane, outH, outW):uniform():type(typename)
+        end
 
-    local module =
-        nn.SpatialFractionalMaxPooling(poolSizeW, poolSizeH, outW, outH)
-        :fixPoolingRegions()
+        local ctype = t2cpu[typename]
+        input = input:type(ctype)
+        gradOutput = gradOutput:type(ctype)
+        local module =
+            nn.SpatialFractionalMaxPooling(poolSizeW, poolSizeH, outW, outH)
+            :fixPoolingRegions():type(ctype)
 
-    module:forward(input)
-    module:zeroGradParameters()
-    local groundgrad = module:backward(input, gradOutput)
-    local a = torch.Timer()
-    for i = 1,nloop do
+        -- convert type of randomSamples and ensure we don't resample
+        module:initSampleBuffer_(input)
+        module:fixPoolingRegions()
+        module.randomSamples = module.randomSamples:type(typename):type(ctype)
+        module:forward(input)
         module:zeroGradParameters()
-        groundgrad = module:backward(input, gradOutput)
-    end
-    tm.cpu = a:time().real
-
-    input = input:cuda()
-    gradOutput = gradOutput:cuda()
-
-    local gmodule =
-        nn.SpatialFractionalMaxPooling(poolSizeW, poolSizeH, outW, outH)
-        :fixPoolingRegions():cuda()
-    -- For comparison purposes, make sure we are using the same random pooling regions
-    -- as the CPU
-    gmodule.randomSamples = module.randomSamples:cuda()
-
-    gmodule:forward(input)
-    gmodule:zeroGradParameters()
-    local rescuda = gmodule:backward(input, gradOutput)
-    a:reset()
-    for i = 1,nloop do
+        local groundgrad = module:backward(input, gradOutput)
+
+        input = input:type(typename)
+        gradOutput = gradOutput:type(typename)
+
+        local gmodule =
+            nn.SpatialFractionalMaxPooling(poolSizeW, poolSizeH, outW, outH)
+            :fixPoolingRegions():type(typename)
+        -- For comparison purposes, make sure we are using the same random pooling regions
+        -- as the CPU
+        gmodule.randomSamples = module.randomSamples:type(typename)
+
+        gmodule:forward(input)
         gmodule:zeroGradParameters()
-        rescuda = gmodule:backward(input, gradOutput)
-    end
-    cutorch.synchronize()
-    tm.gpu = a:time().real
+        local rescuda = gmodule:backward(input, gradOutput)
 
-    local error = rescuda:float() - groundgrad
-    mytester:assertlt(error:abs():max(), precision_backward, 'error on state (backward) ')
+        local error = rescuda:double() - groundgrad:double()
+        mytester:assertlt(error:abs():max(), precision_backward_type(precision_backward, typename),
+            string.format('error on state (backward) with %s', typename))
+    end
 end
 
 function cunntest.SpatialAveragePooling_includepad()
-   local net = nn.SpatialAveragePooling(2, 2, 1, 1, 1, 1):cuda()
-   local net_no_include_pad = net:clone()
-   net_no_include_pad:setCountExcludePad()
-   local net_include_pad = net:clone()
-   net_include_pad:setCountIncludePad()
+   for k, typename in ipairs(typenames) do
+      local net = nn.SpatialAveragePooling(2, 2, 1, 1, 1, 1):type(typename)
+      local net_no_include_pad = net:clone()
+      net_no_include_pad:setCountExcludePad()
+      local net_include_pad = net:clone()
+      net_include_pad:setCountIncludePad()
 
-   local input = torch.FloatTensor(1, 1, 1, 1):cuda()
-   input[1][1][1][1] = 3
-   local out_noinclude = net_no_include_pad:forward(input)
-   local out_include = net_include_pad:forward(input)
+      local input = torch.FloatTensor(1, 1, 1, 1):type(typename)
+      input[1][1][1][1] = 3
+      local out_noinclude = net_no_include_pad:forward(input)
+      local out_include = net_include_pad:forward(input)
 
-   local noinc_out = out_noinclude[1][1][1][1]
-   local inc_out = out_include[1][1][1][1]
-   mytester:assertne(noinc_out, inc_out)
-   mytester:asserteq(3, noinc_out)
-   mytester:asserteq(3/4, inc_out)
+      local noinc_out = out_noinclude[1][1][1][1]
+      local inc_out = out_include[1][1][1][1]
+      mytester:assertne(noinc_out, inc_out)
+      mytester:asserteq(3, noinc_out)
+      mytester:asserteq(3/4, inc_out)
+   end
 end
 
 function cunntest.SpatialAveragePooling_forward()
@@ -2918,36 +2593,26 @@ function cunntest.SpatialAveragePooling_forward()
    local ceil_mode = math.random(0,1) == 1
    local count_exclude_pad = math.random(0,1) == 1
 
-   local tm = {}
-   local title = string.format('SpatialAveragePooling.forward %dx%dx%d o %dx%d -> %dx%dx%d',
-                               from, inj, ini, kj, ki, to, outj, outi)
-   times[title] = tm
+   for k, typename in ipairs(typenames) do
+      local input = torch.randn(from,inj,ini):type(typename)
 
-   local input = torch.randn(from,inj,ini)
-   local sconv = nn.SpatialAveragePooling(ki,kj,si,sj,padi,padj)
-   if ceil_mode then sconv:ceil() end
-   if count_exclude_pad then sconv:setCountExcludePad() end
-   local groundtruth = sconv:forward(input)
-   local a = torch.Timer()
-   for i = 1,nloop do
-      groundtruth = sconv:forward(input)
-   end
-   tm.cpu = a:time().real
+      local ctype = t2cpu[typename]
+      input = input:type(ctype)
+      local sconv = nn.SpatialAveragePooling(ki,kj,si,sj,padi,padj):type(ctype)
+      if ceil_mode then sconv:ceil() end
+      if count_exclude_pad then sconv:setCountExcludePad() end
+      local groundtruth = sconv:forward(input)
 
-   input = input:cuda()
-   local gconv = nn.SpatialAveragePooling(ki,kj,si,sj,padi,padj):cuda()
-   if ceil_mode then gconv:ceil() end
-   if count_exclude_pad then gconv:setCountExcludePad() end
-   local rescuda = gconv:forward(input)
-   a:reset()
-   for i = 1,nloop do
-      rescuda = gconv:forward(input)
-   end
-   cutorch.synchronize()
-   tm.gpu = a:time().real
+      input = input:type(typename)
+      local gconv = nn.SpatialAveragePooling(ki,kj,si,sj,padi,padj):type(typename)
+      if ceil_mode then gconv:ceil() end
+      if count_exclude_pad then gconv:setCountExcludePad() end
+      local rescuda = gconv:forward(input)
 
-   local error = rescuda:float() - groundtruth
-   mytester:assertlt(error:abs():max(), precision_forward, 'error on state (forward) ')
+      local error = rescuda:double() - groundtruth:double()
+      mytester:assertlt(error:abs():max(), precision_forward_type(precision_forward, typename),
+          string.format('error on state (forward) with %s', typename))
+   end
 end
 
 function cunntest.SpatialAveragePooling_forward_batch()
@@ -2967,36 +2632,26 @@ function cunntest.SpatialAveragePooling_forward_batch()
    local ceil_mode = math.random(0,1) == 1
    local count_exclude_pad = math.random(0,1) == 1
 
-   local tm = {}
-   local title = string.format('SpatialAveragePooling.forward %dx%dx%dx%d o %dx%d -> %dx%dx%dx%d',
-                               bs, from, inj, ini, kj, ki, bs, to, outj, outi)
-   times[title] = tm
+   for k, typename in ipairs(typenames) do
+      local input = torch.randn(bs,from,inj,ini):type(typename)
+      local ctype = t2cpu[typename]
 
-   local input = torch.randn(bs,from,inj,ini)
-   local sconv = nn.SpatialAveragePooling(ki,kj,si,sj,padi,padj)
-   if ceil_mode then sconv:ceil() end
-   if count_exclude_pad then sconv:setCountExcludePad() end
-   local groundtruth = sconv:forward(input)
-   local a = torch.Timer()
-   for i = 1,nloop do
-      groundtruth = sconv:forward(input)
-   end
-   tm.cpu = a:time().real
+      input = input:type(ctype)
+      local sconv = nn.SpatialAveragePooling(ki,kj,si,sj,padi,padj):type(ctype)
+      if ceil_mode then sconv:ceil() end
+      if count_exclude_pad then sconv:setCountExcludePad() end
+      local groundtruth = sconv:forward(input)
 
-   input = input:cuda()
-   local gconv = nn.SpatialAveragePooling(ki,kj,si,sj,padi,padj):cuda()
-   if ceil_mode then gconv:ceil() end
-   if count_exclude_pad then gconv:setCountExcludePad() end
-   local rescuda = gconv:forward(input)
-   a:reset()
-   for i = 1,nloop do
-      rescuda = gconv:forward(input)
-   end
-   cutorch.synchronize()
-   tm.gpu = a:time().real
+      input = input:type(typename)
+      local gconv = nn.SpatialAveragePooling(ki,kj,si,sj,padi,padj):type(typename)
+      if ceil_mode then gconv:ceil() end
+      if count_exclude_pad then gconv:setCountExcludePad() end
+      local rescuda = gconv:forward(input)
 
-   local error = rescuda:float() - groundtruth
-   mytester:assertlt(error:abs():max(), precision_forward, 'error on state (forward) ')
+      local error = rescuda:double() - groundtruth:double()
+      mytester:assertlt(error:abs():max(), precision_forward_type(precision_forward, typename),
+          string.format('error on state (forward) with %s', typename))
+   end
 end
 
 function cunntest.SpatialAveragePooling_backward()
@@ -3015,45 +2670,34 @@ function cunntest.SpatialAveragePooling_backward()
    local ceil_mode = math.random(0,1) == 1
    local count_exclude_pad = math.random(0,1) == 1
 
-   local tm = {}
-   local title = string.format('SpatialAveragePooling.backward %dx%dx%d o %dx%d -> %dx%dx%d',
-                               from, inj, ini, kj, ki, to, outj, outi)
-   times[title] = tm
+   for k, typename in ipairs(typenames) do
+      local input = torch.randn(from,inj,ini):type(typename)
+      local gradOutput = torch.randn(to,outj,outi):type(typename)
 
-   local input = torch.randn(from,inj,ini)
-   local gradOutput = torch.randn(to,outj,outi)
-   local sconv = nn.SpatialAveragePooling(ki,kj,si,sj,padi,padj)
-   if ceil_mode then sconv:ceil() end
-   if count_exclude_pad then sconv:setCountExcludePad() end
-   sconv:forward(input)
-   sconv:zeroGradParameters()
-   local groundgrad = sconv:backward(input, gradOutput)
-   local a = torch.Timer()
-   for i = 1,nloop do
+      local ctype = t2cpu[typename]
+      input = input:type(ctype)
+      gradOutput = gradOutput:type(ctype)
+      local sconv = nn.SpatialAveragePooling(ki,kj,si,sj,padi,padj):type(ctype)
+      if ceil_mode then sconv:ceil() end
+      if count_exclude_pad then sconv:setCountExcludePad() end
+      sconv:forward(input)
       sconv:zeroGradParameters()
-      groundgrad = sconv:backward(input, gradOutput)
-   end
-   tm.cpu = a:time().real
+      local groundgrad = sconv:backward(input, gradOutput)
 
-   input = input:cuda()
-   gradOutput = gradOutput:cuda()
-   local gconv = nn.SpatialAveragePooling(ki,kj,si,sj,padi,padj):cuda()
-   if ceil_mode then gconv:ceil() end
-   if count_exclude_pad then gconv:setCountExcludePad() end
-   gconv:forward(input)
-   gconv:zeroGradParameters()
-   local rescuda = gconv:backward(input, gradOutput)
-   a:reset()
-   for i = 1,nloop do
+      input = input:type(typename)
+      gradOutput = gradOutput:type(typename)
+      local gconv = nn.SpatialAveragePooling(ki,kj,si,sj,padi,padj):type(typename)
+      if ceil_mode then gconv:ceil() end
+      if count_exclude_pad then gconv:setCountExcludePad() end
+      gconv:forward(input)
       gconv:zeroGradParameters()
-      rescuda = gconv:backward(input, gradOutput)
-   end
-   cutorch.synchronize()
-   tm.gpu = a:time().real
+      local rescuda = gconv:backward(input, gradOutput)
 
-   local error = rescuda:float() - groundgrad
+      local error = rescuda:double() - groundgrad:double()
 
-   mytester:assertlt(error:abs():max(), precision_backward, 'error on state (backward) ')
+      mytester:assertlt(error:abs():max(), precision_backward_type(precision_backward, typename),
+          string.format('error on state (backward) with %s', typename))
+   end
 end
 
 function cunntest.SpatialAveragePooling_backward_batch()
@@ -3073,45 +2717,34 @@ function cunntest.SpatialAveragePooling_backward_batch()
    local ceil_mode = math.random(0,1) == 1
    local count_exclude_pad = math.random(0,1) == 1
 
-   local tm = {}
-   local title = string.format('SpatialAveragePooling.backward %dx%dx%dx%d o %dx%d -> %dx%dx%dx%d',
-                               bs, from, inj, ini, kj, ki, bs, to, outj, outi)
-   times[title] = tm
+   for k, typename in ipairs(typenames) do
+      local input = torch.randn(bs,from,inj,ini):type(typename)
+      local gradOutput = torch.randn(bs,to,outj,outi):type(typename)
 
-   local input = torch.randn(bs,from,inj,ini)
-   local gradOutput = torch.randn(bs,to,outj,outi)
-   local sconv = nn.SpatialAveragePooling(ki,kj,si,sj,padi,padj)
-   if ceil_mode then sconv:ceil() end
-   if count_exclude_pad then sconv:setCountExcludePad() end
-   sconv:forward(input)
-   sconv:zeroGradParameters()
-   local groundgrad = sconv:backward(input, gradOutput)
-   local a = torch.Timer()
-   for i = 1,nloop do
+      local ctype = t2cpu[typename]
+      input = input:type(ctype)
+      gradOutput = gradOutput:type(ctype)
+      local sconv = nn.SpatialAveragePooling(ki,kj,si,sj,padi,padj):type(ctype)
+      if ceil_mode then sconv:ceil() end
+      if count_exclude_pad then sconv:setCountExcludePad() end
+      sconv:forward(input)
       sconv:zeroGradParameters()
-      groundgrad = sconv:backward(input, gradOutput)
-   end
-   tm.cpu = a:time().real
+      local groundgrad = sconv:backward(input, gradOutput)
 
-   input = input:cuda()
-   gradOutput = gradOutput:cuda()
-   local gconv = nn.SpatialAveragePooling(ki,kj,si,sj,padi,padj):cuda()
-   if ceil_mode then gconv:ceil() end
-   if count_exclude_pad then gconv:setCountExcludePad() end
-   gconv:forward(input)
-   gconv:zeroGradParameters()
-   local rescuda = gconv:backward(input, gradOutput)
-   a:reset()
-   for i = 1,nloop do
+      input = input:type(typename)
+      gradOutput = gradOutput:type(typename)
+      local gconv = nn.SpatialAveragePooling(ki,kj,si,sj,padi,padj):type(typename)
+      if ceil_mode then gconv:ceil() end
+      if count_exclude_pad then gconv:setCountExcludePad() end
+      gconv:forward(input)
       gconv:zeroGradParameters()
-      rescuda = gconv:backward(input, gradOutput)
-   end
-   cutorch.synchronize()
-   tm.gpu = a:time().real
+      local rescuda = gconv:backward(input, gradOutput)
 
-   local error = rescuda:float() - groundgrad
+      local error = rescuda:double() - groundgrad:double()
 
-   mytester:assertlt(error:abs():max(), precision_backward, 'error on state (backward) ')
+      mytester:assertlt(error:abs():max(), precision_backward_type(precision_backward, typename),
+          string.format('error on state (backward) with %s', typename))
+   end
 end
 
 function cunntest.SpatialAdaptiveMaxPooling_forward()
@@ -3122,34 +2755,24 @@ function cunntest.SpatialAdaptiveMaxPooling_forward()
    local ini = math.random(10,256)
    local inj = math.random(10,256)
 
-   local tm = {}
-   local title = string.format('SpatialAdaptiveMaxPooling.forward %dx%dx%d -> %dx%dx%d',
-                               from, inj, ini, to, outj, outi)
-   times[title] = tm
+   for k, typename in ipairs(typenames) do
+      local input = torch.randn(from,inj,ini):type(typename)
+      local ctype = t2cpu[typename]
+      input = input:type(ctype)
+      local sconv = nn.SpatialAdaptiveMaxPooling(outi,outj):type(ctype)
+      local groundtruth = sconv:forward(input):type(ctype)
 
-   local input = torch.randn(from,inj,ini)
-   local sconv = nn.SpatialAdaptiveMaxPooling(outi,outj)
-   local groundtruth = sconv:forward(input)
-   local a = torch.Timer()
-   for i = 1,nloop do
-      groundtruth = sconv:forward(input)
-   end
-   tm.cpu = a:time().real
+      input = input:type(typename)
+      local gconv = nn.SpatialAdaptiveMaxPooling(outi,outj):type(typename)
+      local rescuda = gconv:forward(input)
 
-   input = input:cuda()
-   local gconv = nn.SpatialAdaptiveMaxPooling(outi,outj):cuda()
-   local rescuda = gconv:forward(input)
-   a:reset()
-   for i = 1,nloop do
-      rescuda = gconv:forward(input)
+      local error = rescuda:double() - groundtruth:double()
+      mytester:assertlt(error:abs():max(), precision_forward_type(precision_forward, typename),
+          string.format('error on state (forward) with %s', typename))
+      local error_ind = gconv.indices:long() - sconv.indices
+      mytester:asserteq(error_ind:max(), 0,
+          string.format('error on indices (forward) with %s', typename))
    end
-   cutorch.synchronize()
-   tm.gpu = a:time().real
-
-   local error = rescuda:float() - groundtruth
-   mytester:assertlt(error:abs():max(), precision_forward, 'error on state (forward) ')
-   local error_ind = gconv.indices:float() - sconv.indices
-   mytester:asserteq(error_ind:max(), 0, 'error on indices (forward) ')
 end
 
 function cunntest.SpatialAdaptiveMaxPooling_forward_noncontig()
@@ -3160,35 +2783,24 @@ function cunntest.SpatialAdaptiveMaxPooling_forward_noncontig()
    local ini = math.random(10,256)
    local inj = math.random(10,256)
 
-   local tm = {}
-   local title = string.format('SpatialAdaptiveMaxPooling.forward %s %dx%dx%d -> %dx%dx%d',
-                               'non-contiguous',from, inj, ini, to, outj, outi)
-   times[title] = tm
+   for k, typename in ipairs(typenames) do
+      local input0 = torch.randn(from,ini,inj):type(typename)
+      local ctype = t2cpu[typename]
+      local input = input0:type(ctype):transpose(2,3)
+      local sconv = nn.SpatialAdaptiveMaxPooling(outi,outj):type(ctype)
+      local groundtruth = sconv:forward(input)
 
-   local input0 = torch.randn(from,ini,inj)
-   local input = input0:transpose(2,3)
-   local sconv = nn.SpatialAdaptiveMaxPooling(outi,outj)
-   local groundtruth = sconv:forward(input)
-   local a = torch.Timer()
-   for i = 1,nloop do
-      groundtruth = sconv:forward(input)
-   end
-   tm.cpu = a:time().real
+      input = input0:type(typename):transpose(2,3)
+      local gconv = nn.SpatialAdaptiveMaxPooling(outi,outj):type(typename)
+      local rescuda = gconv:forward(input)
 
-   input = input0:cuda():transpose(2,3)
-   local gconv = nn.SpatialAdaptiveMaxPooling(outi,outj):cuda()
-   local rescuda = gconv:forward(input)
-   a:reset()
-   for i = 1,nloop do
-      rescuda = gconv:forward(input)
+      local error = rescuda:double() - groundtruth:double()
+      mytester:assertlt(error:abs():max(), precision_forward_type(precision_forward, typename),
+          string.format('error on state (forward) with %s', typename))
+      local error_ind = gconv.indices:long() - sconv.indices
+      mytester:asserteq(error_ind:max(), 0,
+          string.format('error on indices (forward) with %s', typename))
    end
-   cutorch.synchronize()
-   tm.gpu = a:time().real
-
-   local error = rescuda:float() - groundtruth
-   mytester:assertlt(error:abs():max(), precision_forward, 'error on state (forward) ')
-   local error_ind = gconv.indices:float() - sconv.indices
-   mytester:asserteq(error_ind:max(), 0, 'error on indices (forward) ')
 end
 
 function cunntest.SpatialAdaptiveMaxPooling_forward_batch()
@@ -3200,32 +2812,21 @@ function cunntest.SpatialAdaptiveMaxPooling_forward_batch()
    local ini = math.random(10,256)
    local inj = math.random(10,256)
 
-   local tm = {}
-   local title = string.format('SpatialAdaptiveMaxPooling.forward %dx%dx%dx%d -> %dx%dx%dx%d',
-                               bs, from, inj, ini, bs, to, outj, outi)
-   times[title] = tm
+   for k, typename in ipairs(typenames) do
+      local input = torch.randn(bs,from,inj,ini):type(typename)
+      local ctype = t2cpu[typename]
+      input = input:type(ctype)
+      local sconv = nn.SpatialAdaptiveMaxPooling(outi,outj):type(ctype)
+      local groundtruth = sconv:forward(input)
 
-   local input = torch.randn(bs,from,inj,ini)
-   local sconv = nn.SpatialAdaptiveMaxPooling(outi,outj)
-   local groundtruth = sconv:forward(input)
-   local a = torch.Timer()
-   for i = 1,nloop do
-      groundtruth = sconv:forward(input)
-   end
-   tm.cpu = a:time().real
+      input = input:type(typename)
+      local gconv = nn.SpatialAdaptiveMaxPooling(outi,outj):type(typename)
+      local rescuda = gconv:forward(input)
 
-   input = input:cuda()
-   local gconv = nn.SpatialAdaptiveMaxPooling(outi,outj):cuda()
-   local rescuda = gconv:forward(input)
-   a:reset()
-   for i = 1,nloop do
-      rescuda = gconv:forward(input)
+      local error = rescuda:double() - groundtruth:double()
+      mytester:assertlt(error:abs():max(), precision_forward_type(precision_forward, typename),
+          string.format('error on state (forward) with %s', typename))
    end
-   cutorch.synchronize()
-   tm.gpu = a:time().real
-
-   local error = rescuda:float() - groundtruth
-   mytester:assertlt(error:abs():max(), precision_forward, 'error on state (forward) ')
 end
 
 function cunntest.SpatialAdaptiveMaxPooling_backward()
@@ -3236,41 +2837,29 @@ function cunntest.SpatialAdaptiveMaxPooling_backward()
    local ini = math.random(10,256)
    local inj = math.random(10,256)
 
-   local tm = {}
-   local title = string.format('SpatialAdaptiveMaxPooling.backward %dx%dx%d -> %dx%dx%d',
-                               from, inj, ini, to, outj, outi)
-   times[title] = tm
-
-   local input = torch.randn(from,inj,ini)
-   local gradOutput = torch.randn(to,outj,outi)
-   local sconv = nn.SpatialAdaptiveMaxPooling(outi,outj)
-   sconv:forward(input)
-   sconv:zeroGradParameters()
-   local groundgrad = sconv:backward(input, gradOutput)
-   local a = torch.Timer()
-   for i = 1,nloop do
+   for k, typename in ipairs(typenames) do
+      local input = torch.randn(from,inj,ini):type(typename)
+      local gradOutput = torch.randn(to,outj,outi):type(typename)
+      local ctype = t2cpu[typename]
+      input = input:type(ctype)
+      gradOutput = gradOutput:type(ctype)
+      local sconv = nn.SpatialAdaptiveMaxPooling(outi,outj):type(ctype)
+      sconv:forward(input)
       sconv:zeroGradParameters()
-      groundgrad = sconv:backward(input, gradOutput)
-   end
-   tm.cpu = a:time().real
+      local groundgrad = sconv:backward(input, gradOutput)
 
-   input = input:cuda()
-   gradOutput = gradOutput:cuda()
-   local gconv = nn.SpatialAdaptiveMaxPooling(outi,outj):cuda()
-   gconv:forward(input)
-   gconv:zeroGradParameters()
-   local rescuda = gconv:backward(input, gradOutput)
-   a:reset()
-   for i = 1,nloop do
+      input = input:type(typename)
+      gradOutput = gradOutput:type(typename)
+      local gconv = nn.SpatialAdaptiveMaxPooling(outi,outj):type(typename)
+      gconv:forward(input)
       gconv:zeroGradParameters()
-      rescuda = gconv:backward(input, gradOutput)
-   end
-   cutorch.synchronize()
-   tm.gpu = a:time().real
+      local rescuda = gconv:backward(input, gradOutput)
 
-   local error = rescuda:float() - groundgrad
+      local error = rescuda:double() - groundgrad:double()
 
-   mytester:assertlt(error:abs():max(), precision_backward, 'error on state (backward) ')
+      mytester:assertlt(error:abs():max(), precision_backward_type(precision_backward, typename),
+          string.format('error on state (backward) with %s', typename))
+   end
 end
 
 function cunntest.SpatialAdaptiveMaxPooling_backward_noncontig()
@@ -3281,42 +2870,29 @@ function cunntest.SpatialAdaptiveMaxPooling_backward_noncontig()
    local ini = math.random(10,256)
    local inj = math.random(10,256)
 
-   local tm = {}
-   local title = string.format('SpatialAdaptiveMaxPooling.backward %s %dx%dx%d -> %dx%dx%d',
-                               'non-contiguous', from, inj, ini, to, outj, outi)
-   times[title] = tm
-
-   local input0 = torch.randn(from,ini,inj)
-   local input = input0:transpose(2,3)
-   local gradOutput = torch.randn(to,outj,outi)
-   local sconv = nn.SpatialAdaptiveMaxPooling(outi,outj)
-   sconv:forward(input)
-   sconv:zeroGradParameters()
-   local groundgrad = sconv:backward(input, gradOutput)
-   local a = torch.Timer()
-   for i = 1,nloop do
+   for k, typename in ipairs(typenames) do
+      local input0 = torch.randn(from,ini,inj):type(typename)
+      local gradOutput = torch.randn(to,outj,outi):type(typename)
+      local ctype = t2cpu[typename]
+      local input = input0:type(ctype):transpose(2,3)
+      gradOutput = gradOutput:type(ctype)
+      local sconv = nn.SpatialAdaptiveMaxPooling(outi,outj):type(ctype)
+      sconv:forward(input)
       sconv:zeroGradParameters()
-      groundgrad = sconv:backward(input, gradOutput)
-   end
-   tm.cpu = a:time().real
+      local groundgrad = sconv:backward(input, gradOutput)
 
-   input = input0:cuda():transpose(2,3)
-   gradOutput = gradOutput:cuda()
-   local gconv = nn.SpatialAdaptiveMaxPooling(outi,outj):cuda()
-   gconv:forward(input)
-   gconv:zeroGradParameters()
-   local rescuda = gconv:backward(input, gradOutput)
-   a:reset()
-   for i = 1,nloop do
+      input = input0:type(typename):transpose(2,3)
+      gradOutput = gradOutput:type(typename)
+      local gconv = nn.SpatialAdaptiveMaxPooling(outi,outj):type(typename)
+      gconv:forward(input)
       gconv:zeroGradParameters()
-      rescuda = gconv:backward(input, gradOutput)
-   end
-   cutorch.synchronize()
-   tm.gpu = a:time().real
+      local rescuda = gconv:backward(input, gradOutput)
 
-   local error = rescuda:float() - groundgrad
+      local error = rescuda:double() - groundgrad:double()
 
-   mytester:assertlt(error:abs():max(), precision_backward, 'error on state (backward) ')
+      mytester:assertlt(error:abs():max(), precision_backward_type(precision_backward, typename),
+          string.format('error on state (backward) with %s', typename))
+   end
 end
 
 function cunntest.SpatialAdaptiveMaxPooling_backward_batch()
@@ -3328,41 +2904,29 @@ function cunntest.SpatialAdaptiveMaxPooling_backward_batch()
    local ini = math.random(10,256)
    local inj = math.random(10,256)
 
-   local tm = {}
-   local title = string.format('SpatialAdaptiveMaxPooling.backward %dx%dx%dx%d -> %dx%dx%dx%d',
-                               bs, from, inj, ini, bs, to, outj, outi)
-   times[title] = tm
-
-   local input = torch.randn(bs,from,inj,ini)
-   local gradOutput = torch.randn(bs,to,outj,outi)
-   local sconv = nn.SpatialAdaptiveMaxPooling(outi,outj)
-   sconv:forward(input)
-   sconv:zeroGradParameters()
-   local groundgrad = sconv:backward(input, gradOutput)
-   local a = torch.Timer()
-   for i = 1,nloop do
+   for k, typename in ipairs(typenames) do
+      local input = torch.randn(bs,from,inj,ini):type(typename)
+      local gradOutput = torch.randn(bs,to,outj,outi):type(typename)
+      local ctype = t2cpu[typename]
+      input = input:type(ctype)
+      gradOutput = gradOutput:type(ctype)
+      local sconv = nn.SpatialAdaptiveMaxPooling(outi,outj):type(ctype)
+      sconv:forward(input)
       sconv:zeroGradParameters()
-      groundgrad = sconv:backward(input, gradOutput)
-   end
-   tm.cpu = a:time().real
+      local groundgrad = sconv:backward(input, gradOutput)
 
-   input = input:cuda()
-   gradOutput = gradOutput:cuda()
-   local gconv = nn.SpatialAdaptiveMaxPooling(outi,outj):cuda()
-   gconv:forward(input)
-   gconv:zeroGradParameters()
-   local rescuda = gconv:backward(input, gradOutput)
-   a:reset()
-   for i = 1,nloop do
+      input = input:type(typename)
+      gradOutput = gradOutput:type(typename)
+      local gconv = nn.SpatialAdaptiveMaxPooling(outi,outj):type(typename)
+      gconv:forward(input)
       gconv:zeroGradParameters()
-      rescuda = gconv:backward(input, gradOutput)
-   end
-   cutorch.synchronize()
-   tm.gpu = a:time().real
+      local rescuda = gconv:backward(input, gradOutput)
 
-   local error = rescuda:float() - groundgrad
+      local error = rescuda:double() - groundgrad:double()
 
-   mytester:assertlt(error:abs():max(), precision_backward, 'error on state (backward) ')
+      mytester:assertlt(error:abs():max(), precision_backward_type(precision_backward, typename),
+          string.format('error on state (backward) with %s', typename))
+   end
 end
 
 function cunntest.SpatialLPPooling_forward()
@@ -3495,164 +3059,154 @@ end
 
 function cunntest.BCECriterion_forward()
   local size = math.random(1,100)
-  local input = torch.Tensor(size):uniform()
-  local target = torch.Tensor(size):uniform():gt(0.5):type(torch.type(input))
-
-  local tm = {}
-  local title = string.format('BCECriterion.forward, Size: %d', size)
-  times[title] = tm
-
-  local crit = nn.BCECriterion()
-  local rescpu = crit:forward(input, target)
-  local a = torch.Timer()
-  for i = 1,nloop do
-     rescpu = crit:forward(input, target)
-  end
-  tm.cpu = a:time().real
-
-  input = input:cuda()
-  target = target:cuda()
-  local g_crit = nn.BCECriterion():cuda()
-  local rescuda = g_crit:forward(input, target)
-  a:reset()
-  for i = 1,nloop do
-     rescuda = g_crit:forward(input, target)
-  end
-  cutorch.synchronize()
-  tm.gpu = a:time().real
-  local errorVal = rescuda - rescpu
-  mytester:assertlt(errorVal, precision_forward, 'error on state (forward) ')
-
-  -- test vs lua implementation
-  buffer = input.new()
-  local restruth = BCECriterion_forward_truth(buffer, input, target, nil, true)
-  for i = 1,nloop do
-    local restruth = BCECriterion_forward_truth(buffer, input, target, nil, true)
+
+  for k, typename in ipairs(typenames) do
+     local input = torch.Tensor(size):uniform():type(typename)
+     local target = torch.Tensor(size):uniform():gt(0.5):type(torch.type(input))
+
+     local ctype = t2cpu[typename]
+     input = input:type(ctype)
+     target = target:type(ctype)
+     local crit = nn.BCECriterion():type(ctype)
+     local rescpu = crit:forward(input, target)
+
+     input = input:type(typename)
+     target = target:type(typename)
+     local g_crit = nn.BCECriterion():type(typename)
+     local rescuda = g_crit:forward(input, target)
+     local errorVal = rescuda - rescpu
+     mytester:assertlt(errorVal, precision_forward_type(precision_forward, typename),
+        string.format('error on state (forward) with %s', typename))
+
+     -- test vs lua implementation
+     input = input:type(ctype)
+     target = target:type(ctype)
+     buffer = input.new()
+     local restruth = BCECriterion_forward_truth(buffer, input, target, nil, true)
+     errorVal = rescpu - restruth
+     mytester:assertlt(errorVal, precision_forward_type(precision_forward, typename),
+        string.format('error on state (forward) with %s', typename))
+     errorVal = rescuda - restruth
+     mytester:assertlt(errorVal, precision_forward_type(precision_forward, typename),
+        string.format('error on state (forward) with %s', typename))
   end
-  errorVal = rescpu - restruth
-  mytester:assertlt(errorVal, precision_forward, 'error on state (forward) ')
-  errorVal = rescuda - restruth
-  mytester:assertlt(errorVal, precision_forward, 'error on state (forward) ')
 end
 
-
 function cunntest.BCECriterionWeights_forward()
   local size = math.random(1,100)
-  local input = torch.Tensor(size):uniform()
-  local target = torch.Tensor(size):uniform():gt(0.5):type(torch.type(input))
-  local weights = torch.Tensor(size):uniform()
-
-  local tm = {}
-  local title = string.format('BCECriterionWeights.forward, Size: %d', size)
-  times[title] = tm
-
-  local crit = nn.BCECriterion(weights)
-  local rescpu = crit:forward(input, target)
-  local a = torch.Timer()
-  for i = 1,nloop do
-    rescpu = crit:forward(input, target)
-  end
-  tm.cpu = a:time().real
-
-  input = input:cuda()
-  target = target:cuda()
-  weights = weights:cuda()
-  local g_crit = nn.BCECriterion(weights):cuda()
-  local rescuda = g_crit:forward(input, target)
-  a:reset()
-  for i = 1,nloop do
-    rescuda = g_crit:forward(input, target)
+  for k, typename in ipairs(typenames) do
+     local input = torch.Tensor(size):uniform():type(typename)
+     local target = torch.Tensor(size):uniform():gt(0.5):type(torch.type(input))
+     local weights = torch.Tensor(size):uniform():type(typename)
+
+     local ctype = t2cpu[typename]
+     input = input:type(ctype)
+     target = target:type(ctype)
+     weights = weights:type(ctype)
+     local crit = nn.BCECriterion(weights):type(ctype)
+     local rescpu = crit:forward(input, target)
+
+     input = input:type(typename)
+     target = target:type(typename)
+     weights = weights:type(typename)
+     local g_crit = nn.BCECriterion(weights):type(typename)
+     local rescuda = g_crit:forward(input, target)
+
+     local errorVal = rescuda - rescpu
+     mytester:assertlt(errorVal, precision_forward_type(precision_forward, typename),
+        string.format('error on state (forward) with %s', typename))
+
+     -- test vs lua implementation
+     -- FIXME: half does not support dot without CUDA 8.0, so can't compare to lua implementation.
+     if typename ~= 'torch.CudaHalfTensor' then
+        buffer = input.new()
+        restruth = BCECriterion_forward_truth(buffer, input, target, weights, true)
+        errorVal = rescpu - restruth
+        mytester:assertlt(errorVal, precision_forward_type(precision_forward, typename),
+           string.format('error on state (forward) with %s', typename))
+        errorVal = rescuda - restruth
+        mytester:assertlt(errorVal, precision_forward_type(precision_forward, typename),
+           string.format('error on state (forward) with %s', typename))
+     end
   end
-  cutorch.synchronize()
-  tm.gpu = a:time().real
-  local errorVal = rescuda - rescpu
-  mytester:assertlt(errorVal, precision_forward, 'error on state (forward) ')
-
-  -- test vs lua implementation
-  buffer = input.new()
-  local restruth = BCECriterion_forward_truth(buffer, input, target, weights, true)
-  for i = 1,nloop do
-    local restruth = BCECriterion_forward_truth(buffer, input, target, weights, true)
-  end
-  errorVal = rescpu - restruth
-  mytester:assertlt(errorVal, precision_forward, 'error on state (forward) ')
-  errorVal = rescuda - restruth
-  mytester:assertlt(errorVal, precision_forward, 'error on state (forward) ')
 end
 
-
 function cunntest.MarginCriterion_forward()
   local size = math.random(1,100)
-  local input = (torch.rand(size)-0.5) * 2 -- data spread from -1 to 1
-  local target = (torch.round(torch.rand(size))*2)-1 -- generate random labels -1, 1
-
-  local tm = {}
-  local title = string.format('MarginCriterion.forward, Size: %d', size)
-  times[title] = tm
-
-  local crit = nn.MarginCriterion()
-  local groundtruth= crit:forward(input, target)
-  local a = torch.Timer()
-  for i = 1,nloop do
-     groundtruth = crit:forward(input, target)
-  end
-  tm.cpu = a:time().real
-
-  input = input:cuda()
-  target = target:cuda()
-  local g_crit = nn.MarginCriterion():cuda()
-  local rescuda = g_crit:forward(input, target)
-  a:reset()
-  for i = 1,nloop do
-     rescuda = g_crit:forward(input, target)
+
+  for k, typename in ipairs(typenames) do
+    local input = ((torch.rand(size)-0.5) * 2):type(typename) -- data spread from -1 to 1
+    local target = ((torch.round(torch.rand(size))*2)-1):type(typename) -- generate random labels -1, 1
+
+    local ctype = t2cpu[typename]
+    input = input:type(ctype)
+    target = input:type(ctype)
+    local crit = nn.MarginCriterion():type(ctype)
+    local groundtruth= crit:forward(input, target)
+
+    input = input:type(typename)
+    target = target:type(typename)
+    local g_crit = nn.MarginCriterion():type(typename)
+    local rescuda = g_crit:forward(input, target)
+    local errorVal = rescuda - groundtruth
+    mytester:assertlt(errorVal, precision_forward_type(precision_forward, typename),
+        string.format('error on state (forward) with %s', typename))
   end
-  cutorch.synchronize()
-  tm.gpu = a:time().real
-  local errorVal = rescuda - groundtruth
-  mytester:assertlt(errorVal, precision_forward, 'error on state (forward) ')
 end
 
 function cunntest.MultiLabelMarginCriterion_forward()
   local size = math.random(1,100)
-  local input = (torch.rand(size)-0.5) * 2 -- data spread from -1 to 1
-  local target = torch.round(torch.rand(size)*(size-1)):add(1) -- generate random labels > 0
-  local zero = math.random(0,size) -- turn some labels into 0 targets
-  if zero > 0 then
-     target:sub(size-zero+1,size):zero()
-  end
 
-  local crit = nn.MultiLabelMarginCriterion()
-  local groundtruth= crit:forward(input, target)
-  input = input:cuda()
-  target = target:cuda()
-  local g_crit = nn.MultiLabelMarginCriterion():cuda()
-  local rescuda = g_crit:forward(input, target)
-  local errorVal = rescuda - groundtruth
-  mytester:assertlt(errorVal, precision_forward, 'error on state (forward) ')
+  for k, typename in ipairs(typenames) do
+     local input = ((torch.rand(size)-0.5) * 2):type(typename) -- data spread from -1 to 1
+     local target = torch.round(torch.rand(size)*(size-1)):add(1) -- generate random labels > 0
+     local zero = math.random(0,size) -- turn some labels into 0 targets
+     if zero > 0 then
+        target:sub(size-zero+1,size):zero()
+     end
+
+     local ctype = t2cpu[typename]
+     input = input:type(ctype)
+     local crit = nn.MultiLabelMarginCriterion():type(ctype)
+     local groundtruth= crit:forward(input, target)
+     input = input:type(typename)
+     target = target:type(typename)
+     local g_crit = nn.MultiLabelMarginCriterion():type(typename)
+     local rescuda = g_crit:forward(input, target)
+     local errorVal = rescuda - groundtruth
+     mytester:assertlt(errorVal, precision_forward_type(precision_forward, typename),
+        string.format('error on state (forward) with %s', typename))
+  end
 end
 
 function cunntest.MultiLabelMarginCriterion_backward()
    local size = math.random(1,100)
-   local input = (torch.rand(size)-0.5) * 2 -- data spread from -1 to 1
-   local target = torch.round(torch.rand(size)*(size-1)):add(1) -- generate random labels > 0
-   local zero = math.random(0,size) -- turn some labels into 0 targets
-   if zero > 0 then
-      target:sub(size-zero+1,size):zero()
-   end
 
-   local crit = nn.MultiLabelMarginCriterion()
-   local pred = crit:forward(input, target)
-   local groundgrad = crit:backward(input, target)
+   for k, typename in ipairs(typenames) do
+      local input = ((torch.rand(size)-0.5) * 2):type(typename) -- data spread from -1 to 1
+      local target = torch.round(torch.rand(size)*(size-1)):add(1) -- generate random labels > 0
+      local zero = math.random(0,size) -- turn some labels into 0 targets
+      if zero > 0 then
+         target:sub(size-zero+1,size):zero()
+      end
 
-   input = input:cuda()
-   target = target:cuda()
-   local g_crit = nn.MultiLabelMarginCriterion():cuda()
-   g_crit:forward(input, target)
-   local rescuda = g_crit:backward(input, target)
+      local ctype = t2cpu[typename]
+      input = input:type(ctype)
+      local crit = nn.MultiLabelMarginCriterion():type(ctype)
+      local pred = crit:forward(input, target)
+      local groundgrad = crit:backward(input, target)
 
-   local error = rescuda:float() - groundgrad
+      input = input:type(typename)
+      target = target:type(typename)
+      local g_crit = nn.MultiLabelMarginCriterion():type(typename)
+      g_crit:forward(input, target)
+      local rescuda = g_crit:backward(input, target)
 
-   mytester:assertlt(error:abs():max(), precision_backward, 'error on state (backward) ')
+      local error = rescuda:double() - groundgrad:double()
+
+      mytester:assertlt(error:abs():max(), precision_backward_type(precision_backward, typename),
+         string.format('error on state (backward) with %s', typename))
+   end
 end
 
 function cunntest.SpatialCrossMapLRN_forward_batch()
@@ -3664,31 +3218,22 @@ function cunntest.SpatialCrossMapLRN_forward_batch()
    local beta  = math.random(0,100)/100
    local k = math.random(1,3)
 
-   local tm = {}
-   local title = string.format('SpatialCrossMapLRN.forward')
-   times[title] = tm
+   for k, typename in ipairs(typenames) do
+      local input = torch.rand(bs, nbfeatures, inputSize, inputSize):type(typename)
 
-   local input = torch.rand(bs, nbfeatures, inputSize, inputSize)
-   local sconv = nn.SpatialCrossMapLRN(size, alpha, beta, k)
-   local groundtruth = sconv:forward(input)
-   local a = torch.Timer()
-   for i = 1,nloop do
-      groundtruth = sconv:forward(input)
-   end
-   tm.cpu = a:time().real
+      local ctype = t2cpu[typename]
+      input = input:type(ctype)
+      local sconv = nn.SpatialCrossMapLRN(size, alpha, beta, k):type(ctype)
+      local groundtruth = sconv:forward(input)
 
-   input = input:cuda()
-   local gconv = nn.SpatialCrossMapLRN(size, alpha, beta, k):cuda()
-   local rescuda = gconv:forward(input)
-   a:reset()
-   for i = 1,nloop do
-      rescuda = gconv:forward(input)
-   end
-   cutorch.synchronize()
-   tm.gpu = a:time().real
+      input = input:type(typename)
+      local gconv = nn.SpatialCrossMapLRN(size, alpha, beta, k):type(typename)
+      local rescuda = gconv:forward(input)
 
-   local error = rescuda:float() - groundtruth
-   mytester:assertlt(error:abs():max(), precision_forward, 'error on state (forward) ')
+      local error = rescuda:double() - groundtruth:double()
+      mytester:assertlt(error:abs():max(), precision_forward_type(precision_forward, typename),
+          string.format('error on state (forward) with %s', typename))
+   end
 end
 
 function cunntest.SpatialCrossMapLRN_backward_batch()
@@ -3700,242 +3245,208 @@ function cunntest.SpatialCrossMapLRN_backward_batch()
    local beta  = math.random(0,100)/100
    local k = math.random(1,3)
 
-   local tm = {}
-   local title = string.format('SpatialCrossMapLRN.backward')
-   times[title] = tm
+   for k, typename in ipairs(typenames) do
+      local input = torch.rand(bs, nbfeatures, inputSize, inputSize):type(typename)
+      local gradOutput = torch.rand(input:size()):type(typename)
 
-   local input = torch.rand(bs, nbfeatures, inputSize, inputSize)
-   local gradOutput = torch.rand(input:size())
-   local sconv = nn.SpatialCrossMapLRN(size, alpha, beta, k)
-   sconv:forward(input)
-   sconv:zeroGradParameters()
-   local groundgrad = sconv:backward(input, gradOutput)
-   local a = torch.Timer()
-   for i = 1,nloop do
+      local ctype = t2cpu[typename]
+      input = input:type(ctype)
+      gradOutput = gradOutput:type(ctype)
+      local sconv = nn.SpatialCrossMapLRN(size, alpha, beta, k):type(ctype)
+      sconv:forward(input)
       sconv:zeroGradParameters()
-      groundgrad = sconv:backward(input, gradOutput)
-   end
-   tm.cpu = a:time().real
+      local groundgrad = sconv:backward(input, gradOutput)
 
-   input = input:cuda()
-   gradOutput = gradOutput:cuda()
-   local gconv = nn.SpatialCrossMapLRN(size, alpha, beta, k):cuda()
-   gconv:forward(input)
-   gconv:zeroGradParameters()
-   local rescuda = gconv:backward(input, gradOutput)
-   a:reset()
-   for i = 1,nloop do
+      input = input:type(ctype)
+      gradOutput = gradOutput:type(ctype)
+      local gconv = nn.SpatialCrossMapLRN(size, alpha, beta, k):type(ctype)
+      gconv:forward(input)
       gconv:zeroGradParameters()
-      rescuda = gconv:backward(input, gradOutput)
-   end
-   cutorch.synchronize()
-   tm.gpu = a:time().real
+      local rescuda = gconv:backward(input, gradOutput)
 
-   local error = rescuda:float() - groundgrad
+      local error = rescuda:double() - groundgrad:double()
 
-   mytester:assertlt(error:abs():max(), precision_backward, 'error on state (backward) ')
+      mytester:assertlt(error:abs():max(), precision_backward_type(precision_backward),
+          string.format('error on state (backward) with %s', typename))
+   end
 end
 
 function cunntest.MarginCriterion_backward()
    local size = math.random(1,100)
 
-   local tm = {}
-   local title = string.format('MarginCriterion.backward, Size %d', size)
-   times[title] = tm
-
-   local input = (torch.rand(size)-0.5) * 2 -- data spread from -1 to 1
-   local target = (torch.round(torch.rand(size))*2)-1 -- generate random labels -1, 1
+   for k, typename in ipairs(typenames) do
+      local input = ((torch.rand(size)-0.5) * 2):type(typename) -- data spread from -1 to 1
+      local target = ((torch.round(torch.rand(size))*2)-1):type(typename) -- generate random labels -1, 1
 
-   local crit = nn.MarginCriterion()
-   crit:forward(input, target)
-   local groundgrad = crit:backward(input, target)
-   local a = torch.Timer()
-   for i = 1,nloop do
-      groundgrad = crit:backward(input, target)
-   end
-   tm.cpu = a:time().real
+      local ctype = t2cpu[typename]
+      input = input:type(ctype)
+      target = target:type(ctype)
+      local crit = nn.MarginCriterion():type(ctype)
+      crit:forward(input, target)
+      local groundgrad = crit:backward(input, target)
 
-   input = input:cuda()
-   target = target:cuda()
-   local g_crit = nn.MarginCriterion():cuda()
-   g_crit:forward(input, target)
-   local rescuda = g_crit:backward(input, target)
-   a:reset()
-   for i = 1,nloop do
-      rescuda = g_crit:backward(input, target)
-   end
-   cutorch.synchronize()
-   tm.gpu = a:time().real
+      input = input:type(typename)
+      target = target:type(typename)
+      local g_crit = nn.MarginCriterion():type(typename)
+      g_crit:forward(input, target)
+      local rescuda = g_crit:backward(input, target)
 
-   local error = rescuda:float() - groundgrad
+      local error = rescuda:double() - groundgrad:double()
 
-   mytester:assertlt(error:abs():max(), precision_backward, 'error on state (backward) ')
+      mytester:assertlt(error:abs():max(), precision_backward_type(precision_backward),
+         string.format('error on state (backward) with %s', typename))
+   end
 end
 
 function cunntest.BCECriterion_backward()
    local size = math.random(1,100)
 
-   local tm = {}
-   local title = string.format('BCECriterion.backward, Size %d', size)
-   times[title] = tm
-
-   local input = torch.Tensor(size):uniform()
-   local target = torch.Tensor(size):uniform():gt(0.5):type(torch.type(input))
+   for k, typename in ipairs(typenames) do
+      local input = torch.Tensor(size):uniform():type(typename)
+      local target = torch.Tensor(size):uniform():gt(0.5):type(torch.type(input))
 
-   local crit = nn.BCECriterion()
-   crit:forward(input, target)
-   local groundgrad = crit:backward(input, target)
-   local a = torch.Timer()
-   for i = 1,nloop do
-      groundgrad = crit:backward(input, target)
-   end
-   tm.cpu = a:time().real
+      local ctype = t2cpu[typename]
+      input = input:type(ctype)
+      target = target:type(ctype)
+      local crit = nn.BCECriterion():type(ctype)
+      crit:forward(input, target)
+      local groundgrad = crit:backward(input, target)
 
-   input = input:cuda()
-   target = target:cuda()
-   local g_crit = nn.BCECriterion():cuda()
-   g_crit:forward(input, target)
-   local rescuda = g_crit:backward(input, target)
-   a:reset()
-   for i = 1,nloop do
-      rescuda = g_crit:backward(input, target)
-   end
-   cutorch.synchronize()
-   tm.gpu = a:time().real
+      input = input:type(typename)
+      target = target:type(typename)
+      local g_crit = nn.BCECriterion():type(typename)
+      g_crit:forward(input, target)
+      local rescuda = g_crit:backward(input, target)
 
-   local error = rescuda:float() - groundgrad
+      local error = rescuda:double() - groundgrad:double()
 
-   mytester:assertlt(error:abs():max(), precision_backward, 'error on state (backward) ')
+      mytester:assertlt(error:abs():max(), precision_backward_type(precision_backward, typename),
+         string.format('error on state (backward) with %s', typename))
+   end
 end
 
 function cunntest.BCECriterionWeights_backward()
   local size = math.random(1,100)
 
-  local tm = {}
-  local title = string.format('BCECriterionWeights.backward, Size %d', size)
-  times[title] = tm
-
-  local input = torch.Tensor(size):uniform()
-  local target = torch.Tensor(size):uniform():gt(0.5):type(torch.type(input))
-  local weights = torch.Tensor(size):uniform()
-
-  local crit = nn.BCECriterion(weights)
-  crit:forward(input, target)
-  local groundgrad = crit:backward(input, target)
-  local a = torch.Timer()
-  for i = 1,nloop do
-    groundgrad = crit:backward(input, target)
-  end
-  tm.cpu = a:time().real
-
-  input = input:cuda()
-  target = target:cuda()
-  weights = weights:cuda()
-  local g_crit = nn.BCECriterion(weights):cuda()
-  g_crit:forward(input, target)
-  local rescuda = g_crit:backward(input, target)
-  a:reset()
-  for i = 1,nloop do
-    rescuda = g_crit:backward(input, target)
+  for k, typename in ipairs(typenames) do
+     local input = torch.Tensor(size):uniform():type(typename)
+     local target = torch.Tensor(size):uniform():gt(0.5):type(torch.type(input))
+     local weights = torch.Tensor(size):uniform():type(typename)
+
+     local ctype = t2cpu[typename]
+     input = input:type(ctype)
+     target = target:type(ctype)
+     weights = weights:type(ctype)
+     local crit = nn.BCECriterion(weights):type(ctype)
+     crit:forward(input, target)
+     local groundgrad = crit:backward(input, target)
+
+     input = input:type(typename)
+     target = target:type(typename)
+     weights = weights:type(typename)
+     local g_crit = nn.BCECriterion(weights):type(typename)
+     g_crit:forward(input, target)
+     local rescuda = g_crit:backward(input, target)
+
+     local error = rescuda:double() - groundgrad:double()
+
+     mytester:assertlt(error:abs():max(), precision_backward_type(precision_backward, typename),
+        string.format('error on state (backward) with %s', typename))
   end
-  cutorch.synchronize()
-  tm.gpu = a:time().real
-
-  local error = rescuda:float() - groundgrad
-
-  mytester:assertlt(error:abs():max(), precision_backward, 'error on state (backward) ')
 end
 
 function cunntest.mse()
    for sizeAverage = 0, 1 do
-      local size = math.random(3000,5000)
-      local input = torch.randn(size,1,1)
-      local target = torch.randn(size)
-      local mod = nn.MSECriterion(sizeAverage == 1)
-
-      local tm = {}
-      local title = string.format('MSECriterion sizeAverage %d, %d ', sizeAverage, size)
-      times[title] = tm
-
-      local a = torch.Timer()
-      local fout = mod:forward(input,target)
-      local fgin = mod:backward(input,target):clone()
-      tm.cpu = a:time().real
-
-      local cinput = input:cuda()
-      local ctarget = target:cuda()
-      local cmod = nn.MSECriterion(sizeAverage == 1):cuda()
-      a:reset()
-      local cout = cmod:forward(cinput,ctarget)
-      local cgin = cmod:backward(cinput,ctarget)
-      cutorch.synchronize()
-      tm.gpu = a:time().real
-
-      mytester:assertlt(math.abs(fout-cout), 0.02, 'error  on output')
-      local gerr = cgin:float() - fgin
-      mytester:assertlt(gerr:abs():max(), precision_forward, 'error  on gradInput')
+      for k, typename in ipairs(typenames) do
+         local size = math.random(3000,5000)
+         local input = torch.randn(size,1,1):type(typename)
+         local target = torch.randn(size):type(typename)
+
+         local ctype = t2cpu[typename]
+         input = input:type(ctype)
+         target = target:type(ctype)
+         local mod = nn.MSECriterion(sizeAverage == 1):type(ctype)
+
+         local fout = mod:forward(input,target)
+         local fgin = mod:backward(input,target):clone()
+
+         local cinput = input:type(typename)
+         local ctarget = target:type(typename)
+         local cmod = nn.MSECriterion(sizeAverage == 1):type(typename)
+         local cout = cmod:forward(cinput,ctarget)
+         local cgin = cmod:backward(cinput,ctarget)
+
+         if (typename == 'torch.CudaHalfTensor') then
+            fout = ffi.C.THC_half2float(ffi.C.THC_float2half(fout))
+         end
+         mytester:assertlt(math.abs(fout-cout), precision_forward_type(0.02, typename),
+            string.format('error  on output with %s', typename))
+         local gerr = cgin:double() - fgin:double()
+         mytester:assertlt(gerr:abs():max(), precision_forward_type(precision_forward, typename),
+            string.format('error  on gradInput with %s', typename))
+      end
    end
 end
 
 function cunntest.SmoothL1()
    for sizeAverage = 0, 1 do
       local size = math.random(3000,5000)
-      local input = torch.randn(size,1,1)
-      local target = torch.randn(size)
-      local mod = nn.SmoothL1Criterion(sizeAverage == 1)
-
-      local tm = {}
-      local title = string.format('SmoothL1Criterion sizeAverage %d, %d ', sizeAverage, size)
-      times[title] = tm
-
-      local a = torch.Timer()
-      local fout = mod:forward(input,target)
-      local fgin = mod:backward(input,target):clone()
-      tm.cpu = a:time().real
 
-      local cinput = input:cuda()
-      local ctarget = target:cuda()
-      local cmod = nn.SmoothL1Criterion(sizeAverage == 1):cuda()
-      a:reset()
-      local cout = cmod:forward(cinput,ctarget)
-      local cgin = cmod:backward(cinput,ctarget)
-      cutorch.synchronize()
-      tm.gpu = a:time().real
-
-      mytester:assertlt(math.abs(fout-cout), 0.01, 'error  on output')
-      local gerr = cgin:float() - fgin
-      mytester:assertlt(gerr:abs():max(), precision_forward, 'error  on gradInput')
+      for k, typename in ipairs(typenames) do
+         local input = torch.randn(size,1,1):type(typename)
+         local target = torch.randn(size):type(typename)
+
+         local ctype = t2cpu[typename]
+         input = input:type(ctype)
+         target = target:type(ctype)
+         local mod = nn.SmoothL1Criterion(sizeAverage == 1):type(ctype)
+
+         local fout = mod:forward(input,target)
+         local fgin = mod:backward(input,target):clone()
+
+         local cinput = input:type(typename)
+         local ctarget = target:type(typename)
+         local cmod = nn.SmoothL1Criterion(sizeAverage == 1):type(typename)
+         local cout = cmod:forward(cinput,ctarget)
+         local cgin = cmod:backward(cinput,ctarget)
+
+         if (typename == 'torch.CudaHalfTensor') then
+            fout = ffi.C.THC_half2float(ffi.C.THC_float2half(fout))
+         end
+         mytester:assertlt(math.abs(fout-cout), 0.01, string.format('error  on output with %s', typename))
+         local gerr = cgin:double() - fgin:double()
+         mytester:assertlt(gerr:abs():max(), precision_forward_type(precision_forward, typename),
+            string.format('error  on gradInput with %s', typename))
+      end
    end
 end
 
 function cunntest.SoftMarginCriterion()
    for sizeAverage = 0, 1 do
-      local size = math.random(3000,5000)
-      local input = torch.randn(size,1,1)
-      local target = torch.randn(size)
-      local mod = nn.SoftMarginCriterion(sizeAverage == 1)
-
-      local tm = {}
-      local title = string.format('SoftMarginCriterion sizeAverage %d, %d ', sizeAverage, size)
-      times[title] = tm
-
-      local a = torch.Timer()
-      local fout = mod:forward(input,target)
-      local fgin = mod:backward(input,target):clone()
-      tm.cpu = a:time().real
-
-      local cinput = input:cuda()
-      local ctarget = target:cuda()
-      local cmod = nn.SoftMarginCriterion(sizeAverage == 1):cuda()
-      a:reset()
-      local cout = cmod:forward(cinput,ctarget)
-      local cgin = cmod:backward(cinput,ctarget)
-      cutorch.synchronize()
-      tm.gpu = a:time().real
-
-      mytester:assertlt(math.abs(fout-cout), 0.01, 'error  on output')
-      local gerr = cgin:float() - fgin
-      mytester:assertlt(gerr:abs():max(), precision_forward, 'error  on gradInput')
+      for k, typename in ipairs(typenames) do
+         local size = math.random(3000,5000)
+         local input = torch.randn(size,1,1):type(typename)
+         local target = torch.randn(size):type(typename)
+
+         local ctype = t2cpu[typename]
+         input = input:type(ctype)
+         target = target:type(ctype)
+         local mod = nn.SoftMarginCriterion(sizeAverage == 1):type(ctype)
+
+         local fout = mod:forward(input,target)
+         local fgin = mod:backward(input,target):clone()
+
+         local cinput = input:type(typename)
+         local ctarget = target:type(typename)
+         local cmod = nn.SoftMarginCriterion(sizeAverage == 1):type(typename)
+         local cout = cmod:forward(cinput,ctarget)
+         local cgin = cmod:backward(cinput,ctarget)
+
+        mytester:assertlt(math.abs(fout-cout), 0.01, 'error  on output')
+        local gerr = cgin:double() - fgin:double()
+        mytester:assertlt(gerr:abs():max(), precision_forward_type(precision_forward, typename),
+           string.format('error  on gradInput with %s', typename))
+      end
    end
 end
 
@@ -3943,31 +3454,31 @@ end
 function cunntest.distkldiv()
    for sizeAverage = 0, 1 do
       local size = math.random(3000,5000)
-      local input = torch.randn(size) -- TODO, make it back to (size, 1, 1), see https://github.com/torch/cunn/issues/245#issuecomment-209260954
-      local target = torch.randn(size)
-      local mod = nn.DistKLDivCriterion(sizeAverage == 1)
 
-      local tm = {}
-      local title = string.format('DistKLDivCriterion sizeAverage %d, %d ',sizeAverage,size)
-      times[title] = tm
-
-      local a = torch.Timer()
-      local fout = mod:forward(input,target)
-      local fgin = mod:backward(input,target):clone()
-      tm.cpu = a:time().real
-
-      local cinput = input:cuda()
-      local ctarget = target:cuda()
-      local cmod = nn.DistKLDivCriterion(sizeAverage == 1):cuda()
-      a:reset()
-      local cout = cmod:forward(cinput,ctarget)
-      local cgin = cmod:backward(cinput,ctarget)
-      cutorch.synchronize()
-      tm.gpu = a:time().real
-
-      mytester:assertlt(math.abs(fout-cout), precision_forward, 'error  on output')
-      local gerr = cgin:float() - fgin
-      mytester:assertlt(gerr:abs():max(), precision_backward, 'error  on gradInput')
+      for k, typename in ipairs(typenames) do
+         local input = torch.randn(size):type(typename) -- TODO, make it back to (size, 1, 1), see https://github.com/torch/cunn/issues/245#issuecomment-209260954
+         local target = torch.randn(size):type(typename)
+
+         local ctype = t2cpu[typename]
+         input = input:type(ctype)
+         target = target:type(ctype)
+         local mod = nn.DistKLDivCriterion(sizeAverage == 1):type(ctype)
+
+         local fout = mod:forward(input,target)
+         local fgin = mod:backward(input,target):clone()
+
+         local cinput = input:type(typename)
+         local ctarget = target:type(typename)
+         local cmod = nn.DistKLDivCriterion(sizeAverage == 1):type(typename)
+         local cout = cmod:forward(cinput,ctarget)
+         local cgin = cmod:backward(cinput,ctarget)
+
+         mytester:assertlt(math.abs(fout-cout), precision_forward_type(precision_forward, typename),
+            string.format('error  on output with %s', typename))
+         local gerr = cgin:double() - fgin:double()
+         mytester:assertlt(gerr:abs():max(), precision_backward_type(precision_backward, typename),
+            string.format('error  on gradInput with %s', typename))
+      end
    end
 end
 
@@ -3979,34 +3490,24 @@ function cunntest.TemporalConvolution_forward()
    local outi = math.random(1,256) -- nOutputFrame
    local ini = (outi-1)*si+ki -- nInputFrame
 
-   local tm = {}
-   local title = string.format('TemporalConvolution.forward %dx%d o %d -> %dx%d [s: %d]',
-                               from, ini, ki, to, outi, si)
-   times[title] = tm
+   for k, typename in ipairs(typenames) do
+      local input = torch.randn(ini,from):type(typename)
 
-   local input = torch.randn(ini,from)
-   local sconv = nn.TemporalConvolution(from,to,ki,si)
-   local groundtruth = sconv:forward(input)
-   local a = torch.Timer()
-   for i = 1,nloop do
-      groundtruth = sconv:forward(input)
-   end
-   tm.cpu = a:time().real
+      local ctype = t2cpu[typename]
+      input = input:type(ctype)
+      local sconv = nn.TemporalConvolution(from,to,ki,si):type(ctype)
+      local groundtruth = sconv:forward(input)
 
-   input = input:cuda()
-   local gconv = nn.TemporalConvolution(from,to,ki,si):cuda()
-   gconv.weight = sconv.weight:cuda()
-   gconv.bias = sconv.bias:cuda()
-   local rescuda = gconv:forward(input)
-   a:reset()
-   for i = 1,nloop do
-      rescuda = gconv:forward(input)
-   end
-   cutorch.synchronize()
-   tm.gpu = a:time().real
+      input = input:type(typename)
+      local gconv = nn.TemporalConvolution(from,to,ki,si):type(typename)
+      gconv.weight = sconv.weight:type(typename)
+      gconv.bias = sconv.bias:type(typename)
+      local rescuda = gconv:forward(input)
 
-   local error = rescuda:float() - groundtruth
-   mytester:assertlt(error:abs():max(), precision_forward, 'error on state (forward) ')
+      local error = rescuda:double() - groundtruth:double()
+      mytester:assertlt(error:abs():max(), precision_forward_type(precision_forward, typename),
+        string.format('error on state (forward) with %s', typename))
+   end
 end
 
 function cunntest.TemporalConvolution_forward_batch()
@@ -4018,34 +3519,24 @@ function cunntest.TemporalConvolution_forward_batch()
    local outi = math.random(1,256)
    local ini = (outi-1)*si+ki
 
-   local tm = {}
-   local title = string.format('TemporalConvolution.forward %dx%dx%d o %d -> %dx%dx%d [s: %d]',
-                               bs, from, ini, ki, bs, to, outi, si)
-   times[title] = tm
+   for k, typename in ipairs(typenames) do
+      local input = torch.randn(bs,ini,from):type(typename)
 
-   local input = torch.randn(bs,ini,from)
-   local sconv = nn.TemporalConvolution(from,to,ki,si)
-   local groundtruth = sconv:forward(input)
-   local a = torch.Timer()
-   for i = 1,nloop do
-      groundtruth = sconv:forward(input)
-   end
-   tm.cpu = a:time().real
+      local ctype = t2cpu[typename]
+      input = input:type(ctype)
+      local sconv = nn.TemporalConvolution(from,to,ki,si):type(ctype)
+      local groundtruth = sconv:forward(input)
 
-   input = input:cuda()
-   local gconv = nn.TemporalConvolution(from,to,ki,si):cuda()
-   gconv.weight = sconv.weight:cuda()
-   gconv.bias = sconv.bias:cuda()
-   local rescuda = gconv:forward(input)
-   a:reset()
-   for i = 1,nloop do
-      rescuda = gconv:forward(input)
-   end
-   cutorch.synchronize()
-   tm.gpu = a:time().real
+      input = input:type(typename)
+      local gconv = nn.TemporalConvolution(from,to,ki,si):type(typename)
+      gconv.weight = sconv.weight:type(typename)
+      gconv.bias = sconv.bias:type(typename)
+      local rescuda = gconv:forward(input)
 
-   local error = rescuda:float() - groundtruth
-   mytester:assertlt(error:abs():max(), precision_forward, 'error on state (forward) ')
+      local error = rescuda:double() - groundtruth:double()
+      mytester:assertlt(error:abs():max(), precision_forward_type(precision_forward, typename),
+        string.format('error on state (forward) with %s', typename))
+   end
 end
 
 function cunntest.TemporalConvolution_backward()
@@ -4056,52 +3547,44 @@ function cunntest.TemporalConvolution_backward()
    local outi = math.random(1,256)
    local ini = (outi-1)*si+ki
 
-   local tm = {}
-   local title = string.format('TemporalConvolution.backward %dx%d o %d -> %dx%d',
-                               from, ini, ki, to, outi)
-
-   times[title] = tm
+   for k, typename in ipairs(typenames) do
+      local input = torch.randn(ini,from):type(typename)
+      local gradOutput = torch.randn(outi,to):type(typename)
 
-   local input = torch.randn(ini,from)
-   local gradOutput = torch.randn(outi,to)
-   local sconv = nn.TemporalConvolution(from,to,ki,si)
-   sconv:forward(input)
-   sconv:zeroGradParameters()
-   local groundgrad = sconv:backward(input, gradOutput)
-   local a = torch.Timer()
-   for i = 1,nloop do
+      local ctype = t2cpu[typename]
+      input = input:type(ctype)
+      gradOutput = gradOutput:type(ctype)
+      local sconv = nn.TemporalConvolution(from,to,ki,si):type(ctype)
+      sconv:forward(input)
       sconv:zeroGradParameters()
-      groundgrad = sconv:backward(input, gradOutput)
-   end
-   local groundweight = sconv.gradWeight
-   local groundbias = sconv.gradBias
-   tm.cpu = a:time().real
+      local groundgrad = sconv:backward(input, gradOutput)
+      local groundweight = sconv.gradWeight
+      local groundbias = sconv.gradBias
 
-   input = input:cuda()
-   gradOutput = gradOutput:cuda()
-   local gconv = nn.TemporalConvolution(from,to,ki,si):cuda()
-   gconv.weight = sconv.weight:cuda()
-   gconv.bias = sconv.bias:cuda()
-   gconv:forward(input)
-   gconv:zeroGradParameters()
-   local rescuda = gconv:backward(input, gradOutput)
-   a:reset()
-   for i = 1,nloop do
+      input = input:type(typename)
+      gradOutput = gradOutput:type(typename)
+      local gconv = nn.TemporalConvolution(from,to,ki,si):type(typename)
+      gconv.weight = sconv.weight:type(typename)
+      gconv.bias = sconv.bias:type(typename)
+      gconv:forward(input)
       gconv:zeroGradParameters()
-      rescuda = gconv:backward(input, gradOutput)
-   end
-   local weightcuda = gconv.gradWeight
-   local biascuda = gconv.gradBias
-   cutorch.synchronize()
-   tm.gpu = a:time().real
+      local rescuda = gconv:backward(input, gradOutput)
+      local weightcuda = gconv.gradWeight
+      local biascuda = gconv.gradBias
 
-   local error = rescuda:float() - groundgrad
-   local werror = weightcuda:float() - groundweight
-   local berror = biascuda:float() - groundbias
+      local error = rescuda:double() - groundgrad:double()
+      local werror = weightcuda:double() - groundweight:double()
+      local berror = biascuda:double() - groundbias:double()
 
-   mytester:assertlt(error:abs():max(), precision_backward, 'error on state (backward) ')
-   mytester:assertlt(werror:abs():max(), precision_backward, 'error on weight (backward) ')
-   mytester:assertlt(berror:abs():max(), precision_backward, 'error on bias (backward) ')
+      mytester:assertlt(error:abs():max(), precision_backward_type(precision_backward, typename),
+        string.format('error on state (backward) with %s', typename))
+      mytester:assertlt(werror:abs():max(),
+        precision_backward_conv_weightbias(precision_backward, typename, weightcuda:abs():max()),
+        string.format('error on weight (backward) with %s', typename))
+      mytester:assertlt(berror:abs():max(),
+        precision_backward_conv_weightbias(precision_backward, typename, biascuda:abs():max()),
+        string.format('error on bias (backward) with %s', typename))
+   end
 end
 
 function cunntest.TemporalConvolution_backward_batch()
@@ -4113,51 +3596,44 @@ function cunntest.TemporalConvolution_backward_batch()
    local outi = math.random(1,256)
    local ini = (outi-1)*si+ki
 
-   local tm = {}
-   local title = string.format('TemporalConvolution.backward %dx%dx%d o %d -> %dx%dx%d',
-                               bs, from, ini, ki, bs, to, outi)
-   times[title] = tm
+   for k, typename in ipairs(typenames) do
+      local input = torch.randn(bs,ini,from):type(typename)
+      local gradOutput = torch.randn(bs,outi,to):type(typename)
 
-   local input = torch.randn(bs,ini,from)
-   local gradOutput = torch.randn(bs,outi,to)
-   local sconv = nn.TemporalConvolution(from,to,ki,si)
-   sconv:forward(input)
-   sconv:zeroGradParameters()
-   local groundgrad = sconv:backward(input, gradOutput)
-   local a = torch.Timer()
-   for i = 1,nloop do
+      local ctype = t2cpu[typename]
+      input = input:type(ctype)
+      gradOutput = gradOutput:type(ctype)
+      local sconv = nn.TemporalConvolution(from,to,ki,si):type(ctype)
+      sconv:forward(input)
       sconv:zeroGradParameters()
-      groundgrad = sconv:backward(input, gradOutput)
-   end
-   local groundweight = sconv.gradWeight
-   local groundbias = sconv.gradBias
-   tm.cpu = a:time().real
+      local groundgrad = sconv:backward(input, gradOutput)
+      local groundweight = sconv.gradWeight
+      local groundbias = sconv.gradBias
 
-   input = input:cuda()
-   gradOutput = gradOutput:cuda()
-   local gconv = nn.TemporalConvolution(from,to,ki,si):cuda()
-   gconv.weight = sconv.weight:cuda()
-   gconv.bias = sconv.bias:cuda()
-   gconv:forward(input)
-   gconv:zeroGradParameters()
-   local rescuda = gconv:backward(input, gradOutput)
-   a:reset()
-   for i = 1,nloop do
+      input = input:type(typename)
+      gradOutput = gradOutput:type(typename)
+      local gconv = nn.TemporalConvolution(from,to,ki,si):type(typename)
+      gconv.weight = sconv.weight:type(typename)
+      gconv.bias = sconv.bias:type(typename)
+      gconv:forward(input)
       gconv:zeroGradParameters()
-      rescuda = gconv:backward(input, gradOutput)
-   end
-   local weightcuda = gconv.gradWeight
-   local biascuda = gconv.gradBias
-   cutorch.synchronize()
-   tm.gpu = a:time().real
+      local rescuda = gconv:backward(input, gradOutput)
+      local weightcuda = gconv.gradWeight
+      local biascuda = gconv.gradBias
 
-   local error = rescuda:float() - groundgrad
-   local werror = weightcuda:float() - groundweight
-   local berror = biascuda:float() - groundbias
+      local error = rescuda:double() - groundgrad:double()
+      local werror = weightcuda:double() - groundweight:double()
+      local berror = biascuda:double() - groundbias:double()
 
-   mytester:assertlt(error:abs():max(), precision_backward, 'error on state (backward) ')
-   mytester:assertlt(werror:abs():max(), precision_backward, 'error on weight (backward) ')
-   mytester:assertlt(berror:abs():max(), precision_backward, 'error on bias (backward) ')
+      mytester:assertlt(error:abs():max(), precision_backward_type(precision_backward, typename),
+        string.format('error on state (backward) with %s', typename))
+      mytester:assertlt(werror:abs():max(),
+        precision_backward_conv_weightbias(precision_backward, typename, weightcuda:abs():max()),
+        string.format('error on weight (backward) with %s', typename))
+      mytester:assertlt(berror:abs():max(),
+        precision_backward_conv_weightbias(precision_backward, typename, biascuda:abs():max()),
+        string.format('error on bias (backward) with %s', typename))
+   end
 end
 
 function cunntest.Dropout()
@@ -4211,66 +3687,46 @@ end
 function cunntest.SoftPlus_forward()
    local size = math.random(1,100)
 
-   local tm = {}
-   local title = string.format('SoftPlus forward %d -> %d', size, size)
-   times[title] = tm
-
-   local input = torch.randn(size)
-   local sconv = nn.SoftPlus()
-   local groundtruth = sconv:forward(input)
-   local a = torch.Timer()
-   for i = 1,nloop do
-      groundtruth = sconv:forward(input)
-   end
-   tm.cpu = a:time().real
+   for k, typename in ipairs(typenames) do
+      local input = torch.randn(size):type(typename)
+      local ctype = t2cpu[typename]
+      input = input:type(ctype)
+      local sconv = nn.SoftPlus():type(ctype)
+      local groundtruth = sconv:forward(input)
 
-   input = input:cuda()
-   local gconv = nn.SoftPlus():cuda()
-   local rescuda = gconv:forward(input)
-   a:reset()
-   for i = 1,nloop do
-      rescuda = gconv:forward(input)
-   end
-   cutorch.synchronize()
-   tm.gpu = a:time().real
+      input = input:type(typename)
+      local gconv = nn.SoftPlus():type(typename)
+      local rescuda = gconv:forward(input)
 
-   local error = rescuda:float() - groundtruth
-   mytester:assertlt(error:abs():max(), precision_forward, 'error on state (forward) ')
+      local error = rescuda:double() - groundtruth:double()
+      mytester:assertlt(error:abs():max(), precision_forward_type(precision_forward,typename),
+          string.format('error on state (forward) with %s', typename))
+    end
 end
 
 function cunntest.SoftPlus_backward()
    local size = math.random(1,100)
 
-   local tm = {}
-   local title = string.format('SoftPlus.backward %d -> %d', size, size)
-   times[title] = tm
-
-   local input = torch.randn(size)
-   local gradOutput = torch.randn(size)
-   local sconv = nn.SoftPlus()
-   sconv:forward(input)
-   local groundgrad = sconv:backward(input, gradOutput)
-   local a = torch.Timer()
-   for i = 1,nloop do
-      groundgrad = sconv:backward(input, gradOutput)
-   end
-   tm.cpu = a:time().real
-
-   input = input:cuda()
-   gradOutput = gradOutput:cuda()
-   local gconv = sconv:clone():cuda()
-   gconv:forward(input)
-   local rescuda = gconv:backward(input, gradOutput)
-   a:reset()
-   for i = 1,nloop do
-      rescuda = gconv:backward(input, gradOutput)
-   end
-   cutorch.synchronize()
-   tm.gpu = a:time().real
+   for k, typename in ipairs(typenames) do
+      local input = torch.randn(size):type(typename)
+      local gradOutput = torch.randn(size):type(typename)
+      local ctype = t2cpu[typename]
+      input = input:type(ctype)
+      gradOutput = gradOutput:type(ctype)
+      local sconv = nn.SoftPlus():type(ctype)
+      sconv:forward(input)
+      local groundgrad = sconv:backward(input, gradOutput)
 
-   local error = rescuda:float() - groundgrad
+      input = input:type(typename)
+      gradOutput = gradOutput:type(typename)
+      local gconv = sconv:clone():type(typename)
+      gconv:forward(input)
+      local rescuda = gconv:backward(input, gradOutput)
 
-   mytester:assertlt(error:abs():max(), precision_backward, 'error on state (backward) ')
+      local error = rescuda:double() - groundgrad:double()
+      mytester:assertlt(error:abs():max(), precision_backward_type(precision_backward, typename),
+          string.format('error on state (backward) with %s', typename))
+    end
 end
 
 function cunntest.SpatialUpSamplingNearest_forward()
@@ -4279,32 +3735,22 @@ function cunntest.SpatialUpSamplingNearest_forward()
    local w = torch.random(3, 15)
    local scale = torch.random(2,5)
 
-   local tm = {}
-   local title = string.format('SpatialUpSamplingNearest.forward %dx%dx%d -> %dx%dx%d',
-                               f, h, w, f, h*scale, w*scale)
-   times[title] = tm
+   for k, typename in ipairs(typenames) do
+      local input = torch.randn(f, h, w):type(typename)
 
-   local input = torch.randn(f, h, w)
-   local sconv = nn.SpatialUpSamplingNearest(scale)
-   local groundtruth = sconv:forward(input)
-   local a = torch.Timer()
-   for i = 1,nloop do
-      groundtruth = sconv:forward(input)
-   end
-   tm.cpu = a:time().real
+      local ctype = t2cpu[typename]
+      input = input:type(ctype)
+      local sconv = nn.SpatialUpSamplingNearest(scale):type(ctype)
+      local groundtruth = sconv:forward(input)
 
-   input = input:cuda()
-   local gconv = sconv:clone():cuda()
-   local rescuda = gconv:forward(input)
-   a:reset()
-   for i = 1,nloop do
-      rescuda = gconv:forward(input)
-   end
-   cutorch.synchronize()
-   tm.gpu = a:time().real
+      input = input:type(typename)
+      local gconv = sconv:clone():type(typename)
+      local rescuda = gconv:forward(input)
 
-   local error = rescuda:float() - groundtruth
-   mytester:assertlt(error:abs():max(), precision_forward, 'error on state (forward) ')
+      local error = rescuda:double() - groundtruth:double()
+      mytester:assertlt(error:abs():max(), precision_forward_type(precision_forward, typename),
+          string.format('error on state (forward) with %s', typename))
+   end
 end
 
 function cunntest.SpatialUpSamplingNearest_forward_batch()
@@ -4314,33 +3760,22 @@ function cunntest.SpatialUpSamplingNearest_forward_batch()
    local w = torch.random(3, 15)
    local scale = torch.random(2,5)
 
-   local tm = {}
-   local title = string.format('SpatialUpSamplingNearest.forward %dx%dx%dx%d -> %dx%dx%dx%d',
-                               nbatch, f, h, w, nbatch, f, h*scale, w*scale)
-   times[title] = tm
-
-   local input = torch.randn(nbatch, f, h, w)
-   local sconv = nn.SpatialUpSamplingNearest(scale)
-   local groundtruth = sconv:forward(input)
-   local a = torch.Timer()
-   for i = 1,nloop do
-      groundtruth = sconv:forward(input)
-   end
-   tm.cpu = a:time().real
+   for k, typename in ipairs(typenames) do
+      local input = torch.randn(nbatch, f, h, w):type(typename)
 
-   input = input:cuda()
-   local gconv = sconv:clone():cuda()
-   local rescuda = gconv:forward(input)
-   a:reset()
-   for i = 1,nloop do
-      rescuda = gconv:forward(input)
-   end
-   cutorch.synchronize()
-   tm.gpu = a:time().real
+      local ctype = t2cpu[typename]
+      input = input:type(ctype)
+      local sconv = nn.SpatialUpSamplingNearest(scale):type(ctype)
+      local groundtruth = sconv:forward(input)
 
-   local error = rescuda:float() - groundtruth
-   mytester:assertlt(error:abs():max(), precision_forward, 'error on state (forward) ')
+      input = input:type(typename)
+      local gconv = sconv:clone():type(typename)
+      local rescuda = gconv:forward(input)
 
+      local error = rescuda:double() - groundtruth:double()
+      mytester:assertlt(error:abs():max(), precision_forward_type(precision_forward, typename),
+          string.format('error on state (forward) with %s', typename))
+   end
 end
 
 function cunntest.SpatialUpSamplingNearest_backward()
@@ -4349,41 +3784,30 @@ function cunntest.SpatialUpSamplingNearest_backward()
    local w = torch.random(3, 15)
    local scale = torch.random(2,5)
 
-   local tm = {}
-   local title = string.format('SpatialUpSamplingNearest.backward %dx%dx%d -> %dx%dx%d',
-                               f, h, w, f, h*scale, w*scale)
-   times[title] = tm
+   for k, typename in ipairs(typenames) do
+      local input = torch.randn(f, h, w):type(typename)
+      local gradOutput = torch.randn(f, h*scale, w*scale):type(typename)
 
-   local input = torch.randn(f, h, w)
-   local gradOutput = torch.randn(f, h*scale, w*scale)
-   local sconv = nn.SpatialUpSamplingNearest(scale)
-   sconv:forward(input)
-   sconv:zeroGradParameters()
-   local groundgrad = sconv:backward(input, gradOutput)
-   local a = torch.Timer()
-   for i = 1,nloop do
+      local ctype = t2cpu[typename]
+      input = input:type(ctype)
+      gradOutput = gradOutput:type(ctype)
+      local sconv = nn.SpatialUpSamplingNearest(scale):type(ctype)
+      sconv:forward(input)
       sconv:zeroGradParameters()
-      groundgrad = sconv:backward(input, gradOutput)
-   end
-   tm.cpu = a:time().real
+      local groundgrad = sconv:backward(input, gradOutput)
 
-   input = input:cuda()
-   gradOutput = gradOutput:cuda()
-   local gconv = sconv:clone():cuda()
-   gconv:forward(input)
-   gconv:zeroGradParameters()
-   local rescuda = gconv:backward(input, gradOutput)
-   a:reset()
-   for i = 1,nloop do
+      input = input:type(typename)
+      gradOutput = gradOutput:type(typename)
+      local gconv = sconv:clone():type(typename)
+      gconv:forward(input)
       gconv:zeroGradParameters()
-      rescuda = gconv:backward(input, gradOutput)
-   end
-   cutorch.synchronize()
-   tm.gpu = a:time().real
+      local rescuda = gconv:backward(input, gradOutput)
 
-   local error = rescuda:float() - groundgrad
+      local error = rescuda:double() - groundgrad:double()
 
-   mytester:assertlt(error:abs():max(), precision_backward, 'error on state (backward) ')
+      mytester:assertlt(error:abs():max(), precision_backward_type(precision_backward, typename),
+          string.format('error on state (backward) with %s', typename))
+   end
 end
 
 function cunntest.SpatialUpSamplingNearest_backward_batch()
@@ -4393,41 +3817,30 @@ function cunntest.SpatialUpSamplingNearest_backward_batch()
    local w = torch.random(3, 15)
    local scale = torch.random(2,5)
 
-   local tm = {}
-   local title = string.format('SpatialUpSamplingNearest.backward %dx%dx%dx%d -> %dx%dx%dx%d',
-                               nbatch, f, h, w, nbatch, f, h*scale, w*scale)
-   times[title] = tm
+   for k, typename in ipairs(typenames) do
+      local input = torch.randn(nbatch, f, h, w):type(typename)
+      local gradOutput = torch.randn(nbatch, f, h*scale, w*scale):type(typename)
 
-   local input = torch.randn(nbatch, f, h, w)
-   local gradOutput = torch.randn(nbatch, f, h*scale, w*scale)
-   local sconv = nn.SpatialUpSamplingNearest(scale)
-   sconv:forward(input)
-   sconv:zeroGradParameters()
-   local groundgrad = sconv:backward(input, gradOutput)
-   local a = torch.Timer()
-   for i = 1,nloop do
+      local ctype = t2cpu[typename]
+      input = input:type(ctype)
+      gradOutput = gradOutput:type(ctype)
+      local sconv = nn.SpatialUpSamplingNearest(scale):type(ctype)
+      sconv:forward(input)
       sconv:zeroGradParameters()
-      groundgrad = sconv:backward(input, gradOutput)
-   end
-   tm.cpu = a:time().real
+      local groundgrad = sconv:backward(input, gradOutput)
 
-   input = input:cuda()
-   gradOutput = gradOutput:cuda()
-   local gconv = sconv:clone():cuda()
-   gconv:forward(input)
-   gconv:zeroGradParameters()
-   local rescuda = gconv:backward(input, gradOutput)
-   a:reset()
-   for i = 1,nloop do
+      input = input:type(typename)
+      gradOutput = gradOutput:type(typename)
+      local gconv = sconv:clone():type(typename)
+      gconv:forward(input)
       gconv:zeroGradParameters()
-      rescuda = gconv:backward(input, gradOutput)
-   end
-   cutorch.synchronize()
-   tm.gpu = a:time().real
+      local rescuda = gconv:backward(input, gradOutput)
 
-   local error = rescuda:float() - groundgrad
+      local error = rescuda:double() - groundgrad:double()
 
-   mytester:assertlt(error:abs():max(), precision_backward, 'error on state (backward) ')
+      mytester:assertlt(error:abs():max(), precision_backward_type(precision_backward, typename),
+          string.format('error on state (backward) with %s', typename))
+   end
 end
 
 function cunntest.SpatialUpSamplingBilinear_forward()
@@ -4436,34 +3849,22 @@ function cunntest.SpatialUpSamplingBilinear_forward()
    local w = torch.random(3, 15)
    local scale = torch.random(2,5)
 
-   local tm = {}
-   local title =
-   string.format('SpatialUpSamplingBilinear.forward %dx%dx%d -> %dx%dx%d',
-                               f, h, w, f, h*scale, w*scale)
-   times[title] = tm
+   for k, typename in ipairs(typenames) do
+      local input = torch.randn(f, h, w):type(typename)
 
-   local input = torch.randn(f, h, w)
-   local sconv = nn.SpatialUpSamplingBilinear(scale)
-   local groundtruth = sconv:forward(input)
-   local a = torch.Timer()
-   for i = 1,nloop do
-      groundtruth = sconv:forward(input)
-   end
-   tm.cpu = a:time().real
+      local ctype = t2cpu[typename]
+      input = input:type(ctype)
+      local sconv = nn.SpatialUpSamplingBilinear(scale):type(ctype)
+      local groundtruth = sconv:forward(input)
 
-   input = input:cuda()
-   local gconv = sconv:clone():cuda()
-   local rescuda = gconv:forward(input)
-   a:reset()
-   for i = 1,nloop do
-      rescuda = gconv:forward(input)
-   end
-   cutorch.synchronize()
-   tm.gpu = a:time().real
+      input = input:type(typename)
+      local gconv = sconv:clone():type(typename)
+      local rescuda = gconv:forward(input)
 
-   local error = rescuda:float() - groundtruth
-   mytester:assertlt(error:abs():max(), precision_forward,
-                      'error on state (forward) ')
+      local error = rescuda:double() - groundtruth:double()
+      mytester:assertlt(error:abs():max(), precision_forward_type(precision_forward, typename),
+                        string.format('error on state (forward) with %s', typename))
+   end
 end
 
 function cunntest.SpatialUpSamplingBilinear_forward_batch()
@@ -4473,35 +3874,22 @@ function cunntest.SpatialUpSamplingBilinear_forward_batch()
    local w = torch.random(3, 15)
    local scale = torch.random(2,5)
 
-   local tm = {}
-   local title =
-   string.format('SpatialUpSamplingBilinear.forward %dx%dx%dx%d -> %dx%dx%dx%d',
-                               nbatch, f, h, w, nbatch, f, h*scale, w*scale)
-   times[title] = tm
+   for k, typename in ipairs(typenames) do
+      local input = torch.randn(nbatch, f, h, w):type(typename)
 
-   local input = torch.randn(nbatch, f, h, w)
-   local sconv = nn.SpatialUpSamplingBilinear(scale)
-   local groundtruth = sconv:forward(input)
-   local a = torch.Timer()
-   for i = 1,nloop do
-      groundtruth = sconv:forward(input)
-   end
-   tm.cpu = a:time().real
-
-   input = input:cuda()
-   local gconv = sconv:clone():cuda()
-   local rescuda = gconv:forward(input)
-   a:reset()
-   for i = 1,nloop do
-      rescuda = gconv:forward(input)
-   end
-   cutorch.synchronize()
-   tm.gpu = a:time().real
+      local ctype = t2cpu[typename]
+      input = input:type(ctype)
+      local sconv = nn.SpatialUpSamplingBilinear(scale):type(ctype)
+      local groundtruth = sconv:forward(input)
 
-   local error = rescuda:float() - groundtruth
-   mytester:assertlt(error:abs():max(), precision_forward,
-                      'error on state (forward) ')
+      input = input:type(typename)
+      local gconv = sconv:clone():type(typename)
+      local rescuda = gconv:forward(input)
 
+      local error = rescuda:double() - groundtruth:double()
+      mytester:assertlt(error:abs():max(), precision_forward_type(precision_forward, typename),
+                        string.format('error on state (forward) with %s', typename))
+   end
 end
 
 function cunntest.SpatialUpSamplingBilinear_backward()
@@ -4510,43 +3898,30 @@ function cunntest.SpatialUpSamplingBilinear_backward()
    local w = torch.random(3, 15)
    local scale = torch.random(2,5)
 
-   local tm = {}
-   local title =
-   string.format('SpatialUpSamplingBilinear.backward %dx%dx%d -> %dx%dx%d',
-                               f, h, w, f, h*scale, w*scale)
-   times[title] = tm
+   for k, typename in ipairs(typenames) do
+      local input = torch.randn(f, h, w):type(typename)
+      local gradOutput = torch.randn(f, h*scale, w*scale):type(typename)
 
-   local input = torch.randn(f, h, w)
-   local gradOutput = torch.randn(f, h*scale, w*scale)
-   local sconv = nn.SpatialUpSamplingBilinear(scale)
-   sconv:forward(input)
-   sconv:zeroGradParameters()
-   local groundgrad = sconv:backward(input, gradOutput)
-   local a = torch.Timer()
-   for i = 1,nloop do
+      local ctype = t2cpu[typename]
+      input = input:type(ctype)
+      gradOutput = gradOutput:type(ctype)
+      local sconv = nn.SpatialUpSamplingBilinear(scale):type(ctype)
+      sconv:forward(input)
       sconv:zeroGradParameters()
-      groundgrad = sconv:backward(input, gradOutput)
-   end
-   tm.cpu = a:time().real
+      local groundgrad = sconv:backward(input, gradOutput)
 
-   input = input:cuda()
-   gradOutput = gradOutput:cuda()
-   local gconv = sconv:clone():cuda()
-   gconv:forward(input)
-   gconv:zeroGradParameters()
-   local rescuda = gconv:backward(input, gradOutput)
-   a:reset()
-   for i = 1,nloop do
+      input = input:type(typename)
+      gradOutput = gradOutput:type(typename)
+      local gconv = sconv:clone():type(typename)
+      gconv:forward(input)
       gconv:zeroGradParameters()
-      rescuda = gconv:backward(input, gradOutput)
-   end
-   cutorch.synchronize()
-   tm.gpu = a:time().real
+      local rescuda = gconv:backward(input, gradOutput)
 
-   local error = rescuda:float() - groundgrad
+      local error = rescuda:double() - groundgrad:double()
 
-   mytester:assertlt(error:abs():max(), precision_backward,
-                      'error on state (backward) ')
+      mytester:assertlt(error:abs():max(), precision_backward_type(precision_backward, typename),
+                        string.format('error on state (backward) with %s', typename))
+   end
 end
 
 function cunntest.SpatialUpSamplingBilinear_backward_batch()
@@ -4556,146 +3931,152 @@ function cunntest.SpatialUpSamplingBilinear_backward_batch()
    local w = torch.random(3, 15)
    local scale = torch.random(2,5)
 
-   local input = torch.randn(nbatch, f, h, w)
-   local gradOutput = torch.randn(nbatch, f, h*scale, w*scale)
-   local sconv = nn.SpatialUpSamplingBilinear(scale)
-   local output = sconv:forward(input)
-   sconv:zeroGradParameters()
-   local groundgrad = sconv:backward(input, gradOutput)
+   for k, typename in ipairs(typenames) do
+      local input = torch.randn(nbatch, f, h, w):type(typename)
+      local gradOutput = torch.randn(nbatch, f, h*scale, w*scale):type(typename)
 
-   input = input:cuda()
-   gradOutput = gradOutput:cuda()
-   local gconv = sconv:clone():cuda()
-   gconv:forward(input)
-   gconv:zeroGradParameters()
-   local rescuda = gconv:backward(input, gradOutput)
+      local ctype = t2cpu[typename]
+      input = input:type(ctype)
+      gradOutput = gradOutput:type(ctype)
+      local sconv = nn.SpatialUpSamplingBilinear(scale):type(ctype)
+      local output = sconv:forward(input)
+      sconv:zeroGradParameters()
+      local groundgrad = sconv:backward(input, gradOutput)
+
+      input = input:type(typename)
+      gradOutput = gradOutput:type(typename)
+      local gconv = sconv:clone():type(typename)
+      gconv:forward(input)
+      gconv:zeroGradParameters()
+      local rescuda = gconv:backward(input, gradOutput)
 
-   local err = rescuda:float() - groundgrad
+      local err = rescuda:double() - groundgrad:double()
 
-   mytester:assertlt(err:abs():max(), precision_backward,
-                      'error on state (backward) ')
+      mytester:assertlt(err:abs():max(), precision_backward_type(precision_backward, typename),
+                        string.format('error on state (backward) with %s', typename))
+   end
 end
 
 function cunntest.l1cost()
    local size = math.random(300,500)
-   local input = torch.randn(size)
-   local mod = nn.L1Cost()
 
-   local tm = {}
-   local title = string.format('L1Cost %d ',size)
-   times[title] = tm
+   for k, typename in ipairs(typenames) do
+     local input = torch.randn(size):type(typename)
 
-   local a = torch.Timer()
-   local fout = mod:forward(input)
-   local fgin = mod:backward(input):clone()
-   tm.cpu = a:time().real
+     local ctype = t2cpu[typename]
+     input = input:type(ctype)
+     local mod = nn.L1Cost():type(ctype)
 
-   local cinput = input:cuda()
-   local cmod = nn.L1Cost():cuda()
-   a:reset()
-   local cout = cmod:forward(cinput)
-   local cgin = cmod:backward(cinput)
-   cutorch.synchronize()
-   tm.gpu = a:time().real
+     local fout = mod:forward(input)
+     local fgin = mod:backward(input):clone()
+
+     local cinput = input:type(typename)
+     local cmod = nn.L1Cost():type(typename)
+     local cout = cmod:forward(cinput)
+     local cgin = cmod:backward(cinput)
 
-   mytester:assertlt(math.abs(fout-cout), precision_forward, 'error  on output')
-   local gerr = cgin:float() - fgin
-   mytester:assertlt(gerr:abs():max(), precision_forward, 'error  on gradInput')
+     if (typename == 'torch.CudaHalfTensor') then
+        fout = ffi.C.THC_half2float(ffi.C.THC_float2half(fout))
+     end
+     mytester:assertlt(math.abs(fout-cout), precision_forward_type(precision_forward, typename),
+        string.format('error  on output with %s', typename))
+     local gerr = cgin:double() - fgin:double()
+     mytester:assertlt(gerr:abs():max(), precision_forward_type(precision_forward, typename),
+        string.format('error  on gradInput with %s', typename))
+   end
 end
 
 
 function cunntest.ClassNLLCriterionSingleTarget()
    local size = math.random(3000,5000)
-   local input = torch.randn(size)
-   local target = 1
-   local mod = nn.ClassNLLCriterion()
 
-   local tm = {}
-   local title = string.format('ClassNLLCriterionSingleTarget %d ',size)
-   times[title] = tm
+   for k, typename in ipairs(typenames) do
+      local input = torch.randn(size):type(typename)
+      local target = 1
 
-   local a = torch.Timer()
-   local fout = mod:forward(input, target)
-   local fgin = mod:backward(input, target):clone()
-   tm.cpu = a:time().real
+      local ctype = t2cpu[typename]
+      input = input:type(ctype)
+      local mod = nn.ClassNLLCriterion():type(ctype)
 
-   local cinput = input:cuda()
-   local ctarget = torch.CudaTensor(1):fill(target)
-   local cmod = nn.ClassNLLCriterion():cuda()
-   a:reset()
-   local cout = cmod:forward(cinput,ctarget)
-   local cgin = cmod:backward(cinput,ctarget)
-   cutorch.synchronize()
-   tm.gpu = a:time().real
+      local fout = mod:forward(input, target)
+      local fgin = mod:backward(input, target):clone()
+
+      local cinput = input:type(typename)
+      local ctarget = torch.CudaTensor(1):fill(target)
+      local cmod = nn.ClassNLLCriterion():type(typename)
+      local cout = cmod:forward(cinput,ctarget)
+      local cgin = cmod:backward(cinput,ctarget)
 
-   mytester:assertlt(
-       math.abs(fout-cout), precision_forward, 'error  on output')
-   local gerr = cgin:float() - fgin
-   mytester:assertlt(gerr:abs():max(), precision_forward, 'error  on gradInput')
+      mytester:assertlt(
+         math.abs(fout-cout), precision_forward_type(precision_forward, typename),
+            string.format('error  on output with %s', typename))
+      local gerr = cgin:double() - fgin:double()
+      mytester:assertlt(gerr:abs():max(), precision_forward_type(precision_forward, typename),
+         string.format('error  on gradInput with %s', typename))
+   end
 end
 
 function cunntest.ClassNLLCriterionSingleTargetWeights()
    local size = math.random(3000,5000)
-   local input = torch.randn(size)
-   local target = 1
-   local weights = torch.rand(size)
-   local mod = nn.ClassNLLCriterion(weights)
 
-   local tm = {}
-   local title = string.format('ClassNLLCriterionSingleTargetWeights %d ',size)
-   times[title] = tm
+   for k, typename in ipairs(typenames) do
+      local input = torch.randn(size):type(typename)
+      local target = 1
+      local weights = torch.rand(size):type(typename)
 
-   local a = torch.Timer()
-   local fout = mod:forward(input, target)
-   local fgin = mod:backward(input, target):clone()
-   tm.cpu = a:time().real
+      local ctype = t2cpu[typename]
+      input = input:type(ctype)
+      weights = weights:type(ctype)
+      local mod = nn.ClassNLLCriterion(weights):type(ctype)
 
-   local cinput = input:cuda()
-   local cweights = weights:cuda()
-   local ctarget = torch.CudaTensor(1):fill(target)
-   local cmod = nn.ClassNLLCriterion(cweights):cuda()
-   a:reset()
-   local cout = cmod:forward(cinput,ctarget)
-   local cgin = cmod:backward(cinput,ctarget)
-   cutorch.synchronize()
-   tm.gpu = a:time().real
+      local fout = mod:forward(input, target)
+      local fgin = mod:backward(input, target):clone()
 
-   mytester:assertlt(
-       math.abs(fout-cout), precision_forward, 'error  on output')
-   local gerr = cgin:float() - fgin
-   mytester:assertlt(gerr:abs():max(), precision_forward, 'error  on gradInput')
+      local cinput = input:type(typename)
+      local cweights = weights:type(typename)
+      local ctarget = torch.CudaTensor(1):fill(target)
+      local cmod = nn.ClassNLLCriterion(cweights):type(typename)
+      local cout = cmod:forward(cinput,ctarget)
+      local cgin = cmod:backward(cinput,ctarget)
+
+      mytester:assertlt(
+         math.abs(fout-cout), precision_forward_type(precision_forward, typename),
+            string.format('error  on output with %s', typename))
+      local gerr = cgin:double() - fgin:double()
+      mytester:assertlt(gerr:abs():max(), precision_forward_type(precision_forward, typename),
+         string.format('error  on gradInput with %s', typename))
+   end
 end
 
 function cunntest.ClassNLLCriterionMultipleTarget()
    local size = math.random(3000,5000)
-   local input = torch.randn(size, size)
-   local target = torch.randperm(size)
-   local mod = nn.ClassNLLCriterion()
 
-   local tm = {}
-   local title = string.format('ClassNLLCriterionMultiTarget %d ',size)
-   times[title] = tm
+   for k, typename in ipairs(typenames) do
+      local input = torch.randn(size, size):type(typename)
+      local target = torch.randperm(size)
 
-   local a = torch.Timer()
-   local fout = mod:forward(input, target)
-   local fgin = mod:backward(input, target):clone()
-   tm.cpu = a:time().real
+      local ctype = t2cpu[typename]
+      input = input:type(ctype)
+      local mod = nn.ClassNLLCriterion():type(ctype)
 
-   local cinput = input:cuda()
-   local ctarget = target:cuda()
+      local fout = mod:forward(input, target)
+      local fgin = mod:backward(input, target):clone()
 
-   local cmod = nn.ClassNLLCriterion():cuda()
-   a:reset()
-   local cout = cmod:forward(cinput,ctarget)
-   local cgin = cmod:backward(cinput,ctarget)
-   cutorch.synchronize()
-   tm.gpu = a:time().real
+      local cinput = input:type(typename)
+      local ctarget = target:cuda()
+
+      local cmod = nn.ClassNLLCriterion():type(typename)
+      local cout = cmod:forward(cinput,ctarget)
+      local cgin = cmod:backward(cinput,ctarget)
 
-   mytester:assertlt(
-       math.abs(fout-cout), precision_forward, 'error on output')
+      mytester:assertlt(
+        math.abs(fout-cout), precision_forward_type(precision_forward, typename),
+          string.format('error on output with %s', typename))
 
-   local gerr = cgin:float() - fgin
-   mytester:assertlt(gerr:abs():max(), precision_forward, 'error  on gradInput')
+      local gerr = cgin:double() - fgin:double()
+      mytester:assertlt(gerr:abs():max(), precision_forward_type(precision_forward, typename),
+        string.format('error  on gradInput with %s', typename))
+   end
 end
 
 function cunntest.SpatialClassNLLCriterion()
@@ -4703,103 +4084,98 @@ function cunntest.SpatialClassNLLCriterion()
    local h = math.random(300, 500)
    local w = math.random(300, 800)
    local classes = math.random(10,30)
-   local input = torch.randn(batchSize, classes, h, w)
-   local target = torch.Tensor(batchSize, h, w)
-   target:apply(function() return math.random(1, classes) end)
-   local mod = nn.SpatialClassNLLCriterion()
 
-   local tm = {}
-   local title = string.format('SpatialClassNLLCriterion %dx%dx%dx%d ',
-         batchSize, classes, h, w)
-   times[title] = tm
+   for k, typename in ipairs(typenames) do
+      local input = torch.randn(batchSize, classes, h, w):type(typename)
+      local target = torch.Tensor(batchSize, h, w)
+      target:apply(function() return math.random(1, classes) end)
+      local ctype = t2cpu[typename]
+      input = input:type(ctype)
+      local mod = nn.SpatialClassNLLCriterion():type(ctype)
+      local fout = mod:forward(input, target)
+      local fgin = mod:backward(input, target):clone()
 
-   local a = torch.Timer()
-   local fout = mod:forward(input, target)
-   local fgin = mod:backward(input, target):clone()
-   tm.cpu = a:time().real
+      local cinput = input:type(typename)
+      local ctarget = target:type(typename)
 
-   local cinput = input:cuda()
-   local ctarget = target:cuda()
-
-   local cmod = nn.SpatialClassNLLCriterion():cuda()
-   a:reset()
-   local cout = cmod:forward(cinput,ctarget)
-   local cgin = cmod:backward(cinput,ctarget)
-   cutorch.synchronize()
-   tm.gpu = a:time().real
+      local cmod = nn.SpatialClassNLLCriterion():type(typename)
+      local cout = cmod:forward(cinput,ctarget)
+      local cgin = cmod:backward(cinput,ctarget)
+      cutorch.synchronize()
 
-   mytester:assertlt(
-       math.abs(fout-cout), precision_forward, 'error on output')
+      mytester:assertlt(
+        math.abs(fout-cout), precision_forward_type(precision_forward, typename),
+          string.format('error on output with %s', typename))
 
-   local gerr = cgin:float() - fgin
-   mytester:assertlt(gerr:abs():max(), precision_forward, 'error  on gradInput')
+      local gerr = cgin:double() - fgin:double()
+      mytester:assertlt(gerr:abs():max(), precision_forward_type(precision_forward, typename),
+          string.format('error  on gradInput with %s', typename))
+    end
 end
 
 function cunntest.ClassNLLCriterionMultipleTargetWeights()
    local size = math.random(3000,5000)
-   local input = torch.randn(size, size)
-   local target = torch.randperm(size)
-   local weights = torch.rand(size)
-   local mod = nn.ClassNLLCriterion(weights)
 
-   local tm = {}
-   local title = string.format('ClassNLLCriterionMultiTargetWeights %d ',size)
-   times[title] = tm
+   for k, typename in ipairs(typenames) do
+      local input = torch.randn(size, size):type(typename)
+      local target = torch.randperm(size)
+      local weights = torch.rand(size):type(typename)
 
-   local a = torch.Timer()
-   local fout = mod:forward(input, target)
-   local fgin = mod:backward(input, target):clone()
-   tm.cpu = a:time().real
+      local ctype = t2cpu[typename]
+      input = input:type(ctype)
+      weights = weights:type(ctype)
+      local mod = nn.ClassNLLCriterion(weights):type(ctype)
 
-   local cinput = input:cuda()
-   local ctarget = target:cuda()
-   local cweights = weights:cuda()
+      local fout = mod:forward(input, target)
+      local fgin = mod:backward(input, target):clone()
 
-   local cmod = nn.ClassNLLCriterion(cweights):cuda()
-   a:reset()
-   local cout = cmod:forward(cinput,ctarget)
-   local cgin = cmod:backward(cinput,ctarget)
-   cutorch.synchronize()
-   tm.gpu = a:time().real
+      local cinput = input:type(typename)
+      local ctarget = target:cuda()
+      local cweights = weights:type(typename)
 
-   mytester:assertlt(
-       math.abs(fout-cout), precision_forward, 'error on output')
+      local cmod = nn.ClassNLLCriterion(cweights):type(typename)
+      local cout = cmod:forward(cinput,ctarget)
+      local cgin = cmod:backward(cinput,ctarget)
+
+      mytester:assertlt(
+        math.abs(fout-cout), precision_forward_type(precision_forward, typename),
+          string.format('error on output with %s', typename))
 
-   local gerr = cgin:float() - fgin
-   mytester:assertlt(gerr:abs():max(), precision_forward, 'error  on gradInput')
+      local gerr = cgin:double() - fgin:double()
+      mytester:assertlt(gerr:abs():max(), precision_forward_type(precision_forward, typename),
+        string.format('error  on gradInput with %s', typename))
+   end
 end
 
 function cunntest.TemporalMaxPooling()
-   local input = torch.rand(16, 18, 3)
    local settings = {{2, 2}, {3, 3}, {4, 2}, {2, 4}, {3, 5}}
 
    for i, setting in ipairs(settings) do
-      local mod = nn.TemporalMaxPooling(setting[1], setting[2])
-
-      local tm = {}
-      local title = 'TemporalMaxPooling '..setting[1]..' '..setting[2]
-      times[title] = tm
-
-      local a = torch.Timer()
-      local fout = mod:forward(input)
-      local fgout = torch.rand(fout:size())
-      local fgin = mod:backward(input, fgout):clone()
-      tm.cpu = a:time().real
-
-      local cinput = input:cuda()
-      local cgout = fgout:cuda()
-      local cmod = nn.TemporalMaxPooling(setting[1], setting[2]):cuda()
-      a:reset()
-      local cout = cmod:forward(cinput)
-      local cgin = cmod:backward(cinput, cgout)
-      cutorch.synchronize()
-      tm.gpu = a:time().real
-
-      local outerror = cout:float() - fout
-      mytester:assertlt(outerror:abs():max(), precision_forward, 'error on output')
-
-      local ginerror = cgin:float() - fgin
-      mytester:assertlt(ginerror:abs():max(), precision_backward, 'error on gradInput')
+      for k, typename in ipairs(typenames) do
+        local input = torch.rand(16, 18, 3):type(typename)
+
+        local ctype = t2cpu[typename]
+        input = input:type(ctype)
+        local mod = nn.TemporalMaxPooling(setting[1], setting[2]):type(ctype)
+
+        local fout = mod:forward(input)
+        local fgout = torch.rand(fout:size()):type(typename):type(ctype)
+        local fgin = mod:backward(input, fgout):clone()
+
+        local cinput = input:type(typename)
+        local cgout = fgout:type(typename)
+        local cmod = nn.TemporalMaxPooling(setting[1], setting[2]):type(typename)
+        local cout = cmod:forward(cinput)
+        local cgin = cmod:backward(cinput, cgout)
+
+        local outerror = cout:double() - fout:double()
+        mytester:assertlt(outerror:abs():max(), precision_forward_type(precision_forward, typename),
+          string.format('error on output with %s', typename))
+
+        local ginerror = cgin:double() - fgin:double()
+        mytester:assertlt(ginerror:abs():max(), precision_backward_type(precision_backward, typename),
+          string.format('error on gradInput with %s', typename))
+      end
    end
 end
 
@@ -4819,35 +4195,26 @@ function cunntest.VolumetricConvolution_forward_single()
    local inj = (outj-1)*sj+kj
    local ink = (outk-1)*sk+kk
 
-   local tm = {}
-   local title = string.format('VolumetricConvolution.forward %dx%dx%dx%d o %dx%dx%d -> %dx%dx%dx%d',
-                           from, ink, inj, ini, kk, kj, ki, to, outk, outj, outi)
-   times[title] = tm
+   for k, typename in ipairs(typenames) do
+      local input = torch.randn(from,ini,inj,ink):type(typename)
 
-   local input = torch.randn(from,ini,inj,ink)
-   local sconv = nn.VolumetricConvolution(from,to,ki,kk,kj,si,sk,sj)
-   local groundtruth = sconv:forward(input)
-   local a = torch.Timer()
-   for i = 1,nloop do
-      groundtruth = sconv:forward(input)
-   end
-   tm.cpu = a:time().real
+      local ctype = t2cpu[typename]
+      input = input:type(ctype)
+      local sconv = nn.VolumetricConvolution(from,to,ki,kk,kj,si,sk,sj):type(ctype)
+      local groundtruth = sconv:forward(input)
 
-   input = input:cuda()
-   local gconv = nn.VolumetricConvolution(from,to,ki,kk,kj,si,sk,sj):cuda()
-   gconv.weight = sconv.weight:cuda()
-   gconv.bias = sconv.bias:cuda()
-   local rescuda = gconv:forward(input)
-   a:reset()
-   for i = 1,nloop do
-      rescuda = gconv:forward(input)
-   end
-   cutorch.synchronize()
-   tm.gpu = a:time().real
+      input = input:type(typename)
+      local gconv = nn.VolumetricConvolution(from,to,ki,kk,kj,si,sk,sj):type(typename)
+      gconv.weight = sconv.weight:type(typename)
+      gconv.bias = sconv.bias:type(typename)
+      local rescuda = gconv:forward(input)
 
-   local error = rescuda:float() - groundtruth
-   mytester:assertlt(error:abs():max(), precision_forward, 'error on state (forward) ')
-   mytester:assert(groundtruth:isSize(rescuda:size()), 'size mismatch on state (forward)')
+      local error = rescuda:double() - groundtruth:double()
+      mytester:assertlt(error:abs():max(), precision_forward_type(precision_forward, typename),
+        string.format('error on state (forward) with %s', typename))
+      mytester:assert(groundtruth:isSize(rescuda:size()),
+        string.format('size mismatch on state (forward) with %s', typename))
+   end
 end
 
 function cunntest.VolumetricConvolution_forward_batch()
@@ -4867,35 +4234,26 @@ function cunntest.VolumetricConvolution_forward_batch()
    local inj = (outj-1)*sj+kj
    local ink = (outk-1)*sk+kk
 
-   local tm = {}
-   local title = string.format('VolumetricConvolution.forward %dx%dx%dx%dx%d o %dx%dx%d -> %dx%dx%dx%dx%d',
-                           bs, from, ink, inj, ini, kk, kj, ki, bs, to, outk, outj, outi)
-   times[title] = tm
+   for k, typename in ipairs(typenames) do
+      local input = torch.randn(bs,from,ini,inj, ink):type(typename)
 
-   local input = torch.randn(bs,from,ini,inj, ink)
-   local sconv = nn.VolumetricConvolution(from,to,ki,kk,kj,si,sj,sk)
-   local groundtruth = sconv:forward(input)
-   local a = torch.Timer()
-   for i = 1,nloop do
-      groundtruth = sconv:forward(input, sconv)
-   end
-   tm.cpu = a:time().real
+      local ctype = t2cpu[typename]
+      input = input:type(ctype)
+      local sconv = nn.VolumetricConvolution(from,to,ki,kk,kj,si,sj,sk):type(ctype)
+      local groundtruth = sconv:forward(input)
 
-   input = input:cuda()
-   local gconv = nn.VolumetricConvolution(from,to,ki,kk,kj,si,sj,sk):cuda()
-   gconv.weight = sconv.weight:cuda()
-   gconv.bias = sconv.bias:cuda()
-   local rescuda = gconv:forward(input)
-   a:reset()
-   for i = 1,nloop do
-      rescuda = gconv:forward(input)
-   end
-   cutorch.synchronize()
-   tm.gpu = a:time().real
+      input = input:type(typename)
+      local gconv = nn.VolumetricConvolution(from,to,ki,kk,kj,si,sj,sk):type(typename)
+      gconv.weight = sconv.weight:type(typename)
+      gconv.bias = sconv.bias:type(typename)
+      local rescuda = gconv:forward(input)
 
-   local error = rescuda:float() - groundtruth
-   mytester:assertlt(error:abs():max(), precision_forward, 'error on state (forward) ')
-   mytester:assert(groundtruth:isSize(rescuda:size()), 'size mismatch on state (forward)')
+      local error = rescuda:double() - groundtruth:double()
+      mytester:assertlt(error:abs():max(), precision_forward_type(precision_forward, typename),
+        string.format('error on state (forward) with %s', typename))
+      mytester:assert(groundtruth:isSize(rescuda:size()),
+        string.format('size mismatch on state (forward) with %s', typename))
+   end
 end
 
 function cunntest.VolumetricConvolution_backward_single()
@@ -4914,50 +4272,44 @@ function cunntest.VolumetricConvolution_backward_single()
    local inj = (outj-1)*sj+kj
    local ink = (outk-1)*sk+kk
 
-   local tm = {}
-   local title = string.format('VolumetricConvolution.backward %dx%dx%dx%d o %dx%dx%d -> %dx%dx%dx%d',
-                               from, ink, inj, ini, kk, kj, ki, to, outk, outj, outi)
-   times[title] = tm
+   for k, typename in ipairs(typenames) do
+      local input = torch.randn(from, ini, inj, ink):type(typename)
+      local gradOutput = torch.randn(to, outi, outj, outk):type(typename)
 
-   local input = torch.randn(from, ini, inj, ink)
-   local gradOutput = torch.randn(to, outi, outj, outk)
-   local sconv = nn.VolumetricConvolution(from,to,ki,kk,kj,si,sk,sj)
-   sconv:forward(input)
-   sconv:zeroGradParameters()
-   local groundgrad = sconv:backward(input, gradOutput)
-   local a = torch.Timer()
-   for i = 1,nloop do
+      local ctype = t2cpu[typename]
+      input = input:type(ctype)
+      gradOutput = gradOutput:type(ctype)
+      local sconv = nn.VolumetricConvolution(from,to,ki,kk,kj,si,sk,sj):type(ctype)
+      sconv:forward(input)
       sconv:zeroGradParameters()
-      groundgrad = sconv:backward(input, gradOutput)
-   end
-   local groundweight = sconv.gradWeight
-   local groundbias = sconv.gradBias
-   tm.cpu = a:time().real
+      local groundgrad = sconv:backward(input, gradOutput)
+      local groundweight = sconv.gradWeight
+      local groundbias = sconv.gradBias
 
-   input = input:cuda()
-   gradOutput = gradOutput:cuda()
-   local gconv = nn.VolumetricConvolution(from,to,ki,kk,kj,si,sk,sj):cuda()
-   gconv.weight = sconv.weight:cuda()
-   gconv.bias = sconv.bias:cuda()
-   gconv:forward(input)
-   gconv:zeroGradParameters()
-   local rescuda = gconv:backward(input, gradOutput)
-   a:reset()
-   for i = 1,nloop do
+      input = input:type(typename)
+      gradOutput = gradOutput:type(typename)
+      local gconv = nn.VolumetricConvolution(from,to,ki,kk,kj,si,sk,sj):type(typename)
+      gconv.weight = sconv.weight:type(typename)
+      gconv.bias = sconv.bias:type(typename)
+      gconv:forward(input)
       gconv:zeroGradParameters()
-      rescuda = gconv:backward(input, gradOutput)
+      local rescuda = gconv:backward(input, gradOutput)
+      local weightcuda = gconv.gradWeight
+      local biascuda = gconv.gradBias
+      local error = rescuda:double() - groundgrad:double()
+      local werror = weightcuda:double() - groundweight:double()
+      local berror = biascuda:double() - groundbias:double()
+      mytester:assert(groundgrad:isSize(rescuda:size()),
+        string.format('size mismatch on state (forward) with %s', typename))
+      mytester:assertlt(error:abs():max(), precision_backward_type(precision_backward, typename),
+        string.format('error on state (backward) with %s', typename))
+      mytester:assertlt(werror:abs():max(),
+        precision_backward_conv_weightbias(precision_backward, typename, weightcuda:abs():max()),
+        string.format('error on weight (backward) with %s', typename))
+      mytester:assertlt(berror:abs():max(),
+        precision_backward_conv_weightbias(precision_backward, typename, biascuda:abs():max()),
+        string.format('error on bias (backward) with %s', typename))
    end
-   local weightcuda = gconv.gradWeight
-   local biascuda = gconv.gradBias
-   cutorch.synchronize()
-   tm.gpu = a:time().real
-   local error = rescuda:float() - groundgrad
-   local werror = weightcuda:float() - groundweight
-   local berror = biascuda:float() - groundbias
-   mytester:assert(groundgrad:isSize(rescuda:size()), 'size mismatch on state (forward)')
-   mytester:assertlt(error:abs():max(), precision_backward, 'error on state (backward) ')
-   mytester:assertlt(werror:abs():max(), precision_backward, 'error on weight (backward) ')
-   mytester:assertlt(berror:abs():max(), precision_backward, 'error on bias (backward) ')
 end
 
 function cunntest.VolumetricConvolution_backward_batch()
@@ -4977,50 +4329,44 @@ function cunntest.VolumetricConvolution_backward_batch()
    local inj = (outj-1)*sj+kj
    local ink = (outk-1)*sk+kk
 
-   local tm = {}
-   local title = string.format('VolumetricConvolution.backward %dx%dx%dx%dx%d o %dx%dx%d -> %dx%dx%dx%dx%d',
-                           bs, from, ink, inj, ini, kk, kj, ki, bs, to, outk, outj, outi)
-   times[title] = tm
+   for k, typename in ipairs(typenames) do
+      local input = torch.randn(bs, from, ini, inj, ink):type(typename)
+      local gradOutput = torch.randn(bs, to, outi, outj, outk):type(typename)
 
-   local input = torch.randn(bs, from, ini, inj, ink)
-   local gradOutput = torch.randn(bs, to, outi, outj, outk)
-   local sconv = nn.VolumetricConvolution(from,to,ki,kk,kj,si,sk,sj)
-   sconv:forward(input)
-   sconv:zeroGradParameters()
-   local groundgrad = sconv:backward(input, gradOutput)
-   local a = torch.Timer()
-   for i = 1,nloop do
+      local ctype = t2cpu[typename]
+      input = input:type(ctype)
+      gradOutput = gradOutput:type(ctype)
+      local sconv = nn.VolumetricConvolution(from,to,ki,kk,kj,si,sk,sj):type(ctype)
+      sconv:forward(input)
       sconv:zeroGradParameters()
-      groundgrad = sconv:backward(input, gradOutput)
-   end
-   local groundweight = sconv.gradWeight
-   local groundbias = sconv.gradBias
-   tm.cpu = a:time().real
+      local groundgrad = sconv:backward(input, gradOutput)
+      local groundweight = sconv.gradWeight
+      local groundbias = sconv.gradBias
 
-   input = input:cuda()
-   gradOutput = gradOutput:cuda()
-   local gconv = nn.VolumetricConvolution(from,to,ki,kk,kj,si,sk,sj):cuda()
-   gconv.weight = sconv.weight:cuda()
-   gconv.bias = sconv.bias:cuda()
-   gconv:forward(input)
-   gconv:zeroGradParameters()
-   local rescuda = gconv:backward(input, gradOutput)
-   a:reset()
-   for i = 1,nloop do
+      input = input:type(typename)
+      gradOutput = gradOutput:type(typename)
+      local gconv = nn.VolumetricConvolution(from,to,ki,kk,kj,si,sk,sj):type(typename)
+      gconv.weight = sconv.weight:type(typename)
+      gconv.bias = sconv.bias:type(typename)
+      gconv:forward(input)
       gconv:zeroGradParameters()
-      rescuda = gconv:backward(input, gradOutput)
+      local rescuda = gconv:backward(input, gradOutput)
+      local weightcuda = gconv.gradWeight
+      local biascuda = gconv.gradBias
+      local error = rescuda:double() - groundgrad:double()
+      local werror = weightcuda:double() - groundweight:double()
+      local berror = biascuda:double() - groundbias:double()
+      mytester:assert(groundgrad:isSize(rescuda:size()),
+        string.format('size mismatch on state (forward) with %s', typename))
+      mytester:assertlt(error:abs():max(), precision_backward_type(precision_backward, typename),
+        string.format('error on state (backward) with %s', typename))
+      mytester:assertlt(werror:abs():max(),
+        precision_backward_conv_weightbias(precision_backward, typename, weightcuda:abs():max()),
+        string.format('error on weight (backward) with %s', typename))
+      mytester:assertlt(berror:abs():max(),
+        precision_backward_conv_weightbias(precision_backward, typename, biascuda:abs():max()),
+        string.format('error on bias (backward) with %s', typename))
    end
-   local weightcuda = gconv.gradWeight
-   local biascuda = gconv.gradBias
-   cutorch.synchronize()
-   tm.gpu = a:time().real
-   local error = rescuda:float() - groundgrad
-   local werror = weightcuda:float() - groundweight
-   local berror = biascuda:float() - groundbias
-   mytester:assert(groundgrad:isSize(rescuda:size()), 'size mismatch on state (forward)')
-   mytester:assertlt(error:abs():max(), precision_backward, 'error on state (backward) ')
-   mytester:assertlt(werror:abs():max(), precision_backward, 'error on weight (backward) ')
-   mytester:assertlt(berror:abs():max(), precision_backward, 'error on bias (backward) ')
 end
 
 function cunntest.VolumetricMaxPooling_forward()
@@ -5041,32 +4387,22 @@ function cunntest.VolumetricMaxPooling_forward()
    local oH = math.floor((iH - kH + 2*padH) / dH + 1)
    local oW = math.floor((iW - kW + 2*padW) / dW + 1)
 
-   local tm = {}
-   local title = string.format('VolumetricMaxPooling.forward %dx%dx%dx%d o %dx%dx%d (%dx%dx%d)-> %dx%dx%dx%d',
-                           iF, iT, iH, iW, kT, kH, kW, dT, dH, dW, iF, oT, oH, oW)
-   times[title] = tm
+   for k, typename in ipairs(typenames) do
+      local input = torch.Tensor(iF, iT, iH, iW):float():uniform(-1, 1):type(typename)
 
-   local input = torch.Tensor(iF, iT, iH, iW):float():uniform(-1, 1)
-   local layer = nn.VolumetricMaxPooling(kT, kW, kH, dT, dW, dH, padT, padW, padH):float()
-   local output = layer:forward(input)
-   local timer = torch.Timer()
-   for i = 1,nloop do
-      output = layer:forward(input)
-   end
-   tm.cpu = timer:time().real
+      local ctype = t2cpu[typename]
+      input = input:type(ctype)
+      local layer = nn.VolumetricMaxPooling(kT, kW, kH, dT, dW, dH, padT, padW, padH):type(ctype)
+      local output = layer:forward(input)
 
-   local inputCUDA = input:cuda()
-   local layerCUDA = layer:clone():cuda()
-   local outputCUDA = layerCUDA:forward(inputCUDA)
-   timer:reset()
-   for i = 1,nloop do
-      outputCUDA = layerCUDA:forward(inputCUDA)
-   end
-   cutorch.synchronize()
-   tm.gpu = timer:time().real
+      local inputCUDA = input:type(typename)
+      local layerCUDA = layer:clone():type(typename)
+      local outputCUDA = layerCUDA:forward(inputCUDA)
 
-   local error = outputCUDA:float() - output
-   mytester:assertlt(error:abs():max(), precision_forward, 'error on state (forward) ')
+      local error = outputCUDA:double() - output:double()
+      mytester:assertlt(error:abs():max(), precision_forward_type(precision_forward, typename),
+        string.format('error on state (forward) with %s', typename))
+   end
 end
 
 function cunntest.VolumetricMaxPooling_backward()
@@ -5087,38 +4423,27 @@ function cunntest.VolumetricMaxPooling_backward()
    local oH = math.floor((iH - kH + 2*padH) / dH + 1)
    local oW = math.floor((iW - kW + 2*padW) / dW + 1)
 
-   local tm = {}
-   local title = string.format('VolumetricMaxPooling.backward %dx%dx%dx%d o %dx%dx%d (%dx%dx%d) -> %dx%dx%dx%d',
-                               iF, iT, iH, iW, kT, kH, kW, dT, dH, dW, iF, oT, oH, oW)
-   times[title] = tm
+   for k, typename in ipairs(typenames) do
+      local input = torch.Tensor(iF, iT, iH, iW):float():uniform(-1, 1):type(typename)
 
-   local input = torch.Tensor(iF, iT, iH, iW):float():uniform(-1, 1)
-   local layer = nn.VolumetricMaxPooling(kT, kW, kH, dT, dW, dH, padT, padW, padH):float()
-   local output = layer:forward(input)
-   local gradOutput = output:clone():uniform(-1, 1)
+      local ctype = t2cpu[typename]
+      input = input:type(ctype)
+      local layer = nn.VolumetricMaxPooling(kT, kW, kH, dT, dW, dH, padT, padW, padH):type(ctype)
+      local output = layer:forward(input)
+      local gradOutput = output:clone():uniform(-1, 1)
 
-   local gradInput = layer:backward(input, gradOutput)
-   local timer = torch.Timer()
-   for i = 1,nloop do
-      gradInput = layer:backward(input, gradOutput)
-   end
-   tm.cpu = timer:time().real
+      local gradInput = layer:backward(input, gradOutput)
 
-   local inputCUDA = input:cuda()
-   local layerCUDA = layer:clone():cuda()
-   local outputCUDA = layerCUDA:forward(inputCUDA)
-   local gradOutputCUDA = gradOutput:cuda()
-   local gradInputCUDA = layerCUDA:backward(inputCUDA, gradOutputCUDA)
+      local inputCUDA = input:type(typename)
+      local layerCUDA = layer:clone():type(typename)
+      local outputCUDA = layerCUDA:forward(inputCUDA)
+      local gradOutputCUDA = gradOutput:type(typename)
+      local gradInputCUDA = layerCUDA:backward(inputCUDA, gradOutputCUDA)
 
-   timer:reset()
-   for i = 1,nloop do
-      gradInputCUDA = layerCUDA:backward(inputCUDA, gradOutputCUDA)
+      local error = gradInputCUDA:double() - gradInput:double()
+      mytester:assertlt(error:abs():max(), precision_forward_type(precision_forward, typename),
+        string.format('error on state (backward) with %s', typename))
    end
-   cutorch.synchronize()
-   tm.gpu = timer:time().real
-
-   local error = gradInputCUDA:float() - gradInput
-   mytester:assertlt(error:abs():max(), precision_forward, 'error on state (backward) ')
 end
 
 function cunntest.VolumetricDilatedMaxPooling_forward_batch()
@@ -5145,34 +4470,24 @@ function cunntest.VolumetricDilatedMaxPooling_forward_batch()
    local inj = (outj-1)*sj+(dilationj*(kj-1)+1)-2*padj
    local ceil_mode = math.random(0,1) == 1
 
-   local tm = {}
-   local title = string.format('VolumetricDilatedMaxPooling.forward %dx%dx%dx%dx%d o %dx%dx%d -> %d%dx%dx%dx%d',
-                               bs, from, int, inj, ini, kt, kj, ki, bs, to, outt, outj, outi)
-   times[title] = tm
+   for k, typename in ipairs(typenames) do
+      local input = torch.randn(bs,from,int,inj,ini):type(typename)
 
-   local input = torch.randn(bs,from,int,inj,ini)
-   local sconv = nn.VolumetricDilatedMaxPooling(kt,ki,kj,st,si,sj,padt,padi,padj,dilationt,dilationi,dilationj)
-   if ceil_mode then sconv:ceil() end
-   local groundtruth = sconv:forward(input)
-   local a = torch.Timer()
-   for i = 1,nloop do
-      groundtruth = sconv:forward(input)
-   end
-   tm.cpu = a:time().real
+      local ctype = t2cpu[typename]
+      input = input:type(ctype)
+      local sconv = nn.VolumetricDilatedMaxPooling(kt,ki,kj,st,si,sj,padt,padi,padj,dilationt,dilationi,dilationj):type(ctype)
+      if ceil_mode then sconv:ceil() end
+      local groundtruth = sconv:forward(input)
 
-   input = input:cuda()
-   local gconv = nn.VolumetricDilatedMaxPooling(kt,ki,kj,st,si,sj,padt,padi,padj,dilationt,dilationi,dilationj):cuda()
-   if ceil_mode then gconv:ceil() end
-   local rescuda = gconv:forward(input)
-   a:reset()
-   for i = 1,nloop do
-      rescuda = gconv:forward(input)
-   end
-   cutorch.synchronize()
-   tm.gpu = a:time().real
+      input = input:type(typename)
+      local gconv = nn.VolumetricDilatedMaxPooling(kt,ki,kj,st,si,sj,padt,padi,padj,dilationt,dilationi,dilationj):type(typename)
+      if ceil_mode then gconv:ceil() end
+      local rescuda = gconv:forward(input)
 
-   local error = rescuda:float() - groundtruth
-   mytester:assertlt(error:abs():max(), precision_forward, 'error on state (forward) ')
+      local error = rescuda:double() - groundtruth:double()
+      mytester:assertlt(error:abs():max(), precision_forward_type(precision_forward, typename),
+        string.format('error on state (forward) with %s', typename))
+   end
 end
 
 function cunntest.VolumetricDilatedMaxPooling_backward_batch()
@@ -5199,43 +4514,32 @@ function cunntest.VolumetricDilatedMaxPooling_backward_batch()
    local inj = (outj-1)*sj+(dilationj*(kj-1)+1)-2*padj
    local ceil_mode = math.random(0,1) == 1
 
-   local tm = {}
-   local title = string.format('VolumetricDilatedMaxPooling.forward %dx%dx%dx%dx%d o %dx%dx%d -> %d%dx%dx%dx%d',
-                               bs, from, int, inj, ini, kt, kj, ki, bs, to, outt, outj, outi)
-   times[title] = tm
+   for k, typename in ipairs(typenames) do
+      local input = torch.randn(bs,from,int,inj,ini):type(typename)
+      local gradOutput = torch.randn(bs,to,outt,outj,outi):type(typename)
 
-   local input = torch.randn(bs,from,int,inj,ini)
-   local gradOutput = torch.randn(bs,to,outt,outj,outi)
-   local sconv = nn.VolumetricDilatedMaxPooling(kt,ki,kj,st,si,sj,padt,padi,padj,dilationt,dilationi,dilationj)
-   if ceil_mode then sconv:ceil() end
-   sconv:forward(input)
-   sconv:zeroGradParameters()
-   local groundgrad = sconv:backward(input, gradOutput)
-   local a = torch.Timer()
-   for i = 1,nloop do
+      local ctype = t2cpu[typename]
+      input = input:type(ctype)
+      gradOutput = gradOutput:type(ctype)
+      local sconv = nn.VolumetricDilatedMaxPooling(kt,ki,kj,st,si,sj,padt,padi,padj,dilationt,dilationi,dilationj):type(ctype)
+      if ceil_mode then sconv:ceil() end
+      sconv:forward(input)
       sconv:zeroGradParameters()
-      groundgrad = sconv:backward(input, gradOutput)
-   end
-   tm.cpu = a:time().real
+      local groundgrad = sconv:backward(input, gradOutput)
 
-   input = input:cuda()
-   gradOutput = gradOutput:cuda()
-   local gconv = nn.VolumetricDilatedMaxPooling(kt,ki,kj,st,si,sj,padt,padi,padj,dilationt,dilationi,dilationj):cuda()
-   if ceil_mode then gconv:ceil() end
-   gconv:forward(input)
-   gconv:zeroGradParameters()
-   local rescuda = gconv:backward(input, gradOutput)
-   a:reset()
-   for i = 1,nloop do
+      input = input:type(typename)
+      gradOutput = gradOutput:type(typename)
+      local gconv = nn.VolumetricDilatedMaxPooling(kt,ki,kj,st,si,sj,padt,padi,padj,dilationt,dilationi,dilationj):type(typename)
+      if ceil_mode then gconv:ceil() end
+      gconv:forward(input)
       gconv:zeroGradParameters()
-      rescuda = gconv:backward(input, gradOutput)
-   end
-   cutorch.synchronize()
-   tm.gpu = a:time().real
+      local rescuda = gconv:backward(input, gradOutput)
 
-   local error = rescuda:float() - groundgrad
+      local error = rescuda:double() - groundgrad:double()
 
-   mytester:assertlt(error:abs():max(), precision_backward, 'error on state (backward) ')
+      mytester:assertlt(error:abs():max(), precision_backward_type(precision_backward, typename),
+        string.format('error on state (backward) with %s', typename))
+   end
 end
 
 function cunntest.VolumetricMaxUnpooling_forward_batch()
@@ -5252,42 +4556,31 @@ function cunntest.VolumetricMaxUnpooling_forward_batch()
    local padt = math.random(0,kt/2-1)
    local padi = math.random(0,ki/2-1)
    local padj = math.random(0,kj/2-1)
-   local it = ((outt + padt*2 - kt)/st) +1
-   local ii = ((outi + padi*2 - ki)/si) +1
-   local ij = ((outj + padj*2 - kj)/sj) +1
+   local it = math.max(((outt + padt*2 - kt)/st) +1, kt)
+   local ii = math.max(((outi + padi*2 - ki)/si) +1, ki)
+   local ij = math.max(((outj + padj*2 - kj)/sj) +1, kj)
 
-   local tm = {}
-   local title = string.format('VolumetricMaxUnpooling.forward %dx%dx%dx%dx%d o %dx%dx%d -> %d%dx%dx%dx%d',
-                               bs, from, it, ij, ii, kt, kj, ki, bs, to, outt, outj, outi)
-   times[title] = tm
+   for k, typename in ipairs(typenames) do
+      local ctype = t2cpu[typename]
 
-   local pooler = nn.VolumetricMaxPooling(kt,ki,kj,st,si,sj,padt,padi,padj)
-   local sunpool = nn.VolumetricMaxUnpooling(pooler)
+      local pooler = nn.VolumetricMaxPooling(kt,ki,kj,st,si,sj,padt,padi,padj):type(ctype)
+      local sunpool = nn.VolumetricMaxUnpooling(pooler):type(ctype)
 
-   local original = torch.randn(bs,from,it,ij,ii)
-   local input = pooler:forward(original)
-   local groundtruth = sunpool:forward(input)
-   local a = torch.Timer()
-   for i = 1,nloop do
-      groundtruth = sunpool:forward(input)
-   end
-   tm.cpu = a:time().real
+      local original = torch.randn(bs,from,it,ij,ii):type(typename):type(ctype)
+      local input = pooler:forward(original)
+      local groundtruth = sunpool:forward(input)
 
-   original = original:cuda()
-   pooler = nn.VolumetricMaxPooling(kt,ki,kj,st,si,sj,padt,padi,padj):cuda()
-   local gunpool = nn.VolumetricMaxUnpooling(pooler):cuda()
+      original = original:type(typename)
+      pooler = nn.VolumetricMaxPooling(kt,ki,kj,st,si,sj,padt,padi,padj):type(typename)
+      local gunpool = nn.VolumetricMaxUnpooling(pooler):type(typename)
 
-   input = pooler:forward(original)
-   local rescuda = gunpool:forward(input)
-   a:reset()
-   for i = 1,nloop do
-      rescuda = gunpool:forward(input)
-   end
-   cutorch.synchronize()
-   tm.gpu = a:time().real
+      input = pooler:forward(original)
+      local rescuda = gunpool:forward(input)
 
-   local error = rescuda:float() - groundtruth
-   mytester:assertlt(error:abs():max(), precision_forward, 'error on state (forward) ')
+      local error = rescuda:double() - groundtruth:double()
+      mytester:assertlt(error:abs():max(), precision_forward_type(precision_forward, typename),
+        string.format('error on state (forward) with %s', typename))
+   end
 end
 
 function cunntest.VolumetricMaxUnpooling_backward_batch()
@@ -5304,52 +4597,39 @@ function cunntest.VolumetricMaxUnpooling_backward_batch()
    local padt = math.random(0,kt/2-1)
    local padi = math.random(0,ki/2-1)
    local padj = math.random(0,kj/2-1)
-   local it = ((outt + padt*2 - kt)/st) +1
-   local ii = ((outi + padi*2 - ki)/si) +1
-   local ij = ((outj + padj*2 - kj)/sj) +1
+   local it = math.max(((outt + padt*2 - kt)/st) +1, kt)
+   local ii = math.max(((outi + padi*2 - ki)/si) +1, ki)
+   local ij = math.max(((outj + padj*2 - kj)/sj) +1, kj)
 
-   local tm = {}
-   local title = string.format('VolumetricMaxUnpooling.backward %dx%dx%dx%dx%d o %dx%dx%d -> %d%dx%dx%dx%d',
-                               bs, from, it, ij, ii, kt, kj, ki, bs, to, outt, outj, outi)
-   times[title] = tm
+   for k, typename in ipairs(typenames) do
+      local ctype = t2cpu[typename]
 
-   local pooler = nn.VolumetricMaxPooling(kt,ki,kj,st,si,sj,padt,padi,padj)
-   local sunpool = nn.VolumetricMaxUnpooling(pooler)
+      local pooler = nn.VolumetricMaxPooling(kt,ki,kj,st,si,sj,padt,padi,padj):type(ctype)
+      local sunpool = nn.VolumetricMaxUnpooling(pooler):type(ctype)
 
-   local original = torch.randn(bs,from,it,ij,ii)
-   local input = pooler:forward(original)
-   local gradOutput = torch.randn(original:size())
-   sunpool:forward(input)
-   sunpool:zeroGradParameters()
-   local groundgrad = sunpool:backward(input, gradOutput)
-   local a = torch.Timer()
-   for i = 1,nloop do
+      local original = torch.randn(bs,from,it,ij,ii):type(typename):type(ctype)
+      local input = pooler:forward(original)
+      local gradOutput = torch.randn(original:size()):type(typename):type(ctype)
+      sunpool:forward(input)
       sunpool:zeroGradParameters()
-      groundgrad = sunpool:backward(input, gradOutput)
-   end
-   tm.cpu = a:time().real
+      local groundgrad = sunpool:backward(input, gradOutput)
 
-   pooler = nn.VolumetricMaxPooling(kt,ki,kj,st,si,sj,padt,padi,padj):cuda()
-   local gunpool = nn.VolumetricMaxUnpooling(pooler):cuda()
+      pooler = nn.VolumetricMaxPooling(kt,ki,kj,st,si,sj,padt,padi,padj):type(typename)
+      local gunpool = nn.VolumetricMaxUnpooling(pooler):type(typename)
 
-   original = original:cuda()
-   input = pooler:forward(original)
-   gunpool:forward(input)
+      original = original:type(typename)
+      input = pooler:forward(original)
+      gunpool:forward(input)
 
-   gradOutput = gradOutput:cuda()
-   gunpool:zeroGradParameters()
-   local rescuda = gunpool:backward(input, gradOutput)
-   a:reset()
-   for i = 1,nloop do
+      gradOutput = gradOutput:type(typename)
       gunpool:zeroGradParameters()
-      rescuda = gunpool:backward(input, gradOutput)
-   end
-   cutorch.synchronize()
-   tm.gpu = a:time().real
+      local rescuda = gunpool:backward(input, gradOutput)
 
-   local error = rescuda:float() - groundgrad
+      local error = rescuda:double() - groundgrad:double()
 
-   mytester:assertlt(error:abs():max(), precision_backward, 'error on state (backward) ')
+      mytester:assertlt(error:abs():max(), precision_backward_type(precision_backward, typename),
+        string.format('error on state (backward) with %s', typename))
+   end
 end
 
 function cunntest.VolumetricAveragePooling_forward()
@@ -5367,32 +4647,22 @@ function cunntest.VolumetricAveragePooling_forward()
    local iH = (oH - 1) * dH + kH
    local iW = (oW - 1) * dW + kW
 
-   local tm = {}
-   local title = string.format('VolumetricAveragePooling.forward %dx%dx%dx%d o %dx%dx%d (%dx%dx%d) -> %dx%dx%dx%d',
-                               iF, iT, iH, iW, kT, kH, kW, dT, dH, dW, iF, oT, oH, oW)
-   times[title] = tm
+   for k, typename in ipairs(typenames) do
+      local input = torch.Tensor(iF, iT, iH, iW):float():uniform(-1, 1):type(typename)
 
-   local input = torch.Tensor(iF, iT, iH, iW):float():uniform(-1, 1)
-   local layer = nn.VolumetricAveragePooling(kT, kW, kH, dT, dW, dH):float()
-   local output = layer:forward(input)
-   local timer = torch.Timer()
-   for i = 1,nloop do
-      output = layer:forward(input)
-   end
-   tm.cpu = timer:time().real
+      local ctype = t2cpu[typename]
+      input = input:type(ctype)
+      local layer = nn.VolumetricAveragePooling(kT, kW, kH, dT, dW, dH):type(ctype)
+      local output = layer:forward(input)
 
-   local inputCUDA = input:cuda()
-   local layerCUDA = layer:clone():cuda()
-   local outputCUDA = layerCUDA:forward(inputCUDA)
-   timer:reset()
-   for i = 1,nloop do
-      outputCUDA = layerCUDA:forward(inputCUDA)
-   end
-   cutorch.synchronize()
-   tm.gpu = timer:time().real
+      local inputCUDA = input:type(typename)
+      local layerCUDA = layer:clone():type(typename)
+      local outputCUDA = layerCUDA:forward(inputCUDA)
 
-   local error = outputCUDA:float() - output
-   mytester:assertlt(error:abs():max(), precision_forward, 'error on state (forward) ')
+      local error = outputCUDA:double() - output:double()
+      mytester:assertlt(error:abs():max(), precision_forward_type(precision_forward, typename),
+        string.format('error on state (forward) with %s', typename))
+   end
 end
 
 function cunntest.VolumetricAveragePooling_backward()
@@ -5410,36 +4680,25 @@ function cunntest.VolumetricAveragePooling_backward()
    local iH = (oH - 1) * dH + kH
    local iW = (oW - 1) * dW + kW
 
-   local tm = {}
-   local title = string.format('VolumetricAveragePooling.backward %dx%dx%dx%d o %dx%dx%d (%dx%dx%d) -> %dx%dx%dx%d',
-                           iF, iT, iH, iW, kT, kH, kW, dT, dH, dW, iF, oT, oH, oW)
-   times[title] = tm
+   for k, typename in ipairs(typenames) do
+      local input = torch.Tensor(iF, iT, iH, iW):float():uniform(-1, 1):type(typename)
 
-   local input = torch.Tensor(iF, iT, iH, iW):float():uniform(-1, 1)
-   local layer = nn.VolumetricAveragePooling(kT, kW, kH, dT, dW, dH):float()
-   local output = layer:forward(input)
-   local gradOutput = output:clone():uniform(-1, 1)
+      local ctype = t2cpu[typename]
+      input = input:type(ctype)
+      local layer = nn.VolumetricAveragePooling(kT, kW, kH, dT, dW, dH):type(ctype)
+      local output = layer:forward(input)
+      local gradOutput = output:clone():uniform(-1, 1)
 
-   local gradInput = layer:backward(input, gradOutput)
-   local timer = torch.Timer()
-   for i = 1,nloop do
-      gradInput = layer:backward(input, gradOutput)
-   end
-   tm.cpu = timer:time().real
+      local gradInput = layer:backward(input, gradOutput)
 
-   local inputCUDA = input:cuda()  local layerCUDA = layer:clone():cuda()
-   local outputCUDA = layerCUDA:forward(inputCUDA)   local gradOutputCUDA = gradOutput:cuda()
-   local gradInputCUDA = layerCUDA:backward(inputCUDA, gradOutputCUDA)
+      local inputCUDA = input:type(typename)  local layerCUDA = layer:clone():type(typename)
+      local outputCUDA = layerCUDA:forward(inputCUDA)   local gradOutputCUDA = gradOutput:type(typename)
+      local gradInputCUDA = layerCUDA:backward(inputCUDA, gradOutputCUDA)
 
-   timer:reset()
-   for i = 1,nloop do
-      gradInputCUDA = layerCUDA:backward(inputCUDA, gradOutputCUDA)
+      local error = gradInputCUDA:double() - gradInput:double()
+      mytester:assertlt(error:abs():max(), precision_forward_type(precision_forward, typename),
+        string.format('error on state (backward) with %s', typename))
    end
-   cutorch.synchronize()
-   tm.gpu = timer:time().real
-
-   local error = gradInputCUDA:float() - gradInput
-   mytester:assertlt(error:abs():max(), precision_forward, 'error on state (backward) ')
 end
 
 function cunntest.CMul_forward_batch()
@@ -5526,32 +4785,22 @@ function cunntest.PReLU_forward()
     local nOutputPlane = 8
     local w = math.random(1,100)
     local h = math.random(1,100)
+    local input = torch.randn(nOutputPlane,h,w)
 
-    local tm = {}
-    local title = string.format('PReLU forward %d x %d', w, h)
-    times[title] = tm
+    for k, typename in ipairs(typenames) do
+      local ctype = t2cpu[typename]
+      local input = input:type(ctype)
+      local sconv = nn.PReLU(nOutputPlane):type(ctype)
+      local groundtruth = sconv:forward(input)
 
-    local input = torch.randn(nOutputPlane,h,w)
-    local sconv = nn.PReLU(nOutputPlane)
-    local groundtruth = sconv:forward(input)
-    local a = torch.Timer()
-    for i = 1,nloop do
-        groundtruth = sconv:forward(input)
-    end
-    tm.cpu = a:time().real
-
-    input = input:cuda()
-    local gconv = sconv:cuda()
-    local rescuda = gconv:forward(input)
-    a:reset()
-    for i = 1,nloop do
-        rescuda = gconv:forward(input)
-    end
-    cutorch.synchronize()
-    tm.gpu = a:time().real
+      input = input:type(typename)
+      local gconv = sconv:type(typename)
+      local rescuda = gconv:forward(input)
 
-    local error = rescuda:float() - groundtruth
-    mytester:assertlt(error:abs():max(), precision_forward, 'error on state')
+      local error = rescuda:double() - groundtruth:double()
+      mytester:assertlt(error:abs():max(), precision_forward_type(precision_forward, typename),
+          string.format('error on state with %s', typename))
+    end
 end
 
 function cunntest.PReLU_backward()
@@ -5559,39 +4808,33 @@ function cunntest.PReLU_backward()
     local w = math.random(1,10)
     local h = math.random(1,10)
 
-    local tm = {}
-    local title = string.format('PReLU backward %d x %d', w, h)
-    times[title] = tm
-
-    local input = torch.randn(nOutputPlane, h, w)
-    local gradOutput = torch.randn(#input)
-    local sconv = nn.PReLU(nOutputPlane)
-    local gconv = sconv:clone():cuda()
-
-    sconv:forward(input)
-    local groundgrad = sconv:backward(input, gradOutput)
-    local a = torch.Timer()
-    for i = 1,nloop do
-        groundgrad = sconv:backward(input, gradOutput)
+    for k, typename in ipairs(typenames) do
+        local input = torch.randn(nOutputPlane, h, w):type(typename)
+        local gradOutput = torch.randn(#input):type(typename)
+        local ctype = t2cpu[typename]
+        input = input:type(ctype)
+        gradOutput = gradOutput:type(ctype)
+        local sconv = nn.PReLU(nOutputPlane):type(ctype)
+        local gconv = sconv:clone():type(typename)
+
+        sconv:forward(input)
+        sconv:zeroGradParameters()
+        local groundgrad = sconv:backward(input, gradOutput)
+
+        input = input:type(typename)
+        gradOutput = gradOutput:type(typename)
+        gconv:forward(input)
+        gconv:zeroGradParameters()
+        local rescuda = gconv:backward(input, gradOutput)
+
+        local err = rescuda:double() - groundgrad:double()
+        local weightGradError = gconv.gradWeight:double() - sconv.gradWeight:double()
+
+        mytester:assertlt(err:abs():max(), precision_backward_type(precision_backward, typename),
+            string.format('error on state %s', typename))
+        mytester:assertlt(weightGradError:abs():max(), precision_backward_type(precision_backward, typename),
+            string.format('error on weight %s', typename))
     end
-    tm.cpu = a:time().real
-
-    input = input:cuda()
-    gradOutput = gradOutput:cuda()
-    gconv:forward(input)
-    local rescuda = gconv:backward(input, gradOutput)
-    a:reset()
-    for i = 1,nloop do
-        rescuda = gconv:backward(input, gradOutput)
-    end
-    cutorch.synchronize()
-    tm.gpu = a:time().real
-
-    local err = rescuda:float() - groundgrad
-    local weightGradError = gconv.gradWeight:float() - sconv.gradWeight
-
-    mytester:assertlt(err:abs():max(), precision_backward, 'error on state')
-    mytester:assertlt(weightGradError:abs():max(), precision_backward, 'error on weight')
 end
 
 
@@ -5600,40 +4843,29 @@ function cunntest.RReLU_forward()
     local w = math.random(1,100)
     local h = math.random(1,100)
 
-    for _,train in ipairs({true,false}) do
-       for _,inplace in ipairs({false,true}) do
-          local tm = {}
-          local title = string.format('RReLU forward %d x %d (inplace: %s, train: %s)',
-             w, h, tostring(inplace), tostring(train))
-          times[title] = tm
-
-          local input = torch.randn(nOutputPlane, h, w) - 0.5
-          local sconv = nn.RReLU(1/8, 1/3, inplace)
-          if not train then
-             sconv:evaluate()
-          end
-          local groundtruth = sconv:forward(input:clone())
-          local a = torch.Timer()
-          for i = 1,nloop do
-             groundtruth = sconv:forward(input:clone())
-          end
-          tm.cpu = a:time().real
-
-          input = input:cuda()
-          local gconv = sconv:cuda()
-          local rescuda = gconv:forward(input:clone())
-          a:reset()
-          for i = 1,nloop do
-             rescuda = gconv:forward(input:clone())
-          end
-          cutorch.synchronize()
-          tm.gpu = a:time().real
-
-          if not train then
-             local error = rescuda:float() - groundtruth
-             mytester:assertlt(error:abs():max(), precision_forward, 'error on state')
+    for k, typename in ipairs(typenames) do
+       for _,train in ipairs({true,false}) do
+          for _,inplace in ipairs({false,true}) do
+              local input = torch.randn(nOutputPlane, h, w):type(typename) - 0.5
+              local ctype = t2cpu[typename]
+              input = input:type(ctype)
+              local sconv = nn.RReLU(1/8, 1/3, inplace):type(ctype)
+              if not train then
+                  sconv:evaluate()
+              end
+              local groundtruth = sconv:forward(input:clone())
+
+              input = input:type(typename)
+              local gconv = sconv:type(typename)
+              local rescuda = gconv:forward(input:clone())
+
+              if not train then
+                  local error = rescuda:double() - groundtruth:double()
+                  mytester:assertlt(error:abs():max(), precision_forward_type(precision_forward, typename),
+                      string.format('error on state %s', typename))
+              end
           end
-       end
+      end
     end
 end
 
@@ -5642,50 +4874,40 @@ function cunntest.RReLU_backward()
     local w = math.random(1,10)
     local h = math.random(1,10)
 
-    for _,train in ipairs({true,false}) do
-       for _,inplace in ipairs({false,true}) do
-          local tm = {}
-          local title = string.format('RReLU backward %d x %d (inplace: %s, train: %s)',
-            w, h, tostring(inplace), tostring(train))
-          times[title] = tm
-
-          local input = torch.randn(nOutputPlane, h, w)
-          local gradOutput = torch.randn(#input) - 0.5
-          local sconv = nn.RReLU(1/8, 1/3, inplace)
-          if not train then
-             sconv:evaluate()
-          end
+    for k, typename in ipairs(typenames) do
+        for _,train in ipairs({true,false}) do
+            for _,inplace in ipairs({false,true}) do
+                local ctype = t2cpu[typename]
+                local input = torch.randn(nOutputPlane, h, w):type(typename)
+                local gradOutput = torch.randn(#input):type(typename) - 0.5
+                input = input:type(ctype)
+                gradOutput = gradOutput:type(ctype)
+                local sconv = nn.RReLU(1/8, 1/3, inplace):type(ctype)
+                if not train then
+                  sconv:evaluate()
+                end
 
-          sconv:forward(input:clone())
-          local groundgrad = sconv:backward(input, gradOutput:clone())
-          local a = torch.Timer()
-          for i = 1,nloop do
-             groundgrad = sconv:backward(input, gradOutput:clone())
-          end
-          tm.cpu = a:time().real
+                sconv:forward(input:clone())
+                local groundgrad = sconv:backward(input, gradOutput:clone())
 
-          local gconv = sconv:clone():cuda()
-          input = input:cuda()
-          gradOutput = gradOutput:cuda()
-          gconv:forward(input:clone())
-          local rescuda = gconv:backward(input, gradOutput:clone())
-          a:reset()
-          for i = 1,nloop do
-             rescuda = gconv:backward(input, gradOutput:clone())
-          end
-          cutorch.synchronize()
-          tm.gpu = a:time().real
+                local gconv = sconv:clone():type(typename)
+                input = input:type(typename)
+                gradOutput = gradOutput:type(typename)
+                gconv:forward(input:clone())
+                local rescuda = gconv:backward(input, gradOutput:clone())
 
-          if not train then
-             local err = rescuda:float() - groundgrad
-             mytester:assertlt(err:abs():max(), precision_backward, 'error on state')
-          end
+                if not train then
+                  local err = rescuda:double() - groundgrad:double()
+                  mytester:assertlt(err:abs():max(), precision_backward_type(precision_backward, typename),
+                    string.format('error on state', typename))
+                end
 
-          input = -torch.rand(1000):cuda()
-          gconv:forward(input) -- fill internal noise tensor
-          local g = gconv:backward(input, torch.ones(1000):cuda())
-          local err = math.abs(g[input:le(0)]:mean()-(gconv.lower+gconv.upper)/2)
-          mytester:assertlt(err, 0.05, 'mean deviation of gradient for negative inputs')
+                input = -torch.rand(1000):type(typename)
+                gconv:forward(input) -- fill internal noise tensor
+                local g = gconv:backward(input, torch.ones(1000):type(typename))
+                local err = math.abs(g[input:le(0)]:mean()-(gconv.lower+gconv.upper)/2)
+                mytester:assertlt(err, 0.05, 'mean deviation of gradient for negative inputs')
+          end
        end
     end
 end
@@ -5705,79 +4927,91 @@ function cunntest.VolumetricFullConvolution_pair_test()
     local inChan = math.random(1,32)
     local outChan = math.random(1,32)
 
-    local module = nn.VolumetricFullConvolution(inChan, outChan, kT, kH, kW,
-                                          dT, dH, dW, pT, pH, pW);
-    module.weight:fill(1);
-    module.bias:fill(0.1);
-
-    local bs = math.random(8,32)
-    local inD = math.random(8,32)
-    local inH = math.random(8,32)
-    local inW = math.random(8,32)
-    local outD = (inD - 1) * dT - 2 * pT + kT
-    local outH = (inH - 1) * dH - 2 * pH + kH
-    local outW = (inW - 1) * dW - 2 * pW + kW
-    local input = torch.Tensor(bs, inChan, inD, inH, inW):fill(1);
-    local gradOut = torch.randn(bs, outChan, outD, outH, outW)
-
-    local outcpu = module:forward(input)
-    local gradcpu = module:backward(input, gradOut)
-    module:cuda()
-    local outgpu = module:forward(input:cuda())
-    local gradgpu = module:backward(input:cuda(), gradOut:cuda())
-
-    local error = outgpu:float() - outcpu
-    mytester:assertlt(error:abs():max(), precision_forward,
-                      'error on state (forward) ')
-
-    local error = gradgpu:float() - gradcpu
-    mytester:assertlt(error:abs():max(), precision_backward,
-                      'error on state (backward) ')
+    for k, typename in ipairs(typenames) do
+      local ctype = t2cpu[typename]
+      local module = nn.VolumetricFullConvolution(inChan, outChan, kT, kH, kW,
+                                                  dT, dH, dW, pT, pH, pW):type(ctype);
+      module.weight:fill(1);
+      module.bias:fill(0.1);
+      module.weight = module.weight:type(typename):type(ctype)
+      module.bias = module.bias:type(typename):type(ctype)
+
+      local bs = math.random(8,32)
+      local inD = math.random(8,32)
+      local inH = math.random(8,32)
+      local inW = math.random(8,32)
+      local outD = (inD - 1) * dT - 2 * pT + kT
+      local outH = (inH - 1) * dH - 2 * pH + kH
+      local outW = (inW - 1) * dW - 2 * pW + kW
+      local input = torch.Tensor(bs, inChan, inD, inH, inW):fill(1):type(typename):type(ctype)
+      local gradOut = torch.randn(bs, outChan, outD, outH, outW):type(typename):type(ctype)
+
+      local outcpu = module:forward(input)
+      local gradcpu = module:backward(input, gradOut)
+      module:type(typename)
+      local outgpu = module:forward(input:type(typename))
+      local gradgpu = module:backward(input:type(typename), gradOut:type(typename))
+
+      local error = outgpu:type(typename) - outcpu:type(typename)
+      mytester:assertlt(error:abs():max(),
+                        precision_forward_type(precision_forward, typename, outgpu:abs():max()),
+                        string.format('error on state (forward) with %s', typename))
+
+      local error = gradgpu:type(typename) - gradcpu:type(typename)
+      mytester:assertlt(error:abs():max(),
+                        precision_backward_type(precision_backward, typename),
+                        string.format('error on state (backward) with %s', typename))
+    end
 end
 
 function cunntest.VolumetricFullConvolution()
-    local module = nn.VolumetricFullConvolution(3, 1, 3, 3, 3, 3, 3, 3);
-    module.weight:fill(1);
-    module.bias:fill(0.1);
-    module:cuda();
-
-    local input = torch.Tensor(1, 3, 2, 2, 2):zero();
-    for c = 1,3 do
-        input[1][c][1][1][1] = 1
-    end
-    local output = module:forward(input:cuda())
-    for t = 1,6 do
-        for h = 1,6 do
-            for w = 1,6 do
-                if t <= 3 and h <= 3 and w <= 3 then
-                    mytester:assertlt(output[1][1][t][h][w] - 3.1, precision_forward, 'error on forward ')
-                else
-                    mytester:assertlt(output[1][1][t][h][w] - 0.1, precision_forward, 'error on forward ')
+    for k, typename in ipairs(typenames) do
+        local ctype = t2cpu[typename]
+        local module = nn.VolumetricFullConvolution(3, 1, 3, 3, 3, 3, 3, 3):type(ctype);
+        module.weight:fill(1);
+        module.bias:fill(0.1);
+        module:type(typename);
+
+        local input = torch.Tensor(1, 3, 2, 2, 2):zero();
+        for c = 1,3 do
+            input[1][c][1][1][1] = 1
+        end
+        local output = module:forward(input:type(typename))
+        for t = 1,6 do
+            for h = 1,6 do
+                for w = 1,6 do
+                    if t <= 3 and h <= 3 and w <= 3 then
+                        mytester:assertlt(output[1][1][t][h][w] - 3.1, precision_forward_type(precision_forward, typename),
+                          string.format('error on forward with %s', typename))
+                    else
+                        mytester:assertlt(output[1][1][t][h][w] - 0.1, precision_forward_type(precision_forward, typename),
+                          string.format('error on forward with %s', typename))
+                    end
                 end
             end
         end
-    end
 
-    module:zeroGradParameters()
-    local gradOut = torch.Tensor(1, 1, 6, 6, 6):fill(0.1);
-    local gradIn = module:backward(input:cuda(), gradOut:cuda())
-    for t = 1,2 do
-        for h = 1,2 do
-            for w = 1,2 do
-                mytester:assertlt(gradIn[1][1][t][h][w] - 2.7, precision_backward,
-                                  'error on backward input gradients ')
+        module:zeroGradParameters()
+        local gradOut = torch.Tensor(1, 1, 6, 6, 6):fill(0.1);
+        local gradIn = module:backward(input:type(typename), gradOut:type(typename))
+        for t = 1,2 do
+            for h = 1,2 do
+                for w = 1,2 do
+                    mytester:assertlt(gradIn[1][1][t][h][w] - 2.7, precision_backward_type(precision_backward, typename),
+                                      string.format('error on backward input gradients with %s', typename))
+                end
             end
         end
-    end
 
-    mytester:assertlt(module.gradBias[1] - 21.6, precision_backward,
-                      'error on backward gradBias ')
-    for c = 1,3 do
-        for t = 1,3 do
-            for h = 1,3 do
-                for w = 1,3 do
-                    mytester:assertlt(module.gradWeight[c][1][t][h][w] - 0.1, precision_backward,
-                                      'error on backward weight gradients ')
+        mytester:assertlt(module.gradBias[1] - 21.6, precision_backward_type(precision_backward, typename),
+                          string.format('error on backward gradBias with %s', typename))
+        for c = 1,3 do
+            for t = 1,3 do
+                for h = 1,3 do
+                    for w = 1,3 do
+                        mytester:assertlt(module.gradWeight[c][1][t][h][w] - 0.1, precision_backward_type(precision_backward, typename),
+                                          string.format('error on backward weight gradients with %s', typename))
+                    end
                 end
             end
         end
@@ -5806,35 +5040,46 @@ function cunntest.VolumetricDilatedConvolution()
    local inj = (outj - 1) * sj - 2 * padH + dilationH * (kj-1) + 1
    local ink = (outk - 1) * sk - 2 * padT + dilationT * (kk-1) + 1
 
-   local input = torch.randn(from,ink,inj,ini)
-   local sconv = nn.VolumetricDilatedConvolution(from,to,kk,ki,kj,sk,si,sj,padT,padW,padH,dilationT,dilationW,dilationH)
-   local output = sconv:forward(input)
-   local gradOutput = output:clone():normal()
-   sconv:zeroGradParameters()
-   local groundgrad = sconv:backward(input, gradOutput)
-   local groundweight = sconv.gradWeight
-   local groundbias = sconv.gradBias
+   for k, typename in ipairs(typenames) do
+      local input = torch.randn(from,ink,inj,ini):type(typename)
 
-   input = input:cuda()
-   gradOutput = gradOutput:cuda()
-   local gconv = nn.VolumetricDilatedConvolution(from,to,kk,ki,kj,sk,si,sj,padT,padW,padH,dilationT,dilationW,dilationH):cuda()
-   gconv.weight = sconv.weight:cuda()
-   gconv.bias = sconv.bias:cuda()
-   local rescuda = gconv:forward(input)
-   gconv:zeroGradParameters()
-   local gradcuda = gconv:backward(input, gradOutput)
-   local weightcuda = gconv.gradWeight
-   local biascuda = gconv.gradBias
+      local ctype = t2cpu[typename]
+      input = input:type(ctype)
+      local sconv = nn.VolumetricDilatedConvolution(from,to,kk,ki,kj,sk,si,sj,padT,padW,padH,dilationT,dilationW,dilationH):type(ctype)
+      local output = sconv:forward(input)
+      local gradOutput = output:clone():normal()
+      sconv:zeroGradParameters()
+      local groundgrad = sconv:backward(input, gradOutput)
+      local groundweight = sconv.gradWeight
+      local groundbias = sconv.gradBias
 
-   local error = rescuda:float() - output
-   local gerror = gradcuda:float() - groundgrad
-   local werror = weightcuda:float() - groundweight
-   local berror = biascuda:float() - groundbias
+      input = input:type(typename)
+      gradOutput = gradOutput:type(typename)
+      local gconv = nn.VolumetricDilatedConvolution(from,to,kk,ki,kj,sk,si,sj,padT,padW,padH,dilationT,dilationW,dilationH):type(typename)
+      gconv.weight = sconv.weight:type(typename)
+      gconv.bias = sconv.bias:type(typename)
+      local rescuda = gconv:forward(input)
+      gconv:zeroGradParameters()
+      local gradcuda = gconv:backward(input, gradOutput)
+      local weightcuda = gconv.gradWeight
+      local biascuda = gconv.gradBias
 
-   mytester:assertlt(error:abs():max(), precision_forward, 'error on state (forward) ')
-   mytester:assertlt(gerror:abs():max(), precision_backward, 'error on state (backward) ')
-   mytester:assertlt(werror:abs():max(), precision_backward, 'error on weight (backward) ')
-   mytester:assertlt(berror:abs():max(), precision_backward, 'error on bias (backward) ')
+      local error = rescuda:double() - output:double()
+      local gerror = gradcuda:double() - groundgrad:double()
+      local werror = weightcuda:double() - groundweight:double()
+      local berror = biascuda:double() - groundbias:double()
+
+      mytester:assertlt(error:abs():max(), precision_forward_type(precision_forward, typename),
+        string.format('error on state (forward) with %s', typename))
+      mytester:assertlt(gerror:abs():max(), precision_backward_type(precision_backward, typename),
+        string.format('error on state (backward) with %s', typename))
+      mytester:assertlt(werror:abs():max(),
+        precision_backward_conv_weightbias(precision_backward, typename, weightcuda:abs():max()),
+        string.format('error on weight (backward) with %s', typename))
+      mytester:assertlt(berror:abs():max(),
+        precision_backward_conv_weightbias(precision_backward, typename, biascuda:abs():max()),
+        string.format('error on bias (backward) with %s', typename))
+   end
 end
 
 function cunntest.LookupTable_forward()
@@ -5842,31 +5087,21 @@ function cunntest.LookupTable_forward()
    local nDim = 100
    local nInput = 1000
 
-   local tm = {}
-   local title = string.format('LookupTable forward %d x %d', nVocab, nDim)
-   times[title] = tm
+   for k, typename in ipairs(typenames) do
+      local input = torch.LongTensor(nInput):random(nVocab)
 
-   local input = torch.LongTensor(nInput):random(nVocab)
-   local sconv = nn.LookupTable(nVocab, nDim)
-   local groundtruth = sconv:forward(input)
-   local a = torch.Timer()
-   for i = 1,nloop do
-       groundtruth = sconv:forward(input)
-   end
-   tm.cpu = a:time().real
+      local ctype = t2cpu[typename]
+      local sconv = nn.LookupTable(nVocab, nDim):type(ctype)
+      local groundtruth = sconv:forward(input)
 
-   input = input:cuda()
-   local gconv = sconv:cuda()
-   local rescuda = gconv:forward(input)
-   a:reset()
-   for i = 1,nloop do
-       rescuda = gconv:forward(input)
-   end
-   cutorch.synchronize()
-   tm.gpu = a:time().real
+      input = input:cuda()
+      local gconv = sconv:type(typename)
+      local rescuda = gconv:forward(input)
 
-   local error = rescuda:float() - groundtruth
-   mytester:assertlt(error:abs():max(), precision_forward, 'error on state')
+      local error = rescuda:double() - groundtruth:double()
+      mytester:assertlt(error:abs():max(), precision_forward_type(precision_forward, typename),
+        string.format('error on state with %s', typename))
+   end
 end
 
 function cunntest.LookupTable_backward()
@@ -5886,73 +5121,69 @@ function cunntest.LookupTable_backward()
          s[k] = v[torch.random(#v)]
       end
 
-      local input, gradOutput
-      if s.batch then
-         input = torch.LongTensor(s.nInput, 5):random(s.nVocab)
-         gradOutput = torch.randn(s.nInput, 5, s.nDim)
-      else
-         input = torch.LongTensor(s.nInput):random(s.nVocab)
-         gradOutput = torch.randn(s.nInput, s.nDim)
-      end
-
-      local sconv = nn.LookupTable(s.nVocab, s.nDim, s.paddingValue)
-      local gconv = sconv:clone():cuda()
-      if s.scaleGradByFreq then
-         sconv = sconv:scaleGradByFreq()
-         gconv = gconv:scaleGradByFreq()
-      end
+      for k, typename in ipairs(typenames) do
+          local ctype = t2cpu[typename]
+          local input, gradOutput
+          if s.batch then
+              input = torch.LongTensor(s.nInput, 5):random(s.nVocab)
+              gradOutput = torch.randn(s.nInput, 5, s.nDim):type(typename):type(ctype)
+          else
+              input = torch.LongTensor(s.nInput):random(s.nVocab)
+              gradOutput = torch.randn(s.nInput, s.nDim):type(typename):type(ctype)
+          end
 
-      sconv:forward(input)
-      sconv:backward(input, gradOutput)
+          local sconv = nn.LookupTable(s.nVocab, s.nDim, s.paddingValue):type(ctype)
+          local gconv = sconv:clone():type(typename)
+          if s.scaleGradByFreq then
+              sconv = sconv:scaleGradByFreq()
+              gconv = gconv:scaleGradByFreq()
+          end
 
-      input = input:cuda()
-      gradOutput = gradOutput:cuda()
-      gconv:forward(input)
-      gconv:backward(input, gradOutput)
+          sconv:forward(input)
+          sconv:backward(input, gradOutput)
 
-      local weightGradError = gconv.gradWeight:float() - sconv.gradWeight
-      mytester:assertlt(weightGradError:abs():max(), precision_backward,
-         'error on weight for size ' .. tostring(s.nInput) ..
-          ' nVocab: ' .. tostring(s.nVocab) ..
-          ' nDim ' .. tostring(s.nDim) ..
-          ' scaleGradByFreq: ' .. tostring(s.scaleGradByFreq) ..
-          ' batch: ' .. tostring(s.batch) ..
-          ' paddingValue: ' .. tostring(s.paddingValue))
+          input = input:cuda()
+          gradOutput = gradOutput:type(typename)
+          gconv:forward(input)
+          gconv:backward(input, gradOutput)
+
+          local weightGradError = gconv.gradWeight:double() - sconv.gradWeight:double()
+          mytester:assertlt(weightGradError:abs():max(),
+              precision_backward_conv_weightbias(precision_backward, typename, gconv.gradWeight:abs():max()),
+              'error on weight for size ' .. tostring(s.nInput) ..
+              ' nVocab: ' .. tostring(s.nVocab) ..
+              ' nDim ' .. tostring(s.nDim) ..
+              ' scaleGradByFreq: ' .. tostring(s.scaleGradByFreq) ..
+              ' batch: ' .. tostring(s.batch) ..
+              ' paddingValue: ' .. tostring(s.paddingValue) ..
+              ' type:' .. typename)
+      end
    end
 
    local nVocab = 10000
    local nDim = 128
    local nInput = 1000
-   local tm = {}
-   local title = string.format('LookupTable backward %d x %d', nVocab, nDim, nInput)
-   times[title] = tm
 
-   local input = torch.LongTensor(nInput):random(nVocab)
-   local gradOutput = torch.randn(nInput, nDim)
-   local sconv = nn.LookupTable(nVocab, nDim)
-   local gconv = sconv:clone():cuda()
+   for k, typename in ipairs(typenames) do
+      local input = torch.LongTensor(nInput):random(nVocab)
 
-   sconv:forward(input)
-   sconv:backward(input, gradOutput)
-   local a = torch.Timer()
-   for i = 1,nloop do
-       sconv:backward(input, gradOutput)
-   end
-   tm.cpu = a:time().real
+      local ctype = t2cpu[typename]
+      local gradOutput = torch.randn(nInput, nDim):type(ctype)
+      local sconv = nn.LookupTable(nVocab, nDim):type(ctype)
+      local gconv = sconv:clone():type(typename)
 
-   input = input:cuda()
-   gradOutput = gradOutput:cuda()
-   gconv:forward(input)
-   gconv:backward(input, gradOutput)
-   a:reset()
-   for i = 1,nloop do
-       gconv:backward(input, gradOutput)
-   end
-   cutorch.synchronize()
-   tm.gpu = a:time().real
+      sconv:forward(input)
+      sconv:backward(input, gradOutput)
+
+      input = input:cuda()
+      gradOutput = gradOutput:type(typename)
+      gconv:forward(input)
+      gconv:backward(input, gradOutput)
 
-   local weightGradError = gconv.gradWeight:float() - sconv.gradWeight
-   mytester:assertlt(weightGradError:abs():max(), precision_backward, 'error on weight')
+      local weightGradError = gconv.gradWeight:double() - sconv.gradWeight:double()
+      mytester:assertlt(weightGradError:abs():max(), precision_backward_type(precision_backward, typename),
+          string.format('error on weight with %s', typename))
+   end
 end
 
 function cunntest.getParameters()
@@ -5980,36 +5211,23 @@ function cunntest.SpatialReflectionPadding_forward()
    local padT = math.random(-3,3)
    local padB = math.random(-3,3)
 
-   local tm = {}
-   local title =
-      string.format(
-         'SpatialReflectionPadding.forward %dx%dx%dx%d -> %dx%dx%dx%d',
-         batch, plane, sizeY, sizeX,
-         batch, plane, sizeY + padT + padB, sizeX + padL + padR)
-   times[title] = tm
+   for k, typename in ipairs(typenames) do
+      local input = torch.rand(batch, plane, sizeY, sizeX):type(typename)
 
-   local input = torch.rand(batch, plane, sizeY, sizeX)
-   local module = nn.SpatialReflectionPadding(padL, padR, padT, padB)
-   local groundtruth = module:forward(input)
-   local a = torch.Timer()
-   for i = 1,nloop do
-      groundtruth = module:forward(input)
-   end
-   tm.cpu = a:time().real
+      local ctype = t2cpu[typename]
+      input = input:type(ctype)
+      local module = nn.SpatialReflectionPadding(padL, padR, padT, padB):type(ctype)
+      local groundtruth = module:forward(input)
 
-   input = input:cuda()
-   local gmodule = nn.SpatialReflectionPadding(padL, padR, padT, padB):cuda()
-   local rescuda = gmodule:forward(input)
-   a:reset()
-   for i = 1,nloop do
-      rescuda = gmodule:forward(input)
-   end
-   cutorch.synchronize()
-   tm.gpu = a:time().real
+      input = input:type(typename)
+      local gmodule = nn.SpatialReflectionPadding(padL, padR, padT, padB):type(typename)
+      local rescuda = gmodule:forward(input)
 
-   local error = rescuda:float() - groundtruth
-   mytester:assertlt(error:abs():max(),
-                     precision_forward, 'error on state (forward) ')
+      local error = rescuda:double() - groundtruth:double()
+      mytester:assertlt(error:abs():max(),
+                        precision_forward_type(precision_forward, typename),
+                        string.format('error on state (forward) with %s', typename))
+   end
 end
 
 function cunntest.SpatialReflectionPadding_backward()
@@ -6022,46 +5240,32 @@ function cunntest.SpatialReflectionPadding_backward()
    local padT = math.random(-3,3)
    local padB = math.random(-3,3)
 
-   local tm = {}
-   local title =
-      string.format(
-         'SpatialReflectionPadding.backward %dx%dx%dx%d -> %dx%dx%dx%d',
-         batch, plane, sizeY, sizeX,
-         batch, plane, sizeY + padT + padB, sizeX + padL + padR)
-   times[title] = tm
+   for k, typename in ipairs(typenames) do
+      local input = torch.rand(batch, plane, sizeY, sizeX):type(typename)
+      local gradOutput = torch.rand(
+          batch, plane, sizeY + padT + padB, sizeX + padL + padR
+       ):type(typename)
 
-   local input = torch.rand(batch, plane, sizeY, sizeX)
-   local gradOutput = torch.rand(
-      batch, plane, sizeY + padT + padB, sizeX + padL + padR
-   )
-   local module = nn.SpatialReflectionPadding(padL, padR, padT, padB)
-   module:forward(input)
-   module:zeroGradParameters()
-   local groundgrad = module:backward(input, gradOutput)
-   local a = torch.Timer()
-   for i = 1,nloop do
-      module:zeroGradParameters()
-      groundgrad = module:backward(input, gradOutput)
-   end
-   tm.cpu = a:time().real
+       local ctype = t2cpu[typename]
+       input = input:type(ctype)
+       gradOutput = gradOutput:type(ctype)
+       local module = nn.SpatialReflectionPadding(padL, padR, padT, padB):type(ctype)
+       module:forward(input)
+       module:zeroGradParameters()
+       local groundgrad = module:backward(input, gradOutput)
 
-   input = input:cuda()
-   gradOutput = gradOutput:cuda()
-   local gmodule = nn.SpatialReflectionPadding(padL, padR, padT, padB):cuda()
-   gmodule:forward(input)
-   gmodule:zeroGradParameters()
-   local rescuda = gmodule:backward(input, gradOutput)
-   a:reset()
-   for i = 1,nloop do
-      gmodule:zeroGradParameters()
-      rescuda = gmodule:backward(input, gradOutput)
-   end
-   cutorch.synchronize()
-   tm.gpu = a:time().real
+       input = input:type(typename)
+       gradOutput = gradOutput:type(typename)
+       local gmodule = nn.SpatialReflectionPadding(padL, padR, padT, padB):type(typename)
+       gmodule:forward(input)
+       gmodule:zeroGradParameters()
+       local rescuda = gmodule:backward(input, gradOutput)
 
-   local error = rescuda:float() - groundgrad
-   mytester:assertlt(error:abs():max(),
-                     precision_backward, 'error on state (backward) ')
+       local error = rescuda:double() - groundgrad:double()
+       mytester:assertlt(error:abs():max(),
+                         precision_backward_type(precision_backward, type),
+                         string.format('error on state (backward) with %s', typename))
+   end
 end
 
 function cunntest.SpatialReplicationPadding_forward()
@@ -6074,36 +5278,23 @@ function cunntest.SpatialReplicationPadding_forward()
    local padT = math.random(-3,3)
    local padB = math.random(-3,3)
 
-   local tm = {}
-   local title =
-      string.format(
-         'SpatialReplicationPadding.forward %dx%dx%dx%d -> %dx%dx%dx%d',
-         batch, plane, sizeY, sizeX,
-         batch, plane, sizeY + padT + padB, sizeX + padL + padR)
-   times[title] = tm
+   for k, typename in ipairs(typenames) do
+      local input = torch.rand(batch, plane, sizeY, sizeX):type(typename)
 
-   local input = torch.rand(batch, plane, sizeY, sizeX)
-   local module = nn.SpatialReplicationPadding(padL, padR, padT, padB)
-   local groundtruth = module:forward(input)
-   local a = torch.Timer()
-   for i = 1,nloop do
-      groundtruth = module:forward(input)
-   end
-   tm.cpu = a:time().real
+      local ctype = t2cpu[typename]
+      input = input:type(ctype)
+      local module = nn.SpatialReplicationPadding(padL, padR, padT, padB):type(ctype)
+      local groundtruth = module:forward(input)
 
-   input = input:cuda()
-   local gmodule = nn.SpatialReplicationPadding(padL, padR, padT, padB):cuda()
-   local rescuda = gmodule:forward(input)
-   a:reset()
-   for i = 1,nloop do
-      rescuda = gmodule:forward(input)
-   end
-   cutorch.synchronize()
-   tm.gpu = a:time().real
+      input = input:type(typename)
+      local gmodule = nn.SpatialReplicationPadding(padL, padR, padT, padB):type(typename)
+      local rescuda = gmodule:forward(input)
 
-   local error = rescuda:float() - groundtruth
-   mytester:assertlt(error:abs():max(),
-                     precision_forward, 'error on state (forward) ')
+      local error = rescuda:double() - groundtruth:double()
+      mytester:assertlt(error:abs():max(),
+                        precision_forward_type(precision_forward, type),
+                        string.format('error on state (forward) with %s', typename))
+   end
 end
 
 function cunntest.SpatialReplicationPadding_backward()
@@ -6116,46 +5307,32 @@ function cunntest.SpatialReplicationPadding_backward()
    local padT = math.random(-3,3)
    local padB = math.random(-3,3)
 
-   local tm = {}
-   local title =
-      string.format(
-         'SpatialReplicationPadding.backward %dx%dx%dx%d -> %dx%dx%dx%d',
-         batch, plane, sizeY, sizeX,
-         batch, plane, sizeY + padT + padB, sizeX + padL + padR)
-   times[title] = tm
+   for k, typename in ipairs(typenames) do
+      local input = torch.rand(batch, plane, sizeY, sizeX):type(typename)
+      local gradOutput = torch.rand(
+          batch, plane, sizeY + padT + padB, sizeX + padL + padR
+       ):type(typename)
 
-   local input = torch.rand(batch, plane, sizeY, sizeX)
-   local gradOutput = torch.rand(
-      batch, plane, sizeY + padT + padB, sizeX + padL + padR
-   )
-   local module = nn.SpatialReplicationPadding(padL, padR, padT, padB)
-   module:forward(input)
-   module:zeroGradParameters()
-   local groundgrad = module:backward(input, gradOutput)
-   local a = torch.Timer()
-   for i = 1,nloop do
-      module:zeroGradParameters()
-      groundgrad = module:backward(input, gradOutput)
-   end
-   tm.cpu = a:time().real
+       local ctype = t2cpu[typename]
+       input = input:type(ctype)
+       gradOutput = gradOutput:type(ctype)
+       local module = nn.SpatialReplicationPadding(padL, padR, padT, padB):type(ctype)
+       module:forward(input)
+       module:zeroGradParameters()
+       local groundgrad = module:backward(input, gradOutput)
 
-   input = input:cuda()
-   gradOutput = gradOutput:cuda()
-   local gmodule = nn.SpatialReplicationPadding(padL, padR, padT, padB):cuda()
-   gmodule:forward(input)
-   gmodule:zeroGradParameters()
-   local rescuda = gmodule:backward(input, gradOutput)
-   a:reset()
-   for i = 1,nloop do
-      gmodule:zeroGradParameters()
-      rescuda = gmodule:backward(input, gradOutput)
-   end
-   cutorch.synchronize()
-   tm.gpu = a:time().real
+       input = input:type(typename)
+       gradOutput = gradOutput:type(typename)
+       local gmodule = nn.SpatialReplicationPadding(padL, padR, padT, padB):type(typename)
+       gmodule:forward(input)
+       gmodule:zeroGradParameters()
+       local rescuda = gmodule:backward(input, gradOutput)
 
-   local error = rescuda:float() - groundgrad
-   mytester:assertlt(error:abs():max(),
-                     precision_backward, 'error on state (backward) ')
+       local error = rescuda:double() - groundgrad:double()
+       mytester:assertlt(error:abs():max(),
+                         precision_backward_type(precision_backward, typename),
+                         string.format('error on state (backward) with %s', typename))
+   end
 end
 
 function cunntest.VolumetricReplicationPadding_forward()
@@ -6171,40 +5348,25 @@ function cunntest.VolumetricReplicationPadding_forward()
    local pfront = math.random(-3,3)
    local pback = math.random(-3,3)
 
-   local tm = {}
-   local title =
-      string.format(
-         'VolumetricReplicationPadding.forward %dx%dx%dx%dx%d -> ' ..
-         '%dx%dx%dx%dx%d',
-         batch, plane, sizeZ, sizeY, sizeX,
-         batch, plane, sizeZ + pfront + pback, sizeY + ptop + pbottom,
-         sizeX + pleft + pright)
-   times[title] = tm
+   for k, typename in ipairs(typenames) do
+      local input = torch.rand(batch, plane, sizeZ, sizeY, sizeX):type(typename)
 
-   local input = torch.rand(batch, plane, sizeZ, sizeY, sizeX)
-   local module = nn.VolumetricReplicationPadding(pleft, pright, ptop, pbottom,
-                                                  pfront, pback)
-   local groundtruth = module:forward(input)
-   local a = torch.Timer()
-   for i = 1, nloop do
-      groundtruth = module:forward(input)
-   end
-   tm.cpu = a:time().real
+      local ctype = t2cpu[typename]
+      input = input:type(ctype)
+      local module = nn.VolumetricReplicationPadding(pleft, pright, ptop, pbottom,
+                                                     pfront, pback):type(ctype)
+      local groundtruth = module:forward(input)
 
-   input = input:cuda()
-   local gmodule = nn.VolumetricReplicationPadding(pleft, pright, ptop, pbottom,
-                                                   pfront, pback):cuda()
-   local rescuda = gmodule:forward(input)
-   a:reset()
-   for i = 1, nloop do
-      rescuda = gmodule:forward(input)
-   end
-   cutorch.synchronize()
-   tm.gpu = a:time().real
+      input = input:type(typename)
+      local gmodule = nn.VolumetricReplicationPadding(pleft, pright, ptop, pbottom,
+                                                      pfront, pback):type(typename)
+      local rescuda = gmodule:forward(input)
 
-   local error = rescuda:float() - groundtruth
-   mytester:assertlt(error:abs():max(),
-                     precision_forward, 'error on state (forward) ')
+      local error = rescuda:double() - groundtruth:double()
+      mytester:assertlt(error:abs():max(),
+                        precision_forward_type(precision_forward, typename),
+                        string.format('error on state (forward) with %s', typename))
+   end
 end
 
 function cunntest.VolumetricReplicationPadding_backward()
@@ -6220,51 +5382,35 @@ function cunntest.VolumetricReplicationPadding_backward()
    local pfront = math.random(-3,3)
    local pback = math.random(-3,3)
 
-   local tm = {}
-   local title =
-      string.format(
-         'VolumetricReplicationPadding.backward %dx%dx%dx%dx%d -> ' ..
-         '%dx%dx%dx%dx%d',
-         batch, plane, sizeZ, sizeY, sizeX,
-         batch, plane, sizeZ + pfront + pback, sizeY + ptop + pbottom,
-         sizeX + pleft + pright)
-   times[title] = tm
-
-   local input = torch.rand(batch, plane, sizeZ, sizeY, sizeX)
-   local gradOutput = torch.rand(
-      batch, plane, sizeZ + pfront + pback, sizeY + ptop + pbottom,
-      sizeX + pleft + pright
-   )
-   local module = nn.VolumetricReplicationPadding(pleft, pright, ptop, pbottom,
-                                                  pfront, pback)
-   module:forward(input)
-   module:zeroGradParameters()
-   local groundgrad = module:backward(input, gradOutput)
-   local a = torch.Timer()
-   for i = 1, nloop do
+   for k, typename in ipairs(typenames) do
+      local input = torch.rand(batch, plane, sizeZ, sizeY, sizeX):type(typename)
+      local gradOutput = torch.rand(
+        batch, plane, sizeZ + pfront + pback, sizeY + ptop + pbottom,
+        sizeX + pleft + pright
+      ):type(typename)
+
+      local ctype = t2cpu[typename]
+      input = input:type(ctype)
+      gradOutput = gradOutput:type(ctype)
+      local module = nn.VolumetricReplicationPadding(pleft, pright, ptop, pbottom,
+                                                     pfront, pback):type(ctype)
+      module:forward(input)
       module:zeroGradParameters()
-      groundgrad = module:backward(input, gradOutput)
-   end
-   tm.cpu = a:time().real
+      local groundgrad = module:backward(input, gradOutput)
 
-   input = input:cuda()
-   gradOutput = gradOutput:cuda()
-   local gmodule = nn.VolumetricReplicationPadding(pleft, pright, ptop, pbottom,
-                                                   pfront, pback):cuda()
-   gmodule:forward(input)
-   gmodule:zeroGradParameters()
-   local rescuda = gmodule:backward(input, gradOutput)
-   a:reset()
-   for i = 1, nloop do
+      input = input:type(typename)
+      gradOutput = gradOutput:type(typename)
+      local gmodule = nn.VolumetricReplicationPadding(pleft, pright, ptop, pbottom,
+                                                      pfront, pback):type(typename)
+      gmodule:forward(input)
       gmodule:zeroGradParameters()
-      rescuda = gmodule:backward(input, gradOutput)
-   end
-   cutorch.synchronize()
-   tm.gpu = a:time().real
+      local rescuda = gmodule:backward(input, gradOutput)
 
-   local error = rescuda:float() - groundgrad
-   mytester:assertlt(error:abs():max(),
-                     precision_backward, 'error on state (backward) ')
+      local error = rescuda:double() - groundgrad:double()
+      mytester:assertlt(error:abs():max(),
+                        precision_backward_type(precision_backward, typename),
+                        string.format('error on state (backward) with %s', typename))
+   end
 end
 
 function cunntest.GPU()
@@ -6508,6 +5654,7 @@ function nn.testcuda(tests, print_timing, n_loop, seed)
    nloop = n_loop or nloop
    local oldtype = torch.getdefaulttensortype()
    torch.setdefaulttensortype('torch.FloatTensor')
+   checkHalf()
    initSeed(seed)
    mytester = torch.Tester()
    mytester:add(cunntest)
author	Soumith Chintala <soumith@gmail.com>	2016-11-09 02:18:01 +0300
committer	GitHub <noreply@github.com>	2016-11-09 02:18:01 +0300
commit	8d5027feb6aac1ce2ee4e6a346ddcb1b7e95d6fa (patch)
tree	f0a89adfb00e7a49031ca32a9badcf016bc599cb
parent	aa256bc7def4336dd9e6fa43b66bf00758b4922e (diff)
parent	27479c372040b8cab4e53e9338e8ce840bdb67dd (diff)