diff options
author | Soumith Chintala <soumith@gmail.com> | 2016-11-09 02:18:01 +0300 |
---|---|---|
committer | GitHub <noreply@github.com> | 2016-11-09 02:18:01 +0300 |
commit | 8d5027feb6aac1ce2ee4e6a346ddcb1b7e95d6fa (patch) | |
tree | f0a89adfb00e7a49031ca32a9badcf016bc599cb | |
parent | aa256bc7def4336dd9e6fa43b66bf00758b4922e (diff) | |
parent | 27479c372040b8cab4e53e9338e8ce840bdb67dd (diff) |
Merge pull request #346 from gchanan/pointwiseGeneric
Add generic support for pointwise modules
125 files changed, 14077 insertions, 12901 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt index 3ce811f..501a2de 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -14,6 +14,11 @@ FILE(WRITE THCUNN_h.lua "return [[") FILE(APPEND THCUNN_h.lua ${THCUNN_headers}) FILE(APPEND THCUNN_h.lua "]]") +FILE(STRINGS lib/THCUNN/generic/THCUNN.h THCUNN_generic_headers NEWLINE_CONSUME) +FILE(WRITE THCUNN_generic_h.lua "return [[") +FILE(APPEND THCUNN_generic_h.lua ${THCUNN_generic_headers}) +FILE(APPEND THCUNN_generic_h.lua "]]") + FILE(GLOB luasrc *.lua) ADD_SUBDIRECTORY(lib) @@ -12,46 +12,145 @@ function THCUNN.getState() return THCState_ptr(cutorch.getState()); end -local THCUNN_h = require 'cunn.THCUNN_h' +local THCUNN_generic_h = require 'cunn.THCUNN_generic_h' -- strip all lines starting with # -- to remove preprocessor directives originally present -- in THNN.h -THCUNN_h = THCUNN_h:gsub("\n#[^\n]*", "") -THCUNN_h = THCUNN_h:gsub("^#[^\n]*\n", "") +THCUNN_generic_h = THCUNN_generic_h:gsub("\n#[^\n]*", "") +THCUNN_generic_h = THCUNN_generic_h:gsub("^#[^\n]*\n", "") -local preprocessed = string.gsub(THCUNN_h, 'TH_API ', '') +local preprocessed_generic = string.gsub(THCUNN_generic_h, 'TH_API void THNN_%(([%a%d_]+)%)', 'void THNN_TYPE%1') local replacements = { { ['THTensor'] = 'THCudaTensor', - ['THIndexTensor'] = 'THCudaLongTensor', + ['THCIndexTensor'] = 'THCudaLongTensor', ['THIndex_t'] = 'long', ['THInteger_t'] = 'float' } } -for i=1,#replacements do - local r = replacements[i] - local s = preprocessed - for k,v in pairs(r) do - s = string.gsub(s, k, v) +local cct2lt = { + ['THCudaFloatTensor'] = 'torch.CudaTensor', + ['THCudaDoubleTensor'] = 'torch.CudaDoubleTensor', +} + +local replacements_generic = +{ + { + ['THCTensor'] = 'THCudaTensor', + ['THCIndexTensor'] = 'THCudaLongTensor', + ['TYPE'] = 'Cuda', + ['real'] = 'float' + }, + { + ['THCTensor'] = 'THCudaDoubleTensor', + ['THCIndexTensor'] = 'THCudaLongTensor', + ['TYPE'] = 'CudaDouble', + ['real'] = 'double', + } +} + +if cutorch.hasHalf then + ffi.cdef("half THC_float2half(float a);") + ffi.cdef("float THC_half2float(half a);") + cct2lt['THCudaHalfTensor'] = 'torch.CudaHalfTensor' + local half_replacement = { + ['THCTensor'] = 'THCudaHalfTensor', + ['THCIndexTensor'] = 'THCudaLongTensor', + ['TYPE'] = 'CudaHalf', + ['real'] = 'half' + } + table.insert(replacements_generic, half_replacement) +end + +for i=1,#replacements_generic do + local r = replacements_generic[i] + local s = preprocessed_generic + for k,v in pairs(r) do + s = string.gsub(s, k, v) + end + ffi.cdef(s) +end + +local function extract_function_names_generic(s) + local t = {} + for n in string.gmatch(s, 'TH_API void THNN_%(([%a%d_]+)%)') do + t[#t+1] = n end - ffi.cdef(s) + return t end -local function extract_function_names(s) +local function find_positions(s, p) + local begin = 0 + local positions = {} + while true do + local start, stop = string.find(s, p, begin) + if (start == nil) then break end + positions[#positions+1] = start + begin = stop + 1 + end + return positions +end + +local function extract_function_names_and_real_args(s) local t = {} - for n in string.gmatch(s, 'TH_API void THNN_Cuda([%a%d_]+)') do - t[#t+1] = n + for n in string.gmatch(s, 'TH_API ([^;]+)') do + local func_name = string.match(n, 'void THNN_%(([%a%d_]+)%)') + local param_positions = find_positions(n, ',') + local positions = {} + for x,y in ipairs(find_positions(n, 'real')) do + local found = false + for cn,cp in ipairs(param_positions) do + if cp > y then + positions[#positions+1] = cn + found = true + break + end + end + -- it is the last param + if not found then positions[#positions+1] = #param_positions + 1 end + end + + t[func_name] = positions end return t end +local real_args = extract_function_names_and_real_args(THCUNN_generic_h) + -- build function table -local function_names = extract_function_names(THCUNN_h) +local function_names_generic = extract_function_names_generic(THCUNN_generic_h) -THNN.kernels['torch.CudaTensor'] = THNN.bind(THCUNN.C, function_names, 'Cuda', THCUNN.getState) +THNN.kernels['torch.CudaTensor'] = THNN.bind(THCUNN.C, function_names_generic, 'Cuda', THCUNN.getState) torch.getmetatable('torch.CudaTensor').THNN = THNN.kernels['torch.CudaTensor'] +THNN.kernels['torch.CudaDoubleTensor'] = THNN.bind(THCUNN.C, function_names_generic, 'CudaDouble', THCUNN.getState) +torch.getmetatable('torch.CudaDoubleTensor').THNN = THNN.kernels['torch.CudaDoubleTensor'] + +-- in order to call 'half' functions from lua, convert real arguments from +-- to half since there is no other defined conversion +local transform_reals_to_half = function(func_name, real_args, ...) + t = {} + -- this select logic is necessary to deal with nil arguments + for i = 1, select('#', ...) do + t[i] = select(i, ...) + end + for k,v in ipairs(real_args[func_name]) do + -- first argument (THCState) is added implicitly by bind + t[v-1] = ffi.C.THC_float2half(t[v-1]) + end + return t +end + +local raw_half_functions = THNN.bind(THCUNN.C, function_names_generic, 'CudaHalf', THCUNN.getState) +for k,v in pairs(raw_half_functions) do + -- select required in case there are trailing nils + raw_half_functions[k] = function(...) v(unpack(transform_reals_to_half(k, real_args, ...), 1, select("#",...))) +end +end +THNN.kernels['torch.CudaHalfTensor'] = raw_half_functions +torch.getmetatable('torch.CudaHalfTensor').THNN = THNN.kernels['torch.CudaHalfTensor'] + return THCUNN diff --git a/lib/THCUNN/Abs.cu b/lib/THCUNN/Abs.cu index 81b3297..0abefd9 100644 --- a/lib/THCUNN/Abs.cu +++ b/lib/THCUNN/Abs.cu @@ -1,32 +1,24 @@ #include "THCUNN.h" -#include "common.h" +#include "THCHalf.h" +#include "THCHalfAutoNumerics.cuh" +template <typename T> struct absupdateOutput_functor { - __device__ void operator()(float* output, const float* input) const + __device__ void operator()(T* output, const T* input) const { *output = abs(*input); } }; -void THNN_CudaAbs_updateOutput(THCState *state, THCudaTensor *input, THCudaTensor *output) -{ - THCUNN_assertSameGPU(state, 2, input, output); - THCudaTensor_resizeAs(state, output, input); - THC_pointwiseApply2(state, output, input, absupdateOutput_functor()); -} - +template <typename T> struct absupdateGradInput_functor { - __device__ void operator()(float* gradInput, const float* input, const float* gradOutput) const + __device__ void operator()(T* gradInput, const T* input, const T* gradOutput) const { *gradInput = *input < 0 ? - *gradOutput : *gradOutput; } }; -void THNN_CudaAbs_updateGradInput(THCState *state, THCudaTensor *input, THCudaTensor *gradOutput, THCudaTensor *gradInput) -{ - THCUNN_assertSameGPU(state, 3, input, gradOutput, gradInput); - THCudaTensor_resizeAs(state, gradInput, input); - THC_pointwiseApply3(state, gradInput, input, gradOutput, absupdateGradInput_functor()); -} +#include "generic/Abs.cu" +#include "THCGenerateFloatTypes.h" diff --git a/lib/THCUNN/AbsCriterion.cu b/lib/THCUNN/AbsCriterion.cu index fd19cae..057c971 100644 --- a/lib/THCUNN/AbsCriterion.cu +++ b/lib/THCUNN/AbsCriterion.cu @@ -1,5 +1,7 @@ #include "THCUNN.h" #include "common.h" +#include "THCHalf.h" +#include "THCHalfAutoNumerics.cuh" #include <thrust/fill.h> #include <thrust/functional.h> @@ -7,69 +9,30 @@ #include <thrust/reduce.h> #include <thrust/inner_product.h> +template <typename Dtype, typename Acctype> struct abs_functor { - __host__ __device__ float operator()(const float& x, const float& y) const + __host__ __device__ Acctype operator()(const Dtype& x, const Dtype& y) const { - float z = x-y; - return z >= 0 ? z : -z; + Dtype z = x-y; + return ScalarConvert<Dtype, Acctype>::to(z >= 0 ? z : -z); } }; -void THNN_CudaAbsCriterion_updateOutput(THCState *state, THCudaTensor *input, THCudaTensor *target, THCudaTensor *output, bool sizeAverage) -{ - THCUNN_assertSameGPU(state, 2, input, target); - - long size = THCudaTensor_nElement(state, input); - - input = THCudaTensor_newContiguous(state, input); - target = THCudaTensor_newContiguous(state, target); - - thrust::device_ptr<float> input_data(THCudaTensor_data(state, input)); - thrust::device_ptr<float> target_data(THCudaTensor_data(state, target)); - float sum = thrust::inner_product(input_data, input_data+size, target_data, (float) 0, thrust::plus<float>(), abs_functor()); - - if (sizeAverage) - sum /= size; - - THCudaTensor_free(state, input); - THCudaTensor_free(state, target); - - THCudaTensor_set1d(state, output, 0, sum); -} - +template <typename Dtype> struct abs_updateGradInput_functor { - const float norm; + const Dtype norm; - abs_updateGradInput_functor(float norm_) + abs_updateGradInput_functor(Dtype norm_) : norm(norm_) {} - __host__ __device__ float operator()(const float& x, const float& y) const + __host__ __device__ Dtype operator()(const Dtype& x, const Dtype& y) const { return (x - y) >= 0 ? norm : -norm; } }; -void THNN_CudaAbsCriterion_updateGradInput(THCState *state, THCudaTensor *input, THCudaTensor *target, THCudaTensor *gradInput, bool sizeAverage) -{ - THCUNN_assertSameGPU(state, 3, input, target, gradInput); - - long size = THCudaTensor_nElement(state, input); - float norm = (sizeAverage ? 1./size : 1.); - - input = THCudaTensor_newContiguous(state, input); - target = THCudaTensor_newContiguous(state, target); - - THCudaTensor_resizeAs(state, gradInput, input); - - thrust::device_ptr<float> input_data(THCudaTensor_data(state, input)); - thrust::device_ptr<float> target_data(THCudaTensor_data(state, target)); - thrust::device_ptr<float> gradInput_data(THCudaTensor_data(state, gradInput)); - - thrust::transform(input_data, input_data+size, target_data, gradInput_data, abs_updateGradInput_functor(norm)); - - THCudaTensor_free(state, input); - THCudaTensor_free(state, target); -} +#include "generic/AbsCriterion.cu" +#include "THCGenerateFloatTypes.h" diff --git a/lib/THCUNN/BCECriterion.cu b/lib/THCUNN/BCECriterion.cu index c8edcfe..3653fc8 100644 --- a/lib/THCUNN/BCECriterion.cu +++ b/lib/THCUNN/BCECriterion.cu @@ -1,5 +1,7 @@ #include "THCUNN.h" #include "common.h" +#include "THCHalf.h" +#include "THCHalfAutoNumerics.cuh" #include <thrust/functional.h> #include <thrust/device_ptr.h> @@ -7,148 +9,80 @@ #include <thrust/transform.h> #include <thrust/transform_reduce.h> -const float eps = 1e-12f; +template <typename T> +inline __device__ T eps(); +template <> +inline __device__ float eps() { return 1e-12f; } + +template <> +inline __device__ double eps() { return 1e-12; } + +template <typename Dtype, typename Acctype> struct bce_functor { template <class Tuple> __host__ __device__ - float operator()(Tuple x) + Acctype operator()(Tuple x) { - float o = thrust::get<0>(x); - float t = thrust::get<1>(x); - return - (t * logf(o + eps) + (1.f - t) * logf(1.f - o + eps)); + Dtype o = thrust::get<0>(x); + Dtype t = thrust::get<1>(x); + return - (t * THCNumerics<Acctype>::log(o + eps<Acctype>()) + (Acctype(1)- t) * THCNumerics<Acctype>::log(Acctype(1) - o + eps<Acctype>())); } }; +template <typename Dtype, typename Acctype> struct bce_functor_weights { template <class Tuple> __host__ __device__ - float operator()(Tuple x) + Acctype operator()(Tuple x) { - float o = thrust::get<0>(x); - float t = thrust::get<1>(x); - float w = thrust::get<2>(x); - return - w * (t * logf(o + eps) + (1.f - t) * logf(1.f - o + eps)); + Dtype o = thrust::get<0>(x); + Dtype t = thrust::get<1>(x); + Dtype w = thrust::get<2>(x); + return - w * (t * THCNumerics<Acctype>::log(o + eps<Acctype>()) + (Acctype(1) - t) * THCNumerics<Acctype>::log(Acctype(1) - o + eps<Acctype>())); } }; -void THNN_CudaBCECriterion_updateOutput(THCState *state, THCudaTensor *input, THCudaTensor *target, THCudaTensor *output, bool sizeAverage, THCudaTensor *weights) -{ - THCUNN_assertSameGPU(state, 3, input, target, weights); - - long size = THCudaTensor_nElement(state, input); - - input = THCudaTensor_newContiguous(state, input); - target = THCudaTensor_newContiguous(state, target); - - thrust::device_ptr<float> input_data(THCudaTensor_data(state, input)); - thrust::device_ptr<float> target_data(THCudaTensor_data(state, target)); - - float sum; - if (weights) { - weights = THCudaTensor_newContiguous(state, weights); - thrust::device_ptr<float> weights_data(THCudaTensor_data(state, weights)); - sum = thrust::transform_reduce( - thrust::make_zip_iterator(thrust::make_tuple(input_data, target_data, weights_data)), - thrust::make_zip_iterator(thrust::make_tuple(input_data+size, target_data+size, weights_data+size)), - bce_functor_weights(), - (float) 0.f, - thrust::plus<float>() - ); - THCudaTensor_free(state, weights); - } else { - sum = thrust::transform_reduce( - thrust::make_zip_iterator(thrust::make_tuple(input_data, target_data)), - thrust::make_zip_iterator(thrust::make_tuple(input_data+size, target_data+size)), - bce_functor(), - (float) 0.f, - thrust::plus<float>() - ); - } - - if (sizeAverage) - sum /= size; - - THCudaTensor_free(state, input); - THCudaTensor_free(state, target); - - THCudaTensor_set1d(state, output, 0, sum); -} - +template <typename Dtype, typename Acctype> struct bce_updateGradInput_functor { - const float norm; + const Dtype norm; - bce_updateGradInput_functor(float norm_) + bce_updateGradInput_functor(Dtype norm_) : norm(norm_) {} template <class Tuple> __host__ __device__ - float operator()(Tuple x) + Dtype operator()(Tuple x) { - float o = thrust::get<0>(x); - float t = thrust::get<1>(x); - return - (t - o) / ((1 - o + eps) * (o + eps)) * norm; + Dtype o = thrust::get<0>(x); + Dtype t = thrust::get<1>(x); + return ScalarConvert<Acctype,Dtype>::to(- (t - o) / ((Acctype(1) - o + eps<Acctype>()) * (o + eps<Acctype>())) * norm); } }; +template <typename Dtype, typename Acctype> struct bce_updateGradInput_functor_weights { - const float norm; + const Dtype norm; - bce_updateGradInput_functor_weights(float norm_) + bce_updateGradInput_functor_weights(Dtype norm_) : norm(norm_) {} template <class Tuple> __host__ __device__ - float operator()(Tuple x) + Dtype operator()(Tuple x) { - float o = thrust::get<0>(x); - float t = thrust::get<1>(x); - float w = thrust::get<2>(x); - return - (t - o) / ((1 - o + eps) * (o + eps)) * norm * w; + Dtype o = thrust::get<0>(x); + Dtype t = thrust::get<1>(x); + Dtype w = thrust::get<2>(x); + return ScalarConvert<Acctype, Dtype>::to(- (t - o) / ((Acctype(1) - o + eps<Acctype>()) * (o + eps<Acctype>())) * norm * w); } }; -void THNN_CudaBCECriterion_updateGradInput(THCState *state, THCudaTensor *input, THCudaTensor *target, THCudaTensor *gradInput, bool sizeAverage, THCudaTensor *weights) -{ - THCUNN_assertSameGPU(state, 4, input, target, gradInput, weights); - - long size = THCudaTensor_nElement(state, input); - float norm = (sizeAverage ? 1./size : 1.); - - input = THCudaTensor_newContiguous(state, input); - target = THCudaTensor_newContiguous(state, target); - - THCudaTensor_resizeAs(state, gradInput, input); - - thrust::device_ptr<float> input_data(THCudaTensor_data(state, input)); - thrust::device_ptr<float> target_data(THCudaTensor_data(state, target)); - thrust::device_ptr<float> gradInput_data(THCudaTensor_data(state, gradInput)); - - if (weights) { - weights = THCudaTensor_newContiguous(state, weights); - thrust::device_ptr<float> weights_data(THCudaTensor_data(state, weights)); - thrust::transform( - thrust::make_zip_iterator(thrust::make_tuple(input_data, target_data, weights_data)), - thrust::make_zip_iterator(thrust::make_tuple(input_data+size, target_data+size, weights_data+size)), - gradInput_data, - bce_updateGradInput_functor_weights(norm) - ); - THCudaTensor_free(state, weights); - } else { - thrust::transform( - thrust::make_zip_iterator(thrust::make_tuple(input_data, target_data)), - thrust::make_zip_iterator(thrust::make_tuple(input_data+size, target_data+size)), - gradInput_data, - bce_updateGradInput_functor(norm) - ); - } - - THCudaTensor_free(state, input); - THCudaTensor_free(state, target); -} +#include "generic/BCECriterion.cu" +#include "THCGenerateFloatTypes.h" diff --git a/lib/THCUNN/BatchNormalization.cu b/lib/THCUNN/BatchNormalization.cu index ca2cd8c..125e3ff 100644 --- a/lib/THCUNN/BatchNormalization.cu +++ b/lib/THCUNN/BatchNormalization.cu @@ -1,12 +1,12 @@ #include "THCUNN.h" #include "common.h" +#include "THCHalf.h" +#include "THCHalfAutoNumerics.cuh" #include "THCDeviceTensor.cuh" #include "THCDeviceTensorUtils.cuh" const int WARP_SIZE = 32; -typedef THCDeviceTensor<float, 3> DeviceTensor3; -typedef THCDeviceTensor<float, 1> DeviceTensor1; // The maximum number of threads in a block const int MAX_BLOCK_SIZE = 512; @@ -27,11 +27,13 @@ __device__ __forceinline__ int getMSB(int val) { return 31 - __clz(val); } +template <typename Dtype, typename Acctype> struct Float2 { - float v1, v2; + Acctype v1, v2; __device__ Float2() {} - __device__ Float2(float v1, float v2) : v1(v1), v2(v2) {} - __device__ Float2(float v) : v1(v), v2(v) {} + __device__ Float2(Dtype v1, Dtype v2) : v1(ScalarConvert<Dtype, Acctype>::to(v1)), v2(ScalarConvert<Dtype, Acctype>::to(v2)) {} + __device__ Float2(Dtype v) : v1(ScalarConvert<Dtype, Acctype>::to(v)), v2(ScalarConvert<Dtype, Acctype>::to(v)) {} + __device__ Float2(int v) : v1(ScalarConvert<int, Acctype>::to(v)), v2(ScalarConvert<int, Acctype>::to(v)) {} __device__ Float2& operator+=(const Float2& a) { v1 += a.v1; v2 += a.v2; @@ -39,45 +41,49 @@ struct Float2 { } }; +template <typename Dtype, typename Acctype, typename DeviceTensor3> struct SumOp { __device__ SumOp(const DeviceTensor3 t) : tensor(t) {} - __device__ __forceinline__ float operator()(int batch, int plane, int n) { - return tensor[batch][plane][n]; + __device__ __forceinline__ Acctype operator()(int batch, int plane, int n) { + return ScalarConvert<Dtype, Acctype>::to(tensor[batch][plane][n]); } const DeviceTensor3 tensor; }; +template <typename Dtype, typename Acctype, typename DeviceTensor3> struct VarOp { - __device__ VarOp(float m, const DeviceTensor3 t) : mean(m), tensor(t) {} - __device__ __forceinline__ float operator()(int batch, int plane, int n) { - float val = tensor[batch][plane][n]; + __device__ VarOp(Acctype m, const DeviceTensor3 t) : mean(m), tensor(t) {} + __device__ __forceinline__ Acctype operator()(int batch, int plane, int n) { + Dtype val = tensor[batch][plane][n]; return (val - mean) * (val - mean); } - const float mean; + const Acctype mean; const DeviceTensor3 tensor; }; +template <typename Dtype, typename Acctype, typename DeviceTensor3> struct GradOp { - __device__ GradOp(float m, const DeviceTensor3 i, const DeviceTensor3 g) + __device__ GradOp(Acctype m, const DeviceTensor3 i, const DeviceTensor3 g) : mean(m), input(i), gradOutput(g) {} - __device__ __forceinline__ Float2 operator()(int batch, int plane, int n) { - float g = gradOutput[batch][plane][n]; - float c = input[batch][plane][n] - mean; - return Float2(g, g * c); + __device__ __forceinline__ Float2<Dtype, Acctype> operator()(int batch, int plane, int n) { + Dtype g = gradOutput[batch][plane][n]; + Dtype c = ScalarConvert<Acctype, Dtype>::to(input[batch][plane][n] - mean); + return Float2<Dtype, Acctype>(g, g * c); } - const float mean; + const Acctype mean; const DeviceTensor3 input; const DeviceTensor3 gradOutput; }; // Sum across all threads within a warp -static __device__ __forceinline__ float warpSum(float val) { +template <typename T> +static __device__ __forceinline__ T warpSum(T val) { #if __CUDA_ARCH__ >= 300 for (int i = 0; i < getMSB(WARP_SIZE); ++i) { val += __shfl_xor(val, 1 << i, WARP_SIZE); } #else - __shared__ float values[MAX_BLOCK_SIZE]; + __shared__ T values[MAX_BLOCK_SIZE]; values[threadIdx.x] = val; __threadfence_block(); const int base = (threadIdx.x / WARP_SIZE) * WARP_SIZE; @@ -88,14 +94,15 @@ static __device__ __forceinline__ float warpSum(float val) { return val; } -static __device__ __forceinline__ Float2 warpSum(Float2 value) { +template <typename Dtype, typename Acctype> +static __device__ __forceinline__ Float2<Dtype, Acctype> warpSum(Float2<Dtype, Acctype> value) { value.v1 = warpSum(value.v1); value.v2 = warpSum(value.v2); return value; } // Sum across (batch, x/y/z) applying Op() pointwise -template<typename T, typename Op> +template<typename T, typename Op, typename DeviceTensor3> __device__ T reduce(Op op, DeviceTensor3 tensor, int plane) { T sum = (T)0; for (int batch = 0; batch < tensor.getSize(0); ++batch) { @@ -130,32 +137,7 @@ __device__ T reduce(Op op, DeviceTensor3 tensor, int plane) { return shared[0]; } -template <int Dim> -static THCDeviceTensor<float, Dim> devicetensor(THCState *state, THCudaTensor *t) { - if (!t) { - return THCDeviceTensor<float, Dim>(); - } - - int inDim = THCudaTensor_nDimension(state, t); - if (inDim == Dim) { - return toDeviceTensor<float, Dim>(state, t); - } - - // View in which the last dimensions are collapsed or expanded as needed - THAssert(THCudaTensor_isContiguous(state, t)); - int size[Dim]; - for (int i = 0; i < Dim || i < inDim; ++i) { - if (i < Dim && i < inDim) { - size[i] = t->size[i]; - } else if (i < Dim) { - size[i] = 1; - } else { - size[Dim - 1] *= t->size[i]; - } - } - return THCDeviceTensor<float, Dim>(THCudaTensor_data(state, t), size); -} - +template <typename Dtype, typename Acctype, typename DeviceTensor1, typename DeviceTensor3> __global__ void BatchNormalizationUpdateOutputInference_kernel( const DeviceTensor3 input, DeviceTensor3 output, @@ -163,31 +145,32 @@ __global__ void BatchNormalizationUpdateOutputInference_kernel( DeviceTensor1 runningVar, const DeviceTensor1 weight, const DeviceTensor1 bias, - float epsilon) { + Acctype epsilon) { int plane = blockIdx.x; - float invstd = 1.0f / sqrt(runningVar[plane].ldg() + epsilon); - float mean = runningMean[plane].ldg(); - float gamma = weight.numElements() > 0 ? weight[plane].ldg() : 1.0f; - float beta = bias.numElements() > 0 ? bias[plane].ldg() : 0.0f; + Acctype invstd = Acctype(1) / sqrt(runningVar[plane].ldg() + epsilon); + Acctype mean = ScalarConvert<Dtype, Acctype>::to(runningMean[plane].ldg()); + Acctype gamma = weight.numElements() > 0 ? ScalarConvert<Dtype, Acctype>::to(weight[plane].ldg()) : Acctype(1); + Acctype beta = bias.numElements() > 0 ? ScalarConvert<Dtype, Acctype>::to(bias[plane].ldg()) : Acctype(0); // Write normalized and update the output for (int batch = 0; batch < input.getSize(0); batch++) { for (int x = threadIdx.x; x < input.getSize(2); x += blockDim.x) { - float inp = input[batch][plane][x].ldg(); - output[batch][plane][x] = gamma * (inp - mean) * invstd + beta; + Dtype inp = input[batch][plane][x].ldg(); + output[batch][plane][x] = ScalarConvert<Acctype, Dtype>::to(gamma * (inp - mean) * invstd + beta); } } } +template <typename Dtype, typename Acctype, typename DeviceTensor1, typename DeviceTensor3> __global__ void BatchNormalizationUpdateOutput_kernel( const DeviceTensor3 input, DeviceTensor3 output, const DeviceTensor1 weight, const DeviceTensor1 bias, - const float epsilon, - const float momentum, + const Acctype epsilon, + const Acctype momentum, DeviceTensor1 runningMean, DeviceTensor1 runningVar, DeviceTensor1 saveMean, @@ -196,73 +179,39 @@ __global__ void BatchNormalizationUpdateOutput_kernel( int plane = blockIdx.x; int N = input.getSize(0) * input.getSize(2); - float norm = 1.0f / N; + Acctype norm = Acctype(1) / N; // Compute the mean and variance across (batch, x/y/z) - float mean = reduce<float>(SumOp(input), input, plane) * norm; + Acctype mean = reduce<Acctype>(SumOp<Dtype, Acctype, DeviceTensor3>(input), input, plane) * norm; __syncthreads(); - float varN = reduce<float>(VarOp(mean, input), input, plane); - float invStd = 0.0f; - if (varN != 0.0f || epsilon != 0.0f) { + Acctype varN = reduce<Acctype>(VarOp<Dtype, Acctype, DeviceTensor3>(mean, input), input, plane); + Acctype invStd = 0; + if (varN != Acctype(0) || epsilon != Acctype(0)) { invStd = 1 / sqrt(varN * norm + epsilon); } // Save the mean, variance, and moving averages if (threadIdx.x == 0) { // Momentum based writeback - float unbiasedVar = varN / (N - 1); - saveMean[plane] = mean; - saveStd[plane] = invStd; - runningMean[plane] = (1 - momentum) * runningMean[plane] + momentum * mean; - runningVar[plane] = (1 - momentum) * runningVar[plane] + momentum * unbiasedVar; + Acctype unbiasedVar = varN / (N - 1); + saveMean[plane] = ScalarConvert<Acctype, Dtype>::to(mean); + saveStd[plane] = ScalarConvert<Acctype, Dtype>::to(invStd); + runningMean[plane] = ScalarConvert<Acctype, Dtype>::to((1 - momentum) * runningMean[plane] + momentum * mean); + runningVar[plane] = ScalarConvert<Acctype, Dtype>::to((1 - momentum) * runningVar[plane] + momentum * unbiasedVar); } // Write normalized and update the output - float gamma = weight.numElements() > 0 ? weight[plane] : 1.0f; - float beta = bias.numElements() > 0 ? bias[plane] : 0.0f; + Acctype gamma = weight.numElements() > 0 ? ScalarConvert<Dtype, Acctype>::to(weight[plane]) : ScalarConvert<int, Acctype>::to(1); + Acctype beta = bias.numElements() > 0 ? ScalarConvert<Dtype, Acctype>::to(bias[plane]) : ScalarConvert<int, Acctype>::to(0); for (int batch = 0; batch < input.getSize(0); ++batch) { for (int x = threadIdx.x; x < input.getSize(2); x += blockDim.x) { - float inp = input[batch][plane][x].ldg(); - output[batch][plane][x] = gamma * (inp - mean) * invStd + beta; + Dtype inp = input[batch][plane][x].ldg(); + output[batch][plane][x] = ScalarConvert<Acctype, Dtype>::to(gamma * (inp - mean) * invStd + beta); } } } -void THNN_CudaBatchNormalization_updateOutput( - THCState *state, THCudaTensor *input_, THCudaTensor *output_, - THCudaTensor *weight_, THCudaTensor *bias_, THCudaTensor *runningMean_, - THCudaTensor *runningVar_, THCudaTensor *saveMean_, THCudaTensor *saveStd_, - bool train, double momentum, double eps) { - - THCUNN_assertSameGPU(state, 8, input_, output_, weight_, bias_, runningMean_, - runningVar_, saveMean_, saveStd_); - DeviceTensor3 input = devicetensor<3>(state, input_); - DeviceTensor3 output = devicetensor<3>(state, output_); - DeviceTensor1 weight = devicetensor<1>(state, weight_); - DeviceTensor1 bias = devicetensor<1>(state, bias_); - DeviceTensor1 runningMean = devicetensor<1>(state, runningMean_); - DeviceTensor1 runningVar = devicetensor<1>(state, runningVar_); - DeviceTensor1 saveMean = devicetensor<1>(state, saveMean_); - DeviceTensor1 saveStd = devicetensor<1>(state, saveStd_); - - cudaStream_t s = THCState_getCurrentStream(state); - cudaDeviceProp *prop = THCState_getCurrentDeviceProperties(state); - - if (!train) { - dim3 blocks(input.getSize(1)); - dim3 threads(getNumThreads(input.getSize(2))); - BatchNormalizationUpdateOutputInference_kernel<<<blocks, threads, 0, s>>>( - input, output, runningMean, runningVar, weight, bias, eps); - } else { - dim3 blocks(input.getSize(1)); - dim3 threads(getNumThreads(input.getSize(2))); - BatchNormalizationUpdateOutput_kernel<<<blocks, threads, 0, s>>>( - input, output, weight, bias, eps, momentum, runningMean, runningVar, - saveMean, saveStd); - } - THCudaCheck(cudaGetLastError()); -} - +template <typename Dtype, typename Acctype, typename DeviceTensor1, typename DeviceTensor3> __global__ void BatchNormalizationBackward_kernel( const DeviceTensor3 input, const DeviceTensor3 gradOutput, @@ -275,45 +224,46 @@ __global__ void BatchNormalizationBackward_kernel( const DeviceTensor1 saveMean, const DeviceTensor1 saveStd, bool train, - float scale, + Acctype scale, double eps) { int plane = blockIdx.x; int N = gradOutput.getSize(0) * gradOutput.getSize(2); - float mean, stdVal; + Acctype mean, stdVal; if (train) { - mean = saveMean[plane]; - stdVal = saveStd[plane]; + mean = ScalarConvert<Dtype, Acctype>::to(saveMean[plane]); + stdVal = ScalarConvert<Dtype, Acctype>::to(saveStd[plane]); } else { - mean = runningMean[plane]; + mean = ScalarConvert<Dtype, Acctype>::to(runningMean[plane]); stdVal = 1 / sqrt(runningVar[plane] + eps); } - float weightVal = weight.numElements() > 0 ? weight[plane] : 1.0f; - float norm = 1.0f / N; + Acctype weightVal = weight.numElements() > 0 ? ScalarConvert<Dtype, Acctype>::to(weight[plane]) : Acctype(1); + Acctype norm = Acctype(1) / N; // Compute two values across (batch, x/y/z) in one pass: // 1. Sum(gradOutput) // 2. DotProduct(input - mean, gradOutput) - Float2 res = reduce<Float2>(GradOp(mean, input, gradOutput), gradOutput, plane); - float gradOutputSum = res.v1; - float dotP = res.v2; + GradOp<Dtype, Acctype, DeviceTensor3> g(mean, input, gradOutput); + Float2<Dtype, Acctype> res = reduce<Float2<Dtype, Acctype>, GradOp<Dtype, Acctype, DeviceTensor3>, DeviceTensor3>(g, gradOutput, plane); + Acctype gradOutputSum = res.v1; + Acctype dotP = res.v2; - float gradMean = gradOutputSum * norm; - float projScale = dotP * norm * stdVal * stdVal; - float gradScale = stdVal * weightVal; + Acctype gradMean = gradOutputSum * norm; + Acctype projScale = dotP * norm * stdVal * stdVal; + Acctype gradScale = stdVal * weightVal; if (gradInput.numElements() > 0) { for (int batch = 0; batch < gradOutput.getSize(0); ++batch) { for (int x = threadIdx.x; x < gradOutput.getSize(2); x += blockDim.x) { - float gradOut = gradOutput[batch][plane][x]; + Dtype gradOut = gradOutput[batch][plane][x]; if (train) { - float inp = input[batch][plane][x]; - float proj = (inp - mean) * projScale; - gradInput[batch][plane][x] = (gradOut - proj - gradMean) * gradScale; + Dtype inp = input[batch][plane][x]; + Acctype proj = (inp - mean) * projScale; + gradInput[batch][plane][x] = ScalarConvert<Acctype, Dtype>::to((gradOut - proj - gradMean) * gradScale); } else { - gradInput[batch][plane][x] = gradOut * gradScale; + gradInput[batch][plane][x] = ScalarConvert<Acctype, Dtype>::to(gradOut * gradScale); } } } @@ -321,42 +271,16 @@ __global__ void BatchNormalizationBackward_kernel( if (gradWeight.numElements() > 0) { if (threadIdx.x == 0) { - gradWeight[plane] += scale * dotP * stdVal; + gradWeight[plane] += ScalarConvert<Acctype, Dtype>::to(scale * dotP * stdVal); } } if (gradBias.numElements() > 0) { if (threadIdx.x == 0) { - gradBias[plane] += scale * gradOutputSum; + gradBias[plane] += ScalarConvert<Acctype, Dtype>::to(scale * gradOutputSum); } } } -void THNN_CudaBatchNormalization_backward( - THCState *state, THCudaTensor *input_, THCudaTensor *gradOutput_, - THCudaTensor *gradInput_, THCudaTensor *gradWeight_, THCudaTensor *gradBias_, - THCudaTensor *weight_, THCudaTensor *runningMean_, THCudaTensor *runningVar_, - THCudaTensor *saveMean_, THCudaTensor *saveStd_, bool train, float scale, double eps) { - - THCUNN_assertSameGPU(state, 10, input_, gradOutput_, gradInput_, gradWeight_, - gradBias_, weight_, runningMean_, runningVar_, saveMean_, saveStd_); - DeviceTensor3 input = devicetensor<3>(state, input_); - DeviceTensor3 gradOutput = devicetensor<3>(state, gradOutput_); - DeviceTensor3 gradInput = devicetensor<3>(state, gradInput_); - DeviceTensor1 gradWeight = devicetensor<1>(state, gradWeight_); - DeviceTensor1 gradBias = devicetensor<1>(state, gradBias_); - DeviceTensor1 weight = devicetensor<1>(state, weight_); - DeviceTensor1 runningMean = devicetensor<1>(state, runningMean_); - DeviceTensor1 runningVar = devicetensor<1>(state, runningVar_); - DeviceTensor1 saveMean = devicetensor<1>(state, saveMean_); - DeviceTensor1 saveStd = devicetensor<1>(state, saveStd_); - - cudaStream_t s = THCState_getCurrentStream(state); - - dim3 blocks(gradOutput.getSize(1)); - dim3 threads(getNumThreads(gradOutput.getSize(2))); - BatchNormalizationBackward_kernel<<<blocks, threads, 0, s>>>( - input, gradOutput, gradInput, gradWeight, gradBias, weight, runningMean, runningVar, - saveMean, saveStd, train, scale, eps); - THCudaCheck(cudaGetLastError()); -} +#include "generic/BatchNormalization.cu" +#include "THCGenerateFloatTypes.h" diff --git a/lib/THCUNN/CMakeLists.txt b/lib/THCUNN/CMakeLists.txt index 8492503..5dd8712 100644 --- a/lib/THCUNN/CMakeLists.txt +++ b/lib/THCUNN/CMakeLists.txt @@ -31,6 +31,7 @@ ENDIF() FILE(GLOB src-cuda *.cu) +CUDA_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}) CUDA_ADD_LIBRARY(THCUNN MODULE ${src-cuda}) INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}) diff --git a/lib/THCUNN/ClassNLLCriterion.cu b/lib/THCUNN/ClassNLLCriterion.cu index 2b3ed8b..58684f4 100644 --- a/lib/THCUNN/ClassNLLCriterion.cu +++ b/lib/THCUNN/ClassNLLCriterion.cu @@ -1,16 +1,19 @@ #include "THCUNN.h" #include "common.h" +#include "THCHalf.h" +#include "THCHalfAutoNumerics.cuh" #include <stdio.h> #include <assert.h> static const int NTHREADS = 32; -__global__ void cunn_ClassNLLCriterion_updateOutput_kernel1(float *output, - float *total_weight, - float *input, - long *target, - float *weights, +template <typename Dtype> +__global__ void cunn_ClassNLLCriterion_updateOutput_kernel1(Dtype *output, + Dtype *total_weight, + Dtype *input, + THCIndex_t *target, + Dtype *weights, int size_average, int n_classes) { assert(threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0); @@ -20,7 +23,7 @@ __global__ void cunn_ClassNLLCriterion_updateOutput_kernel1(float *output, int t = (int)*target - TH_INDEX_BASE; assert(t >= 0 && t < n_classes); - float cur_weight = weights ? weights[t] : 1.0f; + Dtype cur_weight = weights ? weights[t] : ScalarConvert<int, Dtype>::to(1); *output = -cur_weight * input[t]; *total_weight = cur_weight; if (size_average && *total_weight > 0) { @@ -28,25 +31,26 @@ __global__ void cunn_ClassNLLCriterion_updateOutput_kernel1(float *output, } } -__global__ void cunn_ClassNLLCriterion_updateOutput_kernel(float *output, - float *total_weight, - float *input, - long *target, - float *weights, +template <typename Dtype, typename Acctype> +__global__ void cunn_ClassNLLCriterion_updateOutput_kernel(Dtype *output, + Dtype *total_weight, + Dtype *input, + THCIndex_t *target, + Dtype *weights, int size_average, int nframe, int ndim, int n_classes) { - __shared__ float shInputs[NTHREADS], acc_weight[NTHREADS]; + __shared__ Acctype shInputs[NTHREADS], acc_weight[NTHREADS]; int i, t; - float cur_weight; + Dtype cur_weight; - shInputs[threadIdx.x] = 0.0f; - acc_weight[threadIdx.x] = 0.0f; + shInputs[threadIdx.x] = ScalarConvert<int, Acctype>::to(0); + acc_weight[threadIdx.x] = ScalarConvert<int, Acctype>::to(0); for (i = threadIdx.x; i < nframe; i += NTHREADS) { t = target[i] - TH_INDEX_BASE; assert(t >= 0 && t < n_classes); - cur_weight = weights ? weights[t] : 1.0f; + cur_weight = weights ? weights[t] : ScalarConvert<int, Dtype>::to(1); shInputs[threadIdx.x] -= input[i * ndim + t] * cur_weight; acc_weight[threadIdx.x] += cur_weight; } @@ -56,39 +60,47 @@ __global__ void cunn_ClassNLLCriterion_updateOutput_kernel(float *output, // updateOutput_kernel if (threadIdx.x == 0) { - *output = *total_weight = 0; + *output = *total_weight = ScalarConvert<int, Dtype>::to(0); + Acctype outputAcc = 0; + Acctype total_weightAcc = 0; for (i = 0; i < NTHREADS; ++i){ - *output += shInputs[i]; - *total_weight += acc_weight[i]; + // FIXME should we do somethigng here + outputAcc += shInputs[i]; + total_weightAcc += acc_weight[i]; } + *total_weight = ScalarConvert<Acctype, Dtype>::to(total_weightAcc); + *output = ScalarConvert<Acctype, Dtype>::to(outputAcc); if (size_average && *total_weight > 0) { - *output /= *total_weight; + *output = ScalarConvert<Acctype, Dtype>::to(outputAcc / total_weightAcc); } + } } +template <typename Dtype> __global__ void cunn_ClassNLLCriterion_updateGradInput_kernel1( - float* gradInput, - float* weights, - long* target, - float* total_weight, + Dtype* gradInput, + Dtype* weights, + THCIndex_t* target, + Dtype* total_weight, int size_average, int n_classes) { if (*total_weight <= 0) { return; } - float norm = size_average ? (1.0f / *total_weight) : 1.0f; + Dtype norm = size_average ? (ScalarConvert<int, Dtype>::to(1) / *total_weight) : ScalarConvert<int, Dtype>::to(1); int t = (int)*target - TH_INDEX_BASE; assert(t >= 0 && t < n_classes); - gradInput[t] = -(weights ? weights[t] : 1.0f) * norm; + gradInput[t] = -(weights ? weights[t] : ScalarConvert<int, Dtype>::to(1)) * norm; } +template <typename Dtype> __global__ void cunn_ClassNLLCriterion_updateGradInput_kernel( - float *gradInput, - long *target, - float *weights, - float *total_weight, + Dtype *gradInput, + THCIndex_t *target, + Dtype *weights, + Dtype *total_weight, int size_average, int nframe, int ndim, @@ -98,148 +110,14 @@ __global__ void cunn_ClassNLLCriterion_updateGradInput_kernel( return; } int i, t; - float norm = size_average ? (1.0f / *total_weight) : 1.0f; + Dtype norm = size_average ? (ScalarConvert<int, Dtype>::to(1) / *total_weight) : ScalarConvert<int, Dtype>::to(1); for (i = threadIdx.x; i < nframe; i += NTHREADS) { t = (int)target[i] - TH_INDEX_BASE; assert(t >= 0 && t < n_classes); - gradInput[i * ndim + t] = -(weights ? weights[t] : 1.0f) * norm; - } -} - -void THNN_CudaClassNLLCriterion_updateOutput(THCState *state, THCudaTensor *input, THCudaLongTensor *target, THCudaTensor *output, bool sizeAverage, THCudaTensor *weights, THCudaTensor *total_weight) { - if (THCudaLongTensor_nDimension(state, target) > 1) { - THError("multi-target not supported"); - } - - int n_dims = THCudaTensor_nDimension(state, input); - int n_classes = THCudaTensor_size(state, input, n_dims - 1); - - if (weights) { - THCUNN_assertSameGPU( - state, 5, input, target, weights, output, total_weight - ); - } else { - THCUNN_assertSameGPU( - state, 4, input, target, output, total_weight - ); - } - - if (THCudaTensor_nDimension(state, input) > 2) { - THArgCheck(0, 2, "vector or matrix expected"); - } - if (weights && THCudaTensor_nElement(state, weights) != n_classes) { - THError("weight tensor should be defined either for all or no classes"); - } - - input = THCudaTensor_newContiguous(state, input); - weights = weights ? THCudaTensor_newContiguous(state, weights) : NULL; - target = THCudaLongTensor_newContiguous(state, target); - - float *input_data = THCudaTensor_data(state, input); - float *weights_data = weights ? THCudaTensor_data(state, weights) : NULL; - long *target_data = THCudaLongTensor_data(state, target); - float *output_data = THCudaTensor_data(state, output); - float *total_weight_data = THCudaTensor_data(state, total_weight); - - if (THCudaTensor_nDimension(state, input) == 1) { - cunn_ClassNLLCriterion_updateOutput_kernel1 - <<<1, 1, 0, THCState_getCurrentStream(state)>>>( - output_data, - total_weight_data, - input_data, - target_data, - weights_data, - sizeAverage, - n_classes - ); - - } else if (THCudaTensor_nDimension(state, input) == 2) { - cunn_ClassNLLCriterion_updateOutput_kernel - <<<1, NTHREADS, 0, THCState_getCurrentStream(state)>>>( - output_data, - total_weight_data, - input_data, - target_data, - weights_data, - sizeAverage, - THCudaTensor_size(state, input, 0), - THCudaTensor_size(state, input, 1), - n_classes - ); - } - THCudaCheck(cudaGetLastError()); - - if (weights) { - THCudaTensor_free(state, weights); + gradInput[i * ndim + t] = -(weights ? weights[t] : ScalarConvert<int, Dtype>::to(1)) * norm; } - THCudaLongTensor_free(state, target); - THCudaTensor_free(state, input); } -void THNN_CudaClassNLLCriterion_updateGradInput(THCState *state, THCudaTensor *input, THCudaLongTensor *target, THCudaTensor *gradInput, bool sizeAverage, THCudaTensor *weights, THCudaTensor *total_weight) { - if (THCudaLongTensor_nDimension(state, target) > 1) { - THError("multi-target not supported"); - } - - int n_dims = THCudaTensor_nDimension(state, input); - int n_classes = THCudaTensor_size(state, input, n_dims - 1); - - THArgCheck(THCudaTensor_isContiguous(state, gradInput), 4, "gradInput must be contiguous"); - - if (weights) { - THCUNN_assertSameGPU( - state, 5, weights, input, target, gradInput, total_weight - ); - } - else { - THCUNN_assertSameGPU( - state, 4, input, target, gradInput, total_weight - ); - } - - if (THCudaTensor_nDimension(state, input) > 2) { - THArgCheck(0, 2, "vector or matrix expected"); - } - if (weights && THCudaTensor_nElement(state, weights) != n_classes) { - THError("weight tensor should be defined either for all or no classes"); - } - - weights = weights ? THCudaTensor_newContiguous(state, weights) : NULL; - target = THCudaLongTensor_newContiguous(state, target); - - float *weights_data = weights ? THCudaTensor_data(state, weights) : NULL; - float *gradInput_data = THCudaTensor_data(state, gradInput); - long *target_data = THCudaLongTensor_data(state, target); - float *total_weight_data = THCudaTensor_data(state, total_weight); - - if (THCudaTensor_nDimension(state, input) == 1) { - cunn_ClassNLLCriterion_updateGradInput_kernel1 - <<<1, 1, 0, THCState_getCurrentStream(state)>>>( - gradInput_data, - weights_data, - target_data, - total_weight_data, - sizeAverage, - n_classes - ); - } else { - cunn_ClassNLLCriterion_updateGradInput_kernel - <<<1, NTHREADS, 0, THCState_getCurrentStream(state)>>>( - gradInput_data, - target_data, - weights_data, - total_weight_data, - sizeAverage, - THCudaTensor_size(state, input, 0), - THCudaTensor_size(state, input, 1), - n_classes - ); - } - THCudaCheck(cudaGetLastError()); - - if (weights) { - THCudaTensor_free(state, weights); - } - THCudaLongTensor_free(state, target); -} +#include "generic/ClassNLLCriterion.cu" +#include "THCGenerateFloatTypes.h" diff --git a/lib/THCUNN/DistKLDivCriterion.cu b/lib/THCUNN/DistKLDivCriterion.cu index 2984672..aa453f8 100644 --- a/lib/THCUNN/DistKLDivCriterion.cu +++ b/lib/THCUNN/DistKLDivCriterion.cu @@ -1,5 +1,7 @@ #include "THCUNN.h" #include "common.h" +#include "THCHalf.h" +#include "THCHalfAutoNumerics.cuh" #include <thrust/fill.h> #include <thrust/functional.h> @@ -7,76 +9,30 @@ #include <thrust/reduce.h> #include <thrust/inner_product.h> +template <typename Dtype, typename Acctype> struct kl_functor { - __host__ __device__ float operator()(const float& x, const float& y) const + __host__ __device__ Acctype operator()(const Dtype& x, const Dtype& y) const { - return y > 0 ? y * (log(y) - x) : 0; + Acctype yAcc = ScalarConvert<Dtype, Acctype>::to(y); + return y > 0 ? yAcc * (THCNumerics<Acctype>::log(yAcc) - x) : Acctype(0); } }; -void THNN_CudaDistKLDivCriterion_updateOutput(THCState *state, THCudaTensor *input, THCudaTensor *target, THCudaTensor *output, bool sizeAverage) -{ - THCUNN_assertSameGPU(state, 2, input, target); - - THArgCheck(THCudaTensor_nElement(state, input) == THCudaTensor_nElement(state, target), 2, - "input and target need to have the same number of elements"); - - float sum; - - long size = THCudaTensor_nElement(state, input); - - input = THCudaTensor_newContiguous(state, input); - target = THCudaTensor_newContiguous(state, target); - - thrust::device_ptr<float> input_data(THCudaTensor_data(state, input)); - thrust::device_ptr<float> target_data(THCudaTensor_data(state, target)); - sum = thrust::inner_product(input_data, input_data+size, target_data, (float) 0, thrust::plus<float>(), kl_functor()); - - if (sizeAverage) - sum /= size; - - THCudaTensor_free(state, input); - THCudaTensor_free(state, target); - - THCudaTensor_set1d(state, output, 0, sum); -} - +template <typename Dtype> struct kl_updateGradInput_functor { - const float norm; + const Dtype norm; - kl_updateGradInput_functor(float norm_) + kl_updateGradInput_functor(Dtype norm_) : norm(norm_) {} - __host__ __device__ float operator()(const float& x, const float& y) const + __host__ __device__ Dtype operator()(const Dtype& x, const Dtype& y) const { - return y > 0 ? norm * (-y) : 0; + return y > 0 ? norm * (-y) : ScalarConvert<int, Dtype>::to(0); } }; -void THNN_CudaDistKLDivCriterion_updateGradInput(THCState *state, THCudaTensor *input, THCudaTensor *target, THCudaTensor *gradInput, bool sizeAverage) -{ - THCUNN_assertSameGPU(state, 3, input, target, gradInput); - - THArgCheck(THCudaTensor_nElement(state, input) == THCudaTensor_nElement(state, target), 2, - "input and target need to have the same number of elements"); - - long size = THCudaTensor_nElement(state, input); - float norm = (sizeAverage ? 1./size : 1.); - - input = THCudaTensor_newContiguous(state, input); - target = THCudaTensor_newContiguous(state, target); - - THCudaTensor_resizeAs(state, gradInput, input); - - thrust::device_ptr<float> input_data(THCudaTensor_data(state, input)); - thrust::device_ptr<float> target_data(THCudaTensor_data(state, target)); - thrust::device_ptr<float> gradInput_data(THCudaTensor_data(state, gradInput)); - - thrust::transform(input_data, input_data+size, target_data, gradInput_data, kl_updateGradInput_functor(norm)); - - THCudaTensor_free(state, input); - THCudaTensor_free(state, target); -} +#include "generic/DistKLDivCriterion.cu" +#include "THCGenerateFloatTypes.h" diff --git a/lib/THCUNN/ELU.cu b/lib/THCUNN/ELU.cu index e4a05bc..ced993c 100644 --- a/lib/THCUNN/ELU.cu +++ b/lib/THCUNN/ELU.cu @@ -1,93 +1,67 @@ #include "THCUNN.h" -#include "common.h" +#include "THCHalf.h" +#include "THCHalfAutoNumerics.cuh" +template <typename T> struct ELUupdateOutput_functor { - const float alpha_; + const T alpha_; - ELUupdateOutput_functor(float alpha) + ELUupdateOutput_functor(T alpha) : alpha_(alpha) {} - __device__ void operator()(float *output, const float *input) const + __device__ void operator()(T *output, const T *input) const { *output = *input <= 0 ? (exp(*input) - 1) * alpha_ : *input; } }; // in-place variant +template <typename T> struct ELUupdateOutputIP_functor { - const float alpha_; + const T alpha_; - ELUupdateOutputIP_functor(float alpha) + ELUupdateOutputIP_functor(T alpha) : alpha_(alpha) {} - __device__ void operator()(float *x) const + __device__ void operator()(T *x) const { *x = *x <= 0 ? (exp(*x) - 1) * alpha_ : *x; } }; -void THNN_CudaELU_updateOutput(THCState *state, THCudaTensor *input, THCudaTensor *output, - float alpha, bool inplace) -{ - THCUNN_assertSameGPU(state, 2, input, output); - - if (inplace) - { - THC_pointwiseApply1(state, input, ELUupdateOutputIP_functor(alpha)); - THCudaTensor_set(state, output, input); - } - else - { - THCudaTensor_resizeAs(state, output, input); - THC_pointwiseApply2(state, output, input, ELUupdateOutput_functor(alpha)); - } -} - +template <typename T> struct ELUupdateGradInput_functor { - const float alpha_; + const T alpha_; - ELUupdateGradInput_functor(float alpha) + ELUupdateGradInput_functor(T alpha) : alpha_(alpha) {} - __device__ void operator()(float *gradInput, const float *output, const float *gradOutput) const + __device__ void operator()(T *gradInput, const T *output, const T *gradOutput) const { *gradInput = (*output) <= 0 ? (*gradOutput * (*output + alpha_)) : (*gradOutput); } }; +template <typename T> struct ELUupdateGradInputIP_functor { - const float alpha_; + const T alpha_; - ELUupdateGradInputIP_functor(float alpha) + ELUupdateGradInputIP_functor(T alpha) : alpha_(alpha) {} - __device__ void operator()(float *gradOutput, const float *output) const + __device__ void operator()(T *gradOutput, const T *output) const { *gradOutput = (*output) <= 0 ? (*gradOutput * (*output + alpha_)) : (*gradOutput); } }; -void THNN_CudaELU_updateGradInput(THCState *state, THCudaTensor *input, THCudaTensor *gradOutput, - THCudaTensor *gradInput, THCudaTensor *output, float alpha, bool inplace) -{ - THCUNN_assertSameGPU(state, 3, output, gradOutput, gradInput); - - if (inplace) - { - THC_pointwiseApply2(state, gradOutput, output, ELUupdateGradInputIP_functor(alpha)); - THCudaTensor_set(state, gradInput, gradOutput); - } - else - { - THCudaTensor_resizeAs(state, gradInput, output); - THC_pointwiseApply3(state, gradInput, output, gradOutput, ELUupdateGradInput_functor(alpha)); - } -} +#include "generic/ELU.cu" +#include "THCGenerateFloatTypes.h" diff --git a/lib/THCUNN/HardTanh.cu b/lib/THCUNN/HardTanh.cu index c7df1dd..b400beb 100644 --- a/lib/THCUNN/HardTanh.cu +++ b/lib/THCUNN/HardTanh.cu @@ -1,17 +1,19 @@ #include "THCUNN.h" -#include "common.h" +#include "THCHalf.h" +#include "THCHalfAutoNumerics.cuh" +template <typename T> struct hardtanhupdateOutput_functor { - const float max_val_; - const float min_val_; + const T max_val_; + const T min_val_; - hardtanhupdateOutput_functor(float min_val, float max_val) + hardtanhupdateOutput_functor(T min_val, T max_val) : min_val_(min_val) , max_val_(max_val) {} - __device__ void operator()(float *output, const float *input) const + __device__ void operator()(T *output, const T *input) const { if (*input < min_val_) *output = min_val_; @@ -21,7 +23,7 @@ struct hardtanhupdateOutput_functor *output = max_val_; } - __device__ void operator()(float *input) const + __device__ void operator()(T *input) const { if (*input < min_val_) *input = min_val_; @@ -30,74 +32,31 @@ struct hardtanhupdateOutput_functor } }; -void THNN_CudaHardTanh_updateOutput( - THCState *state, - THCudaTensor *input, - THCudaTensor *output, - float min_val, - float max_val, - bool inplace) -{ - THCUNN_assertSameGPU(state, 2, input, output); - if(inplace) - { - THCudaTensor_set(state, output, input); - THC_pointwiseApply1(state, output, hardtanhupdateOutput_functor(min_val, max_val)); - } - else - { - THCudaTensor_resizeAs(state, output, input); - THC_pointwiseApply2(state, output, input, - hardtanhupdateOutput_functor(min_val, max_val)); - } -} - +template <typename T> struct hardtanhupdateGradInput_functor { - const float max_val_; - const float min_val_; + const T max_val_; + const T min_val_; - hardtanhupdateGradInput_functor(float min_val, float max_val) + hardtanhupdateGradInput_functor(T min_val, T max_val) : min_val_(min_val) , max_val_(max_val) {} - __device__ void operator()(float *gradInput, const float *input, const float *gradOutput) const + __device__ void operator()(T *gradInput, const T *input, const T *gradOutput) const { if (*input < min_val_ || *input > max_val_) - *gradInput = 0; + *gradInput = ScalarConvert<int, T>::to(0); else *gradInput = *gradOutput; } - __device__ void operator()(float *gradInput, const float *input) const + __device__ void operator()(T *gradInput, const T *input) const { if (*input <= min_val_ || *input >= max_val_) - *gradInput = 0; + *gradInput = ScalarConvert<int, T>::to(0); } }; -void THNN_CudaHardTanh_updateGradInput( - THCState *state, - THCudaTensor *input, - THCudaTensor *gradOutput, - THCudaTensor *gradInput, - float min_val, - float max_val, - bool inplace) -{ - THCUNN_assertSameGPU(state, 3, input, gradOutput, gradInput); - - if (inplace) - { - THCudaTensor_set(state, gradInput, gradOutput); - THC_pointwiseApply2(state, gradInput, input, - hardtanhupdateGradInput_functor(min_val, max_val)); - } - else - { - THCudaTensor_resizeAs(state, gradInput, input); - THC_pointwiseApply3(state, gradInput, input, gradOutput, - hardtanhupdateGradInput_functor(min_val, max_val)); - } -} +#include "generic/HardTanh.cu" +#include "THCGenerateFloatTypes.h" diff --git a/lib/THCUNN/L1Cost.cu b/lib/THCUNN/L1Cost.cu index 7dd6bb2..eda58c1 100644 --- a/lib/THCUNN/L1Cost.cu +++ b/lib/THCUNN/L1Cost.cu @@ -1,57 +1,34 @@ #include "THCUNN.h" #include "common.h" +#include "THCHalf.h" +#include "THCHalfAutoNumerics.cuh" #include <thrust/device_ptr.h> #include <thrust/reduce.h> -#include <thrust/transform.h> +#include <thrust/transform_reduce.h> +template <typename Dtype, typename Acctype> struct l1cost_functor { - __host__ __device__ float operator()(float x, float y) const + __host__ __device__ Acctype operator()(Dtype x) const { - return abs(x) + abs(y); + return THCNumerics<Acctype>::abs(ScalarConvert<Dtype, Acctype>::to(x)); } }; -void THNN_CudaL1Cost_updateOutput(THCState *state, THCudaTensor *input, THCudaTensor *output) -{ - THCUNN_assertSameGPU(state, 1, input); - float sum; - long size = THCudaTensor_nElement(state, input); - input = THCudaTensor_newContiguous(state, input); - thrust::device_ptr<float> input_data(THCudaTensor_data(state, input)); - sum = thrust::reduce(input_data, input_data+size, (float) 0, l1cost_functor()); - - THCudaTensor_free(state, input); - - THCudaTensor_set1d(state, output, 0, sum); -} - +template <typename Dtype> struct l1cost_updateGradInput_functor { - __host__ __device__ float operator()(float x) const + __host__ __device__ Dtype operator()(Dtype x) const { if (x > 0) - return 1; + return ScalarConvert<int, Dtype>::to(1); else if (x < 0) - return -1; + return ScalarConvert<int, Dtype>::to(-1); else - return 0; + return ScalarConvert<int, Dtype>::to(0); } }; -void THNN_CudaL1Cost_updateGradInput(THCState *state, THCudaTensor *input, THCudaTensor *gradOutput, THCudaTensor *gradInput) -{ - THCUNN_assertSameGPU(state, 2, input, gradInput); - long size = THCudaTensor_nElement(state, input); - - input = THCudaTensor_newContiguous(state, input); - THCudaTensor_resizeAs(state, gradInput, input); - - thrust::device_ptr<float> input_data(THCudaTensor_data(state, input)); - thrust::device_ptr<float> gradInput_data(THCudaTensor_data(state, gradInput)); - - thrust::transform(input_data, input_data+size, gradInput_data, l1cost_updateGradInput_functor()); - - THCudaTensor_free(state, input); -} +#include "generic/L1Cost.cu" +#include "THCGenerateFloatTypes.h" diff --git a/lib/THCUNN/LeakyReLU.cu b/lib/THCUNN/LeakyReLU.cu index a641821..8c0e6f8 100644 --- a/lib/THCUNN/LeakyReLU.cu +++ b/lib/THCUNN/LeakyReLU.cu @@ -1,103 +1,73 @@ #include "THCUNN.h" -#include "common.h" +#include "THCHalf.h" +#include "THCHalfAutoNumerics.cuh" +template <typename T> struct LeakyReLUUpdateOutput { - const float negval_; + const T negval_; - LeakyReLUUpdateOutput(float negval) + LeakyReLUUpdateOutput(T negval) : negval_(negval) {} - __device__ __forceinline__ void operator()(float *out, float *in) + __device__ __forceinline__ void operator()(T *out, T *in) { - float x = *in; + T x = *in; *out = (x > 0) ? x : x * negval_; } }; // in-place variant +template <typename T> struct LeakyReLUUpdateOutputIP { - const float negval_; + const T negval_; - LeakyReLUUpdateOutputIP(float negval) + LeakyReLUUpdateOutputIP(T negval) : negval_(negval) {} - __device__ __forceinline__ void operator()(float *x) + __device__ __forceinline__ void operator()(T *x) { *x = (*x > 0) ? *x : negval_ * (*x); } }; -void THNN_CudaLeakyReLU_updateOutput(THCState *state, THCudaTensor *input, THCudaTensor *output, - double negval, bool inplace) -{ - THCUNN_assertSameGPU(state, 2, input, output); - - if (inplace) - { - THC_pointwiseApply1(state, input, LeakyReLUUpdateOutputIP(negval)); - THCudaTensor_set(state, output, input); - } - else - { - THCudaTensor_resizeAs(state, output, input); - THC_pointwiseApply2(state, output, input, LeakyReLUUpdateOutput(negval)); - } - - THCudaCheck(cudaGetLastError()); -} - +template <typename T> struct LeakyReLUUpdateGradInput { - const float negval_; + const T negval_; - LeakyReLUUpdateGradInput(float negval) + LeakyReLUUpdateGradInput(T negval) : negval_(negval) {} __device__ __forceinline__ void operator()( - float* gradInput, - float* input, - float* gradOutput) const + T* gradInput, + T* input, + T* gradOutput) const { *gradInput = (*input > 0) ? *gradOutput : (*gradOutput) * negval_; } }; +template <typename T> struct LeakyReLUUpdateGradInputIP { - const float negval_; + const T negval_; - LeakyReLUUpdateGradInputIP(float negval) + LeakyReLUUpdateGradInputIP(T negval) : negval_(negval) {} __device__ __forceinline__ void operator()( - float* gradOutput, - float* input) const + T* gradOutput, + T* input) const { *gradOutput = (*input > 0) ? *gradOutput : (*gradOutput) * negval_; } }; -void THNN_CudaLeakyReLU_updateGradInput(THCState *state, THCudaTensor *input, THCudaTensor *gradOutput, - THCudaTensor *gradInput, double negval, bool inplace) -{ - THCUNN_assertSameGPU(state, 3, input, gradInput, gradOutput); - - if (inplace) - { - THC_pointwiseApply2(state, gradOutput, input, LeakyReLUUpdateGradInputIP(negval)); - THCudaTensor_set(state, gradInput, gradOutput); - } - else - { - THCudaTensor_resizeAs(state, gradInput, input); - THC_pointwiseApply3(state, gradInput, input, gradOutput, LeakyReLUUpdateGradInput(negval)); - } - - THCudaCheck(cudaGetLastError()); -} +#include "generic/LeakyReLU.cu" +#include "THCGenerateFloatTypes.h" diff --git a/lib/THCUNN/LogSigmoid.cu b/lib/THCUNN/LogSigmoid.cu index 2f56081..f008b63 100644 --- a/lib/THCUNN/LogSigmoid.cu +++ b/lib/THCUNN/LogSigmoid.cu @@ -1,35 +1,26 @@ #include "THCUNN.h" -#include "common.h" +#include "THCHalf.h" +#include "THCHalfAutoNumerics.cuh" +template <typename T> struct logSigmoid_updateOutput_functor { - __device__ void operator()(float *output, const float *input) const + __device__ void operator()(T *output, const T *input) const { - float z = exp(-*input); - *output = -log(1. + z); + T z = exp(-*input); + *output = ScalarConvert<double, T>::to(-log(1. + z)); } }; -void THNN_CudaLogSigmoid_updateOutput(THCState *state, THCudaTensor *input, THCudaTensor *output, THCudaTensor *buffer) -{ - THCUNN_assertSameGPU(state, 2, input, output); - THCudaTensor_resizeAs(state, output, input); - THC_pointwiseApply2(state, output, input, logSigmoid_updateOutput_functor()); -} - +template <typename T> struct logSigmoid_updateGradInput_functor { - __device__ void operator()(float *gradInput, const float *input, const float *gradOutput) const + __device__ void operator()(T *gradInput, const T *input, const T *gradOutput) const { - float z = exp(-*input); - *gradInput = *gradOutput * z / (1. + z); + T z = exp(-*input); + *gradInput = ScalarConvert<double, T>::to(*gradOutput * z / (1. + z)); } }; -void THNN_CudaLogSigmoid_updateGradInput(THCState *state, THCudaTensor *input, THCudaTensor *gradOutput, - THCudaTensor *gradInput , THCudaTensor *buffer) -{ - THCUNN_assertSameGPU(state, 3, input, gradOutput, gradInput); - THCudaTensor_resizeAs(state, gradInput, input); - THC_pointwiseApply3(state, gradInput, input, gradOutput, logSigmoid_updateGradInput_functor()); -} +#include "generic/LogSigmoid.cu" +#include "THCGenerateFloatTypes.h" diff --git a/lib/THCUNN/LogSoftMax.cu b/lib/THCUNN/LogSoftMax.cu index 70cab80..4d7973e 100644 --- a/lib/THCUNN/LogSoftMax.cu +++ b/lib/THCUNN/LogSoftMax.cu @@ -1,7 +1,10 @@ #include "THCUNN.h" -#include "common.h" +#include "THCHalf.h" +#include "THCHalfAutoNumerics.cuh" +#include "SharedMem.cuh" -__global__ void cunn_SpatialLogSoftMax_updateOutput_kernel(float *output, float *input, int classSize, int height, int width) +template <typename T, typename AccumT> +__global__ void cunn_SpatialLogSoftMax_updateOutput_kernel(T *output, T *input, int classSize, int height, int width) { int batchIndex = blockIdx.x; int index = threadIdx.x; @@ -18,11 +21,11 @@ __global__ void cunn_SpatialLogSoftMax_updateOutput_kernel(float *output, float (width*classSize)*y + (classSize)*x; - float sum = 0; + AccumT sum = 0; for (int i = 0; i < classSize; i++) { - sum += __expf(input[inputStartIndex + i]); + sum += THCNumerics<T>::exp(input[inputStartIndex + i]); } - sum = 1.0f / sum; + sum = AccumT(1) / sum; for (int i = 0; i < classSize; i++) { // calculate output index in torch layout (B x C x H x W) @@ -31,13 +34,15 @@ __global__ void cunn_SpatialLogSoftMax_updateOutput_kernel(float *output, float (height*width)*i + (width)*y + x; - output[outputIndex] = logf(sum * __expf(input[inputStartIndex + i])); + output[outputIndex] = ScalarConvert<AccumT, T>::to( + THCNumerics<AccumT>::log(sum * THCNumerics<T>::exp(input[inputStartIndex + i]))); } index += blockDim.x; } } -__global__ void cunn_SpatialLogSoftMax_updateGradInput_kernel(float *gradInput, float *output, float *gradOutput, int classSize, int height, int width) +template <typename T, typename AccumT> +__global__ void cunn_SpatialLogSoftMax_updateGradInput_kernel(T *gradInput, T *output, T *gradOutput, int classSize, int height, int width) { int batchIndex = blockIdx.x; int index = threadIdx.x; @@ -54,7 +59,7 @@ __global__ void cunn_SpatialLogSoftMax_updateGradInput_kernel(float *gradInput, (width*classSize)*y + (classSize)*x; - float sum = 0; + AccumT sum = 0; for (int i = 0; i < classSize; i++) { sum += gradOutput[outputStartIndex + i]; } @@ -66,70 +71,76 @@ __global__ void cunn_SpatialLogSoftMax_updateGradInput_kernel(float *gradInput, (height*width)*i + (width)*y + x; - gradInput[inputIndex] = gradOutput[outputStartIndex + i] - __expf(output[outputStartIndex + i]) * sum; + gradInput[inputIndex] = ScalarConvert<AccumT, T>::to( + gradOutput[outputStartIndex + i] - THCNumerics<T>::exp(output[outputStartIndex + i]) * sum); } index += blockDim.x; } } +template <typename T, typename AccumT> struct MaxFloat { - __device__ __forceinline__ float operator()(float max, float v) const + __device__ __forceinline__ AccumT operator()(AccumT max, T v) const { - return fmaxf(max, v); + return fmaxType(max, v); } }; +template<typename T, typename AccumT> struct SumFloat { - __device__ __forceinline__ float operator()(float sum, float v) const + __device__ __forceinline__ AccumT operator()(AccumT sum, T v) const { return sum + v; } }; +template<typename T, typename AccumT> struct SumExpFloat { - __device__ __forceinline__ SumExpFloat(float v) + __device__ __forceinline__ SumExpFloat(T v) : max_k(v) {} - __device__ __forceinline__ float operator()(float sum, float v) const + __device__ __forceinline__ AccumT operator()(AccumT sum, T v) const { - return sum + expf(v - max_k); + return sum + THCNumerics<T>::exp(v - max_k); } - const float max_k; + const T max_k; }; +template<typename AccumT> struct NoFinal { - __device__ __forceinline__ float operator()(float v) const + __device__ __forceinline__ AccumT operator()(AccumT v) const { return v; } }; +template<typename AccumT> struct LSMFinal { - __device__ __forceinline__ LSMFinal(float m) + __device__ __forceinline__ LSMFinal(AccumT m) : max_k(m) {} - __device__ __forceinline__ float operator()(float v) const + __device__ __forceinline__ AccumT operator()(AccumT v) const { - return max_k + logf(v); + return max_k + THCNumerics<AccumT>::log(v); } - const float max_k; + const AccumT max_k; }; -template <typename Reduction, typename Finalize> -__device__ __forceinline__ float -blockReduce(float* smem, float val, - const Reduction& r, - float defaultVal, - const Finalize& f) +template <template<typename, typename> class Reduction, template<typename> class Finalize, typename AccumT> +__device__ __forceinline__ AccumT +blockReduce(AccumT* smem, AccumT val, + const Reduction<AccumT, AccumT>& r, + AccumT defaultVal, + const Finalize<AccumT>& f) { // To avoid RaW races from chaining blockReduce calls together, we // need a sync here @@ -139,7 +150,7 @@ blockReduce(float* smem, float val, __syncthreads(); - float warpVal = defaultVal; + AccumT warpVal = defaultVal; // First warp will perform per-warp reductions for the remaining warps if ((threadIdx.x / 32) == 0) // only threads in warp1 go into this (if) @@ -162,7 +173,7 @@ blockReduce(float* smem, float val, __syncthreads(); // First thread will perform a reduction of the above per-warp reductions - float blockVal = defaultVal; + AccumT blockVal = defaultVal; if (threadIdx.x == 0) { @@ -179,23 +190,23 @@ blockReduce(float* smem, float val, return smem[0]; } -template <typename Reduction> -__device__ __forceinline__ float -blockReduce(float* smem, float val, - const Reduction& r, - float defaultVal) +template <template<typename, typename> class Reduction, typename AccumT> +__device__ __forceinline__ AccumT +blockReduce(AccumT* smem, AccumT val, + const Reduction<AccumT, AccumT>& r, + AccumT defaultVal) { - return blockReduce<Reduction, NoFinal>(smem, val, r, defaultVal, NoFinal()); + return blockReduce<Reduction, NoFinal, AccumT>(smem, val, r, defaultVal, NoFinal<AccumT>()); } -template <typename Reduction, int ILP> -__device__ __forceinline__ float -ilpReduce(float* data, +template <template<typename, typename> class Reduction, int ILP, typename T, typename AccumT> +__device__ __forceinline__ AccumT +ilpReduce(T* data, int size, - const Reduction& r, - float defaultVal) + const Reduction<T, AccumT>& r, + AccumT defaultVal) { - float threadVal = defaultVal; + AccumT threadVal = defaultVal; int offset = threadIdx.x; int last = size % (ILP * blockDim.x); @@ -203,7 +214,7 @@ ilpReduce(float* data, // Body (unroll by ILP times) for (; offset < size - last; offset += blockDim.x * ILP) { - float tmp[ILP]; + T tmp[ILP]; #pragma unroll for (int j = 0; j < ILP; ++j) @@ -227,28 +238,30 @@ ilpReduce(float* data, return threadVal; } -template <int ILP> +template <int ILP, typename T, typename AccumT> __global__ void -cunn_LogSoftMax_updateOutput_kernel(float *output, float *input, int classes) +cunn_LogSoftMax_updateOutput_kernel(T *output, T *input, int classes) { - extern __shared__ float buffer[]; + SharedMem<AccumT> smem; + AccumT *buffer = smem.getPointer(); // forward pointers to batch[blockIdx.x] // each block handles a sample in the mini-batch input += blockIdx.x * classes; output += blockIdx.x * classes; // find the max of the batch - float threadMax = - ilpReduce<MaxFloat, ILP>(input, classes, MaxFloat(), -FLT_MAX); + AccumT threadMax = ilpReduce<MaxFloat, ILP, T, AccumT>( + input, classes, MaxFloat<T, AccumT>(), -THCNumerics<AccumT>::max()); // find the max over all batches - float max_k = - blockReduce<MaxFloat>(buffer, threadMax, MaxFloat(), -FLT_MAX); + AccumT max_k = blockReduce<MaxFloat, AccumT>( + buffer, threadMax, MaxFloat<AccumT, AccumT>(), -THCNumerics<AccumT>::max()); + T max_k_non_accum = ScalarConvert<AccumT, T>::to(max_k); - float threadExp = - ilpReduce<SumExpFloat, ILP>(input, classes, SumExpFloat(max_k), 0.0f); - float logsum_k = - blockReduce<SumFloat, LSMFinal>( - buffer, threadExp, SumFloat(), 0.0f, LSMFinal(max_k)); + AccumT threadExp = ilpReduce<SumExpFloat, ILP, T, AccumT>( + input, classes, SumExpFloat<T, AccumT>(max_k_non_accum), AccumT(0)); + T logsum_k = ScalarConvert<AccumT, T>::to( + blockReduce<SumFloat, LSMFinal, AccumT>( + buffer, threadExp, SumFloat<AccumT, AccumT>(), AccumT(0), LSMFinal<AccumT>(max_k))); // Output LSM (hand ILP) int offset = threadIdx.x; @@ -256,7 +269,7 @@ cunn_LogSoftMax_updateOutput_kernel(float *output, float *input, int classes) int last = classes % (ILP * blockDim.x); for (; offset < classes - last; offset += blockDim.x * ILP) { - float tmp[ILP]; + T tmp[ILP]; #pragma unroll for (int j = 0; j < ILP; ++j) { @@ -276,30 +289,32 @@ cunn_LogSoftMax_updateOutput_kernel(float *output, float *input, int classes) } } -template <int ILP> +template <int ILP, typename T, typename AccumT> __global__ void -cunn_LogSoftMax_updateGradInput_kernel(float *gradInput, - float *output, - float *gradOutput, +cunn_LogSoftMax_updateGradInput_kernel(T *gradInput, + T *output, + T *gradOutput, int classes) { - extern __shared__ float buffer[]; + SharedMem<AccumT> smem; + AccumT *buffer = smem.getPointer(); gradInput += blockIdx.x * classes; output += blockIdx.x * classes; gradOutput += blockIdx.x * classes; - float threadSum = - ilpReduce<SumFloat, 4>(gradOutput, classes, SumFloat(), 0.0f); - float sum_k = - blockReduce<SumFloat>(buffer, threadSum, SumFloat(), 0.0f); + AccumT threadSum = ilpReduce<SumFloat, 4, T, AccumT>( + gradOutput, classes, SumFloat<T, AccumT>(), AccumT(0)); + T sum_k = ScalarConvert<AccumT, T>::to( + blockReduce<SumFloat, AccumT>( + buffer, threadSum, SumFloat<AccumT, AccumT>(), AccumT(0))); // Update gradInput (hand ILP) int offset = threadIdx.x; int last = classes % (ILP * blockDim.x); for (; offset < classes - last; offset += blockDim.x * ILP) { - float tmpGradOutput[ILP]; - float tmpOutput[ILP]; + T tmpGradOutput[ILP]; + T tmpOutput[ILP]; #pragma unroll for (int j = 0; j < ILP; ++j) @@ -312,237 +327,16 @@ cunn_LogSoftMax_updateGradInput_kernel(float *gradInput, for (int j = 0; j < ILP; ++j) { gradInput[offset + j * blockDim.x] = - tmpGradOutput[j] - __expf(tmpOutput[j]) * sum_k; + tmpGradOutput[j] - THCNumerics<T>::exp(tmpOutput[j]) * sum_k; } } for (; offset < classes; offset += blockDim.x) { gradInput[offset] = - gradOutput[offset] - __expf(output[offset]) * sum_k; + gradOutput[offset] - THCNumerics<T>::exp(output[offset]) * sum_k; } } -void THNN_CudaLogSoftMax_updateOutput(THCState *state, THCudaTensor *input, THCudaTensor *output) -{ - THCUNN_assertSameGPU(state, 2, input, output); - - THCudaTensor_resizeAs(state, output, input); - - bool spatial = false; - int batchSize = 1; - int classSize = 0; - int height = 0; - int width = 0; - - int ndims = THCudaTensor_nDimension(state, input); - - if (ndims == 1) - { - classSize = THCudaTensor_size(state, input, 0); - input = THCudaTensor_newContiguous(state, input); - } - else if (ndims == 2) - { - batchSize = THCudaTensor_size(state, input, 0); - classSize = THCudaTensor_size(state, input, 1); - input = THCudaTensor_newContiguous(state, input); - } - else if (ndims == 3) - { - spatial = true; - classSize = THCudaTensor_size(state, input, 0); - height = THCudaTensor_size(state, input, 1); - width = THCudaTensor_size(state, input, 2); - - // create contiguous tensor with cuda layout from tensor with torch layout - // C x H x W -> W x H x C - THCudaTensor_transpose(state, input, input, 0, 2); - // W x H x C -> H x W x C - THCudaTensor_transpose(state, input, input, 0, 1); - THCudaTensor *transposedInput = THCudaTensor_newContiguous(state, input); - THCudaTensor_transpose(state, input, input, 0, 1); - THCudaTensor_transpose(state, input, input, 0, 2); - input = transposedInput; - } - else if (ndims == 4) - { - spatial = true; - batchSize = THCudaTensor_size(state, input, 0); - classSize = THCudaTensor_size(state, input, 1); - height = THCudaTensor_size(state, input, 2); - width = THCudaTensor_size(state, input, 3); - - // create contiguous tensor with cuda layout from tensor with torch layout - // B x C x H x W -> B x W x H x C - THCudaTensor_transpose(state, input, input, 1, 3); - // B x W x H x C -> B x H x W x C - THCudaTensor_transpose(state, input, input, 1, 2); - THCudaTensor *transposedInput = THCudaTensor_newContiguous(state, input); - THCudaTensor_transpose(state, input, input, 1, 2); - THCudaTensor_transpose(state, input, input, 1, 3); - input = transposedInput; - } - else - { - THError("1D, 2D, 3D or 4D Tensor expected"); - } - - if (!spatial) - { - dim3 grid(batchSize); - dim3 block(1024); - - cunn_LogSoftMax_updateOutput_kernel<2> - <<<grid, block, block.x * sizeof(float), THCState_getCurrentStream(state)>>>( - THCudaTensor_data(state, output), - THCudaTensor_data(state, input), - classSize - ); - } - else - { - dim3 grid(batchSize); - dim3 block(1024); - - cunn_SpatialLogSoftMax_updateOutput_kernel - <<<grid, block, 0, THCState_getCurrentStream(state)>>>( - THCudaTensor_data(state, output), - THCudaTensor_data(state, input), - classSize, height, width - ); - } - - cudaError errcode = cudaGetLastError(); - if (errcode != cudaSuccess) - { - THError(cudaGetErrorString(errcode)); - } - - THCudaTensor_free(state, input); -} - -void THNN_CudaLogSoftMax_updateGradInput(THCState *state, THCudaTensor *input, THCudaTensor *gradOutput, - THCudaTensor *gradInput, THCudaTensor *output) -{ - THCUNN_assertSameGPU(state, 3, output, gradOutput, gradInput); - - THCudaTensor_resizeAs(state, gradInput, output); - - bool spatial = false; - int batchSize = 1; - int classSize = 0; - int height = 0; - int width = 0; - - int ndims = THCudaTensor_nDimension(state, input); - - if (ndims == 1) - { - classSize = THCudaTensor_size(state, gradInput, 0); - output = THCudaTensor_newContiguous(state, output); - gradOutput = THCudaTensor_newContiguous(state, gradOutput); - } - else if (ndims == 2) - { - batchSize = THCudaTensor_size(state, gradInput, 0); - classSize = THCudaTensor_size(state, gradInput, 1); - output = THCudaTensor_newContiguous(state, output); - gradOutput = THCudaTensor_newContiguous(state, gradOutput); - } - else if (ndims == 3) - { - spatial = true; - classSize = THCudaTensor_size(state, input, 0); - height = THCudaTensor_size(state, input, 1); - width = THCudaTensor_size(state, input, 2); - - // create contiguous tensor with cuda layout from tensor with torch layout - // C x H x W -> W x H x C - THCudaTensor_transpose(state, output, output, 0, 2); - // W x H x C -> H x W x C - THCudaTensor_transpose(state, output, output, 0, 1); - THCudaTensor *transposedOutput = THCudaTensor_newContiguous(state, output); - THCudaTensor_transpose(state, output, output, 0, 1); - THCudaTensor_transpose(state, output, output, 0, 2); - output = transposedOutput; - - // create contiguous tensor with cuda layout from tensor with torch layout - // C x H x W -> W x H x C - THCudaTensor_transpose(state, gradOutput, gradOutput, 0, 2); - // W x H x C -> H x W x C - THCudaTensor_transpose(state, gradOutput, gradOutput, 0, 1); - THCudaTensor *transposedGradOutput = THCudaTensor_newContiguous(state, gradOutput); - THCudaTensor_transpose(state, gradOutput, gradOutput, 0, 1); - THCudaTensor_transpose(state, gradOutput, gradOutput, 0, 2); - gradOutput = transposedGradOutput; - } - else if (ndims == 4) - { - spatial = true; - batchSize = THCudaTensor_size(state, gradInput, 0); - classSize = THCudaTensor_size(state, input, 1); - height = THCudaTensor_size(state, input, 2); - width = THCudaTensor_size(state, input, 3); - - // create contiguous tensor with cuda layout from tensor with torch layout - // B x C x H x W -> B x W x H x C - THCudaTensor_transpose(state, output, output, 1, 3); - // B x W x H x C -> B x H x W x C - THCudaTensor_transpose(state, output, output, 1, 2); - THCudaTensor *transposedOutput = THCudaTensor_newContiguous(state, output); - THCudaTensor_transpose(state, output, output, 1, 2); - THCudaTensor_transpose(state, output, output, 1, 3); - output = transposedOutput; - - // create contiguous tensor with cuda layout from tensor with torch layout - // B x C x H x W -> B x W x H x C - THCudaTensor_transpose(state, gradOutput, gradOutput, 1, 3); - // B x W x H x C -> B x H x W x C - THCudaTensor_transpose(state, gradOutput, gradOutput, 1, 2); - THCudaTensor *transposedGradOutput = THCudaTensor_newContiguous(state, gradOutput); - THCudaTensor_transpose(state, gradOutput, gradOutput, 1, 2); - THCudaTensor_transpose(state, gradOutput, gradOutput, 1, 3); - gradOutput = transposedGradOutput; - } - else - { - THError("1D, 2D, 3D or 4D Tensor expected"); - } - - if (!spatial) - { - dim3 grid(batchSize); - dim3 block(1024); - - cunn_LogSoftMax_updateGradInput_kernel<2> - <<<grid, block, block.x * sizeof(float), THCState_getCurrentStream(state)>>>( - THCudaTensor_data(state, gradInput), - THCudaTensor_data(state, output), - THCudaTensor_data(state, gradOutput), - classSize - ); - } - else - { - dim3 grid(batchSize); - dim3 block(1024); - - cunn_SpatialLogSoftMax_updateGradInput_kernel - <<<grid, block, 0, THCState_getCurrentStream(state)>>>( - THCudaTensor_data(state, gradInput), - THCudaTensor_data(state, output), - THCudaTensor_data(state, gradOutput), - classSize, height, width - ); - } - - cudaError errcode = cudaGetLastError(); - if (errcode != cudaSuccess) - { - THError(cudaGetErrorString(errcode)); - } - - THCudaTensor_free(state, gradOutput); - THCudaTensor_free(state, output); -} +#include "generic/LogSoftMax.cu" +#include "THCGenerateFloatTypes.h" diff --git a/lib/THCUNN/LookupTable.cu b/lib/THCUNN/LookupTable.cu index 2b2040e..bb91c7e 100644 --- a/lib/THCUNN/LookupTable.cu +++ b/lib/THCUNN/LookupTable.cu @@ -9,6 +9,8 @@ #include <thrust/system/cuda/execution_policy.h> #endif #include <thrust/unique.h> +#include "THCHalf.h" +#include "THCHalfAutoNumerics.cuh" #ifndef DIVUP #define DIVUP(x, y) (((x) + (y) - 1) / (y)) @@ -49,8 +51,9 @@ __device__ __forceinline__ bool warpHasCollision(int val) return __any(dup) != 0; } +template <typename Dtype> __global__ void cunn_LookupTable_accGradParametersKernelByFeature( - long *input, float *gradOutput, float *gradWeight, float scale, long numel, + long *input, Dtype *gradOutput, Dtype *gradWeight, Dtype scale, long numel, long stride, int paddingValue) { const int featureDim = blockIdx.x * 4 + threadIdx.x / 32; @@ -78,8 +81,9 @@ __global__ void cunn_LookupTable_accGradParametersKernelByFeature( continue; } - float update = gradOutput[i*stride + featureDim] * scale; + Dtype update = gradOutput[i*stride + featureDim] * scale; + // FIXME: should we accumulate as accreal? // Check for collision if (warpHasCollision(weightIndex)) { // Run all lanes sequentially; warp divergence @@ -95,9 +99,10 @@ __global__ void cunn_LookupTable_accGradParametersKernelByFeature( } } +template <typename Dtype, typename Acctype> __global__ void cunn_LookupTable_accGradParametersKernel( - long *input, long *indices, float *gradOutput, float *gradWeight, - long *count, float defaultScale, long numel, long stride, int paddingValue) { + long *input, long *indices, Dtype *gradOutput, Dtype *gradWeight, + long *count, Dtype defaultScale, long numel, long stride, int paddingValue) { int idx = blockIdx.x * 4 + threadIdx.y; @@ -122,10 +127,10 @@ __global__ void cunn_LookupTable_accGradParametersKernel( const int startFeature = threadIdx.x + blockIdx.y * blockDim.x * SZ; const int weightRow = ((int) input[idx] - TH_INDEX_BASE) * stride; const int gradOutputRow = ((int) indices[idx] - TH_INDEX_BASE) * stride; - const float scale = count ? defaultScale / count[idx] : defaultScale; + const Acctype scale = count ? ScalarConvert<Dtype, Acctype>::to(defaultScale) / count[idx] : ScalarConvert<Dtype, Acctype>::to(defaultScale); - float gradient[SZ]; - float weight[SZ]; + Acctype gradient[SZ]; + Acctype weight[SZ]; #pragma unroll for (int ii = 0; ii < SZ; ii++) @@ -133,8 +138,8 @@ __global__ void cunn_LookupTable_accGradParametersKernel( int featureDim = startFeature + ii * WARP_SIZE; if (featureDim < stride) { - gradient[ii] = gradOutput[gradOutputRow + featureDim]; - weight[ii] = gradWeight[weightRow + featureDim]; + gradient[ii] = ScalarConvert<Dtype, Acctype>::to(gradOutput[gradOutputRow + featureDim]); + weight[ii] = ScalarConvert<Dtype, Acctype>::to(gradWeight[weightRow + featureDim]); } } @@ -150,7 +155,7 @@ __global__ void cunn_LookupTable_accGradParametersKernel( int featureDim = startFeature + ii * WARP_SIZE; if (featureDim < stride) { - gradWeight[weightRow + featureDim] = weight[ii]; + gradWeight[weightRow + featureDim] = ScalarConvert<Acctype, Dtype>::to(weight[ii]); } } @@ -159,129 +164,23 @@ __global__ void cunn_LookupTable_accGradParametersKernel( } } -void THNN_CudaLookupTable_accGradParameters( - THCState *state, - THIndexTensor *input, - THCudaTensor *gradOutput, - THCudaTensor *gradWeight, - THIndexTensor *count, - THIndexTensor *sorted, - THIndexTensor *indices, - bool scaleGradByFreq, - int paddingValue, - float scale) -{ - THCUNN_assertSameGPU(state, 5, input, gradOutput, gradWeight, sorted, indices); - if (!(THIndexTensor_(isContiguous)(state, input) && - THCudaTensor_isContiguous(state, gradOutput) && - THCudaTensor_isContiguous(state, gradWeight))) - { - THError("Tensors must be contiguous"); - } - - int nDim = THIndexTensor_(nDimension)(state, input); - if (nDim != 1 && nDim != 2) - THError("input must be a vector or matrix"); - - long numel = THIndexTensor_(nElement)(state, input); - long stride = gradWeight->stride[0]; - - cudaStream_t stream = THCState_getCurrentStream(state); - - if (numel <= 768 && !scaleGradByFreq) { - cunn_LookupTable_accGradParametersKernelByFeature<<<DIVUP(stride,4), 128, 0, stream>>>( - THIndexTensor_(data)(state, input), - THCudaTensor_data(state, gradOutput), - THCudaTensor_data(state, gradWeight), - scale, - numel, - stride, - paddingValue); - THCudaCheck(cudaGetLastError()); - return; - } - - THLongStorage *inputSize = THIndexTensor_(newSizeOf)(state, input); - THIndexTensor_(resize)(state, sorted, inputSize, NULL); - THIndexTensor_(resize)(state, indices, inputSize, NULL); - THLongStorage_free(inputSize); - - // Sort the inputs into sorted with the corresponding indices - THIndexTensor_(sort)(state, sorted, indices, input, 0, 0); - - long *sorted_data = THIndexTensor_(data)(state, sorted); - long *indices_data = THIndexTensor_(data)(state, indices); - long *count_data = NULL; - - if (scaleGradByFreq) - { - THIndexTensor_(resizeAs)(state, count, input); - count_data = THIndexTensor_(data)(state, count); - - thrust::device_ptr<long> sorted_ptr(sorted_data); - thrust::device_ptr<long> count_ptr(count_data); - - // Compute an increasing sequence per unique item in sorted: - // sorted: 2 5 5 5 7 7 8 9 9 - // count: 1 1 2 3 1 2 1 1 2 - thrust::inclusive_scan_by_key( -#if CUDA_VERSION >= 7000 - thrust::cuda::par.on(THCState_getCurrentStream(state)), -#endif - sorted_ptr, - sorted_ptr + numel, - thrust::make_constant_iterator(1), - count_ptr - ); - - // Take the maximum of each count per unique key in reverse: - // sorted: 2 5 5 5 7 7 8 9 9 - // count: 1 3 3 3 2 2 1 2 2 - thrust::inclusive_scan_by_key( -#if CUDA_VERSION >= 7000 - thrust::cuda::par.on(THCState_getCurrentStream(state)), -#endif - thrust::make_reverse_iterator(sorted_ptr + numel), - thrust::make_reverse_iterator(sorted_ptr), - thrust::make_reverse_iterator(count_ptr + numel), - thrust::make_reverse_iterator(count_ptr + numel), - thrust::equal_to<long>(), - thrust::maximum<long>() - ); - } - - dim3 grid(DIVUP(numel,4), DIVUP(stride,128)); - dim3 block(32, 4); - cunn_LookupTable_accGradParametersKernel<<<grid, block, 0, stream>>>( - sorted_data, - indices_data, - THCudaTensor_data(state, gradOutput), - THCudaTensor_data(state, gradWeight), - count_data, - scale, - numel, - stride, - paddingValue - ); - THCudaCheck(cudaGetLastError()); -} - /* * Keep the norm of weight smaller than maxNorm */ -template <typename T> +template <typename Dtype, typename Acctype> struct pow_v { - T normType; - pow_v(T v) : normType(v) {} + Acctype normType; + pow_v(Dtype v) : normType(ScalarConvert<Dtype, Acctype>::to(v)) {} __host__ __device__ - T operator()(const T& x) const { + Acctype operator()(const Dtype& x) const { + Acctype xA = ScalarConvert<Dtype, Acctype>::to(x); if (normType == 1) - return std::abs(x); + return std::abs(xA); else if (normType == 2) - return x * x; + return xA * xA; else - return std::pow(std::abs(x), normType); + return std::pow(std::abs(xA), normType); } }; @@ -296,47 +195,5 @@ struct multiply_s } }; -void THNN_CudaLookupTable_renorm( - THCState *state, - THIndexTensor *idx, - THCudaTensor *weight, - float maxNorm, - float normType) -{ - THCUNN_assertSameGPU(state, 2, idx, weight); - if (!(THIndexTensor_(isContiguous)(state, idx) && - THCudaTensor_isContiguous(state, weight))) - { - THError("Tensors must be contiguous"); - } - if (THIndexTensor_(nDimension)(state, idx) != 1) - THError("idx must be a vector"); - if (normType <= 0) - THError("non-positive-norm not supported"); - - long numel = THIndexTensor_(nElement)(state, idx); - long stride = weight->stride[0]; - - // get the unique indices - thrust::device_ptr<float> weight_ptr(THCudaTensor_data(state, weight)); - thrust::device_ptr<long> idx_ptr(THIndexTensor_(data)(state, idx)); - thrust::device_ptr<long> end_ptr = thrust::unique(idx_ptr, idx_ptr+numel); - numel = end_ptr - idx_ptr; - - pow_v<float> unary_pow(normType); - thrust::plus<float> binary_plus; - // numel << stride, since idx usually contains sparse row indices - for (long i = 0; i < numel; i++) - { - long k = idx_ptr[i] - TH_INDEX_BASE; - thrust::device_ptr<float> row_ptr = weight_ptr + k * stride; - float norm = thrust::transform_reduce(row_ptr, row_ptr + stride, - unary_pow, 0, binary_plus); - norm = std::pow(norm, (float) (1.0 / normType)); - if (norm > maxNorm) - { - multiply_s<float> unary_mul(maxNorm / (norm + 1e-7)); - thrust::transform(row_ptr, row_ptr + stride, row_ptr, unary_mul); - } - } -} +#include "generic/LookupTable.cu" +#include "THCGenerateFloatTypes.h" diff --git a/lib/THCUNN/MSECriterion.cu b/lib/THCUNN/MSECriterion.cu index 982e19a..26a35a5 100644 --- a/lib/THCUNN/MSECriterion.cu +++ b/lib/THCUNN/MSECriterion.cu @@ -1,5 +1,7 @@ #include "THCUNN.h" #include "common.h" +#include "THCHalf.h" +#include "THCHalfAutoNumerics.cuh" #include <thrust/fill.h> #include <thrust/functional.h> @@ -10,87 +12,32 @@ #include <thrust/system/cuda/execution_policy.h> #endif +template <typename Dtype, typename Acctype> struct mse_functor { mse_functor() {} - __host__ __device__ float operator()(const float &x, const float &y) const + __host__ __device__ Acctype operator()(const Dtype &x, const Dtype &y) const { - float z = x-y; + Acctype z = ScalarConvert<Dtype, Acctype>::to(x)-y; return z*z; } }; -void THNN_CudaMSECriterion_updateOutput(THCState *state, THCudaTensor *input, THCudaTensor *target, THCudaTensor *output, bool sizeAverage) -{ - THCUNN_assertSameGPU(state, 2, input, target); - THArgCheck(THCudaTensor_nElement(state, input) == THCudaTensor_nElement(state, target), 2, - "input and target need to have the same number of elements" - ); - - long size = THCudaTensor_nElement(state, input); - - input = THCudaTensor_newContiguous(state, input); - target = THCudaTensor_newContiguous(state, target); - - thrust::device_ptr<float> input_data(THCudaTensor_data(state, input)); - thrust::device_ptr<float> target_data(THCudaTensor_data(state, target)); - float sum = thrust::inner_product( -#if CUDA_VERSION >= 7000 - thrust::cuda::par.on(THCState_getCurrentStream(state)), -#endif - input_data, input_data+size, target_data, (float) 0, - thrust::plus<float>(), mse_functor()); - - if (sizeAverage) - sum /= size; - - THCudaTensor_free(state, input); - THCudaTensor_free(state, target); - - THCudaTensor_set1d(state, output, 0, sum); -} - +template <typename Dtype, typename Acctype> struct mse_updateGradInput_functor { - const float norm; + const Acctype norm; - mse_updateGradInput_functor(float norm_) + mse_updateGradInput_functor(Acctype norm_) : norm(norm_) {} - __host__ __device__ float operator()(const float &x, const float &y) const + __host__ __device__ Dtype operator()(const Dtype &x, const Dtype &y) const { - return norm * (x - y); + return ScalarConvert<Acctype, Dtype>::to(norm * (ScalarConvert<Dtype, Acctype>::to(x) - y)); } }; -void THNN_CudaMSECriterion_updateGradInput(THCState *state, THCudaTensor *input, THCudaTensor *target, THCudaTensor *gradInput, bool sizeAverage) -{ - THCUNN_assertSameGPU(state, 3, input, target, gradInput); - THArgCheck(THCudaTensor_nElement(state, input) == THCudaTensor_nElement(state, target), 2, - "input and target need to have the same number of elements" - ); - - long size = THCudaTensor_nElement(state, input); - float norm = sizeAverage ? 2.f/size : 2.f; - - input = THCudaTensor_newContiguous(state, input); - target = THCudaTensor_newContiguous(state, target); - - THCudaTensor_resizeAs(state, gradInput, input); - - thrust::device_ptr<float> input_data(THCudaTensor_data(state, input)); - thrust::device_ptr<float> target_data(THCudaTensor_data(state, target)); - thrust::device_ptr<float> gradInput_data(THCudaTensor_data(state, gradInput)); - - thrust::transform( -#if CUDA_VERSION >= 7000 - thrust::cuda::par.on(THCState_getCurrentStream(state)), -#endif - input_data, input_data+size, target_data, gradInput_data, - mse_updateGradInput_functor(norm)); - - THCudaTensor_free(state, input); - THCudaTensor_free(state, target); -} +#include "generic/MSECriterion.cu" +#include "THCGenerateFloatTypes.h" diff --git a/lib/THCUNN/MarginCriterion.cu b/lib/THCUNN/MarginCriterion.cu index b49e1dc..7ccdbb7 100644 --- a/lib/THCUNN/MarginCriterion.cu +++ b/lib/THCUNN/MarginCriterion.cu @@ -1,5 +1,7 @@ #include "THCUNN.h" #include "common.h" +#include "THCHalf.h" +#include "THCHalfAutoNumerics.cuh" #include <thrust/fill.h> #include <thrust/functional.h> @@ -7,76 +9,37 @@ #include <thrust/reduce.h> #include <thrust/inner_product.h> +template <typename Dtype, typename Acctype> struct margin_functor { - margin_functor(float margin) + margin_functor(Acctype margin) : margin(margin) {} - __host__ __device__ float operator()(const float &x, const float &y) const + __host__ __device__ Acctype operator()(const Dtype &x, const Dtype &y) const { - float z = margin - x * y; + Acctype z = margin - ScalarConvert<Dtype, Acctype>::to(x) * y; return z >= 0 ? z : 0; } - const float margin; + const Acctype margin; }; -void THNN_CudaMarginCriterion_updateOutput(THCState *state, THCudaTensor *input, THCudaTensor *target, THCudaTensor *output, bool sizeAverage, float margin) -{ - THCUNN_assertSameGPU(state, 2, input, target); - - long size = THCudaTensor_nElement(state, input); - - input = THCudaTensor_newContiguous(state, input); - target = THCudaTensor_newContiguous(state, target); - - thrust::device_ptr<float> input_data(THCudaTensor_data(state, input)); - thrust::device_ptr<float> target_data(THCudaTensor_data(state, target)); - float sum = thrust::inner_product(input_data, input_data+size, target_data, (float) 0, thrust::plus<float>(), margin_functor(margin)); - - if (sizeAverage) - sum /= size; - - THCudaTensor_free(state, input); - THCudaTensor_free(state, target); - - THCudaTensor_set1d(state, output, 0, sum); -} - +template <typename Dtype, typename Acctype> struct margin_updateGradInput_functor { - const float margin, norm; + const Acctype margin, norm; - margin_updateGradInput_functor(float margin_, float norm_) + margin_updateGradInput_functor(Acctype margin_, Acctype norm_) : margin(margin_) , norm(norm_) {} - __host__ __device__ float operator()(const float &x, const float &y) const + __host__ __device__ Dtype operator()(const Dtype &x, const Dtype &y) const { - return (x * y) < margin ? -norm * y : 0; + return ScalarConvert<Acctype, Dtype>::to((ScalarConvert<Dtype, Acctype>::to(x) * y) < margin ? -norm * y : 0); } }; -void THNN_CudaMarginCriterion_updateGradInput(THCState *state, THCudaTensor *input, THCudaTensor *target, THCudaTensor *gradInput, bool sizeAverage, float margin) -{ - THCUNN_assertSameGPU(state, 3, input, target, gradInput); - - long size = THCudaTensor_nElement(state, input); - float norm = sizeAverage ? 1.f/size : 1; - - input = THCudaTensor_newContiguous(state, input); - target = THCudaTensor_newContiguous(state, target); - - THCudaTensor_resizeAs(state, gradInput, input); - - thrust::device_ptr<float> input_data(THCudaTensor_data(state, input)); - thrust::device_ptr<float> target_data(THCudaTensor_data(state, target)); - thrust::device_ptr<float> gradInput_data(THCudaTensor_data(state, gradInput)); - - thrust::transform(input_data, input_data+size, target_data, gradInput_data, margin_updateGradInput_functor(margin, norm)); - - THCudaTensor_free(state, input); - THCudaTensor_free(state, target); -} +#include "generic/MarginCriterion.cu" +#include "THCGenerateFloatTypes.h" diff --git a/lib/THCUNN/MultiLabelMarginCriterion.cu b/lib/THCUNN/MultiLabelMarginCriterion.cu index 903e064..72fc486 100644 --- a/lib/THCUNN/MultiLabelMarginCriterion.cu +++ b/lib/THCUNN/MultiLabelMarginCriterion.cu @@ -1,60 +1,63 @@ #include "THCUNN.h" #include "common.h" #include "THCReduceApplyUtils.cuh" +#include "THCHalf.h" +#include "THCHalfAutoNumerics.cuh" #include <thrust/functional.h> #define MULTILABELMARGIN_THREADS 1024 -__global__ void cunn_MultiLabelMarginCriterion_updateOutput_kernel(float *output, - float *input, - float *target, - float *istarget, +template <typename Dtype, typename Acctype> +__global__ void cunn_MultiLabelMarginCriterion_updateOutput_kernel(Dtype *output, + Dtype *input, + THCIndex_t *target, + Dtype *istarget, int nframe, int dim, int sizeaverage) { // Temporary sums (for mapreduce) - __shared__ float sums[MULTILABELMARGIN_THREADS]; + __shared__ Acctype sums[MULTILABELMARGIN_THREADS]; // vectors: int k = blockIdx.x; - float *input_k = input + k*dim; - float *target_k = target + k*dim; - float *output_k = output + k; - float *istarget_k = istarget + k*dim; + Dtype *input_k = input + k*dim; + THCIndex_t *target_k = target + k*dim; + Dtype *output_k = output + k; + Dtype *istarget_k = istarget + k*dim; // zero istarget for (int d = threadIdx.x; d < dim; d += blockDim.x) { - istarget_k[d] = 0; + istarget_k[d] = ScalarConvert<int, Dtype>::to(0); } __syncthreads(); // mark targets in istarget if (threadIdx.x == 0) { for (int dt = 0; dt < dim; dt++) { - int target_idx = (int)target_k[dt] - TH_INDEX_BASE; + int target_idx = target_k[dt] - TH_INDEX_BASE; if (target_idx < 0) break; - istarget_k[target_idx] = 1; + istarget_k[target_idx] = ScalarConvert<int, Dtype>::to(1); } } __syncthreads(); // iterate over targets - float sum = 0; + Acctype sum = 0; for (int dt = 0; dt < dim; dt++) { // next target: - int target_idx = (int)target_k[dt] - TH_INDEX_BASE; + int target_idx = target_k[dt] - TH_INDEX_BASE; if (target_idx < 0) break; // current value for target - float input_target_k = input_k[target_idx]; + Dtype input_target_k = input_k[target_idx]; // compare to all inputs (multithreaded): for (int d = threadIdx.x; d < dim; d += blockDim.x) { // contribute to loss only if not a target - if (!istarget_k[d]) { - float z = 1 - input_target_k + input_k[d]; + if (!ScalarConvert<Dtype, int>::to(istarget_k[d])) { + Dtype z = 1 - input_target_k + input_k[d]; if (z > 0) sum += z; } @@ -62,40 +65,41 @@ __global__ void cunn_MultiLabelMarginCriterion_updateOutput_kernel(float *output } // reduce - float totalSum = reduceBlock(sums, blockDim.x, sum, thrust::plus<float>(), 0.0f); + Acctype totalSum = reduceBlock(sums, blockDim.x, sum, thrust::plus<Acctype>(), (Acctype)0); if (threadIdx.x == 0) { if (sizeaverage) { - *output_k = (totalSum / dim) / nframe; + *output_k = ScalarConvert<Acctype, Dtype>::to((totalSum / dim) / nframe); } else { - *output_k = totalSum / dim; + *output_k = ScalarConvert<Acctype, Dtype>::to(totalSum / dim); } } } -__global__ void cunn_MultiLabelMarginCriterion_updateGradInput_kernel(float *gradInput, - float *input, - float *target, - float *istarget, +template <typename Dtype, typename Acctype> +__global__ void cunn_MultiLabelMarginCriterion_updateGradInput_kernel(Dtype *gradInput, + Dtype *input, + THCIndex_t *target, + Dtype *istarget, int nframe, int dim, int sizeaverage) { // Temporary sums (for mapreduce) - __shared__ float sums[MULTILABELMARGIN_THREADS]; + __shared__ Acctype sums[MULTILABELMARGIN_THREADS]; // vectors: int k = blockIdx.x; - float *input_k = input + k*dim; - float *gradInput_k = gradInput + k*dim; - float *target_k = target + k*dim; - float *istarget_k = istarget + k*dim; + Dtype *input_k = input + k*dim; + Dtype *gradInput_k = gradInput + k*dim; + THCIndex_t *target_k = target + k*dim; + Dtype *istarget_k = istarget + k*dim; // gain: - float g = ( sizeaverage ? 1./((float)(nframe*dim)) : 1./((float)dim) ); + Dtype g = ScalarConvert<Acctype, Dtype>::to( sizeaverage ? 1./((Acctype)(nframe*dim)) : 1./((Acctype)dim) ); // zero gradients: for (int d = threadIdx.x; d < dim; d += blockDim.x) { - gradInput_k[d] = 0; + gradInput_k[d] = ScalarConvert<int, Dtype>::to(0); } __syncthreads(); @@ -106,14 +110,14 @@ __global__ void cunn_MultiLabelMarginCriterion_updateGradInput_kernel(float *gra if (target_idx < 0) break; // current value for target - float input_target_k = input_k[target_idx]; + Dtype input_target_k = input_k[target_idx]; // compare to all inputs (multithreaded): - float sum = 0; + Acctype sum = 0; for (int d = threadIdx.x; d < dim; d += blockDim.x) { // contribute to loss only if not a target - if (!istarget_k[d]) { - float z = 1 - input_target_k + input_k[d]; + if (!ScalarConvert<Dtype, int>::to(istarget_k[d])) { + Dtype z = 1 - input_target_k + input_k[d]; if (z > 0) { sum -= g; gradInput_k[d] += g; @@ -123,118 +127,15 @@ __global__ void cunn_MultiLabelMarginCriterion_updateGradInput_kernel(float *gra __syncthreads(); // reduce sum - float totalSum = reduceBlock(sums, blockDim.x, sum, thrust::plus<float>(), 0.0f); + Acctype totalSum = reduceBlock(sums, blockDim.x, sum, thrust::plus<Acctype>(), (Acctype)0); if (threadIdx.x == 0) { - gradInput_k[target_idx] += totalSum; + gradInput_k[target_idx] += ScalarConvert<Acctype, Dtype>::to(totalSum); } __syncthreads(); } } -void THNN_CudaMultiLabelMarginCriterion_updateOutput( - THCState *state, - THCudaTensor *input, - THCudaTensor *target, - THCudaTensor *output, - THCudaTensor *istarget, - bool sizeaverage) -{ - input = THCudaTensor_newContiguous(state, input); - target = THCudaTensor_newContiguous(state, target); - istarget = THCudaTensor_newContiguous(state, istarget); - THCudaTensor_resizeAs(state, istarget, input); - - if(input->nDimension == 1) - { - THCudaTensor_resize1d(state, output, 1); - - dim3 blocks(1); - dim3 threads(MULTILABELMARGIN_THREADS); - - cunn_MultiLabelMarginCriterion_updateOutput_kernel<<<blocks,threads>>>( - THCudaTensor_data(state, output), - THCudaTensor_data(state, input), - THCudaTensor_data(state, target), - THCudaTensor_data(state, istarget), - 1, input->size[0], - sizeaverage - ); - THCudaCheck(cudaGetLastError()); - } - else if(input->nDimension == 2) - { - THCudaTensor *output_tmp = THCudaTensor_newWithSize1d(state, input->size[0]); - - dim3 blocks(input->size[0]); - dim3 threads(MULTILABELMARGIN_THREADS); - - cunn_MultiLabelMarginCriterion_updateOutput_kernel<<<blocks,threads>>>( - THCudaTensor_data(state, output_tmp), - THCudaTensor_data(state, input), - THCudaTensor_data(state, target), - THCudaTensor_data(state, istarget), - input->size[0], input->size[1], - sizeaverage - ); - THCudaCheck(cudaGetLastError()); - THCudaTensor_resize1d(state, output, 1); - THCudaTensor_set1d(state, output, 0, THCudaTensor_sumall(state, output_tmp)); - THCudaTensor_free(state, output_tmp); - } - else - THError("vector or matrix expected"); - - THCudaTensor_free(state, input); - THCudaTensor_free(state, target); - THCudaTensor_free(state, istarget); -} - -void THNN_CudaMultiLabelMarginCriterion_updateGradInput( - THCState *state, - THCudaTensor *input, - THCudaTensor *target, - THCudaTensor *gradInput, - THCudaTensor *istarget, - bool sizeaverage) -{ - input = THCudaTensor_newContiguous(state, input); - target = THCudaTensor_newContiguous(state, target); - istarget = THCudaTensor_newContiguous(state, istarget); - THCudaTensor_resizeAs(state, gradInput, input); - - if(gradInput->nDimension == 1) - { - dim3 blocks(1); - dim3 threads(MULTILABELMARGIN_THREADS); - - cunn_MultiLabelMarginCriterion_updateGradInput_kernel<<<blocks,threads>>>(THCudaTensor_data(state, gradInput), - THCudaTensor_data(state, input), - THCudaTensor_data(state, target), - THCudaTensor_data(state, istarget), - 1, gradInput->size[0], - sizeaverage); - - } - else if(gradInput->nDimension == 2) - { - dim3 blocks(gradInput->size[0]); - dim3 threads(MULTILABELMARGIN_THREADS); - - cunn_MultiLabelMarginCriterion_updateGradInput_kernel<<<blocks,threads>>>(THCudaTensor_data(state, gradInput), - THCudaTensor_data(state, input), - THCudaTensor_data(state, target), - THCudaTensor_data(state, istarget), - gradInput->size[0], gradInput->size[1], - sizeaverage); - } - else - THError("vector or matrix expected"); - - THCudaCheck(cudaGetLastError()); - - THCudaTensor_free(state, input); - THCudaTensor_free(state, target); - THCudaTensor_free(state, istarget); -} +#include "generic/MultiLabelMarginCriterion.cu" +#include "THCGenerateFloatTypes.h" #undef MULTILABELMARGIN_THREADS diff --git a/lib/THCUNN/MultiMarginCriterion.cu b/lib/THCUNN/MultiMarginCriterion.cu index 31caa75..b5ed75b 100644 --- a/lib/THCUNN/MultiMarginCriterion.cu +++ b/lib/THCUNN/MultiMarginCriterion.cu @@ -1,17 +1,19 @@ #include "THCUNN.h" #include "common.h" +#include "THCHalf.h" +#include "THCHalfAutoNumerics.cuh" #define MULTIMARGIN_THREADS 128 -template <int P> -__global__ void cunn_MultiMarginCriterion_updateOutput_kernel(float *output, float *input, float *target, float *weights, int nframe, int dim, bool sizeAverage, float margin) +template <int P, typename Dtype, typename Acctype> +__global__ void cunn_MultiMarginCriterion_updateOutput_kernel(Dtype *output, Dtype *input, THCIndex_t *target, Dtype *weights, int nframe, int dim, bool sizeAverage, Dtype margin) { - __shared__ float buffer[MULTIMARGIN_THREADS]; + __shared__ Acctype buffer[MULTIMARGIN_THREADS]; int k = blockIdx.x; - float *input_k = input + k*dim; - float *output_k = output + k; + Dtype *input_k = input + k*dim; + Dtype *output_k = output + k; int target_k = ((int)target[k]) - TH_INDEX_BASE; - float input_target_k = input_k[target_k]; + Dtype input_target_k = input_k[target_k]; int i_start = threadIdx.x; int i_end = dim; @@ -20,12 +22,12 @@ __global__ void cunn_MultiMarginCriterion_updateOutput_kernel(float *output, flo buffer[threadIdx.x] = 0; for (int i = i_start; i < i_end; i += i_step) { - float z = margin - input_target_k + input_k[i]; + Dtype z = margin - input_target_k + input_k[i]; if (i == target_k) continue; if (z > 0) { - float h = (P==1) ? z : z*z; + Dtype h = (P==1) ? z : z*z; if(weights) h *= weights[target_k]; buffer[threadIdx.x] += h; @@ -36,26 +38,26 @@ __global__ void cunn_MultiMarginCriterion_updateOutput_kernel(float *output, flo // reduce if (threadIdx.x == 0) { - float sum = 0; + Acctype sum = 0; for (int i=0; i < blockDim.x; i++) sum += buffer[i]; - *output_k = sum/dim; + *output_k = ScalarConvert<Acctype, Dtype>::to(sum/dim); if(sizeAverage) *output_k /= nframe; } } -template <int P> -__global__ void cunn_MultiMarginCriterion_updateGradInput_kernel(float *gradInput, float *input, float *target, float *weights, int nframe, int dim, bool sizeAverage, float margin) +template <int P, typename Dtype, typename Acctype> +__global__ void cunn_MultiMarginCriterion_updateGradInput_kernel(Dtype *gradInput, Dtype *input, THCIndex_t *target, Dtype *weights, int nframe, int dim, bool sizeAverage, Dtype margin) { - __shared__ float buffer[MULTIMARGIN_THREADS]; + __shared__ Acctype buffer[MULTIMARGIN_THREADS]; int k = blockIdx.x; - float *input_k = input + k*dim; - float *gradInput_k = gradInput + k*dim; + Dtype *input_k = input + k*dim; + Dtype *gradInput_k = gradInput + k*dim; int target_k = ((int)target[k]) - TH_INDEX_BASE; - float input_target_k = input_k[target_k]; - float g = (sizeAverage ? 1./((float)(nframe*dim)) : 1./((float)dim)); + Dtype input_target_k = input_k[target_k]; + Acctype g = (sizeAverage ? 1./((Acctype)(nframe*dim)) : 1./((Acctype)dim)); int i_start = threadIdx.x; int i_end = dim; @@ -64,20 +66,20 @@ __global__ void cunn_MultiMarginCriterion_updateGradInput_kernel(float *gradInpu buffer[threadIdx.x] = 0; for (int i=i_start; i<i_end; i+=i_step) { - float z = margin - input_target_k + input_k[i]; + Dtype z = margin - input_target_k + input_k[i]; if (i == target_k) continue; if (z > 0) { - float h = (P == 1) ? g : 2*g*z; + Dtype h = ScalarConvert<Acctype, Dtype>::to((P == 1) ? g : 2*g*z); if(weights) h *= weights[target_k]; buffer[threadIdx.x] -= h; gradInput_k[i] = h; } else - gradInput_k[i] = 0; + gradInput_k[i] = ScalarConvert<int, Dtype>::to(0); } __syncthreads(); @@ -85,175 +87,14 @@ __global__ void cunn_MultiMarginCriterion_updateGradInput_kernel(float *gradInpu // reduce if (threadIdx.x == 0) { - float gradInput_target_k = 0; + Acctype gradInput_target_k = 0; for (int i=0; i<blockDim.x; i++) gradInput_target_k += buffer[i]; - gradInput_k[target_k] = gradInput_target_k; + gradInput_k[target_k] = ScalarConvert<Acctype, Dtype>::to(gradInput_target_k); } } -void THNN_CudaMultiMarginCriterion_updateOutput(THCState *state, THCudaTensor *input, - THCudaTensor *target, THCudaTensor *output, - bool sizeAverage, int p, THCudaTensor *weights, - float margin) -{ - THCUNN_assertSameGPU(state, 2, input, target); - input = THCudaTensor_newContiguous(state, input); - if(weights) - weights = THCudaTensor_newContiguous(state, weights); - if (input->nDimension == 1) - { - dim3 blocks(1); - dim3 threads(MULTIMARGIN_THREADS); - if (p == 1) - { - cunn_MultiMarginCriterion_updateOutput_kernel<1> <<<blocks,threads, 0, THCState_getCurrentStream(state)>>>( - THCudaTensor_data(state, output), - THCudaTensor_data(state, input), - THCudaTensor_data(state, target), - weights ? THCudaTensor_data(state, weights) : NULL, - 1, input->size[0], - sizeAverage, - margin - ); - } - else if (p == 2) - { - cunn_MultiMarginCriterion_updateOutput_kernel<2> <<<blocks,threads, 0, THCState_getCurrentStream(state)>>>( - THCudaTensor_data(state, output), - THCudaTensor_data(state, input), - THCudaTensor_data(state, target), - weights ? THCudaTensor_data(state, weights) : NULL, - 1, input->size[0], - sizeAverage, - margin - ); - } - THCudaCheck(cudaGetLastError()); - } - else if (input->nDimension == 2) - { - THCudaTensor *output_ = THCudaTensor_newWithSize1d(state, input->size[0]); // tmp outupt buffer - dim3 blocks(input->size[0]); - dim3 threads(MULTIMARGIN_THREADS); - if (p == 1) - { - cunn_MultiMarginCriterion_updateOutput_kernel<1> <<<blocks,threads, 0, THCState_getCurrentStream(state)>>>( - THCudaTensor_data(state, output_), - THCudaTensor_data(state, input), - THCudaTensor_data(state, target), - weights ? THCudaTensor_data(state, weights) : NULL, - input->size[0], input->size[1], - sizeAverage, - margin - ); - } - else if (p == 2) - { - cunn_MultiMarginCriterion_updateOutput_kernel<2> <<<blocks,threads, 0, THCState_getCurrentStream(state)>>>( - THCudaTensor_data(state, output_), - THCudaTensor_data(state, input), - THCudaTensor_data(state, target), - weights ? THCudaTensor_data(state, weights) : NULL, - input->size[0], input->size[1], - sizeAverage, - margin - ); - } - THCudaCheck(cudaGetLastError()); - float sum = THCudaTensor_sumall(state, output_); - THCudaTensor_set1d(state, output, 0, sum); - THCudaTensor_free(state, output_); - } - else - { - THError("vector or matrix expected"); - } - - THCudaTensor_free(state, input); - if(weights) - THCudaTensor_free(state, weights); -} - -void THNN_CudaMultiMarginCriterion_updateGradInput(THCState *state, THCudaTensor *input, - THCudaTensor *target, THCudaTensor *gradInput, - bool sizeAverage, int p, THCudaTensor *weights, - float margin) -{ - THCUNN_assertSameGPU(state, 3, input, gradInput, target); - input = THCudaTensor_newContiguous(state, input); - THCudaTensor_resizeAs(state, gradInput, input); - if(weights) - weights = THCudaTensor_newContiguous(state, weights); +#include "generic/MultiMarginCriterion.cu" +#include "THCGenerateFloatTypes.h" - if (input->nDimension == 1) - { - dim3 blocks(1); - dim3 threads(MULTIMARGIN_THREADS); - - if (p == 1) - { - cunn_MultiMarginCriterion_updateGradInput_kernel<1> <<<blocks,threads, 0, THCState_getCurrentStream(state)>>>( - THCudaTensor_data(state, gradInput), - THCudaTensor_data(state, input), - THCudaTensor_data(state, target), - weights ? THCudaTensor_data(state, weights) : NULL, - 1, gradInput->size[0], - sizeAverage, - margin - ); - } - else if (p == 2) - { - cunn_MultiMarginCriterion_updateGradInput_kernel<2> <<<blocks,threads, 0, THCState_getCurrentStream(state)>>>( - THCudaTensor_data(state, gradInput), - THCudaTensor_data(state, input), - THCudaTensor_data(state, target), - weights ? THCudaTensor_data(state, weights) : NULL, - 1, gradInput->size[0], - sizeAverage, - margin - ); - } - THCudaCheck(cudaGetLastError()); - } - else if (input->nDimension == 2) - { - dim3 blocks(gradInput->size[0]); - dim3 threads(MULTIMARGIN_THREADS); - - if (p == 1) - { - cunn_MultiMarginCriterion_updateGradInput_kernel<1> <<<blocks,threads, 0, THCState_getCurrentStream(state)>>>( - THCudaTensor_data(state, gradInput), - THCudaTensor_data(state, input), - THCudaTensor_data(state, target), - weights ? THCudaTensor_data(state, weights) : NULL, - gradInput->size[0], gradInput->size[1], - sizeAverage, - margin - ); - } - else if (p == 2) - { - cunn_MultiMarginCriterion_updateGradInput_kernel<2> <<<blocks,threads, 0, THCState_getCurrentStream(state)>>>( - THCudaTensor_data(state, gradInput), - THCudaTensor_data(state, input), - THCudaTensor_data(state, target), - weights ? THCudaTensor_data(state, weights) : NULL, - gradInput->size[0], gradInput->size[1], - sizeAverage, - margin - ); - } - THCudaCheck(cudaGetLastError()); - } - else - { - THError("vector or matrix expected"); - } - - THCudaTensor_free(state, input); - if(weights) - THCudaTensor_free(state, weights); -} +#undef MULTIMARGIN_THREADS diff --git a/lib/THCUNN/PReLU.cu b/lib/THCUNN/PReLU.cu index 048c0b4..dfac397 100644 --- a/lib/THCUNN/PReLU.cu +++ b/lib/THCUNN/PReLU.cu @@ -1,25 +1,27 @@ #include "THCUNN.h" -#include "THCReduce.cuh" -#include "common.h" +#include "THCHalf.h" +#include "THCHalfAutoNumerics.cuh" -#include <thrust/functional.h> +#include "common.h" +template <typename T> struct PReLUUpdateOutput { - float* weight_; + T* weight_; - PReLUUpdateOutput(float* weight) + PReLUUpdateOutput(T* weight) : weight_(weight) {} - __device__ __forceinline__ void operator()(float *out, float *in) + __device__ __forceinline__ void operator()(T *out, T *in) { - float x = *in; + T x = *in; *out = (x > 0) ? x : weight_[0] * x; } }; -__global__ void preluForward(float *output, const float *input, const float *weight, int n, int nElemsPerSample, int mapSize) +template <typename T> +__global__ void preluForward(T *output, const T *input, const T *weight, int n, int nElemsPerSample, int mapSize) { CUDA_KERNEL_LOOP(i, n) { @@ -29,63 +31,27 @@ __global__ void preluForward(float *output, const float *input, const float *wei } } -void THNN_CudaPReLU_updateOutput( - THCState *state, - THCudaTensor *input, - THCudaTensor *output, - THCudaTensor *weight, - long nOutputPlane) -{ - THCudaTensor_resizeAs(state, output, input); - - float *w = THCudaTensor_data(state, weight); - - if (nOutputPlane == 0) - { - THC_pointwiseApply2(state, output, input, PReLUUpdateOutput(w)); - } - else - { - int ndim = THCudaTensor_nDimension(state, input); - input = THCudaTensor_newContiguous(state, input); - - int n = THCudaTensor_nElement(state, input); - int mapSize = 1; - if (ndim == 3) - mapSize = (input->size[1] * input->size[2]); - else if (ndim == 4) - mapSize = (input->size[2] * input->size[3]); - int nElemsPerSample = nOutputPlane * mapSize; - preluForward<<<GET_BLOCKS(n), CUDA_NUM_THREADS, 0, THCState_getCurrentStream(state)>>>( - THCudaTensor_data(state, output), - THCudaTensor_data(state, input), - w, - n, nElemsPerSample, mapSize - ); - THCudaCheck(cudaGetLastError()); - THCudaTensor_free(state, input); - } -} - +template <typename T> struct PReLUUpdateGradInput { - float *weight_; + T *weight_; - PReLUUpdateGradInput(float *weight) + PReLUUpdateGradInput(T *weight) : weight_(weight) {} - __device__ __forceinline__ void operator()(float *gradInput, float *gradOutput, float *input) + __device__ __forceinline__ void operator()(T *gradInput, T *gradOutput, T *input) { *gradInput = *input > 0 ? *gradOutput : *gradOutput * *weight_; } }; +template <typename T> __global__ void preluBackward( - float *gradInput, - const float *input, - const float *weight, - const float *gradOutput, + T *gradInput, + const T *input, + const T *weight, + const T *gradOutput, int n, int nElemsPerSample, int mapSize) { CUDA_KERNEL_LOOP(i, n) @@ -96,149 +62,44 @@ __global__ void preluBackward( } } -void THNN_CudaPReLU_updateGradInput( - THCState *state, - THCudaTensor *input, - THCudaTensor *gradOutput, - THCudaTensor *gradInput, - THCudaTensor *weight, - long nOutputPlane) -{ - THCudaTensor_resizeAs(state, gradInput, input); - - float *w = THCudaTensor_data(state, weight); - if (nOutputPlane == 0) - { - THC_pointwiseApply3(state, gradInput, gradOutput, input, PReLUUpdateGradInput(w)); - } - else - { - int ndim = THCudaTensor_nDimension(state, input); - input = THCudaTensor_newContiguous(state, input); - gradOutput = THCudaTensor_newContiguous(state, gradOutput); - - int n = THCudaTensor_nElement(state, input); - int mapSize = 1; - if (ndim == 3) - mapSize = (input->size[1] * input->size[2]); - else if (ndim == 4) - mapSize = (input->size[2] * input->size[3]); - int nElemsPerSample = nOutputPlane * mapSize; - preluBackward<<<GET_BLOCKS(n), CUDA_NUM_THREADS, 0, THCState_getCurrentStream(state)>>>( - THCudaTensor_data(state, gradInput), - THCudaTensor_data(state, input), - w, - THCudaTensor_data(state, gradOutput), - n, nElemsPerSample, mapSize - ); - THCudaCheck(cudaGetLastError()); - THCudaTensor_free(state, input); - THCudaTensor_free(state, gradOutput); - } -} - +template <typename T> struct PReLUAccGradParametersShared { - __device__ __forceinline__ void operator()(float *gradInput, float *input, float *gradOutput) + __device__ __forceinline__ void operator()(T *gradInput, T *input, T *gradOutput) { *gradInput = (*input) * (*gradOutput) * (*input <= 0); } }; +template <typename T> struct PReLUAccGradParameters { - float scale; + T scale; - PReLUAccGradParameters(float scale) + PReLUAccGradParameters(T scale) : scale(scale) {} - __device__ __forceinline__ void operator()(float *gradInput, float *input, float *gradOutput) + __device__ __forceinline__ void operator()(T *gradInput, T *input, T *gradOutput) { *gradInput = (*input) * (*gradOutput) * scale * (*input <= 0); } }; +template <typename T> struct PReLUAccGradParameters1to1 { - float scale; + T scale; - PReLUAccGradParameters1to1(float scale) + PReLUAccGradParameters1to1(T scale) : scale(scale) {} - __device__ __forceinline__ void operator()(float *gradWeight, float *input, float *gradOutput) + __device__ __forceinline__ void operator()(T *gradWeight, T *input, T *gradOutput) { *gradWeight += (*input) * (*gradOutput) * scale * (*input <= 0); } }; -void THNN_CudaPReLU_accGradParameters( - THCState *state, - THCudaTensor *input, - THCudaTensor *gradOutput, - THCudaTensor *gradInput, - THCudaTensor *weight, - THCudaTensor *gradWeight, - THCudaTensor *gradWeightBuf, - THCudaTensor *gradWeightBuf2, - long nOutputPlane, - float scale) -{ - // use grad input for temporary storage, then call updateGradInput again - - if (nOutputPlane == 0) - { - THC_pointwiseApply3(state, gradInput, input, gradOutput, PReLUAccGradParametersShared()); - - // introduces a sync point - float sum = THCudaTensor_sumall(state, gradInput); - float w = THCudaTensor_get1d(state, gradWeight, 0); - THCudaTensor_set1d(state, gradWeight, 0, w + sum * scale); - - // restore gradInput - THNN_CudaPReLU_updateGradInput(state, input, gradOutput, gradInput, weight, nOutputPlane); - } - else - { - int ndim = THCudaTensor_nDimension(state, input); - - if (ndim == 1) - { - THC_pointwiseApply3(state, gradWeight, input, gradOutput, PReLUAccGradParameters1to1(scale)); - } - else - { - THC_pointwiseApply3(state, gradInput, input, gradOutput, PReLUAccGradParameters(scale)); - THCudaTensor *sumbuf = gradWeightBuf2; - THCudaTensor_resizeAs(state, gradWeightBuf, gradWeight); - - if (ndim == 2) - { - THCudaTensor_sum(state, gradWeightBuf, gradInput, 0); - THCudaTensor_cadd(state, gradWeight, gradWeight, scale, gradWeightBuf); - } - else if (ndim == 3) - { - THCudaTensor *buffer = THCudaTensor_newContiguous(state, gradInput); - THCudaTensor_resize2d(state, buffer, nOutputPlane, input->size[1] * input->size[2]); - THCudaTensor_sum(state, gradWeightBuf, buffer, 1); - THCudaTensor_cadd(state, gradWeight, gradWeight, scale, gradWeightBuf); - THCudaTensor_free(state, buffer); - } - else if (ndim == 4) - { - THCudaTensor *buffer = THCudaTensor_newContiguous(state, gradInput); - THCudaTensor_resize3d(state, buffer, input->size[0], nOutputPlane, input->size[2] * input->size[3]); - THCudaTensor_resize2d(state, sumbuf, input->size[0], nOutputPlane); - THCudaTensor_sum(state, sumbuf, buffer, 2); - THCudaTensor_sum(state, gradWeightBuf, sumbuf, 0); - THCudaTensor_cadd(state, gradWeight, gradWeight, scale, gradWeightBuf); - THCudaTensor_free(state, buffer); - } - - // restore gradInput - THNN_CudaPReLU_updateGradInput(state, input, gradOutput, gradInput, weight, nOutputPlane); - } - } -} +#include "generic/PReLU.cu" +#include "THCGenerateFloatTypes.h" diff --git a/lib/THCUNN/RReLU.cu b/lib/THCUNN/RReLU.cu index 245598b..55a34ec 100644 --- a/lib/THCUNN/RReLU.cu +++ b/lib/THCUNN/RReLU.cu @@ -1,4 +1,6 @@ #include "THCUNN.h" +#include "THCHalf.h" +#include "THCHalfAutoNumerics.cuh" #include "common.h" #include <curand.h> #include <curand_kernel.h> @@ -8,51 +10,72 @@ #define BLOCK_SIZE 256 #define NUM_BLOCKS(n) min((int)THCCeilDiv(n, (long) BLOCK_SIZE), MAX_NUM_BLOCKS) +template<typename T> +inline T __device__ curand_uniform_type(curandStateMtgp32 *state); + +template <> +inline half __device__ curand_uniform_type<half>(curandStateMtgp32 *state) { + return ScalarConvert<float, half>::to(curand_uniform(state)); +} + +template <> +inline float __device__ curand_uniform_type<float>(curandStateMtgp32 *state) { + return curand_uniform(state); +} + +template <> +inline double __device__ curand_uniform_type<double>(curandStateMtgp32 *state) { + return curand_uniform_double(state); +} + +template <typename T> __global__ void rreluUpdateOutputTrain(int n, curandStateMtgp32 *state, - float *input, float* noise, float *output, double a, double b) + T *input, T* noise, T *output, double a, double b) { CUDA_KERNEL_LOOP(i, n) { if (input[i] <= 0) { - float r = curand_uniform(&state[blockIdx.x]); - r = r * (b-a) + a; + T r = curand_uniform_type<T>(&state[blockIdx.x]); + r = ScalarConvert<double, T>::to(r * (b-a) + a); output[i] = input[i] * r; noise[i] = r; } else { output[i] = input[i]; - noise[i] = 1; + noise[i] = ScalarConvert<int, T>::to(1); } } } +template <typename T> struct RReLUUpdateOutputEval_functor { - const float negSlope_; + const T negSlope_; - RReLUUpdateOutputEval_functor(float negSlope) + RReLUUpdateOutputEval_functor(T negSlope) : negSlope_(negSlope) {} - __device__ __forceinline__ void operator()(float *out, float *in) + __device__ __forceinline__ void operator()(T *out, T *in) { - const float x = *in; - const float r = x <= 0 ? negSlope_ : 1; + const T x = *in; + const T r = x <= 0 ? negSlope_ : ScalarConvert<int, T>::to(1); *out = x * r; } }; +template <typename T> struct RReLUUpdateOutputEvalIP_functor { - const float negSlope_; + const T negSlope_; - RReLUUpdateOutputEvalIP_functor(float negSlope) + RReLUUpdateOutputEvalIP_functor(T negSlope) : negSlope_(negSlope) {} - __device__ __forceinline__ void operator()(float *x) + __device__ __forceinline__ void operator()(T *x) { if (*x <= 0) { @@ -61,74 +84,31 @@ struct RReLUUpdateOutputEvalIP_functor } }; -void THNN_CudaRReLU_updateOutput(THCState *state, THCudaTensor *input, THCudaTensor *output, - THCudaTensor *noise, double lower, double upper, bool train, bool inplace, void *generator) -{ - THCUNN_assertSameGPU(state, 3, input, output, noise); - struct curandStateMtgp32* gen_states = THCRandom_generatorStates(state); - - if (train) - { - input = THCudaTensor_newContiguous(state, input); - THCudaTensor_resizeAs(state, noise, input); - float *input_data = THCudaTensor_data(state, input); - float *noise_data = THCudaTensor_data(state, noise); - long n = THCudaTensor_nElement(state, input); - if (inplace) - { - rreluUpdateOutputTrain<<<NUM_BLOCKS(n), BLOCK_SIZE, 0, THCState_getCurrentStream(state)>>>( - n, gen_states, input_data, noise_data, input_data, lower, upper); - THCudaTensor_set(state, output, input); - } - else - { - THCudaTensor_resizeAs(state, output, input); - float *output_data = THCudaTensor_data(state, output); - rreluUpdateOutputTrain<<<NUM_BLOCKS(n), BLOCK_SIZE, 0, THCState_getCurrentStream(state)>>>( - n, gen_states, input_data, noise_data, output_data, lower, upper); - } - THCudaCheck(cudaGetLastError()); - THCudaTensor_free(state, input); - } - else - { - const double negSlope = (lower + upper) / 2; - if (inplace) - { - THC_pointwiseApply1(state, input, RReLUUpdateOutputEvalIP_functor(negSlope)); - THCudaTensor_set(state, output, input); - } - else - { - THCudaTensor_resizeAs(state, output, input); - THC_pointwiseApply2(state, output, input, RReLUUpdateOutputEval_functor(negSlope)); - } - } -} - +template <typename T> struct RReLUupdateGradInputEval_functor { - const float negSlope_; + const T negSlope_; - RReLUupdateGradInputEval_functor(float negSlope) + RReLUupdateGradInputEval_functor(T negSlope) : negSlope_(negSlope) {} - __device__ __forceinline__ void operator()(float *gradIn, float *gradOut, float *in) + __device__ __forceinline__ void operator()(T *gradIn, T *gradOut, T *in) { *gradIn = (*in) <= 0 ? (*gradOut) * negSlope_ : (*gradOut); } }; +template <typename T> struct RReLUupdateGradInputEvalIP_functor { - const float negSlope_; + const T negSlope_; - RReLUupdateGradInputEvalIP_functor(float negSlope) + RReLUupdateGradInputEvalIP_functor(T negSlope) : negSlope_(negSlope) {} - __device__ __forceinline__ void operator()(float *gradOut, float *in) + __device__ __forceinline__ void operator()(T *gradOut, T *in) { if (*in <= 0) { @@ -137,42 +117,5 @@ struct RReLUupdateGradInputEvalIP_functor } }; -void THNN_CudaRReLU_updateGradInput(THCState *state, THCudaTensor *input, THCudaTensor *gradOutput, - THCudaTensor *gradInput, THCudaTensor *noise, double lower, double upper, bool train, bool inplace) -{ - THCUNN_assertSameGPU(state, 4, input, gradOutput, gradInput, noise); - - gradOutput = THCudaTensor_newContiguous(state, gradOutput); - - if (train && upper - lower > 1E-6) // e.g. if upper == lower, RReLU behaves like LeakyReLU - { - // multiply the gradient by the noise tensor - if (inplace) - { - THCudaTensor_cmul(state, gradOutput, gradOutput, noise); - THCudaTensor_set(state, gradInput, gradOutput); - } - else - { - THCudaTensor_resizeAs(state, gradInput, input); - THCudaTensor_cmul(state, gradInput, gradOutput, noise); - } - } - else - { - // use constant factor for negative input values - const double negSlope = (lower + upper) / 2; - if (inplace) - { - THC_pointwiseApply2(state, gradOutput, input, RReLUupdateGradInputEvalIP_functor(negSlope)); - THCudaTensor_set(state, gradInput, gradOutput); - } - else - { - THCudaTensor_resizeAs(state, gradInput, input); - THC_pointwiseApply3(state, gradInput, gradOutput, input, RReLUupdateGradInputEval_functor(negSlope)); - } - } - - THCudaTensor_free(state, gradOutput); -} +#include "generic/RReLU.cu" +#include "THCGenerateFloatTypes.h" diff --git a/lib/THCUNN/SharedMem.cuh b/lib/THCUNN/SharedMem.cuh new file mode 100644 index 0000000..8d83d9f --- /dev/null +++ b/lib/THCUNN/SharedMem.cuh @@ -0,0 +1,43 @@ +// Based on the simpleTempltes CUDA example + +#ifndef THCUNN_SHAREDMEM_H +#define THCUNN_SHAREDMEM_H + +template <typename T> +struct SharedMem { + __device__ T *getPointer() + { + extern __device__ void error(void); + error(); + return NULL; + } +}; + +template <> +struct SharedMem<half> +{ + __device__ half *getPointer() { + extern __shared__ half s_half[]; + return s_half; + } +}; + +template <> +struct SharedMem<float> +{ + __device__ float *getPointer() { + extern __shared__ float s_float[]; + return s_float; + } +}; + +template <> +struct SharedMem<double> +{ + __device__ double *getPointer() { + extern __shared__ double s_double[]; + return s_double; + } +}; + +#endif diff --git a/lib/THCUNN/Sigmoid.cu b/lib/THCUNN/Sigmoid.cu index f2a3675..623b84d 100644 --- a/lib/THCUNN/Sigmoid.cu +++ b/lib/THCUNN/Sigmoid.cu @@ -1,32 +1,24 @@ #include "THCUNN.h" -#include "common.h" +#include "THCHalf.h" +#include "THCHalfAutoNumerics.cuh" +template <typename T> struct sigmoidupdateOutput_functor { - __device__ void operator()(float *output, const float *input) const + __device__ void operator()(T *output, const T *input) const { - *output = 1./(1.+ exp(-*input)); + *output = ScalarConvert<double, T>::to(1./(1.+ exp(-*input))); } }; -void THNN_CudaSigmoid_updateOutput(THCState *state, THCudaTensor *input, THCudaTensor *output) -{ - THCUNN_assertSameGPU(state, 2, input, output); - THCudaTensor_resizeAs(state, output, input); - THC_pointwiseApply2(state, output, input, sigmoidupdateOutput_functor()); -} - +template <typename T> struct sigmoidupdateGradInput_functor { - __device__ void operator()(float *gradInput, const float *output, const float *gradOutput) const + __device__ void operator()(T *gradInput, const T *output, const T *gradOutput) const { - *gradInput = *gradOutput * (1.-*output) * (*output); + *gradInput = ScalarConvert<double, T>::to(*gradOutput * (1.-*output) * (*output)); } }; -void THNN_CudaSigmoid_updateGradInput(THCState *state, THCudaTensor *input, THCudaTensor *gradOutput, THCudaTensor *gradInput, THCudaTensor *output) -{ - THCUNN_assertSameGPU(state, 3, output, gradOutput, gradInput); - THCudaTensor_resizeAs(state, gradInput, output); - THC_pointwiseApply3(state, gradInput, output, gradOutput, sigmoidupdateGradInput_functor()); -} +#include "generic/Sigmoid.cu" +#include "THCGenerateFloatTypes.h" diff --git a/lib/THCUNN/SmoothL1Criterion.cu b/lib/THCUNN/SmoothL1Criterion.cu index 2b528ef..8e94fbc 100644 --- a/lib/THCUNN/SmoothL1Criterion.cu +++ b/lib/THCUNN/SmoothL1Criterion.cu @@ -1,5 +1,7 @@ #include "THCUNN.h" #include "common.h" +#include "THCHalf.h" +#include "THCHalfAutoNumerics.cuh" #include <thrust/fill.h> #include <thrust/functional.h> @@ -10,97 +12,38 @@ #include <thrust/system/cuda/execution_policy.h> #endif +template <typename Dtype, typename Acctype> struct smoothl1_functor { smoothl1_functor() {} - __host__ __device__ float operator()(const float &x, const float &y) const + __host__ __device__ Acctype operator()(const Dtype &x, const Dtype &y) const { - float z = fabsf(x-y); - return z < 1.f ? 0.5f*z*z : z - 0.5f; + Acctype z = ScalarConvert<Dtype, Acctype>::to(THCNumerics<Dtype>::abs(x-y)); + return z < Acctype(1) ? 0.5f*z*z : z - 0.5f; } }; -void THNN_CudaSmoothL1Criterion_updateOutput(THCState *state, THCudaTensor *input, THCudaTensor *target, THCudaTensor *output, bool sizeAverage) -{ - THCUNN_assertSameGPU(state, 2, input, target); - THArgCheck( - THCudaTensor_nElement(state, input) == THCudaTensor_nElement(state, target), 2, - "input and target need to have the same number of elements" - ); - - long size = THCudaTensor_nElement(state, input); - - input = THCudaTensor_newContiguous(state, input); - target = THCudaTensor_newContiguous(state, target); - - thrust::device_ptr<float> input_data(THCudaTensor_data(state, input)); - thrust::device_ptr<float> target_data(THCudaTensor_data(state, target)); - float sum = thrust::inner_product( -#if CUDA_VERSION >= 7000 - thrust::cuda::par.on(THCState_getCurrentStream(state)), -#endif - input_data, input_data+size, target_data, (float) 0, - thrust::plus<float>(), smoothl1_functor() - ); - - if (sizeAverage) - sum /= size; - - THCudaTensor_free(state, input); - THCudaTensor_free(state, target); - - THCudaTensor_set1d(state, output, 0, sum); -} - +template <typename Dtype> struct smoothl1_updateGradInput_functor { - const float norm; + const Dtype norm; - smoothl1_updateGradInput_functor(float norm_) + smoothl1_updateGradInput_functor(Dtype norm_) : norm(norm_) {} - __host__ __device__ float operator()(const float &x, const float &y) const + __host__ __device__ Dtype operator()(const Dtype &x, const Dtype &y) const { - float z = x - y; - if (z < -1.f) + Dtype z = x - y; + if (z < ScalarConvert<int, Dtype>::to(-1)) return -norm; - else if (z > 1.f) + else if (z > ScalarConvert<int, Dtype>::to(1)) return norm; else return norm * z; } }; -void THNN_CudaSmoothL1Criterion_updateGradInput(THCState *state, THCudaTensor *input, THCudaTensor *target, THCudaTensor *gradInput, bool sizeAverage) -{ - THCUNN_assertSameGPU(state, 3, input, target, gradInput); - THArgCheck( - THCudaTensor_nElement(state, input) == THCudaTensor_nElement(state, target), 2, - "input and target need to have the same number of elements" - ); - - long size = THCudaTensor_nElement(state, input); - float norm = sizeAverage ? 1./size : 1.; - - input = THCudaTensor_newContiguous(state, input); - target = THCudaTensor_newContiguous(state, target); - - THCudaTensor_resizeAs(state, gradInput, input); - - thrust::device_ptr<float> input_data(THCudaTensor_data(state, input)); - thrust::device_ptr<float> target_data(THCudaTensor_data(state, target)); - thrust::device_ptr<float> gradInput_data(THCudaTensor_data(state, gradInput)); - - thrust::transform( -#if CUDA_VERSION >= 7000 - thrust::cuda::par.on(THCState_getCurrentStream(state)), -#endif - input_data, input_data+size, target_data, gradInput_data, - smoothl1_updateGradInput_functor(norm) - ); - - THCudaTensor_free(state, input); - THCudaTensor_free(state, target); -} +#include "generic/SmoothL1Criterion.cu" +#include "THCGenerateFloatTypes.h" diff --git a/lib/THCUNN/SoftMarginCriterion.cu b/lib/THCUNN/SoftMarginCriterion.cu index 2a15f4a..95c8e95 100644 --- a/lib/THCUNN/SoftMarginCriterion.cu +++ b/lib/THCUNN/SoftMarginCriterion.cu @@ -1,5 +1,7 @@ #include "THCUNN.h" #include "common.h" +#include "THCHalf.h" +#include "THCHalfAutoNumerics.cuh" #include <thrust/fill.h> #include <thrust/functional.h> @@ -7,81 +9,29 @@ #include <thrust/reduce.h> #include <thrust/inner_product.h> +template <typename Dtype, typename Acctype> struct softmargin_functor { - __host__ __device__ float operator()(const float& x, const float& y) const + __host__ __device__ Acctype operator()(const Dtype& x, const Dtype& y) const { - return log(1 + exp(-x*y)); + return log(1 + exp(ScalarConvert<Dtype, Acctype>::to(-x)*y)); } }; - -void THNN_CudaSoftMarginCriterion_updateOutput(THCState *state, - THCudaTensor *input, - THCudaTensor *target, - THCudaTensor *output, - int sizeAverage - ) -{ - THCUNN_assertSameGPU(state, 2, input, target); - float sum; - - long size = THCudaTensor_nElement(state, input); - - input = THCudaTensor_newContiguous(state, input); - target = THCudaTensor_newContiguous(state, target); - - thrust::device_ptr<float> input_data(THCudaTensor_data(state, input)); - thrust::device_ptr<float> target_data(THCudaTensor_data(state, target)); - sum = thrust::inner_product(input_data, input_data+size, target_data, (float) 0, thrust::plus<float>(), softmargin_functor()); - - if(sizeAverage) - sum /= size; - - THCudaTensor_free(state, input); - THCudaTensor_free(state, target); - - THCudaTensor_set1d(state, output, 0, sum); -} - - +template <typename Dtype, typename Acctype> struct softmargin_updateGradInput_functor { - const float norm; + const Acctype norm; - softmargin_updateGradInput_functor(float norm_) : + softmargin_updateGradInput_functor(Acctype norm_) : norm(norm_) {} - __host__ __device__ float operator()(const float& x, const float& y) const + __host__ __device__ Dtype operator()(const Dtype& x, const Dtype& y) const { - float temp = exp(-x*y); - return -y*temp*norm/(1.f + temp); + Acctype temp = exp(ScalarConvert<Dtype, Acctype>::to(-x)*y); + return ScalarConvert<Acctype, Dtype>::to(-y*temp*norm/(ScalarConvert<int, Acctype>::to(1) + temp)); } }; -void THNN_CudaSoftMarginCriterion_updateGradInput(THCState *state, - THCudaTensor *input, - THCudaTensor *target, - THCudaTensor *gradInput, - int sizeAverage - ) -{ - THCUNN_assertSameGPU(state, 3, input, target, gradInput); - - long size = THCudaTensor_nElement(state, input); - float norm = (sizeAverage ? 1./size : 1.); - - input = THCudaTensor_newContiguous(state, input); - target = THCudaTensor_newContiguous(state, target); - - THCudaTensor_resizeAs(state, gradInput, input); - - thrust::device_ptr<float> input_data(THCudaTensor_data(state, input)); - thrust::device_ptr<float> target_data(THCudaTensor_data(state, target)); - thrust::device_ptr<float> gradInput_data(THCudaTensor_data(state, gradInput)); - - thrust::transform(input_data, input_data+size, target_data, gradInput_data, softmargin_updateGradInput_functor(norm)); - - THCudaTensor_free(state, input); - THCudaTensor_free(state, target); -} +#include "generic/SoftMarginCriterion.cu" +#include "THCGenerateFloatTypes.h" diff --git a/lib/THCUNN/SoftMax.cu b/lib/THCUNN/SoftMax.cu index 987325f..5ae6b70 100644 --- a/lib/THCUNN/SoftMax.cu +++ b/lib/THCUNN/SoftMax.cu @@ -1,26 +1,29 @@ #include "THCUNN.h" -#include "common.h" +#include "THCHalf.h" +#include "THCHalfAutoNumerics.cuh" #define SOFTMAX_THREADS 128 +template <typename T, typename AccumT> __global__ void cunn_SoftMax_updateOutput_kernel( - float *output, float *input, int nframe, int dim, int stride0, int stride1) + T *output, T *input, int nframe, int dim, int stride0, int stride1) { - __shared__ float buffer[SOFTMAX_THREADS+1]; - float *input_k = input + blockIdx.x*dim*stride0 + blockIdx.y*stride1 + blockIdx.z; - float *output_k = output + blockIdx.x*dim*stride0 + blockIdx.y*stride1 + blockIdx.z; + __shared__ AccumT buffer[SOFTMAX_THREADS+1]; + T *input_k = input + blockIdx.x*dim*stride0 + blockIdx.y*stride1 + blockIdx.z; + T *output_k = output + blockIdx.x*dim*stride0 + blockIdx.y*stride1 + blockIdx.z; int i_start = threadIdx.x; int i_end = dim; int i_step = blockDim.x; // max? - buffer[threadIdx.x] = -FLT_MAX; + buffer[threadIdx.x] = -THCNumerics<AccumT>::max(); for (int i=i_start; i<i_end; i+=i_step) { - float z = input_k[i*stride0]; - if (buffer[threadIdx.x] < z) - buffer[threadIdx.x] = z; + T z = input_k[i*stride0]; + AccumT zAcc = ScalarConvert<T, AccumT>::to(z); + if (buffer[threadIdx.x] < zAcc) + buffer[threadIdx.x] = zAcc; } @@ -29,7 +32,7 @@ __global__ void cunn_SoftMax_updateOutput_kernel( // reduce if (threadIdx.x == 0) { - float max_k = -FLT_MAX; + AccumT max_k = -THCNumerics<AccumT>::max(); for (int i=0; i<blockDim.x; i++) { if (max_k < buffer[i]) @@ -41,11 +44,11 @@ __global__ void cunn_SoftMax_updateOutput_kernel( __syncthreads(); // sum? - float max_k = buffer[SOFTMAX_THREADS]; - buffer[threadIdx.x] = 0; + T max_k = ScalarConvert<AccumT, T>::to(buffer[SOFTMAX_THREADS]); + buffer[threadIdx.x] = ScalarConvert<int, AccumT>::to(0); for (int i=i_start; i<i_end; i+=i_step) { - float z = __expf(input_k[i*stride0]-max_k); - buffer[threadIdx.x] += z; + T z = THCNumerics<T>::exp(input_k[i*stride0]-max_k); + buffer[threadIdx.x] += ScalarConvert<T, AccumT>::to(z); output_k[i*stride0] = z; } @@ -54,7 +57,7 @@ __global__ void cunn_SoftMax_updateOutput_kernel( // reduce if (threadIdx.x == 0) { - float sum_k = 0; + AccumT sum_k = ScalarConvert<int, AccumT>::to(0); for (int i=0; i<blockDim.x; i++) sum_k += buffer[i]; buffer[SOFTMAX_THREADS] = sum_k; @@ -63,34 +66,35 @@ __global__ void cunn_SoftMax_updateOutput_kernel( __syncthreads(); // softmax - float sum_k = buffer[SOFTMAX_THREADS]; + T sum_k = ScalarConvert<AccumT, T>::to(buffer[SOFTMAX_THREADS]); for (int i=i_start; i<i_end; i+=i_step) output_k[i*stride0] = output_k[i*stride0] / sum_k; } +template <typename T, typename AccumT> __global__ void cunn_SoftMax_updateGradInput_kernel( - float *gradInput, float *output, float *gradOutput, int nframe, int dim, int stride0, int stride1) + T *gradInput, T *output, T *gradOutput, int nframe, int dim, int stride0, int stride1) { - __shared__ float buffer[SOFTMAX_THREADS]; - float *gradInput_k = gradInput + blockIdx.x*dim*stride0 + blockIdx.y * stride1 + blockIdx.z; - float *output_k = output + blockIdx.x*dim*stride0 + blockIdx.y * stride1 + blockIdx.z; - float *gradOutput_k = gradOutput + blockIdx.x*dim*stride0 + blockIdx.y * stride1 + blockIdx.z; + __shared__ AccumT buffer[SOFTMAX_THREADS]; + T *gradInput_k = gradInput + blockIdx.x*dim*stride0 + blockIdx.y * stride1 + blockIdx.z; + T *output_k = output + blockIdx.x*dim*stride0 + blockIdx.y * stride1 + blockIdx.z; + T *gradOutput_k = gradOutput + blockIdx.x*dim*stride0 + blockIdx.y * stride1 + blockIdx.z; int i_start = threadIdx.x; int i_end = dim; int i_step = blockDim.x; // sum? - buffer[threadIdx.x] = 0; + buffer[threadIdx.x] = ScalarConvert<int, AccumT>::to(0); for (int i=i_start; i<i_end; i+=i_step) - buffer[threadIdx.x] += gradOutput_k[i*stride0] * output_k[i*stride0]; + buffer[threadIdx.x] += ScalarConvert<T, AccumT>::to(gradOutput_k[i*stride0] * output_k[i*stride0]); __syncthreads(); // reduce if (threadIdx.x == 0) { - float sum_k = 0; + AccumT sum_k = ScalarConvert<int, AccumT>::to(0); for (int i=0; i<blockDim.x; i++) sum_k += buffer[i]; buffer[0] = sum_k; @@ -98,147 +102,12 @@ __global__ void cunn_SoftMax_updateGradInput_kernel( __syncthreads(); - float sum_k = buffer[0]; + T sum_k = ScalarConvert<AccumT, T>::to(buffer[0]); for (int i=i_start; i<i_end; i+=i_step) gradInput_k[i*stride0] = output_k[i*stride0] * (gradOutput_k[i*stride0] - sum_k); } -void THNN_CudaSoftMax_updateOutput(THCState *state, THCudaTensor *input, THCudaTensor *output) -{ - THCUNN_assertSameGPU(state, 2, input, output); - - input = THCudaTensor_newContiguous(state, input); - THCudaTensor_resizeAs(state, output, input); - long batchSize, dim, stride0, stride1 = 1; - long blocksY = 1, blocksZ = 1; - - if (input->nDimension == 1) - { - batchSize = 1; - dim = input->size[0]; - stride0 = 1; - } - else if (input->nDimension == 2) - { - batchSize = input->size[0]; - dim = input->size[1]; - stride0 = 1; - } - else if (input->nDimension == 3) - { - batchSize = 1; - dim = input->size[0]; - blocksY = input->size[1]; - blocksZ = input->size[2]; - stride0 = blocksY * blocksZ; - stride1 = blocksZ; - } - else if (input->nDimension == 4) - { - batchSize = input->size[0]; - dim = input->size[1]; - blocksY = input->size[2]; - blocksZ = input->size[3]; - stride0 = blocksY * blocksZ; - stride1 = blocksZ; - } - else - { - THError("1D, 2D, 3D or 4D tensor expected"); - } - - // when possible use only 2d grid of thread blocks to stay compatible with compute capability 2.X devices. - if (blocksY * blocksZ < 65536) - { - blocksY *= blocksZ; - blocksZ = 1; - if (input->nDimension == 3 || input->nDimension == 4) { - stride0 = blocksY * blocksZ; - stride1 = blocksZ; - } - } - - dim3 blocks(batchSize, blocksY, blocksZ); - dim3 threads(SOFTMAX_THREADS); - cunn_SoftMax_updateOutput_kernel<<<blocks, threads, 0, THCState_getCurrentStream(state)>>>( - THCudaTensor_data(state, output), - THCudaTensor_data(state, input), - batchSize, dim, stride0, stride1 - ); - THCudaCheck(cudaGetLastError()); - - THCudaTensor_free(state, input); -} - -void THNN_CudaSoftMax_updateGradInput(THCState *state, THCudaTensor *input, THCudaTensor *gradOutput, THCudaTensor *gradInput, THCudaTensor *output) -{ - THCUNN_assertSameGPU(state, 3, output, gradOutput, gradInput); - - output = THCudaTensor_newContiguous(state, output); - gradOutput = THCudaTensor_newContiguous(state, gradOutput); - - THCudaTensor_resizeAs(state, gradInput, output); - long batchSize, dim, stride0, stride1 = 1; - long blocksY = 1, blocksZ = 1; - - if (gradInput->nDimension == 1) - { - batchSize = 1; - dim = gradInput->size[0]; - stride0 = 1; - } - else if (gradInput->nDimension == 2) - { - batchSize = gradInput->size[0]; - dim = gradInput->size[1]; - stride0 = 1; - } - else if (gradInput->nDimension == 3) - { - batchSize = 1; - dim = gradInput->size[0]; - blocksY = gradInput->size[1]; - blocksZ = gradInput->size[2]; - stride0 = blocksY * blocksZ; - stride1 = blocksZ; - } - else if (gradInput->nDimension == 4) - { - batchSize = gradInput->size[0]; - dim = gradInput->size[1]; - blocksY = gradInput->size[2]; - blocksZ = gradInput->size[3]; - stride0 = blocksY * blocksZ; - stride1 = blocksZ; - } - else - { - THError("1D, 2D, 3D or 4D tensor expected"); - } - - // when possible use only 2d grid of thread blocks to stay compatible with compute capability 2.X devices. - if (blocksY * blocksZ < 65536) - { - blocksY *= blocksZ; - blocksZ = 1; - if (input->nDimension == 3 || input->nDimension == 4) { - stride0 = blocksY * blocksZ; - stride1 = blocksZ; - } - } - - dim3 blocks(batchSize, blocksY, blocksZ); - dim3 threads(SOFTMAX_THREADS); - cunn_SoftMax_updateGradInput_kernel<<<blocks, threads, 0, THCState_getCurrentStream(state)>>>( - THCudaTensor_data(state, gradInput), - THCudaTensor_data(state, output), - THCudaTensor_data(state, gradOutput), - batchSize, dim, stride0, stride1 - ); - THCudaCheck(cudaGetLastError()); - - THCudaTensor_free(state, gradOutput); - THCudaTensor_free(state, output); -} +#include "generic/SoftMax.cu" +#include "THCGenerateFloatTypes.h" #undef SOFTMAX_THREADS diff --git a/lib/THCUNN/SoftPlus.cu b/lib/THCUNN/SoftPlus.cu index 0d1609a..cb9ecb7 100644 --- a/lib/THCUNN/SoftPlus.cu +++ b/lib/THCUNN/SoftPlus.cu @@ -1,52 +1,42 @@ #include "THCUNN.h" -#include "common.h" +#include "THCHalf.h" +#include "THCHalfAutoNumerics.cuh" +template <typename T> struct softPlusupdateOutput_functor { - const float threshold; - const float beta; + const T threshold; + const T beta; - softPlusupdateOutput_functor(float threshold_, float beta_) + softPlusupdateOutput_functor(T threshold_, T beta_) : threshold(threshold_) , beta(beta_) {} - __device__ void operator()(float *output, const float *input) const - { - float betain = beta * (*input); + __device__ void operator()(T *output, const T *input) const { + T betain = beta * (*input); *output = ((betain) > threshold) ? *input : (1/beta) * log1p(exp(betain)); } }; -void THNN_CudaSoftPlus_updateOutput(THCState *state, THCudaTensor *input, THCudaTensor *output, float beta, float threshold) -{ - THCUNN_assertSameGPU(state, 2, input, output); - THCudaTensor_resizeAs(state, output, input); - THC_pointwiseApply2(state, output, input, softPlusupdateOutput_functor(threshold, beta)); -} - +template <typename T> struct softPlusupdateGradInput_functor { - const float threshold; - const float beta; + const T threshold; + const T beta; - softPlusupdateGradInput_functor(float threshold_, float beta_) + softPlusupdateGradInput_functor(T threshold_, T beta_) : threshold(threshold_) , beta(beta_) {} - __device__ void operator()(float *gradInput, const float *output, const float *gradOutput) const + __device__ void operator()(T *gradInput, const T *output, const T *gradOutput) const { - float betaout = beta * (*output); - float exp_bo = exp(betaout); + T betaout = beta * (*output); + T exp_bo = exp(betaout); *gradInput = ((betaout) > threshold) ? *gradOutput : *gradOutput * (exp_bo - 1) / exp_bo; } }; -void THNN_CudaSoftPlus_updateGradInput(THCState *state, THCudaTensor *input, THCudaTensor *gradOutput, THCudaTensor *gradInput, - THCudaTensor *output, float beta, float threshold) -{ - THCUNN_assertSameGPU(state, 4, input, output, gradOutput, gradInput); - THCudaTensor_resizeAs(state, gradInput, output); - THC_pointwiseApply3(state, gradInput, output, gradOutput, softPlusupdateGradInput_functor(threshold, beta)); -} +#include "generic/SoftPlus.cu" +#include "THCGenerateFloatTypes.h" diff --git a/lib/THCUNN/SoftShrink.cu b/lib/THCUNN/SoftShrink.cu index 2a08570..7762449 100644 --- a/lib/THCUNN/SoftShrink.cu +++ b/lib/THCUNN/SoftShrink.cu @@ -1,54 +1,43 @@ #include "THCUNN.h" -#include "common.h" +#include "THCHalf.h" +#include "THCHalfAutoNumerics.cuh" +template <typename T> struct SoftShrinkUpdateOutput { - const float lambda_; + const T lambda_; - SoftShrinkUpdateOutput(float lambda) + SoftShrinkUpdateOutput(T lambda) : lambda_(lambda) {} - __device__ __forceinline__ void operator()(float *out, float *in) + __device__ __forceinline__ void operator()(T *out, T *in) { - float x = *in; + T x = *in; if (x > lambda_) *out = x - lambda_; else if (x < -lambda_) *out = x + lambda_; - else *out = 0; + else *out = ScalarConvert<int, T>::to(0); } }; -void THNN_CudaSoftShrink_updateOutput(THCState *state, THCudaTensor *input, THCudaTensor *output, double lambda) -{ - THCUNN_assertSameGPU(state, 2, input, output); - THCudaTensor_resizeAs(state, output, input); - THC_pointwiseApply2(state, output, input, SoftShrinkUpdateOutput(lambda)); - THCudaCheck(cudaGetLastError()); -} - +template <typename T> struct SoftShrinkUpdateGradInput { - const float lambda_; + const T lambda_; - SoftShrinkUpdateGradInput(float lambda) + SoftShrinkUpdateGradInput(T lambda) : lambda_(lambda) {} - __device__ __forceinline__ void operator()(float *gradInput, float *input, float *gradOutput) const + __device__ __forceinline__ void operator()(T *gradInput, T *input, T *gradOutput) const { - float x = *input; + T x = *input; if (x > lambda_ || x < -lambda_) *gradInput = *gradOutput; else - *gradInput = 0; + *gradInput = ScalarConvert<int, T>::to(0); } }; - -void THNN_CudaSoftShrink_updateGradInput(THCState *state, THCudaTensor *input, THCudaTensor *gradOutput, THCudaTensor *gradInput, double lambda) -{ - THCUNN_assertSameGPU(state, 3, input, gradOutput, gradInput); - THCudaTensor_resizeAs(state, gradInput, input); - THC_pointwiseApply3(state, gradInput, input, gradOutput, SoftShrinkUpdateGradInput(lambda)); - THCudaCheck(cudaGetLastError()); -} +#include "generic/SoftShrink.cu" +#include "THCGenerateFloatTypes.h" diff --git a/lib/THCUNN/SparseLinear.cu b/lib/THCUNN/SparseLinear.cu index 577eec5..9435735 100644 --- a/lib/THCUNN/SparseLinear.cu +++ b/lib/THCUNN/SparseLinear.cu @@ -1,4 +1,7 @@ #include "THCUNN.h" +#include "THCHalf.h" +#include "THCHalfAutoNumerics.cuh" + #include <cusparse.h> #include <thrust/device_vector.h> @@ -13,250 +16,70 @@ static void init_cusparse() { } } -static bool checkInput(THCudaTensor* t) -{ - return t->nDimension == 2 && t->size[1] == 3; -} - -static bool checkSize2D(THCudaTensor* t, long size0, long size1) -{ - return t->nDimension == 2 && t->size[0] == size0 && t->size[1] == size1; -} - -static bool checkSize1D(THCudaTensor* t, long size0) -{ - return t->nDimension == 1 && t->size[0] == size0; -} - -void THNN_CudaSparseLinear_updateOutput(THCState *state, - THCudaTensor *input, - THCudaTensor *output, - THCudaTensor *weight, - THCudaTensor *bias) -{ - THAssert(THCudaTensor_checkGPU(state, 4, input, output, weight, bias)); - - long h; - long outDim = THCudaTensor_size(state, weight, 0); - long inDim = THCudaTensor_size(state, weight, 1); - - THArgCheck(checkInput(input), 2, "input size must be nnz x 3"); - THArgCheck(THCudaTensor_nDimension(state, output) == 2, 3, "output must be batchsize x outputsize"); - THArgCheck(checkSize1D(bias, outDim), 5, "bias size wrong"); - - long batchnum = THCudaTensor_size(state, output, 0); - long nnz = THCudaTensor_size(state, input, 0); - - THCudaTensor *buffer = THCudaTensor_new(state); - THCudaTensor *sel = THCudaTensor_new(state); - THCudaTensor *values = THCudaTensor_new(state); - THCudaIntTensor *rowbuf = THCudaIntTensor_new(state); - THCudaIntTensor *csrPtrs = THCudaIntTensor_new(state); - THCudaIntTensor *colInds = THCudaIntTensor_new(state); - - THCudaTensor_resize1d(state, values, nnz); - THCudaIntTensor_resize1d(state, rowbuf, nnz); - THCudaIntTensor_resize1d(state, colInds, nnz); - THCudaIntTensor_resize1d(state, csrPtrs, batchnum+1); - - // Get data ready for cusparse, need CudaInt buffers - // We do not need to sort, since rows are already in order - // If rows might get out of order in future implementations, or if cusparse - // complains with an illegal memory access, sort like we do in AccGradParameters - THCudaTensor_select(state, sel, input, 1, 0); - THCudaIntTensor_copyCudaFloat(state, rowbuf, sel); - THCudaTensor_select(state, sel, input, 1, 1); - THCudaIntTensor_copyCudaFloat(state, colInds, sel); - THCudaTensor_select(state, sel, input, 1, 2); - THCudaTensor_copyCuda(state, values, sel); - - init_cusparse(); - cusparseXcoo2csr(cusparse_handle, - THCudaIntTensor_data(state, rowbuf), nnz, batchnum, - THCudaIntTensor_data(state, csrPtrs), CUSPARSE_INDEX_BASE_ONE); - - // output = bias - THCudaTensor_resize2d(state, buffer, outDim, batchnum); - THCudaTensor_zero(state, buffer); - for (h=0; h<batchnum; h++) { - THCudaTensor_select(state, sel, buffer, 1, h); - THCudaTensor_copy(state, sel, bias); - } - - // output = W * x - float one = 1; - cusparseMatDescr_t descr = 0; - cusparseCreateMatDescr(&descr); - cusparseSetMatType(descr,CUSPARSE_MATRIX_TYPE_GENERAL); - cusparseSetMatIndexBase(descr,CUSPARSE_INDEX_BASE_ONE); - cusparseScsrmm(cusparse_handle, - CUSPARSE_OPERATION_NON_TRANSPOSE, - batchnum, outDim, inDim, nnz, - &one, - descr, - THCudaTensor_data(state, values), - THCudaIntTensor_data(state, csrPtrs), - THCudaIntTensor_data(state, colInds), - THCudaTensor_data(state, weight), inDim, - &one, THCudaTensor_data(state, buffer), batchnum - ); - THCudaTensor_transpose(state, buffer, NULL, 0, 1); - - // We do work in the buffer to keep the output contiguous - THCudaTensor_copy(state, output, buffer); - - cusparseDestroyMatDescr(descr); - descr = 0; - THCudaTensor_free(state, buffer); - THCudaTensor_free(state, sel); - THCudaTensor_free(state, values); - THCudaIntTensor_free(state, rowbuf); - THCudaIntTensor_free(state, colInds); - THCudaIntTensor_free(state, csrPtrs); +void THNN_CudaHalfSparseLinear_updateOutput( + THCState *state, + THCudaHalfTensor *input, + THCudaHalfTensor *output, + THCudaHalfTensor *weight, + THCudaHalfTensor *bias) { + THError("THCudaHalfTensor not supported with SparseLinear"); } -void THNN_CudaSparseLinear_accGradParameters( +void THNN_CudaHalfSparseLinear_accGradParameters( THCState *state, - THCudaTensor *input, - THCudaTensor *gradOutput, - THCudaTensor *gradWeight, - THCudaTensor *gradBias, - THCudaTensor *weight, - THCudaTensor *bias, + THCudaHalfTensor *input, + THCudaHalfTensor *gradOutput, + THCudaHalfTensor *gradWeight, + THCudaHalfTensor *gradBias, + THCudaHalfTensor *weight, + THCudaHalfTensor *bias, double weightDecay, - double scale) -{ - long outDim = THCudaTensor_size(state, weight, 0); - long inDim = THCudaTensor_size(state, weight, 1); - - THArgCheck(checkInput(input), 2, "input size must be batchsize x nnz x 2"); - THArgCheck(checkSize2D(gradWeight, outDim, inDim), 4, "gradWeight size wrong"); - THArgCheck(checkSize1D(gradBias, outDim), 5, "gradBias size wrong"); - THArgCheck(THCudaTensor_isContiguous(state, gradOutput), 3, - "gradOutput must be contiguous"); - - long nnz = THCudaTensor_size(state, input, 0); - long batchnum = THCudaTensor_size(state, gradOutput, 0); - - THCudaTensor *buf = THCudaTensor_new(state); - THCudaTensor *cols = THCudaTensor_new(state); - THCudaTensor *sel = THCudaTensor_new(state); - THCudaLongTensor *inds = THCudaLongTensor_new(state); - THCudaTensor *values = THCudaTensor_new(state); - THCudaIntTensor *colbuf = THCudaIntTensor_new(state); - THCudaIntTensor *colPtrs = THCudaIntTensor_new(state); - THCudaIntTensor *rowInds = THCudaIntTensor_new(state); - - THCudaTensor_select(state, sel, input, 1, 0); // rowInds - THCudaTensor_select(state, cols, input, 1, 1); // colInds - THCudaTensor_cadd(state, buf, sel, batchnum, cols); // colInds * buatchdim + rowInds - THCudaTensor_sort(state, buf, inds, buf, 0, 0); // Indicies are now in ind - THCudaTensor_indexSelect(state, buf, input, 0, inds); - - THCudaTensor_resize1d(state, values, nnz); - THCudaIntTensor_resize1d(state, colbuf, nnz); - THCudaIntTensor_resize1d(state, rowInds, nnz); - THCudaIntTensor_resize1d(state, colPtrs, inDim+1); - - // Get data ready for cusparse, need CudaInt buffers - THCudaTensor_select(state, sel, buf, 1, 0); - THCudaIntTensor_copyCudaFloat(state, rowInds, sel); - THCudaTensor_select(state, sel, buf, 1, 1); - THCudaIntTensor_copyCudaFloat(state, colbuf, sel); - THCudaTensor_select(state, sel, buf, 1, 2); - THCudaTensor_copyCuda(state, values, sel); - - init_cusparse(); - // Secretly coo2csc - cusparseXcoo2csr(cusparse_handle, - THCudaIntTensor_data(state, colbuf), nnz, inDim, - THCudaIntTensor_data(state, colPtrs), CUSPARSE_INDEX_BASE_ONE); - - // FORTRAN expects contiguous col-major matricies - THCudaTensor_transpose(state, gradOutput, NULL, 0, 1); - THCudaTensor_resize2d(state, buf, batchnum, outDim); - THCudaTensor_copy(state, buf, gradOutput); - THCudaTensor_transpose(state, gradOutput, NULL, 0, 1); // Restore gradOutput - - float one = 1; - cusparseMatDescr_t descr = 0; - cusparseCreateMatDescr(&descr); - cusparseSetMatType(descr,CUSPARSE_MATRIX_TYPE_GENERAL); - cusparseSetMatIndexBase(descr,CUSPARSE_INDEX_BASE_ONE); - cusparseScsrmm(cusparse_handle, - CUSPARSE_OPERATION_NON_TRANSPOSE, - inDim, outDim, batchnum, nnz, - &one, - descr, - THCudaTensor_data(state, values), - THCudaIntTensor_data(state, colPtrs), - THCudaIntTensor_data(state, rowInds), - THCudaTensor_data(state, buf), batchnum, - &one, THCudaTensor_data(state, gradWeight), inDim - ); - - THCudaTensor_sum(state, buf, gradOutput, 0); - THCudaTensor_resize1d(state, buf, outDim); - THCudaTensor_cadd(state, gradBias, gradBias, scale, buf); - - if (weightDecay != 0) - { - THCudaTensor_cadd(state, gradWeight, gradWeight, weightDecay, weight); - THCudaTensor_cadd(state, gradBias, gradBias, weightDecay, bias); - } - - THCudaTensor_free(state, buf); - THCudaTensor_free(state, sel); - THCudaTensor_free(state, cols); - THCudaLongTensor_free(state, inds); - THCudaTensor_free(state, values); - THCudaIntTensor_free(state, colbuf); - THCudaIntTensor_free(state, rowInds); - THCudaIntTensor_free(state, colPtrs); + double scale) { + THError("THCudaHalfTensor not supported with SparseLinear"); } -void THNN_CudaSparseLinear_legacyUpdateOutput( +void THNN_CudaHalfSparseLinear_legacyUpdateOutput( THCState *state, - THCudaTensor *input, - THCudaTensor *output, - THCudaTensor *weight, - THCudaTensor *bias) { - THError("CUDA does not support legacy input format, please use a table of nnz x 2 vectors"); + THCudaHalfTensor *input, + THCudaHalfTensor *output, + THCudaHalfTensor *weight, + THCudaHalfTensor *bias) { + THError("THCudaHalfTensor not supported with SparseLinear"); } -void THNN_CudaSparseLinear_legacyAccGradParameters( + +void THNN_CudaHalfSparseLinear_legacyAccGradParameters( THCState *state, - THCudaTensor *input, - THCudaTensor *gradOutput, - THCudaTensor *gradWeight, - THCudaTensor *gradBias, - THCudaTensor *weight, - THCudaTensor *bias, + THCudaHalfTensor *input, + THCudaHalfTensor *gradOutput, + THCudaHalfTensor *gradWeight, + THCudaHalfTensor *gradBias, + THCudaHalfTensor *weight, + THCudaHalfTensor *bias, double weightDecay, double scale) { - THError("CUDA does not support legacy input format, please use a table of nnz x 2 vectors"); + THError("THCudaHalfTensor not supported with SparseLinear"); } -// Dense updates are pretty fast on the GPU -void THNN_CudaSparseLinear_zeroGradParameters( +void THNN_CudaHalfSparseLinear_zeroGradParameters( THCState *state, - THCudaTensor *gradWeight, - THCudaTensor *gradBias, - THCudaTensor *lastInput) { - THCudaTensor_zero(state, gradWeight); - THCudaTensor_zero(state, gradBias); + THCudaHalfTensor *gradWeight, + THCudaHalfTensor *gradBias, + THCudaHalfTensor *lastInput) { + THError("THCudaHalfTensor not supported with SparseLinear"); } -TH_API void THNN_CudaSparseLinear_updateParameters( +void THNN_CudaHalfSparseLinear_updateParameters( THCState *state, - THCudaTensor *weight, - THCudaTensor *bias, - THCudaTensor *gradWeight, - THCudaTensor *gradBias, - THCudaTensor *lastInput, + THCudaHalfTensor *weight, + THCudaHalfTensor *bias, + THCudaHalfTensor *gradWeight, + THCudaHalfTensor *gradBias, + THCudaHalfTensor *lastInput, double learningRate) { - THCudaTensor_cadd(state, weight, weight, -learningRate, gradWeight); - THCudaTensor_cadd(state, bias, bias, -learningRate, gradBias); + THError("THCudaHalfTensor not supported with SparseLinear"); } -void THNN_CudaSparseLinear_cudaClearState(THCState *state) { -} +#include "generic/SparseLinear.cu" +#include "THCGenerateFloatType.h" +#include "generic/SparseLinear.cu" +#include "THCGenerateDoubleType.h" diff --git a/lib/THCUNN/SpatialAdaptiveMaxPooling.cu b/lib/THCUNN/SpatialAdaptiveMaxPooling.cu index 5dd8659..2957953 100644 --- a/lib/THCUNN/SpatialAdaptiveMaxPooling.cu +++ b/lib/THCUNN/SpatialAdaptiveMaxPooling.cu @@ -1,5 +1,7 @@ #include "THCUNN.h" -#include "common.h" +#include "THCHalf.h" +#include "THCHalfAutoNumerics.cuh" +#include "THCAtomics.cuh" #define CUDA_MAX_THREADS 1024 // this is safe, in reality 256 is our limit @@ -8,7 +10,8 @@ * this function adaptively maxpools an input 4D tensor along dimensions 2 and 3 * 4D input, 4D output, 4D argmax x and y */ -__global__ void adaptivemaxpool(float *input, float *output, float *indices_x, float *indices_y, + template <typename T> +__global__ void adaptivemaxpool(T *input, T *output, THCIndex_t *indices_x, THCIndex_t *indices_y, int input_n, int input_h, int input_w, int output_h, int output_w, int strideh, int stridew, @@ -29,7 +32,6 @@ __global__ void adaptivemaxpool(float *input, float *output, float *indices_x, f int yy_start = blockDim.y*blockIdx.y + threadIdx.y; int yy_end = output_h; const int yy_step = blockDim.y*gridDim.y; - // select input/output plane output = output + o*output_w*output_h; input = input + i*strided; @@ -50,17 +52,17 @@ __global__ void adaptivemaxpool(float *input, float *output, float *indices_x, f int kW = x_end-x_start; // Compute the mean of the input image... - float *ptr_input = input + y_start*strideh + x_start*stridew; - float *ptr_output = output + yy*output_w + xx; - float *ptr_ind_x = indices_x + yy*output_w + xx; - float *ptr_ind_y = indices_y + yy*output_w + xx; + T *ptr_input = input + y_start*strideh + x_start*stridew; + T *ptr_output = output + yy*output_w + xx; + THCIndex_t *ptr_ind_x = indices_x + yy*output_w + xx; + THCIndex_t *ptr_ind_y = indices_y + yy*output_w + xx; int argmax_x = -1; int argmax_y = -1; - float max = -FLT_MAX; + T max = THCNumerics<T>::min(); int kx, ky; for(ky = 0; ky < kH; ky++) { for(kx = 0; kx < kW; kx++) { - float val = ptr_input[kx*stridew]; + T val = ptr_input[kx*stridew]; if (val > max) { max = val; argmax_x = kx; @@ -81,7 +83,8 @@ __global__ void adaptivemaxpool(float *input, float *output, float *indices_x, f * Description: * this function computes the gradInput from weight and gradOutput */ -__global__ void adaptivemaxgradinput(float *gradInput, float *gradOutput, float *indices_x, float *indices_y, + template <typename T> +__global__ void adaptivemaxgradinput(T *gradInput, T *gradOutput, THCIndex_t *indices_x, THCIndex_t *indices_y, int input_n, int input_h, int input_w, int output_h, int output_w) { @@ -116,11 +119,11 @@ __global__ void adaptivemaxgradinput(float *gradInput, float *gradOutput, float int x_start = (int)floor(float(xx) / output_w * input_w); - float *ptr_gradInput = gradInput + y_start*input_w + x_start; - float *ptr_gradOutput = gradOutput + yy*output_w + xx; - float *ptr_ind_x = indices_x + yy*output_w + xx; - float *ptr_ind_y = indices_y + yy*output_w + xx; - float z = *ptr_gradOutput; + T *ptr_gradInput = gradInput + y_start*input_w + x_start; + T *ptr_gradOutput = gradOutput + yy*output_w + xx; + THCIndex_t *ptr_ind_x = indices_x + yy*output_w + xx; + THCIndex_t *ptr_ind_y = indices_y + yy*output_w + xx; + T z = *ptr_gradOutput; int argmax_x = (*ptr_ind_x) - TH_INDEX_BASE; int argmax_y = (*ptr_ind_y) - TH_INDEX_BASE; @@ -135,8 +138,9 @@ __global__ void adaptivemaxgradinput(float *gradInput, float *gradOutput, float * this function computes the gradInput from weight and gradOutput * when kH != dH or kW != dW (uses atomic add) */ + template <typename T> __global__ void atomicadaptivemaxgradinput( - float *gradInput, float *gradOutput, float *indices_x, float *indices_y, + T *gradInput, T *gradOutput, THCIndex_t *indices_x, THCIndex_t *indices_y, int input_n, int input_h, int input_w, int output_h, int output_w ) { @@ -170,11 +174,11 @@ __global__ void atomicadaptivemaxgradinput( int x_start = (int)floor(float(xx) / output_w * input_w); - float *ptr_gradInput = gradInput + y_start*input_w + x_start; - float *ptr_gradOutput = gradOutput + yy*output_w + xx; - float *ptr_ind_x = indices_x + yy*output_w + xx; - float *ptr_ind_y = indices_y + yy*output_w + xx; - float z = *ptr_gradOutput; + T *ptr_gradInput = gradInput + y_start*input_w + x_start; + T *ptr_gradOutput = gradOutput + yy*output_w + xx; + THCIndex_t *ptr_ind_x = indices_x + yy*output_w + xx; + THCIndex_t *ptr_ind_y = indices_y + yy*output_w + xx; + T z = *ptr_gradOutput; int argmax_x = (*ptr_ind_x) - TH_INDEX_BASE; int argmax_y = (*ptr_ind_y) - TH_INDEX_BASE; @@ -185,174 +189,7 @@ __global__ void atomicadaptivemaxgradinput( } } -void THNN_CudaSpatialAdaptiveMaxPooling_updateOutput(THCState *state, THCudaTensor *input, THCudaTensor *output, THCudaTensor *indices, int nOutputCols, int nOutputRows) -{ - THCUNN_assertSameGPU(state, 3, input, output, indices); - - float *indices_data; - float *output_data; - float *input_data; - - THArgCheck(input->nDimension == 3 || input->nDimension == 4, 2, "3D or 4D (batch) tensor expected"); - - if (input->nDimension == 3) { - long nInputCols = input->size[2]; - long nInputRows = input->size[1]; - long nInputPlane = input->size[0]; - - long istride_d = input->stride[0]; - long istride_h = input->stride[1]; - long istride_w = input->stride[2]; - - input_data = THCudaTensor_data(state, input); - - THCudaTensor_resize3d(state, output, nInputPlane, nOutputRows, nOutputCols); - THCudaTensor_resize4d(state, indices, 2, nInputPlane, nOutputRows, nOutputCols); - - indices_data = THCudaTensor_data(state, indices); - output_data = THCudaTensor_data(state, output); - - // cuda blocks & threads: - int yblocks = (int)(16L / nInputPlane); - yblocks = yblocks < 1 ? 1 : yblocks; - dim3 blocks(nInputPlane,yblocks); - dim3 threads(32,8); - - // run maxpool kernel - adaptivemaxpool <<<blocks, threads, 0, THCState_getCurrentStream(state)>>> (input_data, output_data, - indices_data+nInputPlane*nOutputCols*nOutputRows, indices_data, - nInputPlane, nInputRows, nInputCols, nOutputRows, nOutputCols, - istride_h, istride_w, istride_d); - THCudaCheck(cudaGetLastError()); - - } else { - long nInputCols = input->size[3]; - long nInputRows = input->size[2]; - long nInputPlane = input->size[1]; - long nbatch = input->size[0]; - - long istride_d = input->stride[1]; - long istride_h = input->stride[2]; - long istride_w = input->stride[3]; - - input = THCudaTensor_newContiguous(state, input); - input_data = THCudaTensor_data(state, input); - - THCudaTensor_resize4d(state, output, nbatch, nInputPlane, nOutputRows, nOutputCols); - THCudaTensor_resize5d(state, indices, 2, nbatch, nInputPlane, nOutputRows, nOutputCols); - - indices_data = THCudaTensor_data(state, indices); - output_data = THCudaTensor_data(state, output); - - // cuda blocks & threads: - int yblocks = (int)(16L / nInputPlane); - yblocks = yblocks < 1 ? 1 : yblocks; - dim3 blocks(nInputPlane*nbatch,yblocks); - dim3 threads(32,8); - - // run maxpool kernel - adaptivemaxpool <<<blocks, threads, 0, THCState_getCurrentStream(state)>>> (input_data, output_data, - indices_data+nbatch*nInputPlane*nOutputCols*nOutputRows, indices_data, - nInputPlane, nInputRows, nInputCols, nOutputRows, nOutputCols, - istride_h, istride_w, istride_d); - THCudaCheck(cudaGetLastError()); - // clean - THCudaTensor_free(state, input); - } -} - -void THNN_CudaSpatialAdaptiveMaxPooling_updateGradInput(THCState *state, THCudaTensor *input, THCudaTensor *gradOutput, THCudaTensor *gradInput, THCudaTensor *indices) -{ - bool atomic = true; // suboptimal, but without atomic it doesn't pass the tests - - THCUNN_assertSameGPU(state, 4, input, indices, gradOutput, gradInput); - - float *indices_data; - float *gradInput_data; - float *gradOutput_data; - - gradOutput = THCudaTensor_newContiguous(state, gradOutput); - - if (input->nDimension == 3) { - long nInputCols = input->size[2]; - long nInputRows = input->size[1]; - long nInputPlane = input->size[0]; - long nOutputCols = gradOutput->size[2]; - long nOutputRows = gradOutput->size[1]; - - //bool atomic = (nInputCols%nOutputCols != 0) || (nInputRows%nOutputRows != 0); - - THCudaTensor_resizeAs(state, gradInput, input); - THCudaTensor_zero(state, gradInput); - - indices_data = THCudaTensor_data(state, indices); - gradOutput_data = THCudaTensor_data(state, gradOutput); - gradInput_data = THCudaTensor_data(state, gradInput); - - // cuda blocks & threads: - int yblocks = (int)(16L / nInputPlane); - yblocks = yblocks < 1 ? 1 : yblocks; - dim3 blocks(nInputPlane,yblocks); - dim3 threads(32,8); - - if(atomic) - { - // run updateGradInput kernel, accumulate gradients atomically - atomicadaptivemaxgradinput <<<blocks, threads, 0, THCState_getCurrentStream(state)>>> (gradInput_data, gradOutput_data, - indices_data+nInputPlane*nOutputCols*nOutputRows, indices_data, - nInputPlane, nInputRows, nInputCols, nOutputRows, nOutputCols); - } - else - { - // run updateGradInput kernel - atomicadaptivemaxgradinput <<<blocks, threads, 0, THCState_getCurrentStream(state)>>> (gradInput_data, gradOutput_data, - indices_data+nInputPlane*nOutputCols*nOutputRows, indices_data, - nInputPlane, nInputRows, nInputCols, nOutputRows, nOutputCols); - } - THCudaCheck(cudaGetLastError()); - } else { - long nInputCols = input->size[3]; - long nInputRows = input->size[2]; - long nInputPlane = input->size[1]; - long nbatch = input->size[0]; - long nOutputCols = gradOutput->size[3]; - long nOutputRows = gradOutput->size[2]; - - //bool atomic = //(nInputCols%nOutputCols != 0) || (nInputRows%nOutputRows != 0); - - THCudaTensor_resizeAs(state, gradInput, input); - THCudaTensor_zero(state, gradInput); - - indices_data = THCudaTensor_data(state, indices); - gradOutput_data = THCudaTensor_data(state, gradOutput); - gradInput_data = THCudaTensor_data(state, gradInput); - - // cuda blocks & threads: - int yblocks = (int)(16L / nInputPlane); - yblocks = yblocks < 1 ? 1 : yblocks; - dim3 blocks(nInputPlane*nbatch,yblocks); - dim3 threads(32,8); - - if(atomic) - { - // run updateGradInput kernel, accumulate gradients atomically - atomicadaptivemaxgradinput <<<blocks, threads, 0, THCState_getCurrentStream(state)>>> (gradInput_data, gradOutput_data, - indices_data+nbatch*nInputPlane*nOutputCols*nOutputRows, indices_data, - nInputPlane, nInputRows, nInputCols, nOutputRows, nOutputCols); - } - else - { - // run updateGradInput kernel, accumulate gradients atomically - adaptivemaxgradinput <<<blocks, threads, 0, THCState_getCurrentStream(state)>>> (gradInput_data, gradOutput_data, - indices_data+nbatch*nInputPlane*nOutputCols*nOutputRows, indices_data, - nInputPlane, nInputRows, nInputCols, nOutputRows, nOutputCols); - } - THCudaCheck(cudaGetLastError()); - } - - // clean - THCudaTensor_free(state,gradOutput); - -} +#include "generic/SpatialAdaptiveMaxPooling.cu" +#include "THCGenerateFloatTypes.h" #undef CUDA_MAX_THREADS diff --git a/lib/THCUNN/SpatialAveragePooling.cu b/lib/THCUNN/SpatialAveragePooling.cu index 65060c9..5f77e06 100644 --- a/lib/THCUNN/SpatialAveragePooling.cu +++ b/lib/THCUNN/SpatialAveragePooling.cu @@ -1,7 +1,9 @@ #include "THCUNN.h" +#include "THCHalf.h" +#include "THCHalfAutoNumerics.cuh" #include "common.h" -template <typename Dtype, bool COUNT_INCLUDE_PAD> +template <typename Dtype, typename Acctype, bool COUNT_INCLUDE_PAD> __global__ void AvePoolForward(const int nthreads, const Dtype* const bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, @@ -22,7 +24,7 @@ __global__ void AvePoolForward(const int nthreads, wstart = max(wstart, 0); hend = min(hend, height); wend = min(wend, width); - Dtype aveval = 0; + Acctype aveval = Acctype(0); const Dtype* const bottom_slice = bottom_data + (n * channels + c) * height * width; for (int h = hstart; h < hend; ++h) { for (int w = wstart; w < wend; ++w) { @@ -30,87 +32,13 @@ __global__ void AvePoolForward(const int nthreads, } } if(COUNT_INCLUDE_PAD) - top_data[index] = aveval / pool_size; + top_data[index] = ScalarConvert<Acctype, Dtype>::to(aveval / pool_size); else - top_data[index] = aveval / ((hend - hstart) * (wend - wstart)); + top_data[index] = ScalarConvert<Acctype, Dtype>::to(aveval / ((hend - hstart) * (wend - wstart))); } } - -void THNN_CudaSpatialAveragePooling_updateOutput(THCState *state, THCudaTensor *input, THCudaTensor *output, int kW, int kH, int dW, int dH, int padW, int padH, bool ceil_mode, bool count_include_pad) -{ - THCUNN_assertSameGPU(state, 2, input, output); - THArgCheck(input->nDimension == 3 || input->nDimension == 4, 2, "3D or 4D (batch) tensor expected"); - - long nInputCols, nInputRows, nInputPlane, batchSize; - long nOutputCols, nOutputRows; - - if (input->nDimension == 3) { - nInputCols = input->size[2]; - nInputRows = input->size[1]; - nInputPlane = input->size[0]; - batchSize = 1; - } - else - { - nInputCols = input->size[3]; - nInputRows = input->size[2]; - nInputPlane = input->size[1]; - batchSize = input->size[0]; - } - - THArgCheck(nInputCols >= kW - 2*padW && nInputRows >= kH - 2*padH, 2, "input image smaller than kernel size"); - THArgCheck(kW/2 >= padW && kH/2 >= padH, 2, "pad should be smaller than half of kernel size"); - - if(ceil_mode) { - nOutputCols = ceil(float(nInputCols - kW + 2*padW) / float(dW)) + 1; - nOutputRows = ceil(float(nInputRows - kH + 2*padH) / float(dH)) + 1; - } - else { - nOutputCols = floor(float(nInputCols - kW + 2*padW) / float(dW)) + 1; - nOutputRows = floor(float(nInputRows - kH + 2*padH) / float(dH)) + 1; - } - if (padW || padH) - { - // ensure that the last pooling starts inside the image - // needed to avoid problems in ceil mode - if ((nOutputRows - 1)*dH >= nInputRows + padH) - --nOutputRows; - if ((nOutputCols - 1)*dW >= nInputCols + padW) - --nOutputCols; - } - - input = THCudaTensor_newContiguous(state, input); - float* input_data = THCudaTensor_data(state, input); - - THCudaTensor_resize4d(state, output, batchSize, nInputPlane, nOutputRows, nOutputCols); - - float* output_data = THCudaTensor_data(state, output); - - int count = THCudaTensor_nElement(state, output); - - if(count_include_pad) - AvePoolForward<float, true> - <<<GET_BLOCKS(count), CUDA_NUM_THREADS, 0, THCState_getCurrentStream(state) >>>( - count, input_data, - batchSize, nInputPlane, nInputRows, nInputCols, nOutputRows, nOutputCols, - kH, kW, dH, dW, padH, padW, output_data); - else - AvePoolForward<float, false> - <<<GET_BLOCKS(count), CUDA_NUM_THREADS, 0, THCState_getCurrentStream(state) >>>( - count, input_data, - batchSize, nInputPlane, nInputRows, nInputCols, nOutputRows, nOutputCols, - kH, kW, dH, dW, padH, padW, output_data); - THCudaCheck(cudaGetLastError()); - - if(input->nDimension == 3) - THCudaTensor_resize3d(state, output, nInputPlane, nOutputRows, nOutputCols); - - THCudaTensor_free(state, input); - -} - -template <typename Dtype, bool COUNT_INCLUDE_PAD> +template <typename Dtype, typename Acctype, bool COUNT_INCLUDE_PAD> __global__ void AvePoolBackward(const int nthreads, const Dtype* const top_diff, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, @@ -128,7 +56,7 @@ __global__ void AvePoolBackward(const int nthreads, const Dtype* const top_diff, const int phend = min(h / stride_h + 1, pooled_height); const int pwstart = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1; const int pwend = min(w / stride_w + 1, pooled_width); - Dtype gradient = 0; + Acctype gradient = Acctype(0); const Dtype* const top_diff_slice = top_diff + (n * channels + c) * pooled_height * pooled_width; for (int ph = phstart; ph < phend; ++ph) { @@ -149,76 +77,9 @@ __global__ void AvePoolBackward(const int nthreads, const Dtype* const top_diff, gradient += top_diff_slice[ph * pooled_width + pw] / ((hend - hstart) * (wend - wstart)); } } - bottom_diff[index] = gradient; - } -} - -void THNN_CudaSpatialAveragePooling_updateGradInput(THCState *state, THCudaTensor *input, THCudaTensor *gradOutput, THCudaTensor *gradInput, int kW, int kH, int dW, int dH, int padW, int padH, bool ceil_mode, bool count_include_pad) -{ - THCUNN_assertSameGPU(state, 3, input, gradOutput, gradInput); - - input = THCudaTensor_newContiguous(state, input); - gradOutput = THCudaTensor_newContiguous(state, gradOutput); - - long nInputCols, nInputRows, nInputPlane, batchSize; - long nOutputCols, nOutputRows; - - if (input->nDimension == 3) { - nInputCols = input->size[2]; - nInputRows = input->size[1]; - nInputPlane = input->size[0]; - batchSize = 1; - } - else - { - nInputCols = input->size[3]; - nInputRows = input->size[2]; - nInputPlane = input->size[1]; - batchSize = input->size[0]; + bottom_diff[index] = ScalarConvert<Acctype, Dtype>::to(gradient); } - - if(ceil_mode) { - nOutputCols = ceil(float(nInputCols - kW + 2*padW) / float(dW)) + 1; - nOutputRows = ceil(float(nInputRows - kH + 2*padH) / float(dH)) + 1; - } - else { - nOutputCols = floor(float(nInputCols - kW + 2*padW) / float(dW)) + 1; - nOutputRows = floor(float(nInputRows - kH + 2*padH) / float(dH)) + 1; - } - if (padW || padH) - { - // ensure that the last pooling starts inside the image - // needed to avoid problems in ceil mode - if ((nOutputRows - 1)*dH >= nInputRows + padH) - --nOutputRows; - if ((nOutputCols - 1)*dW >= nInputCols + padW) - --nOutputCols; - } - - THCudaTensor_resizeAs(state, gradInput, input); - - int count = THCudaTensor_nElement(state, input); - - if(count_include_pad) - AvePoolBackward<float, true> - <<< GET_BLOCKS(count), CUDA_NUM_THREADS, 0, THCState_getCurrentStream(state) >>> - (count, - THCudaTensor_data(state, gradOutput), - batchSize, nInputPlane, nInputRows, nInputCols, nOutputRows, nOutputCols, - kH, kW, dH, dW, padH, padW, - THCudaTensor_data(state, gradInput)); - else - AvePoolBackward<float, false> - <<< GET_BLOCKS(count), CUDA_NUM_THREADS, 0, THCState_getCurrentStream(state) >>> - (count, - THCudaTensor_data(state, gradOutput), - batchSize, nInputPlane, nInputRows, nInputCols, nOutputRows, nOutputCols, - kH, kW, dH, dW, padH, padW, - THCudaTensor_data(state, gradInput)); - THCudaCheck(cudaGetLastError()); - - // clean - THCudaTensor_free(state, input); - THCudaTensor_free(state, gradOutput); } +#include "generic/SpatialAveragePooling.cu" +#include "THCGenerateFloatTypes.h" diff --git a/lib/THCUNN/SpatialClassNLLCriterion.cu b/lib/THCUNN/SpatialClassNLLCriterion.cu index 0469520..4b2f523 100644 --- a/lib/THCUNN/SpatialClassNLLCriterion.cu +++ b/lib/THCUNN/SpatialClassNLLCriterion.cu @@ -1,29 +1,30 @@ #include "THCUNN.h" +#include "THCHalf.h" +#include "THCHalfAutoNumerics.cuh" +#include "THCAtomics.cuh" #include "common.h" -#include <stdio.h> -#include <assert.h> - #include <thrust/functional.h> +template <typename T, typename AccumT> __global__ void cunn_SpatialClassNLLCriterion_updateOutput_kernel( - float *output, - float *total_weight, - float *input, - long *target, - float *weights, + T *output, + T *total_weight, + T *input, + THCIndex_t *target, + T *weights, int size_average, int batch_size, int n_classes, int map_nelem, int blocks_per_sample) { - __shared__ float partial_sums[CUDA_NUM_THREADS]; + __shared__ AccumT partial_sums[CUDA_NUM_THREADS]; int i, t; - float cur_weight; - float input_sum = 0; - float acc_weight = 0; + T cur_weight; + AccumT input_sum = 0; + AccumT acc_weight = 0; int sample = blockIdx.x / blocks_per_sample; int toffset = sample * map_nelem; @@ -34,30 +35,31 @@ __global__ void cunn_SpatialClassNLLCriterion_updateOutput_kernel( i += step) { t = target[toffset + i] - TH_INDEX_BASE; assert(t >= 0 && t < n_classes); - cur_weight = weights ? weights[t] : 1.0f; + cur_weight = weights ? weights[t] : ScalarConvert<int, T>::to(1); input_sum -= input[ioffset + i + map_nelem * t] * cur_weight; acc_weight += cur_weight; } __syncthreads(); - input_sum = reduceBlock(partial_sums, blockDim.x, input_sum, thrust::plus<float>(), 0.0f); - acc_weight = reduceBlock(partial_sums, blockDim.x, acc_weight, thrust::plus<float>(), 0.0f); + input_sum = reduceBlock(partial_sums, blockDim.x, input_sum, thrust::plus<AccumT>(), AccumT(0)); + acc_weight = reduceBlock(partial_sums, blockDim.x, acc_weight, thrust::plus<AccumT>(), AccumT(0)); if (threadIdx.x == 0) { - atomicAdd(total_weight, acc_weight); + atomicAdd(total_weight, ScalarConvert<AccumT, T>::to(acc_weight)); if (size_average && acc_weight > 0) - atomicAdd(output, input_sum / acc_weight / gridDim.x); + atomicAdd(output, ScalarConvert<AccumT, T>::to(input_sum / acc_weight / gridDim.x)); else - atomicAdd(output, input_sum); + atomicAdd(output, ScalarConvert<AccumT, T>::to(input_sum)); } } +template<typename T> __global__ void cunn_SpatialClassNLLCriterion_updateGradInput_kernel( - float *gradInput, - long *target, - float *weights, - float *total_weight, + T *gradInput, + THCIndex_t *target, + T *weights, + T *total_weight, int size_average, int batch_size, int n_classes, @@ -68,7 +70,7 @@ __global__ void cunn_SpatialClassNLLCriterion_updateGradInput_kernel( return; int i, t; - float norm = size_average ? (1.0f / *total_weight) : 1.0f; + T norm = size_average ? (ScalarConvert<int, T>::to(1) / *total_weight) : ScalarConvert<int, T>::to(1); int sample = blockIdx.x / blocks_per_sample; int step = blockDim.x * blocks_per_sample; @@ -79,127 +81,9 @@ __global__ void cunn_SpatialClassNLLCriterion_updateGradInput_kernel( i += step) { t = (int)target[toffset + i] - TH_INDEX_BASE; assert(t >= 0 && t < n_classes); - gradInput[ioffset + i + map_nelem * t] = -(weights ? weights[t] : 1.0f) * norm; + gradInput[ioffset + i + map_nelem * t] = -(weights ? weights[t] : ScalarConvert<int, T>::to(1)) * norm; } } -void THNN_CudaSpatialClassNLLCriterion_updateOutput( - THCState *state, - THCudaTensor *input, - THCudaLongTensor *target, - THCudaTensor *output, - bool sizeAverage, - THCudaTensor *weights, - THCudaTensor *total_weight) -{ - THArgCheck(THCudaLongTensor_nDimension(state, target) == 3, 1, - "only batches of spatial targets supported (3D tensors)"); - THArgCheck(THCudaTensor_nDimension(state, input) == 4, 2, - "only batches of spatial inputs supported (4D tensors)"); - if (weights && THCudaTensor_nElement(state, weights) != THCudaTensor_size(state, input, 1)) { - THError("weight tensor should be defined either for all or no classes"); - } - - if (weights) - THCUNN_assertSameGPU(state, 5, input, target, weights, output, total_weight); - else - THCUNN_assertSameGPU(state, 4, input, target, output, total_weight); - - input = THCudaTensor_newContiguous(state, input); - weights = weights ? THCudaTensor_newContiguous(state, weights) : NULL; - target = THCudaLongTensor_newContiguous(state, target); - - float *input_data = THCudaTensor_data(state, input); - float *weights_data = weights ? THCudaTensor_data(state, weights) : NULL; - long *target_data = THCudaLongTensor_data(state, target); - float *output_data = THCudaTensor_data(state, output); - float *total_weight_data = THCudaTensor_data(state, total_weight); - - long batch_size = THCudaLongTensor_size(state, target, 0); - long map_nelem = THCudaLongTensor_nElement(state, target) / batch_size; - int blocks_per_sample = GET_BLOCKS(map_nelem) / 128; - blocks_per_sample = (blocks_per_sample == 0) ? 1 : blocks_per_sample; - int total_blocks = blocks_per_sample * batch_size; - - THCudaTensor_fill(state, output, 0); - THCudaTensor_fill(state, total_weight, 0); - - cunn_SpatialClassNLLCriterion_updateOutput_kernel - <<<total_blocks, CUDA_NUM_THREADS, 0, THCState_getCurrentStream(state)>>>( - output_data, - total_weight_data, - input_data, - target_data, - weights_data, - sizeAverage, - THCudaTensor_size(state, input, 0), - THCudaTensor_size(state, input, 1), - THCudaTensor_size(state, input, 2) * THCudaTensor_size(state, input, 3), - blocks_per_sample - ); - THCudaCheck(cudaGetLastError()); - - if (weights) - THCudaTensor_free(state, weights); - THCudaLongTensor_free(state, target); - THCudaTensor_free(state, input); -} - -void THNN_CudaSpatialClassNLLCriterion_updateGradInput( - THCState *state, - THCudaTensor *input, - THCudaLongTensor *target, - THCudaTensor *gradInput, - bool sizeAverage, - THCudaTensor *weights, - THCudaTensor *total_weight) -{ - THArgCheck(THCudaLongTensor_nDimension(state, target) == 3, 1, - "only batches of spatial targets supported (3D tensors)"); - THArgCheck(THCudaTensor_nDimension(state, input) == 4, 2, - "only batches of spatial inputs supported (4D tensors)"); - THArgCheck(THCudaTensor_isContiguous(state, gradInput), 4, - "gradInput must be contiguous"); - if (weights && THCudaTensor_nElement(state, weights) != THCudaTensor_size(state, input, 1)) { - THError("weight tensor should be defined either for all or no classes"); - } - - if (weights) - THCUNN_assertSameGPU(state, 5, weights, input, target, gradInput, total_weight); - else - THCUNN_assertSameGPU(state, 4, input, target, gradInput, total_weight); - - input = THCudaTensor_newContiguous(state, input); - weights = weights ? THCudaTensor_newContiguous(state, weights) : NULL; - target = THCudaLongTensor_newContiguous(state, target); - - float *weights_data = weights ? THCudaTensor_data(state, weights) : NULL; - float *gradInput_data = THCudaTensor_data(state, gradInput); - long *target_data = THCudaLongTensor_data(state, target); - float *total_weight_data = THCudaTensor_data(state, total_weight); - - long batch_size = THCudaLongTensor_size(state, target, 0); - long map_nelem = THCudaLongTensor_nElement(state, target) / batch_size; - int blocks_per_sample = GET_BLOCKS(map_nelem) / 128; - blocks_per_sample = (blocks_per_sample == 0) ? 1 : blocks_per_sample; - int total_blocks = blocks_per_sample * batch_size; - - cunn_SpatialClassNLLCriterion_updateGradInput_kernel - <<<total_blocks, CUDA_NUM_THREADS, 0, THCState_getCurrentStream(state)>>>( - gradInput_data, - target_data, - weights_data, - total_weight_data, - sizeAverage, - THCudaTensor_size(state, input, 0), - THCudaTensor_size(state, input, 1), - THCudaTensor_size(state, input, 2) *THCudaTensor_size(state, input, 3), - blocks_per_sample - ); - THCudaCheck(cudaGetLastError()); - - if (weights) - THCudaTensor_free(state, weights); - THCudaLongTensor_free(state, target); - THCudaTensor_free(state, input); -} +#include "generic/SpatialClassNLLCriterion.cu" +#include "THCGenerateFloatTypes.h" diff --git a/lib/THCUNN/SpatialConvolutionLocal.cu b/lib/THCUNN/SpatialConvolutionLocal.cu index 7f758ef..e5b1f98 100644 --- a/lib/THCUNN/SpatialConvolutionLocal.cu +++ b/lib/THCUNN/SpatialConvolutionLocal.cu @@ -2,321 +2,8 @@ #include "common.h" #include "im2col.h" -void THNN_CudaSpatialConvolutionLocal_updateOutput( - THCState *state, - THCudaTensor *input, - THCudaTensor *output, - THCudaTensor *weight, - THCudaTensor *bias, - THCudaTensor *finput, - THCudaTensor *fgradInput, - int kW, int kH, - int dW, int dH, - int padW, int padH, - long inputWidth, long inputHeight, - long outputWidth, long outputHeight) -{ - THCUNN_assertSameGPU(state, 5, input, output, weight, - bias, finput); +#include "THCHalf.h" +#include "THCHalfAutoNumerics.cuh" - long nInputPlane = THCudaTensor_size(state,weight,2)/(kW*kH); - long nOutputPlane = THCudaTensor_size(state,weight,1); - - int batch = 1; - if (input->nDimension == 3) { - // Force batch - batch = 0; - THCudaTensor_resize4d(state, input, 1, nInputPlane, inputHeight, inputWidth); - } - - // Batch size + input planes - long batchSize = input->size[0]; - - // Resize output - THCudaTensor_resize4d(state, output, batchSize, nOutputPlane, outputHeight, outputWidth); - - // Augment the input - THCudaTensor_resize3d(state, finput, batchSize, nInputPlane*kW*kH, outputHeight*outputWidth); - - // Helpers - THCudaTensor *input_n = THCudaTensor_new(state); - THCudaTensor *finput_n = THCudaTensor_new(state); - THCudaTensor *output_n = THCudaTensor_new(state); - - // For each elt in batch, do: - for (int elt = 0; elt < batchSize; elt ++) { - THCudaTensor *finput3d, *output3d; - THCudaTensor *wslice = THCudaTensor_new(state); - THCudaTensor *islice = THCudaTensor_new(state); - THCudaTensor *oslice = THCudaTensor_new(state); - - // Matrix mulitply per output: - THCudaTensor_select(state, input_n, input, 0, elt); - THCudaTensor_select(state, finput_n, finput, 0, elt); - THCudaTensor_select(state, output_n, output, 0, elt); - - // Extract columns: - im2col( - THCState_getCurrentStream(state), - THCudaTensor_data(state, input_n), - nInputPlane, inputHeight, inputWidth, kH, kW, padH, padW, dH, dW, - 1, 1, THCudaTensor_data(state, finput_n) - ); - - output3d = THCudaTensor_newWithStorage3d(state, output_n->storage, output_n->storageOffset, - outputHeight*outputWidth, 1, - nOutputPlane, outputHeight*outputWidth, - 1, nOutputPlane*outputHeight*outputWidth); - - finput3d = THCudaTensor_newWithStorage3d(state, finput_n->storage, finput_n->storageOffset, - outputHeight*outputWidth, 1, - kW*kH*nInputPlane, outputHeight*outputWidth, - 1, kW*kH*nInputPlane*outputHeight*outputWidth); - - THCudaTensor_copy(state, output_n, bias); - - for (int i = 0; i < outputHeight; i++) { - for(int j = 0; j < outputWidth; j++) { - int sliceidx = i * outputWidth + j; - THCudaTensor_select(state, wslice, weight, 0, sliceidx); - THCudaTensor_select(state, islice, finput3d, 0, sliceidx); - THCudaTensor_select(state, oslice, output3d, 0, sliceidx); - THCudaTensor_addmm(state, oslice, 1.0, oslice, 1.0, wslice, islice); - } - } - - - // weight: oH*oW x nOutputPlane x nInputPlane*kH*kW - // finput3d: oH*oW x nInputPlane*kH*kW x 1 - // THCudaTensor_baddbmm(state, output3d, 1.0, output3d, 1.0, weight, finput3d); - // output3d: oH*oW x nOutputPlane x 1 - - THCudaTensor_free(state, output3d); - THCudaTensor_free(state, finput3d); - THCudaTensor_free(state, wslice); - THCudaTensor_free(state, islice); - THCudaTensor_free(state, oslice); - } - - // Free - THCudaTensor_free(state, input_n); - THCudaTensor_free(state, finput_n); - THCudaTensor_free(state, output_n); - - // Resize output - if (batch == 0) { - THCudaTensor_resize3d(state, output, nOutputPlane, outputHeight, outputWidth); - THCudaTensor_resize3d(state, input, nInputPlane, inputHeight, inputWidth); - } -} - -void THNN_CudaSpatialConvolutionLocal_updateGradInput( - THCState *state, - THCudaTensor *input, - THCudaTensor *gradOutput, - THCudaTensor *gradInput, - THCudaTensor *weight, - THCudaTensor *finput, - THCudaTensor *fgradInput, - int kW, int kH, - int dW, int dH, - int padW, int padH, - long inputWidth, long inputHeight, - long outputWidth, long outputHeight) -{ - THCUNN_assertSameGPU(state, 5, input, gradOutput, weight, - fgradInput, gradInput); - - long nInputPlane = THCudaTensor_size(state,weight,2)/(kW*kH); - long nOutputPlane = THCudaTensor_size(state,weight,1); - - int batch = 1; - if (input->nDimension == 3) { - // Force batch - batch = 0; - THCudaTensor_resize4d(state, input, 1, nInputPlane, inputHeight, inputWidth); - THCudaTensor_resize4d(state, gradOutput, 1, nOutputPlane, outputHeight, outputWidth); - } - - // Batch size + input planes - long batchSize = input->size[0]; - - // Resize output - THCudaTensor_resize4d(state, gradInput, batchSize, nInputPlane, inputHeight, inputWidth); - - // Resize temporary columns - THCudaTensor_resize3d(state, fgradInput, batchSize, nInputPlane*kW*kH, outputHeight*outputWidth); - - // Helpers - THCudaTensor *gradInput_n = THCudaTensor_new(state); - THCudaTensor *fgradInput_n = THCudaTensor_new(state); - THCudaTensor *gradOutput_n = THCudaTensor_new(state); - - THCudaTensor_transpose(state, weight, weight, 1, 2); - - // For each elt in batch, do: - for (int elt = 0; elt < batchSize; elt ++) { - THCudaTensor *gradOutput3d, *fgradInput3d; - THCudaTensor *wslice = THCudaTensor_new(state); - THCudaTensor *gislice = THCudaTensor_new(state); - THCudaTensor *goslice = THCudaTensor_new(state); - - // Matrix mulitply per sample: - THCudaTensor_select(state, gradInput_n, gradInput, 0, elt); - THCudaTensor_select(state, fgradInput_n, fgradInput, 0, elt); - THCudaTensor_select(state, gradOutput_n, gradOutput, 0, elt); - - gradOutput3d = THCudaTensor_newWithStorage3d(state, gradOutput_n->storage, gradOutput_n->storageOffset, - outputHeight*outputWidth, 1, - nOutputPlane, outputHeight*outputWidth, - 1, nOutputPlane*outputHeight*outputWidth); - fgradInput3d = THCudaTensor_newWithStorage3d(state, fgradInput_n->storage, fgradInput_n->storageOffset, - outputHeight*outputWidth, 1, - kW*kH*nInputPlane, outputHeight*outputWidth, - 1, kW*kH*nInputPlane*outputHeight*outputWidth); - - for (int i = 0; i < outputHeight; i++) { - for(int j = 0; j < outputWidth; j++) { - int sliceidx = i * outputWidth + j; - THCudaTensor_select(state, wslice, weight, 0, sliceidx); - THCudaTensor_select(state, gislice, fgradInput3d, 0, sliceidx); - THCudaTensor_select(state, goslice, gradOutput3d, 0, sliceidx); - THCudaTensor_addmm(state, gislice, 0.0, gislice, 1.0, wslice, goslice); - } - } - - // weight: oH*oW x nInputPlane*kH*kW x nOutputPlane - // gradOutput3d: oH*oW x nOutputPlane x 1 - //THCudaTensor_baddbmm(state, fgradInput3d, 0.0, fgradInput3d, 1.0, weight, gradOutput3d); - // fgradInput3d: oH*oW x nInputPlane*kH*kW x 1 - - // Unpack columns back into input: - col2im( - THCState_getCurrentStream(state), - THCudaTensor_data(state, fgradInput_n), - nInputPlane, inputHeight, inputWidth, kH, kW, padH, padW, dH, dW, - 1, 1, THCudaTensor_data(state, gradInput_n) - ); - - THCudaTensor_free(state, gradOutput3d); - THCudaTensor_free(state, fgradInput3d); - THCudaTensor_free(state, wslice); - THCudaTensor_free(state, gislice); - THCudaTensor_free(state, goslice); - } - - // Free - THCudaTensor_free(state, gradInput_n); - THCudaTensor_free(state, fgradInput_n); - THCudaTensor_free(state, gradOutput_n); - - // Resize output - if (batch == 0) { - THCudaTensor_resize3d(state, gradOutput, nOutputPlane, outputHeight, outputWidth); - THCudaTensor_resize3d(state, input, nInputPlane, inputHeight, inputWidth); - THCudaTensor_resize3d(state, gradInput, nInputPlane, inputHeight, inputWidth); - } - - THCudaTensor_transpose(state, weight, weight, 1, 2); -} - -void THNN_CudaSpatialConvolutionLocal_accGradParameters( - THCState *state, - THCudaTensor *input, - THCudaTensor *gradOutput, - THCudaTensor *gradWeight, - THCudaTensor *gradBias, - THCudaTensor *finput, - THCudaTensor *fgradInput, - int kW, int kH, - int dW, int dH, - int padW, int padH, - long inputWidth, long inputHeight, - long outputWidth, long outputHeight, - float scale) -{ - THCUNN_assertSameGPU(state, 5, input, gradOutput, gradWeight, - gradBias, finput); - - long nInputPlane = THCudaTensor_size(state,gradWeight,2)/(kW*kH); - long nOutputPlane = THCudaTensor_size(state,gradWeight,1); - - int batch = 1; - if (input->nDimension == 3) { - // Force batch - batch = 0; - THCudaTensor_resize4d(state, input, 1, nInputPlane, inputHeight, inputWidth); - THCudaTensor_resize4d(state, gradOutput, 1, nOutputPlane, outputHeight, outputWidth); - } - - // Batch size + input planes - long batchSize = input->size[0]; - - // Helpers - THCudaTensor *input_n = THCudaTensor_new(state); - THCudaTensor *finput_n = THCudaTensor_new(state); - THCudaTensor *gradOutput_n = THCudaTensor_new(state); - - // For each elt in batch, do: - for (int elt = 0; elt < batchSize; elt ++) { - THCudaTensor *gradOutput3d, *finput3d; - THCudaTensor *gwslice = THCudaTensor_new(state); - THCudaTensor *islice = THCudaTensor_new(state); - THCudaTensor *goslice = THCudaTensor_new(state); - - // Matrix mulitply per output: - THCudaTensor_select(state, input_n, input, 0, elt); - THCudaTensor_select(state, finput_n, finput, 0, elt); - THCudaTensor_select(state, gradOutput_n, gradOutput, 0, elt); - - gradOutput3d = THCudaTensor_newWithStorage3d(state, gradOutput_n->storage, gradOutput_n->storageOffset, - outputHeight*outputWidth, 1, - nOutputPlane, outputHeight*outputWidth, - 1, nOutputPlane*outputHeight*outputWidth); - finput3d = THCudaTensor_newWithStorage3d(state, finput_n->storage, finput_n->storageOffset, - outputHeight*outputWidth, 1, - 1, kW*kH*nInputPlane*outputHeight*outputWidth, - kW*kH*nInputPlane, outputHeight*outputWidth); - - // Extract columns: - im2col( - THCState_getCurrentStream(state), - THCudaTensor_data(state, input_n), - nInputPlane, inputHeight, inputWidth, kH, kW, padH, padW, dH, dW, - 1, 1, THCudaTensor_data(state, finput_n) - ); - - for (int i = 0; i < outputHeight; i++) { - for(int j = 0; j < outputWidth; j++) { - int sliceidx = i * outputWidth + j; - THCudaTensor_select(state, gwslice, gradWeight, 0, sliceidx); - THCudaTensor_select(state, goslice, gradOutput3d, 0, sliceidx); - THCudaTensor_select(state, islice, finput3d, 0, sliceidx); - THCudaTensor_addmm(state, gwslice, 1.0, gwslice, scale, goslice, islice); - } - } - // gradOutput3d: oH*oW x nOutputPlane x 1 - // finput3d: oH*oW x 1 x kW*kH*nInputPlane - //THCudaTensor_baddbmm(state, gradWeight, 1.0, gradWeight, scale, gradOutput3d, finput3d); - // gradWeight: oH*oW x nOutputPlane x kW*kH*nInputPlane - - THCudaTensor_cadd(state, gradBias, gradBias, scale, gradOutput_n); - - THCudaTensor_free(state, gradOutput3d); - THCudaTensor_free(state, finput3d); - THCudaTensor_free(state, gwslice); - THCudaTensor_free(state, goslice); - THCudaTensor_free(state, islice); - } - - // Free - THCudaTensor_free(state, input_n); - THCudaTensor_free(state, finput_n); - THCudaTensor_free(state, gradOutput_n); - - // Resize - if (batch == 0) { - THCudaTensor_resize3d(state, gradOutput, nOutputPlane, outputHeight, outputWidth); - THCudaTensor_resize3d(state, input, nInputPlane, inputHeight, inputWidth); - } -} +#include "generic/SpatialConvolutionLocal.cu" +#include "THCGenerateFloatTypes.h" diff --git a/lib/THCUNN/SpatialConvolutionMM.cu b/lib/THCUNN/SpatialConvolutionMM.cu index 4714e9d..2a88047 100644 --- a/lib/THCUNN/SpatialConvolutionMM.cu +++ b/lib/THCUNN/SpatialConvolutionMM.cu @@ -2,352 +2,8 @@ #include "common.h" #include "im2col.h" +#include "THCHalf.h" +#include "THCHalfAutoNumerics.cuh" -void THNN_CudaSpatialConvolutionMM_updateOutput(THCState *state, THCudaTensor *input, THCudaTensor *output, THCudaTensor *weight, THCudaTensor *bias, THCudaTensor *columns, THCudaTensor *ones, int kW, int kH, int dW, int dH, int padW, int padH) { - - THCUNN_assertSameGPU(state, 5, input, output, weight, columns, ones); - if (bias) { - THCUNN_assertSameGPU(state, 2, weight, bias); - } - THArgCheck(input->nDimension == 3 || input->nDimension == 4, 2, "3D or 4D (batch mode) tensor is expected"); - THArgCheck(!bias || weight->size[0] == bias->size[0], 4, "nOutputPlane mismatch in weight and bias"); - THArgCheck(kW > 0 && kH > 0, 8, "kernel size should be greater than zero"); - THArgCheck(dW > 0 && dH > 0, 10, "stride should be greater than zero"); - THArgCheck(weight->nDimension == 2 || weight->nDimension == 4, 4, "weight tensor should be 2D or 4D"); - - int freeWeight = 0; - - // Params: - int nInputPlane = weight->nDimension == 2 ? weight->size[1]/(kH*kW) : weight->size[1]; - int nOutputPlane = weight->size[0]; - - if (weight->nDimension == 4) { - long s1 = weight->size[0]; - long s2 = weight->size[1] * weight->size[2] * weight->size[3]; - weight = THCudaTensor_newWithStorage2d(state, weight->storage, weight->storageOffset, s1, -1, s2, -1); - freeWeight = 1; - } - - int batch = 1; - if (input->nDimension == 3) { - THArgCheck(input->size[0] == nInputPlane, 2, "input channels and nInputPlane dont match"); - // Force batch - batch = 0; - THCudaTensor_resize4d(state, input, 1, input->size[0], input->size[1], input->size[2]); - } else { - THArgCheck(input->size[1] == nInputPlane, 2, "input channels and nInputPlane dont match"); - } - - long inputWidth = input->size[3]; - long inputHeight = input->size[2]; - long outputWidth = (inputWidth + 2*padW - kW) / dW + 1; - long outputHeight = (inputHeight + 2*padH - kH) / dH + 1; - - if (outputWidth < 1 || outputHeight < 1) - THError("Given input size: (%dx%dx%d). Calculated output size: (%dx%dx%d). Output size is too small", - nInputPlane,inputHeight,inputWidth,nOutputPlane,outputHeight,outputWidth); - - // Batch size + input planes - long batchSize = input->size[0]; - - // Resize output - THCudaTensor_resize4d(state, output, batchSize, nOutputPlane, outputHeight, outputWidth); - - // Resize temporary columns - THCudaTensor_resize2d(state, columns, nInputPlane*kW*kH, outputHeight*outputWidth); - - // Define a buffer of ones, for bias accumulation - // Note: this buffer can be shared with other modules, it only ever gets increased, - // and always contains ones. - if (ones->nDimension != 2 || ones->size[0]*ones->size[1] < outputHeight*outputWidth) { - // Resize plane and fill with ones... - THCudaTensor_resize2d(state, ones, outputHeight, outputWidth); - THCudaTensor_fill(state, ones, 1); - } - - // Helpers - THCudaTensor *input_n = THCudaTensor_new(state); - THCudaTensor *output_n = THCudaTensor_new(state); - - // For each elt in batch, do: - for (int elt = 0; elt < batchSize; elt ++) { - // Matrix mulitply per output: - THCudaTensor_select(state, input_n, input, 0, elt); - THCudaTensor_select(state, output_n, output, 0, elt); - - // Do Bias first: - // M,N,K are dims of matrix A and B - // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm) - long m_ = nOutputPlane; - long n_ = outputHeight * outputWidth; - long k_ = 1; - - // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices) - if (bias) { - THCudaBlas_Sgemm( - state, - 't', 'n', - n_, m_, k_, - 1, - THCudaTensor_data(state, ones), k_, - THCudaTensor_data(state, bias), k_, - 0, - THCudaTensor_data(state, output_n), n_ - ); - } else { - THCudaTensor_zero(state, output_n); - } - - // Extract columns: - im2col( - THCState_getCurrentStream(state), - THCudaTensor_data(state, input_n), - nInputPlane, inputHeight, inputWidth, kH, kW, padH, padW, dH, dW, - 1, 1, THCudaTensor_data(state, columns) - ); - - // M,N,K are dims of matrix A and B - // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm) - long m = nOutputPlane; - long n = columns->size[1]; - long k = nInputPlane*kH*kW; - - // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices) - THCudaBlas_Sgemm( - state, - 'n', 'n', - n, m, k, - 1, - THCudaTensor_data(state, columns), n, - THCudaTensor_data(state, weight), k, - 1, - THCudaTensor_data(state, output_n), n - ); - } - - // Free - THCudaTensor_free(state, input_n); - THCudaTensor_free(state, output_n); - if (freeWeight) - THCudaTensor_free(state, weight); - - // Resize output - if (batch == 0) { - THCudaTensor_resize3d(state, output, nOutputPlane, outputHeight, outputWidth); - THCudaTensor_resize3d(state, input, nInputPlane, inputHeight, inputWidth); - } -} - -void THNN_CudaSpatialConvolutionMM_updateGradInput(THCState *state, THCudaTensor *input, THCudaTensor *gradOutput, THCudaTensor *gradInput, THCudaTensor *weight, THCudaTensor *gradColumns, THCudaTensor *ones, int kW, int kH, int dW, int dH, int padW, int padH) { - - THCUNN_assertSameGPU(state, 5, input, gradOutput, weight, - gradColumns, gradInput); - THArgCheck(input->nDimension == 3 || input->nDimension == 4, 2, "3D or 4D (batch mode) tensor is expected"); - THArgCheck(kW > 0 && kH > 0, 9, "kernel size should be greater than zero"); - THArgCheck(dW > 0 && dH > 0, 11, "stride should be greater than zero"); - THArgCheck(weight->nDimension == 2 || weight->nDimension == 4, 4, "weight tensor should be 2D or 4D"); - - // Params - int nInputPlane = weight->nDimension == 2 ? weight->size[1]/(kW*kH) : weight->size[1]; - int nOutputPlane = weight->size[0]; - - int freeWeight = 0; - if (weight->nDimension == 4) { - long s1 = weight->size[0]; - long s2 = weight->size[1] * weight->size[2] * weight->size[3]; - weight = THCudaTensor_newWithStorage2d(state, weight->storage, weight->storageOffset, s1, -1, s2, -1); - freeWeight = 1; - } - - int batch = 1; - if (input->nDimension == 3) { - // Force batch - batch = 0; - THCudaTensor_resize4d(state, input, 1, input->size[0], input->size[1], input->size[2]); - THCudaTensor_resize4d(state, gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2]); - } - - long inputWidth = input->size[3]; - long inputHeight = input->size[2]; - long outputWidth = (inputWidth + 2*padW - kW) / dW + 1; - long outputHeight = (inputHeight + 2*padH - kH) / dH + 1; - - // Batch size + input planes - long batchSize = input->size[0]; - - // Resize output - THCudaTensor_resize4d(state, gradInput, batchSize, nInputPlane, inputHeight, inputWidth); - - // Resize temporary columns - THCudaTensor_resize2d(state, gradColumns, nInputPlane*kW*kH, outputHeight*outputWidth); - - // Helpers - THCudaTensor *gradInput_n = THCudaTensor_new(state); - THCudaTensor *gradOutput_n = THCudaTensor_new(state); - - // For each elt in batch, do: - for (int elt = 0; elt < batchSize; elt ++) { - // Matrix mulitply per sample: - THCudaTensor_select(state, gradInput_n, gradInput, 0, elt); - THCudaTensor_select(state, gradOutput_n, gradOutput, 0, elt); - - // M,N,K are dims of matrix A and B - // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm) - long m = nInputPlane*kW*kH; - long n = gradColumns->size[1]; - long k = nOutputPlane; - - // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices) - THCudaBlas_Sgemm( - state, - 'n', 't', - n, m, k, - 1, - THCudaTensor_data(state, gradOutput_n), n, - THCudaTensor_data(state, weight), m, - 0, - THCudaTensor_data(state, gradColumns), n - ); - - // Unpack columns back into input: - col2im( - THCState_getCurrentStream(state), - THCudaTensor_data(state, gradColumns), - nInputPlane, inputHeight, inputWidth, kH, kW, padH, padW, dH, dW, - 1, 1, THCudaTensor_data(state, gradInput_n) - ); - } - - // Free - THCudaTensor_free(state, gradInput_n); - THCudaTensor_free(state, gradOutput_n); - if (freeWeight) - THCudaTensor_free(state, weight); - - // Resize output - if (batch == 0) { - THCudaTensor_resize3d(state, gradOutput, nOutputPlane, outputHeight, outputWidth); - THCudaTensor_resize3d(state, input, nInputPlane, inputHeight, inputWidth); - THCudaTensor_resize3d(state, gradInput, nInputPlane, inputHeight, inputWidth); - } -} - -void THNN_CudaSpatialConvolutionMM_accGradParameters(THCState *state, THCudaTensor *input, THCudaTensor *gradOutput, THCudaTensor *gradWeight, THCudaTensor *gradBias, THCudaTensor *columns, THCudaTensor *ones, int kW, int kH, int dW, int dH, int padW, int padH, float scale) { - - THCUNN_assertSameGPU(state, 5, input, gradOutput, gradWeight, columns, ones); - if (gradBias) { - THCUNN_assertSameGPU(state, 2, gradWeight, gradBias); - } - THArgCheck(input->nDimension == 3 || input->nDimension == 4, 2, "3D or 4D (batch mode) tensor is expected"); - THArgCheck(!gradBias || gradWeight->size[0] == gradBias->size[0], 4, "nOutputPlane mismatch in gradWeight and gradBias"); - THArgCheck(kW > 0 && kH > 0, 8, "kernel size should be greater than zero"); - THArgCheck(dW > 0 && dH > 0, 10, "stride should be greater than zero"); - THArgCheck(gradWeight->nDimension == 2 || gradWeight->nDimension == 4, 4, "gradWeight tensor should be 2D or 4D"); - - // Params - int nInputPlane = gradWeight->nDimension == 2 ? gradWeight->size[1]/(kW*kH) : gradWeight->size[1]; - int nOutputPlane = gradWeight->size[0]; - - int freeWeight = 0; - if (gradWeight->nDimension == 4) { - long s1 = gradWeight->size[0]; - long s2 = gradWeight->size[1] * gradWeight->size[2] * gradWeight->size[3]; - gradWeight = THCudaTensor_newWithStorage2d(state, gradWeight->storage, gradWeight->storageOffset, s1, -1, s2, -1); - freeWeight = 1; - } - - int batch = 1; - if (input->nDimension == 3) { - // Force batch - batch = 0; - THCudaTensor_resize4d(state, input, 1, input->size[0], input->size[1], input->size[2]); - THCudaTensor_resize4d(state, gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2]); - } - - long inputWidth = input->size[3]; - long inputHeight = input->size[2]; - long outputWidth = (inputWidth + 2*padW - kW) / dW + 1; - long outputHeight = (inputHeight + 2*padH - kH) / dH + 1; - - // Batch size + input planes - long batchSize = input->size[0]; - - // Define a buffer of ones, for bias accumulation - if (ones->nDimension != 2 || ones->size[0]*ones->size[1] < outputHeight*outputWidth) { - // Resize plane and fill with ones... - THCudaTensor_resize2d(state, ones, outputHeight, outputWidth); - THCudaTensor_fill(state, ones, 1); - } - - // Resize temporary columns - THCudaTensor_resize2d(state, columns, nInputPlane*kW*kH, outputHeight*outputWidth); - - // Helpers - THCudaTensor *input_n = THCudaTensor_new(state); - THCudaTensor *gradOutput_n = THCudaTensor_new(state); - - // For each elt in batch, do: - for (int elt = 0; elt < batchSize; elt ++) { - // Matrix mulitply per output: - THCudaTensor_select(state, input_n, input, 0, elt); - THCudaTensor_select(state, gradOutput_n, gradOutput, 0, elt); - - // Extract columns: - im2col( - THCState_getCurrentStream(state), - THCudaTensor_data(state, input_n), - nInputPlane, inputHeight, inputWidth, kH, kW, padH, padW, dH, dW, - 1, 1, THCudaTensor_data(state, columns) - ); - - // M,N,K are dims of matrix A and B - // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm) - long m = nOutputPlane; - long n = nInputPlane*kW*kH; - long k = columns->size[1]; - - // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices) - THCudaBlas_Sgemm( - state, - 't', 'n', - n, m, k, - scale, - THCudaTensor_data(state, columns), k, - THCudaTensor_data(state, gradOutput_n), k, - 1, - THCudaTensor_data(state, gradWeight), n - ); - - // Do Bias: - // M,N,K are dims of matrix A and B - // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm) - long m_ = nOutputPlane; - long k_ = outputHeight * outputWidth; - - // Do GEMV (note: this is a bit confusing because gemv assumes column-major matrices) - if (gradBias) { - THCudaBlas_Sgemv( - state, - 't', - k_, m_, - scale, - THCudaTensor_data(state, gradOutput_n), k_, - THCudaTensor_data(state, ones), 1, - 1, - THCudaTensor_data(state, gradBias), 1 - ); - } - } - - // Free - THCudaTensor_free(state, input_n); - THCudaTensor_free(state, gradOutput_n); - if (freeWeight) - THCudaTensor_free(state, gradWeight); - - // Resize - if (batch == 0) { - THCudaTensor_resize3d(state, gradOutput, nOutputPlane, outputHeight, outputWidth); - THCudaTensor_resize3d(state, input, nInputPlane, inputHeight, inputWidth); - } -} +#include "generic/SpatialConvolutionMM.cu" +#include "THCGenerateFloatTypes.h" diff --git a/lib/THCUNN/SpatialCrossMapLRN.cu b/lib/THCUNN/SpatialCrossMapLRN.cu index 9c93926..cd37320 100644 --- a/lib/THCUNN/SpatialCrossMapLRN.cu +++ b/lib/THCUNN/SpatialCrossMapLRN.cu @@ -1,7 +1,9 @@ #include "THCUNN.h" +#include "THCHalf.h" +#include "THCHalfAutoNumerics.cuh" #include "common.h" -template <typename Dtype> +template <typename Dtype, typename Acctype> __global__ void #if __CUDA_ARCH__ >= 320 __launch_bounds__(CUDA_NUM_THREADS) @@ -22,7 +24,7 @@ LRNFillScale(const int nthreads, const Dtype* const in, int head = 0; const int pre_pad = (size - 1) / 2; const int post_pad = size - pre_pad - 1; - Dtype accum_scale = 0; + Acctype accum_scale = Acctype(0); // fill the scale at [n, :, h, w] // accumulate values while (head < post_pad && head < channels) { @@ -36,7 +38,7 @@ LRNFillScale(const int nthreads, const Dtype* const in, accum_scale -= in_off[(head - size) * step] * in_off[(head - size) * step]; } - scale_off[(head - post_pad) * step] = k + accum_scale * alpha_over_size; + scale_off[(head - post_pad) * step] = ScalarConvert<Acctype, Dtype>::to(k + accum_scale * alpha_over_size); ++head; } // subtract only @@ -45,20 +47,21 @@ LRNFillScale(const int nthreads, const Dtype* const in, accum_scale -= in_off[(head - size) * step] * in_off[(head - size) * step]; } - scale_off[(head - post_pad) * step] = k + accum_scale * alpha_over_size; + scale_off[(head - post_pad) * step] = ScalarConvert<Acctype, Dtype>::to(k + accum_scale * alpha_over_size); ++head; } } } -__global__ void LRNComputeOutput(const int nthreads, const float* in, - const float* scale, const float negative_beta, float* out) { +template <typename Dtype> +__global__ void LRNComputeOutput(const int nthreads, const Dtype* in, + const Dtype* scale, const Dtype negative_beta, Dtype* out) { CUDA_KERNEL_LOOP(index, nthreads) { out[index] = in[index] * pow(scale[index], negative_beta); } } -template <typename Dtype> +template <typename Dtype, typename Acctype> __global__ void LRNComputeDiff(const int nthreads, const Dtype* const bottom_data, const Dtype* const top_data, const Dtype* const scale, const Dtype* const top_diff, @@ -80,7 +83,7 @@ __global__ void LRNComputeDiff(const int nthreads, int head = 0; const int pre_pad = size - (size + 1) / 2; const int post_pad = size - pre_pad - 1; - Dtype accum_ratio = 0; + Acctype accum_ratio = Acctype(0); // accumulate values while (head < post_pad && head < channels) { accum_ratio += top_diff_off[head * step] * top_off[head * step] / @@ -96,9 +99,9 @@ __global__ void LRNComputeDiff(const int nthreads, top_off[(head - size) * step] / scale_off[(head - size) * step]; } bottom_diff_off[(head - post_pad) * step] = - top_diff_off[(head - post_pad) * step] + ScalarConvert<Acctype, Dtype>::to(top_diff_off[(head - post_pad) * step] * pow(scale_off[(head - post_pad) * step], negative_beta) - - cache_ratio * bottom_off[(head - post_pad) * step] * accum_ratio; + - cache_ratio * bottom_off[(head - post_pad) * step] * accum_ratio); ++head; } // subtract only @@ -108,121 +111,14 @@ __global__ void LRNComputeDiff(const int nthreads, top_off[(head - size) * step] / scale_off[(head - size) * step]; } bottom_diff_off[(head - post_pad) * step] = - top_diff_off[(head - post_pad) * step] + ScalarConvert<Acctype, Dtype>::to(top_diff_off[(head - post_pad) * step] * pow(scale_off[(head - post_pad) * step], negative_beta) - - cache_ratio * bottom_off[(head - post_pad) * step] * accum_ratio; + - cache_ratio * bottom_off[(head - post_pad) * step] * accum_ratio); ++head; } } } -extern "C" -void LRNforward(THCState* state, THCudaTensor* input, THCudaTensor* output, - THCudaTensor* scale, int local_size, float alpha, float beta, float k) -{ - THCudaTensor_resizeAs(state, output, input); - THCudaTensor_resizeAs(state, scale, input); - - int batchSize; - int nInputPlane; - int imsize_h; - int imsize_w; - - if (input->nDimension == 3) { - batchSize = 1; - nInputPlane = input->size[0]; - imsize_h = input->size[1]; - imsize_w = input->size[2]; - } - else - { - batchSize = input->size[0]; - nInputPlane = input->size[1]; - imsize_h = input->size[2]; - imsize_w = input->size[3]; - } - - input = THCudaTensor_newContiguous(state, input); - - int n_threads = batchSize * imsize_h * imsize_w; - LRNFillScale<<<GET_BLOCKS(n_threads), CUDA_NUM_THREADS, 0, THCState_getCurrentStream(state)>>>( - n_threads, THCudaTensor_data(state, input), batchSize, nInputPlane, imsize_h, imsize_w, local_size, - alpha / local_size, k, THCudaTensor_data(state, scale)); - n_threads *= nInputPlane; - THCudaCheck(cudaGetLastError()); - LRNComputeOutput<<<GET_BLOCKS(n_threads), CUDA_NUM_THREADS, 0, THCState_getCurrentStream(state)>>>( - n_threads, THCudaTensor_data(state, input), THCudaTensor_data(state, scale), -beta, THCudaTensor_data(state, output)); - THCudaCheck(cudaGetLastError()); - - THCudaTensor_free(state, input); -} - - -extern "C" -void LRNbackward(THCState* state, THCudaTensor* input, THCudaTensor* output, - THCudaTensor* gradOutput, THCudaTensor* gradInput, THCudaTensor* scale, - int local_size, float alpha, float beta, float k) -{ - THCudaTensor_resizeAs(state, gradInput, input); - - int batchSize; - int nInputPlane; - int imsize_h; - int imsize_w; - - if (input->nDimension == 3) { - batchSize = 1; - nInputPlane = input->size[0]; - imsize_h = input->size[1]; - imsize_w = input->size[2]; - } - else - { - batchSize = input->size[0]; - nInputPlane = input->size[1]; - imsize_h = input->size[2]; - imsize_w = input->size[3]; - } - - input = THCudaTensor_newContiguous(state, input); - gradOutput = THCudaTensor_newContiguous(state, gradOutput); - - int n_threads = batchSize * imsize_h * imsize_w; - LRNComputeDiff<<<GET_BLOCKS(n_threads), CUDA_NUM_THREADS, 0, THCState_getCurrentStream(state)>>>( - n_threads, THCudaTensor_data(state, input), THCudaTensor_data(state, output), - THCudaTensor_data(state, scale), THCudaTensor_data(state, gradOutput), batchSize, nInputPlane, imsize_h, imsize_w, - local_size, -beta, float(2. * alpha * beta / local_size), - THCudaTensor_data(state, gradInput)); - THCudaCheck(cudaGetLastError()); - - THCudaTensor_free(state, input); - THCudaTensor_free(state, gradOutput); -} - -void THNN_CudaSpatialCrossMapLRN_updateOutput( - THCState *state, - THCudaTensor *input, - THCudaTensor *output, - THCudaTensor *scale, - int size, - float alpha, - float beta, - float k) -{ - LRNforward(state, input, output, scale, size, alpha, beta, k); -} -void THNN_CudaSpatialCrossMapLRN_updateGradInput( - THCState *state, - THCudaTensor *input, - THCudaTensor *gradOutput, - THCudaTensor *gradInput, - THCudaTensor *scale, - THCudaTensor *output, - int size, - float alpha, - float beta, - float k) -{ - LRNbackward(state, input, output, gradOutput, gradInput, scale, size, alpha, beta, k); -} +#include "generic/SpatialCrossMapLRN.cu" +#include "THCGenerateFloatTypes.h" diff --git a/lib/THCUNN/SpatialDilatedConvolution.cu b/lib/THCUNN/SpatialDilatedConvolution.cu index 5b2f0e6..a4a8e38 100644 --- a/lib/THCUNN/SpatialDilatedConvolution.cu +++ b/lib/THCUNN/SpatialDilatedConvolution.cu @@ -2,338 +2,8 @@ #include "common.h" #include "im2col.h" +#include "THCHalf.h" +#include "THCHalfAutoNumerics.cuh" -void THNN_CudaSpatialDilatedConvolution_updateOutput(THCState *state, - THCudaTensor *input, THCudaTensor *output, THCudaTensor *weight, - THCudaTensor *bias, THCudaTensor *columns, - THCudaTensor *ones, int kW, int kH, int dW, int dH, - int padW, int padH, int dilationW, int dilationH) { - - THCUNN_assertSameGPU(state, 5, input, output, weight, columns, ones); - if (bias) { - THCUNN_assertSameGPU(state, 2, weight, bias); - } - THArgCheck(input->nDimension == 3 || input->nDimension == 4, 2, "3D or 4D (batch mode) tensor is expected"); - THArgCheck(weight->nDimension == 4, 4, "weight tensor must be 4D (nOutputPlane,nInputPlane,kH,kW)"); - THArgCheck(!bias || weight->size[0] == bias->size[0], 4, "nOutputPlane mismatch in weight and bias"); - THArgCheck(kW > 0 && kH > 0, 8, "kernel size should be greater than zero"); - THArgCheck(dW > 0 && dH > 0, 10, "stride should be greater than zero"); - - // Params: - int nInputPlane = weight->size[1]; - int nOutputPlane = weight->size[0]; - - int batch = 1; - if (input->nDimension == 3) { - THArgCheck(input->size[0] == nInputPlane, 2, "input channels and nInputPlane dont match"); - // Force batch - batch = 0; - THCudaTensor_resize4d(state, input, 1, input->size[0], input->size[1], input->size[2]); - } else { - THArgCheck(input->size[1] == nInputPlane, 2, "input channels and nInputPlane dont match"); - } - - long inputWidth = input->size[3]; - long inputHeight = input->size[2]; - long outputWidth = (inputWidth + 2*padW - (dilationW * (kW - 1) + 1)) / dW + 1; - long outputHeight = (inputHeight + 2*padH - (dilationH * (kH - 1) + 1)) / dH + 1; - - if (outputWidth < 1 || outputHeight < 1) - THError("Given input size: (%dx%dx%d). Calculated output size: (%dx%dx%d). Output size is too small", - nInputPlane,inputHeight,inputWidth,nOutputPlane,outputHeight,outputWidth); - - // Batch size + input planes - long batchSize = input->size[0]; - - // Resize output - THCudaTensor_resize4d(state, output, batchSize, nOutputPlane, outputHeight, outputWidth); - - // Resize temporary columns - THCudaTensor_resize2d(state, columns, nInputPlane*kW*kH, outputHeight*outputWidth); - - // Define a buffer of ones, for bias accumulation - // Note: this buffer can be shared with other modules, it only ever gets increased, - // and always contains ones. - if (ones->nDimension != 2 || ones->size[0]*ones->size[1] < outputHeight*outputWidth) { - // Resize plane and fill with ones... - THCudaTensor_resize2d(state, ones, outputHeight, outputWidth); - THCudaTensor_fill(state, ones, 1); - } - - // Helpers - THCudaTensor *input_n = THCudaTensor_new(state); - THCudaTensor *output_n = THCudaTensor_new(state); - - // For each elt in batch, do: - for (int elt = 0; elt < batchSize; elt ++) { - // Matrix mulitply per output: - THCudaTensor_select(state, input_n, input, 0, elt); - THCudaTensor_select(state, output_n, output, 0, elt); - - // Do Bias first: - // M,N,K are dims of matrix A and B - // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm) - long m_ = nOutputPlane; - long n_ = outputHeight * outputWidth; - long k_ = 1; - - // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices) - if (bias) { - THCudaBlas_Sgemm( - state, - 't', 'n', - n_, m_, k_, - 1, - THCudaTensor_data(state, ones), k_, - THCudaTensor_data(state, bias), k_, - 0, - THCudaTensor_data(state, output_n), n_ - ); - } else { - THCudaTensor_zero(state, output_n); - } - - // Extract columns: - im2col( - THCState_getCurrentStream(state), - THCudaTensor_data(state, input_n), - nInputPlane, inputHeight, inputWidth, kH, kW, padH, padW, dH, dW, - dilationH, dilationW, - THCudaTensor_data(state, columns) - ); - - // M,N,K are dims of matrix A and B - // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm) - long m = nOutputPlane; - long n = columns->size[1]; - long k = nInputPlane*kH*kW; - - // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices) - THCudaBlas_Sgemm( - state, - 'n', 'n', - n, m, k, - 1, - THCudaTensor_data(state, columns), n, - THCudaTensor_data(state, weight), k, - 1, - THCudaTensor_data(state, output_n), n - ); - } - - // Free - THCudaTensor_free(state, input_n); - THCudaTensor_free(state, output_n); - - // Resize output - if (batch == 0) { - THCudaTensor_resize3d(state, output, nOutputPlane, outputHeight, outputWidth); - THCudaTensor_resize3d(state, input, nInputPlane, inputHeight, inputWidth); - } -} - -void THNN_CudaSpatialDilatedConvolution_updateGradInput(THCState *state, - THCudaTensor *input, THCudaTensor *gradOutput, - THCudaTensor *gradInput, THCudaTensor *weight, - THCudaTensor *gradColumns, - int kW, int kH, int dW, int dH, int padW, int padH, - int dilationW, int dilationH ) { - - THCUNN_assertSameGPU(state, 5, input, gradOutput, weight, - gradColumns, gradInput); - THArgCheck(input->nDimension == 3 || input->nDimension == 4, 2, "3D or 4D (batch mode) tensor is expected"); - THArgCheck(weight->nDimension == 4, 4, "weight tensor must be 4D (nOutputPlane,nInputPlane,kH,kW)"); - THArgCheck(kW > 0 && kH > 0, 9, "kernel size should be greater than zero"); - THArgCheck(dW > 0 && dH > 0, 11, "stride should be greater than zero"); - - // Params - int nInputPlane = weight->size[1]; - int nOutputPlane = weight->size[0]; - - int batch = 1; - if (input->nDimension == 3) { - // Force batch - batch = 0; - THCudaTensor_resize4d(state, input, 1, input->size[0], input->size[1], input->size[2]); - THCudaTensor_resize4d(state, gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2]); - } - - long inputWidth = input->size[3]; - long inputHeight = input->size[2]; - long outputWidth = (inputWidth + 2*padW - (dilationW * (kW - 1) + 1)) / dW + 1; - long outputHeight = (inputHeight + 2*padH - (dilationH * (kH - 1) + 1)) / dH + 1; - - // Batch size + input planes - long batchSize = input->size[0]; - - // Resize output - THCudaTensor_resize4d(state, gradInput, batchSize, nInputPlane, inputHeight, inputWidth); - - // Resize temporary columns - THCudaTensor_resize2d(state, gradColumns, nInputPlane*kW*kH, outputHeight*outputWidth); - - // Helpers - THCudaTensor *gradInput_n = THCudaTensor_new(state); - THCudaTensor *gradOutput_n = THCudaTensor_new(state); - - // For each elt in batch, do: - for (int elt = 0; elt < batchSize; elt ++) { - // Matrix mulitply per sample: - THCudaTensor_select(state, gradInput_n, gradInput, 0, elt); - THCudaTensor_select(state, gradOutput_n, gradOutput, 0, elt); - - // M,N,K are dims of matrix A and B - // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm) - long m = nInputPlane*kW*kH; - long n = gradColumns->size[1]; - long k = nOutputPlane; - - // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices) - THCudaBlas_Sgemm( - state, - 'n', 't', - n, m, k, - 1, - THCudaTensor_data(state, gradOutput_n), n, - THCudaTensor_data(state, weight), m, - 0, - THCudaTensor_data(state, gradColumns), n - ); - - // Unpack columns back into input: - col2im( - THCState_getCurrentStream(state), - THCudaTensor_data(state, gradColumns), - nInputPlane, inputHeight, inputWidth, kH, kW, padH, padW, dH, dW, - dilationH, dilationW, - THCudaTensor_data(state, gradInput_n) - ); - } - - // Free - THCudaTensor_free(state, gradInput_n); - THCudaTensor_free(state, gradOutput_n); - - // Resize output - if (batch == 0) { - THCudaTensor_resize3d(state, gradOutput, nOutputPlane, outputHeight, outputWidth); - THCudaTensor_resize3d(state, input, nInputPlane, inputHeight, inputWidth); - THCudaTensor_resize3d(state, gradInput, nInputPlane, inputHeight, inputWidth); - } -} - -void THNN_CudaSpatialDilatedConvolution_accGradParameters(THCState *state, - THCudaTensor *input, THCudaTensor *gradOutput, - THCudaTensor *gradWeight, THCudaTensor *gradBias, - THCudaTensor *columns, THCudaTensor *ones, - int kW, int kH, int dW, int dH, - int padW, int padH, int dilationW, int dilationH, float scale) { - - THCUNN_assertSameGPU(state, 5, input, gradOutput, gradWeight, columns, ones); - if (gradBias) { - THCUNN_assertSameGPU(state, 2, gradWeight, gradBias); - } - THArgCheck(input->nDimension == 3 || input->nDimension == 4, 2, "3D or 4D (batch mode) tensor is expected"); - THArgCheck(gradWeight->nDimension == 4, 4, "gradWeight tensor must be 4D (nOutputPlane,nInputPlane,kH,kW)"); - THArgCheck(!gradBias || gradWeight->size[0] == gradBias->size[0], 4, "nOutputPlane mismatch in gradWeight and gradBias"); - THArgCheck(kW > 0 && kH > 0, 8, "kernel size should be greater than zero"); - THArgCheck(dW > 0 && dH > 0, 10, "stride should be greater than zero"); - - // Params - int nInputPlane = gradWeight->size[1]; - int nOutputPlane = gradWeight->size[0]; - - int batch = 1; - if (input->nDimension == 3) { - // Force batch - batch = 0; - THCudaTensor_resize4d(state, input, 1, input->size[0], input->size[1], input->size[2]); - THCudaTensor_resize4d(state, gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2]); - } - - long inputWidth = input->size[3]; - long inputHeight = input->size[2]; - long outputWidth = (inputWidth + 2*padW - (dilationW * (kW - 1) + 1)) / dW + 1; - long outputHeight = (inputHeight + 2*padH - (dilationH * (kH - 1) + 1)) / dH + 1; - - // Batch size + input planes - long batchSize = input->size[0]; - - // Define a buffer of ones, for bias accumulation - if (ones->nDimension != 2 || ones->size[0]*ones->size[1] < outputHeight*outputWidth) { - // Resize plane and fill with ones... - THCudaTensor_resize2d(state, ones, outputHeight, outputWidth); - THCudaTensor_fill(state, ones, 1); - } - - // Resize temporary columns - THCudaTensor_resize2d(state, columns, nInputPlane*kW*kH, outputHeight*outputWidth); - - // Helpers - THCudaTensor *input_n = THCudaTensor_new(state); - THCudaTensor *gradOutput_n = THCudaTensor_new(state); - - // For each elt in batch, do: - for (int elt = 0; elt < batchSize; elt ++) { - // Matrix mulitply per output: - THCudaTensor_select(state, input_n, input, 0, elt); - THCudaTensor_select(state, gradOutput_n, gradOutput, 0, elt); - - // Extract columns: - im2col( - THCState_getCurrentStream(state), - THCudaTensor_data(state, input_n), - nInputPlane, inputHeight, inputWidth, kH, kW, padH, padW, dH, dW, - dilationH, dilationW, - THCudaTensor_data(state, columns) - ); - - // M,N,K are dims of matrix A and B - // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm) - long m = nOutputPlane; - long n = nInputPlane*kW*kH; - long k = columns->size[1]; - - // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices) - THCudaBlas_Sgemm( - state, - 't', 'n', - n, m, k, - scale, - THCudaTensor_data(state, columns), k, - THCudaTensor_data(state, gradOutput_n), k, - 1, - THCudaTensor_data(state, gradWeight), n - ); - - // Do Bias: - // M,N,K are dims of matrix A and B - // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm) - long m_ = nOutputPlane; - long k_ = outputHeight * outputWidth; - - // Do GEMV (note: this is a bit confusing because gemv assumes column-major matrices) - if (gradBias) { - THCudaBlas_Sgemv( - state, - 't', - k_, m_, - scale, - THCudaTensor_data(state, gradOutput_n), k_, - THCudaTensor_data(state, ones), 1, - 1, - THCudaTensor_data(state, gradBias), 1 - ); - } - } - - // Free - THCudaTensor_free(state, input_n); - THCudaTensor_free(state, gradOutput_n); - - // Resize - if (batch == 0) { - THCudaTensor_resize3d(state, gradOutput, nOutputPlane, outputHeight, outputWidth); - THCudaTensor_resize3d(state, input, nInputPlane, inputHeight, inputWidth); - } -} +#include "generic/SpatialDilatedConvolution.cu" +#include "THCGenerateFloatTypes.h" diff --git a/lib/THCUNN/SpatialDilatedMaxPooling.cu b/lib/THCUNN/SpatialDilatedMaxPooling.cu index 26ac65d..94fd681 100644 --- a/lib/THCUNN/SpatialDilatedMaxPooling.cu +++ b/lib/THCUNN/SpatialDilatedMaxPooling.cu @@ -1,15 +1,17 @@ #include "THCUNN.h" +#include "THCHalf.h" +#include "THCHalfAutoNumerics.cuh" #include "common.h" // kernels borrowed from Caffe -template <typename Dtype> +template <typename Dtype, typename AccType> __global__ void MaxPoolForward(const int nthreads, const Dtype* bottom_data, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, const int dilation_h, const int dilation_w, Dtype* top_data, - Dtype* top_mask) { + long* top_mask) { CUDA_KERNEL_LOOP(index, nthreads) { int pw = index % pooled_width; int ph = (index / pooled_width) % pooled_height; @@ -23,26 +25,26 @@ __global__ void MaxPoolForward(const int nthreads, const Dtype* bottom_data, hstart += dilation_h; while(wstart < 0) wstart += dilation_w; - Dtype maxval = -FLT_MAX; + AccType maxval = THCNumerics<AccType>::min(); int maxidx = -1; bottom_data += (n * channels + c) * height * width; for (int h = hstart; h < hend; h += dilation_h) { for (int w = wstart; w < wend; w += dilation_w) { - if (bottom_data[h * width + w] > maxval) { + if (ScalarConvert<Dtype, AccType>::to(bottom_data[h * width + w]) > maxval) { maxidx = h * width + w; - maxval = bottom_data[maxidx]; + maxval = ScalarConvert<Dtype, AccType>::to(bottom_data[maxidx]); } } } - top_data[index] = maxval; + top_data[index] = ScalarConvert<AccType, Dtype>::to(maxval); top_mask[index] = maxidx + TH_INDEX_BASE; } } -template <typename Dtype> +template <typename Dtype, typename AccType> __global__ void MaxPoolBackward(const int nthreads, const Dtype* top_diff, - const Dtype* top_mask, const int num, const int channels, + const long* top_mask, const int num, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, @@ -61,147 +63,21 @@ __global__ void MaxPoolBackward(const int nthreads, const Dtype* top_diff, int pwstart = (w + pad_w < ((kernel_w - 1) * dilation_w + 1)) ? 0 : (w + pad_w - ((kernel_w - 1) * dilation_w + 1)) / stride_w + 1; int pwend = min((w + pad_w) / stride_w + 1, pooled_width); - - Dtype gradient = 0; + + AccType gradient = AccType(0); int offset = (n * channels + c) * pooled_height * pooled_width; top_diff += offset; top_mask += offset; for (int ph = phstart; ph < phend; ++ph) { for (int pw = pwstart; pw < pwend; ++pw) { - if (top_mask[ph * pooled_width + pw] - TH_INDEX_BASE == h * width + w) { - gradient += top_diff[ph * pooled_width + pw]; - } + if (top_mask[ph * pooled_width + pw] - TH_INDEX_BASE == h * width + w) { + gradient += ScalarConvert<Dtype, AccType>::to(top_diff[ph * pooled_width + pw]); + } } } - bottom_diff[index] = gradient; - } -} - -void THNN_CudaSpatialDilatedMaxPooling_updateOutput(THCState *state, THCudaTensor *input, THCudaTensor *output, THCudaTensor *indices, int kW, int kH, int dW, int dH, int padW, int padH, int dilationW, int dilationH, bool ceil_mode) -{ - - THCUNN_assertSameGPU(state, 3, input, output, indices); - THArgCheck(input->nDimension == 3 || input->nDimension == 4, 2, "3D or 4D (batch) tensor expected"); - - long nInputCols, nInputRows, nInputPlane, batchSize; - long nOutputCols, nOutputRows; - - if (input->nDimension == 3) { - nInputCols = input->size[2]; - nInputRows = input->size[1]; - nInputPlane = input->size[0]; - batchSize = 1; + bottom_diff[index] = ScalarConvert<AccType, Dtype>::to(gradient); } - else - { - nInputCols = input->size[3]; - nInputRows = input->size[2]; - nInputPlane = input->size[1]; - batchSize = input->size[0]; - } - - THArgCheck(nInputCols >= kW - padW && nInputRows >= kH - padH, 2, "input image smaller than kernel size"); - THArgCheck(kW/2 >= padW && kH/2 >= padH, 2, "pad should be smaller than half of kernel size"); - - if(ceil_mode) { - nOutputCols = ceil(float(nInputCols - (dilationW * (kW - 1) + 1) + 2*padW) / float(dW)) + 1; - nOutputRows = ceil(float(nInputRows - (dilationH * (kH - 1) + 1) + 2*padH) / float(dH)) + 1; - } - else { - nOutputCols = floor(float(nInputCols - (dilationW * (kW - 1) + 1) + 2*padW) / float(dW)) + 1; - nOutputRows = floor(float(nInputRows - (dilationH * (kH - 1) + 1) + 2*padH) / float(dH)) + 1; - } - -if (nOutputCols < 1 || nOutputRows < 1) - THError("Given input size: (%dx%dx%d). Calculated output size: (%dx%dx%d). Output size is too small", - nInputPlane,nInputRows,nInputCols,nInputPlane,nOutputRows,nOutputCols); - -if (padW || padH) - { - // ensure that the last pooling starts inside the image - if ((nOutputRows - 1)*dH >= nInputRows + padH) - --nOutputRows; - if ((nOutputCols - 1)*dW >= nInputCols + padW) - --nOutputCols; - } - - input = THCudaTensor_newContiguous(state, input); - float* input_data = THCudaTensor_data(state, input); - - THCudaTensor_resize4d(state, output, batchSize, nInputPlane, nOutputRows, nOutputCols); - THCudaTensor_resizeAs(state, indices, output); - - float* indices_data = THCudaTensor_data(state, indices); - float* output_data = THCudaTensor_data(state, output); - - int count = THCudaTensor_nElement(state, output); - - MaxPoolForward <<< GET_BLOCKS(count), CUDA_NUM_THREADS, 0, THCState_getCurrentStream(state) >>> - (count, input_data, - batchSize, nInputPlane, nInputRows, nInputCols, nOutputRows, nOutputCols, - kH, kW, dH, dW, padH, padW, dilationH, dilationW, output_data, indices_data); - THCudaCheck(cudaGetLastError()); - - if(input->nDimension == 3) - THCudaTensor_resize3d(state, output, nInputPlane, nOutputRows, nOutputCols); - - THCudaTensor_free(state, input); } -void THNN_CudaSpatialDilatedMaxPooling_updateGradInput(THCState *state, THCudaTensor *input, THCudaTensor *gradOutput, THCudaTensor *gradInput, THCudaTensor *indices, int kW, int kH, int dW, int dH, int padW, int padH, int dilationW, int dilationH, bool ceil_mode) -{ - THCUNN_assertSameGPU(state, 4, input, gradOutput, indices, gradInput); - - input = THCudaTensor_newContiguous(state, input); - gradOutput = THCudaTensor_newContiguous(state, gradOutput); - - long nInputCols, nInputRows, nInputPlane, batchSize; - long nOutputCols, nOutputRows; - - if (input->nDimension == 3) { - nInputCols = input->size[2]; - nInputRows = input->size[1]; - nInputPlane = input->size[0]; - batchSize = 1; - } - else - { - nInputCols = input->size[3]; - nInputRows = input->size[2]; - nInputPlane = input->size[1]; - batchSize = input->size[0]; - } - - if(ceil_mode) { - nOutputCols = ceil(float(nInputCols - (dilationW * (kW - 1) + 1) + 2*padW) / float(dW)) + 1; - nOutputRows = ceil(float(nInputRows - (dilationH * (kH - 1) + 1) + 2*padH) / float(dH)) + 1; - } - else { - nOutputCols = floor(float(nInputCols - (dilationW * (kW - 1) + 1) + 2*padW) / float(dW)) + 1; - nOutputRows = floor(float(nInputRows - (dilationH * (kH - 1) + 1) + 2*padH) / float(dH)) + 1; - } - - if (nOutputCols < 1 || nOutputRows < 1) - THError("Given input size: (%dx%dx%d). Calculated output size: (%dx%dx%d). Output size is too small", - nInputPlane,nInputRows,nInputCols,nInputPlane,nOutputRows,nOutputCols); - - gradOutput = THCudaTensor_newContiguous(state, gradOutput); - THCudaTensor_resizeAs(state, gradInput, input); - - int count = THCudaTensor_nElement(state, input); - - MaxPoolBackward <<< GET_BLOCKS(count), CUDA_NUM_THREADS, 0, THCState_getCurrentStream(state) >>> - (count, - THCudaTensor_data(state, gradOutput), - THCudaTensor_data(state, indices), - batchSize, nInputPlane, nInputRows, nInputCols, nOutputRows, nOutputCols, - kH, kW, dH, dW, padH, padW, dilationH, dilationW, - THCudaTensor_data(state, gradInput)); - THCudaCheck(cudaGetLastError()); - - THCudaTensor_free(state, gradOutput); - - // clean - THCudaTensor_free(state, input); - THCudaTensor_free(state, gradOutput); -} +#include "generic/SpatialDilatedMaxPooling.cu" +#include "THCGenerateFloatTypes.h" diff --git a/lib/THCUNN/SpatialFractionalMaxPooling.cu b/lib/THCUNN/SpatialFractionalMaxPooling.cu index 289b1d6..22a2878 100644 --- a/lib/THCUNN/SpatialFractionalMaxPooling.cu +++ b/lib/THCUNN/SpatialFractionalMaxPooling.cu @@ -3,15 +3,19 @@ #include "THCDeviceTensor.cuh" #include "THCDeviceTensorUtils.cuh" #include "THCDeviceUtils.cuh" +#include "THCHalf.h" +#include "THCHalfAutoNumerics.cuh" +#include "THCAtomics.cuh" #include <cfloat> -__device__ inline float getInterval(float sample, - int index, - int inputSize, - int outputSize, - int poolSize) { - float alpha = (float)(inputSize - poolSize) / (float) (outputSize - 1); +template <typename Dtype, typename Acctype> +__device__ inline int getInterval(Acctype sample, + int index, + int inputSize, + int outputSize, + int poolSize) { + Acctype alpha = (Acctype)(inputSize - poolSize) / (Acctype) (outputSize - 1); if (index == outputSize - 1) { return inputSize - poolSize; } else { @@ -20,12 +24,12 @@ __device__ inline float getInterval(float sample, } // We template on poolSizeW to allow the innermost loop to be unrolled -template <int PoolSizeWStatic> +template <int PoolSizeWStatic, typename Dtype, typename Acctype> __global__ void SpatialFractionalMaxPooling_updateOutput( - THCDeviceTensor<float, 4> input, - THCDeviceTensor<float, 4> output, - THCDeviceTensor<float, 4> indices, - THCDeviceTensor<float, 3> samples, + THCDeviceTensor<Dtype, 4> input, + THCDeviceTensor<Dtype, 4> output, + THCDeviceTensor<THCIndex_t, 4> indices, + THCDeviceTensor<Dtype, 3> samples, int poolSizeW, int poolSizeH) { // Output (h, w) point that this thread is responsible for @@ -38,33 +42,39 @@ __global__ void SpatialFractionalMaxPooling_updateOutput( int outputW = ourOutputPoint % output.getSize(3); int outputH = ourOutputPoint / output.getSize(3); - int poolW = getInterval(samples[batch][plane][0], outputW, + int poolW = getInterval<Dtype, Acctype>(ScalarConvert<Dtype, Acctype>::to(samples[batch][plane][0]), outputW, input.getSize(3), output.getSize(3), poolSizeW); - int poolH = getInterval(samples[batch][plane][1], outputH, + int poolH = getInterval<Dtype, Acctype>(ScalarConvert<Dtype, Acctype>::to(samples[batch][plane][1]), outputH, input.getSize(2), output.getSize(2), poolSizeH); - float maxVal = -FLT_MAX; + Dtype maxVal = THCNumerics<Dtype>::min(); int maxIndex = -1; for (int h = poolH; h < poolH + poolSizeH; ++h) { if (PoolSizeWStatic == -1) { for (int w = poolW; w < poolW + poolSizeW; ++w) { - float val = input[batch][plane][h][w]; - maxVal = fmaxf(val, maxVal); - maxIndex = (maxVal == val) ? (h * input.getSize(3) + w) : maxIndex; + Dtype val = input[batch][plane][h][w]; + // for consistency with THNN, favor the first max + if (val > maxVal) { + maxIndex = h * input.getSize(3) + w; + maxVal = val; + } } } else { #pragma unroll for (int i = 0; i < PoolSizeWStatic; ++i) { int w = i + poolW; - float val = input[batch][plane][h][w]; - maxVal = fmaxf(val, maxVal); - maxIndex = (maxVal == val) ? (h * input.getSize(3) + w) : maxIndex; + Dtype val = input[batch][plane][h][w]; + // for consistency with THNN, favor the first max + if (val > maxVal) { + maxIndex = h * input.getSize(3) + w; + maxVal = val; + } } } } - assert(maxVal != -FLT_MAX); + assert(maxVal != THCNumerics<Dtype>::min()); assert(maxIndex != -1); // +1 for Lua index @@ -73,100 +83,11 @@ __global__ void SpatialFractionalMaxPooling_updateOutput( } } -void THNN_CudaSpatialFractionalMaxPooling_updateOutput( - THCState *state, - THCudaTensor *input, - THCudaTensor *output, - int outputW, int outputH, - int poolSizeW, int poolSizeH, - THCudaTensor *indices, - THCudaTensor *randomSamples) -{ - int planeDim = 0; - int dimh = 1; - int dimw = 2; - long numBatch = 1; - - long numInputDims = THCudaTensor_nDimension(state, input); - THArgCheck(numInputDims == 3 || numInputDims == 4, 2, - "3D or 4D (batch mode) tensor expected"); - - if (numInputDims == 4) { - numBatch = THCudaTensor_size(state, input, 0); - planeDim++; - dimh++; - dimw++; - } - - /* sizes */ - long numPlanes = THCudaTensor_size(state, input, planeDim); - long inputH = THCudaTensor_size(state, input, dimh); - long inputW = THCudaTensor_size(state, input, dimw); - - THArgCheck(outputH + poolSizeH - 1 < inputH, 6, - "poolSizeH too large relative to input height"); - THArgCheck(outputW + poolSizeW - 1 < inputW, 5, - "poolSizeW too large relative to input width"); - - THCDeviceTensor<float, 4> devInput; - THCDeviceTensor<float, 4> devOutput; - THCDeviceTensor<float, 4> devIndices; - THCDeviceTensor<float, 3> devSamples = - toDeviceTensor<float, 3>(state, randomSamples); - - if (numInputDims == 3) { - /* resize output */ - THCudaTensor_resize3d(state, output, numPlanes, outputH, outputW); - /* indices will contain the locations for each output point */ - THCudaTensor_resize3d(state, indices, numPlanes, outputH, outputW); - - devInput = toDeviceTensor<float, 3>(state, input).upcastOuter<4>(); - devOutput = toDeviceTensor<float, 3>(state, output).upcastOuter<4>(); - devIndices = toDeviceTensor<float, 3>(state, indices).upcastOuter<4>(); - } else { - THCudaTensor_resize4d(state, output, numBatch, numPlanes, outputH, outputW); - /* indices will contain the locations for each output point */ - THCudaTensor_resize4d(state, indices, numBatch, numPlanes, outputH, outputW); - - devInput = toDeviceTensor<float, 4>(state, input); - devOutput = toDeviceTensor<float, 4>(state, output); - devIndices = toDeviceTensor<float, 4>(state, indices); - } - - // block is limited to 4 warps - // grid handles overflow per each plane - int outputPlaneSize = devOutput.getSize(2) * devOutput.getSize(3); - dim3 grid(THCCeilDiv(outputPlaneSize, 128), - devInput.getSize(1), - devInput.getSize(0)); - dim3 block(outputPlaneSize > 128 ? 128 : outputPlaneSize); - -#define SFMP_UPDATE_OUTPUT(POOL_W) \ - SpatialFractionalMaxPooling_updateOutput<POOL_W> \ - <<<grid, block, 0, THCState_getCurrentStream(state)>>>( \ - devInput, devOutput, devIndices, devSamples, poolSizeW, poolSizeH); - -#define SFMP_UPDATE_OUTPUT_CASE(POOL_W) \ - case POOL_W: SFMP_UPDATE_OUTPUT(POOL_W); break - - switch (poolSizeW) { - SFMP_UPDATE_OUTPUT_CASE(2); - SFMP_UPDATE_OUTPUT_CASE(3); - SFMP_UPDATE_OUTPUT_CASE(4); - SFMP_UPDATE_OUTPUT_CASE(5); - SFMP_UPDATE_OUTPUT_CASE(6); - SFMP_UPDATE_OUTPUT_CASE(7); - default: - // dynamic pool width - SFMP_UPDATE_OUTPUT_CASE(-1); - } - THCudaCheck(cudaGetLastError()); -} - +template <typename Dtype> __global__ void SpatialFractionalMaxPooling_updateGradInput( - THCDeviceTensor<float, 4> gradInput, - THCDeviceTensor<float, 4> gradOutput, - THCDeviceTensor<float, 4> indices) { + THCDeviceTensor<Dtype, 4> gradInput, + THCDeviceTensor<Dtype, 4> gradOutput, + THCDeviceTensor<THCIndex_t, 4> indices) { // Output (h, w) point that this thread is responsible for int ourOutputPoint = threadIdx.x + blockIdx.x * blockDim.x; int plane = blockIdx.y; @@ -188,62 +109,5 @@ __global__ void SpatialFractionalMaxPooling_updateGradInput( } } -void THNN_CudaSpatialFractionalMaxPooling_updateGradInput( - THCState *state, - THCudaTensor *input, - THCudaTensor *gradOutput, - THCudaTensor *gradInput, - int outputW, int outputH, - int poolSizeW, int poolSizeH, - THCudaTensor *indices) -{ - int dimh = 1; - int dimw = 2; - - long numInputDims = THCudaTensor_nDimension(state, input); - if (numInputDims == 4) { - dimh++; - dimw++; - } - - /* sizes */ - long inputH = THCudaTensor_size(state, input, dimh); - long inputW = THCudaTensor_size(state, input, dimw); - - THArgCheck(outputH == THCudaTensor_size(state, gradOutput, dimh), 3, - "gradOutput height unexpected"); - THArgCheck(outputW == THCudaTensor_size(state, gradOutput, dimw), 3, - "gradOutput width unexpected"); - - /* resize */ - THCudaTensor_resizeAs(state, gradInput, input); - THCudaTensor_zero(state, gradInput); - - THCDeviceTensor<float, 4> devGradInput; - THCDeviceTensor<float, 4> devGradOutput; - THCDeviceTensor<float, 4> devIndices; - - /* backprop */ - if (numInputDims == 3) { - devGradInput = toDeviceTensor<float, 3>(state, gradInput).upcastOuter<4>(); - devGradOutput = toDeviceTensor<float, 3>(state, gradOutput).upcastOuter<4>(); - devIndices = toDeviceTensor<float, 3>(state, indices).upcastOuter<4>(); - } else { - devGradInput = toDeviceTensor<float, 4>(state, gradInput); - devGradOutput = toDeviceTensor<float, 4>(state, gradOutput); - devIndices = toDeviceTensor<float, 4>(state, indices); - } - - // block is limited to 4 warps - // grid handles overflow per each plane - int outputPlaneSize = devGradOutput.getSize(2) * devGradOutput.getSize(3); - dim3 grid(THCCeilDiv(outputPlaneSize, 128), - devGradInput.getSize(1), - devGradInput.getSize(0)); - dim3 block(outputPlaneSize > 128 ? 128 : outputPlaneSize); - - SpatialFractionalMaxPooling_updateGradInput - <<<grid, block, 0, THCState_getCurrentStream(state)>>>( - devGradInput, devGradOutput, devIndices); - THCudaCheck(cudaGetLastError()); -} +#include "generic/SpatialFractionalMaxPooling.cu" +#include "THCGenerateFloatTypes.h" diff --git a/lib/THCUNN/SpatialFullConvolution.cu b/lib/THCUNN/SpatialFullConvolution.cu index 4c31485..4e37ecf 100644 --- a/lib/THCUNN/SpatialFullConvolution.cu +++ b/lib/THCUNN/SpatialFullConvolution.cu @@ -1,337 +1,8 @@ #include "THCUNN.h" #include "im2col.h" +#include "THCHalf.h" +#include "THCHalfAutoNumerics.cuh" -void THNN_CudaSpatialFullConvolution_updateOutput( - THCState *state, - THCudaTensor *input, - THCudaTensor *output, - THCudaTensor *weight, - THCudaTensor *bias, - THCudaTensor *columns, - THCudaTensor *ones, - int kW, int kH, - int dW, int dH, - int padW, int padH, - int adjW, int adjH) -{ - - int nInputPlane = THCudaTensor_size(state, weight, 0); - int nOutputPlane = THCudaTensor_size(state, weight, 1); - - THCUNN_assertSameGPU(state, 6, input, output, weight, - bias, columns, ones); - - THArgCheck(input->nDimension == 3 || input->nDimension == 4, 2, "3D or 4D (batch mode) tensor is expected"); - - int batch = 1; - if (input->nDimension == 3) { - THArgCheck(input->size[0] == nInputPlane, 2, "input channels and nInputPlane dont match"); - // Force batch - batch = 0; - THCudaTensor_resize4d(state, input, 1, input->size[0], input->size[1], input->size[2]); - } else { - THArgCheck(input->size[1] == nInputPlane, 2, "input channels and nInputPlane dont match"); - } - - long inputWidth = input->size[3]; - long inputHeight = input->size[2]; - long outputWidth = (inputWidth - 1) * dW - 2*padW + kW + adjW; - long outputHeight = (inputHeight - 1) * dH - 2*padH + kH + adjH; - - // Batch size + input planes - long batchSize = input->size[0]; - - // Resize output - THCudaTensor_resize4d(state, output, batchSize, nOutputPlane, outputHeight, outputWidth); - - // Resize temporary columns - THCudaTensor_resize2d(state, columns, nOutputPlane*kW*kH, inputHeight*inputWidth); - - // Define a buffer of ones, for bias accumulation - // Note: this buffer can be shared with other modules, it only ever gets increased, - // and always contains ones. - if (ones->nDimension != 2 || ones->size[0]*ones->size[1] < outputHeight*outputWidth) { - // Resize plane and fill with ones... - THCudaTensor_resize2d(state, ones, outputHeight, outputWidth); - THCudaTensor_fill(state, ones, 1); - } - - // Helpers - THCudaTensor *input_n = THCudaTensor_new(state); - THCudaTensor *output_n = THCudaTensor_new(state); - - // For each elt in batch, do: - for (int elt = 0; elt < batchSize; elt ++) { - // Matrix mulitply per output: - THCudaTensor_select(state, input_n, input, 0, elt); - THCudaTensor_select(state, output_n, output, 0, elt); - - // M,N,K are dims of matrix A and B - // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm) - long m = weight->size[1] * weight->size[2] * weight->size[3]; - long n = columns->size[1]; - long k = weight->size[0]; - - // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices) - THCudaBlas_Sgemm( - state, - 'n', 't', - n, m, k, - 1, - THCudaTensor_data(state, input_n), n, - THCudaTensor_data(state, weight), m, - 0, - THCudaTensor_data(state, columns), n - ); - - // Unpack columns back into input: - col2im( - THCState_getCurrentStream(state), - THCudaTensor_data(state, columns), - nOutputPlane, outputHeight, outputWidth, kH, kW, padH, padW, dH, dW, - 1, 1, THCudaTensor_data(state, output_n) - ); - - // Do Bias after: - // M,N,K are dims of matrix A and B - // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm) - long m_ = nOutputPlane; - long n_ = outputHeight * outputWidth; - long k_ = 1; - - // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices) - if (bias) { - THCudaBlas_Sgemm( - state, - 't', 'n', - n_, m_, k_, - 1, - THCudaTensor_data(state, ones), k_, - THCudaTensor_data(state, bias), k_, - 1, - THCudaTensor_data(state, output_n), n_ - ); - } - - } - - // Free - THCudaTensor_free(state, input_n); - THCudaTensor_free(state, output_n); - - // Resize output - if (batch == 0) { - THCudaTensor_resize3d(state, output, nOutputPlane, outputHeight, outputWidth); - THCudaTensor_resize3d(state, input, nInputPlane, inputHeight, inputWidth); - } -} - -void THNN_CudaSpatialFullConvolution_updateGradInput( - THCState *state, - THCudaTensor *input, - THCudaTensor *gradOutput, - THCudaTensor *gradInput, - THCudaTensor *weight, - THCudaTensor *gradColumns, - int kW, int kH, - int dW, int dH, - int padW, int padH, - int adjW, int adjH) -{ - int nInputPlane = THCudaTensor_size(state, weight, 0); - int nOutputPlane = THCudaTensor_size(state, weight, 1); - - THCUNN_assertSameGPU(state, 5, input, gradOutput, weight, - gradColumns, gradInput); - THArgCheck(input->nDimension == 3 || input->nDimension == 4, 2, "3D or 4D (batch mode) tensor is expected"); - - int batch = 1; - if (input->nDimension == 3) { - // Force batch - batch = 0; - THCudaTensor_resize4d(state, input, 1, input->size[0], input->size[1], input->size[2]); - THCudaTensor_resize4d(state, gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2]); - } - - long inputWidth = input->size[3]; - long inputHeight = input->size[2]; - long outputWidth = (inputWidth - 1) * dW - 2*padW + kW + adjW; - long outputHeight = (inputHeight - 1) * dH - 2*padH + kH + adjH; - - // Batch size + input planes - long batchSize = input->size[0]; - - // Resize output - THCudaTensor_resize4d(state, gradInput, batchSize, nInputPlane, inputHeight, inputWidth); - - // Resize temporary columns - THCudaTensor_resize2d(state, gradColumns, nOutputPlane*kW*kH, inputHeight*inputWidth); - - // Helpers - THCudaTensor *gradInput_n = THCudaTensor_new(state); - THCudaTensor *gradOutput_n = THCudaTensor_new(state); - - // For each elt in batch, do: - for (int elt = 0; elt < batchSize; elt ++) { - // Matrix mulitply per sample: - THCudaTensor_select(state, gradInput_n, gradInput, 0, elt); - THCudaTensor_select(state, gradOutput_n, gradOutput, 0, elt); - - // Extract columns: - im2col( - THCState_getCurrentStream(state), - THCudaTensor_data(state, gradOutput_n), - nOutputPlane, outputHeight, outputWidth, kH, kW, padH, padW, dH, dW, - 1, 1, THCudaTensor_data(state, gradColumns) - ); - - - // M,N,K are dims of matrix A and B - // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm) - long m = weight->size[0]; - long n = gradColumns->size[1]; - long k = weight->size[1] * weight->size[2] * weight->size[3]; - - // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices) - THCudaBlas_Sgemm( - state, - 'n', 'n', - n, m, k, - 1, - THCudaTensor_data(state, gradColumns), n, - THCudaTensor_data(state, weight), k, - 0, - THCudaTensor_data(state, gradInput_n), n - ); - } - - - // Free - THCudaTensor_free(state, gradInput_n); - THCudaTensor_free(state, gradOutput_n); - - // Resize output - if (batch == 0) { - THCudaTensor_resize3d(state, gradOutput, nOutputPlane, outputHeight, outputWidth); - THCudaTensor_resize3d(state, input, nInputPlane, inputHeight, inputWidth); - THCudaTensor_resize3d(state, gradInput, nInputPlane, inputHeight, inputWidth); - } -} - - -void THNN_CudaSpatialFullConvolution_accGradParameters( - THCState *state, - THCudaTensor *input, - THCudaTensor *gradOutput, - THCudaTensor *gradWeight, - THCudaTensor *gradBias, - THCudaTensor *columns, - THCudaTensor *ones, - int kW, int kH, - int dW, int dH, - int padW, int padH, - int adjW, int adjH, - float scale) -{ - int nInputPlane = THCudaTensor_size(state, gradWeight, 0); - int nOutputPlane = THCudaTensor_size(state, gradWeight, 1); - - THCUNN_assertSameGPU(state, 6, input, gradOutput, gradWeight, - gradBias, columns, ones); - - THArgCheck(input->nDimension == 3 || input->nDimension == 4, 2, "3D or 4D (batch mode) tensor is expected"); - - int batch = 1; - if (input->nDimension == 3) { - // Force batch - batch = 0; - THCudaTensor_resize4d(state, input, 1, input->size[0], input->size[1], input->size[2]); - THCudaTensor_resize4d(state, gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2]); - } - - long inputWidth = input->size[3]; - long inputHeight = input->size[2]; - long outputWidth = (inputWidth - 1) * dW - 2*padW + kW + adjW; - long outputHeight = (inputHeight - 1) * dH - 2*padH + kH + adjH; - - // Batch size + input planes - long batchSize = input->size[0]; - - // Define a buffer of ones, for bias accumulation - if (ones->nDimension != 2 || ones->size[0]*ones->size[1] < outputHeight*outputWidth) { - // Resize plane and fill with ones... - THCudaTensor_resize2d(state, ones, outputHeight, outputWidth); - THCudaTensor_fill(state, ones, 1); - } - - // Resize temporary columns - THCudaTensor_resize2d(state, columns, nOutputPlane*kW*kH, inputHeight*inputWidth); - - // Helpers - THCudaTensor *input_n = THCudaTensor_new(state); - THCudaTensor *gradOutput_n = THCudaTensor_new(state); - - // For each elt in batch, do: - for (int elt = 0; elt < batchSize; elt ++) { - // Matrix mulitply per output: - THCudaTensor_select(state, input_n, input, 0, elt); - THCudaTensor_select(state, gradOutput_n, gradOutput, 0, elt); - - // Extract columns: - im2col( - THCState_getCurrentStream(state), - THCudaTensor_data(state, gradOutput_n), - nOutputPlane, outputHeight, outputWidth, kH, kW, padH, padW, dH, dW, - 1, 1, THCudaTensor_data(state, columns) - ); - - // M,N,K are dims of matrix A and B - // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm) - long n = columns->size[0]; // nOutputPlane * kh * kw - long m = input_n->size[0]; // nInputPlane - long k = columns->size[1]; // inputHeight * inputWidth - - // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices) - THCudaBlas_Sgemm( - state, - 't', 'n', - n, m, k, - scale, - THCudaTensor_data(state, columns), k, - THCudaTensor_data(state, input_n), k, - 1, - THCudaTensor_data(state, gradWeight), n - ); - - // Do Bias: - // M,N,K are dims of matrix A and B - // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm) - long m_ = nOutputPlane; - long k_ = outputHeight * outputWidth; - - // Do GEMV (note: this is a bit confusing because gemv assumes column-major matrices) - if (gradBias) { - THCudaBlas_Sgemv( - state, - 't', - k_, m_, - scale, - THCudaTensor_data(state, gradOutput_n), k_, - THCudaTensor_data(state, ones), 1, - 1, - THCudaTensor_data(state, gradBias), 1 - ); - } - } - - // Free - THCudaTensor_free(state, input_n); - THCudaTensor_free(state, gradOutput_n); - - // Resize - if (batch == 0) { - THCudaTensor_resize3d(state, gradOutput, nOutputPlane, outputHeight, outputWidth); - THCudaTensor_resize3d(state, input, nInputPlane, inputHeight, inputWidth); - } -} +#include "generic/SpatialFullConvolution.cu" +#include "THCGenerateFloatTypes.h" diff --git a/lib/THCUNN/SpatialMaxPooling.cu b/lib/THCUNN/SpatialMaxPooling.cu index ac6e3fd..90e6fe4 100644 --- a/lib/THCUNN/SpatialMaxPooling.cu +++ b/lib/THCUNN/SpatialMaxPooling.cu @@ -1,18 +1,4 @@ #include "THCUNN.h" -#include "common.h" -void THNN_CudaSpatialMaxPooling_updateOutput(THCState *state, THCudaTensor *input, THCudaTensor *output, THCudaTensor *indices, int kW, int kH, int dW, int dH, int padW, int padH, bool ceil_mode) -{ - THNN_CudaSpatialDilatedMaxPooling_updateOutput( - state, input, output, indices, - kW, kH, dW, dH, padW, padH, 1, 1, ceil_mode); - -} - -void THNN_CudaSpatialMaxPooling_updateGradInput(THCState *state, THCudaTensor *input, THCudaTensor *gradOutput, THCudaTensor *gradInput, THCudaTensor *indices, int kW, int kH, int dW, int dH, int padW, int padH, bool ceil_mode) -{ - THNN_CudaSpatialDilatedMaxPooling_updateGradInput( - state, input, gradOutput, gradInput, indices, - kW, kH, dW, dH, padW, padH, 1, 1, ceil_mode); - -} +#include "generic/SpatialMaxPooling.cu" +#include "THCGenerateFloatTypes.h" diff --git a/lib/THCUNN/SpatialMaxUnpooling.cu b/lib/THCUNN/SpatialMaxUnpooling.cu index b56bd56..2d8a481 100644 --- a/lib/THCUNN/SpatialMaxUnpooling.cu +++ b/lib/THCUNN/SpatialMaxUnpooling.cu @@ -2,7 +2,7 @@ #include "common.h" template <typename Dtype> -__global__ void MaxUnpoolForward(const int nthreads, const Dtype* bottom_data, const Dtype* bottom_mask, +__global__ void MaxUnpoolForward(const int nthreads, const Dtype* bottom_data, const long* bottom_mask, const int num, const int channels, const int iheight, const int iwidth, const int oheight, const int owidth, Dtype* top_data) { CUDA_KERNEL_LOOP(index, nthreads) { //index here indices the input pixels int c = (index / iwidth / iheight) % channels; @@ -15,7 +15,7 @@ __global__ void MaxUnpoolForward(const int nthreads, const Dtype* bottom_data, c } template <typename Dtype> -__global__ void MaxUnpoolBackward(const int nthreads, const Dtype* top_diff, const Dtype* bottom_mask, +__global__ void MaxUnpoolBackward(const int nthreads, const Dtype* top_diff, const long* bottom_mask, const int num, const int channels, const int iheight, const int iwidth, const int oheight, const int owidth, Dtype* bottom_diff) { CUDA_KERNEL_LOOP(index, nthreads) { int c = (index / iwidth / iheight) % channels; @@ -27,79 +27,5 @@ __global__ void MaxUnpoolBackward(const int nthreads, const Dtype* top_diff, con } } -void THNN_CudaSpatialMaxUnpooling_updateOutput(THCState *state, THCudaTensor *input, THCudaTensor *output, THCudaTensor *indices, int owidth, int oheight) -{ - THCUNN_assertSameGPU(state, 3, input, output, indices); - THArgCheck(input->nDimension == 3 || input->nDimension == 4, 2, "3D or 4D (batch) tensor expected"); - - long nInputCols, nInputRows, nInputPlane, batchSize; - - if (input->nDimension == 3) { - nInputCols = input->size[2]; - nInputRows = input->size[1]; - nInputPlane = input->size[0]; - batchSize = 1; - } - else - { - nInputCols = input->size[3]; - nInputRows = input->size[2]; - nInputPlane = input->size[1]; - batchSize = input->size[0]; - } - - input = THCudaTensor_newContiguous(state, input); - indices = THCudaTensor_newContiguous(state, indices); - THCudaTensor_resize4d(state, output, batchSize, nInputPlane, oheight, owidth); - THCudaTensor_zero(state, output); - - int count = THCudaTensor_nElement(state, input); - - MaxUnpoolForward <<< GET_BLOCKS(count), CUDA_NUM_THREADS, 0, THCState_getCurrentStream(state) >>> - (count, THCudaTensor_data(state, input), THCudaTensor_data(state, indices), - batchSize, nInputPlane, nInputRows, nInputCols, oheight, owidth, THCudaTensor_data(state, output)); - THCudaCheck(cudaGetLastError()); - - if(input->nDimension == 3) - THCudaTensor_resize3d(state, output, nInputPlane, oheight, owidth); - - THCudaTensor_free(state, input); - -} - -void THNN_CudaSpatialMaxUnpooling_updateGradInput(THCState *state, THCudaTensor *input, THCudaTensor *gradOutput, THCudaTensor *gradInput, THCudaTensor *indices, int owidth, int oheight) -{ - THCUNN_assertSameGPU(state, 4, input, gradOutput, indices, gradInput); - - long nInputCols, nInputRows, nInputPlane, batchSize; - - if (input->nDimension == 3) { - nInputCols = input->size[2]; - nInputRows = input->size[1]; - nInputPlane = input->size[0]; - batchSize = 1; - } - else - { - nInputCols = input->size[3]; - nInputRows = input->size[2]; - nInputPlane = input->size[1]; - batchSize = input->size[0]; - } - - input = THCudaTensor_newContiguous(state, input); - indices = THCudaTensor_newContiguous(state, indices); - gradOutput = THCudaTensor_newContiguous(state, gradOutput); - THCudaTensor_resizeAs(state, gradInput, input); - - int count = THCudaTensor_nElement(state, input); - - MaxUnpoolBackward <<< GET_BLOCKS(count), CUDA_NUM_THREADS, 0, THCState_getCurrentStream(state) >>> - (count, THCudaTensor_data(state, gradOutput), THCudaTensor_data(state, indices), - batchSize, nInputPlane, nInputRows, nInputCols, oheight, owidth, THCudaTensor_data(state, gradInput)); - THCudaCheck(cudaGetLastError()); - - // clean - THCudaTensor_free(state, input); - THCudaTensor_free(state, gradOutput); -} +#include "generic/SpatialMaxUnpooling.cu" +#include "THCGenerateFloatTypes.h" diff --git a/lib/THCUNN/SpatialReflectionPadding.cu b/lib/THCUNN/SpatialReflectionPadding.cu index c3ae14e..75aae1a 100644 --- a/lib/THCUNN/SpatialReflectionPadding.cu +++ b/lib/THCUNN/SpatialReflectionPadding.cu @@ -5,9 +5,14 @@ #include "THCDeviceUtils.cuh" #include "THCReduceApplyUtils.cuh" +#include "THCHalf.h" +#include "THCHalfAutoNumerics.cuh" +#include "THCAtomics.cuh" + +template<typename Dtype> __global__ void SpatialReflectionPadding_updateOutput( - THCDeviceTensor<float, 4> input, - THCDeviceTensor<float, 4> output, + THCDeviceTensor<Dtype, 4> input, + THCDeviceTensor<Dtype, 4> output, int padT, int padB, int padL, int padR) { int outputPointId = threadIdx.x + blockIdx.x * blockDim.x; @@ -36,70 +41,14 @@ __global__ void SpatialReflectionPadding_updateOutput( + 2 * padT + input.getSize(2) - 1 - oStartY + iStartY; - float valueToCopy = input[batch][plane][inputPointY][inputPointX]; + Dtype valueToCopy = input[batch][plane][inputPointY][inputPointX]; output[batch][plane][outputPointY][outputPointX] = valueToCopy; } -void THNN_CudaSpatialReflectionPadding_updateOutput(THCState *state, - THCudaTensor *input, - THCudaTensor *output, - int padL, int padR, - int padT, int padB - ) { - THArgCheck(TensorUtils<THCudaTensor>::canUse32BitIndexMath(state, input), 2, - "input tensor must fit into 32-bit index math"); - - int planeDim = 0; - int dimh = 1; - int dimw = 2; - int numBatch = 1; - - int numInputDims = THCudaTensor_nDimension(state, input); - THArgCheck(numInputDims == 3 || numInputDims == 4, 2, - "input must be 3 or 4-dimensional"); - - if (numInputDims == 4) { - numBatch = THCudaTensor_size(state, input, 0); - planeDim++; - dimh++; - dimw++; - } - - int numPlanes = THCudaTensor_size(state, input, planeDim); - int inputH = THCudaTensor_size(state, input, dimh); - int inputW = THCudaTensor_size(state, input, dimw); - int outputH = inputH + padT + padB; - int outputW = inputW + padL + padR; - - THCDeviceTensor<float, 4> devInput; - THCDeviceTensor<float, 4> devOutput; - - if (numInputDims == 3) { - THCudaTensor_resize3d(state, output, numPlanes, outputH, outputW); - - devInput = toDeviceTensor<float, 3>(state, input).upcastOuter<4>(); - devOutput = toDeviceTensor<float, 3>(state, output).upcastOuter<4>(); - } else { - THCudaTensor_resize4d(state, output, numBatch, numPlanes, outputH, outputW); - - devInput = toDeviceTensor<float, 4>(state, input); - devOutput = toDeviceTensor<float, 4>(state, output); - } - - int outputPlaneSize = devOutput.getSize(2) * devOutput.getSize(3); - dim3 gridSize(THCCeilDiv(outputPlaneSize, 256), - devOutput.getSize(1), - devOutput.getSize(0)); - dim3 blockSize(outputPlaneSize > 256 ? 256 : outputPlaneSize); - - SpatialReflectionPadding_updateOutput<<<gridSize, blockSize, 0, THCState_getCurrentStream(state)>>>( - devInput, devOutput, padT, padB, padL, padR); - THCudaCheck(cudaGetLastError()); -} - +template <typename Dtype> __global__ void SpatialReflectionPadding_updateGradInput( - THCDeviceTensor<float, 4> gradInput, - THCDeviceTensor<float, 4> gradOutput, + THCDeviceTensor<Dtype, 4> gradInput, + THCDeviceTensor<Dtype, 4> gradOutput, int padT, int padB, int padL, int padR) { int outputPointId = threadIdx.x + blockIdx.x * blockDim.x; @@ -128,54 +77,9 @@ __global__ void SpatialReflectionPadding_updateGradInput( + 2 * padT + gradInput.getSize(2) - 1 - oStartY + iStartY; - float valueToCopy = gradOutput[batch][plane][outputPointY][outputPointX]; + Dtype valueToCopy = gradOutput[batch][plane][outputPointY][outputPointX]; atomicAdd(&gradInput[batch][plane][inputPointY][inputPointX], valueToCopy); } -void THNN_CudaSpatialReflectionPadding_updateGradInput(THCState *state, - THCudaTensor *input, - THCudaTensor *gradOutput, - THCudaTensor *gradInput, - int padL, int padR, - int padT, int padB) { - - THArgCheck(TensorUtils<THCudaTensor>::canUse32BitIndexMath(state, input), 2, - "input tensor must fit into 32-bit index math"); - THArgCheck(TensorUtils<THCudaTensor>::canUse32BitIndexMath(state, gradOutput), 3, - "output gradient tensor must fit into 32-bit index math"); - - int planeDim = 0; - int dimh = 1; - int dimw = 2; - - int numInputDims = THCudaTensor_nDimension(state, input); - if (numInputDims == 4) { - planeDim++; - dimh++; - dimw++; - } - - THCudaTensor_resizeAs(state, gradInput, input); - THCudaTensor_zero(state, gradInput); - - THCDeviceTensor<float, 4> devGradInput; - THCDeviceTensor<float, 4> devGradOutput; - - if (numInputDims == 3) { - devGradInput = toDeviceTensor<float, 3>(state, gradInput).upcastOuter<4>(); - devGradOutput = toDeviceTensor<float, 3>(state, gradOutput).upcastOuter<4>(); - } else { - devGradInput = toDeviceTensor<float, 4>(state, gradInput); - devGradOutput = toDeviceTensor<float, 4>(state, gradOutput); - } - - int outputPlaneSize = devGradOutput.getSize(2) * devGradOutput.getSize(3); - dim3 gridSize(THCCeilDiv(outputPlaneSize, 256), - devGradOutput.getSize(1), - devGradOutput.getSize(0)); - dim3 blockSize(outputPlaneSize > 256 ? 256 : outputPlaneSize); - - SpatialReflectionPadding_updateGradInput<<<gridSize, blockSize, 0, THCState_getCurrentStream(state)>>>( - devGradInput, devGradOutput, padT, padB, padL, padR); - THCudaCheck(cudaGetLastError()); -} +#include "generic/SpatialReflectionPadding.cu" +#include "THCGenerateFloatTypes.h" diff --git a/lib/THCUNN/SpatialReplicationPadding.cu b/lib/THCUNN/SpatialReplicationPadding.cu index fc09291..f022e57 100644 --- a/lib/THCUNN/SpatialReplicationPadding.cu +++ b/lib/THCUNN/SpatialReplicationPadding.cu @@ -5,9 +5,14 @@ #include "THCDeviceUtils.cuh" #include "THCReduceApplyUtils.cuh" +#include "THCHalf.h" +#include "THCHalfAutoNumerics.cuh" +#include "THCAtomics.cuh" + +template <typename Dtype> __global__ void SpatialReplicationPadding_updateOutput( - THCDeviceTensor<float, 4> input, - THCDeviceTensor<float, 4> output, + THCDeviceTensor<Dtype, 4> input, + THCDeviceTensor<Dtype, 4> output, int padT, int padB, int padL, int padR) { int outputPointId = threadIdx.x + blockIdx.x * blockDim.x; @@ -27,70 +32,14 @@ __global__ void SpatialReplicationPadding_updateOutput( int inputPointX = min(max(padL, outputPointX), input.getSize(3) + padL - 1) - oStartX + iStartX; int inputPointY = min(max(padT, outputPointY), input.getSize(2) + padT - 1) - oStartY + iStartY; - float valueToCopy = input[batch][plane][inputPointY][inputPointX]; + Dtype valueToCopy = input[batch][plane][inputPointY][inputPointX]; output[batch][plane][outputPointY][outputPointX] = valueToCopy; } -void THNN_CudaSpatialReplicationPadding_updateOutput(THCState *state, - THCudaTensor *input, - THCudaTensor *output, - int padL, int padR, - int padT, int padB - ) { - THArgCheck(TensorUtils<THCudaTensor>::canUse32BitIndexMath(state, input), 2, - "input tensor must fit into 32-bit index math"); - - int planeDim = 0; - int dimh = 1; - int dimw = 2; - int numBatch = 1; - - int numInputDims = THCudaTensor_nDimension(state, input); - THArgCheck(numInputDims == 3 || numInputDims == 4, 2, - "input must be 3 or 4-dimensional"); - - if (numInputDims == 4) { - numBatch = THCudaTensor_size(state, input, 0); - planeDim++; - dimh++; - dimw++; - } - - int numPlanes = THCudaTensor_size(state, input, planeDim); - int inputH = THCudaTensor_size(state, input, dimh); - int inputW = THCudaTensor_size(state, input, dimw); - int outputH = inputH + padT + padB; - int outputW = inputW + padL + padR; - - THCDeviceTensor<float, 4> devInput; - THCDeviceTensor<float, 4> devOutput; - - if (numInputDims == 3) { - THCudaTensor_resize3d(state, output, numPlanes, outputH, outputW); - - devInput = toDeviceTensor<float, 3>(state, input).upcastOuter<4>(); - devOutput = toDeviceTensor<float, 3>(state, output).upcastOuter<4>(); - } else { - THCudaTensor_resize4d(state, output, numBatch, numPlanes, outputH, outputW); - - devInput = toDeviceTensor<float, 4>(state, input); - devOutput = toDeviceTensor<float, 4>(state, output); - } - - int outputPlaneSize = devOutput.getSize(2) * devOutput.getSize(3); - dim3 gridSize(THCCeilDiv(outputPlaneSize, 256), - devOutput.getSize(1), - devOutput.getSize(0)); - dim3 blockSize(outputPlaneSize > 256 ? 256 : outputPlaneSize); - - SpatialReplicationPadding_updateOutput<<<gridSize, blockSize, 0, THCState_getCurrentStream(state)>>>( - devInput, devOutput, padT, padB, padL, padR); - -} - +template <typename Dtype> __global__ void SpatialReplicationPadding_updateGradInput( - THCDeviceTensor<float, 4> gradInput, - THCDeviceTensor<float, 4> gradOutput, + THCDeviceTensor<Dtype, 4> gradInput, + THCDeviceTensor<Dtype, 4> gradOutput, int padT, int padB, int padL, int padR) { int outputPointId = threadIdx.x + blockIdx.x * blockDim.x; @@ -110,54 +59,10 @@ __global__ void SpatialReplicationPadding_updateGradInput( int inputPointX = min(max(padL, outputPointX), gradInput.getSize(3) + padL - 1) - oStartX + iStartX; int inputPointY = min(max(padT, outputPointY), gradInput.getSize(2) + padT - 1) - oStartY + iStartY; - float valueToCopy = gradOutput[batch][plane][outputPointY][outputPointX]; + Dtype valueToCopy = gradOutput[batch][plane][outputPointY][outputPointX]; atomicAdd(&gradInput[batch][plane][inputPointY][inputPointX], valueToCopy); } -void THNN_CudaSpatialReplicationPadding_updateGradInput(THCState *state, - THCudaTensor *input, - THCudaTensor *gradOutput, - THCudaTensor *gradInput, - int padL, int padR, - int padT, int padB) { - - THArgCheck(TensorUtils<THCudaTensor>::canUse32BitIndexMath(state, input), 2, - "input tensor must fit into 32-bit index math"); - THArgCheck(TensorUtils<THCudaTensor>::canUse32BitIndexMath(state, gradOutput), 3, - "output gradient tensor must fit into 32-bit index math"); - - int planeDim = 0; - int dimh = 1; - int dimw = 2; - - int numInputDims = THCudaTensor_nDimension(state, input); - if (numInputDims == 4) { - planeDim++; - dimh++; - dimw++; - } - - THCudaTensor_resizeAs(state, gradInput, input); - THCudaTensor_zero(state, gradInput); - THCDeviceTensor<float, 4> devGradInput; - THCDeviceTensor<float, 4> devGradOutput; - - if (numInputDims == 3) { - devGradInput = toDeviceTensor<float, 3>(state, gradInput).upcastOuter<4>(); - devGradOutput = toDeviceTensor<float, 3>(state, gradOutput).upcastOuter<4>(); - } else { - devGradInput = toDeviceTensor<float, 4>(state, gradInput); - devGradOutput = toDeviceTensor<float, 4>(state, gradOutput); - } - - int outputPlaneSize = devGradOutput.getSize(2) * devGradOutput.getSize(3); - dim3 gridSize(THCCeilDiv(outputPlaneSize, 256), - devGradOutput.getSize(1), - devGradOutput.getSize(0)); - dim3 blockSize(outputPlaneSize > 256 ? 256 : outputPlaneSize); - - SpatialReplicationPadding_updateGradInput<<<gridSize, blockSize, 0, THCState_getCurrentStream(state)>>>( - devGradInput, devGradOutput, padT, padB, padL, padR); - -} +#include "generic/SpatialReplicationPadding.cu" +#include "THCGenerateFloatTypes.h" diff --git a/lib/THCUNN/SpatialSubSampling.cu b/lib/THCUNN/SpatialSubSampling.cu index d801862..a38661b 100644 --- a/lib/THCUNN/SpatialSubSampling.cu +++ b/lib/THCUNN/SpatialSubSampling.cu @@ -1,5 +1,7 @@ #include "THCUNN.h" -#include "common.h" +#include "THCHalf.h" +#include "THCHalfAutoNumerics.cuh" +#include "THCAtomics.cuh" #define CUDA_MAX_THREADS 1024 // this is safe, in reality 256 is our limit @@ -8,7 +10,8 @@ * this function subsamples an input 3D tensor along dimensions 1 and 2 * 3D input, 3D output, 1D weight, 1D bias */ -__global__ void subsample(float *input, float *output, float *weight, float *bias, + template <typename Dtype, typename Acctype> +__global__ void subsample(Dtype *input, Dtype *output, Dtype *weight, Dtype *bias, int input_n, int input_h, int input_w, int kH, int kW, int dH, int dW) { @@ -37,18 +40,18 @@ __global__ void subsample(float *input, float *output, float *weight, float *bia input = input + i*input_w*input_h; // Get the good mask for (k,i) (k out, i in) - float the_weight = weight[k]; + Dtype the_weight = weight[k]; // Initialize to the bias - float the_bias = bias[k]; + Dtype the_bias = bias[k]; // For all output pixels... for(yy = yy_start; yy < yy_end; yy+=yy_step) { for(xx = xx_start; xx < xx_end; xx+=xx_step) { // Compute the mean of the input image... - float *ptr_input = input + yy*dH*input_w + xx*dW; - float *ptr_output = output + yy*output_w + xx; - float sum = 0; + Dtype *ptr_input = input + yy*dH*input_w + xx*dW; + Dtype *ptr_output = output + yy*output_w + xx; + Acctype sum = 0; int kx, ky; for(ky = 0; ky < kH; ky++) { for(kx = 0; kx < kW; kx++) @@ -56,7 +59,7 @@ __global__ void subsample(float *input, float *output, float *weight, float *bia ptr_input += input_w; // next input line } // Update output - *ptr_output = the_weight*sum + the_bias; + *ptr_output = ScalarConvert<Acctype, Dtype>::to(the_weight*sum + the_bias); } } } @@ -65,7 +68,8 @@ __global__ void subsample(float *input, float *output, float *weight, float *bia * Description: * this function computes the gradWeight from input and gradOutput */ -__global__ void subgradweight(float *input, float *gradOutput, float *gradWeight, float *gradBias, + template <typename Dtype, typename Acctype> +__global__ void subgradweight(Dtype *input, Dtype *gradOutput, Dtype *gradWeight, Dtype *gradBias, int input_n, int input_h, int input_w, int kH, int kW, int dH, int dW, float scale) @@ -98,15 +102,15 @@ __global__ void subgradweight(float *input, float *gradOutput, float *gradWeight int tid = blockDim.x*threadIdx.y + threadIdx.x; // create array to hold partial sums - __shared__ float sums[CUDA_MAX_THREADS]; + __shared__ Acctype sums[CUDA_MAX_THREADS]; sums[tid] = 0; // compute partial sums for(yy = yy_start; yy < yy_end; yy+=yy_step) { for(xx = xx_start; xx < xx_end; xx+=xx_step) { - float *ptr_input = input + yy*dH*input_w + xx*dW; - float *ptr_gradOutput = gradOutput + yy*output_w + xx; - float z = *ptr_gradOutput; + Dtype *ptr_input = input + yy*dH*input_w + xx*dW; + Dtype *ptr_gradOutput = gradOutput + yy*output_w + xx; + Dtype z = *ptr_gradOutput; long kx, ky; for(ky = 0; ky < kH; ky++) { for(kx = 0; kx < kW; kx++) { @@ -120,7 +124,11 @@ __global__ void subgradweight(float *input, float *gradOutput, float *gradWeight // reduce: accumulate all partial sums to produce final gradWeight if ((threadIdx.x == 0) && (threadIdx.y == 0)) { - for(int i = 0; i < blockDim.x*blockDim.y; i++) gradWeight[k] += scale*sums[i]; + Acctype scaledSums = Acctype(0); + for(int i = 0; i < blockDim.x*blockDim.y; i++) { + scaledSums += scale*sums[i]; + } + gradWeight[k] += ScalarConvert<Acctype, Dtype>::to(scaledSums); } __syncthreads(); @@ -133,8 +141,11 @@ __global__ void subgradweight(float *input, float *gradOutput, float *gradWeight // reduce gradBias if ((threadIdx.x == 0) && (threadIdx.y == 0)) { - for (int i=0; i<(blockDim.x*blockDim.y); i++) - gradBias[k] += scale*sums[i]; + Acctype scaledSums = Acctype(0); + for (int i=0; i<(blockDim.x*blockDim.y); i++) { + scaledSums += scale*sums[i]; + } + gradBias[k] += ScalarConvert<Acctype, Dtype>::to(scaledSums); } } @@ -142,7 +153,8 @@ __global__ void subgradweight(float *input, float *gradOutput, float *gradWeight * Description: * this function computes the gradInput from weight and gradOutput */ -__global__ void subgradinput(float *gradInput, float *gradOutput, float *weight, + template <typename Dtype> +__global__ void subgradinput(Dtype *gradInput, Dtype *gradOutput, Dtype *weight, int input_n, int input_h, int input_w, int kH, int kW, int dH, int dW) { @@ -171,18 +183,20 @@ __global__ void subgradinput(float *gradInput, float *gradOutput, float *weight, gradInput = gradInput + i*input_w*input_h; // get weight - float the_weight = weight[k]; + Dtype the_weight = weight[k]; // compute gradInput for(yy = yy_start; yy < yy_end; yy+=yy_step) { for(xx = xx_start; xx < xx_end; xx+=xx_step) { - float *ptr_gradInput = gradInput + yy*dH*input_w + xx*dW; - float *ptr_gradOutput = gradOutput + yy*output_w + xx; - float z = *ptr_gradOutput * the_weight; + Dtype *ptr_gradInput = gradInput + yy*dH*input_w + xx*dW; + Dtype *ptr_gradOutput = gradOutput + yy*output_w + xx; + Dtype z = *ptr_gradOutput * the_weight; int kx, ky; for(ky = 0; ky < kH; ky++) { - for(kx = 0; kx < kW; kx++) + for(kx = 0; kx < kW; kx++) { + // FIXME: should this be done at accreal precision? ptr_gradInput[kx] += z; + } ptr_gradInput += input_w; } } @@ -193,7 +207,8 @@ __global__ void subgradinput(float *gradInput, float *gradOutput, float *weight, * Description: * this function computes the gradInput from weight and gradOutput */ -__global__ void subgradinputAtomic(float *gradInput, float *gradOutput, float *weight, + template <typename Dtype> +__global__ void subgradinputAtomic(Dtype *gradInput, Dtype *gradOutput, Dtype *weight, int input_n, int input_h, int input_w, int kH, int kW, int dH, int dW) { @@ -222,17 +237,18 @@ __global__ void subgradinputAtomic(float *gradInput, float *gradOutput, float *w gradInput = gradInput + i*input_w*input_h; // get weight - float the_weight = weight[k]; + Dtype the_weight = weight[k]; // compute gradInput for(yy = yy_start; yy < yy_end; yy+=yy_step) { for(xx = xx_start; xx < xx_end; xx+=xx_step) { - float *ptr_gradInput = gradInput + yy*dH*input_w + xx*dW; - float *ptr_gradOutput = gradOutput + yy*output_w + xx; - float z = *ptr_gradOutput * the_weight; + Dtype *ptr_gradInput = gradInput + yy*dH*input_w + xx*dW; + Dtype *ptr_gradOutput = gradOutput + yy*output_w + xx; + Dtype z = *ptr_gradOutput * the_weight; int kx, ky; for(ky = 0; ky < kH; ky++) { for(kx = 0; kx < kW; kx++) { + // FIXME: should this be done at accreal precision? atomicAdd(&(ptr_gradInput[kx]), z); } ptr_gradInput += input_w; @@ -241,205 +257,8 @@ __global__ void subgradinputAtomic(float *gradInput, float *gradOutput, float *w } } -void THNN_CudaSpatialSubSampling_updateOutput(THCState *state, THCudaTensor *input, THCudaTensor *output, THCudaTensor *weight, THCudaTensor *bias, int kW, int kH, int dW, int dH) -{ - float *weight_data = THCudaTensor_data(state, weight); - float *bias_data = THCudaTensor_data(state, bias); - float *output_data; - float *input_data; - - int nInputPlane = THCudaTensor_size(state, weight, 0); - - THCUNN_assertSameGPU(state, 4, input, output, weight, bias); - THArgCheck(input->nDimension == 3 || input->nDimension == 4, 2, "3D or 4D (batch) tensor expected"); - - if (input->nDimension == 3) { - long nInputCols = input->size[2]; - long nInputRows = input->size[1]; - long nOutputCols = (nInputCols - kW) / dW + 1; - long nOutputRows = (nInputRows - kH) / dH + 1; - - THArgCheck(input->size[0] == nInputPlane, 2, "invalid number of input planes"); - THArgCheck(nInputCols >= kW && nInputRows >= kH, 2, "input image smaller than kernel size"); - - input = THCudaTensor_newContiguous(state, input); - input_data = THCudaTensor_data(state, input); - - THCudaTensor_resize3d(state, output, nInputPlane, nOutputRows, nOutputCols); - output_data = THCudaTensor_data(state, output); - - // cuda blocks & threads: - int yblocks = (int)(16L / nInputPlane); - yblocks = yblocks < 1 ? 1 : yblocks; - dim3 blocks(nInputPlane,yblocks); - dim3 threads(32,8); - - // run subsample kernel - subsample <<<blocks, threads, 0, THCState_getCurrentStream(state)>>> ( - input_data, output_data, weight_data, bias_data, - nInputPlane, nInputRows, nInputCols, kH, kW, dH, dW); - THCudaCheck(cudaGetLastError()); - } else { - long nInputCols = input->size[3]; - long nInputRows = input->size[2]; - long nbatch = input->size[0]; - long nOutputCols = (nInputCols - kW) / dW + 1; - long nOutputRows = (nInputRows - kH) / dH + 1; - - THArgCheck(input->size[1] == nInputPlane, 2, "invalid number of input planes"); - THArgCheck(nInputCols >= kW && nInputRows >= kH, 2, "input image smaller than kernel size"); - - input = THCudaTensor_newContiguous(state, input); - input_data = THCudaTensor_data(state, input); - - THCudaTensor_resize4d(state, output, nbatch, nInputPlane, nOutputRows, nOutputCols); - output_data = THCudaTensor_data(state, output); - - // cuda blocks & threads: - int yblocks = (int)(16L / nInputPlane); - yblocks = yblocks < 1 ? 1 : yblocks; - dim3 blocks(nInputPlane*nbatch,yblocks); - dim3 threads(32,8); - - // run subsample kernel - subsample <<<blocks, threads, 0, THCState_getCurrentStream(state)>>> ( - input_data, output_data, weight_data, bias_data, - nInputPlane, nInputRows, nInputCols, kH, kW, dH, dW); - THCudaCheck(cudaGetLastError()); - } - - // clean - THCudaTensor_free(state, input); - -} - -void THNN_CudaSpatialSubSampling_updateGradInput(THCState *state, THCudaTensor *input, THCudaTensor *gradOutput, THCudaTensor *gradInput, THCudaTensor *weight, int kW, int kH, int dW, int dH) -{ - THCUNN_assertSameGPU(state, 4, input, gradOutput, weight, gradInput); - - int nInputPlane = THCudaTensor_size(state, weight, 0); - - if (input->nDimension == 3) { - long nInputCols = input->size[2]; - long nInputRows = input->size[1]; - - float *weight_data = THCudaTensor_data(state, weight); - float *gradOutput_data = THCudaTensor_data(state, gradOutput); - float *gradInput_data; - - THCudaTensor_resizeAs(state, gradInput, input); - THCudaTensor_zero(state, gradInput); - gradInput_data = THCudaTensor_data(state, gradInput); - - // cuda blocks & threads: - int yblocks = (int)(16L / nInputPlane); - yblocks = yblocks < 1 ? 1 : yblocks; - dim3 blocks(nInputPlane,yblocks); - dim3 threads(32,8); - - // run updateGradInput kernel - if (kH <= dH && kW <= dW) { - subgradinput <<<blocks, threads, 0, THCState_getCurrentStream(state)>>> ( - gradInput_data, gradOutput_data, weight_data, - nInputPlane, nInputRows, nInputCols, kH, kW, dH, dW); - } else { - subgradinputAtomic <<<blocks, threads, 0, THCState_getCurrentStream(state)>>> ( - gradInput_data, gradOutput_data, weight_data, - nInputPlane, nInputRows, nInputCols, kH, kW, dH, dW); - } - THCudaCheck(cudaGetLastError()); - } else { - long nInputCols = input->size[3]; - long nInputRows = input->size[2]; - long nbatch = input->size[0]; - - float *weight_data = THCudaTensor_data(state, weight); - float *gradOutput_data = THCudaTensor_data(state, gradOutput); - float *gradInput_data; - - THCudaTensor_resizeAs(state, gradInput, input); - THCudaTensor_zero(state, gradInput); - gradInput_data = THCudaTensor_data(state, gradInput); - - // cuda blocks & threads: - int yblocks = (int)(16L / nInputPlane); - yblocks = yblocks < 1 ? 1 : yblocks; - dim3 blocks(nInputPlane*nbatch,yblocks); - dim3 threads(32,8); - - // run updateGradInput kernel - if (kH <= dH && kW <= dW) { - subgradinput <<<blocks, threads, 0, THCState_getCurrentStream(state)>>> ( - gradInput_data, gradOutput_data, weight_data, - nInputPlane, nInputRows, nInputCols, kH, kW, dH, dW); - } else { - subgradinputAtomic <<<blocks, threads, 0, THCState_getCurrentStream(state)>>> ( - gradInput_data, gradOutput_data, weight_data, - nInputPlane, nInputRows, nInputCols, kH, kW, dH, dW); - } - THCudaCheck(cudaGetLastError()); - } -} - -void THNN_CudaSpatialSubSampling_accGradParameters(THCState *state, THCudaTensor *input, THCudaTensor *gradOutput, THCudaTensor *gradWeight, THCudaTensor *gradBias, int kW, int kH, int dW, int dH, float scale) -{ - THCUNN_assertSameGPU(state, 4, input, gradOutput, gradWeight, gradBias); - - int nInputPlane = THCudaTensor_size(state, gradWeight, 0); - - if (input->nDimension == 3) { - long nInputCols = input->size[2]; - long nInputRows = input->size[1]; - - float *gradWeight_data = THCudaTensor_data(state, gradWeight); - float *gradBias_data = THCudaTensor_data(state, gradBias); - float *gradOutput_data = THCudaTensor_data(state, gradOutput); - float *input_data; - - input = THCudaTensor_newContiguous(state, input); - input_data = THCudaTensor_data(state, input); - - // cuda blocks & threads: - dim3 blocks(nInputPlane); - dim3 threads(32,8); - - // run gradweight kernel - subgradweight <<<blocks, threads, 0, THCState_getCurrentStream(state)>>> ( - input_data, gradOutput_data, gradWeight_data, gradBias_data, - nInputPlane, nInputRows, nInputCols, kH, kW, dH, dW, scale); - THCudaCheck(cudaGetLastError()); - } else { - long nInputCols = input->size[3]; - long nInputRows = input->size[2]; - long nbatch = input->size[0]; - - float *gradWeight_data = THCudaTensor_data(state, gradWeight); - float *gradBias_data = THCudaTensor_data(state, gradBias); - float *gradOutput_data = THCudaTensor_data(state, gradOutput); - float *input_data; - - input = THCudaTensor_newContiguous(state, input); - input_data = THCudaTensor_data(state, input); - - // cuda blocks & threads: - dim3 blocks(nInputPlane); - dim3 threads(32,8); - - // run gradweight kernel - long sl; - for (sl=0; sl<nbatch; sl++) { - subgradweight <<<blocks, threads, 0, THCState_getCurrentStream(state)>>> ( - input_data + sl*input->stride[0], - gradOutput_data + sl*gradOutput->stride[0], - gradWeight_data, gradBias_data, - nInputPlane, nInputRows, nInputCols, kH, kW, dH, dW, scale); - } - THCudaCheck(cudaGetLastError()); - } - // clean - THCudaTensor_free(state, input); - -} +#include "generic/SpatialSubSampling.cu" +#include "THCGenerateFloatTypes.h" #undef CUDA_MAX_THREADS diff --git a/lib/THCUNN/SpatialUpSamplingBilinear.cu b/lib/THCUNN/SpatialUpSamplingBilinear.cu index 27fab85..039f8e7 100644 --- a/lib/THCUNN/SpatialUpSamplingBilinear.cu +++ b/lib/THCUNN/SpatialUpSamplingBilinear.cu @@ -5,10 +5,14 @@ #include "THCDeviceTensor.cuh" #include "THCDeviceTensorUtils.cuh" #include "THCDeviceUtils.cuh" +#include "THCHalf.h" +#include "THCHalfAutoNumerics.cuh" +#include "THCAtomics.cuh" +template<typename Dtype, typename Acctype> __global__ void caffe_gpu_interp2_kernel(const int n, - const float rheight, const float rwidth, - const THCDeviceTensor<float, 4> data1, THCDeviceTensor<float, 4> data2) { + const Acctype rheight, const Acctype rwidth, + const THCDeviceTensor<Dtype, 4> data1, THCDeviceTensor<Dtype, 4> data2) { int index = threadIdx.x + blockIdx.x * blockDim.x; const int batchsize = data1.getSize(0); const int channels = data1.getSize(1); @@ -26,72 +30,42 @@ __global__ void caffe_gpu_interp2_kernel(const int n, const int w1 = w2; for (int n = 0; n < batchsize ; n++){ for (int c = 0; c < channels; ++c) { - const float val = data1[n][c][h1][w1]; + const Dtype val = data1[n][c][h1][w1]; data2[n][c][h2][w2] = val; } } return; } // - const float h1r = rheight * h2; + const Acctype h1r = rheight * h2; const int h1 = h1r; const int h1p = (h1 < height1 - 1) ? 1 : 0; - const float h1lambda = h1r - h1; - const float h0lambda = 1.0f - h1lambda; + const Acctype h1lambda = h1r - h1; + const Acctype h0lambda = Acctype(1) - h1lambda; // - const float w1r = rwidth * w2; + const Acctype w1r = rwidth * w2; const int w1 = w1r; const int w1p = (w1 < width1 - 1) ? 1 : 0; - const float w1lambda = w1r - w1; - const float w0lambda = 1.0f - w1lambda; + const Acctype w1lambda = w1r - w1; + const Acctype w0lambda = Acctype(1) - w1lambda; // for (int n = 0; n < batchsize ; n++){ for (int c = 0; c < channels; ++c) { - const float val = h0lambda * (w0lambda * data1[n][c][h1][w1] + const Acctype val = h0lambda * (w0lambda * data1[n][c][h1][w1] + w1lambda * data1[n][c][h1][w1+w1p]) + h1lambda * (w0lambda * data1[n][c][h1+h1p][w1] + w1lambda * data1[n][c][h1+h1p][w1+w1p]); - data2[n][c][h2][w2] = val; + data2[n][c][h2][w2] = ScalarConvert<Acctype, Dtype>::to(val); } } } } -void THNN_CudaSpatialUpSamplingBilinear_updateOutput( - THCState *state, - THCudaTensor *input, - THCudaTensor *output, - int outputHeight, - int outputWidth) { - input = THCudaTensor_newContiguous(state, input); - output = THCudaTensor_newContiguous(state, output); - THCUNN_assertSameGPU(state, 2, input, output); - THCudaTensor_zero(state, output); - THCDeviceTensor<float, 4> idata = toDeviceTensor<float, 4>(state, input); - THCDeviceTensor<float, 4> odata = toDeviceTensor<float, 4>(state, output); - int height1 = idata.getSize(2); - int width1 = idata.getSize(3); - int height2 = odata.getSize(2); - int width2 = odata.getSize(3); - assert( height1 > 0 && width1 > 0 && height2 > 0 && width2 > 0); - const float rheight= (height2 > 1) ? (float)(height1 - 1)/(height2 - 1) : 0.f; - const float rwidth = (width2 > 1) ? (float)(width1 - 1)/(width2 - 1) : 0.f; - const int num_kernels = height2 * width2; - const int num_threads = - THCState_getCurrentDeviceProperties(state)->maxThreadsPerBlock; - cudaStream_t stream = THCState_getCurrentStream(state); - caffe_gpu_interp2_kernel<<<THCCeilDiv(num_kernels, num_threads), num_threads , - 0 , stream>>>(num_kernels, rheight, rwidth, idata, odata); - THCudaCheck(cudaGetLastError()); - THCudaTensor_free(state, input); - THCudaTensor_free(state, output); -} - - // Backward (adjoint) operation 1 <- 2 (accumulates) +template <typename Dtype, typename Acctype> __global__ void caffe_gpu_interp2_kernel_backward(const int n, - const float rheight, const float rwidth, - THCDeviceTensor<float, 4> data1, const THCDeviceTensor<float, 4> data2){ + const Acctype rheight, const Acctype rwidth, + THCDeviceTensor<Dtype, 4> data1, const THCDeviceTensor<Dtype, 4> data2){ int index = threadIdx.x + blockIdx.x * blockDim.x; const int batchsize = data1.getSize(0); const int channels = data1.getSize(1); @@ -108,69 +82,41 @@ __global__ void caffe_gpu_interp2_kernel_backward(const int n, const int w1 = w2; for (int n = 0; n < batchsize ; n++){ for (int c = 0; c < channels; ++c) { - const float val = data2[n][c][h1][w1]; + const Dtype val = data2[n][c][h1][w1]; data1[n][c][h2][w2] += val; } } return; } // - const float h1r = rheight * h2; + const Acctype h1r = rheight * h2; const int h1 = h1r; const int h1p = (h1 < height1 - 1) ? 1 : 0; - const float h1lambda = h1r - h1; - const float h0lambda = 1.0f - h1lambda; + const Acctype h1lambda = h1r - h1; + const Acctype h0lambda = Acctype(1) - h1lambda; // - const float w1r = rwidth * w2; + const Acctype w1r = rwidth * w2; const int w1 = w1r; const int w1p = (w1 < width1 - 1) ? 1 : 0; - const float w1lambda = w1r - w1; - const float w0lambda = 1.0f - w1lambda; + const Acctype w1lambda = w1r - w1; + const Acctype w0lambda = Acctype(1) - w1lambda; // for (int n = 0; n < batchsize ; n++){ for (int c = 0; c < channels; ++c) { - const float d2val = data2[n][c][h2][w2]; - atomicAdd(data1[n][c][h1][w1].data(), h0lambda * w0lambda * d2val); - atomicAdd(data1[n][c][h1][w1+w1p].data(), h0lambda * w1lambda * d2val); - atomicAdd(data1[n][c][h1+h1p][w1].data(), h1lambda * w0lambda * d2val); + const Dtype d2val = data2[n][c][h2][w2]; + atomicAdd(data1[n][c][h1][w1].data(), + ScalarConvert<Acctype, Dtype>::to(h0lambda * w0lambda * d2val)); + atomicAdd(data1[n][c][h1][w1+w1p].data(), + ScalarConvert<Acctype, Dtype>::to(h0lambda * w1lambda * d2val)); + atomicAdd(data1[n][c][h1+h1p][w1].data(), + ScalarConvert<Acctype, Dtype>::to(h1lambda * w0lambda * d2val)); atomicAdd(data1[n][c][h1+h1p][w1+w1p].data(), - h1lambda * w1lambda * d2val); + ScalarConvert<Acctype, Dtype>::to(h1lambda * w1lambda * d2val)); } } } } -void THNN_CudaSpatialUpSamplingBilinear_updateGradInput( - THCState *state, - THCudaTensor *gradOutput, - THCudaTensor *gradInput, - int nbatch, - int nchannels, - int inputHeight, - int inputWidth, - int outputHeight, - int outputWidth) { - gradInput = THCudaTensor_newContiguous(state, gradInput); - gradOutput = THCudaTensor_newContiguous(state, gradOutput); - THCUNN_assertSameGPU(state, 2, gradOutput, gradInput); - THCudaTensor_zero(state, gradInput); - THCDeviceTensor<float, 4> data1 = toDeviceTensor<float, 4>(state, gradInput); - THCDeviceTensor<float, 4> data2 = toDeviceTensor<float, 4>(state, gradOutput); - int height1 = data1.getSize(2); - int width1 = data1.getSize(3); - int height2 = data2.getSize(2); - int width2 = data2.getSize(3); - assert(height1 > 0 && width1 > 0 && height2 > 0 && width2 > 0); - const float rheight= (height2 > 1) ? (float)(height1 - 1)/(height2 - 1) : 0.f; - const float rwidth = (width2 > 1) ? (float)(width1 - 1) / (width2 - 1) : 0.f; - const int num_kernels = height2 * width2; - const int num_threads = - THCState_getCurrentDeviceProperties(state)->maxThreadsPerBlock; - cudaStream_t stream = THCState_getCurrentStream(state); - caffe_gpu_interp2_kernel_backward<<<THCCeilDiv(num_kernels, num_threads), - num_threads, 0, stream>>>(num_kernels, rheight, rwidth, data1, data2); - THCudaCheck(cudaGetLastError()); - THCudaTensor_free(state, gradInput); - THCudaTensor_free(state, gradOutput); -} +#include "generic/SpatialUpSamplingBilinear.cu" +#include "THCGenerateFloatTypes.h" diff --git a/lib/THCUNN/SpatialUpSamplingNearest.cu b/lib/THCUNN/SpatialUpSamplingNearest.cu index c188d3a..5753c6e 100644 --- a/lib/THCUNN/SpatialUpSamplingNearest.cu +++ b/lib/THCUNN/SpatialUpSamplingNearest.cu @@ -6,6 +6,8 @@ #include <thrust/transform_reduce.h> #include <thrust/functional.h> +#include "THCHalf.h" +#include "THCHalfAutoNumerics.cuh" /* * Description: @@ -46,7 +48,8 @@ __device__ int translate_idx_inv(int ii, int d1, int d2, int d3, int scale_facto } -__global__ void upscale(float *input, float *output, long no_elements, +template <typename Dtype> +__global__ void upscale(Dtype *input, Dtype *output, long no_elements, int scale_factor, int d1, int d2, int d3) { // output offset: @@ -57,121 +60,26 @@ __global__ void upscale(float *input, float *output, long no_elements, output[ii]=input[ipidx]; } - -void THNN_CudaSpatialUpSamplingNearest_updateOutput(THCState *state, THCudaTensor *input, THCudaTensor *output, int scale_factor) -{ - THCudaTensor_zero(state, output); - - THCUNN_assertSameGPU(state, 2, input, output); - - input = THCudaTensor_newContiguous(state, input); - // This is for allocating output Tensor - long no_elements = 1; - for(int i = 0; i < input->nDimension; i++){ - no_elements *= input->size[i]; - } - no_elements *= scale_factor * scale_factor; - - int d1; - int d2; - int d3; - - if (input->nDimension == 3) { - d1 = output->size[0]; - d2 = output->size[1]; - d3 = output->size[2]; - } else { - d1 = output->size[1]; - d2 = output->size[2]; - d3 = output->size[3]; - } - - float *input_data = THCudaTensor_data(state, input); - float *output_data = THCudaTensor_data(state, output); - - // cuda blocks & threads: - long nthreads = 256; - // Max number of blocks: http://en.wikipedia.org/wiki/CUDA - // 65535 for SM 2.x, 2^32 -1 for >= 3.0 - // TODO: When we move to SM 3.5 we should update this - long n_xblocks = min(max((int)ceil((float)no_elements / nthreads), 1), 65535); - long n_yblocks = (long)ceil((float)no_elements / (float)(n_xblocks * nthreads)); - if (n_yblocks > 65535) { - THError("Input size is too large! aborting"); - } - dim3 blocks(n_xblocks, n_yblocks); - dim3 threads(nthreads); - - // kernel: - upscale<<<blocks, threads, 0, THCState_getCurrentStream(state)>>> (input_data, output_data, no_elements, scale_factor, d1, d2, d3); - THCudaCheck(cudaGetLastError()); - - // final cut: - THCudaTensor_free(state, input); -} - /* * Description: */ -__global__ void downscale(float *gradInput_data, float *gradOutput_data, long no_elements, +template <typename Dtype, typename Acctype> +__global__ void downscale(Dtype *gradInput_data, Dtype *gradOutput_data, long no_elements, int scale_factor, int d1, int d2, int d3) { // output offset: long ii = threadIdx.x + blockDim.x * blockIdx.x; ii += threadIdx.y + blockDim.y * (blockDim.x * gridDim.x) * blockIdx.y; if (ii >= no_elements) return; + Acctype sum = Acctype(0); for (int i=0; i < scale_factor; i++){ for(int j=0; j < scale_factor; j++){ int ipidx = translate_idx_inv(ii, d1, d2, d3, scale_factor, i, j); - gradInput_data[ii] += gradOutput_data[ipidx]; + sum += gradOutput_data[ipidx]; } } + gradInput_data[ii] += ScalarConvert<Acctype, Dtype>::to(sum); } - -void THNN_CudaSpatialUpSamplingNearest_updateGradInput(THCState *state, THCudaTensor *input, THCudaTensor *gradOutput, THCudaTensor *gradInput, int scale_factor) -{ - THCUNN_assertSameGPU(state, 2, gradOutput, gradInput); - - THCudaTensor_zero(state, gradInput); - - float *gradInput_data = THCudaTensor_data(state, gradInput); - float *gradOutput_data = THCudaTensor_data(state, gradOutput); - - long no_elements = 1; - for(int i = 0; i < gradInput->nDimension; i++){ - no_elements *= gradInput->size[i]; - } - - int d1; - int d2; - int d3; - - if (gradInput->nDimension == 3) { - d1 = gradInput->size[0]; - d2 = gradInput->size[1]; - d3 = gradInput->size[2]; - } else { - d1 = gradInput->size[1]; - d2 = gradInput->size[2]; - d3 = gradInput->size[3]; - } - - // cuda blocks & threads: - long nthreads = 256; - // Max number of blocks: http://en.wikipedia.org/wiki/CUDA - // 65535 for SM 2.x, 2^32 -1 for >= 3.0 - // TODO: When we move to SM 3.5 we should update this - long n_xblocks = min(max((int)ceil((float)no_elements / nthreads), 1), 65535); - long n_yblocks = (long)ceil((float)no_elements / (float)(n_xblocks * nthreads)); - if (n_yblocks > 65535) { - THError("Input size is too large! aborting"); - } - dim3 blocks(n_xblocks, n_yblocks); - dim3 threads(nthreads); - - // kernel: - downscale<<<blocks, threads, 0, THCState_getCurrentStream(state)>>> (gradInput_data, gradOutput_data, no_elements, - scale_factor, d1, d2, d3); - THCudaCheck(cudaGetLastError()); -} +#include "generic/SpatialUpSamplingNearest.cu" +#include "THCGenerateFloatTypes.h" diff --git a/lib/THCUNN/Sqrt.cu b/lib/THCUNN/Sqrt.cu index e1a4fe3..9d0dddc 100644 --- a/lib/THCUNN/Sqrt.cu +++ b/lib/THCUNN/Sqrt.cu @@ -1,40 +1,32 @@ #include "THCUNN.h" -#include "common.h" +#include "THCHalf.h" +#include "THCHalfAutoNumerics.cuh" +template <typename T> struct sqrtupdateOutput_functor { - const float bias; + const T bias; - sqrtupdateOutput_functor(float bias_) + sqrtupdateOutput_functor(T bias_) : bias(bias_) {} - __device__ void operator()(float *output, const float *input) const + __device__ void operator()(T *output, const T *input) const { *output = sqrt(*input + bias); } }; -void THNN_CudaSqrt_updateOutput(THCState *state, THCudaTensor *input, THCudaTensor *output, float eps) -{ - THCUNN_assertSameGPU(state, 2, input, output); - THCudaTensor_resizeAs(state, output, input); - THC_pointwiseApply2(state, output, input, sqrtupdateOutput_functor(eps)); -} - +template <typename T> struct sqrtupdateGradInput_functor { sqrtupdateGradInput_functor() {} - __device__ void operator()(float *gradInput, const float *output, const float *gradOutput) const + __device__ void operator()(T *gradInput, const T *output, const T *gradOutput) const { - *gradInput = (*output == 0.0f) ? 0.0f : ((0.5f * *gradOutput) / *output); + *gradInput = (THCNumerics<T>::eq(*output,ScalarConvert<float, T>::to(0.0f))) ? ScalarConvert<float, T>::to(0.0f) : ((ScalarConvert<float, T>::to(0.5f) * *gradOutput) / *output); } }; -void THNN_CudaSqrt_updateGradInput(THCState *state, THCudaTensor *input, THCudaTensor *gradOutput, THCudaTensor *gradInput, THCudaTensor *output) -{ - THCUNN_assertSameGPU(state, 3, output, gradOutput, gradInput); - THCudaTensor_resizeAs(state, gradInput, output); - THC_pointwiseApply3(state, gradInput, output, gradOutput, sqrtupdateGradInput_functor()); -} +#include "generic/Sqrt.cu" +#include "THCGenerateFloatTypes.h" diff --git a/lib/THCUNN/Square.cu b/lib/THCUNN/Square.cu index a6d147c..4acf67f 100644 --- a/lib/THCUNN/Square.cu +++ b/lib/THCUNN/Square.cu @@ -1,32 +1,24 @@ #include "THCUNN.h" -#include "common.h" +#include "THCHalf.h" +#include "THCHalfAutoNumerics.cuh" +template <typename T> struct squareupdateOutput_functor { - __device__ void operator()(float* output, const float* input) const + __device__ void operator()(T* output, const T* input) const { *output = (*input) * (*input); } }; -void THNN_CudaSquare_updateOutput(THCState *state, THCudaTensor *input, THCudaTensor *output) -{ - THCUNN_assertSameGPU(state, 2, input, output); - THCudaTensor_resizeAs(state, output, input); - THC_pointwiseApply2(state, output, input, squareupdateOutput_functor()); -} - +template <typename T> struct squareupdateGradInput_functor { - __device__ void operator()(float* gradInput, const float* input, const float* gradOutput) const + __device__ void operator()(T* gradInput, const T* input, const T* gradOutput) const { - *gradInput = 2.0 * (*gradOutput) * (*input); + *gradInput = ScalarConvert<double, T>::to(2.0) * (*gradOutput) * (*input); } }; -void THNN_CudaSquare_updateGradInput(THCState *state, THCudaTensor *input, THCudaTensor *gradOutput, THCudaTensor *gradInput) -{ - THCUNN_assertSameGPU(state, 3, input, gradOutput, gradInput); - THCudaTensor_resizeAs(state, gradInput, input); - THC_pointwiseApply3(state, gradInput, input, gradOutput, squareupdateGradInput_functor()); -} +#include "generic/Square.cu" +#include "THCGenerateFloatTypes.h" diff --git a/lib/THCUNN/THCHalfAutoNumerics.cuh b/lib/THCUNN/THCHalfAutoNumerics.cuh new file mode 100644 index 0000000..89a9602 --- /dev/null +++ b/lib/THCUNN/THCHalfAutoNumerics.cuh @@ -0,0 +1,248 @@ +#ifndef THC_HALF_AUTO_NUMERICS_INC +#define THC_HALF_AUTO_NUMERICS_INC + +#include "THCHalf.h" +#include "THCNumerics.cuh" + +// Half numerics functions defined as free functions, so cunn code can be +//written generically, i.e. without excessive calling of THCNumerics<half> functions. + +#ifdef CUDA_HALF_TENSOR + +// these functions should move to THCNumerics + +inline __host__ __device__ half fmaxType(half x, half y) { + return THCNumerics<half>::ge(x, y) ? x : y; +} + +inline __host__ __device__ float fmaxType(float x, half y) { + return fmaxf(x, ScalarConvert<half, float>::to(y)); +} + +inline __host__ __device__ float fmaxType(float x, float y) { + return fmaxf(x, y); +} + +inline __host__ __device__ double fmaxType(double x, double y) { + return fmax(x, y); +} + +inline __host__ __device__ half mul(half a, half b) { + #ifdef __CUDA_ARCH__ + #ifdef CUDA_HALF_INSTRUCTIONS + return __hmul(a, b); + #else + float fa = __half2float(a); + float fb = __half2float(b); + return __float2half( fa * fb ); + #endif + #else // __CUDA_ARCH__ + return THC_float2half(THC_half2float(a) * THC_half2float(b)); + #endif +} + +inline __host__ __device__ half div(half a, half b) { + #ifdef __CUDA_ARCH__ + #ifdef CUDA_HALF_INSTRUCTIONS + return __hdiv(a, b); + #else + float fa = __half2float(a); + float fb = __half2float(b); + return __float2half( fa / fb ); + #endif + #else // __CUDA_ARCH__ + return THC_float2half(THC_half2float(a) / THC_half2float(b)); + #endif +} + +// arithmetic functions + +inline __host__ __device__ half operator+(half a, half b) { + return THCNumerics<half>::add(a, b); +} + +inline __host__ __device__ float operator+(half a, float b) { + return ScalarConvert<half, float>::to(a) + b; +} + +inline __host__ __device__ float operator+(float a, half b) { + return a + ScalarConvert<half, float>::to(b); +} + +inline __host__ __device__ double operator+(double a, half b) { + return a + ScalarConvert<half, double>::to(b); +} + +inline __host__ __device__ half operator-(half a) { + return THCNumerics<half>::neg(a); +} + +inline __host__ __device__ half operator-(half a, half b) { + return THCNumerics<half>::add(a, THCNumerics<half>::neg(b)); +} + +inline __host__ __device__ half operator-(half a, int b) { + return THCNumerics<half>::add(a, THCNumerics<half>::neg(ScalarConvert<int, half>::to(b))); +} + +inline __host__ __device__ float operator-(half a, float b) { + return ScalarConvert<half, float>::to(a) - b; +} + +inline __host__ __device__ double operator-(half a, double b) { + return ScalarConvert<half, double>::to(a) - b; +} + +inline __host__ __device__ half operator-(int a, half b) { + return THCNumerics<half>::add(ScalarConvert<int, half>::to(a), THCNumerics<half>::neg(b)); +} + +inline __host__ __device__ float operator-(float a, half b) { + return a - ScalarConvert<half, float>::to(b); +} + +inline __host__ __device__ double operator-(double a, half b) { + return a - ScalarConvert<half, double>::to(b); +} + +inline __host__ __device__ half operator*(half a, half b) { + return mul(a, b); +} + +inline __host__ __device__ float operator*(half a, float b) { + return ScalarConvert<half, float>::to(a) * b; +} + +inline __host__ __device__ double operator*(half a, double b) { + return ScalarConvert<half, double>::to(a) * b; +} + +inline __host__ __device__ half operator*(half a, int b) { + return a * ScalarConvert<int, half>::to(b); +} + +inline __host__ __device__ float operator*(float a, half b) { + return a * ScalarConvert<half, float>::to(b); +} + +inline __host__ __device__ double operator*(double a, half b) { + return a * ScalarConvert<half, double>::to(b); +} + +inline __host__ __device__ half operator/(half a, half b) { + return div(a, b); +} + +inline __host__ __device__ float operator/(float a, half b) { + return a / ScalarConvert<half, float>::to(b); +} + +inline __host__ __device__ double operator/(double a, half b) { + return a / ScalarConvert<half, double>::to(b); +} + +inline __host__ __device__ half operator/(int a, half b) { + return ScalarConvert<int, half>::to(a) / b; +} + +inline __host__ __device__ float operator/(half a, float b) { + return ScalarConvert<half, float>::to(a) / b; +} + +inline __host__ __device__ double operator/(half a, double b) { + return ScalarConvert<half, double>::to(a) / b; +} + +inline __host__ __device__ half operator/(half a, int b) { + return a / ScalarConvert<int, half>::to(b); +} + +inline __host__ __device__ half& operator+=(half &lhs, const half &rhs) { + lhs = lhs + rhs; + return lhs; +} +inline __host__ __device__ float& operator+=(float &lhs, const half &rhs) { + lhs = lhs + rhs; + return lhs; +} + +inline __host__ __device__ float& operator-=(float &lhs, const half &rhs) { + lhs = lhs - rhs; + return lhs; +} + +inline __host__ __device__ half& operator*=(half &lhs, const half &rhs) { + lhs = lhs * rhs; + return lhs; +} + +inline __host__ __device__ half& operator/=(half &lhs, const int &rhs) { + lhs = lhs / rhs; + return lhs; +} + +inline __host__ __device__ half& operator/=(half &lhs, const half &rhs) { + lhs = lhs / rhs; + return lhs; +} + +inline __host__ __device__ half abs(half a) { + return THCNumerics<half>::abs(a); +} + +inline __host__ __device__ half exp(half a) { + return THCNumerics<half>::exp(a); +} + +inline __host__ __device__ half log1p(half a) { + return THCNumerics<half>::log1p(a); +} + +inline __host__ __device__ half pow(half a, half b) { + return THCNumerics<half>::pow(a, b); +} + +inline __host__ __device__ half sqrt(half a) { + return THCNumerics<half>::sqrt(a); +} + +inline __host__ __device__ half tanh(half a) { + return THCNumerics<half>::tanh(a); +} + +// comparison functions + +inline __host__ __device__ bool operator<(half a, half b) { + return THCNumerics<half>::lt(a, b); +} + +inline __host__ __device__ bool operator<=(half a, half b) { + return THCNumerics<half>::le(a, b); +} + +inline __host__ __device__ bool operator<=(half a, int b) { + return THCNumerics<half>::le(a, ScalarConvert<int, half>::to(b)); +} + +inline __host__ __device__ bool operator<(half a, int b) { + return THCNumerics<half>::lt(a, ScalarConvert<int, half>::to(b)); +} + +inline __host__ __device__ bool operator>(half a, half b) { + return THCNumerics<half>::gt(a, b); +} + +inline __host__ __device__ bool operator>(half a, int b) { + return THCNumerics<half>::gt(a, ScalarConvert<int, half>::to(b)); +} + +inline __host__ __device__ bool operator>=(half a, half b) { + return THCNumerics<half>::ge(a, b); +} + +inline __host__ __device__ bool operator>=(half a, int b) { + return THCNumerics<half>::ge(a, ScalarConvert<int ,half>::to(b)); +} + +#endif +#endif diff --git a/lib/THCUNN/THCUNN.h b/lib/THCUNN/THCUNN.h index 9aeef2d..4e95b06 100644 --- a/lib/THCUNN/THCUNN.h +++ b/lib/THCUNN/THCUNN.h @@ -1,1090 +1,11 @@ #include <THC/THC.h> #include <THC/THCApply.cuh> -#define THIndexTensor THCudaLongTensor -#define THIndexTensor_(NAME) THCudaLongTensor_ ## NAME +#define THCIndexTensor THCudaLongTensor +#define THCIndexTensor_(NAME) THCudaLongTensor_ ## NAME +typedef long THCIndex_t; -TH_API void THNN_CudaAbs_updateOutput( - THCState *state, - THCudaTensor *input, - THCudaTensor *output); -TH_API void THNN_CudaAbs_updateGradInput( - THCState *state, - THCudaTensor *input, - THCudaTensor *gradOutput, - THCudaTensor *gradInput); +#define THNN_(NAME) TH_CONCAT_3(THNN_, CReal, NAME) -TH_API void THNN_CudaAbsCriterion_updateOutput( - THCState *state, - THCudaTensor *input, - THCudaTensor *target, - THCudaTensor *output, - bool sizeAverage); -TH_API void THNN_CudaAbsCriterion_updateGradInput( - THCState *state, - THCudaTensor *input, - THCudaTensor *target, - THCudaTensor *gradInput, - bool sizeAverage); - -TH_API void THNN_CudaBCECriterion_updateOutput( - THCState *state, - THCudaTensor *input, - THCudaTensor *target, - THCudaTensor *output, - bool sizeAverage, - THCudaTensor *weights); // [OPTIONAL] -TH_API void THNN_CudaBCECriterion_updateGradInput( - THCState *state, - THCudaTensor *input, - THCudaTensor *target, - THCudaTensor *gradInput, - bool sizeAverage, - THCudaTensor *weights); // [OPTIONAL] - -TH_API void THNN_CudaClassNLLCriterion_updateOutput( - THCState *state, - THCudaTensor *input, - THIndexTensor *target, - THCudaTensor *output, - bool sizeAverage, - THCudaTensor *weights, // [OPTIONAL] - THCudaTensor *total_weight); -TH_API void THNN_CudaClassNLLCriterion_updateGradInput( - THCState *state, - THCudaTensor *input, - THIndexTensor *target, - THCudaTensor *gradInput, - bool sizeAverage, - THCudaTensor *weights, // [OPTIONAL] - THCudaTensor *total_weight); - -TH_API void THNN_CudaSpatialClassNLLCriterion_updateOutput( - THCState *state, - THCudaTensor *input, - THIndexTensor *target, - THCudaTensor *output, - bool sizeAverage, - THCudaTensor *weights, // [OPTIONAL] - THCudaTensor *total_weight); -TH_API void THNN_CudaSpatialClassNLLCriterion_updateGradInput( - THCState *state, - THCudaTensor *input, - THIndexTensor *target, - THCudaTensor *gradInput, - bool sizeAverage, - THCudaTensor *weights, // [OPTIONAL] - THCudaTensor *total_weight); - -TH_API void THNN_CudaDistKLDivCriterion_updateOutput( - THCState *state, - THCudaTensor *input, - THCudaTensor *target, - THCudaTensor *output, - bool sizeAverage); -TH_API void THNN_CudaDistKLDivCriterion_updateGradInput( - THCState *state, - THCudaTensor *input, - THCudaTensor *target, - THCudaTensor *gradInput, - bool sizeAverage); - -TH_API void THNN_CudaELU_updateOutput( - THCState *state, - THCudaTensor *input, - THCudaTensor *output, - float alpha, - bool inplace); -TH_API void THNN_CudaELU_updateGradInput( - THCState *state, - THCudaTensor *input, - THCudaTensor *gradOutput, - THCudaTensor *gradInput, - THCudaTensor *output, - float alpha, - bool inplace); - -TH_API void THNN_CudaHardTanh_updateOutput( - THCState *state, - THCudaTensor *input, - THCudaTensor *output, - float min_val, - float max_val, - bool inplace); -TH_API void THNN_CudaHardTanh_updateGradInput( - THCState *state, - THCudaTensor *input, - THCudaTensor *gradOutput, - THCudaTensor *gradInput, - float min_val, - float max_val, - bool inplace); - -TH_API void THNN_CudaL1Cost_updateOutput( - THCState *state, - THCudaTensor *input, - THCudaTensor *output); -TH_API void THNN_CudaL1Cost_updateGradInput( - THCState *state, - THCudaTensor *input, - THCudaTensor *gradOutput, // [OPTIONAL] - THCudaTensor *gradInput); - -TH_API void THNN_CudaLeakyReLU_updateOutput( - THCState *state, - THCudaTensor *input, - THCudaTensor *output, - double negval, bool inplace); -TH_API void THNN_CudaLeakyReLU_updateGradInput( - THCState *state, - THCudaTensor *input, - THCudaTensor *gradOutput, - THCudaTensor *gradInput, - double negval, - bool inplace); - -TH_API void THNN_CudaLogSigmoid_updateOutput( - THCState *state, - THCudaTensor *input, - THCudaTensor *output, - THCudaTensor *buffer); -TH_API void THNN_CudaLogSigmoid_updateGradInput( - THCState *state, - THCudaTensor *input, - THCudaTensor *gradOutput, - THCudaTensor *gradInput, - THCudaTensor *buffer); - -TH_API void THNN_CudaLogSoftMax_updateOutput( - THCState *state, - THCudaTensor *input, - THCudaTensor *output); -TH_API void THNN_CudaLogSoftMax_updateGradInput( - THCState *state, - THCudaTensor *input, - THCudaTensor *gradOutput, - THCudaTensor *gradInput, - THCudaTensor *output); - -TH_API void THNN_CudaLookupTable_accGradParameters( - THCState *state, - THIndexTensor *input, - THCudaTensor *gradOutput, - THCudaTensor *gradWeight, - THIndexTensor *count, - THIndexTensor *sorted, // [OPTIONAL] - THIndexTensor *indices, // [OPTIONAL] - bool scaleGradByFreq, - int paddingValue, - float scale); - -TH_API void THNN_CudaLookupTable_renorm( - THCState *state, - THIndexTensor *idx, - THCudaTensor *weight, - float maxNorm, - float normType); - -TH_API void THNN_CudaMarginCriterion_updateOutput( - THCState *state, - THCudaTensor *input, - THCudaTensor *target, - THCudaTensor *output, - bool sizeAverage, - float margin); -TH_API void THNN_CudaMarginCriterion_updateGradInput( - THCState *state, - THCudaTensor *input, - THCudaTensor *target, - THCudaTensor *gradInput, - bool sizeAverage, - float margin); - -TH_API void THNN_CudaSoftMarginCriterion_updateOutput( - THCState *state, - THCudaTensor *input, - THCudaTensor *target, - THCudaTensor *output, - int sizeAverage); - -TH_API void THNN_CudaSoftMarginCriterion_updateGradInput( - THCState *state, - THCudaTensor *input, - THCudaTensor *target, - THCudaTensor *gradInput, - int sizeAverage); - -TH_API void THNN_CudaMSECriterion_updateOutput( - THCState *state, - THCudaTensor *input, - THCudaTensor *target, - THCudaTensor *output, - bool sizeAverage); -TH_API void THNN_CudaMSECriterion_updateGradInput( - THCState *state, - THCudaTensor *input, - THCudaTensor *target, - THCudaTensor *gradInput, - bool sizeAverage); - -TH_API void THNN_CudaMultiMarginCriterion_updateOutput( - THCState *state, - THCudaTensor *input, - THCudaTensor *target, - THCudaTensor *output, - bool sizeAverage, - int p, - THCudaTensor *weights, // [OPTIONAL] - float margin); -TH_API void THNN_CudaMultiMarginCriterion_updateGradInput( - THCState *state, - THCudaTensor *input, - THCudaTensor *target, - THCudaTensor *gradInput, - bool sizeAverage, - int p, - THCudaTensor *weights, // [OPTIONAL] - float margin); - -TH_API void THNN_CudaMultiLabelMarginCriterion_updateOutput( - THCState *state, - THCudaTensor *input, - THCudaTensor *target, - THCudaTensor *output, - THCudaTensor *istarget, - bool sizeAverage); -TH_API void THNN_CudaMultiLabelMarginCriterion_updateGradInput( - THCState *state, - THCudaTensor *input, - THCudaTensor *target, - THCudaTensor *gradInput, - THCudaTensor *istarget, - bool sizeAverage); - -TH_API void THNN_CudaPReLU_updateOutput( - THCState *state, - THCudaTensor *input, - THCudaTensor *output, - THCudaTensor *weight, - long nOutputPlane); -TH_API void THNN_CudaPReLU_updateGradInput( - THCState *state, - THCudaTensor *input, - THCudaTensor *gradOutput, - THCudaTensor *gradInput, - THCudaTensor *weight, - long nOutputPlane); -TH_API void THNN_CudaPReLU_accGradParameters( - THCState *state, - THCudaTensor *input, - THCudaTensor *gradOutput, - THCudaTensor *gradInput, - THCudaTensor *weight, - THCudaTensor *gradWeight, - THCudaTensor *gradWeightBuf, - THCudaTensor *gradWeightBuf2, - long nOutputPlane, - float scale); - -TH_API void THNN_CudaRReLU_updateOutput( - THCState *state, - THCudaTensor *input, - THCudaTensor *output, - THCudaTensor *noise, - double lower, - double upper, - bool train, - bool inplace, - void *generator); -TH_API void THNN_CudaRReLU_updateGradInput( - THCState *state, - THCudaTensor *input, - THCudaTensor *gradOutput, - THCudaTensor *gradInput, - THCudaTensor *noise, - double lower, - double upper, - bool train, - bool inplace); - -TH_API void THNN_CudaSigmoid_updateOutput( - THCState *state, - THCudaTensor *input, - THCudaTensor *output); -TH_API void THNN_CudaSigmoid_updateGradInput( - THCState *state, - THCudaTensor *input, - THCudaTensor *gradOutput, - THCudaTensor *gradInput, - THCudaTensor *output); - -TH_API void THNN_CudaSmoothL1Criterion_updateOutput( - THCState *state, - THCudaTensor *input, - THCudaTensor *target, - THCudaTensor *output, - bool sizeAverage); -TH_API void THNN_CudaSmoothL1Criterion_updateGradInput( - THCState *state, - THCudaTensor *input, - THCudaTensor *target, - THCudaTensor *gradInput, - bool sizeAverage); - -TH_API void THNN_CudaSoftMax_updateOutput( - THCState *state, - THCudaTensor *input, - THCudaTensor *output); -TH_API void THNN_CudaSoftMax_updateGradInput( - THCState *state, - THCudaTensor *input, - THCudaTensor *gradOutput, - THCudaTensor *gradInput, - THCudaTensor *output); - -TH_API void THNN_CudaSoftPlus_updateOutput( - THCState *state, - THCudaTensor *input, - THCudaTensor *output, - float beta, - float threshold); -TH_API void THNN_CudaSoftPlus_updateGradInput( - THCState *state, - THCudaTensor *input, - THCudaTensor *gradOutput, - THCudaTensor *gradInput, - THCudaTensor *output, - float beta, - float threshold); - -TH_API void THNN_CudaSoftShrink_updateOutput( - THCState *state, - THCudaTensor *input, - THCudaTensor *output, - double lambda); -TH_API void THNN_CudaSoftShrink_updateGradInput( - THCState *state, - THCudaTensor *input, - THCudaTensor *gradOutput, - THCudaTensor *gradInput, - double lambda); - -TH_API void THNN_CudaSqrt_updateOutput( - THCState *state, - THCudaTensor *input, - THCudaTensor *output, - float eps); -TH_API void THNN_CudaSqrt_updateGradInput( - THCState *state, - THCudaTensor *input, - THCudaTensor *gradOutput, - THCudaTensor *gradInput, - THCudaTensor *output); - -TH_API void THNN_CudaSquare_updateOutput( - THCState *state, - THCudaTensor *input, - THCudaTensor *output); -TH_API void THNN_CudaSquare_updateGradInput( - THCState *state, - THCudaTensor *input, - THCudaTensor *gradOutput, - THCudaTensor *gradInput); - -TH_API void THNN_CudaTanh_updateOutput( - THCState *state, - THCudaTensor *input, - THCudaTensor *output); -TH_API void THNN_CudaTanh_updateGradInput( - THCState *state, - THCudaTensor *input, - THCudaTensor *gradOutput, - THCudaTensor *gradInput, - THCudaTensor *output); - -TH_API void THNN_CudaThreshold_updateOutput( - THCState *state, - THCudaTensor *input, - THCudaTensor *output, - double threshold, - double val, - bool inplace); -TH_API void THNN_CudaThreshold_updateGradInput( - THCState *state, - THCudaTensor *input, - THCudaTensor *gradOutput, - THCudaTensor *gradInput, - double threshold, - double val, - bool inplace); - -TH_API void THNN_CudaTemporalConvolution_updateOutput( - THCState *state, - THCudaTensor *input, - THCudaTensor *output, - THCudaTensor *weight, - THCudaTensor *bias, - int kW, int dW, - int inputFrameSize, - int outputFrameSize); - -TH_API void THNN_CudaTemporalConvolution_updateGradInput( - THCState* state, - THCudaTensor *input, - THCudaTensor *gradOutput, - THCudaTensor *gradInput, - THCudaTensor *weight, - int kW, int dW); - -TH_API void THNN_CudaTemporalConvolution_accGradParameters( - THCState *state, - THCudaTensor *input, - THCudaTensor *gradOutput, - THCudaTensor *gradWeight, - THCudaTensor *gradBias, - int kW, int dW, - float scale); - -TH_API void THNN_CudaTemporalMaxPooling_updateOutput( - THCState *state, - THCudaTensor *input, - THCudaTensor *output, - THCudaTensor *indices, - int kW, int dW); - -TH_API void THNN_CudaTemporalMaxPooling_updateGradInput( - THCState *state, - THCudaTensor *input, - THCudaTensor *gradOutput, - THCudaTensor *gradInput, - THCudaTensor *indices, - int kW, int dW); - -TH_API void THNN_CudaSparseLinear_updateOutput( - THCState *state, - THCudaTensor *input, - THCudaTensor *output, - THCudaTensor *weight, - THCudaTensor *bias); -TH_API void THNN_CudaSparseLinear_accGradParameters( - THCState *state, - THCudaTensor *input, - THCudaTensor *gradOutput, - THCudaTensor *gradWeight, - THCudaTensor *gradBias, - THCudaTensor *weight, - THCudaTensor *bias, - double weightDecay, - double scale); -TH_API void THNN_CudaSparseLinear_legacyUpdateOutput( - THCState *state, - THCudaTensor *input, - THCudaTensor *output, - THCudaTensor *weight, - THCudaTensor *bias); -TH_API void THNN_CudaSparseLinear_legacyAccGradParameters( - THCState *state, - THCudaTensor *input, - THCudaTensor *gradOutput, - THCudaTensor *gradWeight, - THCudaTensor *gradBias, - THCudaTensor *weight, - THCudaTensor *bias, - double weightDecay, - double scale); -TH_API void THNN_CudaSparseLinear_zeroGradParameters( - THCState *state, - THCudaTensor *gradWeight, - THCudaTensor *gradBias, - THCudaTensor *lastInput); -TH_API void THNN_CudaSparseLinear_updateParameters( - THCState *state, - THCudaTensor *weight, - THCudaTensor *bias, - THCudaTensor *gradWeight, - THCudaTensor *gradBias, - THCudaTensor *lastInput, - double learningRate); - -TH_API void THNN_CudaBatchNormalization_updateOutput( - THCState *state, - THCudaTensor *input, - THCudaTensor *output, - THCudaTensor *weight, // [OPTIONAL] - THCudaTensor *bias, // [OPTIONAL] - THCudaTensor *runningMean, - THCudaTensor *runningVar, - THCudaTensor *saveMean, - THCudaTensor *saveStd, - bool train, - double momentum, - double eps); -TH_API void THNN_CudaBatchNormalization_backward( - THCState *state, - THCudaTensor *input, - THCudaTensor *gradOutput, - THCudaTensor *gradInput, // [OPTIONAL] - THCudaTensor *gradWeight, // [OPTIONAL] - THCudaTensor *gradBias, // [OPTIONAL] - THCudaTensor *weight, // [OPTIONAL] - THCudaTensor *running_mean, - THCudaTensor *running_var, - THCudaTensor *save_mean, - THCudaTensor *save_std, - bool train, - float scale, - double eps); - -TH_API void THNN_CudaSpatialConvolutionMM_updateOutput( - THCState *state, - THCudaTensor *input, - THCudaTensor *output, - THCudaTensor *weight, - THCudaTensor *bias, // [OPTIONAL] - THCudaTensor *columns, - THCudaTensor *ones, - int kW, int kH, - int dW, int dH, - int padW, int padH); -TH_API void THNN_CudaSpatialConvolutionMM_updateGradInput( - THCState *state, - THCudaTensor *input, - THCudaTensor *gradOutput, - THCudaTensor *gradInput, - THCudaTensor *weight, - THCudaTensor *columns, - THCudaTensor *ones, - int kW, int kH, - int dW, int dH, - int padW, int padH); -TH_API void THNN_CudaSpatialConvolutionMM_accGradParameters( - THCState *state, - THCudaTensor *input, - THCudaTensor *gradOutput, - THCudaTensor *gradWeight, - THCudaTensor *gradBias, // [OPTIONAL] - THCudaTensor *columns, - THCudaTensor *ones, - int kW, int kH, - int dW, int dH, - int padW, int padH, - float scale); - -TH_API void THNN_CudaSpatialConvolutionLocal_updateOutput( - THCState *state, - THCudaTensor *input, - THCudaTensor *output, - THCudaTensor *weight, - THCudaTensor *bias, - THCudaTensor *finput, - THCudaTensor *fgradInput, - int kW, int kH, - int dW, int dH, - int padW, int padH, - long inputWidth, long inputHeight, - long outputWidth, long outputHeight); -TH_API void THNN_CudaSpatialConvolutionLocal_updateGradInput( - THCState *state, - THCudaTensor *input, - THCudaTensor *gradOutput, - THCudaTensor *gradInput, - THCudaTensor *weight, - THCudaTensor *finput, - THCudaTensor *fgradInput, - int kW, int kH, - int dW, int dH, - int padW, int padH, - long inputWidth, long inputHeight, - long outputWidth, long outputHeight); -TH_API void THNN_CudaSpatialConvolutionLocal_accGradParameters( - THCState *state, - THCudaTensor *input, - THCudaTensor *gradOutput, - THCudaTensor *gradWeight, - THCudaTensor *gradBias, - THCudaTensor *finput, - THCudaTensor *fgradInput, - int kW, int kH, - int dW, int dH, - int padW, int padH, - long inputWidth, long inputHeight, - long outputWidth, long outputHeight, - float scale); - -TH_API void THNN_CudaSpatialFullConvolution_updateOutput( - THCState *state, - THCudaTensor *input, - THCudaTensor *output, - THCudaTensor *weight, - THCudaTensor *bias, // [OPTIONAL] - THCudaTensor *columns, - THCudaTensor *ones, - int kW, int kH, - int dW, int dH, - int padW, int padH, - int adjW, int adjH); -TH_API void THNN_CudaSpatialFullConvolution_updateGradInput( - THCState *state, - THCudaTensor *input, - THCudaTensor *gradOutput, - THCudaTensor *gradInput, - THCudaTensor *weight, - THCudaTensor *gradColumns, - int kW, int kH, - int dW, int dH, - int padW, int padH, - int adjW, int adjH); -TH_API void THNN_CudaSpatialFullConvolution_accGradParameters( - THCState *state, - THCudaTensor *input, - THCudaTensor *gradOutput, - THCudaTensor *gradWeight, - THCudaTensor *gradBias, // [OPTIONAL] - THCudaTensor *columns, - THCudaTensor *ones, - int kW, int kH, - int dW, int dH, - int padW, int padH, - int adjW, int adjH, - float scale); - -TH_API void THNN_CudaSpatialDilatedConvolution_updateOutput( - THCState *state, - THCudaTensor *input, - THCudaTensor *output, - THCudaTensor *weight, - THCudaTensor *bias, // [OPTIONAL] - THCudaTensor *columns, - THCudaTensor *ones, - int kW, int kH, - int dW, int dH, - int padW, int padH, - int dilationW, int dilationH); - -TH_API void THNN_CudaSpatialDilatedConvolution_updateGradInput( - THCState *state, - THCudaTensor *input, - THCudaTensor *gradOutput, - THCudaTensor *gradInput, - THCudaTensor *weight, - THCudaTensor *gradColumns, - int kW, int kH, - int dW, int dH, - int padW, int padH, - int dilationW, int dilationH ); - -TH_API void THNN_CudaSpatialDilatedConvolution_accGradParameters( - THCState *state, - THCudaTensor *input, - THCudaTensor *gradOutput, - THCudaTensor *gradWeight, - THCudaTensor *gradBias, // [OPTIONAL] - THCudaTensor *columns, - THCudaTensor *ones, - int kW, int kH, - int dW, int dH, - int padW, int padH, - int dilationW, int dilationH, - float scale); - -TH_API void THNN_CudaSpatialCrossMapLRN_updateOutput( - THCState *state, - THCudaTensor *input, - THCudaTensor *output, - THCudaTensor *scale, - int size, - float alpha, - float beta, - float k); -TH_API void THNN_CudaSpatialCrossMapLRN_updateGradInput( - THCState *state, - THCudaTensor *input, - THCudaTensor *gradOutput, - THCudaTensor *gradInput, - THCudaTensor *scale, - THCudaTensor *output, - int size, - float alpha, - float beta, - float k); - -TH_API void THNN_CudaSpatialAdaptiveMaxPooling_updateOutput( - THCState *state, - THCudaTensor *input, - THCudaTensor *output, - THCudaTensor *indices, - int nOutputCols, - int nOutputRows); -TH_API void THNN_CudaSpatialAdaptiveMaxPooling_updateGradInput( - THCState *state, - THCudaTensor *input, - THCudaTensor *gradOutput, - THCudaTensor *gradInput, - THCudaTensor *indices); - -TH_API void THNN_CudaSpatialAveragePooling_updateOutput( - THCState *state, - THCudaTensor *input, - THCudaTensor *output, - int kW, int kH, - int dW, int dH, - int padW, int padH, - bool ceil_mode, - bool count_include_pad); -TH_API void THNN_CudaSpatialAveragePooling_updateGradInput( - THCState *state, - THCudaTensor *input, - THCudaTensor *gradOutput, - THCudaTensor *gradInput, - int kW, int kH, - int dW, int dH, - int padW, int padH, - bool ceil_mode, - bool count_include_pad); - -TH_API void THNN_CudaSpatialMaxPooling_updateOutput( - THCState *state, - THCudaTensor *input, - THCudaTensor *output, - THCudaTensor *indices, - int kW, int kH, - int dW, int dH, - int padW, int padH, - bool ceil_mode); -TH_API void THNN_CudaSpatialMaxPooling_updateGradInput( - THCState *state, - THCudaTensor *input, - THCudaTensor *gradOutput, - THCudaTensor *gradInput, - THCudaTensor *indices, - int kW, int kH, - int dW, int dH, - int padW, int padH, - bool ceil_mode); - -TH_API void THNN_CudaSpatialDilatedMaxPooling_updateOutput( - THCState *state, - THCudaTensor *input, - THCudaTensor *output, - THCudaTensor *indices, - int kW, int kH, - int dW, int dH, - int padW, int padH, - int dilationW, int dilationH, - bool ceil_mode); -TH_API void THNN_CudaSpatialDilatedMaxPooling_updateGradInput( - THCState *state, - THCudaTensor *input, - THCudaTensor *gradOutput, - THCudaTensor *gradInput, - THCudaTensor *indices, - int kW, int kH, - int dW, int dH, - int padW, int padH, - int dilationW, int dilationH, - bool ceil_mode); - -TH_API void THNN_CudaSpatialMaxUnpooling_updateOutput( - THCState *state, - THCudaTensor *input, - THCudaTensor *output, - THCudaTensor *indices, - int owidth, int oheight); -TH_API void THNN_CudaSpatialMaxUnpooling_updateGradInput( - THCState *state, - THCudaTensor *input, - THCudaTensor *gradOutput, - THCudaTensor *gradInput, - THCudaTensor *indices, - int owidth, int oheight); - -TH_API void THNN_CudaSpatialFractionalMaxPooling_updateOutput( - THCState *state, - THCudaTensor *input, - THCudaTensor *output, - int outputW, int outputH, - int poolSizeW, int poolSizeH, - THCudaTensor *indices, - THCudaTensor *randomSamples); -TH_API void THNN_CudaSpatialFractionalMaxPooling_updateGradInput( - THCState *state, - THCudaTensor *input, - THCudaTensor *gradOutput, - THCudaTensor *gradInput, - int outputW, int outputH, - int poolSizeW, int poolSizeH, - THCudaTensor *indices); - -TH_API void THNN_CudaSpatialSubSampling_updateOutput( - THCState *state, - THCudaTensor *input, - THCudaTensor *output, - THCudaTensor *weight, - THCudaTensor *bias, - int kW, int kH, - int dW, int dH); -TH_API void THNN_CudaSpatialSubSampling_updateGradInput( - THCState *state, - THCudaTensor *input, - THCudaTensor *gradOutput, - THCudaTensor *gradInput, - THCudaTensor *weight, - int kW, int kH, - int dW, int dH); -TH_API void THNN_CudaSpatialSubSampling_accGradParameters( - THCState *state, - THCudaTensor *input, - THCudaTensor *gradOutput, - THCudaTensor *gradWeight, - THCudaTensor *gradBias, - int kW, int kH, - int dW, int dH, - float scale); - -TH_API void THNN_CudaSpatialUpSamplingNearest_updateOutput( - THCState *state, - THCudaTensor *input, - THCudaTensor *output, - int scale_factor); -TH_API void THNN_CudaSpatialUpSamplingNearest_updateGradInput( - THCState *state, - THCudaTensor *input, - THCudaTensor *gradOutput, - THCudaTensor *gradInput, - int scale_factor); - -TH_API void THNN_CudaSpatialUpSamplingBilinear_updateOutput( - THCState *state, - THCudaTensor *input, - THCudaTensor *output, - int outputHeight, - int outputWidth); -TH_API void THNN_CudaSpatialUpSamplingBilinear_updateGradInput( - THCState *state, - THCudaTensor *gradOutput, - THCudaTensor *gradInput, - int nbatch, - int nchannels, - int inputHeight, - int inputWidth, - int outputHeight, - int outputWidth); - -TH_API void THNN_CudaVolumetricAveragePooling_updateOutput( - THCState *state, - THCudaTensor *input, - THCudaTensor *output, - int kT, int kW, int kH, - int dT, int dW, int dH); -TH_API void THNN_CudaVolumetricAveragePooling_updateGradInput( - THCState *state, - THCudaTensor *input, - THCudaTensor *gradOutput, - THCudaTensor *gradInput, - int kT, int kW, int kH, - int dT, int dW, int dH); - -TH_API void THNN_CudaVolumetricConvolution_updateOutput( - THCState *state, - THCudaTensor *input, - THCudaTensor *output, - THCudaTensor *weight, - THCudaTensor *bias, - THCudaTensor *finput, - THCudaTensor *fgradInput, - int dT, int dW, int dH, - int padT, int padW, int padH); -TH_API void THNN_CudaVolumetricConvolution_updateGradInput( - THCState *state, - THCudaTensor *input, - THCudaTensor *gradOutput, - THCudaTensor *gradInput, - THCudaTensor *weight, - THCudaTensor *finput, - int dT, int dW, int dH, - int padT, int padW, int padH); -TH_API void THNN_CudaVolumetricConvolution_accGradParameters( - THCState *state, - THCudaTensor *input, - THCudaTensor *gradOutput, - THCudaTensor *gradWeight, - THCudaTensor *gradBias, - THCudaTensor *finput, - THCudaTensor *fgradInput, - int dT, int dW, int dH, - int padT, int padW, int padH, - float scale); - -TH_API void THNN_CudaVolumetricFullConvolution_updateOutput( - THCState *state, - THCudaTensor *input, - THCudaTensor *output, - THCudaTensor *weight, - THCudaTensor *bias, - THCudaTensor *finput, - THCudaTensor *fgradInput, - int dT, int dW, int dH, - int padT, int padW, int padH, - int adjT, int adjW, int adjH); -TH_API void THNN_CudaVolumetricFullConvolution_updateGradInput( - THCState *state, - THCudaTensor *input, - THCudaTensor *gradOutput, - THCudaTensor *gradInput, - THCudaTensor *weight, - THCudaTensor *finput, - THCudaTensor *fgradInput, - int dT, int dW, int dH, - int padT, int padW, int padH, - int adjT, int adjW, int adjH); -TH_API void THNN_CudaVolumetricFullConvolution_accGradParameters( - THCState *state, - THCudaTensor *input, - THCudaTensor *gradOutput, - THCudaTensor *gradWeight, - THCudaTensor *gradBias, - THCudaTensor *finput, - THCudaTensor *fgradInput, - int dT, int dW, int dH, - int padT, int padW, int padH, - int adjT, int adjW, int adjH, - float scale); - -TH_API void THNN_CudaVolumetricDilatedConvolution_updateOutput( - THCState *state, - THCudaTensor *input, - THCudaTensor *output, - THCudaTensor *weight, - THCudaTensor *bias, - THCudaTensor *columns, - THCudaTensor *ones, - int kT, int kW, int kH, - int dT, int dW, int dH, - int padT, int padW, int padH, - int dilationT, int dilationW, int dilationH); - -TH_API void THNN_CudaVolumetricDilatedConvolution_updateGradInput( - THCState *state, - THCudaTensor *input, - THCudaTensor *gradOutput, - THCudaTensor *gradInput, - THCudaTensor *weight, - THCudaTensor *gradColumns, - int kT, int kW, int kH, - int dT, int dW, int dH, - int padT, int padW, int padH, - int dilationT, int dilationW, int dilationH); - -TH_API void THNN_CudaVolumetricDilatedConvolution_accGradParameters( - THCState *state, - THCudaTensor *input, - THCudaTensor *gradOutput, - THCudaTensor *gradWeight, - THCudaTensor *gradBias, - THCudaTensor *columns, - THCudaTensor *ones, - int kT, int kW, int kH, - int dT, int dW, int dH, - int padT, int padW, int padH, - int dilationT, int dilationW, int dilationH, - float scale); - -TH_API void THNN_CudaVolumetricMaxPooling_updateOutput( - THCState *state, - THCudaTensor *input, - THCudaTensor *output, - THCudaTensor *indices, - int kT, int kW, int kH, - int dT, int dW, int dH, - int padT, int padW, int padH, - bool ceilMode); -TH_API void THNN_CudaVolumetricMaxPooling_updateGradInput( - THCState *state, - THCudaTensor *input, - THCudaTensor *gradOutput, - THCudaTensor *gradInput, - THCudaTensor *indices, - int dT, int dW, int dH, - int padT, int padW, int padH); - -TH_API void THNN_CudaVolumetricDilatedMaxPooling_updateOutput( - THCState *state, - THCudaTensor *input, - THCudaTensor *output, - THCudaTensor *indices, - int kT, int kW, int kH, - int dT, int dW, int dH, - int padT, int padW, int padH, - int dilationT, int dilationW, int dilationH, - bool ceilMode); -TH_API void THNN_CudaVolumetricDilatedMaxPooling_updateGradInput( - THCState *state, - THCudaTensor *input, - THCudaTensor *gradOutput, - THCudaTensor *gradInput, - THCudaTensor *indices, - int dT, int dW, int dH, - int padT, int padW, int padH, - int dilationT, int dilationW, int dilationH); - -TH_API void THNN_CudaVolumetricMaxUnpooling_updateOutput( - THCState *state, - THCudaTensor *input, - THCudaTensor *output, - THCudaTensor *indices, - int outputTime, int outputWidth, int outputHeight, - int dT, int dW, int dH, - int padT, int padW, int padH); -TH_API void THNN_CudaVolumetricMaxUnpooling_updateGradInput( - THCState *state, - THCudaTensor *input, - THCudaTensor *gradOutput, - THCudaTensor *gradInput, - THCudaTensor *indices, - int outputTime, int outputWidth, int outputHeight, - int dT, int dW, int dH, - int padT, int padW, int padH); - -TH_API void THNN_CudaSpatialReflectionPadding_updateOutput( - THCState *state, - THCudaTensor *input, - THCudaTensor *output, - int padL, int padR, - int padT, int padB); -TH_API void THNN_CudaSpatialReflectionPadding_updateGradInput( - THCState *state, - THCudaTensor *input, - THCudaTensor *gradOutput, - THCudaTensor *gradInput, - int padL, int padR, - int padT, int padB); - -TH_API void THNN_CudaSpatialReplicationPadding_updateOutput( - THCState *state, - THCudaTensor *input, - THCudaTensor *output, - int padL, int padR, - int padT, int padB); -TH_API void THNN_CudaSpatialReplicationPadding_updateGradInput( - THCState *state, - THCudaTensor *input, - THCudaTensor *gradOutput, - THCudaTensor *gradInput, - int padL, int padR, - int padT, int padB); - -TH_API void THNN_CudaVolumetricReplicationPadding_updateOutput( - THCState *state, - THCudaTensor *input, - THCudaTensor *output, - int pleft, int pright, - int ptop, int pbottom, - int pfront, int pback); -TH_API void THNN_CudaVolumetricReplicationPadding_updateGradInput( - THCState *state, - THCudaTensor *input, - THCudaTensor *gradOutput, - THCudaTensor *gradInput, - int pleft, int pright, - int ptop, int pbottom, - int pfront, int pback); +#include "generic/THCUNN.h" +#include "THCGenerateFloatTypes.h" diff --git a/lib/THCUNN/Tanh.cu b/lib/THCUNN/Tanh.cu index 726169a..0bd1ee8 100644 --- a/lib/THCUNN/Tanh.cu +++ b/lib/THCUNN/Tanh.cu @@ -1,32 +1,24 @@ #include "THCUNN.h" -#include "common.h" +#include "THCHalf.h" +#include "THCHalfAutoNumerics.cuh" +template <typename T> struct tanhupdateOutput_functor { - __device__ void operator()(float *output, const float *input) const + __device__ void operator()(T *output, const T *input) const { *output = tanh(*input); } }; -void THNN_CudaTanh_updateOutput(THCState *state, THCudaTensor *input, THCudaTensor *output) -{ - THCUNN_assertSameGPU(state, 2, input, output); - THCudaTensor_resizeAs(state, output, input); - THC_pointwiseApply2(state, output, input, tanhupdateOutput_functor()); -} - +template <typename T> struct tanhupdateGradInput_functor { - __device__ void operator()(float *gradInput, const float *output, const float *gradOutput) const + __device__ void operator()(T *gradInput, const T *output, const T *gradOutput) const { *gradInput = *gradOutput * (1 - *output * *output); } }; -void THNN_CudaTanh_updateGradInput(THCState *state, THCudaTensor *input, THCudaTensor *gradOutput, THCudaTensor *gradInput, THCudaTensor *output) -{ - THCUNN_assertSameGPU(state, 3, output, gradOutput, gradInput); - THCudaTensor_resizeAs(state, gradInput, output); - THC_pointwiseApply3(state, gradInput, output, gradOutput, tanhupdateGradInput_functor()); -} +#include "generic/Tanh.cu" +#include "THCGenerateFloatTypes.h" diff --git a/lib/THCUNN/TemporalConvolution.cu b/lib/THCUNN/TemporalConvolution.cu index 078fd7b..f4e9c69 100644 --- a/lib/THCUNN/TemporalConvolution.cu +++ b/lib/THCUNN/TemporalConvolution.cu @@ -1,345 +1,7 @@ #include "THCUNN.h" #include "common.h" +#include "THCHalf.h" +#include "THCHalfAutoNumerics.cuh" -void THNN_CudaTemporalConvolution_updateOutput( - THCState *state, - THCudaTensor *input, - THCudaTensor *output, - THCudaTensor *weight, - THCudaTensor *bias, - int kW, int dW, - int inputFrameSize, - int outputFrameSize) { - - THCudaTensor *outputWindow, *inputWindow; - int nInputFrame, nOutputFrame; - long k, i; - - int dimS = 0; // sequence dimension - int dimF = 1; // feature dimension - - THCUNN_assertSameGPU(state, 4, input, output, weight, bias); - THArgCheck( input->nDimension == 2 || input->nDimension == 3, 2, "2D or 3D(batch mode) tensor expected"); - - if (input->nDimension == 3) - { - dimS = 1; - dimF = 2; - } - THArgCheck( input->size[dimF] == inputFrameSize, 2, "invalid input frame size"); - THArgCheck( input->size[dimS] >= kW, 2, "input sequence smaller than kernel size"); - - input = THCudaTensor_newContiguous(state, input); - outputWindow = THCudaTensor_new(state); - inputWindow = THCudaTensor_new(state); - - nInputFrame = input->size[dimS]; - nOutputFrame = (nInputFrame - kW) / dW + 1; - - if (input->nDimension == 2) - { - THCudaTensor_resize2d(state, output, - nOutputFrame, - outputFrameSize); - - /* bias first */ - for(k = 0; k < nOutputFrame; k++) - { - THCudaTensor_select(state, outputWindow, output, 0, k); - THCudaTensor_copy(state, outputWindow, bias); - } - - - /* ouch */ - for(k = 0; nOutputFrame > 0; k++) - { - long outputFrameStride = (kW-1)/dW+1; - long inputFrameStride = outputFrameStride*dW; - long nFrame = (nInputFrame-k*dW-kW)/inputFrameStride + 1; - nOutputFrame -= nFrame; - - THCudaTensor_setStorage2d(state, inputWindow, input->storage, - input->storageOffset+k*dW*input->size[1], - nFrame, inputFrameStride*input->size[1], - kW*input->size[1], 1); - - THCudaTensor_setStorage2d(state, outputWindow, output->storage, - output->storageOffset + k*output->size[1], - nFrame, outputFrameStride*output->size[1], - output->size[1], 1); - - THCudaTensor_transpose(state, weight, NULL, 0, 1); - THCudaTensor_addmm(state, outputWindow, 1, outputWindow, 1, inputWindow, weight); - THCudaTensor_transpose(state, weight, NULL, 0, 1); - } - } - else - { - THCudaTensor *outputSample = THCudaTensor_new(state); - THCudaTensor *inputSample = THCudaTensor_new(state); - int nBatchFrame = input->size[0]; - - THCudaTensor_resize3d(state, output, - nBatchFrame, - nOutputFrame, - outputFrameSize); - - for(i = 0; i < nBatchFrame; i++) - { - THCudaTensor_select(state, outputSample, output, 0, i); - THCudaTensor_select(state, inputSample, input, 0, i); - long nOutputSampleFrame = nOutputFrame; - - /* bias first */ - for(k = 0; k < nOutputFrame; k++) - { - THCudaTensor_select(state, outputWindow, outputSample, 0, k); - THCudaTensor_copy(state, outputWindow, bias); - } - - /* ouch */ - for(k = 0; nOutputSampleFrame > 0; k++) - { - long outputFrameStride = (kW-1)/dW+1; - long inputFrameStride = outputFrameStride*dW; - long nFrame = (nInputFrame-k*dW-kW)/inputFrameStride + 1; - nOutputSampleFrame -= nFrame; - - THCudaTensor_setStorage2d(state, inputWindow, inputSample->storage, - inputSample->storageOffset+k*dW*inputSample->size[1], - nFrame, inputFrameStride*inputSample->size[1], - kW*inputSample->size[1], 1); - - THCudaTensor_setStorage2d(state, outputWindow, outputSample->storage, - outputSample->storageOffset + k*outputSample->size[1], - nFrame, outputFrameStride*outputSample->size[1], - outputSample->size[1], 1); - - THCudaTensor_transpose(state, weight, NULL, 0, 1); - THCudaTensor_addmm(state, outputWindow, 1, outputWindow, 1, inputWindow, weight); - THCudaTensor_transpose(state, weight, NULL, 0, 1); - } - } - THCudaTensor_free(state, outputSample); - THCudaTensor_free(state, inputSample); - } - - THCudaTensor_free(state, outputWindow); - THCudaTensor_free(state, inputWindow); - THCudaTensor_free(state, input); - -} - -void THNN_CudaTemporalConvolution_updateGradInput( - THCState* state, - THCudaTensor *input, - THCudaTensor *gradOutput, - THCudaTensor *gradInput, - THCudaTensor *weight, - int kW, int dW) { - - long nInputFrame; - long nOutputFrame; - - THCudaTensor *gradOutputWindow; - THCudaTensor *gradInputWindow; - long k, i; - - int dimS = 0; // sequence dimension - - THCUNN_assertSameGPU(state, 4, input, gradOutput, weight, gradInput); - - if (gradOutput->nDimension == 3) - { - dimS = 1; - } - - nInputFrame = input->size[dimS]; - nOutputFrame = gradOutput->size[dimS]; - - - /* Not necessary with partial backprop: */ - gradOutputWindow = THCudaTensor_new(state); - gradInputWindow = THCudaTensor_new(state); - - THCudaTensor_resizeAs(state, gradInput, input); - THCudaTensor_zero(state, gradInput); - - if (gradOutput->nDimension == 2) - { - /* ouch */ - for(k = 0; nOutputFrame > 0; k++) - { - long outputFrameStride = (kW-1)/dW+1; - long inputFrameStride = outputFrameStride*dW; - long nFrame = (nInputFrame-k*dW-kW)/inputFrameStride + 1; - nOutputFrame -= nFrame; - - THCudaTensor_setStorage2d(state, gradOutputWindow, gradOutput->storage, - gradOutput->storageOffset + k*gradOutput->size[1], - nFrame, outputFrameStride*gradOutput->size[1], - gradOutput->size[1], 1); - - THCudaTensor_setStorage2d(state, gradInputWindow, gradInput->storage, - gradInput->storageOffset+k*dW*gradInput->size[1], - nFrame, inputFrameStride*gradInput->size[1], - kW*gradInput->size[1], 1); - - THCudaTensor_addmm(state, gradInputWindow, 1, gradInputWindow, 1, gradOutputWindow, weight); - } - } - else - { - THCudaTensor *gradOutputSample = THCudaTensor_new(state); - THCudaTensor *gradInputSample = THCudaTensor_new(state); - long nBatchFrame = input->size[0]; - for(i = 0; i < nBatchFrame; i++) - { - THCudaTensor_select(state, gradOutputSample, gradOutput, 0, i); - THCudaTensor_select(state, gradInputSample, gradInput, 0, i); - long nOutputSampleFrame = nOutputFrame; - - /* ouch */ - for(k = 0; nOutputSampleFrame > 0; k++) - { - long outputFrameStride = (kW-1)/dW+1; - long inputFrameStride = outputFrameStride*dW; - long nFrame = (nInputFrame-k*dW-kW)/inputFrameStride + 1; - nOutputSampleFrame -= nFrame; - - THCudaTensor_setStorage2d(state, gradOutputWindow, gradOutputSample->storage, - gradOutputSample->storageOffset + k*gradOutputSample->size[1], - nFrame, outputFrameStride*gradOutputSample->size[1], - gradOutputSample->size[1], 1); - - THCudaTensor_setStorage2d(state, gradInputWindow, gradInputSample->storage, - gradInputSample->storageOffset+k*dW*gradInputSample->size[1], - nFrame, inputFrameStride*gradInputSample->size[1], - kW*gradInputSample->size[1], 1); - - THCudaTensor_addmm(state, gradInputWindow, 1, gradInputWindow, 1, gradOutputWindow, weight); - } - } - THCudaTensor_free(state, gradOutputSample); - THCudaTensor_free(state, gradInputSample); - } - - THCudaTensor_free(state, gradOutputWindow); - THCudaTensor_free(state, gradInputWindow); - -} - -void THNN_CudaTemporalConvolution_accGradParameters( - THCState *state, - THCudaTensor *input, - THCudaTensor *gradOutput, - THCudaTensor *gradWeight, - THCudaTensor *gradBias, - int kW, int dW, - float scale) { - - long nInputFrame; - long nOutputFrame; - - THCudaTensor *gradOutputWindow; - THCudaTensor *inputWindow; - long k, i; - - int dimS = 0; // sequence dimension - - if (gradOutput->nDimension == 3) - { - dimS = 1; - } - - nInputFrame = input->size[dimS]; - nOutputFrame = gradOutput->size[dimS]; - - /* Not necessary with partial backprop: */ - input = THCudaTensor_newContiguous(state, input); - gradOutputWindow = THCudaTensor_new(state); - inputWindow = THCudaTensor_new(state); - - if (input->nDimension == 2) - { - /* bias first */ - for(k = 0; k < nOutputFrame; k++) - { - THCudaTensor_select(state, gradOutputWindow, gradOutput, 0, k); - THCudaTensor_cadd(state, gradBias, gradBias, scale, gradOutputWindow); - } - - /* ouch */ - for(k = 0; nOutputFrame > 0; k++) - { - long outputFrameStride = (kW-1)/dW+1; - long inputFrameStride = outputFrameStride*dW; - long nFrame = (nInputFrame-k*dW-kW)/inputFrameStride + 1; - nOutputFrame -= nFrame; - - THCudaTensor_setStorage2d(state, inputWindow, input->storage, - input->storageOffset+k*dW*input->size[1], - nFrame, inputFrameStride*input->size[1], - kW*input->size[1], 1); - - THCudaTensor_setStorage2d(state, gradOutputWindow, gradOutput->storage, - gradOutput->storageOffset + k*gradOutput->size[1], - nFrame, outputFrameStride*gradOutput->size[1], - gradOutput->size[1], 1); - - THCudaTensor_transpose(state, gradOutputWindow, NULL, 0, 1); - THCudaTensor_addmm(state, gradWeight, 1, gradWeight, scale, gradOutputWindow, inputWindow); - THCudaTensor_transpose(state, gradOutputWindow, NULL, 0, 1); - } - } - else - { - THCudaTensor *gradOutputSample = THCudaTensor_new(state); - THCudaTensor *inputSample = THCudaTensor_new(state); - long nBatchFrame = input->size[0]; - - for(i = 0; i < nBatchFrame; i++) - { - THCudaTensor_select(state, gradOutputSample, gradOutput, 0, i); - THCudaTensor_select(state, inputSample, input, 0, i); - long nOutputSampleFrame = nOutputFrame; - - /* bias first */ - for(k = 0; k < nOutputFrame; k++) - { - THCudaTensor_select(state, gradOutputWindow, gradOutputSample, 0, k); - THCudaTensor_cadd(state, gradBias, gradBias, scale, gradOutputWindow); - } - - /* ouch */ - for(k = 0; nOutputSampleFrame > 0; k++) - { - long outputFrameStride = (kW-1)/dW+1; - long inputFrameStride = outputFrameStride*dW; - long nFrame = (nInputFrame-k*dW-kW)/inputFrameStride + 1; - nOutputSampleFrame -= nFrame; - - THCudaTensor_setStorage2d(state, inputWindow, inputSample->storage, - inputSample->storageOffset+k*dW*inputSample->size[1], - nFrame, inputFrameStride*inputSample->size[1], - kW*inputSample->size[1], 1); - - THCudaTensor_setStorage2d(state, gradOutputWindow, gradOutputSample->storage, - gradOutputSample->storageOffset + k*gradOutputSample->size[1], - nFrame, outputFrameStride*gradOutputSample->size[1], - gradOutputSample->size[1], 1); - - THCudaTensor_transpose(state, gradOutputWindow, NULL, 0, 1); - THCudaTensor_addmm(state, gradWeight, 1, gradWeight, scale, gradOutputWindow, inputWindow); - THCudaTensor_transpose(state, gradOutputWindow, NULL, 0, 1); - } - } - THCudaTensor_free(state, gradOutputSample); - THCudaTensor_free(state, inputSample); - } - - THCudaTensor_free(state, gradOutputWindow); - THCudaTensor_free(state, inputWindow); - THCudaTensor_free(state, input); - -} +#include "generic/TemporalConvolution.cu" +#include "THCGenerateFloatTypes.h" diff --git a/lib/THCUNN/TemporalMaxPooling.cu b/lib/THCUNN/TemporalMaxPooling.cu index 0b31c60..384e409 100644 --- a/lib/THCUNN/TemporalMaxPooling.cu +++ b/lib/THCUNN/TemporalMaxPooling.cu @@ -1,28 +1,32 @@ #include "THCUNN.h" #include "common.h" +#include "THCHalf.h" +#include "THCHalfAutoNumerics.cuh" +#include "THCAtomics.cuh" #define TEMPORAL_MAX_POOLING_THREADS 1024 -__global__ void cunn_TemporalMaxPooling_updateOutputKernel(float *input, float *output, float *indices, int input_w, int input_n, int output_w, int kW, int dW) { +template <typename Dtype> +__global__ void cunn_TemporalMaxPooling_updateOutputKernel(Dtype *input, Dtype *output, THCIndex_t *indices, int input_w, int input_n, int output_w, int kW, int dW) { // Block idx is the batch index, thread idx + block idx y * MAX_THREADS is the time index - float *input_data = input + blockIdx.x * input_w * input_n + ( + Dtype *input_data = input + blockIdx.x * input_w * input_n + ( threadIdx.x + blockIdx.y * TEMPORAL_MAX_POOLING_THREADS) * input_n * dW; - float *output_data = output + blockIdx.x * output_w * input_n + ( + Dtype *output_data = output + blockIdx.x * output_w * input_n + ( threadIdx.x + blockIdx.y * TEMPORAL_MAX_POOLING_THREADS) * input_n; - float *indices_data = indices + blockIdx.x * output_w * input_n + ( + THCIndex_t *indices_data = indices + blockIdx.x * output_w * input_n + ( threadIdx.x + blockIdx.y * TEMPORAL_MAX_POOLING_THREADS) * input_n; int feat = 0; int time = 0; int max_time = input_n * kW; - float max_value; - float max_index = 0.0; + Dtype max_value; + THCIndex_t max_index = 0; if (threadIdx.x + blockIdx.y * TEMPORAL_MAX_POOLING_THREADS < output_w) { // For all features for (feat = 0; feat < input_n; ++feat) { - max_value = -FLT_MAX; + max_value = THCNumerics<Dtype>::min(); // For all values in the kernel space for (time = 0; time < max_time; time += input_n) { if (max_value < input_data[time + feat]) { @@ -31,18 +35,19 @@ __global__ void cunn_TemporalMaxPooling_updateOutputKernel(float *input, float * } } output_data[feat] = max_value; - indices_data[feat] = (float)max_index; + indices_data[feat] = max_index; } } } -__global__ void cunn_TemporalMaxPooling_updateGradInputKernel(float *gradInput, float *gradOutput, float *indices, int input_w, int input_n, int output_w, int kW, int dW) { +template <typename Dtype> +__global__ void cunn_TemporalMaxPooling_updateGradInputKernel(Dtype *gradInput, Dtype *gradOutput, THCIndex_t *indices, int input_w, int input_n, int output_w, int kW, int dW) { // Block idx is the batch index, thread idx + block idx y * MAX_THREADS is the time index - float *gradInput_data = gradInput + blockIdx.x * input_w * input_n + ( + Dtype *gradInput_data = gradInput + blockIdx.x * input_w * input_n + ( threadIdx.x + blockIdx.y * TEMPORAL_MAX_POOLING_THREADS) * input_n * dW; - float *gradOutput_data = gradOutput + blockIdx.x * output_w * input_n + ( + Dtype *gradOutput_data = gradOutput + blockIdx.x * output_w * input_n + ( threadIdx.x + blockIdx.y * TEMPORAL_MAX_POOLING_THREADS) * input_n; - float *indices_data = indices + blockIdx.x * output_w * input_n + ( + THCIndex_t *indices_data = indices + blockIdx.x * output_w * input_n + ( threadIdx.x + blockIdx.y * TEMPORAL_MAX_POOLING_THREADS) * input_n; int feat = 0; @@ -50,18 +55,19 @@ __global__ void cunn_TemporalMaxPooling_updateGradInputKernel(float *gradInput, if (threadIdx.x + blockIdx.y * TEMPORAL_MAX_POOLING_THREADS < output_w) { // For all features for (feat = 0; feat < input_n; ++feat) { - gradInput_data[(int)indices_data[feat] * input_n + feat] += gradOutput_data[feat]; + gradInput_data[indices_data[feat] * input_n + feat] += gradOutput_data[feat]; } } } -__global__ void cunn_TemporalMaxPooling_updateGradInputKernelAtomic(float *gradInput, float *gradOutput, float *indices, int input_w, int input_n, int output_w, int kW, int dW) { +template <typename Dtype> +__global__ void cunn_TemporalMaxPooling_updateGradInputKernelAtomic(Dtype *gradInput, Dtype *gradOutput, THCIndex_t *indices, int input_w, int input_n, int output_w, int kW, int dW) { // Block idx is the batch index, thread idx + block idx y * MAX_THREADS is the time index - float *gradInput_data = gradInput + blockIdx.x * input_w * input_n + ( + Dtype *gradInput_data = gradInput + blockIdx.x * input_w * input_n + ( threadIdx.x + blockIdx.y * TEMPORAL_MAX_POOLING_THREADS) * input_n * dW; - float *gradOutput_data = gradOutput + blockIdx.x * output_w * input_n + ( + Dtype *gradOutput_data = gradOutput + blockIdx.x * output_w * input_n + ( threadIdx.x + blockIdx.y * TEMPORAL_MAX_POOLING_THREADS) * input_n; - float *indices_data = indices + blockIdx.x * output_w * input_n + ( + THCIndex_t *indices_data = indices + blockIdx.x * output_w * input_n + ( threadIdx.x + blockIdx.y * TEMPORAL_MAX_POOLING_THREADS) * input_n; int feat = 0; @@ -69,153 +75,10 @@ __global__ void cunn_TemporalMaxPooling_updateGradInputKernelAtomic(float *gradI if (threadIdx.x + blockIdx.y * TEMPORAL_MAX_POOLING_THREADS < output_w) { // For all features for (feat = 0; feat < input_n; ++feat) { - atomicAdd(&gradInput_data[(int)indices_data[feat] * input_n + feat], gradOutput_data[feat]); + atomicAdd(&gradInput_data[indices_data[feat] * input_n + feat], gradOutput_data[feat]); } } } -void THNN_CudaTemporalMaxPooling_updateOutput( - THCState *state, - THCudaTensor *input, - THCudaTensor *output, - THCudaTensor *indices, - int kW, int dW) { - - int dimT = 0; // Temporal dimension - int dimF = 1; // Feature dimension - - int batch = 1; - int input_w; - int input_n; - int output_w; - int nthreads; - - float *input_data; - float *output_data; - float *indices_data; - - THCUNN_assertSameGPU(state, 3, input, output, indices); - THArgCheck( input->nDimension == 2 || input->nDimension == 3, 2, "2D or 3D(batch mode) tensor expected"); - - if (input->nDimension == 3) - { - dimT = 1; - dimF = 2; - batch = input->size[0]; - } - THArgCheck( input->size[dimT] >= kW, 2, "input sequence smaller than kernel size"); - - input = THCudaTensor_newContiguous(state, input); - - input_w = input->size[dimT]; - input_n = input->size[dimF]; - output_w = (input_w - kW) / dW + 1; - - if (input->nDimension == 2) - { - THCudaTensor_resize2d(state, output, output_w, input->size[dimF]); - THCudaTensor_resize2d(state, indices, output_w, input->size[dimF]); - } - else - { - THCudaTensor_resize3d(state, output, batch, output_w, input->size[dimF]); - THCudaTensor_resize3d(state, indices, batch, output_w, input->size[dimF]); - } - - input_data = THCudaTensor_data(state, input); - output_data = THCudaTensor_data(state, output); - indices_data = THCudaTensor_data(state, indices); - - dim3 blocks(batch); - nthreads = (output_w / 32) * 32; - if (output_w % 32 > 0) { - nthreads += 32; - } - - if (nthreads > TEMPORAL_MAX_POOLING_THREADS) { - blocks.y = nthreads / TEMPORAL_MAX_POOLING_THREADS; - if (nthreads % TEMPORAL_MAX_POOLING_THREADS > 0) { - blocks.y += 1; - } - nthreads = TEMPORAL_MAX_POOLING_THREADS; - } - - dim3 threads(nthreads); - cunn_TemporalMaxPooling_updateOutputKernel <<< blocks, threads, 0, THCState_getCurrentStream(state) >>>( - input_data, output_data, indices_data, input_w, input_n, output_w, kW, dW); - THCudaCheck(cudaGetLastError()); - THCudaTensor_free(state, input); - -} - -void THNN_CudaTemporalMaxPooling_updateGradInput( - THCState *state, - THCudaTensor *input, - THCudaTensor *gradOutput, - THCudaTensor *gradInput, - THCudaTensor *indices, - int kW, int dW) { - - int dimT = 0; // Temporal dimension - int dimF = 1; // Feature dimension - - int batch = 1; - int input_w; - int input_n; - int output_w; - int nthreads; - - float *gradInput_data; - float *gradOutput_data; - float *indices_data; - - THCUNN_assertSameGPU(state, 4, input, gradOutput, gradInput, indices); - THArgCheck( input->nDimension == 2 || input->nDimension == 3, 2, "2D or 3D(batch mode) tensor expected"); - - THCudaTensor_resizeAs(state, gradInput, input); - THCudaTensor_zero(state, gradInput); - - if (input->nDimension == 3) - { - dimT = 1; - dimF = 2; - batch = input->size[0]; - } - THArgCheck( input->size[dimT] >= kW, 2, "input sequence smaller than kernel size"); - - gradOutput = THCudaTensor_newContiguous(state, gradOutput); - - input_w = input->size[dimT]; - input_n = input->size[dimF]; - output_w = (input_w - kW) / dW + 1; - - gradInput_data = THCudaTensor_data(state, gradInput); - gradOutput_data = THCudaTensor_data(state, gradOutput); - indices_data = THCudaTensor_data(state, indices); - - dim3 blocks(batch); - nthreads = (output_w / 32) * 32; - if (output_w % 32 > 0) { - nthreads += 32; - } - - if (nthreads > TEMPORAL_MAX_POOLING_THREADS) { - blocks.y = nthreads / TEMPORAL_MAX_POOLING_THREADS; - if (nthreads % TEMPORAL_MAX_POOLING_THREADS > 0) { - blocks.y += 1; - } - nthreads = TEMPORAL_MAX_POOLING_THREADS; - } - - dim3 threads(nthreads); - if (kW <= dW) { - cunn_TemporalMaxPooling_updateGradInputKernel <<< blocks, threads, 0, THCState_getCurrentStream(state) >>>( - gradInput_data, gradOutput_data, indices_data, input_w, input_n, output_w, kW, dW); - } else { - cunn_TemporalMaxPooling_updateGradInputKernelAtomic <<< blocks, threads, 0, THCState_getCurrentStream(state) >>>( - gradInput_data, gradOutput_data, indices_data, input_w, input_n, output_w, kW, dW); - } - THCudaCheck(cudaGetLastError()); - THCudaTensor_free(state, gradOutput); - -} +#include "generic/TemporalMaxPooling.cu" +#include "THCGenerateFloatTypes.h" diff --git a/lib/THCUNN/Threshold.cu b/lib/THCUNN/Threshold.cu index 623add8..47fe794 100644 --- a/lib/THCUNN/Threshold.cu +++ b/lib/THCUNN/Threshold.cu @@ -1,112 +1,74 @@ #include "THCUNN.h" -#include "common.h" +#include "THCHalf.h" +#include "THCHalfAutoNumerics.cuh" +template <typename T> struct ThresholdUpdateOutput { - const float threshold_; - const float val_; + const T threshold_; + const T val_; - ThresholdUpdateOutput(float threshold, float val) + ThresholdUpdateOutput(T threshold, T val) : threshold_(threshold) , val_(val) {} - __device__ __forceinline__ void operator()(float *out, float *in) + __device__ __forceinline__ void operator()(T *out, T *in) { - float x = *in; + T x = *in; *out = (x > threshold_) ? x : val_; } }; // in-place variant +template <typename T> struct ThresholdUpdateOutputIP { - const float threshold_; - const float val_; + const T threshold_; + const T val_; - ThresholdUpdateOutputIP(float threshold, float val) + ThresholdUpdateOutputIP(T threshold, T val) : threshold_(threshold) , val_(val) {} - __device__ __forceinline__ void operator()(float *x) + __device__ __forceinline__ void operator()(T *x) { *x = (*x > threshold_) ? *x : val_; } }; -void THNN_CudaThreshold_updateOutput(THCState *state, THCudaTensor *input, THCudaTensor *output, - double threshold, double val, bool inplace) -{ - THCUNN_assertSameGPU(state, 2, input, output); - - if (inplace) - { - THC_pointwiseApply1(state, input, - ThresholdUpdateOutputIP(threshold, val) - ); - THCudaTensor_set(state, output, input); - } - else - { - THCudaTensor_resizeAs(state, output, input); - THC_pointwiseApply2(state, output, input, - ThresholdUpdateOutput(threshold, val) - ); - } - - THCudaCheck(cudaGetLastError()); -} - +template <typename T> struct ThresholdUpdateGradInput { - const float threshold_; + const T threshold_; - ThresholdUpdateGradInput(float threshold) + ThresholdUpdateGradInput(T threshold) : threshold_(threshold) {} __device__ __forceinline__ void operator()( - float *gradInput, float *input, float *gradOutput) const + T *gradInput, T *input, T *gradOutput) const { - *gradInput = (*input > threshold_) ? *gradOutput : 0; + *gradInput = (*input > threshold_) ? *gradOutput : ScalarConvert<int, T>::to(0); } }; +template <typename T> struct ThresholdUpdateGradInputIP { - const float threshold_; + const T threshold_; - ThresholdUpdateGradInputIP(float threshold) + ThresholdUpdateGradInputIP(T threshold) : threshold_(threshold) {} __device__ __forceinline__ void operator()( - float *gradOutput, float *input) const + T *gradOutput, T *input) const { - *gradOutput = (*input > threshold_) ? *gradOutput : 0; + *gradOutput = (*input > threshold_) ? *gradOutput : ScalarConvert<int, T>::to(0); } }; -void THNN_CudaThreshold_updateGradInput(THCState *state, THCudaTensor *input, THCudaTensor *gradOutput, - THCudaTensor *gradInput, double threshold, double val, bool inplace) -{ - THCUNN_assertSameGPU(state, 3, input, gradInput, gradOutput); - - if (inplace) - { - THC_pointwiseApply2(state, gradOutput, input, - ThresholdUpdateGradInputIP(threshold) - ); - THCudaTensor_set(state, gradInput, gradOutput); - } - else - { - THCudaTensor_resizeAs(state, gradInput, input); - THC_pointwiseApply3(state, gradInput, input, gradOutput, - ThresholdUpdateGradInput(threshold) - ); - } - - THCudaCheck(cudaGetLastError()); -} +#include "generic/Threshold.cu" +#include "THCGenerateFloatTypes.h" diff --git a/lib/THCUNN/VolumetricAveragePooling.cu b/lib/THCUNN/VolumetricAveragePooling.cu index 9542232..f584dcf 100644 --- a/lib/THCUNN/VolumetricAveragePooling.cu +++ b/lib/THCUNN/VolumetricAveragePooling.cu @@ -3,10 +3,14 @@ #include "THCDeviceTensor.cuh" #include "THCDeviceTensorUtils.cuh" #include "THCDeviceUtils.cuh" +#include "THCHalf.h" +#include "THCHalfAutoNumerics.cuh" +#include "THCAtomics.cuh" +template <typename Dtype, typename Acctype> __global__ void cuda_VolumetricAveragePooling_updateOutput( - THCDeviceTensor<float, 4> input, THCDeviceTensor<float, 4> output, - int kT, int kH, int kW, int dT, int dH, int dW, float normFactor, int offsetZ) + THCDeviceTensor<Dtype, 4> input, THCDeviceTensor<Dtype, 4> output, + int kT, int kH, int kW, int dT, int dH, int dW, Acctype normFactor, int offsetZ) { int oCol = blockIdx.x * blockDim.x + threadIdx.x; int oRow = blockIdx.y * blockDim.y + threadIdx.y; @@ -15,7 +19,7 @@ __global__ void cuda_VolumetricAveragePooling_updateOutput( if (oRow < output.getSize(2) && oCol < output.getSize(3)) { - float sum = 0.0; + Acctype sum = 0.0; int iColumn = oCol * dW; int iRow = oRow * dH; @@ -33,7 +37,7 @@ __global__ void cuda_VolumetricAveragePooling_updateOutput( { if (iColumn + column < input.getSize(3)) { - float val = input[slice][iFrame + frame][iRow + row][iColumn + column]; + Dtype val = input[slice][iFrame + frame][iRow + row][iColumn + column]; sum += val; } } @@ -42,17 +46,17 @@ __global__ void cuda_VolumetricAveragePooling_updateOutput( } } - output[slice][oFrame][oRow][oCol] = sum * normFactor; + output[slice][oFrame][oRow][oCol] = ScalarConvert<Acctype, Dtype>::to(sum * normFactor); } } // Inner-most loop size (kW) passed as template parameter for // performance reasons. // -template<int KERNEL_WIDTH> +template<int KERNEL_WIDTH, typename Dtype, typename Acctype> __global__ void cuda_VolumetricAveragePooling_updateOutput( - THCDeviceTensor<float, 4> input, THCDeviceTensor<float, 4> output, - int kT, int kH, int dT, int dH, int dW, float normFactor, int offsetZ) + THCDeviceTensor<Dtype, 4> input, THCDeviceTensor<Dtype, 4> output, + int kT, int kH, int dT, int dH, int dW, Acctype normFactor, int offsetZ) { int oCol = blockIdx.x * blockDim.x + threadIdx.x; int oRow = blockIdx.y * blockDim.y + threadIdx.y; @@ -61,7 +65,7 @@ __global__ void cuda_VolumetricAveragePooling_updateOutput( if (oRow < output.getSize(2) && oCol < output.getSize(3)) { - float sum = 0.0; + Acctype sum = 0.0; int iColumn = oCol * dW; int iRow = oRow * dH; @@ -79,7 +83,7 @@ __global__ void cuda_VolumetricAveragePooling_updateOutput( { if (iColumn + column < input.getSize(3)) { - float val = input[slice][iFrame + frame][iRow + row][iColumn + column]; + Dtype val = input[slice][iFrame + frame][iRow + row][iColumn + column]; sum += val; } } @@ -88,7 +92,7 @@ __global__ void cuda_VolumetricAveragePooling_updateOutput( } } - output[slice][oFrame][oRow][oCol] = sum * normFactor; + output[slice][oFrame][oRow][oCol] = ScalarConvert<Acctype, Dtype>::to(sum * normFactor); } } @@ -97,127 +101,11 @@ __global__ void cuda_VolumetricAveragePooling_updateOutput( cudaInput, cudaOutput, kT, kH, dT, dH, dW, normFactor, offsetZ); \ break - -void THNN_CudaVolumetricAveragePooling_updateOutput( - THCState *state, THCudaTensor *input, THCudaTensor *output, - int kT, int kW, int kH, - int dT, int dW, int dH) -{ - int batchSize; - int inputSlices; - int inputTime; - int inputHeight; - int inputWidth; - - if (THCudaTensor_nDimension(state, input) == 4) - { - THArgCheck( - THCudaTensor_size(state, input, 1) >= kT && - THCudaTensor_size(state, input, 2) >= kH && - THCudaTensor_size(state, input, 3) >= kW, 2, - "input image smaller than kernel size" - ); - - /* sizes */ - batchSize = 1; - inputSlices = THCudaTensor_size(state, input, 0); - inputTime = THCudaTensor_size(state, input, 1); - inputHeight = THCudaTensor_size(state, input, 2); - inputWidth = THCudaTensor_size(state, input, 3); - } - else if (THCudaTensor_nDimension(state, input) == 5) - { - THArgCheck( - THCudaTensor_size(state, input, 2) >= kT && - THCudaTensor_size(state, input, 3) >= kH && - THCudaTensor_size(state, input, 4) >= kW, 2, - "input image smaller than kernel size" - - ); - /* sizes */ - batchSize = THCudaTensor_size(state, input, 0); - inputSlices = THCudaTensor_size(state, input, 1); - inputTime = THCudaTensor_size(state, input, 2); - inputHeight = THCudaTensor_size(state, input, 3); - inputWidth = THCudaTensor_size(state, input, 4); - } - else - { - THArgCheck(false, 2, "4D or 5D tensor expected"); - } - - int outputTime = (inputTime - kT) / dT + 1; - int outputHeight = (inputHeight - kH) / dH + 1; - int outputWidth = (inputWidth - kW) / dW + 1; - - if (input->nDimension == 4) /* 4D */ - { - /* resize output */ - THCudaTensor_resize4d(state, output, inputSlices, - outputTime, outputHeight, outputWidth); - } - else /* 5D */ - { - THCudaTensor_resize5d(state, output, batchSize, inputSlices, - outputTime, outputHeight, outputWidth); - } - - input = THCudaTensor_newContiguous(state, input); - - // Collapse batch and feature dimensions - THCDeviceTensor<float, 4> cudaInput; - THCDeviceTensor<float, 4> cudaOutput; - if (THCudaTensor_nDimension(state, input) == 4) - { - cudaInput = toDeviceTensor<float, 4>(state, input); - cudaOutput = toDeviceTensor<float, 4>(state, output); - } - else - { - cudaInput = toDeviceTensor<float, 5>(state, input).downcastOuter<4>(); - cudaOutput = toDeviceTensor<float, 5>(state, output).downcastOuter<4>(); - } - - int totalZ = outputTime * inputSlices * batchSize; - int offsetZ = 0; - dim3 block(32, 8); - while (totalZ > 0) { - dim3 grid(THCCeilDiv(outputWidth, static_cast<int>(block.x)), - THCCeilDiv(outputHeight, static_cast<int>(block.y)), - totalZ > 65535 ? 65535 : totalZ); - - float normFactor = 1.0f / static_cast<float>(kT * kH * kW); - switch (kW) - { - LAUNCH_UPDATE_OUTPUT_KERNEL_WIDTH(1); - LAUNCH_UPDATE_OUTPUT_KERNEL_WIDTH(2); - LAUNCH_UPDATE_OUTPUT_KERNEL_WIDTH(3); - LAUNCH_UPDATE_OUTPUT_KERNEL_WIDTH(4); - LAUNCH_UPDATE_OUTPUT_KERNEL_WIDTH(5); - LAUNCH_UPDATE_OUTPUT_KERNEL_WIDTH(6); - LAUNCH_UPDATE_OUTPUT_KERNEL_WIDTH(7); - default: - cuda_VolumetricAveragePooling_updateOutput<<<grid, block>>>( - cudaInput, - cudaOutput, - kT, kH, kW, - dT, dH, dW, - normFactor, - offsetZ - ); - break; - } - totalZ -= 65535; - offsetZ += 65535; - THCudaCheck(cudaGetLastError()); - } - THCudaTensor_free(state, input); -} - +template <typename Dtype, typename Acctype> __global__ void cuda_VolumetricAveragePooling_updateGradInput_Stride1( - THCDeviceTensor<float, 4> gradOutput, - THCDeviceTensor<float, 4> gradInput, - int kT, int kH, int kW, float normFactor, int offsetZ) + THCDeviceTensor<Dtype, 4> gradOutput, + THCDeviceTensor<Dtype, 4> gradInput, + int kT, int kH, int kW, Acctype normFactor, int offsetZ) { int iCol = blockIdx.x * blockDim.x + threadIdx.x; int iRow = blockIdx.y * blockDim.y + threadIdx.y; @@ -227,8 +115,8 @@ __global__ void cuda_VolumetricAveragePooling_updateGradInput_Stride1( // guard against over-tiled threads if (iRow < gradInput.getSize(2) && iCol < gradInput.getSize(3)) { - float sum = 0.0; - float *gOut = &gradOutput[slice][max(0, iFrame - kT + 1)] + Acctype sum = 0.0; + Dtype *gOut = &gradOutput[slice][max(0, iFrame - kT + 1)] [max(0, iRow - kH + 1)][max(0, iCol - kW + 1)]; int frameOffset = 0; for (int oFrame = max(0, iFrame - kT + 1); @@ -252,13 +140,14 @@ __global__ void cuda_VolumetricAveragePooling_updateGradInput_Stride1( } frameOffset += gradOutput.getSize(2) * gradOutput.getSize(3); } - gradInput[slice][iFrame][iRow][iCol] = sum * normFactor; + gradInput[slice][iFrame][iRow][iCol] = ScalarConvert<Acctype, Dtype>::to(sum * normFactor); } } +template <typename Dtype, typename Acctype> __global__ void cuda_VolumetricAveragePooling_updateGradInput_atomicAdd( - THCDeviceTensor<float, 4> gradOutput, - THCDeviceTensor<float, 4> gradInput, + THCDeviceTensor<Dtype, 4> gradOutput, + THCDeviceTensor<Dtype, 4> gradInput, int kT, int kH, int kW, int dT, int dH, int dW, int offsetZ) { int oCol = blockIdx.x * blockDim.x + threadIdx.x; @@ -269,7 +158,8 @@ __global__ void cuda_VolumetricAveragePooling_updateGradInput_atomicAdd( // guard against over-tiled threads if (oRow < gradOutput.getSize(2) && oCol < gradOutput.getSize(3)) { - float val = gradOutput[slice][oFrame][oRow][oCol] / (kT * kH * kW); + Dtype val = ScalarConvert<Acctype, Dtype>::to( + ScalarConvert<Dtype, Acctype>::to(gradOutput[slice][oFrame][oRow][oCol]) / (kT * kH * kW)); for (int iFrame = oFrame * dT; iFrame < oFrame * dT + kT; ++iFrame) { for (int iRow = oRow * dH; iRow < oRow * dH + kH; ++iRow) @@ -283,9 +173,10 @@ __global__ void cuda_VolumetricAveragePooling_updateGradInput_atomicAdd( } } +template <typename Dtype, typename Acctype> __global__ void cuda_VolumetricAveragePooling_updateGradInput( - THCDeviceTensor<float, 4> gradOutput, - THCDeviceTensor<float, 4> gradInput, + THCDeviceTensor<Dtype, 4> gradOutput, + THCDeviceTensor<Dtype, 4> gradInput, int kT, int kH, int kW, int dT, int dH, int dW, int offsetZ) { @@ -297,7 +188,8 @@ __global__ void cuda_VolumetricAveragePooling_updateGradInput( // guard against over-tiled threads if (oRow < gradOutput.getSize(2) && oCol < gradOutput.getSize(3)) { - float val = gradOutput[slice][oFrame][oRow][oCol] / (kT * kH * kW); + Dtype val = ScalarConvert<Acctype, Dtype>::to( + ScalarConvert<Dtype, Acctype>::to(gradOutput[slice][oFrame][oRow][oCol]) / (kT * kH * kW)); for (int iFrame = oFrame * dT; iFrame < oFrame * dT + kT; ++iFrame) { for (int iRow = oRow * dH; iRow < oRow * dH + kH; ++iRow) @@ -311,116 +203,5 @@ __global__ void cuda_VolumetricAveragePooling_updateGradInput( } } -void THNN_CudaVolumetricAveragePooling_updateGradInput( - THCState *state, - THCudaTensor *input, - THCudaTensor *gradOutput, - THCudaTensor *gradInput, - int kT, int kW, int kH, - int dT, int dW, int dH) -{ - bool kernelsOverlap = (dT < kT) || (dH < kH) || (dW < kW); - - // Resize and initialize result tensor. - THCudaTensor_resizeAs(state, gradInput, input); - THCudaTensor_zero(state, gradInput); - - int batchSize; - int inputSlices; - int inputTime; - int inputHeight; - int inputWidth; - - int outputTime; - int outputHeight; - int outputWidth; - - if (THCudaTensor_nDimension(state, input) == 4) /* 4D */ - { - batchSize = 1; - inputSlices = THCudaTensor_size(state, input, 0); - inputTime = THCudaTensor_size(state, input, 1); - inputHeight = THCudaTensor_size(state, input, 2); - inputWidth = THCudaTensor_size(state, input, 3); - - outputTime = THCudaTensor_size(state, gradOutput, 1); - outputHeight = THCudaTensor_size(state, gradOutput, 2); - outputWidth = THCudaTensor_size(state, gradOutput, 3); - } - else - { - batchSize = THCudaTensor_size(state, input, 0); - inputSlices = THCudaTensor_size(state, input, 1); - inputTime = THCudaTensor_size(state, input, 2); - inputHeight = THCudaTensor_size(state, input, 3); - inputWidth = THCudaTensor_size(state, input, 4); - - outputTime = THCudaTensor_size(state, gradOutput, 2); - outputHeight = THCudaTensor_size(state, gradOutput, 3); - outputWidth = THCudaTensor_size(state, gradOutput, 4); - } - - gradOutput = THCudaTensor_newContiguous(state, gradOutput); - - // Collapse batch and feature dimensions - THCDeviceTensor<float, 4> cudaGradInput; - THCDeviceTensor<float, 4> cudaGradOutput; - if (THCudaTensor_nDimension(state, input) == 4) - { - cudaGradInput = toDeviceTensor<float, 4>(state, gradInput); - cudaGradOutput = toDeviceTensor<float, 4>(state, gradOutput); - } - else - { - cudaGradInput = - toDeviceTensor<float, 5>(state, gradInput).downcastOuter<4>(); - cudaGradOutput = - toDeviceTensor<float, 5>(state, gradOutput).downcastOuter<4>(); - } - - dim3 block(32, 8); - - // Optimizing for stride 1 is probably only of limited value, but this - // specialization yields 3x speedup over the atomicAdd implementation. - if (dT == 1 && dH == 1 && dW == 1) - { - int totalZ = inputTime * inputSlices * batchSize; - int offsetZ = 0; - while (totalZ > 0) { - dim3 grid(THCCeilDiv(inputWidth, static_cast<int>(block.x)), - THCCeilDiv(inputHeight, static_cast<int>(block.y)), - totalZ > 65535 ? 65535 : totalZ); - cuda_VolumetricAveragePooling_updateGradInput_Stride1<<<grid, block>>>( - cudaGradOutput, cudaGradInput, kT, kH, kW, 1.0f/(kT * kH * kW), offsetZ); - THCudaCheck(cudaGetLastError()); - totalZ -= 65535; - offsetZ += 65535; - } - } - else - { - int totalZ = outputTime * inputSlices * batchSize; - int offsetZ = 0; - while (totalZ > 0) { - - dim3 grid(THCCeilDiv(outputWidth, static_cast<int>(block.x)), - THCCeilDiv(outputHeight, static_cast<int>(block.y)), - totalZ > 65535 ? 65535 : totalZ); - if (kernelsOverlap) - { - cuda_VolumetricAveragePooling_updateGradInput_atomicAdd<<<grid, block>>>( - cudaGradOutput, cudaGradInput, kT, kH, kW, dT, dH, dW, offsetZ); - } - else - { - cuda_VolumetricAveragePooling_updateGradInput<<<grid, block>>>( - cudaGradOutput, cudaGradInput, kT, kH, kW, dT, dH, dW, offsetZ); - } - THCudaCheck(cudaGetLastError()); - totalZ -= 65535; - offsetZ += 65535; - } - } - - THCudaTensor_free(state, gradOutput); -} +#include "generic/VolumetricAveragePooling.cu" +#include "THCGenerateFloatTypes.h" diff --git a/lib/THCUNN/VolumetricConvolution.cu b/lib/THCUNN/VolumetricConvolution.cu index 8576ae8..78f45f9 100644 --- a/lib/THCUNN/VolumetricConvolution.cu +++ b/lib/THCUNN/VolumetricConvolution.cu @@ -1,16 +1,19 @@ #include "THCUNN.h" #include "common.h" +#include "THCHalf.h" +#include "THCHalfAutoNumerics.cuh" // Kernel for fast unfold+copy // Borrowed from Theano // Authors: Arjun Jain, Frédéric Bastien, Jan Schlüter, Nicolas Ballas -__global__ void im3d2col_kernel(const int n, const float* data_im, +template <typename Dtype> +__global__ void im3d2col_kernel(const int n, const Dtype* data_im, const int height, const int width, const int depth, const int kernel_h, const int kernel_w, const int kernel_d, const int pad_h, const int pad_w, const int pad_d, const int stride_h, const int stride_w, const int stride_d, const int height_col, const int width_col, const int depth_col, - float* data_col) + Dtype* data_col) { CUDA_KERNEL_LOOP(index, n) { @@ -29,11 +32,11 @@ __global__ void im3d2col_kernel(const int n, const float* data_im, int w_in = w_out * stride_w - pad_w; int d_in = d_out * stride_d - pad_d; - float* data_col_ptr = data_col; + Dtype* data_col_ptr = data_col; data_col_ptr += channel_out * (height_col * width_col * depth_col) + h_out * (width_col * depth_col) + w_out * depth_col + d_out; - const float* data_im_ptr = data_im; + const Dtype* data_im_ptr = data_im; data_im_ptr += channel_in * (height * width * depth) + h_in * (width * depth) + w_in * depth + d_in; @@ -48,7 +51,7 @@ __global__ void im3d2col_kernel(const int n, const float* data_im, int d = d_in + k; *data_col_ptr = (h >= 0 && w >= 0 && d >= 0 && h < height && w < width && d < depth) ? - data_im_ptr[i * (width * depth) + j *depth + k] : 0; + data_im_ptr[i * (width * depth) + j *depth + k] : ScalarConvert<int, Dtype>::to(0); data_col_ptr += height_col * width_col * depth_col; } } @@ -56,12 +59,13 @@ __global__ void im3d2col_kernel(const int n, const float* data_im, } } -void im3d2col(cudaStream_t stream, const float* data_im, const int channels, +template <typename Dtype> +void im3d2col(cudaStream_t stream, const Dtype* data_im, const int channels, const int height, const int width, const int depth, const int kernel_h, const int kernel_w, const int kernel_d, const int pad_h, const int pad_w, const int pad_d, const int stride_h, const int stride_w, const int stride_d, - float* data_col) + Dtype* data_col) { // We are going to launch channels * height_col * width_col * depth_col kernels, each // kernel responsible for copying a single-channel grid. @@ -80,19 +84,19 @@ void im3d2col(cudaStream_t stream, const float* data_im, const int channels, THCudaCheck(cudaGetLastError()); } - -__global__ void col2im3d_kernel(const int n, const float* data_col, +template <typename Dtype, typename Acctype> +__global__ void col2im3d_kernel(const int n, const Dtype* data_col, const int height, const int width, const int depth, const int channels, const int patch_h, const int patch_w, const int patch_d, const int pad_h, const int pad_w, const int pad_d, const int stride_h, const int stride_w, const int stride_d, const int height_col, const int width_col, const int depth_col, - float* data_im) + Dtype* data_im) { CUDA_KERNEL_LOOP(index, n) { - float val = 0; + Acctype val = 0; int d = index % depth + pad_d; int w_index = index / depth; int w = w_index % width + pad_w; @@ -120,16 +124,17 @@ __global__ void col2im3d_kernel(const int n, const float* data_col, val += data_col[offset + h_col * coeff_h_col + w_col * coeff_w_col + d_col * coeff_d_col]; } } - data_im[index] = val; + data_im[index] = ScalarConvert<Acctype, Dtype>::to(val); } } -void col2im3d(cudaStream_t stream, const float* data_col, const int channels, +template <typename Dtype, typename Acctype> +void col2im3d(cudaStream_t stream, const Dtype* data_col, const int channels, const int height, const int width, const int depth, const int patch_h, const int patch_w, const int patch_d, const int pad_h, const int pad_w, const int pad_d, const int stride_h, const int stride_w, const int stride_d, - float* data_im) + Dtype* data_im) { int height_col = (height + 2 * pad_h - patch_h) / stride_h + 1; int width_col = (width + 2 * pad_w - patch_w) / stride_w + 1; @@ -138,7 +143,7 @@ void col2im3d(cudaStream_t stream, const float* data_col, const int channels, // To avoid involving atomic operations, we will launch one kernel per // bottom dimension, and then in the kernel add up the top dimensions. - col2im3d_kernel<<<GET_BLOCKS(num_kernels), + col2im3d_kernel<Dtype, Acctype><<<GET_BLOCKS(num_kernels), CUDA_NUM_THREADS, 0, stream>>>(num_kernels, data_col, height, width, depth, channels, patch_h, patch_w, patch_d, @@ -149,371 +154,5 @@ void col2im3d(cudaStream_t stream, const float* data_col, const int channels, THCudaCheck(cudaGetLastError()); } -void THNN_CudaVolumetricConvolution_updateOutput( - THCState *state, - THCudaTensor *input, - THCudaTensor *output, - THCudaTensor *weight, - THCudaTensor *bias, - THCudaTensor *finput, - THCudaTensor *fgradInput, - int dT, int dW, int dH, - int padT, int padW, int padH) -{ - THCudaTensor *columns = finput; - THCudaTensor *ones = fgradInput; - THCUNN_assertSameGPU(state, 6, input, output, weight, bias, columns, ones); - - THArgCheck(input->nDimension == 4 || input->nDimension == 5, 2, - "4D or 5D (batch mode) tensor is expected" - ); - - THArgCheck(weight->nDimension == 5, 4, - "5D weight tensor is expected (nOutputPlane x nInputPlane x kT x kH x kW)" - ); - - int nOutputPlane = (int)weight->size[0]; - int nInputPlane = (int)weight->size[1]; - int kT = (int)weight->size[2]; - int kH = (int)weight->size[3]; - int kW = (int)weight->size[4]; - - int batch = 1; - if (input->nDimension == 4) - { - // Force batch - batch = 0; - THCudaTensor_resize5d(state, input, 1, input->size[0], input->size[1], - input->size[2], input->size[3]); - } - - long inputWidth = input->size[3]; - long inputHeight = input->size[2]; - long inputDepth = input->size[4]; - long outputWidth = (inputWidth + 2*padH - kH) / dH + 1; - long outputHeight = (inputHeight + 2*padT - kT) / dT + 1; - long outputDepth = (inputDepth + 2*padW - kW) / dW + 1; - - // Batch size + input planes - long batchSize = input->size[0]; - - // Resize output - THCudaTensor_resize5d(state, output, batchSize, nOutputPlane, - outputHeight, outputWidth, outputDepth); - - // Resize temporary columns - THCudaTensor_resize2d(state, columns, nInputPlane*kW*kH*kT, outputDepth*outputHeight*outputWidth); - - // Define a buffer of ones, for bias accumulation - // Note: this buffer can be shared with other modules, it only ever gets increased, - // and always contains ones. - if (ones->nDimension != 3 || ones->size[0]*ones->size[1]*ones->size[2] < outputDepth*outputHeight*outputWidth) - { - // Resize plane and fill with ones... - THCudaTensor_resize3d(state, ones, outputHeight, outputWidth, outputDepth); - THCudaTensor_fill(state, ones, 1); - } - - // Helpers - THCudaTensor *input_n = THCudaTensor_new(state); - THCudaTensor *output_n = THCudaTensor_new(state); - - // For each elt in batch, do: - for (int elt = 0; elt < batchSize; elt ++) - { - // Matrix mulitply per output: - THCudaTensor_select(state, input_n, input, 0, elt); - THCudaTensor_select(state, output_n, output, 0, elt); - - // Do Bias first: - // M,N,K are dims of matrix A and B - // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm) - long m_ = nOutputPlane; - long n_ = outputDepth * outputHeight * outputWidth; - long k_ = 1; - - // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices) - THCudaBlas_Sgemm( - state, - 't', 'n', - n_, m_, k_, - 1, - THCudaTensor_data(state, ones), k_, - THCudaTensor_data(state, bias), k_, - 0, - THCudaTensor_data(state, output_n), n_ - ); - - // Extract columns: - im3d2col( - THCState_getCurrentStream(state), - THCudaTensor_data(state, input_n), - nInputPlane, inputHeight, inputWidth, inputDepth, kT, kH, kW, padT, padH, padW, dT, dH, dW, - THCudaTensor_data(state, columns) - ); - - // M,N,K are dims of matrix A and B - // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm) - long m = weight->size[0]; - long n = columns->size[1]; - long k = weight->size[1]*weight->size[2]*weight->size[3]*weight->size[4]; - - // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices) - THCudaBlas_Sgemm( - state, - 'n', 'n', - n, m, k, - 1, - THCudaTensor_data(state, columns), n, - THCudaTensor_data(state, weight), k, - 1, - THCudaTensor_data(state, output_n), n - ); - } - - // Free - THCudaTensor_free(state, input_n); - THCudaTensor_free(state, output_n); - - // Resize output - if (batch == 0) - { - THCudaTensor_resize4d(state, output, nOutputPlane, outputHeight, outputWidth, outputDepth); - THCudaTensor_resize4d(state, input, nInputPlane, inputHeight, inputWidth, inputDepth); - } -} - -void THNN_CudaVolumetricConvolution_updateGradInput( - THCState *state, - THCudaTensor *input, - THCudaTensor *gradOutput, - THCudaTensor *gradInput, - THCudaTensor *weight, - THCudaTensor *finput, - int dT, int dW, int dH, - int padT, int padW, int padH) -{ - THArgCheck(weight->nDimension == 5, 4, - "5D weight tensor is expected (nOutputPlane x nInputPlane x kT x kH x kW)" - ); - - int nOutputPlane = (int)weight->size[0]; - int nInputPlane = (int)weight->size[1]; - int kT = (int)weight->size[2]; - int kH = (int)weight->size[3]; - int kW = (int)weight->size[4]; - - THCudaTensor *gradColumns = finput; - - THCUNN_assertSameGPU(state, 5, input, gradOutput, weight, gradColumns, gradInput); - THArgCheck(input->nDimension == 4 || input->nDimension == 5, 2, - "4D or 5D (batch mode) tensor is expected" - ); - - int batch = 1; - if (input->nDimension == 4) - { - // Force batch - batch = 0; - THCudaTensor_resize5d(state, input, 1, input->size[0], input->size[1], input->size[2], input->size[3]); - THCudaTensor_resize5d(state, gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2], gradOutput->size[3]); - } - - long inputWidth = input->size[3]; - long inputHeight = input->size[2]; - long inputDepth = input->size[4]; - long outputWidth = (inputWidth + 2*padH - kH) / dH + 1; - long outputHeight = (inputHeight + 2*padT - kT) / dT + 1; - long outputDepth = (inputDepth + 2*padW - kW) / dW + 1; - - // Batch size + input planes - long batchSize = input->size[0]; - - // Resize output - THCudaTensor_resize5d(state, gradInput, batchSize, nInputPlane, inputHeight, inputWidth, inputDepth); - - // Resize temporary columns - THCudaTensor_resize2d(state, gradColumns, nInputPlane*kH*kT*kW, outputDepth*outputHeight*outputWidth); - - // Helpers - THCudaTensor *input_n = THCudaTensor_new(state); - THCudaTensor *gradInput_n = THCudaTensor_new(state); - THCudaTensor *gradOutput_n = THCudaTensor_new(state); - - // For each elt in batch, do: - for (int elt = 0; elt < batchSize; elt ++) - { - // Matrix mulitply per sample: - THCudaTensor_select(state, input_n, input, 0, elt); - THCudaTensor_select(state, gradInput_n, gradInput, 0, elt); - THCudaTensor_select(state, gradOutput_n, gradOutput, 0, elt); - - // M,N,K are dims of matrix A and B - // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm) - long m = weight->size[1]*weight->size[2]*weight->size[3]*weight->size[4]; - long n = gradColumns->size[1]; - long k = weight->size[0]; - - // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices) - THCudaBlas_Sgemm( - state, - 'n', 't', - n, m, k, - 1, - THCudaTensor_data(state, gradOutput_n), n, - THCudaTensor_data(state, weight), m, - 0, - THCudaTensor_data(state, gradColumns), n - ); - - // Unpack columns back into input: - col2im3d( - THCState_getCurrentStream(state), - THCudaTensor_data(state, gradColumns), - nInputPlane, inputHeight, inputWidth, inputDepth, kT, kH, kW, padT, padH, padW, dT, dH, dW, - THCudaTensor_data(state, gradInput_n) - ); - } - - // Free - THCudaTensor_free(state, input_n); - THCudaTensor_free(state, gradInput_n); - THCudaTensor_free(state, gradOutput_n); - - // Resize output - if (batch == 0) - { - THCudaTensor_resize4d(state, gradOutput, nOutputPlane, outputHeight, outputWidth, outputDepth); - THCudaTensor_resize4d(state, input, nInputPlane, inputHeight, inputWidth, inputDepth); - THCudaTensor_resize4d(state, gradInput, nInputPlane, inputHeight, inputWidth, inputDepth); - } -} - -void THNN_CudaVolumetricConvolution_accGradParameters( - THCState *state, - THCudaTensor *input, - THCudaTensor *gradOutput, - THCudaTensor *gradWeight, - THCudaTensor *gradBias, - THCudaTensor *finput, - THCudaTensor *fgradInput, - int dT, int dW, int dH, - int padT, int padW, int padH, - float scale) -{ - THCudaTensor *columns = finput; - THCudaTensor *ones = fgradInput; - THCUNN_assertSameGPU(state, 6, input, gradOutput, gradWeight, gradBias, columns, ones); - - THArgCheck(gradWeight->nDimension == 5, 4, - "5D gradWeight tensor is expected (nOutputPlane x nInputPlane x kT x kH x kW)" - ); - - int nOutputPlane = (int)gradWeight->size[0]; - int nInputPlane = (int)gradWeight->size[1]; - int kT = (int)gradWeight->size[2]; - int kH = (int)gradWeight->size[3]; - int kW = (int)gradWeight->size[4]; - - THArgCheck( - input->nDimension == 4 || input->nDimension == 5, 2, - "3D or 4D (batch mode) tensor is expected" - ); - - int batch = 1; - if (input->nDimension == 4) - { - // Force batch - batch = 0; - THCudaTensor_resize5d(state, input, 1, input->size[0], input->size[1], input->size[2], input->size[3]); - THCudaTensor_resize5d(state, gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2], gradOutput->size[3]); - } - - long inputWidth = input->size[3]; - long inputHeight = input->size[2]; - long inputDepth = input->size[4]; - long outputWidth = (inputWidth + 2*padH - kH) / dH + 1; - long outputHeight = (inputHeight + 2*padT - kT) / dT + 1; - long outputDepth = (inputDepth + 2*padW - kW) / dW + 1; - - // Batch size + input planes - long batchSize = input->size[0]; - - // Define a buffer of ones, for bias accumulation - if (ones->nDimension != 3 || ones->size[0]*ones->size[1]*ones->size[2] < outputDepth*outputHeight*outputWidth) - { - // Resize plane and fill with ones... - THCudaTensor_resize3d(state, ones, outputHeight, outputWidth, outputDepth); - THCudaTensor_fill(state, ones, 1); - } - - // Resize temporary columns - THCudaTensor_resize2d(state, columns, nInputPlane*kH*kT*kW, outputDepth*outputHeight*outputWidth); - - // Helpers - THCudaTensor *input_n = THCudaTensor_new(state); - THCudaTensor *gradOutput_n = THCudaTensor_new(state); - - // For each elt in batch, do: - for (int elt = 0; elt < batchSize; elt ++) - { - // Matrix mulitply per output: - THCudaTensor_select(state, input_n, input, 0, elt); - THCudaTensor_select(state, gradOutput_n, gradOutput, 0, elt); - - // Extract columns: - im3d2col( - THCState_getCurrentStream(state), - THCudaTensor_data(state, input_n), - nInputPlane, inputHeight, inputWidth, inputDepth, kT, kH, kW, padT, padH, padW, dT, dH, dW, - THCudaTensor_data(state, columns) - ); - - // M,N,K are dims of matrix A and B - // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm) - long m = gradWeight->size[0]; - long n = gradWeight->size[1]*gradWeight->size[2]*gradWeight->size[3]*gradWeight->size[4]; - long k = columns->size[1]; - - // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices) - THCudaBlas_Sgemm( - state, - 't', 'n', - n, m, k, - scale, - THCudaTensor_data(state, columns), k, - THCudaTensor_data(state, gradOutput_n), k, - 1, - THCudaTensor_data(state, gradWeight), n - ); - - // Do Bias: - // M,N,K are dims of matrix A and B - // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm) - long m_ = nOutputPlane; - long k_ = outputDepth * outputHeight * outputWidth; - - // Do GEMV (note: this is a bit confusing because gemv assumes column-major matrices) - THCudaBlas_Sgemv( - state, - 't', - k_, m_, - scale, - THCudaTensor_data(state, gradOutput_n), k_, - THCudaTensor_data(state, ones), 1, - 1, - THCudaTensor_data(state, gradBias), 1 - ); - } - - // Free - THCudaTensor_free(state, input_n); - THCudaTensor_free(state, gradOutput_n); - - // Resize - if (batch == 0) - { - THCudaTensor_resize4d(state, gradOutput, nOutputPlane, outputHeight, outputWidth, outputDepth); - THCudaTensor_resize4d(state, input, nInputPlane, inputHeight, inputWidth, inputDepth); - } -} +#include "generic/VolumetricConvolution.cu" +#include "THCGenerateFloatTypes.h" diff --git a/lib/THCUNN/VolumetricDilatedConvolution.cu b/lib/THCUNN/VolumetricDilatedConvolution.cu index 72caba7..d82e02d 100644 --- a/lib/THCUNN/VolumetricDilatedConvolution.cu +++ b/lib/THCUNN/VolumetricDilatedConvolution.cu @@ -1,367 +1,8 @@ #include "THCUNN.h" #include "common.h" #include "vol2col.h" +#include "THCHalf.h" +#include "THCHalfAutoNumerics.cuh" - -void THNN_CudaVolumetricDilatedConvolution_updateOutput( - THCState *state, - THCudaTensor *input, - THCudaTensor *output, - THCudaTensor *weight, - THCudaTensor *bias, - THCudaTensor *columns, - THCudaTensor *ones, - int kT, int kW, int kH, - int dT, int dW, int dH, - int padT, int padW, int padH, - int dilationT, int dilationW, int dilationH) { - - THCUNN_assertSameGPU(state, 5, input, output, weight, columns, ones); - if (bias) { - THCUNN_assertSameGPU(state, 2, weight, bias); - } - THArgCheck(input->nDimension == 4 || input->nDimension == 5, 2, "4D or 5D (batch mode) tensor is expected, but got: %d", input->nDimension); - THArgCheck(weight->nDimension == 5, 4, "weight tensor must be 5D (nOutputPlane,nInputPlane,kT,kH,kW)"); - THArgCheck(!bias || weight->size[0] == bias->size[0], 4, "nOutputPlane mismatch in weight and bias"); - THArgCheck(kT > 0 && kW > 0 && kH > 0, 8, "kernel size should be greater than zero"); - THArgCheck(dT > 0 && dW > 0 && dH > 0, 10, "stride should be greater than zero"); - - // Params: - int nInputPlane = weight->size[1]; - int nOutputPlane = weight->size[0]; - - int batch = 1; - if (input->nDimension == 4) { - THArgCheck(input->size[0] == nInputPlane, 2, "input channels and nInputPlane dont match"); - // Force batch - batch = 0; - THCudaTensor_resize5d(state, input, 1, input->size[0], input->size[1], input->size[2], input->size[3]); - } else { - THArgCheck(input->size[1] == nInputPlane, 2, "input channels and nInputPlane dont match"); - } - - long inputDepth = input->size[2]; - long inputHeight = input->size[3]; - long inputWidth = input->size[4]; - long outputDepth = (inputDepth + 2*padT - (dilationT * (kT - 1) + 1)) / dT + 1; - long outputHeight = (inputHeight + 2*padH - (dilationH * (kH - 1) + 1)) / dH + 1; - long outputWidth = (inputWidth + 2*padW - (dilationW * (kW - 1) + 1)) / dW + 1; - - if (outputDepth < 1 || outputWidth < 1 || outputHeight < 1) - THError("Given input size: (%dx%dx%dx%d). Calculated output size: (%dx%dx%dx%d). Output size is too small", - nInputPlane,inputDepth,inputHeight,inputWidth,nOutputPlane,outputDepth,outputHeight,outputWidth); - - // Batch size + input planes - long batchSize = input->size[0]; - - // Resize output - THCudaTensor_resize5d(state, output, batchSize, nOutputPlane, outputDepth, outputHeight, outputWidth); - - // Resize temporary columns - THCudaTensor_resize2d(state, columns, nInputPlane*kT*kW*kH, outputDepth*outputHeight*outputWidth); - - // Define a buffer of ones, for bias accumulation - // Note: this buffer can be shared with other modules, it only ever gets increased, - // and always contains ones. - if (ones->nDimension != 2 || ones->size[0]*ones->size[1]*ones->size[2] < outputDepth*outputHeight*outputWidth) { - // Resize plane and fill with ones... - THCudaTensor_resize3d(state, ones, outputDepth, outputHeight, outputWidth); - THCudaTensor_fill(state, ones, 1); - } - - // Helpers - THCudaTensor *input_n = THCudaTensor_new(state); - THCudaTensor *output_n = THCudaTensor_new(state); - - // For each elt in batch, do: - for (int elt = 0; elt < batchSize; elt ++) { - // Matrix mulitply per output: - THCudaTensor_select(state, input_n, input, 0, elt); - THCudaTensor_select(state, output_n, output, 0, elt); - - // Do Bias first: - // M,N,K are dims of matrix A and B - // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm) - long m_ = nOutputPlane; - long n_ = outputDepth * outputHeight * outputWidth; - long k_ = 1; - - // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices) - if (bias) { - THCudaBlas_Sgemm( - state, - 't', 'n', - n_, m_, k_, - 1, - THCudaTensor_data(state, ones), k_, - THCudaTensor_data(state, bias), k_, - 0, - THCudaTensor_data(state, output_n), n_ - ); - } else { - THCudaTensor_zero(state, output_n); - } - - // Extract columns: - vol2col( - THCState_getCurrentStream(state), - THCudaTensor_data(state, input_n), - nInputPlane, inputDepth, inputHeight, inputWidth, - kT, kH, kW, padT, padH, padW, dT, dH, dW, - dilationT, dilationH, dilationW, - THCudaTensor_data(state, columns) - ); - - // M,N,K are dims of matrix A and B - // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm) - long m = nOutputPlane; - long n = columns->size[1]; - long k = nInputPlane*kT*kH*kW; - - // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices) - THCudaBlas_Sgemm( - state, - 'n', 'n', - n, m, k, - 1, - THCudaTensor_data(state, columns), n, - THCudaTensor_data(state, weight), k, - 1, - THCudaTensor_data(state, output_n), n - ); - } - - // Free - THCudaTensor_free(state, input_n); - THCudaTensor_free(state, output_n); - - // Resize output - if (batch == 0) { - THCudaTensor_resize4d(state, output, nOutputPlane, outputDepth, outputHeight, outputWidth); - THCudaTensor_resize4d(state, input, nInputPlane, inputDepth, inputHeight, inputWidth); - } -} - -void THNN_CudaVolumetricDilatedConvolution_updateGradInput( - THCState *state, - THCudaTensor *input, - THCudaTensor *gradOutput, - THCudaTensor *gradInput, - THCudaTensor *weight, - THCudaTensor *gradColumns, - int kT, int kW, int kH, - int dT, int dW, int dH, - int padT, int padW, int padH, - int dilationT, int dilationW, int dilationH) { - - THCUNN_assertSameGPU(state, 5, input, gradOutput, weight, - gradColumns, gradInput); - THArgCheck(input->nDimension == 4 || input->nDimension == 5, 2, "4D or 5D (batch mode) tensor is expected"); - THArgCheck(gradOutput->nDimension == 4 || gradOutput->nDimension == 5, 3, "4D or 5D (batch mode) tensor is expected"); - THArgCheck(weight->nDimension == 5, 4, "weight tensor must be 5D (nOutputPlane,nInputPlane,kT,kH,kW)"); - THArgCheck(kT > 0 && kW > 0 && kH > 0, 8, "kernel size should be greater than zero"); - THArgCheck(dT > 0 && dW > 0 && dH > 0, 10, "stride should be greater than zero"); - - // Params - int nInputPlane = weight->size[1]; - int nOutputPlane = weight->size[0]; - - int batch = 1; - if (input->nDimension == 4) { - // Force batch - batch = 0; - THCudaTensor_resize5d(state, input, 1, input->size[0], input->size[1], input->size[2], input->size[3]); - THCudaTensor_resize5d(state, gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2], gradOutput->size[3]); - } - - long inputDepth = input->size[2]; - long inputWidth = input->size[4]; - long inputHeight = input->size[3]; - long outputDepth = (inputDepth + 2*padT - (dilationT * (kT - 1) + 1)) / dT + 1; - long outputWidth = (inputWidth + 2*padW - (dilationW * (kW - 1) + 1)) / dW + 1; - long outputHeight = (inputHeight + 2*padH - (dilationH * (kH - 1) + 1)) / dH + 1; - - // Batch size + input planes - long batchSize = input->size[0]; - - // Resize output - THCudaTensor_resize5d(state, gradInput, batchSize, nInputPlane, inputDepth, inputHeight, inputWidth); - - // Resize temporary columns - THCudaTensor_resize2d(state, gradColumns, nInputPlane*kT*kW*kH, outputDepth*outputHeight*outputWidth); - - // Helpers - THCudaTensor *gradInput_n = THCudaTensor_new(state); - THCudaTensor *gradOutput_n = THCudaTensor_new(state); - - // For each elt in batch, do: - for (int elt = 0; elt < batchSize; elt ++) { - // Matrix mulitply per sample: - THCudaTensor_select(state, gradInput_n, gradInput, 0, elt); - THCudaTensor_select(state, gradOutput_n, gradOutput, 0, elt); - - // M,N,K are dims of matrix A and B - // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm) - long m = nInputPlane*kT*kW*kH; - long n = gradColumns->size[1]; - long k = nOutputPlane; - - // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices) - THCudaBlas_Sgemm( - state, - 'n', 't', - n, m, k, - 1, - THCudaTensor_data(state, gradOutput_n), n, - THCudaTensor_data(state, weight), m, - 0, - THCudaTensor_data(state, gradColumns), n - ); - - // Unpack columns back into input: - col2vol( - THCState_getCurrentStream(state), - THCudaTensor_data(state, gradColumns), - nInputPlane, inputDepth, inputHeight, inputWidth, - kT, kH, kW, padT, padH, padW, dT, dH, dW, - dilationT, dilationH, dilationW, - THCudaTensor_data(state, gradInput_n) - ); - } - - // Free - THCudaTensor_free(state, gradInput_n); - THCudaTensor_free(state, gradOutput_n); - - // Resize output - if (batch == 0) { - THCudaTensor_resize4d(state, gradOutput, nOutputPlane, outputDepth, outputHeight, outputWidth); - THCudaTensor_resize4d(state, input, nInputPlane, inputDepth, inputHeight, inputWidth); - THCudaTensor_resize4d(state, gradInput, nInputPlane, inputDepth, inputHeight, inputWidth); - } -} - -void THNN_CudaVolumetricDilatedConvolution_accGradParameters( - THCState *state, - THCudaTensor *input, - THCudaTensor *gradOutput, - THCudaTensor *gradWeight, - THCudaTensor *gradBias, - THCudaTensor *columns, - THCudaTensor *ones, - int kT, int kW, int kH, - int dT, int dW, int dH, - int padT, int padW, int padH, - int dilationT, int dilationW, int dilationH, - float scale) { - - THCUNN_assertSameGPU(state, 5, input, gradOutput, gradWeight, columns, ones); - if (gradBias) { - THCUNN_assertSameGPU(state, 2, gradWeight, gradBias); - } - THArgCheck(input->nDimension == 4 || input->nDimension == 5, 2, "4D or 5D (batch mode) tensor is expected"); - THArgCheck(gradOutput->nDimension == 4 || gradOutput->nDimension == 5, 3, "4D or 5D (batch mode) tensor is expected"); - THArgCheck(gradWeight->nDimension == 5, 4, "gradWeight tensor must be 5D (nOutputPlane,nInputPlane,kT,kH,kW)"); - THArgCheck(kT > 0 && kW > 0 && kH > 0, 8, "kernel size should be greater than zero"); - THArgCheck(dT > 0 && dW > 0 && dH > 0, 10, "stride should be greater than zero"); - - // Params - int nInputPlane = gradWeight->size[1]; - int nOutputPlane = gradWeight->size[0]; - - int batch = 1; - if (input->nDimension == 4) { - // Force batch - batch = 0; - THCudaTensor_resize5d(state, input, 1, input->size[0], input->size[1], input->size[2], input->size[3]); - THCudaTensor_resize5d(state, gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2], gradOutput->size[3]); - } - - long inputDepth = input->size[2]; - long inputWidth = input->size[4]; - long inputHeight = input->size[3]; - long outputDepth = (inputDepth + 2*padT - (dilationT * (kT - 1) + 1)) / dT + 1; - long outputWidth = (inputWidth + 2*padW - (dilationW * (kW - 1) + 1)) / dW + 1; - long outputHeight = (inputHeight + 2*padH - (dilationH * (kH - 1) + 1)) / dH + 1; - - // Batch size + input planes - long batchSize = input->size[0]; - - // Define a buffer of ones, for bias accumulation - if (ones->nDimension != 3 || ones->size[0]*ones->size[1]*ones->size[2] < outputDepth*outputHeight*outputWidth) { - // Resize plane and fill with ones... - THCudaTensor_resize3d(state, ones, outputDepth, outputHeight, outputWidth); - THCudaTensor_fill(state, ones, 1); - } - - // Resize temporary columns - THCudaTensor_resize2d(state, columns, nInputPlane*kT*kW*kH, outputDepth*outputHeight*outputWidth); - - // Helpers - THCudaTensor *input_n = THCudaTensor_new(state); - THCudaTensor *gradOutput_n = THCudaTensor_new(state); - - // For each elt in batch, do: - for (int elt = 0; elt < batchSize; elt ++) { - // Matrix mulitply per output: - THCudaTensor_select(state, input_n, input, 0, elt); - THCudaTensor_select(state, gradOutput_n, gradOutput, 0, elt); - - // Extract columns: - vol2col( - THCState_getCurrentStream(state), - THCudaTensor_data(state, input_n), - nInputPlane, inputDepth, inputHeight, inputWidth, kT, kH, kW, padT, padH, padW, dT, dH, dW, - dilationT, dilationH, dilationW, - THCudaTensor_data(state, columns) - ); - - // M,N,K are dims of matrix A and B - // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm) - long m = nOutputPlane; - long n = nInputPlane*kT*kW*kH; - long k = columns->size[1]; - - // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices) - THCudaBlas_Sgemm( - state, - 't', 'n', - n, m, k, - scale, - THCudaTensor_data(state, columns), k, - THCudaTensor_data(state, gradOutput_n), k, - 1, - THCudaTensor_data(state, gradWeight), n - ); - - // Do Bias: - // M,N,K are dims of matrix A and B - // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm) - long m_ = nOutputPlane; - long k_ = outputDepth * outputHeight * outputWidth; - - // Do GEMV (note: this is a bit confusing because gemv assumes column-major matrices) - if (gradBias) { - THCudaBlas_Sgemv( - state, - 't', - k_, m_, - scale, - THCudaTensor_data(state, gradOutput_n), k_, - THCudaTensor_data(state, ones), 1, - 1, - THCudaTensor_data(state, gradBias), 1 - ); - } - } - - // Free - THCudaTensor_free(state, input_n); - THCudaTensor_free(state, gradOutput_n); - - // Resize output - if (batch == 0) { - THCudaTensor_resize4d(state, gradOutput, nOutputPlane, outputDepth, outputHeight, outputWidth); - THCudaTensor_resize4d(state, input, nInputPlane, inputDepth, inputHeight, inputWidth); - } -} +#include "generic/VolumetricDilatedConvolution.cu" +#include "THCGenerateFloatTypes.h" diff --git a/lib/THCUNN/VolumetricDilatedMaxPooling.cu b/lib/THCUNN/VolumetricDilatedMaxPooling.cu index 49c56e7..0cff32c 100644 --- a/lib/THCUNN/VolumetricDilatedMaxPooling.cu +++ b/lib/THCUNN/VolumetricDilatedMaxPooling.cu @@ -3,16 +3,20 @@ #include "THCDeviceTensor.cuh" #include "THCDeviceTensorUtils.cuh" #include "THCDeviceUtils.cuh" +#include "THCHalf.h" +#include "THCHalfAutoNumerics.cuh" +#include "THCAtomics.cuh" #include <cfloat> +template <typename Dtype> __global__ void cuda_VolumetricDilatedMaxPooling_updateOutput( - THCDeviceTensor<float, 4> input, - THCDeviceTensor<float, 4> indices, - THCDeviceTensor<float, 4> output, + THCDeviceTensor<Dtype, 4> input, + THCDeviceTensor<THCIndex_t, 4> indices, + THCDeviceTensor<Dtype, 4> output, int kT, int kH, int kW, int dT, int dH, int dW, - int padT, int padH, int padW, + int padT, int padH, int padW, int dilationT, int dilationH, int dilationW, int offsetZ) { @@ -31,7 +35,7 @@ __global__ void cuda_VolumetricDilatedMaxPooling_updateOutput( int maxRow = 0; int maxFrame = 0; - float max = -FLT_MAX; + Dtype max = THCNumerics<Dtype>::min(); for (int frame = 0; frame < kT; ++frame) { @@ -45,7 +49,7 @@ __global__ void cuda_VolumetricDilatedMaxPooling_updateOutput( { if (iColumn + column * dilationW < input.getSize(3) && iColumn + column * dilationW >= 0) { - float val = input[slice][iFrame + frame * dilationT][iRow + row * dilationH][iColumn + column * dilationW]; + Dtype val = input[slice][iFrame + frame * dilationT][iRow + row * dilationH][iColumn + column * dilationW]; if (max < val) { @@ -62,7 +66,7 @@ __global__ void cuda_VolumetricDilatedMaxPooling_updateOutput( } output[slice][oFrame][oRow][oColumn] = max; - float *idx = &indices[slice][oFrame][oRow][oColumn]; + THCIndex_t *idx = &indices[slice][oFrame][oRow][oColumn]; ((unsigned char*)(idx))[0] = maxFrame; ((unsigned char*)(idx))[1] = maxRow; ((unsigned char*)(idx))[2] = maxColumn; @@ -70,13 +74,13 @@ __global__ void cuda_VolumetricDilatedMaxPooling_updateOutput( } } -template <int KERNEL_WIDTH> +template <int KERNEL_WIDTH, typename Dtype> __global__ void cuda_VolumetricDilatedMaxPooling_updateOutput( - THCDeviceTensor<float, 4> input, THCDeviceTensor<float, 4> indices, - THCDeviceTensor<float, 4> output, + THCDeviceTensor<Dtype, 4> input, THCDeviceTensor<THCIndex_t, 4> indices, + THCDeviceTensor<Dtype, 4> output, int kT, int kH, int dT, int dH, int dW, - int padT, int padH, int padW, + int padT, int padH, int padW, int dilationT, int dilationH, int dilationW, int offsetZ) { @@ -95,7 +99,7 @@ __global__ void cuda_VolumetricDilatedMaxPooling_updateOutput( int maxRow = 0; int maxFrame; - float max = -FLT_MAX; + Dtype max = THCNumerics<Dtype>::min(); for (int frame = 0; frame < kT; ++frame) { @@ -109,7 +113,7 @@ __global__ void cuda_VolumetricDilatedMaxPooling_updateOutput( { if (iColumn + column * dilationW < input.getSize(3) && iColumn + column * dilationW >= 0) { - float val = input[slice][iFrame + frame * dilationT][iRow + row * dilationH][iColumn + column * dilationW]; + Dtype val = input[slice][iFrame + frame * dilationT][iRow + row * dilationH][iColumn + column * dilationW]; if (max < val) { @@ -126,7 +130,7 @@ __global__ void cuda_VolumetricDilatedMaxPooling_updateOutput( } output[slice][oFrame][oRow][oColumn] = max; - float *idx = &indices[slice][oFrame][oRow][oColumn]; + THCIndex_t *idx = &indices[slice][oFrame][oRow][oColumn]; ((unsigned char*)(idx))[0] = maxFrame; ((unsigned char*)(idx))[1] = maxRow; ((unsigned char*)(idx))[2] = maxColumn; @@ -134,193 +138,13 @@ __global__ void cuda_VolumetricDilatedMaxPooling_updateOutput( } } -#define UPDATE_OUTPUT_KERNEL_WIDTH(KW) case KW: \ - cuda_VolumetricDilatedMaxPooling_updateOutput<KW><<<grid, block, \ - 0, THCState_getCurrentStream(state)>>>( \ - cudaInput, cudaIndices, cudaOutput, kT, kH, dT, dH, dW, padT, padH, padW,\ - dilationT, dilationH, dilationW, offsetZ); \ - break - - -void THNN_CudaVolumetricDilatedMaxPooling_updateOutput( - THCState *state, THCudaTensor *input, THCudaTensor *output, THCudaTensor *indices, - int kT, int kW, int kH, - int dT, int dW, int dH, - int padT, int padW, int padH, - int dilationT, int dilationW, int dilationH, - bool ceilMode) -{ - int batchSize; - int inputSlices; - int inputTime; - int inputHeight; - int inputWidth; - int outputTime; - int outputHeight; - int outputWidth; - - THCUNN_assertSameGPU(state, 3, input, indices, output); - - if (THCudaTensor_nDimension(state, input) == 4) - { - THArgCheck( - THCudaTensor_size(state, input, 1) >= kT && - THCudaTensor_size(state, input, 2) >= kH && - THCudaTensor_size(state, input, 3) >= kW, 2, - "input image smaller than kernel size" - ); - - /* sizes */ - batchSize = 1; - inputSlices = THCudaTensor_size(state, input, 0); - inputTime = THCudaTensor_size(state, input, 1); - inputHeight = THCudaTensor_size(state, input, 2); - inputWidth = THCudaTensor_size(state, input, 3); - } - else if (THCudaTensor_nDimension(state, input) == 5) - { - THArgCheck( - THCudaTensor_size(state, input, 4) >= kW && - THCudaTensor_size(state, input, 3) >= kH && - THCudaTensor_size(state, input, 2) >= kT, 2, - "input image smaller than kernel size" - ); - - /* sizes */ - batchSize = THCudaTensor_size(state, input, 0); - inputSlices = THCudaTensor_size(state, input, 1); - inputTime = THCudaTensor_size(state, input, 2); - inputHeight = THCudaTensor_size(state, input, 3); - inputWidth = THCudaTensor_size(state, input, 4); - } - else - { - THArgCheck(false, 2, "4D or 5D tensor expected"); - } - - THArgCheck(kT/2 >= padT && kW/2 >= padW && kH/2 >= padH, 2, - "pad should be smaller than half of kernel size" - ); - - if (ceilMode) - { - outputTime = (int)(ceil((float)(inputTime - (dilationT * (kT - 1) + 1) + 2*padT) / dT)) + 1; - outputHeight = (int)(ceil((float)(inputHeight - (dilationH * (kH - 1) + 1) + 2*padH) / dH)) + 1; - outputWidth = (int)(ceil((float)(inputWidth - (dilationW * (kW - 1) + 1) + 2*padW) / dW)) + 1; - } - else - { - outputTime = (int)(floor((float)(inputTime - (dilationT * (kT - 1) + 1) + 2*padT) / dT)) + 1; - outputHeight = (int)(floor((float)(inputHeight - (dilationH * (kH - 1) + 1) + 2*padH) / dH)) + 1; - outputWidth = (int)(floor((float)(inputWidth - (dilationW * (kW - 1) + 1) + 2*padW) / dW)) + 1; - } - - if (outputTime < 1 || outputHeight < 1 || outputWidth < 1) - THError("Given input size: (%dx%dx%dx%d). Calculated output size: (%dx%dx%dx%d). Output size is too small", - inputSlices,inputTime,inputHeight,inputWidth,inputSlices,outputTime,outputHeight,outputWidth); - - if (padT || padW || padH) - { - if ((outputTime - 1)*dT >= inputTime + padT) - --outputTime; - if ((outputHeight - 1)*dH >= inputHeight + padH) - --outputHeight; - if ((outputWidth - 1)*dW >= inputWidth + padW) - --outputWidth; - } - - if (input->nDimension == 4) /* 4D */ - { - /* resize output */ - THCudaTensor_resize4d(state, output, inputSlices, - outputTime, outputHeight, outputWidth); - /* indices pack ti,i,j locations for each output point as uchar into - each float of the tensor */ - THCudaTensor_resize4d(state, indices, inputSlices, - outputTime, outputHeight, outputWidth); - } - else - { /* 5D */ - THCudaTensor_resize5d(state, output, batchSize, inputSlices, - outputTime, outputHeight, outputWidth); - // Index tensor packs index offsets as uchars into floats - THCudaTensor_resize5d(state, indices, batchSize, inputSlices, - outputTime, outputHeight, outputWidth); - } - - input = THCudaTensor_newContiguous(state, input); - - // Collapse batch and feature dimensions - THCDeviceTensor<float, 4> cudaInput; - THCDeviceTensor<float, 4> cudaOutput; - if (THCudaTensor_nDimension(state, input) == 4) - { - cudaInput = toDeviceTensor<float, 4>(state, input); - cudaOutput = toDeviceTensor<float, 4>(state, output); - } - else - { - cudaInput = toDeviceTensor<float, 5>(state, input).downcastOuter<4>(); - cudaOutput = toDeviceTensor<float, 5>(state, output).downcastOuter<4>(); - } - - THLongStorage *indicesSize = THLongStorage_newWithSize(4); - long indicesSizeRaw[4] = { batchSize * inputSlices, - outputTime, outputHeight, outputWidth }; - THLongStorage_rawCopy(indicesSize, indicesSizeRaw); - - THCudaTensor *indices1 = THCudaTensor_newWithStorage( - state, THCudaTensor_storage(state, indices), - THCudaTensor_storageOffset(state, indices), - indicesSize, NULL); - - THLongStorage_free(indicesSize); - - THCDeviceTensor<float, 4> cudaIndices = - toDeviceTensor<float, 4>(state, indices1); - - int totalZ = outputTime * inputSlices * batchSize; - int offsetZ = 0; - dim3 block(32, 8); - - while (totalZ > 0) { - dim3 grid(THCCeilDiv(outputWidth, static_cast<int>(block.x)), - THCCeilDiv(outputHeight, static_cast<int>(block.y)), - totalZ > 65535 ? 65535 : totalZ); - - switch (kW) - { - UPDATE_OUTPUT_KERNEL_WIDTH(1); - UPDATE_OUTPUT_KERNEL_WIDTH(2); - UPDATE_OUTPUT_KERNEL_WIDTH(3); - UPDATE_OUTPUT_KERNEL_WIDTH(4); - UPDATE_OUTPUT_KERNEL_WIDTH(5); - UPDATE_OUTPUT_KERNEL_WIDTH(6); - UPDATE_OUTPUT_KERNEL_WIDTH(7); - default: - cuda_VolumetricDilatedMaxPooling_updateOutput<<<grid, block, - 0, THCState_getCurrentStream(state)>>>( - cudaInput, cudaIndices, cudaOutput, - kT, kH, kW, dT, dH, dW, - padT, padH, padW, dilationT, dilationH, dilationW, offsetZ); - } - THCudaCheck(cudaGetLastError()); - totalZ -= 65535; - offsetZ += 65535; - } - - THCudaTensor_free(state, input); - THCudaTensor_free(state, indices1); -} - -#undef UPDATE_OUTPUT_KERNEL_WIDTH - +template <typename Dtype> __global__ void cuda_VolumetricDilatedMaxPooling_updateGradInput( - THCDeviceTensor<float, 4> gradOutput, - THCDeviceTensor<float, 4> indices, - THCDeviceTensor<float, 4> gradInput, + THCDeviceTensor<Dtype, 4> gradOutput, + THCDeviceTensor<THCIndex_t, 4> indices, + THCDeviceTensor<Dtype, 4> gradInput, int dT, int dH, int dW, - int padT, int padH, int padW, + int padT, int padH, int padW, int dilationT, int dilationH, int dilationW, int offsetZ) { @@ -331,7 +155,7 @@ __global__ void cuda_VolumetricDilatedMaxPooling_updateGradInput( if (oRow < gradOutput.getSize(2) && oColumn < gradOutput.getSize(3)) { - float *idx = &indices[slice][oFrame][oRow][oColumn]; + THCIndex_t *idx = &indices[slice][oFrame][oRow][oColumn]; int iFrame = ((unsigned char*)(idx))[0] * dilationT + oFrame * dT - padT; int iRow = ((unsigned char*)(idx))[1] * dilationH + oRow * dH - padH; int iColumn = ((unsigned char*)(idx))[2] * dilationW + oColumn * dW - padW; @@ -340,98 +164,5 @@ __global__ void cuda_VolumetricDilatedMaxPooling_updateGradInput( } } -void THNN_CudaVolumetricDilatedMaxPooling_updateGradInput( - THCState *state, THCudaTensor *input, THCudaTensor *gradOutput, THCudaTensor *gradInput, - THCudaTensor *indices, - int dT, int dW, int dH, - int padT, int padW, int padH, - int dilationT, int dilationW, int dilationH) -{ - // Resize and initialize result tensor. - THCudaTensor_resizeAs(state, gradInput, input); - THCudaTensor_zero(state, gradInput); - - int batchSize; - int inputSlices; - - int outputTime; - int outputHeight; - int outputWidth; - - THCUNN_assertSameGPU(state, 4, input, indices, gradOutput, gradInput); - - if (THCudaTensor_nDimension(state, input) == 4) /* 4D */ - { - batchSize = 1; - inputSlices = THCudaTensor_size(state, input, 0); - - outputTime = THCudaTensor_size(state, gradOutput, 1); - outputHeight = THCudaTensor_size(state, gradOutput, 2); - outputWidth = THCudaTensor_size(state, gradOutput, 3); - } - else - { - batchSize = THCudaTensor_size(state, input, 0); - inputSlices = THCudaTensor_size(state, input, 1); - - outputTime = THCudaTensor_size(state, gradOutput, 2); - outputHeight = THCudaTensor_size(state, gradOutput, 3); - outputWidth = THCudaTensor_size(state, gradOutput, 4); - } - - gradOutput = THCudaTensor_newContiguous(state, gradOutput); - - // Collapse batch and feature dimensions - THCDeviceTensor<float, 4> cudaGradInput; - THCDeviceTensor<float, 4> cudaGradOutput; - if (THCudaTensor_nDimension(state, input) == 4) - { - cudaGradInput = toDeviceTensor<float, 4>(state, gradInput); - cudaGradOutput = toDeviceTensor<float, 4>(state, gradOutput); - } - else - { - cudaGradInput = - toDeviceTensor<float, 5>(state, gradInput).downcastOuter<4>(); - cudaGradOutput = - toDeviceTensor<float, 5>(state, gradOutput).downcastOuter<4>(); - } - - THLongStorage *indicesSize = THLongStorage_newWithSize(4); - long indicesSizeRaw[4] = { batchSize * inputSlices, - outputTime, outputHeight, outputWidth }; - THLongStorage_rawCopy(indicesSize, indicesSizeRaw); - THCudaTensor *indices1 = THCudaTensor_newWithStorage( - state, THCudaTensor_storage(state, indices), - THCudaTensor_storageOffset(state, indices), indicesSize, NULL); - THLongStorage_free(indicesSize); - - THCDeviceTensor<float, 4> cudaIndices = - toDeviceTensor<float, 4>(state, indices1); - - int totalZ = outputTime * inputSlices * batchSize; - int offsetZ = 0; - dim3 block(32, 8); - - while (totalZ > 0) { - dim3 grid(THCCeilDiv(outputWidth, static_cast<int>(block.x)), - THCCeilDiv(outputHeight, static_cast<int>(block.y)), - totalZ > 65535 ? 65535 : totalZ); - - cuda_VolumetricDilatedMaxPooling_updateGradInput<<<grid, block, - 0, THCState_getCurrentStream(state)>>>( - cudaGradOutput, - cudaIndices, - cudaGradInput, - dT, dH, dW, - padT, padH, padW, - dilationT, dilationH, dilationW, offsetZ); - THCudaCheck(cudaGetLastError()); - totalZ -= 65535; - offsetZ += 65535; - } - - // cleanup - THCudaTensor_free(state, gradOutput); - THCudaTensor_free(state, indices1); -} +#include "generic/VolumetricDilatedMaxPooling.cu" +#include "THCGenerateFloatTypes.h" diff --git a/lib/THCUNN/VolumetricFullConvolution.cu b/lib/THCUNN/VolumetricFullConvolution.cu index ae414b7..93c4c0f 100644 --- a/lib/THCUNN/VolumetricFullConvolution.cu +++ b/lib/THCUNN/VolumetricFullConvolution.cu @@ -1,356 +1,8 @@ #include "THCUNN.h" #include "common.h" #include "vol2col.h" +#include "THCHalf.h" +#include "THCHalfAutoNumerics.cuh" - -void THNN_CudaVolumetricFullConvolution_updateOutput( - THCState *state, - THCudaTensor *input, - THCudaTensor *output, - THCudaTensor *weight, - THCudaTensor *bias, - THCudaTensor *finput, - THCudaTensor *fgradInput, - int dT, int dW, int dH, - int padT, int padW, int padH, - int adjT, int adjW, int adjH) -{ - - THCudaTensor *columns = finput; - THCudaTensor *ones = fgradInput; - - int nInputPlane = THCudaTensor_size(state, weight, 0); - int nOutputPlane = THCudaTensor_size(state, weight, 1); - const int kT = (int)weight->size[2]; - const int kH = (int)weight->size[3]; - const int kW = (int)weight->size[4]; - - THCUNN_assertSameGPU(state, 6, input, output, weight, - bias, columns, ones); - THArgCheck(input->nDimension == 4 || input->nDimension == 5, 2, "4D or 5D (batch mode) tensor is expected"); - - int batch = 1; - if (input->nDimension == 4) { - THArgCheck(input->size[0] == nInputPlane, 2, "input channels and nInputPlane dont match"); - // Force batch - batch = 0; - THCudaTensor_resize5d(state, input, 1, input->size[0], input->size[1], input->size[2], input->size[3]); - } else { - THArgCheck(input->size[1] == nInputPlane, 2, "input channels and nInputPlane dont match"); - } - - long inputWidth = input->size[4]; - long inputHeight = input->size[3]; - long inputDepth = input->size[2]; - long outputWidth = (inputWidth - 1) * dW - 2*padW + kW + adjW; - long outputHeight = (inputHeight - 1) * dH - 2*padH + kH + adjH; - long outputDepth = (inputDepth - 1) * dT - 2*padT + kT + adjT; - - // Batch size + input planes - long batchSize = input->size[0]; - - // Resize output - THCudaTensor_resize5d(state, output, batchSize, nOutputPlane, outputDepth, outputHeight, outputWidth); - - // Resize temporary columns - THCudaTensor_resize2d(state, columns, nOutputPlane*kW*kH*kT, inputDepth*inputHeight*inputWidth); - - // Define a buffer of ones, for bias accumulation - // Note: this buffer can be shared with other modules, it only ever gets increased, - // and always contains ones. - if (ones->nDimension != 3 || ones->size[0]*ones->size[1]*ones->size[2] < outputDepth*outputHeight*outputWidth) { - // Resize plane and fill with ones... - THCudaTensor_resize3d(state, ones, outputDepth, outputHeight, outputWidth); - THCudaTensor_fill(state, ones, 1); - } - - // Helpers - THCudaTensor *input_n = THCudaTensor_new(state); - THCudaTensor *output_n = THCudaTensor_new(state); - - // For each elt in batch, do: - for (int elt = 0; elt < batchSize; elt ++) { - // Matrix mulitply per output: - THCudaTensor_select(state, input_n, input, 0, elt); - THCudaTensor_select(state, output_n, output, 0, elt); - - // M,N,K are dims of matrix A and B - // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm) - long m = weight->size[1] * weight->size[2] * weight->size[3] * weight->size[4]; - long n = columns->size[1]; - long k = weight->size[0]; - - // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices) - THCudaBlas_Sgemm( - state, - 'n', 't', - n, m, k, - 1, - THCudaTensor_data(state, input_n), n, - THCudaTensor_data(state, weight), m, - 0, - THCudaTensor_data(state, columns), n - ); - - // Unpack columns back into input: - col2vol( - THCState_getCurrentStream(state), - THCudaTensor_data(state, columns), - nOutputPlane, outputDepth, outputHeight, outputWidth, kT, kH, kW, padT, padH, padW, dT, dH, dW, - 1,1,1, - THCudaTensor_data(state, output_n) - ); - - // Do Bias after: - // M,N,K are dims of matrix A and B - // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm) - long m_ = nOutputPlane; - long n_ = outputDepth * outputHeight * outputWidth; - long k_ = 1; - - // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices) - THCudaBlas_Sgemm( - state, - 't', 'n', - n_, m_, k_, - 1, - THCudaTensor_data(state, ones), k_, - THCudaTensor_data(state, bias), k_, - 1, - THCudaTensor_data(state, output_n), n_ - ); - - } - - // Free - THCudaTensor_free(state, input_n); - THCudaTensor_free(state, output_n); - - // Resize output - if (batch == 0) { - THCudaTensor_resize4d(state, output, nOutputPlane, outputDepth, outputHeight, outputWidth); - THCudaTensor_resize4d(state, input, nInputPlane, inputDepth, inputHeight, inputWidth); - } -} - -void THNN_CudaVolumetricFullConvolution_updateGradInput( - THCState *state, - THCudaTensor *input, - THCudaTensor *gradOutput, - THCudaTensor *gradInput, - THCudaTensor *weight, - THCudaTensor *finput, - THCudaTensor *fgradInput, - int dT, int dW, int dH, - int padT, int padW, int padH, - int adjT, int adjW, int adjH) -{ - THCudaTensor *gradColumns = finput; - - int nInputPlane = THCudaTensor_size(state, weight, 0); - int nOutputPlane = THCudaTensor_size(state, weight, 1); - const int kT = (int)weight->size[2]; - const int kH = (int)weight->size[3]; - const int kW = (int)weight->size[4]; - - THCUNN_assertSameGPU(state, 5, input, gradOutput, weight, - gradColumns, gradInput); - THArgCheck(input->nDimension == 4 || input->nDimension == 5, 2, "4D or 5D (batch mode) tensor is expected"); - - int batch = 1; - if (input->nDimension == 4) { - // Force batch - batch = 0; - THCudaTensor_resize5d(state, input, 1, input->size[0], input->size[1], input->size[2], input->size[3]); - THCudaTensor_resize5d(state, gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2], gradOutput->size[3]); - } - - long inputWidth = input->size[4]; - long inputHeight = input->size[3]; - long inputDepth = input->size[2]; - long outputWidth = (inputWidth - 1) * dW - 2*padW + kW + adjW; - long outputHeight = (inputHeight - 1) * dH - 2*padH + kH + adjH; - long outputDepth = (inputDepth - 1) * dT - 2*padT + kT + adjT; - - // Batch size + input planes - long batchSize = input->size[0]; - - // Resize output - THCudaTensor_resize5d(state, gradInput, batchSize, nInputPlane, inputDepth, inputHeight, inputWidth); - - // Resize temporary columns - THCudaTensor_resize2d(state, gradColumns, nOutputPlane*kW*kH*kT, inputDepth*inputHeight*inputWidth); - - // Helpers - THCudaTensor *gradInput_n = THCudaTensor_new(state); - THCudaTensor *gradOutput_n = THCudaTensor_new(state); - - // For each elt in batch, do: - for (int elt = 0; elt < batchSize; elt ++) { - // Matrix mulitply per sample: - THCudaTensor_select(state, gradInput_n, gradInput, 0, elt); - THCudaTensor_select(state, gradOutput_n, gradOutput, 0, elt); - - // Extract columns: - vol2col( - THCState_getCurrentStream(state), - THCudaTensor_data(state, gradOutput_n), - nOutputPlane, outputDepth, outputHeight, outputWidth, kT, kH, kW, padT, padH, padW, dT, dH, dW, - 1,1,1, - THCudaTensor_data(state, gradColumns) - ); - - - // M,N,K are dims of matrix A and B - // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm) - long m = weight->size[0]; - long n = gradColumns->size[1]; - long k = weight->size[1] * weight->size[2] * weight->size[3] * weight->size[4]; - - // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices) - THCudaBlas_Sgemm( - state, - 'n', 'n', - n, m, k, - 1, - THCudaTensor_data(state, gradColumns), n, - THCudaTensor_data(state, weight), k, - 0, - THCudaTensor_data(state, gradInput_n), n - ); - } - - - // Free - THCudaTensor_free(state, gradInput_n); - THCudaTensor_free(state, gradOutput_n); - - // Resize output - if (batch == 0) { - THCudaTensor_resize4d(state, gradOutput, nOutputPlane, outputDepth, outputHeight, outputWidth); - THCudaTensor_resize4d(state, input, nInputPlane, inputDepth, inputHeight, inputWidth); - THCudaTensor_resize4d(state, gradInput, nInputPlane, inputDepth, inputHeight, inputWidth); - } -} - - -void THNN_CudaVolumetricFullConvolution_accGradParameters( - THCState *state, - THCudaTensor *input, - THCudaTensor *gradOutput, - THCudaTensor *gradWeight, - THCudaTensor *gradBias, - THCudaTensor *finput, - THCudaTensor *fgradInput, - int dT, int dW, int dH, - int padT, int padW, int padH, - int adjT, int adjW, int adjH, - float scale) -{ - THCudaTensor *columns = finput; - THCudaTensor *ones = fgradInput; - - int nInputPlane = THCudaTensor_size(state, gradWeight, 0); - int nOutputPlane = THCudaTensor_size(state, gradWeight, 1); - const int kT = (int)gradWeight->size[2]; - const int kH = (int)gradWeight->size[3]; - const int kW = (int)gradWeight->size[4]; - - THCUNN_assertSameGPU(state, 6, input, gradOutput, gradWeight, - gradBias, columns, ones); - THArgCheck(input->nDimension == 4 || input->nDimension == 5, 2, "4D or 5D (batch mode) tensor is expected"); - - int batch = 1; - if (input->nDimension == 4) { - // Force batch - batch = 0; - THCudaTensor_resize5d(state, input, 1, input->size[0], input->size[1], input->size[2], input->size[3]); - THCudaTensor_resize5d(state, gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2], gradOutput->size[3]); - } - - long inputWidth = input->size[4]; - long inputHeight = input->size[3]; - long inputDepth = input->size[2]; - long outputWidth = (inputWidth - 1) * dW - 2*padW + kW + adjW; - long outputHeight = (inputHeight - 1) * dH - 2*padH + kH + adjH; - long outputDepth = (inputDepth - 1) * dT - 2*padT + kT + adjT; - - // Batch size + input planes - long batchSize = input->size[0]; - - // Define a buffer of ones, for bias accumulation - if (ones->nDimension != 3 || ones->size[0]*ones->size[1]*ones->size[2] < outputDepth*outputHeight*outputWidth) { - // Resize plane and fill with ones... - THCudaTensor_resize3d(state, ones, outputDepth, outputHeight, outputWidth); - THCudaTensor_fill(state, ones, 1); - } - - // Resize temporary columns - THCudaTensor_resize2d(state, columns, nOutputPlane*kW*kH*kT, inputDepth*inputHeight*inputWidth); - - // Helpers - THCudaTensor *input_n = THCudaTensor_new(state); - THCudaTensor *gradOutput_n = THCudaTensor_new(state); - - // For each elt in batch, do: - for (int elt = 0; elt < batchSize; elt ++) { - // Matrix mulitply per output: - THCudaTensor_select(state, input_n, input, 0, elt); - THCudaTensor_select(state, gradOutput_n, gradOutput, 0, elt); - - // Extract columns: - vol2col( - THCState_getCurrentStream(state), - THCudaTensor_data(state, gradOutput_n), - nOutputPlane, outputDepth, outputHeight, outputWidth, kT, kH, kW, padT, padH, padW, dT, dH, dW, - 1,1,1, - THCudaTensor_data(state, columns) - ); - - // M,N,K are dims of matrix A and B - // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm) - long n = columns->size[0]; // nOutputPlane * kt * kh * kw - long m = input_n->size[0]; // nInputPlane - long k = columns->size[1]; // inputHeight * inputWidth - - // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices) - THCudaBlas_Sgemm( - state, - 't', 'n', - n, m, k, - scale, - THCudaTensor_data(state, columns), k, - THCudaTensor_data(state, input_n), k, - 1, - THCudaTensor_data(state, gradWeight), n - ); - - // Do Bias: - // M,N,K are dims of matrix A and B - // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm) - long m_ = nOutputPlane; - long k_ = outputDepth * outputHeight * outputWidth; - - // Do GEMV (note: this is a bit confusing because gemv assumes column-major matrices) - THCudaBlas_Sgemv( - state, - 't', - k_, m_, - scale, - THCudaTensor_data(state, gradOutput_n), k_, - THCudaTensor_data(state, ones), 1, - 1, - THCudaTensor_data(state, gradBias), 1 - ); - } - - // Free - THCudaTensor_free(state, input_n); - THCudaTensor_free(state, gradOutput_n); - - // Resize - if (batch == 0) { - THCudaTensor_resize4d(state, gradOutput, nOutputPlane, outputDepth, outputHeight, outputWidth); - THCudaTensor_resize4d(state, input, nInputPlane, inputDepth, inputHeight, inputWidth); - } -} +#include "generic/VolumetricFullConvolution.cu" +#include "THCGenerateFloatTypes.h" diff --git a/lib/THCUNN/VolumetricMaxPooling.cu b/lib/THCUNN/VolumetricMaxPooling.cu index 892bef9..2f7de7b 100644 --- a/lib/THCUNN/VolumetricMaxPooling.cu +++ b/lib/THCUNN/VolumetricMaxPooling.cu @@ -6,27 +6,5 @@ #include <cfloat> -void THNN_CudaVolumetricMaxPooling_updateOutput( - THCState *state, THCudaTensor *input, THCudaTensor *output, THCudaTensor *indices, - int kT, int kW, int kH, - int dT, int dW, int dH, - int padT, int padW, int padH, - bool ceilMode) -{ - THNN_CudaVolumetricDilatedMaxPooling_updateOutput( - state, input, output, indices, - kT, kW, kH, dT, dW, dH, padT, padW, padH, 1, 1, 1, ceilMode); - -} - -void THNN_CudaVolumetricMaxPooling_updateGradInput( - THCState *state, THCudaTensor *input, THCudaTensor *gradOutput, THCudaTensor *gradInput, - THCudaTensor *indices, - int dT, int dW, int dH, - int padT, int padW, int padH) -{ - THNN_CudaVolumetricDilatedMaxPooling_updateGradInput( - state, input, gradOutput, gradInput, indices, - dT, dW, dH, padT, padW, padH, 1, 1, 1); - -} +#include "generic/VolumetricMaxPooling.cu" +#include "THCGenerateFloatTypes.h" diff --git a/lib/THCUNN/VolumetricMaxUnpooling.cu b/lib/THCUNN/VolumetricMaxUnpooling.cu index b583211..5d2b8b4 100644 --- a/lib/THCUNN/VolumetricMaxUnpooling.cu +++ b/lib/THCUNN/VolumetricMaxUnpooling.cu @@ -3,13 +3,16 @@ #include "THCDeviceTensor.cuh" #include "THCDeviceTensorUtils.cuh" #include "THCDeviceUtils.cuh" +#include "THCHalf.h" +#include "THCHalfAutoNumerics.cuh" #include <cfloat> +template <typename Dtype> __global__ void cuda_VolumetricMaxUnpooling_updateOutput( - THCDeviceTensor<float, 4> input, - THCDeviceTensor<float, 4> indices, - THCDeviceTensor<float, 4> output, + THCDeviceTensor<Dtype, 4> input, + THCDeviceTensor<THCIndex_t, 4> indices, + THCDeviceTensor<Dtype, 4> output, int dT, int dH, int dW, int padT, int padH, int padW, int offsetZ) { @@ -24,9 +27,9 @@ __global__ void cuda_VolumetricMaxUnpooling_updateOutput( long start_h = iRow * dH - padH; long start_w = iColumn * dW - padW; - float val = input[slice][iFrame][iRow][iColumn]; - - float *idx = &indices[slice][iFrame][iRow][iColumn]; + Dtype val = input[slice][iFrame][iRow][iColumn]; + + THCIndex_t *idx = &indices[slice][iFrame][iRow][iColumn]; long maxz = ((unsigned char*)(idx))[0]; long maxy = ((unsigned char*)(idx))[1]; long maxx = ((unsigned char*)(idx))[2]; @@ -34,104 +37,11 @@ __global__ void cuda_VolumetricMaxUnpooling_updateOutput( } } -void THNN_CudaVolumetricMaxUnpooling_updateOutput( - THCState *state, THCudaTensor *input, THCudaTensor *output, THCudaTensor *indices, - int outputTime, int outputWidth, int outputHeight, - int dT, int dW, int dH, - int padT, int padW, int padH) -{ - int batchSize; - int inputSlices; - int inputTime; - int inputHeight; - int inputWidth; - - THCUNN_assertSameGPU(state, 3, input, indices, output); - - if (THCudaTensor_nDimension(state, input) == 4) - { - /* sizes */ - batchSize = 1; - inputSlices = THCudaTensor_size(state, input, 0); - inputTime = THCudaTensor_size(state, input, 1); - inputHeight = THCudaTensor_size(state, input, 2); - inputWidth = THCudaTensor_size(state, input, 3); - } - else if (THCudaTensor_nDimension(state, input) == 5) - { - /* sizes */ - batchSize = THCudaTensor_size(state, input, 0); - inputSlices = THCudaTensor_size(state, input, 1); - inputTime = THCudaTensor_size(state, input, 2); - inputHeight = THCudaTensor_size(state, input, 3); - inputWidth = THCudaTensor_size(state, input, 4); - } - else - { - THArgCheck(false, 2, "4D or 5D tensor expected"); - } - - if (input->nDimension == 4) /* 4D */ - { - /* resize output */ - THCudaTensor_resize4d(state, output, inputSlices, - outputTime, outputHeight, outputWidth); - } - else - { /* 5D */ - THCudaTensor_resize5d(state, output, batchSize, inputSlices, - outputTime, outputHeight, outputWidth); - } - - input = THCudaTensor_newContiguous(state, input); - indices = THCudaTensor_newContiguous(state, indices); - THCudaTensor_zero(state, output); - - // Collapse batch and feature dimensions - THCDeviceTensor<float, 4> cudaInput; - THCDeviceTensor<float, 4> cudaOutput; - THCDeviceTensor<float, 4> cudaIndices; - - if (THCudaTensor_nDimension(state, input) == 4) - { - cudaInput = toDeviceTensor<float, 4>(state, input); - cudaOutput = toDeviceTensor<float, 4>(state, output); - cudaIndices = toDeviceTensor<float, 4>(state, indices); - } - else - { - cudaInput = toDeviceTensor<float, 5>(state, input).downcastOuter<4>(); - cudaOutput = toDeviceTensor<float, 5>(state, output).downcastOuter<4>(); - cudaIndices = toDeviceTensor<float, 5>(state, indices).downcastOuter<4>(); - } - - int totalZ = inputTime * inputSlices * batchSize; - int offsetZ = 0; - dim3 block(32, 8); - - while (totalZ > 0) { - dim3 grid(THCCeilDiv(inputWidth, static_cast<int>(block.x)), - THCCeilDiv(inputHeight, static_cast<int>(block.y)), - totalZ > 65535 ? 65535 : totalZ); - - cuda_VolumetricMaxUnpooling_updateOutput<<<grid, block, - 0, THCState_getCurrentStream(state)>>>( - cudaInput, cudaIndices, cudaOutput, - dT, dH, dW, - padT, padH, padW, offsetZ); - THCudaCheck(cudaGetLastError()); - totalZ -= 65535; - offsetZ += 65535; - } - - THCudaTensor_free(state, input); - THCudaTensor_free(state, indices); -} - +template <typename Dtype> __global__ void cuda_VolumetricMaxUnpooling_updateGradInput( - THCDeviceTensor<float, 4> gradOutput, - THCDeviceTensor<float, 4> indices, - THCDeviceTensor<float, 4> gradInput, + THCDeviceTensor<Dtype, 4> gradOutput, + THCDeviceTensor<THCIndex_t, 4> indices, + THCDeviceTensor<Dtype, 4> gradInput, int dT, int dH, int dW, int padT, int padH, int padW, int offsetZ) { @@ -142,105 +52,21 @@ __global__ void cuda_VolumetricMaxUnpooling_updateGradInput( if (iRow < gradInput.getSize(2) && iColumn < gradInput.getSize(3)) { - + long start_t = iFrame * dT - padT; long start_h = iRow * dH - padH; long start_w = iColumn * dW - padW; - float *idx = &indices[slice][iFrame][iRow][iColumn]; + THCIndex_t *idx = &indices[slice][iFrame][iRow][iColumn]; long maxz = ((unsigned char*)(idx))[0]; long maxy = ((unsigned char*)(idx))[1]; long maxx = ((unsigned char*)(idx))[2]; - float grad_val = gradOutput[slice][start_t + maxz][start_h + maxy][start_w + maxx]; + Dtype grad_val = gradOutput[slice][start_t + maxz][start_h + maxy][start_w + maxx]; gradInput[slice][iFrame][iRow][iColumn] = grad_val; } } -void THNN_CudaVolumetricMaxUnpooling_updateGradInput( - THCState *state, THCudaTensor *input, THCudaTensor *gradOutput, THCudaTensor *gradInput, - THCudaTensor *indices, - int outputTime, int outputWidth, int outputHeight, - int dT, int dW, int dH, - int padT, int padW, int padH) -{ - - int batchSize; - int inputSlices; - int inputTime; - int inputHeight; - int inputWidth; - - THCUNN_assertSameGPU(state, 4, input, indices, gradOutput, gradInput); - - if (THCudaTensor_nDimension(state, input) == 4) /* 4D */ - { - batchSize = 1; - inputSlices = THCudaTensor_size(state, input, 0); - inputTime = THCudaTensor_size(state, input, 1); - inputHeight = THCudaTensor_size(state, input, 2); - inputWidth = THCudaTensor_size(state, input, 3); - } - else - { - batchSize = THCudaTensor_size(state, input, 0); - inputSlices = THCudaTensor_size(state, input, 1); - inputTime = THCudaTensor_size(state, input, 2); - inputHeight = THCudaTensor_size(state, input, 3); - inputWidth = THCudaTensor_size(state, input, 4); - } - - input = THCudaTensor_newContiguous(state, input); - indices = THCudaTensor_newContiguous(state, indices); - gradOutput = THCudaTensor_newContiguous(state, gradOutput); - THCudaTensor_resizeAs(state, gradInput, input); - THCudaTensor_zero(state, gradInput); - - // Collapse batch and feature dimensions - THCDeviceTensor<float, 4> cudaGradInput; - THCDeviceTensor<float, 4> cudaGradOutput; - THCDeviceTensor<float, 4> cudaIndices; - - if (THCudaTensor_nDimension(state, input) == 4) - { - cudaGradInput = toDeviceTensor<float, 4>(state, gradInput); - cudaGradOutput = toDeviceTensor<float, 4>(state, gradOutput); - cudaIndices = toDeviceTensor<float, 4>(state, indices); - } - else - { - cudaGradInput = - toDeviceTensor<float, 5>(state, gradInput).downcastOuter<4>(); - cudaGradOutput = - toDeviceTensor<float, 5>(state, gradOutput).downcastOuter<4>(); - cudaIndices = - toDeviceTensor<float, 5>(state, indices).downcastOuter<4>(); - } - - int totalZ = inputTime * inputSlices * batchSize; - int offsetZ = 0; - dim3 block(32, 8); - - while (totalZ > 0) { - dim3 grid(THCCeilDiv(inputWidth, static_cast<int>(block.x)), - THCCeilDiv(inputHeight, static_cast<int>(block.y)), - totalZ > 65535 ? 65535 : totalZ); - - cuda_VolumetricMaxUnpooling_updateGradInput<<<grid, block, - 0, THCState_getCurrentStream(state)>>>( - cudaGradOutput, - cudaIndices, - cudaGradInput, - dT, dH, dW, - padT, padH, padW, offsetZ); - THCudaCheck(cudaGetLastError()); - totalZ -= 65535; - offsetZ += 65535; - } - - // cleanup - THCudaTensor_free(state, input); - THCudaTensor_free(state, gradOutput); - THCudaTensor_free(state, indices); -}
\ No newline at end of file +#include "generic/VolumetricMaxUnpooling.cu" +#include "THCGenerateFloatTypes.h" diff --git a/lib/THCUNN/VolumetricReplicationPadding.cu b/lib/THCUNN/VolumetricReplicationPadding.cu index cca51d5..39c37b4 100644 --- a/lib/THCUNN/VolumetricReplicationPadding.cu +++ b/lib/THCUNN/VolumetricReplicationPadding.cu @@ -4,10 +4,14 @@ #include "THCDeviceTensorUtils.cuh" #include "THCDeviceUtils.cuh" #include "THCReduceApplyUtils.cuh" +#include "THCHalf.h" +#include "THCHalfAutoNumerics.cuh" +#include "THCAtomics.cuh" +template <typename Dtype> __global__ void VolumetricReplicationPadding_updateOutput( - THCDeviceTensor<float, 5> input, - THCDeviceTensor<float, 5> output, + THCDeviceTensor<Dtype, 5> input, + THCDeviceTensor<Dtype, 5> output, int pfront, int pback, int ptop, int pbottom, int pleft, int pright) { int outputPointId = threadIdx.x + blockIdx.x * blockDim.x; @@ -35,76 +39,15 @@ __global__ void VolumetricReplicationPadding_updateOutput( int inputPointZ = min(max(pfront, outputPointZ), input.getSize(2) + pfront - 1) - oStartZ + iStartZ; - float valueToCopy = + Dtype valueToCopy = input[batch][plane][inputPointZ][inputPointY][inputPointX]; output[batch][plane][outputPointZ][outputPointY][outputPointX] = valueToCopy; } -void THNN_CudaVolumetricReplicationPadding_updateOutput(THCState *state, - THCudaTensor *input, - THCudaTensor *output, - int pleft, int pright, - int ptop, int pbottom, - int pfront, int pback) { - THArgCheck(TensorUtils<THCudaTensor>::canUse32BitIndexMath(state, input), 2, - "input tensor must fit into 32-bit index math"); - - int planeDim = 0; - int dimd = 1; - int dimh = 2; - int dimw = 3; - int numBatch = 1; - - int numInputDims = THCudaTensor_nDimension(state, input); - THArgCheck(numInputDims == 4 || numInputDims == 5, 2, - "input must be 4 or 5-dimensional"); - - if (numInputDims == 5) { - numBatch = THCudaTensor_size(state, input, 0); - planeDim++; - dimd++; - dimh++; - dimw++; - } - - int numPlanes = THCudaTensor_size(state, input, planeDim); - int inputD = THCudaTensor_size(state, input, dimd); - int inputH = THCudaTensor_size(state, input, dimh); - int inputW = THCudaTensor_size(state, input, dimw); - int outputD = inputD + pfront + pback; - int outputH = inputH + ptop + pbottom; - int outputW = inputW + pleft + pright; - - THCDeviceTensor<float, 5> devInput; - THCDeviceTensor<float, 5> devOutput; - - if (numInputDims == 4) { - THCudaTensor_resize4d(state, output, numPlanes, outputD, outputH, outputW); - - devInput = toDeviceTensor<float, 4>(state, input).upcastOuter<5>(); - devOutput = toDeviceTensor<float, 4>(state, output).upcastOuter<5>(); - } else { - THCudaTensor_resize5d(state, output, numBatch, numPlanes, outputD, outputH, - outputW); - - devInput = toDeviceTensor<float, 5>(state, input); - devOutput = toDeviceTensor<float, 5>(state, output); - } - - int outputPlaneSize = devOutput.getSize(2) * devOutput.getSize(3) * - devOutput.getSize(4); - dim3 gridSize(THCCeilDiv(outputPlaneSize, 256), - devOutput.getSize(1), - devOutput.getSize(0)); - dim3 blockSize(outputPlaneSize > 256 ? 256 : outputPlaneSize); - - VolumetricReplicationPadding_updateOutput<<<gridSize, blockSize, 0, THCState_getCurrentStream(state)>>>( - devInput, devOutput, pfront, pback, ptop, pbottom, pleft, pright); -} - +template <typename Dtype> __global__ void VolumetricReplicationPadding_updateGradInput( - THCDeviceTensor<float, 5> gradInput, - THCDeviceTensor<float, 5> gradOutput, + THCDeviceTensor<Dtype, 5> gradInput, + THCDeviceTensor<Dtype, 5> gradOutput, int pfront, int pback, int ptop, int pbottom, int pleft, int pright) { int outputPointId = threadIdx.x + blockIdx.x * blockDim.x; int plane = blockIdx.y; @@ -134,56 +77,12 @@ __global__ void VolumetricReplicationPadding_updateGradInput( int inputPointZ = min(max(pfront, outputPointZ), gradInput.getSize(2) + pfront - 1) - oStartZ + iStartZ; - float valueToCopy = + Dtype valueToCopy = gradOutput[batch][plane][outputPointZ][outputPointY][outputPointX]; atomicAdd(&gradInput[batch][plane][inputPointZ][inputPointY][inputPointX], valueToCopy); } -void THNN_CudaVolumetricReplicationPadding_updateGradInput( - THCState *state, THCudaTensor *input, THCudaTensor *gradOutput, - THCudaTensor *gradInput, int pleft, int pright, int ptop, int pbottom, - int pfront, int pback) { - THArgCheck(TensorUtils<THCudaTensor>::canUse32BitIndexMath(state, input), 2, - "input tensor must fit into 32-bit index math"); - THArgCheck(TensorUtils<THCudaTensor>::canUse32BitIndexMath(state, gradOutput), - 3, "output gradient tensor must fit into 32-bit index math"); - - int planeDim = 0; - int dimd = 1; - int dimh = 2; - int dimw = 3; - - int numInputDims = THCudaTensor_nDimension(state, input); - if (numInputDims == 5) { - planeDim++; - dimd++; - dimh++; - dimw++; - } - - THCudaTensor_resizeAs(state, gradInput, input); - THCudaTensor_zero(state, gradInput); - THCDeviceTensor<float, 5> devGradInput; - THCDeviceTensor<float, 5> devGradOutput; - - if (numInputDims == 4) { - devGradInput = toDeviceTensor<float, 4>(state, gradInput).upcastOuter<5>(); - devGradOutput = - toDeviceTensor<float, 4>(state, gradOutput).upcastOuter<5>(); - } else { - devGradInput = toDeviceTensor<float, 5>(state, gradInput); - devGradOutput = toDeviceTensor<float, 5>(state, gradOutput); - } - - int outputPlaneSize = devGradOutput.getSize(2) * devGradOutput.getSize(3) * - devGradOutput.getSize(4); - dim3 gridSize(THCCeilDiv(outputPlaneSize, 256), - devGradOutput.getSize(1), - devGradOutput.getSize(0)); - dim3 blockSize(outputPlaneSize > 256 ? 256 : outputPlaneSize); - - VolumetricReplicationPadding_updateGradInput<<<gridSize, blockSize, 0, THCState_getCurrentStream(state)>>>( - devGradInput, devGradOutput, pfront, pback, ptop, pbottom, pleft, pright); -} +#include "generic/VolumetricReplicationPadding.cu" +#include "THCGenerateFloatTypes.h" diff --git a/lib/THCUNN/common.h b/lib/THCUNN/common.h index e097535..12fbdd1 100644 --- a/lib/THCUNN/common.h +++ b/lib/THCUNN/common.h @@ -8,6 +8,10 @@ #define THCUNN_assertSameGPU(...) THAssertMsg(THCudaTensor_checkGPU(__VA_ARGS__), \ "Some of weight/gradient/input tensors are located on different GPUs. Please move them to a single one.") +// _generic can be removed once everything is genericized +#define THCUNN_assertSameGPU_generic(...) THAssertMsg(THCTensor_(checkGPU)(__VA_ARGS__), \ + "Some of weight/gradient/input tensors are located on different GPUs. Please move them to a single one.") + // Use 1024 threads per block, which requires cuda sm_2x or above const int CUDA_NUM_THREADS = 1024; @@ -17,4 +21,12 @@ inline int GET_BLOCKS(const int N) return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS; } +#define THCUNN_resizeAs_indices(STATE, I1, I2) \ + THLongStorage *size2 = THCTensor_(newSizeOf)(STATE, I2); \ + if (!THCudaLongTensor_isSize(STATE, I1, size2)) \ + { \ + THCudaLongTensor_resize(STATE, I1, size2, NULL); \ + } \ + THLongStorage_free(size2); + #endif diff --git a/lib/THCUNN/generic/Abs.cu b/lib/THCUNN/generic/Abs.cu new file mode 100644 index 0000000..dc1e9a2 --- /dev/null +++ b/lib/THCUNN/generic/Abs.cu @@ -0,0 +1,28 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/Abs.cu" +#else + +#include "../common.h" + +void THNN_(Abs_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output) +{ + THCUNN_assertSameGPU_generic(state, 2, input, output); + THCTensor_(resizeAs)(state, output, input); + THC_pointwiseApply2(state, output, input, absupdateOutput_functor<real>()); +} + +void THNN_(Abs_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradInput) +{ + THCUNN_assertSameGPU_generic(state, 3, input, gradOutput, gradInput); + THCTensor_(resizeAs)(state, gradInput, input); + THC_pointwiseApply3(state, gradInput, input, gradOutput, absupdateGradInput_functor<real>()); +} + +#endif diff --git a/lib/THCUNN/generic/AbsCriterion.cu b/lib/THCUNN/generic/AbsCriterion.cu new file mode 100644 index 0000000..904fbba --- /dev/null +++ b/lib/THCUNN/generic/AbsCriterion.cu @@ -0,0 +1,59 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/AbsCriterion.cu" +#else + +void THNN_(AbsCriterion_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *target, + THCTensor *output, + bool sizeAverage) +{ + THCUNN_assertSameGPU_generic(state, 2, input, target); + + long size = THCTensor_(nElement)(state, input); + + input = THCTensor_(newContiguous)(state, input); + target = THCTensor_(newContiguous)(state, target); + + thrust::device_ptr<real> input_data(THCTensor_(data)(state, input)); + thrust::device_ptr<real> target_data(THCTensor_(data)(state, target)); + accreal sum = thrust::inner_product(input_data, input_data+size, target_data, (accreal)0, thrust::plus<accreal>(), abs_functor<real, accreal>()); + + if (sizeAverage) + sum /= size; + + THCTensor_(free)(state, input); + THCTensor_(free)(state, target); + + THCTensor_(set1d)(state, output, 0, ScalarConvert<accreal, real>::to(sum)); +} + +void THNN_(AbsCriterion_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *target, + THCTensor *gradInput, + bool sizeAverage) +{ + THCUNN_assertSameGPU_generic(state, 3, input, target, gradInput); + + long size = THCTensor_(nElement)(state, input); + real norm = ScalarConvert<double, real>::to(sizeAverage ? 1./size : 1.); + + input = THCTensor_(newContiguous)(state, input); + target = THCTensor_(newContiguous)(state, target); + + THCTensor_(resizeAs)(state, gradInput, input); + + thrust::device_ptr<real> input_data(THCTensor_(data)(state, input)); + thrust::device_ptr<real> target_data(THCTensor_(data)(state, target)); + thrust::device_ptr<real> gradInput_data(THCTensor_(data)(state, gradInput)); + + thrust::transform(input_data, input_data+size, target_data, gradInput_data, abs_updateGradInput_functor<real>(norm)); + + THCTensor_(free)(state, input); + THCTensor_(free)(state, target); +} + +#endif diff --git a/lib/THCUNN/generic/BCECriterion.cu b/lib/THCUNN/generic/BCECriterion.cu new file mode 100644 index 0000000..a6f76d6 --- /dev/null +++ b/lib/THCUNN/generic/BCECriterion.cu @@ -0,0 +1,99 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/BCECriterion.cu" +#else + +void THNN_(BCECriterion_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *target, + THCTensor *output, + bool sizeAverage, + THCTensor *weights) +{ + THCUNN_assertSameGPU_generic(state, 3, input, target, weights); + + long size = THCTensor_(nElement)(state, input); + + input = THCTensor_(newContiguous)(state, input); + target = THCTensor_(newContiguous)(state, target); + + thrust::device_ptr<real> input_data(THCTensor_(data)(state, input)); + thrust::device_ptr<real> target_data(THCTensor_(data)(state, target)); + + accreal sum; + if (weights) { + weights = THCTensor_(newContiguous)(state, weights); + thrust::device_ptr<real> weights_data(THCTensor_(data)(state, weights)); + sum = thrust::transform_reduce( + thrust::make_zip_iterator(thrust::make_tuple(input_data, target_data, weights_data)), + thrust::make_zip_iterator(thrust::make_tuple(input_data+size, target_data+size, weights_data+size)), + bce_functor_weights<real, accreal>(), + (accreal) 0, + thrust::plus<accreal>() + ); + THCTensor_(free)(state, weights); + } else { + sum = thrust::transform_reduce( + thrust::make_zip_iterator(thrust::make_tuple(input_data, target_data)), + thrust::make_zip_iterator(thrust::make_tuple(input_data+size, target_data+size)), + bce_functor<real, accreal>(), + (accreal) 0, + thrust::plus<accreal>() + ); + } + + if (sizeAverage) + sum /= size; + + THCTensor_(free)(state, input); + THCTensor_(free)(state, target); + + THCTensor_(set1d)(state, output, 0, ScalarConvert<accreal, real>::to(sum)); +} + +void THNN_(BCECriterion_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *target, + THCTensor *gradInput, + bool sizeAverage, + THCTensor *weights) +{ + THCUNN_assertSameGPU_generic(state, 4, input, target, gradInput, weights); + + long size = THCTensor_(nElement)(state, input); + real norm = ScalarConvert<accreal, real>::to(sizeAverage ? accreal(1)/size : accreal(1)); + + input = THCTensor_(newContiguous)(state, input); + target = THCTensor_(newContiguous)(state, target); + + THCTensor_(resizeAs)(state, gradInput, input); + + thrust::device_ptr<real> input_data(THCTensor_(data)(state, input)); + thrust::device_ptr<real> target_data(THCTensor_(data)(state, target)); + thrust::device_ptr<real> gradInput_data(THCTensor_(data)(state, gradInput)); + + if (weights) { + weights = THCTensor_(newContiguous)(state, weights); + thrust::device_ptr<real> weights_data(THCTensor_(data)(state, weights)); + thrust::transform( + thrust::make_zip_iterator(thrust::make_tuple(input_data, target_data, weights_data)), + thrust::make_zip_iterator(thrust::make_tuple(input_data+size, target_data+size, weights_data+size)), + gradInput_data, + bce_updateGradInput_functor_weights<real, accreal>(norm) + ); + THCTensor_(free)(state, weights); + } else { + thrust::transform( + thrust::make_zip_iterator(thrust::make_tuple(input_data, target_data)), + thrust::make_zip_iterator(thrust::make_tuple(input_data+size, target_data+size)), + gradInput_data, + bce_updateGradInput_functor<real, accreal>(norm) + ); + } + + THCTensor_(free)(state, input); + THCTensor_(free)(state, target); +} + +#endif diff --git a/lib/THCUNN/generic/BatchNormalization.cu b/lib/THCUNN/generic/BatchNormalization.cu new file mode 100644 index 0000000..80bb937 --- /dev/null +++ b/lib/THCUNN/generic/BatchNormalization.cu @@ -0,0 +1,97 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/BatchNormalization.cu" +#else + +#define DeviceTensor3 THCDeviceTensor<real, 3> +#define DeviceTensor1 THCDeviceTensor<real, 1> + +template <int Dim> +static THCDeviceTensor<real, Dim> devicetensor(THCState *state, THCTensor *t) { + if (!t) { + return THCDeviceTensor<real, Dim>(); + } + + int inDim = THCTensor_(nDimension)(state, t); + if (inDim == Dim) { + return toDeviceTensor<real, Dim>(state, t); + } + + // View in which the last dimensions are collapsed or expanded as needed + THAssert(THCTensor_(isContiguous)(state, t)); + int size[Dim]; + for (int i = 0; i < Dim || i < inDim; ++i) { + if (i < Dim && i < inDim) { + size[i] = t->size[i]; + } else if (i < Dim) { + size[i] = 1; + } else { + size[Dim - 1] *= t->size[i]; + } + } + return THCDeviceTensor<real, Dim>(THCTensor_(data)(state, t), size); +} + +void THNN_(BatchNormalization_updateOutput)( + THCState *state, THCTensor *input_, THCTensor *output_, + THCTensor *weight_, THCTensor *bias_, THCTensor *runningMean_, + THCTensor *runningVar_, THCTensor *saveMean_, THCTensor *saveStd_, + bool train, double momentum, double eps) { + + DeviceTensor3 input = devicetensor<3>(state, input_); + DeviceTensor3 output = devicetensor<3>(state, output_); + DeviceTensor1 weight = devicetensor<1>(state, weight_); + DeviceTensor1 bias = devicetensor<1>(state, bias_); + DeviceTensor1 runningMean = devicetensor<1>(state, runningMean_); + DeviceTensor1 runningVar = devicetensor<1>(state, runningVar_); + DeviceTensor1 saveMean = devicetensor<1>(state, saveMean_); + DeviceTensor1 saveStd = devicetensor<1>(state, saveStd_); + + cudaStream_t s = THCState_getCurrentStream(state); + cudaDeviceProp *prop = THCState_getCurrentDeviceProperties(state); + + if (!train) { + dim3 blocks(input.getSize(1)); + dim3 threads(getNumThreads(input.getSize(2))); + BatchNormalizationUpdateOutputInference_kernel<real, accreal, DeviceTensor1, DeviceTensor3> <<<blocks, threads, 0, s>>>( + input, output, runningMean, runningVar, weight, bias, eps); + } else { + dim3 blocks(input.getSize(1)); + dim3 threads(getNumThreads(input.getSize(2))); + BatchNormalizationUpdateOutput_kernel<real, accreal, DeviceTensor1, DeviceTensor3> <<<blocks, threads, 0, s>>>( + input, output, weight, bias, eps, momentum, runningMean, runningVar, + saveMean, saveStd); + } + THCudaCheck(cudaGetLastError()); +} + +void THNN_(BatchNormalization_backward)( + THCState *state, THCTensor *input_, THCTensor *gradOutput_, + THCTensor *gradInput_, THCTensor *gradWeight_, THCTensor *gradBias_, + THCTensor *weight_, THCTensor *runningMean_, THCTensor *runningVar_, + THCTensor *saveMean_, THCTensor *saveStd_, bool train, float scale, double eps) { + + DeviceTensor3 input = devicetensor<3>(state, input_); + DeviceTensor3 gradOutput = devicetensor<3>(state, gradOutput_); + DeviceTensor3 gradInput = devicetensor<3>(state, gradInput_); + DeviceTensor1 gradWeight = devicetensor<1>(state, gradWeight_); + DeviceTensor1 gradBias = devicetensor<1>(state, gradBias_); + DeviceTensor1 weight = devicetensor<1>(state, weight_); + DeviceTensor1 runningMean = devicetensor<1>(state, runningMean_); + DeviceTensor1 runningVar = devicetensor<1>(state, runningVar_); + DeviceTensor1 saveMean = devicetensor<1>(state, saveMean_); + DeviceTensor1 saveStd = devicetensor<1>(state, saveStd_); + + cudaStream_t s = THCState_getCurrentStream(state); + + dim3 blocks(gradOutput.getSize(1)); + dim3 threads(getNumThreads(gradOutput.getSize(2))); + BatchNormalizationBackward_kernel<real, accreal, DeviceTensor1, DeviceTensor3> <<<blocks, threads, 0, s>>>( + input, gradOutput, gradInput, gradWeight, gradBias, weight, runningMean, runningVar, + saveMean, saveStd, train, scale, eps); + THCudaCheck(cudaGetLastError()); +} + +#undef DeviceTensor3 +#undef DeviceTensor1 + +#endif diff --git a/lib/THCUNN/generic/ClassNLLCriterion.cu b/lib/THCUNN/generic/ClassNLLCriterion.cu new file mode 100644 index 0000000..62925a8 --- /dev/null +++ b/lib/THCUNN/generic/ClassNLLCriterion.cu @@ -0,0 +1,156 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/ClassNLLCriterion.cu" +#else + +void THNN_(ClassNLLCriterion_updateOutput)( + THCState *state, + THCTensor *input, + THCIndexTensor *target, + THCTensor *output, + bool sizeAverage, + THCTensor *weights, + THCTensor *total_weight) { + if (THCIndexTensor_(nDimension)(state, target) > 1) { + THError("multi-target not supported"); + } + + int n_dims = THCTensor_(nDimension)(state, input); + int n_classes = THCTensor_(size)(state, input, n_dims - 1); + + if (weights) { + THCUNN_assertSameGPU_generic( + state, 5, input, target, weights, output, total_weight + ); + } else { + THCUNN_assertSameGPU_generic( + state, 4, input, target, output, total_weight + ); + } + + if (THCTensor_(nDimension)(state, input) > 2) { + THArgCheck(0, 2, "vector or matrix expected"); + } + if (weights && THCTensor_(nElement)(state, weights) != n_classes) { + THError("weight tensor should be defined either for all or no classes"); + } + + input = THCTensor_(newContiguous)(state, input); + weights = weights ? THCTensor_(newContiguous)(state, weights) : NULL; + target = THCIndexTensor_(newContiguous)(state, target); + + real *input_data = THCTensor_(data)(state, input); + real *weights_data = weights ? THCTensor_(data)(state, weights) : NULL; + THCIndex_t *target_data = THCIndexTensor_(data)(state, target); + real *output_data = THCTensor_(data)(state, output); + real *total_weight_data = THCTensor_(data)(state, total_weight); + + if (THCTensor_(nDimension)(state, input) == 1) { + cunn_ClassNLLCriterion_updateOutput_kernel1<real> + <<<1, 1, 0, THCState_getCurrentStream(state)>>>( + output_data, + total_weight_data, + input_data, + target_data, + weights_data, + sizeAverage, + n_classes + ); + + } else if (THCTensor_(nDimension)(state, input) == 2) { + cunn_ClassNLLCriterion_updateOutput_kernel<real, accreal> + <<<1, NTHREADS, 0, THCState_getCurrentStream(state)>>>( + output_data, + total_weight_data, + input_data, + target_data, + weights_data, + sizeAverage, + THCTensor_(size)(state, input, 0), + THCTensor_(size)(state, input, 1), + n_classes + ); + } + THCudaCheck(cudaGetLastError()); + + if (weights) { + THCTensor_(free)(state, weights); + } + THCIndexTensor_(free)(state, target); + THCTensor_(free)(state, input); +} + +void THNN_(ClassNLLCriterion_updateGradInput)( + THCState *state, + THCTensor *input, + THCIndexTensor *target, + THCTensor *gradInput, + bool sizeAverage, + THCTensor *weights, + THCTensor *total_weight) { + if (THCIndexTensor_(nDimension)(state, target) > 1) { + THError("multi-target not supported"); + } + + int n_dims = THCTensor_(nDimension)(state, input); + int n_classes = THCTensor_(size)(state, input, n_dims - 1); + + THArgCheck(THCTensor_(isContiguous)(state, gradInput), 4, "gradInput must be contiguous"); + + if (weights) { + THCUNN_assertSameGPU( + state, 5, weights, input, target, gradInput, total_weight + ); + } + else { + THCUNN_assertSameGPU( + state, 4, input, target, gradInput, total_weight + ); + } + + if (THCTensor_(nDimension)(state, input) > 2) { + THArgCheck(0, 2, "vector or matrix expected"); + } + if (weights && THCTensor_(nElement)(state, weights) != n_classes) { + THError("weight tensor should be defined either for all or no classes"); + } + + weights = weights ? THCTensor_(newContiguous)(state, weights) : NULL; + target = THCIndexTensor_(newContiguous)(state, target); + + real *weights_data = weights ? THCTensor_(data)(state, weights) : NULL; + real *gradInput_data = THCTensor_(data)(state, gradInput); + THCIndex_t *target_data = THCIndexTensor_(data)(state, target); + real *total_weight_data = THCTensor_(data)(state, total_weight); + + if (THCTensor_(nDimension)(state, input) == 1) { + cunn_ClassNLLCriterion_updateGradInput_kernel1<real> + <<<1, 1, 0, THCState_getCurrentStream(state)>>>( + gradInput_data, + weights_data, + target_data, + total_weight_data, + sizeAverage, + n_classes + ); + } else { + cunn_ClassNLLCriterion_updateGradInput_kernel<real> + <<<1, NTHREADS, 0, THCState_getCurrentStream(state)>>>( + gradInput_data, + target_data, + weights_data, + total_weight_data, + sizeAverage, + THCTensor_(size)(state, input, 0), + THCTensor_(size)(state, input, 1), + n_classes + ); + } + THCudaCheck(cudaGetLastError()); + + if (weights) { + THCTensor_(free)(state, weights); + } + THCIndexTensor_(free)(state, target); +} + +#endif diff --git a/lib/THCUNN/generic/DistKLDivCriterion.cu b/lib/THCUNN/generic/DistKLDivCriterion.cu new file mode 100644 index 0000000..e06fe9e --- /dev/null +++ b/lib/THCUNN/generic/DistKLDivCriterion.cu @@ -0,0 +1,67 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/DistKLDivCriterion.cu" +#else + +void THNN_(DistKLDivCriterion_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *target, + THCTensor *output, + bool sizeAverage) +{ + THCUNN_assertSameGPU_generic(state, 2, input, target); + + THArgCheck(THCTensor_(nElement)(state, input) == THCTensor_(nElement)(state, target), 2, + "input and target need to have the same number of elements"); + + accreal sum; + + long size = THCTensor_(nElement)(state, input); + + input = THCTensor_(newContiguous)(state, input); + target = THCTensor_(newContiguous)(state, target); + + thrust::device_ptr<real> input_data(THCTensor_(data)(state, input)); + thrust::device_ptr<real> target_data(THCTensor_(data)(state, target)); + sum = thrust::inner_product(input_data, input_data+size, target_data, (accreal) 0, thrust::plus<accreal>(), kl_functor<real, accreal>()); + + if (sizeAverage) + sum /= size; + + THCTensor_(free)(state, input); + THCTensor_(free)(state, target); + + THCTensor_(set1d)(state, output, 0, ScalarConvert<accreal, real>::to(sum)); +} + +void THNN_(DistKLDivCriterion_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *target, + THCTensor *gradInput, + bool sizeAverage) +{ + THCUNN_assertSameGPU_generic(state, 3, input, target, gradInput); + + THArgCheck(THCTensor_(nElement)(state, input) == THCTensor_(nElement)(state, target), 2, + "input and target need to have the same number of elements"); + + long size = THCTensor_(nElement)(state, input); + real norm = (sizeAverage ? ScalarConvert<accreal, real>::to(accreal(1)/size) : ScalarConvert<int, real>::to(1)); + + input = THCTensor_(newContiguous)(state, input); + target = THCTensor_(newContiguous)(state, target); + + THCTensor_(resizeAs)(state, gradInput, input); + + thrust::device_ptr<real> input_data(THCTensor_(data)(state, input)); + thrust::device_ptr<real> target_data(THCTensor_(data)(state, target)); + thrust::device_ptr<real> gradInput_data(THCTensor_(data)(state, gradInput)); + + thrust::transform(input_data, input_data+size, target_data, gradInput_data, kl_updateGradInput_functor<real>(norm)); + + THCTensor_(free)(state, input); + THCTensor_(free)(state, target); +} + +#endif diff --git a/lib/THCUNN/generic/ELU.cu b/lib/THCUNN/generic/ELU.cu new file mode 100644 index 0000000..291f26e --- /dev/null +++ b/lib/THCUNN/generic/ELU.cu @@ -0,0 +1,53 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/ELU.cu" +#else + +#include "../common.h" + + +void THNN_(ELU_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output, + real alpha, + bool inplace) +{ + THCUNN_assertSameGPU_generic(state, 2, input, output); + + if (inplace) + { + THC_pointwiseApply1(state, input, ELUupdateOutputIP_functor<real>(alpha)); + THCTensor_(set)(state, output, input); + } + else + { + THCTensor_(resizeAs)(state, output, input); + THC_pointwiseApply2(state, output, input, ELUupdateOutput_functor<real>(alpha)); + } +} + + +void THNN_(ELU_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradInput, + THCTensor *output, + real alpha, + bool inplace) +{ + THCUNN_assertSameGPU_generic(state, 3, output, gradOutput, gradInput); + + if (inplace) + { + THC_pointwiseApply2(state, gradOutput, output, ELUupdateGradInputIP_functor<real>(alpha)); + THCTensor_(set)(state, gradInput, gradOutput); + } + else + { + THCTensor_(resizeAs)(state, gradInput, output); + THC_pointwiseApply3(state, gradInput, output, gradOutput, ELUupdateGradInput_functor<real>(alpha)); + } +} + +#endif diff --git a/lib/THCUNN/generic/HardTanh.cu b/lib/THCUNN/generic/HardTanh.cu new file mode 100644 index 0000000..8899476 --- /dev/null +++ b/lib/THCUNN/generic/HardTanh.cu @@ -0,0 +1,54 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/HardTanh.cu" +#else + +#include "../common.h" + +void THNN_(HardTanh_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output, + real min_val, + real max_val, + bool inplace) +{ + THCUNN_assertSameGPU_generic(state, 2, input, output); + if(inplace) + { + THCTensor_(set)(state, output, input); + THC_pointwiseApply1(state, output, hardtanhupdateOutput_functor<real>(min_val, max_val)); + } + else + { + THCTensor_(resizeAs)(state, output, input); + THC_pointwiseApply2(state, output, input, + hardtanhupdateOutput_functor<real>(min_val, max_val)); + } +} + +void THNN_(HardTanh_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradInput, + real min_val, + real max_val, + bool inplace) +{ + THCUNN_assertSameGPU_generic(state, 3, input, gradOutput, gradInput); + + if (inplace) + { + THCTensor_(set)(state, gradInput, gradOutput); + THC_pointwiseApply2(state, gradInput, input, + hardtanhupdateGradInput_functor<real>(min_val, max_val)); + } + else + { + THCTensor_(resizeAs)(state, gradInput, input); + THC_pointwiseApply3(state, gradInput, input, gradOutput, + hardtanhupdateGradInput_functor<real>(min_val, max_val)); + } +} + +#endif diff --git a/lib/THCUNN/generic/L1Cost.cu b/lib/THCUNN/generic/L1Cost.cu new file mode 100644 index 0000000..de15189 --- /dev/null +++ b/lib/THCUNN/generic/L1Cost.cu @@ -0,0 +1,42 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/L1Cost.cu" +#else + +void THNN_(L1Cost_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output) +{ + THCUNN_assertSameGPU_generic(state, 1, input); + accreal sum; + long size = THCTensor_(nElement)(state, input); + input = THCTensor_(newContiguous)(state, input); + thrust::device_ptr<real> input_data(THCTensor_(data)(state, input)); + sum = thrust::transform_reduce(input_data, input_data+size, l1cost_functor<real, accreal>(), accreal(0), thrust::plus<accreal>()); + + THCTensor_(free)(state, input); + + THCTensor_(set1d)(state, output, 0, ScalarConvert<accreal, real>::to(sum)); +} + +void THNN_(L1Cost_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradInput) +{ + THCUNN_assertSameGPU_generic(state, 2, input, gradInput); + long size = THCTensor_(nElement)(state, input); + + input = THCTensor_(newContiguous)(state, input); + THCTensor_(resizeAs)(state, gradInput, input); + + thrust::device_ptr<real> input_data(THCTensor_(data)(state, input)); + thrust::device_ptr<real> gradInput_data(THCTensor_(data)(state, gradInput)); + + thrust::transform(input_data, input_data+size, gradInput_data, l1cost_updateGradInput_functor<real>()); + + THCTensor_(free)(state, input); +} + +#endif diff --git a/lib/THCUNN/generic/LeakyReLU.cu b/lib/THCUNN/generic/LeakyReLU.cu new file mode 100644 index 0000000..20fb778 --- /dev/null +++ b/lib/THCUNN/generic/LeakyReLU.cu @@ -0,0 +1,54 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/LeakyReLU.cu" +#else + +#include "../common.h" + +void THNN_(LeakyReLU_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output, + real negval, + bool inplace) +{ + THCUNN_assertSameGPU_generic(state, 2, input, output); + + if (inplace) + { + THC_pointwiseApply1(state, input, LeakyReLUUpdateOutputIP<real>(negval)); + THCTensor_(set)(state, output, input); + } + else + { + THCTensor_(resizeAs)(state, output, input); + THC_pointwiseApply2(state, output, input, LeakyReLUUpdateOutput<real>(negval)); + } + + THCudaCheck(cudaGetLastError()); +} + +void THNN_(LeakyReLU_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradInput, + real negval, + bool inplace) +{ + THCUNN_assertSameGPU_generic(state, 3, input, gradInput, gradOutput); + + if (inplace) + { + THC_pointwiseApply2(state, gradOutput, input, LeakyReLUUpdateGradInputIP<real>(negval)); + THCTensor_(set)(state, gradInput, gradOutput); + } + else + { + THCTensor_(resizeAs)(state, gradInput, input); + THC_pointwiseApply3(state, gradInput, input, gradOutput, LeakyReLUUpdateGradInput<real>(negval)); + } + + THCudaCheck(cudaGetLastError()); +} + +#endif diff --git a/lib/THCUNN/generic/LogSigmoid.cu b/lib/THCUNN/generic/LogSigmoid.cu new file mode 100644 index 0000000..4a6a4c9 --- /dev/null +++ b/lib/THCUNN/generic/LogSigmoid.cu @@ -0,0 +1,30 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/LogSigmoid.cu" +#else + +#include "../common.h" + +void THNN_(LogSigmoid_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output, + THCTensor *buffer) +{ + THCUNN_assertSameGPU_generic(state, 2, input, output); + THCTensor_(resizeAs)(state, output, input); + THC_pointwiseApply2(state, output, input, logSigmoid_updateOutput_functor<real>()); +} + +void THNN_(LogSigmoid_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradInput, + THCTensor *buffer) +{ + THCUNN_assertSameGPU_generic(state, 3, input, gradOutput, gradInput); + THCTensor_(resizeAs)(state, gradInput, input); + THC_pointwiseApply3(state, gradInput, input, gradOutput, logSigmoid_updateGradInput_functor<real>()); +} + +#endif diff --git a/lib/THCUNN/generic/LogSoftMax.cu b/lib/THCUNN/generic/LogSoftMax.cu new file mode 100644 index 0000000..5755ea2 --- /dev/null +++ b/lib/THCUNN/generic/LogSoftMax.cu @@ -0,0 +1,238 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/LogSoftMax.cu" +#else + +#include "../common.h" + +void THNN_(LogSoftMax_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output) +{ + THCUNN_assertSameGPU_generic(state, 2, input, output); + + THCTensor_(resizeAs)(state, output, input); + + bool spatial = false; + int batchSize = 1; + int classSize = 0; + int height = 0; + int width = 0; + + int ndims = THCTensor_(nDimension)(state, input); + + if (ndims == 1) + { + classSize = THCTensor_(size)(state, input, 0); + input = THCTensor_(newContiguous)(state, input); + } + else if (ndims == 2) + { + batchSize = THCTensor_(size)(state, input, 0); + classSize = THCTensor_(size)(state, input, 1); + input = THCTensor_(newContiguous)(state, input); + } + else if (ndims == 3) + { + spatial = true; + classSize = THCTensor_(size)(state, input, 0); + height = THCTensor_(size)(state, input, 1); + width = THCTensor_(size)(state, input, 2); + + // create contiguous tensor with cuda layout from tensor with torch layout + // C x H x W -> W x H x C + THCTensor_(transpose)(state, input, input, 0, 2); + // W x H x C -> H x W x C + THCTensor_(transpose)(state, input, input, 0, 1); + THCTensor *transposedInput = THCTensor_(newContiguous)(state, input); + THCTensor_(transpose)(state, input, input, 0, 1); + THCTensor_(transpose)(state, input, input, 0, 2); + input = transposedInput; + } + else if (ndims == 4) + { + spatial = true; + batchSize = THCTensor_(size)(state, input, 0); + classSize = THCTensor_(size)(state, input, 1); + height = THCTensor_(size)(state, input, 2); + width = THCTensor_(size)(state, input, 3); + + // create contiguous tensor with cuda layout from tensor with torch layout + // B x C x H x W -> B x W x H x C + THCTensor_(transpose)(state, input, input, 1, 3); + // B x W x H x C -> B x H x W x C + THCTensor_(transpose)(state, input, input, 1, 2); + THCTensor *transposedInput = THCTensor_(newContiguous)(state, input); + THCTensor_(transpose)(state, input, input, 1, 2); + THCTensor_(transpose)(state, input, input, 1, 3); + input = transposedInput; + } + else + { + THError("1D, 2D, 3D or 4D Tensor expected"); + } + + if (!spatial) + { + dim3 grid(batchSize); + dim3 block(1024); + + cunn_LogSoftMax_updateOutput_kernel<2, real, accreal> + <<<grid, block, block.x * sizeof(accreal), THCState_getCurrentStream(state)>>>( + THCTensor_(data)(state, output), + THCTensor_(data)(state, input), + classSize + ); + } + else + { + dim3 grid(batchSize); + dim3 block(1024); + + cunn_SpatialLogSoftMax_updateOutput_kernel<real, accreal> + <<<grid, block, 0, THCState_getCurrentStream(state)>>>( + THCTensor_(data)(state, output), + THCTensor_(data)(state, input), + classSize, height, width + ); + } + + cudaError errcode = cudaGetLastError(); + if (errcode != cudaSuccess) + { + THError(cudaGetErrorString(errcode)); + } + + THCTensor_(free)(state, input); +} + +void THNN_(LogSoftMax_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradInput, + THCTensor *output) +{ + THCUNN_assertSameGPU_generic(state, 3, output, gradOutput, gradInput); + + THCTensor_(resizeAs)(state, gradInput, output); + + bool spatial = false; + int batchSize = 1; + int classSize = 0; + int height = 0; + int width = 0; + + int ndims = THCTensor_(nDimension)(state, input); + + if (ndims == 1) + { + classSize = THCTensor_(size)(state, gradInput, 0); + output = THCTensor_(newContiguous)(state, output); + gradOutput = THCTensor_(newContiguous)(state, gradOutput); + } + else if (ndims == 2) + { + batchSize = THCTensor_(size)(state, gradInput, 0); + classSize = THCTensor_(size)(state, gradInput, 1); + output = THCTensor_(newContiguous)(state, output); + gradOutput = THCTensor_(newContiguous)(state, gradOutput); + } + else if (ndims == 3) + { + spatial = true; + classSize = THCTensor_(size)(state, input, 0); + height = THCTensor_(size)(state, input, 1); + width = THCTensor_(size)(state, input, 2); + + // create contiguous tensor with cuda layout from tensor with torch layout + // C x H x W -> W x H x C + THCTensor_(transpose)(state, output, output, 0, 2); + // W x H x C -> H x W x C + THCTensor_(transpose)(state, output, output, 0, 1); + THCTensor *transposedOutput = THCTensor_(newContiguous)(state, output); + THCTensor_(transpose)(state, output, output, 0, 1); + THCTensor_(transpose)(state, output, output, 0, 2); + output = transposedOutput; + + // create contiguous tensor with cuda layout from tensor with torch layout + // C x H x W -> W x H x C + THCTensor_(transpose)(state, gradOutput, gradOutput, 0, 2); + // W x H x C -> H x W x C + THCTensor_(transpose)(state, gradOutput, gradOutput, 0, 1); + THCTensor *transposedGradOutput = THCTensor_(newContiguous)(state, gradOutput); + THCTensor_(transpose)(state, gradOutput, gradOutput, 0, 1); + THCTensor_(transpose)(state, gradOutput, gradOutput, 0, 2); + gradOutput = transposedGradOutput; + } + else if (ndims == 4) + { + spatial = true; + batchSize = THCTensor_(size)(state, gradInput, 0); + classSize = THCTensor_(size)(state, input, 1); + height = THCTensor_(size)(state, input, 2); + width = THCTensor_(size)(state, input, 3); + + // create contiguous tensor with cuda layout from tensor with torch layout + // B x C x H x W -> B x W x H x C + THCTensor_(transpose)(state, output, output, 1, 3); + // B x W x H x C -> B x H x W x C + THCTensor_(transpose)(state, output, output, 1, 2); + THCTensor *transposedOutput = THCTensor_(newContiguous)(state, output); + THCTensor_(transpose)(state, output, output, 1, 2); + THCTensor_(transpose)(state, output, output, 1, 3); + output = transposedOutput; + + // create contiguous tensor with cuda layout from tensor with torch layout + // B x C x H x W -> B x W x H x C + THCTensor_(transpose)(state, gradOutput, gradOutput, 1, 3); + // B x W x H x C -> B x H x W x C + THCTensor_(transpose)(state, gradOutput, gradOutput, 1, 2); + THCTensor *transposedGradOutput = THCTensor_(newContiguous)(state, gradOutput); + THCTensor_(transpose)(state, gradOutput, gradOutput, 1, 2); + THCTensor_(transpose)(state, gradOutput, gradOutput, 1, 3); + gradOutput = transposedGradOutput; + } + else + { + THError("1D, 2D, 3D or 4D Tensor expected"); + } + + if (!spatial) + { + dim3 grid(batchSize); + dim3 block(1024); + + cunn_LogSoftMax_updateGradInput_kernel<2, real, accreal> + <<<grid, block, block.x * sizeof(accreal), THCState_getCurrentStream(state)>>>( + THCTensor_(data)(state, gradInput), + THCTensor_(data)(state, output), + THCTensor_(data)(state, gradOutput), + classSize + ); + } + else + { + dim3 grid(batchSize); + dim3 block(1024); + + cunn_SpatialLogSoftMax_updateGradInput_kernel<real, accreal> + <<<grid, block, 0, THCState_getCurrentStream(state)>>>( + THCTensor_(data)(state, gradInput), + THCTensor_(data)(state, output), + THCTensor_(data)(state, gradOutput), + classSize, height, width + ); + } + + cudaError errcode = cudaGetLastError(); + if (errcode != cudaSuccess) + { + THError(cudaGetErrorString(errcode)); + } + + THCTensor_(free)(state, gradOutput); + THCTensor_(free)(state, output); +} + +#endif diff --git a/lib/THCUNN/generic/LookupTable.cu b/lib/THCUNN/generic/LookupTable.cu new file mode 100644 index 0000000..2027425 --- /dev/null +++ b/lib/THCUNN/generic/LookupTable.cu @@ -0,0 +1,157 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/LookupTable.cu" +#else + +void THNN_(LookupTable_accGradParameters)( + THCState *state, + THCIndexTensor *input, + THCTensor *gradOutput, + THCTensor *gradWeight, + THCIndexTensor *count, + THCIndexTensor *sorted, + THCIndexTensor *indices, + bool scaleGradByFreq, + int paddingValue, + real scale) +{ + THCUNN_assertSameGPU_generic(state, 5, input, gradOutput, gradWeight, sorted, indices); + if (!(THCIndexTensor_(isContiguous)(state, input) && + THCTensor_(isContiguous)(state, gradOutput) && + THCTensor_(isContiguous)(state, gradWeight))) + { + THError("Tensors must be contiguous"); + } + + int nDim = THCIndexTensor_(nDimension)(state, input); + if (nDim != 1 && nDim != 2) + THError("input must be a vector or matrix"); + + long numel = THCIndexTensor_(nElement)(state, input); + long stride = gradWeight->stride[0]; + + cudaStream_t stream = THCState_getCurrentStream(state); + + if (numel <= 768 && !scaleGradByFreq) { + cunn_LookupTable_accGradParametersKernelByFeature<<<DIVUP(stride,4), 128, 0, stream>>>( + THCIndexTensor_(data)(state, input), + THCTensor_(data)(state, gradOutput), + THCTensor_(data)(state, gradWeight), + scale, + numel, + stride, + paddingValue); + THCudaCheck(cudaGetLastError()); + return; + } + + THLongStorage *inputSize = THCIndexTensor_(newSizeOf)(state, input); + THCIndexTensor_(resize)(state, sorted, inputSize, NULL); + THCIndexTensor_(resize)(state, indices, inputSize, NULL); + THLongStorage_free(inputSize); + + // Sort the inputs into sorted with the corresponding indices + THCIndexTensor_(sort)(state, sorted, indices, input, 0, 0); + + THCIndex_t *sorted_data = THCIndexTensor_(data)(state, sorted); + THCIndex_t *indices_data = THCIndexTensor_(data)(state, indices); + THCIndex_t *count_data = NULL; + + if (scaleGradByFreq) + { + THCIndexTensor_(resizeAs)(state, count, input); + count_data = THCIndexTensor_(data)(state, count); + + thrust::device_ptr<THCIndex_t> sorted_ptr(sorted_data); + thrust::device_ptr<THCIndex_t> count_ptr(count_data); + + // Compute an increasing sequence per unique item in sorted: + // sorted: 2 5 5 5 7 7 8 9 9 + // count: 1 1 2 3 1 2 1 1 2 + thrust::inclusive_scan_by_key( +#if CUDA_VERSION >= 7000 + thrust::cuda::par.on(THCState_getCurrentStream(state)), +#endif + sorted_ptr, + sorted_ptr + numel, + thrust::make_constant_iterator(1), + count_ptr + ); + + // Take the maximum of each count per unique key in reverse: + // sorted: 2 5 5 5 7 7 8 9 9 + // count: 1 3 3 3 2 2 1 2 2 + thrust::inclusive_scan_by_key( +#if CUDA_VERSION >= 7000 + thrust::cuda::par.on(THCState_getCurrentStream(state)), +#endif + thrust::make_reverse_iterator(sorted_ptr + numel), + thrust::make_reverse_iterator(sorted_ptr), + thrust::make_reverse_iterator(count_ptr + numel), + thrust::make_reverse_iterator(count_ptr + numel), + thrust::equal_to<long>(), + thrust::maximum<long>() + ); + } + + dim3 grid(DIVUP(numel,4), DIVUP(stride,128)); + dim3 block(32, 4); + cunn_LookupTable_accGradParametersKernel<real, accreal><<<grid, block, 0, stream>>>( + sorted_data, + indices_data, + THCTensor_(data)(state, gradOutput), + THCTensor_(data)(state, gradWeight), + count_data, + scale, + numel, + stride, + paddingValue + ); + THCudaCheck(cudaGetLastError()); +} + +void THNN_(LookupTable_renorm)( + THCState *state, + THCIndexTensor *idx, + THCTensor *weight, + real maxNorm, + real normType) +{ + THCUNN_assertSameGPU_generic(state, 2, idx, weight); + if (!(THCIndexTensor_(isContiguous)(state, idx) && + THCTensor_(isContiguous)(state, weight))) + { + THError("Tensors must be contiguous"); + } + if (THCIndexTensor_(nDimension)(state, idx) != 1) + THError("idx must be a vector"); + if (normType <= 0) + THError("non-positive-norm not supported"); + + THCIndex_t numel = THCIndexTensor_(nElement)(state, idx); + long stride = weight->stride[0]; + + // get the unique indices + thrust::device_ptr<real> weight_ptr(THCTensor_(data)(state, weight)); + thrust::device_ptr<THCIndex_t> idx_ptr(THCIndexTensor_(data)(state, idx)); + thrust::device_ptr<THCIndex_t> end_ptr = thrust::unique(idx_ptr, idx_ptr+numel); + numel = end_ptr - idx_ptr; + + pow_v<real, accreal> unary_pow(normType); + thrust::plus<accreal> binary_plus; + // numel << stride, since idx usually contains sparse row indices + for (long i = 0; i < numel; i++) + { + THCIndex_t k = idx_ptr[i] - TH_INDEX_BASE; + thrust::device_ptr<real> row_ptr = weight_ptr + k * stride; + accreal norm = thrust::transform_reduce(row_ptr, row_ptr + stride, + unary_pow, 0, binary_plus); + norm = std::pow(norm, (accreal) (1.0 / normType)); + if (norm > ScalarConvert<real, accreal>::to(maxNorm)) + { + multiply_s<real> unary_mul(ScalarConvert<accreal, real>::to(maxNorm / (norm + 1e-7))); + thrust::transform(row_ptr, row_ptr + stride, row_ptr, unary_mul); + } + } +} + +#endif diff --git a/lib/THCUNN/generic/MSECriterion.cu b/lib/THCUNN/generic/MSECriterion.cu new file mode 100644 index 0000000..f4a38de --- /dev/null +++ b/lib/THCUNN/generic/MSECriterion.cu @@ -0,0 +1,75 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/MSECriterion.cu" +#else + +void THNN_(MSECriterion_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *target, + THCTensor *output, + bool sizeAverage) +{ + THCUNN_assertSameGPU_generic(state, 2, input, target); + THArgCheck(THCTensor_(nElement)(state, input) == THCTensor_(nElement)(state, target), 2, + "input and target need to have the same number of elements" + ); + + long size = THCTensor_(nElement)(state, input); + + input = THCTensor_(newContiguous)(state, input); + target = THCTensor_(newContiguous)(state, target); + + thrust::device_ptr<real> input_data(THCTensor_(data)(state, input)); + thrust::device_ptr<real> target_data(THCTensor_(data)(state, target)); + accreal sum = thrust::inner_product( +#if CUDA_VERSION >= 7000 + thrust::cuda::par.on(THCState_getCurrentStream(state)), +#endif + input_data, input_data+size, target_data, (accreal) 0, + thrust::plus<accreal>(), mse_functor<real, accreal>()); + + if (sizeAverage) + sum /= size; + + THCTensor_(free)(state, input); + THCTensor_(free)(state, target); + + THCTensor_(set1d)(state, output, 0, ScalarConvert<accreal, real>::to(sum)); +} + +void THNN_(MSECriterion_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *target, + THCTensor *gradInput, + bool sizeAverage) +{ + THCUNN_assertSameGPU_generic(state, 3, input, target, gradInput); + THArgCheck(THCTensor_(nElement)(state, input) == THCTensor_(nElement)(state, target), 2, + "input and target need to have the same number of elements" + ); + + long size = THCTensor_(nElement)(state, input); + accreal norm = sizeAverage ? (accreal)(2)/size : (accreal)(2); + + input = THCTensor_(newContiguous)(state, input); + target = THCTensor_(newContiguous)(state, target); + + THCTensor_(resizeAs)(state, gradInput, input); + + thrust::device_ptr<real> input_data(THCTensor_(data)(state, input)); + thrust::device_ptr<real> target_data(THCTensor_(data)(state, target)); + thrust::device_ptr<real> gradInput_data(THCTensor_(data)(state, gradInput)); + + thrust::transform( +#if CUDA_VERSION >= 7000 + thrust::cuda::par.on(THCState_getCurrentStream(state)), +#endif + input_data, input_data+size, target_data, gradInput_data, + mse_updateGradInput_functor<real, accreal>(norm)); + + THCTensor_(free)(state, input); + THCTensor_(free)(state, target); +} + +#endif diff --git a/lib/THCUNN/generic/MarginCriterion.cu b/lib/THCUNN/generic/MarginCriterion.cu new file mode 100644 index 0000000..4b7e85f --- /dev/null +++ b/lib/THCUNN/generic/MarginCriterion.cu @@ -0,0 +1,64 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/MarginCriterion.cu" +#else + +void THNN_(MarginCriterion_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *target, + THCTensor *output, + bool sizeAverage, + real margin) +{ + THCUNN_assertSameGPU_generic(state, 2, input, target); + + long size = THCTensor_(nElement)(state, input); + + input = THCTensor_(newContiguous)(state, input); + target = THCTensor_(newContiguous)(state, target); + + thrust::device_ptr<real> input_data(THCTensor_(data)(state, input)); + thrust::device_ptr<real> target_data(THCTensor_(data)(state, target)); + accreal sum = thrust::inner_product(input_data, input_data+size, target_data, (accreal) 0, thrust::plus<accreal>(), + margin_functor<real, accreal>(ScalarConvert<real, accreal>::to(margin))); + + if (sizeAverage) + sum /= size; + + THCTensor_(free)(state, input); + THCTensor_(free)(state, target); + + THCTensor_(set1d)(state, output, 0, ScalarConvert<accreal, real>::to(sum)); +} + + +void THNN_(MarginCriterion_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *target, + THCTensor *gradInput, + bool sizeAverage, + real margin) +{ + THCUNN_assertSameGPU_generic(state, 3, input, target, gradInput); + + long size = THCTensor_(nElement)(state, input); + accreal norm = sizeAverage ? 1.f/size : 1; + + input = THCTensor_(newContiguous)(state, input); + target = THCTensor_(newContiguous)(state, target); + + THCTensor_(resizeAs)(state, gradInput, input); + + thrust::device_ptr<real> input_data(THCTensor_(data)(state, input)); + thrust::device_ptr<real> target_data(THCTensor_(data)(state, target)); + thrust::device_ptr<real> gradInput_data(THCTensor_(data)(state, gradInput)); + + thrust::transform(input_data, input_data+size, target_data, gradInput_data, + margin_updateGradInput_functor<real, accreal>(ScalarConvert<real, accreal>::to(margin), norm)); + + THCTensor_(free)(state, input); + THCTensor_(free)(state, target); +} + +#endif diff --git a/lib/THCUNN/generic/MultiLabelMarginCriterion.cu b/lib/THCUNN/generic/MultiLabelMarginCriterion.cu new file mode 100644 index 0000000..c9a56c4 --- /dev/null +++ b/lib/THCUNN/generic/MultiLabelMarginCriterion.cu @@ -0,0 +1,111 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/MultiLabelMarginCriterion.cu" +#else + +void THNN_(MultiLabelMarginCriterion_updateOutput)( + THCState *state, + THCTensor *input, + THCIndexTensor *target, + THCTensor *output, + THCTensor *istarget, + bool sizeaverage) +{ + input = THCTensor_(newContiguous)(state, input); + target = THCIndexTensor_(newContiguous)(state, target); + istarget = THCTensor_(newContiguous)(state, istarget); + THCTensor_(resizeAs)(state, istarget, input); + + if(input->nDimension == 1) + { + THCTensor_(resize1d)(state, output, 1); + + dim3 blocks(1); + dim3 threads(MULTILABELMARGIN_THREADS); + + cunn_MultiLabelMarginCriterion_updateOutput_kernel<real, accreal> <<<blocks,threads>>>( + THCTensor_(data)(state, output), + THCTensor_(data)(state, input), + THCIndexTensor_(data)(state, target), + THCTensor_(data)(state, istarget), + 1, input->size[0], + sizeaverage + ); + THCudaCheck(cudaGetLastError()); + } + else if(input->nDimension == 2) + { + THCTensor *output_tmp = THCTensor_(newWithSize1d)(state, input->size[0]); + + dim3 blocks(input->size[0]); + dim3 threads(MULTILABELMARGIN_THREADS); + + cunn_MultiLabelMarginCriterion_updateOutput_kernel<real, accreal> <<<blocks,threads>>>( + THCTensor_(data)(state, output_tmp), + THCTensor_(data)(state, input), + THCIndexTensor_(data)(state, target), + THCTensor_(data)(state, istarget), + input->size[0], input->size[1], + sizeaverage + ); + THCudaCheck(cudaGetLastError()); + THCTensor_(resize1d)(state, output, 1); + THCTensor_(set1d)(state, output, 0, ScalarConvert<accreal, real>::to(THCTensor_(sumall)(state, output_tmp))); + THCTensor_(free)(state, output_tmp); + } + else + THError("vector or matrix expected"); + + THCTensor_(free)(state, input); + THCIndexTensor_(free)(state, target); + THCTensor_(free)(state, istarget); +} + +void THNN_(MultiLabelMarginCriterion_updateGradInput)( + THCState *state, + THCTensor *input, + THCIndexTensor *target, + THCTensor *gradInput, + THCTensor *istarget, + bool sizeaverage) +{ + input = THCTensor_(newContiguous)(state, input); + target = THCIndexTensor_(newContiguous)(state, target); + istarget = THCTensor_(newContiguous)(state, istarget); + THCTensor_(resizeAs)(state, gradInput, input); + + if(gradInput->nDimension == 1) + { + dim3 blocks(1); + dim3 threads(MULTILABELMARGIN_THREADS); + + cunn_MultiLabelMarginCriterion_updateGradInput_kernel<real, accreal> <<<blocks,threads>>>(THCTensor_(data)(state, gradInput), + THCTensor_(data)(state, input), + THCIndexTensor_(data)(state, target), + THCTensor_(data)(state, istarget), + 1, gradInput->size[0], + sizeaverage); + + } + else if(gradInput->nDimension == 2) + { + dim3 blocks(gradInput->size[0]); + dim3 threads(MULTILABELMARGIN_THREADS); + + cunn_MultiLabelMarginCriterion_updateGradInput_kernel<real, accreal> <<<blocks,threads>>>(THCTensor_(data)(state, gradInput), + THCTensor_(data)(state, input), + THCIndexTensor_(data)(state, target), + THCTensor_(data)(state, istarget), + gradInput->size[0], gradInput->size[1], + sizeaverage); + } + else + THError("vector or matrix expected"); + + THCudaCheck(cudaGetLastError()); + + THCTensor_(free)(state, input); + THCIndexTensor_(free)(state, target); + THCTensor_(free)(state, istarget); +} + +#endif diff --git a/lib/THCUNN/generic/MultiMarginCriterion.cu b/lib/THCUNN/generic/MultiMarginCriterion.cu new file mode 100644 index 0000000..303c9a9 --- /dev/null +++ b/lib/THCUNN/generic/MultiMarginCriterion.cu @@ -0,0 +1,181 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/MultiMarginCriterion.cu" +#else + +void THNN_(MultiMarginCriterion_updateOutput)( + THCState *state, + THCTensor *input, + THCIndexTensor *target, + THCTensor *output, + bool sizeAverage, + int p, + THCTensor *weights, + real margin) +{ + THCUNN_assertSameGPU_generic(state, 2, input, target); + input = THCTensor_(newContiguous)(state, input); + if(weights) + weights = THCTensor_(newContiguous)(state, weights); + if (input->nDimension == 1) + { + dim3 blocks(1); + dim3 threads(MULTIMARGIN_THREADS); + if (p == 1) + { + cunn_MultiMarginCriterion_updateOutput_kernel<1, real, accreal> <<<blocks,threads, 0, THCState_getCurrentStream(state)>>>( + THCTensor_(data)(state, output), + THCTensor_(data)(state, input), + THCIndexTensor_(data)(state, target), + weights ? THCTensor_(data)(state, weights) : NULL, + 1, input->size[0], + sizeAverage, + margin + ); + } + else if (p == 2) + { + cunn_MultiMarginCriterion_updateOutput_kernel<2, real, accreal> <<<blocks,threads, 0, THCState_getCurrentStream(state)>>>( + THCTensor_(data)(state, output), + THCTensor_(data)(state, input), + THCIndexTensor_(data)(state, target), + weights ? THCTensor_(data)(state, weights) : NULL, + 1, input->size[0], + sizeAverage, + margin + ); + } + THCudaCheck(cudaGetLastError()); + } + else if (input->nDimension == 2) + { + THCTensor *output_ = THCTensor_(newWithSize1d)(state, input->size[0]); // tmp outupt buffer + dim3 blocks(input->size[0]); + dim3 threads(MULTIMARGIN_THREADS); + if (p == 1) + { + cunn_MultiMarginCriterion_updateOutput_kernel<1, real, accreal> <<<blocks,threads, 0, THCState_getCurrentStream(state)>>>( + THCTensor_(data)(state, output_), + THCTensor_(data)(state, input), + THCIndexTensor_(data)(state, target), + weights ? THCTensor_(data)(state, weights) : NULL, + input->size[0], input->size[1], + sizeAverage, + margin + ); + } + else if (p == 2) + { + cunn_MultiMarginCriterion_updateOutput_kernel<2, real, accreal> <<<blocks,threads, 0, THCState_getCurrentStream(state)>>>( + THCTensor_(data)(state, output_), + THCTensor_(data)(state, input), + THCIndexTensor_(data)(state, target), + weights ? THCTensor_(data)(state, weights) : NULL, + input->size[0], input->size[1], + sizeAverage, + margin + ); + } + THCudaCheck(cudaGetLastError()); + float sum = THCTensor_(sumall)(state, output_); + THCTensor_(set1d)(state, output, 0, ScalarConvert<accreal, real>::to(sum)); + THCTensor_(free)(state, output_); + } + else + { + THError("vector or matrix expected"); + } + + THCTensor_(free)(state, input); + if(weights) + THCTensor_(free)(state, weights); +} + +void THNN_(MultiMarginCriterion_updateGradInput)( + THCState *state, + THCTensor *input, + THCIndexTensor *target, + THCTensor *gradInput, + bool sizeAverage, + int p, + THCTensor *weights, + real margin) +{ + THCUNN_assertSameGPU_generic(state, 3, input, gradInput, target); + input = THCTensor_(newContiguous)(state, input); + THCTensor_(resizeAs)(state, gradInput, input); + if(weights) + weights = THCTensor_(newContiguous)(state, weights); + + if (input->nDimension == 1) + { + dim3 blocks(1); + dim3 threads(MULTIMARGIN_THREADS); + + if (p == 1) + { + cunn_MultiMarginCriterion_updateGradInput_kernel<1, real, accreal> <<<blocks,threads, 0, THCState_getCurrentStream(state)>>>( + THCTensor_(data)(state, gradInput), + THCTensor_(data)(state, input), + THCIndexTensor_(data)(state, target), + weights ? THCTensor_(data)(state, weights) : NULL, + 1, gradInput->size[0], + sizeAverage, + margin + ); + } + else if (p == 2) + { + cunn_MultiMarginCriterion_updateGradInput_kernel<2, real, accreal> <<<blocks,threads, 0, THCState_getCurrentStream(state)>>>( + THCTensor_(data)(state, gradInput), + THCTensor_(data)(state, input), + THCIndexTensor_(data)(state, target), + weights ? THCTensor_(data)(state, weights) : NULL, + 1, gradInput->size[0], + sizeAverage, + margin + ); + } + THCudaCheck(cudaGetLastError()); + } + else if (input->nDimension == 2) + { + dim3 blocks(gradInput->size[0]); + dim3 threads(MULTIMARGIN_THREADS); + + if (p == 1) + { + cunn_MultiMarginCriterion_updateGradInput_kernel<1, real, accreal> <<<blocks,threads, 0, THCState_getCurrentStream(state)>>>( + THCTensor_(data)(state, gradInput), + THCTensor_(data)(state, input), + THCIndexTensor_(data)(state, target), + weights ? THCTensor_(data)(state, weights) : NULL, + gradInput->size[0], gradInput->size[1], + sizeAverage, + margin + ); + } + else if (p == 2) + { + cunn_MultiMarginCriterion_updateGradInput_kernel<2, real, accreal> <<<blocks,threads, 0, THCState_getCurrentStream(state)>>>( + THCTensor_(data)(state, gradInput), + THCTensor_(data)(state, input), + THCIndexTensor_(data)(state, target), + weights ? THCTensor_(data)(state, weights) : NULL, + gradInput->size[0], gradInput->size[1], + sizeAverage, + margin + ); + } + THCudaCheck(cudaGetLastError()); + } + else + { + THError("vector or matrix expected"); + } + + THCTensor_(free)(state, input); + if(weights) + THCTensor_(free)(state, weights); +} + +#endif diff --git a/lib/THCUNN/generic/PReLU.cu b/lib/THCUNN/generic/PReLU.cu new file mode 100644 index 0000000..8f2f670 --- /dev/null +++ b/lib/THCUNN/generic/PReLU.cu @@ -0,0 +1,154 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/PReLU.cu" +#else + +void THNN_(PReLU_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output, + THCTensor *weight, + long nOutputPlane) +{ + THCTensor_(resizeAs)(state, output, input); + + real *w = THCTensor_(data)(state, weight); + + if (nOutputPlane == 0) + { + THC_pointwiseApply2(state, output, input, PReLUUpdateOutput<real>(w)); + } + else + { + int ndim = THCTensor_(nDimension)(state, input); + input = THCTensor_(newContiguous)(state, input); + + int n = THCTensor_(nElement)(state, input); + int mapSize = 1; + if (ndim == 3) + mapSize = (input->size[1] * input->size[2]); + else if (ndim == 4) + mapSize = (input->size[2] * input->size[3]); + int nElemsPerSample = nOutputPlane * mapSize; + preluForward<<<GET_BLOCKS(n), CUDA_NUM_THREADS, 0, THCState_getCurrentStream(state)>>>( + THCTensor_(data)(state, output), + THCTensor_(data)(state, input), + w, + n, nElemsPerSample, mapSize + ); + THCudaCheck(cudaGetLastError()); + THCTensor_(free)(state, input); + } +} + +void THNN_(PReLU_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradInput, + THCTensor *weight, + long nOutputPlane) +{ + THCTensor_(resizeAs)(state, gradInput, input); + + real *w = THCTensor_(data)(state, weight); + if (nOutputPlane == 0) + { + THC_pointwiseApply3(state, gradInput, gradOutput, input, PReLUUpdateGradInput<real>(w)); + } + else + { + int ndim = THCTensor_(nDimension)(state, input); + input = THCTensor_(newContiguous)(state, input); + gradOutput = THCTensor_(newContiguous)(state, gradOutput); + + int n = THCTensor_(nElement)(state, input); + int mapSize = 1; + if (ndim == 3) + mapSize = (input->size[1] * input->size[2]); + else if (ndim == 4) + mapSize = (input->size[2] * input->size[3]); + int nElemsPerSample = nOutputPlane * mapSize; + preluBackward<<<GET_BLOCKS(n), CUDA_NUM_THREADS, 0, THCState_getCurrentStream(state)>>>( + THCTensor_(data)(state, gradInput), + THCTensor_(data)(state, input), + w, + THCTensor_(data)(state, gradOutput), + n, nElemsPerSample, mapSize + ); + THCudaCheck(cudaGetLastError()); + THCTensor_(free)(state, input); + THCTensor_(free)(state, gradOutput); + } +} + +void THNN_(PReLU_accGradParameters)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradInput, + THCTensor *weight, + THCTensor *gradWeight, + THCTensor *gradWeightBuf, + THCTensor *gradWeightBuf2, + long nOutputPlane, + real scale) +{ + // use grad input for temporary storage, then call updateGradInput again + + if (nOutputPlane == 0) + { + THC_pointwiseApply3(state, gradInput, input, gradOutput, PReLUAccGradParametersShared<real>()); + + // introduces a sync point + real sum = ScalarConvert<accreal, real>::to(THCTensor_(sumall)(state, gradInput)); + real w = THCTensor_(get1d)(state, gradWeight, 0); + THCTensor_(set1d)(state, gradWeight, 0, w + sum * scale); + + // restore gradInput + THNN_(PReLU_updateGradInput)(state, input, gradOutput, gradInput, weight, nOutputPlane); + } + else + { + int ndim = THCTensor_(nDimension)(state, input); + + if (ndim == 1) + { + THC_pointwiseApply3(state, gradWeight, input, gradOutput, PReLUAccGradParameters1to1<real>(scale)); + } + else + { + THC_pointwiseApply3(state, gradInput, input, gradOutput, PReLUAccGradParameters<real>(scale)); + THCTensor *sumbuf = gradWeightBuf2; + THCTensor_(resizeAs)(state, gradWeightBuf, gradWeight); + + if (ndim == 2) + { + THCTensor_(sum)(state, gradWeightBuf, gradInput, 0); + THCTensor_(cadd)(state, gradWeight, gradWeight, scale, gradWeightBuf); + } + else if (ndim == 3) + { + THCTensor *buffer = THCTensor_(newContiguous)(state, gradInput); + THCTensor_(resize2d)(state, buffer, nOutputPlane, input->size[1] * input->size[2]); + THCTensor_(sum)(state, gradWeightBuf, buffer, 1); + THCTensor_(cadd)(state, gradWeight, gradWeight, scale, gradWeightBuf); + THCTensor_(free)(state, buffer); + } + else if (ndim == 4) + { + THCTensor *buffer = THCTensor_(newContiguous)(state, gradInput); + THCTensor_(resize3d)(state, buffer, input->size[0], nOutputPlane, input->size[2] * input->size[3]); + THCTensor_(resize2d)(state, sumbuf, input->size[0], nOutputPlane); + THCTensor_(sum)(state, sumbuf, buffer, 2); + THCTensor_(sum)(state, gradWeightBuf, sumbuf, 0); + THCTensor_(cadd)(state, gradWeight, gradWeight, scale, gradWeightBuf); + THCTensor_(free)(state, buffer); + } + + // restore gradInput + THNN_(PReLU_updateGradInput)(state, input, gradOutput, gradInput, weight, nOutputPlane); + } + } +} + +#endif diff --git a/lib/THCUNN/generic/RReLU.cu b/lib/THCUNN/generic/RReLU.cu new file mode 100644 index 0000000..93ad99b --- /dev/null +++ b/lib/THCUNN/generic/RReLU.cu @@ -0,0 +1,108 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/RReLU.cu" +#else + +#include "../common.h" + +void THNN_(RReLU_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output, + THCTensor *noise, + double lower, + double upper, + bool train, + bool inplace, + void *generator) +{ + THCUNN_assertSameGPU_generic(state, 3, input, output, noise); + struct curandStateMtgp32* gen_states = THCRandom_generatorStates(state); + + if (train) + { + input = THCTensor_(newContiguous)(state, input); + THCTensor_(resizeAs)(state, noise, input); + real *input_data = THCTensor_(data)(state, input); + real *noise_data = THCTensor_(data)(state, noise); + long n = THCTensor_(nElement)(state, input); + if (inplace) + { + rreluUpdateOutputTrain<<<NUM_BLOCKS(n), BLOCK_SIZE, 0, THCState_getCurrentStream(state)>>>( + n, gen_states, input_data, noise_data, input_data, lower, upper); + THCTensor_(set)(state, output, input); + } + else + { + THCTensor_(resizeAs)(state, output, input); + real *output_data = THCTensor_(data)(state, output); + rreluUpdateOutputTrain<<<NUM_BLOCKS(n), BLOCK_SIZE, 0, THCState_getCurrentStream(state)>>>( + n, gen_states, input_data, noise_data, output_data, lower, upper); + } + THCudaCheck(cudaGetLastError()); + THCTensor_(free)(state, input); + } + else + { + const real negSlope = ScalarConvert<double, real>::to((lower + upper) / 2); + if (inplace) + { + THC_pointwiseApply1(state, input, RReLUUpdateOutputEvalIP_functor<real>(negSlope)); + THCTensor_(set)(state, output, input); + } + else + { + THCTensor_(resizeAs)(state, output, input); + THC_pointwiseApply2(state, output, input, RReLUUpdateOutputEval_functor<real>(negSlope)); + } + } +} + +void THNN_(RReLU_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradInput, + THCTensor *noise, + double lower, + double upper, + bool train, + bool inplace) +{ + THCUNN_assertSameGPU_generic(state, 4, input, gradOutput, gradInput, noise); + + gradOutput = THCTensor_(newContiguous)(state, gradOutput); + + if (train && upper - lower > 1E-6) // e.g. if upper == lower, RReLU behaves like LeakyReLU + { + // multiply the gradient by the noise tensor + if (inplace) + { + THCTensor_(cmul)(state, gradOutput, gradOutput, noise); + THCTensor_(set)(state, gradInput, gradOutput); + } + else + { + THCTensor_(resizeAs)(state, gradInput, input); + THCTensor_(cmul)(state, gradInput, gradOutput, noise); + } + } + else + { + // use constant factor for negative input values + const real negSlope = ScalarConvert<double, real>::to((lower + upper) / 2); + if (inplace) + { + THC_pointwiseApply2(state, gradOutput, input, RReLUupdateGradInputEvalIP_functor<real>(negSlope)); + THCTensor_(set)(state, gradInput, gradOutput); + } + else + { + THCTensor_(resizeAs)(state, gradInput, input); + THC_pointwiseApply3(state, gradInput, gradOutput, input, RReLUupdateGradInputEval_functor<real>(negSlope)); + } + } + + THCTensor_(free)(state, gradOutput); +} + +#endif diff --git a/lib/THCUNN/generic/Sigmoid.cu b/lib/THCUNN/generic/Sigmoid.cu new file mode 100644 index 0000000..7bc5739 --- /dev/null +++ b/lib/THCUNN/generic/Sigmoid.cu @@ -0,0 +1,29 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/Sigmoid.cu" +#else + +#include "../common.h" + +void THNN_(Sigmoid_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output) +{ + THCUNN_assertSameGPU_generic(state, 2, input, output); + THCTensor_(resizeAs)(state, output, input); + THC_pointwiseApply2(state, output, input, sigmoidupdateOutput_functor<real>()); +} + +void THNN_(Sigmoid_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradInput, + THCTensor *output) +{ + THCUNN_assertSameGPU_generic(state, 3, output, gradOutput, gradInput); + THCTensor_(resizeAs)(state, gradInput, output); + THC_pointwiseApply3(state, gradInput, output, gradOutput, sigmoidupdateGradInput_functor<real>()); +} + +#endif diff --git a/lib/THCUNN/generic/SmoothL1Criterion.cu b/lib/THCUNN/generic/SmoothL1Criterion.cu new file mode 100644 index 0000000..8bf6865 --- /dev/null +++ b/lib/THCUNN/generic/SmoothL1Criterion.cu @@ -0,0 +1,79 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/SmoothL1Criterion.cu" +#else + +void THNN_(SmoothL1Criterion_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *target, + THCTensor *output, + bool sizeAverage) +{ + THCUNN_assertSameGPU_generic(state, 2, input, target); + THArgCheck( + THCTensor_(nElement)(state, input) == THCTensor_(nElement)(state, target), 2, + "input and target need to have the same number of elements" + ); + + long size = THCTensor_(nElement)(state, input); + + input = THCTensor_(newContiguous)(state, input); + target = THCTensor_(newContiguous)(state, target); + + thrust::device_ptr<real> input_data(THCTensor_(data)(state, input)); + thrust::device_ptr<real> target_data(THCTensor_(data)(state, target)); + accreal sum = thrust::inner_product( +#if CUDA_VERSION >= 7000 + thrust::cuda::par.on(THCState_getCurrentStream(state)), +#endif + input_data, input_data+size, target_data, (accreal) 0, + thrust::plus<accreal>(), smoothl1_functor<real, accreal>() + ); + + if (sizeAverage) + sum /= size; + + THCTensor_(free)(state, input); + THCTensor_(free)(state, target); + + THCTensor_(set1d)(state, output, 0, ScalarConvert<accreal, real>::to(sum)); +} + +void THNN_(SmoothL1Criterion_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *target, + THCTensor *gradInput, + bool sizeAverage) +{ + THCUNN_assertSameGPU_generic(state, 3, input, target, gradInput); + THArgCheck( + THCTensor_(nElement)(state, input) == THCTensor_(nElement)(state, target), 2, + "input and target need to have the same number of elements" + ); + + long size = THCTensor_(nElement)(state, input); + real norm = ScalarConvert<accreal, real>::to(sizeAverage ? accreal(1)/size : accreal(1)); + + input = THCTensor_(newContiguous)(state, input); + target = THCTensor_(newContiguous)(state, target); + + THCTensor_(resizeAs)(state, gradInput, input); + + thrust::device_ptr<real> input_data(THCTensor_(data)(state, input)); + thrust::device_ptr<real> target_data(THCTensor_(data)(state, target)); + thrust::device_ptr<real> gradInput_data(THCTensor_(data)(state, gradInput)); + + thrust::transform( +#if CUDA_VERSION >= 7000 + thrust::cuda::par.on(THCState_getCurrentStream(state)), +#endif + input_data, input_data+size, target_data, gradInput_data, + smoothl1_updateGradInput_functor<real>(norm) + ); + + THCTensor_(free)(state, input); + THCTensor_(free)(state, target); +} + +#endif diff --git a/lib/THCUNN/generic/SoftMarginCriterion.cu b/lib/THCUNN/generic/SoftMarginCriterion.cu new file mode 100644 index 0000000..b5b5329 --- /dev/null +++ b/lib/THCUNN/generic/SoftMarginCriterion.cu @@ -0,0 +1,60 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/SoftMarginCriterion.cu" +#else + +void THNN_(SoftMarginCriterion_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *target, + THCTensor *output, + int sizeAverage) +{ + THCUNN_assertSameGPU_generic(state, 2, input, target); + accreal sum; + + long size = THCTensor_(nElement)(state, input); + + input = THCTensor_(newContiguous)(state, input); + target = THCTensor_(newContiguous)(state, target); + + thrust::device_ptr<real> input_data(THCTensor_(data)(state, input)); + thrust::device_ptr<real> target_data(THCTensor_(data)(state, target)); + sum = thrust::inner_product(input_data, input_data+size, target_data, (accreal) 0, thrust::plus<accreal>(), softmargin_functor<real, accreal>()); + + if(sizeAverage) + sum /= size; + + THCTensor_(free)(state, input); + THCTensor_(free)(state, target); + + THCTensor_(set1d)(state, output, 0, ScalarConvert<accreal, real>::to(sum)); +} + +void THNN_(SoftMarginCriterion_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *target, + THCTensor *gradInput, + int sizeAverage) +{ + THCUNN_assertSameGPU_generic(state, 3, input, target, gradInput); + + long size = THCTensor_(nElement)(state, input); + accreal norm = (sizeAverage ? 1./size : 1.); + + input = THCTensor_(newContiguous)(state, input); + target = THCTensor_(newContiguous)(state, target); + + THCTensor_(resizeAs)(state, gradInput, input); + + thrust::device_ptr<real> input_data(THCTensor_(data)(state, input)); + thrust::device_ptr<real> target_data(THCTensor_(data)(state, target)); + thrust::device_ptr<real> gradInput_data(THCTensor_(data)(state, gradInput)); + + thrust::transform(input_data, input_data+size, target_data, gradInput_data, softmargin_updateGradInput_functor<real, accreal>(norm)); + + THCTensor_(free)(state, input); + THCTensor_(free)(state, target); +} + +#endif diff --git a/lib/THCUNN/generic/SoftMax.cu b/lib/THCUNN/generic/SoftMax.cu new file mode 100644 index 0000000..917e3f0 --- /dev/null +++ b/lib/THCUNN/generic/SoftMax.cu @@ -0,0 +1,153 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/SoftMax.cu" +#else + +#include "../common.h" + +void THNN_(SoftMax_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output) +{ + THCUNN_assertSameGPU_generic(state, 2, input, output); + + input = THCTensor_(newContiguous)(state, input); + THCTensor_(resizeAs)(state, output, input); + long batchSize, dim, stride0, stride1 = 1; + long blocksY = 1, blocksZ = 1; + + if (input->nDimension == 1) + { + batchSize = 1; + dim = input->size[0]; + stride0 = 1; + } + else if (input->nDimension == 2) + { + batchSize = input->size[0]; + dim = input->size[1]; + stride0 = 1; + } + else if (input->nDimension == 3) + { + batchSize = 1; + dim = input->size[0]; + blocksY = input->size[1]; + blocksZ = input->size[2]; + stride0 = blocksY * blocksZ; + stride1 = blocksZ; + } + else if (input->nDimension == 4) + { + batchSize = input->size[0]; + dim = input->size[1]; + blocksY = input->size[2]; + blocksZ = input->size[3]; + stride0 = blocksY * blocksZ; + stride1 = blocksZ; + } + else + { + THError("1D, 2D, 3D or 4D tensor expected"); + } + + // when possible use only 2d grid of thread blocks to stay compatible with compute capability 2.X devices. + if (blocksY * blocksZ < 65536) + { + blocksY *= blocksZ; + blocksZ = 1; + if (input->nDimension == 3 || input->nDimension == 4) { + stride0 = blocksY * blocksZ; + stride1 = blocksZ; + } + } + + dim3 blocks(batchSize, blocksY, blocksZ); + dim3 threads(SOFTMAX_THREADS); + cunn_SoftMax_updateOutput_kernel<real, accreal><<<blocks, threads, 0, THCState_getCurrentStream(state)>>>( + THCTensor_(data)(state, output), + THCTensor_(data)(state, input), + batchSize, dim, stride0, stride1 + ); + THCudaCheck(cudaGetLastError()); + + THCTensor_(free)(state, input); +} + +void THNN_(SoftMax_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradInput, + THCTensor *output) +{ + THCUNN_assertSameGPU_generic(state, 3, output, gradOutput, gradInput); + + output = THCTensor_(newContiguous)(state, output); + gradOutput = THCTensor_(newContiguous)(state, gradOutput); + + THCTensor_(resizeAs)(state, gradInput, output); + long batchSize, dim, stride0, stride1 = 1; + long blocksY = 1, blocksZ = 1; + + if (gradInput->nDimension == 1) + { + batchSize = 1; + dim = gradInput->size[0]; + stride0 = 1; + } + else if (gradInput->nDimension == 2) + { + batchSize = gradInput->size[0]; + dim = gradInput->size[1]; + stride0 = 1; + } + else if (gradInput->nDimension == 3) + { + batchSize = 1; + dim = gradInput->size[0]; + blocksY = gradInput->size[1]; + blocksZ = gradInput->size[2]; + stride0 = blocksY * blocksZ; + stride1 = blocksZ; + } + else if (gradInput->nDimension == 4) + { + batchSize = gradInput->size[0]; + dim = gradInput->size[1]; + blocksY = gradInput->size[2]; + blocksZ = gradInput->size[3]; + stride0 = blocksY * blocksZ; + stride1 = blocksZ; + } + else + { + THError("1D, 2D, 3D or 4D tensor expected"); + } + + // when possible use only 2d grid of thread blocks to stay compatible with compute capability 2.X devices. + if (blocksY * blocksZ < 65536) + { + blocksY *= blocksZ; + blocksZ = 1; + if (input->nDimension == 3 || input->nDimension == 4) { + stride0 = blocksY * blocksZ; + stride1 = blocksZ; + } + } + + dim3 blocks(batchSize, blocksY, blocksZ); + dim3 threads(SOFTMAX_THREADS); + cunn_SoftMax_updateGradInput_kernel<real, accreal><<<blocks, threads, 0, THCState_getCurrentStream(state)>>>( + THCTensor_(data)(state, gradInput), + THCTensor_(data)(state, output), + THCTensor_(data)(state, gradOutput), + batchSize, dim, stride0, stride1 + ); + THCudaCheck(cudaGetLastError()); + + THCTensor_(free)(state, gradOutput); + THCTensor_(free)(state, output); +} + +#endif diff --git a/lib/THCUNN/generic/SoftPlus.cu b/lib/THCUNN/generic/SoftPlus.cu new file mode 100644 index 0000000..39794b0 --- /dev/null +++ b/lib/THCUNN/generic/SoftPlus.cu @@ -0,0 +1,33 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/SoftPlus.cu" +#else + +#include "../common.h" + +void THNN_(SoftPlus_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output, + real beta, + real threshold) +{ + THCUNN_assertSameGPU_generic(state, 2, input, output); + THCTensor_(resizeAs)(state, output, input); + THC_pointwiseApply2(state, output, input, softPlusupdateOutput_functor<real>(threshold, beta)); +} + +void THNN_(SoftPlus_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradInput, + THCTensor *output, + real beta, + real threshold) +{ + THCUNN_assertSameGPU_generic(state, 4, input, output, gradOutput, gradInput); + THCTensor_(resizeAs)(state, gradInput, output); + THC_pointwiseApply3(state, gradInput, output, gradOutput, softPlusupdateGradInput_functor<real>(threshold, beta)); +} + +#endif diff --git a/lib/THCUNN/generic/SoftShrink.cu b/lib/THCUNN/generic/SoftShrink.cu new file mode 100644 index 0000000..aa530bb --- /dev/null +++ b/lib/THCUNN/generic/SoftShrink.cu @@ -0,0 +1,32 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/SoftShrink.cu" +#else + +#include "../common.h" + +void THNN_(SoftShrink_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output, + real lambda) +{ + THCUNN_assertSameGPU_generic(state, 2, input, output); + THCTensor_(resizeAs)(state, output, input); + THC_pointwiseApply2(state, output, input, SoftShrinkUpdateOutput<real>(lambda)); + THCudaCheck(cudaGetLastError()); +} + +void THNN_(SoftShrink_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradInput, + real lambda) +{ + THCUNN_assertSameGPU_generic(state, 3, input, gradOutput, gradInput); + THCTensor_(resizeAs)(state, gradInput, input); + THC_pointwiseApply3(state, gradInput, input, gradOutput, SoftShrinkUpdateGradInput<real>(lambda)); + THCudaCheck(cudaGetLastError()); +} + +#endif diff --git a/lib/THCUNN/generic/SparseLinear.cu b/lib/THCUNN/generic/SparseLinear.cu new file mode 100644 index 0000000..438a8e4 --- /dev/null +++ b/lib/THCUNN/generic/SparseLinear.cu @@ -0,0 +1,269 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/SparseLinear.cu" +#else + +static bool checkInput(THCTensor* t) +{ + return t->nDimension == 2 && t->size[1] == 3; +} + +static bool checkSize2D(THCTensor* t, long size0, long size1) +{ + return t->nDimension == 2 && t->size[0] == size0 && t->size[1] == size1; +} + +static bool checkSize1D(THCTensor* t, long size0) +{ + return t->nDimension == 1 && t->size[0] == size0; +} + +static inline void copyCudaFloatingType(THCState *state, THCudaIntTensor *buf, THCTensor *t) { + #ifdef THC_REAL_IS_FLOAT + THCudaIntTensor_copyCudaFloat(state, buf, t); + #elif defined(THC_REAL_IS_DOUBLE) + THCudaIntTensor_copyCudaDouble(state, buf, t); + #elif defined(THC_REAL_IS_HALF) + THCudaIntTensor_copyCudaHalf(state, buf, t); + #endif +} + +void THNN_(SparseLinear_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output, + THCTensor *weight, + THCTensor *bias) +{ + THAssert(THCTensor_(checkGPU)(state, 4, input, output, weight, bias)); + + long h; + long outDim = THCTensor_(size)(state, weight, 0); + long inDim = THCTensor_(size)(state, weight, 1); + + THArgCheck(checkInput(input), 2, "input size must be nnz x 3"); + THArgCheck(THCTensor_(nDimension)(state, output) == 2, 3, "output must be batchsize x outputsize"); + THArgCheck(checkSize1D(bias, outDim), 5, "bias size wrong"); + + long batchnum = THCTensor_(size)(state, output, 0); + long nnz = THCTensor_(size)(state, input, 0); + + THCTensor *buffer = THCTensor_(new)(state); + THCTensor *sel = THCTensor_(new)(state); + THCTensor *values = THCTensor_(new)(state); + THCudaIntTensor *rowbuf = THCudaIntTensor_new(state); + THCudaIntTensor *csrPtrs = THCudaIntTensor_new(state); + THCudaIntTensor *colInds = THCudaIntTensor_new(state); + + THCTensor_(resize1d)(state, values, nnz); + THCudaIntTensor_resize1d(state, rowbuf, nnz); + THCudaIntTensor_resize1d(state, colInds, nnz); + THCudaIntTensor_resize1d(state, csrPtrs, batchnum+1); + + // Get data ready for cusparse, need CudaInt buffers + // We do not need to sort, since rows are already in order + // If rows might get out of order in future implementations, or if cusparse + // complains with an illegal memory access, sort like we do in AccGradParameters + THCTensor_(select)(state, sel, input, 1, 0); + copyCudaFloatingType(state, rowbuf, sel); + THCTensor_(select)(state, sel, input, 1, 1); + copyCudaFloatingType(state, colInds, sel); + THCTensor_(select)(state, sel, input, 1, 2); + THCTensor_(copyCuda)(state, values, sel); + + init_cusparse(); + cusparseXcoo2csr(cusparse_handle, + THCudaIntTensor_data(state, rowbuf), nnz, batchnum, + THCudaIntTensor_data(state, csrPtrs), CUSPARSE_INDEX_BASE_ONE); + + // output = bias + THCTensor_(resize2d)(state, buffer, outDim, batchnum); + THCTensor_(zero)(state, buffer); + for (h=0; h<batchnum; h++) { + THCTensor_(select)(state, sel, buffer, 1, h); + THCTensor_(copy)(state, sel, bias); + } + + // output = W * x + real one = ScalarConvert<int, real>::to(1); + cusparseMatDescr_t descr = 0; + cusparseCreateMatDescr(&descr); + cusparseSetMatType(descr,CUSPARSE_MATRIX_TYPE_GENERAL); + cusparseSetMatIndexBase(descr,CUSPARSE_INDEX_BASE_ONE); + #ifdef THC_REAL_IS_FLOAT + cusparseScsrmm(cusparse_handle, + #elif defined(THC_REAL_IS_DOUBLE) + cusparseDcsrmm(cusparse_handle, + #endif + CUSPARSE_OPERATION_NON_TRANSPOSE, + batchnum, outDim, inDim, nnz, + &one, + descr, + THCTensor_(data)(state, values), + THCudaIntTensor_data(state, csrPtrs), + THCudaIntTensor_data(state, colInds), + THCTensor_(data)(state, weight), inDim, + &one, THCTensor_(data)(state, buffer), batchnum + ); + THCTensor_(transpose)(state, buffer, NULL, 0, 1); + + // We do work in the buffer to keep the output contiguous + THCTensor_(copy)(state, output, buffer); + + cusparseDestroyMatDescr(descr); + descr = 0; + THCTensor_(free)(state, buffer); + THCTensor_(free)(state, sel); + THCTensor_(free)(state, values); + THCudaIntTensor_free(state, rowbuf); + THCudaIntTensor_free(state, colInds); + THCudaIntTensor_free(state, csrPtrs); +} + +void THNN_(SparseLinear_accGradParameters)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradWeight, + THCTensor *gradBias, + THCTensor *weight, + THCTensor *bias, + double weightDecay, + double scale) +{ + long outDim = THCTensor_(size)(state, weight, 0); + long inDim = THCTensor_(size)(state, weight, 1); + + THArgCheck(checkInput(input), 2, "input size must be batchsize x nnz x 2"); + THArgCheck(checkSize2D(gradWeight, outDim, inDim), 4, "gradWeight size wrong"); + THArgCheck(checkSize1D(gradBias, outDim), 5, "gradBias size wrong"); + THArgCheck(THCTensor_(isContiguous)(state, gradOutput), 3, + "gradOutput must be contiguous"); + + long nnz = THCTensor_(size)(state, input, 0); + long batchnum = THCTensor_(size)(state, gradOutput, 0); + + THCTensor *buf = THCTensor_(new)(state); + THCTensor *cols = THCTensor_(new)(state); + THCTensor *sel = THCTensor_(new)(state); + THCudaLongTensor *inds = THCudaLongTensor_new(state); + THCTensor *values = THCTensor_(new)(state); + THCudaIntTensor *colbuf = THCudaIntTensor_new(state); + THCudaIntTensor *colPtrs = THCudaIntTensor_new(state); + THCudaIntTensor *rowInds = THCudaIntTensor_new(state); + + THCTensor_(select)(state, sel, input, 1, 0); // rowInds + THCTensor_(select)(state, cols, input, 1, 1); // colInds + THCTensor_(cadd)(state, buf, sel, batchnum, cols); // colInds * buatchdim + rowInds + THCTensor_(sort)(state, buf, inds, buf, 0, 0); // Indicies are now in ind + THCTensor_(indexSelect)(state, buf, input, 0, inds); + + THCTensor_(resize1d)(state, values, nnz); + THCudaIntTensor_resize1d(state, colbuf, nnz); + THCudaIntTensor_resize1d(state, rowInds, nnz); + THCudaIntTensor_resize1d(state, colPtrs, inDim+1); + + // Get data ready for cusparse, need CudaInt buffers + THCTensor_(select)(state, sel, buf, 1, 0); + copyCudaFloatingType(state, rowInds, sel); + THCTensor_(select)(state, sel, buf, 1, 1); + copyCudaFloatingType(state, colbuf, sel); + THCTensor_(select)(state, sel, buf, 1, 2); + THCTensor_(copyCuda)(state, values, sel); + + init_cusparse(); + // Secretly coo2csc + cusparseXcoo2csr(cusparse_handle, + THCudaIntTensor_data(state, colbuf), nnz, inDim, + THCudaIntTensor_data(state, colPtrs), CUSPARSE_INDEX_BASE_ONE); + + // FORTRAN expects contiguous col-major matricies + THCTensor_(transpose)(state, gradOutput, NULL, 0, 1); + THCTensor_(resize2d)(state, buf, batchnum, outDim); + THCTensor_(copy)(state, buf, gradOutput); + THCTensor_(transpose)(state, gradOutput, NULL, 0, 1); // Restore gradOutput + + real one = ScalarConvert<int, real>::to(1); + cusparseMatDescr_t descr = 0; + cusparseCreateMatDescr(&descr); + cusparseSetMatType(descr,CUSPARSE_MATRIX_TYPE_GENERAL); + cusparseSetMatIndexBase(descr,CUSPARSE_INDEX_BASE_ONE); + #ifdef THC_REAL_IS_FLOAT + cusparseScsrmm(cusparse_handle, + #elif defined(THC_REAL_IS_DOUBLE) + cusparseDcsrmm(cusparse_handle, + #endif + CUSPARSE_OPERATION_NON_TRANSPOSE, + inDim, outDim, batchnum, nnz, + &one, + descr, + THCTensor_(data)(state, values), + THCudaIntTensor_data(state, colPtrs), + THCudaIntTensor_data(state, rowInds), + THCTensor_(data)(state, buf), batchnum, + &one, THCTensor_(data)(state, gradWeight), inDim + ); + + THCTensor_(sum)(state, buf, gradOutput, 0); + THCTensor_(resize1d)(state, buf, outDim); + THCTensor_(cadd)(state, gradBias, gradBias, scale, buf); + + if (weightDecay != 0) + { + THCTensor_(cadd)(state, gradWeight, gradWeight, weightDecay, weight); + THCTensor_(cadd)(state, gradBias, gradBias, weightDecay, bias); + } + + THCTensor_(free)(state, buf); + THCTensor_(free)(state, sel); + THCTensor_(free)(state, cols); + THCudaLongTensor_free(state, inds); + THCTensor_(free)(state, values); + THCudaIntTensor_free(state, colbuf); + THCudaIntTensor_free(state, rowInds); + THCudaIntTensor_free(state, colPtrs); +} + +void THNN_(SparseLinear_legacyUpdateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output, + THCTensor *weight, + THCTensor *bias) { + THError("CUDA does not support legacy input format, please use a table of nnz x 2 vectors"); +} +void THNN_(SparseLinear_legacyAccGradParameters)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradWeight, + THCTensor *gradBias, + THCTensor *weight, + THCTensor *bias, + double weightDecay, + double scale) { + THError("CUDA does not support legacy input format, please use a table of nnz x 2 vectors"); +} + +// Dense updates are pretty fast on the GPU +void THNN_(SparseLinear_zeroGradParameters)( + THCState *state, + THCTensor *gradWeight, + THCTensor *gradBias, + THCTensor *lastInput) { + THCTensor_(zero)(state, gradWeight); + THCTensor_(zero)(state, gradBias); +} + +void THNN_(SparseLinear_updateParameters)( + THCState *state, + THCTensor *weight, + THCTensor *bias, + THCTensor *gradWeight, + THCTensor *gradBias, + THCTensor *lastInput, + double learningRate) { + THCTensor_(cadd)(state, weight, weight, -learningRate, gradWeight); + THCTensor_(cadd)(state, bias, bias, -learningRate, gradBias); +} + +#endif diff --git a/lib/THCUNN/generic/SpatialAdaptiveMaxPooling.cu b/lib/THCUNN/generic/SpatialAdaptiveMaxPooling.cu new file mode 100644 index 0000000..fc6fe6d --- /dev/null +++ b/lib/THCUNN/generic/SpatialAdaptiveMaxPooling.cu @@ -0,0 +1,188 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/SpatialAdaptiveMaxPooling.cu" +#else + +#include "../common.h" + +void THNN_(SpatialAdaptiveMaxPooling_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output, + THCIndexTensor *indices, + int nOutputCols, + int nOutputRows) +{ + THCUNN_assertSameGPU_generic(state, 3, input, output, indices); + + THCIndex_t *indices_data; + real *output_data; + real *input_data; + + THArgCheck(input->nDimension == 3 || input->nDimension == 4, 2, "3D or 4D (batch) tensor expected"); + + if (input->nDimension == 3) { + long nInputCols = input->size[2]; + long nInputRows = input->size[1]; + long nInputPlane = input->size[0]; + + long istride_d = input->stride[0]; + long istride_h = input->stride[1]; + long istride_w = input->stride[2]; + + input_data = THCTensor_(data)(state, input); + + THCTensor_(resize3d)(state, output, nInputPlane, nOutputRows, nOutputCols); + THCIndexTensor_(resize4d)(state, indices, 2, nInputPlane, nOutputRows, nOutputCols); + + indices_data = THCIndexTensor_(data)(state, indices); + output_data = THCTensor_(data)(state, output); + + // cuda blocks & threads: + int yblocks = (int)(16L / nInputPlane); + yblocks = yblocks < 1 ? 1 : yblocks; + dim3 blocks(nInputPlane,yblocks); + dim3 threads(32,8); + + // run maxpool kernel + adaptivemaxpool <<<blocks, threads, 0, THCState_getCurrentStream(state)>>> (input_data, output_data, + indices_data+nInputPlane*nOutputCols*nOutputRows, indices_data, + nInputPlane, nInputRows, nInputCols, nOutputRows, nOutputCols, + istride_h, istride_w, istride_d); + THCudaCheck(cudaGetLastError()); + + } else { + long nInputCols = input->size[3]; + long nInputRows = input->size[2]; + long nInputPlane = input->size[1]; + long nbatch = input->size[0]; + + long istride_d = input->stride[1]; + long istride_h = input->stride[2]; + long istride_w = input->stride[3]; + + input = THCTensor_(newContiguous)(state, input); + input_data = THCTensor_(data)(state, input); + + THCTensor_(resize4d)(state, output, nbatch, nInputPlane, nOutputRows, nOutputCols); + THCIndexTensor_(resize5d)(state, indices, 2, nbatch, nInputPlane, nOutputRows, nOutputCols); + + indices_data = THCIndexTensor_(data)(state, indices); + output_data = THCTensor_(data)(state, output); + + // cuda blocks & threads: + int yblocks = (int)(16L / nInputPlane); + yblocks = yblocks < 1 ? 1 : yblocks; + dim3 blocks(nInputPlane*nbatch,yblocks); + dim3 threads(32,8); + + // run maxpool kernel + adaptivemaxpool <<<blocks, threads, 0, THCState_getCurrentStream(state)>>> (input_data, output_data, + indices_data+nbatch*nInputPlane*nOutputCols*nOutputRows, indices_data, + nInputPlane, nInputRows, nInputCols, nOutputRows, nOutputCols, + istride_h, istride_w, istride_d); + THCudaCheck(cudaGetLastError()); + // clean + THCTensor_(free)(state, input); + } +} + +void THNN_(SpatialAdaptiveMaxPooling_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradInput, + THCIndexTensor *indices) +{ + bool atomic = true; // suboptimal, but without atomic it doesn't pass the tests + + THCUNN_assertSameGPU_generic(state, 4, input, indices, gradOutput, gradInput); + + THCIndex_t *indices_data; + real *gradInput_data; + real *gradOutput_data; + + gradOutput = THCTensor_(newContiguous)(state, gradOutput); + + if (input->nDimension == 3) { + long nInputCols = input->size[2]; + long nInputRows = input->size[1]; + long nInputPlane = input->size[0]; + long nOutputCols = gradOutput->size[2]; + long nOutputRows = gradOutput->size[1]; + + //bool atomic = (nInputCols%nOutputCols != 0) || (nInputRows%nOutputRows != 0); + + THCTensor_(resizeAs)(state, gradInput, input); + THCTensor_(zero)(state, gradInput); + + indices_data = THCIndexTensor_(data)(state, indices); + gradOutput_data = THCTensor_(data)(state, gradOutput); + gradInput_data = THCTensor_(data)(state, gradInput); + + // cuda blocks & threads: + int yblocks = (int)(16L / nInputPlane); + yblocks = yblocks < 1 ? 1 : yblocks; + dim3 blocks(nInputPlane,yblocks); + dim3 threads(32,8); + + if(atomic) + { + // run updateGradInput kernel, accumulate gradients atomically + atomicadaptivemaxgradinput <<<blocks, threads, 0, THCState_getCurrentStream(state)>>> (gradInput_data, gradOutput_data, + indices_data+nInputPlane*nOutputCols*nOutputRows, indices_data, + nInputPlane, nInputRows, nInputCols, nOutputRows, nOutputCols); + } + else + { + // run updateGradInput kernel + atomicadaptivemaxgradinput <<<blocks, threads, 0, THCState_getCurrentStream(state)>>> (gradInput_data, gradOutput_data, + indices_data+nInputPlane*nOutputCols*nOutputRows, indices_data, + nInputPlane, nInputRows, nInputCols, nOutputRows, nOutputCols); + } + THCudaCheck(cudaGetLastError()); + } else { + long nInputCols = input->size[3]; + long nInputRows = input->size[2]; + long nInputPlane = input->size[1]; + long nbatch = input->size[0]; + long nOutputCols = gradOutput->size[3]; + long nOutputRows = gradOutput->size[2]; + + //bool atomic = //(nInputCols%nOutputCols != 0) || (nInputRows%nOutputRows != 0); + + THCTensor_(resizeAs)(state, gradInput, input); + THCTensor_(zero)(state, gradInput); + + indices_data = THCIndexTensor_(data)(state, indices); + gradOutput_data = THCTensor_(data)(state, gradOutput); + gradInput_data = THCTensor_(data)(state, gradInput); + + // cuda blocks & threads: + int yblocks = (int)(16L / nInputPlane); + yblocks = yblocks < 1 ? 1 : yblocks; + dim3 blocks(nInputPlane*nbatch,yblocks); + dim3 threads(32,8); + + if(atomic) + { + // run updateGradInput kernel, accumulate gradients atomically + atomicadaptivemaxgradinput <<<blocks, threads, 0, THCState_getCurrentStream(state)>>> (gradInput_data, gradOutput_data, + indices_data+nbatch*nInputPlane*nOutputCols*nOutputRows, indices_data, + nInputPlane, nInputRows, nInputCols, nOutputRows, nOutputCols); + } + else + { + // run updateGradInput kernel, accumulate gradients atomically + adaptivemaxgradinput <<<blocks, threads, 0, THCState_getCurrentStream(state)>>> (gradInput_data, gradOutput_data, + indices_data+nbatch*nInputPlane*nOutputCols*nOutputRows, indices_data, + nInputPlane, nInputRows, nInputCols, nOutputRows, nOutputCols); + } + THCudaCheck(cudaGetLastError()); + } + + // clean + THCTensor_(free)(state,gradOutput); + +} + +#endif diff --git a/lib/THCUNN/generic/SpatialAveragePooling.cu b/lib/THCUNN/generic/SpatialAveragePooling.cu new file mode 100644 index 0000000..09f539e --- /dev/null +++ b/lib/THCUNN/generic/SpatialAveragePooling.cu @@ -0,0 +1,164 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/SpatialAveragePooling.cu" +#else + +void THNN_(SpatialAveragePooling_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output, + int kW, int kH, + int dW, int dH, + int padW, int padH, + bool ceil_mode, + bool count_include_pad) +{ + THCUNN_assertSameGPU_generic(state, 2, input, output); + THArgCheck(input->nDimension == 3 || input->nDimension == 4, 2, "3D or 4D (batch) tensor expected"); + + long nInputCols, nInputRows, nInputPlane, batchSize; + long nOutputCols, nOutputRows; + + if (input->nDimension == 3) { + nInputCols = input->size[2]; + nInputRows = input->size[1]; + nInputPlane = input->size[0]; + batchSize = 1; + } + else + { + nInputCols = input->size[3]; + nInputRows = input->size[2]; + nInputPlane = input->size[1]; + batchSize = input->size[0]; + } + + THArgCheck(nInputCols >= kW - 2*padW && nInputRows >= kH - 2*padH, 2, "input image smaller than kernel size"); + THArgCheck(kW/2 >= padW && kH/2 >= padH, 2, "pad should be smaller than half of kernel size"); + + if(ceil_mode) { + nOutputCols = ceil(float(nInputCols - kW + 2*padW) / float(dW)) + 1; + nOutputRows = ceil(float(nInputRows - kH + 2*padH) / float(dH)) + 1; + } + else { + nOutputCols = floor(float(nInputCols - kW + 2*padW) / float(dW)) + 1; + nOutputRows = floor(float(nInputRows - kH + 2*padH) / float(dH)) + 1; + } + if (padW || padH) + { + // ensure that the last pooling starts inside the image + // needed to avoid problems in ceil mode + if ((nOutputRows - 1)*dH >= nInputRows + padH) + --nOutputRows; + if ((nOutputCols - 1)*dW >= nInputCols + padW) + --nOutputCols; + } + + input = THCTensor_(newContiguous)(state, input); + real* input_data = THCTensor_(data)(state, input); + + THCTensor_(resize4d)(state, output, batchSize, nInputPlane, nOutputRows, nOutputCols); + + real* output_data = THCTensor_(data)(state, output); + + int count = THCTensor_(nElement)(state, output); + + if(count_include_pad) + AvePoolForward<real, accreal, true> + <<<GET_BLOCKS(count), CUDA_NUM_THREADS, 0, THCState_getCurrentStream(state) >>>( + count, input_data, + batchSize, nInputPlane, nInputRows, nInputCols, nOutputRows, nOutputCols, + kH, kW, dH, dW, padH, padW, output_data); + else + AvePoolForward<real, accreal, false> + <<<GET_BLOCKS(count), CUDA_NUM_THREADS, 0, THCState_getCurrentStream(state) >>>( + count, input_data, + batchSize, nInputPlane, nInputRows, nInputCols, nOutputRows, nOutputCols, + kH, kW, dH, dW, padH, padW, output_data); + THCudaCheck(cudaGetLastError()); + + if(input->nDimension == 3) + THCTensor_(resize3d)(state, output, nInputPlane, nOutputRows, nOutputCols); + + THCTensor_(free)(state, input); + +} + +void THNN_(SpatialAveragePooling_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradInput, + int kW, int kH, + int dW, int dH, + int padW, int padH, + bool ceil_mode, + bool count_include_pad) +{ + THCUNN_assertSameGPU_generic(state, 3, input, gradOutput, gradInput); + + input = THCTensor_(newContiguous)(state, input); + gradOutput = THCTensor_(newContiguous)(state, gradOutput); + + long nInputCols, nInputRows, nInputPlane, batchSize; + long nOutputCols, nOutputRows; + + if (input->nDimension == 3) { + nInputCols = input->size[2]; + nInputRows = input->size[1]; + nInputPlane = input->size[0]; + batchSize = 1; + } + else + { + nInputCols = input->size[3]; + nInputRows = input->size[2]; + nInputPlane = input->size[1]; + batchSize = input->size[0]; + } + + if(ceil_mode) { + nOutputCols = ceil(float(nInputCols - kW + 2*padW) / float(dW)) + 1; + nOutputRows = ceil(float(nInputRows - kH + 2*padH) / float(dH)) + 1; + } + else { + nOutputCols = floor(float(nInputCols - kW + 2*padW) / float(dW)) + 1; + nOutputRows = floor(float(nInputRows - kH + 2*padH) / float(dH)) + 1; + } + if (padW || padH) + { + // ensure that the last pooling starts inside the image + // needed to avoid problems in ceil mode + if ((nOutputRows - 1)*dH >= nInputRows + padH) + --nOutputRows; + if ((nOutputCols - 1)*dW >= nInputCols + padW) + --nOutputCols; + } + + THCTensor_(resizeAs)(state, gradInput, input); + + int count = THCTensor_(nElement)(state, input); + + if(count_include_pad) + AvePoolBackward<real, accreal, true> + <<< GET_BLOCKS(count), CUDA_NUM_THREADS, 0, THCState_getCurrentStream(state) >>> + (count, + THCTensor_(data)(state, gradOutput), + batchSize, nInputPlane, nInputRows, nInputCols, nOutputRows, nOutputCols, + kH, kW, dH, dW, padH, padW, + THCTensor_(data)(state, gradInput)); + else + AvePoolBackward<real, accreal, false> + <<< GET_BLOCKS(count), CUDA_NUM_THREADS, 0, THCState_getCurrentStream(state) >>> + (count, + THCTensor_(data)(state, gradOutput), + batchSize, nInputPlane, nInputRows, nInputCols, nOutputRows, nOutputCols, + kH, kW, dH, dW, padH, padW, + THCTensor_(data)(state, gradInput)); + THCudaCheck(cudaGetLastError()); + + // clean + THCTensor_(free)(state, input); + THCTensor_(free)(state, gradOutput); +} + +#endif diff --git a/lib/THCUNN/generic/SpatialClassNLLCriterion.cu b/lib/THCUNN/generic/SpatialClassNLLCriterion.cu new file mode 100644 index 0000000..edb3a0c --- /dev/null +++ b/lib/THCUNN/generic/SpatialClassNLLCriterion.cu @@ -0,0 +1,126 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/SpatialClassNLLCriterion.cu" +#else + +void THNN_(SpatialClassNLLCriterion_updateOutput)( + THCState *state, + THCTensor *input, + THCIndexTensor *target, + THCTensor *output, + bool sizeAverage, + THCTensor *weights, + THCTensor *total_weight) +{ + THArgCheck(THCIndexTensor_(nDimension)(state, target) == 3, 1, + "only batches of spatial targets supported (3D tensors)"); + THArgCheck(THCTensor_(nDimension)(state, input) == 4, 2, + "only batches of spatial inputs supported (4D tensors)"); + if (weights && THCTensor_(nElement)(state, weights) != THCTensor_(size)(state, input, 1)) { + THError("weight tensor should be defined either for all or no classes"); + } + + if (weights) + THCUNN_assertSameGPU_generic(state, 5, input, target, weights, output, total_weight); + else + THCUNN_assertSameGPU_generic(state, 4, input, target, output, total_weight); + + input = THCTensor_(newContiguous)(state, input); + weights = weights ? THCTensor_(newContiguous)(state, weights) : NULL; + target = THCIndexTensor_(newContiguous)(state, target); + + real *input_data = THCTensor_(data)(state, input); + real *weights_data = weights ? THCTensor_(data)(state, weights) : NULL; + THCIndex_t *target_data = THCIndexTensor_(data)(state, target); + real *output_data = THCTensor_(data)(state, output); + real *total_weight_data = THCTensor_(data)(state, total_weight); + + THCIndex_t batch_size = THCIndexTensor_(size)(state, target, 0); + THCIndex_t map_nelem = THCIndexTensor_(nElement)(state, target) / batch_size; + int blocks_per_sample = GET_BLOCKS(map_nelem) / 128; + blocks_per_sample = (blocks_per_sample == 0) ? 1 : blocks_per_sample; + int total_blocks = blocks_per_sample * batch_size; + + THCTensor_(fill)(state, output, ScalarConvert<int, real>::to(0)); + THCTensor_(fill)(state, total_weight, ScalarConvert<int, real>::to(0)); + + cunn_SpatialClassNLLCriterion_updateOutput_kernel<real, accreal> + <<<total_blocks, CUDA_NUM_THREADS, 0, THCState_getCurrentStream(state)>>>( + output_data, + total_weight_data, + input_data, + target_data, + weights_data, + sizeAverage, + THCTensor_(size)(state, input, 0), + THCTensor_(size)(state, input, 1), + THCTensor_(size)(state, input, 2) * THCTensor_(size)(state, input, 3), + blocks_per_sample + ); + THCudaCheck(cudaGetLastError()); + + if (weights) + THCTensor_(free)(state, weights); + THCIndexTensor_(free)(state, target); + THCTensor_(free)(state, input); +} + +void THNN_(SpatialClassNLLCriterion_updateGradInput)( + THCState *state, + THCTensor *input, + THCIndexTensor *target, + THCTensor *gradInput, + bool sizeAverage, + THCTensor *weights, + THCTensor *total_weight) +{ + THArgCheck(THCIndexTensor_(nDimension)(state, target) == 3, 1, + "only batches of spatial targets supported (3D tensors)"); + THArgCheck(THCTensor_(nDimension)(state, input) == 4, 2, + "only batches of spatial inputs supported (4D tensors)"); + THArgCheck(THCTensor_(isContiguous)(state, gradInput), 4, + "gradInput must be contiguous"); + if (weights && THCTensor_(nElement)(state, weights) != THCTensor_(size)(state, input, 1)) { + THError("weight tensor should be defined either for all or no classes"); + } + + if (weights) + THCUNN_assertSameGPU_generic(state, 5, weights, input, target, gradInput, total_weight); + else + THCUNN_assertSameGPU_generic(state, 4, input, target, gradInput, total_weight); + + input = THCTensor_(newContiguous)(state, input); + weights = weights ? THCTensor_(newContiguous)(state, weights) : NULL; + target = THCIndexTensor_(newContiguous)(state, target); + + real *weights_data = weights ? THCTensor_(data)(state, weights) : NULL; + real *gradInput_data = THCTensor_(data)(state, gradInput); + THCIndex_t *target_data = THCIndexTensor_(data)(state, target); + real *total_weight_data = THCTensor_(data)(state, total_weight); + + THCIndex_t batch_size = THCIndexTensor_(size)(state, target, 0); + THCIndex_t map_nelem = THCIndexTensor_(nElement)(state, target) / batch_size; + int blocks_per_sample = GET_BLOCKS(map_nelem) / 128; + blocks_per_sample = (blocks_per_sample == 0) ? 1 : blocks_per_sample; + int total_blocks = blocks_per_sample * batch_size; + + cunn_SpatialClassNLLCriterion_updateGradInput_kernel + <<<total_blocks, CUDA_NUM_THREADS, 0, THCState_getCurrentStream(state)>>>( + gradInput_data, + target_data, + weights_data, + total_weight_data, + sizeAverage, + THCTensor_(size)(state, input, 0), + THCTensor_(size)(state, input, 1), + THCTensor_(size)(state, input, 2) *THCTensor_(size)(state, input, 3), + blocks_per_sample + ); + THCudaCheck(cudaGetLastError()); + + if (weights) + THCTensor_(free)(state, weights); + THCIndexTensor_(free)(state, target); + THCTensor_(free)(state, input); +} + +#endif diff --git a/lib/THCUNN/generic/SpatialConvolutionLocal.cu b/lib/THCUNN/generic/SpatialConvolutionLocal.cu new file mode 100644 index 0000000..d317fe2 --- /dev/null +++ b/lib/THCUNN/generic/SpatialConvolutionLocal.cu @@ -0,0 +1,324 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/SpatialConvolutionLocal.cu" +#else + +void THNN_(SpatialConvolutionLocal_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output, + THCTensor *weight, + THCTensor *bias, + THCTensor *finput, + THCTensor *fgradInput, + int kW, int kH, + int dW, int dH, + int padW, int padH, + long inputWidth, long inputHeight, + long outputWidth, long outputHeight) +{ + THCUNN_assertSameGPU_generic(state, 5, input, output, weight, + bias, finput); + + long nInputPlane = THCTensor_(size)(state,weight,2)/(kW*kH); + long nOutputPlane = THCTensor_(size)(state,weight,1); + + int batch = 1; + if (input->nDimension == 3) { + // Force batch + batch = 0; + THCTensor_(resize4d)(state, input, 1, nInputPlane, inputHeight, inputWidth); + } + + // Batch size + input planes + long batchSize = input->size[0]; + + // Resize output + THCTensor_(resize4d)(state, output, batchSize, nOutputPlane, outputHeight, outputWidth); + + // Augment the input + THCTensor_(resize3d)(state, finput, batchSize, nInputPlane*kW*kH, outputHeight*outputWidth); + + // Helpers + THCTensor *input_n = THCTensor_(new)(state); + THCTensor *finput_n = THCTensor_(new)(state); + THCTensor *output_n = THCTensor_(new)(state); + + // For each elt in batch, do: + for (int elt = 0; elt < batchSize; elt ++) { + THCTensor *finput3d, *output3d; + THCTensor *wslice = THCTensor_(new)(state); + THCTensor *islice = THCTensor_(new)(state); + THCTensor *oslice = THCTensor_(new)(state); + + // Matrix mulitply per output: + THCTensor_(select)(state, input_n, input, 0, elt); + THCTensor_(select)(state, finput_n, finput, 0, elt); + THCTensor_(select)(state, output_n, output, 0, elt); + + // Extract columns: + im2col( + THCState_getCurrentStream(state), + THCTensor_(data)(state, input_n), + nInputPlane, inputHeight, inputWidth, kH, kW, padH, padW, dH, dW, + 1, 1, THCTensor_(data)(state, finput_n) + ); + + output3d = THCTensor_(newWithStorage3d)(state, output_n->storage, output_n->storageOffset, + outputHeight*outputWidth, 1, + nOutputPlane, outputHeight*outputWidth, + 1, nOutputPlane*outputHeight*outputWidth); + + finput3d = THCTensor_(newWithStorage3d)(state, finput_n->storage, finput_n->storageOffset, + outputHeight*outputWidth, 1, + kW*kH*nInputPlane, outputHeight*outputWidth, + 1, kW*kH*nInputPlane*outputHeight*outputWidth); + + THCTensor_(copy)(state, output_n, bias); + + for (int i = 0; i < outputHeight; i++) { + for(int j = 0; j < outputWidth; j++) { + int sliceidx = i * outputWidth + j; + THCTensor_(select)(state, wslice, weight, 0, sliceidx); + THCTensor_(select)(state, islice, finput3d, 0, sliceidx); + THCTensor_(select)(state, oslice, output3d, 0, sliceidx); + THCTensor_(addmm)(state, oslice, ScalarConvert<int, real>::to(1), oslice, ScalarConvert<int, real>::to(1), wslice, islice); + } + } + + + // weight: oH*oW x nOutputPlane x nInputPlane*kH*kW + // finput3d: oH*oW x nInputPlane*kH*kW x 1 + // THCTensor_(baddbmm)(state, output3d, 1.0, output3d, 1.0, weight, finput3d); + // output3d: oH*oW x nOutputPlane x 1 + + THCTensor_(free)(state, output3d); + THCTensor_(free)(state, finput3d); + THCTensor_(free)(state, wslice); + THCTensor_(free)(state, islice); + THCTensor_(free)(state, oslice); + } + + // Free + THCTensor_(free)(state, input_n); + THCTensor_(free)(state, finput_n); + THCTensor_(free)(state, output_n); + + // Resize output + if (batch == 0) { + THCTensor_(resize3d)(state, output, nOutputPlane, outputHeight, outputWidth); + THCTensor_(resize3d)(state, input, nInputPlane, inputHeight, inputWidth); + } +} + +void THNN_(SpatialConvolutionLocal_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradInput, + THCTensor *weight, + THCTensor *finput, + THCTensor *fgradInput, + int kW, int kH, + int dW, int dH, + int padW, int padH, + long inputWidth, long inputHeight, + long outputWidth, long outputHeight) +{ + THCUNN_assertSameGPU_generic(state, 5, input, gradOutput, weight, + fgradInput, gradInput); + + long nInputPlane = THCTensor_(size)(state,weight,2)/(kW*kH); + long nOutputPlane = THCTensor_(size)(state,weight,1); + + int batch = 1; + if (input->nDimension == 3) { + // Force batch + batch = 0; + THCTensor_(resize4d)(state, input, 1, nInputPlane, inputHeight, inputWidth); + THCTensor_(resize4d)(state, gradOutput, 1, nOutputPlane, outputHeight, outputWidth); + } + + // Batch size + input planes + long batchSize = input->size[0]; + + // Resize output + THCTensor_(resize4d)(state, gradInput, batchSize, nInputPlane, inputHeight, inputWidth); + + // Resize temporary columns + THCTensor_(resize3d)(state, fgradInput, batchSize, nInputPlane*kW*kH, outputHeight*outputWidth); + + // Helpers + THCTensor *gradInput_n = THCTensor_(new)(state); + THCTensor *fgradInput_n = THCTensor_(new)(state); + THCTensor *gradOutput_n = THCTensor_(new)(state); + + THCTensor_(transpose)(state, weight, weight, 1, 2); + + // For each elt in batch, do: + for (int elt = 0; elt < batchSize; elt ++) { + THCTensor *gradOutput3d, *fgradInput3d; + THCTensor *wslice = THCTensor_(new)(state); + THCTensor *gislice = THCTensor_(new)(state); + THCTensor *goslice = THCTensor_(new)(state); + + // Matrix mulitply per sample: + THCTensor_(select)(state, gradInput_n, gradInput, 0, elt); + THCTensor_(select)(state, fgradInput_n, fgradInput, 0, elt); + THCTensor_(select)(state, gradOutput_n, gradOutput, 0, elt); + + gradOutput3d = THCTensor_(newWithStorage3d)(state, gradOutput_n->storage, gradOutput_n->storageOffset, + outputHeight*outputWidth, 1, + nOutputPlane, outputHeight*outputWidth, + 1, nOutputPlane*outputHeight*outputWidth); + fgradInput3d = THCTensor_(newWithStorage3d)(state, fgradInput_n->storage, fgradInput_n->storageOffset, + outputHeight*outputWidth, 1, + kW*kH*nInputPlane, outputHeight*outputWidth, + 1, kW*kH*nInputPlane*outputHeight*outputWidth); + + for (int i = 0; i < outputHeight; i++) { + for(int j = 0; j < outputWidth; j++) { + int sliceidx = i * outputWidth + j; + THCTensor_(select)(state, wslice, weight, 0, sliceidx); + THCTensor_(select)(state, gislice, fgradInput3d, 0, sliceidx); + THCTensor_(select)(state, goslice, gradOutput3d, 0, sliceidx); + THCTensor_(addmm)(state, gislice, ScalarConvert<int, real>::to(0), gislice, ScalarConvert<int, real>::to(1), wslice, goslice); + } + } + + // weight: oH*oW x nInputPlane*kH*kW x nOutputPlane + // gradOutput3d: oH*oW x nOutputPlane x 1 + //THCTensor_(baddbmm)(state, fgradInput3d, 0.0, fgradInput3d, 1.0, weight, gradOutput3d); + // fgradInput3d: oH*oW x nInputPlane*kH*kW x 1 + + // Unpack columns back into input: + col2im<real, accreal>( + THCState_getCurrentStream(state), + THCTensor_(data)(state, fgradInput_n), + nInputPlane, inputHeight, inputWidth, kH, kW, padH, padW, dH, dW, + 1, 1, THCTensor_(data)(state, gradInput_n) + ); + + THCTensor_(free)(state, gradOutput3d); + THCTensor_(free)(state, fgradInput3d); + THCTensor_(free)(state, wslice); + THCTensor_(free)(state, gislice); + THCTensor_(free)(state, goslice); + } + + // Free + THCTensor_(free)(state, gradInput_n); + THCTensor_(free)(state, fgradInput_n); + THCTensor_(free)(state, gradOutput_n); + + // Resize output + if (batch == 0) { + THCTensor_(resize3d)(state, gradOutput, nOutputPlane, outputHeight, outputWidth); + THCTensor_(resize3d)(state, input, nInputPlane, inputHeight, inputWidth); + THCTensor_(resize3d)(state, gradInput, nInputPlane, inputHeight, inputWidth); + } + + THCTensor_(transpose)(state, weight, weight, 1, 2); +} + +void THNN_(SpatialConvolutionLocal_accGradParameters)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradWeight, + THCTensor *gradBias, + THCTensor *finput, + THCTensor *fgradInput, + int kW, int kH, + int dW, int dH, + int padW, int padH, + long inputWidth, long inputHeight, + long outputWidth, long outputHeight, + real scale) +{ + THCUNN_assertSameGPU_generic(state, 5, input, gradOutput, gradWeight, + gradBias, finput); + + long nInputPlane = THCTensor_(size)(state,gradWeight,2)/(kW*kH); + long nOutputPlane = THCTensor_(size)(state,gradWeight,1); + + int batch = 1; + if (input->nDimension == 3) { + // Force batch + batch = 0; + THCTensor_(resize4d)(state, input, 1, nInputPlane, inputHeight, inputWidth); + THCTensor_(resize4d)(state, gradOutput, 1, nOutputPlane, outputHeight, outputWidth); + } + + // Batch size + input planes + long batchSize = input->size[0]; + + // Helpers + THCTensor *input_n = THCTensor_(new)(state); + THCTensor *finput_n = THCTensor_(new)(state); + THCTensor *gradOutput_n = THCTensor_(new)(state); + + // For each elt in batch, do: + for (int elt = 0; elt < batchSize; elt ++) { + THCTensor *gradOutput3d, *finput3d; + THCTensor *gwslice = THCTensor_(new)(state); + THCTensor *islice = THCTensor_(new)(state); + THCTensor *goslice = THCTensor_(new)(state); + + // Matrix mulitply per output: + THCTensor_(select)(state, input_n, input, 0, elt); + THCTensor_(select)(state, finput_n, finput, 0, elt); + THCTensor_(select)(state, gradOutput_n, gradOutput, 0, elt); + + gradOutput3d = THCTensor_(newWithStorage3d)(state, gradOutput_n->storage, gradOutput_n->storageOffset, + outputHeight*outputWidth, 1, + nOutputPlane, outputHeight*outputWidth, + 1, nOutputPlane*outputHeight*outputWidth); + finput3d = THCTensor_(newWithStorage3d)(state, finput_n->storage, finput_n->storageOffset, + outputHeight*outputWidth, 1, + 1, kW*kH*nInputPlane*outputHeight*outputWidth, + kW*kH*nInputPlane, outputHeight*outputWidth); + + // Extract columns: + im2col( + THCState_getCurrentStream(state), + THCTensor_(data)(state, input_n), + nInputPlane, inputHeight, inputWidth, kH, kW, padH, padW, dH, dW, + 1, 1, THCTensor_(data)(state, finput_n) + ); + + for (int i = 0; i < outputHeight; i++) { + for(int j = 0; j < outputWidth; j++) { + int sliceidx = i * outputWidth + j; + THCTensor_(select)(state, gwslice, gradWeight, 0, sliceidx); + THCTensor_(select)(state, goslice, gradOutput3d, 0, sliceidx); + THCTensor_(select)(state, islice, finput3d, 0, sliceidx); + THCTensor_(addmm)(state, gwslice, ScalarConvert<int, real>::to(1), gwslice, scale, goslice, islice); + } + } + // gradOutput3d: oH*oW x nOutputPlane x 1 + // finput3d: oH*oW x 1 x kW*kH*nInputPlane + //THCTensor_(baddbmm)(state, gradWeight, 1.0, gradWeight, scale, gradOutput3d, finput3d); + // gradWeight: oH*oW x nOutputPlane x kW*kH*nInputPlane + + THCTensor_(cadd)(state, gradBias, gradBias, scale, gradOutput_n); + + THCTensor_(free)(state, gradOutput3d); + THCTensor_(free)(state, finput3d); + THCTensor_(free)(state, gwslice); + THCTensor_(free)(state, goslice); + THCTensor_(free)(state, islice); + } + + // Free + THCTensor_(free)(state, input_n); + THCTensor_(free)(state, finput_n); + THCTensor_(free)(state, gradOutput_n); + + // Resize + if (batch == 0) { + THCTensor_(resize3d)(state, gradOutput, nOutputPlane, outputHeight, outputWidth); + THCTensor_(resize3d)(state, input, nInputPlane, inputHeight, inputWidth); + } +} + +#endif diff --git a/lib/THCUNN/generic/SpatialConvolutionMM.cu b/lib/THCUNN/generic/SpatialConvolutionMM.cu new file mode 100644 index 0000000..46a62fb --- /dev/null +++ b/lib/THCUNN/generic/SpatialConvolutionMM.cu @@ -0,0 +1,427 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/SpatialConvolutionMM.cu" +#else + +void THNN_(SpatialConvolutionMM_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output, + THCTensor *weight, + THCTensor *bias, + THCTensor *columns, + THCTensor *ones, + int kW, int kH, + int dW, int dH, + int padW, int padH) { + + THCUNN_assertSameGPU_generic(state, 5, input, output, weight, columns, ones); + if (bias) { + THCUNN_assertSameGPU_generic(state, 2, weight, bias); + } + THArgCheck(input->nDimension == 3 || input->nDimension == 4, 2, "3D or 4D (batch mode) tensor is expected"); + THArgCheck(!bias || weight->size[0] == bias->size[0], 4, "nOutputPlane mismatch in weight and bias"); + THArgCheck(kW > 0 && kH > 0, 8, "kernel size should be greater than zero"); + THArgCheck(dW > 0 && dH > 0, 10, "stride should be greater than zero"); + THArgCheck(weight->nDimension == 2 || weight->nDimension == 4, 4, "weight tensor should be 2D or 4D"); + + int freeWeight = 0; + + // Params: + int nInputPlane = weight->nDimension == 2 ? weight->size[1]/(kH*kW) : weight->size[1]; + int nOutputPlane = weight->size[0]; + + if (weight->nDimension == 4) { + long s1 = weight->size[0]; + long s2 = weight->size[1] * weight->size[2] * weight->size[3]; + weight = THCTensor_(newWithStorage2d)(state, weight->storage, weight->storageOffset, s1, -1, s2, -1); + freeWeight = 1; + } + + int batch = 1; + if (input->nDimension == 3) { + THArgCheck(input->size[0] == nInputPlane, 2, "input channels and nInputPlane dont match"); + // Force batch + batch = 0; + THCTensor_(resize4d)(state, input, 1, input->size[0], input->size[1], input->size[2]); + } else { + THArgCheck(input->size[1] == nInputPlane, 2, "input channels and nInputPlane dont match"); + } + + long inputWidth = input->size[3]; + long inputHeight = input->size[2]; + long outputWidth = (inputWidth + 2*padW - kW) / dW + 1; + long outputHeight = (inputHeight + 2*padH - kH) / dH + 1; + + if (outputWidth < 1 || outputHeight < 1) + THError("Given input size: (%dx%dx%d). Calculated output size: (%dx%dx%d). Output size is too small", + nInputPlane,inputHeight,inputWidth,nOutputPlane,outputHeight,outputWidth); + + // Batch size + input planes + long batchSize = input->size[0]; + + // Resize output + THCTensor_(resize4d)(state, output, batchSize, nOutputPlane, outputHeight, outputWidth); + + // Resize temporary columns + THCTensor_(resize2d)(state, columns, nInputPlane*kW*kH, outputHeight*outputWidth); + + // Define a buffer of ones, for bias accumulation + // Note: this buffer can be shared with other modules, it only ever gets increased, + // and always contains ones. + if (ones->nDimension != 2 || ones->size[0]*ones->size[1] < outputHeight*outputWidth) { + // Resize plane and fill with ones... + THCTensor_(resize2d)(state, ones, outputHeight, outputWidth); + THCTensor_(fill)(state, ones, ScalarConvert<int, real>::to(1)); + } + + // Helpers + THCTensor *input_n = THCTensor_(new)(state); + THCTensor *output_n = THCTensor_(new)(state); + + // For each elt in batch, do: + for (int elt = 0; elt < batchSize; elt ++) { + // Matrix mulitply per output: + THCTensor_(select)(state, input_n, input, 0, elt); + THCTensor_(select)(state, output_n, output, 0, elt); + + // Do Bias first: + // M,N,K are dims of matrix A and B + // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm) + long m_ = nOutputPlane; + long n_ = outputHeight * outputWidth; + long k_ = 1; + + // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices) + if (bias) { + #ifdef THC_REAL_IS_FLOAT + THCudaBlas_Sgemm( + #elif defined(THC_REAL_IS_HALF) + THCudaBlas_Hgemm( + #elif defined(THC_REAL_IS_DOUBLE) + THCudaBlas_Dgemm( + #endif + state, + 't', 'n', + n_, m_, k_, + ScalarConvert<int, real>::to(1), + THCTensor_(data)(state, ones), k_, + THCTensor_(data)(state, bias), k_, + ScalarConvert<int, real>::to(0), + THCTensor_(data)(state, output_n), n_ + ); + } else { + THCTensor_(zero)(state, output_n); + } + + // Extract columns: + im2col( + THCState_getCurrentStream(state), + THCTensor_(data)(state, input_n), + nInputPlane, inputHeight, inputWidth, kH, kW, padH, padW, dH, dW, + 1, 1, THCTensor_(data)(state, columns) + ); + + // M,N,K are dims of matrix A and B + // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm) + long m = nOutputPlane; + long n = columns->size[1]; + long k = nInputPlane*kH*kW; + + // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices) + #ifdef THC_REAL_IS_FLOAT + THCudaBlas_Sgemm( + #elif defined(THC_REAL_IS_HALF) + THCudaBlas_Hgemm( + #elif defined(THC_REAL_IS_DOUBLE) + THCudaBlas_Dgemm( + #endif + state, + 'n', 'n', + n, m, k, + ScalarConvert<int, real>::to(1), + THCTensor_(data)(state, columns), n, + THCTensor_(data)(state, weight), k, + ScalarConvert<int, real>::to(1), + THCTensor_(data)(state, output_n), n + ); + } + + // Free + THCTensor_(free)(state, input_n); + THCTensor_(free)(state, output_n); + if (freeWeight) + THCTensor_(free)(state, weight); + + // Resize output + if (batch == 0) { + THCTensor_(resize3d)(state, output, nOutputPlane, outputHeight, outputWidth); + THCTensor_(resize3d)(state, input, nInputPlane, inputHeight, inputWidth); + } +} + +void THNN_(SpatialConvolutionMM_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradInput, + THCTensor *weight, + THCTensor *gradColumns, + THCTensor *ones, + int kW, int kH, + int dW, int dH, + int padW, int padH) { + + THCUNN_assertSameGPU_generic(state, 5, input, gradOutput, weight, + gradColumns, gradInput); + THArgCheck(input->nDimension == 3 || input->nDimension == 4, 2, "3D or 4D (batch mode) tensor is expected"); + THArgCheck(kW > 0 && kH > 0, 9, "kernel size should be greater than zero"); + THArgCheck(dW > 0 && dH > 0, 11, "stride should be greater than zero"); + THArgCheck(weight->nDimension == 2 || weight->nDimension == 4, 4, "weight tensor should be 2D or 4D"); + + // Params + int nInputPlane = weight->nDimension == 2 ? weight->size[1]/(kW*kH) : weight->size[1]; + int nOutputPlane = weight->size[0]; + + int freeWeight = 0; + if (weight->nDimension == 4) { + long s1 = weight->size[0]; + long s2 = weight->size[1] * weight->size[2] * weight->size[3]; + weight = THCTensor_(newWithStorage2d)(state, weight->storage, weight->storageOffset, s1, -1, s2, -1); + freeWeight = 1; + } + + int batch = 1; + if (input->nDimension == 3) { + // Force batch + batch = 0; + THCTensor_(resize4d)(state, input, 1, input->size[0], input->size[1], input->size[2]); + THCTensor_(resize4d)(state, gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2]); + } + + long inputWidth = input->size[3]; + long inputHeight = input->size[2]; + long outputWidth = (inputWidth + 2*padW - kW) / dW + 1; + long outputHeight = (inputHeight + 2*padH - kH) / dH + 1; + + // Batch size + input planes + long batchSize = input->size[0]; + + // Resize output + THCTensor_(resize4d)(state, gradInput, batchSize, nInputPlane, inputHeight, inputWidth); + + // Resize temporary columns + THCTensor_(resize2d)(state, gradColumns, nInputPlane*kW*kH, outputHeight*outputWidth); + + // Helpers + THCTensor *gradInput_n = THCTensor_(new)(state); + THCTensor *gradOutput_n = THCTensor_(new)(state); + + // For each elt in batch, do: + for (int elt = 0; elt < batchSize; elt ++) { + // Matrix mulitply per sample: + THCTensor_(select)(state, gradInput_n, gradInput, 0, elt); + THCTensor_(select)(state, gradOutput_n, gradOutput, 0, elt); + + // M,N,K are dims of matrix A and B + // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm) + long m = nInputPlane*kW*kH; + long n = gradColumns->size[1]; + long k = nOutputPlane; + + // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices) + #ifdef THC_REAL_IS_FLOAT + THCudaBlas_Sgemm( + #elif defined(THC_REAL_IS_HALF) + THCudaBlas_Hgemm( + #elif defined(THC_REAL_IS_DOUBLE) + THCudaBlas_Dgemm( + #endif + state, + 'n', 't', + n, m, k, + ScalarConvert<int, real>::to(1), + THCTensor_(data)(state, gradOutput_n), n, + THCTensor_(data)(state, weight), m, + ScalarConvert<int, real>::to(0), + THCTensor_(data)(state, gradColumns), n + ); + + // Unpack columns back into input: + col2im<real, accreal>( + THCState_getCurrentStream(state), + THCTensor_(data)(state, gradColumns), + nInputPlane, inputHeight, inputWidth, kH, kW, padH, padW, dH, dW, + 1, 1, THCTensor_(data)(state, gradInput_n) + ); + } + + // Free + THCTensor_(free)(state, gradInput_n); + THCTensor_(free)(state, gradOutput_n); + if (freeWeight) + THCTensor_(free)(state, weight); + + // Resize output + if (batch == 0) { + THCTensor_(resize3d)(state, gradOutput, nOutputPlane, outputHeight, outputWidth); + THCTensor_(resize3d)(state, input, nInputPlane, inputHeight, inputWidth); + THCTensor_(resize3d)(state, gradInput, nInputPlane, inputHeight, inputWidth); + } +} + +void THNN_(SpatialConvolutionMM_accGradParameters)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradWeight, + THCTensor *gradBias, + THCTensor *columns, + THCTensor *ones, + int kW, int kH, + int dW, int dH, + int padW, int padH, + real scale) { + + THCUNN_assertSameGPU_generic(state, 5, input, gradOutput, gradWeight, columns, ones); + if (gradBias) { + THCUNN_assertSameGPU_generic(state, 2, gradWeight, gradBias); + } + THArgCheck(input->nDimension == 3 || input->nDimension == 4, 2, "3D or 4D (batch mode) tensor is expected"); + THArgCheck(!gradBias || gradWeight->size[0] == gradBias->size[0], 4, "nOutputPlane mismatch in gradWeight and gradBias"); + THArgCheck(kW > 0 && kH > 0, 8, "kernel size should be greater than zero"); + THArgCheck(dW > 0 && dH > 0, 10, "stride should be greater than zero"); + THArgCheck(gradWeight->nDimension == 2 || gradWeight->nDimension == 4, 4, "gradWeight tensor should be 2D or 4D"); + + // Params + int nInputPlane = gradWeight->nDimension == 2 ? gradWeight->size[1]/(kW*kH) : gradWeight->size[1]; + int nOutputPlane = gradWeight->size[0]; + + int freeWeight = 0; + if (gradWeight->nDimension == 4) { + long s1 = gradWeight->size[0]; + long s2 = gradWeight->size[1] * gradWeight->size[2] * gradWeight->size[3]; + gradWeight = THCTensor_(newWithStorage2d)(state, gradWeight->storage, gradWeight->storageOffset, s1, -1, s2, -1); + freeWeight = 1; + } + + int batch = 1; + if (input->nDimension == 3) { + // Force batch + batch = 0; + THCTensor_(resize4d)(state, input, 1, input->size[0], input->size[1], input->size[2]); + THCTensor_(resize4d)(state, gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2]); + } + + long inputWidth = input->size[3]; + long inputHeight = input->size[2]; + long outputWidth = (inputWidth + 2*padW - kW) / dW + 1; + long outputHeight = (inputHeight + 2*padH - kH) / dH + 1; + + // Batch size + input planes + long batchSize = input->size[0]; + + // Define a buffer of ones, for bias accumulation + if (ones->nDimension != 2 || ones->size[0]*ones->size[1] < outputHeight*outputWidth) { + // Resize plane and fill with ones... + THCTensor_(resize2d)(state, ones, outputHeight, outputWidth); + THCTensor_(fill)(state, ones, ScalarConvert<int, real>::to(1)); + } + + // Resize temporary columns + THCTensor_(resize2d)(state, columns, nInputPlane*kW*kH, outputHeight*outputWidth); + + // Helpers + THCTensor *input_n = THCTensor_(new)(state); + THCTensor *gradOutput_n = THCTensor_(new)(state); + + // For each elt in batch, do: + for (int elt = 0; elt < batchSize; elt ++) { + // Matrix mulitply per output: + THCTensor_(select)(state, input_n, input, 0, elt); + THCTensor_(select)(state, gradOutput_n, gradOutput, 0, elt); + + // Extract columns: + im2col( + THCState_getCurrentStream(state), + THCTensor_(data)(state, input_n), + nInputPlane, inputHeight, inputWidth, kH, kW, padH, padW, dH, dW, + 1, 1, THCTensor_(data)(state, columns) + ); + + // M,N,K are dims of matrix A and B + // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm) + long m = nOutputPlane; + long n = nInputPlane*kW*kH; + long k = columns->size[1]; + + // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices) + #ifdef THC_REAL_IS_FLOAT + THCudaBlas_Sgemm( + #elif defined(THC_REAL_IS_HALF) + THCudaBlas_Hgemm( + #elif defined(THC_REAL_IS_DOUBLE) + THCudaBlas_Dgemm( + #endif + state, + 't', 'n', + n, m, k, + scale, + THCTensor_(data)(state, columns), k, + THCTensor_(data)(state, gradOutput_n), k, + ScalarConvert<int, real>::to(1), + THCTensor_(data)(state, gradWeight), n + ); + + // Do Bias: + // M,N,K are dims of matrix A and B + // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm) + long m_ = nOutputPlane; + long k_ = outputHeight * outputWidth; + + // Do GEMV (note: this is a bit confusing because gemv assumes column-major matrices) + if (gradBias) { + #if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) + #ifdef THC_REAL_IS_FLOAT + THCudaBlas_Sgemv( + #elif defined(THC_REAL_IS_DOUBLE) + THCudaBlas_Dgemv( + #endif + state, + 't', + k_, m_, + scale, + THCTensor_(data)(state, gradOutput_n), k_, + THCTensor_(data)(state, ones), 1, + ScalarConvert<int, real>::to(1), + THCTensor_(data)(state, gradBias), 1 + ); + #endif + #ifdef THC_REAL_IS_HALF + THCudaBlas_Hgemm( + state, + 't', 'n', + m_, 1, k_, + scale, + THCTensor_(data)(state, gradOutput_n), k_, + THCTensor_(data)(state, ones), k_, + ScalarConvert<int, real>::to(1), + THCTensor_(data)(state, gradBias), m_ + ); + #endif + } + } + + // Free + THCTensor_(free)(state, input_n); + THCTensor_(free)(state, gradOutput_n); + if (freeWeight) + THCTensor_(free)(state, gradWeight); + + // Resize + if (batch == 0) { + THCTensor_(resize3d)(state, gradOutput, nOutputPlane, outputHeight, outputWidth); + THCTensor_(resize3d)(state, input, nInputPlane, inputHeight, inputWidth); + } +} + +#endif diff --git a/lib/THCUNN/generic/SpatialCrossMapLRN.cu b/lib/THCUNN/generic/SpatialCrossMapLRN.cu new file mode 100644 index 0000000..a09ea0b --- /dev/null +++ b/lib/THCUNN/generic/SpatialCrossMapLRN.cu @@ -0,0 +1,114 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/SpatialCrossMapLRN.cu" +#else + +void LRNforward(THCState* state, THCTensor* input, THCTensor* output, + THCTensor* scale, int local_size, real alpha, real beta, real k) +{ + THCTensor_(resizeAs)(state, output, input); + THCTensor_(resizeAs)(state, scale, input); + + int batchSize; + int nInputPlane; + int imsize_h; + int imsize_w; + + if (input->nDimension == 3) { + batchSize = 1; + nInputPlane = input->size[0]; + imsize_h = input->size[1]; + imsize_w = input->size[2]; + } + else + { + batchSize = input->size[0]; + nInputPlane = input->size[1]; + imsize_h = input->size[2]; + imsize_w = input->size[3]; + } + + input = THCTensor_(newContiguous)(state, input); + + int n_threads = batchSize * imsize_h * imsize_w; + LRNFillScale<real, accreal> <<<GET_BLOCKS(n_threads), CUDA_NUM_THREADS, 0, THCState_getCurrentStream(state)>>>( + n_threads, THCTensor_(data)(state, input), batchSize, nInputPlane, imsize_h, imsize_w, local_size, + alpha / local_size, k, THCTensor_(data)(state, scale)); + n_threads *= nInputPlane; + THCudaCheck(cudaGetLastError()); + LRNComputeOutput<<<GET_BLOCKS(n_threads), CUDA_NUM_THREADS, 0, THCState_getCurrentStream(state)>>>( + n_threads, THCTensor_(data)(state, input), THCTensor_(data)(state, scale), -beta, THCTensor_(data)(state, output)); + THCudaCheck(cudaGetLastError()); + + THCTensor_(free)(state, input); +} + + +void LRNbackward(THCState* state, THCTensor* input, THCTensor* output, + THCTensor* gradOutput, THCTensor* gradInput, THCTensor* scale, + int local_size, real alpha, real beta, real k) +{ + THCTensor_(resizeAs)(state, gradInput, input); + + int batchSize; + int nInputPlane; + int imsize_h; + int imsize_w; + + if (input->nDimension == 3) { + batchSize = 1; + nInputPlane = input->size[0]; + imsize_h = input->size[1]; + imsize_w = input->size[2]; + } + else + { + batchSize = input->size[0]; + nInputPlane = input->size[1]; + imsize_h = input->size[2]; + imsize_w = input->size[3]; + } + + input = THCTensor_(newContiguous)(state, input); + gradOutput = THCTensor_(newContiguous)(state, gradOutput); + + int n_threads = batchSize * imsize_h * imsize_w; + LRNComputeDiff<real, accreal> <<<GET_BLOCKS(n_threads), CUDA_NUM_THREADS, 0, THCState_getCurrentStream(state)>>>( + n_threads, THCTensor_(data)(state, input), THCTensor_(data)(state, output), + THCTensor_(data)(state, scale), THCTensor_(data)(state, gradOutput), batchSize, nInputPlane, imsize_h, imsize_w, + local_size, -beta, ScalarConvert<int, real>::to(2) * alpha * beta / local_size, + THCTensor_(data)(state, gradInput)); + THCudaCheck(cudaGetLastError()); + + THCTensor_(free)(state, input); + THCTensor_(free)(state, gradOutput); +} + +void THNN_(SpatialCrossMapLRN_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output, + THCTensor *scale, + int size, + real alpha, + real beta, + real k) +{ + LRNforward(state, input, output, scale, size, alpha, beta, k); +} + +void THNN_(SpatialCrossMapLRN_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradInput, + THCTensor *scale, + THCTensor *output, + int size, + real alpha, + real beta, + real k) +{ + LRNbackward(state, input, output, gradOutput, gradInput, scale, size, alpha, beta, k); +} + +#endif diff --git a/lib/THCUNN/generic/SpatialDilatedConvolution.cu b/lib/THCUNN/generic/SpatialDilatedConvolution.cu new file mode 100644 index 0000000..fb5fe51 --- /dev/null +++ b/lib/THCUNN/generic/SpatialDilatedConvolution.cu @@ -0,0 +1,402 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/SpatialDilatedConvolution.cu" +#else + +void THNN_(SpatialDilatedConvolution_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output, + THCTensor *weight, + THCTensor *bias, + THCTensor *columns, + THCTensor *ones, + int kW, int kH, + int dW, int dH, + int padW, int padH, + int dilationW, int dilationH) { + + THCUNN_assertSameGPU_generic(state, 5, input, output, weight, columns, ones); + if (bias) { + THCUNN_assertSameGPU_generic(state, 2, weight, bias); + } + THArgCheck(input->nDimension == 3 || input->nDimension == 4, 2, "3D or 4D (batch mode) tensor is expected"); + THArgCheck(weight->nDimension == 4, 4, "weight tensor must be 4D (nOutputPlane,nInputPlane,kH,kW)"); + THArgCheck(!bias || weight->size[0] == bias->size[0], 4, "nOutputPlane mismatch in weight and bias"); + THArgCheck(kW > 0 && kH > 0, 8, "kernel size should be greater than zero"); + THArgCheck(dW > 0 && dH > 0, 10, "stride should be greater than zero"); + THArgCheck(dilationW > 0 && dilationH > 0, 14, "dilation should be greater than 0"); + + // Params: + int nInputPlane = weight->size[1]; + int nOutputPlane = weight->size[0]; + + int batch = 1; + if (input->nDimension == 3) { + THArgCheck(input->size[0] == nInputPlane, 2, "input channels and nInputPlane dont match"); + // Force batch + batch = 0; + THCTensor_(resize4d)(state, input, 1, input->size[0], input->size[1], input->size[2]); + } else { + THArgCheck(input->size[1] == nInputPlane, 2, "input channels and nInputPlane dont match"); + } + + long inputWidth = input->size[3]; + long inputHeight = input->size[2]; + long outputWidth = (inputWidth + 2*padW - (dilationW * (kW - 1) + 1)) / dW + 1; + long outputHeight = (inputHeight + 2*padH - (dilationH * (kH - 1) + 1)) / dH + 1; + + if (outputWidth < 1 || outputHeight < 1) + THError("Given input size: (%dx%dx%d). Calculated output size: (%dx%dx%d). Output size is too small", + nInputPlane,inputHeight,inputWidth,nOutputPlane,outputHeight,outputWidth); + + // Batch size + input planes + long batchSize = input->size[0]; + + // Resize output + THCTensor_(resize4d)(state, output, batchSize, nOutputPlane, outputHeight, outputWidth); + + // Resize temporary columns + THCTensor_(resize2d)(state, columns, nInputPlane*kW*kH, outputHeight*outputWidth); + + // Define a buffer of ones, for bias accumulation + // Note: this buffer can be shared with other modules, it only ever gets increased, + // and always contains ones. + if (ones->nDimension != 2 || ones->size[0]*ones->size[1] < outputHeight*outputWidth) { + // Resize plane and fill with ones... + THCTensor_(resize2d)(state, ones, outputHeight, outputWidth); + THCTensor_(fill)(state, ones, ScalarConvert<int, real>::to(1)); + } + + // Helpers + THCTensor *input_n = THCTensor_(new)(state); + THCTensor *output_n = THCTensor_(new)(state); + + // For each elt in batch, do: + for (int elt = 0; elt < batchSize; elt ++) { + // Matrix mulitply per output: + THCTensor_(select)(state, input_n, input, 0, elt); + THCTensor_(select)(state, output_n, output, 0, elt); + + // Do Bias first: + // M,N,K are dims of matrix A and B + // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm) + long m_ = nOutputPlane; + long n_ = outputHeight * outputWidth; + long k_ = 1; + + // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices) + if (bias) { + #ifdef THC_REAL_IS_FLOAT + THCudaBlas_Sgemm( + #elif defined(THC_REAL_IS_HALF) + THCudaBlas_Hgemm( + #elif defined(THC_REAL_IS_DOUBLE) + THCudaBlas_Dgemm( + #endif + state, + 't', 'n', + n_, m_, k_, + ScalarConvert<int, real>::to(1), + THCTensor_(data)(state, ones), k_, + THCTensor_(data)(state, bias), k_, + ScalarConvert<int, real>::to(0), + THCTensor_(data)(state, output_n), n_ + ); + } else { + THCTensor_(zero)(state, output_n); + } + + // Extract columns: + im2col( + THCState_getCurrentStream(state), + THCTensor_(data)(state, input_n), + nInputPlane, inputHeight, inputWidth, kH, kW, padH, padW, dH, dW, + dilationH, dilationW, + THCTensor_(data)(state, columns) + ); + + // M,N,K are dims of matrix A and B + // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm) + long m = nOutputPlane; + long n = columns->size[1]; + long k = nInputPlane*kH*kW; + + // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices) + #ifdef THC_REAL_IS_FLOAT + THCudaBlas_Sgemm( + #elif defined(THC_REAL_IS_HALF) + THCudaBlas_Hgemm( + #elif defined(THC_REAL_IS_DOUBLE) + THCudaBlas_Dgemm( + #endif + state, + 'n', 'n', + n, m, k, + ScalarConvert<int, real>::to(1), + THCTensor_(data)(state, columns), n, + THCTensor_(data)(state, weight), k, + ScalarConvert<int, real>::to(1), + THCTensor_(data)(state, output_n), n + ); + } + + // Free + THCTensor_(free)(state, input_n); + THCTensor_(free)(state, output_n); + + // Resize output + if (batch == 0) { + THCTensor_(resize3d)(state, output, nOutputPlane, outputHeight, outputWidth); + THCTensor_(resize3d)(state, input, nInputPlane, inputHeight, inputWidth); + } +} + +void THNN_(SpatialDilatedConvolution_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradInput, + THCTensor *weight, + THCTensor *gradColumns, + int kW, int kH, + int dW, int dH, + int padW, int padH, + int dilationW, int dilationH) { + + THCUNN_assertSameGPU_generic(state, 5, input, gradOutput, weight, + gradColumns, gradInput); + THArgCheck(input->nDimension == 3 || input->nDimension == 4, 2, "3D or 4D (batch mode) tensor is expected"); + THArgCheck(weight->nDimension == 4, 4, "weight tensor must be 4D (nOutputPlane,nInputPlane,kH,kW)"); + THArgCheck(kW > 0 && kH > 0, 9, "kernel size should be greater than zero"); + THArgCheck(dW > 0 && dH > 0, 11, "stride should be greater than zero"); + + // Params + int nInputPlane = weight->size[1]; + int nOutputPlane = weight->size[0]; + + int batch = 1; + if (input->nDimension == 3) { + // Force batch + batch = 0; + THCTensor_(resize4d)(state, input, 1, input->size[0], input->size[1], input->size[2]); + THCTensor_(resize4d)(state, gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2]); + } + + long inputWidth = input->size[3]; + long inputHeight = input->size[2]; + long outputWidth = (inputWidth + 2*padW - (dilationW * (kW - 1) + 1)) / dW + 1; + long outputHeight = (inputHeight + 2*padH - (dilationH * (kH - 1) + 1)) / dH + 1; + + // Batch size + input planes + long batchSize = input->size[0]; + + // Resize output + THCTensor_(resize4d)(state, gradInput, batchSize, nInputPlane, inputHeight, inputWidth); + + // Resize temporary columns + THCTensor_(resize2d)(state, gradColumns, nInputPlane*kW*kH, outputHeight*outputWidth); + + // Helpers + THCTensor *gradInput_n = THCTensor_(new)(state); + THCTensor *gradOutput_n = THCTensor_(new)(state); + + // For each elt in batch, do: + for (int elt = 0; elt < batchSize; elt ++) { + // Matrix mulitply per sample: + THCTensor_(select)(state, gradInput_n, gradInput, 0, elt); + THCTensor_(select)(state, gradOutput_n, gradOutput, 0, elt); + + // M,N,K are dims of matrix A and B + // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm) + long m = nInputPlane*kW*kH; + long n = gradColumns->size[1]; + long k = nOutputPlane; + + // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices) + #ifdef THC_REAL_IS_FLOAT + THCudaBlas_Sgemm( + #elif defined(THC_REAL_IS_HALF) + THCudaBlas_Hgemm( + #elif defined(THC_REAL_IS_DOUBLE) + THCudaBlas_Dgemm( + #endif + state, + 'n', 't', + n, m, k, + ScalarConvert<int, real>::to(1), + THCTensor_(data)(state, gradOutput_n), n, + THCTensor_(data)(state, weight), m, + ScalarConvert<int, real>::to(0), + THCTensor_(data)(state, gradColumns), n + ); + + // Unpack columns back into input: + col2im<real, accreal>( + THCState_getCurrentStream(state), + THCTensor_(data)(state, gradColumns), + nInputPlane, inputHeight, inputWidth, kH, kW, padH, padW, dH, dW, + dilationH, dilationW, + THCTensor_(data)(state, gradInput_n) + ); + } + + // Free + THCTensor_(free)(state, gradInput_n); + THCTensor_(free)(state, gradOutput_n); + + // Resize output + if (batch == 0) { + THCTensor_(resize3d)(state, gradOutput, nOutputPlane, outputHeight, outputWidth); + THCTensor_(resize3d)(state, input, nInputPlane, inputHeight, inputWidth); + THCTensor_(resize3d)(state, gradInput, nInputPlane, inputHeight, inputWidth); + } +} + +void THNN_(SpatialDilatedConvolution_accGradParameters)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradWeight, + THCTensor *gradBias, + THCTensor *columns, + THCTensor *ones, + int kW, int kH, + int dW, int dH, + int padW, int padH, + int dilationW, int dilationH, + real scale) { + + THCUNN_assertSameGPU_generic(state, 5, input, gradOutput, gradWeight, columns, ones); + if (gradBias) { + THCUNN_assertSameGPU_generic(state, 2, gradWeight, gradBias); + } + THArgCheck(input->nDimension == 3 || input->nDimension == 4, 2, "3D or 4D (batch mode) tensor is expected"); + THArgCheck(gradWeight->nDimension == 4, 4, "gradWeight tensor must be 4D (nOutputPlane,nInputPlane,kH,kW)"); + THArgCheck(!gradBias || gradWeight->size[0] == gradBias->size[0], 4, "nOutputPlane mismatch in gradWeight and gradBias"); + THArgCheck(kW > 0 && kH > 0, 8, "kernel size should be greater than zero"); + THArgCheck(dW > 0 && dH > 0, 10, "stride should be greater than zero"); + + // Params + int nInputPlane = gradWeight->size[1]; + int nOutputPlane = gradWeight->size[0]; + + int batch = 1; + if (input->nDimension == 3) { + // Force batch + batch = 0; + THCTensor_(resize4d)(state, input, 1, input->size[0], input->size[1], input->size[2]); + THCTensor_(resize4d)(state, gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2]); + } + + long inputWidth = input->size[3]; + long inputHeight = input->size[2]; + long outputWidth = (inputWidth + 2*padW - (dilationW * (kW - 1) + 1)) / dW + 1; + long outputHeight = (inputHeight + 2*padH - (dilationH * (kH - 1) + 1)) / dH + 1; + + // Batch size + input planes + long batchSize = input->size[0]; + + // Define a buffer of ones, for bias accumulation + if (ones->nDimension != 2 || ones->size[0]*ones->size[1] < outputHeight*outputWidth) { + // Resize plane and fill with ones... + THCTensor_(resize2d)(state, ones, outputHeight, outputWidth); + THCTensor_(fill)(state, ones, ScalarConvert<int, real>::to(1)); + } + + // Resize temporary columns + THCTensor_(resize2d)(state, columns, nInputPlane*kW*kH, outputHeight*outputWidth); + + // Helpers + THCTensor *input_n = THCTensor_(new)(state); + THCTensor *gradOutput_n = THCTensor_(new)(state); + + // For each elt in batch, do: + for (int elt = 0; elt < batchSize; elt ++) { + // Matrix mulitply per output: + THCTensor_(select)(state, input_n, input, 0, elt); + THCTensor_(select)(state, gradOutput_n, gradOutput, 0, elt); + + // Extract columns: + im2col( + THCState_getCurrentStream(state), + THCTensor_(data)(state, input_n), + nInputPlane, inputHeight, inputWidth, kH, kW, padH, padW, dH, dW, + dilationH, dilationW, + THCTensor_(data)(state, columns) + ); + + // M,N,K are dims of matrix A and B + // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm) + long m = nOutputPlane; + long n = nInputPlane*kW*kH; + long k = columns->size[1]; + + // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices) + #ifdef THC_REAL_IS_FLOAT + THCudaBlas_Sgemm( + #elif defined(THC_REAL_IS_HALF) + THCudaBlas_Hgemm( + #elif defined(THC_REAL_IS_DOUBLE) + THCudaBlas_Dgemm( + #endif + state, + 't', 'n', + n, m, k, + scale, + THCTensor_(data)(state, columns), k, + THCTensor_(data)(state, gradOutput_n), k, + ScalarConvert<int, real>::to(1), + THCTensor_(data)(state, gradWeight), n + ); + + // Do Bias: + // M,N,K are dims of matrix A and B + // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm) + long m_ = nOutputPlane; + long k_ = outputHeight * outputWidth; + + // Do GEMV (note: this is a bit confusing because gemv assumes column-major matrices) + if (gradBias) { + #if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) + #ifdef THC_REAL_IS_FLOAT + THCudaBlas_Sgemv( + #elif defined(THC_REAL_IS_DOUBLE) + THCudaBlas_Dgemv( + #endif + state, + 't', + k_, m_, + scale, + THCTensor_(data)(state, gradOutput_n), k_, + THCTensor_(data)(state, ones), 1, + ScalarConvert<int, real>::to(1), + THCTensor_(data)(state, gradBias), 1 + ); + #endif + #ifdef THC_REAL_IS_HALF + THCudaBlas_Hgemm( + state, + 't', 'n', + m_, 1, k_, + scale, + THCTensor_(data)(state, gradOutput_n), k_, + THCTensor_(data)(state, ones), k_, + ScalarConvert<int, real>::to(1), + THCTensor_(data)(state, gradBias), m_ + ); + #endif + } + } + + // Free + THCTensor_(free)(state, input_n); + THCTensor_(free)(state, gradOutput_n); + + // Resize + if (batch == 0) { + THCTensor_(resize3d)(state, gradOutput, nOutputPlane, outputHeight, outputWidth); + THCTensor_(resize3d)(state, input, nInputPlane, inputHeight, inputWidth); + } +} + +#endif diff --git a/lib/THCUNN/generic/SpatialDilatedMaxPooling.cu b/lib/THCUNN/generic/SpatialDilatedMaxPooling.cu new file mode 100644 index 0000000..b99b5e6 --- /dev/null +++ b/lib/THCUNN/generic/SpatialDilatedMaxPooling.cu @@ -0,0 +1,155 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/SpatialDilatedMaxPooling.cu" +#else + +#include "../common.h" + +void THNN_(SpatialDilatedMaxPooling_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output, + THCIndexTensor *indices, + int kW, int kH, + int dW, int dH, + int padW, int padH, + int dilationW, int dilationH, + bool ceil_mode) +{ + + THCUNN_assertSameGPU_generic(state, 3, input, output, indices); + THArgCheck(input->nDimension == 3 || input->nDimension == 4, 2, "3D or 4D (batch) tensor expected"); + + long nInputCols, nInputRows, nInputPlane, batchSize; + long nOutputCols, nOutputRows; + + if (input->nDimension == 3) { + nInputCols = input->size[2]; + nInputRows = input->size[1]; + nInputPlane = input->size[0]; + batchSize = 1; + } + else + { + nInputCols = input->size[3]; + nInputRows = input->size[2]; + nInputPlane = input->size[1]; + batchSize = input->size[0]; + } + + THArgCheck(nInputCols >= kW - padW && nInputRows >= kH - padH, 2, "input image smaller than kernel size"); + THArgCheck(kW/2 >= padW && kH/2 >= padH, 2, "pad should be smaller than half of kernel size"); + THArgCheck(dilationW > 0 && dilationH > 0, 11, "dilation should be greater than 0"); + if(ceil_mode) { + nOutputCols = ceil(float(nInputCols - (dilationW * (kW - 1) + 1) + 2*padW) / float(dW)) + 1; + nOutputRows = ceil(float(nInputRows - (dilationH * (kH - 1) + 1) + 2*padH) / float(dH)) + 1; + } + else { + nOutputCols = floor(float(nInputCols - (dilationW * (kW - 1) + 1) + 2*padW) / float(dW)) + 1; + nOutputRows = floor(float(nInputRows - (dilationH * (kH - 1) + 1) + 2*padH) / float(dH)) + 1; + } + +if (nOutputCols < 1 || nOutputRows < 1) + THError("Given input size: (%dx%dx%d). Calculated output size: (%dx%dx%d). Output size is too small", + nInputPlane,nInputRows,nInputCols,nInputPlane,nOutputRows,nOutputCols); + +if (padW || padH) + { + // ensure that the last pooling starts inside the image + if ((nOutputRows - 1)*dH >= nInputRows + padH) + --nOutputRows; + if ((nOutputCols - 1)*dW >= nInputCols + padW) + --nOutputCols; + } + + input = THCTensor_(newContiguous)(state, input); + real* input_data = THCTensor_(data)(state, input); + + THCTensor_(resize4d)(state, output, batchSize, nInputPlane, nOutputRows, nOutputCols); + THCUNN_resizeAs_indices(state, indices, output); + + THCIndex_t* indices_data = THCIndexTensor_(data)(state, indices); + real* output_data = THCTensor_(data)(state, output); + + int count = THCTensor_(nElement)(state, output); + + MaxPoolForward<real, accreal> <<< GET_BLOCKS(count), CUDA_NUM_THREADS, 0, THCState_getCurrentStream(state) >>> + (count, input_data, + batchSize, nInputPlane, nInputRows, nInputCols, nOutputRows, nOutputCols, + kH, kW, dH, dW, padH, padW, dilationH, dilationW, output_data, indices_data); + THCudaCheck(cudaGetLastError()); + + if(input->nDimension == 3) + THCTensor_(resize3d)(state, output, nInputPlane, nOutputRows, nOutputCols); + + THCTensor_(free)(state, input); +} + +void THNN_(SpatialDilatedMaxPooling_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradInput, + THCIndexTensor *indices, + int kW, int kH, + int dW, int dH, + int padW, int padH, + int dilationW, int dilationH, + bool ceil_mode) +{ + THCUNN_assertSameGPU_generic(state, 4, input, gradOutput, indices, gradInput); + + input = THCTensor_(newContiguous)(state, input); + gradOutput = THCTensor_(newContiguous)(state, gradOutput); + + long nInputCols, nInputRows, nInputPlane, batchSize; + long nOutputCols, nOutputRows; + + if (input->nDimension == 3) { + nInputCols = input->size[2]; + nInputRows = input->size[1]; + nInputPlane = input->size[0]; + batchSize = 1; + } + else + { + nInputCols = input->size[3]; + nInputRows = input->size[2]; + nInputPlane = input->size[1]; + batchSize = input->size[0]; + } + + if(ceil_mode) { + nOutputCols = ceil(float(nInputCols - (dilationW * (kW - 1) + 1) + 2*padW) / float(dW)) + 1; + nOutputRows = ceil(float(nInputRows - (dilationH * (kH - 1) + 1) + 2*padH) / float(dH)) + 1; + } + else { + nOutputCols = floor(float(nInputCols - (dilationW * (kW - 1) + 1) + 2*padW) / float(dW)) + 1; + nOutputRows = floor(float(nInputRows - (dilationH * (kH - 1) + 1) + 2*padH) / float(dH)) + 1; + } + + if (nOutputCols < 1 || nOutputRows < 1) + THError("Given input size: (%dx%dx%d). Calculated output size: (%dx%dx%d). Output size is too small", + nInputPlane,nInputRows,nInputCols,nInputPlane,nOutputRows,nOutputCols); + + gradOutput = THCTensor_(newContiguous)(state, gradOutput); + THCTensor_(resizeAs)(state, gradInput, input); + + int count = THCTensor_(nElement)(state, input); + + MaxPoolBackward<real, accreal> <<< GET_BLOCKS(count), CUDA_NUM_THREADS, 0, THCState_getCurrentStream(state) >>> + (count, + THCTensor_(data)(state, gradOutput), + THCIndexTensor_(data)(state, indices), + batchSize, nInputPlane, nInputRows, nInputCols, nOutputRows, nOutputCols, + kH, kW, dH, dW, padH, padW, dilationH, dilationW, + THCTensor_(data)(state, gradInput)); + THCudaCheck(cudaGetLastError()); + + THCTensor_(free)(state, gradOutput); + + // clean + THCTensor_(free)(state, input); + THCTensor_(free)(state, gradOutput); +} + +#endif diff --git a/lib/THCUNN/generic/SpatialFractionalMaxPooling.cu b/lib/THCUNN/generic/SpatialFractionalMaxPooling.cu new file mode 100644 index 0000000..6e7fc99 --- /dev/null +++ b/lib/THCUNN/generic/SpatialFractionalMaxPooling.cu @@ -0,0 +1,155 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/SpatialFractionalMaxPooling.cu" +#else + +void THNN_(SpatialFractionalMaxPooling_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output, + int outputW, int outputH, + int poolSizeW, int poolSizeH, + THCIndexTensor *indices, + THCTensor *randomSamples) +{ + int planeDim = 0; + int dimh = 1; + int dimw = 2; + long numBatch = 1; + + long numInputDims = THCTensor_(nDimension)(state, input); + THArgCheck(numInputDims == 3 || numInputDims == 4, 2, + "3D or 4D (batch mode) tensor expected"); + + if (numInputDims == 4) { + numBatch = THCTensor_(size)(state, input, 0); + planeDim++; + dimh++; + dimw++; + } + + /* sizes */ + long numPlanes = THCTensor_(size)(state, input, planeDim); + long inputH = THCTensor_(size)(state, input, dimh); + long inputW = THCTensor_(size)(state, input, dimw); + + THArgCheck(outputH + poolSizeH - 1 < inputH, 6, + "poolSizeH too large relative to input height"); + THArgCheck(outputW + poolSizeW - 1 < inputW, 5, + "poolSizeW too large relative to input width"); + + THCDeviceTensor<real, 4> devInput; + THCDeviceTensor<real, 4> devOutput; + THCDeviceTensor<THCIndex_t, 4> devIndices; + THCDeviceTensor<real, 3> devSamples = + toDeviceTensor<real, 3>(state, randomSamples); + + if (numInputDims == 3) { + /* resize output */ + THCTensor_(resize3d)(state, output, numPlanes, outputH, outputW); + /* indices will contain the locations for each output point */ + THCIndexTensor_(resize3d)(state, indices, numPlanes, outputH, outputW); + + devInput = toDeviceTensor<real, 3>(state, input).upcastOuter<4>(); + devOutput = toDeviceTensor<real, 3>(state, output).upcastOuter<4>(); + devIndices = toDeviceTensor<THCIndex_t, 3>(state, indices).upcastOuter<4>(); + } else { + THCTensor_(resize4d)(state, output, numBatch, numPlanes, outputH, outputW); + /* indices will contain the locations for each output point */ + THCIndexTensor_(resize4d)(state, indices, numBatch, numPlanes, outputH, outputW); + + devInput = toDeviceTensor<real, 4>(state, input); + devOutput = toDeviceTensor<real, 4>(state, output); + devIndices = toDeviceTensor<THCIndex_t, 4>(state, indices); + } + + // block is limited to 4 warps + // grid handles overflow per each plane + int outputPlaneSize = devOutput.getSize(2) * devOutput.getSize(3); + dim3 grid(THCCeilDiv(outputPlaneSize, 128), + devInput.getSize(1), + devInput.getSize(0)); + dim3 block(outputPlaneSize > 128 ? 128 : outputPlaneSize); + +#define SFMP_UPDATE_OUTPUT(POOL_W) \ + SpatialFractionalMaxPooling_updateOutput<POOL_W, real, accreal> \ + <<<grid, block, 0, THCState_getCurrentStream(state)>>>( \ + devInput, devOutput, devIndices, devSamples, poolSizeW, poolSizeH); + +#define SFMP_UPDATE_OUTPUT_CASE(POOL_W) \ + case POOL_W: SFMP_UPDATE_OUTPUT(POOL_W); break + + switch (poolSizeW) { + SFMP_UPDATE_OUTPUT_CASE(2); + SFMP_UPDATE_OUTPUT_CASE(3); + SFMP_UPDATE_OUTPUT_CASE(4); + SFMP_UPDATE_OUTPUT_CASE(5); + SFMP_UPDATE_OUTPUT_CASE(6); + SFMP_UPDATE_OUTPUT_CASE(7); + default: + // dynamic pool width + SFMP_UPDATE_OUTPUT_CASE(-1); + } + THCudaCheck(cudaGetLastError()); +} + +void THNN_(SpatialFractionalMaxPooling_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradInput, + int outputW, int outputH, + int poolSizeW, int poolSizeH, + THCIndexTensor *indices) +{ + int dimh = 1; + int dimw = 2; + + long numInputDims = THCTensor_(nDimension)(state, input); + if (numInputDims == 4) { + dimh++; + dimw++; + } + + /* sizes */ + long inputH = THCTensor_(size)(state, input, dimh); + long inputW = THCTensor_(size)(state, input, dimw); + + THArgCheck(outputH == THCTensor_(size)(state, gradOutput, dimh), 3, + "gradOutput height unexpected"); + THArgCheck(outputW == THCTensor_(size)(state, gradOutput, dimw), 3, + "gradOutput width unexpected"); + + /* resize */ + THCTensor_(resizeAs)(state, gradInput, input); + THCTensor_(zero)(state, gradInput); + + THCDeviceTensor<real, 4> devGradInput; + THCDeviceTensor<real, 4> devGradOutput; + THCDeviceTensor<THCIndex_t, 4> devIndices; + + /* backprop */ + if (numInputDims == 3) { + devGradInput = toDeviceTensor<real, 3>(state, gradInput).upcastOuter<4>(); + devGradOutput = toDeviceTensor<real, 3>(state, gradOutput).upcastOuter<4>(); + devIndices = toDeviceTensor<THCIndex_t, 3>(state, indices).upcastOuter<4>(); + } else { + devGradInput = toDeviceTensor<real, 4>(state, gradInput); + devGradOutput = toDeviceTensor<real, 4>(state, gradOutput); + devIndices = toDeviceTensor<THCIndex_t, 4>(state, indices); + } + + // block is limited to 4 warps + // grid handles overflow per each plane + int outputPlaneSize = devGradOutput.getSize(2) * devGradOutput.getSize(3); + dim3 grid(THCCeilDiv(outputPlaneSize, 128), + devGradInput.getSize(1), + devGradInput.getSize(0)); + dim3 block(outputPlaneSize > 128 ? 128 : outputPlaneSize); + + SpatialFractionalMaxPooling_updateGradInput + <<<grid, block, 0, THCState_getCurrentStream(state)>>>( + devGradInput, devGradOutput, devIndices); + THCudaCheck(cudaGetLastError()); +} + +#endif diff --git a/lib/THCUNN/generic/SpatialFullConvolution.cu b/lib/THCUNN/generic/SpatialFullConvolution.cu new file mode 100644 index 0000000..7f0ebc7 --- /dev/null +++ b/lib/THCUNN/generic/SpatialFullConvolution.cu @@ -0,0 +1,381 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/SpatialFullConvolution.cu" +#else + +void THNN_(SpatialFullConvolution_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output, + THCTensor *weight, + THCTensor *bias, + THCTensor *columns, + THCTensor *ones, + int kW, int kH, + int dW, int dH, + int padW, int padH, + int adjW, int adjH) +{ + + int nInputPlane = THCTensor_(size)(state, weight, 0); + int nOutputPlane = THCTensor_(size)(state, weight, 1); + + THCUNN_assertSameGPU_generic(state, 6, input, output, weight, + bias, columns, ones); + + THArgCheck(input->nDimension == 3 || input->nDimension == 4, 2, "3D or 4D (batch mode) tensor is expected"); + + int batch = 1; + if (input->nDimension == 3) { + THArgCheck(input->size[0] == nInputPlane, 2, "input channels and nInputPlane dont match"); + // Force batch + batch = 0; + THCTensor_(resize4d)(state, input, 1, input->size[0], input->size[1], input->size[2]); + } else { + THArgCheck(input->size[1] == nInputPlane, 2, "input channels and nInputPlane dont match"); + } + + long inputWidth = input->size[3]; + long inputHeight = input->size[2]; + long outputWidth = (inputWidth - 1) * dW - 2*padW + kW + adjW; + long outputHeight = (inputHeight - 1) * dH - 2*padH + kH + adjH; + + // Batch size + input planes + long batchSize = input->size[0]; + + // Resize output + THCTensor_(resize4d)(state, output, batchSize, nOutputPlane, outputHeight, outputWidth); + + // Resize temporary columns + THCTensor_(resize2d)(state, columns, nOutputPlane*kW*kH, inputHeight*inputWidth); + + // Define a buffer of ones, for bias accumulation + // Note: this buffer can be shared with other modules, it only ever gets increased, + // and always contains ones. + if (ones->nDimension != 2 || ones->size[0]*ones->size[1] < outputHeight*outputWidth) { + // Resize plane and fill with ones... + THCTensor_(resize2d)(state, ones, outputHeight, outputWidth); + THCTensor_(fill)(state, ones, ScalarConvert<int, real>::to(1)); + } + + // Helpers + THCTensor *input_n = THCTensor_(new)(state); + THCTensor *output_n = THCTensor_(new)(state); + + // For each elt in batch, do: + for (int elt = 0; elt < batchSize; elt ++) { + // Matrix mulitply per output: + THCTensor_(select)(state, input_n, input, 0, elt); + THCTensor_(select)(state, output_n, output, 0, elt); + + // M,N,K are dims of matrix A and B + // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm) + long m = weight->size[1] * weight->size[2] * weight->size[3]; + long n = columns->size[1]; + long k = weight->size[0]; + + // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices) + #ifdef THC_REAL_IS_FLOAT + THCudaBlas_Sgemm( + #elif defined(THC_REAL_IS_HALF) + THCudaBlas_Hgemm( + #elif defined(THC_REAL_IS_DOUBLE) + THCudaBlas_Dgemm( + #endif + state, + 'n', 't', + n, m, k, + ScalarConvert<int, real>::to(1), + THCTensor_(data)(state, input_n), n, + THCTensor_(data)(state, weight), m, + ScalarConvert<int, real>::to(0), + THCTensor_(data)(state, columns), n + ); + + // Unpack columns back into input: + col2im<real, accreal>( + THCState_getCurrentStream(state), + THCTensor_(data)(state, columns), + nOutputPlane, outputHeight, outputWidth, kH, kW, padH, padW, dH, dW, + 1, 1, THCTensor_(data)(state, output_n) + ); + + // Do Bias after: + // M,N,K are dims of matrix A and B + // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm) + long m_ = nOutputPlane; + long n_ = outputHeight * outputWidth; + long k_ = 1; + + // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices) + if (bias) { + #ifdef THC_REAL_IS_FLOAT + THCudaBlas_Sgemm( + #elif defined(THC_REAL_IS_HALF) + THCudaBlas_Hgemm( + #elif defined(THC_REAL_IS_DOUBLE) + THCudaBlas_Dgemm( + #endif + state, + 't', 'n', + n_, m_, k_, + ScalarConvert<int, real>::to(1), + THCTensor_(data)(state, ones), k_, + THCTensor_(data)(state, bias), k_, + ScalarConvert<int, real>::to(1), + THCTensor_(data)(state, output_n), n_ + ); + } + + } + + // Free + THCTensor_(free)(state, input_n); + THCTensor_(free)(state, output_n); + + // Resize output + if (batch == 0) { + THCTensor_(resize3d)(state, output, nOutputPlane, outputHeight, outputWidth); + THCTensor_(resize3d)(state, input, nInputPlane, inputHeight, inputWidth); + } +} + +void THNN_(SpatialFullConvolution_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradInput, + THCTensor *weight, + THCTensor *gradColumns, + int kW, int kH, + int dW, int dH, + int padW, int padH, + int adjW, int adjH) +{ + int nInputPlane = THCTensor_(size)(state, weight, 0); + int nOutputPlane = THCTensor_(size)(state, weight, 1); + + THCUNN_assertSameGPU_generic(state, 5, input, gradOutput, weight, + gradColumns, gradInput); + THArgCheck(input->nDimension == 3 || input->nDimension == 4, 2, "3D or 4D (batch mode) tensor is expected"); + + int batch = 1; + if (input->nDimension == 3) { + // Force batch + batch = 0; + THCTensor_(resize4d)(state, input, 1, input->size[0], input->size[1], input->size[2]); + THCTensor_(resize4d)(state, gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2]); + } + + long inputWidth = input->size[3]; + long inputHeight = input->size[2]; + long outputWidth = (inputWidth - 1) * dW - 2*padW + kW + adjW; + long outputHeight = (inputHeight - 1) * dH - 2*padH + kH + adjH; + + // Batch size + input planes + long batchSize = input->size[0]; + + // Resize output + THCTensor_(resize4d)(state, gradInput, batchSize, nInputPlane, inputHeight, inputWidth); + + // Resize temporary columns + THCTensor_(resize2d)(state, gradColumns, nOutputPlane*kW*kH, inputHeight*inputWidth); + + // Helpers + THCTensor *gradInput_n = THCTensor_(new)(state); + THCTensor *gradOutput_n = THCTensor_(new)(state); + + // For each elt in batch, do: + for (int elt = 0; elt < batchSize; elt ++) { + // Matrix mulitply per sample: + THCTensor_(select)(state, gradInput_n, gradInput, 0, elt); + THCTensor_(select)(state, gradOutput_n, gradOutput, 0, elt); + + // Extract columns: + im2col( + THCState_getCurrentStream(state), + THCTensor_(data)(state, gradOutput_n), + nOutputPlane, outputHeight, outputWidth, kH, kW, padH, padW, dH, dW, + 1, 1, THCTensor_(data)(state, gradColumns) + ); + + + // M,N,K are dims of matrix A and B + // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm) + long m = weight->size[0]; + long n = gradColumns->size[1]; + long k = weight->size[1] * weight->size[2] * weight->size[3]; + + // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices) + #ifdef THC_REAL_IS_FLOAT + THCudaBlas_Sgemm( + #elif defined(THC_REAL_IS_HALF) + THCudaBlas_Hgemm( + #elif defined(THC_REAL_IS_DOUBLE) + THCudaBlas_Dgemm( + #endif + state, + 'n', 'n', + n, m, k, + ScalarConvert<int, real>::to(1), + THCTensor_(data)(state, gradColumns), n, + THCTensor_(data)(state, weight), k, + ScalarConvert<int, real>::to(0), + THCTensor_(data)(state, gradInput_n), n + ); + } + + + // Free + THCTensor_(free)(state, gradInput_n); + THCTensor_(free)(state, gradOutput_n); + + // Resize output + if (batch == 0) { + THCTensor_(resize3d)(state, gradOutput, nOutputPlane, outputHeight, outputWidth); + THCTensor_(resize3d)(state, input, nInputPlane, inputHeight, inputWidth); + THCTensor_(resize3d)(state, gradInput, nInputPlane, inputHeight, inputWidth); + } +} + + +void THNN_(SpatialFullConvolution_accGradParameters)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradWeight, + THCTensor *gradBias, + THCTensor *columns, + THCTensor *ones, + int kW, int kH, + int dW, int dH, + int padW, int padH, + int adjW, int adjH, + real scale) +{ + int nInputPlane = THCTensor_(size)(state, gradWeight, 0); + int nOutputPlane = THCTensor_(size)(state, gradWeight, 1); + + THCUNN_assertSameGPU_generic(state, 6, input, gradOutput, gradWeight, + gradBias, columns, ones); + + THArgCheck(input->nDimension == 3 || input->nDimension == 4, 2, "3D or 4D (batch mode) tensor is expected"); + + int batch = 1; + if (input->nDimension == 3) { + // Force batch + batch = 0; + THCTensor_(resize4d)(state, input, 1, input->size[0], input->size[1], input->size[2]); + THCTensor_(resize4d)(state, gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2]); + } + + long inputWidth = input->size[3]; + long inputHeight = input->size[2]; + long outputWidth = (inputWidth - 1) * dW - 2*padW + kW + adjW; + long outputHeight = (inputHeight - 1) * dH - 2*padH + kH + adjH; + + // Batch size + input planes + long batchSize = input->size[0]; + + // Define a buffer of ones, for bias accumulation + if (ones->nDimension != 2 || ones->size[0]*ones->size[1] < outputHeight*outputWidth) { + // Resize plane and fill with ones... + THCTensor_(resize2d)(state, ones, outputHeight, outputWidth); + THCTensor_(fill)(state, ones, ScalarConvert<int, real>::to(1)); + } + + // Resize temporary columns + THCTensor_(resize2d)(state, columns, nOutputPlane*kW*kH, inputHeight*inputWidth); + + // Helpers + THCTensor *input_n = THCTensor_(new)(state); + THCTensor *gradOutput_n = THCTensor_(new)(state); + + // For each elt in batch, do: + for (int elt = 0; elt < batchSize; elt ++) { + // Matrix mulitply per output: + THCTensor_(select)(state, input_n, input, 0, elt); + THCTensor_(select)(state, gradOutput_n, gradOutput, 0, elt); + + // Extract columns: + im2col( + THCState_getCurrentStream(state), + THCTensor_(data)(state, gradOutput_n), + nOutputPlane, outputHeight, outputWidth, kH, kW, padH, padW, dH, dW, + 1, 1, THCTensor_(data)(state, columns) + ); + + // M,N,K are dims of matrix A and B + // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm) + long n = columns->size[0]; // nOutputPlane * kh * kw + long m = input_n->size[0]; // nInputPlane + long k = columns->size[1]; // inputHeight * inputWidth + + // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices) + #ifdef THC_REAL_IS_FLOAT + THCudaBlas_Sgemm( + #elif defined(THC_REAL_IS_HALF) + THCudaBlas_Hgemm( + #elif defined(THC_REAL_IS_DOUBLE) + THCudaBlas_Dgemm( + #endif + state, + 't', 'n', + n, m, k, + scale, + THCTensor_(data)(state, columns), k, + THCTensor_(data)(state, input_n), k, + ScalarConvert<int, real>::to(1), + THCTensor_(data)(state, gradWeight), n + ); + + // Do Bias: + // M,N,K are dims of matrix A and B + // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm) + long m_ = nOutputPlane; + long k_ = outputHeight * outputWidth; + + // Do GEMV (note: this is a bit confusing because gemv assumes column-major matrices) + if (gradBias) { + #if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) + #ifdef THC_REAL_IS_FLOAT + THCudaBlas_Sgemv( + #elif defined(THC_REAL_IS_DOUBLE) + THCudaBlas_Dgemv( + #endif + state, + 't', + k_, m_, + scale, + THCTensor_(data)(state, gradOutput_n), k_, + THCTensor_(data)(state, ones), 1, + ScalarConvert<int, real>::to(1), + THCTensor_(data)(state, gradBias), 1 + ); + #endif + #ifdef THC_REAL_IS_HALF + THCudaBlas_Hgemm( + state, + 't', 'n', + m_, 1, k_, + scale, + THCTensor_(data)(state, gradOutput_n), k_, + THCTensor_(data)(state, ones), k_, + ScalarConvert<int, real>::to(1), + THCTensor_(data)(state, gradBias), m_ + ); + #endif + } + } + + // Free + THCTensor_(free)(state, input_n); + THCTensor_(free)(state, gradOutput_n); + + // Resize + if (batch == 0) { + THCTensor_(resize3d)(state, gradOutput, nOutputPlane, outputHeight, outputWidth); + THCTensor_(resize3d)(state, input, nInputPlane, inputHeight, inputWidth); + } +} + +#endif diff --git a/lib/THCUNN/generic/SpatialMaxPooling.cu b/lib/THCUNN/generic/SpatialMaxPooling.cu new file mode 100644 index 0000000..6be838d --- /dev/null +++ b/lib/THCUNN/generic/SpatialMaxPooling.cu @@ -0,0 +1,40 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/SpatialMaxPooling.cu" +#else + +#include "../common.h" + +void THNN_(SpatialMaxPooling_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output, + THCIndexTensor *indices, + int kW, int kH, + int dW, int dH, + int padW, int padH, + bool ceil_mode) +{ + THNN_(SpatialDilatedMaxPooling_updateOutput)( + state, input, output, indices, + kW, kH, dW, dH, padW, padH, 1, 1, ceil_mode); + +} + +void THNN_(SpatialMaxPooling_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradInput, + THCIndexTensor *indices, + int kW, int kH, + int dW, int dH, + int padW, int padH, + bool ceil_mode) +{ + THNN_(SpatialDilatedMaxPooling_updateGradInput)( + state, input, gradOutput, gradInput, indices, + kW, kH, dW, dH, padW, padH, 1, 1, ceil_mode); + +} + +#endif diff --git a/lib/THCUNN/generic/SpatialMaxUnpooling.cu b/lib/THCUNN/generic/SpatialMaxUnpooling.cu new file mode 100644 index 0000000..8b0c15e --- /dev/null +++ b/lib/THCUNN/generic/SpatialMaxUnpooling.cu @@ -0,0 +1,93 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/SpatialMaxUnpooling.cu" +#else + +void THNN_(SpatialMaxUnpooling_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output, + THCIndexTensor *indices, + int owidth, int oheight) +{ + THCUNN_assertSameGPU_generic(state, 3, input, output, indices); + THArgCheck(input->nDimension == 3 || input->nDimension == 4, 2, "3D or 4D (batch) tensor expected"); + + long nInputCols, nInputRows, nInputPlane, batchSize; + + if (input->nDimension == 3) { + nInputCols = input->size[2]; + nInputRows = input->size[1]; + nInputPlane = input->size[0]; + batchSize = 1; + } + else + { + nInputCols = input->size[3]; + nInputRows = input->size[2]; + nInputPlane = input->size[1]; + batchSize = input->size[0]; + } + + input = THCTensor_(newContiguous)(state, input); + indices = THCIndexTensor_(newContiguous)(state, indices); + THCTensor_(resize4d)(state, output, batchSize, nInputPlane, oheight, owidth); + THCTensor_(zero)(state, output); + + int count = THCTensor_(nElement)(state, input); + + MaxUnpoolForward <<< GET_BLOCKS(count), CUDA_NUM_THREADS, 0, THCState_getCurrentStream(state) >>> + (count, THCTensor_(data)(state, input), THCIndexTensor_(data)(state, indices), + batchSize, nInputPlane, nInputRows, nInputCols, oheight, owidth, THCTensor_(data)(state, output)); + THCudaCheck(cudaGetLastError()); + + if(input->nDimension == 3) + THCTensor_(resize3d)(state, output, nInputPlane, oheight, owidth); + + THCTensor_(free)(state, input); + +} + +void THNN_(SpatialMaxUnpooling_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradInput, + THCIndexTensor *indices, + int owidth, int oheight) +{ + THCUNN_assertSameGPU_generic(state, 4, input, gradOutput, indices, gradInput); + + long nInputCols, nInputRows, nInputPlane, batchSize; + + if (input->nDimension == 3) { + nInputCols = input->size[2]; + nInputRows = input->size[1]; + nInputPlane = input->size[0]; + batchSize = 1; + } + else + { + nInputCols = input->size[3]; + nInputRows = input->size[2]; + nInputPlane = input->size[1]; + batchSize = input->size[0]; + } + + input = THCTensor_(newContiguous)(state, input); + indices = THCIndexTensor_(newContiguous)(state, indices); + gradOutput = THCTensor_(newContiguous)(state, gradOutput); + THCTensor_(resizeAs)(state, gradInput, input); + + int count = THCTensor_(nElement)(state, input); + + MaxUnpoolBackward <<< GET_BLOCKS(count), CUDA_NUM_THREADS, 0, THCState_getCurrentStream(state) >>> + (count, THCTensor_(data)(state, gradOutput), THCIndexTensor_(data)(state, indices), + batchSize, nInputPlane, nInputRows, nInputCols, oheight, owidth, THCTensor_(data)(state, gradInput)); + THCudaCheck(cudaGetLastError()); + + // clean + THCTensor_(free)(state, input); + THCTensor_(free)(state, gradOutput); +} + +#endif diff --git a/lib/THCUNN/generic/SpatialReflectionPadding.cu b/lib/THCUNN/generic/SpatialReflectionPadding.cu new file mode 100644 index 0000000..e4efa91 --- /dev/null +++ b/lib/THCUNN/generic/SpatialReflectionPadding.cu @@ -0,0 +1,110 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/SpatialReflectionPadding.cu" +#else + +void THNN_(SpatialReflectionPadding_updateOutput)(THCState *state, + THCTensor *input, + THCTensor *output, + int padL, int padR, + int padT, int padB) { + THArgCheck(TensorUtils<THCTensor>::canUse32BitIndexMath(state, input), 2, + "input tensor must fit into 32-bit index math"); + + int planeDim = 0; + int dimh = 1; + int dimw = 2; + int numBatch = 1; + + int numInputDims = THCTensor_(nDimension)(state, input); + THArgCheck(numInputDims == 3 || numInputDims == 4, 2, + "input must be 3 or 4-dimensional"); + + if (numInputDims == 4) { + numBatch = THCTensor_(size)(state, input, 0); + planeDim++; + dimh++; + dimw++; + } + + int numPlanes = THCTensor_(size)(state, input, planeDim); + int inputH = THCTensor_(size)(state, input, dimh); + int inputW = THCTensor_(size)(state, input, dimw); + int outputH = inputH + padT + padB; + int outputW = inputW + padL + padR; + + THCDeviceTensor<real, 4> devInput; + THCDeviceTensor<real, 4> devOutput; + + if (numInputDims == 3) { + THCTensor_(resize3d)(state, output, numPlanes, outputH, outputW); + + devInput = toDeviceTensor<real, 3>(state, input).upcastOuter<4>(); + devOutput = toDeviceTensor<real, 3>(state, output).upcastOuter<4>(); + } else { + THCTensor_(resize4d)(state, output, numBatch, numPlanes, outputH, outputW); + + devInput = toDeviceTensor<real, 4>(state, input); + devOutput = toDeviceTensor<real, 4>(state, output); + } + + int outputPlaneSize = devOutput.getSize(2) * devOutput.getSize(3); + dim3 gridSize(THCCeilDiv(outputPlaneSize, 256), + devOutput.getSize(1), + devOutput.getSize(0)); + dim3 blockSize(outputPlaneSize > 256 ? 256 : outputPlaneSize); + + SpatialReflectionPadding_updateOutput<<<gridSize, blockSize, 0, THCState_getCurrentStream(state)>>>( + devInput, devOutput, padT, padB, padL, padR); + THCudaCheck(cudaGetLastError()); +} + +void THNN_(SpatialReflectionPadding_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradInput, + int padL, int padR, + int padT, int padB) { + + THArgCheck(TensorUtils<THCTensor>::canUse32BitIndexMath(state, input), 2, + "input tensor must fit into 32-bit index math"); + THArgCheck(TensorUtils<THCTensor>::canUse32BitIndexMath(state, gradOutput), 3, + "output gradient tensor must fit into 32-bit index math"); + + int planeDim = 0; + int dimh = 1; + int dimw = 2; + + int numInputDims = THCTensor_(nDimension)(state, input); + if (numInputDims == 4) { + planeDim++; + dimh++; + dimw++; + } + + THCTensor_(resizeAs)(state, gradInput, input); + THCTensor_(zero)(state, gradInput); + + THCDeviceTensor<real, 4> devGradInput; + THCDeviceTensor<real, 4> devGradOutput; + + if (numInputDims == 3) { + devGradInput = toDeviceTensor<real, 3>(state, gradInput).upcastOuter<4>(); + devGradOutput = toDeviceTensor<real, 3>(state, gradOutput).upcastOuter<4>(); + } else { + devGradInput = toDeviceTensor<real, 4>(state, gradInput); + devGradOutput = toDeviceTensor<real, 4>(state, gradOutput); + } + + int outputPlaneSize = devGradOutput.getSize(2) * devGradOutput.getSize(3); + dim3 gridSize(THCCeilDiv(outputPlaneSize, 256), + devGradOutput.getSize(1), + devGradOutput.getSize(0)); + dim3 blockSize(outputPlaneSize > 256 ? 256 : outputPlaneSize); + + SpatialReflectionPadding_updateGradInput<<<gridSize, blockSize, 0, THCState_getCurrentStream(state)>>>( + devGradInput, devGradOutput, padT, padB, padL, padR); + THCudaCheck(cudaGetLastError()); +} + +#endif diff --git a/lib/THCUNN/generic/SpatialReplicationPadding.cu b/lib/THCUNN/generic/SpatialReplicationPadding.cu new file mode 100644 index 0000000..cd775d3 --- /dev/null +++ b/lib/THCUNN/generic/SpatialReplicationPadding.cu @@ -0,0 +1,111 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/SpatialReplicationPadding.cu" +#else + +void THNN_(SpatialReplicationPadding_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output, + int padL, int padR, + int padT, int padB) { + THArgCheck(TensorUtils<THCTensor>::canUse32BitIndexMath(state, input), 2, + "input tensor must fit into 32-bit index math"); + + int planeDim = 0; + int dimh = 1; + int dimw = 2; + int numBatch = 1; + + int numInputDims = THCTensor_(nDimension)(state, input); + THArgCheck(numInputDims == 3 || numInputDims == 4, 2, + "input must be 3 or 4-dimensional"); + + if (numInputDims == 4) { + numBatch = THCTensor_(size)(state, input, 0); + planeDim++; + dimh++; + dimw++; + } + + int numPlanes = THCTensor_(size)(state, input, planeDim); + int inputH = THCTensor_(size)(state, input, dimh); + int inputW = THCTensor_(size)(state, input, dimw); + int outputH = inputH + padT + padB; + int outputW = inputW + padL + padR; + + THCDeviceTensor<real, 4> devInput; + THCDeviceTensor<real, 4> devOutput; + + if (numInputDims == 3) { + THCTensor_(resize3d)(state, output, numPlanes, outputH, outputW); + + devInput = toDeviceTensor<real, 3>(state, input).upcastOuter<4>(); + devOutput = toDeviceTensor<real, 3>(state, output).upcastOuter<4>(); + } else { + THCTensor_(resize4d)(state, output, numBatch, numPlanes, outputH, outputW); + + devInput = toDeviceTensor<real, 4>(state, input); + devOutput = toDeviceTensor<real, 4>(state, output); + } + + int outputPlaneSize = devOutput.getSize(2) * devOutput.getSize(3); + dim3 gridSize(THCCeilDiv(outputPlaneSize, 256), + devOutput.getSize(1), + devOutput.getSize(0)); + dim3 blockSize(outputPlaneSize > 256 ? 256 : outputPlaneSize); + + SpatialReplicationPadding_updateOutput<<<gridSize, blockSize, 0, THCState_getCurrentStream(state)>>>( + devInput, devOutput, padT, padB, padL, padR); + +} + +void THNN_(SpatialReplicationPadding_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradInput, + int padL, int padR, + int padT, int padB) { + + THArgCheck(TensorUtils<THCTensor>::canUse32BitIndexMath(state, input), 2, + "input tensor must fit into 32-bit index math"); + THArgCheck(TensorUtils<THCTensor>::canUse32BitIndexMath(state, gradOutput), 3, + "output gradient tensor must fit into 32-bit index math"); + + int planeDim = 0; + int dimh = 1; + int dimw = 2; + + int numInputDims = THCTensor_(nDimension)(state, input); + if (numInputDims == 4) { + planeDim++; + dimh++; + dimw++; + } + + THCTensor_(resizeAs)(state, gradInput, input); + THCTensor_(zero)(state, gradInput); + + THCDeviceTensor<real, 4> devGradInput; + THCDeviceTensor<real, 4> devGradOutput; + + if (numInputDims == 3) { + devGradInput = toDeviceTensor<real, 3>(state, gradInput).upcastOuter<4>(); + devGradOutput = toDeviceTensor<real, 3>(state, gradOutput).upcastOuter<4>(); + } else { + devGradInput = toDeviceTensor<real, 4>(state, gradInput); + devGradOutput = toDeviceTensor<real, 4>(state, gradOutput); + } + + int outputPlaneSize = devGradOutput.getSize(2) * devGradOutput.getSize(3); + dim3 gridSize(THCCeilDiv(outputPlaneSize, 256), + devGradOutput.getSize(1), + devGradOutput.getSize(0)); + dim3 blockSize(outputPlaneSize > 256 ? 256 : outputPlaneSize); + + SpatialReplicationPadding_updateGradInput<<<gridSize, blockSize, 0, THCState_getCurrentStream(state)>>>( + devGradInput, devGradOutput, padT, padB, padL, padR); + +} + +#endif diff --git a/lib/THCUNN/generic/SpatialSubSampling.cu b/lib/THCUNN/generic/SpatialSubSampling.cu new file mode 100644 index 0000000..4b5f238 --- /dev/null +++ b/lib/THCUNN/generic/SpatialSubSampling.cu @@ -0,0 +1,230 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/SpatialSubSampling.cu" +#else + +#include "../common.h" + +void THNN_(SpatialSubSampling_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output, + THCTensor *weight, + THCTensor *bias, + int kW, int kH, + int dW, int dH) +{ + real *weight_data = THCTensor_(data)(state, weight); + real *bias_data = THCTensor_(data)(state, bias); + real *output_data; + real *input_data; + + int nInputPlane = THCTensor_(size)(state, weight, 0); + + THCUNN_assertSameGPU_generic(state, 4, input, output, weight, bias); + THArgCheck(input->nDimension == 3 || input->nDimension == 4, 2, "3D or 4D (batch) tensor expected"); + + if (input->nDimension == 3) { + long nInputCols = input->size[2]; + long nInputRows = input->size[1]; + long nOutputCols = (nInputCols - kW) / dW + 1; + long nOutputRows = (nInputRows - kH) / dH + 1; + + THArgCheck(input->size[0] == nInputPlane, 2, "invalid number of input planes"); + THArgCheck(nInputCols >= kW && nInputRows >= kH, 2, "input image smaller than kernel size"); + + input = THCTensor_(newContiguous)(state, input); + input_data = THCTensor_(data)(state, input); + + THCTensor_(resize3d)(state, output, nInputPlane, nOutputRows, nOutputCols); + output_data = THCTensor_(data)(state, output); + + // cuda blocks & threads: + int yblocks = (int)(16L / nInputPlane); + yblocks = yblocks < 1 ? 1 : yblocks; + dim3 blocks(nInputPlane,yblocks); + dim3 threads(32,8); + + // run subsample kernel + subsample<real, accreal> <<<blocks, threads, 0, THCState_getCurrentStream(state)>>> ( + input_data, output_data, weight_data, bias_data, + nInputPlane, nInputRows, nInputCols, kH, kW, dH, dW); + THCudaCheck(cudaGetLastError()); + } else { + long nInputCols = input->size[3]; + long nInputRows = input->size[2]; + long nbatch = input->size[0]; + long nOutputCols = (nInputCols - kW) / dW + 1; + long nOutputRows = (nInputRows - kH) / dH + 1; + + THArgCheck(input->size[1] == nInputPlane, 2, "invalid number of input planes"); + THArgCheck(nInputCols >= kW && nInputRows >= kH, 2, "input image smaller than kernel size"); + + input = THCTensor_(newContiguous)(state, input); + input_data = THCTensor_(data)(state, input); + + THCTensor_(resize4d)(state, output, nbatch, nInputPlane, nOutputRows, nOutputCols); + output_data = THCTensor_(data)(state, output); + + // cuda blocks & threads: + int yblocks = (int)(16L / nInputPlane); + yblocks = yblocks < 1 ? 1 : yblocks; + dim3 blocks(nInputPlane*nbatch,yblocks); + dim3 threads(32,8); + + // run subsample kernel + subsample<real, accreal> <<<blocks, threads, 0, THCState_getCurrentStream(state)>>> ( + input_data, output_data, weight_data, bias_data, + nInputPlane, nInputRows, nInputCols, kH, kW, dH, dW); + THCudaCheck(cudaGetLastError()); + } + + // clean + THCTensor_(free)(state, input); + +} + +void THNN_(SpatialSubSampling_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradInput, + THCTensor *weight, + int kW, int kH, + int dW, int dH) +{ + THCUNN_assertSameGPU_generic(state, 4, input, gradOutput, weight, gradInput); + + int nInputPlane = THCTensor_(size)(state, weight, 0); + + if (input->nDimension == 3) { + long nInputCols = input->size[2]; + long nInputRows = input->size[1]; + + real *weight_data = THCTensor_(data)(state, weight); + real *gradOutput_data = THCTensor_(data)(state, gradOutput); + real *gradInput_data; + + THCTensor_(resizeAs)(state, gradInput, input); + THCTensor_(zero)(state, gradInput); + gradInput_data = THCTensor_(data)(state, gradInput); + + // cuda blocks & threads: + int yblocks = (int)(16L / nInputPlane); + yblocks = yblocks < 1 ? 1 : yblocks; + dim3 blocks(nInputPlane,yblocks); + dim3 threads(32,8); + + // run updateGradInput kernel + if (kH <= dH && kW <= dW) { + subgradinput <<<blocks, threads, 0, THCState_getCurrentStream(state)>>> ( + gradInput_data, gradOutput_data, weight_data, + nInputPlane, nInputRows, nInputCols, kH, kW, dH, dW); + } else { + subgradinputAtomic <<<blocks, threads, 0, THCState_getCurrentStream(state)>>> ( + gradInput_data, gradOutput_data, weight_data, + nInputPlane, nInputRows, nInputCols, kH, kW, dH, dW); + } + THCudaCheck(cudaGetLastError()); + } else { + long nInputCols = input->size[3]; + long nInputRows = input->size[2]; + long nbatch = input->size[0]; + + real *weight_data = THCTensor_(data)(state, weight); + real *gradOutput_data = THCTensor_(data)(state, gradOutput); + real *gradInput_data; + + THCTensor_(resizeAs)(state, gradInput, input); + THCTensor_(zero)(state, gradInput); + gradInput_data = THCTensor_(data)(state, gradInput); + + // cuda blocks & threads: + int yblocks = (int)(16L / nInputPlane); + yblocks = yblocks < 1 ? 1 : yblocks; + dim3 blocks(nInputPlane*nbatch,yblocks); + dim3 threads(32,8); + + // run updateGradInput kernel + if (kH <= dH && kW <= dW) { + subgradinput <<<blocks, threads, 0, THCState_getCurrentStream(state)>>> ( + gradInput_data, gradOutput_data, weight_data, + nInputPlane, nInputRows, nInputCols, kH, kW, dH, dW); + } else { + subgradinputAtomic <<<blocks, threads, 0, THCState_getCurrentStream(state)>>> ( + gradInput_data, gradOutput_data, weight_data, + nInputPlane, nInputRows, nInputCols, kH, kW, dH, dW); + } + THCudaCheck(cudaGetLastError()); + } +} + +void THNN_(SpatialSubSampling_accGradParameters)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradWeight, + THCTensor *gradBias, + int kW, int kH, + int dW, int dH, + float scale) +{ + THCUNN_assertSameGPU_generic(state, 4, input, gradOutput, gradWeight, gradBias); + + int nInputPlane = THCTensor_(size)(state, gradWeight, 0); + + if (input->nDimension == 3) { + long nInputCols = input->size[2]; + long nInputRows = input->size[1]; + + real *gradWeight_data = THCTensor_(data)(state, gradWeight); + real *gradBias_data = THCTensor_(data)(state, gradBias); + real *gradOutput_data = THCTensor_(data)(state, gradOutput); + real *input_data; + + input = THCTensor_(newContiguous)(state, input); + input_data = THCTensor_(data)(state, input); + + // cuda blocks & threads: + dim3 blocks(nInputPlane); + dim3 threads(32,8); + + // run gradweight kernel + subgradweight<real, accreal> <<<blocks, threads, 0, THCState_getCurrentStream(state)>>> ( + input_data, gradOutput_data, gradWeight_data, gradBias_data, + nInputPlane, nInputRows, nInputCols, kH, kW, dH, dW, scale); + THCudaCheck(cudaGetLastError()); + } else { + long nInputCols = input->size[3]; + long nInputRows = input->size[2]; + long nbatch = input->size[0]; + + real *gradWeight_data = THCTensor_(data)(state, gradWeight); + real *gradBias_data = THCTensor_(data)(state, gradBias); + real *gradOutput_data = THCTensor_(data)(state, gradOutput); + real *input_data; + + input = THCTensor_(newContiguous)(state, input); + input_data = THCTensor_(data)(state, input); + + // cuda blocks & threads: + dim3 blocks(nInputPlane); + dim3 threads(32,8); + + // run gradweight kernel + long sl; + for (sl=0; sl<nbatch; sl++) { + subgradweight<real, accreal> <<<blocks, threads, 0, THCState_getCurrentStream(state)>>> ( + input_data + sl*input->stride[0], + gradOutput_data + sl*gradOutput->stride[0], + gradWeight_data, gradBias_data, + nInputPlane, nInputRows, nInputCols, kH, kW, dH, dW, scale); + } + THCudaCheck(cudaGetLastError()); + } + + // clean + THCTensor_(free)(state, input); + +} + +#endif diff --git a/lib/THCUNN/generic/SpatialUpSamplingBilinear.cu b/lib/THCUNN/generic/SpatialUpSamplingBilinear.cu new file mode 100644 index 0000000..e21f269 --- /dev/null +++ b/lib/THCUNN/generic/SpatialUpSamplingBilinear.cu @@ -0,0 +1,72 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/SpatialUpSamplingBilinear.cu" +#else + +void THNN_(SpatialUpSamplingBilinear_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output, + int outputHeight, + int outputWidth) +{ + input = THCTensor_(newContiguous)(state, input); + output = THCTensor_(newContiguous)(state, output); + THCUNN_assertSameGPU_generic(state, 2, input, output); + THCTensor_(zero)(state, output); + THCDeviceTensor<real, 4> idata = toDeviceTensor<real, 4>(state, input); + THCDeviceTensor<real, 4> odata = toDeviceTensor<real, 4>(state, output); + int height1 = idata.getSize(2); + int width1 = idata.getSize(3); + int height2 = odata.getSize(2); + int width2 = odata.getSize(3); + assert( height1 > 0 && width1 > 0 && height2 > 0 && width2 > 0); + const accreal rheight= (height2 > 1) ? (accreal)(height1 - 1)/(height2 - 1) : accreal(0); + const accreal rwidth = (width2 > 1) ? (accreal)(width1 - 1)/(width2 - 1) : accreal(0); + const int num_kernels = height2 * width2; + const int num_threads = + THCState_getCurrentDeviceProperties(state)->maxThreadsPerBlock; + cudaStream_t stream = THCState_getCurrentStream(state); + caffe_gpu_interp2_kernel<real, accreal> <<<THCCeilDiv(num_kernels, num_threads), num_threads , + 0 , stream>>>(num_kernels, rheight, rwidth, idata, odata); + THCudaCheck(cudaGetLastError()); + THCTensor_(free)(state, input); + THCTensor_(free)(state, output); +} + + +void THNN_(SpatialUpSamplingBilinear_updateGradInput)( + THCState *state, + THCTensor *gradOutput, + THCTensor *gradInput, + int nbatch, + int nchannels, + int inputHeight, + int inputWidth, + int outputHeight, + int outputWidth) +{ + gradInput = THCTensor_(newContiguous)(state, gradInput); + gradOutput = THCTensor_(newContiguous)(state, gradOutput); + THCUNN_assertSameGPU_generic(state, 2, gradOutput, gradInput); + THCTensor_(zero)(state, gradInput); + THCDeviceTensor<real, 4> data1 = toDeviceTensor<real, 4>(state, gradInput); + THCDeviceTensor<real, 4> data2 = toDeviceTensor<real, 4>(state, gradOutput); + int height1 = data1.getSize(2); + int width1 = data1.getSize(3); + int height2 = data2.getSize(2); + int width2 = data2.getSize(3); + assert(height1 > 0 && width1 > 0 && height2 > 0 && width2 > 0); + const accreal rheight= (height2 > 1) ? (accreal)(height1 - 1)/(height2 - 1) : accreal(0); + const accreal rwidth = (width2 > 1) ? (accreal)(width1 - 1) / (width2 - 1) : accreal(0); + const int num_kernels = height2 * width2; + const int num_threads = + THCState_getCurrentDeviceProperties(state)->maxThreadsPerBlock; + cudaStream_t stream = THCState_getCurrentStream(state); + caffe_gpu_interp2_kernel_backward<real ,accreal> <<<THCCeilDiv(num_kernels, num_threads), + num_threads, 0, stream>>>(num_kernels, rheight, rwidth, data1, data2); + THCudaCheck(cudaGetLastError()); + THCTensor_(free)(state, gradInput); + THCTensor_(free)(state, gradOutput); +} + +#endif diff --git a/lib/THCUNN/generic/SpatialUpSamplingNearest.cu b/lib/THCUNN/generic/SpatialUpSamplingNearest.cu new file mode 100644 index 0000000..5bc4ff9 --- /dev/null +++ b/lib/THCUNN/generic/SpatialUpSamplingNearest.cu @@ -0,0 +1,115 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/SpatialUpSamplingNearest.cu" +#else + +#include "../common.h" + +void THNN_(SpatialUpSamplingNearest_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output, + int scale_factor) +{ + THCTensor_(zero)(state, output); + + THCUNN_assertSameGPU_generic(state, 2, input, output); + + input = THCTensor_(newContiguous)(state, input); + // This is for allocating output Tensor + long no_elements = 1; + for(int i = 0; i < input->nDimension; i++){ + no_elements *= input->size[i]; + } + no_elements *= scale_factor * scale_factor; + + int d1; + int d2; + int d3; + + if (input->nDimension == 3) { + d1 = output->size[0]; + d2 = output->size[1]; + d3 = output->size[2]; + } else { + d1 = output->size[1]; + d2 = output->size[2]; + d3 = output->size[3]; + } + + real *input_data = THCTensor_(data)(state, input); + real *output_data = THCTensor_(data)(state, output); + + // cuda blocks & threads: + long nthreads = 256; + // Max number of blocks: http://en.wikipedia.org/wiki/CUDA + // 65535 for SM 2.x, 2^32 -1 for >= 3.0 + // TODO: When we move to SM 3.5 we should update this + long n_xblocks = min(max((int)ceil((float)no_elements / nthreads), 1), 65535); + long n_yblocks = (long)ceil((float)no_elements / (float)(n_xblocks * nthreads)); + if (n_yblocks > 65535) { + THError("Input size is too large! aborting"); + } + dim3 blocks(n_xblocks, n_yblocks); + dim3 threads(nthreads); + + // kernel: + upscale<<<blocks, threads, 0, THCState_getCurrentStream(state)>>> (input_data, output_data, no_elements, scale_factor, d1, d2, d3); + THCudaCheck(cudaGetLastError()); + + // final cut: + THCTensor_(free)(state, input); +} + +void THNN_(SpatialUpSamplingNearest_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradInput, + int scale_factor) +{ + THCUNN_assertSameGPU_generic(state, 2, gradOutput, gradInput); + + THCTensor_(zero)(state, gradInput); + + real *gradInput_data = THCTensor_(data)(state, gradInput); + real *gradOutput_data = THCTensor_(data)(state, gradOutput); + + long no_elements = 1; + for(int i = 0; i < gradInput->nDimension; i++){ + no_elements *= gradInput->size[i]; + } + + int d1; + int d2; + int d3; + + if (gradInput->nDimension == 3) { + d1 = gradInput->size[0]; + d2 = gradInput->size[1]; + d3 = gradInput->size[2]; + } else { + d1 = gradInput->size[1]; + d2 = gradInput->size[2]; + d3 = gradInput->size[3]; + } + + // cuda blocks & threads: + long nthreads = 256; + // Max number of blocks: http://en.wikipedia.org/wiki/CUDA + // 65535 for SM 2.x, 2^32 -1 for >= 3.0 + // TODO: When we move to SM 3.5 we should update this + long n_xblocks = min(max((int)ceil((float)no_elements / nthreads), 1), 65535); + long n_yblocks = (long)ceil((float)no_elements / (float)(n_xblocks * nthreads)); + if (n_yblocks > 65535) { + THError("Input size is too large! aborting"); + } + dim3 blocks(n_xblocks, n_yblocks); + dim3 threads(nthreads); + + // kernel: + downscale<real ,accreal> <<<blocks, threads, 0, THCState_getCurrentStream(state)>>> (gradInput_data, gradOutput_data, no_elements, + scale_factor, d1, d2, d3); + THCudaCheck(cudaGetLastError()); +} + +#endif diff --git a/lib/THCUNN/generic/Sqrt.cu b/lib/THCUNN/generic/Sqrt.cu new file mode 100644 index 0000000..a49798f --- /dev/null +++ b/lib/THCUNN/generic/Sqrt.cu @@ -0,0 +1,30 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/Sqrt.cu" +#else + +#include "../common.h" + +void THNN_(Sqrt_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output, + real eps) +{ + THCUNN_assertSameGPU_generic(state, 2, input, output); + THCTensor_(resizeAs)(state, output, input); + THC_pointwiseApply2(state, output, input, sqrtupdateOutput_functor<real>(eps)); +} + +void THNN_(Sqrt_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradInput, + THCTensor *output) +{ + THCUNN_assertSameGPU_generic(state, 3, output, gradOutput, gradInput); + THCTensor_(resizeAs)(state, gradInput, output); + THC_pointwiseApply3(state, gradInput, output, gradOutput, sqrtupdateGradInput_functor<real>()); +} + +#endif diff --git a/lib/THCUNN/generic/Square.cu b/lib/THCUNN/generic/Square.cu new file mode 100644 index 0000000..06018d7 --- /dev/null +++ b/lib/THCUNN/generic/Square.cu @@ -0,0 +1,28 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/Square.cu" +#else + +#include "../common.h" + +void THNN_(Square_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output) +{ + THCUNN_assertSameGPU_generic(state, 2, input, output); + THCTensor_(resizeAs)(state, output, input); + THC_pointwiseApply2(state, output, input, squareupdateOutput_functor<real>()); +} + +void THNN_(Square_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradInput) +{ + THCUNN_assertSameGPU_generic(state, 3, input, gradOutput, gradInput); + THCTensor_(resizeAs)(state, gradInput, input); + THC_pointwiseApply3(state, gradInput, input, gradOutput, squareupdateGradInput_functor<real>()); +} + +#endif diff --git a/lib/THCUNN/generic/THCUNN.h b/lib/THCUNN/generic/THCUNN.h new file mode 100644 index 0000000..91d68ae --- /dev/null +++ b/lib/THCUNN/generic/THCUNN.h @@ -0,0 +1,1171 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/THCUNN.h" +#else + +TH_API void THNN_(Abs_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output); + +TH_API void THNN_(Abs_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradInput); + +TH_API void THNN_(AbsCriterion_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *target, + THCTensor *output, + bool sizeAverage); + +TH_API void THNN_(AbsCriterion_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *target, + THCTensor *gradInput, + bool sizeAverage); + +TH_API void THNN_(BatchNormalization_updateOutput)( + THCState *state, + THCTensor *input_, + THCTensor *output_, + THCTensor *weight_, // [OPTIONAL] + THCTensor *bias_, // [OPTIONAL] + THCTensor *runningMean_, + THCTensor *runningVar_, + THCTensor *saveMean_, + THCTensor *saveStd_, + bool train, + double momentum, + double eps); + +TH_API void THNN_(BatchNormalization_backward)( + THCState *state, + THCTensor *input_, + THCTensor *gradOutput_, + THCTensor *gradInput_, // [OPTIONAL] + THCTensor *gradWeight_, // [OPTIONAL] + THCTensor *gradBias_, // [OPTIONAL] + THCTensor *weight_, // [OPTIONAL] + THCTensor *runningMean_, + THCTensor *runningVar_, + THCTensor *saveMean_, + THCTensor *saveStd_, + bool train, + float scale, + double eps); + +TH_API void THNN_(BCECriterion_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *target, + THCTensor *output, + bool sizeAverage, + THCTensor *weights); // [OPTIONAL] + +TH_API void THNN_(BCECriterion_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *target, + THCTensor *gradInput, + bool sizeAverage, + THCTensor *weights); // [OPTIONAL] + +TH_API void THNN_(ClassNLLCriterion_updateOutput)( + THCState *state, + THCTensor *input, + THCIndexTensor *target, + THCTensor *output, + bool sizeAverage, + THCTensor *weights, // [OPTIONAL] + THCTensor *total_weight); + +TH_API void THNN_(ClassNLLCriterion_updateGradInput)( + THCState *state, + THCTensor *input, + THCIndexTensor *target, + THCTensor *gradInput, + bool sizeAverage, + THCTensor *weights, // [OPTIONAL] + THCTensor *total_weight); + +TH_API void THNN_(DistKLDivCriterion_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *target, + THCTensor *output, + bool sizeAverage); + +TH_API void THNN_(DistKLDivCriterion_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *target, + THCTensor *gradInput, + bool sizeAverage); + +TH_API void THNN_(ELU_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output, + real alpha, + bool inplace); + +TH_API void THNN_(ELU_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradInput, + THCTensor *output, + real alpha, + bool inplace); + +TH_API void THNN_(HardTanh_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output, + real min_val, + real max_val, + bool inplace); + +TH_API void THNN_(HardTanh_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradInput, + real min_val, + real max_val, + bool inplace); + +TH_API void THNN_(LeakyReLU_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output, + real negval, + bool inplace); + +TH_API void THNN_(LeakyReLU_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradInput, + real negval, + bool inplace); + +TH_API void THNN_(LogSigmoid_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output, + THCTensor *buffer); + +TH_API void THNN_(LogSigmoid_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradInput, + THCTensor *buffer); + +TH_API void THNN_(LogSoftMax_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output); + +TH_API void THNN_(LogSoftMax_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradInput, + THCTensor *output); + +TH_API void THNN_(LookupTable_accGradParameters)( + THCState *state, + THCIndexTensor *input, + THCTensor *gradOutput, + THCTensor *gradWeight, + THCIndexTensor *count, + THCIndexTensor *sorted, // [OPTIONAL] + THCIndexTensor *indices, // [OPTIONAL] + bool scaleGradByFreq, + int paddingValue, + real scale); + +TH_API void THNN_(LookupTable_renorm)( + THCState *state, + THCIndexTensor *idx, + THCTensor *weight, + real maxNorm, + real normType); + +TH_API void THNN_(L1Cost_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output); + +TH_API void THNN_(L1Cost_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, // [OPTIONAL] + THCTensor *gradInput); + +TH_API void THNN_(MarginCriterion_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *target, + THCTensor *output, + bool sizeAverage, + real margin); + +TH_API void THNN_(MarginCriterion_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *target, + THCTensor *gradInput, + bool sizeAverage, + real margin); + +TH_API void THNN_(MSECriterion_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *target, + THCTensor *output, + bool sizeAverage); + +TH_API void THNN_(MSECriterion_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *target, + THCTensor *gradInput, + bool sizeAverage); + +TH_API void THNN_(MultiLabelMarginCriterion_updateOutput)( + THCState *state, + THCTensor *input, + THCIndexTensor *target, + THCTensor *output, + THCTensor *istarget, + bool sizeaverage); + +TH_API void THNN_(MultiLabelMarginCriterion_updateGradInput)( + THCState *state, + THCTensor *input, + THCIndexTensor *target, + THCTensor *gradInput, + THCTensor *istarget, + bool sizeaverage); + +TH_API void THNN_(MultiMarginCriterion_updateGradInput)( + THCState *state, + THCTensor *input, + THCIndexTensor *target, + THCTensor *gradInput, + bool sizeAverage, + int p, + THCTensor *weights, + real margin); + +TH_API void THNN_(MultiMarginCriterion_updateOutput)( + THCState *state, + THCTensor *input, + THCIndexTensor *target, + THCTensor *output, + bool sizeAverage, + int p, + THCTensor *weights, // [OPTIONAL] + real margin); + +TH_API void THNN_(MultiMarginCriterion_updateGradInput)( + THCState *state, + THCTensor *input, + THCIndexTensor *target, + THCTensor *gradInput, + bool sizeAverage, + int p, + THCTensor *weights, // [OPTIONAL] + real margin); + +TH_API void THNN_(PReLU_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output, + THCTensor *weight, + long nOutputPlane); + +TH_API void THNN_(PReLU_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradInput, + THCTensor *weight, + long nOutputPlane); + +TH_API void THNN_(PReLU_accGradParameters)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradInput, + THCTensor *weight, + THCTensor *gradWeight, + THCTensor *gradWeightBuf, + THCTensor *gradWeightBuf2, + long nOutputPlane, + real scale); + +TH_API void THNN_(SmoothL1Criterion_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *target, + THCTensor *output, + bool sizeAverage); + +TH_API void THNN_(SmoothL1Criterion_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *target, + THCTensor *gradInput, + bool sizeAverage); + +TH_API void THNN_(SparseLinear_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output, + THCTensor *weight, + THCTensor *bias); + +TH_API void THNN_(SparseLinear_accGradParameters)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradWeight, + THCTensor *gradBias, + THCTensor *weight, + THCTensor *bias, + double weightDecay, + double scale); + +TH_API void THNN_(SparseLinear_legacyUpdateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output, + THCTensor *weight, + THCTensor *bias); + +TH_API void THNN_(SparseLinear_legacyAccGradParameters)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradWeight, + THCTensor *gradBias, + THCTensor *weight, + THCTensor *bias, + double weightDecay, + double scale); + +TH_API void THNN_(SparseLinear_zeroGradParameters)( + THCState *state, + THCTensor *gradWeight, + THCTensor *gradBias, + THCTensor *lastInput); + +TH_API void THNN_(SparseLinear_updateParameters)( + THCState *state, + THCTensor *weight, + THCTensor *bias, + THCTensor *gradWeight, + THCTensor *gradBias, + THCTensor *lastInput, + double learningRate); + +TH_API void THNN_(SpatialAdaptiveMaxPooling_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output, + THCIndexTensor *indices, + int nOutputCols, + int nOutputRows); + +TH_API void THNN_(SpatialAdaptiveMaxPooling_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradInput, + THCIndexTensor *indices); + +TH_API void THNN_(SpatialAveragePooling_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output, + int kW, int kH, + int dW, int dH, + int padW, int padH, + bool ceil_mode, + bool count_include_pad); + +TH_API void THNN_(SpatialAveragePooling_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradInput, + int kW, int kH, + int dW, int dH, + int padW, int padH, + bool ceil_mode, + bool count_include_pad); + +TH_API void THNN_(SpatialClassNLLCriterion_updateOutput)( + THCState *state, + THCTensor *input, + THCIndexTensor *target, + THCTensor *output, + bool sizeAverage, + THCTensor *weights, // [OPTIONAL] + THCTensor *total_weight); + +TH_API void THNN_(SpatialClassNLLCriterion_updateGradInput)( + THCState *state, + THCTensor *input, + THCIndexTensor *target, + THCTensor *gradInput, + bool sizeAverage, + THCTensor *weights, // [OPTIONAL] + THCTensor *total_weight); + +TH_API void THNN_(SpatialConvolutionLocal_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output, + THCTensor *weight, + THCTensor *bias, + THCTensor *finput, + THCTensor *fgradInput, + int kW, int kH, + int dW, int dH, + int padW, int padH, + long inputWidth, long inputHeight, + long outputWidth, long outputHeight); + +TH_API void THNN_(SpatialConvolutionLocal_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradInput, + THCTensor *weight, + THCTensor *finput, + THCTensor *fgradInput, + int kW, int kH, + int dW, int dH, + int padW, int padH, + long inputWidth, long inputHeight, + long outputWidth, long outputHeight); + +TH_API void THNN_(SpatialConvolutionLocal_accGradParameters)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradWeight, + THCTensor *gradBias, + THCTensor *finput, + THCTensor *fgradInput, + int kW, int kH, + int dW, int dH, + int padW, int padH, + long inputWidth, long inputHeight, + long outputWidth, long outputHeight, + real scale); + +TH_API void THNN_(SpatialConvolutionMM_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output, + THCTensor *weight, + THCTensor *bias, + THCTensor *columns, + THCTensor *ones, + int kW, int kH, + int dW, int dH, + int padW, int padH); + +TH_API void THNN_(SpatialConvolutionMM_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradInput, + THCTensor *weight, + THCTensor *gradColumns, + THCTensor *ones, + int kW, int kH, + int dW, int dH, + int padW, int padH); + +TH_API void THNN_(SpatialConvolutionMM_accGradParameters)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradWeight, + THCTensor *gradBias, + THCTensor *columns, + THCTensor *ones, + int kW, int kH, + int dW, int dH, + int padW, int padH, + real scale); + +TH_API void THNN_(SpatialCrossMapLRN_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output, + THCTensor *scale, + int size, + real alpha, + real beta, + real k); + +TH_API void THNN_(SpatialCrossMapLRN_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradInput, + THCTensor *scale, + THCTensor *output, + int size, + real alpha, + real beta, + real k); + +TH_API void THNN_(SpatialDilatedConvolution_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output, + THCTensor *weight, + THCTensor *bias, // [OPTIONAL] + THCTensor *columns, + THCTensor *ones, + int kW, int kH, + int dW, int dH, + int padW, int padH, + int dilationW, int dilationH); + +TH_API void THNN_(SpatialDilatedConvolution_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradInput, + THCTensor *weight, + THCTensor *gradColumns, + int kW, int kH, + int dW, int dH, + int padW, int padH, + int dilationW, int dilationH); + +TH_API void THNN_(SpatialDilatedConvolution_accGradParameters)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradWeight, + THCTensor *gradBias, // [OPTIONAL] + THCTensor *columns, + THCTensor *ones, + int kW, int kH, + int dW, int dH, + int padW, int padH, + int dilationW, int dilationH, + real scale); + +TH_API void THNN_(SpatialDilatedMaxPooling_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output, + THCIndexTensor *indices, + int kW, int kH, + int dW, int dH, + int padW, int padH, + int dilationW, int dilationH, + bool ceil_mode); + +TH_API void THNN_(SpatialDilatedMaxPooling_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradInput, + THCIndexTensor *indices, + int kW, int kH, + int dW, int dH, + int padW, int padH, + int dilationW, int dilationH, + bool ceil_mode); + +TH_API void THNN_(SpatialFractionalMaxPooling_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output, + int outputW, int outputH, + int poolSizeW, int poolSizeH, + THCIndexTensor *indices, + THCTensor *randomSamples); + +TH_API void THNN_(SpatialFractionalMaxPooling_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradInput, + int outputW, int outputH, + int poolSizeW, int poolSizeH, + THCIndexTensor *indices); + +TH_API void THNN_(SpatialFullConvolution_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output, + THCTensor *weight, + THCTensor *bias, // [OPTIONAL] + THCTensor *columns, + THCTensor *ones, + int kW, int kH, + int dW, int dH, + int padW, int padH, + int adjW, int adjH); + +TH_API void THNN_(SpatialFullConvolution_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradInput, + THCTensor *weight, + THCTensor *gradColumns, + int kW, int kH, + int dW, int dH, + int padW, int padH, + int adjW, int adjH); + +TH_API void THNN_(SpatialFullConvolution_accGradParameters)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradWeight, + THCTensor *gradBias, // [OPTIONAL] + THCTensor *columns, + THCTensor *ones, + int kW, int kH, + int dW, int dH, + int padW, int padH, + int adjW, int adjH, + real scale); + +TH_API void THNN_(SpatialMaxPooling_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output, + THCIndexTensor *indices, + int kW, int kH, + int dW, int dH, + int padW, int padH, + bool ceil_mode); + +TH_API void THNN_(SpatialMaxPooling_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradInput, + THCIndexTensor *indices, + int kW, int kH, + int dW, int dH, + int padW, int padH, + bool ceil_mode); + +TH_API void THNN_(SpatialMaxUnpooling_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output, + THCIndexTensor *indices, + int owidth, int oheight); + +TH_API void THNN_(SpatialMaxUnpooling_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradInput, + THCIndexTensor *indices, + int owidth, int oheight); + +TH_API void THNN_(SpatialReflectionPadding_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output, + int padL, int padR, + int padT, int padB); + +TH_API void THNN_(SpatialReflectionPadding_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradInput, + int padL, int padR, + int padT, int padB); + +TH_API void THNN_(SpatialReplicationPadding_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output, + int padL, int padR, + int padT, int padB); + +TH_API void THNN_(SpatialReplicationPadding_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradInput, + int padL, int padR, + int padT, int padB); + +TH_API void THNN_(SpatialSubSampling_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output, + THCTensor *weight, + THCTensor *bias, + int kW, int kH, + int dW, int dH); + +TH_API void THNN_(SpatialSubSampling_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradInput, + THCTensor *weight, + int kW, int kH, + int dW, int dH); + +TH_API void THNN_(SpatialSubSampling_accGradParameters)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradWeight, + THCTensor *gradBias, + int kW, int kH, + int dW, int dH, + float scale); + +TH_API void THNN_(SpatialUpSamplingBilinear_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output, + int outputHeight, + int outputWidth); + +TH_API void THNN_(SpatialUpSamplingBilinear_updateGradInput)( + THCState *state, + THCTensor *gradOutput, + THCTensor *gradInput, + int nbatch, + int nchannels, + int inputHeight, + int inputWidth, + int outputHeight, + int outputWidth); + +TH_API void THNN_(SpatialUpSamplingNearest_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradInput, + int scale_factor); + +TH_API void THNN_(SpatialUpSamplingNearest_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output, + int scale_factor); + +TH_API void THNN_(RReLU_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output, + THCTensor *noise, + double lower, + double upper, + bool train, + bool inplace, + void *generator); + +TH_API void THNN_(RReLU_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradInput, + THCTensor *noise, + double lower, + double upper, + bool train, + bool inplace); + +TH_API void THNN_(Sigmoid_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output); + +TH_API void THNN_(Sigmoid_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradInput, + THCTensor *output); + +TH_API void THNN_(SoftMarginCriterion_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *target, + THCTensor *output, + int sizeAverage); + +TH_API void THNN_(SoftMarginCriterion_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *target, + THCTensor *gradInput, + int sizeAverage); + +TH_API void THNN_(SoftMax_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output); + +TH_API void THNN_(SoftMax_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradInput, + THCTensor *output); + +TH_API void THNN_(SoftPlus_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output, + real beta, + real threshold); + +TH_API void THNN_(SoftPlus_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradInput, + THCTensor *output, + real beta, + real threshold); + +TH_API void THNN_(SoftShrink_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output, + real lambda); + +TH_API void THNN_(SoftShrink_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradInput, + real lambda); + +TH_API void THNN_(Square_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output); + +TH_API void THNN_(Square_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradInput); + +TH_API void THNN_(Sqrt_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output, + real eps); + +TH_API void THNN_(Sqrt_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradInput, + THCTensor *output); + +TH_API void THNN_(Tanh_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output); + +TH_API void THNN_(Tanh_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradInput, + THCTensor *output); + +TH_API void THNN_(TemporalConvolution_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output, + THCTensor *weight, + THCTensor *bias, + int kW, int dW, + int inputFrameSize, + int outputFrameSize); + +TH_API void THNN_(TemporalConvolution_updateGradInput)( + THCState* state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradInput, + THCTensor *weight, + int kW, int dW); + +TH_API void THNN_(TemporalConvolution_accGradParameters)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradWeight, + THCTensor *gradBias, + int kW, int dW, + real scale); + +TH_API void THNN_(TemporalMaxPooling_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output, + THCIndexTensor *indices, + int kW, int dW); + +TH_API void THNN_(TemporalMaxPooling_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradInput, + THCIndexTensor *indices, + int kW, int dW); + +TH_API void THNN_(Threshold_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output, + real threshold, + real val, + bool inplace); + +TH_API void THNN_(Threshold_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradInput, + real threshold, + real val, + bool inplace); + +TH_API void THNN_(VolumetricAveragePooling_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output, + int kT, int kW, int kH, + int dT, int dW, int dH); + +TH_API void THNN_(VolumetricAveragePooling_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradInput, + int kT, int kW, int kH, + int dT, int dW, int dH); + +TH_API void THNN_(VolumetricReplicationPadding_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output, + int pleft, int pright, + int ptop, int pbottom, + int pfront, int pback); + +TH_API void THNN_(VolumetricConvolution_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output, + THCTensor *weight, + THCTensor *bias, + THCTensor *finput, + THCTensor *fgradInput, + int dT, int dW, int dH, + int padT, int padW, int padH); + +TH_API void THNN_(VolumetricConvolution_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradInput, + THCTensor *weight, + THCTensor *finput, + int dT, int dW, int dH, + int padT, int padW, int padH); + +TH_API void THNN_(VolumetricConvolution_accGradParameters)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradWeight, + THCTensor *gradBias, + THCTensor *finput, + THCTensor *fgradInput, + int dT, int dW, int dH, + int padT, int padW, int padH, + real scale); + +TH_API void THNN_(VolumetricDilatedConvolution_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output, + THCTensor *weight, + THCTensor *bias, + THCTensor *columns, + THCTensor *ones, + int kT, int kW, int kH, + int dT, int dW, int dH, + int padT, int padW, int padH, + int dilationT, int dilationW, int dilationH); + +TH_API void THNN_(VolumetricDilatedConvolution_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradInput, + THCTensor *weight, + THCTensor *gradColumns, + int kT, int kW, int kH, + int dT, int dW, int dH, + int padT, int padW, int padH, + int dilationT, int dilationW, int dilationH); + +TH_API void THNN_(VolumetricDilatedConvolution_accGradParameters)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradWeight, + THCTensor *gradBias, + THCTensor *columns, + THCTensor *ones, + int kT, int kW, int kH, + int dT, int dW, int dH, + int padT, int padW, int padH, + int dilationT, int dilationW, int dilationH, + real scale); + +TH_API void THNN_(VolumetricDilatedMaxPooling_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output, + THCIndexTensor *indices, + int kT, int kW, int kH, + int dT, int dW, int dH, + int padT, int padW, int padH, + int dilationT, int dilationW, int dilationH, + bool ceilMode); + +TH_API void THNN_(VolumetricDilatedMaxPooling_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradInput, + THCIndexTensor *indices, + int dT, int dW, int dH, + int padT, int padW, int padH, + int dilationT, int dilationW, int dilationH); + +TH_API void THNN_(VolumetricFullConvolution_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output, + THCTensor *weight, + THCTensor *bias, + THCTensor *finput, + THCTensor *fgradInput, + int dT, int dW, int dH, + int padT, int padW, int padH, + int adjT, int adjW, int adjH); + +TH_API void THNN_(VolumetricFullConvolution_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradInput, + THCTensor *weight, + THCTensor *finput, + THCTensor *fgradInput, + int dT, int dW, int dH, + int padT, int padW, int padH, + int adjT, int adjW, int adjH); + +TH_API void THNN_(VolumetricFullConvolution_accGradParameters)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradWeight, + THCTensor *gradBias, + THCTensor *finput, + THCTensor *fgradInput, + int dT, int dW, int dH, + int padT, int padW, int padH, + int adjT, int adjW, int adjH, + real scale); + +TH_API void THNN_(VolumetricMaxPooling_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output, + THCIndexTensor *indices, + int kT, int kW, int kH, + int dT, int dW, int dH, + int padT, int padW, int padH, + bool ceilMode); + +TH_API void THNN_(VolumetricMaxPooling_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradInput, + THCIndexTensor *indices, + int dT, int dW, int dH, + int padT, int padW, int padH); + +TH_API void THNN_(VolumetricMaxUnpooling_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output, + THCIndexTensor *indices, + int outputTime, int outputWidth, int outputHeight, + int dT, int dW, int dH, + int padT, int padW, int padH); + +TH_API void THNN_(VolumetricMaxUnpooling_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradInput, + THCIndexTensor *indices, + int outputTime, int outputWidth, int outputHeight, + int dT, int dW, int dH, + int padT, int padW, int padH); + +TH_API void THNN_(VolumetricReplicationPadding_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradInput, + int pleft, int pright, + int ptop, int pbottom, + int pfront, int pback); + +TH_API void THNN_(VolumetricReplicationPadding_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output, + int pleft, int pright, + int ptop, int pbottom, + int pfront, int pback); + +#endif diff --git a/lib/THCUNN/generic/Tanh.cu b/lib/THCUNN/generic/Tanh.cu new file mode 100644 index 0000000..3db4ce0 --- /dev/null +++ b/lib/THCUNN/generic/Tanh.cu @@ -0,0 +1,29 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/Tanh.cu" +#else + +#include "../common.h" + +void THNN_(Tanh_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output) +{ + THCUNN_assertSameGPU_generic(state, 2, input, output); + THCTensor_(resizeAs)(state, output, input); + THC_pointwiseApply2(state, output, input, tanhupdateOutput_functor<real>()); +} + +void THNN_(Tanh_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradInput, + THCTensor *output) +{ + THCUNN_assertSameGPU_generic(state, 3, output, gradOutput, gradInput); + THCTensor_(resizeAs)(state, gradInput, output); + THC_pointwiseApply3(state, gradInput, output, gradOutput, tanhupdateGradInput_functor<real>()); +} + +#endif diff --git a/lib/THCUNN/generic/TemporalConvolution.cu b/lib/THCUNN/generic/TemporalConvolution.cu new file mode 100644 index 0000000..f074cb7 --- /dev/null +++ b/lib/THCUNN/generic/TemporalConvolution.cu @@ -0,0 +1,348 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/TemporalConvolution.cu" +#else + +void THNN_(TemporalConvolution_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output, + THCTensor *weight, + THCTensor *bias, + int kW, int dW, + int inputFrameSize, + int outputFrameSize) { + + THCTensor *outputWindow, *inputWindow; + int nInputFrame, nOutputFrame; + long k, i; + + int dimS = 0; // sequence dimension + int dimF = 1; // feature dimension + + THCUNN_assertSameGPU_generic(state, 4, input, output, weight, bias); + THArgCheck( input->nDimension == 2 || input->nDimension == 3, 2, "2D or 3D(batch mode) tensor expected"); + + if (input->nDimension == 3) + { + dimS = 1; + dimF = 2; + } + THArgCheck( input->size[dimF] == inputFrameSize, 2, "invalid input frame size"); + THArgCheck( input->size[dimS] >= kW, 2, "input sequence smaller than kernel size"); + + input = THCTensor_(newContiguous)(state, input); + outputWindow = THCTensor_(new)(state); + inputWindow = THCTensor_(new)(state); + + nInputFrame = input->size[dimS]; + nOutputFrame = (nInputFrame - kW) / dW + 1; + + if (input->nDimension == 2) + { + THCTensor_(resize2d)(state, output, + nOutputFrame, + outputFrameSize); + + /* bias first */ + for(k = 0; k < nOutputFrame; k++) + { + THCTensor_(select)(state, outputWindow, output, 0, k); + THCTensor_(copy)(state, outputWindow, bias); + } + + + /* ouch */ + for(k = 0; nOutputFrame > 0; k++) + { + long outputFrameStride = (kW-1)/dW+1; + long inputFrameStride = outputFrameStride*dW; + long nFrame = (nInputFrame-k*dW-kW)/inputFrameStride + 1; + nOutputFrame -= nFrame; + + THCTensor_(setStorage2d)(state, inputWindow, input->storage, + input->storageOffset+k*dW*input->size[1], + nFrame, inputFrameStride*input->size[1], + kW*input->size[1], 1); + + THCTensor_(setStorage2d)(state, outputWindow, output->storage, + output->storageOffset + k*output->size[1], + nFrame, outputFrameStride*output->size[1], + output->size[1], 1); + + THCTensor_(transpose)(state, weight, NULL, 0, 1); + THCTensor_(addmm)(state, outputWindow, ScalarConvert<int, real>::to(1), outputWindow, ScalarConvert<int, real>::to(1), inputWindow, weight); + THCTensor_(transpose)(state, weight, NULL, 0, 1); + } + } + else + { + THCTensor *outputSample = THCTensor_(new)(state); + THCTensor *inputSample = THCTensor_(new)(state); + int nBatchFrame = input->size[0]; + + THCTensor_(resize3d)(state, output, + nBatchFrame, + nOutputFrame, + outputFrameSize); + + for(i = 0; i < nBatchFrame; i++) + { + THCTensor_(select)(state, outputSample, output, 0, i); + THCTensor_(select)(state, inputSample, input, 0, i); + long nOutputSampleFrame = nOutputFrame; + + /* bias first */ + for(k = 0; k < nOutputFrame; k++) + { + THCTensor_(select)(state, outputWindow, outputSample, 0, k); + THCTensor_(copy)(state, outputWindow, bias); + } + + /* ouch */ + for(k = 0; nOutputSampleFrame > 0; k++) + { + long outputFrameStride = (kW-1)/dW+1; + long inputFrameStride = outputFrameStride*dW; + long nFrame = (nInputFrame-k*dW-kW)/inputFrameStride + 1; + nOutputSampleFrame -= nFrame; + + THCTensor_(setStorage2d)(state, inputWindow, inputSample->storage, + inputSample->storageOffset+k*dW*inputSample->size[1], + nFrame, inputFrameStride*inputSample->size[1], + kW*inputSample->size[1], 1); + + THCTensor_(setStorage2d)(state, outputWindow, outputSample->storage, + outputSample->storageOffset + k*outputSample->size[1], + nFrame, outputFrameStride*outputSample->size[1], + outputSample->size[1], 1); + + THCTensor_(transpose)(state, weight, NULL, 0, 1); + THCTensor_(addmm)(state, outputWindow, ScalarConvert<int, real>::to(1), outputWindow, ScalarConvert<int, real>::to(1), inputWindow, weight); + THCTensor_(transpose)(state, weight, NULL, 0, 1); + } + } + THCTensor_(free)(state, outputSample); + THCTensor_(free)(state, inputSample); + } + + THCTensor_(free)(state, outputWindow); + THCTensor_(free)(state, inputWindow); + THCTensor_(free)(state, input); + +} + +void THNN_(TemporalConvolution_updateGradInput)( + THCState* state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradInput, + THCTensor *weight, + int kW, int dW) { + + long nInputFrame; + long nOutputFrame; + + THCTensor *gradOutputWindow; + THCTensor *gradInputWindow; + long k, i; + + int dimS = 0; // sequence dimension + + THCUNN_assertSameGPU_generic(state, 4, input, gradOutput, weight, gradInput); + + if (gradOutput->nDimension == 3) + { + dimS = 1; + } + + nInputFrame = input->size[dimS]; + nOutputFrame = gradOutput->size[dimS]; + + + /* Not necessary with partial backprop: */ + gradOutputWindow = THCTensor_(new)(state); + gradInputWindow = THCTensor_(new)(state); + + THCTensor_(resizeAs)(state, gradInput, input); + THCTensor_(zero)(state, gradInput); + + if (gradOutput->nDimension == 2) + { + /* ouch */ + for(k = 0; nOutputFrame > 0; k++) + { + long outputFrameStride = (kW-1)/dW+1; + long inputFrameStride = outputFrameStride*dW; + long nFrame = (nInputFrame-k*dW-kW)/inputFrameStride + 1; + nOutputFrame -= nFrame; + + THCTensor_(setStorage2d)(state, gradOutputWindow, gradOutput->storage, + gradOutput->storageOffset + k*gradOutput->size[1], + nFrame, outputFrameStride*gradOutput->size[1], + gradOutput->size[1], 1); + + THCTensor_(setStorage2d)(state, gradInputWindow, gradInput->storage, + gradInput->storageOffset+k*dW*gradInput->size[1], + nFrame, inputFrameStride*gradInput->size[1], + kW*gradInput->size[1], 1); + + THCTensor_(addmm)(state, gradInputWindow, ScalarConvert<int, real>::to(1), gradInputWindow, ScalarConvert<int, real>::to(1), gradOutputWindow, weight); + } + } + else + { + THCTensor *gradOutputSample = THCTensor_(new)(state); + THCTensor *gradInputSample = THCTensor_(new)(state); + long nBatchFrame = input->size[0]; + for(i = 0; i < nBatchFrame; i++) + { + THCTensor_(select)(state, gradOutputSample, gradOutput, 0, i); + THCTensor_(select)(state, gradInputSample, gradInput, 0, i); + long nOutputSampleFrame = nOutputFrame; + + /* ouch */ + for(k = 0; nOutputSampleFrame > 0; k++) + { + long outputFrameStride = (kW-1)/dW+1; + long inputFrameStride = outputFrameStride*dW; + long nFrame = (nInputFrame-k*dW-kW)/inputFrameStride + 1; + nOutputSampleFrame -= nFrame; + + THCTensor_(setStorage2d)(state, gradOutputWindow, gradOutputSample->storage, + gradOutputSample->storageOffset + k*gradOutputSample->size[1], + nFrame, outputFrameStride*gradOutputSample->size[1], + gradOutputSample->size[1], 1); + + THCTensor_(setStorage2d)(state, gradInputWindow, gradInputSample->storage, + gradInputSample->storageOffset+k*dW*gradInputSample->size[1], + nFrame, inputFrameStride*gradInputSample->size[1], + kW*gradInputSample->size[1], 1); + + THCTensor_(addmm)(state, gradInputWindow, ScalarConvert<int, real>::to(1), gradInputWindow, ScalarConvert<int, real>::to(1), gradOutputWindow, weight); + } + } + THCTensor_(free)(state, gradOutputSample); + THCTensor_(free)(state, gradInputSample); + } + + THCTensor_(free)(state, gradOutputWindow); + THCTensor_(free)(state, gradInputWindow); + +} + +void THNN_(TemporalConvolution_accGradParameters)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradWeight, + THCTensor *gradBias, + int kW, int dW, + real scale) { + + long nInputFrame; + long nOutputFrame; + + THCTensor *gradOutputWindow; + THCTensor *inputWindow; + long k, i; + + int dimS = 0; // sequence dimension + + if (gradOutput->nDimension == 3) + { + dimS = 1; + } + + nInputFrame = input->size[dimS]; + nOutputFrame = gradOutput->size[dimS]; + + /* Not necessary with partial backprop: */ + input = THCTensor_(newContiguous)(state, input); + gradOutputWindow = THCTensor_(new)(state); + inputWindow = THCTensor_(new)(state); + + if (input->nDimension == 2) + { + /* bias first */ + for(k = 0; k < nOutputFrame; k++) + { + THCTensor_(select)(state, gradOutputWindow, gradOutput, 0, k); + THCTensor_(cadd)(state, gradBias, gradBias, scale, gradOutputWindow); + } + + /* ouch */ + for(k = 0; nOutputFrame > 0; k++) + { + long outputFrameStride = (kW-1)/dW+1; + long inputFrameStride = outputFrameStride*dW; + long nFrame = (nInputFrame-k*dW-kW)/inputFrameStride + 1; + nOutputFrame -= nFrame; + + THCTensor_(setStorage2d)(state, inputWindow, input->storage, + input->storageOffset+k*dW*input->size[1], + nFrame, inputFrameStride*input->size[1], + kW*input->size[1], 1); + + THCTensor_(setStorage2d)(state, gradOutputWindow, gradOutput->storage, + gradOutput->storageOffset + k*gradOutput->size[1], + nFrame, outputFrameStride*gradOutput->size[1], + gradOutput->size[1], 1); + + THCTensor_(transpose)(state, gradOutputWindow, NULL, 0, 1); + THCTensor_(addmm)(state, gradWeight, ScalarConvert<int, real>::to(1), gradWeight, scale, gradOutputWindow, inputWindow); + THCTensor_(transpose)(state, gradOutputWindow, NULL, 0, 1); + } + } + else + { + THCTensor *gradOutputSample = THCTensor_(new)(state); + THCTensor *inputSample = THCTensor_(new)(state); + long nBatchFrame = input->size[0]; + + for(i = 0; i < nBatchFrame; i++) + { + THCTensor_(select)(state, gradOutputSample, gradOutput, 0, i); + THCTensor_(select)(state, inputSample, input, 0, i); + long nOutputSampleFrame = nOutputFrame; + + /* bias first */ + for(k = 0; k < nOutputFrame; k++) + { + THCTensor_(select)(state, gradOutputWindow, gradOutputSample, 0, k); + THCTensor_(cadd)(state, gradBias, gradBias, scale, gradOutputWindow); + } + + /* ouch */ + for(k = 0; nOutputSampleFrame > 0; k++) + { + long outputFrameStride = (kW-1)/dW+1; + long inputFrameStride = outputFrameStride*dW; + long nFrame = (nInputFrame-k*dW-kW)/inputFrameStride + 1; + nOutputSampleFrame -= nFrame; + + THCTensor_(setStorage2d)(state, inputWindow, inputSample->storage, + inputSample->storageOffset+k*dW*inputSample->size[1], + nFrame, inputFrameStride*inputSample->size[1], + kW*inputSample->size[1], 1); + + THCTensor_(setStorage2d)(state, gradOutputWindow, gradOutputSample->storage, + gradOutputSample->storageOffset + k*gradOutputSample->size[1], + nFrame, outputFrameStride*gradOutputSample->size[1], + gradOutputSample->size[1], 1); + + THCTensor_(transpose)(state, gradOutputWindow, NULL, 0, 1); + THCTensor_(addmm)(state, gradWeight, ScalarConvert<int, real>::to(1), gradWeight, scale, gradOutputWindow, inputWindow); + THCTensor_(transpose)(state, gradOutputWindow, NULL, 0, 1); + } + } + THCTensor_(free)(state, gradOutputSample); + THCTensor_(free)(state, inputSample); + } + + THCTensor_(free)(state, gradOutputWindow); + THCTensor_(free)(state, inputWindow); + THCTensor_(free)(state, input); + +} + +#endif diff --git a/lib/THCUNN/generic/TemporalMaxPooling.cu b/lib/THCUNN/generic/TemporalMaxPooling.cu new file mode 100644 index 0000000..d833c5a --- /dev/null +++ b/lib/THCUNN/generic/TemporalMaxPooling.cu @@ -0,0 +1,151 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/TemporalMaxPooling.cu" +#else + +void THNN_(TemporalMaxPooling_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output, + THCIndexTensor *indices, + int kW, int dW) { + + int dimT = 0; // Temporal dimension + int dimF = 1; // Feature dimension + + int batch = 1; + int input_w; + int input_n; + int output_w; + int nthreads; + + real *input_data; + real *output_data; + THCIndex_t *indices_data; + + THCUNN_assertSameGPU_generic(state, 3, input, output, indices); + THArgCheck( input->nDimension == 2 || input->nDimension == 3, 2, "2D or 3D(batch mode) tensor expected"); + + if (input->nDimension == 3) + { + dimT = 1; + dimF = 2; + batch = input->size[0]; + } + THArgCheck( input->size[dimT] >= kW, 2, "input sequence smaller than kernel size"); + + input = THCTensor_(newContiguous)(state, input); + + input_w = input->size[dimT]; + input_n = input->size[dimF]; + output_w = (input_w - kW) / dW + 1; + + if (input->nDimension == 2) + { + THCTensor_(resize2d)(state, output, output_w, input->size[dimF]); + THCIndexTensor_(resize2d)(state, indices, output_w, input->size[dimF]); + } + else + { + THCTensor_(resize3d)(state, output, batch, output_w, input->size[dimF]); + THCIndexTensor_(resize3d)(state, indices, batch, output_w, input->size[dimF]); + } + + input_data = THCTensor_(data)(state, input); + output_data = THCTensor_(data)(state, output); + indices_data = THCIndexTensor_(data)(state, indices); + + dim3 blocks(batch); + nthreads = (output_w / 32) * 32; + if (output_w % 32 > 0) { + nthreads += 32; + } + + if (nthreads > TEMPORAL_MAX_POOLING_THREADS) { + blocks.y = nthreads / TEMPORAL_MAX_POOLING_THREADS; + if (nthreads % TEMPORAL_MAX_POOLING_THREADS > 0) { + blocks.y += 1; + } + nthreads = TEMPORAL_MAX_POOLING_THREADS; + } + + dim3 threads(nthreads); + cunn_TemporalMaxPooling_updateOutputKernel <<< blocks, threads, 0, THCState_getCurrentStream(state) >>>( + input_data, output_data, indices_data, input_w, input_n, output_w, kW, dW); + THCudaCheck(cudaGetLastError()); + THCTensor_(free)(state, input); + +} + +void THNN_(TemporalMaxPooling_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradInput, + THCIndexTensor *indices, + int kW, int dW) { + + int dimT = 0; // Temporal dimension + int dimF = 1; // Feature dimension + + int batch = 1; + int input_w; + int input_n; + int output_w; + int nthreads; + + real *gradInput_data; + real *gradOutput_data; + THCIndex_t *indices_data; + + THCUNN_assertSameGPU_generic(state, 4, input, gradOutput, gradInput, indices); + THArgCheck( input->nDimension == 2 || input->nDimension == 3, 2, "2D or 3D(batch mode) tensor expected"); + + THCTensor_(resizeAs)(state, gradInput, input); + THCTensor_(zero)(state, gradInput); + + if (input->nDimension == 3) + { + dimT = 1; + dimF = 2; + batch = input->size[0]; + } + THArgCheck( input->size[dimT] >= kW, 2, "input sequence smaller than kernel size"); + + gradOutput = THCTensor_(newContiguous)(state, gradOutput); + + input_w = input->size[dimT]; + input_n = input->size[dimF]; + output_w = (input_w - kW) / dW + 1; + + gradInput_data = THCTensor_(data)(state, gradInput); + gradOutput_data = THCTensor_(data)(state, gradOutput); + indices_data = THCIndexTensor_(data)(state, indices); + + dim3 blocks(batch); + nthreads = (output_w / 32) * 32; + if (output_w % 32 > 0) { + nthreads += 32; + } + + if (nthreads > TEMPORAL_MAX_POOLING_THREADS) { + blocks.y = nthreads / TEMPORAL_MAX_POOLING_THREADS; + if (nthreads % TEMPORAL_MAX_POOLING_THREADS > 0) { + blocks.y += 1; + } + nthreads = TEMPORAL_MAX_POOLING_THREADS; + } + + dim3 threads(nthreads); + if (kW <= dW) { + cunn_TemporalMaxPooling_updateGradInputKernel <<< blocks, threads, 0, THCState_getCurrentStream(state) >>>( + gradInput_data, gradOutput_data, indices_data, input_w, input_n, output_w, kW, dW); + } else { + cunn_TemporalMaxPooling_updateGradInputKernelAtomic <<< blocks, threads, 0, THCState_getCurrentStream(state) >>>( + gradInput_data, gradOutput_data, indices_data, input_w, input_n, output_w, kW, dW); + } + THCudaCheck(cudaGetLastError()); + THCTensor_(free)(state, gradOutput); + +} + +#endif diff --git a/lib/THCUNN/generic/Threshold.cu b/lib/THCUNN/generic/Threshold.cu new file mode 100644 index 0000000..0ab84c1 --- /dev/null +++ b/lib/THCUNN/generic/Threshold.cu @@ -0,0 +1,64 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/Threshold.cu" +#else + +#include "../common.h" + +void THNN_(Threshold_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output, + real threshold, + real val, + bool inplace) +{ + THCUNN_assertSameGPU_generic(state, 2, input, output); + + if (inplace) + { + THC_pointwiseApply1(state, input, + ThresholdUpdateOutputIP<real>(threshold, val) + ); + THCTensor_(set)(state, output, input); + } + else + { + THCTensor_(resizeAs)(state, output, input); + THC_pointwiseApply2(state, output, input, + ThresholdUpdateOutput<real>(threshold, val) + ); + } + + THCudaCheck(cudaGetLastError()); +} + +void THNN_(Threshold_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradInput, + real threshold, + real val, + bool inplace) +{ + THCUNN_assertSameGPU_generic(state, 3, input, gradInput, gradOutput); + + if (inplace) + { + THC_pointwiseApply2(state, gradOutput, input, + ThresholdUpdateGradInputIP<real>(threshold) + ); + THCTensor_(set)(state, gradInput, gradOutput); + } + else + { + THCTensor_(resizeAs)(state, gradInput, input); + THC_pointwiseApply3(state, gradInput, input, gradOutput, + ThresholdUpdateGradInput<real>(threshold) + ); + } + + THCudaCheck(cudaGetLastError()); +} + +#endif diff --git a/lib/THCUNN/generic/VolumetricAveragePooling.cu b/lib/THCUNN/generic/VolumetricAveragePooling.cu new file mode 100644 index 0000000..62823a8 --- /dev/null +++ b/lib/THCUNN/generic/VolumetricAveragePooling.cu @@ -0,0 +1,237 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/VolumetricAveragePooling.cu" +#else + +void THNN_(VolumetricAveragePooling_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output, + int kT, int kW, int kH, + int dT, int dW, int dH) +{ + int batchSize; + int inputSlices; + int inputTime; + int inputHeight; + int inputWidth; + + if (THCTensor_(nDimension)(state, input) == 4) + { + THArgCheck( + THCTensor_(size)(state, input, 1) >= kT && + THCTensor_(size)(state, input, 2) >= kH && + THCTensor_(size)(state, input, 3) >= kW, 2, + "input image smaller than kernel size" + ); + + /* sizes */ + batchSize = 1; + inputSlices = THCTensor_(size)(state, input, 0); + inputTime = THCTensor_(size)(state, input, 1); + inputHeight = THCTensor_(size)(state, input, 2); + inputWidth = THCTensor_(size)(state, input, 3); + } + else if (THCTensor_(nDimension)(state, input) == 5) + { + THArgCheck( + THCTensor_(size)(state, input, 2) >= kT && + THCTensor_(size)(state, input, 3) >= kH && + THCTensor_(size)(state, input, 4) >= kW, 2, + "input image smaller than kernel size" + + ); + /* sizes */ + batchSize = THCTensor_(size)(state, input, 0); + inputSlices = THCTensor_(size)(state, input, 1); + inputTime = THCTensor_(size)(state, input, 2); + inputHeight = THCTensor_(size)(state, input, 3); + inputWidth = THCTensor_(size)(state, input, 4); + } + else + { + THArgCheck(false, 2, "4D or 5D tensor expected"); + } + + int outputTime = (inputTime - kT) / dT + 1; + int outputHeight = (inputHeight - kH) / dH + 1; + int outputWidth = (inputWidth - kW) / dW + 1; + + if (input->nDimension == 4) /* 4D */ + { + /* resize output */ + THCTensor_(resize4d)(state, output, inputSlices, + outputTime, outputHeight, outputWidth); + } + else /* 5D */ + { + THCTensor_(resize5d)(state, output, batchSize, inputSlices, + outputTime, outputHeight, outputWidth); + } + + input = THCTensor_(newContiguous)(state, input); + + // Collapse batch and feature dimensions + THCDeviceTensor<real, 4> cudaInput; + THCDeviceTensor<real, 4> cudaOutput; + if (THCTensor_(nDimension)(state, input) == 4) + { + cudaInput = toDeviceTensor<real, 4>(state, input); + cudaOutput = toDeviceTensor<real, 4>(state, output); + } + else + { + cudaInput = toDeviceTensor<real, 5>(state, input).downcastOuter<4>(); + cudaOutput = toDeviceTensor<real, 5>(state, output).downcastOuter<4>(); + } + + int totalZ = outputTime * inputSlices * batchSize; + int offsetZ = 0; + dim3 block(32, 8); + while (totalZ > 0) { + dim3 grid(THCCeilDiv(outputWidth, static_cast<int>(block.x)), + THCCeilDiv(outputHeight, static_cast<int>(block.y)), + totalZ > 65535 ? 65535 : totalZ); + + accreal normFactor = ScalarConvert<int, accreal>::to(1) / static_cast<accreal>(kT * kH * kW); + switch (kW) + { + LAUNCH_UPDATE_OUTPUT_KERNEL_WIDTH(1); + LAUNCH_UPDATE_OUTPUT_KERNEL_WIDTH(2); + LAUNCH_UPDATE_OUTPUT_KERNEL_WIDTH(3); + LAUNCH_UPDATE_OUTPUT_KERNEL_WIDTH(4); + LAUNCH_UPDATE_OUTPUT_KERNEL_WIDTH(5); + LAUNCH_UPDATE_OUTPUT_KERNEL_WIDTH(6); + LAUNCH_UPDATE_OUTPUT_KERNEL_WIDTH(7); + default: + cuda_VolumetricAveragePooling_updateOutput<real, accreal><<<grid, block>>>( + cudaInput, + cudaOutput, + kT, kH, kW, + dT, dH, dW, + normFactor, + offsetZ + ); + break; + } + totalZ -= 65535; + offsetZ += 65535; + THCudaCheck(cudaGetLastError()); + } + THCTensor_(free)(state, input); +} + +void THNN_(VolumetricAveragePooling_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradInput, + int kT, int kW, int kH, + int dT, int dW, int dH) +{ + bool kernelsOverlap = (dT < kT) || (dH < kH) || (dW < kW); + + // Resize and initialize result tensor. + THCTensor_(resizeAs)(state, gradInput, input); + THCTensor_(zero)(state, gradInput); + + int batchSize; + int inputSlices; + int inputTime; + int inputHeight; + int inputWidth; + + int outputTime; + int outputHeight; + int outputWidth; + + if (THCTensor_(nDimension)(state, input) == 4) /* 4D */ + { + batchSize = 1; + inputSlices = THCTensor_(size)(state, input, 0); + inputTime = THCTensor_(size)(state, input, 1); + inputHeight = THCTensor_(size)(state, input, 2); + inputWidth = THCTensor_(size)(state, input, 3); + + outputTime = THCTensor_(size)(state, gradOutput, 1); + outputHeight = THCTensor_(size)(state, gradOutput, 2); + outputWidth = THCTensor_(size)(state, gradOutput, 3); + } + else + { + batchSize = THCTensor_(size)(state, input, 0); + inputSlices = THCTensor_(size)(state, input, 1); + inputTime = THCTensor_(size)(state, input, 2); + inputHeight = THCTensor_(size)(state, input, 3); + inputWidth = THCTensor_(size)(state, input, 4); + + outputTime = THCTensor_(size)(state, gradOutput, 2); + outputHeight = THCTensor_(size)(state, gradOutput, 3); + outputWidth = THCTensor_(size)(state, gradOutput, 4); + } + + gradOutput = THCTensor_(newContiguous)(state, gradOutput); + + // Collapse batch and feature dimensions + THCDeviceTensor<real, 4> cudaGradInput; + THCDeviceTensor<real, 4> cudaGradOutput; + if (THCTensor_(nDimension)(state, input) == 4) + { + cudaGradInput = toDeviceTensor<real, 4>(state, gradInput); + cudaGradOutput = toDeviceTensor<real, 4>(state, gradOutput); + } + else + { + cudaGradInput = + toDeviceTensor<real, 5>(state, gradInput).downcastOuter<4>(); + cudaGradOutput = + toDeviceTensor<real, 5>(state, gradOutput).downcastOuter<4>(); + } + + dim3 block(32, 8); + + // Optimizing for stride 1 is probably only of limited value, but this + // specialization yields 3x speedup over the atomicAdd implementation. + if (dT == 1 && dH == 1 && dW == 1) + { + int totalZ = inputTime * inputSlices * batchSize; + int offsetZ = 0; + while (totalZ > 0) { + dim3 grid(THCCeilDiv(inputWidth, static_cast<int>(block.x)), + THCCeilDiv(inputHeight, static_cast<int>(block.y)), + totalZ > 65535 ? 65535 : totalZ); + cuda_VolumetricAveragePooling_updateGradInput_Stride1<real, accreal><<<grid, block>>>( + cudaGradOutput, cudaGradInput, kT, kH, kW, 1.0f/(kT * kH * kW), offsetZ); + THCudaCheck(cudaGetLastError()); + totalZ -= 65535; + offsetZ += 65535; + } + } + else + { + int totalZ = outputTime * inputSlices * batchSize; + int offsetZ = 0; + while (totalZ > 0) { + + dim3 grid(THCCeilDiv(outputWidth, static_cast<int>(block.x)), + THCCeilDiv(outputHeight, static_cast<int>(block.y)), + totalZ > 65535 ? 65535 : totalZ); + if (kernelsOverlap) + { + cuda_VolumetricAveragePooling_updateGradInput_atomicAdd<real, accreal><<<grid, block>>>( + cudaGradOutput, cudaGradInput, kT, kH, kW, dT, dH, dW, offsetZ); + } + else + { + cuda_VolumetricAveragePooling_updateGradInput<real, accreal><<<grid, block>>>( + cudaGradOutput, cudaGradInput, kT, kH, kW, dT, dH, dW, offsetZ); + } + THCudaCheck(cudaGetLastError()); + totalZ -= 65535; + offsetZ += 65535; + } + } + + THCTensor_(free)(state, gradOutput); +} + +#endif diff --git a/lib/THCUNN/generic/VolumetricConvolution.cu b/lib/THCUNN/generic/VolumetricConvolution.cu new file mode 100644 index 0000000..8ed0bca --- /dev/null +++ b/lib/THCUNN/generic/VolumetricConvolution.cu @@ -0,0 +1,416 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/VolumetricConvolution.cu" +#else + +void THNN_(VolumetricConvolution_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output, + THCTensor *weight, + THCTensor *bias, + THCTensor *finput, + THCTensor *fgradInput, + int dT, int dW, int dH, + int padT, int padW, int padH) +{ + THCTensor *columns = finput; + THCTensor *ones = fgradInput; + THCUNN_assertSameGPU_generic(state, 6, input, output, weight, bias, columns, ones); + + THArgCheck(input->nDimension == 4 || input->nDimension == 5, 2, + "4D or 5D (batch mode) tensor is expected" + ); + + THArgCheck(weight->nDimension == 5, 4, + "5D weight tensor is expected (nOutputPlane x nInputPlane x kT x kH x kW)" + ); + + int nOutputPlane = (int)weight->size[0]; + int nInputPlane = (int)weight->size[1]; + int kT = (int)weight->size[2]; + int kH = (int)weight->size[3]; + int kW = (int)weight->size[4]; + + int batch = 1; + if (input->nDimension == 4) + { + // Force batch + batch = 0; + THCTensor_(resize5d)(state, input, 1, input->size[0], input->size[1], + input->size[2], input->size[3]); + } + + long inputWidth = input->size[3]; + long inputHeight = input->size[2]; + long inputDepth = input->size[4]; + long outputWidth = (inputWidth + 2*padH - kH) / dH + 1; + long outputHeight = (inputHeight + 2*padT - kT) / dT + 1; + long outputDepth = (inputDepth + 2*padW - kW) / dW + 1; + + // Batch size + input planes + long batchSize = input->size[0]; + + // Resize output + THCTensor_(resize5d)(state, output, batchSize, nOutputPlane, + outputHeight, outputWidth, outputDepth); + + // Resize temporary columns + THCTensor_(resize2d)(state, columns, nInputPlane*kW*kH*kT, outputDepth*outputHeight*outputWidth); + + // Define a buffer of ones, for bias accumulation + // Note: this buffer can be shared with other modules, it only ever gets increased, + // and always contains ones. + if (ones->nDimension != 3 || ones->size[0]*ones->size[1]*ones->size[2] < outputDepth*outputHeight*outputWidth) + { + // Resize plane and fill with ones... + THCTensor_(resize3d)(state, ones, outputHeight, outputWidth, outputDepth); + THCTensor_(fill)(state, ones, ScalarConvert<int, real>::to(1)); + } + + // Helpers + THCTensor *input_n = THCTensor_(new)(state); + THCTensor *output_n = THCTensor_(new)(state); + + // For each elt in batch, do: + for (int elt = 0; elt < batchSize; elt ++) + { + // Matrix mulitply per output: + THCTensor_(select)(state, input_n, input, 0, elt); + THCTensor_(select)(state, output_n, output, 0, elt); + + // Do Bias first: + // M,N,K are dims of matrix A and B + // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm) + long m_ = nOutputPlane; + long n_ = outputDepth * outputHeight * outputWidth; + long k_ = 1; + + // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices) + #ifdef THC_REAL_IS_FLOAT + THCudaBlas_Sgemm( + #elif defined(THC_REAL_IS_HALF) + THCudaBlas_Hgemm( + #elif defined(THC_REAL_IS_DOUBLE) + THCudaBlas_Dgemm( + #endif + state, + 't', 'n', + n_, m_, k_, + ScalarConvert<int, real>::to(1), + THCTensor_(data)(state, ones), k_, + THCTensor_(data)(state, bias), k_, + ScalarConvert<int, real>::to(0), + THCTensor_(data)(state, output_n), n_ + ); + + // Extract columns: + im3d2col( + THCState_getCurrentStream(state), + THCTensor_(data)(state, input_n), + nInputPlane, inputHeight, inputWidth, inputDepth, kT, kH, kW, padT, padH, padW, dT, dH, dW, + THCTensor_(data)(state, columns) + ); + + // M,N,K are dims of matrix A and B + // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm) + long m = weight->size[0]; + long n = columns->size[1]; + long k = weight->size[1]*weight->size[2]*weight->size[3]*weight->size[4]; + + // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices) + #ifdef THC_REAL_IS_FLOAT + THCudaBlas_Sgemm( + #elif defined(THC_REAL_IS_HALF) + THCudaBlas_Hgemm( + #elif defined(THC_REAL_IS_DOUBLE) + THCudaBlas_Dgemm( + #endif + state, + 'n', 'n', + n, m, k, + ScalarConvert<int, real>::to(1), + THCTensor_(data)(state, columns), n, + THCTensor_(data)(state, weight), k, + ScalarConvert<int, real>::to(1), + THCTensor_(data)(state, output_n), n + ); + } + + // Free + THCTensor_(free)(state, input_n); + THCTensor_(free)(state, output_n); + + // Resize output + if (batch == 0) + { + THCTensor_(resize4d)(state, output, nOutputPlane, outputHeight, outputWidth, outputDepth); + THCTensor_(resize4d)(state, input, nInputPlane, inputHeight, inputWidth, inputDepth); + } +} + +void THNN_(VolumetricConvolution_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradInput, + THCTensor *weight, + THCTensor *finput, + int dT, int dW, int dH, + int padT, int padW, int padH) +{ + THArgCheck(weight->nDimension == 5, 4, + "5D weight tensor is expected (nOutputPlane x nInputPlane x kT x kH x kW)" + ); + + int nOutputPlane = (int)weight->size[0]; + int nInputPlane = (int)weight->size[1]; + int kT = (int)weight->size[2]; + int kH = (int)weight->size[3]; + int kW = (int)weight->size[4]; + + THCTensor *gradColumns = finput; + + THCUNN_assertSameGPU_generic(state, 5, input, gradOutput, weight, gradColumns, gradInput); + THArgCheck(input->nDimension == 4 || input->nDimension == 5, 2, + "4D or 5D (batch mode) tensor is expected" + ); + + int batch = 1; + if (input->nDimension == 4) + { + // Force batch + batch = 0; + THCTensor_(resize5d)(state, input, 1, input->size[0], input->size[1], input->size[2], input->size[3]); + THCTensor_(resize5d)(state, gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2], gradOutput->size[3]); + } + + long inputWidth = input->size[3]; + long inputHeight = input->size[2]; + long inputDepth = input->size[4]; + long outputWidth = (inputWidth + 2*padH - kH) / dH + 1; + long outputHeight = (inputHeight + 2*padT - kT) / dT + 1; + long outputDepth = (inputDepth + 2*padW - kW) / dW + 1; + + // Batch size + input planes + long batchSize = input->size[0]; + + // Resize output + THCTensor_(resize5d)(state, gradInput, batchSize, nInputPlane, inputHeight, inputWidth, inputDepth); + + // Resize temporary columns + THCTensor_(resize2d)(state, gradColumns, nInputPlane*kH*kT*kW, outputDepth*outputHeight*outputWidth); + + // Helpers + THCTensor *input_n = THCTensor_(new)(state); + THCTensor *gradInput_n = THCTensor_(new)(state); + THCTensor *gradOutput_n = THCTensor_(new)(state); + + // For each elt in batch, do: + for (int elt = 0; elt < batchSize; elt ++) + { + // Matrix mulitply per sample: + THCTensor_(select)(state, input_n, input, 0, elt); + THCTensor_(select)(state, gradInput_n, gradInput, 0, elt); + THCTensor_(select)(state, gradOutput_n, gradOutput, 0, elt); + + // M,N,K are dims of matrix A and B + // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm) + long m = weight->size[1]*weight->size[2]*weight->size[3]*weight->size[4]; + long n = gradColumns->size[1]; + long k = weight->size[0]; + + // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices) + #ifdef THC_REAL_IS_FLOAT + THCudaBlas_Sgemm( + #elif defined(THC_REAL_IS_HALF) + THCudaBlas_Hgemm( + #elif defined(THC_REAL_IS_DOUBLE) + THCudaBlas_Dgemm( + #endif + state, + 'n', 't', + n, m, k, + ScalarConvert<int, real>::to(1), + THCTensor_(data)(state, gradOutput_n), n, + THCTensor_(data)(state, weight), m, + ScalarConvert<int, real>::to(0), + THCTensor_(data)(state, gradColumns), n + ); + + // Unpack columns back into input: + col2im3d<real, accreal>( + THCState_getCurrentStream(state), + THCTensor_(data)(state, gradColumns), + nInputPlane, inputHeight, inputWidth, inputDepth, kT, kH, kW, padT, padH, padW, dT, dH, dW, + THCTensor_(data)(state, gradInput_n) + ); + } + + // Free + THCTensor_(free)(state, input_n); + THCTensor_(free)(state, gradInput_n); + THCTensor_(free)(state, gradOutput_n); + + // Resize output + if (batch == 0) + { + THCTensor_(resize4d)(state, gradOutput, nOutputPlane, outputHeight, outputWidth, outputDepth); + THCTensor_(resize4d)(state, input, nInputPlane, inputHeight, inputWidth, inputDepth); + THCTensor_(resize4d)(state, gradInput, nInputPlane, inputHeight, inputWidth, inputDepth); + } +} + +void THNN_(VolumetricConvolution_accGradParameters)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradWeight, + THCTensor *gradBias, + THCTensor *finput, + THCTensor *fgradInput, + int dT, int dW, int dH, + int padT, int padW, int padH, + real scale) +{ + THCTensor *columns = finput; + THCTensor *ones = fgradInput; + THCUNN_assertSameGPU_generic(state, 6, input, gradOutput, gradWeight, gradBias, columns, ones); + + THArgCheck(gradWeight->nDimension == 5, 4, + "5D gradWeight tensor is expected (nOutputPlane x nInputPlane x kT x kH x kW)" + ); + + int nOutputPlane = (int)gradWeight->size[0]; + int nInputPlane = (int)gradWeight->size[1]; + int kT = (int)gradWeight->size[2]; + int kH = (int)gradWeight->size[3]; + int kW = (int)gradWeight->size[4]; + + THArgCheck( + input->nDimension == 4 || input->nDimension == 5, 2, + "3D or 4D (batch mode) tensor is expected" + ); + + int batch = 1; + if (input->nDimension == 4) + { + // Force batch + batch = 0; + THCTensor_(resize5d)(state, input, 1, input->size[0], input->size[1], input->size[2], input->size[3]); + THCTensor_(resize5d)(state, gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2], gradOutput->size[3]); + } + + long inputWidth = input->size[3]; + long inputHeight = input->size[2]; + long inputDepth = input->size[4]; + long outputWidth = (inputWidth + 2*padH - kH) / dH + 1; + long outputHeight = (inputHeight + 2*padT - kT) / dT + 1; + long outputDepth = (inputDepth + 2*padW - kW) / dW + 1; + + // Batch size + input planes + long batchSize = input->size[0]; + + // Define a buffer of ones, for bias accumulation + if (ones->nDimension != 3 || ones->size[0]*ones->size[1]*ones->size[2] < outputDepth*outputHeight*outputWidth) + { + // Resize plane and fill with ones... + THCTensor_(resize3d)(state, ones, outputHeight, outputWidth, outputDepth); + THCTensor_(fill)(state, ones, ScalarConvert<int, real>::to(1)); + } + + // Resize temporary columns + THCTensor_(resize2d)(state, columns, nInputPlane*kH*kT*kW, outputDepth*outputHeight*outputWidth); + + // Helpers + THCTensor *input_n = THCTensor_(new)(state); + THCTensor *gradOutput_n = THCTensor_(new)(state); + + // For each elt in batch, do: + for (int elt = 0; elt < batchSize; elt ++) + { + // Matrix mulitply per output: + THCTensor_(select)(state, input_n, input, 0, elt); + THCTensor_(select)(state, gradOutput_n, gradOutput, 0, elt); + + // Extract columns: + im3d2col( + THCState_getCurrentStream(state), + THCTensor_(data)(state, input_n), + nInputPlane, inputHeight, inputWidth, inputDepth, kT, kH, kW, padT, padH, padW, dT, dH, dW, + THCTensor_(data)(state, columns) + ); + + // M,N,K are dims of matrix A and B + // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm) + long m = gradWeight->size[0]; + long n = gradWeight->size[1]*gradWeight->size[2]*gradWeight->size[3]*gradWeight->size[4]; + long k = columns->size[1]; + + // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices) + #ifdef THC_REAL_IS_FLOAT + THCudaBlas_Sgemm( + #elif defined(THC_REAL_IS_HALF) + THCudaBlas_Hgemm( + #elif defined(THC_REAL_IS_DOUBLE) + THCudaBlas_Dgemm( + #endif + state, + 't', 'n', + n, m, k, + scale, + THCTensor_(data)(state, columns), k, + THCTensor_(data)(state, gradOutput_n), k, + ScalarConvert<int, real>::to(1), + THCTensor_(data)(state, gradWeight), n + ); + + // Do Bias: + // M,N,K are dims of matrix A and B + // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm) + long m_ = nOutputPlane; + long k_ = outputDepth * outputHeight * outputWidth; + + // Do GEMV (note: this is a bit confusing because gemv assumes column-major matrices) + #if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) + #ifdef THC_REAL_IS_FLOAT + THCudaBlas_Sgemv( + #elif defined(THC_REAL_IS_DOUBLE) + THCudaBlas_Dgemv( + #endif + state, + 't', + k_, m_, + scale, + THCTensor_(data)(state, gradOutput_n), k_, + THCTensor_(data)(state, ones), 1, + ScalarConvert<int, real>::to(1), + THCTensor_(data)(state, gradBias), 1 + ); + #endif + #ifdef THC_REAL_IS_HALF + THCudaBlas_Hgemm( + state, + 't', 'n', + m_, 1, k_, + scale, + THCTensor_(data)(state, gradOutput_n), k_, + THCTensor_(data)(state, ones), k_, + ScalarConvert<int, real>::to(1), + THCTensor_(data)(state, gradBias), m_ + ); + #endif + } + + // Free + THCTensor_(free)(state, input_n); + THCTensor_(free)(state, gradOutput_n); + + // Resize + if (batch == 0) + { + THCTensor_(resize4d)(state, gradOutput, nOutputPlane, outputHeight, outputWidth, outputDepth); + THCTensor_(resize4d)(state, input, nInputPlane, inputHeight, inputWidth, inputDepth); + } +} + +#endif diff --git a/lib/THCUNN/generic/VolumetricDilatedConvolution.cu b/lib/THCUNN/generic/VolumetricDilatedConvolution.cu new file mode 100644 index 0000000..22536fe --- /dev/null +++ b/lib/THCUNN/generic/VolumetricDilatedConvolution.cu @@ -0,0 +1,411 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/VolumetricDilatedConvolution.cu" +#else + +void THNN_(VolumetricDilatedConvolution_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output, + THCTensor *weight, + THCTensor *bias, + THCTensor *columns, + THCTensor *ones, + int kT, int kW, int kH, + int dT, int dW, int dH, + int padT, int padW, int padH, + int dilationT, int dilationW, int dilationH) { + + THCUNN_assertSameGPU_generic(state, 5, input, output, weight, columns, ones); + if (bias) { + THCUNN_assertSameGPU_generic(state, 2, weight, bias); + } + THArgCheck(input->nDimension == 4 || input->nDimension == 5, 2, "4D or 5D (batch mode) tensor is expected, but got: %d", input->nDimension); + THArgCheck(weight->nDimension == 5, 4, "weight tensor must be 5D (nOutputPlane,nInputPlane,kT,kH,kW)"); + THArgCheck(!bias || weight->size[0] == bias->size[0], 4, "nOutputPlane mismatch in weight and bias"); + THArgCheck(kT > 0 && kW > 0 && kH > 0, 8, "kernel size should be greater than zero"); + THArgCheck(dT > 0 && dW > 0 && dH > 0, 10, "stride should be greater than zero"); + THArgCheck(dilationT > 0 && dilationW > 0 && dilationH > 0, 16, "dilation should be greater than 0"); + + // Params: + int nInputPlane = weight->size[1]; + int nOutputPlane = weight->size[0]; + + int batch = 1; + if (input->nDimension == 4) { + THArgCheck(input->size[0] == nInputPlane, 2, "input channels and nInputPlane dont match"); + // Force batch + batch = 0; + THCTensor_(resize5d)(state, input, 1, input->size[0], input->size[1], input->size[2], input->size[3]); + } else { + THArgCheck(input->size[1] == nInputPlane, 2, "input channels and nInputPlane dont match"); + } + + long inputDepth = input->size[2]; + long inputHeight = input->size[3]; + long inputWidth = input->size[4]; + long outputDepth = (inputDepth + 2*padT - (dilationT * (kT - 1) + 1)) / dT + 1; + long outputHeight = (inputHeight + 2*padH - (dilationH * (kH - 1) + 1)) / dH + 1; + long outputWidth = (inputWidth + 2*padW - (dilationW * (kW - 1) + 1)) / dW + 1; + + if (outputDepth < 1 || outputWidth < 1 || outputHeight < 1) + THError("Given input size: (%dx%dx%dx%d). Calculated output size: (%dx%dx%dx%d). Output size is too small", + nInputPlane,inputDepth,inputHeight,inputWidth,nOutputPlane,outputDepth,outputHeight,outputWidth); + + // Batch size + input planes + long batchSize = input->size[0]; + + // Resize output + THCTensor_(resize5d)(state, output, batchSize, nOutputPlane, outputDepth, outputHeight, outputWidth); + + // Resize temporary columns + THCTensor_(resize2d)(state, columns, nInputPlane*kT*kW*kH, outputDepth*outputHeight*outputWidth); + + // Define a buffer of ones, for bias accumulation + // Note: this buffer can be shared with other modules, it only ever gets increased, + // and always contains ones. + if (ones->nDimension != 2 || ones->size[0]*ones->size[1]*ones->size[2] < outputDepth*outputHeight*outputWidth) { + // Resize plane and fill with ones... + THCTensor_(resize3d)(state, ones, outputDepth, outputHeight, outputWidth); + THCTensor_(fill)(state, ones, ScalarConvert<int, real>::to(1)); + } + + // Helpers + THCTensor *input_n = THCTensor_(new)(state); + THCTensor *output_n = THCTensor_(new)(state); + + // For each elt in batch, do: + for (int elt = 0; elt < batchSize; elt ++) { + // Matrix mulitply per output: + THCTensor_(select)(state, input_n, input, 0, elt); + THCTensor_(select)(state, output_n, output, 0, elt); + + // Do Bias first: + // M,N,K are dims of matrix A and B + // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm) + long m_ = nOutputPlane; + long n_ = outputDepth * outputHeight * outputWidth; + long k_ = 1; + + // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices) + if (bias) { + #ifdef THC_REAL_IS_FLOAT + THCudaBlas_Sgemm( + #elif defined(THC_REAL_IS_HALF) + THCudaBlas_Hgemm( + #elif defined(THC_REAL_IS_DOUBLE) + THCudaBlas_Dgemm( + #endif + state, + 't', 'n', + n_, m_, k_, + ScalarConvert<int, real>::to(1), + THCTensor_(data)(state, ones), k_, + THCTensor_(data)(state, bias), k_, + ScalarConvert<int, real>::to(0), + THCTensor_(data)(state, output_n), n_ + ); + } else { + THCTensor_(zero)(state, output_n); + } + + // Extract columns: + vol2col( + THCState_getCurrentStream(state), + THCTensor_(data)(state, input_n), + nInputPlane, inputDepth, inputHeight, inputWidth, + kT, kH, kW, padT, padH, padW, dT, dH, dW, + dilationT, dilationH, dilationW, + THCTensor_(data)(state, columns) + ); + + // M,N,K are dims of matrix A and B + // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm) + long m = nOutputPlane; + long n = columns->size[1]; + long k = nInputPlane*kT*kH*kW; + + // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices) + #ifdef THC_REAL_IS_FLOAT + THCudaBlas_Sgemm( + #elif defined(THC_REAL_IS_HALF) + THCudaBlas_Hgemm( + #elif defined(THC_REAL_IS_DOUBLE) + THCudaBlas_Dgemm( + #endif + state, + 'n', 'n', + n, m, k, + ScalarConvert<int, real>::to(1), + THCTensor_(data)(state, columns), n, + THCTensor_(data)(state, weight), k, + ScalarConvert<int, real>::to(1), + THCTensor_(data)(state, output_n), n + ); + } + + // Free + THCTensor_(free)(state, input_n); + THCTensor_(free)(state, output_n); + + // Resize output + if (batch == 0) { + THCTensor_(resize4d)(state, output, nOutputPlane, outputDepth, outputHeight, outputWidth); + THCTensor_(resize4d)(state, input, nInputPlane, inputDepth, inputHeight, inputWidth); + } +} + +void THNN_(VolumetricDilatedConvolution_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradInput, + THCTensor *weight, + THCTensor *gradColumns, + int kT, int kW, int kH, + int dT, int dW, int dH, + int padT, int padW, int padH, + int dilationT, int dilationW, int dilationH) { + + THCUNN_assertSameGPU_generic(state, 5, input, gradOutput, weight, + gradColumns, gradInput); + THArgCheck(input->nDimension == 4 || input->nDimension == 5, 2, "4D or 5D (batch mode) tensor is expected"); + THArgCheck(gradOutput->nDimension == 4 || gradOutput->nDimension == 5, 3, "4D or 5D (batch mode) tensor is expected"); + THArgCheck(weight->nDimension == 5, 4, "weight tensor must be 5D (nOutputPlane,nInputPlane,kT,kH,kW)"); + THArgCheck(kT > 0 && kW > 0 && kH > 0, 8, "kernel size should be greater than zero"); + THArgCheck(dT > 0 && dW > 0 && dH > 0, 10, "stride should be greater than zero"); + + // Params + int nInputPlane = weight->size[1]; + int nOutputPlane = weight->size[0]; + + int batch = 1; + if (input->nDimension == 4) { + // Force batch + batch = 0; + THCTensor_(resize5d)(state, input, 1, input->size[0], input->size[1], input->size[2], input->size[3]); + THCTensor_(resize5d)(state, gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2], gradOutput->size[3]); + } + + long inputDepth = input->size[2]; + long inputWidth = input->size[4]; + long inputHeight = input->size[3]; + long outputDepth = (inputDepth + 2*padT - (dilationT * (kT - 1) + 1)) / dT + 1; + long outputWidth = (inputWidth + 2*padW - (dilationW * (kW - 1) + 1)) / dW + 1; + long outputHeight = (inputHeight + 2*padH - (dilationH * (kH - 1) + 1)) / dH + 1; + + // Batch size + input planes + long batchSize = input->size[0]; + + // Resize output + THCTensor_(resize5d)(state, gradInput, batchSize, nInputPlane, inputDepth, inputHeight, inputWidth); + + // Resize temporary columns + THCTensor_(resize2d)(state, gradColumns, nInputPlane*kT*kW*kH, outputDepth*outputHeight*outputWidth); + + // Helpers + THCTensor *gradInput_n = THCTensor_(new)(state); + THCTensor *gradOutput_n = THCTensor_(new)(state); + + // For each elt in batch, do: + for (int elt = 0; elt < batchSize; elt ++) { + // Matrix mulitply per sample: + THCTensor_(select)(state, gradInput_n, gradInput, 0, elt); + THCTensor_(select)(state, gradOutput_n, gradOutput, 0, elt); + + // M,N,K are dims of matrix A and B + // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm) + long m = nInputPlane*kT*kW*kH; + long n = gradColumns->size[1]; + long k = nOutputPlane; + + // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices) + #ifdef THC_REAL_IS_FLOAT + THCudaBlas_Sgemm( + #elif defined(THC_REAL_IS_HALF) + THCudaBlas_Hgemm( + #elif defined(THC_REAL_IS_DOUBLE) + THCudaBlas_Dgemm( + #endif + state, + 'n', 't', + n, m, k, + ScalarConvert<int, real>::to(1), + THCTensor_(data)(state, gradOutput_n), n, + THCTensor_(data)(state, weight), m, + ScalarConvert<int, real>::to(0), + THCTensor_(data)(state, gradColumns), n + ); + + // Unpack columns back into input: + col2vol<real, accreal>( + THCState_getCurrentStream(state), + THCTensor_(data)(state, gradColumns), + nInputPlane, inputDepth, inputHeight, inputWidth, + kT, kH, kW, padT, padH, padW, dT, dH, dW, + dilationT, dilationH, dilationW, + THCTensor_(data)(state, gradInput_n) + ); + } + + // Free + THCTensor_(free)(state, gradInput_n); + THCTensor_(free)(state, gradOutput_n); + + // Resize output + if (batch == 0) { + THCTensor_(resize4d)(state, gradOutput, nOutputPlane, outputDepth, outputHeight, outputWidth); + THCTensor_(resize4d)(state, input, nInputPlane, inputDepth, inputHeight, inputWidth); + THCTensor_(resize4d)(state, gradInput, nInputPlane, inputDepth, inputHeight, inputWidth); + } +} + +void THNN_(VolumetricDilatedConvolution_accGradParameters)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradWeight, + THCTensor *gradBias, + THCTensor *columns, + THCTensor *ones, + int kT, int kW, int kH, + int dT, int dW, int dH, + int padT, int padW, int padH, + int dilationT, int dilationW, int dilationH, + real scale) { + + THCUNN_assertSameGPU_generic(state, 5, input, gradOutput, gradWeight, columns, ones); + if (gradBias) { + THCUNN_assertSameGPU_generic(state, 2, gradWeight, gradBias); + } + THArgCheck(input->nDimension == 4 || input->nDimension == 5, 2, "4D or 5D (batch mode) tensor is expected"); + THArgCheck(gradOutput->nDimension == 4 || gradOutput->nDimension == 5, 3, "4D or 5D (batch mode) tensor is expected"); + THArgCheck(gradWeight->nDimension == 5, 4, "gradWeight tensor must be 5D (nOutputPlane,nInputPlane,kT,kH,kW)"); + THArgCheck(kT > 0 && kW > 0 && kH > 0, 8, "kernel size should be greater than zero"); + THArgCheck(dT > 0 && dW > 0 && dH > 0, 10, "stride should be greater than zero"); + + // Params + int nInputPlane = gradWeight->size[1]; + int nOutputPlane = gradWeight->size[0]; + + int batch = 1; + if (input->nDimension == 4) { + // Force batch + batch = 0; + THCTensor_(resize5d)(state, input, 1, input->size[0], input->size[1], input->size[2], input->size[3]); + THCTensor_(resize5d)(state, gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2], gradOutput->size[3]); + } + + long inputDepth = input->size[2]; + long inputWidth = input->size[4]; + long inputHeight = input->size[3]; + long outputDepth = (inputDepth + 2*padT - (dilationT * (kT - 1) + 1)) / dT + 1; + long outputWidth = (inputWidth + 2*padW - (dilationW * (kW - 1) + 1)) / dW + 1; + long outputHeight = (inputHeight + 2*padH - (dilationH * (kH - 1) + 1)) / dH + 1; + + // Batch size + input planes + long batchSize = input->size[0]; + + // Define a buffer of ones, for bias accumulation + if (ones->nDimension != 3 || ones->size[0]*ones->size[1]*ones->size[2] < outputDepth*outputHeight*outputWidth) { + // Resize plane and fill with ones... + THCTensor_(resize3d)(state, ones, outputDepth, outputHeight, outputWidth); + THCTensor_(fill)(state, ones, ScalarConvert<int, real>::to(1)); + } + + // Resize temporary columns + THCTensor_(resize2d)(state, columns, nInputPlane*kT*kW*kH, outputDepth*outputHeight*outputWidth); + + // Helpers + THCTensor *input_n = THCTensor_(new)(state); + THCTensor *gradOutput_n = THCTensor_(new)(state); + + // For each elt in batch, do: + for (int elt = 0; elt < batchSize; elt ++) { + // Matrix mulitply per output: + THCTensor_(select)(state, input_n, input, 0, elt); + THCTensor_(select)(state, gradOutput_n, gradOutput, 0, elt); + + // Extract columns: + vol2col( + THCState_getCurrentStream(state), + THCTensor_(data)(state, input_n), + nInputPlane, inputDepth, inputHeight, inputWidth, kT, kH, kW, padT, padH, padW, dT, dH, dW, + dilationT, dilationH, dilationW, + THCTensor_(data)(state, columns) + ); + + // M,N,K are dims of matrix A and B + // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm) + long m = nOutputPlane; + long n = nInputPlane*kT*kW*kH; + long k = columns->size[1]; + + // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices) + #ifdef THC_REAL_IS_FLOAT + THCudaBlas_Sgemm( + #elif defined(THC_REAL_IS_HALF) + THCudaBlas_Hgemm( + #elif defined(THC_REAL_IS_DOUBLE) + THCudaBlas_Dgemm( + #endif + state, + 't', 'n', + n, m, k, + scale, + THCTensor_(data)(state, columns), k, + THCTensor_(data)(state, gradOutput_n), k, + ScalarConvert<int, real>::to(1), + THCTensor_(data)(state, gradWeight), n + ); + + // Do Bias: + // M,N,K are dims of matrix A and B + // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm) + long m_ = nOutputPlane; + long k_ = outputDepth * outputHeight * outputWidth; + + // Do GEMV (note: this is a bit confusing because gemv assumes column-major matrices) + if (gradBias) { + #if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) + #ifdef THC_REAL_IS_FLOAT + THCudaBlas_Sgemv( + #elif defined(THC_REAL_IS_DOUBLE) + THCudaBlas_Dgemv( + #endif + state, + 't', + k_, m_, + scale, + THCTensor_(data)(state, gradOutput_n), k_, + THCTensor_(data)(state, ones), 1, + ScalarConvert<int, real>::to(1), + THCTensor_(data)(state, gradBias), 1 + ); + #endif + #ifdef THC_REAL_IS_HALF + THCudaBlas_Hgemm( + state, + 't', 'n', + m_, 1, k_, + scale, + THCTensor_(data)(state, gradOutput_n), k_, + THCTensor_(data)(state, ones), k_, + ScalarConvert<int, real>::to(1), + THCTensor_(data)(state, gradBias), m_ + ); + #endif + } + } + + // Free + THCTensor_(free)(state, input_n); + THCTensor_(free)(state, gradOutput_n); + + // Resize output + if (batch == 0) { + THCTensor_(resize4d)(state, gradOutput, nOutputPlane, outputDepth, outputHeight, outputWidth); + THCTensor_(resize4d)(state, input, nInputPlane, inputDepth, inputHeight, inputWidth); + } +} + +#endif diff --git a/lib/THCUNN/generic/VolumetricDilatedMaxPooling.cu b/lib/THCUNN/generic/VolumetricDilatedMaxPooling.cu new file mode 100644 index 0000000..0d6d084 --- /dev/null +++ b/lib/THCUNN/generic/VolumetricDilatedMaxPooling.cu @@ -0,0 +1,290 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/VolumetricDilatedMaxPooling.cu" +#else + +#define UPDATE_OUTPUT_KERNEL_WIDTH(KW) case KW: \ + cuda_VolumetricDilatedMaxPooling_updateOutput<KW><<<grid, block, \ + 0, THCState_getCurrentStream(state)>>>( \ + cudaInput, cudaIndices, cudaOutput, kT, kH, dT, dH, dW, padT, padH, padW,\ + dilationT, dilationH, dilationW, offsetZ); \ + break + +void THNN_(VolumetricDilatedMaxPooling_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output, + THCIndexTensor *indices, + int kT, int kW, int kH, + int dT, int dW, int dH, + int padT, int padW, int padH, + int dilationT, int dilationW, int dilationH, + bool ceilMode) +{ + int batchSize; + int inputSlices; + int inputTime; + int inputHeight; + int inputWidth; + int outputTime; + int outputHeight; + int outputWidth; + + THCUNN_assertSameGPU_generic(state, 3, input, indices, output); + + if (THCTensor_(nDimension)(state, input) == 4) + { + THArgCheck( + THCTensor_(size)(state, input, 1) >= kT && + THCTensor_(size)(state, input, 2) >= kH && + THCTensor_(size)(state, input, 3) >= kW, 2, + "input image smaller than kernel size" + ); + + /* sizes */ + batchSize = 1; + inputSlices = THCTensor_(size)(state, input, 0); + inputTime = THCTensor_(size)(state, input, 1); + inputHeight = THCTensor_(size)(state, input, 2); + inputWidth = THCTensor_(size)(state, input, 3); + } + else if (THCTensor_(nDimension)(state, input) == 5) + { + THArgCheck( + THCTensor_(size)(state, input, 4) >= kW && + THCTensor_(size)(state, input, 3) >= kH && + THCTensor_(size)(state, input, 2) >= kT, 2, + "input image smaller than kernel size" + ); + + /* sizes */ + batchSize = THCTensor_(size)(state, input, 0); + inputSlices = THCTensor_(size)(state, input, 1); + inputTime = THCTensor_(size)(state, input, 2); + inputHeight = THCTensor_(size)(state, input, 3); + inputWidth = THCTensor_(size)(state, input, 4); + } + else + { + THArgCheck(false, 2, "4D or 5D tensor expected"); + } + + THArgCheck(kT/2 >= padT && kW/2 >= padW && kH/2 >= padH, 2, + "pad should be smaller than half of kernel size" + ); + THArgCheck(dilationT > 0 && dilationW > 0 && dilationH > 0, 14, + "dilation should be greater than 0" + ); + + if (ceilMode) + { + outputTime = (int)(ceil((float)(inputTime - (dilationT * (kT - 1) + 1) + 2*padT) / dT)) + 1; + outputHeight = (int)(ceil((float)(inputHeight - (dilationH * (kH - 1) + 1) + 2*padH) / dH)) + 1; + outputWidth = (int)(ceil((float)(inputWidth - (dilationW * (kW - 1) + 1) + 2*padW) / dW)) + 1; + } + else + { + outputTime = (int)(floor((float)(inputTime - (dilationT * (kT - 1) + 1) + 2*padT) / dT)) + 1; + outputHeight = (int)(floor((float)(inputHeight - (dilationH * (kH - 1) + 1) + 2*padH) / dH)) + 1; + outputWidth = (int)(floor((float)(inputWidth - (dilationW * (kW - 1) + 1) + 2*padW) / dW)) + 1; + } + + if (outputTime < 1 || outputHeight < 1 || outputWidth < 1) + THError("Given input size: (%dx%dx%dx%d). Calculated output size: (%dx%dx%dx%d). Output size is too small", + inputSlices,inputTime,inputHeight,inputWidth,inputSlices,outputTime,outputHeight,outputWidth); + + if (padT || padW || padH) + { + if ((outputTime - 1)*dT >= inputTime + padT) + --outputTime; + if ((outputHeight - 1)*dH >= inputHeight + padH) + --outputHeight; + if ((outputWidth - 1)*dW >= inputWidth + padW) + --outputWidth; + } + + if (input->nDimension == 4) /* 4D */ + { + /* resize output */ + THCTensor_(resize4d)(state, output, inputSlices, + outputTime, outputHeight, outputWidth); + /* indices pack ti,i,j locations for each output point as uchar into + each float of the tensor */ + THCIndexTensor_(resize4d)(state, indices, inputSlices, + outputTime, outputHeight, outputWidth); + } + else + { /* 5D */ + THCTensor_(resize5d)(state, output, batchSize, inputSlices, + outputTime, outputHeight, outputWidth); + // Index tensor packs index offsets as uchars into floats + THCIndexTensor_(resize5d)(state, indices, batchSize, inputSlices, + outputTime, outputHeight, outputWidth); + } + + input = THCTensor_(newContiguous)(state, input); + + // Collapse batch and feature dimensions + THCDeviceTensor<real, 4> cudaInput; + THCDeviceTensor<real, 4> cudaOutput; + if (THCTensor_(nDimension)(state, input) == 4) + { + cudaInput = toDeviceTensor<real, 4>(state, input); + cudaOutput = toDeviceTensor<real, 4>(state, output); + } + else + { + cudaInput = toDeviceTensor<real, 5>(state, input).downcastOuter<4>(); + cudaOutput = toDeviceTensor<real, 5>(state, output).downcastOuter<4>(); + } + + THLongStorage *indicesSize = THLongStorage_newWithSize(4); + long indicesSizeRaw[4] = { batchSize * inputSlices, + outputTime, outputHeight, outputWidth }; + THLongStorage_rawCopy(indicesSize, indicesSizeRaw); + + THCIndexTensor *indices1 = THCIndexTensor_(newWithStorage)( + state, THCIndexTensor_(storage)(state, indices), + THCIndexTensor_(storageOffset)(state, indices), + indicesSize, NULL); + + THLongStorage_free(indicesSize); + + THCDeviceTensor<THCIndex_t, 4> cudaIndices = + toDeviceTensor<THCIndex_t, 4>(state, indices1); + + int totalZ = outputTime * inputSlices * batchSize; + int offsetZ = 0; + dim3 block(32, 8); + + while (totalZ > 0) { + dim3 grid(THCCeilDiv(outputWidth, static_cast<int>(block.x)), + THCCeilDiv(outputHeight, static_cast<int>(block.y)), + totalZ > 65535 ? 65535 : totalZ); + + switch (kW) + { + UPDATE_OUTPUT_KERNEL_WIDTH(1); + UPDATE_OUTPUT_KERNEL_WIDTH(2); + UPDATE_OUTPUT_KERNEL_WIDTH(3); + UPDATE_OUTPUT_KERNEL_WIDTH(4); + UPDATE_OUTPUT_KERNEL_WIDTH(5); + UPDATE_OUTPUT_KERNEL_WIDTH(6); + UPDATE_OUTPUT_KERNEL_WIDTH(7); + default: + cuda_VolumetricDilatedMaxPooling_updateOutput<<<grid, block, + 0, THCState_getCurrentStream(state)>>>( + cudaInput, cudaIndices, cudaOutput, + kT, kH, kW, dT, dH, dW, + padT, padH, padW, dilationT, dilationH, dilationW, offsetZ); + } + THCudaCheck(cudaGetLastError()); + totalZ -= 65535; + offsetZ += 65535; + } + + THCTensor_(free)(state, input); + THCIndexTensor_(free)(state, indices1); +} + +#undef UPDATE_OUTPUT_KERNEL_WIDTH + +void THNN_(VolumetricDilatedMaxPooling_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradInput, + THCIndexTensor *indices, + int dT, int dW, int dH, + int padT, int padW, int padH, + int dilationT, int dilationW, int dilationH) +{ + // Resize and initialize result tensor. + THCTensor_(resizeAs)(state, gradInput, input); + THCTensor_(zero)(state, gradInput); + + int batchSize; + int inputSlices; + + int outputTime; + int outputHeight; + int outputWidth; + + THCUNN_assertSameGPU_generic(state, 4, input, indices, gradOutput, gradInput); + + if (THCTensor_(nDimension)(state, input) == 4) /* 4D */ + { + batchSize = 1; + inputSlices = THCTensor_(size)(state, input, 0); + + outputTime = THCTensor_(size)(state, gradOutput, 1); + outputHeight = THCTensor_(size)(state, gradOutput, 2); + outputWidth = THCTensor_(size)(state, gradOutput, 3); + } + else + { + batchSize = THCTensor_(size)(state, input, 0); + inputSlices = THCTensor_(size)(state, input, 1); + + outputTime = THCTensor_(size)(state, gradOutput, 2); + outputHeight = THCTensor_(size)(state, gradOutput, 3); + outputWidth = THCTensor_(size)(state, gradOutput, 4); + } + + gradOutput = THCTensor_(newContiguous)(state, gradOutput); + + // Collapse batch and feature dimensions + THCDeviceTensor<real, 4> cudaGradInput; + THCDeviceTensor<real, 4> cudaGradOutput; + if (THCTensor_(nDimension)(state, input) == 4) + { + cudaGradInput = toDeviceTensor<real, 4>(state, gradInput); + cudaGradOutput = toDeviceTensor<real, 4>(state, gradOutput); + } + else + { + cudaGradInput = + toDeviceTensor<real, 5>(state, gradInput).downcastOuter<4>(); + cudaGradOutput = + toDeviceTensor<real, 5>(state, gradOutput).downcastOuter<4>(); + } + + THLongStorage *indicesSize = THLongStorage_newWithSize(4); + long indicesSizeRaw[4] = { batchSize * inputSlices, + outputTime, outputHeight, outputWidth }; + THLongStorage_rawCopy(indicesSize, indicesSizeRaw); + THCIndexTensor *indices1 = THCIndexTensor_(newWithStorage)( + state, THCIndexTensor_(storage)(state, indices), + THCIndexTensor_(storageOffset)(state, indices), indicesSize, NULL); + THLongStorage_free(indicesSize); + + THCDeviceTensor<THCIndex_t, 4> cudaIndices = + toDeviceTensor<THCIndex_t, 4>(state, indices1); + + int totalZ = outputTime * inputSlices * batchSize; + int offsetZ = 0; + dim3 block(32, 8); + + while (totalZ > 0) { + dim3 grid(THCCeilDiv(outputWidth, static_cast<int>(block.x)), + THCCeilDiv(outputHeight, static_cast<int>(block.y)), + totalZ > 65535 ? 65535 : totalZ); + + cuda_VolumetricDilatedMaxPooling_updateGradInput<<<grid, block, + 0, THCState_getCurrentStream(state)>>>( + cudaGradOutput, + cudaIndices, + cudaGradInput, + dT, dH, dW, + padT, padH, padW, + dilationT, dilationH, dilationW, offsetZ); + THCudaCheck(cudaGetLastError()); + totalZ -= 65535; + offsetZ += 65535; + } + + // cleanup + THCTensor_(free)(state, gradOutput); + THCIndexTensor_(free)(state, indices1); +} + +#endif diff --git a/lib/THCUNN/generic/VolumetricFullConvolution.cu b/lib/THCUNN/generic/VolumetricFullConvolution.cu new file mode 100644 index 0000000..7a1f788 --- /dev/null +++ b/lib/THCUNN/generic/VolumetricFullConvolution.cu @@ -0,0 +1,399 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/VolumetricFullConvolution.cu" +#else + +void THNN_(VolumetricFullConvolution_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output, + THCTensor *weight, + THCTensor *bias, + THCTensor *finput, + THCTensor *fgradInput, + int dT, int dW, int dH, + int padT, int padW, int padH, + int adjT, int adjW, int adjH) +{ + + THCTensor *columns = finput; + THCTensor *ones = fgradInput; + + int nInputPlane = THCTensor_(size)(state, weight, 0); + int nOutputPlane = THCTensor_(size)(state, weight, 1); + const int kT = (int)weight->size[2]; + const int kH = (int)weight->size[3]; + const int kW = (int)weight->size[4]; + + THCUNN_assertSameGPU_generic(state, 6, input, output, weight, + bias, columns, ones); + THArgCheck(input->nDimension == 4 || input->nDimension == 5, 2, "4D or 5D (batch mode) tensor is expected"); + + int batch = 1; + if (input->nDimension == 4) { + THArgCheck(input->size[0] == nInputPlane, 2, "input channels and nInputPlane dont match"); + // Force batch + batch = 0; + THCTensor_(resize5d)(state, input, 1, input->size[0], input->size[1], input->size[2], input->size[3]); + } else { + THArgCheck(input->size[1] == nInputPlane, 2, "input channels and nInputPlane dont match"); + } + + long inputWidth = input->size[4]; + long inputHeight = input->size[3]; + long inputDepth = input->size[2]; + long outputWidth = (inputWidth - 1) * dW - 2*padW + kW + adjW; + long outputHeight = (inputHeight - 1) * dH - 2*padH + kH + adjH; + long outputDepth = (inputDepth - 1) * dT - 2*padT + kT + adjT; + + // Batch size + input planes + long batchSize = input->size[0]; + + // Resize output + THCTensor_(resize5d)(state, output, batchSize, nOutputPlane, outputDepth, outputHeight, outputWidth); + + // Resize temporary columns + THCTensor_(resize2d)(state, columns, nOutputPlane*kW*kH*kT, inputDepth*inputHeight*inputWidth); + + // Define a buffer of ones, for bias accumulation + // Note: this buffer can be shared with other modules, it only ever gets increased, + // and always contains ones. + if (ones->nDimension != 3 || ones->size[0]*ones->size[1]*ones->size[2] < outputDepth*outputHeight*outputWidth) { + // Resize plane and fill with ones... + THCTensor_(resize3d)(state, ones, outputDepth, outputHeight, outputWidth); + THCTensor_(fill)(state, ones, ScalarConvert<int, real>::to(1)); + } + + // Helpers + THCTensor *input_n = THCTensor_(new)(state); + THCTensor *output_n = THCTensor_(new)(state); + + // For each elt in batch, do: + for (int elt = 0; elt < batchSize; elt ++) { + // Matrix mulitply per output: + THCTensor_(select)(state, input_n, input, 0, elt); + THCTensor_(select)(state, output_n, output, 0, elt); + + // M,N,K are dims of matrix A and B + // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm) + long m = weight->size[1] * weight->size[2] * weight->size[3] * weight->size[4]; + long n = columns->size[1]; + long k = weight->size[0]; + + // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices) + #ifdef THC_REAL_IS_FLOAT + THCudaBlas_Sgemm( + #elif defined(THC_REAL_IS_HALF) + THCudaBlas_Hgemm( + #elif defined(THC_REAL_IS_DOUBLE) + THCudaBlas_Dgemm( + #endif + state, + 'n', 't', + n, m, k, + ScalarConvert<int, real>::to(1), + THCTensor_(data)(state, input_n), n, + THCTensor_(data)(state, weight), m, + ScalarConvert<int, real>::to(0), + THCTensor_(data)(state, columns), n + ); + + // Unpack columns back into input: + col2vol<real, accreal>( + THCState_getCurrentStream(state), + THCTensor_(data)(state, columns), + nOutputPlane, outputDepth, outputHeight, outputWidth, kT, kH, kW, padT, padH, padW, dT, dH, dW, + 1,1,1, + THCTensor_(data)(state, output_n) + ); + + // Do Bias after: + // M,N,K are dims of matrix A and B + // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm) + long m_ = nOutputPlane; + long n_ = outputDepth * outputHeight * outputWidth; + long k_ = 1; + + // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices) + #ifdef THC_REAL_IS_FLOAT + THCudaBlas_Sgemm( + #elif defined(THC_REAL_IS_HALF) + THCudaBlas_Hgemm( + #elif defined(THC_REAL_IS_DOUBLE) + THCudaBlas_Dgemm( + #endif + state, + 't', 'n', + n_, m_, k_, + ScalarConvert<int, real>::to(1), + THCTensor_(data)(state, ones), k_, + THCTensor_(data)(state, bias), k_, + ScalarConvert<int, real>::to(1), + THCTensor_(data)(state, output_n), n_ + ); + + } + + // Free + THCTensor_(free)(state, input_n); + THCTensor_(free)(state, output_n); + + // Resize output + if (batch == 0) { + THCTensor_(resize4d)(state, output, nOutputPlane, outputDepth, outputHeight, outputWidth); + THCTensor_(resize4d)(state, input, nInputPlane, inputDepth, inputHeight, inputWidth); + } +} + +void THNN_(VolumetricFullConvolution_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradInput, + THCTensor *weight, + THCTensor *finput, + THCTensor *fgradInput, + int dT, int dW, int dH, + int padT, int padW, int padH, + int adjT, int adjW, int adjH) +{ + THCTensor *gradColumns = finput; + + int nInputPlane = THCTensor_(size)(state, weight, 0); + int nOutputPlane = THCTensor_(size)(state, weight, 1); + const int kT = (int)weight->size[2]; + const int kH = (int)weight->size[3]; + const int kW = (int)weight->size[4]; + + THCUNN_assertSameGPU_generic(state, 5, input, gradOutput, weight, + gradColumns, gradInput); + THArgCheck(input->nDimension == 4 || input->nDimension == 5, 2, "4D or 5D (batch mode) tensor is expected"); + + int batch = 1; + if (input->nDimension == 4) { + // Force batch + batch = 0; + THCTensor_(resize5d)(state, input, 1, input->size[0], input->size[1], input->size[2], input->size[3]); + THCTensor_(resize5d)(state, gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2], gradOutput->size[3]); + } + + long inputWidth = input->size[4]; + long inputHeight = input->size[3]; + long inputDepth = input->size[2]; + long outputWidth = (inputWidth - 1) * dW - 2*padW + kW + adjW; + long outputHeight = (inputHeight - 1) * dH - 2*padH + kH + adjH; + long outputDepth = (inputDepth - 1) * dT - 2*padT + kT + adjT; + + // Batch size + input planes + long batchSize = input->size[0]; + + // Resize output + THCTensor_(resize5d)(state, gradInput, batchSize, nInputPlane, inputDepth, inputHeight, inputWidth); + + // Resize temporary columns + THCTensor_(resize2d)(state, gradColumns, nOutputPlane*kW*kH*kT, inputDepth*inputHeight*inputWidth); + + // Helpers + THCTensor *gradInput_n = THCTensor_(new)(state); + THCTensor *gradOutput_n = THCTensor_(new)(state); + + // For each elt in batch, do: + for (int elt = 0; elt < batchSize; elt ++) { + // Matrix mulitply per sample: + THCTensor_(select)(state, gradInput_n, gradInput, 0, elt); + THCTensor_(select)(state, gradOutput_n, gradOutput, 0, elt); + + // Extract columns: + vol2col( + THCState_getCurrentStream(state), + THCTensor_(data)(state, gradOutput_n), + nOutputPlane, outputDepth, outputHeight, outputWidth, kT, kH, kW, padT, padH, padW, dT, dH, dW, + 1,1,1, + THCTensor_(data)(state, gradColumns) + ); + + + // M,N,K are dims of matrix A and B + // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm) + long m = weight->size[0]; + long n = gradColumns->size[1]; + long k = weight->size[1] * weight->size[2] * weight->size[3] * weight->size[4]; + + // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices) + #ifdef THC_REAL_IS_FLOAT + THCudaBlas_Sgemm( + #elif defined(THC_REAL_IS_HALF) + THCudaBlas_Hgemm( + #elif defined(THC_REAL_IS_DOUBLE) + THCudaBlas_Dgemm( + #endif + state, + 'n', 'n', + n, m, k, + ScalarConvert<int, real>::to(1), + THCTensor_(data)(state, gradColumns), n, + THCTensor_(data)(state, weight), k, + ScalarConvert<int, real>::to(0), + THCTensor_(data)(state, gradInput_n), n + ); + } + + + // Free + THCTensor_(free)(state, gradInput_n); + THCTensor_(free)(state, gradOutput_n); + + // Resize output + if (batch == 0) { + THCTensor_(resize4d)(state, gradOutput, nOutputPlane, outputDepth, outputHeight, outputWidth); + THCTensor_(resize4d)(state, input, nInputPlane, inputDepth, inputHeight, inputWidth); + THCTensor_(resize4d)(state, gradInput, nInputPlane, inputDepth, inputHeight, inputWidth); + } +} + + +void THNN_(VolumetricFullConvolution_accGradParameters)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradWeight, + THCTensor *gradBias, + THCTensor *finput, + THCTensor *fgradInput, + int dT, int dW, int dH, + int padT, int padW, int padH, + int adjT, int adjW, int adjH, + real scale) +{ + THCTensor *columns = finput; + THCTensor *ones = fgradInput; + + int nInputPlane = THCTensor_(size)(state, gradWeight, 0); + int nOutputPlane = THCTensor_(size)(state, gradWeight, 1); + const int kT = (int)gradWeight->size[2]; + const int kH = (int)gradWeight->size[3]; + const int kW = (int)gradWeight->size[4]; + + THCUNN_assertSameGPU_generic(state, 6, input, gradOutput, gradWeight, + gradBias, columns, ones); + THArgCheck(input->nDimension == 4 || input->nDimension == 5, 2, "4D or 5D (batch mode) tensor is expected"); + + int batch = 1; + if (input->nDimension == 4) { + // Force batch + batch = 0; + THCTensor_(resize5d)(state, input, 1, input->size[0], input->size[1], input->size[2], input->size[3]); + THCTensor_(resize5d)(state, gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2], gradOutput->size[3]); + } + + long inputWidth = input->size[4]; + long inputHeight = input->size[3]; + long inputDepth = input->size[2]; + long outputWidth = (inputWidth - 1) * dW - 2*padW + kW + adjW; + long outputHeight = (inputHeight - 1) * dH - 2*padH + kH + adjH; + long outputDepth = (inputDepth - 1) * dT - 2*padT + kT + adjT; + + // Batch size + input planes + long batchSize = input->size[0]; + + // Define a buffer of ones, for bias accumulation + if (ones->nDimension != 3 || ones->size[0]*ones->size[1]*ones->size[2] < outputDepth*outputHeight*outputWidth) { + // Resize plane and fill with ones... + THCTensor_(resize3d)(state, ones, outputDepth, outputHeight, outputWidth); + THCTensor_(fill)(state, ones, ScalarConvert<int, real>::to(1)); + } + + // Resize temporary columns + THCTensor_(resize2d)(state, columns, nOutputPlane*kW*kH*kT, inputDepth*inputHeight*inputWidth); + + // Helpers + THCTensor *input_n = THCTensor_(new)(state); + THCTensor *gradOutput_n = THCTensor_(new)(state); + + // For each elt in batch, do: + for (int elt = 0; elt < batchSize; elt ++) { + // Matrix mulitply per output: + THCTensor_(select)(state, input_n, input, 0, elt); + THCTensor_(select)(state, gradOutput_n, gradOutput, 0, elt); + + // Extract columns: + vol2col( + THCState_getCurrentStream(state), + THCTensor_(data)(state, gradOutput_n), + nOutputPlane, outputDepth, outputHeight, outputWidth, kT, kH, kW, padT, padH, padW, dT, dH, dW, + 1,1,1, + THCTensor_(data)(state, columns) + ); + + // M,N,K are dims of matrix A and B + // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm) + long n = columns->size[0]; // nOutputPlane * kt * kh * kw + long m = input_n->size[0]; // nInputPlane + long k = columns->size[1]; // inputHeight * inputWidth + + // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices) + #ifdef THC_REAL_IS_FLOAT + THCudaBlas_Sgemm( + #elif defined(THC_REAL_IS_HALF) + THCudaBlas_Hgemm( + #elif defined(THC_REAL_IS_DOUBLE) + THCudaBlas_Dgemm( + #endif + state, + 't', 'n', + n, m, k, + scale, + THCTensor_(data)(state, columns), k, + THCTensor_(data)(state, input_n), k, + ScalarConvert<int, real>::to(1), + THCTensor_(data)(state, gradWeight), n + ); + + // Do Bias: + // M,N,K are dims of matrix A and B + // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm) + long m_ = nOutputPlane; + long k_ = outputDepth * outputHeight * outputWidth; + + // Do GEMV (note: this is a bit confusing because gemv assumes column-major matrices) + #if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) + #ifdef THC_REAL_IS_FLOAT + THCudaBlas_Sgemv( + #elif defined(THC_REAL_IS_DOUBLE) + THCudaBlas_Dgemv( + #endif + state, + 't', + k_, m_, + scale, + THCTensor_(data)(state, gradOutput_n), k_, + THCTensor_(data)(state, ones), 1, + ScalarConvert<int, real>::to(1), + THCTensor_(data)(state, gradBias), 1 + ); + #endif + #ifdef THC_REAL_IS_HALF + THCudaBlas_Hgemm( + state, + 't', 'n', + m_, 1, k_, + scale, + THCTensor_(data)(state, gradOutput_n), k_, + THCTensor_(data)(state, ones), k_, + ScalarConvert<int, real>::to(1), + THCTensor_(data)(state, gradBias), m_ + ); + #endif + } + + // Free + THCTensor_(free)(state, input_n); + THCTensor_(free)(state, gradOutput_n); + + // Resize + if (batch == 0) { + THCTensor_(resize4d)(state, gradOutput, nOutputPlane, outputDepth, outputHeight, outputWidth); + THCTensor_(resize4d)(state, input, nInputPlane, inputDepth, inputHeight, inputWidth); + } +} + +#endif diff --git a/lib/THCUNN/generic/VolumetricMaxPooling.cu b/lib/THCUNN/generic/VolumetricMaxPooling.cu new file mode 100644 index 0000000..4a55a45 --- /dev/null +++ b/lib/THCUNN/generic/VolumetricMaxPooling.cu @@ -0,0 +1,36 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/VolumetricMaxPooling.cu" +#else + +void THNN_(VolumetricMaxPooling_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output, + THCIndexTensor *indices, + int kT, int kW, int kH, + int dT, int dW, int dH, + int padT, int padW, int padH, + bool ceilMode) +{ + THNN_(VolumetricDilatedMaxPooling_updateOutput)( + state, input, output, indices, + kT, kW, kH, dT, dW, dH, padT, padW, padH, 1, 1, 1, ceilMode); + +} + +void THNN_(VolumetricMaxPooling_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradInput, + THCIndexTensor *indices, + int dT, int dW, int dH, + int padT, int padW, int padH) +{ + THNN_(VolumetricDilatedMaxPooling_updateGradInput)( + state, input, gradOutput, gradInput, indices, + dT, dW, dH, padT, padW, padH, 1, 1, 1); + +} + +#endif diff --git a/lib/THCUNN/generic/VolumetricMaxUnpooling.cu b/lib/THCUNN/generic/VolumetricMaxUnpooling.cu new file mode 100644 index 0000000..2a68094 --- /dev/null +++ b/lib/THCUNN/generic/VolumetricMaxUnpooling.cu @@ -0,0 +1,192 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/VolumetricMaxUnpooling.cu" +#else + +void THNN_(VolumetricMaxUnpooling_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output, + THCIndexTensor *indices, + int outputTime, int outputWidth, int outputHeight, + int dT, int dW, int dH, + int padT, int padW, int padH) +{ + int batchSize; + int inputSlices; + int inputTime; + int inputHeight; + int inputWidth; + + THCUNN_assertSameGPU_generic(state, 3, input, indices, output); + + if (THCTensor_(nDimension)(state, input) == 4) + { + /* sizes */ + batchSize = 1; + inputSlices = THCTensor_(size)(state, input, 0); + inputTime = THCTensor_(size)(state, input, 1); + inputHeight = THCTensor_(size)(state, input, 2); + inputWidth = THCTensor_(size)(state, input, 3); + } + else if (THCTensor_(nDimension)(state, input) == 5) + { + /* sizes */ + batchSize = THCTensor_(size)(state, input, 0); + inputSlices = THCTensor_(size)(state, input, 1); + inputTime = THCTensor_(size)(state, input, 2); + inputHeight = THCTensor_(size)(state, input, 3); + inputWidth = THCTensor_(size)(state, input, 4); + } + else + { + THArgCheck(false, 2, "4D or 5D tensor expected"); + } + + if (input->nDimension == 4) /* 4D */ + { + /* resize output */ + THCTensor_(resize4d)(state, output, inputSlices, + outputTime, outputHeight, outputWidth); + } + else + { /* 5D */ + THCTensor_(resize5d)(state, output, batchSize, inputSlices, + outputTime, outputHeight, outputWidth); + } + + input = THCTensor_(newContiguous)(state, input); + indices = THCIndexTensor_(newContiguous)(state, indices); + THCTensor_(zero)(state, output); + + // Collapse batch and feature dimensions + THCDeviceTensor<real, 4> cudaInput; + THCDeviceTensor<real, 4> cudaOutput; + THCDeviceTensor<THCIndex_t, 4> cudaIndices; + + if (THCTensor_(nDimension)(state, input) == 4) + { + cudaInput = toDeviceTensor<real, 4>(state, input); + cudaOutput = toDeviceTensor<real, 4>(state, output); + cudaIndices = toDeviceTensor<THCIndex_t, 4>(state, indices); + } + else + { + cudaInput = toDeviceTensor<real, 5>(state, input).downcastOuter<4>(); + cudaOutput = toDeviceTensor<real, 5>(state, output).downcastOuter<4>(); + cudaIndices = toDeviceTensor<THCIndex_t, 5>(state, indices).downcastOuter<4>(); + } + + int totalZ = inputTime * inputSlices * batchSize; + int offsetZ = 0; + dim3 block(32, 8); + + while (totalZ > 0) { + dim3 grid(THCCeilDiv(inputWidth, static_cast<int>(block.x)), + THCCeilDiv(inputHeight, static_cast<int>(block.y)), + totalZ > 65535 ? 65535 : totalZ); + + cuda_VolumetricMaxUnpooling_updateOutput<<<grid, block, + 0, THCState_getCurrentStream(state)>>>( + cudaInput, cudaIndices, cudaOutput, + dT, dH, dW, + padT, padH, padW, offsetZ); + THCudaCheck(cudaGetLastError()); + totalZ -= 65535; + offsetZ += 65535; + } + + THCTensor_(free)(state, input); + THCIndexTensor_(free)(state, indices); +} + +void THNN_(VolumetricMaxUnpooling_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradInput, + THCIndexTensor *indices, + int outputTime, int outputWidth, int outputHeight, + int dT, int dW, int dH, + int padT, int padW, int padH) +{ + + int batchSize; + int inputSlices; + int inputTime; + int inputHeight; + int inputWidth; + + THCUNN_assertSameGPU_generic(state, 4, input, indices, gradOutput, gradInput); + + if (THCTensor_(nDimension)(state, input) == 4) /* 4D */ + { + batchSize = 1; + inputSlices = THCTensor_(size)(state, input, 0); + inputTime = THCTensor_(size)(state, input, 1); + inputHeight = THCTensor_(size)(state, input, 2); + inputWidth = THCTensor_(size)(state, input, 3); + } + else + { + batchSize = THCTensor_(size)(state, input, 0); + inputSlices = THCTensor_(size)(state, input, 1); + inputTime = THCTensor_(size)(state, input, 2); + inputHeight = THCTensor_(size)(state, input, 3); + inputWidth = THCTensor_(size)(state, input, 4); + } + + input = THCTensor_(newContiguous)(state, input); + indices = THCIndexTensor_(newContiguous)(state, indices); + gradOutput = THCTensor_(newContiguous)(state, gradOutput); + THCTensor_(resizeAs)(state, gradInput, input); + THCTensor_(zero)(state, gradInput); + + // Collapse batch and feature dimensions + THCDeviceTensor<real, 4> cudaGradInput; + THCDeviceTensor<real, 4> cudaGradOutput; + THCDeviceTensor<THCIndex_t, 4> cudaIndices; + + if (THCTensor_(nDimension)(state, input) == 4) + { + cudaGradInput = toDeviceTensor<real, 4>(state, gradInput); + cudaGradOutput = toDeviceTensor<real, 4>(state, gradOutput); + cudaIndices = toDeviceTensor<THCIndex_t, 4>(state, indices); + } + else + { + cudaGradInput = + toDeviceTensor<real, 5>(state, gradInput).downcastOuter<4>(); + cudaGradOutput = + toDeviceTensor<real, 5>(state, gradOutput).downcastOuter<4>(); + cudaIndices = + toDeviceTensor<THCIndex_t, 5>(state, indices).downcastOuter<4>(); + } + + int totalZ = inputTime * inputSlices * batchSize; + int offsetZ = 0; + dim3 block(32, 8); + + while (totalZ > 0) { + dim3 grid(THCCeilDiv(inputWidth, static_cast<int>(block.x)), + THCCeilDiv(inputHeight, static_cast<int>(block.y)), + totalZ > 65535 ? 65535 : totalZ); + + cuda_VolumetricMaxUnpooling_updateGradInput<<<grid, block, + 0, THCState_getCurrentStream(state)>>>( + cudaGradOutput, + cudaIndices, + cudaGradInput, + dT, dH, dW, + padT, padH, padW, offsetZ); + THCudaCheck(cudaGetLastError()); + totalZ -= 65535; + offsetZ += 65535; + } + + // cleanup + THCTensor_(free)(state, input); + THCTensor_(free)(state, gradOutput); + THCIndexTensor_(free)(state, indices); +} + +#endif diff --git a/lib/THCUNN/generic/VolumetricReplicationPadding.cu b/lib/THCUNN/generic/VolumetricReplicationPadding.cu new file mode 100644 index 0000000..60bf36c --- /dev/null +++ b/lib/THCUNN/generic/VolumetricReplicationPadding.cu @@ -0,0 +1,120 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/VolumetricReplicationPadding.cu" +#else + +void THNN_(VolumetricReplicationPadding_updateOutput)( + THCState *state, + THCTensor *input, + THCTensor *output, + int pleft, int pright, + int ptop, int pbottom, + int pfront, int pback) { + THArgCheck(TensorUtils<THCTensor>::canUse32BitIndexMath(state, input), 2, + "input tensor must fit into 32-bit index math"); + + int planeDim = 0; + int dimd = 1; + int dimh = 2; + int dimw = 3; + int numBatch = 1; + + int numInputDims = THCTensor_(nDimension)(state, input); + THArgCheck(numInputDims == 4 || numInputDims == 5, 2, + "input must be 4 or 5-dimensional"); + + if (numInputDims == 5) { + numBatch = THCTensor_(size)(state, input, 0); + planeDim++; + dimd++; + dimh++; + dimw++; + } + + int numPlanes = THCTensor_(size)(state, input, planeDim); + int inputD = THCTensor_(size)(state, input, dimd); + int inputH = THCTensor_(size)(state, input, dimh); + int inputW = THCTensor_(size)(state, input, dimw); + int outputD = inputD + pfront + pback; + int outputH = inputH + ptop + pbottom; + int outputW = inputW + pleft + pright; + + THCDeviceTensor<real, 5> devInput; + THCDeviceTensor<real, 5> devOutput; + + if (numInputDims == 4) { + THCTensor_(resize4d)(state, output, numPlanes, outputD, outputH, outputW); + + devInput = toDeviceTensor<real, 4>(state, input).upcastOuter<5>(); + devOutput = toDeviceTensor<real, 4>(state, output).upcastOuter<5>(); + } else { + THCTensor_(resize5d)(state, output, numBatch, numPlanes, outputD, outputH, + outputW); + + devInput = toDeviceTensor<real, 5>(state, input); + devOutput = toDeviceTensor<real, 5>(state, output); + } + + int outputPlaneSize = devOutput.getSize(2) * devOutput.getSize(3) * + devOutput.getSize(4); + dim3 gridSize(THCCeilDiv(outputPlaneSize, 256), + devOutput.getSize(1), + devOutput.getSize(0)); + dim3 blockSize(outputPlaneSize > 256 ? 256 : outputPlaneSize); + + VolumetricReplicationPadding_updateOutput<real><<<gridSize, blockSize, 0, THCState_getCurrentStream(state)>>>( + devInput, devOutput, pfront, pback, ptop, pbottom, pleft, pright); +} + +void THNN_(VolumetricReplicationPadding_updateGradInput)( + THCState *state, + THCTensor *input, + THCTensor *gradOutput, + THCTensor *gradInput, + int pleft, int pright, + int ptop, int pbottom, + int pfront, int pback) { + THArgCheck(TensorUtils<THCTensor>::canUse32BitIndexMath(state, input), 2, + "input tensor must fit into 32-bit index math"); + THArgCheck(TensorUtils<THCTensor>::canUse32BitIndexMath(state, gradOutput), + 3, "output gradient tensor must fit into 32-bit index math"); + + int planeDim = 0; + int dimd = 1; + int dimh = 2; + int dimw = 3; + + int numInputDims = THCTensor_(nDimension)(state, input); + if (numInputDims == 5) { + planeDim++; + dimd++; + dimh++; + dimw++; + } + + THCTensor_(resizeAs)(state, gradInput, input); + THCTensor_(zero)(state, gradInput); + + THCDeviceTensor<real, 5> devGradInput; + THCDeviceTensor<real, 5> devGradOutput; + + if (numInputDims == 4) { + devGradInput = toDeviceTensor<real, 4>(state, gradInput).upcastOuter<5>(); + devGradOutput = + toDeviceTensor<real, 4>(state, gradOutput).upcastOuter<5>(); + } else { + devGradInput = toDeviceTensor<real, 5>(state, gradInput); + devGradOutput = toDeviceTensor<real, 5>(state, gradOutput); + } + + int outputPlaneSize = devGradOutput.getSize(2) * devGradOutput.getSize(3) * + devGradOutput.getSize(4); + dim3 gridSize(THCCeilDiv(outputPlaneSize, 256), + devGradOutput.getSize(1), + devGradOutput.getSize(0)); + dim3 blockSize(outputPlaneSize > 256 ? 256 : outputPlaneSize); + + VolumetricReplicationPadding_updateGradInput<<<gridSize, blockSize, 0, THCState_getCurrentStream(state)>>>( + devGradInput, devGradOutput, pfront, pback, ptop, pbottom, pleft, pright); +} + +#endif diff --git a/lib/THCUNN/im2col.h b/lib/THCUNN/im2col.h index a81dbd2..ba57263 100644 --- a/lib/THCUNN/im2col.h +++ b/lib/THCUNN/im2col.h @@ -2,6 +2,7 @@ #define THCUNN_IM2COL_H #include "common.h" +#include "THCNumerics.cuh" // Kernel for fast unfold+copy // (borrowed from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/conv_layer.cu) @@ -29,7 +30,7 @@ __global__ void im2col_kernel(const int n, const Dtype* data_im, int h = h_in + i * dilation_h; int w = w_in + j * dilation_w; *data_col = (h >= 0 && w >= 0 && h < height && w < width) ? - data_im[i * dilation_h * width + j * dilation_w] : 0; + data_im[i * dilation_h * width + j * dilation_w] : ScalarConvert<int, Dtype>::to(0); data_col += height_col * width_col; } } @@ -59,7 +60,7 @@ void im2col(cudaStream_t stream, const Dtype* data_im, const int channels, THCudaCheck(cudaGetLastError()); } -template <typename Dtype> +template <typename Dtype, typename Acctype> __global__ void col2im_kernel(const int n, const Dtype* data_col, const int height, const int width, const int channels, const int kernel_h, const int kernel_w, @@ -69,7 +70,7 @@ __global__ void col2im_kernel(const int n, const Dtype* data_col, const int height_col, const int width_col, Dtype* data_im) { CUDA_KERNEL_LOOP(index, n) { - Dtype val = 0; + Acctype val = Acctype(0); const int w_im = index % width + pad_w; const int h_im = (index / width) % height + pad_h; const int c_im = index / (width * height); @@ -96,11 +97,11 @@ __global__ void col2im_kernel(const int n, const Dtype* data_col, } } } - data_im[index] = val; + data_im[index] = ScalarConvert<Acctype, Dtype>::to(val); } } -template <typename Dtype> +template <typename Dtype, typename Acctype> void col2im(cudaStream_t stream, const Dtype* data_col, const int channels, const int height, const int width, const int patch_h, const int patch_w, const int pad_h, @@ -113,7 +114,7 @@ void col2im(cudaStream_t stream, const Dtype* data_col, const int channels, int num_kernels = channels * height * width; // To avoid involving atomic operations, we will launch one kernel per // bottom dimension, and then in the kernel add up the top dimensions. - col2im_kernel <<<GET_BLOCKS(num_kernels), CUDA_NUM_THREADS, 0, stream>>> ( + col2im_kernel<Dtype, Acctype> <<<GET_BLOCKS(num_kernels), CUDA_NUM_THREADS, 0, stream>>> ( num_kernels, data_col, height, width, channels, patch_h, patch_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, diff --git a/lib/THCUNN/vol2col.h b/lib/THCUNN/vol2col.h index add88e1..15b110e 100644 --- a/lib/THCUNN/vol2col.h +++ b/lib/THCUNN/vol2col.h @@ -2,6 +2,7 @@ #define THCUNN_VOL2COL_H #include "common.h" +#include "THCNumerics.cuh" // Kernel for fast unfold+copy on volumes template <typename Dtype> @@ -33,7 +34,7 @@ CUDA_KERNEL_LOOP(index, n) { int h = h_in + j * dilation_h; int w = w_in + k * dilation_w; *data_col = (t >= 0 && h >= 0 && w >= 0 && t < depth && h < height && w < width) ? - data_vol[i * dilation_t * height * width + j * dilation_h * width + k * dilation_w] : 0; + data_vol[i * dilation_t * height * width + j * dilation_h * width + k * dilation_w] : ScalarConvert<int, Dtype>::to(0); data_col += depth_col * height_col * width_col; } } @@ -65,7 +66,7 @@ void vol2col(cudaStream_t stream, const Dtype* data_vol, const int channels, THCudaCheck(cudaGetLastError()); } -template <typename Dtype> +template <typename Dtype, typename Acctype> __global__ void vol2im_kernel(const int n, const Dtype* data_col, const int depth, const int height, const int width, const int channels, const int kernel_t, const int kernel_h, const int kernel_w, @@ -75,7 +76,7 @@ __global__ void vol2im_kernel(const int n, const Dtype* data_col, const int depth_col, const int height_col, const int width_col, Dtype* data_vol) { CUDA_KERNEL_LOOP(index, n) { - Dtype val = 0; + Acctype val = Acctype(0); const int w_im = index % width + pad_w; const int h_im = (index / width) % height + pad_h; const int t_im = (index / width / height) % depth + pad_t; @@ -112,11 +113,11 @@ __global__ void vol2im_kernel(const int n, const Dtype* data_col, } } } - data_vol[index] = val; + data_vol[index] = ScalarConvert<Acctype, Dtype>::to(val); } } -template <typename Dtype> +template <typename Dtype, typename Acctype> void col2vol(cudaStream_t stream, const Dtype* data_col, const int channels, const int depth, const int height, const int width, const int patch_t, const int patch_h, const int patch_w, @@ -130,7 +131,7 @@ void col2vol(cudaStream_t stream, const Dtype* data_col, const int channels, int num_kernels = channels * depth * height * width; // To avoid involving atomic operations, we will launch one kernel per // bottom dimension, and then in the kernel add up the top dimensions. - vol2im_kernel <<<GET_BLOCKS(num_kernels), CUDA_NUM_THREADS, 0, stream>>> ( + vol2im_kernel<Dtype, Acctype> <<<GET_BLOCKS(num_kernels), CUDA_NUM_THREADS, 0, stream>>> ( num_kernels, data_col, depth, height, width, channels, patch_t, patch_h, patch_w, pad_t, pad_h, pad_w, stride_t, stride_h, stride_w, dilation_t, dilation_h, dilation_w, @@ -1,4 +1,5 @@ local cunntest = torch.TestSuite() +local ffi = require 'ffi' local precision_forward = 1e-4 local precision_backward = 1e-2 local nloop = 1 @@ -6,134 +7,183 @@ local times = {} --e.g.: th -lcunn -e "nn.testcuda{'Sigmoid_forward'}" -local function pointwise_forward(proto_module, name, max_error) - local size = math.random(1,100) +local typenames = { + 'torch.CudaTensor', + 'torch.CudaDoubleTensor', +} - local tm = {} - local title = string.format(name..'.forward %d -> %d', size, size) - times[title] = tm +local t2cpu = { + ['torch.CudaTensor'] = 'torch.FloatTensor', + ['torch.CudaDoubleTensor'] = 'torch.DoubleTensor', - local input = torch.randn(size) - if name == 'Sqrt' then input:abs() end - local sconv = proto_module - local groundtruth = sconv:forward(input) - local a = torch.Timer() - for i = 1,nloop do - groundtruth = sconv:forward(input) +} + +local function checkHalf() + if cutorch.hasHalf then + table.insert(typenames, 'torch.CudaHalfTensor') + t2cpu['torch.CudaHalfTensor'] = 'torch.FloatTensor' end - tm.cpu = a:time().real +end - input = input:cuda() - local gconv = proto_module:clone():cuda() - local rescuda = gconv:forward(input) - a:reset() - for i = 1,nloop do - rescuda = gconv:forward(input) +-- workarounds for non-existant functions +function torch.CudaHalfTensor:mean() + return self:cuda():mean() +end + +function torch.CudaDoubleTensor:mean() + return self:cuda():mean() +end + +local function half_max_error(maxabs) + -- arbitrarily double the precision limit + return 2 * ((maxabs and (2^(math.floor(math.log(maxabs) / math.log(2)))) * (2^(-10))) or 0) +end + +-- half has additional error on top of double/float +local function precision_forward_type(precision_f, tensor_type, maxabs) + if (tensor_type == 'torch.CudaHalfTensor') then + return 1e-2 + precision_f + half_max_error(maxabs) + else + return precision_f end - cutorch.synchronize() - tm.gpu = a:time().real +end - local error = rescuda:float() - groundtruth - mytester:assertlt(error:abs():max(), max_error, 'error on state (forward) ') +local function precision_backward_type(precision_b, tensor_type, maxabs) + if (tensor_type == 'torch.CudaHalfTensor') then + return 1e-1 + precision_b + half_max_error(maxabs) + else + return precision_b + end +end + +local function precision_backward_conv_weightbias(precision_b, tensor_type, maxabs) + if (tensor_type == 'torch.CudaHalfTensor') then + -- cudnn uses 8 here + return 2 + precision_b + half_max_error(maxabs) + else + return precision_b + end +end + +local function pointwise_forward(proto_module, name, max_error) + local size = math.random(1,100) + + for k, typename in ipairs(typenames) do + local input = torch.randn(size):type(typename) + local ctype = t2cpu[typename] + local input = input:type(ctype) + if name == 'Sqrt' then input:abs() end + local sconv = proto_module:type(ctype) + local groundtruth = sconv:forward(input) + + input = input:type(typename) + local gconv = proto_module:clone():type(typename) + local rescuda = gconv:forward(input) + + local error = rescuda:double() - groundtruth:double() + mytester:assertlt(error:abs():max(), precision_forward_type(max_error, typename), + string.format('error on state (forward) with %s', typename)) + end end local function pointwise_backward(proto_module, name, max_error) local size = math.random(1,100) - local tm = {} - local title = string.format(name..'.backward %d -> %d', size, size) - times[title] = tm + for k, typename in ipairs(typenames) do + local input = torch.randn(size):type(typename) + local gradOutput = torch.randn(size):type(typename) - local input = torch.randn(size) - if name == 'Sqrt' then input:abs() end - local gradOutput = torch.randn(size) - local sconv = proto_module - sconv:forward(input) - local groundgrad = sconv:backward(input, gradOutput) - local a = torch.Timer() - for i = 1,nloop do - groundgrad = sconv:backward(input, gradOutput) - end - tm.cpu = a:time().real + local ctype = t2cpu[typename] + input = input:type(ctype) + gradOutput = gradOutput:type(ctype) + if name == 'Sqrt' then input:abs() end + local sconv = proto_module:type(ctype) + sconv:forward(input) + local groundgrad = sconv:backward(input, gradOutput) - input = input:cuda() - gradOutput = gradOutput:cuda() - local gconv = proto_module:clone():cuda() - gconv:forward(input) - local rescuda = gconv:backward(input, gradOutput) - a:reset() - for i = 1,nloop do - rescuda = gconv:backward(input, gradOutput) - end - cutorch.synchronize() - tm.gpu = a:time().real + input = input:type(typename) + gradOutput = gradOutput:type(typename) + local gconv = proto_module:clone():type(typename) + gconv:forward(input) + local rescuda = gconv:backward(input, gradOutput) - local error = rescuda:float() - groundgrad + local error = rescuda:double() - groundgrad:double() - mytester:assertlt(error:abs():max(), max_error, 'error on state (backward) ') + mytester:assertlt(error:abs():max(), precision_backward_type(max_error, typename), + string.format('error on state (backward) with %s', typename)) + end end local function pointwise_backward_inplace(proto_module, name) local size = math.random(1,100) - local tm = {} - local title = string.format(name..'.backward_inplace %d -> %d', size, size) - times[title] = tm - - local input = torch.randn(size) - if name == 'Sqrt' then input:abs() end - local gradOutput = torch.randn(size) - local sconv = proto_module - local groundgrad = sconv:backward(input, gradOutput) - mytester:assertTensorEq(groundgrad:float(), - gradOutput:float(), - 0.000001, "inplace not respected") - - local input = torch.randn(size):cuda() - if name == 'Sqrt' then input:abs() end - local gradOutput = torch.randn(size):cuda() - local sconv = proto_module:clone():cuda() - local groundgrad = sconv:backward(input, gradOutput) - mytester:assertTensorEq(groundgrad:float(), - gradOutput:float(), - 0.000001, "cuda inplace not respected") + for k, typename in ipairs(typenames) do + local input = torch.randn(size):type(typename) + local ctype = t2cpu[typename] + input = input:type(ctype) + if name == 'Sqrt' then input:abs() end + local gradOutput = torch.randn(size) + gradOutput = gradOutput:type(ctype) + local sconv = proto_module:type(ctype) + local groundgrad = sconv:backward(input, gradOutput) + mytester:assertTensorEq(groundgrad:double(), + gradOutput:double(), + 0.000001, + string.format("inplace not respected for %s", ctype)) + + input = torch.randn(size) + input = input:type(typename) + if name == 'Sqrt' then input:abs() end + gradOutput = torch.randn(size) + gradOutput = gradOutput:type(typename) + local sconv = proto_module:clone():type(typename) + local groundgrad = sconv:backward(input, gradOutput) + mytester:assertTensorEq(groundgrad:double(), + gradOutput:double(), + 0.000001, + string.format("cuda inplace not respected for %s", typename)) + end end local function pointwise_transposed(proto_module, name, max_error) max_error = max_error or 1e-7 - local tm = {} - local title = name .. '.transposed' - times[title] = tm - local input = torch.Tensor(11, 19):uniform(-1, 1) - if name == 'Sqrt' then - input:uniform(0.1, 1) - end - local inputCUDA = input:clone():cuda() + for k, typename in ipairs(typenames) do + local ctype = t2cpu[typename] + local input = torch.Tensor(11, 19):uniform(-1, 1):type(typename) + input = input:type(ctype) + local proto_module = proto_module:type(ctype) + if name == 'Sqrt' then + input:uniform(0.1, 1) + end + local inputCUDA = input:clone():type(typename) - local cuda_module = proto_module:clone():cuda() + local cuda_module = proto_module:clone():type(typename) - -- transpose the inputs and DON'T make contiguous - input = input:transpose(1, 2) - inputCUDA = inputCUDA:transpose(1, 2) + -- transpose the inputs and DON'T make contiguous + input = input:transpose(1, 2) + inputCUDA = inputCUDA:transpose(1, 2) - local output = proto_module:forward(input) - local outputCUDA = cuda_module:forward(inputCUDA) + local output = proto_module:forward(input) + local outputCUDA = cuda_module:forward(inputCUDA) - local error = outputCUDA:float() - output - mytester:assertlt(error:abs():max(), max_error, 'error on state (forward) ') + local error = outputCUDA:double() - output:double() + mytester:assertlt(error:abs():max(), precision_forward_type(max_error, typename), + string.format('error on state (forward) for %s', typename)) - local gradOutput = torch.Tensor(11, 19):uniform(-1, 1) - local gradOutputCUDA = gradOutput:clone():cuda() + local gradOutput = torch.Tensor(11, 19):uniform(-1, 1):type(ctype) + local gradOutputCUDA = gradOutput:clone():type(typename) - gradOutput = gradOutput:transpose(1, 2) - gradOutputCUDA = gradOutputCUDA:transpose(1, 2) + gradOutput = gradOutput:transpose(1, 2) + gradOutputCUDA = gradOutputCUDA:transpose(1, 2) - local gradInput = proto_module:backward(input, gradOutput) - local gradInputCUDA = cuda_module:backward(inputCUDA, gradOutputCUDA) + local gradInput = proto_module:backward(input, gradOutput) + local gradInputCUDA = cuda_module:backward(inputCUDA, gradOutputCUDA) - local error = gradInputCUDA:float() - gradInput - mytester:assertlt(error:abs():max(), max_error, 'error on state (backward) ') + local error = gradInputCUDA:double() - gradInput:double() + mytester:assertlt(error:abs():max(), precision_backward_type(max_error, typename), + string.format('error on state (backward) for %s', typename)) + end end function cunntest.Tanh_forward() @@ -253,29 +303,32 @@ end function cunntest.Sqrt_zero() local size = math.random(1, 100) - -- Test zero inputs; we will avoid a div-by-zero by setting to zero - local module_gpu = nn.Sqrt():cuda() - local input_gpu = torch.CudaTensor(size, size):zero() - module_gpu:forward(input_gpu) + for k, typename in ipairs(typenames) do + -- Test zero inputs; we will avoid a div-by-zero by setting to zero + local module_gpu = nn.Sqrt():type(typename) + local input_gpu = torch.CudaTensor(size, size):zero():type(typename) + module_gpu:forward(input_gpu) - local gradOutput_gpu = torch.CudaTensor(size, size):fill(1) - local gradInput_gpu = module_gpu:backward(input_gpu, gradOutput_gpu) + local gradOutput_gpu = torch.CudaTensor(size, size):fill(1):type(typename) + local gradInput_gpu = module_gpu:backward(input_gpu, gradOutput_gpu) - mytester:assertTensorEq(gradInput_gpu:float(), - torch.FloatTensor(size, size):zero(), - 0.000001, "error in sqrt backward singularity") + mytester:assertTensorEq(gradInput_gpu:double(), + torch.DoubleTensor(size, size):zero(), + 0.000001, "error in sqrt backward singularity") - -- Verify CPU and GPU zero behavior equivalency - local module_cpu = nn.Sqrt() - local input_cpu = input_gpu:float() - module_cpu:forward(input_cpu) + -- Verify CPU and GPU zero behavior equivalency + local ctype = t2cpu[typename] + local module_cpu = nn.Sqrt():type(ctype) + local input_cpu = input_gpu:type(ctype) + module_cpu:forward(input_cpu) - local gradOutput_cpu = gradOutput_gpu:float() - local gradInput_cpu = module_cpu:backward(input_cpu, gradOutput_cpu) + local gradOutput_cpu = gradOutput_gpu:type(ctype) + local gradInput_cpu = module_cpu:backward(input_cpu, gradOutput_cpu) - mytester:assertTensorEq(gradInput_gpu:float(), - gradInput_cpu:float(), - 0.000001, "Sqrt_zero CPU and GPU not equivalent") + mytester:assertTensorEq(gradInput_gpu:double(), + gradInput_cpu:double(), + 0.000001, "Sqrt_zero CPU and GPU not equivalent") + end end function cunntest.Sqrt_transposed() @@ -295,15 +348,18 @@ function cunntest.Square_transposed() end function cunntest.SoftShrink_forward() - pointwise_forward(nn.SoftShrink(math.random()), 'SoftShrink', precision_forward) + local r = ffi.C.THC_half2float(ffi.C.THC_float2half(math.random())) + pointwise_forward(nn.SoftShrink(r), 'SoftShrink', precision_forward) end function cunntest.SoftShrink_backward() - pointwise_backward(nn.SoftShrink(math.random()), 'SoftShrink', precision_backward) + local r = ffi.C.THC_half2float(ffi.C.THC_float2half(math.random())) + pointwise_backward(nn.SoftShrink(r), 'SoftShrink', precision_backward) end function cunntest.SoftShrink_transposed() - pointwise_transposed(nn.SoftShrink(math.random()), 'SoftShrink', precision_backward) + local r = ffi.C.THC_half2float(ffi.C.THC_float2half(math.random())) + pointwise_transposed(nn.SoftShrink(r), 'SoftShrink', precision_backward) end function cunntest.ELU_forward() @@ -363,67 +419,48 @@ function cunntest.LogSoftMax_forward_batch() local size = math.random(1,256) local bs = math.random(32,256) - local tm = {} - local title = string.format('LogSoftMax forward batch %d x %d -> %d x %d', bs, size, bs, size) - times[title] = tm - - local input = torch.randn(bs, size) - local sconv = nn.LogSoftMax() - local groundtruth = sconv:forward(input) - local a = torch.Timer() - for i = 1,nloop do - groundtruth = sconv:forward(input) - end - tm.cpu = a:time().real + for k, typename in ipairs(typenames) do + local input = torch.randn(bs, size):type(typename) + local ctype = t2cpu[typename] + input = input:type(ctype) + local sconv = nn.LogSoftMax():type(ctype) + local groundtruth = sconv:forward(input) - input = input:cuda() - local gconv = nn.LogSoftMax():cuda() - local rescuda = gconv:forward(input) - a:reset() - for i = 1,nloop do - rescuda = gconv:forward(input) - end - cutorch.synchronize() - tm.gpu = a:time().real + input = input:type(typename) + local gconv = nn.LogSoftMax():type(typename) + local rescuda = gconv:forward(input) - local error = rescuda:float() - groundtruth - mytester:assertlt(error:abs():max(), precision_forward*10, 'error on state (forward) ') + local error = rescuda:double() - groundtruth:double() + mytester:assertlt(error:abs():max(), precision_forward_type(precision_forward*10, typename), + string.format('error on state (forward) with %s', typename)) + end end function cunntest.LogSoftMax_backward_batch() local size = math.random(1,256) local bs = math.random(32,256) - local tm = {} - local title = string.format('LogSoftMax.backward batch %d x %d -> %d x %d', bs, size, bs, size) - times[title] = tm - - local input = torch.randn(bs, size) - local gradOutput = torch.randn(bs, size) - local sconv = nn.LogSoftMax() - sconv:forward(input) - local groundgrad = sconv:backward(input, gradOutput) - local a = torch.Timer() - for i = 1,nloop do - groundgrad = sconv:backward(input, gradOutput) - end - tm.cpu = a:time().real + for k, typename in ipairs(typenames) do + local input = torch.randn(bs, size):type(typename) + local gradOutput = torch.randn(bs, size):type(typename) + local ctype = t2cpu[typename] + input = input:type(ctype) + gradOutput = gradOutput:type(ctype) + local sconv = nn.LogSoftMax():type(ctype) + sconv:forward(input) + local groundgrad = sconv:backward(input, gradOutput) - input = input:cuda() - gradOutput = gradOutput:cuda() - local gconv = sconv:clone():cuda() - gconv:forward(input) - local rescuda = gconv:backward(input, gradOutput) - a:reset() - for i = 1,nloop do - rescuda = gconv:backward(input, gradOutput) - end - cutorch.synchronize() - tm.gpu = a:time().real + input = input:type(typename) + gradOutput = gradOutput:type(typename) + local gconv = sconv:clone():type(typename) + gconv:forward(input) + local rescuda = gconv:backward(input, gradOutput) - local error = rescuda:float() - groundgrad + local error = rescuda:double() - groundgrad:double() - mytester:assertlt(error:abs():max(), precision_backward, 'error on state (backward) ') + mytester:assertlt(error:abs():max(), precision_backward_type(precision_backward, typename), + string.format('error on state (backward) with %s', typename)) + end end function cunntest.SpatialLogSoftMax_forward() @@ -431,31 +468,22 @@ function cunntest.SpatialLogSoftMax_forward() local ini = math.random(8,32) local inj = math.random(8,32) - local tm = {} - local title = string.format('SpatialLogSoftMax forward %d x %d x %d', size, inj, ini) - times[title] = tm - - local input = torch.randn(size, inj, ini) - local sconv = nn.SpatialLogSoftMax() - local groundtruth = sconv:forward(input) - local a = torch.Timer() - for i = 1,nloop do - groundtruth = sconv:forward(input) - end - tm.cpu = a:time().real + for k, typename in ipairs(typenames) do + local input = torch.randn(size, inj, ini):type(typename) + local ctype = t2cpu[typename] + input = input:type(ctype) + local sconv = nn.SpatialLogSoftMax():type(ctype) + local groundtruth = sconv:forward(input):type(ctype) - input = input:cuda() - local gconv = nn.SpatialLogSoftMax():cuda() - local rescuda = gconv:forward(input) - a:reset() - for i = 1,nloop do - rescuda = gconv:forward(input) - end - cutorch.synchronize() - tm.gpu = a:time().real + input = input:type(typename) + local gconv = nn.SpatialLogSoftMax():type(typename) + local rescuda = gconv:forward(input) - local error = rescuda:float() - groundtruth - mytester:assertlt(error:abs():max(), precision_forward*25, 'error on state (forward) ') + local error = rescuda:double() - groundtruth:double() + mytester:assertlt(error:abs():max(), + precision_forward_type(precision_forward*25, typename), + string.format('error on state (forward) with %s', typename)) + end end function cunntest.SpatialLogSoftMax_backward() @@ -463,36 +491,27 @@ function cunntest.SpatialLogSoftMax_backward() local ini = math.random(8,32) local inj = math.random(8,32) - local tm = {} - local title = string.format('SpatialLogSoftMax.backward %d x %d x %d', size, inj, ini) - times[title] = tm - - local input = torch.randn(size, inj, ini) - local gradOutput = torch.randn(size, inj, ini) - local sconv = nn.SpatialLogSoftMax() - sconv:forward(input) - local groundgrad = sconv:backward(input, gradOutput) - local a = torch.Timer() - for i = 1,nloop do - groundgrad = sconv:backward(input, gradOutput) - end - tm.cpu = a:time().real + for k, typename in ipairs(typenames) do + local input = torch.randn(size, inj, ini):type(typename) + local gradOutput = torch.randn(size, inj, ini):type(typename) + local ctype = t2cpu[typename] + input = input:type(ctype) + gradOutput = gradOutput:type(ctype) + local sconv = nn.SpatialLogSoftMax():type(ctype) + sconv:forward(input) + local groundgrad = sconv:backward(input, gradOutput) - input = input:cuda() - gradOutput = gradOutput:cuda() - local gconv = sconv:clone():cuda() - gconv:forward(input) - local rescuda = gconv:backward(input, gradOutput) - a:reset() - for i = 1,nloop do - rescuda = gconv:backward(input, gradOutput) - end - cutorch.synchronize() - tm.gpu = a:time().real + input = input:type(typename) + gradOutput = gradOutput:type(typename) + local gconv = sconv:clone():type(typename) + gconv:forward(input) + local rescuda = gconv:backward(input, gradOutput) - local error = rescuda:float() - groundgrad + local error = rescuda:double() - groundgrad:double() - mytester:assertlt(error:abs():max(), precision_backward, 'error on state (backward) ') + mytester:assertlt(error:abs():max(), precision_backward_type(precision_backward, typename), + string.format('error on state (backward) with %s', typename)) + end end function cunntest.SpatialLogSoftMax_forward_batch() @@ -501,31 +520,22 @@ function cunntest.SpatialLogSoftMax_forward_batch() local ini = math.random(8,32) local inj = math.random(8,32) - local tm = {} - local title = string.format('SpatialLogSoftMax forward batch %d x %d x %d x %d', bs, size, inj, ini) - times[title] = tm - - local input = torch.randn(bs, size, inj, ini) - local sconv = nn.SpatialLogSoftMax() - local groundtruth = sconv:forward(input) - local a = torch.Timer() - for i = 1,nloop do - groundtruth = sconv:forward(input) - end - tm.cpu = a:time().real + for k, typename in ipairs(typenames) do + local input = torch.randn(bs, size, inj, ini):type(typename) + local ctype = t2cpu[typename] + input = input:type(ctype) + local sconv = nn.SpatialLogSoftMax():type(ctype) + local groundtruth = sconv:forward(input) - input = input:cuda() - local gconv = nn.SpatialLogSoftMax():cuda() - local rescuda = gconv:forward(input) - a:reset() - for i = 1,nloop do - rescuda = gconv:forward(input) - end - cutorch.synchronize() - tm.gpu = a:time().real + input = input:type(typename) + local gconv = nn.SpatialLogSoftMax():type(typename) + local rescuda = gconv:forward(input) - local error = rescuda:float() - groundtruth - mytester:assertlt(error:abs():max(), precision_forward*25, 'error on state (forward) ') + local error = rescuda:double() - groundtruth:double() + mytester:assertlt(error:abs():max(), + precision_forward_type(precision_forward*25, typename), + string.format('error on state (forward) with %s', typename)) + end end function cunntest.SpatialLogSoftMax_backward_batch() @@ -534,36 +544,27 @@ function cunntest.SpatialLogSoftMax_backward_batch() local ini = math.random(8,32) local inj = math.random(8,32) - local tm = {} - local title = string.format('SpatialLogSoftMax.backward batch %d x %d x %d x %d', bs, size, inj, ini) - times[title] = tm - - local input = torch.randn(bs, size, inj, ini) - local gradOutput = torch.randn(bs, size, inj, ini) - local sconv = nn.SpatialLogSoftMax() - sconv:forward(input) - local groundgrad = sconv:backward(input, gradOutput) - local a = torch.Timer() - for i = 1,nloop do - groundgrad = sconv:backward(input, gradOutput) - end - tm.cpu = a:time().real + for k, typename in ipairs(typenames) do + local input = torch.randn(bs, size, inj, ini):type(typename) + local gradOutput = torch.randn(bs, size, inj, ini):type(typename) + local ctype = t2cpu[typename] + input = input:type(ctype) + gradOutput = gradOutput:type(ctype) + local sconv = nn.SpatialLogSoftMax():type(ctype) + sconv:forward(input) + local groundgrad = sconv:backward(input, gradOutput) - input = input:cuda() - gradOutput = gradOutput:cuda() - local gconv = sconv:clone():cuda() - gconv:forward(input) - local rescuda = gconv:backward(input, gradOutput) - a:reset() - for i = 1,nloop do - rescuda = gconv:backward(input, gradOutput) - end - cutorch.synchronize() - tm.gpu = a:time().real + input = input:type(typename) + gradOutput = gradOutput:type(typename) + local gconv = sconv:clone():type(typename) + gconv:forward(input) + local rescuda = gconv:backward(input, gradOutput) - local error = rescuda:float() - groundgrad + local error = rescuda:double() - groundgrad:double() - mytester:assertlt(error:abs():max(), precision_backward, 'error on state (backward) ') + mytester:assertlt(error:abs():max(), precision_backward_type(precision_backward, typename), + string.format('error on state (backward) with %s', typename)) + end end @@ -732,44 +733,35 @@ function cunntest.SparseLinear_forward() local ini = math.random(50,100) local inj = math.random(5,10) - local module = nn.SparseLinear(ini,inj) - local sslin = module - local gslin = module:clone():cuda() - - -- Create a random sparse vector - local input = {} - for i=1,inb do - local nnz = math.random(5, 10) - local inds = torch.randperm(ini)[{{1,nnz}}] - input[i] = torch.Tensor(nnz, 2) - input[i]:select(2,1):copy(inds) - input[i]:select(2,2):copy(torch.rand(nnz)) - end + for k, typename in ipairs(typenames) do + if typename ~= "torch.CudaHalfTensor" then + local ctype = t2cpu[typename] + local module = nn.SparseLinear(ini,inj):type(ctype) + local sslin = module + local gslin = module:clone():type(typename) + + -- Create a random sparse vector + local input = {} + for i=1,inb do + local nnz = math.random(5, 10) + local inds = torch.randperm(ini)[{{1,nnz}}] + input[i] = torch.Tensor(nnz, 2):type(ctype) + input[i]:select(2,1):copy(inds) + input[i]:select(2,2):copy(torch.rand(nnz):type(typename):type(ctype)) + end - local tm = {} - local title = string.format('SparseLinear forward %d -> %d', ini, inj) - times[title] = tm + local groundtruth = sslin:forward(input) + sslin:zeroGradParameters() - local groundtruth = sslin:forward(input) - sslin:zeroGradParameters() - local a = torch.Timer() - for i = 1,nloop do - groundtruth = sslin:forward(input) - end - tm.cpu = a:time().real - - for i,v in ipairs(input) do input[i] = input[i]:cuda() end - local rescuda = gslin:forward(input) - gslin:zeroGradParameters() - a:reset() - for i = 1,nloop do - rescuda = gslin:forward(input) - end - cutorch.synchronize() - tm.gpu = a:time().real + for i,v in ipairs(input) do input[i] = input[i]:type(typename) end + local rescuda = gslin:forward(input) + gslin:zeroGradParameters() - local error = rescuda:float() - groundtruth - mytester:assertlt(error:abs():max(), precision_forward, 'error on state (forward) ') + local error = rescuda:double() - groundtruth:double() + mytester:assertlt(error:abs():max(), precision_forward_type(precision_forward, typename), + string.format('error on state (forward) with %s', typename)) + end + end end function cunntest.SparseLinear_backward() @@ -777,201 +769,184 @@ function cunntest.SparseLinear_backward() local ini = math.random(50,100) local inj = math.random(5,10) - local gslin = nn.SparseLinear(ini,inj):cuda() - local sslin = nn.Linear(ini,inj) - gslin.weight = sslin.weight:clone():cuda() - gslin.bias = sslin.bias:clone():cuda() - - -- Create a random sparse vector - local input = {} - local nonsparse = torch.zeros(inb, ini) - for i=1,inb do - local nnz = math.random(3, 5) - local inds = torch.randperm(ini)[{{1,nnz}}] - input[i] = torch.Tensor(nnz, 2) - input[i]:select(2,1):copy(inds) - input[i]:select(2,2):copy(torch.rand(nnz)) - nonsparse[i]:scatter(1, input[i]:select(2,1):long(), input[i]:select(2,2)) - end + for k, typename in ipairs(typenames) do + if typename ~= "torch.CudaHalfTensor" then + local ctype = t2cpu[typename] + local gslin = nn.SparseLinear(ini,inj):type(typename) + local sslin = nn.Linear(ini,inj):type(ctype) + gslin.weight = sslin.weight:clone():type(typename) + gslin.bias = sslin.bias:clone():type(typename) + + -- Create a random sparse vector + local input = {} + local nonsparse = torch.zeros(inb, ini):type(ctype) + for i=1,inb do + local nnz = math.random(3, 5) + local inds = torch.randperm(ini)[{{1,nnz}}] + input[i] = torch.Tensor(nnz, 2):type(ctype) + input[i]:select(2,1):copy(inds) + input[i]:select(2,2):copy(torch.rand(nnz):type(typename):type(ctype)) + nonsparse[i]:scatter(1, input[i]:select(2,1):long(), input[i]:select(2,2)) + end - local tm = {} - local title = string.format('SparseLinear backward %d <- %d', ini, inj) - times[title] = tm - - local gradOutput = torch.randn(inb, inj) - sslin:forward(nonsparse) - local groundgrad = sslin:backward(nonsparse, gradOutput) - sslin:zeroGradParameters() - local a = torch.Timer() - for i = 1,nloop do - sslin:backward(nonsparse, gradOutput) - end - tm.cpu = a:time().real - local groundweight = sslin.gradWeight - local groundbias = sslin.gradBias - - for i,v in ipairs(input) do input[i] = input[i]:cuda() end - gradOutput = gradOutput:cuda() - gslin:forward(input) - local rescuda = gslin:backward(input, gradOutput) - gslin:zeroGradParameters() - a:reset() - for i = 1,nloop do - gslin:backward(input, gradOutput) + local gradOutput = torch.randn(inb, inj):type(typename):type(ctype) + sslin:forward(nonsparse) + local groundgrad = sslin:backward(nonsparse, gradOutput) + sslin:zeroGradParameters() + local groundweight = sslin.gradWeight + local groundbias = sslin.gradBias + + for i,v in ipairs(input) do input[i] = input[i]:type(typename) end + gradOutput = gradOutput:type(typename) + gslin:forward(input) + local rescuda = gslin:backward(input, gradOutput) + gslin:zeroGradParameters() + local weightcuda = gslin.gradWeight + local biascuda = gslin.gradBias + + local werror = weightcuda:double() - groundweight:double() + local berror = biascuda:double() - groundbias:double() + + mytester:assertlt(werror:abs():max(), precision_backward_type(precision_backward, typename), + string.format('error on weight (backward) with %s', typename)) + mytester:assertlt(berror:abs():max(), precision_backward_type(precision_backward, typename), + string.format('error on bias (backward) with %s', typename)) + + gslin:updateParameters(.1) + sslin:updateParameters(.1) + werror = gslin.weight:double() - sslin.weight:double() + berror = gslin.bias:double() - sslin.bias:double() + + mytester:assertlt(werror:abs():max(), precision_backward_type(precision_backward, typename), + string.format('error on weight (update) with %s', typename)) + mytester:assertlt(berror:abs():max(), precision_backward_type(precision_backward, typename), + string.format('error on bias (update) with %s', typename)) + + gslin:zeroGradParameters() + end end - local weightcuda = gslin.gradWeight - local biascuda = gslin.gradBias - cutorch.synchronize() - tm.gpu = a:time().real - - local werror = weightcuda:float() - groundweight - local berror = biascuda:float() - groundbias - - mytester:assertlt(werror:abs():max(), precision_backward, 'error on weight (backward) ') - mytester:assertlt(berror:abs():max(), precision_backward, 'error on bias (backward) ') - - gslin:updateParameters(.1) - sslin:updateParameters(.1) - werror = gslin.weight:float() - sslin.weight - berror = gslin.bias:float() - sslin.bias - - mytester:assertlt(werror:abs():max(), precision_backward, 'error on weight (update) ') - mytester:assertlt(berror:abs():max(), precision_backward, 'error on bias (update) ') - - gslin:zeroGradParameters() end local function BatchNormalization_forward(moduleName, inputSize) local planes = inputSize[2] - local tm = {} - local title = moduleName .. '.forward ' .. table.concat(inputSize, 'x') - times[title] = tm - local input = torch.randn(table.unpack(inputSize)) - local sbnorm = nn[moduleName](planes) - local groundtruth = sbnorm:forward(input) - local a = torch.Timer() - for i = 1,nloop do - groundtruth = sbnorm:forward(input) - end - tm.cpu = a:time().real + for k, typename in ipairs(typenames) do + local input = torch.randn(table.unpack(inputSize)):type(typename) - input = input:cuda() - local gbnorm = nn[moduleName](planes):cuda() - gbnorm.weight = sbnorm.weight:cuda() - gbnorm.bias = sbnorm.bias:cuda() - local rescuda = gbnorm:forward(input) + local ctype = t2cpu[typename] + input = input:type(ctype) + local sbnorm = nn[moduleName](planes):type(ctype) + local groundtruth = sbnorm:forward(input) - a:reset() - for i = 1,nloop do - rescuda = gbnorm:forward(input) - end - cutorch.synchronize() - tm.gpu = a:time().real + input = input:type(typename) + local gbnorm = nn[moduleName](planes):type(typename) + gbnorm.weight = sbnorm.weight:type(typename) + gbnorm.bias = sbnorm.bias:type(typename) + local rescuda = gbnorm:forward(input) - local error = rescuda:float() - groundtruth - mytester:assertlt(error:abs():max(), precision_forward, 'error on state (forward)') - mytester:assertlt((gbnorm.running_mean:float() - sbnorm.running_mean):abs():max(), - precision_forward, 'error on running_mean (forward)') - mytester:assertlt((gbnorm.running_var:float() - sbnorm.running_var):abs():max(), - precision_forward, 'error on running_var (forward)') + local error = rescuda:double() - groundtruth:double() + mytester:assertlt(error:abs():max(), precision_forward_type(precision_forward, typename, rescuda:abs():max()), + string.format('error on state (forward) with %s', typename)) + mytester:assertlt((gbnorm.running_mean:double() - sbnorm.running_mean:double()):abs():max(), + precision_forward_type(precision_forward, typename, gbnorm.running_mean:abs():max()), + string.format('error on running_mean (forward) with %s', typenanme)) + mytester:assertlt((gbnorm.running_var:double() - sbnorm.running_var:double()):abs():max(), + precision_forward_type(precision_forward, typename, gbnorm.running_var:abs():max()), + string.format('error on running_var (forward) with %s', typename)) + end end local function BatchNormalization_forward_inference(moduleName, inputSize) local planes = inputSize[2] - local tm = {} - local title = moduleName .. '.forward (evaluate) ' .. table.concat(inputSize, 'x') - times[title] = tm - local input = torch.randn(table.unpack(inputSize)) - local sbnorm = nn[moduleName](planes) - sbnorm.running_mean:normal(1, 2) - sbnorm.running_var:uniform(1e-3, 2) - sbnorm:evaluate() - local groundtruth = sbnorm:forward(input) - local a = torch.Timer() - for i = 1,nloop do - groundtruth = sbnorm:forward(input) - end - tm.cpu = a:time().real + for k, typename in ipairs(typenames) do + local input = torch.randn(table.unpack(inputSize)):type(typename) - input = input:cuda() - local gbnorm = nn[moduleName](planes):cuda() - gbnorm:evaluate() - gbnorm.weight = sbnorm.weight:cuda() - gbnorm.bias = sbnorm.bias:cuda() - gbnorm.running_mean = sbnorm.running_mean:cuda() - gbnorm.running_var = sbnorm.running_var:cuda() - local rescuda = gbnorm:forward(input) - a:reset() - for i = 1,nloop do - rescuda = gbnorm:forward(input) - end - cutorch.synchronize() - tm.gpu = a:time().real + local ctype = t2cpu[typename] + input = input:type(ctype) + local sbnorm = nn[moduleName](planes):type(ctype) + sbnorm.running_mean:normal(1, 2) + sbnorm.running_var:uniform(1e-3, 2) + sbnorm.running_var = sbnorm.running_var:type(typename):type(ctype) + sbnorm.running_mean = sbnorm.running_mean:type(typename):type(ctype) - local error = rescuda:float() - groundtruth - mytester:assertlt(error:abs():max(), precision_forward, 'error on state (forward evaluate)') + sbnorm:evaluate() + local groundtruth = sbnorm:forward(input) + + input = input:type(typename) + local gbnorm = nn[moduleName](planes):type(typename) + gbnorm:evaluate() + gbnorm.weight = sbnorm.weight:type(typename) + gbnorm.bias = sbnorm.bias:type(typename) + gbnorm.running_mean = sbnorm.running_mean:type(typename) + gbnorm.running_var = sbnorm.running_var:type(typename) + local rescuda = gbnorm:forward(input) + + local error = rescuda:double() - groundtruth:double() + mytester:assertlt(error:abs():max(), precision_forward_type(precision_forward, typename, rescuda:abs():max()), + string.format('error on state (forward evaluate) with %s', typename)) + end end local function BatchNormalization_backward(moduleName, mode, inputSize, backwardFn) assert(mode == 'training' or mode == 'evaluation', 'invalid mode') local planes = inputSize[2] - local tm = {} - local title = moduleName .. '.backward ' .. table.concat(inputSize, 'x') - times[title] = tm - local input = torch.randn(table.unpack(inputSize)) - local gradOutput = torch.randn(table.unpack(inputSize)) - local sbnorm = nn[moduleName](planes) - if mode == 'training' then - sbnorm:training() - else - sbnorm:evaluate() - end - sbnorm:forward(input) - sbnorm:zeroGradParameters() - local groundgrad = backwardFn(sbnorm, input, gradOutput) - local a = torch.Timer() - for i = 1,nloop do - sbnorm:zeroGradParameters() - groundgrad = backwardFn(sbnorm, input, gradOutput) - end - local groundweight = sbnorm.gradWeight - local groundbias = sbnorm.gradBias - tm.cpu = a:time().real + for k, typename in ipairs(typenames) do + local input = torch.randn(table.unpack(inputSize)):type(typename) + local gradOutput = torch.randn(table.unpack(inputSize)):type(typename) - input = input:cuda() - gradOutput = gradOutput:cuda() - local gbnorm = nn[moduleName](planes):cuda() - if mode == 'training' then - gbnorm:training() - else - gbnorm:evaluate() - end - gbnorm.weight = sbnorm.weight:cuda() - gbnorm.bias = sbnorm.bias:cuda() - gbnorm:forward(input) - gbnorm:zeroGradParameters() - local rescuda = backwardFn(gbnorm, input, gradOutput) - a:reset() - for i = 1,nloop do + local ctype = t2cpu[typename] + input = input:type(ctype) + gradOutput = gradOutput:type(ctype) + local sbnorm = nn[moduleName](planes):type(ctype) + if mode == 'training' then + sbnorm:training() + else + sbnorm:evaluate() + end + sbnorm:forward(input) + sbnorm:zeroGradParameters() + local groundgrad = backwardFn(sbnorm, input, gradOutput) + local groundweight = sbnorm.gradWeight + local groundbias = sbnorm.gradBias + + input = input:type(typename) + gradOutput = gradOutput:type(typename) + local gbnorm = nn[moduleName](planes):type(typename) + if mode == 'training' then + gbnorm:training() + else + gbnorm:evaluate() + end + gbnorm.weight = sbnorm.weight:type(typename) + gbnorm.bias = sbnorm.bias:type(typename) + gbnorm:forward(input) gbnorm:zeroGradParameters() - rescuda = backwardFn(gbnorm, input, gradOutput) - end - local weightcuda = gbnorm.gradWeight - local biascuda = gbnorm.gradBias - cutorch.synchronize() - tm.gpu = a:time().real - - local error = rescuda:float() - groundgrad - local werror = weightcuda:float() - groundweight - local berror = biascuda:float() - groundbias - - mytester:assertlt(error:abs():max(), precision_backward, 'error on state (backward) ') - mytester:assertlt(werror:abs():max(), precision_backward, 'error on weight (backward) ') - mytester:assertlt(berror:abs():max(), precision_backward, 'error on bias (backward) ') + local rescuda = backwardFn(gbnorm, input, gradOutput) + local weightcuda = gbnorm.gradWeight + local biascuda = gbnorm.gradBias + + local error = rescuda:double() - groundgrad:double() + local werror = weightcuda:double() - groundweight:double() + local berror = biascuda:double() - groundbias:double() + + local backerror = precision_backward_type(precision_backward, typename, rescuda:abs():max()) + if typename == 'torch.CudaHalfTensor' and (mode == 'training') then + -- this correction is empirical; mean can be off by roughly 4e-4, multiplied by roughly stdval^2. + backerror = backerror + (sbnorm.save_std:max())^2 * 4e-4 + end + mytester:assertlt(error:abs():max(), + backerror, + string.format('error on state (backward) with %s', typename)) + mytester:assertlt(werror:abs():max(), + precision_backward_type(precision_backward, typename, weightcuda:abs():max()), + string.format('error on weight (backward) with %s', typename)) + mytester:assertlt(berror:abs():max(), + precision_backward_type(precision_backward, typename, biascuda:abs():max()), + string.format('error on bias (backward) with %s', typename)) + end end local function testBatchNormalization(name, dim, k) @@ -1027,44 +1002,34 @@ function cunntest.SpatialConvolutionMM_forward_single() local ini = (outi-1)*si+ki-padW*2 local inj = (outj-1)*sj+kj-padH*2 - local tm = {} - local title = string.format('SpatialConvolutionMM.forward %dx%dx%d o %dx%d -> %dx%dx%d [s: %dx%d] [p: %dx%d]', - from, inj, ini, kj, ki, to, outj, outi, sj, si, padH, padW) - times[title] = tm - local function jacTests(noBias) noBias = noBias or false - local input = torch.randn(from,inj,ini) - local sconv = nn.SpatialConvolutionMM(from,to,ki,kj,si,sj,padW,padH) - if noBias then - sconv:noBias() - end - local groundtruth = sconv:forward(input) - local a = torch.Timer() - for i = 1,nloop do - groundtruth = sconv:forward(input) - end - tm.cpu = a:time().real - - input = input:cuda() - local gconv = nn.SpatialConvolutionMM(from,to,ki,kj,si,sj,padW,padH):cuda() - if noBias then - gconv:noBias() - end - gconv.weight = sconv.weight:cuda() - if gconv.bias then - gconv.bias = sconv.bias:cuda() - end - local rescuda = gconv:forward(input) - a:reset() - for i = 1,nloop do - rescuda = gconv:forward(input) + for k, typename in ipairs(typenames) do + local input = torch.randn(from,inj,ini):type(typename) + + local ctype = t2cpu[typename] + input = input:type(ctype) + local sconv = nn.SpatialConvolutionMM(from,to,ki,kj,si,sj,padW,padH):type(ctype) + if noBias then + sconv:noBias() + end + local groundtruth = sconv:forward(input) + + input = input:type(typename) + local gconv = nn.SpatialConvolutionMM(from,to,ki,kj,si,sj,padW,padH):type(typename) + if noBias then + gconv:noBias() + end + gconv.weight = sconv.weight:type(typename) + if gconv.bias then + gconv.bias = sconv.bias:type(typename) + end + local rescuda = gconv:forward(input) + + local error = rescuda:double() - groundtruth:double() + mytester:assertlt(error:abs():max(), precision_forward_type(precision_forward, typename), + string.format('error on state (forward) with %s', typename)) end - cutorch.synchronize() - tm.gpu = a:time().real - - local error = rescuda:float() - groundtruth - mytester:assertlt(error:abs():max(), precision_forward, 'error on state (forward) ') end jacTests(false) @@ -1086,44 +1051,34 @@ function cunntest.SpatialConvolutionMM_forward_batch() local ini = (outi-1)*si+ki-padW*2 local inj = (outj-1)*sj+kj-padH*2 - local tm = {} - local title = string.format('SpatialConvolutionMM.forward %dx%dx%dx%d o %dx%d -> %dx%dx%dx%d [s: %dx%d] [p: %dx%d]', - bs, from, inj, ini, kj, ki, bs, to, outj, outi, sj, si, padH, padW) - times[title] = tm - local function jacTests(noBias) noBias = noBias or false - local input = torch.randn(bs,from,inj,ini) - local sconv = nn.SpatialConvolutionMM(from,to,ki,kj,si,sj,padW,padH) - if noBias then - sconv:noBias() - end - local groundtruth = sconv:forward(input) - local a = torch.Timer() - for i = 1,nloop do - groundtruth = sconv:forward(input) + for k, typename in ipairs(typenames) do + local input = torch.randn(bs,from,inj,ini):type(typename) + + local ctype = t2cpu[typename] + input = input:type(ctype) + local sconv = nn.SpatialConvolutionMM(from,to,ki,kj,si,sj,padW,padH):type(ctype) + if noBias then + sconv:noBias() + end + local groundtruth = sconv:forward(input) + + input = input:type(typename) + local gconv = nn.SpatialConvolutionMM(from,to,ki,kj,si,sj,padW,padH):type(typename) + if noBias then + gconv:noBias() + end + gconv.weight = sconv.weight:type(typename) + if gconv.bias then + gconv.bias = sconv.bias:type(typename) + end + local rescuda = gconv:forward(input) + + local error = rescuda:double() - groundtruth:double() + mytester:assertlt(error:abs():max(), precision_forward_type(precision_forward, typename), + string.format('error on state (forward) with %s', typename)) end - tm.cpu = a:time().real - - input = input:cuda() - local gconv = nn.SpatialConvolutionMM(from,to,ki,kj,si,sj,padW,padH):cuda() - if noBias then - gconv:noBias() - end - gconv.weight = sconv.weight:cuda() - if gconv.bias then - gconv.bias = sconv.bias:cuda() - end - local rescuda = gconv:forward(input) - a:reset() - for i = 1,nloop do - rescuda = gconv:forward(input) - end - cutorch.synchronize() - tm.gpu = a:time().real - - local error = rescuda:float() - groundtruth - mytester:assertlt(error:abs():max(), precision_forward, 'error on state (forward) ') end @@ -1143,62 +1098,56 @@ function cunntest.SpatialConvolutionMM_backward_single() local ini = (outi-1)*si+ki-padW*2 local inj = (outj-1)*sj+kj-padH*2 - local tm = {} - local title = string.format('SpatialConvolutionMM.backward %dx%dx%d o %dx%d -> %dx%dx%d [s: %dx%d] [p: %dx%d]', - from, inj, ini, kj, ki, to, outj, outi, sj, si, padH, padW) - times[title] = tm - local function jacTests(noBias) noBias = noBias or false - local input = torch.randn(from,inj,ini) - local gradOutput = torch.randn(to,outj,outi) - local sconv = nn.SpatialConvolutionMM(from,to,ki,kj,si,sj,padW,padH) - if noBias then - sconv:noBias() - end - sconv:forward(input) - sconv:zeroGradParameters() - local groundgrad = sconv:backward(input, gradOutput) - local a = torch.Timer() - for i = 1,nloop do - sconv:zeroGradParameters() - groundgrad = sconv:backward(input, gradOutput) - end - local groundweight = sconv.gradWeight - local groundbias = sconv.gradBias - tm.cpu = a:time().real - input = input:cuda() - gradOutput = gradOutput:cuda() - local gconv = nn.SpatialConvolutionMM(from,to,ki,kj,si,sj,padW,padH):cuda() - if noBias then - gconv:noBias() - end - gconv.weight = sconv.weight:cuda() - if gconv.bias then - gconv.bias = sconv.bias:cuda() - end - gconv:forward(input) - gconv:zeroGradParameters() - local rescuda = gconv:backward(input, gradOutput) - a:reset() - for i = 1,nloop do + for k, typename in ipairs(typenames) do + local input = torch.randn(from,inj,ini):type(typename) + local gradOutput = torch.randn(to,outj,outi):type(typename) + + local ctype = t2cpu[typename] + input = input:type(ctype) + gradOutput = gradOutput:type(ctype) + local sconv = nn.SpatialConvolutionMM(from,to,ki,kj,si,sj,padW,padH):type(ctype) + if noBias then + sconv:noBias() + end + sconv:forward(input) + sconv:zeroGradParameters() + local groundgrad = sconv:backward(input, gradOutput) + local groundweight = sconv.gradWeight + local groundbias = sconv.gradBias + + input = input:type(typename) + gradOutput = gradOutput:type(typename) + local gconv = nn.SpatialConvolutionMM(from,to,ki,kj,si,sj,padW,padH):type(typename) + if noBias then + gconv:noBias() + end + gconv.weight = sconv.weight:type(typename) + if gconv.bias then + gconv.bias = sconv.bias:type(typename) + end + gconv:forward(input) gconv:zeroGradParameters() - rescuda = gconv:backward(input, gradOutput) - end - local weightcuda = gconv.gradWeight - cutorch.synchronize() - tm.gpu = a:time().real - - local error = rescuda:float() - groundgrad - local werror = weightcuda:float() - groundweight - - mytester:assertlt(error:abs():max(), precision_backward, 'error on state (backward) ') - mytester:assertlt(werror:abs():max(), precision_backward, 'error on weight (backward) ') - - if gconv.bias then - local berror = gconv.gradBias:float() - groundbias - mytester:assertlt(berror:abs():max(), precision_backward, 'error on bias (backward) ') + local rescuda = gconv:backward(input, gradOutput) + local weightcuda = gconv.gradWeight + + local error = rescuda:double() - groundgrad:double() + local werror = weightcuda:double() - groundweight:double() + + mytester:assertlt(error:abs():max(), precision_backward_type(precision_backward, typename), + string.format('error on state (backward) with %s', typename)) + mytester:assertlt(werror:abs():max(), + precision_backward_conv_weightbias(precision_backward, typename, weightcuda:abs():max()), + string.format('error on weight (backward) with %s', typename)) + + if gconv.bias then + local berror = gconv.gradBias:double() - groundbias:double() + mytester:assertlt(berror:abs():max(), + precision_backward_conv_weightbias(precision_backward, typename, gconv.gradBias:abs():max()), + string.format('error on bias (backward) with %s', typename)) + end end end @@ -1221,61 +1170,55 @@ function cunntest.SpatialConvolutionMM_backward_batch() local ini = (outi-1)*si+ki-padW*2 local inj = (outj-1)*sj+kj-padH*2 - local tm = {} - local title = string.format('SpatialConvolutionMM.backward %dx%dx%dx%d o %dx%d -> %dx%dx%dx%d [s: %dx%d] [p: %dx%d]', - bs, from, inj, ini, kj, ki, bs, to, outj, outi, sj, si, padH, padW) - times[title] = tm - local function jacTests(noBias) noBias = noBias or false - local input = torch.randn(bs,from,inj,ini) - local gradOutput = torch.randn(bs,to,outj,outi) - local sconv = nn.SpatialConvolutionMM(from,to,ki,kj,si,sj,padW,padH) - if noBias then - sconv:noBias() - end - sconv:forward(input) - sconv:zeroGradParameters() - local groundgrad = sconv:backward(input, gradOutput) - local a = torch.Timer() - for i = 1,nloop do - sconv:zeroGradParameters() - groundgrad = sconv:backward(input, gradOutput) - end - local groundweight = sconv.gradWeight - local groundbias = sconv.gradBias - tm.cpu = a:time().real - input = input:cuda() - gradOutput = gradOutput:cuda() - local gconv = nn.SpatialConvolutionMM(from,to,ki,kj,si,sj,padW,padH):cuda() - if noBias then - gconv:noBias() - end - gconv.weight = sconv.weight:cuda() - if gconv.bias then - gconv.bias = sconv.bias:cuda() - end - gconv:forward(input) - gconv:zeroGradParameters() - local rescuda = gconv:backward(input, gradOutput) - a:reset() - for i = 1,nloop do + for k, typename in ipairs(typenames) do + local input = torch.randn(bs,from,inj,ini) + local gradOutput = torch.randn(bs,to,outj,outi) + + local ctype = t2cpu[typename] + input = input:type(ctype) + gradOutput = gradOutput:type(ctype) + local sconv = nn.SpatialConvolutionMM(from,to,ki,kj,si,sj,padW,padH):type(ctype) + if noBias then + sconv:noBias() + end + sconv:forward(input) + sconv:zeroGradParameters() + local groundgrad = sconv:backward(input, gradOutput) + local groundweight = sconv.gradWeight + local groundbias = sconv.gradBias + + input = input:type(typename) + gradOutput = gradOutput:type(typename) + local gconv = nn.SpatialConvolutionMM(from,to,ki,kj,si,sj,padW,padH):type(typename) + if noBias then + gconv:noBias() + end + gconv.weight = sconv.weight:type(typename) + if gconv.bias then + gconv.bias = sconv.bias:type(typename) + end + gconv:forward(input) gconv:zeroGradParameters() - rescuda = gconv:backward(input, gradOutput) - end - local weightcuda = gconv.gradWeight - cutorch.synchronize() - tm.gpu = a:time().real - - local error = rescuda:float() - groundgrad - local werror = weightcuda:float() - groundweight - - mytester:assertlt(error:abs():max(), precision_backward, 'error on state (backward) ') - mytester:assertlt(werror:abs():max(), precision_backward, 'error on weight (backward) ') - if gconv.bias then - local berror = gconv.gradBias:float() - groundbias - mytester:assertlt(berror:abs():max(), precision_backward, 'error on bias (backward) ') + local rescuda = gconv:backward(input, gradOutput) + local weightcuda = gconv.gradWeight + + local error = rescuda:double() - groundgrad:double() + local werror = weightcuda:double() - groundweight:double() + + mytester:assertlt(error:abs():max(), precision_backward_type(precision_backward, typename), + string.format('error on state (backward) with %s', typename)) + mytester:assertlt(werror:abs():max(), + precision_backward_conv_weightbias(precision_backward, typename, weightcuda:abs():max()), + string.format('error on weight (backward) with %s', typename)) + if gconv.bias then + local berror = gconv.gradBias:double() - groundbias:double() + mytester:assertlt(berror:abs():max(), + precision_backward_conv_weightbias(precision_backward, typename, gconv.gradBias:abs():max()), + string.format('error on bias (backward) with %s', typename)) + end end end @@ -1297,34 +1240,24 @@ function cunntest.SpatialConvolutionLocal_forward_single() local ini = (outi-1)*si+ki-padW*2 local inj = (outj-1)*sj+kj-padH*2 - local tm = {} - local title = string.format('SpatialConvolutionLocal.forward %dx%dx%d o %dx%d -> %dx%dx%d [s: %dx%d] [p: %dx%d]', - from, inj, ini, kj, ki, to, outj, outi, sj, si, padH, padW) - times[title] = tm + for k, typename in ipairs(typenames) do + local input = torch.randn(from,inj,ini):type(typename) - local input = torch.randn(from,inj,ini) - local sconv = nn.SpatialConvolutionLocal(from,to,ini,inj,ki,kj,si,sj,padW,padH) - local groundtruth = sconv:forward(input) - local a = torch.Timer() - for i = 1,nloop do - groundtruth = sconv:forward(input) - end - tm.cpu = a:time().real + local ctype = t2cpu[typename] + input = input:type(ctype) + local sconv = nn.SpatialConvolutionLocal(from,to,ini,inj,ki,kj,si,sj,padW,padH):type(ctype) + local groundtruth = sconv:forward(input) - input = input:cuda() - local gconv = nn.SpatialConvolutionLocal(from,to,ini,inj,ki,kj,si,sj,padW,padH):cuda() - gconv.weight = sconv.weight:cuda() - gconv.bias = sconv.bias:cuda() - local rescuda = gconv:forward(input) - a:reset() - for i = 1,nloop do - rescuda = gconv:forward(input) - end - cutorch.synchronize() - tm.gpu = a:time().real + input = input:type(typename) + local gconv = nn.SpatialConvolutionLocal(from,to,ini,inj,ki,kj,si,sj,padW,padH):type(typename) + gconv.weight = sconv.weight:type(typename) + gconv.bias = sconv.bias:type(typename) + local rescuda = gconv:forward(input) - local error = rescuda:float() - groundtruth - mytester:assertlt(error:abs():max(), precision_forward, 'error on state (forward) ') + local error = rescuda:double() - groundtruth:double() + mytester:assertlt(error:abs():max(), precision_forward_type(precision_forward, typename), + string.format('error on state (forward) with %s', typename)) + end end function cunntest.SpatialConvolutionLocal_forward_batch() @@ -1342,34 +1275,24 @@ function cunntest.SpatialConvolutionLocal_forward_batch() local ini = (outi-1)*si+ki-padW*2 local inj = (outj-1)*sj+kj-padH*2 - local tm = {} - local title = string.format('SpatialConvolutionLocal.forward %dx%dx%dx%d o %dx%d -> %dx%dx%dx%d [s: %dx%d] [p: %dx%d]', - bs, from, inj, ini, kj, ki, bs, to, outj, outi, sj, si, padH, padW) - times[title] = tm + for k, typename in ipairs(typenames) do + local input = torch.randn(bs,from,inj,ini):type(typename) - local input = torch.randn(bs,from,inj,ini) - local sconv = nn.SpatialConvolutionLocal(from,to,ini,inj,ki,kj,si,sj,padW,padH) - local groundtruth = sconv:forward(input) - local a = torch.Timer() - for i = 1,nloop do - groundtruth = sconv:forward(input) - end - tm.cpu = a:time().real + local ctype = t2cpu[typename] + input = input:type(ctype) + local sconv = nn.SpatialConvolutionLocal(from,to,ini,inj,ki,kj,si,sj,padW,padH):type(ctype) + local groundtruth = sconv:forward(input) - input = input:cuda() - local gconv = nn.SpatialConvolutionLocal(from,to,ini,inj,ki,kj,si,sj,padW,padH):cuda() - gconv.weight = sconv.weight:cuda() - gconv.bias = sconv.bias:cuda() - local rescuda = gconv:forward(input) - a:reset() - for i = 1,nloop do - rescuda = gconv:forward(input) - end - cutorch.synchronize() - tm.gpu = a:time().real + input = input:type(typename) + local gconv = nn.SpatialConvolutionLocal(from,to,ini,inj,ki,kj,si,sj,padW,padH):type(typename) + gconv.weight = sconv.weight:type(typename) + gconv.bias = sconv.bias:type(typename) + local rescuda = gconv:forward(input) - local error = rescuda:float() - groundtruth - mytester:assertlt(error:abs():max(), precision_forward, 'error on state (forward) ') + local error = rescuda:double() - groundtruth:double() + mytester:assertlt(error:abs():max(), precision_forward_type(precision_forward, typename), + string.format('error on state (forward) with %s', typename)) + end end function cunntest.SpatialConvolutionLocal_backward_single() @@ -1386,51 +1309,44 @@ function cunntest.SpatialConvolutionLocal_backward_single() local ini = (outi-1)*si+ki-padW*2 local inj = (outj-1)*sj+kj-padH*2 - local tm = {} - local title = string.format('SpatialConvolutionLocal.backward %dx%dx%d o %dx%d -> %dx%dx%d [s: %dx%d] [p: %dx%d]', - from, inj, ini, kj, ki, to, outj, outi, sj, si, padH, padW) - times[title] = tm + for k, typename in ipairs(typenames) do + local input = torch.randn(from,inj,ini):type(typename) + local gradOutput = torch.randn(to,outj,outi):type(typename) - local input = torch.randn(from,inj,ini) - local gradOutput = torch.randn(to,outj,outi) - local sconv = nn.SpatialConvolutionLocal(from,to,ini,inj,ki,kj,si,sj,padW,padH) - sconv:forward(input) - sconv:zeroGradParameters() - local groundgrad = sconv:backward(input, gradOutput) - local a = torch.Timer() - for i = 1,nloop do + local ctype = t2cpu[typename] + input = input:type(ctype) + gradOutput = gradOutput:type(ctype) + local sconv = nn.SpatialConvolutionLocal(from,to,ini,inj,ki,kj,si,sj,padW,padH):type(ctype) + sconv:forward(input) sconv:zeroGradParameters() - groundgrad = sconv:backward(input, gradOutput) - end - local groundweight = sconv.gradWeight - local groundbias = sconv.gradBias - tm.cpu = a:time().real + local groundgrad = sconv:backward(input, gradOutput) + local groundweight = sconv.gradWeight + local groundbias = sconv.gradBias - input = input:cuda() - gradOutput = gradOutput:cuda() - local gconv = nn.SpatialConvolutionLocal(from,to,ini,inj,ki,kj,si,sj,padW,padH):cuda() - gconv.weight = sconv.weight:cuda() - gconv.bias = sconv.bias:cuda() - gconv:forward(input) - gconv:zeroGradParameters() - local rescuda = gconv:backward(input, gradOutput) - a:reset() - for i = 1,nloop do + input = input:type(typename) + gradOutput = gradOutput:type(typename) + local gconv = nn.SpatialConvolutionLocal(from,to,ini,inj,ki,kj,si,sj,padW,padH):type(typename) + gconv.weight = sconv.weight:type(typename) + gconv.bias = sconv.bias:type(typename) + gconv:forward(input) gconv:zeroGradParameters() - rescuda = gconv:backward(input, gradOutput) - end - local weightcuda = gconv.gradWeight - local biascuda = gconv.gradBias - cutorch.synchronize() - tm.gpu = a:time().real + local rescuda = gconv:backward(input, gradOutput) + local weightcuda = gconv.gradWeight + local biascuda = gconv.gradBias - local error = rescuda:float() - groundgrad - local werror = weightcuda:float() - groundweight - local berror = biascuda:float() - groundbias + local error = rescuda:double() - groundgrad:double() + local werror = weightcuda:double() - groundweight:double() + local berror = biascuda:double() - groundbias:double() - mytester:assertlt(error:abs():max(), precision_backward, 'error on state (backward) ') - mytester:assertlt(werror:abs():max(), precision_backward, 'error on weight (backward) ') - mytester:assertlt(berror:abs():max(), precision_backward, 'error on bias (backward) ') + mytester:assertlt(error:abs():max(), precision_backward_type(precision_backward, typename), + string.format('error on state (backward) with %s', typename)) + mytester:assertlt(werror:abs():max(), + precision_backward_conv_weightbias(precision_backward, typename, weightcuda:abs():max()), + string.format('error on weight (backward) with %s', typename)) + mytester:assertlt(berror:abs():max(), + precision_backward_conv_weightbias(precision_backward, typename, biascuda:abs():max()), + string.format('error on bias (backward) with %s', typename)) + end end function cunntest.SpatialConvolutionLocal_backward_batch() @@ -1448,51 +1364,44 @@ function cunntest.SpatialConvolutionLocal_backward_batch() local ini = (outi-1)*si+ki-padW*2 local inj = (outj-1)*sj+kj-padH*2 - local tm = {} - local title = string.format('SpatialConvolutionLocal.backward %dx%dx%dx%d o %dx%d -> %dx%dx%dx%d [s: %dx%d] [p: %dx%d]', - bs, from, inj, ini, kj, ki, bs, to, outj, outi, sj, si, padH, padW) - times[title] = tm + for k, typename in ipairs(typenames) do + local input = torch.randn(bs,from,inj,ini):type(typename) + local gradOutput = torch.randn(bs,to,outj,outi):type(typename) - local input = torch.randn(bs,from,inj,ini) - local gradOutput = torch.randn(bs,to,outj,outi) - local sconv = nn.SpatialConvolutionLocal(from,to,ini,inj,ki,kj,si,sj,padW,padH) - sconv:forward(input) - sconv:zeroGradParameters() - local groundgrad = sconv:backward(input, gradOutput) - local a = torch.Timer() - for i = 1,nloop do + local ctype = t2cpu[typename] + input = input:type(ctype) + gradOutput = gradOutput:type(ctype) + local sconv = nn.SpatialConvolutionLocal(from,to,ini,inj,ki,kj,si,sj,padW,padH):type(ctype) + sconv:forward(input) sconv:zeroGradParameters() - groundgrad = sconv:backward(input, gradOutput) - end - local groundweight = sconv.gradWeight - local groundbias = sconv.gradBias - tm.cpu = a:time().real + local groundgrad = sconv:backward(input, gradOutput) + local groundweight = sconv.gradWeight + local groundbias = sconv.gradBias - input = input:cuda() - gradOutput = gradOutput:cuda() - local gconv = nn.SpatialConvolutionLocal(from,to,ini,inj,ki,kj,si,sj,padW,padH):cuda() - gconv.weight = sconv.weight:cuda() - gconv.bias = sconv.bias:cuda() - gconv:forward(input) - gconv:zeroGradParameters() - local rescuda = gconv:backward(input, gradOutput) - a:reset() - for i = 1,nloop do + input = input:type(typename) + gradOutput = gradOutput:type(typename) + local gconv = nn.SpatialConvolutionLocal(from,to,ini,inj,ki,kj,si,sj,padW,padH):type(typename) + gconv.weight = sconv.weight:type(typename) + gconv.bias = sconv.bias:type(typename) + gconv:forward(input) gconv:zeroGradParameters() - rescuda = gconv:backward(input, gradOutput) - end - local weightcuda = gconv.gradWeight - local biascuda = gconv.gradBias - cutorch.synchronize() - tm.gpu = a:time().real + local rescuda = gconv:backward(input, gradOutput) + local weightcuda = gconv.gradWeight + local biascuda = gconv.gradBias - local error = rescuda:float() - groundgrad - local werror = weightcuda:float() - groundweight - local berror = biascuda:float() - groundbias + local error = rescuda:double() - groundgrad:double() + local werror = weightcuda:double() - groundweight:double() + local berror = biascuda:double() - groundbias:double() - mytester:assertlt(error:abs():max(), precision_backward, 'error on state (backward) ') - mytester:assertlt(werror:abs():max(), precision_backward, 'error on weight (backward) ') - mytester:assertlt(berror:abs():max(), precision_backward, 'error on bias (backward) ') + mytester:assertlt(error:abs():max(), precision_backward_type(precision_backward, typename), + string.format('error on state (backward) with %s', typename)) + mytester:assertlt(werror:abs():max(), + precision_backward_conv_weightbias(precision_backward, typename, weightcuda:abs():max()), + string.format('error on weight (backward) with %s', typename)) + mytester:assertlt(berror:abs():max(), + precision_backward_conv_weightbias(precision_backward, typename, biascuda:abs():max()), + string.format('error on bias (backward) with %s', typename)) + end end function cunntest.SpatialFullConvolution_forward_single() @@ -1511,46 +1420,34 @@ function cunntest.SpatialFullConvolution_forward_single() local ini = math.floor((outi + 2 * padW - ki) / si + 1) local inj = math.floor((outj + 2 * padH - kj) / sj + 1) - local tm = {} - local title = - string.format('SpatialFullConvolution.forward %dx%dx%d o %dx%d ' - .. '-> %dx%dx%d [s: %dx%d] [p: %dx%d] [a: %dx%d]', - from, inj, ini, kj, ki, to, outj, outi, sj, si, padH, padW, adjH, adjW) - times[title] = tm - local function jacTests(noBias) noBias = noBias or false - local input = torch.randn(from,inj,ini) - local sconv = nn.SpatialFullConvolution(from,to,ki,kj,si,sj,padW,padH,adjW,adjH) - if noBias then - sconv:noBias() + for k, typename in ipairs(typenames) do + local input = torch.randn(from,inj,ini):type(typename) + + local ctype = t2cpu[typename] + input = input:type(ctype) + local sconv = nn.SpatialFullConvolution(from,to,ki,kj,si,sj,padW,padH,adjW,adjH):type(ctype) + if noBias then + sconv:noBias() + end + local groundtruth = sconv:forward(input) + + input = input:type(typename) + local gconv = nn.SpatialFullConvolution(from,to,ki,kj,si,sj,padW,padH,adjW,adjH):type(typename) + if noBias then + gconv:noBias() + end + gconv.weight = sconv.weight:type(typename) + if gconv.bias then + gconv.bias = sconv.bias:type(typename) + end + local rescuda = gconv:forward(input) + + local error = rescuda:double() - groundtruth:double() + mytester:assertlt(error:abs():max(), precision_forward_type(precision_forward, typename), + string.format('error on state (forward) with %s', typename)) end - local groundtruth = sconv:forward(input) - local a = torch.Timer() - for i = 1,nloop do - groundtruth = sconv:forward(input) - end - tm.cpu = a:time().real - - input = input:cuda() - local gconv = nn.SpatialFullConvolution(from,to,ki,kj,si,sj,padW,padH,adjW,adjH):cuda() - if noBias then - gconv:noBias() - end - gconv.weight = sconv.weight:cuda() - if gconv.bias then - gconv.bias = sconv.bias:cuda() - end - local rescuda = gconv:forward(input) - a:reset() - for i = 1,nloop do - rescuda = gconv:forward(input) - end - cutorch.synchronize() - tm.gpu = a:time().real - - local error = rescuda:float() - groundtruth - mytester:assertlt(error:abs():max(), precision_forward, 'error on state (forward) ') end jacTests(false) @@ -1574,44 +1471,34 @@ function cunntest.SpatialFullConvolution_forward_batch() local ini = math.floor((outi + 2 * padW - ki) / si + 1) local inj = math.floor((outj + 2 * padH - kj) / sj + 1) - local tm = {} - local title = string.format('SpatialFullConvolution.forward %dx%dx%dx%d o %dx%d -> %dx%dx%dx%d [s: %dx%d] [p: %dx%d] [a: %dx%d]', - bs, from, inj, ini, kj, ki, bs, to, outj, outi, sj, si, padH, padW, adjH, adjW) - times[title] = tm - local function jacTests(noBias) noBias = noBias or false - local input = torch.randn(bs,from,inj,ini) - local sconv = nn.SpatialFullConvolution(from,to,ki,kj,si,sj,padW,padH,adjW,adjH) - if noBias then - sconv:noBias() - end - local groundtruth = sconv:forward(input) - local a = torch.Timer() - for i = 1,nloop do - groundtruth = sconv:forward(input) - end - tm.cpu = a:time().real - - input = input:cuda() - local gconv = nn.SpatialFullConvolution(from,to,ki,kj,si,sj,padW,padH,adjW,adjH):cuda() - if noBias then - gconv:noBias() + for k, typename in ipairs(typenames) do + local input = torch.randn(bs,from,inj,ini):type(typename) + + local ctype = t2cpu[typename] + input = input:type(ctype) + local sconv = nn.SpatialFullConvolution(from,to,ki,kj,si,sj,padW,padH,adjW,adjH):type(ctype) + if noBias then + sconv:noBias() + end + local groundtruth = sconv:forward(input) + + input = input:type(typename) + local gconv = nn.SpatialFullConvolution(from,to,ki,kj,si,sj,padW,padH,adjW,adjH):type(typename) + if noBias then + gconv:noBias() + end + gconv.weight = sconv.weight:type(typename) + if gconv.bias then + gconv.bias = sconv.bias:type(typename) + end + local rescuda = gconv:forward(input) + + local error = rescuda:double() - groundtruth:double() + mytester:assertlt(error:abs():max(), precision_forward_type(precision_forward, typename), + string.format('error on state (forward) with %s', typename)) end - gconv.weight = sconv.weight:cuda() - if gconv.bias then - gconv.bias = sconv.bias:cuda() - end - local rescuda = gconv:forward(input) - a:reset() - for i = 1,nloop do - rescuda = gconv:forward(input) - end - cutorch.synchronize() - tm.gpu = a:time().real - - local error = rescuda:float() - groundtruth - mytester:assertlt(error:abs():max(), precision_forward, 'error on state (forward) ') end jacTests(false) @@ -1634,62 +1521,54 @@ function cunntest.SpatialFullConvolution_backward_single() local ini = math.floor((outi + 2 * padW - ki) / si + 1) local inj = math.floor((outj + 2 * padH - kj) / sj + 1) - local tm = {} - local title = string.format('SpatialFullConvolution.backward %dx%dx%d o %dx%d -> %dx%dx%d [s: %dx%d] [p: %dx%d] [a: %dx%d]', - from, inj, ini, kj, ki, to, outj, outi, sj, si, padH, padW, adjH, adjW) - times[title] = tm - local function jacTests(noBias) noBias = noBias or false - local input = torch.randn(from,inj,ini) - local sconv = nn.SpatialFullConvolution(from,to,ki,kj,si,sj,padW,padH,adjW,adjH) - if noBias then - sconv:noBias() - end - local output = sconv:forward(input) - local gradOutput = output:clone():normal() - sconv:zeroGradParameters() - local groundgrad = sconv:backward(input, gradOutput) - local a = torch.Timer() - for i = 1,nloop do + for k, typename in ipairs(typenames) do + local input = torch.randn(from,inj,ini):type(typename) + + local ctype = t2cpu[typename] + input = input:type(ctype) + local sconv = nn.SpatialFullConvolution(from,to,ki,kj,si,sj,padW,padH,adjW,adjH):type(ctype) + if noBias then + sconv:noBias() + end + local output = sconv:forward(input) + local gradOutput = output:clone():normal() sconv:zeroGradParameters() - groundgrad = sconv:backward(input, gradOutput) - end - local groundweight = sconv.gradWeight - local groundbias = sconv.gradBias - tm.cpu = a:time().real - - input = input:cuda() - gradOutput = gradOutput:cuda() - local gconv = nn.SpatialFullConvolution(from,to,ki,kj,si,sj,padW,padH,adjW,adjH):cuda() - if noBias then - gconv:noBias() - end - gconv.weight = sconv.weight:cuda() - if gconv.bias then - gconv.bias = sconv.bias:cuda() - end - gconv:forward(input) - gconv:zeroGradParameters() - local rescuda = gconv:backward(input, gradOutput) - a:reset() - for i = 1,nloop do + local groundgrad = sconv:backward(input, gradOutput) + local groundweight = sconv.gradWeight + local groundbias = sconv.gradBias + + input = input:type(typename) + gradOutput = gradOutput:type(typename) + local gconv = nn.SpatialFullConvolution(from,to,ki,kj,si,sj,padW,padH,adjW,adjH):type(typename) + if noBias then + gconv:noBias() + end + gconv.weight = sconv.weight:type(typename) + if gconv.bias then + gconv.bias = sconv.bias:type(typename) + end + gconv:forward(input) gconv:zeroGradParameters() - rescuda = gconv:backward(input, gradOutput) - end - local weightcuda = gconv.gradWeight - cutorch.synchronize() - tm.gpu = a:time().real - - local error = rescuda:float() - groundgrad - local werror = weightcuda:float() - groundweight - - mytester:assertlt(error:abs():max(), precision_backward, 'error on state (backward) ') - mytester:assertlt(werror:abs():max(), precision_backward, 'error on weight (backward) ') - - if gconv.bias then - local berror = gconv.gradBias:float() - groundbias - mytester:assertlt(berror:abs():max(), precision_backward, 'error on bias (backward) ') + local rescuda = gconv:backward(input, gradOutput) + local weightcuda = gconv.gradWeight + + local error = rescuda:double() - groundgrad:double() + local werror = weightcuda:double() - groundweight:double() + + mytester:assertlt(error:abs():max(), precision_backward_type(precision_backward, typename), + string.format('error on state (backward) with %s', typename)) + mytester:assertlt(werror:abs():max(), + precision_backward_conv_weightbias(precision_backward, typename, weightcuda:abs():max()), + string.format('error on weight (backward) with %s', typename)) + + if gconv.bias then + local berror = gconv.gradBias:double() - groundbias:double() + mytester:assertlt(berror:abs():max(), + precision_backward_conv_weightbias(precision_backward, typename, gconv.gradBias:abs():max()), + string.format('error on bias (backward) with %s', typename)) + end end end @@ -1714,63 +1593,54 @@ function cunntest.SpatialFullConvolution_backward_batch() local ini = math.floor((outi + 2 * padW - ki) / si + 1) local inj = math.floor((outj + 2 * padH - kj) / sj + 1) - local tm = {} - local title = string.format('SpatialFullConvolution.backward %dx%dx%dx%d o %dx%d ' - .. '-> %dx%dx%dx%d [s: %dx%d] [p: %dx%d] [a: %dx%d]', - bs, from, inj, ini, kj, ki, - bs, to, outj, outi, sj, si, padH, padW, adjH, adjW) - times[title] = tm - local function jacTests(noBias) noBias = noBias or false - local input = torch.randn(bs,from,inj,ini) - local sconv = nn.SpatialFullConvolution(from,to,ki,kj,si,sj,padW,padH,adjW,adjH) - if noBias then - sconv:noBias() - end - local output = sconv:forward(input) - local gradOutput = output:clone():normal() - sconv:zeroGradParameters() - local groundgrad = sconv:backward(input, gradOutput) - local a = torch.Timer() - for i = 1,nloop do - sconv:zeroGradParameters() - groundgrad = sconv:backward(input, gradOutput) - end - local groundweight = sconv.gradWeight - local groundbias = sconv.gradBias - tm.cpu = a:time().real - input = input:cuda() - gradOutput = gradOutput:cuda() - local gconv = nn.SpatialFullConvolution(from,to,ki,kj,si,sj,padW,padH,adjW,adjH):cuda() - if noBias then - gconv:noBias() - end - gconv.weight = sconv.weight:cuda() - if gconv.bias then - gconv.bias = sconv.bias:cuda() - end - gconv:forward(input) - gconv:zeroGradParameters() - local rescuda = gconv:backward(input, gradOutput) - a:reset() - for i = 1,nloop do + for k, typename in ipairs(typenames) do + local input = torch.randn(bs,from,inj,ini):type(typename) + + local ctype = t2cpu[typename] + input = input:type(ctype) + local sconv = nn.SpatialFullConvolution(from,to,ki,kj,si,sj,padW,padH,adjW,adjH):type(ctype) + if noBias then + sconv:noBias() + end + local output = sconv:forward(input) + local gradOutput = output:clone():normal() + sconv:zeroGradParameters() + local groundgrad = sconv:backward(input, gradOutput) + local groundweight = sconv.gradWeight + local groundbias = sconv.gradBias + + input = input:type(typename) + gradOutput = gradOutput:type(typename) + local gconv = nn.SpatialFullConvolution(from,to,ki,kj,si,sj,padW,padH,adjW,adjH):type(typename) + if noBias then + gconv:noBias() + end + gconv.weight = sconv.weight:type(typename) + if gconv.bias then + gconv.bias = sconv.bias:type(typename) + end + gconv:forward(input) gconv:zeroGradParameters() - rescuda = gconv:backward(input, gradOutput) - end - local weightcuda = gconv.gradWeight - cutorch.synchronize() - tm.gpu = a:time().real - - local error = rescuda:float() - groundgrad - local werror = weightcuda:float() - groundweight - - mytester:assertlt(error:abs():max(), precision_backward, 'error on state (backward) ') - mytester:assertlt(werror:abs():max(), precision_backward, 'error on weight (backward) ') - if gconv.bias then - local berror = gconv.gradBias:float() - groundbias - mytester:assertlt(berror:abs():max(), precision_backward, 'error on bias (backward) ') + local rescuda = gconv:backward(input, gradOutput) + local weightcuda = gconv.gradWeight + + local error = rescuda:double() - groundgrad:double() + local werror = weightcuda:double() - groundweight:double() + + mytester:assertlt(error:abs():max(), precision_backward_type(precision_backward, typename), + string.format('error on state (backward) with %s', typename)) + mytester:assertlt(werror:abs():max(), + precision_backward_conv_weightbias(precision_backward, typename, weightcuda:abs():max()), + string.format('error on weight (backward) with %s', typename)) + if gconv.bias then + local berror = gconv.gradBias:double() - groundbias:double() + mytester:assertlt(berror:abs():max(), + precision_backward_conv_weightbias(precision_backward, typename, gconv.gradBias:abs():max()), + string.format('error on bias (backward) with %s', typename)) + end end end @@ -1794,36 +1664,24 @@ function cunntest.SpatialDilatedConvolution_forward_single() local ini = (outi - 1) * si - 2 * padW + dilationW * (ki-1) + 1 local inj = (outj - 1) * sj - 2 * padH + dilationH * (kj-1) + 1 - local tm = {} - local title = - string.format('SpatialDilatedConvolution.forward %dx%dx%d o %dx%d ' - .. '-> %dx%dx%d [s: %dx%d] [p: %dx%d] [a: %dx%d]', - from, inj, ini, kj, ki, to, outj, outi, sj, si, padH, padW, dilationH, dilationW) - times[title] = tm + for k, typename in ipairs(typenames) do + local input = torch.randn(from,inj,ini):type(typename) - local input = torch.randn(from,inj,ini) - local sconv = nn.SpatialDilatedConvolution(from,to,ki,kj,si,sj,padW,padH,dilationW,dilationH) - local groundtruth = sconv:forward(input) - local a = torch.Timer() - for i = 1,nloop do - groundtruth = sconv:forward(input) - end - tm.cpu = a:time().real + local ctype = t2cpu[typename] + input = input:type(ctype) + local sconv = nn.SpatialDilatedConvolution(from,to,ki,kj,si,sj,padW,padH,dilationW,dilationH):type(ctype) + local groundtruth = sconv:forward(input) - input = input:cuda() - local gconv = nn.SpatialDilatedConvolution(from,to,ki,kj,si,sj,padW,padH,dilationW,dilationH):cuda() - gconv.weight = sconv.weight:cuda() - gconv.bias = sconv.bias:cuda() - local rescuda = gconv:forward(input) - a:reset() - for i = 1,nloop do - rescuda = gconv:forward(input) - end - cutorch.synchronize() - tm.gpu = a:time().real + input = input:type(typename) + local gconv = nn.SpatialDilatedConvolution(from,to,ki,kj,si,sj,padW,padH,dilationW,dilationH):type(typename) + gconv.weight = sconv.weight:type(typename) + gconv.bias = sconv.bias:type(typename) + local rescuda = gconv:forward(input) - local error = rescuda:float() - groundtruth - mytester:assertlt(error:abs():max(), precision_forward, 'error on state (forward) ') + local error = rescuda:double() - groundtruth:double() + mytester:assertlt(error:abs():max(), precision_forward_type(precision_forward, typename), + string.format('error on state (forward) with %s', typename)) + end end function cunntest.SpatialDilatedConvolution_forward_batch() @@ -1838,39 +1696,29 @@ function cunntest.SpatialDilatedConvolution_forward_batch() local padH = math.random(0,1) local outi = math.random(ki, 64) local outj = math.random(kj, 64) - local dilationW = math.random(0,10) - local dilationH = math.random(0,10) + local dilationW = math.random(1,10) + local dilationH = math.random(1,10) local ini = (outi - 1) * si - 2 * padW + dilationW * (ki-1) + 1 local inj = (outj - 1) * sj - 2 * padH + dilationH * (kj-1) + 1 - local tm = {} - local title = string.format('SpatialDilatedConvolution.forward %dx%dx%dx%d o %dx%d -> %dx%dx%dx%d [s: %dx%d] [p: %dx%d] [a: %dx%d]', - bs, from, inj, ini, kj, ki, bs, to, outj, outi, sj, si, padH, padW, dilationH, dilationW) - times[title] = tm + for k, typename in ipairs(typenames) do + local input = torch.randn(bs,from,inj,ini):type(typename) - local input = torch.randn(bs,from,inj,ini) - local sconv = nn.SpatialDilatedConvolution(from,to,ki,kj,si,sj,padW,padH,dilationW,dilationH) - local groundtruth = sconv:forward(input) - local a = torch.Timer() - for i = 1,nloop do - groundtruth = sconv:forward(input) - end - tm.cpu = a:time().real + local ctype = t2cpu[typename] + input = input:type(ctype) + local sconv = nn.SpatialDilatedConvolution(from,to,ki,kj,si,sj,padW,padH,dilationW,dilationH):type(ctype) + local groundtruth = sconv:forward(input) - input = input:cuda() - local gconv = nn.SpatialDilatedConvolution(from,to,ki,kj,si,sj,padW,padH,dilationW,dilationH):cuda() - gconv.weight = sconv.weight:cuda() - gconv.bias = sconv.bias:cuda() - local rescuda = gconv:forward(input) - a:reset() - for i = 1,nloop do - rescuda = gconv:forward(input) - end - cutorch.synchronize() - tm.gpu = a:time().real + input = input:type(typename) + local gconv = nn.SpatialDilatedConvolution(from,to,ki,kj,si,sj,padW,padH,dilationW,dilationH):type(typename) + gconv.weight = sconv.weight:type(typename) + gconv.bias = sconv.bias:type(typename) + local rescuda = gconv:forward(input) - local error = rescuda:float() - groundtruth - mytester:assertlt(error:abs():max(), precision_forward, 'error on state (forward) ') + local error = rescuda:double() - groundtruth:double() + mytester:assertlt(error:abs():max(), precision_forward_type(precision_forward, typename), + string.format('error on state (forward) with %s', typename)) + end end function cunntest.SpatialDilatedConvolution_backward_single() @@ -1884,56 +1732,48 @@ function cunntest.SpatialDilatedConvolution_backward_single() local padH = math.random(0,1) local outi = math.random(ki, 64) local outj = math.random(kj, 64) - local dilationW = math.random(0,10) - local dilationH = math.random(0,10) + local dilationW = math.random(1,10) + local dilationH = math.random(1,10) local ini = (outi - 1) * si - 2 * padW + dilationW * (ki-1) + 1 local inj = (outj - 1) * sj - 2 * padH + dilationH * (kj-1) + 1 - local tm = {} - local title = string.format('SpatialDilatedConvolution.backward %dx%dx%d o %dx%d -> %dx%dx%d [s: %dx%d] [p: %dx%d] [a: %dx%d]', - from, inj, ini, kj, ki, to, outj, outi, sj, si, padH, padW, dilationH, dilationW) - times[title] = tm + for k, typename in ipairs(typenames) do + local input = torch.randn(from,inj,ini):type(typename) - local input = torch.randn(from,inj,ini) - local sconv = nn.SpatialDilatedConvolution(from,to,ki,kj,si,sj,padW,padH,dilationW,dilationH) - local output = sconv:forward(input) - local gradOutput = output:clone():normal() - sconv:zeroGradParameters() - local groundgrad = sconv:backward(input, gradOutput) - local a = torch.Timer() - for i = 1,nloop do + local ctype = t2cpu[typename] + input = input:type(ctype) + local sconv = nn.SpatialDilatedConvolution(from,to,ki,kj,si,sj,padW,padH,dilationW,dilationH):type(ctype) + local output = sconv:forward(input) + local gradOutput = output:clone():normal() sconv:zeroGradParameters() - groundgrad = sconv:backward(input, gradOutput) - end - local groundweight = sconv.gradWeight - local groundbias = sconv.gradBias - tm.cpu = a:time().real + local groundgrad = sconv:backward(input, gradOutput) + local groundweight = sconv.gradWeight + local groundbias = sconv.gradBias - input = input:cuda() - gradOutput = gradOutput:cuda() - local gconv = nn.SpatialDilatedConvolution(from,to,ki,kj,si,sj,padW,padH,dilationW,dilationH):cuda() - gconv.weight = sconv.weight:cuda() - gconv.bias = sconv.bias:cuda() - gconv:forward(input) - gconv:zeroGradParameters() - local rescuda = gconv:backward(input, gradOutput) - a:reset() - for i = 1,nloop do + input = input:type(typename) + gradOutput = gradOutput:type(typename) + local gconv = nn.SpatialDilatedConvolution(from,to,ki,kj,si,sj,padW,padH,dilationW,dilationH):type(typename) + gconv.weight = sconv.weight:type(typename) + gconv.bias = sconv.bias:type(typename) + gconv:forward(input) gconv:zeroGradParameters() - rescuda = gconv:backward(input, gradOutput) - end - local weightcuda = gconv.gradWeight - local biascuda = gconv.gradBias - cutorch.synchronize() - tm.gpu = a:time().real + local rescuda = gconv:backward(input, gradOutput) + local weightcuda = gconv.gradWeight + local biascuda = gconv.gradBias - local error = rescuda:float() - groundgrad - local werror = weightcuda:float() - groundweight - local berror = biascuda:float() - groundbias + local error = rescuda:double() - groundgrad:double() + local werror = weightcuda:double() - groundweight:double() + local berror = biascuda:double() - groundbias:double() - mytester:assertlt(error:abs():max(), precision_backward, 'error on state (backward) ') - mytester:assertlt(werror:abs():max(), precision_backward, 'error on weight (backward) ') - mytester:assertlt(berror:abs():max(), precision_backward, 'error on bias (backward) ') + mytester:assertlt(error:abs():max(), precision_backward_type(precision_backward, typename), + string.format('error on state (backward) with %s', typename)) + mytester:assertlt(werror:abs():max(), + precision_backward_conv_weightbias(precision_backward, typename, weightcuda:abs():max()), + string.format('error on weight (backward) with %s', typename)) + mytester:assertlt(berror:abs():max(), + precision_backward_conv_weightbias(precision_backward, typename, biascuda:abs():max()), + string.format('error on bias (backward) with %s', typename)) + end end function cunntest.SpatialDilatedConvolution_backward_batch() @@ -1948,58 +1788,48 @@ function cunntest.SpatialDilatedConvolution_backward_batch() local padH = math.random(0,1) local outi = math.random(ki, 64) local outj = math.random(kj, 64) - local dilationW = math.random(0,10) - local dilationH = math.random(0,10) + local dilationW = math.random(1,10) + local dilationH = math.random(1,10) local ini = (outi - 1) * si - 2 * padW + dilationW * (ki-1) + 1 local inj = (outj - 1) * sj - 2 * padH + dilationH * (kj-1) + 1 - local tm = {} - local title = string.format('SpatialDilatedConvolution.backward %dx%dx%dx%d o %dx%d ' - .. '-> %dx%dx%dx%d [s: %dx%d] [p: %dx%d] [a: %dx%d]', - bs, from, inj, ini, kj, ki, - bs, to, outj, outi, sj, si, padH, padW, dilationH, dilationW) - times[title] = tm + for k, typename in ipairs(typenames) do + local input = torch.randn(bs,from,inj,ini):type(typename) - local input = torch.randn(bs,from,inj,ini) - local sconv = nn.SpatialDilatedConvolution(from,to,ki,kj,si,sj,padW,padH,dilationW,dilationH) - local output = sconv:forward(input) - local gradOutput = output:clone():normal() - sconv:zeroGradParameters() - local groundgrad = sconv:backward(input, gradOutput) - local a = torch.Timer() - for i = 1,nloop do + local ctype = t2cpu[typename] + input = input:type(ctype) + local sconv = nn.SpatialDilatedConvolution(from,to,ki,kj,si,sj,padW,padH,dilationW,dilationH):type(ctype) + local output = sconv:forward(input) + local gradOutput = output:clone():normal() sconv:zeroGradParameters() - groundgrad = sconv:backward(input, gradOutput) - end - local groundweight = sconv.gradWeight - local groundbias = sconv.gradBias - tm.cpu = a:time().real + local groundgrad = sconv:backward(input, gradOutput) + local groundweight = sconv.gradWeight + local groundbias = sconv.gradBias - input = input:cuda() - gradOutput = gradOutput:cuda() - local gconv = nn.SpatialDilatedConvolution(from,to,ki,kj,si,sj,padW,padH,dilationW,dilationH):cuda() - gconv.weight = sconv.weight:cuda() - gconv.bias = sconv.bias:cuda() - gconv:forward(input) - gconv:zeroGradParameters() - local rescuda = gconv:backward(input, gradOutput) - a:reset() - for i = 1,nloop do + input = input:type(typename) + gradOutput = gradOutput:type(typename) + local gconv = nn.SpatialDilatedConvolution(from,to,ki,kj,si,sj,padW,padH,dilationW,dilationH):type(typename) + gconv.weight = sconv.weight:type(typename) + gconv.bias = sconv.bias:type(typename) + gconv:forward(input) gconv:zeroGradParameters() - rescuda = gconv:backward(input, gradOutput) - end - local weightcuda = gconv.gradWeight - local biascuda = gconv.gradBias - cutorch.synchronize() - tm.gpu = a:time().real + local rescuda = gconv:backward(input, gradOutput) + local weightcuda = gconv.gradWeight + local biascuda = gconv.gradBias - local error = rescuda:float() - groundgrad - local werror = weightcuda:float() - groundweight - local berror = biascuda:float() - groundbias + local error = rescuda:double() - groundgrad:double() + local werror = weightcuda:double() - groundweight:double() + local berror = biascuda:double() - groundbias:double() - mytester:assertlt(error:abs():max(), precision_backward, 'error on state (backward) ') - mytester:assertlt(werror:abs():max(), precision_backward, 'error on weight (backward) ') - mytester:assertlt(berror:abs():max(), precision_backward, 'error on bias (backward) ') + mytester:assertlt(error:abs():max(), precision_backward_type(precision_backward, typename), + string.format('error on state (backward) with %s', typename)) + mytester:assertlt(werror:abs():max(), + precision_backward_conv_weightbias(precision_backward, typename, weightcuda:abs():max()), + string.format('error on weight (backward) with %s', typename)) + mytester:assertlt(berror:abs():max(), + precision_backward_conv_weightbias(precision_backward, typename, biascuda:abs():max()), + string.format('error on bias (backward) with %s', typename)) + end end function cunntest.SpatialSubSampling_forward() @@ -2014,37 +1844,27 @@ function cunntest.SpatialSubSampling_forward() local ini = (outi-1)*si+ki local inj = (outj-1)*sj+kj - local tm = {} - local title = string.format('SpatialSubSampling.forward %dx%dx%d o %dx%d -> %dx%dx%d', - from, inj, ini, kj, ki, to, outj, outi) - times[title] = tm + for k, typename in ipairs(typenames) do + local input = torch.randn(from,inj,ini):type(typename) - local input = torch.randn(from,inj,ini) - local sconv = nn.SpatialSubSampling(from,ki,kj,si,sj) - local groundtruth = sconv:forward(input) - local a = torch.Timer() - for i = 1,nloop do - groundtruth = sconv:forward(input) - end - tm.cpu = a:time().real + local ctype = t2cpu[typename] + input = input:type(ctype) + local sconv = nn.SpatialSubSampling(from,ki,kj,si,sj):type(ctype) + local groundtruth = sconv:forward(input) - input = input:cuda() - local gconv = nn.SpatialSubSampling(from,ki,kj,si,sj):cuda() - gconv.weight = sconv.weight:cuda() - gconv.bias = sconv.bias:cuda() - local rescuda = gconv:forward(input) - a:reset() - for i = 1,nloop do - rescuda = gconv:forward(input) - end - cutorch.synchronize() - tm.gpu = a:time().real + input = input:type(typename) + local gconv = nn.SpatialSubSampling(from,ki,kj,si,sj):type(typename) + gconv.weight = sconv.weight:type(typename) + gconv.bias = sconv.bias:type(typename) + local rescuda = gconv:forward(input) - local error = rescuda:float() - groundtruth - mytester:assertlt(error:abs():max(), precision_forward, 'error on state (forward) ') + local error = rescuda:double() - groundtruth:double() + mytester:assertlt(error:abs():max(), precision_forward_type(precision_forward, typename), + string.format('error on state (forward) with %s', typename)) + end end -function cunntest.SpatialSubSampling_forward_batch() +function cunntest.Sampling_forward_batch() local bs = math.random(4,10) local from = math.random(1,64) local to = from @@ -2057,34 +1877,24 @@ function cunntest.SpatialSubSampling_forward_batch() local ini = (outi-1)*si+ki local inj = (outj-1)*sj+kj - local tm = {} - local title = string.format('SpatialSubSampling.forward %dx%dx%dx%d o %dx%d -> %dx%dx%dx%d', - bs, from, inj, ini, kj, ki, bs, to, outj, outi) - times[title] = tm + for k, typename in ipairs(typenames) do + local input = torch.randn(bs,from,inj,ini):type(typename) - local input = torch.randn(bs,from,inj,ini) - local sconv = nn.SpatialSubSampling(from,ki,kj,si,sj) - local groundtruth = sconv:forward(input) - local a = torch.Timer() - for i = 1,nloop do - groundtruth = sconv:forward(input) - end - tm.cpu = a:time().real + local ctype = t2cpu[typename] + input = input:type(ctype) + local sconv = nn.SpatialSubSampling(from,ki,kj,si,sj):type(ctype) + local groundtruth = sconv:forward(input) - input = input:cuda() - local gconv = nn.SpatialSubSampling(from,ki,kj,si,sj):cuda() - gconv.weight = sconv.weight:cuda() - gconv.bias = sconv.bias:cuda() - local rescuda = gconv:forward(input) - a:reset() - for i = 1,nloop do - rescuda = gconv:forward(input) - end - cutorch.synchronize() - tm.gpu = a:time().real + input = input:type(typename) + local gconv = nn.SpatialSubSampling(from,ki,kj,si,sj):type(typename) + gconv.weight = sconv.weight:type(typename) + gconv.bias = sconv.bias:type(typename) + local rescuda = gconv:forward(input) - local error = rescuda:float() - groundtruth - mytester:assertlt(error:abs():max(), precision_forward, 'error on state (forward) ') + local error = rescuda:double() - groundtruth:double() + mytester:assertlt(error:abs():max(), precision_forward_type(precision_forward, typename), + string.format('error on state (forward) with %s', typename)) + end end function cunntest.SpatialSubSampling_backward() @@ -2099,51 +1909,50 @@ function cunntest.SpatialSubSampling_backward() local ini = (outi-1)*si+ki local inj = (outj-1)*sj+kj - local tm = {} - local title = string.format('SpatialSubSampling.backward %dx%dx%d o %dx%d -> %dx%dx%d', - from, inj, ini, kj, ki, to, outj, outi) - times[title] = tm + for k, typename in ipairs(typenames) do + -- FIXME: SpatialSubSampling accumulates directly to real, causes + -- precision issues with half + precision_backward_old = precision_backward + if typename == 'torch.CudaHalfTensor' then + precision_backward = 0.4 + end + local input = torch.randn(from,inj,ini):type(typename) + local gradOutput = torch.randn(to,outj,outi):type(typename) - local input = torch.randn(from,inj,ini) - local gradOutput = torch.randn(to,outj,outi) - local sconv = nn.SpatialSubSampling(from,ki,kj,si,sj) - sconv:forward(input) - sconv:zeroGradParameters() - local groundgrad = sconv:backward(input, gradOutput) - local a = torch.Timer() - for i = 1,nloop do + local ctype = t2cpu[typename] + input = input:type(ctype) + gradOutput = gradOutput:type(ctype) + local sconv = nn.SpatialSubSampling(from,ki,kj,si,sj):type(ctype) + sconv:forward(input) sconv:zeroGradParameters() - groundgrad = sconv:backward(input, gradOutput) - end - local groundweight = sconv.gradWeight - local groundbias = sconv.gradBias - tm.cpu = a:time().real + local groundgrad = sconv:backward(input, gradOutput) + local groundweight = sconv.gradWeight + local groundbias = sconv.gradBias - input = input:cuda() - gradOutput = gradOutput:cuda() - local gconv = nn.SpatialSubSampling(from,ki,kj,si,sj):cuda() - gconv.weight = sconv.weight:cuda() - gconv.bias = sconv.bias:cuda() - gconv:forward(input) - gconv:zeroGradParameters() - local rescuda = gconv:backward(input, gradOutput) - a:reset() - for i = 1,nloop do + input = input:type(typename) + gradOutput = gradOutput:type(typename) + local gconv = nn.SpatialSubSampling(from,ki,kj,si,sj):type(typename) + gconv.weight = sconv.weight:type(typename) + gconv.bias = sconv.bias:type(typename) + gconv:forward(input) gconv:zeroGradParameters() - rescuda = gconv:backward(input, gradOutput) - end - local weightcuda = gconv.gradWeight - local biascuda = gconv.gradBias - cutorch.synchronize() - tm.gpu = a:time().real + local rescuda = gconv:backward(input, gradOutput) + local weightcuda = gconv.gradWeight + local biascuda = gconv.gradBias - local error = rescuda:float() - groundgrad - local werror = weightcuda:float() - groundweight - local berror = biascuda:float() - groundbias + local error = rescuda:double() - groundgrad:double() + local werror = weightcuda:double() - groundweight:double() + local berror = biascuda:double() - groundbias:double() - mytester:assertlt(error:abs():max(), precision_backward, 'error on state (backward) ') - mytester:assertlt(werror:abs():max(), precision_backward, 'error on weight (backward) ') - mytester:assertlt(berror:abs():max(), precision_backward, 'error on bias (backward) ') + mytester:assertlt(error:abs():max(), precision_backward_type(precision_backward, typename), + string.format('error on state (backward) with %s', typename)) + mytester:assertlt(werror:abs():max(), precision_backward_type(precision_backward, typename), + string.format('error on weight (backward) with %s', typename)) + mytester:assertlt(berror:abs():max(), precision_backward_type(precision_backward, typename), + string.format('error on bias (backward) with %s', typename)) + + precision_backward = precision_backward_old + end end function cunntest.SpatialSubSampling_backward_batch() @@ -2159,51 +1968,47 @@ function cunntest.SpatialSubSampling_backward_batch() local ini = (outi-1)*si+ki local inj = (outj-1)*sj+kj - local tm = {} - local title = string.format('SpatialSubSampling.backward %dx%dx%dx%d o %dx%d -> %dx%dx%dx%d', - bs, from, inj, ini, kj, ki, bs, to, outj, outi) - times[title] = tm + for k, typename in ipairs(typenames) do + local input = torch.randn(bs,from,inj,ini):type(typename) + local gradOutput = torch.randn(bs,to,outj,outi):type(typename) - local input = torch.randn(bs,from,inj,ini) - local gradOutput = torch.randn(bs,to,outj,outi) - local sconv = nn.SpatialSubSampling(from,ki,kj,si,sj) - sconv:forward(input) - sconv:zeroGradParameters() - local groundgrad = sconv:backward(input, gradOutput) - local a = torch.Timer() - for i = 1,nloop do + local ctype = t2cpu[typename] + input = input:type(ctype) + gradOutput = gradOutput:type(ctype) + local sconv = nn.SpatialSubSampling(from,ki,kj,si,sj):type(ctype) + sconv:forward(input) sconv:zeroGradParameters() - groundgrad = sconv:backward(input, gradOutput) - end - local groundweight = sconv.gradWeight - local groundbias = sconv.gradBias - tm.cpu = a:time().real + local groundgrad = sconv:backward(input, gradOutput) + local groundweight = sconv.gradWeight + local groundbias = sconv.gradBias - input = input:cuda() - gradOutput = gradOutput:cuda() - local gconv = nn.SpatialSubSampling(from,ki,kj,si,sj):cuda() - gconv.weight = sconv.weight:cuda() - gconv.bias = sconv.bias:cuda() - gconv:forward(input) - gconv:zeroGradParameters() - local rescuda = gconv:backward(input, gradOutput) - a:reset() - for i = 1,nloop do + input = input:type(typename) + gradOutput = gradOutput:type(typename) + local gconv = nn.SpatialSubSampling(from,ki,kj,si,sj):type(typename) + gconv.weight = sconv.weight:type(typename) + gconv.bias = sconv.bias:type(typename) + gconv:forward(input) gconv:zeroGradParameters() - rescuda = gconv:backward(input, gradOutput) - end - local weightcuda = gconv.gradWeight - local biascuda = gconv.gradBias - cutorch.synchronize() - tm.gpu = a:time().real + local rescuda = gconv:backward(input, gradOutput) + local weightcuda = gconv.gradWeight + local biascuda = gconv.gradBias - local error = rescuda:float() - groundgrad - local werror = weightcuda:float() - groundweight - local berror = biascuda:float() - groundbias + local error = rescuda:double() - groundgrad:double() + local werror = weightcuda:double() - groundweight:double() + local berror = biascuda:double() - groundbias:double() - mytester:assertlt(error:abs():max(), precision_backward, 'error on state (backward) ') - mytester:assertlt(werror:abs():max(), precision_backward, 'error on weight (backward) ') - mytester:assertlt(berror:abs():max(), precision_backward, 'error on bias (backward) ') + -- FIXME: SpatialSubSampling accumulates directly to real, causes + -- precision issues with half, so we double the error tolerance + mytester:assertlt(error:abs():max(), + 2*precision_backward_type(precision_backward, typename, rescuda:abs():max()), + string.format('error on state (backward) with %s', typename)) + mytester:assertlt(werror:abs():max(), + 2*precision_backward_type(precision_backward, typename, weightcuda:abs():max()), + string.format('error on weight (backward) with %s', typename)) + mytester:assertlt(berror:abs():max(), + 2*precision_backward_type(precision_backward, typename, biascuda:abs():max()), + string.format('error on bias (backward) with %s', typename)) + end end function cunntest.SpatialMaxPooling_forward() @@ -2221,36 +2026,26 @@ function cunntest.SpatialMaxPooling_forward() local inj = (outj-1)*sj+kj - padj*2 local ceil_mode = math.random(0,1) == 1 - local tm = {} - local title = string.format('SpatialMaxPooling.forward %dx%dx%d o %dx%d -> %dx%dx%d', - from, inj, ini, kj, ki, to, outj, outi) - times[title] = tm - - local input = torch.randn(from,inj,ini) - local sconv = nn.SpatialMaxPooling(ki,kj,si,sj,padi,padj) - if ceil_mode then sconv:ceil() end - local groundtruth = sconv:forward(input) - local a = torch.Timer() - for i = 1,nloop do - groundtruth = sconv:forward(input) - end - tm.cpu = a:time().real + for k, typename in ipairs(typenames) do + local input = torch.randn(from,inj,ini):type(typename) + local ctype = t2cpu[typename] + input = input:type(ctype) + local sconv = nn.SpatialMaxPooling(ki,kj,si,sj,padi,padj):type(ctype) + if ceil_mode then sconv:ceil() end + local groundtruth = sconv:forward(input) - input = input:cuda() - local gconv = nn.SpatialMaxPooling(ki,kj,si,sj,padi,padj):cuda() - if ceil_mode then gconv:ceil() end - local rescuda = gconv:forward(input) - a:reset() - for i = 1,nloop do - rescuda = gconv:forward(input) - end - cutorch.synchronize() - tm.gpu = a:time().real + input = input:type(typename) + local gconv = nn.SpatialMaxPooling(ki,kj,si,sj,padi,padj):type(typename) + if ceil_mode then gconv:ceil() end + local rescuda = gconv:forward(input) - local error = rescuda:float() - groundtruth - mytester:assertlt(error:abs():max(), precision_forward, 'error on state (forward) ') - local error_ind = gconv.indices:float() - sconv.indices - mytester:asserteq(error_ind:max(), 0, 'error on indices (forward) ') + local error = rescuda:double() - groundtruth:double() + mytester:assertlt(error:abs():max(), precision_forward_type(precision_forward, typename), + string.format('error on state (forward) with %s', typename)) + local error_ind = gconv.indices:long() - sconv.indices + mytester:asserteq(error_ind:max(), 0, + string.format('error on indices (forward) with %s', typename)) + end end function cunntest.SpatialMaxPooling_forward_batch() @@ -2269,34 +2064,23 @@ function cunntest.SpatialMaxPooling_forward_batch() local inj = (outj-1)*sj+kj - padj*2 local ceil_mode = math.random(0,1) == 1 - local tm = {} - local title = string.format('SpatialMaxPooling.forward %dx%dx%dx%d o %dx%d -> %dx%dx%dx%d', - bs, from, inj, ini, kj, ki, bs, to, outj, outi) - times[title] = tm - - local input = torch.randn(bs,from,inj,ini) - local sconv = nn.SpatialMaxPooling(ki,kj,si,sj,padi,padj) - if ceil_mode then sconv:ceil() end - local groundtruth = sconv:forward(input) - local a = torch.Timer() - for i = 1,nloop do - groundtruth = sconv:forward(input) - end - tm.cpu = a:time().real + for k, typename in ipairs(typenames) do + local input = torch.randn(bs,from,inj,ini):type(typename) + local ctype = t2cpu[typename] + input = input:type(ctype) + local sconv = nn.SpatialMaxPooling(ki,kj,si,sj,padi,padj):type(ctype) + if ceil_mode then sconv:ceil() end + local groundtruth = sconv:forward(input) - input = input:cuda() - local gconv = nn.SpatialMaxPooling(ki,kj,si,sj,padi,padj):cuda() - if ceil_mode then gconv:ceil() end - local rescuda = gconv:forward(input) - a:reset() - for i = 1,nloop do - rescuda = gconv:forward(input) - end - cutorch.synchronize() - tm.gpu = a:time().real + input = input:type(typename) + local gconv = nn.SpatialMaxPooling(ki,kj,si,sj,padi,padj):type(typename) + if ceil_mode then gconv:ceil() end + local rescuda = gconv:forward(input) - local error = rescuda:float() - groundtruth - mytester:assertlt(error:abs():max(), precision_forward, 'error on state (forward) ') + local error = rescuda:double() - groundtruth:double() + mytester:assertlt(error:abs():max(), precision_forward_type(precision_forward, typename), + string.format('error on state (forward) with %s', typename)) + end end function cunntest.SpatialMaxUnpooling_forward_batch() @@ -2316,40 +2100,29 @@ function cunntest.SpatialMaxUnpooling_forward_batch() local ini = fun((outi + padi*2 - ki)/si) +1 local inj = fun((outj + padj*2 - kj)/sj) +1 - local tm = {} - local title = string.format('SpatialMaxUnpooling.forward %dx%dx%dx%d o %dx%d -> %dx%dx%dx%d', - bs, from, inj, ini, kj, ki, bs, to, outj, outi) - times[title] = tm + for k, typename in ipairs(typenames) do + local ctype = t2cpu[typename] + local pooler = nn.SpatialMaxPooling(ki,kj,si,sj,padi,padj):type(ctype) + if ceil_mode then pooler:ceil() end + local sunpool = nn.SpatialMaxUnpooling(pooler):type(ctype) - local pooler = nn.SpatialMaxPooling(ki,kj,si,sj,padi,padj) - if ceil_mode then pooler:ceil() end - local sunpool = nn.SpatialMaxUnpooling(pooler) - - local original = torch.randn(bs,from,outj,outi) - local input = pooler:forward(original) - local groundtruth = sunpool:forward(input) - local a = torch.Timer() - for i = 1,nloop do - groundtruth = sunpool:forward(input) - end - tm.cpu = a:time().real + local original = torch.randn(bs,from,outj,outi):type(typename) + original = original:type(ctype) + local input = pooler:forward(original) + local groundtruth = sunpool:forward(input) - original = original:cuda() - pooler = nn.SpatialMaxPooling(ki,kj,si,sj,padi,padj):cuda() - if ceil_mode then pooler:ceil() end - local gunpool = nn.SpatialMaxUnpooling(pooler):cuda() + original = original:type(typename) + pooler = nn.SpatialMaxPooling(ki,kj,si,sj,padi,padj):type(typename) + if ceil_mode then pooler:ceil() end + local gunpool = nn.SpatialMaxUnpooling(pooler):type(typename) - input = pooler:forward(original) - local rescuda = gunpool:forward(input) - a:reset() - for i = 1,nloop do - rescuda = gunpool:forward(input) - end - cutorch.synchronize() - tm.gpu = a:time().real + input = pooler:forward(original) + local rescuda = gunpool:forward(input) - local error = rescuda:float() - groundtruth - mytester:assertlt(error:abs():max(), precision_forward, 'error on state (forward) ') + local error = rescuda:double() - groundtruth:double() + mytester:assertlt(error:abs():max(), precision_forward_type(precision_forward, typename), + string.format('error on state (forward) with %s', typename)) + end end function cunntest.SpatialMaxPooling_backward() @@ -2365,45 +2138,34 @@ function cunntest.SpatialMaxPooling_backward() local padj = math.random(0,kj/2-1) local ini = (outi-1)*si+ki - padi*2 local inj = (outj-1)*sj+kj - padj*2 - local ceil_mode = math.random(0,1) == 1 + local ceil_mode = true--math.random(0,1) == 1 - local tm = {} - local title = string.format('SpatialMaxPooling.backward %dx%dx%d o %dx%d -> %dx%dx%d', - from, inj, ini, kj, ki, to, outj, outi) - times[title] = tm + for k, typename in ipairs(typenames) do + local input = torch.randn(from,inj,ini):type(typename) + local gradOutput = torch.randn(to,outj,outi):type(typename) + local ctype = t2cpu[typename] + input = input:type(ctype) + gradOutput = gradOutput:type(ctype) - local input = torch.randn(from,inj,ini) - local gradOutput = torch.randn(to,outj,outi) - local sconv = nn.SpatialMaxPooling(ki,kj,si,sj,padi,padj) - if ceil_mode then sconv:ceil() end - sconv:forward(input) - sconv:zeroGradParameters() - local groundgrad = sconv:backward(input, gradOutput) - local a = torch.Timer() - for i = 1,nloop do + local sconv = nn.SpatialMaxPooling(ki,kj,si,sj,padi,padj):type(ctype) + if ceil_mode then sconv:ceil() end + sconv:forward(input) sconv:zeroGradParameters() - groundgrad = sconv:backward(input, gradOutput) - end - tm.cpu = a:time().real + local groundgrad = sconv:backward(input, gradOutput) - input = input:cuda() - gradOutput = gradOutput:cuda() - local gconv = nn.SpatialMaxPooling(ki,kj,si,sj,padi,padj):cuda() - if ceil_mode then gconv:ceil() end - gconv:forward(input) - gconv:zeroGradParameters() - local rescuda = gconv:backward(input, gradOutput) - a:reset() - for i = 1,nloop do + input = input:type(typename) + gradOutput = gradOutput:type(typename) + local gconv = nn.SpatialMaxPooling(ki,kj,si,sj,padi,padj):type(typename) + if ceil_mode then gconv:ceil() end + gconv:forward(input) gconv:zeroGradParameters() - rescuda = gconv:backward(input, gradOutput) - end - cutorch.synchronize() - tm.gpu = a:time().real + local rescuda = gconv:backward(input, gradOutput) - local error = rescuda:float() - groundgrad + local error = rescuda:double() - groundgrad:double() - mytester:assertlt(error:abs():max(), precision_backward, 'error on state (backward) ') + mytester:assertlt(error:abs():max(), precision_backward_type(precision_backward, typename), + string.format('error on state (backward) with %s', typename)) + end end function cunntest.SpatialMaxPooling_backward_batch() @@ -2422,43 +2184,31 @@ function cunntest.SpatialMaxPooling_backward_batch() local inj = (outj-1)*sj+kj - padj*2 local ceil_mode = math.random(0,1) == 1 - local tm = {} - local title = string.format('SpatialMaxPooling.backward %dx%dx%dx%d o %dx%d -> %dx%dx%dx%d', - bs, from, inj, ini, kj, ki, bs, to, outj, outi) - times[title] = tm - - local input = torch.randn(bs,from,inj,ini) - local gradOutput = torch.randn(bs,to,outj,outi) - local sconv = nn.SpatialMaxPooling(ki,kj,si,sj,padi,padj) - if ceil_mode then sconv:ceil() end - sconv:forward(input) - sconv:zeroGradParameters() - local groundgrad = sconv:backward(input, gradOutput) - local a = torch.Timer() - for i = 1,nloop do + for k, typename in ipairs(typenames) do + local input = torch.randn(bs,from,inj,ini):type(typename) + local gradOutput = torch.randn(bs,to,outj,outi):type(typename) + local ctype = t2cpu[typename] + local input = input:type(ctype) + local gradOutput = gradOutput:type(ctype) + local sconv = nn.SpatialMaxPooling(ki,kj,si,sj,padi,padj):type(ctype) + if ceil_mode then sconv:ceil() end + sconv:forward(input) sconv:zeroGradParameters() - groundgrad = sconv:backward(input, gradOutput) - end - tm.cpu = a:time().real + local groundgrad = sconv:backward(input, gradOutput) - input = input:cuda() - gradOutput = gradOutput:cuda() - local gconv = nn.SpatialMaxPooling(ki,kj,si,sj,padi,padj):cuda() - if ceil_mode then gconv:ceil() end - gconv:forward(input) - gconv:zeroGradParameters() - local rescuda = gconv:backward(input, gradOutput) - a:reset() - for i = 1,nloop do + input = input:type(typename) + gradOutput = gradOutput:type(typename) + local gconv = nn.SpatialMaxPooling(ki,kj,si,sj,padi,padj):type(typename) + if ceil_mode then gconv:ceil() end + gconv:forward(input) gconv:zeroGradParameters() - rescuda = gconv:backward(input, gradOutput) - end - cutorch.synchronize() - tm.gpu = a:time().real + local rescuda = gconv:backward(input, gradOutput) - local error = rescuda:float() - groundgrad + local error = rescuda:double() - groundgrad:double() - mytester:assertlt(error:abs():max(), precision_backward, 'error on state (backward) ') + mytester:assertlt(error:abs():max(), precision_backward_type(precision_backward, typename), + string.format('error on state (backward) with %s', typename)) + end end function cunntest.SpatialMaxUnpooling_backward_batch() @@ -2478,50 +2228,38 @@ function cunntest.SpatialMaxUnpooling_backward_batch() local ini = fun((outi + padi*2 - ki)/si) +1 local inj = fun((outj + padj*2 - kj)/sj) +1 - local tm = {} - local title = string.format('SpatialMaxUnpooling.backward %dx%dx%dx%d o %dx%d -> %dx%dx%dx%d', - bs, from, inj, ini, kj, ki, bs, to, outj, outi) - times[title] = tm - - local pooler = nn.SpatialMaxPooling(ki,kj,si,sj,padi,padj) - if ceil_mode then pooler:ceil() end - local sunpool = nn.SpatialMaxUnpooling(pooler) - - local original = torch.randn(bs,from,outj,outi) - local input = pooler:forward(original) - local gradOutput = torch.randn(original:size()) - sunpool:forward(input) - sunpool:zeroGradParameters() - local groundgrad = sunpool:backward(input, gradOutput) - local a = torch.Timer() - for i = 1,nloop do + for k, typename in ipairs(typenames) do + local ctype = t2cpu[typename] + local pooler = nn.SpatialMaxPooling(ki,kj,si,sj,padi,padj):type(ctype) + if ceil_mode then pooler:ceil() end + local sunpool = nn.SpatialMaxUnpooling(pooler):type(ctype) + + local original = torch.randn(bs,from,outj,outi):type(typename) + original = original:type(ctype) + local input = pooler:forward(original) + local gradOutput = torch.randn(original:size()):type(typename) + gradOutput = gradOutput:type(ctype) + sunpool:forward(input) sunpool:zeroGradParameters() - groundgrad = sunpool:backward(input, gradOutput) - end - tm.cpu = a:time().real + local groundgrad = sunpool:backward(input, gradOutput) - pooler = nn.SpatialMaxPooling(ki,kj,si,sj,padi,padj):cuda() - if ceil_mode then pooler:ceil() end - local gunpool = nn.SpatialMaxUnpooling(pooler):cuda() + pooler = nn.SpatialMaxPooling(ki,kj,si,sj,padi,padj):type(typename) + if ceil_mode then pooler:ceil() end + local gunpool = nn.SpatialMaxUnpooling(pooler):type(typename) - original = original:cuda() - input = pooler:forward(original) - gunpool:forward(input) + original = original:type(typename) + input = pooler:forward(original) + gunpool:forward(input) - gradOutput = gradOutput:cuda() - gunpool:zeroGradParameters() - local rescuda = gunpool:backward(input, gradOutput) - a:reset() - for i = 1,nloop do + gradOutput = gradOutput:type(typename) gunpool:zeroGradParameters() - rescuda = gunpool:backward(input, gradOutput) - end - cutorch.synchronize() - tm.gpu = a:time().real + local rescuda = gunpool:backward(input, gradOutput) - local error = rescuda:float() - groundgrad + local error = rescuda:double() - groundgrad:double() - mytester:assertlt(error:abs():max(), precision_backward, 'error on state (backward) ') + mytester:assertlt(error:abs():max(), precision_backward_type(precision_backward, typename), + string.format('error on state (backward) with %s', typename)) + end end function cunntest.SpatialDilatedMaxPooling_forward() @@ -2541,36 +2279,26 @@ function cunntest.SpatialDilatedMaxPooling_forward() local inj = (outj-1)*sj+(dilationj*(kj-1)+1)-2*padj local ceil_mode = math.random(0,1) == 1 - local tm = {} - local title = string.format('SpatialDilatedMaxPooling.forward %dx%dx%d o %dx%d -> %dx%dx%d', - from, inj, ini, kj, ki, to, outj, outi) - times[title] = tm - - local input = torch.randn(from,inj,ini) - local sconv = nn.SpatialDilatedMaxPooling(ki,kj,si,sj,padi,padj,dilationi,dilationj) - if ceil_mode then sconv:ceil() end - local groundtruth = sconv:forward(input) - local a = torch.Timer() - for i = 1,nloop do - groundtruth = sconv:forward(input) - end - tm.cpu = a:time().real + for k, typename in ipairs(typenames) do + local input = torch.randn(from,inj,ini):type(typename) + local ctype = t2cpu[typename] + input = input:type(ctype) + local sconv = nn.SpatialDilatedMaxPooling(ki,kj,si,sj,padi,padj,dilationi,dilationj):type(ctype) + if ceil_mode then sconv:ceil() end + local groundtruth = sconv:forward(input) - input = input:cuda() - local gconv = nn.SpatialDilatedMaxPooling(ki,kj,si,sj,padi,padj,dilationi,dilationj):cuda() - if ceil_mode then gconv:ceil() end - local rescuda = gconv:forward(input) - a:reset() - for i = 1,nloop do - rescuda = gconv:forward(input) - end - cutorch.synchronize() - tm.gpu = a:time().real + input = input:type(typename) + local gconv = nn.SpatialDilatedMaxPooling(ki,kj,si,sj,padi,padj,dilationi,dilationj):type(typename) + if ceil_mode then gconv:ceil() end + local rescuda = gconv:forward(input) - local error = rescuda:float() - groundtruth - mytester:assertlt(error:abs():max(), precision_forward, 'error on state (forward) ') - local error_ind = gconv.indices:float() - sconv.indices - mytester:asserteq(error_ind:max(), 0, 'error on indices (forward) ') + local error = rescuda:double() - groundtruth:double() + mytester:assertlt(error:abs():max(), precision_forward_type(precision_forward, typename), + string.format('error on state (forward) with %s', typename)) + local error_ind = gconv.indices:long() - sconv.indices + mytester:asserteq(error_ind:max(), 0, + string.format('error on indices (forward) with %s', typename)) + end end function cunntest.SpatialDilatedMaxPooling_forward_batch() @@ -2591,34 +2319,23 @@ function cunntest.SpatialDilatedMaxPooling_forward_batch() local inj = (outj-1)*sj+(dilationj*(kj-1)+1)-2*padj local ceil_mode = math.random(0,1) == 1 - local tm = {} - local title = string.format('SpatialDilatedMaxPooling.forward %dx%dx%dx%d o %dx%d -> %dx%dx%dx%d', - bs, from, inj, ini, kj, ki, bs, to, outj, outi) - times[title] = tm - - local input = torch.randn(bs,from,inj,ini) - local sconv = nn.SpatialDilatedMaxPooling(ki,kj,si,sj,padi,padj,dilationi,dilationj) - if ceil_mode then sconv:ceil() end - local groundtruth = sconv:forward(input) - local a = torch.Timer() - for i = 1,nloop do - groundtruth = sconv:forward(input) - end - tm.cpu = a:time().real + for k, typename in ipairs(typenames) do + local input = torch.randn(bs,from,inj,ini):type(typename) + local ctype = t2cpu[typename] + input = input:type(ctype) + local sconv = nn.SpatialDilatedMaxPooling(ki,kj,si,sj,padi,padj,dilationi,dilationj):type(ctype) + if ceil_mode then sconv:ceil() end + local groundtruth = sconv:forward(input) - input = input:cuda() - local gconv = nn.SpatialDilatedMaxPooling(ki,kj,si,sj,padi,padj,dilationi,dilationj):cuda() - if ceil_mode then gconv:ceil() end - local rescuda = gconv:forward(input) - a:reset() - for i = 1,nloop do - rescuda = gconv:forward(input) - end - cutorch.synchronize() - tm.gpu = a:time().real + input = input:type(typename) + local gconv = nn.SpatialDilatedMaxPooling(ki,kj,si,sj,padi,padj,dilationi,dilationj):type(typename) + if ceil_mode then gconv:ceil() end + local rescuda = gconv:forward(input) - local error = rescuda:float() - groundtruth - mytester:assertlt(error:abs():max(), precision_forward, 'error on state (forward) ') + local error = rescuda:double() - groundtruth:double() + mytester:assertlt(error:abs():max(), precision_forward_type(precision_forward, typename), + string.format('error on state (forward) with %s', typename)) + end end function cunntest.SpatialDilatedMaxPooling_backward() @@ -2638,43 +2355,31 @@ function cunntest.SpatialDilatedMaxPooling_backward() local inj = (outj-1)*sj+(dilationj*(kj-1)+1)-2*padj local ceil_mode = math.random(0,1) == 1 - local tm = {} - local title = string.format('SpatialDilatedMaxPooling.backward %dx%dx%d o %dx%d -> %dx%dx%d', - from, inj, ini, kj, ki, to, outj, outi) - times[title] = tm - - local input = torch.randn(from,inj,ini) - local gradOutput = torch.randn(to,outj,outi) - local sconv = nn.SpatialDilatedMaxPooling(ki,kj,si,sj,padi,padj,dilationi,dilationj) - if ceil_mode then sconv:ceil() end - sconv:forward(input) - sconv:zeroGradParameters() - local groundgrad = sconv:backward(input, gradOutput) - local a = torch.Timer() - for i = 1,nloop do + for k, typename in ipairs(typenames) do + local input = torch.randn(from,inj,ini):type(typename) + local gradOutput = torch.randn(to,outj,outi):type(typename) + local ctype = t2cpu[typename] + input = input:type(ctype) + gradOutput = gradOutput:type(ctype) + local sconv = nn.SpatialDilatedMaxPooling(ki,kj,si,sj,padi,padj,dilationi,dilationj):type(ctype) + if ceil_mode then sconv:ceil() end + sconv:forward(input) sconv:zeroGradParameters() - groundgrad = sconv:backward(input, gradOutput) - end - tm.cpu = a:time().real + local groundgrad = sconv:backward(input, gradOutput) - input = input:cuda() - gradOutput = gradOutput:cuda() - local gconv = nn.SpatialDilatedMaxPooling(ki,kj,si,sj,padi,padj,dilationi,dilationj):cuda() - if ceil_mode then gconv:ceil() end - gconv:forward(input) - gconv:zeroGradParameters() - local rescuda = gconv:backward(input, gradOutput) - a:reset() - for i = 1,nloop do + input = input:type(typename) + gradOutput = gradOutput:type(typename) + local gconv = nn.SpatialDilatedMaxPooling(ki,kj,si,sj,padi,padj,dilationi,dilationj):type(typename) + if ceil_mode then gconv:ceil() end + gconv:forward(input) gconv:zeroGradParameters() - rescuda = gconv:backward(input, gradOutput) - end - cutorch.synchronize() - tm.gpu = a:time().real + local rescuda = gconv:backward(input, gradOutput) - local error = rescuda:float() - groundgrad + local error = rescuda:double() - groundgrad:double() - mytester:assertlt(error:abs():max(), precision_backward, 'error on state (backward) ') + mytester:assertlt(error:abs():max(), precision_backward_type(precision_backward, typename), + string.format('error on state (backward) with %s', typename)) + end end function cunntest.SpatialDilatedMaxPooling_backward_batch() @@ -2695,43 +2400,31 @@ function cunntest.SpatialDilatedMaxPooling_backward_batch() local inj = (outj-1)*sj+(dilationj*(kj-1)+1)-2*padj local ceil_mode = math.random(0,1) == 1 - local tm = {} - local title = string.format('SpatialDilatedMaxPooling.backward %dx%dx%dx%d o %dx%d -> %dx%dx%dx%d', - bs, from, inj, ini, kj, ki, bs, to, outj, outi) - times[title] = tm - - local input = torch.randn(bs,from,inj,ini) - local gradOutput = torch.randn(bs,to,outj,outi) - local sconv = nn.SpatialDilatedMaxPooling(ki,kj,si,sj,padi,padj,dilationi,dilationj) - if ceil_mode then sconv:ceil() end - sconv:forward(input) - sconv:zeroGradParameters() - local groundgrad = sconv:backward(input, gradOutput) - local a = torch.Timer() - for i = 1,nloop do + for k, typename in ipairs(typenames) do + local input = torch.randn(bs,from,inj,ini):type(typename) + local gradOutput = torch.randn(bs,to,outj,outi):type(typename) + local ctype = t2cpu[typename] + input = input:type(ctype) + gradOutput = gradOutput:type(ctype) + local sconv = nn.SpatialDilatedMaxPooling(ki,kj,si,sj,padi,padj,dilationi,dilationj):type(ctype) + if ceil_mode then sconv:ceil() end + sconv:forward(input) sconv:zeroGradParameters() - groundgrad = sconv:backward(input, gradOutput) - end - tm.cpu = a:time().real + local groundgrad = sconv:backward(input, gradOutput) - input = input:cuda() - gradOutput = gradOutput:cuda() - local gconv = nn.SpatialDilatedMaxPooling(ki,kj,si,sj,padi,padj,dilationi,dilationj):cuda() - if ceil_mode then gconv:ceil() end - gconv:forward(input) - gconv:zeroGradParameters() - local rescuda = gconv:backward(input, gradOutput) - a:reset() - for i = 1,nloop do + input = input:type(typename) + gradOutput = gradOutput:type(typename) + local gconv = nn.SpatialDilatedMaxPooling(ki,kj,si,sj,padi,padj,dilationi,dilationj):type(typename) + if ceil_mode then gconv:ceil() end + gconv:forward(input) gconv:zeroGradParameters() - rescuda = gconv:backward(input, gradOutput) - end - cutorch.synchronize() - tm.gpu = a:time().real + local rescuda = gconv:backward(input, gradOutput) - local error = rescuda:float() - groundgrad + local error = rescuda:double() - groundgrad:double() - mytester:assertlt(error:abs():max(), precision_backward, 'error on state (backward) ') + mytester:assertlt(error:abs():max(), precision_backward_type(precision_backward, typename), + string.format('error on state (backward) with %s', typename)) + end end function cunntest.SpatialFractionalMaxPooling_forward() @@ -2752,66 +2445,55 @@ function cunntest.SpatialFractionalMaxPooling_forward() local ratioW = outW / inW local ratioH = outH / inH - local tm = {} - local title = - string.format('SpatialFractionalMaxPooling.forward %dx%dx%dx%d o %dx%d -> %dx%dx%dx%d', - batch, plane, inH, inW, poolSizeH, poolSizeW, batch, plane, outH, outW) - times[title] = tm - - local input = nil - if batch == 1 then - input = torch.Tensor(plane, inH, inW):uniform() - else - input = torch.Tensor(batch, plane, inH, inW):uniform() - end + for k, typename in ipairs(typenames) do + local input = nil + if batch == 1 then + input = torch.Tensor(plane, inH, inW):uniform():type(typename) + else + input = torch.Tensor(batch, plane, inH, inW):uniform():type(typename) + end - local module = nil - if useRatio then - module = - nn.SpatialFractionalMaxPooling(poolSizeW, poolSizeH, ratioW, ratioH) - else - module = - nn.SpatialFractionalMaxPooling(poolSizeW, poolSizeH, outW, outH) - end + local ctype = t2cpu[typename] + input = input:type(ctype) + local module = nil + if useRatio then + module = + nn.SpatialFractionalMaxPooling(poolSizeW, poolSizeH, ratioW, ratioH):type(ctype) + else + module = + nn.SpatialFractionalMaxPooling(poolSizeW, poolSizeH, outW, outH):type(ctype) + end - module:fixPoolingRegions() + module:fixPoolingRegions() - local groundtruth = module:forward(input) - local a = torch.Timer() - for i = 1,nloop do - groundtruth = module:forward(input) - end - tm.cpu = a:time().real + local groundtruth = module:forward(input) - input = input:cuda() + input = input:type(typename) - local gmodule = nil - if useRatio then - gmodule = - nn.SpatialFractionalMaxPooling(poolSizeW, poolSizeH, ratioW, ratioH) - else - gmodule = - nn.SpatialFractionalMaxPooling(poolSizeW, poolSizeH, outW, outH) - end + local gmodule = nil + if useRatio then + gmodule = + nn.SpatialFractionalMaxPooling(poolSizeW, poolSizeH, ratioW, ratioH) + else + gmodule = + nn.SpatialFractionalMaxPooling(poolSizeW, poolSizeH, outW, outH) + end - gmodule = gmodule:fixPoolingRegions():cuda() + gmodule = gmodule:fixPoolingRegions():type(typename) - -- For comparison purposes, make sure we are using the same random pooling regions - -- as the CPU - gmodule.randomSamples = module.randomSamples:cuda() + -- For comparison purposes, make sure we are using the same random pooling regions + -- as the CPU + gmodule.randomSamples = module.randomSamples:type(typename) - local rescuda = gmodule:forward(input) - a:reset() - for i = 1,nloop do - rescuda = gmodule:forward(input) - end - cutorch.synchronize() - tm.gpu = a:time().real + local rescuda = gmodule:forward(input) - local error = rescuda:float() - groundtruth - mytester:assertlt(error:abs():max(), precision_forward, 'error on state (forward) ') - local error_ind = gmodule.indices:float() - module.indices - mytester:asserteq(error_ind:abs():max(), 0, 'error on indices (forward) ') + local error = rescuda:double() - groundtruth:double() + mytester:assertlt(error:abs():max(), precision_forward_type(precision_forward, typename), + string.format('error on state (forward) with %s', typename)) + local error_ind = gmodule.indices:long() - module.indices + mytester:asserteq(error_ind:abs():max(), 0, + string.format('error on indices (forward) with %s', typename)) + end end function cunntest.SpatialFractionalMaxPooling_backward() @@ -2828,78 +2510,71 @@ function cunntest.SpatialFractionalMaxPooling_backward() local inW = math.random(minInW, minInW + 6) local inH = math.random(minInH, minInH + 6) - local tm = {} - local title = - string.format('SpatialFractionalMaxPooling.backward %dx%dx%dx%d o %dx%d -> %dx%dx%dx%d', - batch, plane, inH, inW, poolSizeH, poolSizeW, batch, plane, outH, outW) - times[title] = tm - - local input = nil - local gradOutput = nil - if batch == 1 then - input = torch.Tensor(plane, inH, inW):uniform() - gradOutput = torch.Tensor(plane, outH, outW):uniform() - else - input = torch.Tensor(batch, plane, inH, inW):uniform() - gradOutput = torch.Tensor(batch, plane, outH, outW):uniform() - end + for k, typename in ipairs(typenames) do + local input = nil + local gradOutput = nil + if batch == 1 then + input = torch.Tensor(plane, inH, inW):uniform():type(typename) + gradOutput = torch.Tensor(plane, outH, outW):uniform():type(typename) + else + input = torch.Tensor(batch, plane, inH, inW):uniform():type(typename) + gradOutput = torch.Tensor(batch, plane, outH, outW):uniform():type(typename) + end - local module = - nn.SpatialFractionalMaxPooling(poolSizeW, poolSizeH, outW, outH) - :fixPoolingRegions() + local ctype = t2cpu[typename] + input = input:type(ctype) + gradOutput = gradOutput:type(ctype) + local module = + nn.SpatialFractionalMaxPooling(poolSizeW, poolSizeH, outW, outH) + :fixPoolingRegions():type(ctype) - module:forward(input) - module:zeroGradParameters() - local groundgrad = module:backward(input, gradOutput) - local a = torch.Timer() - for i = 1,nloop do + -- convert type of randomSamples and ensure we don't resample + module:initSampleBuffer_(input) + module:fixPoolingRegions() + module.randomSamples = module.randomSamples:type(typename):type(ctype) + module:forward(input) module:zeroGradParameters() - groundgrad = module:backward(input, gradOutput) - end - tm.cpu = a:time().real - - input = input:cuda() - gradOutput = gradOutput:cuda() - - local gmodule = - nn.SpatialFractionalMaxPooling(poolSizeW, poolSizeH, outW, outH) - :fixPoolingRegions():cuda() - -- For comparison purposes, make sure we are using the same random pooling regions - -- as the CPU - gmodule.randomSamples = module.randomSamples:cuda() - - gmodule:forward(input) - gmodule:zeroGradParameters() - local rescuda = gmodule:backward(input, gradOutput) - a:reset() - for i = 1,nloop do + local groundgrad = module:backward(input, gradOutput) + + input = input:type(typename) + gradOutput = gradOutput:type(typename) + + local gmodule = + nn.SpatialFractionalMaxPooling(poolSizeW, poolSizeH, outW, outH) + :fixPoolingRegions():type(typename) + -- For comparison purposes, make sure we are using the same random pooling regions + -- as the CPU + gmodule.randomSamples = module.randomSamples:type(typename) + + gmodule:forward(input) gmodule:zeroGradParameters() - rescuda = gmodule:backward(input, gradOutput) - end - cutorch.synchronize() - tm.gpu = a:time().real + local rescuda = gmodule:backward(input, gradOutput) - local error = rescuda:float() - groundgrad - mytester:assertlt(error:abs():max(), precision_backward, 'error on state (backward) ') + local error = rescuda:double() - groundgrad:double() + mytester:assertlt(error:abs():max(), precision_backward_type(precision_backward, typename), + string.format('error on state (backward) with %s', typename)) + end end function cunntest.SpatialAveragePooling_includepad() - local net = nn.SpatialAveragePooling(2, 2, 1, 1, 1, 1):cuda() - local net_no_include_pad = net:clone() - net_no_include_pad:setCountExcludePad() - local net_include_pad = net:clone() - net_include_pad:setCountIncludePad() + for k, typename in ipairs(typenames) do + local net = nn.SpatialAveragePooling(2, 2, 1, 1, 1, 1):type(typename) + local net_no_include_pad = net:clone() + net_no_include_pad:setCountExcludePad() + local net_include_pad = net:clone() + net_include_pad:setCountIncludePad() - local input = torch.FloatTensor(1, 1, 1, 1):cuda() - input[1][1][1][1] = 3 - local out_noinclude = net_no_include_pad:forward(input) - local out_include = net_include_pad:forward(input) + local input = torch.FloatTensor(1, 1, 1, 1):type(typename) + input[1][1][1][1] = 3 + local out_noinclude = net_no_include_pad:forward(input) + local out_include = net_include_pad:forward(input) - local noinc_out = out_noinclude[1][1][1][1] - local inc_out = out_include[1][1][1][1] - mytester:assertne(noinc_out, inc_out) - mytester:asserteq(3, noinc_out) - mytester:asserteq(3/4, inc_out) + local noinc_out = out_noinclude[1][1][1][1] + local inc_out = out_include[1][1][1][1] + mytester:assertne(noinc_out, inc_out) + mytester:asserteq(3, noinc_out) + mytester:asserteq(3/4, inc_out) + end end function cunntest.SpatialAveragePooling_forward() @@ -2918,36 +2593,26 @@ function cunntest.SpatialAveragePooling_forward() local ceil_mode = math.random(0,1) == 1 local count_exclude_pad = math.random(0,1) == 1 - local tm = {} - local title = string.format('SpatialAveragePooling.forward %dx%dx%d o %dx%d -> %dx%dx%d', - from, inj, ini, kj, ki, to, outj, outi) - times[title] = tm + for k, typename in ipairs(typenames) do + local input = torch.randn(from,inj,ini):type(typename) - local input = torch.randn(from,inj,ini) - local sconv = nn.SpatialAveragePooling(ki,kj,si,sj,padi,padj) - if ceil_mode then sconv:ceil() end - if count_exclude_pad then sconv:setCountExcludePad() end - local groundtruth = sconv:forward(input) - local a = torch.Timer() - for i = 1,nloop do - groundtruth = sconv:forward(input) - end - tm.cpu = a:time().real + local ctype = t2cpu[typename] + input = input:type(ctype) + local sconv = nn.SpatialAveragePooling(ki,kj,si,sj,padi,padj):type(ctype) + if ceil_mode then sconv:ceil() end + if count_exclude_pad then sconv:setCountExcludePad() end + local groundtruth = sconv:forward(input) - input = input:cuda() - local gconv = nn.SpatialAveragePooling(ki,kj,si,sj,padi,padj):cuda() - if ceil_mode then gconv:ceil() end - if count_exclude_pad then gconv:setCountExcludePad() end - local rescuda = gconv:forward(input) - a:reset() - for i = 1,nloop do - rescuda = gconv:forward(input) - end - cutorch.synchronize() - tm.gpu = a:time().real + input = input:type(typename) + local gconv = nn.SpatialAveragePooling(ki,kj,si,sj,padi,padj):type(typename) + if ceil_mode then gconv:ceil() end + if count_exclude_pad then gconv:setCountExcludePad() end + local rescuda = gconv:forward(input) - local error = rescuda:float() - groundtruth - mytester:assertlt(error:abs():max(), precision_forward, 'error on state (forward) ') + local error = rescuda:double() - groundtruth:double() + mytester:assertlt(error:abs():max(), precision_forward_type(precision_forward, typename), + string.format('error on state (forward) with %s', typename)) + end end function cunntest.SpatialAveragePooling_forward_batch() @@ -2967,36 +2632,26 @@ function cunntest.SpatialAveragePooling_forward_batch() local ceil_mode = math.random(0,1) == 1 local count_exclude_pad = math.random(0,1) == 1 - local tm = {} - local title = string.format('SpatialAveragePooling.forward %dx%dx%dx%d o %dx%d -> %dx%dx%dx%d', - bs, from, inj, ini, kj, ki, bs, to, outj, outi) - times[title] = tm + for k, typename in ipairs(typenames) do + local input = torch.randn(bs,from,inj,ini):type(typename) + local ctype = t2cpu[typename] - local input = torch.randn(bs,from,inj,ini) - local sconv = nn.SpatialAveragePooling(ki,kj,si,sj,padi,padj) - if ceil_mode then sconv:ceil() end - if count_exclude_pad then sconv:setCountExcludePad() end - local groundtruth = sconv:forward(input) - local a = torch.Timer() - for i = 1,nloop do - groundtruth = sconv:forward(input) - end - tm.cpu = a:time().real + input = input:type(ctype) + local sconv = nn.SpatialAveragePooling(ki,kj,si,sj,padi,padj):type(ctype) + if ceil_mode then sconv:ceil() end + if count_exclude_pad then sconv:setCountExcludePad() end + local groundtruth = sconv:forward(input) - input = input:cuda() - local gconv = nn.SpatialAveragePooling(ki,kj,si,sj,padi,padj):cuda() - if ceil_mode then gconv:ceil() end - if count_exclude_pad then gconv:setCountExcludePad() end - local rescuda = gconv:forward(input) - a:reset() - for i = 1,nloop do - rescuda = gconv:forward(input) - end - cutorch.synchronize() - tm.gpu = a:time().real + input = input:type(typename) + local gconv = nn.SpatialAveragePooling(ki,kj,si,sj,padi,padj):type(typename) + if ceil_mode then gconv:ceil() end + if count_exclude_pad then gconv:setCountExcludePad() end + local rescuda = gconv:forward(input) - local error = rescuda:float() - groundtruth - mytester:assertlt(error:abs():max(), precision_forward, 'error on state (forward) ') + local error = rescuda:double() - groundtruth:double() + mytester:assertlt(error:abs():max(), precision_forward_type(precision_forward, typename), + string.format('error on state (forward) with %s', typename)) + end end function cunntest.SpatialAveragePooling_backward() @@ -3015,45 +2670,34 @@ function cunntest.SpatialAveragePooling_backward() local ceil_mode = math.random(0,1) == 1 local count_exclude_pad = math.random(0,1) == 1 - local tm = {} - local title = string.format('SpatialAveragePooling.backward %dx%dx%d o %dx%d -> %dx%dx%d', - from, inj, ini, kj, ki, to, outj, outi) - times[title] = tm + for k, typename in ipairs(typenames) do + local input = torch.randn(from,inj,ini):type(typename) + local gradOutput = torch.randn(to,outj,outi):type(typename) - local input = torch.randn(from,inj,ini) - local gradOutput = torch.randn(to,outj,outi) - local sconv = nn.SpatialAveragePooling(ki,kj,si,sj,padi,padj) - if ceil_mode then sconv:ceil() end - if count_exclude_pad then sconv:setCountExcludePad() end - sconv:forward(input) - sconv:zeroGradParameters() - local groundgrad = sconv:backward(input, gradOutput) - local a = torch.Timer() - for i = 1,nloop do + local ctype = t2cpu[typename] + input = input:type(ctype) + gradOutput = gradOutput:type(ctype) + local sconv = nn.SpatialAveragePooling(ki,kj,si,sj,padi,padj):type(ctype) + if ceil_mode then sconv:ceil() end + if count_exclude_pad then sconv:setCountExcludePad() end + sconv:forward(input) sconv:zeroGradParameters() - groundgrad = sconv:backward(input, gradOutput) - end - tm.cpu = a:time().real + local groundgrad = sconv:backward(input, gradOutput) - input = input:cuda() - gradOutput = gradOutput:cuda() - local gconv = nn.SpatialAveragePooling(ki,kj,si,sj,padi,padj):cuda() - if ceil_mode then gconv:ceil() end - if count_exclude_pad then gconv:setCountExcludePad() end - gconv:forward(input) - gconv:zeroGradParameters() - local rescuda = gconv:backward(input, gradOutput) - a:reset() - for i = 1,nloop do + input = input:type(typename) + gradOutput = gradOutput:type(typename) + local gconv = nn.SpatialAveragePooling(ki,kj,si,sj,padi,padj):type(typename) + if ceil_mode then gconv:ceil() end + if count_exclude_pad then gconv:setCountExcludePad() end + gconv:forward(input) gconv:zeroGradParameters() - rescuda = gconv:backward(input, gradOutput) - end - cutorch.synchronize() - tm.gpu = a:time().real + local rescuda = gconv:backward(input, gradOutput) - local error = rescuda:float() - groundgrad + local error = rescuda:double() - groundgrad:double() - mytester:assertlt(error:abs():max(), precision_backward, 'error on state (backward) ') + mytester:assertlt(error:abs():max(), precision_backward_type(precision_backward, typename), + string.format('error on state (backward) with %s', typename)) + end end function cunntest.SpatialAveragePooling_backward_batch() @@ -3073,45 +2717,34 @@ function cunntest.SpatialAveragePooling_backward_batch() local ceil_mode = math.random(0,1) == 1 local count_exclude_pad = math.random(0,1) == 1 - local tm = {} - local title = string.format('SpatialAveragePooling.backward %dx%dx%dx%d o %dx%d -> %dx%dx%dx%d', - bs, from, inj, ini, kj, ki, bs, to, outj, outi) - times[title] = tm + for k, typename in ipairs(typenames) do + local input = torch.randn(bs,from,inj,ini):type(typename) + local gradOutput = torch.randn(bs,to,outj,outi):type(typename) - local input = torch.randn(bs,from,inj,ini) - local gradOutput = torch.randn(bs,to,outj,outi) - local sconv = nn.SpatialAveragePooling(ki,kj,si,sj,padi,padj) - if ceil_mode then sconv:ceil() end - if count_exclude_pad then sconv:setCountExcludePad() end - sconv:forward(input) - sconv:zeroGradParameters() - local groundgrad = sconv:backward(input, gradOutput) - local a = torch.Timer() - for i = 1,nloop do + local ctype = t2cpu[typename] + input = input:type(ctype) + gradOutput = gradOutput:type(ctype) + local sconv = nn.SpatialAveragePooling(ki,kj,si,sj,padi,padj):type(ctype) + if ceil_mode then sconv:ceil() end + if count_exclude_pad then sconv:setCountExcludePad() end + sconv:forward(input) sconv:zeroGradParameters() - groundgrad = sconv:backward(input, gradOutput) - end - tm.cpu = a:time().real + local groundgrad = sconv:backward(input, gradOutput) - input = input:cuda() - gradOutput = gradOutput:cuda() - local gconv = nn.SpatialAveragePooling(ki,kj,si,sj,padi,padj):cuda() - if ceil_mode then gconv:ceil() end - if count_exclude_pad then gconv:setCountExcludePad() end - gconv:forward(input) - gconv:zeroGradParameters() - local rescuda = gconv:backward(input, gradOutput) - a:reset() - for i = 1,nloop do + input = input:type(typename) + gradOutput = gradOutput:type(typename) + local gconv = nn.SpatialAveragePooling(ki,kj,si,sj,padi,padj):type(typename) + if ceil_mode then gconv:ceil() end + if count_exclude_pad then gconv:setCountExcludePad() end + gconv:forward(input) gconv:zeroGradParameters() - rescuda = gconv:backward(input, gradOutput) - end - cutorch.synchronize() - tm.gpu = a:time().real + local rescuda = gconv:backward(input, gradOutput) - local error = rescuda:float() - groundgrad + local error = rescuda:double() - groundgrad:double() - mytester:assertlt(error:abs():max(), precision_backward, 'error on state (backward) ') + mytester:assertlt(error:abs():max(), precision_backward_type(precision_backward, typename), + string.format('error on state (backward) with %s', typename)) + end end function cunntest.SpatialAdaptiveMaxPooling_forward() @@ -3122,34 +2755,24 @@ function cunntest.SpatialAdaptiveMaxPooling_forward() local ini = math.random(10,256) local inj = math.random(10,256) - local tm = {} - local title = string.format('SpatialAdaptiveMaxPooling.forward %dx%dx%d -> %dx%dx%d', - from, inj, ini, to, outj, outi) - times[title] = tm + for k, typename in ipairs(typenames) do + local input = torch.randn(from,inj,ini):type(typename) + local ctype = t2cpu[typename] + input = input:type(ctype) + local sconv = nn.SpatialAdaptiveMaxPooling(outi,outj):type(ctype) + local groundtruth = sconv:forward(input):type(ctype) - local input = torch.randn(from,inj,ini) - local sconv = nn.SpatialAdaptiveMaxPooling(outi,outj) - local groundtruth = sconv:forward(input) - local a = torch.Timer() - for i = 1,nloop do - groundtruth = sconv:forward(input) - end - tm.cpu = a:time().real + input = input:type(typename) + local gconv = nn.SpatialAdaptiveMaxPooling(outi,outj):type(typename) + local rescuda = gconv:forward(input) - input = input:cuda() - local gconv = nn.SpatialAdaptiveMaxPooling(outi,outj):cuda() - local rescuda = gconv:forward(input) - a:reset() - for i = 1,nloop do - rescuda = gconv:forward(input) + local error = rescuda:double() - groundtruth:double() + mytester:assertlt(error:abs():max(), precision_forward_type(precision_forward, typename), + string.format('error on state (forward) with %s', typename)) + local error_ind = gconv.indices:long() - sconv.indices + mytester:asserteq(error_ind:max(), 0, + string.format('error on indices (forward) with %s', typename)) end - cutorch.synchronize() - tm.gpu = a:time().real - - local error = rescuda:float() - groundtruth - mytester:assertlt(error:abs():max(), precision_forward, 'error on state (forward) ') - local error_ind = gconv.indices:float() - sconv.indices - mytester:asserteq(error_ind:max(), 0, 'error on indices (forward) ') end function cunntest.SpatialAdaptiveMaxPooling_forward_noncontig() @@ -3160,35 +2783,24 @@ function cunntest.SpatialAdaptiveMaxPooling_forward_noncontig() local ini = math.random(10,256) local inj = math.random(10,256) - local tm = {} - local title = string.format('SpatialAdaptiveMaxPooling.forward %s %dx%dx%d -> %dx%dx%d', - 'non-contiguous',from, inj, ini, to, outj, outi) - times[title] = tm + for k, typename in ipairs(typenames) do + local input0 = torch.randn(from,ini,inj):type(typename) + local ctype = t2cpu[typename] + local input = input0:type(ctype):transpose(2,3) + local sconv = nn.SpatialAdaptiveMaxPooling(outi,outj):type(ctype) + local groundtruth = sconv:forward(input) - local input0 = torch.randn(from,ini,inj) - local input = input0:transpose(2,3) - local sconv = nn.SpatialAdaptiveMaxPooling(outi,outj) - local groundtruth = sconv:forward(input) - local a = torch.Timer() - for i = 1,nloop do - groundtruth = sconv:forward(input) - end - tm.cpu = a:time().real + input = input0:type(typename):transpose(2,3) + local gconv = nn.SpatialAdaptiveMaxPooling(outi,outj):type(typename) + local rescuda = gconv:forward(input) - input = input0:cuda():transpose(2,3) - local gconv = nn.SpatialAdaptiveMaxPooling(outi,outj):cuda() - local rescuda = gconv:forward(input) - a:reset() - for i = 1,nloop do - rescuda = gconv:forward(input) + local error = rescuda:double() - groundtruth:double() + mytester:assertlt(error:abs():max(), precision_forward_type(precision_forward, typename), + string.format('error on state (forward) with %s', typename)) + local error_ind = gconv.indices:long() - sconv.indices + mytester:asserteq(error_ind:max(), 0, + string.format('error on indices (forward) with %s', typename)) end - cutorch.synchronize() - tm.gpu = a:time().real - - local error = rescuda:float() - groundtruth - mytester:assertlt(error:abs():max(), precision_forward, 'error on state (forward) ') - local error_ind = gconv.indices:float() - sconv.indices - mytester:asserteq(error_ind:max(), 0, 'error on indices (forward) ') end function cunntest.SpatialAdaptiveMaxPooling_forward_batch() @@ -3200,32 +2812,21 @@ function cunntest.SpatialAdaptiveMaxPooling_forward_batch() local ini = math.random(10,256) local inj = math.random(10,256) - local tm = {} - local title = string.format('SpatialAdaptiveMaxPooling.forward %dx%dx%dx%d -> %dx%dx%dx%d', - bs, from, inj, ini, bs, to, outj, outi) - times[title] = tm + for k, typename in ipairs(typenames) do + local input = torch.randn(bs,from,inj,ini):type(typename) + local ctype = t2cpu[typename] + input = input:type(ctype) + local sconv = nn.SpatialAdaptiveMaxPooling(outi,outj):type(ctype) + local groundtruth = sconv:forward(input) - local input = torch.randn(bs,from,inj,ini) - local sconv = nn.SpatialAdaptiveMaxPooling(outi,outj) - local groundtruth = sconv:forward(input) - local a = torch.Timer() - for i = 1,nloop do - groundtruth = sconv:forward(input) - end - tm.cpu = a:time().real + input = input:type(typename) + local gconv = nn.SpatialAdaptiveMaxPooling(outi,outj):type(typename) + local rescuda = gconv:forward(input) - input = input:cuda() - local gconv = nn.SpatialAdaptiveMaxPooling(outi,outj):cuda() - local rescuda = gconv:forward(input) - a:reset() - for i = 1,nloop do - rescuda = gconv:forward(input) + local error = rescuda:double() - groundtruth:double() + mytester:assertlt(error:abs():max(), precision_forward_type(precision_forward, typename), + string.format('error on state (forward) with %s', typename)) end - cutorch.synchronize() - tm.gpu = a:time().real - - local error = rescuda:float() - groundtruth - mytester:assertlt(error:abs():max(), precision_forward, 'error on state (forward) ') end function cunntest.SpatialAdaptiveMaxPooling_backward() @@ -3236,41 +2837,29 @@ function cunntest.SpatialAdaptiveMaxPooling_backward() local ini = math.random(10,256) local inj = math.random(10,256) - local tm = {} - local title = string.format('SpatialAdaptiveMaxPooling.backward %dx%dx%d -> %dx%dx%d', - from, inj, ini, to, outj, outi) - times[title] = tm - - local input = torch.randn(from,inj,ini) - local gradOutput = torch.randn(to,outj,outi) - local sconv = nn.SpatialAdaptiveMaxPooling(outi,outj) - sconv:forward(input) - sconv:zeroGradParameters() - local groundgrad = sconv:backward(input, gradOutput) - local a = torch.Timer() - for i = 1,nloop do + for k, typename in ipairs(typenames) do + local input = torch.randn(from,inj,ini):type(typename) + local gradOutput = torch.randn(to,outj,outi):type(typename) + local ctype = t2cpu[typename] + input = input:type(ctype) + gradOutput = gradOutput:type(ctype) + local sconv = nn.SpatialAdaptiveMaxPooling(outi,outj):type(ctype) + sconv:forward(input) sconv:zeroGradParameters() - groundgrad = sconv:backward(input, gradOutput) - end - tm.cpu = a:time().real + local groundgrad = sconv:backward(input, gradOutput) - input = input:cuda() - gradOutput = gradOutput:cuda() - local gconv = nn.SpatialAdaptiveMaxPooling(outi,outj):cuda() - gconv:forward(input) - gconv:zeroGradParameters() - local rescuda = gconv:backward(input, gradOutput) - a:reset() - for i = 1,nloop do + input = input:type(typename) + gradOutput = gradOutput:type(typename) + local gconv = nn.SpatialAdaptiveMaxPooling(outi,outj):type(typename) + gconv:forward(input) gconv:zeroGradParameters() - rescuda = gconv:backward(input, gradOutput) - end - cutorch.synchronize() - tm.gpu = a:time().real + local rescuda = gconv:backward(input, gradOutput) - local error = rescuda:float() - groundgrad + local error = rescuda:double() - groundgrad:double() - mytester:assertlt(error:abs():max(), precision_backward, 'error on state (backward) ') + mytester:assertlt(error:abs():max(), precision_backward_type(precision_backward, typename), + string.format('error on state (backward) with %s', typename)) + end end function cunntest.SpatialAdaptiveMaxPooling_backward_noncontig() @@ -3281,42 +2870,29 @@ function cunntest.SpatialAdaptiveMaxPooling_backward_noncontig() local ini = math.random(10,256) local inj = math.random(10,256) - local tm = {} - local title = string.format('SpatialAdaptiveMaxPooling.backward %s %dx%dx%d -> %dx%dx%d', - 'non-contiguous', from, inj, ini, to, outj, outi) - times[title] = tm - - local input0 = torch.randn(from,ini,inj) - local input = input0:transpose(2,3) - local gradOutput = torch.randn(to,outj,outi) - local sconv = nn.SpatialAdaptiveMaxPooling(outi,outj) - sconv:forward(input) - sconv:zeroGradParameters() - local groundgrad = sconv:backward(input, gradOutput) - local a = torch.Timer() - for i = 1,nloop do + for k, typename in ipairs(typenames) do + local input0 = torch.randn(from,ini,inj):type(typename) + local gradOutput = torch.randn(to,outj,outi):type(typename) + local ctype = t2cpu[typename] + local input = input0:type(ctype):transpose(2,3) + gradOutput = gradOutput:type(ctype) + local sconv = nn.SpatialAdaptiveMaxPooling(outi,outj):type(ctype) + sconv:forward(input) sconv:zeroGradParameters() - groundgrad = sconv:backward(input, gradOutput) - end - tm.cpu = a:time().real + local groundgrad = sconv:backward(input, gradOutput) - input = input0:cuda():transpose(2,3) - gradOutput = gradOutput:cuda() - local gconv = nn.SpatialAdaptiveMaxPooling(outi,outj):cuda() - gconv:forward(input) - gconv:zeroGradParameters() - local rescuda = gconv:backward(input, gradOutput) - a:reset() - for i = 1,nloop do + input = input0:type(typename):transpose(2,3) + gradOutput = gradOutput:type(typename) + local gconv = nn.SpatialAdaptiveMaxPooling(outi,outj):type(typename) + gconv:forward(input) gconv:zeroGradParameters() - rescuda = gconv:backward(input, gradOutput) - end - cutorch.synchronize() - tm.gpu = a:time().real + local rescuda = gconv:backward(input, gradOutput) - local error = rescuda:float() - groundgrad + local error = rescuda:double() - groundgrad:double() - mytester:assertlt(error:abs():max(), precision_backward, 'error on state (backward) ') + mytester:assertlt(error:abs():max(), precision_backward_type(precision_backward, typename), + string.format('error on state (backward) with %s', typename)) + end end function cunntest.SpatialAdaptiveMaxPooling_backward_batch() @@ -3328,41 +2904,29 @@ function cunntest.SpatialAdaptiveMaxPooling_backward_batch() local ini = math.random(10,256) local inj = math.random(10,256) - local tm = {} - local title = string.format('SpatialAdaptiveMaxPooling.backward %dx%dx%dx%d -> %dx%dx%dx%d', - bs, from, inj, ini, bs, to, outj, outi) - times[title] = tm - - local input = torch.randn(bs,from,inj,ini) - local gradOutput = torch.randn(bs,to,outj,outi) - local sconv = nn.SpatialAdaptiveMaxPooling(outi,outj) - sconv:forward(input) - sconv:zeroGradParameters() - local groundgrad = sconv:backward(input, gradOutput) - local a = torch.Timer() - for i = 1,nloop do + for k, typename in ipairs(typenames) do + local input = torch.randn(bs,from,inj,ini):type(typename) + local gradOutput = torch.randn(bs,to,outj,outi):type(typename) + local ctype = t2cpu[typename] + input = input:type(ctype) + gradOutput = gradOutput:type(ctype) + local sconv = nn.SpatialAdaptiveMaxPooling(outi,outj):type(ctype) + sconv:forward(input) sconv:zeroGradParameters() - groundgrad = sconv:backward(input, gradOutput) - end - tm.cpu = a:time().real + local groundgrad = sconv:backward(input, gradOutput) - input = input:cuda() - gradOutput = gradOutput:cuda() - local gconv = nn.SpatialAdaptiveMaxPooling(outi,outj):cuda() - gconv:forward(input) - gconv:zeroGradParameters() - local rescuda = gconv:backward(input, gradOutput) - a:reset() - for i = 1,nloop do + input = input:type(typename) + gradOutput = gradOutput:type(typename) + local gconv = nn.SpatialAdaptiveMaxPooling(outi,outj):type(typename) + gconv:forward(input) gconv:zeroGradParameters() - rescuda = gconv:backward(input, gradOutput) - end - cutorch.synchronize() - tm.gpu = a:time().real + local rescuda = gconv:backward(input, gradOutput) - local error = rescuda:float() - groundgrad + local error = rescuda:double() - groundgrad:double() - mytester:assertlt(error:abs():max(), precision_backward, 'error on state (backward) ') + mytester:assertlt(error:abs():max(), precision_backward_type(precision_backward, typename), + string.format('error on state (backward) with %s', typename)) + end end function cunntest.SpatialLPPooling_forward() @@ -3495,164 +3059,154 @@ end function cunntest.BCECriterion_forward() local size = math.random(1,100) - local input = torch.Tensor(size):uniform() - local target = torch.Tensor(size):uniform():gt(0.5):type(torch.type(input)) - - local tm = {} - local title = string.format('BCECriterion.forward, Size: %d', size) - times[title] = tm - - local crit = nn.BCECriterion() - local rescpu = crit:forward(input, target) - local a = torch.Timer() - for i = 1,nloop do - rescpu = crit:forward(input, target) - end - tm.cpu = a:time().real - - input = input:cuda() - target = target:cuda() - local g_crit = nn.BCECriterion():cuda() - local rescuda = g_crit:forward(input, target) - a:reset() - for i = 1,nloop do - rescuda = g_crit:forward(input, target) - end - cutorch.synchronize() - tm.gpu = a:time().real - local errorVal = rescuda - rescpu - mytester:assertlt(errorVal, precision_forward, 'error on state (forward) ') - - -- test vs lua implementation - buffer = input.new() - local restruth = BCECriterion_forward_truth(buffer, input, target, nil, true) - for i = 1,nloop do - local restruth = BCECriterion_forward_truth(buffer, input, target, nil, true) + + for k, typename in ipairs(typenames) do + local input = torch.Tensor(size):uniform():type(typename) + local target = torch.Tensor(size):uniform():gt(0.5):type(torch.type(input)) + + local ctype = t2cpu[typename] + input = input:type(ctype) + target = target:type(ctype) + local crit = nn.BCECriterion():type(ctype) + local rescpu = crit:forward(input, target) + + input = input:type(typename) + target = target:type(typename) + local g_crit = nn.BCECriterion():type(typename) + local rescuda = g_crit:forward(input, target) + local errorVal = rescuda - rescpu + mytester:assertlt(errorVal, precision_forward_type(precision_forward, typename), + string.format('error on state (forward) with %s', typename)) + + -- test vs lua implementation + input = input:type(ctype) + target = target:type(ctype) + buffer = input.new() + local restruth = BCECriterion_forward_truth(buffer, input, target, nil, true) + errorVal = rescpu - restruth + mytester:assertlt(errorVal, precision_forward_type(precision_forward, typename), + string.format('error on state (forward) with %s', typename)) + errorVal = rescuda - restruth + mytester:assertlt(errorVal, precision_forward_type(precision_forward, typename), + string.format('error on state (forward) with %s', typename)) end - errorVal = rescpu - restruth - mytester:assertlt(errorVal, precision_forward, 'error on state (forward) ') - errorVal = rescuda - restruth - mytester:assertlt(errorVal, precision_forward, 'error on state (forward) ') end - function cunntest.BCECriterionWeights_forward() local size = math.random(1,100) - local input = torch.Tensor(size):uniform() - local target = torch.Tensor(size):uniform():gt(0.5):type(torch.type(input)) - local weights = torch.Tensor(size):uniform() - - local tm = {} - local title = string.format('BCECriterionWeights.forward, Size: %d', size) - times[title] = tm - - local crit = nn.BCECriterion(weights) - local rescpu = crit:forward(input, target) - local a = torch.Timer() - for i = 1,nloop do - rescpu = crit:forward(input, target) - end - tm.cpu = a:time().real - - input = input:cuda() - target = target:cuda() - weights = weights:cuda() - local g_crit = nn.BCECriterion(weights):cuda() - local rescuda = g_crit:forward(input, target) - a:reset() - for i = 1,nloop do - rescuda = g_crit:forward(input, target) + for k, typename in ipairs(typenames) do + local input = torch.Tensor(size):uniform():type(typename) + local target = torch.Tensor(size):uniform():gt(0.5):type(torch.type(input)) + local weights = torch.Tensor(size):uniform():type(typename) + + local ctype = t2cpu[typename] + input = input:type(ctype) + target = target:type(ctype) + weights = weights:type(ctype) + local crit = nn.BCECriterion(weights):type(ctype) + local rescpu = crit:forward(input, target) + + input = input:type(typename) + target = target:type(typename) + weights = weights:type(typename) + local g_crit = nn.BCECriterion(weights):type(typename) + local rescuda = g_crit:forward(input, target) + + local errorVal = rescuda - rescpu + mytester:assertlt(errorVal, precision_forward_type(precision_forward, typename), + string.format('error on state (forward) with %s', typename)) + + -- test vs lua implementation + -- FIXME: half does not support dot without CUDA 8.0, so can't compare to lua implementation. + if typename ~= 'torch.CudaHalfTensor' then + buffer = input.new() + restruth = BCECriterion_forward_truth(buffer, input, target, weights, true) + errorVal = rescpu - restruth + mytester:assertlt(errorVal, precision_forward_type(precision_forward, typename), + string.format('error on state (forward) with %s', typename)) + errorVal = rescuda - restruth + mytester:assertlt(errorVal, precision_forward_type(precision_forward, typename), + string.format('error on state (forward) with %s', typename)) + end end - cutorch.synchronize() - tm.gpu = a:time().real - local errorVal = rescuda - rescpu - mytester:assertlt(errorVal, precision_forward, 'error on state (forward) ') - - -- test vs lua implementation - buffer = input.new() - local restruth = BCECriterion_forward_truth(buffer, input, target, weights, true) - for i = 1,nloop do - local restruth = BCECriterion_forward_truth(buffer, input, target, weights, true) - end - errorVal = rescpu - restruth - mytester:assertlt(errorVal, precision_forward, 'error on state (forward) ') - errorVal = rescuda - restruth - mytester:assertlt(errorVal, precision_forward, 'error on state (forward) ') end - function cunntest.MarginCriterion_forward() local size = math.random(1,100) - local input = (torch.rand(size)-0.5) * 2 -- data spread from -1 to 1 - local target = (torch.round(torch.rand(size))*2)-1 -- generate random labels -1, 1 - - local tm = {} - local title = string.format('MarginCriterion.forward, Size: %d', size) - times[title] = tm - - local crit = nn.MarginCriterion() - local groundtruth= crit:forward(input, target) - local a = torch.Timer() - for i = 1,nloop do - groundtruth = crit:forward(input, target) - end - tm.cpu = a:time().real - - input = input:cuda() - target = target:cuda() - local g_crit = nn.MarginCriterion():cuda() - local rescuda = g_crit:forward(input, target) - a:reset() - for i = 1,nloop do - rescuda = g_crit:forward(input, target) + + for k, typename in ipairs(typenames) do + local input = ((torch.rand(size)-0.5) * 2):type(typename) -- data spread from -1 to 1 + local target = ((torch.round(torch.rand(size))*2)-1):type(typename) -- generate random labels -1, 1 + + local ctype = t2cpu[typename] + input = input:type(ctype) + target = input:type(ctype) + local crit = nn.MarginCriterion():type(ctype) + local groundtruth= crit:forward(input, target) + + input = input:type(typename) + target = target:type(typename) + local g_crit = nn.MarginCriterion():type(typename) + local rescuda = g_crit:forward(input, target) + local errorVal = rescuda - groundtruth + mytester:assertlt(errorVal, precision_forward_type(precision_forward, typename), + string.format('error on state (forward) with %s', typename)) end - cutorch.synchronize() - tm.gpu = a:time().real - local errorVal = rescuda - groundtruth - mytester:assertlt(errorVal, precision_forward, 'error on state (forward) ') end function cunntest.MultiLabelMarginCriterion_forward() local size = math.random(1,100) - local input = (torch.rand(size)-0.5) * 2 -- data spread from -1 to 1 - local target = torch.round(torch.rand(size)*(size-1)):add(1) -- generate random labels > 0 - local zero = math.random(0,size) -- turn some labels into 0 targets - if zero > 0 then - target:sub(size-zero+1,size):zero() - end - local crit = nn.MultiLabelMarginCriterion() - local groundtruth= crit:forward(input, target) - input = input:cuda() - target = target:cuda() - local g_crit = nn.MultiLabelMarginCriterion():cuda() - local rescuda = g_crit:forward(input, target) - local errorVal = rescuda - groundtruth - mytester:assertlt(errorVal, precision_forward, 'error on state (forward) ') + for k, typename in ipairs(typenames) do + local input = ((torch.rand(size)-0.5) * 2):type(typename) -- data spread from -1 to 1 + local target = torch.round(torch.rand(size)*(size-1)):add(1) -- generate random labels > 0 + local zero = math.random(0,size) -- turn some labels into 0 targets + if zero > 0 then + target:sub(size-zero+1,size):zero() + end + + local ctype = t2cpu[typename] + input = input:type(ctype) + local crit = nn.MultiLabelMarginCriterion():type(ctype) + local groundtruth= crit:forward(input, target) + input = input:type(typename) + target = target:type(typename) + local g_crit = nn.MultiLabelMarginCriterion():type(typename) + local rescuda = g_crit:forward(input, target) + local errorVal = rescuda - groundtruth + mytester:assertlt(errorVal, precision_forward_type(precision_forward, typename), + string.format('error on state (forward) with %s', typename)) + end end function cunntest.MultiLabelMarginCriterion_backward() local size = math.random(1,100) - local input = (torch.rand(size)-0.5) * 2 -- data spread from -1 to 1 - local target = torch.round(torch.rand(size)*(size-1)):add(1) -- generate random labels > 0 - local zero = math.random(0,size) -- turn some labels into 0 targets - if zero > 0 then - target:sub(size-zero+1,size):zero() - end - local crit = nn.MultiLabelMarginCriterion() - local pred = crit:forward(input, target) - local groundgrad = crit:backward(input, target) + for k, typename in ipairs(typenames) do + local input = ((torch.rand(size)-0.5) * 2):type(typename) -- data spread from -1 to 1 + local target = torch.round(torch.rand(size)*(size-1)):add(1) -- generate random labels > 0 + local zero = math.random(0,size) -- turn some labels into 0 targets + if zero > 0 then + target:sub(size-zero+1,size):zero() + end - input = input:cuda() - target = target:cuda() - local g_crit = nn.MultiLabelMarginCriterion():cuda() - g_crit:forward(input, target) - local rescuda = g_crit:backward(input, target) + local ctype = t2cpu[typename] + input = input:type(ctype) + local crit = nn.MultiLabelMarginCriterion():type(ctype) + local pred = crit:forward(input, target) + local groundgrad = crit:backward(input, target) - local error = rescuda:float() - groundgrad + input = input:type(typename) + target = target:type(typename) + local g_crit = nn.MultiLabelMarginCriterion():type(typename) + g_crit:forward(input, target) + local rescuda = g_crit:backward(input, target) - mytester:assertlt(error:abs():max(), precision_backward, 'error on state (backward) ') + local error = rescuda:double() - groundgrad:double() + + mytester:assertlt(error:abs():max(), precision_backward_type(precision_backward, typename), + string.format('error on state (backward) with %s', typename)) + end end function cunntest.SpatialCrossMapLRN_forward_batch() @@ -3664,31 +3218,22 @@ function cunntest.SpatialCrossMapLRN_forward_batch() local beta = math.random(0,100)/100 local k = math.random(1,3) - local tm = {} - local title = string.format('SpatialCrossMapLRN.forward') - times[title] = tm + for k, typename in ipairs(typenames) do + local input = torch.rand(bs, nbfeatures, inputSize, inputSize):type(typename) - local input = torch.rand(bs, nbfeatures, inputSize, inputSize) - local sconv = nn.SpatialCrossMapLRN(size, alpha, beta, k) - local groundtruth = sconv:forward(input) - local a = torch.Timer() - for i = 1,nloop do - groundtruth = sconv:forward(input) - end - tm.cpu = a:time().real + local ctype = t2cpu[typename] + input = input:type(ctype) + local sconv = nn.SpatialCrossMapLRN(size, alpha, beta, k):type(ctype) + local groundtruth = sconv:forward(input) - input = input:cuda() - local gconv = nn.SpatialCrossMapLRN(size, alpha, beta, k):cuda() - local rescuda = gconv:forward(input) - a:reset() - for i = 1,nloop do - rescuda = gconv:forward(input) - end - cutorch.synchronize() - tm.gpu = a:time().real + input = input:type(typename) + local gconv = nn.SpatialCrossMapLRN(size, alpha, beta, k):type(typename) + local rescuda = gconv:forward(input) - local error = rescuda:float() - groundtruth - mytester:assertlt(error:abs():max(), precision_forward, 'error on state (forward) ') + local error = rescuda:double() - groundtruth:double() + mytester:assertlt(error:abs():max(), precision_forward_type(precision_forward, typename), + string.format('error on state (forward) with %s', typename)) + end end function cunntest.SpatialCrossMapLRN_backward_batch() @@ -3700,242 +3245,208 @@ function cunntest.SpatialCrossMapLRN_backward_batch() local beta = math.random(0,100)/100 local k = math.random(1,3) - local tm = {} - local title = string.format('SpatialCrossMapLRN.backward') - times[title] = tm + for k, typename in ipairs(typenames) do + local input = torch.rand(bs, nbfeatures, inputSize, inputSize):type(typename) + local gradOutput = torch.rand(input:size()):type(typename) - local input = torch.rand(bs, nbfeatures, inputSize, inputSize) - local gradOutput = torch.rand(input:size()) - local sconv = nn.SpatialCrossMapLRN(size, alpha, beta, k) - sconv:forward(input) - sconv:zeroGradParameters() - local groundgrad = sconv:backward(input, gradOutput) - local a = torch.Timer() - for i = 1,nloop do + local ctype = t2cpu[typename] + input = input:type(ctype) + gradOutput = gradOutput:type(ctype) + local sconv = nn.SpatialCrossMapLRN(size, alpha, beta, k):type(ctype) + sconv:forward(input) sconv:zeroGradParameters() - groundgrad = sconv:backward(input, gradOutput) - end - tm.cpu = a:time().real + local groundgrad = sconv:backward(input, gradOutput) - input = input:cuda() - gradOutput = gradOutput:cuda() - local gconv = nn.SpatialCrossMapLRN(size, alpha, beta, k):cuda() - gconv:forward(input) - gconv:zeroGradParameters() - local rescuda = gconv:backward(input, gradOutput) - a:reset() - for i = 1,nloop do + input = input:type(ctype) + gradOutput = gradOutput:type(ctype) + local gconv = nn.SpatialCrossMapLRN(size, alpha, beta, k):type(ctype) + gconv:forward(input) gconv:zeroGradParameters() - rescuda = gconv:backward(input, gradOutput) - end - cutorch.synchronize() - tm.gpu = a:time().real + local rescuda = gconv:backward(input, gradOutput) - local error = rescuda:float() - groundgrad + local error = rescuda:double() - groundgrad:double() - mytester:assertlt(error:abs():max(), precision_backward, 'error on state (backward) ') + mytester:assertlt(error:abs():max(), precision_backward_type(precision_backward), + string.format('error on state (backward) with %s', typename)) + end end function cunntest.MarginCriterion_backward() local size = math.random(1,100) - local tm = {} - local title = string.format('MarginCriterion.backward, Size %d', size) - times[title] = tm - - local input = (torch.rand(size)-0.5) * 2 -- data spread from -1 to 1 - local target = (torch.round(torch.rand(size))*2)-1 -- generate random labels -1, 1 + for k, typename in ipairs(typenames) do + local input = ((torch.rand(size)-0.5) * 2):type(typename) -- data spread from -1 to 1 + local target = ((torch.round(torch.rand(size))*2)-1):type(typename) -- generate random labels -1, 1 - local crit = nn.MarginCriterion() - crit:forward(input, target) - local groundgrad = crit:backward(input, target) - local a = torch.Timer() - for i = 1,nloop do - groundgrad = crit:backward(input, target) - end - tm.cpu = a:time().real + local ctype = t2cpu[typename] + input = input:type(ctype) + target = target:type(ctype) + local crit = nn.MarginCriterion():type(ctype) + crit:forward(input, target) + local groundgrad = crit:backward(input, target) - input = input:cuda() - target = target:cuda() - local g_crit = nn.MarginCriterion():cuda() - g_crit:forward(input, target) - local rescuda = g_crit:backward(input, target) - a:reset() - for i = 1,nloop do - rescuda = g_crit:backward(input, target) - end - cutorch.synchronize() - tm.gpu = a:time().real + input = input:type(typename) + target = target:type(typename) + local g_crit = nn.MarginCriterion():type(typename) + g_crit:forward(input, target) + local rescuda = g_crit:backward(input, target) - local error = rescuda:float() - groundgrad + local error = rescuda:double() - groundgrad:double() - mytester:assertlt(error:abs():max(), precision_backward, 'error on state (backward) ') + mytester:assertlt(error:abs():max(), precision_backward_type(precision_backward), + string.format('error on state (backward) with %s', typename)) + end end function cunntest.BCECriterion_backward() local size = math.random(1,100) - local tm = {} - local title = string.format('BCECriterion.backward, Size %d', size) - times[title] = tm - - local input = torch.Tensor(size):uniform() - local target = torch.Tensor(size):uniform():gt(0.5):type(torch.type(input)) + for k, typename in ipairs(typenames) do + local input = torch.Tensor(size):uniform():type(typename) + local target = torch.Tensor(size):uniform():gt(0.5):type(torch.type(input)) - local crit = nn.BCECriterion() - crit:forward(input, target) - local groundgrad = crit:backward(input, target) - local a = torch.Timer() - for i = 1,nloop do - groundgrad = crit:backward(input, target) - end - tm.cpu = a:time().real + local ctype = t2cpu[typename] + input = input:type(ctype) + target = target:type(ctype) + local crit = nn.BCECriterion():type(ctype) + crit:forward(input, target) + local groundgrad = crit:backward(input, target) - input = input:cuda() - target = target:cuda() - local g_crit = nn.BCECriterion():cuda() - g_crit:forward(input, target) - local rescuda = g_crit:backward(input, target) - a:reset() - for i = 1,nloop do - rescuda = g_crit:backward(input, target) - end - cutorch.synchronize() - tm.gpu = a:time().real + input = input:type(typename) + target = target:type(typename) + local g_crit = nn.BCECriterion():type(typename) + g_crit:forward(input, target) + local rescuda = g_crit:backward(input, target) - local error = rescuda:float() - groundgrad + local error = rescuda:double() - groundgrad:double() - mytester:assertlt(error:abs():max(), precision_backward, 'error on state (backward) ') + mytester:assertlt(error:abs():max(), precision_backward_type(precision_backward, typename), + string.format('error on state (backward) with %s', typename)) + end end function cunntest.BCECriterionWeights_backward() local size = math.random(1,100) - local tm = {} - local title = string.format('BCECriterionWeights.backward, Size %d', size) - times[title] = tm - - local input = torch.Tensor(size):uniform() - local target = torch.Tensor(size):uniform():gt(0.5):type(torch.type(input)) - local weights = torch.Tensor(size):uniform() - - local crit = nn.BCECriterion(weights) - crit:forward(input, target) - local groundgrad = crit:backward(input, target) - local a = torch.Timer() - for i = 1,nloop do - groundgrad = crit:backward(input, target) - end - tm.cpu = a:time().real - - input = input:cuda() - target = target:cuda() - weights = weights:cuda() - local g_crit = nn.BCECriterion(weights):cuda() - g_crit:forward(input, target) - local rescuda = g_crit:backward(input, target) - a:reset() - for i = 1,nloop do - rescuda = g_crit:backward(input, target) + for k, typename in ipairs(typenames) do + local input = torch.Tensor(size):uniform():type(typename) + local target = torch.Tensor(size):uniform():gt(0.5):type(torch.type(input)) + local weights = torch.Tensor(size):uniform():type(typename) + + local ctype = t2cpu[typename] + input = input:type(ctype) + target = target:type(ctype) + weights = weights:type(ctype) + local crit = nn.BCECriterion(weights):type(ctype) + crit:forward(input, target) + local groundgrad = crit:backward(input, target) + + input = input:type(typename) + target = target:type(typename) + weights = weights:type(typename) + local g_crit = nn.BCECriterion(weights):type(typename) + g_crit:forward(input, target) + local rescuda = g_crit:backward(input, target) + + local error = rescuda:double() - groundgrad:double() + + mytester:assertlt(error:abs():max(), precision_backward_type(precision_backward, typename), + string.format('error on state (backward) with %s', typename)) end - cutorch.synchronize() - tm.gpu = a:time().real - - local error = rescuda:float() - groundgrad - - mytester:assertlt(error:abs():max(), precision_backward, 'error on state (backward) ') end function cunntest.mse() for sizeAverage = 0, 1 do - local size = math.random(3000,5000) - local input = torch.randn(size,1,1) - local target = torch.randn(size) - local mod = nn.MSECriterion(sizeAverage == 1) - - local tm = {} - local title = string.format('MSECriterion sizeAverage %d, %d ', sizeAverage, size) - times[title] = tm - - local a = torch.Timer() - local fout = mod:forward(input,target) - local fgin = mod:backward(input,target):clone() - tm.cpu = a:time().real - - local cinput = input:cuda() - local ctarget = target:cuda() - local cmod = nn.MSECriterion(sizeAverage == 1):cuda() - a:reset() - local cout = cmod:forward(cinput,ctarget) - local cgin = cmod:backward(cinput,ctarget) - cutorch.synchronize() - tm.gpu = a:time().real - - mytester:assertlt(math.abs(fout-cout), 0.02, 'error on output') - local gerr = cgin:float() - fgin - mytester:assertlt(gerr:abs():max(), precision_forward, 'error on gradInput') + for k, typename in ipairs(typenames) do + local size = math.random(3000,5000) + local input = torch.randn(size,1,1):type(typename) + local target = torch.randn(size):type(typename) + + local ctype = t2cpu[typename] + input = input:type(ctype) + target = target:type(ctype) + local mod = nn.MSECriterion(sizeAverage == 1):type(ctype) + + local fout = mod:forward(input,target) + local fgin = mod:backward(input,target):clone() + + local cinput = input:type(typename) + local ctarget = target:type(typename) + local cmod = nn.MSECriterion(sizeAverage == 1):type(typename) + local cout = cmod:forward(cinput,ctarget) + local cgin = cmod:backward(cinput,ctarget) + + if (typename == 'torch.CudaHalfTensor') then + fout = ffi.C.THC_half2float(ffi.C.THC_float2half(fout)) + end + mytester:assertlt(math.abs(fout-cout), precision_forward_type(0.02, typename), + string.format('error on output with %s', typename)) + local gerr = cgin:double() - fgin:double() + mytester:assertlt(gerr:abs():max(), precision_forward_type(precision_forward, typename), + string.format('error on gradInput with %s', typename)) + end end end function cunntest.SmoothL1() for sizeAverage = 0, 1 do local size = math.random(3000,5000) - local input = torch.randn(size,1,1) - local target = torch.randn(size) - local mod = nn.SmoothL1Criterion(sizeAverage == 1) - - local tm = {} - local title = string.format('SmoothL1Criterion sizeAverage %d, %d ', sizeAverage, size) - times[title] = tm - - local a = torch.Timer() - local fout = mod:forward(input,target) - local fgin = mod:backward(input,target):clone() - tm.cpu = a:time().real - local cinput = input:cuda() - local ctarget = target:cuda() - local cmod = nn.SmoothL1Criterion(sizeAverage == 1):cuda() - a:reset() - local cout = cmod:forward(cinput,ctarget) - local cgin = cmod:backward(cinput,ctarget) - cutorch.synchronize() - tm.gpu = a:time().real - - mytester:assertlt(math.abs(fout-cout), 0.01, 'error on output') - local gerr = cgin:float() - fgin - mytester:assertlt(gerr:abs():max(), precision_forward, 'error on gradInput') + for k, typename in ipairs(typenames) do + local input = torch.randn(size,1,1):type(typename) + local target = torch.randn(size):type(typename) + + local ctype = t2cpu[typename] + input = input:type(ctype) + target = target:type(ctype) + local mod = nn.SmoothL1Criterion(sizeAverage == 1):type(ctype) + + local fout = mod:forward(input,target) + local fgin = mod:backward(input,target):clone() + + local cinput = input:type(typename) + local ctarget = target:type(typename) + local cmod = nn.SmoothL1Criterion(sizeAverage == 1):type(typename) + local cout = cmod:forward(cinput,ctarget) + local cgin = cmod:backward(cinput,ctarget) + + if (typename == 'torch.CudaHalfTensor') then + fout = ffi.C.THC_half2float(ffi.C.THC_float2half(fout)) + end + mytester:assertlt(math.abs(fout-cout), 0.01, string.format('error on output with %s', typename)) + local gerr = cgin:double() - fgin:double() + mytester:assertlt(gerr:abs():max(), precision_forward_type(precision_forward, typename), + string.format('error on gradInput with %s', typename)) + end end end function cunntest.SoftMarginCriterion() for sizeAverage = 0, 1 do - local size = math.random(3000,5000) - local input = torch.randn(size,1,1) - local target = torch.randn(size) - local mod = nn.SoftMarginCriterion(sizeAverage == 1) - - local tm = {} - local title = string.format('SoftMarginCriterion sizeAverage %d, %d ', sizeAverage, size) - times[title] = tm - - local a = torch.Timer() - local fout = mod:forward(input,target) - local fgin = mod:backward(input,target):clone() - tm.cpu = a:time().real - - local cinput = input:cuda() - local ctarget = target:cuda() - local cmod = nn.SoftMarginCriterion(sizeAverage == 1):cuda() - a:reset() - local cout = cmod:forward(cinput,ctarget) - local cgin = cmod:backward(cinput,ctarget) - cutorch.synchronize() - tm.gpu = a:time().real - - mytester:assertlt(math.abs(fout-cout), 0.01, 'error on output') - local gerr = cgin:float() - fgin - mytester:assertlt(gerr:abs():max(), precision_forward, 'error on gradInput') + for k, typename in ipairs(typenames) do + local size = math.random(3000,5000) + local input = torch.randn(size,1,1):type(typename) + local target = torch.randn(size):type(typename) + + local ctype = t2cpu[typename] + input = input:type(ctype) + target = target:type(ctype) + local mod = nn.SoftMarginCriterion(sizeAverage == 1):type(ctype) + + local fout = mod:forward(input,target) + local fgin = mod:backward(input,target):clone() + + local cinput = input:type(typename) + local ctarget = target:type(typename) + local cmod = nn.SoftMarginCriterion(sizeAverage == 1):type(typename) + local cout = cmod:forward(cinput,ctarget) + local cgin = cmod:backward(cinput,ctarget) + + mytester:assertlt(math.abs(fout-cout), 0.01, 'error on output') + local gerr = cgin:double() - fgin:double() + mytester:assertlt(gerr:abs():max(), precision_forward_type(precision_forward, typename), + string.format('error on gradInput with %s', typename)) + end end end @@ -3943,31 +3454,31 @@ end function cunntest.distkldiv() for sizeAverage = 0, 1 do local size = math.random(3000,5000) - local input = torch.randn(size) -- TODO, make it back to (size, 1, 1), see https://github.com/torch/cunn/issues/245#issuecomment-209260954 - local target = torch.randn(size) - local mod = nn.DistKLDivCriterion(sizeAverage == 1) - local tm = {} - local title = string.format('DistKLDivCriterion sizeAverage %d, %d ',sizeAverage,size) - times[title] = tm - - local a = torch.Timer() - local fout = mod:forward(input,target) - local fgin = mod:backward(input,target):clone() - tm.cpu = a:time().real - - local cinput = input:cuda() - local ctarget = target:cuda() - local cmod = nn.DistKLDivCriterion(sizeAverage == 1):cuda() - a:reset() - local cout = cmod:forward(cinput,ctarget) - local cgin = cmod:backward(cinput,ctarget) - cutorch.synchronize() - tm.gpu = a:time().real - - mytester:assertlt(math.abs(fout-cout), precision_forward, 'error on output') - local gerr = cgin:float() - fgin - mytester:assertlt(gerr:abs():max(), precision_backward, 'error on gradInput') + for k, typename in ipairs(typenames) do + local input = torch.randn(size):type(typename) -- TODO, make it back to (size, 1, 1), see https://github.com/torch/cunn/issues/245#issuecomment-209260954 + local target = torch.randn(size):type(typename) + + local ctype = t2cpu[typename] + input = input:type(ctype) + target = target:type(ctype) + local mod = nn.DistKLDivCriterion(sizeAverage == 1):type(ctype) + + local fout = mod:forward(input,target) + local fgin = mod:backward(input,target):clone() + + local cinput = input:type(typename) + local ctarget = target:type(typename) + local cmod = nn.DistKLDivCriterion(sizeAverage == 1):type(typename) + local cout = cmod:forward(cinput,ctarget) + local cgin = cmod:backward(cinput,ctarget) + + mytester:assertlt(math.abs(fout-cout), precision_forward_type(precision_forward, typename), + string.format('error on output with %s', typename)) + local gerr = cgin:double() - fgin:double() + mytester:assertlt(gerr:abs():max(), precision_backward_type(precision_backward, typename), + string.format('error on gradInput with %s', typename)) + end end end @@ -3979,34 +3490,24 @@ function cunntest.TemporalConvolution_forward() local outi = math.random(1,256) -- nOutputFrame local ini = (outi-1)*si+ki -- nInputFrame - local tm = {} - local title = string.format('TemporalConvolution.forward %dx%d o %d -> %dx%d [s: %d]', - from, ini, ki, to, outi, si) - times[title] = tm + for k, typename in ipairs(typenames) do + local input = torch.randn(ini,from):type(typename) - local input = torch.randn(ini,from) - local sconv = nn.TemporalConvolution(from,to,ki,si) - local groundtruth = sconv:forward(input) - local a = torch.Timer() - for i = 1,nloop do - groundtruth = sconv:forward(input) - end - tm.cpu = a:time().real + local ctype = t2cpu[typename] + input = input:type(ctype) + local sconv = nn.TemporalConvolution(from,to,ki,si):type(ctype) + local groundtruth = sconv:forward(input) - input = input:cuda() - local gconv = nn.TemporalConvolution(from,to,ki,si):cuda() - gconv.weight = sconv.weight:cuda() - gconv.bias = sconv.bias:cuda() - local rescuda = gconv:forward(input) - a:reset() - for i = 1,nloop do - rescuda = gconv:forward(input) - end - cutorch.synchronize() - tm.gpu = a:time().real + input = input:type(typename) + local gconv = nn.TemporalConvolution(from,to,ki,si):type(typename) + gconv.weight = sconv.weight:type(typename) + gconv.bias = sconv.bias:type(typename) + local rescuda = gconv:forward(input) - local error = rescuda:float() - groundtruth - mytester:assertlt(error:abs():max(), precision_forward, 'error on state (forward) ') + local error = rescuda:double() - groundtruth:double() + mytester:assertlt(error:abs():max(), precision_forward_type(precision_forward, typename), + string.format('error on state (forward) with %s', typename)) + end end function cunntest.TemporalConvolution_forward_batch() @@ -4018,34 +3519,24 @@ function cunntest.TemporalConvolution_forward_batch() local outi = math.random(1,256) local ini = (outi-1)*si+ki - local tm = {} - local title = string.format('TemporalConvolution.forward %dx%dx%d o %d -> %dx%dx%d [s: %d]', - bs, from, ini, ki, bs, to, outi, si) - times[title] = tm + for k, typename in ipairs(typenames) do + local input = torch.randn(bs,ini,from):type(typename) - local input = torch.randn(bs,ini,from) - local sconv = nn.TemporalConvolution(from,to,ki,si) - local groundtruth = sconv:forward(input) - local a = torch.Timer() - for i = 1,nloop do - groundtruth = sconv:forward(input) - end - tm.cpu = a:time().real + local ctype = t2cpu[typename] + input = input:type(ctype) + local sconv = nn.TemporalConvolution(from,to,ki,si):type(ctype) + local groundtruth = sconv:forward(input) - input = input:cuda() - local gconv = nn.TemporalConvolution(from,to,ki,si):cuda() - gconv.weight = sconv.weight:cuda() - gconv.bias = sconv.bias:cuda() - local rescuda = gconv:forward(input) - a:reset() - for i = 1,nloop do - rescuda = gconv:forward(input) - end - cutorch.synchronize() - tm.gpu = a:time().real + input = input:type(typename) + local gconv = nn.TemporalConvolution(from,to,ki,si):type(typename) + gconv.weight = sconv.weight:type(typename) + gconv.bias = sconv.bias:type(typename) + local rescuda = gconv:forward(input) - local error = rescuda:float() - groundtruth - mytester:assertlt(error:abs():max(), precision_forward, 'error on state (forward) ') + local error = rescuda:double() - groundtruth:double() + mytester:assertlt(error:abs():max(), precision_forward_type(precision_forward, typename), + string.format('error on state (forward) with %s', typename)) + end end function cunntest.TemporalConvolution_backward() @@ -4056,52 +3547,44 @@ function cunntest.TemporalConvolution_backward() local outi = math.random(1,256) local ini = (outi-1)*si+ki - local tm = {} - local title = string.format('TemporalConvolution.backward %dx%d o %d -> %dx%d', - from, ini, ki, to, outi) - - times[title] = tm + for k, typename in ipairs(typenames) do + local input = torch.randn(ini,from):type(typename) + local gradOutput = torch.randn(outi,to):type(typename) - local input = torch.randn(ini,from) - local gradOutput = torch.randn(outi,to) - local sconv = nn.TemporalConvolution(from,to,ki,si) - sconv:forward(input) - sconv:zeroGradParameters() - local groundgrad = sconv:backward(input, gradOutput) - local a = torch.Timer() - for i = 1,nloop do + local ctype = t2cpu[typename] + input = input:type(ctype) + gradOutput = gradOutput:type(ctype) + local sconv = nn.TemporalConvolution(from,to,ki,si):type(ctype) + sconv:forward(input) sconv:zeroGradParameters() - groundgrad = sconv:backward(input, gradOutput) - end - local groundweight = sconv.gradWeight - local groundbias = sconv.gradBias - tm.cpu = a:time().real + local groundgrad = sconv:backward(input, gradOutput) + local groundweight = sconv.gradWeight + local groundbias = sconv.gradBias - input = input:cuda() - gradOutput = gradOutput:cuda() - local gconv = nn.TemporalConvolution(from,to,ki,si):cuda() - gconv.weight = sconv.weight:cuda() - gconv.bias = sconv.bias:cuda() - gconv:forward(input) - gconv:zeroGradParameters() - local rescuda = gconv:backward(input, gradOutput) - a:reset() - for i = 1,nloop do + input = input:type(typename) + gradOutput = gradOutput:type(typename) + local gconv = nn.TemporalConvolution(from,to,ki,si):type(typename) + gconv.weight = sconv.weight:type(typename) + gconv.bias = sconv.bias:type(typename) + gconv:forward(input) gconv:zeroGradParameters() - rescuda = gconv:backward(input, gradOutput) - end - local weightcuda = gconv.gradWeight - local biascuda = gconv.gradBias - cutorch.synchronize() - tm.gpu = a:time().real + local rescuda = gconv:backward(input, gradOutput) + local weightcuda = gconv.gradWeight + local biascuda = gconv.gradBias - local error = rescuda:float() - groundgrad - local werror = weightcuda:float() - groundweight - local berror = biascuda:float() - groundbias + local error = rescuda:double() - groundgrad:double() + local werror = weightcuda:double() - groundweight:double() + local berror = biascuda:double() - groundbias:double() - mytester:assertlt(error:abs():max(), precision_backward, 'error on state (backward) ') - mytester:assertlt(werror:abs():max(), precision_backward, 'error on weight (backward) ') - mytester:assertlt(berror:abs():max(), precision_backward, 'error on bias (backward) ') + mytester:assertlt(error:abs():max(), precision_backward_type(precision_backward, typename), + string.format('error on state (backward) with %s', typename)) + mytester:assertlt(werror:abs():max(), + precision_backward_conv_weightbias(precision_backward, typename, weightcuda:abs():max()), + string.format('error on weight (backward) with %s', typename)) + mytester:assertlt(berror:abs():max(), + precision_backward_conv_weightbias(precision_backward, typename, biascuda:abs():max()), + string.format('error on bias (backward) with %s', typename)) + end end function cunntest.TemporalConvolution_backward_batch() @@ -4113,51 +3596,44 @@ function cunntest.TemporalConvolution_backward_batch() local outi = math.random(1,256) local ini = (outi-1)*si+ki - local tm = {} - local title = string.format('TemporalConvolution.backward %dx%dx%d o %d -> %dx%dx%d', - bs, from, ini, ki, bs, to, outi) - times[title] = tm + for k, typename in ipairs(typenames) do + local input = torch.randn(bs,ini,from):type(typename) + local gradOutput = torch.randn(bs,outi,to):type(typename) - local input = torch.randn(bs,ini,from) - local gradOutput = torch.randn(bs,outi,to) - local sconv = nn.TemporalConvolution(from,to,ki,si) - sconv:forward(input) - sconv:zeroGradParameters() - local groundgrad = sconv:backward(input, gradOutput) - local a = torch.Timer() - for i = 1,nloop do + local ctype = t2cpu[typename] + input = input:type(ctype) + gradOutput = gradOutput:type(ctype) + local sconv = nn.TemporalConvolution(from,to,ki,si):type(ctype) + sconv:forward(input) sconv:zeroGradParameters() - groundgrad = sconv:backward(input, gradOutput) - end - local groundweight = sconv.gradWeight - local groundbias = sconv.gradBias - tm.cpu = a:time().real + local groundgrad = sconv:backward(input, gradOutput) + local groundweight = sconv.gradWeight + local groundbias = sconv.gradBias - input = input:cuda() - gradOutput = gradOutput:cuda() - local gconv = nn.TemporalConvolution(from,to,ki,si):cuda() - gconv.weight = sconv.weight:cuda() - gconv.bias = sconv.bias:cuda() - gconv:forward(input) - gconv:zeroGradParameters() - local rescuda = gconv:backward(input, gradOutput) - a:reset() - for i = 1,nloop do + input = input:type(typename) + gradOutput = gradOutput:type(typename) + local gconv = nn.TemporalConvolution(from,to,ki,si):type(typename) + gconv.weight = sconv.weight:type(typename) + gconv.bias = sconv.bias:type(typename) + gconv:forward(input) gconv:zeroGradParameters() - rescuda = gconv:backward(input, gradOutput) - end - local weightcuda = gconv.gradWeight - local biascuda = gconv.gradBias - cutorch.synchronize() - tm.gpu = a:time().real + local rescuda = gconv:backward(input, gradOutput) + local weightcuda = gconv.gradWeight + local biascuda = gconv.gradBias - local error = rescuda:float() - groundgrad - local werror = weightcuda:float() - groundweight - local berror = biascuda:float() - groundbias + local error = rescuda:double() - groundgrad:double() + local werror = weightcuda:double() - groundweight:double() + local berror = biascuda:double() - groundbias:double() - mytester:assertlt(error:abs():max(), precision_backward, 'error on state (backward) ') - mytester:assertlt(werror:abs():max(), precision_backward, 'error on weight (backward) ') - mytester:assertlt(berror:abs():max(), precision_backward, 'error on bias (backward) ') + mytester:assertlt(error:abs():max(), precision_backward_type(precision_backward, typename), + string.format('error on state (backward) with %s', typename)) + mytester:assertlt(werror:abs():max(), + precision_backward_conv_weightbias(precision_backward, typename, weightcuda:abs():max()), + string.format('error on weight (backward) with %s', typename)) + mytester:assertlt(berror:abs():max(), + precision_backward_conv_weightbias(precision_backward, typename, biascuda:abs():max()), + string.format('error on bias (backward) with %s', typename)) + end end function cunntest.Dropout() @@ -4211,66 +3687,46 @@ end function cunntest.SoftPlus_forward() local size = math.random(1,100) - local tm = {} - local title = string.format('SoftPlus forward %d -> %d', size, size) - times[title] = tm - - local input = torch.randn(size) - local sconv = nn.SoftPlus() - local groundtruth = sconv:forward(input) - local a = torch.Timer() - for i = 1,nloop do - groundtruth = sconv:forward(input) - end - tm.cpu = a:time().real + for k, typename in ipairs(typenames) do + local input = torch.randn(size):type(typename) + local ctype = t2cpu[typename] + input = input:type(ctype) + local sconv = nn.SoftPlus():type(ctype) + local groundtruth = sconv:forward(input) - input = input:cuda() - local gconv = nn.SoftPlus():cuda() - local rescuda = gconv:forward(input) - a:reset() - for i = 1,nloop do - rescuda = gconv:forward(input) - end - cutorch.synchronize() - tm.gpu = a:time().real + input = input:type(typename) + local gconv = nn.SoftPlus():type(typename) + local rescuda = gconv:forward(input) - local error = rescuda:float() - groundtruth - mytester:assertlt(error:abs():max(), precision_forward, 'error on state (forward) ') + local error = rescuda:double() - groundtruth:double() + mytester:assertlt(error:abs():max(), precision_forward_type(precision_forward,typename), + string.format('error on state (forward) with %s', typename)) + end end function cunntest.SoftPlus_backward() local size = math.random(1,100) - local tm = {} - local title = string.format('SoftPlus.backward %d -> %d', size, size) - times[title] = tm - - local input = torch.randn(size) - local gradOutput = torch.randn(size) - local sconv = nn.SoftPlus() - sconv:forward(input) - local groundgrad = sconv:backward(input, gradOutput) - local a = torch.Timer() - for i = 1,nloop do - groundgrad = sconv:backward(input, gradOutput) - end - tm.cpu = a:time().real - - input = input:cuda() - gradOutput = gradOutput:cuda() - local gconv = sconv:clone():cuda() - gconv:forward(input) - local rescuda = gconv:backward(input, gradOutput) - a:reset() - for i = 1,nloop do - rescuda = gconv:backward(input, gradOutput) - end - cutorch.synchronize() - tm.gpu = a:time().real + for k, typename in ipairs(typenames) do + local input = torch.randn(size):type(typename) + local gradOutput = torch.randn(size):type(typename) + local ctype = t2cpu[typename] + input = input:type(ctype) + gradOutput = gradOutput:type(ctype) + local sconv = nn.SoftPlus():type(ctype) + sconv:forward(input) + local groundgrad = sconv:backward(input, gradOutput) - local error = rescuda:float() - groundgrad + input = input:type(typename) + gradOutput = gradOutput:type(typename) + local gconv = sconv:clone():type(typename) + gconv:forward(input) + local rescuda = gconv:backward(input, gradOutput) - mytester:assertlt(error:abs():max(), precision_backward, 'error on state (backward) ') + local error = rescuda:double() - groundgrad:double() + mytester:assertlt(error:abs():max(), precision_backward_type(precision_backward, typename), + string.format('error on state (backward) with %s', typename)) + end end function cunntest.SpatialUpSamplingNearest_forward() @@ -4279,32 +3735,22 @@ function cunntest.SpatialUpSamplingNearest_forward() local w = torch.random(3, 15) local scale = torch.random(2,5) - local tm = {} - local title = string.format('SpatialUpSamplingNearest.forward %dx%dx%d -> %dx%dx%d', - f, h, w, f, h*scale, w*scale) - times[title] = tm + for k, typename in ipairs(typenames) do + local input = torch.randn(f, h, w):type(typename) - local input = torch.randn(f, h, w) - local sconv = nn.SpatialUpSamplingNearest(scale) - local groundtruth = sconv:forward(input) - local a = torch.Timer() - for i = 1,nloop do - groundtruth = sconv:forward(input) - end - tm.cpu = a:time().real + local ctype = t2cpu[typename] + input = input:type(ctype) + local sconv = nn.SpatialUpSamplingNearest(scale):type(ctype) + local groundtruth = sconv:forward(input) - input = input:cuda() - local gconv = sconv:clone():cuda() - local rescuda = gconv:forward(input) - a:reset() - for i = 1,nloop do - rescuda = gconv:forward(input) - end - cutorch.synchronize() - tm.gpu = a:time().real + input = input:type(typename) + local gconv = sconv:clone():type(typename) + local rescuda = gconv:forward(input) - local error = rescuda:float() - groundtruth - mytester:assertlt(error:abs():max(), precision_forward, 'error on state (forward) ') + local error = rescuda:double() - groundtruth:double() + mytester:assertlt(error:abs():max(), precision_forward_type(precision_forward, typename), + string.format('error on state (forward) with %s', typename)) + end end function cunntest.SpatialUpSamplingNearest_forward_batch() @@ -4314,33 +3760,22 @@ function cunntest.SpatialUpSamplingNearest_forward_batch() local w = torch.random(3, 15) local scale = torch.random(2,5) - local tm = {} - local title = string.format('SpatialUpSamplingNearest.forward %dx%dx%dx%d -> %dx%dx%dx%d', - nbatch, f, h, w, nbatch, f, h*scale, w*scale) - times[title] = tm - - local input = torch.randn(nbatch, f, h, w) - local sconv = nn.SpatialUpSamplingNearest(scale) - local groundtruth = sconv:forward(input) - local a = torch.Timer() - for i = 1,nloop do - groundtruth = sconv:forward(input) - end - tm.cpu = a:time().real + for k, typename in ipairs(typenames) do + local input = torch.randn(nbatch, f, h, w):type(typename) - input = input:cuda() - local gconv = sconv:clone():cuda() - local rescuda = gconv:forward(input) - a:reset() - for i = 1,nloop do - rescuda = gconv:forward(input) - end - cutorch.synchronize() - tm.gpu = a:time().real + local ctype = t2cpu[typename] + input = input:type(ctype) + local sconv = nn.SpatialUpSamplingNearest(scale):type(ctype) + local groundtruth = sconv:forward(input) - local error = rescuda:float() - groundtruth - mytester:assertlt(error:abs():max(), precision_forward, 'error on state (forward) ') + input = input:type(typename) + local gconv = sconv:clone():type(typename) + local rescuda = gconv:forward(input) + local error = rescuda:double() - groundtruth:double() + mytester:assertlt(error:abs():max(), precision_forward_type(precision_forward, typename), + string.format('error on state (forward) with %s', typename)) + end end function cunntest.SpatialUpSamplingNearest_backward() @@ -4349,41 +3784,30 @@ function cunntest.SpatialUpSamplingNearest_backward() local w = torch.random(3, 15) local scale = torch.random(2,5) - local tm = {} - local title = string.format('SpatialUpSamplingNearest.backward %dx%dx%d -> %dx%dx%d', - f, h, w, f, h*scale, w*scale) - times[title] = tm + for k, typename in ipairs(typenames) do + local input = torch.randn(f, h, w):type(typename) + local gradOutput = torch.randn(f, h*scale, w*scale):type(typename) - local input = torch.randn(f, h, w) - local gradOutput = torch.randn(f, h*scale, w*scale) - local sconv = nn.SpatialUpSamplingNearest(scale) - sconv:forward(input) - sconv:zeroGradParameters() - local groundgrad = sconv:backward(input, gradOutput) - local a = torch.Timer() - for i = 1,nloop do + local ctype = t2cpu[typename] + input = input:type(ctype) + gradOutput = gradOutput:type(ctype) + local sconv = nn.SpatialUpSamplingNearest(scale):type(ctype) + sconv:forward(input) sconv:zeroGradParameters() - groundgrad = sconv:backward(input, gradOutput) - end - tm.cpu = a:time().real + local groundgrad = sconv:backward(input, gradOutput) - input = input:cuda() - gradOutput = gradOutput:cuda() - local gconv = sconv:clone():cuda() - gconv:forward(input) - gconv:zeroGradParameters() - local rescuda = gconv:backward(input, gradOutput) - a:reset() - for i = 1,nloop do + input = input:type(typename) + gradOutput = gradOutput:type(typename) + local gconv = sconv:clone():type(typename) + gconv:forward(input) gconv:zeroGradParameters() - rescuda = gconv:backward(input, gradOutput) - end - cutorch.synchronize() - tm.gpu = a:time().real + local rescuda = gconv:backward(input, gradOutput) - local error = rescuda:float() - groundgrad + local error = rescuda:double() - groundgrad:double() - mytester:assertlt(error:abs():max(), precision_backward, 'error on state (backward) ') + mytester:assertlt(error:abs():max(), precision_backward_type(precision_backward, typename), + string.format('error on state (backward) with %s', typename)) + end end function cunntest.SpatialUpSamplingNearest_backward_batch() @@ -4393,41 +3817,30 @@ function cunntest.SpatialUpSamplingNearest_backward_batch() local w = torch.random(3, 15) local scale = torch.random(2,5) - local tm = {} - local title = string.format('SpatialUpSamplingNearest.backward %dx%dx%dx%d -> %dx%dx%dx%d', - nbatch, f, h, w, nbatch, f, h*scale, w*scale) - times[title] = tm + for k, typename in ipairs(typenames) do + local input = torch.randn(nbatch, f, h, w):type(typename) + local gradOutput = torch.randn(nbatch, f, h*scale, w*scale):type(typename) - local input = torch.randn(nbatch, f, h, w) - local gradOutput = torch.randn(nbatch, f, h*scale, w*scale) - local sconv = nn.SpatialUpSamplingNearest(scale) - sconv:forward(input) - sconv:zeroGradParameters() - local groundgrad = sconv:backward(input, gradOutput) - local a = torch.Timer() - for i = 1,nloop do + local ctype = t2cpu[typename] + input = input:type(ctype) + gradOutput = gradOutput:type(ctype) + local sconv = nn.SpatialUpSamplingNearest(scale):type(ctype) + sconv:forward(input) sconv:zeroGradParameters() - groundgrad = sconv:backward(input, gradOutput) - end - tm.cpu = a:time().real + local groundgrad = sconv:backward(input, gradOutput) - input = input:cuda() - gradOutput = gradOutput:cuda() - local gconv = sconv:clone():cuda() - gconv:forward(input) - gconv:zeroGradParameters() - local rescuda = gconv:backward(input, gradOutput) - a:reset() - for i = 1,nloop do + input = input:type(typename) + gradOutput = gradOutput:type(typename) + local gconv = sconv:clone():type(typename) + gconv:forward(input) gconv:zeroGradParameters() - rescuda = gconv:backward(input, gradOutput) - end - cutorch.synchronize() - tm.gpu = a:time().real + local rescuda = gconv:backward(input, gradOutput) - local error = rescuda:float() - groundgrad + local error = rescuda:double() - groundgrad:double() - mytester:assertlt(error:abs():max(), precision_backward, 'error on state (backward) ') + mytester:assertlt(error:abs():max(), precision_backward_type(precision_backward, typename), + string.format('error on state (backward) with %s', typename)) + end end function cunntest.SpatialUpSamplingBilinear_forward() @@ -4436,34 +3849,22 @@ function cunntest.SpatialUpSamplingBilinear_forward() local w = torch.random(3, 15) local scale = torch.random(2,5) - local tm = {} - local title = - string.format('SpatialUpSamplingBilinear.forward %dx%dx%d -> %dx%dx%d', - f, h, w, f, h*scale, w*scale) - times[title] = tm + for k, typename in ipairs(typenames) do + local input = torch.randn(f, h, w):type(typename) - local input = torch.randn(f, h, w) - local sconv = nn.SpatialUpSamplingBilinear(scale) - local groundtruth = sconv:forward(input) - local a = torch.Timer() - for i = 1,nloop do - groundtruth = sconv:forward(input) - end - tm.cpu = a:time().real + local ctype = t2cpu[typename] + input = input:type(ctype) + local sconv = nn.SpatialUpSamplingBilinear(scale):type(ctype) + local groundtruth = sconv:forward(input) - input = input:cuda() - local gconv = sconv:clone():cuda() - local rescuda = gconv:forward(input) - a:reset() - for i = 1,nloop do - rescuda = gconv:forward(input) - end - cutorch.synchronize() - tm.gpu = a:time().real + input = input:type(typename) + local gconv = sconv:clone():type(typename) + local rescuda = gconv:forward(input) - local error = rescuda:float() - groundtruth - mytester:assertlt(error:abs():max(), precision_forward, - 'error on state (forward) ') + local error = rescuda:double() - groundtruth:double() + mytester:assertlt(error:abs():max(), precision_forward_type(precision_forward, typename), + string.format('error on state (forward) with %s', typename)) + end end function cunntest.SpatialUpSamplingBilinear_forward_batch() @@ -4473,35 +3874,22 @@ function cunntest.SpatialUpSamplingBilinear_forward_batch() local w = torch.random(3, 15) local scale = torch.random(2,5) - local tm = {} - local title = - string.format('SpatialUpSamplingBilinear.forward %dx%dx%dx%d -> %dx%dx%dx%d', - nbatch, f, h, w, nbatch, f, h*scale, w*scale) - times[title] = tm + for k, typename in ipairs(typenames) do + local input = torch.randn(nbatch, f, h, w):type(typename) - local input = torch.randn(nbatch, f, h, w) - local sconv = nn.SpatialUpSamplingBilinear(scale) - local groundtruth = sconv:forward(input) - local a = torch.Timer() - for i = 1,nloop do - groundtruth = sconv:forward(input) - end - tm.cpu = a:time().real - - input = input:cuda() - local gconv = sconv:clone():cuda() - local rescuda = gconv:forward(input) - a:reset() - for i = 1,nloop do - rescuda = gconv:forward(input) - end - cutorch.synchronize() - tm.gpu = a:time().real + local ctype = t2cpu[typename] + input = input:type(ctype) + local sconv = nn.SpatialUpSamplingBilinear(scale):type(ctype) + local groundtruth = sconv:forward(input) - local error = rescuda:float() - groundtruth - mytester:assertlt(error:abs():max(), precision_forward, - 'error on state (forward) ') + input = input:type(typename) + local gconv = sconv:clone():type(typename) + local rescuda = gconv:forward(input) + local error = rescuda:double() - groundtruth:double() + mytester:assertlt(error:abs():max(), precision_forward_type(precision_forward, typename), + string.format('error on state (forward) with %s', typename)) + end end function cunntest.SpatialUpSamplingBilinear_backward() @@ -4510,43 +3898,30 @@ function cunntest.SpatialUpSamplingBilinear_backward() local w = torch.random(3, 15) local scale = torch.random(2,5) - local tm = {} - local title = - string.format('SpatialUpSamplingBilinear.backward %dx%dx%d -> %dx%dx%d', - f, h, w, f, h*scale, w*scale) - times[title] = tm + for k, typename in ipairs(typenames) do + local input = torch.randn(f, h, w):type(typename) + local gradOutput = torch.randn(f, h*scale, w*scale):type(typename) - local input = torch.randn(f, h, w) - local gradOutput = torch.randn(f, h*scale, w*scale) - local sconv = nn.SpatialUpSamplingBilinear(scale) - sconv:forward(input) - sconv:zeroGradParameters() - local groundgrad = sconv:backward(input, gradOutput) - local a = torch.Timer() - for i = 1,nloop do + local ctype = t2cpu[typename] + input = input:type(ctype) + gradOutput = gradOutput:type(ctype) + local sconv = nn.SpatialUpSamplingBilinear(scale):type(ctype) + sconv:forward(input) sconv:zeroGradParameters() - groundgrad = sconv:backward(input, gradOutput) - end - tm.cpu = a:time().real + local groundgrad = sconv:backward(input, gradOutput) - input = input:cuda() - gradOutput = gradOutput:cuda() - local gconv = sconv:clone():cuda() - gconv:forward(input) - gconv:zeroGradParameters() - local rescuda = gconv:backward(input, gradOutput) - a:reset() - for i = 1,nloop do + input = input:type(typename) + gradOutput = gradOutput:type(typename) + local gconv = sconv:clone():type(typename) + gconv:forward(input) gconv:zeroGradParameters() - rescuda = gconv:backward(input, gradOutput) - end - cutorch.synchronize() - tm.gpu = a:time().real + local rescuda = gconv:backward(input, gradOutput) - local error = rescuda:float() - groundgrad + local error = rescuda:double() - groundgrad:double() - mytester:assertlt(error:abs():max(), precision_backward, - 'error on state (backward) ') + mytester:assertlt(error:abs():max(), precision_backward_type(precision_backward, typename), + string.format('error on state (backward) with %s', typename)) + end end function cunntest.SpatialUpSamplingBilinear_backward_batch() @@ -4556,146 +3931,152 @@ function cunntest.SpatialUpSamplingBilinear_backward_batch() local w = torch.random(3, 15) local scale = torch.random(2,5) - local input = torch.randn(nbatch, f, h, w) - local gradOutput = torch.randn(nbatch, f, h*scale, w*scale) - local sconv = nn.SpatialUpSamplingBilinear(scale) - local output = sconv:forward(input) - sconv:zeroGradParameters() - local groundgrad = sconv:backward(input, gradOutput) + for k, typename in ipairs(typenames) do + local input = torch.randn(nbatch, f, h, w):type(typename) + local gradOutput = torch.randn(nbatch, f, h*scale, w*scale):type(typename) - input = input:cuda() - gradOutput = gradOutput:cuda() - local gconv = sconv:clone():cuda() - gconv:forward(input) - gconv:zeroGradParameters() - local rescuda = gconv:backward(input, gradOutput) + local ctype = t2cpu[typename] + input = input:type(ctype) + gradOutput = gradOutput:type(ctype) + local sconv = nn.SpatialUpSamplingBilinear(scale):type(ctype) + local output = sconv:forward(input) + sconv:zeroGradParameters() + local groundgrad = sconv:backward(input, gradOutput) + + input = input:type(typename) + gradOutput = gradOutput:type(typename) + local gconv = sconv:clone():type(typename) + gconv:forward(input) + gconv:zeroGradParameters() + local rescuda = gconv:backward(input, gradOutput) - local err = rescuda:float() - groundgrad + local err = rescuda:double() - groundgrad:double() - mytester:assertlt(err:abs():max(), precision_backward, - 'error on state (backward) ') + mytester:assertlt(err:abs():max(), precision_backward_type(precision_backward, typename), + string.format('error on state (backward) with %s', typename)) + end end function cunntest.l1cost() local size = math.random(300,500) - local input = torch.randn(size) - local mod = nn.L1Cost() - local tm = {} - local title = string.format('L1Cost %d ',size) - times[title] = tm + for k, typename in ipairs(typenames) do + local input = torch.randn(size):type(typename) - local a = torch.Timer() - local fout = mod:forward(input) - local fgin = mod:backward(input):clone() - tm.cpu = a:time().real + local ctype = t2cpu[typename] + input = input:type(ctype) + local mod = nn.L1Cost():type(ctype) - local cinput = input:cuda() - local cmod = nn.L1Cost():cuda() - a:reset() - local cout = cmod:forward(cinput) - local cgin = cmod:backward(cinput) - cutorch.synchronize() - tm.gpu = a:time().real + local fout = mod:forward(input) + local fgin = mod:backward(input):clone() + + local cinput = input:type(typename) + local cmod = nn.L1Cost():type(typename) + local cout = cmod:forward(cinput) + local cgin = cmod:backward(cinput) - mytester:assertlt(math.abs(fout-cout), precision_forward, 'error on output') - local gerr = cgin:float() - fgin - mytester:assertlt(gerr:abs():max(), precision_forward, 'error on gradInput') + if (typename == 'torch.CudaHalfTensor') then + fout = ffi.C.THC_half2float(ffi.C.THC_float2half(fout)) + end + mytester:assertlt(math.abs(fout-cout), precision_forward_type(precision_forward, typename), + string.format('error on output with %s', typename)) + local gerr = cgin:double() - fgin:double() + mytester:assertlt(gerr:abs():max(), precision_forward_type(precision_forward, typename), + string.format('error on gradInput with %s', typename)) + end end function cunntest.ClassNLLCriterionSingleTarget() local size = math.random(3000,5000) - local input = torch.randn(size) - local target = 1 - local mod = nn.ClassNLLCriterion() - local tm = {} - local title = string.format('ClassNLLCriterionSingleTarget %d ',size) - times[title] = tm + for k, typename in ipairs(typenames) do + local input = torch.randn(size):type(typename) + local target = 1 - local a = torch.Timer() - local fout = mod:forward(input, target) - local fgin = mod:backward(input, target):clone() - tm.cpu = a:time().real + local ctype = t2cpu[typename] + input = input:type(ctype) + local mod = nn.ClassNLLCriterion():type(ctype) - local cinput = input:cuda() - local ctarget = torch.CudaTensor(1):fill(target) - local cmod = nn.ClassNLLCriterion():cuda() - a:reset() - local cout = cmod:forward(cinput,ctarget) - local cgin = cmod:backward(cinput,ctarget) - cutorch.synchronize() - tm.gpu = a:time().real + local fout = mod:forward(input, target) + local fgin = mod:backward(input, target):clone() + + local cinput = input:type(typename) + local ctarget = torch.CudaTensor(1):fill(target) + local cmod = nn.ClassNLLCriterion():type(typename) + local cout = cmod:forward(cinput,ctarget) + local cgin = cmod:backward(cinput,ctarget) - mytester:assertlt( - math.abs(fout-cout), precision_forward, 'error on output') - local gerr = cgin:float() - fgin - mytester:assertlt(gerr:abs():max(), precision_forward, 'error on gradInput') + mytester:assertlt( + math.abs(fout-cout), precision_forward_type(precision_forward, typename), + string.format('error on output with %s', typename)) + local gerr = cgin:double() - fgin:double() + mytester:assertlt(gerr:abs():max(), precision_forward_type(precision_forward, typename), + string.format('error on gradInput with %s', typename)) + end end function cunntest.ClassNLLCriterionSingleTargetWeights() local size = math.random(3000,5000) - local input = torch.randn(size) - local target = 1 - local weights = torch.rand(size) - local mod = nn.ClassNLLCriterion(weights) - local tm = {} - local title = string.format('ClassNLLCriterionSingleTargetWeights %d ',size) - times[title] = tm + for k, typename in ipairs(typenames) do + local input = torch.randn(size):type(typename) + local target = 1 + local weights = torch.rand(size):type(typename) - local a = torch.Timer() - local fout = mod:forward(input, target) - local fgin = mod:backward(input, target):clone() - tm.cpu = a:time().real + local ctype = t2cpu[typename] + input = input:type(ctype) + weights = weights:type(ctype) + local mod = nn.ClassNLLCriterion(weights):type(ctype) - local cinput = input:cuda() - local cweights = weights:cuda() - local ctarget = torch.CudaTensor(1):fill(target) - local cmod = nn.ClassNLLCriterion(cweights):cuda() - a:reset() - local cout = cmod:forward(cinput,ctarget) - local cgin = cmod:backward(cinput,ctarget) - cutorch.synchronize() - tm.gpu = a:time().real + local fout = mod:forward(input, target) + local fgin = mod:backward(input, target):clone() - mytester:assertlt( - math.abs(fout-cout), precision_forward, 'error on output') - local gerr = cgin:float() - fgin - mytester:assertlt(gerr:abs():max(), precision_forward, 'error on gradInput') + local cinput = input:type(typename) + local cweights = weights:type(typename) + local ctarget = torch.CudaTensor(1):fill(target) + local cmod = nn.ClassNLLCriterion(cweights):type(typename) + local cout = cmod:forward(cinput,ctarget) + local cgin = cmod:backward(cinput,ctarget) + + mytester:assertlt( + math.abs(fout-cout), precision_forward_type(precision_forward, typename), + string.format('error on output with %s', typename)) + local gerr = cgin:double() - fgin:double() + mytester:assertlt(gerr:abs():max(), precision_forward_type(precision_forward, typename), + string.format('error on gradInput with %s', typename)) + end end function cunntest.ClassNLLCriterionMultipleTarget() local size = math.random(3000,5000) - local input = torch.randn(size, size) - local target = torch.randperm(size) - local mod = nn.ClassNLLCriterion() - local tm = {} - local title = string.format('ClassNLLCriterionMultiTarget %d ',size) - times[title] = tm + for k, typename in ipairs(typenames) do + local input = torch.randn(size, size):type(typename) + local target = torch.randperm(size) - local a = torch.Timer() - local fout = mod:forward(input, target) - local fgin = mod:backward(input, target):clone() - tm.cpu = a:time().real + local ctype = t2cpu[typename] + input = input:type(ctype) + local mod = nn.ClassNLLCriterion():type(ctype) - local cinput = input:cuda() - local ctarget = target:cuda() + local fout = mod:forward(input, target) + local fgin = mod:backward(input, target):clone() - local cmod = nn.ClassNLLCriterion():cuda() - a:reset() - local cout = cmod:forward(cinput,ctarget) - local cgin = cmod:backward(cinput,ctarget) - cutorch.synchronize() - tm.gpu = a:time().real + local cinput = input:type(typename) + local ctarget = target:cuda() + + local cmod = nn.ClassNLLCriterion():type(typename) + local cout = cmod:forward(cinput,ctarget) + local cgin = cmod:backward(cinput,ctarget) - mytester:assertlt( - math.abs(fout-cout), precision_forward, 'error on output') + mytester:assertlt( + math.abs(fout-cout), precision_forward_type(precision_forward, typename), + string.format('error on output with %s', typename)) - local gerr = cgin:float() - fgin - mytester:assertlt(gerr:abs():max(), precision_forward, 'error on gradInput') + local gerr = cgin:double() - fgin:double() + mytester:assertlt(gerr:abs():max(), precision_forward_type(precision_forward, typename), + string.format('error on gradInput with %s', typename)) + end end function cunntest.SpatialClassNLLCriterion() @@ -4703,103 +4084,98 @@ function cunntest.SpatialClassNLLCriterion() local h = math.random(300, 500) local w = math.random(300, 800) local classes = math.random(10,30) - local input = torch.randn(batchSize, classes, h, w) - local target = torch.Tensor(batchSize, h, w) - target:apply(function() return math.random(1, classes) end) - local mod = nn.SpatialClassNLLCriterion() - local tm = {} - local title = string.format('SpatialClassNLLCriterion %dx%dx%dx%d ', - batchSize, classes, h, w) - times[title] = tm + for k, typename in ipairs(typenames) do + local input = torch.randn(batchSize, classes, h, w):type(typename) + local target = torch.Tensor(batchSize, h, w) + target:apply(function() return math.random(1, classes) end) + local ctype = t2cpu[typename] + input = input:type(ctype) + local mod = nn.SpatialClassNLLCriterion():type(ctype) + local fout = mod:forward(input, target) + local fgin = mod:backward(input, target):clone() - local a = torch.Timer() - local fout = mod:forward(input, target) - local fgin = mod:backward(input, target):clone() - tm.cpu = a:time().real + local cinput = input:type(typename) + local ctarget = target:type(typename) - local cinput = input:cuda() - local ctarget = target:cuda() - - local cmod = nn.SpatialClassNLLCriterion():cuda() - a:reset() - local cout = cmod:forward(cinput,ctarget) - local cgin = cmod:backward(cinput,ctarget) - cutorch.synchronize() - tm.gpu = a:time().real + local cmod = nn.SpatialClassNLLCriterion():type(typename) + local cout = cmod:forward(cinput,ctarget) + local cgin = cmod:backward(cinput,ctarget) + cutorch.synchronize() - mytester:assertlt( - math.abs(fout-cout), precision_forward, 'error on output') + mytester:assertlt( + math.abs(fout-cout), precision_forward_type(precision_forward, typename), + string.format('error on output with %s', typename)) - local gerr = cgin:float() - fgin - mytester:assertlt(gerr:abs():max(), precision_forward, 'error on gradInput') + local gerr = cgin:double() - fgin:double() + mytester:assertlt(gerr:abs():max(), precision_forward_type(precision_forward, typename), + string.format('error on gradInput with %s', typename)) + end end function cunntest.ClassNLLCriterionMultipleTargetWeights() local size = math.random(3000,5000) - local input = torch.randn(size, size) - local target = torch.randperm(size) - local weights = torch.rand(size) - local mod = nn.ClassNLLCriterion(weights) - local tm = {} - local title = string.format('ClassNLLCriterionMultiTargetWeights %d ',size) - times[title] = tm + for k, typename in ipairs(typenames) do + local input = torch.randn(size, size):type(typename) + local target = torch.randperm(size) + local weights = torch.rand(size):type(typename) - local a = torch.Timer() - local fout = mod:forward(input, target) - local fgin = mod:backward(input, target):clone() - tm.cpu = a:time().real + local ctype = t2cpu[typename] + input = input:type(ctype) + weights = weights:type(ctype) + local mod = nn.ClassNLLCriterion(weights):type(ctype) - local cinput = input:cuda() - local ctarget = target:cuda() - local cweights = weights:cuda() + local fout = mod:forward(input, target) + local fgin = mod:backward(input, target):clone() - local cmod = nn.ClassNLLCriterion(cweights):cuda() - a:reset() - local cout = cmod:forward(cinput,ctarget) - local cgin = cmod:backward(cinput,ctarget) - cutorch.synchronize() - tm.gpu = a:time().real + local cinput = input:type(typename) + local ctarget = target:cuda() + local cweights = weights:type(typename) - mytester:assertlt( - math.abs(fout-cout), precision_forward, 'error on output') + local cmod = nn.ClassNLLCriterion(cweights):type(typename) + local cout = cmod:forward(cinput,ctarget) + local cgin = cmod:backward(cinput,ctarget) + + mytester:assertlt( + math.abs(fout-cout), precision_forward_type(precision_forward, typename), + string.format('error on output with %s', typename)) - local gerr = cgin:float() - fgin - mytester:assertlt(gerr:abs():max(), precision_forward, 'error on gradInput') + local gerr = cgin:double() - fgin:double() + mytester:assertlt(gerr:abs():max(), precision_forward_type(precision_forward, typename), + string.format('error on gradInput with %s', typename)) + end end function cunntest.TemporalMaxPooling() - local input = torch.rand(16, 18, 3) local settings = {{2, 2}, {3, 3}, {4, 2}, {2, 4}, {3, 5}} for i, setting in ipairs(settings) do - local mod = nn.TemporalMaxPooling(setting[1], setting[2]) - - local tm = {} - local title = 'TemporalMaxPooling '..setting[1]..' '..setting[2] - times[title] = tm - - local a = torch.Timer() - local fout = mod:forward(input) - local fgout = torch.rand(fout:size()) - local fgin = mod:backward(input, fgout):clone() - tm.cpu = a:time().real - - local cinput = input:cuda() - local cgout = fgout:cuda() - local cmod = nn.TemporalMaxPooling(setting[1], setting[2]):cuda() - a:reset() - local cout = cmod:forward(cinput) - local cgin = cmod:backward(cinput, cgout) - cutorch.synchronize() - tm.gpu = a:time().real - - local outerror = cout:float() - fout - mytester:assertlt(outerror:abs():max(), precision_forward, 'error on output') - - local ginerror = cgin:float() - fgin - mytester:assertlt(ginerror:abs():max(), precision_backward, 'error on gradInput') + for k, typename in ipairs(typenames) do + local input = torch.rand(16, 18, 3):type(typename) + + local ctype = t2cpu[typename] + input = input:type(ctype) + local mod = nn.TemporalMaxPooling(setting[1], setting[2]):type(ctype) + + local fout = mod:forward(input) + local fgout = torch.rand(fout:size()):type(typename):type(ctype) + local fgin = mod:backward(input, fgout):clone() + + local cinput = input:type(typename) + local cgout = fgout:type(typename) + local cmod = nn.TemporalMaxPooling(setting[1], setting[2]):type(typename) + local cout = cmod:forward(cinput) + local cgin = cmod:backward(cinput, cgout) + + local outerror = cout:double() - fout:double() + mytester:assertlt(outerror:abs():max(), precision_forward_type(precision_forward, typename), + string.format('error on output with %s', typename)) + + local ginerror = cgin:double() - fgin:double() + mytester:assertlt(ginerror:abs():max(), precision_backward_type(precision_backward, typename), + string.format('error on gradInput with %s', typename)) + end end end @@ -4819,35 +4195,26 @@ function cunntest.VolumetricConvolution_forward_single() local inj = (outj-1)*sj+kj local ink = (outk-1)*sk+kk - local tm = {} - local title = string.format('VolumetricConvolution.forward %dx%dx%dx%d o %dx%dx%d -> %dx%dx%dx%d', - from, ink, inj, ini, kk, kj, ki, to, outk, outj, outi) - times[title] = tm + for k, typename in ipairs(typenames) do + local input = torch.randn(from,ini,inj,ink):type(typename) - local input = torch.randn(from,ini,inj,ink) - local sconv = nn.VolumetricConvolution(from,to,ki,kk,kj,si,sk,sj) - local groundtruth = sconv:forward(input) - local a = torch.Timer() - for i = 1,nloop do - groundtruth = sconv:forward(input) - end - tm.cpu = a:time().real + local ctype = t2cpu[typename] + input = input:type(ctype) + local sconv = nn.VolumetricConvolution(from,to,ki,kk,kj,si,sk,sj):type(ctype) + local groundtruth = sconv:forward(input) - input = input:cuda() - local gconv = nn.VolumetricConvolution(from,to,ki,kk,kj,si,sk,sj):cuda() - gconv.weight = sconv.weight:cuda() - gconv.bias = sconv.bias:cuda() - local rescuda = gconv:forward(input) - a:reset() - for i = 1,nloop do - rescuda = gconv:forward(input) - end - cutorch.synchronize() - tm.gpu = a:time().real + input = input:type(typename) + local gconv = nn.VolumetricConvolution(from,to,ki,kk,kj,si,sk,sj):type(typename) + gconv.weight = sconv.weight:type(typename) + gconv.bias = sconv.bias:type(typename) + local rescuda = gconv:forward(input) - local error = rescuda:float() - groundtruth - mytester:assertlt(error:abs():max(), precision_forward, 'error on state (forward) ') - mytester:assert(groundtruth:isSize(rescuda:size()), 'size mismatch on state (forward)') + local error = rescuda:double() - groundtruth:double() + mytester:assertlt(error:abs():max(), precision_forward_type(precision_forward, typename), + string.format('error on state (forward) with %s', typename)) + mytester:assert(groundtruth:isSize(rescuda:size()), + string.format('size mismatch on state (forward) with %s', typename)) + end end function cunntest.VolumetricConvolution_forward_batch() @@ -4867,35 +4234,26 @@ function cunntest.VolumetricConvolution_forward_batch() local inj = (outj-1)*sj+kj local ink = (outk-1)*sk+kk - local tm = {} - local title = string.format('VolumetricConvolution.forward %dx%dx%dx%dx%d o %dx%dx%d -> %dx%dx%dx%dx%d', - bs, from, ink, inj, ini, kk, kj, ki, bs, to, outk, outj, outi) - times[title] = tm + for k, typename in ipairs(typenames) do + local input = torch.randn(bs,from,ini,inj, ink):type(typename) - local input = torch.randn(bs,from,ini,inj, ink) - local sconv = nn.VolumetricConvolution(from,to,ki,kk,kj,si,sj,sk) - local groundtruth = sconv:forward(input) - local a = torch.Timer() - for i = 1,nloop do - groundtruth = sconv:forward(input, sconv) - end - tm.cpu = a:time().real + local ctype = t2cpu[typename] + input = input:type(ctype) + local sconv = nn.VolumetricConvolution(from,to,ki,kk,kj,si,sj,sk):type(ctype) + local groundtruth = sconv:forward(input) - input = input:cuda() - local gconv = nn.VolumetricConvolution(from,to,ki,kk,kj,si,sj,sk):cuda() - gconv.weight = sconv.weight:cuda() - gconv.bias = sconv.bias:cuda() - local rescuda = gconv:forward(input) - a:reset() - for i = 1,nloop do - rescuda = gconv:forward(input) - end - cutorch.synchronize() - tm.gpu = a:time().real + input = input:type(typename) + local gconv = nn.VolumetricConvolution(from,to,ki,kk,kj,si,sj,sk):type(typename) + gconv.weight = sconv.weight:type(typename) + gconv.bias = sconv.bias:type(typename) + local rescuda = gconv:forward(input) - local error = rescuda:float() - groundtruth - mytester:assertlt(error:abs():max(), precision_forward, 'error on state (forward) ') - mytester:assert(groundtruth:isSize(rescuda:size()), 'size mismatch on state (forward)') + local error = rescuda:double() - groundtruth:double() + mytester:assertlt(error:abs():max(), precision_forward_type(precision_forward, typename), + string.format('error on state (forward) with %s', typename)) + mytester:assert(groundtruth:isSize(rescuda:size()), + string.format('size mismatch on state (forward) with %s', typename)) + end end function cunntest.VolumetricConvolution_backward_single() @@ -4914,50 +4272,44 @@ function cunntest.VolumetricConvolution_backward_single() local inj = (outj-1)*sj+kj local ink = (outk-1)*sk+kk - local tm = {} - local title = string.format('VolumetricConvolution.backward %dx%dx%dx%d o %dx%dx%d -> %dx%dx%dx%d', - from, ink, inj, ini, kk, kj, ki, to, outk, outj, outi) - times[title] = tm + for k, typename in ipairs(typenames) do + local input = torch.randn(from, ini, inj, ink):type(typename) + local gradOutput = torch.randn(to, outi, outj, outk):type(typename) - local input = torch.randn(from, ini, inj, ink) - local gradOutput = torch.randn(to, outi, outj, outk) - local sconv = nn.VolumetricConvolution(from,to,ki,kk,kj,si,sk,sj) - sconv:forward(input) - sconv:zeroGradParameters() - local groundgrad = sconv:backward(input, gradOutput) - local a = torch.Timer() - for i = 1,nloop do + local ctype = t2cpu[typename] + input = input:type(ctype) + gradOutput = gradOutput:type(ctype) + local sconv = nn.VolumetricConvolution(from,to,ki,kk,kj,si,sk,sj):type(ctype) + sconv:forward(input) sconv:zeroGradParameters() - groundgrad = sconv:backward(input, gradOutput) - end - local groundweight = sconv.gradWeight - local groundbias = sconv.gradBias - tm.cpu = a:time().real + local groundgrad = sconv:backward(input, gradOutput) + local groundweight = sconv.gradWeight + local groundbias = sconv.gradBias - input = input:cuda() - gradOutput = gradOutput:cuda() - local gconv = nn.VolumetricConvolution(from,to,ki,kk,kj,si,sk,sj):cuda() - gconv.weight = sconv.weight:cuda() - gconv.bias = sconv.bias:cuda() - gconv:forward(input) - gconv:zeroGradParameters() - local rescuda = gconv:backward(input, gradOutput) - a:reset() - for i = 1,nloop do + input = input:type(typename) + gradOutput = gradOutput:type(typename) + local gconv = nn.VolumetricConvolution(from,to,ki,kk,kj,si,sk,sj):type(typename) + gconv.weight = sconv.weight:type(typename) + gconv.bias = sconv.bias:type(typename) + gconv:forward(input) gconv:zeroGradParameters() - rescuda = gconv:backward(input, gradOutput) + local rescuda = gconv:backward(input, gradOutput) + local weightcuda = gconv.gradWeight + local biascuda = gconv.gradBias + local error = rescuda:double() - groundgrad:double() + local werror = weightcuda:double() - groundweight:double() + local berror = biascuda:double() - groundbias:double() + mytester:assert(groundgrad:isSize(rescuda:size()), + string.format('size mismatch on state (forward) with %s', typename)) + mytester:assertlt(error:abs():max(), precision_backward_type(precision_backward, typename), + string.format('error on state (backward) with %s', typename)) + mytester:assertlt(werror:abs():max(), + precision_backward_conv_weightbias(precision_backward, typename, weightcuda:abs():max()), + string.format('error on weight (backward) with %s', typename)) + mytester:assertlt(berror:abs():max(), + precision_backward_conv_weightbias(precision_backward, typename, biascuda:abs():max()), + string.format('error on bias (backward) with %s', typename)) end - local weightcuda = gconv.gradWeight - local biascuda = gconv.gradBias - cutorch.synchronize() - tm.gpu = a:time().real - local error = rescuda:float() - groundgrad - local werror = weightcuda:float() - groundweight - local berror = biascuda:float() - groundbias - mytester:assert(groundgrad:isSize(rescuda:size()), 'size mismatch on state (forward)') - mytester:assertlt(error:abs():max(), precision_backward, 'error on state (backward) ') - mytester:assertlt(werror:abs():max(), precision_backward, 'error on weight (backward) ') - mytester:assertlt(berror:abs():max(), precision_backward, 'error on bias (backward) ') end function cunntest.VolumetricConvolution_backward_batch() @@ -4977,50 +4329,44 @@ function cunntest.VolumetricConvolution_backward_batch() local inj = (outj-1)*sj+kj local ink = (outk-1)*sk+kk - local tm = {} - local title = string.format('VolumetricConvolution.backward %dx%dx%dx%dx%d o %dx%dx%d -> %dx%dx%dx%dx%d', - bs, from, ink, inj, ini, kk, kj, ki, bs, to, outk, outj, outi) - times[title] = tm + for k, typename in ipairs(typenames) do + local input = torch.randn(bs, from, ini, inj, ink):type(typename) + local gradOutput = torch.randn(bs, to, outi, outj, outk):type(typename) - local input = torch.randn(bs, from, ini, inj, ink) - local gradOutput = torch.randn(bs, to, outi, outj, outk) - local sconv = nn.VolumetricConvolution(from,to,ki,kk,kj,si,sk,sj) - sconv:forward(input) - sconv:zeroGradParameters() - local groundgrad = sconv:backward(input, gradOutput) - local a = torch.Timer() - for i = 1,nloop do + local ctype = t2cpu[typename] + input = input:type(ctype) + gradOutput = gradOutput:type(ctype) + local sconv = nn.VolumetricConvolution(from,to,ki,kk,kj,si,sk,sj):type(ctype) + sconv:forward(input) sconv:zeroGradParameters() - groundgrad = sconv:backward(input, gradOutput) - end - local groundweight = sconv.gradWeight - local groundbias = sconv.gradBias - tm.cpu = a:time().real + local groundgrad = sconv:backward(input, gradOutput) + local groundweight = sconv.gradWeight + local groundbias = sconv.gradBias - input = input:cuda() - gradOutput = gradOutput:cuda() - local gconv = nn.VolumetricConvolution(from,to,ki,kk,kj,si,sk,sj):cuda() - gconv.weight = sconv.weight:cuda() - gconv.bias = sconv.bias:cuda() - gconv:forward(input) - gconv:zeroGradParameters() - local rescuda = gconv:backward(input, gradOutput) - a:reset() - for i = 1,nloop do + input = input:type(typename) + gradOutput = gradOutput:type(typename) + local gconv = nn.VolumetricConvolution(from,to,ki,kk,kj,si,sk,sj):type(typename) + gconv.weight = sconv.weight:type(typename) + gconv.bias = sconv.bias:type(typename) + gconv:forward(input) gconv:zeroGradParameters() - rescuda = gconv:backward(input, gradOutput) + local rescuda = gconv:backward(input, gradOutput) + local weightcuda = gconv.gradWeight + local biascuda = gconv.gradBias + local error = rescuda:double() - groundgrad:double() + local werror = weightcuda:double() - groundweight:double() + local berror = biascuda:double() - groundbias:double() + mytester:assert(groundgrad:isSize(rescuda:size()), + string.format('size mismatch on state (forward) with %s', typename)) + mytester:assertlt(error:abs():max(), precision_backward_type(precision_backward, typename), + string.format('error on state (backward) with %s', typename)) + mytester:assertlt(werror:abs():max(), + precision_backward_conv_weightbias(precision_backward, typename, weightcuda:abs():max()), + string.format('error on weight (backward) with %s', typename)) + mytester:assertlt(berror:abs():max(), + precision_backward_conv_weightbias(precision_backward, typename, biascuda:abs():max()), + string.format('error on bias (backward) with %s', typename)) end - local weightcuda = gconv.gradWeight - local biascuda = gconv.gradBias - cutorch.synchronize() - tm.gpu = a:time().real - local error = rescuda:float() - groundgrad - local werror = weightcuda:float() - groundweight - local berror = biascuda:float() - groundbias - mytester:assert(groundgrad:isSize(rescuda:size()), 'size mismatch on state (forward)') - mytester:assertlt(error:abs():max(), precision_backward, 'error on state (backward) ') - mytester:assertlt(werror:abs():max(), precision_backward, 'error on weight (backward) ') - mytester:assertlt(berror:abs():max(), precision_backward, 'error on bias (backward) ') end function cunntest.VolumetricMaxPooling_forward() @@ -5041,32 +4387,22 @@ function cunntest.VolumetricMaxPooling_forward() local oH = math.floor((iH - kH + 2*padH) / dH + 1) local oW = math.floor((iW - kW + 2*padW) / dW + 1) - local tm = {} - local title = string.format('VolumetricMaxPooling.forward %dx%dx%dx%d o %dx%dx%d (%dx%dx%d)-> %dx%dx%dx%d', - iF, iT, iH, iW, kT, kH, kW, dT, dH, dW, iF, oT, oH, oW) - times[title] = tm + for k, typename in ipairs(typenames) do + local input = torch.Tensor(iF, iT, iH, iW):float():uniform(-1, 1):type(typename) - local input = torch.Tensor(iF, iT, iH, iW):float():uniform(-1, 1) - local layer = nn.VolumetricMaxPooling(kT, kW, kH, dT, dW, dH, padT, padW, padH):float() - local output = layer:forward(input) - local timer = torch.Timer() - for i = 1,nloop do - output = layer:forward(input) - end - tm.cpu = timer:time().real + local ctype = t2cpu[typename] + input = input:type(ctype) + local layer = nn.VolumetricMaxPooling(kT, kW, kH, dT, dW, dH, padT, padW, padH):type(ctype) + local output = layer:forward(input) - local inputCUDA = input:cuda() - local layerCUDA = layer:clone():cuda() - local outputCUDA = layerCUDA:forward(inputCUDA) - timer:reset() - for i = 1,nloop do - outputCUDA = layerCUDA:forward(inputCUDA) - end - cutorch.synchronize() - tm.gpu = timer:time().real + local inputCUDA = input:type(typename) + local layerCUDA = layer:clone():type(typename) + local outputCUDA = layerCUDA:forward(inputCUDA) - local error = outputCUDA:float() - output - mytester:assertlt(error:abs():max(), precision_forward, 'error on state (forward) ') + local error = outputCUDA:double() - output:double() + mytester:assertlt(error:abs():max(), precision_forward_type(precision_forward, typename), + string.format('error on state (forward) with %s', typename)) + end end function cunntest.VolumetricMaxPooling_backward() @@ -5087,38 +4423,27 @@ function cunntest.VolumetricMaxPooling_backward() local oH = math.floor((iH - kH + 2*padH) / dH + 1) local oW = math.floor((iW - kW + 2*padW) / dW + 1) - local tm = {} - local title = string.format('VolumetricMaxPooling.backward %dx%dx%dx%d o %dx%dx%d (%dx%dx%d) -> %dx%dx%dx%d', - iF, iT, iH, iW, kT, kH, kW, dT, dH, dW, iF, oT, oH, oW) - times[title] = tm + for k, typename in ipairs(typenames) do + local input = torch.Tensor(iF, iT, iH, iW):float():uniform(-1, 1):type(typename) - local input = torch.Tensor(iF, iT, iH, iW):float():uniform(-1, 1) - local layer = nn.VolumetricMaxPooling(kT, kW, kH, dT, dW, dH, padT, padW, padH):float() - local output = layer:forward(input) - local gradOutput = output:clone():uniform(-1, 1) + local ctype = t2cpu[typename] + input = input:type(ctype) + local layer = nn.VolumetricMaxPooling(kT, kW, kH, dT, dW, dH, padT, padW, padH):type(ctype) + local output = layer:forward(input) + local gradOutput = output:clone():uniform(-1, 1) - local gradInput = layer:backward(input, gradOutput) - local timer = torch.Timer() - for i = 1,nloop do - gradInput = layer:backward(input, gradOutput) - end - tm.cpu = timer:time().real + local gradInput = layer:backward(input, gradOutput) - local inputCUDA = input:cuda() - local layerCUDA = layer:clone():cuda() - local outputCUDA = layerCUDA:forward(inputCUDA) - local gradOutputCUDA = gradOutput:cuda() - local gradInputCUDA = layerCUDA:backward(inputCUDA, gradOutputCUDA) + local inputCUDA = input:type(typename) + local layerCUDA = layer:clone():type(typename) + local outputCUDA = layerCUDA:forward(inputCUDA) + local gradOutputCUDA = gradOutput:type(typename) + local gradInputCUDA = layerCUDA:backward(inputCUDA, gradOutputCUDA) - timer:reset() - for i = 1,nloop do - gradInputCUDA = layerCUDA:backward(inputCUDA, gradOutputCUDA) + local error = gradInputCUDA:double() - gradInput:double() + mytester:assertlt(error:abs():max(), precision_forward_type(precision_forward, typename), + string.format('error on state (backward) with %s', typename)) end - cutorch.synchronize() - tm.gpu = timer:time().real - - local error = gradInputCUDA:float() - gradInput - mytester:assertlt(error:abs():max(), precision_forward, 'error on state (backward) ') end function cunntest.VolumetricDilatedMaxPooling_forward_batch() @@ -5145,34 +4470,24 @@ function cunntest.VolumetricDilatedMaxPooling_forward_batch() local inj = (outj-1)*sj+(dilationj*(kj-1)+1)-2*padj local ceil_mode = math.random(0,1) == 1 - local tm = {} - local title = string.format('VolumetricDilatedMaxPooling.forward %dx%dx%dx%dx%d o %dx%dx%d -> %d%dx%dx%dx%d', - bs, from, int, inj, ini, kt, kj, ki, bs, to, outt, outj, outi) - times[title] = tm + for k, typename in ipairs(typenames) do + local input = torch.randn(bs,from,int,inj,ini):type(typename) - local input = torch.randn(bs,from,int,inj,ini) - local sconv = nn.VolumetricDilatedMaxPooling(kt,ki,kj,st,si,sj,padt,padi,padj,dilationt,dilationi,dilationj) - if ceil_mode then sconv:ceil() end - local groundtruth = sconv:forward(input) - local a = torch.Timer() - for i = 1,nloop do - groundtruth = sconv:forward(input) - end - tm.cpu = a:time().real + local ctype = t2cpu[typename] + input = input:type(ctype) + local sconv = nn.VolumetricDilatedMaxPooling(kt,ki,kj,st,si,sj,padt,padi,padj,dilationt,dilationi,dilationj):type(ctype) + if ceil_mode then sconv:ceil() end + local groundtruth = sconv:forward(input) - input = input:cuda() - local gconv = nn.VolumetricDilatedMaxPooling(kt,ki,kj,st,si,sj,padt,padi,padj,dilationt,dilationi,dilationj):cuda() - if ceil_mode then gconv:ceil() end - local rescuda = gconv:forward(input) - a:reset() - for i = 1,nloop do - rescuda = gconv:forward(input) - end - cutorch.synchronize() - tm.gpu = a:time().real + input = input:type(typename) + local gconv = nn.VolumetricDilatedMaxPooling(kt,ki,kj,st,si,sj,padt,padi,padj,dilationt,dilationi,dilationj):type(typename) + if ceil_mode then gconv:ceil() end + local rescuda = gconv:forward(input) - local error = rescuda:float() - groundtruth - mytester:assertlt(error:abs():max(), precision_forward, 'error on state (forward) ') + local error = rescuda:double() - groundtruth:double() + mytester:assertlt(error:abs():max(), precision_forward_type(precision_forward, typename), + string.format('error on state (forward) with %s', typename)) + end end function cunntest.VolumetricDilatedMaxPooling_backward_batch() @@ -5199,43 +4514,32 @@ function cunntest.VolumetricDilatedMaxPooling_backward_batch() local inj = (outj-1)*sj+(dilationj*(kj-1)+1)-2*padj local ceil_mode = math.random(0,1) == 1 - local tm = {} - local title = string.format('VolumetricDilatedMaxPooling.forward %dx%dx%dx%dx%d o %dx%dx%d -> %d%dx%dx%dx%d', - bs, from, int, inj, ini, kt, kj, ki, bs, to, outt, outj, outi) - times[title] = tm + for k, typename in ipairs(typenames) do + local input = torch.randn(bs,from,int,inj,ini):type(typename) + local gradOutput = torch.randn(bs,to,outt,outj,outi):type(typename) - local input = torch.randn(bs,from,int,inj,ini) - local gradOutput = torch.randn(bs,to,outt,outj,outi) - local sconv = nn.VolumetricDilatedMaxPooling(kt,ki,kj,st,si,sj,padt,padi,padj,dilationt,dilationi,dilationj) - if ceil_mode then sconv:ceil() end - sconv:forward(input) - sconv:zeroGradParameters() - local groundgrad = sconv:backward(input, gradOutput) - local a = torch.Timer() - for i = 1,nloop do + local ctype = t2cpu[typename] + input = input:type(ctype) + gradOutput = gradOutput:type(ctype) + local sconv = nn.VolumetricDilatedMaxPooling(kt,ki,kj,st,si,sj,padt,padi,padj,dilationt,dilationi,dilationj):type(ctype) + if ceil_mode then sconv:ceil() end + sconv:forward(input) sconv:zeroGradParameters() - groundgrad = sconv:backward(input, gradOutput) - end - tm.cpu = a:time().real + local groundgrad = sconv:backward(input, gradOutput) - input = input:cuda() - gradOutput = gradOutput:cuda() - local gconv = nn.VolumetricDilatedMaxPooling(kt,ki,kj,st,si,sj,padt,padi,padj,dilationt,dilationi,dilationj):cuda() - if ceil_mode then gconv:ceil() end - gconv:forward(input) - gconv:zeroGradParameters() - local rescuda = gconv:backward(input, gradOutput) - a:reset() - for i = 1,nloop do + input = input:type(typename) + gradOutput = gradOutput:type(typename) + local gconv = nn.VolumetricDilatedMaxPooling(kt,ki,kj,st,si,sj,padt,padi,padj,dilationt,dilationi,dilationj):type(typename) + if ceil_mode then gconv:ceil() end + gconv:forward(input) gconv:zeroGradParameters() - rescuda = gconv:backward(input, gradOutput) - end - cutorch.synchronize() - tm.gpu = a:time().real + local rescuda = gconv:backward(input, gradOutput) - local error = rescuda:float() - groundgrad + local error = rescuda:double() - groundgrad:double() - mytester:assertlt(error:abs():max(), precision_backward, 'error on state (backward) ') + mytester:assertlt(error:abs():max(), precision_backward_type(precision_backward, typename), + string.format('error on state (backward) with %s', typename)) + end end function cunntest.VolumetricMaxUnpooling_forward_batch() @@ -5252,42 +4556,31 @@ function cunntest.VolumetricMaxUnpooling_forward_batch() local padt = math.random(0,kt/2-1) local padi = math.random(0,ki/2-1) local padj = math.random(0,kj/2-1) - local it = ((outt + padt*2 - kt)/st) +1 - local ii = ((outi + padi*2 - ki)/si) +1 - local ij = ((outj + padj*2 - kj)/sj) +1 + local it = math.max(((outt + padt*2 - kt)/st) +1, kt) + local ii = math.max(((outi + padi*2 - ki)/si) +1, ki) + local ij = math.max(((outj + padj*2 - kj)/sj) +1, kj) - local tm = {} - local title = string.format('VolumetricMaxUnpooling.forward %dx%dx%dx%dx%d o %dx%dx%d -> %d%dx%dx%dx%d', - bs, from, it, ij, ii, kt, kj, ki, bs, to, outt, outj, outi) - times[title] = tm + for k, typename in ipairs(typenames) do + local ctype = t2cpu[typename] - local pooler = nn.VolumetricMaxPooling(kt,ki,kj,st,si,sj,padt,padi,padj) - local sunpool = nn.VolumetricMaxUnpooling(pooler) + local pooler = nn.VolumetricMaxPooling(kt,ki,kj,st,si,sj,padt,padi,padj):type(ctype) + local sunpool = nn.VolumetricMaxUnpooling(pooler):type(ctype) - local original = torch.randn(bs,from,it,ij,ii) - local input = pooler:forward(original) - local groundtruth = sunpool:forward(input) - local a = torch.Timer() - for i = 1,nloop do - groundtruth = sunpool:forward(input) - end - tm.cpu = a:time().real + local original = torch.randn(bs,from,it,ij,ii):type(typename):type(ctype) + local input = pooler:forward(original) + local groundtruth = sunpool:forward(input) - original = original:cuda() - pooler = nn.VolumetricMaxPooling(kt,ki,kj,st,si,sj,padt,padi,padj):cuda() - local gunpool = nn.VolumetricMaxUnpooling(pooler):cuda() + original = original:type(typename) + pooler = nn.VolumetricMaxPooling(kt,ki,kj,st,si,sj,padt,padi,padj):type(typename) + local gunpool = nn.VolumetricMaxUnpooling(pooler):type(typename) - input = pooler:forward(original) - local rescuda = gunpool:forward(input) - a:reset() - for i = 1,nloop do - rescuda = gunpool:forward(input) - end - cutorch.synchronize() - tm.gpu = a:time().real + input = pooler:forward(original) + local rescuda = gunpool:forward(input) - local error = rescuda:float() - groundtruth - mytester:assertlt(error:abs():max(), precision_forward, 'error on state (forward) ') + local error = rescuda:double() - groundtruth:double() + mytester:assertlt(error:abs():max(), precision_forward_type(precision_forward, typename), + string.format('error on state (forward) with %s', typename)) + end end function cunntest.VolumetricMaxUnpooling_backward_batch() @@ -5304,52 +4597,39 @@ function cunntest.VolumetricMaxUnpooling_backward_batch() local padt = math.random(0,kt/2-1) local padi = math.random(0,ki/2-1) local padj = math.random(0,kj/2-1) - local it = ((outt + padt*2 - kt)/st) +1 - local ii = ((outi + padi*2 - ki)/si) +1 - local ij = ((outj + padj*2 - kj)/sj) +1 + local it = math.max(((outt + padt*2 - kt)/st) +1, kt) + local ii = math.max(((outi + padi*2 - ki)/si) +1, ki) + local ij = math.max(((outj + padj*2 - kj)/sj) +1, kj) - local tm = {} - local title = string.format('VolumetricMaxUnpooling.backward %dx%dx%dx%dx%d o %dx%dx%d -> %d%dx%dx%dx%d', - bs, from, it, ij, ii, kt, kj, ki, bs, to, outt, outj, outi) - times[title] = tm + for k, typename in ipairs(typenames) do + local ctype = t2cpu[typename] - local pooler = nn.VolumetricMaxPooling(kt,ki,kj,st,si,sj,padt,padi,padj) - local sunpool = nn.VolumetricMaxUnpooling(pooler) + local pooler = nn.VolumetricMaxPooling(kt,ki,kj,st,si,sj,padt,padi,padj):type(ctype) + local sunpool = nn.VolumetricMaxUnpooling(pooler):type(ctype) - local original = torch.randn(bs,from,it,ij,ii) - local input = pooler:forward(original) - local gradOutput = torch.randn(original:size()) - sunpool:forward(input) - sunpool:zeroGradParameters() - local groundgrad = sunpool:backward(input, gradOutput) - local a = torch.Timer() - for i = 1,nloop do + local original = torch.randn(bs,from,it,ij,ii):type(typename):type(ctype) + local input = pooler:forward(original) + local gradOutput = torch.randn(original:size()):type(typename):type(ctype) + sunpool:forward(input) sunpool:zeroGradParameters() - groundgrad = sunpool:backward(input, gradOutput) - end - tm.cpu = a:time().real + local groundgrad = sunpool:backward(input, gradOutput) - pooler = nn.VolumetricMaxPooling(kt,ki,kj,st,si,sj,padt,padi,padj):cuda() - local gunpool = nn.VolumetricMaxUnpooling(pooler):cuda() + pooler = nn.VolumetricMaxPooling(kt,ki,kj,st,si,sj,padt,padi,padj):type(typename) + local gunpool = nn.VolumetricMaxUnpooling(pooler):type(typename) - original = original:cuda() - input = pooler:forward(original) - gunpool:forward(input) + original = original:type(typename) + input = pooler:forward(original) + gunpool:forward(input) - gradOutput = gradOutput:cuda() - gunpool:zeroGradParameters() - local rescuda = gunpool:backward(input, gradOutput) - a:reset() - for i = 1,nloop do + gradOutput = gradOutput:type(typename) gunpool:zeroGradParameters() - rescuda = gunpool:backward(input, gradOutput) - end - cutorch.synchronize() - tm.gpu = a:time().real + local rescuda = gunpool:backward(input, gradOutput) - local error = rescuda:float() - groundgrad + local error = rescuda:double() - groundgrad:double() - mytester:assertlt(error:abs():max(), precision_backward, 'error on state (backward) ') + mytester:assertlt(error:abs():max(), precision_backward_type(precision_backward, typename), + string.format('error on state (backward) with %s', typename)) + end end function cunntest.VolumetricAveragePooling_forward() @@ -5367,32 +4647,22 @@ function cunntest.VolumetricAveragePooling_forward() local iH = (oH - 1) * dH + kH local iW = (oW - 1) * dW + kW - local tm = {} - local title = string.format('VolumetricAveragePooling.forward %dx%dx%dx%d o %dx%dx%d (%dx%dx%d) -> %dx%dx%dx%d', - iF, iT, iH, iW, kT, kH, kW, dT, dH, dW, iF, oT, oH, oW) - times[title] = tm + for k, typename in ipairs(typenames) do + local input = torch.Tensor(iF, iT, iH, iW):float():uniform(-1, 1):type(typename) - local input = torch.Tensor(iF, iT, iH, iW):float():uniform(-1, 1) - local layer = nn.VolumetricAveragePooling(kT, kW, kH, dT, dW, dH):float() - local output = layer:forward(input) - local timer = torch.Timer() - for i = 1,nloop do - output = layer:forward(input) - end - tm.cpu = timer:time().real + local ctype = t2cpu[typename] + input = input:type(ctype) + local layer = nn.VolumetricAveragePooling(kT, kW, kH, dT, dW, dH):type(ctype) + local output = layer:forward(input) - local inputCUDA = input:cuda() - local layerCUDA = layer:clone():cuda() - local outputCUDA = layerCUDA:forward(inputCUDA) - timer:reset() - for i = 1,nloop do - outputCUDA = layerCUDA:forward(inputCUDA) - end - cutorch.synchronize() - tm.gpu = timer:time().real + local inputCUDA = input:type(typename) + local layerCUDA = layer:clone():type(typename) + local outputCUDA = layerCUDA:forward(inputCUDA) - local error = outputCUDA:float() - output - mytester:assertlt(error:abs():max(), precision_forward, 'error on state (forward) ') + local error = outputCUDA:double() - output:double() + mytester:assertlt(error:abs():max(), precision_forward_type(precision_forward, typename), + string.format('error on state (forward) with %s', typename)) + end end function cunntest.VolumetricAveragePooling_backward() @@ -5410,36 +4680,25 @@ function cunntest.VolumetricAveragePooling_backward() local iH = (oH - 1) * dH + kH local iW = (oW - 1) * dW + kW - local tm = {} - local title = string.format('VolumetricAveragePooling.backward %dx%dx%dx%d o %dx%dx%d (%dx%dx%d) -> %dx%dx%dx%d', - iF, iT, iH, iW, kT, kH, kW, dT, dH, dW, iF, oT, oH, oW) - times[title] = tm + for k, typename in ipairs(typenames) do + local input = torch.Tensor(iF, iT, iH, iW):float():uniform(-1, 1):type(typename) - local input = torch.Tensor(iF, iT, iH, iW):float():uniform(-1, 1) - local layer = nn.VolumetricAveragePooling(kT, kW, kH, dT, dW, dH):float() - local output = layer:forward(input) - local gradOutput = output:clone():uniform(-1, 1) + local ctype = t2cpu[typename] + input = input:type(ctype) + local layer = nn.VolumetricAveragePooling(kT, kW, kH, dT, dW, dH):type(ctype) + local output = layer:forward(input) + local gradOutput = output:clone():uniform(-1, 1) - local gradInput = layer:backward(input, gradOutput) - local timer = torch.Timer() - for i = 1,nloop do - gradInput = layer:backward(input, gradOutput) - end - tm.cpu = timer:time().real + local gradInput = layer:backward(input, gradOutput) - local inputCUDA = input:cuda() local layerCUDA = layer:clone():cuda() - local outputCUDA = layerCUDA:forward(inputCUDA) local gradOutputCUDA = gradOutput:cuda() - local gradInputCUDA = layerCUDA:backward(inputCUDA, gradOutputCUDA) + local inputCUDA = input:type(typename) local layerCUDA = layer:clone():type(typename) + local outputCUDA = layerCUDA:forward(inputCUDA) local gradOutputCUDA = gradOutput:type(typename) + local gradInputCUDA = layerCUDA:backward(inputCUDA, gradOutputCUDA) - timer:reset() - for i = 1,nloop do - gradInputCUDA = layerCUDA:backward(inputCUDA, gradOutputCUDA) + local error = gradInputCUDA:double() - gradInput:double() + mytester:assertlt(error:abs():max(), precision_forward_type(precision_forward, typename), + string.format('error on state (backward) with %s', typename)) end - cutorch.synchronize() - tm.gpu = timer:time().real - - local error = gradInputCUDA:float() - gradInput - mytester:assertlt(error:abs():max(), precision_forward, 'error on state (backward) ') end function cunntest.CMul_forward_batch() @@ -5526,32 +4785,22 @@ function cunntest.PReLU_forward() local nOutputPlane = 8 local w = math.random(1,100) local h = math.random(1,100) + local input = torch.randn(nOutputPlane,h,w) - local tm = {} - local title = string.format('PReLU forward %d x %d', w, h) - times[title] = tm + for k, typename in ipairs(typenames) do + local ctype = t2cpu[typename] + local input = input:type(ctype) + local sconv = nn.PReLU(nOutputPlane):type(ctype) + local groundtruth = sconv:forward(input) - local input = torch.randn(nOutputPlane,h,w) - local sconv = nn.PReLU(nOutputPlane) - local groundtruth = sconv:forward(input) - local a = torch.Timer() - for i = 1,nloop do - groundtruth = sconv:forward(input) - end - tm.cpu = a:time().real - - input = input:cuda() - local gconv = sconv:cuda() - local rescuda = gconv:forward(input) - a:reset() - for i = 1,nloop do - rescuda = gconv:forward(input) - end - cutorch.synchronize() - tm.gpu = a:time().real + input = input:type(typename) + local gconv = sconv:type(typename) + local rescuda = gconv:forward(input) - local error = rescuda:float() - groundtruth - mytester:assertlt(error:abs():max(), precision_forward, 'error on state') + local error = rescuda:double() - groundtruth:double() + mytester:assertlt(error:abs():max(), precision_forward_type(precision_forward, typename), + string.format('error on state with %s', typename)) + end end function cunntest.PReLU_backward() @@ -5559,39 +4808,33 @@ function cunntest.PReLU_backward() local w = math.random(1,10) local h = math.random(1,10) - local tm = {} - local title = string.format('PReLU backward %d x %d', w, h) - times[title] = tm - - local input = torch.randn(nOutputPlane, h, w) - local gradOutput = torch.randn(#input) - local sconv = nn.PReLU(nOutputPlane) - local gconv = sconv:clone():cuda() - - sconv:forward(input) - local groundgrad = sconv:backward(input, gradOutput) - local a = torch.Timer() - for i = 1,nloop do - groundgrad = sconv:backward(input, gradOutput) + for k, typename in ipairs(typenames) do + local input = torch.randn(nOutputPlane, h, w):type(typename) + local gradOutput = torch.randn(#input):type(typename) + local ctype = t2cpu[typename] + input = input:type(ctype) + gradOutput = gradOutput:type(ctype) + local sconv = nn.PReLU(nOutputPlane):type(ctype) + local gconv = sconv:clone():type(typename) + + sconv:forward(input) + sconv:zeroGradParameters() + local groundgrad = sconv:backward(input, gradOutput) + + input = input:type(typename) + gradOutput = gradOutput:type(typename) + gconv:forward(input) + gconv:zeroGradParameters() + local rescuda = gconv:backward(input, gradOutput) + + local err = rescuda:double() - groundgrad:double() + local weightGradError = gconv.gradWeight:double() - sconv.gradWeight:double() + + mytester:assertlt(err:abs():max(), precision_backward_type(precision_backward, typename), + string.format('error on state %s', typename)) + mytester:assertlt(weightGradError:abs():max(), precision_backward_type(precision_backward, typename), + string.format('error on weight %s', typename)) end - tm.cpu = a:time().real - - input = input:cuda() - gradOutput = gradOutput:cuda() - gconv:forward(input) - local rescuda = gconv:backward(input, gradOutput) - a:reset() - for i = 1,nloop do - rescuda = gconv:backward(input, gradOutput) - end - cutorch.synchronize() - tm.gpu = a:time().real - - local err = rescuda:float() - groundgrad - local weightGradError = gconv.gradWeight:float() - sconv.gradWeight - - mytester:assertlt(err:abs():max(), precision_backward, 'error on state') - mytester:assertlt(weightGradError:abs():max(), precision_backward, 'error on weight') end @@ -5600,40 +4843,29 @@ function cunntest.RReLU_forward() local w = math.random(1,100) local h = math.random(1,100) - for _,train in ipairs({true,false}) do - for _,inplace in ipairs({false,true}) do - local tm = {} - local title = string.format('RReLU forward %d x %d (inplace: %s, train: %s)', - w, h, tostring(inplace), tostring(train)) - times[title] = tm - - local input = torch.randn(nOutputPlane, h, w) - 0.5 - local sconv = nn.RReLU(1/8, 1/3, inplace) - if not train then - sconv:evaluate() - end - local groundtruth = sconv:forward(input:clone()) - local a = torch.Timer() - for i = 1,nloop do - groundtruth = sconv:forward(input:clone()) - end - tm.cpu = a:time().real - - input = input:cuda() - local gconv = sconv:cuda() - local rescuda = gconv:forward(input:clone()) - a:reset() - for i = 1,nloop do - rescuda = gconv:forward(input:clone()) - end - cutorch.synchronize() - tm.gpu = a:time().real - - if not train then - local error = rescuda:float() - groundtruth - mytester:assertlt(error:abs():max(), precision_forward, 'error on state') + for k, typename in ipairs(typenames) do + for _,train in ipairs({true,false}) do + for _,inplace in ipairs({false,true}) do + local input = torch.randn(nOutputPlane, h, w):type(typename) - 0.5 + local ctype = t2cpu[typename] + input = input:type(ctype) + local sconv = nn.RReLU(1/8, 1/3, inplace):type(ctype) + if not train then + sconv:evaluate() + end + local groundtruth = sconv:forward(input:clone()) + + input = input:type(typename) + local gconv = sconv:type(typename) + local rescuda = gconv:forward(input:clone()) + + if not train then + local error = rescuda:double() - groundtruth:double() + mytester:assertlt(error:abs():max(), precision_forward_type(precision_forward, typename), + string.format('error on state %s', typename)) + end end - end + end end end @@ -5642,50 +4874,40 @@ function cunntest.RReLU_backward() local w = math.random(1,10) local h = math.random(1,10) - for _,train in ipairs({true,false}) do - for _,inplace in ipairs({false,true}) do - local tm = {} - local title = string.format('RReLU backward %d x %d (inplace: %s, train: %s)', - w, h, tostring(inplace), tostring(train)) - times[title] = tm - - local input = torch.randn(nOutputPlane, h, w) - local gradOutput = torch.randn(#input) - 0.5 - local sconv = nn.RReLU(1/8, 1/3, inplace) - if not train then - sconv:evaluate() - end + for k, typename in ipairs(typenames) do + for _,train in ipairs({true,false}) do + for _,inplace in ipairs({false,true}) do + local ctype = t2cpu[typename] + local input = torch.randn(nOutputPlane, h, w):type(typename) + local gradOutput = torch.randn(#input):type(typename) - 0.5 + input = input:type(ctype) + gradOutput = gradOutput:type(ctype) + local sconv = nn.RReLU(1/8, 1/3, inplace):type(ctype) + if not train then + sconv:evaluate() + end - sconv:forward(input:clone()) - local groundgrad = sconv:backward(input, gradOutput:clone()) - local a = torch.Timer() - for i = 1,nloop do - groundgrad = sconv:backward(input, gradOutput:clone()) - end - tm.cpu = a:time().real + sconv:forward(input:clone()) + local groundgrad = sconv:backward(input, gradOutput:clone()) - local gconv = sconv:clone():cuda() - input = input:cuda() - gradOutput = gradOutput:cuda() - gconv:forward(input:clone()) - local rescuda = gconv:backward(input, gradOutput:clone()) - a:reset() - for i = 1,nloop do - rescuda = gconv:backward(input, gradOutput:clone()) - end - cutorch.synchronize() - tm.gpu = a:time().real + local gconv = sconv:clone():type(typename) + input = input:type(typename) + gradOutput = gradOutput:type(typename) + gconv:forward(input:clone()) + local rescuda = gconv:backward(input, gradOutput:clone()) - if not train then - local err = rescuda:float() - groundgrad - mytester:assertlt(err:abs():max(), precision_backward, 'error on state') - end + if not train then + local err = rescuda:double() - groundgrad:double() + mytester:assertlt(err:abs():max(), precision_backward_type(precision_backward, typename), + string.format('error on state', typename)) + end - input = -torch.rand(1000):cuda() - gconv:forward(input) -- fill internal noise tensor - local g = gconv:backward(input, torch.ones(1000):cuda()) - local err = math.abs(g[input:le(0)]:mean()-(gconv.lower+gconv.upper)/2) - mytester:assertlt(err, 0.05, 'mean deviation of gradient for negative inputs') + input = -torch.rand(1000):type(typename) + gconv:forward(input) -- fill internal noise tensor + local g = gconv:backward(input, torch.ones(1000):type(typename)) + local err = math.abs(g[input:le(0)]:mean()-(gconv.lower+gconv.upper)/2) + mytester:assertlt(err, 0.05, 'mean deviation of gradient for negative inputs') + end end end end @@ -5705,79 +4927,91 @@ function cunntest.VolumetricFullConvolution_pair_test() local inChan = math.random(1,32) local outChan = math.random(1,32) - local module = nn.VolumetricFullConvolution(inChan, outChan, kT, kH, kW, - dT, dH, dW, pT, pH, pW); - module.weight:fill(1); - module.bias:fill(0.1); - - local bs = math.random(8,32) - local inD = math.random(8,32) - local inH = math.random(8,32) - local inW = math.random(8,32) - local outD = (inD - 1) * dT - 2 * pT + kT - local outH = (inH - 1) * dH - 2 * pH + kH - local outW = (inW - 1) * dW - 2 * pW + kW - local input = torch.Tensor(bs, inChan, inD, inH, inW):fill(1); - local gradOut = torch.randn(bs, outChan, outD, outH, outW) - - local outcpu = module:forward(input) - local gradcpu = module:backward(input, gradOut) - module:cuda() - local outgpu = module:forward(input:cuda()) - local gradgpu = module:backward(input:cuda(), gradOut:cuda()) - - local error = outgpu:float() - outcpu - mytester:assertlt(error:abs():max(), precision_forward, - 'error on state (forward) ') - - local error = gradgpu:float() - gradcpu - mytester:assertlt(error:abs():max(), precision_backward, - 'error on state (backward) ') + for k, typename in ipairs(typenames) do + local ctype = t2cpu[typename] + local module = nn.VolumetricFullConvolution(inChan, outChan, kT, kH, kW, + dT, dH, dW, pT, pH, pW):type(ctype); + module.weight:fill(1); + module.bias:fill(0.1); + module.weight = module.weight:type(typename):type(ctype) + module.bias = module.bias:type(typename):type(ctype) + + local bs = math.random(8,32) + local inD = math.random(8,32) + local inH = math.random(8,32) + local inW = math.random(8,32) + local outD = (inD - 1) * dT - 2 * pT + kT + local outH = (inH - 1) * dH - 2 * pH + kH + local outW = (inW - 1) * dW - 2 * pW + kW + local input = torch.Tensor(bs, inChan, inD, inH, inW):fill(1):type(typename):type(ctype) + local gradOut = torch.randn(bs, outChan, outD, outH, outW):type(typename):type(ctype) + + local outcpu = module:forward(input) + local gradcpu = module:backward(input, gradOut) + module:type(typename) + local outgpu = module:forward(input:type(typename)) + local gradgpu = module:backward(input:type(typename), gradOut:type(typename)) + + local error = outgpu:type(typename) - outcpu:type(typename) + mytester:assertlt(error:abs():max(), + precision_forward_type(precision_forward, typename, outgpu:abs():max()), + string.format('error on state (forward) with %s', typename)) + + local error = gradgpu:type(typename) - gradcpu:type(typename) + mytester:assertlt(error:abs():max(), + precision_backward_type(precision_backward, typename), + string.format('error on state (backward) with %s', typename)) + end end function cunntest.VolumetricFullConvolution() - local module = nn.VolumetricFullConvolution(3, 1, 3, 3, 3, 3, 3, 3); - module.weight:fill(1); - module.bias:fill(0.1); - module:cuda(); - - local input = torch.Tensor(1, 3, 2, 2, 2):zero(); - for c = 1,3 do - input[1][c][1][1][1] = 1 - end - local output = module:forward(input:cuda()) - for t = 1,6 do - for h = 1,6 do - for w = 1,6 do - if t <= 3 and h <= 3 and w <= 3 then - mytester:assertlt(output[1][1][t][h][w] - 3.1, precision_forward, 'error on forward ') - else - mytester:assertlt(output[1][1][t][h][w] - 0.1, precision_forward, 'error on forward ') + for k, typename in ipairs(typenames) do + local ctype = t2cpu[typename] + local module = nn.VolumetricFullConvolution(3, 1, 3, 3, 3, 3, 3, 3):type(ctype); + module.weight:fill(1); + module.bias:fill(0.1); + module:type(typename); + + local input = torch.Tensor(1, 3, 2, 2, 2):zero(); + for c = 1,3 do + input[1][c][1][1][1] = 1 + end + local output = module:forward(input:type(typename)) + for t = 1,6 do + for h = 1,6 do + for w = 1,6 do + if t <= 3 and h <= 3 and w <= 3 then + mytester:assertlt(output[1][1][t][h][w] - 3.1, precision_forward_type(precision_forward, typename), + string.format('error on forward with %s', typename)) + else + mytester:assertlt(output[1][1][t][h][w] - 0.1, precision_forward_type(precision_forward, typename), + string.format('error on forward with %s', typename)) + end end end end - end - module:zeroGradParameters() - local gradOut = torch.Tensor(1, 1, 6, 6, 6):fill(0.1); - local gradIn = module:backward(input:cuda(), gradOut:cuda()) - for t = 1,2 do - for h = 1,2 do - for w = 1,2 do - mytester:assertlt(gradIn[1][1][t][h][w] - 2.7, precision_backward, - 'error on backward input gradients ') + module:zeroGradParameters() + local gradOut = torch.Tensor(1, 1, 6, 6, 6):fill(0.1); + local gradIn = module:backward(input:type(typename), gradOut:type(typename)) + for t = 1,2 do + for h = 1,2 do + for w = 1,2 do + mytester:assertlt(gradIn[1][1][t][h][w] - 2.7, precision_backward_type(precision_backward, typename), + string.format('error on backward input gradients with %s', typename)) + end end end - end - mytester:assertlt(module.gradBias[1] - 21.6, precision_backward, - 'error on backward gradBias ') - for c = 1,3 do - for t = 1,3 do - for h = 1,3 do - for w = 1,3 do - mytester:assertlt(module.gradWeight[c][1][t][h][w] - 0.1, precision_backward, - 'error on backward weight gradients ') + mytester:assertlt(module.gradBias[1] - 21.6, precision_backward_type(precision_backward, typename), + string.format('error on backward gradBias with %s', typename)) + for c = 1,3 do + for t = 1,3 do + for h = 1,3 do + for w = 1,3 do + mytester:assertlt(module.gradWeight[c][1][t][h][w] - 0.1, precision_backward_type(precision_backward, typename), + string.format('error on backward weight gradients with %s', typename)) + end end end end @@ -5806,35 +5040,46 @@ function cunntest.VolumetricDilatedConvolution() local inj = (outj - 1) * sj - 2 * padH + dilationH * (kj-1) + 1 local ink = (outk - 1) * sk - 2 * padT + dilationT * (kk-1) + 1 - local input = torch.randn(from,ink,inj,ini) - local sconv = nn.VolumetricDilatedConvolution(from,to,kk,ki,kj,sk,si,sj,padT,padW,padH,dilationT,dilationW,dilationH) - local output = sconv:forward(input) - local gradOutput = output:clone():normal() - sconv:zeroGradParameters() - local groundgrad = sconv:backward(input, gradOutput) - local groundweight = sconv.gradWeight - local groundbias = sconv.gradBias + for k, typename in ipairs(typenames) do + local input = torch.randn(from,ink,inj,ini):type(typename) - input = input:cuda() - gradOutput = gradOutput:cuda() - local gconv = nn.VolumetricDilatedConvolution(from,to,kk,ki,kj,sk,si,sj,padT,padW,padH,dilationT,dilationW,dilationH):cuda() - gconv.weight = sconv.weight:cuda() - gconv.bias = sconv.bias:cuda() - local rescuda = gconv:forward(input) - gconv:zeroGradParameters() - local gradcuda = gconv:backward(input, gradOutput) - local weightcuda = gconv.gradWeight - local biascuda = gconv.gradBias + local ctype = t2cpu[typename] + input = input:type(ctype) + local sconv = nn.VolumetricDilatedConvolution(from,to,kk,ki,kj,sk,si,sj,padT,padW,padH,dilationT,dilationW,dilationH):type(ctype) + local output = sconv:forward(input) + local gradOutput = output:clone():normal() + sconv:zeroGradParameters() + local groundgrad = sconv:backward(input, gradOutput) + local groundweight = sconv.gradWeight + local groundbias = sconv.gradBias - local error = rescuda:float() - output - local gerror = gradcuda:float() - groundgrad - local werror = weightcuda:float() - groundweight - local berror = biascuda:float() - groundbias + input = input:type(typename) + gradOutput = gradOutput:type(typename) + local gconv = nn.VolumetricDilatedConvolution(from,to,kk,ki,kj,sk,si,sj,padT,padW,padH,dilationT,dilationW,dilationH):type(typename) + gconv.weight = sconv.weight:type(typename) + gconv.bias = sconv.bias:type(typename) + local rescuda = gconv:forward(input) + gconv:zeroGradParameters() + local gradcuda = gconv:backward(input, gradOutput) + local weightcuda = gconv.gradWeight + local biascuda = gconv.gradBias - mytester:assertlt(error:abs():max(), precision_forward, 'error on state (forward) ') - mytester:assertlt(gerror:abs():max(), precision_backward, 'error on state (backward) ') - mytester:assertlt(werror:abs():max(), precision_backward, 'error on weight (backward) ') - mytester:assertlt(berror:abs():max(), precision_backward, 'error on bias (backward) ') + local error = rescuda:double() - output:double() + local gerror = gradcuda:double() - groundgrad:double() + local werror = weightcuda:double() - groundweight:double() + local berror = biascuda:double() - groundbias:double() + + mytester:assertlt(error:abs():max(), precision_forward_type(precision_forward, typename), + string.format('error on state (forward) with %s', typename)) + mytester:assertlt(gerror:abs():max(), precision_backward_type(precision_backward, typename), + string.format('error on state (backward) with %s', typename)) + mytester:assertlt(werror:abs():max(), + precision_backward_conv_weightbias(precision_backward, typename, weightcuda:abs():max()), + string.format('error on weight (backward) with %s', typename)) + mytester:assertlt(berror:abs():max(), + precision_backward_conv_weightbias(precision_backward, typename, biascuda:abs():max()), + string.format('error on bias (backward) with %s', typename)) + end end function cunntest.LookupTable_forward() @@ -5842,31 +5087,21 @@ function cunntest.LookupTable_forward() local nDim = 100 local nInput = 1000 - local tm = {} - local title = string.format('LookupTable forward %d x %d', nVocab, nDim) - times[title] = tm + for k, typename in ipairs(typenames) do + local input = torch.LongTensor(nInput):random(nVocab) - local input = torch.LongTensor(nInput):random(nVocab) - local sconv = nn.LookupTable(nVocab, nDim) - local groundtruth = sconv:forward(input) - local a = torch.Timer() - for i = 1,nloop do - groundtruth = sconv:forward(input) - end - tm.cpu = a:time().real + local ctype = t2cpu[typename] + local sconv = nn.LookupTable(nVocab, nDim):type(ctype) + local groundtruth = sconv:forward(input) - input = input:cuda() - local gconv = sconv:cuda() - local rescuda = gconv:forward(input) - a:reset() - for i = 1,nloop do - rescuda = gconv:forward(input) - end - cutorch.synchronize() - tm.gpu = a:time().real + input = input:cuda() + local gconv = sconv:type(typename) + local rescuda = gconv:forward(input) - local error = rescuda:float() - groundtruth - mytester:assertlt(error:abs():max(), precision_forward, 'error on state') + local error = rescuda:double() - groundtruth:double() + mytester:assertlt(error:abs():max(), precision_forward_type(precision_forward, typename), + string.format('error on state with %s', typename)) + end end function cunntest.LookupTable_backward() @@ -5886,73 +5121,69 @@ function cunntest.LookupTable_backward() s[k] = v[torch.random(#v)] end - local input, gradOutput - if s.batch then - input = torch.LongTensor(s.nInput, 5):random(s.nVocab) - gradOutput = torch.randn(s.nInput, 5, s.nDim) - else - input = torch.LongTensor(s.nInput):random(s.nVocab) - gradOutput = torch.randn(s.nInput, s.nDim) - end - - local sconv = nn.LookupTable(s.nVocab, s.nDim, s.paddingValue) - local gconv = sconv:clone():cuda() - if s.scaleGradByFreq then - sconv = sconv:scaleGradByFreq() - gconv = gconv:scaleGradByFreq() - end + for k, typename in ipairs(typenames) do + local ctype = t2cpu[typename] + local input, gradOutput + if s.batch then + input = torch.LongTensor(s.nInput, 5):random(s.nVocab) + gradOutput = torch.randn(s.nInput, 5, s.nDim):type(typename):type(ctype) + else + input = torch.LongTensor(s.nInput):random(s.nVocab) + gradOutput = torch.randn(s.nInput, s.nDim):type(typename):type(ctype) + end - sconv:forward(input) - sconv:backward(input, gradOutput) + local sconv = nn.LookupTable(s.nVocab, s.nDim, s.paddingValue):type(ctype) + local gconv = sconv:clone():type(typename) + if s.scaleGradByFreq then + sconv = sconv:scaleGradByFreq() + gconv = gconv:scaleGradByFreq() + end - input = input:cuda() - gradOutput = gradOutput:cuda() - gconv:forward(input) - gconv:backward(input, gradOutput) + sconv:forward(input) + sconv:backward(input, gradOutput) - local weightGradError = gconv.gradWeight:float() - sconv.gradWeight - mytester:assertlt(weightGradError:abs():max(), precision_backward, - 'error on weight for size ' .. tostring(s.nInput) .. - ' nVocab: ' .. tostring(s.nVocab) .. - ' nDim ' .. tostring(s.nDim) .. - ' scaleGradByFreq: ' .. tostring(s.scaleGradByFreq) .. - ' batch: ' .. tostring(s.batch) .. - ' paddingValue: ' .. tostring(s.paddingValue)) + input = input:cuda() + gradOutput = gradOutput:type(typename) + gconv:forward(input) + gconv:backward(input, gradOutput) + + local weightGradError = gconv.gradWeight:double() - sconv.gradWeight:double() + mytester:assertlt(weightGradError:abs():max(), + precision_backward_conv_weightbias(precision_backward, typename, gconv.gradWeight:abs():max()), + 'error on weight for size ' .. tostring(s.nInput) .. + ' nVocab: ' .. tostring(s.nVocab) .. + ' nDim ' .. tostring(s.nDim) .. + ' scaleGradByFreq: ' .. tostring(s.scaleGradByFreq) .. + ' batch: ' .. tostring(s.batch) .. + ' paddingValue: ' .. tostring(s.paddingValue) .. + ' type:' .. typename) + end end local nVocab = 10000 local nDim = 128 local nInput = 1000 - local tm = {} - local title = string.format('LookupTable backward %d x %d', nVocab, nDim, nInput) - times[title] = tm - local input = torch.LongTensor(nInput):random(nVocab) - local gradOutput = torch.randn(nInput, nDim) - local sconv = nn.LookupTable(nVocab, nDim) - local gconv = sconv:clone():cuda() + for k, typename in ipairs(typenames) do + local input = torch.LongTensor(nInput):random(nVocab) - sconv:forward(input) - sconv:backward(input, gradOutput) - local a = torch.Timer() - for i = 1,nloop do - sconv:backward(input, gradOutput) - end - tm.cpu = a:time().real + local ctype = t2cpu[typename] + local gradOutput = torch.randn(nInput, nDim):type(ctype) + local sconv = nn.LookupTable(nVocab, nDim):type(ctype) + local gconv = sconv:clone():type(typename) - input = input:cuda() - gradOutput = gradOutput:cuda() - gconv:forward(input) - gconv:backward(input, gradOutput) - a:reset() - for i = 1,nloop do - gconv:backward(input, gradOutput) - end - cutorch.synchronize() - tm.gpu = a:time().real + sconv:forward(input) + sconv:backward(input, gradOutput) + + input = input:cuda() + gradOutput = gradOutput:type(typename) + gconv:forward(input) + gconv:backward(input, gradOutput) - local weightGradError = gconv.gradWeight:float() - sconv.gradWeight - mytester:assertlt(weightGradError:abs():max(), precision_backward, 'error on weight') + local weightGradError = gconv.gradWeight:double() - sconv.gradWeight:double() + mytester:assertlt(weightGradError:abs():max(), precision_backward_type(precision_backward, typename), + string.format('error on weight with %s', typename)) + end end function cunntest.getParameters() @@ -5980,36 +5211,23 @@ function cunntest.SpatialReflectionPadding_forward() local padT = math.random(-3,3) local padB = math.random(-3,3) - local tm = {} - local title = - string.format( - 'SpatialReflectionPadding.forward %dx%dx%dx%d -> %dx%dx%dx%d', - batch, plane, sizeY, sizeX, - batch, plane, sizeY + padT + padB, sizeX + padL + padR) - times[title] = tm + for k, typename in ipairs(typenames) do + local input = torch.rand(batch, plane, sizeY, sizeX):type(typename) - local input = torch.rand(batch, plane, sizeY, sizeX) - local module = nn.SpatialReflectionPadding(padL, padR, padT, padB) - local groundtruth = module:forward(input) - local a = torch.Timer() - for i = 1,nloop do - groundtruth = module:forward(input) - end - tm.cpu = a:time().real + local ctype = t2cpu[typename] + input = input:type(ctype) + local module = nn.SpatialReflectionPadding(padL, padR, padT, padB):type(ctype) + local groundtruth = module:forward(input) - input = input:cuda() - local gmodule = nn.SpatialReflectionPadding(padL, padR, padT, padB):cuda() - local rescuda = gmodule:forward(input) - a:reset() - for i = 1,nloop do - rescuda = gmodule:forward(input) - end - cutorch.synchronize() - tm.gpu = a:time().real + input = input:type(typename) + local gmodule = nn.SpatialReflectionPadding(padL, padR, padT, padB):type(typename) + local rescuda = gmodule:forward(input) - local error = rescuda:float() - groundtruth - mytester:assertlt(error:abs():max(), - precision_forward, 'error on state (forward) ') + local error = rescuda:double() - groundtruth:double() + mytester:assertlt(error:abs():max(), + precision_forward_type(precision_forward, typename), + string.format('error on state (forward) with %s', typename)) + end end function cunntest.SpatialReflectionPadding_backward() @@ -6022,46 +5240,32 @@ function cunntest.SpatialReflectionPadding_backward() local padT = math.random(-3,3) local padB = math.random(-3,3) - local tm = {} - local title = - string.format( - 'SpatialReflectionPadding.backward %dx%dx%dx%d -> %dx%dx%dx%d', - batch, plane, sizeY, sizeX, - batch, plane, sizeY + padT + padB, sizeX + padL + padR) - times[title] = tm + for k, typename in ipairs(typenames) do + local input = torch.rand(batch, plane, sizeY, sizeX):type(typename) + local gradOutput = torch.rand( + batch, plane, sizeY + padT + padB, sizeX + padL + padR + ):type(typename) - local input = torch.rand(batch, plane, sizeY, sizeX) - local gradOutput = torch.rand( - batch, plane, sizeY + padT + padB, sizeX + padL + padR - ) - local module = nn.SpatialReflectionPadding(padL, padR, padT, padB) - module:forward(input) - module:zeroGradParameters() - local groundgrad = module:backward(input, gradOutput) - local a = torch.Timer() - for i = 1,nloop do - module:zeroGradParameters() - groundgrad = module:backward(input, gradOutput) - end - tm.cpu = a:time().real + local ctype = t2cpu[typename] + input = input:type(ctype) + gradOutput = gradOutput:type(ctype) + local module = nn.SpatialReflectionPadding(padL, padR, padT, padB):type(ctype) + module:forward(input) + module:zeroGradParameters() + local groundgrad = module:backward(input, gradOutput) - input = input:cuda() - gradOutput = gradOutput:cuda() - local gmodule = nn.SpatialReflectionPadding(padL, padR, padT, padB):cuda() - gmodule:forward(input) - gmodule:zeroGradParameters() - local rescuda = gmodule:backward(input, gradOutput) - a:reset() - for i = 1,nloop do - gmodule:zeroGradParameters() - rescuda = gmodule:backward(input, gradOutput) - end - cutorch.synchronize() - tm.gpu = a:time().real + input = input:type(typename) + gradOutput = gradOutput:type(typename) + local gmodule = nn.SpatialReflectionPadding(padL, padR, padT, padB):type(typename) + gmodule:forward(input) + gmodule:zeroGradParameters() + local rescuda = gmodule:backward(input, gradOutput) - local error = rescuda:float() - groundgrad - mytester:assertlt(error:abs():max(), - precision_backward, 'error on state (backward) ') + local error = rescuda:double() - groundgrad:double() + mytester:assertlt(error:abs():max(), + precision_backward_type(precision_backward, type), + string.format('error on state (backward) with %s', typename)) + end end function cunntest.SpatialReplicationPadding_forward() @@ -6074,36 +5278,23 @@ function cunntest.SpatialReplicationPadding_forward() local padT = math.random(-3,3) local padB = math.random(-3,3) - local tm = {} - local title = - string.format( - 'SpatialReplicationPadding.forward %dx%dx%dx%d -> %dx%dx%dx%d', - batch, plane, sizeY, sizeX, - batch, plane, sizeY + padT + padB, sizeX + padL + padR) - times[title] = tm + for k, typename in ipairs(typenames) do + local input = torch.rand(batch, plane, sizeY, sizeX):type(typename) - local input = torch.rand(batch, plane, sizeY, sizeX) - local module = nn.SpatialReplicationPadding(padL, padR, padT, padB) - local groundtruth = module:forward(input) - local a = torch.Timer() - for i = 1,nloop do - groundtruth = module:forward(input) - end - tm.cpu = a:time().real + local ctype = t2cpu[typename] + input = input:type(ctype) + local module = nn.SpatialReplicationPadding(padL, padR, padT, padB):type(ctype) + local groundtruth = module:forward(input) - input = input:cuda() - local gmodule = nn.SpatialReplicationPadding(padL, padR, padT, padB):cuda() - local rescuda = gmodule:forward(input) - a:reset() - for i = 1,nloop do - rescuda = gmodule:forward(input) - end - cutorch.synchronize() - tm.gpu = a:time().real + input = input:type(typename) + local gmodule = nn.SpatialReplicationPadding(padL, padR, padT, padB):type(typename) + local rescuda = gmodule:forward(input) - local error = rescuda:float() - groundtruth - mytester:assertlt(error:abs():max(), - precision_forward, 'error on state (forward) ') + local error = rescuda:double() - groundtruth:double() + mytester:assertlt(error:abs():max(), + precision_forward_type(precision_forward, type), + string.format('error on state (forward) with %s', typename)) + end end function cunntest.SpatialReplicationPadding_backward() @@ -6116,46 +5307,32 @@ function cunntest.SpatialReplicationPadding_backward() local padT = math.random(-3,3) local padB = math.random(-3,3) - local tm = {} - local title = - string.format( - 'SpatialReplicationPadding.backward %dx%dx%dx%d -> %dx%dx%dx%d', - batch, plane, sizeY, sizeX, - batch, plane, sizeY + padT + padB, sizeX + padL + padR) - times[title] = tm + for k, typename in ipairs(typenames) do + local input = torch.rand(batch, plane, sizeY, sizeX):type(typename) + local gradOutput = torch.rand( + batch, plane, sizeY + padT + padB, sizeX + padL + padR + ):type(typename) - local input = torch.rand(batch, plane, sizeY, sizeX) - local gradOutput = torch.rand( - batch, plane, sizeY + padT + padB, sizeX + padL + padR - ) - local module = nn.SpatialReplicationPadding(padL, padR, padT, padB) - module:forward(input) - module:zeroGradParameters() - local groundgrad = module:backward(input, gradOutput) - local a = torch.Timer() - for i = 1,nloop do - module:zeroGradParameters() - groundgrad = module:backward(input, gradOutput) - end - tm.cpu = a:time().real + local ctype = t2cpu[typename] + input = input:type(ctype) + gradOutput = gradOutput:type(ctype) + local module = nn.SpatialReplicationPadding(padL, padR, padT, padB):type(ctype) + module:forward(input) + module:zeroGradParameters() + local groundgrad = module:backward(input, gradOutput) - input = input:cuda() - gradOutput = gradOutput:cuda() - local gmodule = nn.SpatialReplicationPadding(padL, padR, padT, padB):cuda() - gmodule:forward(input) - gmodule:zeroGradParameters() - local rescuda = gmodule:backward(input, gradOutput) - a:reset() - for i = 1,nloop do - gmodule:zeroGradParameters() - rescuda = gmodule:backward(input, gradOutput) - end - cutorch.synchronize() - tm.gpu = a:time().real + input = input:type(typename) + gradOutput = gradOutput:type(typename) + local gmodule = nn.SpatialReplicationPadding(padL, padR, padT, padB):type(typename) + gmodule:forward(input) + gmodule:zeroGradParameters() + local rescuda = gmodule:backward(input, gradOutput) - local error = rescuda:float() - groundgrad - mytester:assertlt(error:abs():max(), - precision_backward, 'error on state (backward) ') + local error = rescuda:double() - groundgrad:double() + mytester:assertlt(error:abs():max(), + precision_backward_type(precision_backward, typename), + string.format('error on state (backward) with %s', typename)) + end end function cunntest.VolumetricReplicationPadding_forward() @@ -6171,40 +5348,25 @@ function cunntest.VolumetricReplicationPadding_forward() local pfront = math.random(-3,3) local pback = math.random(-3,3) - local tm = {} - local title = - string.format( - 'VolumetricReplicationPadding.forward %dx%dx%dx%dx%d -> ' .. - '%dx%dx%dx%dx%d', - batch, plane, sizeZ, sizeY, sizeX, - batch, plane, sizeZ + pfront + pback, sizeY + ptop + pbottom, - sizeX + pleft + pright) - times[title] = tm + for k, typename in ipairs(typenames) do + local input = torch.rand(batch, plane, sizeZ, sizeY, sizeX):type(typename) - local input = torch.rand(batch, plane, sizeZ, sizeY, sizeX) - local module = nn.VolumetricReplicationPadding(pleft, pright, ptop, pbottom, - pfront, pback) - local groundtruth = module:forward(input) - local a = torch.Timer() - for i = 1, nloop do - groundtruth = module:forward(input) - end - tm.cpu = a:time().real + local ctype = t2cpu[typename] + input = input:type(ctype) + local module = nn.VolumetricReplicationPadding(pleft, pright, ptop, pbottom, + pfront, pback):type(ctype) + local groundtruth = module:forward(input) - input = input:cuda() - local gmodule = nn.VolumetricReplicationPadding(pleft, pright, ptop, pbottom, - pfront, pback):cuda() - local rescuda = gmodule:forward(input) - a:reset() - for i = 1, nloop do - rescuda = gmodule:forward(input) - end - cutorch.synchronize() - tm.gpu = a:time().real + input = input:type(typename) + local gmodule = nn.VolumetricReplicationPadding(pleft, pright, ptop, pbottom, + pfront, pback):type(typename) + local rescuda = gmodule:forward(input) - local error = rescuda:float() - groundtruth - mytester:assertlt(error:abs():max(), - precision_forward, 'error on state (forward) ') + local error = rescuda:double() - groundtruth:double() + mytester:assertlt(error:abs():max(), + precision_forward_type(precision_forward, typename), + string.format('error on state (forward) with %s', typename)) + end end function cunntest.VolumetricReplicationPadding_backward() @@ -6220,51 +5382,35 @@ function cunntest.VolumetricReplicationPadding_backward() local pfront = math.random(-3,3) local pback = math.random(-3,3) - local tm = {} - local title = - string.format( - 'VolumetricReplicationPadding.backward %dx%dx%dx%dx%d -> ' .. - '%dx%dx%dx%dx%d', - batch, plane, sizeZ, sizeY, sizeX, - batch, plane, sizeZ + pfront + pback, sizeY + ptop + pbottom, - sizeX + pleft + pright) - times[title] = tm - - local input = torch.rand(batch, plane, sizeZ, sizeY, sizeX) - local gradOutput = torch.rand( - batch, plane, sizeZ + pfront + pback, sizeY + ptop + pbottom, - sizeX + pleft + pright - ) - local module = nn.VolumetricReplicationPadding(pleft, pright, ptop, pbottom, - pfront, pback) - module:forward(input) - module:zeroGradParameters() - local groundgrad = module:backward(input, gradOutput) - local a = torch.Timer() - for i = 1, nloop do + for k, typename in ipairs(typenames) do + local input = torch.rand(batch, plane, sizeZ, sizeY, sizeX):type(typename) + local gradOutput = torch.rand( + batch, plane, sizeZ + pfront + pback, sizeY + ptop + pbottom, + sizeX + pleft + pright + ):type(typename) + + local ctype = t2cpu[typename] + input = input:type(ctype) + gradOutput = gradOutput:type(ctype) + local module = nn.VolumetricReplicationPadding(pleft, pright, ptop, pbottom, + pfront, pback):type(ctype) + module:forward(input) module:zeroGradParameters() - groundgrad = module:backward(input, gradOutput) - end - tm.cpu = a:time().real + local groundgrad = module:backward(input, gradOutput) - input = input:cuda() - gradOutput = gradOutput:cuda() - local gmodule = nn.VolumetricReplicationPadding(pleft, pright, ptop, pbottom, - pfront, pback):cuda() - gmodule:forward(input) - gmodule:zeroGradParameters() - local rescuda = gmodule:backward(input, gradOutput) - a:reset() - for i = 1, nloop do + input = input:type(typename) + gradOutput = gradOutput:type(typename) + local gmodule = nn.VolumetricReplicationPadding(pleft, pright, ptop, pbottom, + pfront, pback):type(typename) + gmodule:forward(input) gmodule:zeroGradParameters() - rescuda = gmodule:backward(input, gradOutput) - end - cutorch.synchronize() - tm.gpu = a:time().real + local rescuda = gmodule:backward(input, gradOutput) - local error = rescuda:float() - groundgrad - mytester:assertlt(error:abs():max(), - precision_backward, 'error on state (backward) ') + local error = rescuda:double() - groundgrad:double() + mytester:assertlt(error:abs():max(), + precision_backward_type(precision_backward, typename), + string.format('error on state (backward) with %s', typename)) + end end function cunntest.GPU() @@ -6508,6 +5654,7 @@ function nn.testcuda(tests, print_timing, n_loop, seed) nloop = n_loop or nloop local oldtype = torch.getdefaulttensortype() torch.setdefaulttensortype('torch.FloatTensor') + checkHalf() initSeed(seed) mytester = torch.Tester() mytester:add(cunntest) |