Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/torch/cunn.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSoumith Chintala <soumith@gmail.com>2016-11-09 02:18:01 +0300
committerGitHub <noreply@github.com>2016-11-09 02:18:01 +0300
commit8d5027feb6aac1ce2ee4e6a346ddcb1b7e95d6fa (patch)
treef0a89adfb00e7a49031ca32a9badcf016bc599cb
parentaa256bc7def4336dd9e6fa43b66bf00758b4922e (diff)
parent27479c372040b8cab4e53e9338e8ce840bdb67dd (diff)
Merge pull request #346 from gchanan/pointwiseGeneric
Add generic support for pointwise modules
-rw-r--r--CMakeLists.txt5
-rw-r--r--THCUNN.lua131
-rw-r--r--lib/THCUNN/Abs.cu24
-rw-r--r--lib/THCUNN/AbsCriterion.cu61
-rw-r--r--lib/THCUNN/BCECriterion.cu142
-rw-r--r--lib/THCUNN/BatchNormalization.cu230
-rw-r--r--lib/THCUNN/CMakeLists.txt1
-rw-r--r--lib/THCUNN/ClassNLLCriterion.cu214
-rw-r--r--lib/THCUNN/DistKLDivCriterion.cu70
-rw-r--r--lib/THCUNN/ELU.cu66
-rw-r--r--lib/THCUNN/HardTanh.cu77
-rw-r--r--lib/THCUNN/L1Cost.cu49
-rw-r--r--lib/THCUNN/LeakyReLU.cu78
-rw-r--r--lib/THCUNN/LogSigmoid.cu33
-rw-r--r--lib/THCUNN/LogSoftMax.cu376
-rw-r--r--lib/THCUNN/LookupTable.cu193
-rw-r--r--lib/THCUNN/MSECriterion.cu77
-rw-r--r--lib/THCUNN/MarginCriterion.cu65
-rw-r--r--lib/THCUNN/MultiLabelMarginCriterion.cu185
-rw-r--r--lib/THCUNN/MultiMarginCriterion.cu213
-rw-r--r--lib/THCUNN/PReLU.cu201
-rw-r--r--lib/THCUNN/RReLU.cu147
-rw-r--r--lib/THCUNN/SharedMem.cuh43
-rw-r--r--lib/THCUNN/Sigmoid.cu28
-rw-r--r--lib/THCUNN/SmoothL1Criterion.cu87
-rw-r--r--lib/THCUNN/SoftMarginCriterion.cu76
-rw-r--r--lib/THCUNN/SoftMax.cu193
-rw-r--r--lib/THCUNN/SoftPlus.cu44
-rw-r--r--lib/THCUNN/SoftShrink.cu43
-rw-r--r--lib/THCUNN/SparseLinear.cu277
-rw-r--r--lib/THCUNN/SpatialAdaptiveMaxPooling.cu217
-rw-r--r--lib/THCUNN/SpatialAveragePooling.cu161
-rw-r--r--lib/THCUNN/SpatialClassNLLCriterion.cu172
-rw-r--r--lib/THCUNN/SpatialConvolutionLocal.cu321
-rw-r--r--lib/THCUNN/SpatialConvolutionMM.cu352
-rw-r--r--lib/THCUNN/SpatialCrossMapLRN.cu138
-rw-r--r--lib/THCUNN/SpatialDilatedConvolution.cu338
-rw-r--r--lib/THCUNN/SpatialDilatedMaxPooling.cu160
-rw-r--r--lib/THCUNN/SpatialFractionalMaxPooling.cu210
-rw-r--r--lib/THCUNN/SpatialFullConvolution.cu337
-rw-r--r--lib/THCUNN/SpatialMaxPooling.cu18
-rw-r--r--lib/THCUNN/SpatialMaxUnpooling.cu82
-rw-r--r--lib/THCUNN/SpatialReflectionPadding.cu124
-rw-r--r--lib/THCUNN/SpatialReplicationPadding.cu123
-rw-r--r--lib/THCUNN/SpatialSubSampling.cu271
-rw-r--r--lib/THCUNN/SpatialUpSamplingBilinear.cu124
-rw-r--r--lib/THCUNN/SpatialUpSamplingNearest.cu114
-rw-r--r--lib/THCUNN/Sqrt.cu30
-rw-r--r--lib/THCUNN/Square.cu26
-rw-r--r--lib/THCUNN/THCHalfAutoNumerics.cuh248
-rw-r--r--lib/THCUNN/THCUNN.h1091
-rw-r--r--lib/THCUNN/Tanh.cu24
-rw-r--r--lib/THCUNN/TemporalConvolution.cu346
-rw-r--r--lib/THCUNN/TemporalMaxPooling.cu189
-rw-r--r--lib/THCUNN/Threshold.cu88
-rw-r--r--lib/THCUNN/VolumetricAveragePooling.cu287
-rw-r--r--lib/THCUNN/VolumetricConvolution.cu405
-rw-r--r--lib/THCUNN/VolumetricDilatedConvolution.cu367
-rw-r--r--lib/THCUNN/VolumetricDilatedMaxPooling.cu321
-rw-r--r--lib/THCUNN/VolumetricFullConvolution.cu356
-rw-r--r--lib/THCUNN/VolumetricMaxPooling.cu26
-rw-r--r--lib/THCUNN/VolumetricMaxUnpooling.cu210
-rw-r--r--lib/THCUNN/VolumetricReplicationPadding.cu127
-rw-r--r--lib/THCUNN/common.h12
-rw-r--r--lib/THCUNN/generic/Abs.cu28
-rw-r--r--lib/THCUNN/generic/AbsCriterion.cu59
-rw-r--r--lib/THCUNN/generic/BCECriterion.cu99
-rw-r--r--lib/THCUNN/generic/BatchNormalization.cu97
-rw-r--r--lib/THCUNN/generic/ClassNLLCriterion.cu156
-rw-r--r--lib/THCUNN/generic/DistKLDivCriterion.cu67
-rw-r--r--lib/THCUNN/generic/ELU.cu53
-rw-r--r--lib/THCUNN/generic/HardTanh.cu54
-rw-r--r--lib/THCUNN/generic/L1Cost.cu42
-rw-r--r--lib/THCUNN/generic/LeakyReLU.cu54
-rw-r--r--lib/THCUNN/generic/LogSigmoid.cu30
-rw-r--r--lib/THCUNN/generic/LogSoftMax.cu238
-rw-r--r--lib/THCUNN/generic/LookupTable.cu157
-rw-r--r--lib/THCUNN/generic/MSECriterion.cu75
-rw-r--r--lib/THCUNN/generic/MarginCriterion.cu64
-rw-r--r--lib/THCUNN/generic/MultiLabelMarginCriterion.cu111
-rw-r--r--lib/THCUNN/generic/MultiMarginCriterion.cu181
-rw-r--r--lib/THCUNN/generic/PReLU.cu154
-rw-r--r--lib/THCUNN/generic/RReLU.cu108
-rw-r--r--lib/THCUNN/generic/Sigmoid.cu29
-rw-r--r--lib/THCUNN/generic/SmoothL1Criterion.cu79
-rw-r--r--lib/THCUNN/generic/SoftMarginCriterion.cu60
-rw-r--r--lib/THCUNN/generic/SoftMax.cu153
-rw-r--r--lib/THCUNN/generic/SoftPlus.cu33
-rw-r--r--lib/THCUNN/generic/SoftShrink.cu32
-rw-r--r--lib/THCUNN/generic/SparseLinear.cu269
-rw-r--r--lib/THCUNN/generic/SpatialAdaptiveMaxPooling.cu188
-rw-r--r--lib/THCUNN/generic/SpatialAveragePooling.cu164
-rw-r--r--lib/THCUNN/generic/SpatialClassNLLCriterion.cu126
-rw-r--r--lib/THCUNN/generic/SpatialConvolutionLocal.cu324
-rw-r--r--lib/THCUNN/generic/SpatialConvolutionMM.cu427
-rw-r--r--lib/THCUNN/generic/SpatialCrossMapLRN.cu114
-rw-r--r--lib/THCUNN/generic/SpatialDilatedConvolution.cu402
-rw-r--r--lib/THCUNN/generic/SpatialDilatedMaxPooling.cu155
-rw-r--r--lib/THCUNN/generic/SpatialFractionalMaxPooling.cu155
-rw-r--r--lib/THCUNN/generic/SpatialFullConvolution.cu381
-rw-r--r--lib/THCUNN/generic/SpatialMaxPooling.cu40
-rw-r--r--lib/THCUNN/generic/SpatialMaxUnpooling.cu93
-rw-r--r--lib/THCUNN/generic/SpatialReflectionPadding.cu110
-rw-r--r--lib/THCUNN/generic/SpatialReplicationPadding.cu111
-rw-r--r--lib/THCUNN/generic/SpatialSubSampling.cu230
-rw-r--r--lib/THCUNN/generic/SpatialUpSamplingBilinear.cu72
-rw-r--r--lib/THCUNN/generic/SpatialUpSamplingNearest.cu115
-rw-r--r--lib/THCUNN/generic/Sqrt.cu30
-rw-r--r--lib/THCUNN/generic/Square.cu28
-rw-r--r--lib/THCUNN/generic/THCUNN.h1171
-rw-r--r--lib/THCUNN/generic/Tanh.cu29
-rw-r--r--lib/THCUNN/generic/TemporalConvolution.cu348
-rw-r--r--lib/THCUNN/generic/TemporalMaxPooling.cu151
-rw-r--r--lib/THCUNN/generic/Threshold.cu64
-rw-r--r--lib/THCUNN/generic/VolumetricAveragePooling.cu237
-rw-r--r--lib/THCUNN/generic/VolumetricConvolution.cu416
-rw-r--r--lib/THCUNN/generic/VolumetricDilatedConvolution.cu411
-rw-r--r--lib/THCUNN/generic/VolumetricDilatedMaxPooling.cu290
-rw-r--r--lib/THCUNN/generic/VolumetricFullConvolution.cu399
-rw-r--r--lib/THCUNN/generic/VolumetricMaxPooling.cu36
-rw-r--r--lib/THCUNN/generic/VolumetricMaxUnpooling.cu192
-rw-r--r--lib/THCUNN/generic/VolumetricReplicationPadding.cu120
-rw-r--r--lib/THCUNN/im2col.h13
-rw-r--r--lib/THCUNN/vol2col.h13
-rw-r--r--test.lua6497
125 files changed, 14077 insertions, 12901 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3ce811f..501a2de 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -14,6 +14,11 @@ FILE(WRITE THCUNN_h.lua "return [[")
FILE(APPEND THCUNN_h.lua ${THCUNN_headers})
FILE(APPEND THCUNN_h.lua "]]")
+FILE(STRINGS lib/THCUNN/generic/THCUNN.h THCUNN_generic_headers NEWLINE_CONSUME)
+FILE(WRITE THCUNN_generic_h.lua "return [[")
+FILE(APPEND THCUNN_generic_h.lua ${THCUNN_generic_headers})
+FILE(APPEND THCUNN_generic_h.lua "]]")
+
FILE(GLOB luasrc *.lua)
ADD_SUBDIRECTORY(lib)
diff --git a/THCUNN.lua b/THCUNN.lua
index 771cd0c..490cd5c 100644
--- a/THCUNN.lua
+++ b/THCUNN.lua
@@ -12,46 +12,145 @@ function THCUNN.getState()
return THCState_ptr(cutorch.getState());
end
-local THCUNN_h = require 'cunn.THCUNN_h'
+local THCUNN_generic_h = require 'cunn.THCUNN_generic_h'
-- strip all lines starting with #
-- to remove preprocessor directives originally present
-- in THNN.h
-THCUNN_h = THCUNN_h:gsub("\n#[^\n]*", "")
-THCUNN_h = THCUNN_h:gsub("^#[^\n]*\n", "")
+THCUNN_generic_h = THCUNN_generic_h:gsub("\n#[^\n]*", "")
+THCUNN_generic_h = THCUNN_generic_h:gsub("^#[^\n]*\n", "")
-local preprocessed = string.gsub(THCUNN_h, 'TH_API ', '')
+local preprocessed_generic = string.gsub(THCUNN_generic_h, 'TH_API void THNN_%(([%a%d_]+)%)', 'void THNN_TYPE%1')
local replacements =
{
{
['THTensor'] = 'THCudaTensor',
- ['THIndexTensor'] = 'THCudaLongTensor',
+ ['THCIndexTensor'] = 'THCudaLongTensor',
['THIndex_t'] = 'long',
['THInteger_t'] = 'float'
}
}
-for i=1,#replacements do
- local r = replacements[i]
- local s = preprocessed
- for k,v in pairs(r) do
- s = string.gsub(s, k, v)
+local cct2lt = {
+ ['THCudaFloatTensor'] = 'torch.CudaTensor',
+ ['THCudaDoubleTensor'] = 'torch.CudaDoubleTensor',
+}
+
+local replacements_generic =
+{
+ {
+ ['THCTensor'] = 'THCudaTensor',
+ ['THCIndexTensor'] = 'THCudaLongTensor',
+ ['TYPE'] = 'Cuda',
+ ['real'] = 'float'
+ },
+ {
+ ['THCTensor'] = 'THCudaDoubleTensor',
+ ['THCIndexTensor'] = 'THCudaLongTensor',
+ ['TYPE'] = 'CudaDouble',
+ ['real'] = 'double',
+ }
+}
+
+if cutorch.hasHalf then
+ ffi.cdef("half THC_float2half(float a);")
+ ffi.cdef("float THC_half2float(half a);")
+ cct2lt['THCudaHalfTensor'] = 'torch.CudaHalfTensor'
+ local half_replacement = {
+ ['THCTensor'] = 'THCudaHalfTensor',
+ ['THCIndexTensor'] = 'THCudaLongTensor',
+ ['TYPE'] = 'CudaHalf',
+ ['real'] = 'half'
+ }
+ table.insert(replacements_generic, half_replacement)
+end
+
+for i=1,#replacements_generic do
+ local r = replacements_generic[i]
+ local s = preprocessed_generic
+ for k,v in pairs(r) do
+ s = string.gsub(s, k, v)
+ end
+ ffi.cdef(s)
+end
+
+local function extract_function_names_generic(s)
+ local t = {}
+ for n in string.gmatch(s, 'TH_API void THNN_%(([%a%d_]+)%)') do
+ t[#t+1] = n
end
- ffi.cdef(s)
+ return t
end
-local function extract_function_names(s)
+local function find_positions(s, p)
+ local begin = 0
+ local positions = {}
+ while true do
+ local start, stop = string.find(s, p, begin)
+ if (start == nil) then break end
+ positions[#positions+1] = start
+ begin = stop + 1
+ end
+ return positions
+end
+
+local function extract_function_names_and_real_args(s)
local t = {}
- for n in string.gmatch(s, 'TH_API void THNN_Cuda([%a%d_]+)') do
- t[#t+1] = n
+ for n in string.gmatch(s, 'TH_API ([^;]+)') do
+ local func_name = string.match(n, 'void THNN_%(([%a%d_]+)%)')
+ local param_positions = find_positions(n, ',')
+ local positions = {}
+ for x,y in ipairs(find_positions(n, 'real')) do
+ local found = false
+ for cn,cp in ipairs(param_positions) do
+ if cp > y then
+ positions[#positions+1] = cn
+ found = true
+ break
+ end
+ end
+ -- it is the last param
+ if not found then positions[#positions+1] = #param_positions + 1 end
+ end
+
+ t[func_name] = positions
end
return t
end
+local real_args = extract_function_names_and_real_args(THCUNN_generic_h)
+
-- build function table
-local function_names = extract_function_names(THCUNN_h)
+local function_names_generic = extract_function_names_generic(THCUNN_generic_h)
-THNN.kernels['torch.CudaTensor'] = THNN.bind(THCUNN.C, function_names, 'Cuda', THCUNN.getState)
+THNN.kernels['torch.CudaTensor'] = THNN.bind(THCUNN.C, function_names_generic, 'Cuda', THCUNN.getState)
torch.getmetatable('torch.CudaTensor').THNN = THNN.kernels['torch.CudaTensor']
+THNN.kernels['torch.CudaDoubleTensor'] = THNN.bind(THCUNN.C, function_names_generic, 'CudaDouble', THCUNN.getState)
+torch.getmetatable('torch.CudaDoubleTensor').THNN = THNN.kernels['torch.CudaDoubleTensor']
+
+-- in order to call 'half' functions from lua, convert real arguments from
+-- to half since there is no other defined conversion
+local transform_reals_to_half = function(func_name, real_args, ...)
+ t = {}
+ -- this select logic is necessary to deal with nil arguments
+ for i = 1, select('#', ...) do
+ t[i] = select(i, ...)
+ end
+ for k,v in ipairs(real_args[func_name]) do
+ -- first argument (THCState) is added implicitly by bind
+ t[v-1] = ffi.C.THC_float2half(t[v-1])
+ end
+ return t
+end
+
+local raw_half_functions = THNN.bind(THCUNN.C, function_names_generic, 'CudaHalf', THCUNN.getState)
+for k,v in pairs(raw_half_functions) do
+ -- select required in case there are trailing nils
+ raw_half_functions[k] = function(...) v(unpack(transform_reals_to_half(k, real_args, ...), 1, select("#",...)))
+end
+end
+THNN.kernels['torch.CudaHalfTensor'] = raw_half_functions
+torch.getmetatable('torch.CudaHalfTensor').THNN = THNN.kernels['torch.CudaHalfTensor']
+
return THCUNN
diff --git a/lib/THCUNN/Abs.cu b/lib/THCUNN/Abs.cu
index 81b3297..0abefd9 100644
--- a/lib/THCUNN/Abs.cu
+++ b/lib/THCUNN/Abs.cu
@@ -1,32 +1,24 @@
#include "THCUNN.h"
-#include "common.h"
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
+template <typename T>
struct absupdateOutput_functor
{
- __device__ void operator()(float* output, const float* input) const
+ __device__ void operator()(T* output, const T* input) const
{
*output = abs(*input);
}
};
-void THNN_CudaAbs_updateOutput(THCState *state, THCudaTensor *input, THCudaTensor *output)
-{
- THCUNN_assertSameGPU(state, 2, input, output);
- THCudaTensor_resizeAs(state, output, input);
- THC_pointwiseApply2(state, output, input, absupdateOutput_functor());
-}
-
+template <typename T>
struct absupdateGradInput_functor
{
- __device__ void operator()(float* gradInput, const float* input, const float* gradOutput) const
+ __device__ void operator()(T* gradInput, const T* input, const T* gradOutput) const
{
*gradInput = *input < 0 ? - *gradOutput : *gradOutput;
}
};
-void THNN_CudaAbs_updateGradInput(THCState *state, THCudaTensor *input, THCudaTensor *gradOutput, THCudaTensor *gradInput)
-{
- THCUNN_assertSameGPU(state, 3, input, gradOutput, gradInput);
- THCudaTensor_resizeAs(state, gradInput, input);
- THC_pointwiseApply3(state, gradInput, input, gradOutput, absupdateGradInput_functor());
-}
+#include "generic/Abs.cu"
+#include "THCGenerateFloatTypes.h"
diff --git a/lib/THCUNN/AbsCriterion.cu b/lib/THCUNN/AbsCriterion.cu
index fd19cae..057c971 100644
--- a/lib/THCUNN/AbsCriterion.cu
+++ b/lib/THCUNN/AbsCriterion.cu
@@ -1,5 +1,7 @@
#include "THCUNN.h"
#include "common.h"
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
#include <thrust/fill.h>
#include <thrust/functional.h>
@@ -7,69 +9,30 @@
#include <thrust/reduce.h>
#include <thrust/inner_product.h>
+template <typename Dtype, typename Acctype>
struct abs_functor
{
- __host__ __device__ float operator()(const float& x, const float& y) const
+ __host__ __device__ Acctype operator()(const Dtype& x, const Dtype& y) const
{
- float z = x-y;
- return z >= 0 ? z : -z;
+ Dtype z = x-y;
+ return ScalarConvert<Dtype, Acctype>::to(z >= 0 ? z : -z);
}
};
-void THNN_CudaAbsCriterion_updateOutput(THCState *state, THCudaTensor *input, THCudaTensor *target, THCudaTensor *output, bool sizeAverage)
-{
- THCUNN_assertSameGPU(state, 2, input, target);
-
- long size = THCudaTensor_nElement(state, input);
-
- input = THCudaTensor_newContiguous(state, input);
- target = THCudaTensor_newContiguous(state, target);
-
- thrust::device_ptr<float> input_data(THCudaTensor_data(state, input));
- thrust::device_ptr<float> target_data(THCudaTensor_data(state, target));
- float sum = thrust::inner_product(input_data, input_data+size, target_data, (float) 0, thrust::plus<float>(), abs_functor());
-
- if (sizeAverage)
- sum /= size;
-
- THCudaTensor_free(state, input);
- THCudaTensor_free(state, target);
-
- THCudaTensor_set1d(state, output, 0, sum);
-}
-
+template <typename Dtype>
struct abs_updateGradInput_functor
{
- const float norm;
+ const Dtype norm;
- abs_updateGradInput_functor(float norm_)
+ abs_updateGradInput_functor(Dtype norm_)
: norm(norm_)
{}
- __host__ __device__ float operator()(const float& x, const float& y) const
+ __host__ __device__ Dtype operator()(const Dtype& x, const Dtype& y) const
{
return (x - y) >= 0 ? norm : -norm;
}
};
-void THNN_CudaAbsCriterion_updateGradInput(THCState *state, THCudaTensor *input, THCudaTensor *target, THCudaTensor *gradInput, bool sizeAverage)
-{
- THCUNN_assertSameGPU(state, 3, input, target, gradInput);
-
- long size = THCudaTensor_nElement(state, input);
- float norm = (sizeAverage ? 1./size : 1.);
-
- input = THCudaTensor_newContiguous(state, input);
- target = THCudaTensor_newContiguous(state, target);
-
- THCudaTensor_resizeAs(state, gradInput, input);
-
- thrust::device_ptr<float> input_data(THCudaTensor_data(state, input));
- thrust::device_ptr<float> target_data(THCudaTensor_data(state, target));
- thrust::device_ptr<float> gradInput_data(THCudaTensor_data(state, gradInput));
-
- thrust::transform(input_data, input_data+size, target_data, gradInput_data, abs_updateGradInput_functor(norm));
-
- THCudaTensor_free(state, input);
- THCudaTensor_free(state, target);
-}
+#include "generic/AbsCriterion.cu"
+#include "THCGenerateFloatTypes.h"
diff --git a/lib/THCUNN/BCECriterion.cu b/lib/THCUNN/BCECriterion.cu
index c8edcfe..3653fc8 100644
--- a/lib/THCUNN/BCECriterion.cu
+++ b/lib/THCUNN/BCECriterion.cu
@@ -1,5 +1,7 @@
#include "THCUNN.h"
#include "common.h"
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
#include <thrust/functional.h>
#include <thrust/device_ptr.h>
@@ -7,148 +9,80 @@
#include <thrust/transform.h>
#include <thrust/transform_reduce.h>
-const float eps = 1e-12f;
+template <typename T>
+inline __device__ T eps();
+template <>
+inline __device__ float eps() { return 1e-12f; }
+
+template <>
+inline __device__ double eps() { return 1e-12; }
+
+template <typename Dtype, typename Acctype>
struct bce_functor
{
template <class Tuple>
__host__ __device__
- float operator()(Tuple x)
+ Acctype operator()(Tuple x)
{
- float o = thrust::get<0>(x);
- float t = thrust::get<1>(x);
- return - (t * logf(o + eps) + (1.f - t) * logf(1.f - o + eps));
+ Dtype o = thrust::get<0>(x);
+ Dtype t = thrust::get<1>(x);
+ return - (t * THCNumerics<Acctype>::log(o + eps<Acctype>()) + (Acctype(1)- t) * THCNumerics<Acctype>::log(Acctype(1) - o + eps<Acctype>()));
}
};
+template <typename Dtype, typename Acctype>
struct bce_functor_weights
{
template <class Tuple>
__host__ __device__
- float operator()(Tuple x)
+ Acctype operator()(Tuple x)
{
- float o = thrust::get<0>(x);
- float t = thrust::get<1>(x);
- float w = thrust::get<2>(x);
- return - w * (t * logf(o + eps) + (1.f - t) * logf(1.f - o + eps));
+ Dtype o = thrust::get<0>(x);
+ Dtype t = thrust::get<1>(x);
+ Dtype w = thrust::get<2>(x);
+ return - w * (t * THCNumerics<Acctype>::log(o + eps<Acctype>()) + (Acctype(1) - t) * THCNumerics<Acctype>::log(Acctype(1) - o + eps<Acctype>()));
}
};
-void THNN_CudaBCECriterion_updateOutput(THCState *state, THCudaTensor *input, THCudaTensor *target, THCudaTensor *output, bool sizeAverage, THCudaTensor *weights)
-{
- THCUNN_assertSameGPU(state, 3, input, target, weights);
-
- long size = THCudaTensor_nElement(state, input);
-
- input = THCudaTensor_newContiguous(state, input);
- target = THCudaTensor_newContiguous(state, target);
-
- thrust::device_ptr<float> input_data(THCudaTensor_data(state, input));
- thrust::device_ptr<float> target_data(THCudaTensor_data(state, target));
-
- float sum;
- if (weights) {
- weights = THCudaTensor_newContiguous(state, weights);
- thrust::device_ptr<float> weights_data(THCudaTensor_data(state, weights));
- sum = thrust::transform_reduce(
- thrust::make_zip_iterator(thrust::make_tuple(input_data, target_data, weights_data)),
- thrust::make_zip_iterator(thrust::make_tuple(input_data+size, target_data+size, weights_data+size)),
- bce_functor_weights(),
- (float) 0.f,
- thrust::plus<float>()
- );
- THCudaTensor_free(state, weights);
- } else {
- sum = thrust::transform_reduce(
- thrust::make_zip_iterator(thrust::make_tuple(input_data, target_data)),
- thrust::make_zip_iterator(thrust::make_tuple(input_data+size, target_data+size)),
- bce_functor(),
- (float) 0.f,
- thrust::plus<float>()
- );
- }
-
- if (sizeAverage)
- sum /= size;
-
- THCudaTensor_free(state, input);
- THCudaTensor_free(state, target);
-
- THCudaTensor_set1d(state, output, 0, sum);
-}
-
+template <typename Dtype, typename Acctype>
struct bce_updateGradInput_functor
{
- const float norm;
+ const Dtype norm;
- bce_updateGradInput_functor(float norm_)
+ bce_updateGradInput_functor(Dtype norm_)
: norm(norm_)
{}
template <class Tuple>
__host__ __device__
- float operator()(Tuple x)
+ Dtype operator()(Tuple x)
{
- float o = thrust::get<0>(x);
- float t = thrust::get<1>(x);
- return - (t - o) / ((1 - o + eps) * (o + eps)) * norm;
+ Dtype o = thrust::get<0>(x);
+ Dtype t = thrust::get<1>(x);
+ return ScalarConvert<Acctype,Dtype>::to(- (t - o) / ((Acctype(1) - o + eps<Acctype>()) * (o + eps<Acctype>())) * norm);
}
};
+template <typename Dtype, typename Acctype>
struct bce_updateGradInput_functor_weights
{
- const float norm;
+ const Dtype norm;
- bce_updateGradInput_functor_weights(float norm_)
+ bce_updateGradInput_functor_weights(Dtype norm_)
: norm(norm_)
{}
template <class Tuple>
__host__ __device__
- float operator()(Tuple x)
+ Dtype operator()(Tuple x)
{
- float o = thrust::get<0>(x);
- float t = thrust::get<1>(x);
- float w = thrust::get<2>(x);
- return - (t - o) / ((1 - o + eps) * (o + eps)) * norm * w;
+ Dtype o = thrust::get<0>(x);
+ Dtype t = thrust::get<1>(x);
+ Dtype w = thrust::get<2>(x);
+ return ScalarConvert<Acctype, Dtype>::to(- (t - o) / ((Acctype(1) - o + eps<Acctype>()) * (o + eps<Acctype>())) * norm * w);
}
};
-void THNN_CudaBCECriterion_updateGradInput(THCState *state, THCudaTensor *input, THCudaTensor *target, THCudaTensor *gradInput, bool sizeAverage, THCudaTensor *weights)
-{
- THCUNN_assertSameGPU(state, 4, input, target, gradInput, weights);
-
- long size = THCudaTensor_nElement(state, input);
- float norm = (sizeAverage ? 1./size : 1.);
-
- input = THCudaTensor_newContiguous(state, input);
- target = THCudaTensor_newContiguous(state, target);
-
- THCudaTensor_resizeAs(state, gradInput, input);
-
- thrust::device_ptr<float> input_data(THCudaTensor_data(state, input));
- thrust::device_ptr<float> target_data(THCudaTensor_data(state, target));
- thrust::device_ptr<float> gradInput_data(THCudaTensor_data(state, gradInput));
-
- if (weights) {
- weights = THCudaTensor_newContiguous(state, weights);
- thrust::device_ptr<float> weights_data(THCudaTensor_data(state, weights));
- thrust::transform(
- thrust::make_zip_iterator(thrust::make_tuple(input_data, target_data, weights_data)),
- thrust::make_zip_iterator(thrust::make_tuple(input_data+size, target_data+size, weights_data+size)),
- gradInput_data,
- bce_updateGradInput_functor_weights(norm)
- );
- THCudaTensor_free(state, weights);
- } else {
- thrust::transform(
- thrust::make_zip_iterator(thrust::make_tuple(input_data, target_data)),
- thrust::make_zip_iterator(thrust::make_tuple(input_data+size, target_data+size)),
- gradInput_data,
- bce_updateGradInput_functor(norm)
- );
- }
-
- THCudaTensor_free(state, input);
- THCudaTensor_free(state, target);
-}
+#include "generic/BCECriterion.cu"
+#include "THCGenerateFloatTypes.h"
diff --git a/lib/THCUNN/BatchNormalization.cu b/lib/THCUNN/BatchNormalization.cu
index ca2cd8c..125e3ff 100644
--- a/lib/THCUNN/BatchNormalization.cu
+++ b/lib/THCUNN/BatchNormalization.cu
@@ -1,12 +1,12 @@
#include "THCUNN.h"
#include "common.h"
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
#include "THCDeviceTensor.cuh"
#include "THCDeviceTensorUtils.cuh"
const int WARP_SIZE = 32;
-typedef THCDeviceTensor<float, 3> DeviceTensor3;
-typedef THCDeviceTensor<float, 1> DeviceTensor1;
// The maximum number of threads in a block
const int MAX_BLOCK_SIZE = 512;
@@ -27,11 +27,13 @@ __device__ __forceinline__ int getMSB(int val) {
return 31 - __clz(val);
}
+template <typename Dtype, typename Acctype>
struct Float2 {
- float v1, v2;
+ Acctype v1, v2;
__device__ Float2() {}
- __device__ Float2(float v1, float v2) : v1(v1), v2(v2) {}
- __device__ Float2(float v) : v1(v), v2(v) {}
+ __device__ Float2(Dtype v1, Dtype v2) : v1(ScalarConvert<Dtype, Acctype>::to(v1)), v2(ScalarConvert<Dtype, Acctype>::to(v2)) {}
+ __device__ Float2(Dtype v) : v1(ScalarConvert<Dtype, Acctype>::to(v)), v2(ScalarConvert<Dtype, Acctype>::to(v)) {}
+ __device__ Float2(int v) : v1(ScalarConvert<int, Acctype>::to(v)), v2(ScalarConvert<int, Acctype>::to(v)) {}
__device__ Float2& operator+=(const Float2& a) {
v1 += a.v1;
v2 += a.v2;
@@ -39,45 +41,49 @@ struct Float2 {
}
};
+template <typename Dtype, typename Acctype, typename DeviceTensor3>
struct SumOp {
__device__ SumOp(const DeviceTensor3 t) : tensor(t) {}
- __device__ __forceinline__ float operator()(int batch, int plane, int n) {
- return tensor[batch][plane][n];
+ __device__ __forceinline__ Acctype operator()(int batch, int plane, int n) {
+ return ScalarConvert<Dtype, Acctype>::to(tensor[batch][plane][n]);
}
const DeviceTensor3 tensor;
};
+template <typename Dtype, typename Acctype, typename DeviceTensor3>
struct VarOp {
- __device__ VarOp(float m, const DeviceTensor3 t) : mean(m), tensor(t) {}
- __device__ __forceinline__ float operator()(int batch, int plane, int n) {
- float val = tensor[batch][plane][n];
+ __device__ VarOp(Acctype m, const DeviceTensor3 t) : mean(m), tensor(t) {}
+ __device__ __forceinline__ Acctype operator()(int batch, int plane, int n) {
+ Dtype val = tensor[batch][plane][n];
return (val - mean) * (val - mean);
}
- const float mean;
+ const Acctype mean;
const DeviceTensor3 tensor;
};
+template <typename Dtype, typename Acctype, typename DeviceTensor3>
struct GradOp {
- __device__ GradOp(float m, const DeviceTensor3 i, const DeviceTensor3 g)
+ __device__ GradOp(Acctype m, const DeviceTensor3 i, const DeviceTensor3 g)
: mean(m), input(i), gradOutput(g) {}
- __device__ __forceinline__ Float2 operator()(int batch, int plane, int n) {
- float g = gradOutput[batch][plane][n];
- float c = input[batch][plane][n] - mean;
- return Float2(g, g * c);
+ __device__ __forceinline__ Float2<Dtype, Acctype> operator()(int batch, int plane, int n) {
+ Dtype g = gradOutput[batch][plane][n];
+ Dtype c = ScalarConvert<Acctype, Dtype>::to(input[batch][plane][n] - mean);
+ return Float2<Dtype, Acctype>(g, g * c);
}
- const float mean;
+ const Acctype mean;
const DeviceTensor3 input;
const DeviceTensor3 gradOutput;
};
// Sum across all threads within a warp
-static __device__ __forceinline__ float warpSum(float val) {
+template <typename T>
+static __device__ __forceinline__ T warpSum(T val) {
#if __CUDA_ARCH__ >= 300
for (int i = 0; i < getMSB(WARP_SIZE); ++i) {
val += __shfl_xor(val, 1 << i, WARP_SIZE);
}
#else
- __shared__ float values[MAX_BLOCK_SIZE];
+ __shared__ T values[MAX_BLOCK_SIZE];
values[threadIdx.x] = val;
__threadfence_block();
const int base = (threadIdx.x / WARP_SIZE) * WARP_SIZE;
@@ -88,14 +94,15 @@ static __device__ __forceinline__ float warpSum(float val) {
return val;
}
-static __device__ __forceinline__ Float2 warpSum(Float2 value) {
+template <typename Dtype, typename Acctype>
+static __device__ __forceinline__ Float2<Dtype, Acctype> warpSum(Float2<Dtype, Acctype> value) {
value.v1 = warpSum(value.v1);
value.v2 = warpSum(value.v2);
return value;
}
// Sum across (batch, x/y/z) applying Op() pointwise
-template<typename T, typename Op>
+template<typename T, typename Op, typename DeviceTensor3>
__device__ T reduce(Op op, DeviceTensor3 tensor, int plane) {
T sum = (T)0;
for (int batch = 0; batch < tensor.getSize(0); ++batch) {
@@ -130,32 +137,7 @@ __device__ T reduce(Op op, DeviceTensor3 tensor, int plane) {
return shared[0];
}
-template <int Dim>
-static THCDeviceTensor<float, Dim> devicetensor(THCState *state, THCudaTensor *t) {
- if (!t) {
- return THCDeviceTensor<float, Dim>();
- }
-
- int inDim = THCudaTensor_nDimension(state, t);
- if (inDim == Dim) {
- return toDeviceTensor<float, Dim>(state, t);
- }
-
- // View in which the last dimensions are collapsed or expanded as needed
- THAssert(THCudaTensor_isContiguous(state, t));
- int size[Dim];
- for (int i = 0; i < Dim || i < inDim; ++i) {
- if (i < Dim && i < inDim) {
- size[i] = t->size[i];
- } else if (i < Dim) {
- size[i] = 1;
- } else {
- size[Dim - 1] *= t->size[i];
- }
- }
- return THCDeviceTensor<float, Dim>(THCudaTensor_data(state, t), size);
-}
-
+template <typename Dtype, typename Acctype, typename DeviceTensor1, typename DeviceTensor3>
__global__ void BatchNormalizationUpdateOutputInference_kernel(
const DeviceTensor3 input,
DeviceTensor3 output,
@@ -163,31 +145,32 @@ __global__ void BatchNormalizationUpdateOutputInference_kernel(
DeviceTensor1 runningVar,
const DeviceTensor1 weight,
const DeviceTensor1 bias,
- float epsilon) {
+ Acctype epsilon) {
int plane = blockIdx.x;
- float invstd = 1.0f / sqrt(runningVar[plane].ldg() + epsilon);
- float mean = runningMean[plane].ldg();
- float gamma = weight.numElements() > 0 ? weight[plane].ldg() : 1.0f;
- float beta = bias.numElements() > 0 ? bias[plane].ldg() : 0.0f;
+ Acctype invstd = Acctype(1) / sqrt(runningVar[plane].ldg() + epsilon);
+ Acctype mean = ScalarConvert<Dtype, Acctype>::to(runningMean[plane].ldg());
+ Acctype gamma = weight.numElements() > 0 ? ScalarConvert<Dtype, Acctype>::to(weight[plane].ldg()) : Acctype(1);
+ Acctype beta = bias.numElements() > 0 ? ScalarConvert<Dtype, Acctype>::to(bias[plane].ldg()) : Acctype(0);
// Write normalized and update the output
for (int batch = 0; batch < input.getSize(0); batch++) {
for (int x = threadIdx.x; x < input.getSize(2); x += blockDim.x) {
- float inp = input[batch][plane][x].ldg();
- output[batch][plane][x] = gamma * (inp - mean) * invstd + beta;
+ Dtype inp = input[batch][plane][x].ldg();
+ output[batch][plane][x] = ScalarConvert<Acctype, Dtype>::to(gamma * (inp - mean) * invstd + beta);
}
}
}
+template <typename Dtype, typename Acctype, typename DeviceTensor1, typename DeviceTensor3>
__global__ void BatchNormalizationUpdateOutput_kernel(
const DeviceTensor3 input,
DeviceTensor3 output,
const DeviceTensor1 weight,
const DeviceTensor1 bias,
- const float epsilon,
- const float momentum,
+ const Acctype epsilon,
+ const Acctype momentum,
DeviceTensor1 runningMean,
DeviceTensor1 runningVar,
DeviceTensor1 saveMean,
@@ -196,73 +179,39 @@ __global__ void BatchNormalizationUpdateOutput_kernel(
int plane = blockIdx.x;
int N = input.getSize(0) * input.getSize(2);
- float norm = 1.0f / N;
+ Acctype norm = Acctype(1) / N;
// Compute the mean and variance across (batch, x/y/z)
- float mean = reduce<float>(SumOp(input), input, plane) * norm;
+ Acctype mean = reduce<Acctype>(SumOp<Dtype, Acctype, DeviceTensor3>(input), input, plane) * norm;
__syncthreads();
- float varN = reduce<float>(VarOp(mean, input), input, plane);
- float invStd = 0.0f;
- if (varN != 0.0f || epsilon != 0.0f) {
+ Acctype varN = reduce<Acctype>(VarOp<Dtype, Acctype, DeviceTensor3>(mean, input), input, plane);
+ Acctype invStd = 0;
+ if (varN != Acctype(0) || epsilon != Acctype(0)) {
invStd = 1 / sqrt(varN * norm + epsilon);
}
// Save the mean, variance, and moving averages
if (threadIdx.x == 0) {
// Momentum based writeback
- float unbiasedVar = varN / (N - 1);
- saveMean[plane] = mean;
- saveStd[plane] = invStd;
- runningMean[plane] = (1 - momentum) * runningMean[plane] + momentum * mean;
- runningVar[plane] = (1 - momentum) * runningVar[plane] + momentum * unbiasedVar;
+ Acctype unbiasedVar = varN / (N - 1);
+ saveMean[plane] = ScalarConvert<Acctype, Dtype>::to(mean);
+ saveStd[plane] = ScalarConvert<Acctype, Dtype>::to(invStd);
+ runningMean[plane] = ScalarConvert<Acctype, Dtype>::to((1 - momentum) * runningMean[plane] + momentum * mean);
+ runningVar[plane] = ScalarConvert<Acctype, Dtype>::to((1 - momentum) * runningVar[plane] + momentum * unbiasedVar);
}
// Write normalized and update the output
- float gamma = weight.numElements() > 0 ? weight[plane] : 1.0f;
- float beta = bias.numElements() > 0 ? bias[plane] : 0.0f;
+ Acctype gamma = weight.numElements() > 0 ? ScalarConvert<Dtype, Acctype>::to(weight[plane]) : ScalarConvert<int, Acctype>::to(1);
+ Acctype beta = bias.numElements() > 0 ? ScalarConvert<Dtype, Acctype>::to(bias[plane]) : ScalarConvert<int, Acctype>::to(0);
for (int batch = 0; batch < input.getSize(0); ++batch) {
for (int x = threadIdx.x; x < input.getSize(2); x += blockDim.x) {
- float inp = input[batch][plane][x].ldg();
- output[batch][plane][x] = gamma * (inp - mean) * invStd + beta;
+ Dtype inp = input[batch][plane][x].ldg();
+ output[batch][plane][x] = ScalarConvert<Acctype, Dtype>::to(gamma * (inp - mean) * invStd + beta);
}
}
}
-void THNN_CudaBatchNormalization_updateOutput(
- THCState *state, THCudaTensor *input_, THCudaTensor *output_,
- THCudaTensor *weight_, THCudaTensor *bias_, THCudaTensor *runningMean_,
- THCudaTensor *runningVar_, THCudaTensor *saveMean_, THCudaTensor *saveStd_,
- bool train, double momentum, double eps) {
-
- THCUNN_assertSameGPU(state, 8, input_, output_, weight_, bias_, runningMean_,
- runningVar_, saveMean_, saveStd_);
- DeviceTensor3 input = devicetensor<3>(state, input_);
- DeviceTensor3 output = devicetensor<3>(state, output_);
- DeviceTensor1 weight = devicetensor<1>(state, weight_);
- DeviceTensor1 bias = devicetensor<1>(state, bias_);
- DeviceTensor1 runningMean = devicetensor<1>(state, runningMean_);
- DeviceTensor1 runningVar = devicetensor<1>(state, runningVar_);
- DeviceTensor1 saveMean = devicetensor<1>(state, saveMean_);
- DeviceTensor1 saveStd = devicetensor<1>(state, saveStd_);
-
- cudaStream_t s = THCState_getCurrentStream(state);
- cudaDeviceProp *prop = THCState_getCurrentDeviceProperties(state);
-
- if (!train) {
- dim3 blocks(input.getSize(1));
- dim3 threads(getNumThreads(input.getSize(2)));
- BatchNormalizationUpdateOutputInference_kernel<<<blocks, threads, 0, s>>>(
- input, output, runningMean, runningVar, weight, bias, eps);
- } else {
- dim3 blocks(input.getSize(1));
- dim3 threads(getNumThreads(input.getSize(2)));
- BatchNormalizationUpdateOutput_kernel<<<blocks, threads, 0, s>>>(
- input, output, weight, bias, eps, momentum, runningMean, runningVar,
- saveMean, saveStd);
- }
- THCudaCheck(cudaGetLastError());
-}
-
+template <typename Dtype, typename Acctype, typename DeviceTensor1, typename DeviceTensor3>
__global__ void BatchNormalizationBackward_kernel(
const DeviceTensor3 input,
const DeviceTensor3 gradOutput,
@@ -275,45 +224,46 @@ __global__ void BatchNormalizationBackward_kernel(
const DeviceTensor1 saveMean,
const DeviceTensor1 saveStd,
bool train,
- float scale,
+ Acctype scale,
double eps) {
int plane = blockIdx.x;
int N = gradOutput.getSize(0) * gradOutput.getSize(2);
- float mean, stdVal;
+ Acctype mean, stdVal;
if (train) {
- mean = saveMean[plane];
- stdVal = saveStd[plane];
+ mean = ScalarConvert<Dtype, Acctype>::to(saveMean[plane]);
+ stdVal = ScalarConvert<Dtype, Acctype>::to(saveStd[plane]);
} else {
- mean = runningMean[plane];
+ mean = ScalarConvert<Dtype, Acctype>::to(runningMean[plane]);
stdVal = 1 / sqrt(runningVar[plane] + eps);
}
- float weightVal = weight.numElements() > 0 ? weight[plane] : 1.0f;
- float norm = 1.0f / N;
+ Acctype weightVal = weight.numElements() > 0 ? ScalarConvert<Dtype, Acctype>::to(weight[plane]) : Acctype(1);
+ Acctype norm = Acctype(1) / N;
// Compute two values across (batch, x/y/z) in one pass:
// 1. Sum(gradOutput)
// 2. DotProduct(input - mean, gradOutput)
- Float2 res = reduce<Float2>(GradOp(mean, input, gradOutput), gradOutput, plane);
- float gradOutputSum = res.v1;
- float dotP = res.v2;
+ GradOp<Dtype, Acctype, DeviceTensor3> g(mean, input, gradOutput);
+ Float2<Dtype, Acctype> res = reduce<Float2<Dtype, Acctype>, GradOp<Dtype, Acctype, DeviceTensor3>, DeviceTensor3>(g, gradOutput, plane);
+ Acctype gradOutputSum = res.v1;
+ Acctype dotP = res.v2;
- float gradMean = gradOutputSum * norm;
- float projScale = dotP * norm * stdVal * stdVal;
- float gradScale = stdVal * weightVal;
+ Acctype gradMean = gradOutputSum * norm;
+ Acctype projScale = dotP * norm * stdVal * stdVal;
+ Acctype gradScale = stdVal * weightVal;
if (gradInput.numElements() > 0) {
for (int batch = 0; batch < gradOutput.getSize(0); ++batch) {
for (int x = threadIdx.x; x < gradOutput.getSize(2); x += blockDim.x) {
- float gradOut = gradOutput[batch][plane][x];
+ Dtype gradOut = gradOutput[batch][plane][x];
if (train) {
- float inp = input[batch][plane][x];
- float proj = (inp - mean) * projScale;
- gradInput[batch][plane][x] = (gradOut - proj - gradMean) * gradScale;
+ Dtype inp = input[batch][plane][x];
+ Acctype proj = (inp - mean) * projScale;
+ gradInput[batch][plane][x] = ScalarConvert<Acctype, Dtype>::to((gradOut - proj - gradMean) * gradScale);
} else {
- gradInput[batch][plane][x] = gradOut * gradScale;
+ gradInput[batch][plane][x] = ScalarConvert<Acctype, Dtype>::to(gradOut * gradScale);
}
}
}
@@ -321,42 +271,16 @@ __global__ void BatchNormalizationBackward_kernel(
if (gradWeight.numElements() > 0) {
if (threadIdx.x == 0) {
- gradWeight[plane] += scale * dotP * stdVal;
+ gradWeight[plane] += ScalarConvert<Acctype, Dtype>::to(scale * dotP * stdVal);
}
}
if (gradBias.numElements() > 0) {
if (threadIdx.x == 0) {
- gradBias[plane] += scale * gradOutputSum;
+ gradBias[plane] += ScalarConvert<Acctype, Dtype>::to(scale * gradOutputSum);
}
}
}
-void THNN_CudaBatchNormalization_backward(
- THCState *state, THCudaTensor *input_, THCudaTensor *gradOutput_,
- THCudaTensor *gradInput_, THCudaTensor *gradWeight_, THCudaTensor *gradBias_,
- THCudaTensor *weight_, THCudaTensor *runningMean_, THCudaTensor *runningVar_,
- THCudaTensor *saveMean_, THCudaTensor *saveStd_, bool train, float scale, double eps) {
-
- THCUNN_assertSameGPU(state, 10, input_, gradOutput_, gradInput_, gradWeight_,
- gradBias_, weight_, runningMean_, runningVar_, saveMean_, saveStd_);
- DeviceTensor3 input = devicetensor<3>(state, input_);
- DeviceTensor3 gradOutput = devicetensor<3>(state, gradOutput_);
- DeviceTensor3 gradInput = devicetensor<3>(state, gradInput_);
- DeviceTensor1 gradWeight = devicetensor<1>(state, gradWeight_);
- DeviceTensor1 gradBias = devicetensor<1>(state, gradBias_);
- DeviceTensor1 weight = devicetensor<1>(state, weight_);
- DeviceTensor1 runningMean = devicetensor<1>(state, runningMean_);
- DeviceTensor1 runningVar = devicetensor<1>(state, runningVar_);
- DeviceTensor1 saveMean = devicetensor<1>(state, saveMean_);
- DeviceTensor1 saveStd = devicetensor<1>(state, saveStd_);
-
- cudaStream_t s = THCState_getCurrentStream(state);
-
- dim3 blocks(gradOutput.getSize(1));
- dim3 threads(getNumThreads(gradOutput.getSize(2)));
- BatchNormalizationBackward_kernel<<<blocks, threads, 0, s>>>(
- input, gradOutput, gradInput, gradWeight, gradBias, weight, runningMean, runningVar,
- saveMean, saveStd, train, scale, eps);
- THCudaCheck(cudaGetLastError());
-}
+#include "generic/BatchNormalization.cu"
+#include "THCGenerateFloatTypes.h"
diff --git a/lib/THCUNN/CMakeLists.txt b/lib/THCUNN/CMakeLists.txt
index 8492503..5dd8712 100644
--- a/lib/THCUNN/CMakeLists.txt
+++ b/lib/THCUNN/CMakeLists.txt
@@ -31,6 +31,7 @@ ENDIF()
FILE(GLOB src-cuda *.cu)
+CUDA_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
CUDA_ADD_LIBRARY(THCUNN MODULE ${src-cuda})
INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
diff --git a/lib/THCUNN/ClassNLLCriterion.cu b/lib/THCUNN/ClassNLLCriterion.cu
index 2b3ed8b..58684f4 100644
--- a/lib/THCUNN/ClassNLLCriterion.cu
+++ b/lib/THCUNN/ClassNLLCriterion.cu
@@ -1,16 +1,19 @@
#include "THCUNN.h"
#include "common.h"
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
#include <stdio.h>
#include <assert.h>
static const int NTHREADS = 32;
-__global__ void cunn_ClassNLLCriterion_updateOutput_kernel1(float *output,
- float *total_weight,
- float *input,
- long *target,
- float *weights,
+template <typename Dtype>
+__global__ void cunn_ClassNLLCriterion_updateOutput_kernel1(Dtype *output,
+ Dtype *total_weight,
+ Dtype *input,
+ THCIndex_t *target,
+ Dtype *weights,
int size_average,
int n_classes) {
assert(threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0);
@@ -20,7 +23,7 @@ __global__ void cunn_ClassNLLCriterion_updateOutput_kernel1(float *output,
int t = (int)*target - TH_INDEX_BASE;
assert(t >= 0 && t < n_classes);
- float cur_weight = weights ? weights[t] : 1.0f;
+ Dtype cur_weight = weights ? weights[t] : ScalarConvert<int, Dtype>::to(1);
*output = -cur_weight * input[t];
*total_weight = cur_weight;
if (size_average && *total_weight > 0) {
@@ -28,25 +31,26 @@ __global__ void cunn_ClassNLLCriterion_updateOutput_kernel1(float *output,
}
}
-__global__ void cunn_ClassNLLCriterion_updateOutput_kernel(float *output,
- float *total_weight,
- float *input,
- long *target,
- float *weights,
+template <typename Dtype, typename Acctype>
+__global__ void cunn_ClassNLLCriterion_updateOutput_kernel(Dtype *output,
+ Dtype *total_weight,
+ Dtype *input,
+ THCIndex_t *target,
+ Dtype *weights,
int size_average,
int nframe,
int ndim,
int n_classes) {
- __shared__ float shInputs[NTHREADS], acc_weight[NTHREADS];
+ __shared__ Acctype shInputs[NTHREADS], acc_weight[NTHREADS];
int i, t;
- float cur_weight;
+ Dtype cur_weight;
- shInputs[threadIdx.x] = 0.0f;
- acc_weight[threadIdx.x] = 0.0f;
+ shInputs[threadIdx.x] = ScalarConvert<int, Acctype>::to(0);
+ acc_weight[threadIdx.x] = ScalarConvert<int, Acctype>::to(0);
for (i = threadIdx.x; i < nframe; i += NTHREADS) {
t = target[i] - TH_INDEX_BASE;
assert(t >= 0 && t < n_classes);
- cur_weight = weights ? weights[t] : 1.0f;
+ cur_weight = weights ? weights[t] : ScalarConvert<int, Dtype>::to(1);
shInputs[threadIdx.x] -= input[i * ndim + t] * cur_weight;
acc_weight[threadIdx.x] += cur_weight;
}
@@ -56,39 +60,47 @@ __global__ void cunn_ClassNLLCriterion_updateOutput_kernel(float *output,
// updateOutput_kernel
if (threadIdx.x == 0) {
- *output = *total_weight = 0;
+ *output = *total_weight = ScalarConvert<int, Dtype>::to(0);
+ Acctype outputAcc = 0;
+ Acctype total_weightAcc = 0;
for (i = 0; i < NTHREADS; ++i){
- *output += shInputs[i];
- *total_weight += acc_weight[i];
+ // FIXME should we do somethigng here
+ outputAcc += shInputs[i];
+ total_weightAcc += acc_weight[i];
}
+ *total_weight = ScalarConvert<Acctype, Dtype>::to(total_weightAcc);
+ *output = ScalarConvert<Acctype, Dtype>::to(outputAcc);
if (size_average && *total_weight > 0) {
- *output /= *total_weight;
+ *output = ScalarConvert<Acctype, Dtype>::to(outputAcc / total_weightAcc);
}
+
}
}
+template <typename Dtype>
__global__ void cunn_ClassNLLCriterion_updateGradInput_kernel1(
- float* gradInput,
- float* weights,
- long* target,
- float* total_weight,
+ Dtype* gradInput,
+ Dtype* weights,
+ THCIndex_t* target,
+ Dtype* total_weight,
int size_average,
int n_classes)
{
if (*total_weight <= 0) {
return;
}
- float norm = size_average ? (1.0f / *total_weight) : 1.0f;
+ Dtype norm = size_average ? (ScalarConvert<int, Dtype>::to(1) / *total_weight) : ScalarConvert<int, Dtype>::to(1);
int t = (int)*target - TH_INDEX_BASE;
assert(t >= 0 && t < n_classes);
- gradInput[t] = -(weights ? weights[t] : 1.0f) * norm;
+ gradInput[t] = -(weights ? weights[t] : ScalarConvert<int, Dtype>::to(1)) * norm;
}
+template <typename Dtype>
__global__ void cunn_ClassNLLCriterion_updateGradInput_kernel(
- float *gradInput,
- long *target,
- float *weights,
- float *total_weight,
+ Dtype *gradInput,
+ THCIndex_t *target,
+ Dtype *weights,
+ Dtype *total_weight,
int size_average,
int nframe,
int ndim,
@@ -98,148 +110,14 @@ __global__ void cunn_ClassNLLCriterion_updateGradInput_kernel(
return;
}
int i, t;
- float norm = size_average ? (1.0f / *total_weight) : 1.0f;
+ Dtype norm = size_average ? (ScalarConvert<int, Dtype>::to(1) / *total_weight) : ScalarConvert<int, Dtype>::to(1);
for (i = threadIdx.x; i < nframe; i += NTHREADS) {
t = (int)target[i] - TH_INDEX_BASE;
assert(t >= 0 && t < n_classes);
- gradInput[i * ndim + t] = -(weights ? weights[t] : 1.0f) * norm;
- }
-}
-
-void THNN_CudaClassNLLCriterion_updateOutput(THCState *state, THCudaTensor *input, THCudaLongTensor *target, THCudaTensor *output, bool sizeAverage, THCudaTensor *weights, THCudaTensor *total_weight) {
- if (THCudaLongTensor_nDimension(state, target) > 1) {
- THError("multi-target not supported");
- }
-
- int n_dims = THCudaTensor_nDimension(state, input);
- int n_classes = THCudaTensor_size(state, input, n_dims - 1);
-
- if (weights) {
- THCUNN_assertSameGPU(
- state, 5, input, target, weights, output, total_weight
- );
- } else {
- THCUNN_assertSameGPU(
- state, 4, input, target, output, total_weight
- );
- }
-
- if (THCudaTensor_nDimension(state, input) > 2) {
- THArgCheck(0, 2, "vector or matrix expected");
- }
- if (weights && THCudaTensor_nElement(state, weights) != n_classes) {
- THError("weight tensor should be defined either for all or no classes");
- }
-
- input = THCudaTensor_newContiguous(state, input);
- weights = weights ? THCudaTensor_newContiguous(state, weights) : NULL;
- target = THCudaLongTensor_newContiguous(state, target);
-
- float *input_data = THCudaTensor_data(state, input);
- float *weights_data = weights ? THCudaTensor_data(state, weights) : NULL;
- long *target_data = THCudaLongTensor_data(state, target);
- float *output_data = THCudaTensor_data(state, output);
- float *total_weight_data = THCudaTensor_data(state, total_weight);
-
- if (THCudaTensor_nDimension(state, input) == 1) {
- cunn_ClassNLLCriterion_updateOutput_kernel1
- <<<1, 1, 0, THCState_getCurrentStream(state)>>>(
- output_data,
- total_weight_data,
- input_data,
- target_data,
- weights_data,
- sizeAverage,
- n_classes
- );
-
- } else if (THCudaTensor_nDimension(state, input) == 2) {
- cunn_ClassNLLCriterion_updateOutput_kernel
- <<<1, NTHREADS, 0, THCState_getCurrentStream(state)>>>(
- output_data,
- total_weight_data,
- input_data,
- target_data,
- weights_data,
- sizeAverage,
- THCudaTensor_size(state, input, 0),
- THCudaTensor_size(state, input, 1),
- n_classes
- );
- }
- THCudaCheck(cudaGetLastError());
-
- if (weights) {
- THCudaTensor_free(state, weights);
+ gradInput[i * ndim + t] = -(weights ? weights[t] : ScalarConvert<int, Dtype>::to(1)) * norm;
}
- THCudaLongTensor_free(state, target);
- THCudaTensor_free(state, input);
}
-void THNN_CudaClassNLLCriterion_updateGradInput(THCState *state, THCudaTensor *input, THCudaLongTensor *target, THCudaTensor *gradInput, bool sizeAverage, THCudaTensor *weights, THCudaTensor *total_weight) {
- if (THCudaLongTensor_nDimension(state, target) > 1) {
- THError("multi-target not supported");
- }
-
- int n_dims = THCudaTensor_nDimension(state, input);
- int n_classes = THCudaTensor_size(state, input, n_dims - 1);
-
- THArgCheck(THCudaTensor_isContiguous(state, gradInput), 4, "gradInput must be contiguous");
-
- if (weights) {
- THCUNN_assertSameGPU(
- state, 5, weights, input, target, gradInput, total_weight
- );
- }
- else {
- THCUNN_assertSameGPU(
- state, 4, input, target, gradInput, total_weight
- );
- }
-
- if (THCudaTensor_nDimension(state, input) > 2) {
- THArgCheck(0, 2, "vector or matrix expected");
- }
- if (weights && THCudaTensor_nElement(state, weights) != n_classes) {
- THError("weight tensor should be defined either for all or no classes");
- }
-
- weights = weights ? THCudaTensor_newContiguous(state, weights) : NULL;
- target = THCudaLongTensor_newContiguous(state, target);
-
- float *weights_data = weights ? THCudaTensor_data(state, weights) : NULL;
- float *gradInput_data = THCudaTensor_data(state, gradInput);
- long *target_data = THCudaLongTensor_data(state, target);
- float *total_weight_data = THCudaTensor_data(state, total_weight);
-
- if (THCudaTensor_nDimension(state, input) == 1) {
- cunn_ClassNLLCriterion_updateGradInput_kernel1
- <<<1, 1, 0, THCState_getCurrentStream(state)>>>(
- gradInput_data,
- weights_data,
- target_data,
- total_weight_data,
- sizeAverage,
- n_classes
- );
- } else {
- cunn_ClassNLLCriterion_updateGradInput_kernel
- <<<1, NTHREADS, 0, THCState_getCurrentStream(state)>>>(
- gradInput_data,
- target_data,
- weights_data,
- total_weight_data,
- sizeAverage,
- THCudaTensor_size(state, input, 0),
- THCudaTensor_size(state, input, 1),
- n_classes
- );
- }
- THCudaCheck(cudaGetLastError());
-
- if (weights) {
- THCudaTensor_free(state, weights);
- }
- THCudaLongTensor_free(state, target);
-}
+#include "generic/ClassNLLCriterion.cu"
+#include "THCGenerateFloatTypes.h"
diff --git a/lib/THCUNN/DistKLDivCriterion.cu b/lib/THCUNN/DistKLDivCriterion.cu
index 2984672..aa453f8 100644
--- a/lib/THCUNN/DistKLDivCriterion.cu
+++ b/lib/THCUNN/DistKLDivCriterion.cu
@@ -1,5 +1,7 @@
#include "THCUNN.h"
#include "common.h"
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
#include <thrust/fill.h>
#include <thrust/functional.h>
@@ -7,76 +9,30 @@
#include <thrust/reduce.h>
#include <thrust/inner_product.h>
+template <typename Dtype, typename Acctype>
struct kl_functor
{
- __host__ __device__ float operator()(const float& x, const float& y) const
+ __host__ __device__ Acctype operator()(const Dtype& x, const Dtype& y) const
{
- return y > 0 ? y * (log(y) - x) : 0;
+ Acctype yAcc = ScalarConvert<Dtype, Acctype>::to(y);
+ return y > 0 ? yAcc * (THCNumerics<Acctype>::log(yAcc) - x) : Acctype(0);
}
};
-void THNN_CudaDistKLDivCriterion_updateOutput(THCState *state, THCudaTensor *input, THCudaTensor *target, THCudaTensor *output, bool sizeAverage)
-{
- THCUNN_assertSameGPU(state, 2, input, target);
-
- THArgCheck(THCudaTensor_nElement(state, input) == THCudaTensor_nElement(state, target), 2,
- "input and target need to have the same number of elements");
-
- float sum;
-
- long size = THCudaTensor_nElement(state, input);
-
- input = THCudaTensor_newContiguous(state, input);
- target = THCudaTensor_newContiguous(state, target);
-
- thrust::device_ptr<float> input_data(THCudaTensor_data(state, input));
- thrust::device_ptr<float> target_data(THCudaTensor_data(state, target));
- sum = thrust::inner_product(input_data, input_data+size, target_data, (float) 0, thrust::plus<float>(), kl_functor());
-
- if (sizeAverage)
- sum /= size;
-
- THCudaTensor_free(state, input);
- THCudaTensor_free(state, target);
-
- THCudaTensor_set1d(state, output, 0, sum);
-}
-
+template <typename Dtype>
struct kl_updateGradInput_functor
{
- const float norm;
+ const Dtype norm;
- kl_updateGradInput_functor(float norm_)
+ kl_updateGradInput_functor(Dtype norm_)
: norm(norm_)
{}
- __host__ __device__ float operator()(const float& x, const float& y) const
+ __host__ __device__ Dtype operator()(const Dtype& x, const Dtype& y) const
{
- return y > 0 ? norm * (-y) : 0;
+ return y > 0 ? norm * (-y) : ScalarConvert<int, Dtype>::to(0);
}
};
-void THNN_CudaDistKLDivCriterion_updateGradInput(THCState *state, THCudaTensor *input, THCudaTensor *target, THCudaTensor *gradInput, bool sizeAverage)
-{
- THCUNN_assertSameGPU(state, 3, input, target, gradInput);
-
- THArgCheck(THCudaTensor_nElement(state, input) == THCudaTensor_nElement(state, target), 2,
- "input and target need to have the same number of elements");
-
- long size = THCudaTensor_nElement(state, input);
- float norm = (sizeAverage ? 1./size : 1.);
-
- input = THCudaTensor_newContiguous(state, input);
- target = THCudaTensor_newContiguous(state, target);
-
- THCudaTensor_resizeAs(state, gradInput, input);
-
- thrust::device_ptr<float> input_data(THCudaTensor_data(state, input));
- thrust::device_ptr<float> target_data(THCudaTensor_data(state, target));
- thrust::device_ptr<float> gradInput_data(THCudaTensor_data(state, gradInput));
-
- thrust::transform(input_data, input_data+size, target_data, gradInput_data, kl_updateGradInput_functor(norm));
-
- THCudaTensor_free(state, input);
- THCudaTensor_free(state, target);
-}
+#include "generic/DistKLDivCriterion.cu"
+#include "THCGenerateFloatTypes.h"
diff --git a/lib/THCUNN/ELU.cu b/lib/THCUNN/ELU.cu
index e4a05bc..ced993c 100644
--- a/lib/THCUNN/ELU.cu
+++ b/lib/THCUNN/ELU.cu
@@ -1,93 +1,67 @@
#include "THCUNN.h"
-#include "common.h"
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
+template <typename T>
struct ELUupdateOutput_functor
{
- const float alpha_;
+ const T alpha_;
- ELUupdateOutput_functor(float alpha)
+ ELUupdateOutput_functor(T alpha)
: alpha_(alpha)
{}
- __device__ void operator()(float *output, const float *input) const
+ __device__ void operator()(T *output, const T *input) const
{
*output = *input <= 0 ? (exp(*input) - 1) * alpha_ : *input;
}
};
// in-place variant
+template <typename T>
struct ELUupdateOutputIP_functor
{
- const float alpha_;
+ const T alpha_;
- ELUupdateOutputIP_functor(float alpha)
+ ELUupdateOutputIP_functor(T alpha)
: alpha_(alpha)
{}
- __device__ void operator()(float *x) const
+ __device__ void operator()(T *x) const
{
*x = *x <= 0 ? (exp(*x) - 1) * alpha_ : *x;
}
};
-void THNN_CudaELU_updateOutput(THCState *state, THCudaTensor *input, THCudaTensor *output,
- float alpha, bool inplace)
-{
- THCUNN_assertSameGPU(state, 2, input, output);
-
- if (inplace)
- {
- THC_pointwiseApply1(state, input, ELUupdateOutputIP_functor(alpha));
- THCudaTensor_set(state, output, input);
- }
- else
- {
- THCudaTensor_resizeAs(state, output, input);
- THC_pointwiseApply2(state, output, input, ELUupdateOutput_functor(alpha));
- }
-}
-
+template <typename T>
struct ELUupdateGradInput_functor
{
- const float alpha_;
+ const T alpha_;
- ELUupdateGradInput_functor(float alpha)
+ ELUupdateGradInput_functor(T alpha)
: alpha_(alpha)
{}
- __device__ void operator()(float *gradInput, const float *output, const float *gradOutput) const
+ __device__ void operator()(T *gradInput, const T *output, const T *gradOutput) const
{
*gradInput = (*output) <= 0 ? (*gradOutput * (*output + alpha_)) : (*gradOutput);
}
};
+template <typename T>
struct ELUupdateGradInputIP_functor
{
- const float alpha_;
+ const T alpha_;
- ELUupdateGradInputIP_functor(float alpha)
+ ELUupdateGradInputIP_functor(T alpha)
: alpha_(alpha)
{}
- __device__ void operator()(float *gradOutput, const float *output) const
+ __device__ void operator()(T *gradOutput, const T *output) const
{
*gradOutput = (*output) <= 0 ? (*gradOutput * (*output + alpha_)) : (*gradOutput);
}
};
-void THNN_CudaELU_updateGradInput(THCState *state, THCudaTensor *input, THCudaTensor *gradOutput,
- THCudaTensor *gradInput, THCudaTensor *output, float alpha, bool inplace)
-{
- THCUNN_assertSameGPU(state, 3, output, gradOutput, gradInput);
-
- if (inplace)
- {
- THC_pointwiseApply2(state, gradOutput, output, ELUupdateGradInputIP_functor(alpha));
- THCudaTensor_set(state, gradInput, gradOutput);
- }
- else
- {
- THCudaTensor_resizeAs(state, gradInput, output);
- THC_pointwiseApply3(state, gradInput, output, gradOutput, ELUupdateGradInput_functor(alpha));
- }
-}
+#include "generic/ELU.cu"
+#include "THCGenerateFloatTypes.h"
diff --git a/lib/THCUNN/HardTanh.cu b/lib/THCUNN/HardTanh.cu
index c7df1dd..b400beb 100644
--- a/lib/THCUNN/HardTanh.cu
+++ b/lib/THCUNN/HardTanh.cu
@@ -1,17 +1,19 @@
#include "THCUNN.h"
-#include "common.h"
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
+template <typename T>
struct hardtanhupdateOutput_functor
{
- const float max_val_;
- const float min_val_;
+ const T max_val_;
+ const T min_val_;
- hardtanhupdateOutput_functor(float min_val, float max_val)
+ hardtanhupdateOutput_functor(T min_val, T max_val)
: min_val_(min_val)
, max_val_(max_val)
{}
- __device__ void operator()(float *output, const float *input) const
+ __device__ void operator()(T *output, const T *input) const
{
if (*input < min_val_)
*output = min_val_;
@@ -21,7 +23,7 @@ struct hardtanhupdateOutput_functor
*output = max_val_;
}
- __device__ void operator()(float *input) const
+ __device__ void operator()(T *input) const
{
if (*input < min_val_)
*input = min_val_;
@@ -30,74 +32,31 @@ struct hardtanhupdateOutput_functor
}
};
-void THNN_CudaHardTanh_updateOutput(
- THCState *state,
- THCudaTensor *input,
- THCudaTensor *output,
- float min_val,
- float max_val,
- bool inplace)
-{
- THCUNN_assertSameGPU(state, 2, input, output);
- if(inplace)
- {
- THCudaTensor_set(state, output, input);
- THC_pointwiseApply1(state, output, hardtanhupdateOutput_functor(min_val, max_val));
- }
- else
- {
- THCudaTensor_resizeAs(state, output, input);
- THC_pointwiseApply2(state, output, input,
- hardtanhupdateOutput_functor(min_val, max_val));
- }
-}
-
+template <typename T>
struct hardtanhupdateGradInput_functor
{
- const float max_val_;
- const float min_val_;
+ const T max_val_;
+ const T min_val_;
- hardtanhupdateGradInput_functor(float min_val, float max_val)
+ hardtanhupdateGradInput_functor(T min_val, T max_val)
: min_val_(min_val)
, max_val_(max_val)
{}
- __device__ void operator()(float *gradInput, const float *input, const float *gradOutput) const
+ __device__ void operator()(T *gradInput, const T *input, const T *gradOutput) const
{
if (*input < min_val_ || *input > max_val_)
- *gradInput = 0;
+ *gradInput = ScalarConvert<int, T>::to(0);
else
*gradInput = *gradOutput;
}
- __device__ void operator()(float *gradInput, const float *input) const
+ __device__ void operator()(T *gradInput, const T *input) const
{
if (*input <= min_val_ || *input >= max_val_)
- *gradInput = 0;
+ *gradInput = ScalarConvert<int, T>::to(0);
}
};
-void THNN_CudaHardTanh_updateGradInput(
- THCState *state,
- THCudaTensor *input,
- THCudaTensor *gradOutput,
- THCudaTensor *gradInput,
- float min_val,
- float max_val,
- bool inplace)
-{
- THCUNN_assertSameGPU(state, 3, input, gradOutput, gradInput);
-
- if (inplace)
- {
- THCudaTensor_set(state, gradInput, gradOutput);
- THC_pointwiseApply2(state, gradInput, input,
- hardtanhupdateGradInput_functor(min_val, max_val));
- }
- else
- {
- THCudaTensor_resizeAs(state, gradInput, input);
- THC_pointwiseApply3(state, gradInput, input, gradOutput,
- hardtanhupdateGradInput_functor(min_val, max_val));
- }
-}
+#include "generic/HardTanh.cu"
+#include "THCGenerateFloatTypes.h"
diff --git a/lib/THCUNN/L1Cost.cu b/lib/THCUNN/L1Cost.cu
index 7dd6bb2..eda58c1 100644
--- a/lib/THCUNN/L1Cost.cu
+++ b/lib/THCUNN/L1Cost.cu
@@ -1,57 +1,34 @@
#include "THCUNN.h"
#include "common.h"
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
#include <thrust/device_ptr.h>
#include <thrust/reduce.h>
-#include <thrust/transform.h>
+#include <thrust/transform_reduce.h>
+template <typename Dtype, typename Acctype>
struct l1cost_functor
{
- __host__ __device__ float operator()(float x, float y) const
+ __host__ __device__ Acctype operator()(Dtype x) const
{
- return abs(x) + abs(y);
+ return THCNumerics<Acctype>::abs(ScalarConvert<Dtype, Acctype>::to(x));
}
};
-void THNN_CudaL1Cost_updateOutput(THCState *state, THCudaTensor *input, THCudaTensor *output)
-{
- THCUNN_assertSameGPU(state, 1, input);
- float sum;
- long size = THCudaTensor_nElement(state, input);
- input = THCudaTensor_newContiguous(state, input);
- thrust::device_ptr<float> input_data(THCudaTensor_data(state, input));
- sum = thrust::reduce(input_data, input_data+size, (float) 0, l1cost_functor());
-
- THCudaTensor_free(state, input);
-
- THCudaTensor_set1d(state, output, 0, sum);
-}
-
+template <typename Dtype>
struct l1cost_updateGradInput_functor
{
- __host__ __device__ float operator()(float x) const
+ __host__ __device__ Dtype operator()(Dtype x) const
{
if (x > 0)
- return 1;
+ return ScalarConvert<int, Dtype>::to(1);
else if (x < 0)
- return -1;
+ return ScalarConvert<int, Dtype>::to(-1);
else
- return 0;
+ return ScalarConvert<int, Dtype>::to(0);
}
};
-void THNN_CudaL1Cost_updateGradInput(THCState *state, THCudaTensor *input, THCudaTensor *gradOutput, THCudaTensor *gradInput)
-{
- THCUNN_assertSameGPU(state, 2, input, gradInput);
- long size = THCudaTensor_nElement(state, input);
-
- input = THCudaTensor_newContiguous(state, input);
- THCudaTensor_resizeAs(state, gradInput, input);
-
- thrust::device_ptr<float> input_data(THCudaTensor_data(state, input));
- thrust::device_ptr<float> gradInput_data(THCudaTensor_data(state, gradInput));
-
- thrust::transform(input_data, input_data+size, gradInput_data, l1cost_updateGradInput_functor());
-
- THCudaTensor_free(state, input);
-}
+#include "generic/L1Cost.cu"
+#include "THCGenerateFloatTypes.h"
diff --git a/lib/THCUNN/LeakyReLU.cu b/lib/THCUNN/LeakyReLU.cu
index a641821..8c0e6f8 100644
--- a/lib/THCUNN/LeakyReLU.cu
+++ b/lib/THCUNN/LeakyReLU.cu
@@ -1,103 +1,73 @@
#include "THCUNN.h"
-#include "common.h"
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
+template <typename T>
struct LeakyReLUUpdateOutput
{
- const float negval_;
+ const T negval_;
- LeakyReLUUpdateOutput(float negval)
+ LeakyReLUUpdateOutput(T negval)
: negval_(negval)
{}
- __device__ __forceinline__ void operator()(float *out, float *in)
+ __device__ __forceinline__ void operator()(T *out, T *in)
{
- float x = *in;
+ T x = *in;
*out = (x > 0) ? x : x * negval_;
}
};
// in-place variant
+template <typename T>
struct LeakyReLUUpdateOutputIP
{
- const float negval_;
+ const T negval_;
- LeakyReLUUpdateOutputIP(float negval)
+ LeakyReLUUpdateOutputIP(T negval)
: negval_(negval)
{}
- __device__ __forceinline__ void operator()(float *x)
+ __device__ __forceinline__ void operator()(T *x)
{
*x = (*x > 0) ? *x : negval_ * (*x);
}
};
-void THNN_CudaLeakyReLU_updateOutput(THCState *state, THCudaTensor *input, THCudaTensor *output,
- double negval, bool inplace)
-{
- THCUNN_assertSameGPU(state, 2, input, output);
-
- if (inplace)
- {
- THC_pointwiseApply1(state, input, LeakyReLUUpdateOutputIP(negval));
- THCudaTensor_set(state, output, input);
- }
- else
- {
- THCudaTensor_resizeAs(state, output, input);
- THC_pointwiseApply2(state, output, input, LeakyReLUUpdateOutput(negval));
- }
-
- THCudaCheck(cudaGetLastError());
-}
-
+template <typename T>
struct LeakyReLUUpdateGradInput
{
- const float negval_;
+ const T negval_;
- LeakyReLUUpdateGradInput(float negval)
+ LeakyReLUUpdateGradInput(T negval)
: negval_(negval)
{}
__device__ __forceinline__ void operator()(
- float* gradInput,
- float* input,
- float* gradOutput) const
+ T* gradInput,
+ T* input,
+ T* gradOutput) const
{
*gradInput = (*input > 0) ? *gradOutput : (*gradOutput) * negval_;
}
};
+template <typename T>
struct LeakyReLUUpdateGradInputIP
{
- const float negval_;
+ const T negval_;
- LeakyReLUUpdateGradInputIP(float negval)
+ LeakyReLUUpdateGradInputIP(T negval)
: negval_(negval)
{}
__device__ __forceinline__ void operator()(
- float* gradOutput,
- float* input) const
+ T* gradOutput,
+ T* input) const
{
*gradOutput = (*input > 0) ? *gradOutput : (*gradOutput) * negval_;
}
};
-void THNN_CudaLeakyReLU_updateGradInput(THCState *state, THCudaTensor *input, THCudaTensor *gradOutput,
- THCudaTensor *gradInput, double negval, bool inplace)
-{
- THCUNN_assertSameGPU(state, 3, input, gradInput, gradOutput);
-
- if (inplace)
- {
- THC_pointwiseApply2(state, gradOutput, input, LeakyReLUUpdateGradInputIP(negval));
- THCudaTensor_set(state, gradInput, gradOutput);
- }
- else
- {
- THCudaTensor_resizeAs(state, gradInput, input);
- THC_pointwiseApply3(state, gradInput, input, gradOutput, LeakyReLUUpdateGradInput(negval));
- }
-
- THCudaCheck(cudaGetLastError());
-}
+#include "generic/LeakyReLU.cu"
+#include "THCGenerateFloatTypes.h"
diff --git a/lib/THCUNN/LogSigmoid.cu b/lib/THCUNN/LogSigmoid.cu
index 2f56081..f008b63 100644
--- a/lib/THCUNN/LogSigmoid.cu
+++ b/lib/THCUNN/LogSigmoid.cu
@@ -1,35 +1,26 @@
#include "THCUNN.h"
-#include "common.h"
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
+template <typename T>
struct logSigmoid_updateOutput_functor
{
- __device__ void operator()(float *output, const float *input) const
+ __device__ void operator()(T *output, const T *input) const
{
- float z = exp(-*input);
- *output = -log(1. + z);
+ T z = exp(-*input);
+ *output = ScalarConvert<double, T>::to(-log(1. + z));
}
};
-void THNN_CudaLogSigmoid_updateOutput(THCState *state, THCudaTensor *input, THCudaTensor *output, THCudaTensor *buffer)
-{
- THCUNN_assertSameGPU(state, 2, input, output);
- THCudaTensor_resizeAs(state, output, input);
- THC_pointwiseApply2(state, output, input, logSigmoid_updateOutput_functor());
-}
-
+template <typename T>
struct logSigmoid_updateGradInput_functor
{
- __device__ void operator()(float *gradInput, const float *input, const float *gradOutput) const
+ __device__ void operator()(T *gradInput, const T *input, const T *gradOutput) const
{
- float z = exp(-*input);
- *gradInput = *gradOutput * z / (1. + z);
+ T z = exp(-*input);
+ *gradInput = ScalarConvert<double, T>::to(*gradOutput * z / (1. + z));
}
};
-void THNN_CudaLogSigmoid_updateGradInput(THCState *state, THCudaTensor *input, THCudaTensor *gradOutput,
- THCudaTensor *gradInput , THCudaTensor *buffer)
-{
- THCUNN_assertSameGPU(state, 3, input, gradOutput, gradInput);
- THCudaTensor_resizeAs(state, gradInput, input);
- THC_pointwiseApply3(state, gradInput, input, gradOutput, logSigmoid_updateGradInput_functor());
-}
+#include "generic/LogSigmoid.cu"
+#include "THCGenerateFloatTypes.h"
diff --git a/lib/THCUNN/LogSoftMax.cu b/lib/THCUNN/LogSoftMax.cu
index 70cab80..4d7973e 100644
--- a/lib/THCUNN/LogSoftMax.cu
+++ b/lib/THCUNN/LogSoftMax.cu
@@ -1,7 +1,10 @@
#include "THCUNN.h"
-#include "common.h"
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
+#include "SharedMem.cuh"
-__global__ void cunn_SpatialLogSoftMax_updateOutput_kernel(float *output, float *input, int classSize, int height, int width)
+template <typename T, typename AccumT>
+__global__ void cunn_SpatialLogSoftMax_updateOutput_kernel(T *output, T *input, int classSize, int height, int width)
{
int batchIndex = blockIdx.x;
int index = threadIdx.x;
@@ -18,11 +21,11 @@ __global__ void cunn_SpatialLogSoftMax_updateOutput_kernel(float *output, float
(width*classSize)*y +
(classSize)*x;
- float sum = 0;
+ AccumT sum = 0;
for (int i = 0; i < classSize; i++) {
- sum += __expf(input[inputStartIndex + i]);
+ sum += THCNumerics<T>::exp(input[inputStartIndex + i]);
}
- sum = 1.0f / sum;
+ sum = AccumT(1) / sum;
for (int i = 0; i < classSize; i++) {
// calculate output index in torch layout (B x C x H x W)
@@ -31,13 +34,15 @@ __global__ void cunn_SpatialLogSoftMax_updateOutput_kernel(float *output, float
(height*width)*i +
(width)*y +
x;
- output[outputIndex] = logf(sum * __expf(input[inputStartIndex + i]));
+ output[outputIndex] = ScalarConvert<AccumT, T>::to(
+ THCNumerics<AccumT>::log(sum * THCNumerics<T>::exp(input[inputStartIndex + i])));
}
index += blockDim.x;
}
}
-__global__ void cunn_SpatialLogSoftMax_updateGradInput_kernel(float *gradInput, float *output, float *gradOutput, int classSize, int height, int width)
+template <typename T, typename AccumT>
+__global__ void cunn_SpatialLogSoftMax_updateGradInput_kernel(T *gradInput, T *output, T *gradOutput, int classSize, int height, int width)
{
int batchIndex = blockIdx.x;
int index = threadIdx.x;
@@ -54,7 +59,7 @@ __global__ void cunn_SpatialLogSoftMax_updateGradInput_kernel(float *gradInput,
(width*classSize)*y +
(classSize)*x;
- float sum = 0;
+ AccumT sum = 0;
for (int i = 0; i < classSize; i++) {
sum += gradOutput[outputStartIndex + i];
}
@@ -66,70 +71,76 @@ __global__ void cunn_SpatialLogSoftMax_updateGradInput_kernel(float *gradInput,
(height*width)*i +
(width)*y +
x;
- gradInput[inputIndex] = gradOutput[outputStartIndex + i] - __expf(output[outputStartIndex + i]) * sum;
+ gradInput[inputIndex] = ScalarConvert<AccumT, T>::to(
+ gradOutput[outputStartIndex + i] - THCNumerics<T>::exp(output[outputStartIndex + i]) * sum);
}
index += blockDim.x;
}
}
+template <typename T, typename AccumT>
struct MaxFloat
{
- __device__ __forceinline__ float operator()(float max, float v) const
+ __device__ __forceinline__ AccumT operator()(AccumT max, T v) const
{
- return fmaxf(max, v);
+ return fmaxType(max, v);
}
};
+template<typename T, typename AccumT>
struct SumFloat
{
- __device__ __forceinline__ float operator()(float sum, float v) const
+ __device__ __forceinline__ AccumT operator()(AccumT sum, T v) const
{
return sum + v;
}
};
+template<typename T, typename AccumT>
struct SumExpFloat
{
- __device__ __forceinline__ SumExpFloat(float v)
+ __device__ __forceinline__ SumExpFloat(T v)
: max_k(v)
{}
- __device__ __forceinline__ float operator()(float sum, float v) const
+ __device__ __forceinline__ AccumT operator()(AccumT sum, T v) const
{
- return sum + expf(v - max_k);
+ return sum + THCNumerics<T>::exp(v - max_k);
}
- const float max_k;
+ const T max_k;
};
+template<typename AccumT>
struct NoFinal
{
- __device__ __forceinline__ float operator()(float v) const
+ __device__ __forceinline__ AccumT operator()(AccumT v) const
{
return v;
}
};
+template<typename AccumT>
struct LSMFinal
{
- __device__ __forceinline__ LSMFinal(float m)
+ __device__ __forceinline__ LSMFinal(AccumT m)
: max_k(m)
{}
- __device__ __forceinline__ float operator()(float v) const
+ __device__ __forceinline__ AccumT operator()(AccumT v) const
{
- return max_k + logf(v);
+ return max_k + THCNumerics<AccumT>::log(v);
}
- const float max_k;
+ const AccumT max_k;
};
-template <typename Reduction, typename Finalize>
-__device__ __forceinline__ float
-blockReduce(float* smem, float val,
- const Reduction& r,
- float defaultVal,
- const Finalize& f)
+template <template<typename, typename> class Reduction, template<typename> class Finalize, typename AccumT>
+__device__ __forceinline__ AccumT
+blockReduce(AccumT* smem, AccumT val,
+ const Reduction<AccumT, AccumT>& r,
+ AccumT defaultVal,
+ const Finalize<AccumT>& f)
{
// To avoid RaW races from chaining blockReduce calls together, we
// need a sync here
@@ -139,7 +150,7 @@ blockReduce(float* smem, float val,
__syncthreads();
- float warpVal = defaultVal;
+ AccumT warpVal = defaultVal;
// First warp will perform per-warp reductions for the remaining warps
if ((threadIdx.x / 32) == 0) // only threads in warp1 go into this (if)
@@ -162,7 +173,7 @@ blockReduce(float* smem, float val,
__syncthreads();
// First thread will perform a reduction of the above per-warp reductions
- float blockVal = defaultVal;
+ AccumT blockVal = defaultVal;
if (threadIdx.x == 0)
{
@@ -179,23 +190,23 @@ blockReduce(float* smem, float val,
return smem[0];
}
-template <typename Reduction>
-__device__ __forceinline__ float
-blockReduce(float* smem, float val,
- const Reduction& r,
- float defaultVal)
+template <template<typename, typename> class Reduction, typename AccumT>
+__device__ __forceinline__ AccumT
+blockReduce(AccumT* smem, AccumT val,
+ const Reduction<AccumT, AccumT>& r,
+ AccumT defaultVal)
{
- return blockReduce<Reduction, NoFinal>(smem, val, r, defaultVal, NoFinal());
+ return blockReduce<Reduction, NoFinal, AccumT>(smem, val, r, defaultVal, NoFinal<AccumT>());
}
-template <typename Reduction, int ILP>
-__device__ __forceinline__ float
-ilpReduce(float* data,
+template <template<typename, typename> class Reduction, int ILP, typename T, typename AccumT>
+__device__ __forceinline__ AccumT
+ilpReduce(T* data,
int size,
- const Reduction& r,
- float defaultVal)
+ const Reduction<T, AccumT>& r,
+ AccumT defaultVal)
{
- float threadVal = defaultVal;
+ AccumT threadVal = defaultVal;
int offset = threadIdx.x;
int last = size % (ILP * blockDim.x);
@@ -203,7 +214,7 @@ ilpReduce(float* data,
// Body (unroll by ILP times)
for (; offset < size - last; offset += blockDim.x * ILP)
{
- float tmp[ILP];
+ T tmp[ILP];
#pragma unroll
for (int j = 0; j < ILP; ++j)
@@ -227,28 +238,30 @@ ilpReduce(float* data,
return threadVal;
}
-template <int ILP>
+template <int ILP, typename T, typename AccumT>
__global__ void
-cunn_LogSoftMax_updateOutput_kernel(float *output, float *input, int classes)
+cunn_LogSoftMax_updateOutput_kernel(T *output, T *input, int classes)
{
- extern __shared__ float buffer[];
+ SharedMem<AccumT> smem;
+ AccumT *buffer = smem.getPointer();
// forward pointers to batch[blockIdx.x]
// each block handles a sample in the mini-batch
input += blockIdx.x * classes;
output += blockIdx.x * classes;
// find the max of the batch
- float threadMax =
- ilpReduce<MaxFloat, ILP>(input, classes, MaxFloat(), -FLT_MAX);
+ AccumT threadMax = ilpReduce<MaxFloat, ILP, T, AccumT>(
+ input, classes, MaxFloat<T, AccumT>(), -THCNumerics<AccumT>::max());
// find the max over all batches
- float max_k =
- blockReduce<MaxFloat>(buffer, threadMax, MaxFloat(), -FLT_MAX);
+ AccumT max_k = blockReduce<MaxFloat, AccumT>(
+ buffer, threadMax, MaxFloat<AccumT, AccumT>(), -THCNumerics<AccumT>::max());
+ T max_k_non_accum = ScalarConvert<AccumT, T>::to(max_k);
- float threadExp =
- ilpReduce<SumExpFloat, ILP>(input, classes, SumExpFloat(max_k), 0.0f);
- float logsum_k =
- blockReduce<SumFloat, LSMFinal>(
- buffer, threadExp, SumFloat(), 0.0f, LSMFinal(max_k));
+ AccumT threadExp = ilpReduce<SumExpFloat, ILP, T, AccumT>(
+ input, classes, SumExpFloat<T, AccumT>(max_k_non_accum), AccumT(0));
+ T logsum_k = ScalarConvert<AccumT, T>::to(
+ blockReduce<SumFloat, LSMFinal, AccumT>(
+ buffer, threadExp, SumFloat<AccumT, AccumT>(), AccumT(0), LSMFinal<AccumT>(max_k)));
// Output LSM (hand ILP)
int offset = threadIdx.x;
@@ -256,7 +269,7 @@ cunn_LogSoftMax_updateOutput_kernel(float *output, float *input, int classes)
int last = classes % (ILP * blockDim.x);
for (; offset < classes - last; offset += blockDim.x * ILP)
{
- float tmp[ILP];
+ T tmp[ILP];
#pragma unroll
for (int j = 0; j < ILP; ++j) {
@@ -276,30 +289,32 @@ cunn_LogSoftMax_updateOutput_kernel(float *output, float *input, int classes)
}
}
-template <int ILP>
+template <int ILP, typename T, typename AccumT>
__global__ void
-cunn_LogSoftMax_updateGradInput_kernel(float *gradInput,
- float *output,
- float *gradOutput,
+cunn_LogSoftMax_updateGradInput_kernel(T *gradInput,
+ T *output,
+ T *gradOutput,
int classes)
{
- extern __shared__ float buffer[];
+ SharedMem<AccumT> smem;
+ AccumT *buffer = smem.getPointer();
gradInput += blockIdx.x * classes;
output += blockIdx.x * classes;
gradOutput += blockIdx.x * classes;
- float threadSum =
- ilpReduce<SumFloat, 4>(gradOutput, classes, SumFloat(), 0.0f);
- float sum_k =
- blockReduce<SumFloat>(buffer, threadSum, SumFloat(), 0.0f);
+ AccumT threadSum = ilpReduce<SumFloat, 4, T, AccumT>(
+ gradOutput, classes, SumFloat<T, AccumT>(), AccumT(0));
+ T sum_k = ScalarConvert<AccumT, T>::to(
+ blockReduce<SumFloat, AccumT>(
+ buffer, threadSum, SumFloat<AccumT, AccumT>(), AccumT(0)));
// Update gradInput (hand ILP)
int offset = threadIdx.x;
int last = classes % (ILP * blockDim.x);
for (; offset < classes - last; offset += blockDim.x * ILP)
{
- float tmpGradOutput[ILP];
- float tmpOutput[ILP];
+ T tmpGradOutput[ILP];
+ T tmpOutput[ILP];
#pragma unroll
for (int j = 0; j < ILP; ++j)
@@ -312,237 +327,16 @@ cunn_LogSoftMax_updateGradInput_kernel(float *gradInput,
for (int j = 0; j < ILP; ++j)
{
gradInput[offset + j * blockDim.x] =
- tmpGradOutput[j] - __expf(tmpOutput[j]) * sum_k;
+ tmpGradOutput[j] - THCNumerics<T>::exp(tmpOutput[j]) * sum_k;
}
}
for (; offset < classes; offset += blockDim.x)
{
gradInput[offset] =
- gradOutput[offset] - __expf(output[offset]) * sum_k;
+ gradOutput[offset] - THCNumerics<T>::exp(output[offset]) * sum_k;
}
}
-void THNN_CudaLogSoftMax_updateOutput(THCState *state, THCudaTensor *input, THCudaTensor *output)
-{
- THCUNN_assertSameGPU(state, 2, input, output);
-
- THCudaTensor_resizeAs(state, output, input);
-
- bool spatial = false;
- int batchSize = 1;
- int classSize = 0;
- int height = 0;
- int width = 0;
-
- int ndims = THCudaTensor_nDimension(state, input);
-
- if (ndims == 1)
- {
- classSize = THCudaTensor_size(state, input, 0);
- input = THCudaTensor_newContiguous(state, input);
- }
- else if (ndims == 2)
- {
- batchSize = THCudaTensor_size(state, input, 0);
- classSize = THCudaTensor_size(state, input, 1);
- input = THCudaTensor_newContiguous(state, input);
- }
- else if (ndims == 3)
- {
- spatial = true;
- classSize = THCudaTensor_size(state, input, 0);
- height = THCudaTensor_size(state, input, 1);
- width = THCudaTensor_size(state, input, 2);
-
- // create contiguous tensor with cuda layout from tensor with torch layout
- // C x H x W -> W x H x C
- THCudaTensor_transpose(state, input, input, 0, 2);
- // W x H x C -> H x W x C
- THCudaTensor_transpose(state, input, input, 0, 1);
- THCudaTensor *transposedInput = THCudaTensor_newContiguous(state, input);
- THCudaTensor_transpose(state, input, input, 0, 1);
- THCudaTensor_transpose(state, input, input, 0, 2);
- input = transposedInput;
- }
- else if (ndims == 4)
- {
- spatial = true;
- batchSize = THCudaTensor_size(state, input, 0);
- classSize = THCudaTensor_size(state, input, 1);
- height = THCudaTensor_size(state, input, 2);
- width = THCudaTensor_size(state, input, 3);
-
- // create contiguous tensor with cuda layout from tensor with torch layout
- // B x C x H x W -> B x W x H x C
- THCudaTensor_transpose(state, input, input, 1, 3);
- // B x W x H x C -> B x H x W x C
- THCudaTensor_transpose(state, input, input, 1, 2);
- THCudaTensor *transposedInput = THCudaTensor_newContiguous(state, input);
- THCudaTensor_transpose(state, input, input, 1, 2);
- THCudaTensor_transpose(state, input, input, 1, 3);
- input = transposedInput;
- }
- else
- {
- THError("1D, 2D, 3D or 4D Tensor expected");
- }
-
- if (!spatial)
- {
- dim3 grid(batchSize);
- dim3 block(1024);
-
- cunn_LogSoftMax_updateOutput_kernel<2>
- <<<grid, block, block.x * sizeof(float), THCState_getCurrentStream(state)>>>(
- THCudaTensor_data(state, output),
- THCudaTensor_data(state, input),
- classSize
- );
- }
- else
- {
- dim3 grid(batchSize);
- dim3 block(1024);
-
- cunn_SpatialLogSoftMax_updateOutput_kernel
- <<<grid, block, 0, THCState_getCurrentStream(state)>>>(
- THCudaTensor_data(state, output),
- THCudaTensor_data(state, input),
- classSize, height, width
- );
- }
-
- cudaError errcode = cudaGetLastError();
- if (errcode != cudaSuccess)
- {
- THError(cudaGetErrorString(errcode));
- }
-
- THCudaTensor_free(state, input);
-}
-
-void THNN_CudaLogSoftMax_updateGradInput(THCState *state, THCudaTensor *input, THCudaTensor *gradOutput,
- THCudaTensor *gradInput, THCudaTensor *output)
-{
- THCUNN_assertSameGPU(state, 3, output, gradOutput, gradInput);
-
- THCudaTensor_resizeAs(state, gradInput, output);
-
- bool spatial = false;
- int batchSize = 1;
- int classSize = 0;
- int height = 0;
- int width = 0;
-
- int ndims = THCudaTensor_nDimension(state, input);
-
- if (ndims == 1)
- {
- classSize = THCudaTensor_size(state, gradInput, 0);
- output = THCudaTensor_newContiguous(state, output);
- gradOutput = THCudaTensor_newContiguous(state, gradOutput);
- }
- else if (ndims == 2)
- {
- batchSize = THCudaTensor_size(state, gradInput, 0);
- classSize = THCudaTensor_size(state, gradInput, 1);
- output = THCudaTensor_newContiguous(state, output);
- gradOutput = THCudaTensor_newContiguous(state, gradOutput);
- }
- else if (ndims == 3)
- {
- spatial = true;
- classSize = THCudaTensor_size(state, input, 0);
- height = THCudaTensor_size(state, input, 1);
- width = THCudaTensor_size(state, input, 2);
-
- // create contiguous tensor with cuda layout from tensor with torch layout
- // C x H x W -> W x H x C
- THCudaTensor_transpose(state, output, output, 0, 2);
- // W x H x C -> H x W x C
- THCudaTensor_transpose(state, output, output, 0, 1);
- THCudaTensor *transposedOutput = THCudaTensor_newContiguous(state, output);
- THCudaTensor_transpose(state, output, output, 0, 1);
- THCudaTensor_transpose(state, output, output, 0, 2);
- output = transposedOutput;
-
- // create contiguous tensor with cuda layout from tensor with torch layout
- // C x H x W -> W x H x C
- THCudaTensor_transpose(state, gradOutput, gradOutput, 0, 2);
- // W x H x C -> H x W x C
- THCudaTensor_transpose(state, gradOutput, gradOutput, 0, 1);
- THCudaTensor *transposedGradOutput = THCudaTensor_newContiguous(state, gradOutput);
- THCudaTensor_transpose(state, gradOutput, gradOutput, 0, 1);
- THCudaTensor_transpose(state, gradOutput, gradOutput, 0, 2);
- gradOutput = transposedGradOutput;
- }
- else if (ndims == 4)
- {
- spatial = true;
- batchSize = THCudaTensor_size(state, gradInput, 0);
- classSize = THCudaTensor_size(state, input, 1);
- height = THCudaTensor_size(state, input, 2);
- width = THCudaTensor_size(state, input, 3);
-
- // create contiguous tensor with cuda layout from tensor with torch layout
- // B x C x H x W -> B x W x H x C
- THCudaTensor_transpose(state, output, output, 1, 3);
- // B x W x H x C -> B x H x W x C
- THCudaTensor_transpose(state, output, output, 1, 2);
- THCudaTensor *transposedOutput = THCudaTensor_newContiguous(state, output);
- THCudaTensor_transpose(state, output, output, 1, 2);
- THCudaTensor_transpose(state, output, output, 1, 3);
- output = transposedOutput;
-
- // create contiguous tensor with cuda layout from tensor with torch layout
- // B x C x H x W -> B x W x H x C
- THCudaTensor_transpose(state, gradOutput, gradOutput, 1, 3);
- // B x W x H x C -> B x H x W x C
- THCudaTensor_transpose(state, gradOutput, gradOutput, 1, 2);
- THCudaTensor *transposedGradOutput = THCudaTensor_newContiguous(state, gradOutput);
- THCudaTensor_transpose(state, gradOutput, gradOutput, 1, 2);
- THCudaTensor_transpose(state, gradOutput, gradOutput, 1, 3);
- gradOutput = transposedGradOutput;
- }
- else
- {
- THError("1D, 2D, 3D or 4D Tensor expected");
- }
-
- if (!spatial)
- {
- dim3 grid(batchSize);
- dim3 block(1024);
-
- cunn_LogSoftMax_updateGradInput_kernel<2>
- <<<grid, block, block.x * sizeof(float), THCState_getCurrentStream(state)>>>(
- THCudaTensor_data(state, gradInput),
- THCudaTensor_data(state, output),
- THCudaTensor_data(state, gradOutput),
- classSize
- );
- }
- else
- {
- dim3 grid(batchSize);
- dim3 block(1024);
-
- cunn_SpatialLogSoftMax_updateGradInput_kernel
- <<<grid, block, 0, THCState_getCurrentStream(state)>>>(
- THCudaTensor_data(state, gradInput),
- THCudaTensor_data(state, output),
- THCudaTensor_data(state, gradOutput),
- classSize, height, width
- );
- }
-
- cudaError errcode = cudaGetLastError();
- if (errcode != cudaSuccess)
- {
- THError(cudaGetErrorString(errcode));
- }
-
- THCudaTensor_free(state, gradOutput);
- THCudaTensor_free(state, output);
-}
+#include "generic/LogSoftMax.cu"
+#include "THCGenerateFloatTypes.h"
diff --git a/lib/THCUNN/LookupTable.cu b/lib/THCUNN/LookupTable.cu
index 2b2040e..bb91c7e 100644
--- a/lib/THCUNN/LookupTable.cu
+++ b/lib/THCUNN/LookupTable.cu
@@ -9,6 +9,8 @@
#include <thrust/system/cuda/execution_policy.h>
#endif
#include <thrust/unique.h>
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
#ifndef DIVUP
#define DIVUP(x, y) (((x) + (y) - 1) / (y))
@@ -49,8 +51,9 @@ __device__ __forceinline__ bool warpHasCollision(int val)
return __any(dup) != 0;
}
+template <typename Dtype>
__global__ void cunn_LookupTable_accGradParametersKernelByFeature(
- long *input, float *gradOutput, float *gradWeight, float scale, long numel,
+ long *input, Dtype *gradOutput, Dtype *gradWeight, Dtype scale, long numel,
long stride, int paddingValue) {
const int featureDim = blockIdx.x * 4 + threadIdx.x / 32;
@@ -78,8 +81,9 @@ __global__ void cunn_LookupTable_accGradParametersKernelByFeature(
continue;
}
- float update = gradOutput[i*stride + featureDim] * scale;
+ Dtype update = gradOutput[i*stride + featureDim] * scale;
+ // FIXME: should we accumulate as accreal?
// Check for collision
if (warpHasCollision(weightIndex)) {
// Run all lanes sequentially; warp divergence
@@ -95,9 +99,10 @@ __global__ void cunn_LookupTable_accGradParametersKernelByFeature(
}
}
+template <typename Dtype, typename Acctype>
__global__ void cunn_LookupTable_accGradParametersKernel(
- long *input, long *indices, float *gradOutput, float *gradWeight,
- long *count, float defaultScale, long numel, long stride, int paddingValue) {
+ long *input, long *indices, Dtype *gradOutput, Dtype *gradWeight,
+ long *count, Dtype defaultScale, long numel, long stride, int paddingValue) {
int idx = blockIdx.x * 4 + threadIdx.y;
@@ -122,10 +127,10 @@ __global__ void cunn_LookupTable_accGradParametersKernel(
const int startFeature = threadIdx.x + blockIdx.y * blockDim.x * SZ;
const int weightRow = ((int) input[idx] - TH_INDEX_BASE) * stride;
const int gradOutputRow = ((int) indices[idx] - TH_INDEX_BASE) * stride;
- const float scale = count ? defaultScale / count[idx] : defaultScale;
+ const Acctype scale = count ? ScalarConvert<Dtype, Acctype>::to(defaultScale) / count[idx] : ScalarConvert<Dtype, Acctype>::to(defaultScale);
- float gradient[SZ];
- float weight[SZ];
+ Acctype gradient[SZ];
+ Acctype weight[SZ];
#pragma unroll
for (int ii = 0; ii < SZ; ii++)
@@ -133,8 +138,8 @@ __global__ void cunn_LookupTable_accGradParametersKernel(
int featureDim = startFeature + ii * WARP_SIZE;
if (featureDim < stride)
{
- gradient[ii] = gradOutput[gradOutputRow + featureDim];
- weight[ii] = gradWeight[weightRow + featureDim];
+ gradient[ii] = ScalarConvert<Dtype, Acctype>::to(gradOutput[gradOutputRow + featureDim]);
+ weight[ii] = ScalarConvert<Dtype, Acctype>::to(gradWeight[weightRow + featureDim]);
}
}
@@ -150,7 +155,7 @@ __global__ void cunn_LookupTable_accGradParametersKernel(
int featureDim = startFeature + ii * WARP_SIZE;
if (featureDim < stride)
{
- gradWeight[weightRow + featureDim] = weight[ii];
+ gradWeight[weightRow + featureDim] = ScalarConvert<Acctype, Dtype>::to(weight[ii]);
}
}
@@ -159,129 +164,23 @@ __global__ void cunn_LookupTable_accGradParametersKernel(
}
}
-void THNN_CudaLookupTable_accGradParameters(
- THCState *state,
- THIndexTensor *input,
- THCudaTensor *gradOutput,
- THCudaTensor *gradWeight,
- THIndexTensor *count,
- THIndexTensor *sorted,
- THIndexTensor *indices,
- bool scaleGradByFreq,
- int paddingValue,
- float scale)
-{
- THCUNN_assertSameGPU(state, 5, input, gradOutput, gradWeight, sorted, indices);
- if (!(THIndexTensor_(isContiguous)(state, input) &&
- THCudaTensor_isContiguous(state, gradOutput) &&
- THCudaTensor_isContiguous(state, gradWeight)))
- {
- THError("Tensors must be contiguous");
- }
-
- int nDim = THIndexTensor_(nDimension)(state, input);
- if (nDim != 1 && nDim != 2)
- THError("input must be a vector or matrix");
-
- long numel = THIndexTensor_(nElement)(state, input);
- long stride = gradWeight->stride[0];
-
- cudaStream_t stream = THCState_getCurrentStream(state);
-
- if (numel <= 768 && !scaleGradByFreq) {
- cunn_LookupTable_accGradParametersKernelByFeature<<<DIVUP(stride,4), 128, 0, stream>>>(
- THIndexTensor_(data)(state, input),
- THCudaTensor_data(state, gradOutput),
- THCudaTensor_data(state, gradWeight),
- scale,
- numel,
- stride,
- paddingValue);
- THCudaCheck(cudaGetLastError());
- return;
- }
-
- THLongStorage *inputSize = THIndexTensor_(newSizeOf)(state, input);
- THIndexTensor_(resize)(state, sorted, inputSize, NULL);
- THIndexTensor_(resize)(state, indices, inputSize, NULL);
- THLongStorage_free(inputSize);
-
- // Sort the inputs into sorted with the corresponding indices
- THIndexTensor_(sort)(state, sorted, indices, input, 0, 0);
-
- long *sorted_data = THIndexTensor_(data)(state, sorted);
- long *indices_data = THIndexTensor_(data)(state, indices);
- long *count_data = NULL;
-
- if (scaleGradByFreq)
- {
- THIndexTensor_(resizeAs)(state, count, input);
- count_data = THIndexTensor_(data)(state, count);
-
- thrust::device_ptr<long> sorted_ptr(sorted_data);
- thrust::device_ptr<long> count_ptr(count_data);
-
- // Compute an increasing sequence per unique item in sorted:
- // sorted: 2 5 5 5 7 7 8 9 9
- // count: 1 1 2 3 1 2 1 1 2
- thrust::inclusive_scan_by_key(
-#if CUDA_VERSION >= 7000
- thrust::cuda::par.on(THCState_getCurrentStream(state)),
-#endif
- sorted_ptr,
- sorted_ptr + numel,
- thrust::make_constant_iterator(1),
- count_ptr
- );
-
- // Take the maximum of each count per unique key in reverse:
- // sorted: 2 5 5 5 7 7 8 9 9
- // count: 1 3 3 3 2 2 1 2 2
- thrust::inclusive_scan_by_key(
-#if CUDA_VERSION >= 7000
- thrust::cuda::par.on(THCState_getCurrentStream(state)),
-#endif
- thrust::make_reverse_iterator(sorted_ptr + numel),
- thrust::make_reverse_iterator(sorted_ptr),
- thrust::make_reverse_iterator(count_ptr + numel),
- thrust::make_reverse_iterator(count_ptr + numel),
- thrust::equal_to<long>(),
- thrust::maximum<long>()
- );
- }
-
- dim3 grid(DIVUP(numel,4), DIVUP(stride,128));
- dim3 block(32, 4);
- cunn_LookupTable_accGradParametersKernel<<<grid, block, 0, stream>>>(
- sorted_data,
- indices_data,
- THCudaTensor_data(state, gradOutput),
- THCudaTensor_data(state, gradWeight),
- count_data,
- scale,
- numel,
- stride,
- paddingValue
- );
- THCudaCheck(cudaGetLastError());
-}
-
/*
* Keep the norm of weight smaller than maxNorm
*/
-template <typename T>
+template <typename Dtype, typename Acctype>
struct pow_v
{
- T normType;
- pow_v(T v) : normType(v) {}
+ Acctype normType;
+ pow_v(Dtype v) : normType(ScalarConvert<Dtype, Acctype>::to(v)) {}
__host__ __device__
- T operator()(const T& x) const {
+ Acctype operator()(const Dtype& x) const {
+ Acctype xA = ScalarConvert<Dtype, Acctype>::to(x);
if (normType == 1)
- return std::abs(x);
+ return std::abs(xA);
else if (normType == 2)
- return x * x;
+ return xA * xA;
else
- return std::pow(std::abs(x), normType);
+ return std::pow(std::abs(xA), normType);
}
};
@@ -296,47 +195,5 @@ struct multiply_s
}
};
-void THNN_CudaLookupTable_renorm(
- THCState *state,
- THIndexTensor *idx,
- THCudaTensor *weight,
- float maxNorm,
- float normType)
-{
- THCUNN_assertSameGPU(state, 2, idx, weight);
- if (!(THIndexTensor_(isContiguous)(state, idx) &&
- THCudaTensor_isContiguous(state, weight)))
- {
- THError("Tensors must be contiguous");
- }
- if (THIndexTensor_(nDimension)(state, idx) != 1)
- THError("idx must be a vector");
- if (normType <= 0)
- THError("non-positive-norm not supported");
-
- long numel = THIndexTensor_(nElement)(state, idx);
- long stride = weight->stride[0];
-
- // get the unique indices
- thrust::device_ptr<float> weight_ptr(THCudaTensor_data(state, weight));
- thrust::device_ptr<long> idx_ptr(THIndexTensor_(data)(state, idx));
- thrust::device_ptr<long> end_ptr = thrust::unique(idx_ptr, idx_ptr+numel);
- numel = end_ptr - idx_ptr;
-
- pow_v<float> unary_pow(normType);
- thrust::plus<float> binary_plus;
- // numel << stride, since idx usually contains sparse row indices
- for (long i = 0; i < numel; i++)
- {
- long k = idx_ptr[i] - TH_INDEX_BASE;
- thrust::device_ptr<float> row_ptr = weight_ptr + k * stride;
- float norm = thrust::transform_reduce(row_ptr, row_ptr + stride,
- unary_pow, 0, binary_plus);
- norm = std::pow(norm, (float) (1.0 / normType));
- if (norm > maxNorm)
- {
- multiply_s<float> unary_mul(maxNorm / (norm + 1e-7));
- thrust::transform(row_ptr, row_ptr + stride, row_ptr, unary_mul);
- }
- }
-}
+#include "generic/LookupTable.cu"
+#include "THCGenerateFloatTypes.h"
diff --git a/lib/THCUNN/MSECriterion.cu b/lib/THCUNN/MSECriterion.cu
index 982e19a..26a35a5 100644
--- a/lib/THCUNN/MSECriterion.cu
+++ b/lib/THCUNN/MSECriterion.cu
@@ -1,5 +1,7 @@
#include "THCUNN.h"
#include "common.h"
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
#include <thrust/fill.h>
#include <thrust/functional.h>
@@ -10,87 +12,32 @@
#include <thrust/system/cuda/execution_policy.h>
#endif
+template <typename Dtype, typename Acctype>
struct mse_functor
{
mse_functor() {}
- __host__ __device__ float operator()(const float &x, const float &y) const
+ __host__ __device__ Acctype operator()(const Dtype &x, const Dtype &y) const
{
- float z = x-y;
+ Acctype z = ScalarConvert<Dtype, Acctype>::to(x)-y;
return z*z;
}
};
-void THNN_CudaMSECriterion_updateOutput(THCState *state, THCudaTensor *input, THCudaTensor *target, THCudaTensor *output, bool sizeAverage)
-{
- THCUNN_assertSameGPU(state, 2, input, target);
- THArgCheck(THCudaTensor_nElement(state, input) == THCudaTensor_nElement(state, target), 2,
- "input and target need to have the same number of elements"
- );
-
- long size = THCudaTensor_nElement(state, input);
-
- input = THCudaTensor_newContiguous(state, input);
- target = THCudaTensor_newContiguous(state, target);
-
- thrust::device_ptr<float> input_data(THCudaTensor_data(state, input));
- thrust::device_ptr<float> target_data(THCudaTensor_data(state, target));
- float sum = thrust::inner_product(
-#if CUDA_VERSION >= 7000
- thrust::cuda::par.on(THCState_getCurrentStream(state)),
-#endif
- input_data, input_data+size, target_data, (float) 0,
- thrust::plus<float>(), mse_functor());
-
- if (sizeAverage)
- sum /= size;
-
- THCudaTensor_free(state, input);
- THCudaTensor_free(state, target);
-
- THCudaTensor_set1d(state, output, 0, sum);
-}
-
+template <typename Dtype, typename Acctype>
struct mse_updateGradInput_functor
{
- const float norm;
+ const Acctype norm;
- mse_updateGradInput_functor(float norm_)
+ mse_updateGradInput_functor(Acctype norm_)
: norm(norm_)
{}
- __host__ __device__ float operator()(const float &x, const float &y) const
+ __host__ __device__ Dtype operator()(const Dtype &x, const Dtype &y) const
{
- return norm * (x - y);
+ return ScalarConvert<Acctype, Dtype>::to(norm * (ScalarConvert<Dtype, Acctype>::to(x) - y));
}
};
-void THNN_CudaMSECriterion_updateGradInput(THCState *state, THCudaTensor *input, THCudaTensor *target, THCudaTensor *gradInput, bool sizeAverage)
-{
- THCUNN_assertSameGPU(state, 3, input, target, gradInput);
- THArgCheck(THCudaTensor_nElement(state, input) == THCudaTensor_nElement(state, target), 2,
- "input and target need to have the same number of elements"
- );
-
- long size = THCudaTensor_nElement(state, input);
- float norm = sizeAverage ? 2.f/size : 2.f;
-
- input = THCudaTensor_newContiguous(state, input);
- target = THCudaTensor_newContiguous(state, target);
-
- THCudaTensor_resizeAs(state, gradInput, input);
-
- thrust::device_ptr<float> input_data(THCudaTensor_data(state, input));
- thrust::device_ptr<float> target_data(THCudaTensor_data(state, target));
- thrust::device_ptr<float> gradInput_data(THCudaTensor_data(state, gradInput));
-
- thrust::transform(
-#if CUDA_VERSION >= 7000
- thrust::cuda::par.on(THCState_getCurrentStream(state)),
-#endif
- input_data, input_data+size, target_data, gradInput_data,
- mse_updateGradInput_functor(norm));
-
- THCudaTensor_free(state, input);
- THCudaTensor_free(state, target);
-}
+#include "generic/MSECriterion.cu"
+#include "THCGenerateFloatTypes.h"
diff --git a/lib/THCUNN/MarginCriterion.cu b/lib/THCUNN/MarginCriterion.cu
index b49e1dc..7ccdbb7 100644
--- a/lib/THCUNN/MarginCriterion.cu
+++ b/lib/THCUNN/MarginCriterion.cu
@@ -1,5 +1,7 @@
#include "THCUNN.h"
#include "common.h"
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
#include <thrust/fill.h>
#include <thrust/functional.h>
@@ -7,76 +9,37 @@
#include <thrust/reduce.h>
#include <thrust/inner_product.h>
+template <typename Dtype, typename Acctype>
struct margin_functor
{
- margin_functor(float margin)
+ margin_functor(Acctype margin)
: margin(margin)
{}
- __host__ __device__ float operator()(const float &x, const float &y) const
+ __host__ __device__ Acctype operator()(const Dtype &x, const Dtype &y) const
{
- float z = margin - x * y;
+ Acctype z = margin - ScalarConvert<Dtype, Acctype>::to(x) * y;
return z >= 0 ? z : 0;
}
- const float margin;
+ const Acctype margin;
};
-void THNN_CudaMarginCriterion_updateOutput(THCState *state, THCudaTensor *input, THCudaTensor *target, THCudaTensor *output, bool sizeAverage, float margin)
-{
- THCUNN_assertSameGPU(state, 2, input, target);
-
- long size = THCudaTensor_nElement(state, input);
-
- input = THCudaTensor_newContiguous(state, input);
- target = THCudaTensor_newContiguous(state, target);
-
- thrust::device_ptr<float> input_data(THCudaTensor_data(state, input));
- thrust::device_ptr<float> target_data(THCudaTensor_data(state, target));
- float sum = thrust::inner_product(input_data, input_data+size, target_data, (float) 0, thrust::plus<float>(), margin_functor(margin));
-
- if (sizeAverage)
- sum /= size;
-
- THCudaTensor_free(state, input);
- THCudaTensor_free(state, target);
-
- THCudaTensor_set1d(state, output, 0, sum);
-}
-
+template <typename Dtype, typename Acctype>
struct margin_updateGradInput_functor
{
- const float margin, norm;
+ const Acctype margin, norm;
- margin_updateGradInput_functor(float margin_, float norm_)
+ margin_updateGradInput_functor(Acctype margin_, Acctype norm_)
: margin(margin_)
, norm(norm_)
{}
- __host__ __device__ float operator()(const float &x, const float &y) const
+ __host__ __device__ Dtype operator()(const Dtype &x, const Dtype &y) const
{
- return (x * y) < margin ? -norm * y : 0;
+ return ScalarConvert<Acctype, Dtype>::to((ScalarConvert<Dtype, Acctype>::to(x) * y) < margin ? -norm * y : 0);
}
};
-void THNN_CudaMarginCriterion_updateGradInput(THCState *state, THCudaTensor *input, THCudaTensor *target, THCudaTensor *gradInput, bool sizeAverage, float margin)
-{
- THCUNN_assertSameGPU(state, 3, input, target, gradInput);
-
- long size = THCudaTensor_nElement(state, input);
- float norm = sizeAverage ? 1.f/size : 1;
-
- input = THCudaTensor_newContiguous(state, input);
- target = THCudaTensor_newContiguous(state, target);
-
- THCudaTensor_resizeAs(state, gradInput, input);
-
- thrust::device_ptr<float> input_data(THCudaTensor_data(state, input));
- thrust::device_ptr<float> target_data(THCudaTensor_data(state, target));
- thrust::device_ptr<float> gradInput_data(THCudaTensor_data(state, gradInput));
-
- thrust::transform(input_data, input_data+size, target_data, gradInput_data, margin_updateGradInput_functor(margin, norm));
-
- THCudaTensor_free(state, input);
- THCudaTensor_free(state, target);
-}
+#include "generic/MarginCriterion.cu"
+#include "THCGenerateFloatTypes.h"
diff --git a/lib/THCUNN/MultiLabelMarginCriterion.cu b/lib/THCUNN/MultiLabelMarginCriterion.cu
index 903e064..72fc486 100644
--- a/lib/THCUNN/MultiLabelMarginCriterion.cu
+++ b/lib/THCUNN/MultiLabelMarginCriterion.cu
@@ -1,60 +1,63 @@
#include "THCUNN.h"
#include "common.h"
#include "THCReduceApplyUtils.cuh"
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
#include <thrust/functional.h>
#define MULTILABELMARGIN_THREADS 1024
-__global__ void cunn_MultiLabelMarginCriterion_updateOutput_kernel(float *output,
- float *input,
- float *target,
- float *istarget,
+template <typename Dtype, typename Acctype>
+__global__ void cunn_MultiLabelMarginCriterion_updateOutput_kernel(Dtype *output,
+ Dtype *input,
+ THCIndex_t *target,
+ Dtype *istarget,
int nframe,
int dim,
int sizeaverage)
{
// Temporary sums (for mapreduce)
- __shared__ float sums[MULTILABELMARGIN_THREADS];
+ __shared__ Acctype sums[MULTILABELMARGIN_THREADS];
// vectors:
int k = blockIdx.x;
- float *input_k = input + k*dim;
- float *target_k = target + k*dim;
- float *output_k = output + k;
- float *istarget_k = istarget + k*dim;
+ Dtype *input_k = input + k*dim;
+ THCIndex_t *target_k = target + k*dim;
+ Dtype *output_k = output + k;
+ Dtype *istarget_k = istarget + k*dim;
// zero istarget
for (int d = threadIdx.x; d < dim; d += blockDim.x) {
- istarget_k[d] = 0;
+ istarget_k[d] = ScalarConvert<int, Dtype>::to(0);
}
__syncthreads();
// mark targets in istarget
if (threadIdx.x == 0) {
for (int dt = 0; dt < dim; dt++) {
- int target_idx = (int)target_k[dt] - TH_INDEX_BASE;
+ int target_idx = target_k[dt] - TH_INDEX_BASE;
if (target_idx < 0) break;
- istarget_k[target_idx] = 1;
+ istarget_k[target_idx] = ScalarConvert<int, Dtype>::to(1);
}
}
__syncthreads();
// iterate over targets
- float sum = 0;
+ Acctype sum = 0;
for (int dt = 0; dt < dim; dt++) {
// next target:
- int target_idx = (int)target_k[dt] - TH_INDEX_BASE;
+ int target_idx = target_k[dt] - TH_INDEX_BASE;
if (target_idx < 0) break;
// current value for target
- float input_target_k = input_k[target_idx];
+ Dtype input_target_k = input_k[target_idx];
// compare to all inputs (multithreaded):
for (int d = threadIdx.x; d < dim; d += blockDim.x) {
// contribute to loss only if not a target
- if (!istarget_k[d]) {
- float z = 1 - input_target_k + input_k[d];
+ if (!ScalarConvert<Dtype, int>::to(istarget_k[d])) {
+ Dtype z = 1 - input_target_k + input_k[d];
if (z > 0)
sum += z;
}
@@ -62,40 +65,41 @@ __global__ void cunn_MultiLabelMarginCriterion_updateOutput_kernel(float *output
}
// reduce
- float totalSum = reduceBlock(sums, blockDim.x, sum, thrust::plus<float>(), 0.0f);
+ Acctype totalSum = reduceBlock(sums, blockDim.x, sum, thrust::plus<Acctype>(), (Acctype)0);
if (threadIdx.x == 0) {
if (sizeaverage) {
- *output_k = (totalSum / dim) / nframe;
+ *output_k = ScalarConvert<Acctype, Dtype>::to((totalSum / dim) / nframe);
} else {
- *output_k = totalSum / dim;
+ *output_k = ScalarConvert<Acctype, Dtype>::to(totalSum / dim);
}
}
}
-__global__ void cunn_MultiLabelMarginCriterion_updateGradInput_kernel(float *gradInput,
- float *input,
- float *target,
- float *istarget,
+template <typename Dtype, typename Acctype>
+__global__ void cunn_MultiLabelMarginCriterion_updateGradInput_kernel(Dtype *gradInput,
+ Dtype *input,
+ THCIndex_t *target,
+ Dtype *istarget,
int nframe,
int dim,
int sizeaverage)
{
// Temporary sums (for mapreduce)
- __shared__ float sums[MULTILABELMARGIN_THREADS];
+ __shared__ Acctype sums[MULTILABELMARGIN_THREADS];
// vectors:
int k = blockIdx.x;
- float *input_k = input + k*dim;
- float *gradInput_k = gradInput + k*dim;
- float *target_k = target + k*dim;
- float *istarget_k = istarget + k*dim;
+ Dtype *input_k = input + k*dim;
+ Dtype *gradInput_k = gradInput + k*dim;
+ THCIndex_t *target_k = target + k*dim;
+ Dtype *istarget_k = istarget + k*dim;
// gain:
- float g = ( sizeaverage ? 1./((float)(nframe*dim)) : 1./((float)dim) );
+ Dtype g = ScalarConvert<Acctype, Dtype>::to( sizeaverage ? 1./((Acctype)(nframe*dim)) : 1./((Acctype)dim) );
// zero gradients:
for (int d = threadIdx.x; d < dim; d += blockDim.x) {
- gradInput_k[d] = 0;
+ gradInput_k[d] = ScalarConvert<int, Dtype>::to(0);
}
__syncthreads();
@@ -106,14 +110,14 @@ __global__ void cunn_MultiLabelMarginCriterion_updateGradInput_kernel(float *gra
if (target_idx < 0) break;
// current value for target
- float input_target_k = input_k[target_idx];
+ Dtype input_target_k = input_k[target_idx];
// compare to all inputs (multithreaded):
- float sum = 0;
+ Acctype sum = 0;
for (int d = threadIdx.x; d < dim; d += blockDim.x) {
// contribute to loss only if not a target
- if (!istarget_k[d]) {
- float z = 1 - input_target_k + input_k[d];
+ if (!ScalarConvert<Dtype, int>::to(istarget_k[d])) {
+ Dtype z = 1 - input_target_k + input_k[d];
if (z > 0) {
sum -= g;
gradInput_k[d] += g;
@@ -123,118 +127,15 @@ __global__ void cunn_MultiLabelMarginCriterion_updateGradInput_kernel(float *gra
__syncthreads();
// reduce sum
- float totalSum = reduceBlock(sums, blockDim.x, sum, thrust::plus<float>(), 0.0f);
+ Acctype totalSum = reduceBlock(sums, blockDim.x, sum, thrust::plus<Acctype>(), (Acctype)0);
if (threadIdx.x == 0) {
- gradInput_k[target_idx] += totalSum;
+ gradInput_k[target_idx] += ScalarConvert<Acctype, Dtype>::to(totalSum);
}
__syncthreads();
}
}
-void THNN_CudaMultiLabelMarginCriterion_updateOutput(
- THCState *state,
- THCudaTensor *input,
- THCudaTensor *target,
- THCudaTensor *output,
- THCudaTensor *istarget,
- bool sizeaverage)
-{
- input = THCudaTensor_newContiguous(state, input);
- target = THCudaTensor_newContiguous(state, target);
- istarget = THCudaTensor_newContiguous(state, istarget);
- THCudaTensor_resizeAs(state, istarget, input);
-
- if(input->nDimension == 1)
- {
- THCudaTensor_resize1d(state, output, 1);
-
- dim3 blocks(1);
- dim3 threads(MULTILABELMARGIN_THREADS);
-
- cunn_MultiLabelMarginCriterion_updateOutput_kernel<<<blocks,threads>>>(
- THCudaTensor_data(state, output),
- THCudaTensor_data(state, input),
- THCudaTensor_data(state, target),
- THCudaTensor_data(state, istarget),
- 1, input->size[0],
- sizeaverage
- );
- THCudaCheck(cudaGetLastError());
- }
- else if(input->nDimension == 2)
- {
- THCudaTensor *output_tmp = THCudaTensor_newWithSize1d(state, input->size[0]);
-
- dim3 blocks(input->size[0]);
- dim3 threads(MULTILABELMARGIN_THREADS);
-
- cunn_MultiLabelMarginCriterion_updateOutput_kernel<<<blocks,threads>>>(
- THCudaTensor_data(state, output_tmp),
- THCudaTensor_data(state, input),
- THCudaTensor_data(state, target),
- THCudaTensor_data(state, istarget),
- input->size[0], input->size[1],
- sizeaverage
- );
- THCudaCheck(cudaGetLastError());
- THCudaTensor_resize1d(state, output, 1);
- THCudaTensor_set1d(state, output, 0, THCudaTensor_sumall(state, output_tmp));
- THCudaTensor_free(state, output_tmp);
- }
- else
- THError("vector or matrix expected");
-
- THCudaTensor_free(state, input);
- THCudaTensor_free(state, target);
- THCudaTensor_free(state, istarget);
-}
-
-void THNN_CudaMultiLabelMarginCriterion_updateGradInput(
- THCState *state,
- THCudaTensor *input,
- THCudaTensor *target,
- THCudaTensor *gradInput,
- THCudaTensor *istarget,
- bool sizeaverage)
-{
- input = THCudaTensor_newContiguous(state, input);
- target = THCudaTensor_newContiguous(state, target);
- istarget = THCudaTensor_newContiguous(state, istarget);
- THCudaTensor_resizeAs(state, gradInput, input);
-
- if(gradInput->nDimension == 1)
- {
- dim3 blocks(1);
- dim3 threads(MULTILABELMARGIN_THREADS);
-
- cunn_MultiLabelMarginCriterion_updateGradInput_kernel<<<blocks,threads>>>(THCudaTensor_data(state, gradInput),
- THCudaTensor_data(state, input),
- THCudaTensor_data(state, target),
- THCudaTensor_data(state, istarget),
- 1, gradInput->size[0],
- sizeaverage);
-
- }
- else if(gradInput->nDimension == 2)
- {
- dim3 blocks(gradInput->size[0]);
- dim3 threads(MULTILABELMARGIN_THREADS);
-
- cunn_MultiLabelMarginCriterion_updateGradInput_kernel<<<blocks,threads>>>(THCudaTensor_data(state, gradInput),
- THCudaTensor_data(state, input),
- THCudaTensor_data(state, target),
- THCudaTensor_data(state, istarget),
- gradInput->size[0], gradInput->size[1],
- sizeaverage);
- }
- else
- THError("vector or matrix expected");
-
- THCudaCheck(cudaGetLastError());
-
- THCudaTensor_free(state, input);
- THCudaTensor_free(state, target);
- THCudaTensor_free(state, istarget);
-}
+#include "generic/MultiLabelMarginCriterion.cu"
+#include "THCGenerateFloatTypes.h"
#undef MULTILABELMARGIN_THREADS
diff --git a/lib/THCUNN/MultiMarginCriterion.cu b/lib/THCUNN/MultiMarginCriterion.cu
index 31caa75..b5ed75b 100644
--- a/lib/THCUNN/MultiMarginCriterion.cu
+++ b/lib/THCUNN/MultiMarginCriterion.cu
@@ -1,17 +1,19 @@
#include "THCUNN.h"
#include "common.h"
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
#define MULTIMARGIN_THREADS 128
-template <int P>
-__global__ void cunn_MultiMarginCriterion_updateOutput_kernel(float *output, float *input, float *target, float *weights, int nframe, int dim, bool sizeAverage, float margin)
+template <int P, typename Dtype, typename Acctype>
+__global__ void cunn_MultiMarginCriterion_updateOutput_kernel(Dtype *output, Dtype *input, THCIndex_t *target, Dtype *weights, int nframe, int dim, bool sizeAverage, Dtype margin)
{
- __shared__ float buffer[MULTIMARGIN_THREADS];
+ __shared__ Acctype buffer[MULTIMARGIN_THREADS];
int k = blockIdx.x;
- float *input_k = input + k*dim;
- float *output_k = output + k;
+ Dtype *input_k = input + k*dim;
+ Dtype *output_k = output + k;
int target_k = ((int)target[k]) - TH_INDEX_BASE;
- float input_target_k = input_k[target_k];
+ Dtype input_target_k = input_k[target_k];
int i_start = threadIdx.x;
int i_end = dim;
@@ -20,12 +22,12 @@ __global__ void cunn_MultiMarginCriterion_updateOutput_kernel(float *output, flo
buffer[threadIdx.x] = 0;
for (int i = i_start; i < i_end; i += i_step)
{
- float z = margin - input_target_k + input_k[i];
+ Dtype z = margin - input_target_k + input_k[i];
if (i == target_k)
continue;
if (z > 0) {
- float h = (P==1) ? z : z*z;
+ Dtype h = (P==1) ? z : z*z;
if(weights)
h *= weights[target_k];
buffer[threadIdx.x] += h;
@@ -36,26 +38,26 @@ __global__ void cunn_MultiMarginCriterion_updateOutput_kernel(float *output, flo
// reduce
if (threadIdx.x == 0)
{
- float sum = 0;
+ Acctype sum = 0;
for (int i=0; i < blockDim.x; i++)
sum += buffer[i];
- *output_k = sum/dim;
+ *output_k = ScalarConvert<Acctype, Dtype>::to(sum/dim);
if(sizeAverage)
*output_k /= nframe;
}
}
-template <int P>
-__global__ void cunn_MultiMarginCriterion_updateGradInput_kernel(float *gradInput, float *input, float *target, float *weights, int nframe, int dim, bool sizeAverage, float margin)
+template <int P, typename Dtype, typename Acctype>
+__global__ void cunn_MultiMarginCriterion_updateGradInput_kernel(Dtype *gradInput, Dtype *input, THCIndex_t *target, Dtype *weights, int nframe, int dim, bool sizeAverage, Dtype margin)
{
- __shared__ float buffer[MULTIMARGIN_THREADS];
+ __shared__ Acctype buffer[MULTIMARGIN_THREADS];
int k = blockIdx.x;
- float *input_k = input + k*dim;
- float *gradInput_k = gradInput + k*dim;
+ Dtype *input_k = input + k*dim;
+ Dtype *gradInput_k = gradInput + k*dim;
int target_k = ((int)target[k]) - TH_INDEX_BASE;
- float input_target_k = input_k[target_k];
- float g = (sizeAverage ? 1./((float)(nframe*dim)) : 1./((float)dim));
+ Dtype input_target_k = input_k[target_k];
+ Acctype g = (sizeAverage ? 1./((Acctype)(nframe*dim)) : 1./((Acctype)dim));
int i_start = threadIdx.x;
int i_end = dim;
@@ -64,20 +66,20 @@ __global__ void cunn_MultiMarginCriterion_updateGradInput_kernel(float *gradInpu
buffer[threadIdx.x] = 0;
for (int i=i_start; i<i_end; i+=i_step)
{
- float z = margin - input_target_k + input_k[i];
+ Dtype z = margin - input_target_k + input_k[i];
if (i == target_k)
continue;
if (z > 0)
{
- float h = (P == 1) ? g : 2*g*z;
+ Dtype h = ScalarConvert<Acctype, Dtype>::to((P == 1) ? g : 2*g*z);
if(weights)
h *= weights[target_k];
buffer[threadIdx.x] -= h;
gradInput_k[i] = h;
}
else
- gradInput_k[i] = 0;
+ gradInput_k[i] = ScalarConvert<int, Dtype>::to(0);
}
__syncthreads();
@@ -85,175 +87,14 @@ __global__ void cunn_MultiMarginCriterion_updateGradInput_kernel(float *gradInpu
// reduce
if (threadIdx.x == 0)
{
- float gradInput_target_k = 0;
+ Acctype gradInput_target_k = 0;
for (int i=0; i<blockDim.x; i++)
gradInput_target_k += buffer[i];
- gradInput_k[target_k] = gradInput_target_k;
+ gradInput_k[target_k] = ScalarConvert<Acctype, Dtype>::to(gradInput_target_k);
}
}
-void THNN_CudaMultiMarginCriterion_updateOutput(THCState *state, THCudaTensor *input,
- THCudaTensor *target, THCudaTensor *output,
- bool sizeAverage, int p, THCudaTensor *weights,
- float margin)
-{
- THCUNN_assertSameGPU(state, 2, input, target);
- input = THCudaTensor_newContiguous(state, input);
- if(weights)
- weights = THCudaTensor_newContiguous(state, weights);
- if (input->nDimension == 1)
- {
- dim3 blocks(1);
- dim3 threads(MULTIMARGIN_THREADS);
- if (p == 1)
- {
- cunn_MultiMarginCriterion_updateOutput_kernel<1> <<<blocks,threads, 0, THCState_getCurrentStream(state)>>>(
- THCudaTensor_data(state, output),
- THCudaTensor_data(state, input),
- THCudaTensor_data(state, target),
- weights ? THCudaTensor_data(state, weights) : NULL,
- 1, input->size[0],
- sizeAverage,
- margin
- );
- }
- else if (p == 2)
- {
- cunn_MultiMarginCriterion_updateOutput_kernel<2> <<<blocks,threads, 0, THCState_getCurrentStream(state)>>>(
- THCudaTensor_data(state, output),
- THCudaTensor_data(state, input),
- THCudaTensor_data(state, target),
- weights ? THCudaTensor_data(state, weights) : NULL,
- 1, input->size[0],
- sizeAverage,
- margin
- );
- }
- THCudaCheck(cudaGetLastError());
- }
- else if (input->nDimension == 2)
- {
- THCudaTensor *output_ = THCudaTensor_newWithSize1d(state, input->size[0]); // tmp outupt buffer
- dim3 blocks(input->size[0]);
- dim3 threads(MULTIMARGIN_THREADS);
- if (p == 1)
- {
- cunn_MultiMarginCriterion_updateOutput_kernel<1> <<<blocks,threads, 0, THCState_getCurrentStream(state)>>>(
- THCudaTensor_data(state, output_),
- THCudaTensor_data(state, input),
- THCudaTensor_data(state, target),
- weights ? THCudaTensor_data(state, weights) : NULL,
- input->size[0], input->size[1],
- sizeAverage,
- margin
- );
- }
- else if (p == 2)
- {
- cunn_MultiMarginCriterion_updateOutput_kernel<2> <<<blocks,threads, 0, THCState_getCurrentStream(state)>>>(
- THCudaTensor_data(state, output_),
- THCudaTensor_data(state, input),
- THCudaTensor_data(state, target),
- weights ? THCudaTensor_data(state, weights) : NULL,
- input->size[0], input->size[1],
- sizeAverage,
- margin
- );
- }
- THCudaCheck(cudaGetLastError());
- float sum = THCudaTensor_sumall(state, output_);
- THCudaTensor_set1d(state, output, 0, sum);
- THCudaTensor_free(state, output_);
- }
- else
- {
- THError("vector or matrix expected");
- }
-
- THCudaTensor_free(state, input);
- if(weights)
- THCudaTensor_free(state, weights);
-}
-
-void THNN_CudaMultiMarginCriterion_updateGradInput(THCState *state, THCudaTensor *input,
- THCudaTensor *target, THCudaTensor *gradInput,
- bool sizeAverage, int p, THCudaTensor *weights,
- float margin)
-{
- THCUNN_assertSameGPU(state, 3, input, gradInput, target);
- input = THCudaTensor_newContiguous(state, input);
- THCudaTensor_resizeAs(state, gradInput, input);
- if(weights)
- weights = THCudaTensor_newContiguous(state, weights);
+#include "generic/MultiMarginCriterion.cu"
+#include "THCGenerateFloatTypes.h"
- if (input->nDimension == 1)
- {
- dim3 blocks(1);
- dim3 threads(MULTIMARGIN_THREADS);
-
- if (p == 1)
- {
- cunn_MultiMarginCriterion_updateGradInput_kernel<1> <<<blocks,threads, 0, THCState_getCurrentStream(state)>>>(
- THCudaTensor_data(state, gradInput),
- THCudaTensor_data(state, input),
- THCudaTensor_data(state, target),
- weights ? THCudaTensor_data(state, weights) : NULL,
- 1, gradInput->size[0],
- sizeAverage,
- margin
- );
- }
- else if (p == 2)
- {
- cunn_MultiMarginCriterion_updateGradInput_kernel<2> <<<blocks,threads, 0, THCState_getCurrentStream(state)>>>(
- THCudaTensor_data(state, gradInput),
- THCudaTensor_data(state, input),
- THCudaTensor_data(state, target),
- weights ? THCudaTensor_data(state, weights) : NULL,
- 1, gradInput->size[0],
- sizeAverage,
- margin
- );
- }
- THCudaCheck(cudaGetLastError());
- }
- else if (input->nDimension == 2)
- {
- dim3 blocks(gradInput->size[0]);
- dim3 threads(MULTIMARGIN_THREADS);
-
- if (p == 1)
- {
- cunn_MultiMarginCriterion_updateGradInput_kernel<1> <<<blocks,threads, 0, THCState_getCurrentStream(state)>>>(
- THCudaTensor_data(state, gradInput),
- THCudaTensor_data(state, input),
- THCudaTensor_data(state, target),
- weights ? THCudaTensor_data(state, weights) : NULL,
- gradInput->size[0], gradInput->size[1],
- sizeAverage,
- margin
- );
- }
- else if (p == 2)
- {
- cunn_MultiMarginCriterion_updateGradInput_kernel<2> <<<blocks,threads, 0, THCState_getCurrentStream(state)>>>(
- THCudaTensor_data(state, gradInput),
- THCudaTensor_data(state, input),
- THCudaTensor_data(state, target),
- weights ? THCudaTensor_data(state, weights) : NULL,
- gradInput->size[0], gradInput->size[1],
- sizeAverage,
- margin
- );
- }
- THCudaCheck(cudaGetLastError());
- }
- else
- {
- THError("vector or matrix expected");
- }
-
- THCudaTensor_free(state, input);
- if(weights)
- THCudaTensor_free(state, weights);
-}
+#undef MULTIMARGIN_THREADS
diff --git a/lib/THCUNN/PReLU.cu b/lib/THCUNN/PReLU.cu
index 048c0b4..dfac397 100644
--- a/lib/THCUNN/PReLU.cu
+++ b/lib/THCUNN/PReLU.cu
@@ -1,25 +1,27 @@
#include "THCUNN.h"
-#include "THCReduce.cuh"
-#include "common.h"
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
-#include <thrust/functional.h>
+#include "common.h"
+template <typename T>
struct PReLUUpdateOutput
{
- float* weight_;
+ T* weight_;
- PReLUUpdateOutput(float* weight)
+ PReLUUpdateOutput(T* weight)
: weight_(weight)
{}
- __device__ __forceinline__ void operator()(float *out, float *in)
+ __device__ __forceinline__ void operator()(T *out, T *in)
{
- float x = *in;
+ T x = *in;
*out = (x > 0) ? x : weight_[0] * x;
}
};
-__global__ void preluForward(float *output, const float *input, const float *weight, int n, int nElemsPerSample, int mapSize)
+template <typename T>
+__global__ void preluForward(T *output, const T *input, const T *weight, int n, int nElemsPerSample, int mapSize)
{
CUDA_KERNEL_LOOP(i, n)
{
@@ -29,63 +31,27 @@ __global__ void preluForward(float *output, const float *input, const float *wei
}
}
-void THNN_CudaPReLU_updateOutput(
- THCState *state,
- THCudaTensor *input,
- THCudaTensor *output,
- THCudaTensor *weight,
- long nOutputPlane)
-{
- THCudaTensor_resizeAs(state, output, input);
-
- float *w = THCudaTensor_data(state, weight);
-
- if (nOutputPlane == 0)
- {
- THC_pointwiseApply2(state, output, input, PReLUUpdateOutput(w));
- }
- else
- {
- int ndim = THCudaTensor_nDimension(state, input);
- input = THCudaTensor_newContiguous(state, input);
-
- int n = THCudaTensor_nElement(state, input);
- int mapSize = 1;
- if (ndim == 3)
- mapSize = (input->size[1] * input->size[2]);
- else if (ndim == 4)
- mapSize = (input->size[2] * input->size[3]);
- int nElemsPerSample = nOutputPlane * mapSize;
- preluForward<<<GET_BLOCKS(n), CUDA_NUM_THREADS, 0, THCState_getCurrentStream(state)>>>(
- THCudaTensor_data(state, output),
- THCudaTensor_data(state, input),
- w,
- n, nElemsPerSample, mapSize
- );
- THCudaCheck(cudaGetLastError());
- THCudaTensor_free(state, input);
- }
-}
-
+template <typename T>
struct PReLUUpdateGradInput
{
- float *weight_;
+ T *weight_;
- PReLUUpdateGradInput(float *weight)
+ PReLUUpdateGradInput(T *weight)
: weight_(weight)
{}
- __device__ __forceinline__ void operator()(float *gradInput, float *gradOutput, float *input)
+ __device__ __forceinline__ void operator()(T *gradInput, T *gradOutput, T *input)
{
*gradInput = *input > 0 ? *gradOutput : *gradOutput * *weight_;
}
};
+template <typename T>
__global__ void preluBackward(
- float *gradInput,
- const float *input,
- const float *weight,
- const float *gradOutput,
+ T *gradInput,
+ const T *input,
+ const T *weight,
+ const T *gradOutput,
int n, int nElemsPerSample, int mapSize)
{
CUDA_KERNEL_LOOP(i, n)
@@ -96,149 +62,44 @@ __global__ void preluBackward(
}
}
-void THNN_CudaPReLU_updateGradInput(
- THCState *state,
- THCudaTensor *input,
- THCudaTensor *gradOutput,
- THCudaTensor *gradInput,
- THCudaTensor *weight,
- long nOutputPlane)
-{
- THCudaTensor_resizeAs(state, gradInput, input);
-
- float *w = THCudaTensor_data(state, weight);
- if (nOutputPlane == 0)
- {
- THC_pointwiseApply3(state, gradInput, gradOutput, input, PReLUUpdateGradInput(w));
- }
- else
- {
- int ndim = THCudaTensor_nDimension(state, input);
- input = THCudaTensor_newContiguous(state, input);
- gradOutput = THCudaTensor_newContiguous(state, gradOutput);
-
- int n = THCudaTensor_nElement(state, input);
- int mapSize = 1;
- if (ndim == 3)
- mapSize = (input->size[1] * input->size[2]);
- else if (ndim == 4)
- mapSize = (input->size[2] * input->size[3]);
- int nElemsPerSample = nOutputPlane * mapSize;
- preluBackward<<<GET_BLOCKS(n), CUDA_NUM_THREADS, 0, THCState_getCurrentStream(state)>>>(
- THCudaTensor_data(state, gradInput),
- THCudaTensor_data(state, input),
- w,
- THCudaTensor_data(state, gradOutput),
- n, nElemsPerSample, mapSize
- );
- THCudaCheck(cudaGetLastError());
- THCudaTensor_free(state, input);
- THCudaTensor_free(state, gradOutput);
- }
-}
-
+template <typename T>
struct PReLUAccGradParametersShared
{
- __device__ __forceinline__ void operator()(float *gradInput, float *input, float *gradOutput)
+ __device__ __forceinline__ void operator()(T *gradInput, T *input, T *gradOutput)
{
*gradInput = (*input) * (*gradOutput) * (*input <= 0);
}
};
+template <typename T>
struct PReLUAccGradParameters
{
- float scale;
+ T scale;
- PReLUAccGradParameters(float scale)
+ PReLUAccGradParameters(T scale)
: scale(scale)
{}
- __device__ __forceinline__ void operator()(float *gradInput, float *input, float *gradOutput)
+ __device__ __forceinline__ void operator()(T *gradInput, T *input, T *gradOutput)
{
*gradInput = (*input) * (*gradOutput) * scale * (*input <= 0);
}
};
+template <typename T>
struct PReLUAccGradParameters1to1
{
- float scale;
+ T scale;
- PReLUAccGradParameters1to1(float scale)
+ PReLUAccGradParameters1to1(T scale)
: scale(scale)
{}
- __device__ __forceinline__ void operator()(float *gradWeight, float *input, float *gradOutput)
+ __device__ __forceinline__ void operator()(T *gradWeight, T *input, T *gradOutput)
{
*gradWeight += (*input) * (*gradOutput) * scale * (*input <= 0);
}
};
-void THNN_CudaPReLU_accGradParameters(
- THCState *state,
- THCudaTensor *input,
- THCudaTensor *gradOutput,
- THCudaTensor *gradInput,
- THCudaTensor *weight,
- THCudaTensor *gradWeight,
- THCudaTensor *gradWeightBuf,
- THCudaTensor *gradWeightBuf2,
- long nOutputPlane,
- float scale)
-{
- // use grad input for temporary storage, then call updateGradInput again
-
- if (nOutputPlane == 0)
- {
- THC_pointwiseApply3(state, gradInput, input, gradOutput, PReLUAccGradParametersShared());
-
- // introduces a sync point
- float sum = THCudaTensor_sumall(state, gradInput);
- float w = THCudaTensor_get1d(state, gradWeight, 0);
- THCudaTensor_set1d(state, gradWeight, 0, w + sum * scale);
-
- // restore gradInput
- THNN_CudaPReLU_updateGradInput(state, input, gradOutput, gradInput, weight, nOutputPlane);
- }
- else
- {
- int ndim = THCudaTensor_nDimension(state, input);
-
- if (ndim == 1)
- {
- THC_pointwiseApply3(state, gradWeight, input, gradOutput, PReLUAccGradParameters1to1(scale));
- }
- else
- {
- THC_pointwiseApply3(state, gradInput, input, gradOutput, PReLUAccGradParameters(scale));
- THCudaTensor *sumbuf = gradWeightBuf2;
- THCudaTensor_resizeAs(state, gradWeightBuf, gradWeight);
-
- if (ndim == 2)
- {
- THCudaTensor_sum(state, gradWeightBuf, gradInput, 0);
- THCudaTensor_cadd(state, gradWeight, gradWeight, scale, gradWeightBuf);
- }
- else if (ndim == 3)
- {
- THCudaTensor *buffer = THCudaTensor_newContiguous(state, gradInput);
- THCudaTensor_resize2d(state, buffer, nOutputPlane, input->size[1] * input->size[2]);
- THCudaTensor_sum(state, gradWeightBuf, buffer, 1);
- THCudaTensor_cadd(state, gradWeight, gradWeight, scale, gradWeightBuf);
- THCudaTensor_free(state, buffer);
- }
- else if (ndim == 4)
- {
- THCudaTensor *buffer = THCudaTensor_newContiguous(state, gradInput);
- THCudaTensor_resize3d(state, buffer, input->size[0], nOutputPlane, input->size[2] * input->size[3]);
- THCudaTensor_resize2d(state, sumbuf, input->size[0], nOutputPlane);
- THCudaTensor_sum(state, sumbuf, buffer, 2);
- THCudaTensor_sum(state, gradWeightBuf, sumbuf, 0);
- THCudaTensor_cadd(state, gradWeight, gradWeight, scale, gradWeightBuf);
- THCudaTensor_free(state, buffer);
- }
-
- // restore gradInput
- THNN_CudaPReLU_updateGradInput(state, input, gradOutput, gradInput, weight, nOutputPlane);
- }
- }
-}
+#include "generic/PReLU.cu"
+#include "THCGenerateFloatTypes.h"
diff --git a/lib/THCUNN/RReLU.cu b/lib/THCUNN/RReLU.cu
index 245598b..55a34ec 100644
--- a/lib/THCUNN/RReLU.cu
+++ b/lib/THCUNN/RReLU.cu
@@ -1,4 +1,6 @@
#include "THCUNN.h"
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
#include "common.h"
#include <curand.h>
#include <curand_kernel.h>
@@ -8,51 +10,72 @@
#define BLOCK_SIZE 256
#define NUM_BLOCKS(n) min((int)THCCeilDiv(n, (long) BLOCK_SIZE), MAX_NUM_BLOCKS)
+template<typename T>
+inline T __device__ curand_uniform_type(curandStateMtgp32 *state);
+
+template <>
+inline half __device__ curand_uniform_type<half>(curandStateMtgp32 *state) {
+ return ScalarConvert<float, half>::to(curand_uniform(state));
+}
+
+template <>
+inline float __device__ curand_uniform_type<float>(curandStateMtgp32 *state) {
+ return curand_uniform(state);
+}
+
+template <>
+inline double __device__ curand_uniform_type<double>(curandStateMtgp32 *state) {
+ return curand_uniform_double(state);
+}
+
+template <typename T>
__global__ void rreluUpdateOutputTrain(int n, curandStateMtgp32 *state,
- float *input, float* noise, float *output, double a, double b)
+ T *input, T* noise, T *output, double a, double b)
{
CUDA_KERNEL_LOOP(i, n)
{
if (input[i] <= 0)
{
- float r = curand_uniform(&state[blockIdx.x]);
- r = r * (b-a) + a;
+ T r = curand_uniform_type<T>(&state[blockIdx.x]);
+ r = ScalarConvert<double, T>::to(r * (b-a) + a);
output[i] = input[i] * r;
noise[i] = r;
}
else
{
output[i] = input[i];
- noise[i] = 1;
+ noise[i] = ScalarConvert<int, T>::to(1);
}
}
}
+template <typename T>
struct RReLUUpdateOutputEval_functor
{
- const float negSlope_;
+ const T negSlope_;
- RReLUUpdateOutputEval_functor(float negSlope)
+ RReLUUpdateOutputEval_functor(T negSlope)
: negSlope_(negSlope)
{}
- __device__ __forceinline__ void operator()(float *out, float *in)
+ __device__ __forceinline__ void operator()(T *out, T *in)
{
- const float x = *in;
- const float r = x <= 0 ? negSlope_ : 1;
+ const T x = *in;
+ const T r = x <= 0 ? negSlope_ : ScalarConvert<int, T>::to(1);
*out = x * r;
}
};
+template <typename T>
struct RReLUUpdateOutputEvalIP_functor
{
- const float negSlope_;
+ const T negSlope_;
- RReLUUpdateOutputEvalIP_functor(float negSlope)
+ RReLUUpdateOutputEvalIP_functor(T negSlope)
: negSlope_(negSlope)
{}
- __device__ __forceinline__ void operator()(float *x)
+ __device__ __forceinline__ void operator()(T *x)
{
if (*x <= 0)
{
@@ -61,74 +84,31 @@ struct RReLUUpdateOutputEvalIP_functor
}
};
-void THNN_CudaRReLU_updateOutput(THCState *state, THCudaTensor *input, THCudaTensor *output,
- THCudaTensor *noise, double lower, double upper, bool train, bool inplace, void *generator)
-{
- THCUNN_assertSameGPU(state, 3, input, output, noise);
- struct curandStateMtgp32* gen_states = THCRandom_generatorStates(state);
-
- if (train)
- {
- input = THCudaTensor_newContiguous(state, input);
- THCudaTensor_resizeAs(state, noise, input);
- float *input_data = THCudaTensor_data(state, input);
- float *noise_data = THCudaTensor_data(state, noise);
- long n = THCudaTensor_nElement(state, input);
- if (inplace)
- {
- rreluUpdateOutputTrain<<<NUM_BLOCKS(n), BLOCK_SIZE, 0, THCState_getCurrentStream(state)>>>(
- n, gen_states, input_data, noise_data, input_data, lower, upper);
- THCudaTensor_set(state, output, input);
- }
- else
- {
- THCudaTensor_resizeAs(state, output, input);
- float *output_data = THCudaTensor_data(state, output);
- rreluUpdateOutputTrain<<<NUM_BLOCKS(n), BLOCK_SIZE, 0, THCState_getCurrentStream(state)>>>(
- n, gen_states, input_data, noise_data, output_data, lower, upper);
- }
- THCudaCheck(cudaGetLastError());
- THCudaTensor_free(state, input);
- }
- else
- {
- const double negSlope = (lower + upper) / 2;
- if (inplace)
- {
- THC_pointwiseApply1(state, input, RReLUUpdateOutputEvalIP_functor(negSlope));
- THCudaTensor_set(state, output, input);
- }
- else
- {
- THCudaTensor_resizeAs(state, output, input);
- THC_pointwiseApply2(state, output, input, RReLUUpdateOutputEval_functor(negSlope));
- }
- }
-}
-
+template <typename T>
struct RReLUupdateGradInputEval_functor
{
- const float negSlope_;
+ const T negSlope_;
- RReLUupdateGradInputEval_functor(float negSlope)
+ RReLUupdateGradInputEval_functor(T negSlope)
: negSlope_(negSlope)
{}
- __device__ __forceinline__ void operator()(float *gradIn, float *gradOut, float *in)
+ __device__ __forceinline__ void operator()(T *gradIn, T *gradOut, T *in)
{
*gradIn = (*in) <= 0 ? (*gradOut) * negSlope_ : (*gradOut);
}
};
+template <typename T>
struct RReLUupdateGradInputEvalIP_functor
{
- const float negSlope_;
+ const T negSlope_;
- RReLUupdateGradInputEvalIP_functor(float negSlope)
+ RReLUupdateGradInputEvalIP_functor(T negSlope)
: negSlope_(negSlope)
{}
- __device__ __forceinline__ void operator()(float *gradOut, float *in)
+ __device__ __forceinline__ void operator()(T *gradOut, T *in)
{
if (*in <= 0)
{
@@ -137,42 +117,5 @@ struct RReLUupdateGradInputEvalIP_functor
}
};
-void THNN_CudaRReLU_updateGradInput(THCState *state, THCudaTensor *input, THCudaTensor *gradOutput,
- THCudaTensor *gradInput, THCudaTensor *noise, double lower, double upper, bool train, bool inplace)
-{
- THCUNN_assertSameGPU(state, 4, input, gradOutput, gradInput, noise);
-
- gradOutput = THCudaTensor_newContiguous(state, gradOutput);
-
- if (train && upper - lower > 1E-6) // e.g. if upper == lower, RReLU behaves like LeakyReLU
- {
- // multiply the gradient by the noise tensor
- if (inplace)
- {
- THCudaTensor_cmul(state, gradOutput, gradOutput, noise);
- THCudaTensor_set(state, gradInput, gradOutput);
- }
- else
- {
- THCudaTensor_resizeAs(state, gradInput, input);
- THCudaTensor_cmul(state, gradInput, gradOutput, noise);
- }
- }
- else
- {
- // use constant factor for negative input values
- const double negSlope = (lower + upper) / 2;
- if (inplace)
- {
- THC_pointwiseApply2(state, gradOutput, input, RReLUupdateGradInputEvalIP_functor(negSlope));
- THCudaTensor_set(state, gradInput, gradOutput);
- }
- else
- {
- THCudaTensor_resizeAs(state, gradInput, input);
- THC_pointwiseApply3(state, gradInput, gradOutput, input, RReLUupdateGradInputEval_functor(negSlope));
- }
- }
-
- THCudaTensor_free(state, gradOutput);
-}
+#include "generic/RReLU.cu"
+#include "THCGenerateFloatTypes.h"
diff --git a/lib/THCUNN/SharedMem.cuh b/lib/THCUNN/SharedMem.cuh
new file mode 100644
index 0000000..8d83d9f
--- /dev/null
+++ b/lib/THCUNN/SharedMem.cuh
@@ -0,0 +1,43 @@
+// Based on the simpleTempltes CUDA example
+
+#ifndef THCUNN_SHAREDMEM_H
+#define THCUNN_SHAREDMEM_H
+
+template <typename T>
+struct SharedMem {
+ __device__ T *getPointer()
+ {
+ extern __device__ void error(void);
+ error();
+ return NULL;
+ }
+};
+
+template <>
+struct SharedMem<half>
+{
+ __device__ half *getPointer() {
+ extern __shared__ half s_half[];
+ return s_half;
+ }
+};
+
+template <>
+struct SharedMem<float>
+{
+ __device__ float *getPointer() {
+ extern __shared__ float s_float[];
+ return s_float;
+ }
+};
+
+template <>
+struct SharedMem<double>
+{
+ __device__ double *getPointer() {
+ extern __shared__ double s_double[];
+ return s_double;
+ }
+};
+
+#endif
diff --git a/lib/THCUNN/Sigmoid.cu b/lib/THCUNN/Sigmoid.cu
index f2a3675..623b84d 100644
--- a/lib/THCUNN/Sigmoid.cu
+++ b/lib/THCUNN/Sigmoid.cu
@@ -1,32 +1,24 @@
#include "THCUNN.h"
-#include "common.h"
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
+template <typename T>
struct sigmoidupdateOutput_functor
{
- __device__ void operator()(float *output, const float *input) const
+ __device__ void operator()(T *output, const T *input) const
{
- *output = 1./(1.+ exp(-*input));
+ *output = ScalarConvert<double, T>::to(1./(1.+ exp(-*input)));
}
};
-void THNN_CudaSigmoid_updateOutput(THCState *state, THCudaTensor *input, THCudaTensor *output)
-{
- THCUNN_assertSameGPU(state, 2, input, output);
- THCudaTensor_resizeAs(state, output, input);
- THC_pointwiseApply2(state, output, input, sigmoidupdateOutput_functor());
-}
-
+template <typename T>
struct sigmoidupdateGradInput_functor
{
- __device__ void operator()(float *gradInput, const float *output, const float *gradOutput) const
+ __device__ void operator()(T *gradInput, const T *output, const T *gradOutput) const
{
- *gradInput = *gradOutput * (1.-*output) * (*output);
+ *gradInput = ScalarConvert<double, T>::to(*gradOutput * (1.-*output) * (*output));
}
};
-void THNN_CudaSigmoid_updateGradInput(THCState *state, THCudaTensor *input, THCudaTensor *gradOutput, THCudaTensor *gradInput, THCudaTensor *output)
-{
- THCUNN_assertSameGPU(state, 3, output, gradOutput, gradInput);
- THCudaTensor_resizeAs(state, gradInput, output);
- THC_pointwiseApply3(state, gradInput, output, gradOutput, sigmoidupdateGradInput_functor());
-}
+#include "generic/Sigmoid.cu"
+#include "THCGenerateFloatTypes.h"
diff --git a/lib/THCUNN/SmoothL1Criterion.cu b/lib/THCUNN/SmoothL1Criterion.cu
index 2b528ef..8e94fbc 100644
--- a/lib/THCUNN/SmoothL1Criterion.cu
+++ b/lib/THCUNN/SmoothL1Criterion.cu
@@ -1,5 +1,7 @@
#include "THCUNN.h"
#include "common.h"
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
#include <thrust/fill.h>
#include <thrust/functional.h>
@@ -10,97 +12,38 @@
#include <thrust/system/cuda/execution_policy.h>
#endif
+template <typename Dtype, typename Acctype>
struct smoothl1_functor
{
smoothl1_functor() {}
- __host__ __device__ float operator()(const float &x, const float &y) const
+ __host__ __device__ Acctype operator()(const Dtype &x, const Dtype &y) const
{
- float z = fabsf(x-y);
- return z < 1.f ? 0.5f*z*z : z - 0.5f;
+ Acctype z = ScalarConvert<Dtype, Acctype>::to(THCNumerics<Dtype>::abs(x-y));
+ return z < Acctype(1) ? 0.5f*z*z : z - 0.5f;
}
};
-void THNN_CudaSmoothL1Criterion_updateOutput(THCState *state, THCudaTensor *input, THCudaTensor *target, THCudaTensor *output, bool sizeAverage)
-{
- THCUNN_assertSameGPU(state, 2, input, target);
- THArgCheck(
- THCudaTensor_nElement(state, input) == THCudaTensor_nElement(state, target), 2,
- "input and target need to have the same number of elements"
- );
-
- long size = THCudaTensor_nElement(state, input);
-
- input = THCudaTensor_newContiguous(state, input);
- target = THCudaTensor_newContiguous(state, target);
-
- thrust::device_ptr<float> input_data(THCudaTensor_data(state, input));
- thrust::device_ptr<float> target_data(THCudaTensor_data(state, target));
- float sum = thrust::inner_product(
-#if CUDA_VERSION >= 7000
- thrust::cuda::par.on(THCState_getCurrentStream(state)),
-#endif
- input_data, input_data+size, target_data, (float) 0,
- thrust::plus<float>(), smoothl1_functor()
- );
-
- if (sizeAverage)
- sum /= size;
-
- THCudaTensor_free(state, input);
- THCudaTensor_free(state, target);
-
- THCudaTensor_set1d(state, output, 0, sum);
-}
-
+template <typename Dtype>
struct smoothl1_updateGradInput_functor
{
- const float norm;
+ const Dtype norm;
- smoothl1_updateGradInput_functor(float norm_)
+ smoothl1_updateGradInput_functor(Dtype norm_)
: norm(norm_)
{}
- __host__ __device__ float operator()(const float &x, const float &y) const
+ __host__ __device__ Dtype operator()(const Dtype &x, const Dtype &y) const
{
- float z = x - y;
- if (z < -1.f)
+ Dtype z = x - y;
+ if (z < ScalarConvert<int, Dtype>::to(-1))
return -norm;
- else if (z > 1.f)
+ else if (z > ScalarConvert<int, Dtype>::to(1))
return norm;
else
return norm * z;
}
};
-void THNN_CudaSmoothL1Criterion_updateGradInput(THCState *state, THCudaTensor *input, THCudaTensor *target, THCudaTensor *gradInput, bool sizeAverage)
-{
- THCUNN_assertSameGPU(state, 3, input, target, gradInput);
- THArgCheck(
- THCudaTensor_nElement(state, input) == THCudaTensor_nElement(state, target), 2,
- "input and target need to have the same number of elements"
- );
-
- long size = THCudaTensor_nElement(state, input);
- float norm = sizeAverage ? 1./size : 1.;
-
- input = THCudaTensor_newContiguous(state, input);
- target = THCudaTensor_newContiguous(state, target);
-
- THCudaTensor_resizeAs(state, gradInput, input);
-
- thrust::device_ptr<float> input_data(THCudaTensor_data(state, input));
- thrust::device_ptr<float> target_data(THCudaTensor_data(state, target));
- thrust::device_ptr<float> gradInput_data(THCudaTensor_data(state, gradInput));
-
- thrust::transform(
-#if CUDA_VERSION >= 7000
- thrust::cuda::par.on(THCState_getCurrentStream(state)),
-#endif
- input_data, input_data+size, target_data, gradInput_data,
- smoothl1_updateGradInput_functor(norm)
- );
-
- THCudaTensor_free(state, input);
- THCudaTensor_free(state, target);
-}
+#include "generic/SmoothL1Criterion.cu"
+#include "THCGenerateFloatTypes.h"
diff --git a/lib/THCUNN/SoftMarginCriterion.cu b/lib/THCUNN/SoftMarginCriterion.cu
index 2a15f4a..95c8e95 100644
--- a/lib/THCUNN/SoftMarginCriterion.cu
+++ b/lib/THCUNN/SoftMarginCriterion.cu
@@ -1,5 +1,7 @@
#include "THCUNN.h"
#include "common.h"
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
#include <thrust/fill.h>
#include <thrust/functional.h>
@@ -7,81 +9,29 @@
#include <thrust/reduce.h>
#include <thrust/inner_product.h>
+template <typename Dtype, typename Acctype>
struct softmargin_functor
{
- __host__ __device__ float operator()(const float& x, const float& y) const
+ __host__ __device__ Acctype operator()(const Dtype& x, const Dtype& y) const
{
- return log(1 + exp(-x*y));
+ return log(1 + exp(ScalarConvert<Dtype, Acctype>::to(-x)*y));
}
};
-
-void THNN_CudaSoftMarginCriterion_updateOutput(THCState *state,
- THCudaTensor *input,
- THCudaTensor *target,
- THCudaTensor *output,
- int sizeAverage
- )
-{
- THCUNN_assertSameGPU(state, 2, input, target);
- float sum;
-
- long size = THCudaTensor_nElement(state, input);
-
- input = THCudaTensor_newContiguous(state, input);
- target = THCudaTensor_newContiguous(state, target);
-
- thrust::device_ptr<float> input_data(THCudaTensor_data(state, input));
- thrust::device_ptr<float> target_data(THCudaTensor_data(state, target));
- sum = thrust::inner_product(input_data, input_data+size, target_data, (float) 0, thrust::plus<float>(), softmargin_functor());
-
- if(sizeAverage)
- sum /= size;
-
- THCudaTensor_free(state, input);
- THCudaTensor_free(state, target);
-
- THCudaTensor_set1d(state, output, 0, sum);
-}
-
-
+template <typename Dtype, typename Acctype>
struct softmargin_updateGradInput_functor
{
- const float norm;
+ const Acctype norm;
- softmargin_updateGradInput_functor(float norm_) :
+ softmargin_updateGradInput_functor(Acctype norm_) :
norm(norm_) {}
- __host__ __device__ float operator()(const float& x, const float& y) const
+ __host__ __device__ Dtype operator()(const Dtype& x, const Dtype& y) const
{
- float temp = exp(-x*y);
- return -y*temp*norm/(1.f + temp);
+ Acctype temp = exp(ScalarConvert<Dtype, Acctype>::to(-x)*y);
+ return ScalarConvert<Acctype, Dtype>::to(-y*temp*norm/(ScalarConvert<int, Acctype>::to(1) + temp));
}
};
-void THNN_CudaSoftMarginCriterion_updateGradInput(THCState *state,
- THCudaTensor *input,
- THCudaTensor *target,
- THCudaTensor *gradInput,
- int sizeAverage
- )
-{
- THCUNN_assertSameGPU(state, 3, input, target, gradInput);
-
- long size = THCudaTensor_nElement(state, input);
- float norm = (sizeAverage ? 1./size : 1.);
-
- input = THCudaTensor_newContiguous(state, input);
- target = THCudaTensor_newContiguous(state, target);
-
- THCudaTensor_resizeAs(state, gradInput, input);
-
- thrust::device_ptr<float> input_data(THCudaTensor_data(state, input));
- thrust::device_ptr<float> target_data(THCudaTensor_data(state, target));
- thrust::device_ptr<float> gradInput_data(THCudaTensor_data(state, gradInput));
-
- thrust::transform(input_data, input_data+size, target_data, gradInput_data, softmargin_updateGradInput_functor(norm));
-
- THCudaTensor_free(state, input);
- THCudaTensor_free(state, target);
-}
+#include "generic/SoftMarginCriterion.cu"
+#include "THCGenerateFloatTypes.h"
diff --git a/lib/THCUNN/SoftMax.cu b/lib/THCUNN/SoftMax.cu
index 987325f..5ae6b70 100644
--- a/lib/THCUNN/SoftMax.cu
+++ b/lib/THCUNN/SoftMax.cu
@@ -1,26 +1,29 @@
#include "THCUNN.h"
-#include "common.h"
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
#define SOFTMAX_THREADS 128
+template <typename T, typename AccumT>
__global__ void cunn_SoftMax_updateOutput_kernel(
- float *output, float *input, int nframe, int dim, int stride0, int stride1)
+ T *output, T *input, int nframe, int dim, int stride0, int stride1)
{
- __shared__ float buffer[SOFTMAX_THREADS+1];
- float *input_k = input + blockIdx.x*dim*stride0 + blockIdx.y*stride1 + blockIdx.z;
- float *output_k = output + blockIdx.x*dim*stride0 + blockIdx.y*stride1 + blockIdx.z;
+ __shared__ AccumT buffer[SOFTMAX_THREADS+1];
+ T *input_k = input + blockIdx.x*dim*stride0 + blockIdx.y*stride1 + blockIdx.z;
+ T *output_k = output + blockIdx.x*dim*stride0 + blockIdx.y*stride1 + blockIdx.z;
int i_start = threadIdx.x;
int i_end = dim;
int i_step = blockDim.x;
// max?
- buffer[threadIdx.x] = -FLT_MAX;
+ buffer[threadIdx.x] = -THCNumerics<AccumT>::max();
for (int i=i_start; i<i_end; i+=i_step)
{
- float z = input_k[i*stride0];
- if (buffer[threadIdx.x] < z)
- buffer[threadIdx.x] = z;
+ T z = input_k[i*stride0];
+ AccumT zAcc = ScalarConvert<T, AccumT>::to(z);
+ if (buffer[threadIdx.x] < zAcc)
+ buffer[threadIdx.x] = zAcc;
}
@@ -29,7 +32,7 @@ __global__ void cunn_SoftMax_updateOutput_kernel(
// reduce
if (threadIdx.x == 0)
{
- float max_k = -FLT_MAX;
+ AccumT max_k = -THCNumerics<AccumT>::max();
for (int i=0; i<blockDim.x; i++)
{
if (max_k < buffer[i])
@@ -41,11 +44,11 @@ __global__ void cunn_SoftMax_updateOutput_kernel(
__syncthreads();
// sum?
- float max_k = buffer[SOFTMAX_THREADS];
- buffer[threadIdx.x] = 0;
+ T max_k = ScalarConvert<AccumT, T>::to(buffer[SOFTMAX_THREADS]);
+ buffer[threadIdx.x] = ScalarConvert<int, AccumT>::to(0);
for (int i=i_start; i<i_end; i+=i_step) {
- float z = __expf(input_k[i*stride0]-max_k);
- buffer[threadIdx.x] += z;
+ T z = THCNumerics<T>::exp(input_k[i*stride0]-max_k);
+ buffer[threadIdx.x] += ScalarConvert<T, AccumT>::to(z);
output_k[i*stride0] = z;
}
@@ -54,7 +57,7 @@ __global__ void cunn_SoftMax_updateOutput_kernel(
// reduce
if (threadIdx.x == 0)
{
- float sum_k = 0;
+ AccumT sum_k = ScalarConvert<int, AccumT>::to(0);
for (int i=0; i<blockDim.x; i++)
sum_k += buffer[i];
buffer[SOFTMAX_THREADS] = sum_k;
@@ -63,34 +66,35 @@ __global__ void cunn_SoftMax_updateOutput_kernel(
__syncthreads();
// softmax
- float sum_k = buffer[SOFTMAX_THREADS];
+ T sum_k = ScalarConvert<AccumT, T>::to(buffer[SOFTMAX_THREADS]);
for (int i=i_start; i<i_end; i+=i_step)
output_k[i*stride0] = output_k[i*stride0] / sum_k;
}
+template <typename T, typename AccumT>
__global__ void cunn_SoftMax_updateGradInput_kernel(
- float *gradInput, float *output, float *gradOutput, int nframe, int dim, int stride0, int stride1)
+ T *gradInput, T *output, T *gradOutput, int nframe, int dim, int stride0, int stride1)
{
- __shared__ float buffer[SOFTMAX_THREADS];
- float *gradInput_k = gradInput + blockIdx.x*dim*stride0 + blockIdx.y * stride1 + blockIdx.z;
- float *output_k = output + blockIdx.x*dim*stride0 + blockIdx.y * stride1 + blockIdx.z;
- float *gradOutput_k = gradOutput + blockIdx.x*dim*stride0 + blockIdx.y * stride1 + blockIdx.z;
+ __shared__ AccumT buffer[SOFTMAX_THREADS];
+ T *gradInput_k = gradInput + blockIdx.x*dim*stride0 + blockIdx.y * stride1 + blockIdx.z;
+ T *output_k = output + blockIdx.x*dim*stride0 + blockIdx.y * stride1 + blockIdx.z;
+ T *gradOutput_k = gradOutput + blockIdx.x*dim*stride0 + blockIdx.y * stride1 + blockIdx.z;
int i_start = threadIdx.x;
int i_end = dim;
int i_step = blockDim.x;
// sum?
- buffer[threadIdx.x] = 0;
+ buffer[threadIdx.x] = ScalarConvert<int, AccumT>::to(0);
for (int i=i_start; i<i_end; i+=i_step)
- buffer[threadIdx.x] += gradOutput_k[i*stride0] * output_k[i*stride0];
+ buffer[threadIdx.x] += ScalarConvert<T, AccumT>::to(gradOutput_k[i*stride0] * output_k[i*stride0]);
__syncthreads();
// reduce
if (threadIdx.x == 0)
{
- float sum_k = 0;
+ AccumT sum_k = ScalarConvert<int, AccumT>::to(0);
for (int i=0; i<blockDim.x; i++)
sum_k += buffer[i];
buffer[0] = sum_k;
@@ -98,147 +102,12 @@ __global__ void cunn_SoftMax_updateGradInput_kernel(
__syncthreads();
- float sum_k = buffer[0];
+ T sum_k = ScalarConvert<AccumT, T>::to(buffer[0]);
for (int i=i_start; i<i_end; i+=i_step)
gradInput_k[i*stride0] = output_k[i*stride0] * (gradOutput_k[i*stride0] - sum_k);
}
-void THNN_CudaSoftMax_updateOutput(THCState *state, THCudaTensor *input, THCudaTensor *output)
-{
- THCUNN_assertSameGPU(state, 2, input, output);
-
- input = THCudaTensor_newContiguous(state, input);
- THCudaTensor_resizeAs(state, output, input);
- long batchSize, dim, stride0, stride1 = 1;
- long blocksY = 1, blocksZ = 1;
-
- if (input->nDimension == 1)
- {
- batchSize = 1;
- dim = input->size[0];
- stride0 = 1;
- }
- else if (input->nDimension == 2)
- {
- batchSize = input->size[0];
- dim = input->size[1];
- stride0 = 1;
- }
- else if (input->nDimension == 3)
- {
- batchSize = 1;
- dim = input->size[0];
- blocksY = input->size[1];
- blocksZ = input->size[2];
- stride0 = blocksY * blocksZ;
- stride1 = blocksZ;
- }
- else if (input->nDimension == 4)
- {
- batchSize = input->size[0];
- dim = input->size[1];
- blocksY = input->size[2];
- blocksZ = input->size[3];
- stride0 = blocksY * blocksZ;
- stride1 = blocksZ;
- }
- else
- {
- THError("1D, 2D, 3D or 4D tensor expected");
- }
-
- // when possible use only 2d grid of thread blocks to stay compatible with compute capability 2.X devices.
- if (blocksY * blocksZ < 65536)
- {
- blocksY *= blocksZ;
- blocksZ = 1;
- if (input->nDimension == 3 || input->nDimension == 4) {
- stride0 = blocksY * blocksZ;
- stride1 = blocksZ;
- }
- }
-
- dim3 blocks(batchSize, blocksY, blocksZ);
- dim3 threads(SOFTMAX_THREADS);
- cunn_SoftMax_updateOutput_kernel<<<blocks, threads, 0, THCState_getCurrentStream(state)>>>(
- THCudaTensor_data(state, output),
- THCudaTensor_data(state, input),
- batchSize, dim, stride0, stride1
- );
- THCudaCheck(cudaGetLastError());
-
- THCudaTensor_free(state, input);
-}
-
-void THNN_CudaSoftMax_updateGradInput(THCState *state, THCudaTensor *input, THCudaTensor *gradOutput, THCudaTensor *gradInput, THCudaTensor *output)
-{
- THCUNN_assertSameGPU(state, 3, output, gradOutput, gradInput);
-
- output = THCudaTensor_newContiguous(state, output);
- gradOutput = THCudaTensor_newContiguous(state, gradOutput);
-
- THCudaTensor_resizeAs(state, gradInput, output);
- long batchSize, dim, stride0, stride1 = 1;
- long blocksY = 1, blocksZ = 1;
-
- if (gradInput->nDimension == 1)
- {
- batchSize = 1;
- dim = gradInput->size[0];
- stride0 = 1;
- }
- else if (gradInput->nDimension == 2)
- {
- batchSize = gradInput->size[0];
- dim = gradInput->size[1];
- stride0 = 1;
- }
- else if (gradInput->nDimension == 3)
- {
- batchSize = 1;
- dim = gradInput->size[0];
- blocksY = gradInput->size[1];
- blocksZ = gradInput->size[2];
- stride0 = blocksY * blocksZ;
- stride1 = blocksZ;
- }
- else if (gradInput->nDimension == 4)
- {
- batchSize = gradInput->size[0];
- dim = gradInput->size[1];
- blocksY = gradInput->size[2];
- blocksZ = gradInput->size[3];
- stride0 = blocksY * blocksZ;
- stride1 = blocksZ;
- }
- else
- {
- THError("1D, 2D, 3D or 4D tensor expected");
- }
-
- // when possible use only 2d grid of thread blocks to stay compatible with compute capability 2.X devices.
- if (blocksY * blocksZ < 65536)
- {
- blocksY *= blocksZ;
- blocksZ = 1;
- if (input->nDimension == 3 || input->nDimension == 4) {
- stride0 = blocksY * blocksZ;
- stride1 = blocksZ;
- }
- }
-
- dim3 blocks(batchSize, blocksY, blocksZ);
- dim3 threads(SOFTMAX_THREADS);
- cunn_SoftMax_updateGradInput_kernel<<<blocks, threads, 0, THCState_getCurrentStream(state)>>>(
- THCudaTensor_data(state, gradInput),
- THCudaTensor_data(state, output),
- THCudaTensor_data(state, gradOutput),
- batchSize, dim, stride0, stride1
- );
- THCudaCheck(cudaGetLastError());
-
- THCudaTensor_free(state, gradOutput);
- THCudaTensor_free(state, output);
-}
+#include "generic/SoftMax.cu"
+#include "THCGenerateFloatTypes.h"
#undef SOFTMAX_THREADS
diff --git a/lib/THCUNN/SoftPlus.cu b/lib/THCUNN/SoftPlus.cu
index 0d1609a..cb9ecb7 100644
--- a/lib/THCUNN/SoftPlus.cu
+++ b/lib/THCUNN/SoftPlus.cu
@@ -1,52 +1,42 @@
#include "THCUNN.h"
-#include "common.h"
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
+template <typename T>
struct softPlusupdateOutput_functor
{
- const float threshold;
- const float beta;
+ const T threshold;
+ const T beta;
- softPlusupdateOutput_functor(float threshold_, float beta_)
+ softPlusupdateOutput_functor(T threshold_, T beta_)
: threshold(threshold_)
, beta(beta_)
{}
- __device__ void operator()(float *output, const float *input) const
- {
- float betain = beta * (*input);
+ __device__ void operator()(T *output, const T *input) const {
+ T betain = beta * (*input);
*output = ((betain) > threshold) ? *input : (1/beta) * log1p(exp(betain));
}
};
-void THNN_CudaSoftPlus_updateOutput(THCState *state, THCudaTensor *input, THCudaTensor *output, float beta, float threshold)
-{
- THCUNN_assertSameGPU(state, 2, input, output);
- THCudaTensor_resizeAs(state, output, input);
- THC_pointwiseApply2(state, output, input, softPlusupdateOutput_functor(threshold, beta));
-}
-
+template <typename T>
struct softPlusupdateGradInput_functor
{
- const float threshold;
- const float beta;
+ const T threshold;
+ const T beta;
- softPlusupdateGradInput_functor(float threshold_, float beta_)
+ softPlusupdateGradInput_functor(T threshold_, T beta_)
: threshold(threshold_)
, beta(beta_)
{}
- __device__ void operator()(float *gradInput, const float *output, const float *gradOutput) const
+ __device__ void operator()(T *gradInput, const T *output, const T *gradOutput) const
{
- float betaout = beta * (*output);
- float exp_bo = exp(betaout);
+ T betaout = beta * (*output);
+ T exp_bo = exp(betaout);
*gradInput = ((betaout) > threshold) ? *gradOutput : *gradOutput * (exp_bo - 1) / exp_bo;
}
};
-void THNN_CudaSoftPlus_updateGradInput(THCState *state, THCudaTensor *input, THCudaTensor *gradOutput, THCudaTensor *gradInput,
- THCudaTensor *output, float beta, float threshold)
-{
- THCUNN_assertSameGPU(state, 4, input, output, gradOutput, gradInput);
- THCudaTensor_resizeAs(state, gradInput, output);
- THC_pointwiseApply3(state, gradInput, output, gradOutput, softPlusupdateGradInput_functor(threshold, beta));
-}
+#include "generic/SoftPlus.cu"
+#include "THCGenerateFloatTypes.h"
diff --git a/lib/THCUNN/SoftShrink.cu b/lib/THCUNN/SoftShrink.cu
index 2a08570..7762449 100644
--- a/lib/THCUNN/SoftShrink.cu
+++ b/lib/THCUNN/SoftShrink.cu
@@ -1,54 +1,43 @@
#include "THCUNN.h"
-#include "common.h"
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
+template <typename T>
struct SoftShrinkUpdateOutput
{
- const float lambda_;
+ const T lambda_;
- SoftShrinkUpdateOutput(float lambda)
+ SoftShrinkUpdateOutput(T lambda)
: lambda_(lambda)
{}
- __device__ __forceinline__ void operator()(float *out, float *in)
+ __device__ __forceinline__ void operator()(T *out, T *in)
{
- float x = *in;
+ T x = *in;
if (x > lambda_) *out = x - lambda_;
else if (x < -lambda_) *out = x + lambda_;
- else *out = 0;
+ else *out = ScalarConvert<int, T>::to(0);
}
};
-void THNN_CudaSoftShrink_updateOutput(THCState *state, THCudaTensor *input, THCudaTensor *output, double lambda)
-{
- THCUNN_assertSameGPU(state, 2, input, output);
- THCudaTensor_resizeAs(state, output, input);
- THC_pointwiseApply2(state, output, input, SoftShrinkUpdateOutput(lambda));
- THCudaCheck(cudaGetLastError());
-}
-
+template <typename T>
struct SoftShrinkUpdateGradInput
{
- const float lambda_;
+ const T lambda_;
- SoftShrinkUpdateGradInput(float lambda)
+ SoftShrinkUpdateGradInput(T lambda)
: lambda_(lambda)
{}
- __device__ __forceinline__ void operator()(float *gradInput, float *input, float *gradOutput) const
+ __device__ __forceinline__ void operator()(T *gradInput, T *input, T *gradOutput) const
{
- float x = *input;
+ T x = *input;
if (x > lambda_ || x < -lambda_)
*gradInput = *gradOutput;
else
- *gradInput = 0;
+ *gradInput = ScalarConvert<int, T>::to(0);
}
};
-
-void THNN_CudaSoftShrink_updateGradInput(THCState *state, THCudaTensor *input, THCudaTensor *gradOutput, THCudaTensor *gradInput, double lambda)
-{
- THCUNN_assertSameGPU(state, 3, input, gradOutput, gradInput);
- THCudaTensor_resizeAs(state, gradInput, input);
- THC_pointwiseApply3(state, gradInput, input, gradOutput, SoftShrinkUpdateGradInput(lambda));
- THCudaCheck(cudaGetLastError());
-}
+#include "generic/SoftShrink.cu"
+#include "THCGenerateFloatTypes.h"
diff --git a/lib/THCUNN/SparseLinear.cu b/lib/THCUNN/SparseLinear.cu
index 577eec5..9435735 100644
--- a/lib/THCUNN/SparseLinear.cu
+++ b/lib/THCUNN/SparseLinear.cu
@@ -1,4 +1,7 @@
#include "THCUNN.h"
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
+
#include <cusparse.h>
#include <thrust/device_vector.h>
@@ -13,250 +16,70 @@ static void init_cusparse() {
}
}
-static bool checkInput(THCudaTensor* t)
-{
- return t->nDimension == 2 && t->size[1] == 3;
-}
-
-static bool checkSize2D(THCudaTensor* t, long size0, long size1)
-{
- return t->nDimension == 2 && t->size[0] == size0 && t->size[1] == size1;
-}
-
-static bool checkSize1D(THCudaTensor* t, long size0)
-{
- return t->nDimension == 1 && t->size[0] == size0;
-}
-
-void THNN_CudaSparseLinear_updateOutput(THCState *state,
- THCudaTensor *input,
- THCudaTensor *output,
- THCudaTensor *weight,
- THCudaTensor *bias)
-{
- THAssert(THCudaTensor_checkGPU(state, 4, input, output, weight, bias));
-
- long h;
- long outDim = THCudaTensor_size(state, weight, 0);
- long inDim = THCudaTensor_size(state, weight, 1);
-
- THArgCheck(checkInput(input), 2, "input size must be nnz x 3");
- THArgCheck(THCudaTensor_nDimension(state, output) == 2, 3, "output must be batchsize x outputsize");
- THArgCheck(checkSize1D(bias, outDim), 5, "bias size wrong");
-
- long batchnum = THCudaTensor_size(state, output, 0);
- long nnz = THCudaTensor_size(state, input, 0);
-
- THCudaTensor *buffer = THCudaTensor_new(state);
- THCudaTensor *sel = THCudaTensor_new(state);
- THCudaTensor *values = THCudaTensor_new(state);
- THCudaIntTensor *rowbuf = THCudaIntTensor_new(state);
- THCudaIntTensor *csrPtrs = THCudaIntTensor_new(state);
- THCudaIntTensor *colInds = THCudaIntTensor_new(state);
-
- THCudaTensor_resize1d(state, values, nnz);
- THCudaIntTensor_resize1d(state, rowbuf, nnz);
- THCudaIntTensor_resize1d(state, colInds, nnz);
- THCudaIntTensor_resize1d(state, csrPtrs, batchnum+1);
-
- // Get data ready for cusparse, need CudaInt buffers
- // We do not need to sort, since rows are already in order
- // If rows might get out of order in future implementations, or if cusparse
- // complains with an illegal memory access, sort like we do in AccGradParameters
- THCudaTensor_select(state, sel, input, 1, 0);
- THCudaIntTensor_copyCudaFloat(state, rowbuf, sel);
- THCudaTensor_select(state, sel, input, 1, 1);
- THCudaIntTensor_copyCudaFloat(state, colInds, sel);
- THCudaTensor_select(state, sel, input, 1, 2);
- THCudaTensor_copyCuda(state, values, sel);
-
- init_cusparse();
- cusparseXcoo2csr(cusparse_handle,
- THCudaIntTensor_data(state, rowbuf), nnz, batchnum,
- THCudaIntTensor_data(state, csrPtrs), CUSPARSE_INDEX_BASE_ONE);
-
- // output = bias
- THCudaTensor_resize2d(state, buffer, outDim, batchnum);
- THCudaTensor_zero(state, buffer);
- for (h=0; h<batchnum; h++) {
- THCudaTensor_select(state, sel, buffer, 1, h);
- THCudaTensor_copy(state, sel, bias);
- }
-
- // output = W * x
- float one = 1;
- cusparseMatDescr_t descr = 0;
- cusparseCreateMatDescr(&descr);
- cusparseSetMatType(descr,CUSPARSE_MATRIX_TYPE_GENERAL);
- cusparseSetMatIndexBase(descr,CUSPARSE_INDEX_BASE_ONE);
- cusparseScsrmm(cusparse_handle,
- CUSPARSE_OPERATION_NON_TRANSPOSE,
- batchnum, outDim, inDim, nnz,
- &one,
- descr,
- THCudaTensor_data(state, values),
- THCudaIntTensor_data(state, csrPtrs),
- THCudaIntTensor_data(state, colInds),
- THCudaTensor_data(state, weight), inDim,
- &one, THCudaTensor_data(state, buffer), batchnum
- );
- THCudaTensor_transpose(state, buffer, NULL, 0, 1);
-
- // We do work in the buffer to keep the output contiguous
- THCudaTensor_copy(state, output, buffer);
-
- cusparseDestroyMatDescr(descr);
- descr = 0;
- THCudaTensor_free(state, buffer);
- THCudaTensor_free(state, sel);
- THCudaTensor_free(state, values);
- THCudaIntTensor_free(state, rowbuf);
- THCudaIntTensor_free(state, colInds);
- THCudaIntTensor_free(state, csrPtrs);
+void THNN_CudaHalfSparseLinear_updateOutput(
+ THCState *state,
+ THCudaHalfTensor *input,
+ THCudaHalfTensor *output,
+ THCudaHalfTensor *weight,
+ THCudaHalfTensor *bias) {
+ THError("THCudaHalfTensor not supported with SparseLinear");
}
-void THNN_CudaSparseLinear_accGradParameters(
+void THNN_CudaHalfSparseLinear_accGradParameters(
THCState *state,
- THCudaTensor *input,
- THCudaTensor *gradOutput,
- THCudaTensor *gradWeight,
- THCudaTensor *gradBias,
- THCudaTensor *weight,
- THCudaTensor *bias,
+ THCudaHalfTensor *input,
+ THCudaHalfTensor *gradOutput,
+ THCudaHalfTensor *gradWeight,
+ THCudaHalfTensor *gradBias,
+ THCudaHalfTensor *weight,
+ THCudaHalfTensor *bias,
double weightDecay,
- double scale)
-{
- long outDim = THCudaTensor_size(state, weight, 0);
- long inDim = THCudaTensor_size(state, weight, 1);
-
- THArgCheck(checkInput(input), 2, "input size must be batchsize x nnz x 2");
- THArgCheck(checkSize2D(gradWeight, outDim, inDim), 4, "gradWeight size wrong");
- THArgCheck(checkSize1D(gradBias, outDim), 5, "gradBias size wrong");
- THArgCheck(THCudaTensor_isContiguous(state, gradOutput), 3,
- "gradOutput must be contiguous");
-
- long nnz = THCudaTensor_size(state, input, 0);
- long batchnum = THCudaTensor_size(state, gradOutput, 0);
-
- THCudaTensor *buf = THCudaTensor_new(state);
- THCudaTensor *cols = THCudaTensor_new(state);
- THCudaTensor *sel = THCudaTensor_new(state);
- THCudaLongTensor *inds = THCudaLongTensor_new(state);
- THCudaTensor *values = THCudaTensor_new(state);
- THCudaIntTensor *colbuf = THCudaIntTensor_new(state);
- THCudaIntTensor *colPtrs = THCudaIntTensor_new(state);
- THCudaIntTensor *rowInds = THCudaIntTensor_new(state);
-
- THCudaTensor_select(state, sel, input, 1, 0); // rowInds
- THCudaTensor_select(state, cols, input, 1, 1); // colInds
- THCudaTensor_cadd(state, buf, sel, batchnum, cols); // colInds * buatchdim + rowInds
- THCudaTensor_sort(state, buf, inds, buf, 0, 0); // Indicies are now in ind
- THCudaTensor_indexSelect(state, buf, input, 0, inds);
-
- THCudaTensor_resize1d(state, values, nnz);
- THCudaIntTensor_resize1d(state, colbuf, nnz);
- THCudaIntTensor_resize1d(state, rowInds, nnz);
- THCudaIntTensor_resize1d(state, colPtrs, inDim+1);
-
- // Get data ready for cusparse, need CudaInt buffers
- THCudaTensor_select(state, sel, buf, 1, 0);
- THCudaIntTensor_copyCudaFloat(state, rowInds, sel);
- THCudaTensor_select(state, sel, buf, 1, 1);
- THCudaIntTensor_copyCudaFloat(state, colbuf, sel);
- THCudaTensor_select(state, sel, buf, 1, 2);
- THCudaTensor_copyCuda(state, values, sel);
-
- init_cusparse();
- // Secretly coo2csc
- cusparseXcoo2csr(cusparse_handle,
- THCudaIntTensor_data(state, colbuf), nnz, inDim,
- THCudaIntTensor_data(state, colPtrs), CUSPARSE_INDEX_BASE_ONE);
-
- // FORTRAN expects contiguous col-major matricies
- THCudaTensor_transpose(state, gradOutput, NULL, 0, 1);
- THCudaTensor_resize2d(state, buf, batchnum, outDim);
- THCudaTensor_copy(state, buf, gradOutput);
- THCudaTensor_transpose(state, gradOutput, NULL, 0, 1); // Restore gradOutput
-
- float one = 1;
- cusparseMatDescr_t descr = 0;
- cusparseCreateMatDescr(&descr);
- cusparseSetMatType(descr,CUSPARSE_MATRIX_TYPE_GENERAL);
- cusparseSetMatIndexBase(descr,CUSPARSE_INDEX_BASE_ONE);
- cusparseScsrmm(cusparse_handle,
- CUSPARSE_OPERATION_NON_TRANSPOSE,
- inDim, outDim, batchnum, nnz,
- &one,
- descr,
- THCudaTensor_data(state, values),
- THCudaIntTensor_data(state, colPtrs),
- THCudaIntTensor_data(state, rowInds),
- THCudaTensor_data(state, buf), batchnum,
- &one, THCudaTensor_data(state, gradWeight), inDim
- );
-
- THCudaTensor_sum(state, buf, gradOutput, 0);
- THCudaTensor_resize1d(state, buf, outDim);
- THCudaTensor_cadd(state, gradBias, gradBias, scale, buf);
-
- if (weightDecay != 0)
- {
- THCudaTensor_cadd(state, gradWeight, gradWeight, weightDecay, weight);
- THCudaTensor_cadd(state, gradBias, gradBias, weightDecay, bias);
- }
-
- THCudaTensor_free(state, buf);
- THCudaTensor_free(state, sel);
- THCudaTensor_free(state, cols);
- THCudaLongTensor_free(state, inds);
- THCudaTensor_free(state, values);
- THCudaIntTensor_free(state, colbuf);
- THCudaIntTensor_free(state, rowInds);
- THCudaIntTensor_free(state, colPtrs);
+ double scale) {
+ THError("THCudaHalfTensor not supported with SparseLinear");
}
-void THNN_CudaSparseLinear_legacyUpdateOutput(
+void THNN_CudaHalfSparseLinear_legacyUpdateOutput(
THCState *state,
- THCudaTensor *input,
- THCudaTensor *output,
- THCudaTensor *weight,
- THCudaTensor *bias) {
- THError("CUDA does not support legacy input format, please use a table of nnz x 2 vectors");
+ THCudaHalfTensor *input,
+ THCudaHalfTensor *output,
+ THCudaHalfTensor *weight,
+ THCudaHalfTensor *bias) {
+ THError("THCudaHalfTensor not supported with SparseLinear");
}
-void THNN_CudaSparseLinear_legacyAccGradParameters(
+
+void THNN_CudaHalfSparseLinear_legacyAccGradParameters(
THCState *state,
- THCudaTensor *input,
- THCudaTensor *gradOutput,
- THCudaTensor *gradWeight,
- THCudaTensor *gradBias,
- THCudaTensor *weight,
- THCudaTensor *bias,
+ THCudaHalfTensor *input,
+ THCudaHalfTensor *gradOutput,
+ THCudaHalfTensor *gradWeight,
+ THCudaHalfTensor *gradBias,
+ THCudaHalfTensor *weight,
+ THCudaHalfTensor *bias,
double weightDecay,
double scale) {
- THError("CUDA does not support legacy input format, please use a table of nnz x 2 vectors");
+ THError("THCudaHalfTensor not supported with SparseLinear");
}
-// Dense updates are pretty fast on the GPU
-void THNN_CudaSparseLinear_zeroGradParameters(
+void THNN_CudaHalfSparseLinear_zeroGradParameters(
THCState *state,
- THCudaTensor *gradWeight,
- THCudaTensor *gradBias,
- THCudaTensor *lastInput) {
- THCudaTensor_zero(state, gradWeight);
- THCudaTensor_zero(state, gradBias);
+ THCudaHalfTensor *gradWeight,
+ THCudaHalfTensor *gradBias,
+ THCudaHalfTensor *lastInput) {
+ THError("THCudaHalfTensor not supported with SparseLinear");
}
-TH_API void THNN_CudaSparseLinear_updateParameters(
+void THNN_CudaHalfSparseLinear_updateParameters(
THCState *state,
- THCudaTensor *weight,
- THCudaTensor *bias,
- THCudaTensor *gradWeight,
- THCudaTensor *gradBias,
- THCudaTensor *lastInput,
+ THCudaHalfTensor *weight,
+ THCudaHalfTensor *bias,
+ THCudaHalfTensor *gradWeight,
+ THCudaHalfTensor *gradBias,
+ THCudaHalfTensor *lastInput,
double learningRate) {
- THCudaTensor_cadd(state, weight, weight, -learningRate, gradWeight);
- THCudaTensor_cadd(state, bias, bias, -learningRate, gradBias);
+ THError("THCudaHalfTensor not supported with SparseLinear");
}
-void THNN_CudaSparseLinear_cudaClearState(THCState *state) {
-}
+#include "generic/SparseLinear.cu"
+#include "THCGenerateFloatType.h"
+#include "generic/SparseLinear.cu"
+#include "THCGenerateDoubleType.h"
diff --git a/lib/THCUNN/SpatialAdaptiveMaxPooling.cu b/lib/THCUNN/SpatialAdaptiveMaxPooling.cu
index 5dd8659..2957953 100644
--- a/lib/THCUNN/SpatialAdaptiveMaxPooling.cu
+++ b/lib/THCUNN/SpatialAdaptiveMaxPooling.cu
@@ -1,5 +1,7 @@
#include "THCUNN.h"
-#include "common.h"
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
+#include "THCAtomics.cuh"
#define CUDA_MAX_THREADS 1024 // this is safe, in reality 256 is our limit
@@ -8,7 +10,8 @@
* this function adaptively maxpools an input 4D tensor along dimensions 2 and 3
* 4D input, 4D output, 4D argmax x and y
*/
-__global__ void adaptivemaxpool(float *input, float *output, float *indices_x, float *indices_y,
+ template <typename T>
+__global__ void adaptivemaxpool(T *input, T *output, THCIndex_t *indices_x, THCIndex_t *indices_y,
int input_n, int input_h, int input_w,
int output_h, int output_w,
int strideh, int stridew,
@@ -29,7 +32,6 @@ __global__ void adaptivemaxpool(float *input, float *output, float *indices_x, f
int yy_start = blockDim.y*blockIdx.y + threadIdx.y;
int yy_end = output_h;
const int yy_step = blockDim.y*gridDim.y;
-
// select input/output plane
output = output + o*output_w*output_h;
input = input + i*strided;
@@ -50,17 +52,17 @@ __global__ void adaptivemaxpool(float *input, float *output, float *indices_x, f
int kW = x_end-x_start;
// Compute the mean of the input image...
- float *ptr_input = input + y_start*strideh + x_start*stridew;
- float *ptr_output = output + yy*output_w + xx;
- float *ptr_ind_x = indices_x + yy*output_w + xx;
- float *ptr_ind_y = indices_y + yy*output_w + xx;
+ T *ptr_input = input + y_start*strideh + x_start*stridew;
+ T *ptr_output = output + yy*output_w + xx;
+ THCIndex_t *ptr_ind_x = indices_x + yy*output_w + xx;
+ THCIndex_t *ptr_ind_y = indices_y + yy*output_w + xx;
int argmax_x = -1;
int argmax_y = -1;
- float max = -FLT_MAX;
+ T max = THCNumerics<T>::min();
int kx, ky;
for(ky = 0; ky < kH; ky++) {
for(kx = 0; kx < kW; kx++) {
- float val = ptr_input[kx*stridew];
+ T val = ptr_input[kx*stridew];
if (val > max) {
max = val;
argmax_x = kx;
@@ -81,7 +83,8 @@ __global__ void adaptivemaxpool(float *input, float *output, float *indices_x, f
* Description:
* this function computes the gradInput from weight and gradOutput
*/
-__global__ void adaptivemaxgradinput(float *gradInput, float *gradOutput, float *indices_x, float *indices_y,
+ template <typename T>
+__global__ void adaptivemaxgradinput(T *gradInput, T *gradOutput, THCIndex_t *indices_x, THCIndex_t *indices_y,
int input_n, int input_h, int input_w,
int output_h, int output_w)
{
@@ -116,11 +119,11 @@ __global__ void adaptivemaxgradinput(float *gradInput, float *gradOutput, float
int x_start = (int)floor(float(xx) / output_w * input_w);
- float *ptr_gradInput = gradInput + y_start*input_w + x_start;
- float *ptr_gradOutput = gradOutput + yy*output_w + xx;
- float *ptr_ind_x = indices_x + yy*output_w + xx;
- float *ptr_ind_y = indices_y + yy*output_w + xx;
- float z = *ptr_gradOutput;
+ T *ptr_gradInput = gradInput + y_start*input_w + x_start;
+ T *ptr_gradOutput = gradOutput + yy*output_w + xx;
+ THCIndex_t *ptr_ind_x = indices_x + yy*output_w + xx;
+ THCIndex_t *ptr_ind_y = indices_y + yy*output_w + xx;
+ T z = *ptr_gradOutput;
int argmax_x = (*ptr_ind_x) - TH_INDEX_BASE;
int argmax_y = (*ptr_ind_y) - TH_INDEX_BASE;
@@ -135,8 +138,9 @@ __global__ void adaptivemaxgradinput(float *gradInput, float *gradOutput, float
* this function computes the gradInput from weight and gradOutput
* when kH != dH or kW != dW (uses atomic add)
*/
+ template <typename T>
__global__ void atomicadaptivemaxgradinput(
- float *gradInput, float *gradOutput, float *indices_x, float *indices_y,
+ T *gradInput, T *gradOutput, THCIndex_t *indices_x, THCIndex_t *indices_y,
int input_n, int input_h, int input_w, int output_h, int output_w
)
{
@@ -170,11 +174,11 @@ __global__ void atomicadaptivemaxgradinput(
int x_start = (int)floor(float(xx) / output_w * input_w);
- float *ptr_gradInput = gradInput + y_start*input_w + x_start;
- float *ptr_gradOutput = gradOutput + yy*output_w + xx;
- float *ptr_ind_x = indices_x + yy*output_w + xx;
- float *ptr_ind_y = indices_y + yy*output_w + xx;
- float z = *ptr_gradOutput;
+ T *ptr_gradInput = gradInput + y_start*input_w + x_start;
+ T *ptr_gradOutput = gradOutput + yy*output_w + xx;
+ THCIndex_t *ptr_ind_x = indices_x + yy*output_w + xx;
+ THCIndex_t *ptr_ind_y = indices_y + yy*output_w + xx;
+ T z = *ptr_gradOutput;
int argmax_x = (*ptr_ind_x) - TH_INDEX_BASE;
int argmax_y = (*ptr_ind_y) - TH_INDEX_BASE;
@@ -185,174 +189,7 @@ __global__ void atomicadaptivemaxgradinput(
}
}
-void THNN_CudaSpatialAdaptiveMaxPooling_updateOutput(THCState *state, THCudaTensor *input, THCudaTensor *output, THCudaTensor *indices, int nOutputCols, int nOutputRows)
-{
- THCUNN_assertSameGPU(state, 3, input, output, indices);
-
- float *indices_data;
- float *output_data;
- float *input_data;
-
- THArgCheck(input->nDimension == 3 || input->nDimension == 4, 2, "3D or 4D (batch) tensor expected");
-
- if (input->nDimension == 3) {
- long nInputCols = input->size[2];
- long nInputRows = input->size[1];
- long nInputPlane = input->size[0];
-
- long istride_d = input->stride[0];
- long istride_h = input->stride[1];
- long istride_w = input->stride[2];
-
- input_data = THCudaTensor_data(state, input);
-
- THCudaTensor_resize3d(state, output, nInputPlane, nOutputRows, nOutputCols);
- THCudaTensor_resize4d(state, indices, 2, nInputPlane, nOutputRows, nOutputCols);
-
- indices_data = THCudaTensor_data(state, indices);
- output_data = THCudaTensor_data(state, output);
-
- // cuda blocks & threads:
- int yblocks = (int)(16L / nInputPlane);
- yblocks = yblocks < 1 ? 1 : yblocks;
- dim3 blocks(nInputPlane,yblocks);
- dim3 threads(32,8);
-
- // run maxpool kernel
- adaptivemaxpool <<<blocks, threads, 0, THCState_getCurrentStream(state)>>> (input_data, output_data,
- indices_data+nInputPlane*nOutputCols*nOutputRows, indices_data,
- nInputPlane, nInputRows, nInputCols, nOutputRows, nOutputCols,
- istride_h, istride_w, istride_d);
- THCudaCheck(cudaGetLastError());
-
- } else {
- long nInputCols = input->size[3];
- long nInputRows = input->size[2];
- long nInputPlane = input->size[1];
- long nbatch = input->size[0];
-
- long istride_d = input->stride[1];
- long istride_h = input->stride[2];
- long istride_w = input->stride[3];
-
- input = THCudaTensor_newContiguous(state, input);
- input_data = THCudaTensor_data(state, input);
-
- THCudaTensor_resize4d(state, output, nbatch, nInputPlane, nOutputRows, nOutputCols);
- THCudaTensor_resize5d(state, indices, 2, nbatch, nInputPlane, nOutputRows, nOutputCols);
-
- indices_data = THCudaTensor_data(state, indices);
- output_data = THCudaTensor_data(state, output);
-
- // cuda blocks & threads:
- int yblocks = (int)(16L / nInputPlane);
- yblocks = yblocks < 1 ? 1 : yblocks;
- dim3 blocks(nInputPlane*nbatch,yblocks);
- dim3 threads(32,8);
-
- // run maxpool kernel
- adaptivemaxpool <<<blocks, threads, 0, THCState_getCurrentStream(state)>>> (input_data, output_data,
- indices_data+nbatch*nInputPlane*nOutputCols*nOutputRows, indices_data,
- nInputPlane, nInputRows, nInputCols, nOutputRows, nOutputCols,
- istride_h, istride_w, istride_d);
- THCudaCheck(cudaGetLastError());
- // clean
- THCudaTensor_free(state, input);
- }
-}
-
-void THNN_CudaSpatialAdaptiveMaxPooling_updateGradInput(THCState *state, THCudaTensor *input, THCudaTensor *gradOutput, THCudaTensor *gradInput, THCudaTensor *indices)
-{
- bool atomic = true; // suboptimal, but without atomic it doesn't pass the tests
-
- THCUNN_assertSameGPU(state, 4, input, indices, gradOutput, gradInput);
-
- float *indices_data;
- float *gradInput_data;
- float *gradOutput_data;
-
- gradOutput = THCudaTensor_newContiguous(state, gradOutput);
-
- if (input->nDimension == 3) {
- long nInputCols = input->size[2];
- long nInputRows = input->size[1];
- long nInputPlane = input->size[0];
- long nOutputCols = gradOutput->size[2];
- long nOutputRows = gradOutput->size[1];
-
- //bool atomic = (nInputCols%nOutputCols != 0) || (nInputRows%nOutputRows != 0);
-
- THCudaTensor_resizeAs(state, gradInput, input);
- THCudaTensor_zero(state, gradInput);
-
- indices_data = THCudaTensor_data(state, indices);
- gradOutput_data = THCudaTensor_data(state, gradOutput);
- gradInput_data = THCudaTensor_data(state, gradInput);
-
- // cuda blocks & threads:
- int yblocks = (int)(16L / nInputPlane);
- yblocks = yblocks < 1 ? 1 : yblocks;
- dim3 blocks(nInputPlane,yblocks);
- dim3 threads(32,8);
-
- if(atomic)
- {
- // run updateGradInput kernel, accumulate gradients atomically
- atomicadaptivemaxgradinput <<<blocks, threads, 0, THCState_getCurrentStream(state)>>> (gradInput_data, gradOutput_data,
- indices_data+nInputPlane*nOutputCols*nOutputRows, indices_data,
- nInputPlane, nInputRows, nInputCols, nOutputRows, nOutputCols);
- }
- else
- {
- // run updateGradInput kernel
- atomicadaptivemaxgradinput <<<blocks, threads, 0, THCState_getCurrentStream(state)>>> (gradInput_data, gradOutput_data,
- indices_data+nInputPlane*nOutputCols*nOutputRows, indices_data,
- nInputPlane, nInputRows, nInputCols, nOutputRows, nOutputCols);
- }
- THCudaCheck(cudaGetLastError());
- } else {
- long nInputCols = input->size[3];
- long nInputRows = input->size[2];
- long nInputPlane = input->size[1];
- long nbatch = input->size[0];
- long nOutputCols = gradOutput->size[3];
- long nOutputRows = gradOutput->size[2];
-
- //bool atomic = //(nInputCols%nOutputCols != 0) || (nInputRows%nOutputRows != 0);
-
- THCudaTensor_resizeAs(state, gradInput, input);
- THCudaTensor_zero(state, gradInput);
-
- indices_data = THCudaTensor_data(state, indices);
- gradOutput_data = THCudaTensor_data(state, gradOutput);
- gradInput_data = THCudaTensor_data(state, gradInput);
-
- // cuda blocks & threads:
- int yblocks = (int)(16L / nInputPlane);
- yblocks = yblocks < 1 ? 1 : yblocks;
- dim3 blocks(nInputPlane*nbatch,yblocks);
- dim3 threads(32,8);
-
- if(atomic)
- {
- // run updateGradInput kernel, accumulate gradients atomically
- atomicadaptivemaxgradinput <<<blocks, threads, 0, THCState_getCurrentStream(state)>>> (gradInput_data, gradOutput_data,
- indices_data+nbatch*nInputPlane*nOutputCols*nOutputRows, indices_data,
- nInputPlane, nInputRows, nInputCols, nOutputRows, nOutputCols);
- }
- else
- {
- // run updateGradInput kernel, accumulate gradients atomically
- adaptivemaxgradinput <<<blocks, threads, 0, THCState_getCurrentStream(state)>>> (gradInput_data, gradOutput_data,
- indices_data+nbatch*nInputPlane*nOutputCols*nOutputRows, indices_data,
- nInputPlane, nInputRows, nInputCols, nOutputRows, nOutputCols);
- }
- THCudaCheck(cudaGetLastError());
- }
-
- // clean
- THCudaTensor_free(state,gradOutput);
-
-}
+#include "generic/SpatialAdaptiveMaxPooling.cu"
+#include "THCGenerateFloatTypes.h"
#undef CUDA_MAX_THREADS
diff --git a/lib/THCUNN/SpatialAveragePooling.cu b/lib/THCUNN/SpatialAveragePooling.cu
index 65060c9..5f77e06 100644
--- a/lib/THCUNN/SpatialAveragePooling.cu
+++ b/lib/THCUNN/SpatialAveragePooling.cu
@@ -1,7 +1,9 @@
#include "THCUNN.h"
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
#include "common.h"
-template <typename Dtype, bool COUNT_INCLUDE_PAD>
+template <typename Dtype, typename Acctype, bool COUNT_INCLUDE_PAD>
__global__ void AvePoolForward(const int nthreads,
const Dtype* const bottom_data, const int num, const int channels,
const int height, const int width, const int pooled_height,
@@ -22,7 +24,7 @@ __global__ void AvePoolForward(const int nthreads,
wstart = max(wstart, 0);
hend = min(hend, height);
wend = min(wend, width);
- Dtype aveval = 0;
+ Acctype aveval = Acctype(0);
const Dtype* const bottom_slice = bottom_data + (n * channels + c) * height * width;
for (int h = hstart; h < hend; ++h) {
for (int w = wstart; w < wend; ++w) {
@@ -30,87 +32,13 @@ __global__ void AvePoolForward(const int nthreads,
}
}
if(COUNT_INCLUDE_PAD)
- top_data[index] = aveval / pool_size;
+ top_data[index] = ScalarConvert<Acctype, Dtype>::to(aveval / pool_size);
else
- top_data[index] = aveval / ((hend - hstart) * (wend - wstart));
+ top_data[index] = ScalarConvert<Acctype, Dtype>::to(aveval / ((hend - hstart) * (wend - wstart)));
}
}
-
-void THNN_CudaSpatialAveragePooling_updateOutput(THCState *state, THCudaTensor *input, THCudaTensor *output, int kW, int kH, int dW, int dH, int padW, int padH, bool ceil_mode, bool count_include_pad)
-{
- THCUNN_assertSameGPU(state, 2, input, output);
- THArgCheck(input->nDimension == 3 || input->nDimension == 4, 2, "3D or 4D (batch) tensor expected");
-
- long nInputCols, nInputRows, nInputPlane, batchSize;
- long nOutputCols, nOutputRows;
-
- if (input->nDimension == 3) {
- nInputCols = input->size[2];
- nInputRows = input->size[1];
- nInputPlane = input->size[0];
- batchSize = 1;
- }
- else
- {
- nInputCols = input->size[3];
- nInputRows = input->size[2];
- nInputPlane = input->size[1];
- batchSize = input->size[0];
- }
-
- THArgCheck(nInputCols >= kW - 2*padW && nInputRows >= kH - 2*padH, 2, "input image smaller than kernel size");
- THArgCheck(kW/2 >= padW && kH/2 >= padH, 2, "pad should be smaller than half of kernel size");
-
- if(ceil_mode) {
- nOutputCols = ceil(float(nInputCols - kW + 2*padW) / float(dW)) + 1;
- nOutputRows = ceil(float(nInputRows - kH + 2*padH) / float(dH)) + 1;
- }
- else {
- nOutputCols = floor(float(nInputCols - kW + 2*padW) / float(dW)) + 1;
- nOutputRows = floor(float(nInputRows - kH + 2*padH) / float(dH)) + 1;
- }
- if (padW || padH)
- {
- // ensure that the last pooling starts inside the image
- // needed to avoid problems in ceil mode
- if ((nOutputRows - 1)*dH >= nInputRows + padH)
- --nOutputRows;
- if ((nOutputCols - 1)*dW >= nInputCols + padW)
- --nOutputCols;
- }
-
- input = THCudaTensor_newContiguous(state, input);
- float* input_data = THCudaTensor_data(state, input);
-
- THCudaTensor_resize4d(state, output, batchSize, nInputPlane, nOutputRows, nOutputCols);
-
- float* output_data = THCudaTensor_data(state, output);
-
- int count = THCudaTensor_nElement(state, output);
-
- if(count_include_pad)
- AvePoolForward<float, true>
- <<<GET_BLOCKS(count), CUDA_NUM_THREADS, 0, THCState_getCurrentStream(state) >>>(
- count, input_data,
- batchSize, nInputPlane, nInputRows, nInputCols, nOutputRows, nOutputCols,
- kH, kW, dH, dW, padH, padW, output_data);
- else
- AvePoolForward<float, false>
- <<<GET_BLOCKS(count), CUDA_NUM_THREADS, 0, THCState_getCurrentStream(state) >>>(
- count, input_data,
- batchSize, nInputPlane, nInputRows, nInputCols, nOutputRows, nOutputCols,
- kH, kW, dH, dW, padH, padW, output_data);
- THCudaCheck(cudaGetLastError());
-
- if(input->nDimension == 3)
- THCudaTensor_resize3d(state, output, nInputPlane, nOutputRows, nOutputCols);
-
- THCudaTensor_free(state, input);
-
-}
-
-template <typename Dtype, bool COUNT_INCLUDE_PAD>
+template <typename Dtype, typename Acctype, bool COUNT_INCLUDE_PAD>
__global__ void AvePoolBackward(const int nthreads, const Dtype* const top_diff,
const int num, const int channels, const int height,
const int width, const int pooled_height, const int pooled_width,
@@ -128,7 +56,7 @@ __global__ void AvePoolBackward(const int nthreads, const Dtype* const top_diff,
const int phend = min(h / stride_h + 1, pooled_height);
const int pwstart = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1;
const int pwend = min(w / stride_w + 1, pooled_width);
- Dtype gradient = 0;
+ Acctype gradient = Acctype(0);
const Dtype* const top_diff_slice =
top_diff + (n * channels + c) * pooled_height * pooled_width;
for (int ph = phstart; ph < phend; ++ph) {
@@ -149,76 +77,9 @@ __global__ void AvePoolBackward(const int nthreads, const Dtype* const top_diff,
gradient += top_diff_slice[ph * pooled_width + pw] / ((hend - hstart) * (wend - wstart));
}
}
- bottom_diff[index] = gradient;
- }
-}
-
-void THNN_CudaSpatialAveragePooling_updateGradInput(THCState *state, THCudaTensor *input, THCudaTensor *gradOutput, THCudaTensor *gradInput, int kW, int kH, int dW, int dH, int padW, int padH, bool ceil_mode, bool count_include_pad)
-{
- THCUNN_assertSameGPU(state, 3, input, gradOutput, gradInput);
-
- input = THCudaTensor_newContiguous(state, input);
- gradOutput = THCudaTensor_newContiguous(state, gradOutput);
-
- long nInputCols, nInputRows, nInputPlane, batchSize;
- long nOutputCols, nOutputRows;
-
- if (input->nDimension == 3) {
- nInputCols = input->size[2];
- nInputRows = input->size[1];
- nInputPlane = input->size[0];
- batchSize = 1;
- }
- else
- {
- nInputCols = input->size[3];
- nInputRows = input->size[2];
- nInputPlane = input->size[1];
- batchSize = input->size[0];
+ bottom_diff[index] = ScalarConvert<Acctype, Dtype>::to(gradient);
}
-
- if(ceil_mode) {
- nOutputCols = ceil(float(nInputCols - kW + 2*padW) / float(dW)) + 1;
- nOutputRows = ceil(float(nInputRows - kH + 2*padH) / float(dH)) + 1;
- }
- else {
- nOutputCols = floor(float(nInputCols - kW + 2*padW) / float(dW)) + 1;
- nOutputRows = floor(float(nInputRows - kH + 2*padH) / float(dH)) + 1;
- }
- if (padW || padH)
- {
- // ensure that the last pooling starts inside the image
- // needed to avoid problems in ceil mode
- if ((nOutputRows - 1)*dH >= nInputRows + padH)
- --nOutputRows;
- if ((nOutputCols - 1)*dW >= nInputCols + padW)
- --nOutputCols;
- }
-
- THCudaTensor_resizeAs(state, gradInput, input);
-
- int count = THCudaTensor_nElement(state, input);
-
- if(count_include_pad)
- AvePoolBackward<float, true>
- <<< GET_BLOCKS(count), CUDA_NUM_THREADS, 0, THCState_getCurrentStream(state) >>>
- (count,
- THCudaTensor_data(state, gradOutput),
- batchSize, nInputPlane, nInputRows, nInputCols, nOutputRows, nOutputCols,
- kH, kW, dH, dW, padH, padW,
- THCudaTensor_data(state, gradInput));
- else
- AvePoolBackward<float, false>
- <<< GET_BLOCKS(count), CUDA_NUM_THREADS, 0, THCState_getCurrentStream(state) >>>
- (count,
- THCudaTensor_data(state, gradOutput),
- batchSize, nInputPlane, nInputRows, nInputCols, nOutputRows, nOutputCols,
- kH, kW, dH, dW, padH, padW,
- THCudaTensor_data(state, gradInput));
- THCudaCheck(cudaGetLastError());
-
- // clean
- THCudaTensor_free(state, input);
- THCudaTensor_free(state, gradOutput);
}
+#include "generic/SpatialAveragePooling.cu"
+#include "THCGenerateFloatTypes.h"
diff --git a/lib/THCUNN/SpatialClassNLLCriterion.cu b/lib/THCUNN/SpatialClassNLLCriterion.cu
index 0469520..4b2f523 100644
--- a/lib/THCUNN/SpatialClassNLLCriterion.cu
+++ b/lib/THCUNN/SpatialClassNLLCriterion.cu
@@ -1,29 +1,30 @@
#include "THCUNN.h"
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
+#include "THCAtomics.cuh"
#include "common.h"
-#include <stdio.h>
-#include <assert.h>
-
#include <thrust/functional.h>
+template <typename T, typename AccumT>
__global__ void cunn_SpatialClassNLLCriterion_updateOutput_kernel(
- float *output,
- float *total_weight,
- float *input,
- long *target,
- float *weights,
+ T *output,
+ T *total_weight,
+ T *input,
+ THCIndex_t *target,
+ T *weights,
int size_average,
int batch_size,
int n_classes,
int map_nelem,
int blocks_per_sample)
{
- __shared__ float partial_sums[CUDA_NUM_THREADS];
+ __shared__ AccumT partial_sums[CUDA_NUM_THREADS];
int i, t;
- float cur_weight;
- float input_sum = 0;
- float acc_weight = 0;
+ T cur_weight;
+ AccumT input_sum = 0;
+ AccumT acc_weight = 0;
int sample = blockIdx.x / blocks_per_sample;
int toffset = sample * map_nelem;
@@ -34,30 +35,31 @@ __global__ void cunn_SpatialClassNLLCriterion_updateOutput_kernel(
i += step) {
t = target[toffset + i] - TH_INDEX_BASE;
assert(t >= 0 && t < n_classes);
- cur_weight = weights ? weights[t] : 1.0f;
+ cur_weight = weights ? weights[t] : ScalarConvert<int, T>::to(1);
input_sum -= input[ioffset + i + map_nelem * t] * cur_weight;
acc_weight += cur_weight;
}
__syncthreads();
- input_sum = reduceBlock(partial_sums, blockDim.x, input_sum, thrust::plus<float>(), 0.0f);
- acc_weight = reduceBlock(partial_sums, blockDim.x, acc_weight, thrust::plus<float>(), 0.0f);
+ input_sum = reduceBlock(partial_sums, blockDim.x, input_sum, thrust::plus<AccumT>(), AccumT(0));
+ acc_weight = reduceBlock(partial_sums, blockDim.x, acc_weight, thrust::plus<AccumT>(), AccumT(0));
if (threadIdx.x == 0) {
- atomicAdd(total_weight, acc_weight);
+ atomicAdd(total_weight, ScalarConvert<AccumT, T>::to(acc_weight));
if (size_average && acc_weight > 0)
- atomicAdd(output, input_sum / acc_weight / gridDim.x);
+ atomicAdd(output, ScalarConvert<AccumT, T>::to(input_sum / acc_weight / gridDim.x));
else
- atomicAdd(output, input_sum);
+ atomicAdd(output, ScalarConvert<AccumT, T>::to(input_sum));
}
}
+template<typename T>
__global__ void cunn_SpatialClassNLLCriterion_updateGradInput_kernel(
- float *gradInput,
- long *target,
- float *weights,
- float *total_weight,
+ T *gradInput,
+ THCIndex_t *target,
+ T *weights,
+ T *total_weight,
int size_average,
int batch_size,
int n_classes,
@@ -68,7 +70,7 @@ __global__ void cunn_SpatialClassNLLCriterion_updateGradInput_kernel(
return;
int i, t;
- float norm = size_average ? (1.0f / *total_weight) : 1.0f;
+ T norm = size_average ? (ScalarConvert<int, T>::to(1) / *total_weight) : ScalarConvert<int, T>::to(1);
int sample = blockIdx.x / blocks_per_sample;
int step = blockDim.x * blocks_per_sample;
@@ -79,127 +81,9 @@ __global__ void cunn_SpatialClassNLLCriterion_updateGradInput_kernel(
i += step) {
t = (int)target[toffset + i] - TH_INDEX_BASE;
assert(t >= 0 && t < n_classes);
- gradInput[ioffset + i + map_nelem * t] = -(weights ? weights[t] : 1.0f) * norm;
+ gradInput[ioffset + i + map_nelem * t] = -(weights ? weights[t] : ScalarConvert<int, T>::to(1)) * norm;
}
}
-void THNN_CudaSpatialClassNLLCriterion_updateOutput(
- THCState *state,
- THCudaTensor *input,
- THCudaLongTensor *target,
- THCudaTensor *output,
- bool sizeAverage,
- THCudaTensor *weights,
- THCudaTensor *total_weight)
-{
- THArgCheck(THCudaLongTensor_nDimension(state, target) == 3, 1,
- "only batches of spatial targets supported (3D tensors)");
- THArgCheck(THCudaTensor_nDimension(state, input) == 4, 2,
- "only batches of spatial inputs supported (4D tensors)");
- if (weights && THCudaTensor_nElement(state, weights) != THCudaTensor_size(state, input, 1)) {
- THError("weight tensor should be defined either for all or no classes");
- }
-
- if (weights)
- THCUNN_assertSameGPU(state, 5, input, target, weights, output, total_weight);
- else
- THCUNN_assertSameGPU(state, 4, input, target, output, total_weight);
-
- input = THCudaTensor_newContiguous(state, input);
- weights = weights ? THCudaTensor_newContiguous(state, weights) : NULL;
- target = THCudaLongTensor_newContiguous(state, target);
-
- float *input_data = THCudaTensor_data(state, input);
- float *weights_data = weights ? THCudaTensor_data(state, weights) : NULL;
- long *target_data = THCudaLongTensor_data(state, target);
- float *output_data = THCudaTensor_data(state, output);
- float *total_weight_data = THCudaTensor_data(state, total_weight);
-
- long batch_size = THCudaLongTensor_size(state, target, 0);
- long map_nelem = THCudaLongTensor_nElement(state, target) / batch_size;
- int blocks_per_sample = GET_BLOCKS(map_nelem) / 128;
- blocks_per_sample = (blocks_per_sample == 0) ? 1 : blocks_per_sample;
- int total_blocks = blocks_per_sample * batch_size;
-
- THCudaTensor_fill(state, output, 0);
- THCudaTensor_fill(state, total_weight, 0);
-
- cunn_SpatialClassNLLCriterion_updateOutput_kernel
- <<<total_blocks, CUDA_NUM_THREADS, 0, THCState_getCurrentStream(state)>>>(
- output_data,
- total_weight_data,
- input_data,
- target_data,
- weights_data,
- sizeAverage,
- THCudaTensor_size(state, input, 0),
- THCudaTensor_size(state, input, 1),
- THCudaTensor_size(state, input, 2) * THCudaTensor_size(state, input, 3),
- blocks_per_sample
- );
- THCudaCheck(cudaGetLastError());
-
- if (weights)
- THCudaTensor_free(state, weights);
- THCudaLongTensor_free(state, target);
- THCudaTensor_free(state, input);
-}
-
-void THNN_CudaSpatialClassNLLCriterion_updateGradInput(
- THCState *state,
- THCudaTensor *input,
- THCudaLongTensor *target,
- THCudaTensor *gradInput,
- bool sizeAverage,
- THCudaTensor *weights,
- THCudaTensor *total_weight)
-{
- THArgCheck(THCudaLongTensor_nDimension(state, target) == 3, 1,
- "only batches of spatial targets supported (3D tensors)");
- THArgCheck(THCudaTensor_nDimension(state, input) == 4, 2,
- "only batches of spatial inputs supported (4D tensors)");
- THArgCheck(THCudaTensor_isContiguous(state, gradInput), 4,
- "gradInput must be contiguous");
- if (weights && THCudaTensor_nElement(state, weights) != THCudaTensor_size(state, input, 1)) {
- THError("weight tensor should be defined either for all or no classes");
- }
-
- if (weights)
- THCUNN_assertSameGPU(state, 5, weights, input, target, gradInput, total_weight);
- else
- THCUNN_assertSameGPU(state, 4, input, target, gradInput, total_weight);
-
- input = THCudaTensor_newContiguous(state, input);
- weights = weights ? THCudaTensor_newContiguous(state, weights) : NULL;
- target = THCudaLongTensor_newContiguous(state, target);
-
- float *weights_data = weights ? THCudaTensor_data(state, weights) : NULL;
- float *gradInput_data = THCudaTensor_data(state, gradInput);
- long *target_data = THCudaLongTensor_data(state, target);
- float *total_weight_data = THCudaTensor_data(state, total_weight);
-
- long batch_size = THCudaLongTensor_size(state, target, 0);
- long map_nelem = THCudaLongTensor_nElement(state, target) / batch_size;
- int blocks_per_sample = GET_BLOCKS(map_nelem) / 128;
- blocks_per_sample = (blocks_per_sample == 0) ? 1 : blocks_per_sample;
- int total_blocks = blocks_per_sample * batch_size;
-
- cunn_SpatialClassNLLCriterion_updateGradInput_kernel
- <<<total_blocks, CUDA_NUM_THREADS, 0, THCState_getCurrentStream(state)>>>(
- gradInput_data,
- target_data,
- weights_data,
- total_weight_data,
- sizeAverage,
- THCudaTensor_size(state, input, 0),
- THCudaTensor_size(state, input, 1),
- THCudaTensor_size(state, input, 2) *THCudaTensor_size(state, input, 3),
- blocks_per_sample
- );
- THCudaCheck(cudaGetLastError());
-
- if (weights)
- THCudaTensor_free(state, weights);
- THCudaLongTensor_free(state, target);
- THCudaTensor_free(state, input);
-}
+#include "generic/SpatialClassNLLCriterion.cu"
+#include "THCGenerateFloatTypes.h"
diff --git a/lib/THCUNN/SpatialConvolutionLocal.cu b/lib/THCUNN/SpatialConvolutionLocal.cu
index 7f758ef..e5b1f98 100644
--- a/lib/THCUNN/SpatialConvolutionLocal.cu
+++ b/lib/THCUNN/SpatialConvolutionLocal.cu
@@ -2,321 +2,8 @@
#include "common.h"
#include "im2col.h"
-void THNN_CudaSpatialConvolutionLocal_updateOutput(
- THCState *state,
- THCudaTensor *input,
- THCudaTensor *output,
- THCudaTensor *weight,
- THCudaTensor *bias,
- THCudaTensor *finput,
- THCudaTensor *fgradInput,
- int kW, int kH,
- int dW, int dH,
- int padW, int padH,
- long inputWidth, long inputHeight,
- long outputWidth, long outputHeight)
-{
- THCUNN_assertSameGPU(state, 5, input, output, weight,
- bias, finput);
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
- long nInputPlane = THCudaTensor_size(state,weight,2)/(kW*kH);
- long nOutputPlane = THCudaTensor_size(state,weight,1);
-
- int batch = 1;
- if (input->nDimension == 3) {
- // Force batch
- batch = 0;
- THCudaTensor_resize4d(state, input, 1, nInputPlane, inputHeight, inputWidth);
- }
-
- // Batch size + input planes
- long batchSize = input->size[0];
-
- // Resize output
- THCudaTensor_resize4d(state, output, batchSize, nOutputPlane, outputHeight, outputWidth);
-
- // Augment the input
- THCudaTensor_resize3d(state, finput, batchSize, nInputPlane*kW*kH, outputHeight*outputWidth);
-
- // Helpers
- THCudaTensor *input_n = THCudaTensor_new(state);
- THCudaTensor *finput_n = THCudaTensor_new(state);
- THCudaTensor *output_n = THCudaTensor_new(state);
-
- // For each elt in batch, do:
- for (int elt = 0; elt < batchSize; elt ++) {
- THCudaTensor *finput3d, *output3d;
- THCudaTensor *wslice = THCudaTensor_new(state);
- THCudaTensor *islice = THCudaTensor_new(state);
- THCudaTensor *oslice = THCudaTensor_new(state);
-
- // Matrix mulitply per output:
- THCudaTensor_select(state, input_n, input, 0, elt);
- THCudaTensor_select(state, finput_n, finput, 0, elt);
- THCudaTensor_select(state, output_n, output, 0, elt);
-
- // Extract columns:
- im2col(
- THCState_getCurrentStream(state),
- THCudaTensor_data(state, input_n),
- nInputPlane, inputHeight, inputWidth, kH, kW, padH, padW, dH, dW,
- 1, 1, THCudaTensor_data(state, finput_n)
- );
-
- output3d = THCudaTensor_newWithStorage3d(state, output_n->storage, output_n->storageOffset,
- outputHeight*outputWidth, 1,
- nOutputPlane, outputHeight*outputWidth,
- 1, nOutputPlane*outputHeight*outputWidth);
-
- finput3d = THCudaTensor_newWithStorage3d(state, finput_n->storage, finput_n->storageOffset,
- outputHeight*outputWidth, 1,
- kW*kH*nInputPlane, outputHeight*outputWidth,
- 1, kW*kH*nInputPlane*outputHeight*outputWidth);
-
- THCudaTensor_copy(state, output_n, bias);
-
- for (int i = 0; i < outputHeight; i++) {
- for(int j = 0; j < outputWidth; j++) {
- int sliceidx = i * outputWidth + j;
- THCudaTensor_select(state, wslice, weight, 0, sliceidx);
- THCudaTensor_select(state, islice, finput3d, 0, sliceidx);
- THCudaTensor_select(state, oslice, output3d, 0, sliceidx);
- THCudaTensor_addmm(state, oslice, 1.0, oslice, 1.0, wslice, islice);
- }
- }
-
-
- // weight: oH*oW x nOutputPlane x nInputPlane*kH*kW
- // finput3d: oH*oW x nInputPlane*kH*kW x 1
- // THCudaTensor_baddbmm(state, output3d, 1.0, output3d, 1.0, weight, finput3d);
- // output3d: oH*oW x nOutputPlane x 1
-
- THCudaTensor_free(state, output3d);
- THCudaTensor_free(state, finput3d);
- THCudaTensor_free(state, wslice);
- THCudaTensor_free(state, islice);
- THCudaTensor_free(state, oslice);
- }
-
- // Free
- THCudaTensor_free(state, input_n);
- THCudaTensor_free(state, finput_n);
- THCudaTensor_free(state, output_n);
-
- // Resize output
- if (batch == 0) {
- THCudaTensor_resize3d(state, output, nOutputPlane, outputHeight, outputWidth);
- THCudaTensor_resize3d(state, input, nInputPlane, inputHeight, inputWidth);
- }
-}
-
-void THNN_CudaSpatialConvolutionLocal_updateGradInput(
- THCState *state,
- THCudaTensor *input,
- THCudaTensor *gradOutput,
- THCudaTensor *gradInput,
- THCudaTensor *weight,
- THCudaTensor *finput,
- THCudaTensor *fgradInput,
- int kW, int kH,
- int dW, int dH,
- int padW, int padH,
- long inputWidth, long inputHeight,
- long outputWidth, long outputHeight)
-{
- THCUNN_assertSameGPU(state, 5, input, gradOutput, weight,
- fgradInput, gradInput);
-
- long nInputPlane = THCudaTensor_size(state,weight,2)/(kW*kH);
- long nOutputPlane = THCudaTensor_size(state,weight,1);
-
- int batch = 1;
- if (input->nDimension == 3) {
- // Force batch
- batch = 0;
- THCudaTensor_resize4d(state, input, 1, nInputPlane, inputHeight, inputWidth);
- THCudaTensor_resize4d(state, gradOutput, 1, nOutputPlane, outputHeight, outputWidth);
- }
-
- // Batch size + input planes
- long batchSize = input->size[0];
-
- // Resize output
- THCudaTensor_resize4d(state, gradInput, batchSize, nInputPlane, inputHeight, inputWidth);
-
- // Resize temporary columns
- THCudaTensor_resize3d(state, fgradInput, batchSize, nInputPlane*kW*kH, outputHeight*outputWidth);
-
- // Helpers
- THCudaTensor *gradInput_n = THCudaTensor_new(state);
- THCudaTensor *fgradInput_n = THCudaTensor_new(state);
- THCudaTensor *gradOutput_n = THCudaTensor_new(state);
-
- THCudaTensor_transpose(state, weight, weight, 1, 2);
-
- // For each elt in batch, do:
- for (int elt = 0; elt < batchSize; elt ++) {
- THCudaTensor *gradOutput3d, *fgradInput3d;
- THCudaTensor *wslice = THCudaTensor_new(state);
- THCudaTensor *gislice = THCudaTensor_new(state);
- THCudaTensor *goslice = THCudaTensor_new(state);
-
- // Matrix mulitply per sample:
- THCudaTensor_select(state, gradInput_n, gradInput, 0, elt);
- THCudaTensor_select(state, fgradInput_n, fgradInput, 0, elt);
- THCudaTensor_select(state, gradOutput_n, gradOutput, 0, elt);
-
- gradOutput3d = THCudaTensor_newWithStorage3d(state, gradOutput_n->storage, gradOutput_n->storageOffset,
- outputHeight*outputWidth, 1,
- nOutputPlane, outputHeight*outputWidth,
- 1, nOutputPlane*outputHeight*outputWidth);
- fgradInput3d = THCudaTensor_newWithStorage3d(state, fgradInput_n->storage, fgradInput_n->storageOffset,
- outputHeight*outputWidth, 1,
- kW*kH*nInputPlane, outputHeight*outputWidth,
- 1, kW*kH*nInputPlane*outputHeight*outputWidth);
-
- for (int i = 0; i < outputHeight; i++) {
- for(int j = 0; j < outputWidth; j++) {
- int sliceidx = i * outputWidth + j;
- THCudaTensor_select(state, wslice, weight, 0, sliceidx);
- THCudaTensor_select(state, gislice, fgradInput3d, 0, sliceidx);
- THCudaTensor_select(state, goslice, gradOutput3d, 0, sliceidx);
- THCudaTensor_addmm(state, gislice, 0.0, gislice, 1.0, wslice, goslice);
- }
- }
-
- // weight: oH*oW x nInputPlane*kH*kW x nOutputPlane
- // gradOutput3d: oH*oW x nOutputPlane x 1
- //THCudaTensor_baddbmm(state, fgradInput3d, 0.0, fgradInput3d, 1.0, weight, gradOutput3d);
- // fgradInput3d: oH*oW x nInputPlane*kH*kW x 1
-
- // Unpack columns back into input:
- col2im(
- THCState_getCurrentStream(state),
- THCudaTensor_data(state, fgradInput_n),
- nInputPlane, inputHeight, inputWidth, kH, kW, padH, padW, dH, dW,
- 1, 1, THCudaTensor_data(state, gradInput_n)
- );
-
- THCudaTensor_free(state, gradOutput3d);
- THCudaTensor_free(state, fgradInput3d);
- THCudaTensor_free(state, wslice);
- THCudaTensor_free(state, gislice);
- THCudaTensor_free(state, goslice);
- }
-
- // Free
- THCudaTensor_free(state, gradInput_n);
- THCudaTensor_free(state, fgradInput_n);
- THCudaTensor_free(state, gradOutput_n);
-
- // Resize output
- if (batch == 0) {
- THCudaTensor_resize3d(state, gradOutput, nOutputPlane, outputHeight, outputWidth);
- THCudaTensor_resize3d(state, input, nInputPlane, inputHeight, inputWidth);
- THCudaTensor_resize3d(state, gradInput, nInputPlane, inputHeight, inputWidth);
- }
-
- THCudaTensor_transpose(state, weight, weight, 1, 2);
-}
-
-void THNN_CudaSpatialConvolutionLocal_accGradParameters(
- THCState *state,
- THCudaTensor *input,
- THCudaTensor *gradOutput,
- THCudaTensor *gradWeight,
- THCudaTensor *gradBias,
- THCudaTensor *finput,
- THCudaTensor *fgradInput,
- int kW, int kH,
- int dW, int dH,
- int padW, int padH,
- long inputWidth, long inputHeight,
- long outputWidth, long outputHeight,
- float scale)
-{
- THCUNN_assertSameGPU(state, 5, input, gradOutput, gradWeight,
- gradBias, finput);
-
- long nInputPlane = THCudaTensor_size(state,gradWeight,2)/(kW*kH);
- long nOutputPlane = THCudaTensor_size(state,gradWeight,1);
-
- int batch = 1;
- if (input->nDimension == 3) {
- // Force batch
- batch = 0;
- THCudaTensor_resize4d(state, input, 1, nInputPlane, inputHeight, inputWidth);
- THCudaTensor_resize4d(state, gradOutput, 1, nOutputPlane, outputHeight, outputWidth);
- }
-
- // Batch size + input planes
- long batchSize = input->size[0];
-
- // Helpers
- THCudaTensor *input_n = THCudaTensor_new(state);
- THCudaTensor *finput_n = THCudaTensor_new(state);
- THCudaTensor *gradOutput_n = THCudaTensor_new(state);
-
- // For each elt in batch, do:
- for (int elt = 0; elt < batchSize; elt ++) {
- THCudaTensor *gradOutput3d, *finput3d;
- THCudaTensor *gwslice = THCudaTensor_new(state);
- THCudaTensor *islice = THCudaTensor_new(state);
- THCudaTensor *goslice = THCudaTensor_new(state);
-
- // Matrix mulitply per output:
- THCudaTensor_select(state, input_n, input, 0, elt);
- THCudaTensor_select(state, finput_n, finput, 0, elt);
- THCudaTensor_select(state, gradOutput_n, gradOutput, 0, elt);
-
- gradOutput3d = THCudaTensor_newWithStorage3d(state, gradOutput_n->storage, gradOutput_n->storageOffset,
- outputHeight*outputWidth, 1,
- nOutputPlane, outputHeight*outputWidth,
- 1, nOutputPlane*outputHeight*outputWidth);
- finput3d = THCudaTensor_newWithStorage3d(state, finput_n->storage, finput_n->storageOffset,
- outputHeight*outputWidth, 1,
- 1, kW*kH*nInputPlane*outputHeight*outputWidth,
- kW*kH*nInputPlane, outputHeight*outputWidth);
-
- // Extract columns:
- im2col(
- THCState_getCurrentStream(state),
- THCudaTensor_data(state, input_n),
- nInputPlane, inputHeight, inputWidth, kH, kW, padH, padW, dH, dW,
- 1, 1, THCudaTensor_data(state, finput_n)
- );
-
- for (int i = 0; i < outputHeight; i++) {
- for(int j = 0; j < outputWidth; j++) {
- int sliceidx = i * outputWidth + j;
- THCudaTensor_select(state, gwslice, gradWeight, 0, sliceidx);
- THCudaTensor_select(state, goslice, gradOutput3d, 0, sliceidx);
- THCudaTensor_select(state, islice, finput3d, 0, sliceidx);
- THCudaTensor_addmm(state, gwslice, 1.0, gwslice, scale, goslice, islice);
- }
- }
- // gradOutput3d: oH*oW x nOutputPlane x 1
- // finput3d: oH*oW x 1 x kW*kH*nInputPlane
- //THCudaTensor_baddbmm(state, gradWeight, 1.0, gradWeight, scale, gradOutput3d, finput3d);
- // gradWeight: oH*oW x nOutputPlane x kW*kH*nInputPlane
-
- THCudaTensor_cadd(state, gradBias, gradBias, scale, gradOutput_n);
-
- THCudaTensor_free(state, gradOutput3d);
- THCudaTensor_free(state, finput3d);
- THCudaTensor_free(state, gwslice);
- THCudaTensor_free(state, goslice);
- THCudaTensor_free(state, islice);
- }
-
- // Free
- THCudaTensor_free(state, input_n);
- THCudaTensor_free(state, finput_n);
- THCudaTensor_free(state, gradOutput_n);
-
- // Resize
- if (batch == 0) {
- THCudaTensor_resize3d(state, gradOutput, nOutputPlane, outputHeight, outputWidth);
- THCudaTensor_resize3d(state, input, nInputPlane, inputHeight, inputWidth);
- }
-}
+#include "generic/SpatialConvolutionLocal.cu"
+#include "THCGenerateFloatTypes.h"
diff --git a/lib/THCUNN/SpatialConvolutionMM.cu b/lib/THCUNN/SpatialConvolutionMM.cu
index 4714e9d..2a88047 100644
--- a/lib/THCUNN/SpatialConvolutionMM.cu
+++ b/lib/THCUNN/SpatialConvolutionMM.cu
@@ -2,352 +2,8 @@
#include "common.h"
#include "im2col.h"
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
-void THNN_CudaSpatialConvolutionMM_updateOutput(THCState *state, THCudaTensor *input, THCudaTensor *output, THCudaTensor *weight, THCudaTensor *bias, THCudaTensor *columns, THCudaTensor *ones, int kW, int kH, int dW, int dH, int padW, int padH) {
-
- THCUNN_assertSameGPU(state, 5, input, output, weight, columns, ones);
- if (bias) {
- THCUNN_assertSameGPU(state, 2, weight, bias);
- }
- THArgCheck(input->nDimension == 3 || input->nDimension == 4, 2, "3D or 4D (batch mode) tensor is expected");
- THArgCheck(!bias || weight->size[0] == bias->size[0], 4, "nOutputPlane mismatch in weight and bias");
- THArgCheck(kW > 0 && kH > 0, 8, "kernel size should be greater than zero");
- THArgCheck(dW > 0 && dH > 0, 10, "stride should be greater than zero");
- THArgCheck(weight->nDimension == 2 || weight->nDimension == 4, 4, "weight tensor should be 2D or 4D");
-
- int freeWeight = 0;
-
- // Params:
- int nInputPlane = weight->nDimension == 2 ? weight->size[1]/(kH*kW) : weight->size[1];
- int nOutputPlane = weight->size[0];
-
- if (weight->nDimension == 4) {
- long s1 = weight->size[0];
- long s2 = weight->size[1] * weight->size[2] * weight->size[3];
- weight = THCudaTensor_newWithStorage2d(state, weight->storage, weight->storageOffset, s1, -1, s2, -1);
- freeWeight = 1;
- }
-
- int batch = 1;
- if (input->nDimension == 3) {
- THArgCheck(input->size[0] == nInputPlane, 2, "input channels and nInputPlane dont match");
- // Force batch
- batch = 0;
- THCudaTensor_resize4d(state, input, 1, input->size[0], input->size[1], input->size[2]);
- } else {
- THArgCheck(input->size[1] == nInputPlane, 2, "input channels and nInputPlane dont match");
- }
-
- long inputWidth = input->size[3];
- long inputHeight = input->size[2];
- long outputWidth = (inputWidth + 2*padW - kW) / dW + 1;
- long outputHeight = (inputHeight + 2*padH - kH) / dH + 1;
-
- if (outputWidth < 1 || outputHeight < 1)
- THError("Given input size: (%dx%dx%d). Calculated output size: (%dx%dx%d). Output size is too small",
- nInputPlane,inputHeight,inputWidth,nOutputPlane,outputHeight,outputWidth);
-
- // Batch size + input planes
- long batchSize = input->size[0];
-
- // Resize output
- THCudaTensor_resize4d(state, output, batchSize, nOutputPlane, outputHeight, outputWidth);
-
- // Resize temporary columns
- THCudaTensor_resize2d(state, columns, nInputPlane*kW*kH, outputHeight*outputWidth);
-
- // Define a buffer of ones, for bias accumulation
- // Note: this buffer can be shared with other modules, it only ever gets increased,
- // and always contains ones.
- if (ones->nDimension != 2 || ones->size[0]*ones->size[1] < outputHeight*outputWidth) {
- // Resize plane and fill with ones...
- THCudaTensor_resize2d(state, ones, outputHeight, outputWidth);
- THCudaTensor_fill(state, ones, 1);
- }
-
- // Helpers
- THCudaTensor *input_n = THCudaTensor_new(state);
- THCudaTensor *output_n = THCudaTensor_new(state);
-
- // For each elt in batch, do:
- for (int elt = 0; elt < batchSize; elt ++) {
- // Matrix mulitply per output:
- THCudaTensor_select(state, input_n, input, 0, elt);
- THCudaTensor_select(state, output_n, output, 0, elt);
-
- // Do Bias first:
- // M,N,K are dims of matrix A and B
- // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
- long m_ = nOutputPlane;
- long n_ = outputHeight * outputWidth;
- long k_ = 1;
-
- // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
- if (bias) {
- THCudaBlas_Sgemm(
- state,
- 't', 'n',
- n_, m_, k_,
- 1,
- THCudaTensor_data(state, ones), k_,
- THCudaTensor_data(state, bias), k_,
- 0,
- THCudaTensor_data(state, output_n), n_
- );
- } else {
- THCudaTensor_zero(state, output_n);
- }
-
- // Extract columns:
- im2col(
- THCState_getCurrentStream(state),
- THCudaTensor_data(state, input_n),
- nInputPlane, inputHeight, inputWidth, kH, kW, padH, padW, dH, dW,
- 1, 1, THCudaTensor_data(state, columns)
- );
-
- // M,N,K are dims of matrix A and B
- // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
- long m = nOutputPlane;
- long n = columns->size[1];
- long k = nInputPlane*kH*kW;
-
- // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
- THCudaBlas_Sgemm(
- state,
- 'n', 'n',
- n, m, k,
- 1,
- THCudaTensor_data(state, columns), n,
- THCudaTensor_data(state, weight), k,
- 1,
- THCudaTensor_data(state, output_n), n
- );
- }
-
- // Free
- THCudaTensor_free(state, input_n);
- THCudaTensor_free(state, output_n);
- if (freeWeight)
- THCudaTensor_free(state, weight);
-
- // Resize output
- if (batch == 0) {
- THCudaTensor_resize3d(state, output, nOutputPlane, outputHeight, outputWidth);
- THCudaTensor_resize3d(state, input, nInputPlane, inputHeight, inputWidth);
- }
-}
-
-void THNN_CudaSpatialConvolutionMM_updateGradInput(THCState *state, THCudaTensor *input, THCudaTensor *gradOutput, THCudaTensor *gradInput, THCudaTensor *weight, THCudaTensor *gradColumns, THCudaTensor *ones, int kW, int kH, int dW, int dH, int padW, int padH) {
-
- THCUNN_assertSameGPU(state, 5, input, gradOutput, weight,
- gradColumns, gradInput);
- THArgCheck(input->nDimension == 3 || input->nDimension == 4, 2, "3D or 4D (batch mode) tensor is expected");
- THArgCheck(kW > 0 && kH > 0, 9, "kernel size should be greater than zero");
- THArgCheck(dW > 0 && dH > 0, 11, "stride should be greater than zero");
- THArgCheck(weight->nDimension == 2 || weight->nDimension == 4, 4, "weight tensor should be 2D or 4D");
-
- // Params
- int nInputPlane = weight->nDimension == 2 ? weight->size[1]/(kW*kH) : weight->size[1];
- int nOutputPlane = weight->size[0];
-
- int freeWeight = 0;
- if (weight->nDimension == 4) {
- long s1 = weight->size[0];
- long s2 = weight->size[1] * weight->size[2] * weight->size[3];
- weight = THCudaTensor_newWithStorage2d(state, weight->storage, weight->storageOffset, s1, -1, s2, -1);
- freeWeight = 1;
- }
-
- int batch = 1;
- if (input->nDimension == 3) {
- // Force batch
- batch = 0;
- THCudaTensor_resize4d(state, input, 1, input->size[0], input->size[1], input->size[2]);
- THCudaTensor_resize4d(state, gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2]);
- }
-
- long inputWidth = input->size[3];
- long inputHeight = input->size[2];
- long outputWidth = (inputWidth + 2*padW - kW) / dW + 1;
- long outputHeight = (inputHeight + 2*padH - kH) / dH + 1;
-
- // Batch size + input planes
- long batchSize = input->size[0];
-
- // Resize output
- THCudaTensor_resize4d(state, gradInput, batchSize, nInputPlane, inputHeight, inputWidth);
-
- // Resize temporary columns
- THCudaTensor_resize2d(state, gradColumns, nInputPlane*kW*kH, outputHeight*outputWidth);
-
- // Helpers
- THCudaTensor *gradInput_n = THCudaTensor_new(state);
- THCudaTensor *gradOutput_n = THCudaTensor_new(state);
-
- // For each elt in batch, do:
- for (int elt = 0; elt < batchSize; elt ++) {
- // Matrix mulitply per sample:
- THCudaTensor_select(state, gradInput_n, gradInput, 0, elt);
- THCudaTensor_select(state, gradOutput_n, gradOutput, 0, elt);
-
- // M,N,K are dims of matrix A and B
- // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
- long m = nInputPlane*kW*kH;
- long n = gradColumns->size[1];
- long k = nOutputPlane;
-
- // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
- THCudaBlas_Sgemm(
- state,
- 'n', 't',
- n, m, k,
- 1,
- THCudaTensor_data(state, gradOutput_n), n,
- THCudaTensor_data(state, weight), m,
- 0,
- THCudaTensor_data(state, gradColumns), n
- );
-
- // Unpack columns back into input:
- col2im(
- THCState_getCurrentStream(state),
- THCudaTensor_data(state, gradColumns),
- nInputPlane, inputHeight, inputWidth, kH, kW, padH, padW, dH, dW,
- 1, 1, THCudaTensor_data(state, gradInput_n)
- );
- }
-
- // Free
- THCudaTensor_free(state, gradInput_n);
- THCudaTensor_free(state, gradOutput_n);
- if (freeWeight)
- THCudaTensor_free(state, weight);
-
- // Resize output
- if (batch == 0) {
- THCudaTensor_resize3d(state, gradOutput, nOutputPlane, outputHeight, outputWidth);
- THCudaTensor_resize3d(state, input, nInputPlane, inputHeight, inputWidth);
- THCudaTensor_resize3d(state, gradInput, nInputPlane, inputHeight, inputWidth);
- }
-}
-
-void THNN_CudaSpatialConvolutionMM_accGradParameters(THCState *state, THCudaTensor *input, THCudaTensor *gradOutput, THCudaTensor *gradWeight, THCudaTensor *gradBias, THCudaTensor *columns, THCudaTensor *ones, int kW, int kH, int dW, int dH, int padW, int padH, float scale) {
-
- THCUNN_assertSameGPU(state, 5, input, gradOutput, gradWeight, columns, ones);
- if (gradBias) {
- THCUNN_assertSameGPU(state, 2, gradWeight, gradBias);
- }
- THArgCheck(input->nDimension == 3 || input->nDimension == 4, 2, "3D or 4D (batch mode) tensor is expected");
- THArgCheck(!gradBias || gradWeight->size[0] == gradBias->size[0], 4, "nOutputPlane mismatch in gradWeight and gradBias");
- THArgCheck(kW > 0 && kH > 0, 8, "kernel size should be greater than zero");
- THArgCheck(dW > 0 && dH > 0, 10, "stride should be greater than zero");
- THArgCheck(gradWeight->nDimension == 2 || gradWeight->nDimension == 4, 4, "gradWeight tensor should be 2D or 4D");
-
- // Params
- int nInputPlane = gradWeight->nDimension == 2 ? gradWeight->size[1]/(kW*kH) : gradWeight->size[1];
- int nOutputPlane = gradWeight->size[0];
-
- int freeWeight = 0;
- if (gradWeight->nDimension == 4) {
- long s1 = gradWeight->size[0];
- long s2 = gradWeight->size[1] * gradWeight->size[2] * gradWeight->size[3];
- gradWeight = THCudaTensor_newWithStorage2d(state, gradWeight->storage, gradWeight->storageOffset, s1, -1, s2, -1);
- freeWeight = 1;
- }
-
- int batch = 1;
- if (input->nDimension == 3) {
- // Force batch
- batch = 0;
- THCudaTensor_resize4d(state, input, 1, input->size[0], input->size[1], input->size[2]);
- THCudaTensor_resize4d(state, gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2]);
- }
-
- long inputWidth = input->size[3];
- long inputHeight = input->size[2];
- long outputWidth = (inputWidth + 2*padW - kW) / dW + 1;
- long outputHeight = (inputHeight + 2*padH - kH) / dH + 1;
-
- // Batch size + input planes
- long batchSize = input->size[0];
-
- // Define a buffer of ones, for bias accumulation
- if (ones->nDimension != 2 || ones->size[0]*ones->size[1] < outputHeight*outputWidth) {
- // Resize plane and fill with ones...
- THCudaTensor_resize2d(state, ones, outputHeight, outputWidth);
- THCudaTensor_fill(state, ones, 1);
- }
-
- // Resize temporary columns
- THCudaTensor_resize2d(state, columns, nInputPlane*kW*kH, outputHeight*outputWidth);
-
- // Helpers
- THCudaTensor *input_n = THCudaTensor_new(state);
- THCudaTensor *gradOutput_n = THCudaTensor_new(state);
-
- // For each elt in batch, do:
- for (int elt = 0; elt < batchSize; elt ++) {
- // Matrix mulitply per output:
- THCudaTensor_select(state, input_n, input, 0, elt);
- THCudaTensor_select(state, gradOutput_n, gradOutput, 0, elt);
-
- // Extract columns:
- im2col(
- THCState_getCurrentStream(state),
- THCudaTensor_data(state, input_n),
- nInputPlane, inputHeight, inputWidth, kH, kW, padH, padW, dH, dW,
- 1, 1, THCudaTensor_data(state, columns)
- );
-
- // M,N,K are dims of matrix A and B
- // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
- long m = nOutputPlane;
- long n = nInputPlane*kW*kH;
- long k = columns->size[1];
-
- // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
- THCudaBlas_Sgemm(
- state,
- 't', 'n',
- n, m, k,
- scale,
- THCudaTensor_data(state, columns), k,
- THCudaTensor_data(state, gradOutput_n), k,
- 1,
- THCudaTensor_data(state, gradWeight), n
- );
-
- // Do Bias:
- // M,N,K are dims of matrix A and B
- // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
- long m_ = nOutputPlane;
- long k_ = outputHeight * outputWidth;
-
- // Do GEMV (note: this is a bit confusing because gemv assumes column-major matrices)
- if (gradBias) {
- THCudaBlas_Sgemv(
- state,
- 't',
- k_, m_,
- scale,
- THCudaTensor_data(state, gradOutput_n), k_,
- THCudaTensor_data(state, ones), 1,
- 1,
- THCudaTensor_data(state, gradBias), 1
- );
- }
- }
-
- // Free
- THCudaTensor_free(state, input_n);
- THCudaTensor_free(state, gradOutput_n);
- if (freeWeight)
- THCudaTensor_free(state, gradWeight);
-
- // Resize
- if (batch == 0) {
- THCudaTensor_resize3d(state, gradOutput, nOutputPlane, outputHeight, outputWidth);
- THCudaTensor_resize3d(state, input, nInputPlane, inputHeight, inputWidth);
- }
-}
+#include "generic/SpatialConvolutionMM.cu"
+#include "THCGenerateFloatTypes.h"
diff --git a/lib/THCUNN/SpatialCrossMapLRN.cu b/lib/THCUNN/SpatialCrossMapLRN.cu
index 9c93926..cd37320 100644
--- a/lib/THCUNN/SpatialCrossMapLRN.cu
+++ b/lib/THCUNN/SpatialCrossMapLRN.cu
@@ -1,7 +1,9 @@
#include "THCUNN.h"
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
#include "common.h"
-template <typename Dtype>
+template <typename Dtype, typename Acctype>
__global__ void
#if __CUDA_ARCH__ >= 320
__launch_bounds__(CUDA_NUM_THREADS)
@@ -22,7 +24,7 @@ LRNFillScale(const int nthreads, const Dtype* const in,
int head = 0;
const int pre_pad = (size - 1) / 2;
const int post_pad = size - pre_pad - 1;
- Dtype accum_scale = 0;
+ Acctype accum_scale = Acctype(0);
// fill the scale at [n, :, h, w]
// accumulate values
while (head < post_pad && head < channels) {
@@ -36,7 +38,7 @@ LRNFillScale(const int nthreads, const Dtype* const in,
accum_scale -= in_off[(head - size) * step]
* in_off[(head - size) * step];
}
- scale_off[(head - post_pad) * step] = k + accum_scale * alpha_over_size;
+ scale_off[(head - post_pad) * step] = ScalarConvert<Acctype, Dtype>::to(k + accum_scale * alpha_over_size);
++head;
}
// subtract only
@@ -45,20 +47,21 @@ LRNFillScale(const int nthreads, const Dtype* const in,
accum_scale -= in_off[(head - size) * step]
* in_off[(head - size) * step];
}
- scale_off[(head - post_pad) * step] = k + accum_scale * alpha_over_size;
+ scale_off[(head - post_pad) * step] = ScalarConvert<Acctype, Dtype>::to(k + accum_scale * alpha_over_size);
++head;
}
}
}
-__global__ void LRNComputeOutput(const int nthreads, const float* in,
- const float* scale, const float negative_beta, float* out) {
+template <typename Dtype>
+__global__ void LRNComputeOutput(const int nthreads, const Dtype* in,
+ const Dtype* scale, const Dtype negative_beta, Dtype* out) {
CUDA_KERNEL_LOOP(index, nthreads) {
out[index] = in[index] * pow(scale[index], negative_beta);
}
}
-template <typename Dtype>
+template <typename Dtype, typename Acctype>
__global__ void LRNComputeDiff(const int nthreads,
const Dtype* const bottom_data, const Dtype* const top_data,
const Dtype* const scale, const Dtype* const top_diff,
@@ -80,7 +83,7 @@ __global__ void LRNComputeDiff(const int nthreads,
int head = 0;
const int pre_pad = size - (size + 1) / 2;
const int post_pad = size - pre_pad - 1;
- Dtype accum_ratio = 0;
+ Acctype accum_ratio = Acctype(0);
// accumulate values
while (head < post_pad && head < channels) {
accum_ratio += top_diff_off[head * step] * top_off[head * step] /
@@ -96,9 +99,9 @@ __global__ void LRNComputeDiff(const int nthreads,
top_off[(head - size) * step] / scale_off[(head - size) * step];
}
bottom_diff_off[(head - post_pad) * step] =
- top_diff_off[(head - post_pad) * step]
+ ScalarConvert<Acctype, Dtype>::to(top_diff_off[(head - post_pad) * step]
* pow(scale_off[(head - post_pad) * step], negative_beta)
- - cache_ratio * bottom_off[(head - post_pad) * step] * accum_ratio;
+ - cache_ratio * bottom_off[(head - post_pad) * step] * accum_ratio);
++head;
}
// subtract only
@@ -108,121 +111,14 @@ __global__ void LRNComputeDiff(const int nthreads,
top_off[(head - size) * step] / scale_off[(head - size) * step];
}
bottom_diff_off[(head - post_pad) * step] =
- top_diff_off[(head - post_pad) * step]
+ ScalarConvert<Acctype, Dtype>::to(top_diff_off[(head - post_pad) * step]
* pow(scale_off[(head - post_pad) * step], negative_beta)
- - cache_ratio * bottom_off[(head - post_pad) * step] * accum_ratio;
+ - cache_ratio * bottom_off[(head - post_pad) * step] * accum_ratio);
++head;
}
}
}
-extern "C"
-void LRNforward(THCState* state, THCudaTensor* input, THCudaTensor* output,
- THCudaTensor* scale, int local_size, float alpha, float beta, float k)
-{
- THCudaTensor_resizeAs(state, output, input);
- THCudaTensor_resizeAs(state, scale, input);
-
- int batchSize;
- int nInputPlane;
- int imsize_h;
- int imsize_w;
-
- if (input->nDimension == 3) {
- batchSize = 1;
- nInputPlane = input->size[0];
- imsize_h = input->size[1];
- imsize_w = input->size[2];
- }
- else
- {
- batchSize = input->size[0];
- nInputPlane = input->size[1];
- imsize_h = input->size[2];
- imsize_w = input->size[3];
- }
-
- input = THCudaTensor_newContiguous(state, input);
-
- int n_threads = batchSize * imsize_h * imsize_w;
- LRNFillScale<<<GET_BLOCKS(n_threads), CUDA_NUM_THREADS, 0, THCState_getCurrentStream(state)>>>(
- n_threads, THCudaTensor_data(state, input), batchSize, nInputPlane, imsize_h, imsize_w, local_size,
- alpha / local_size, k, THCudaTensor_data(state, scale));
- n_threads *= nInputPlane;
- THCudaCheck(cudaGetLastError());
- LRNComputeOutput<<<GET_BLOCKS(n_threads), CUDA_NUM_THREADS, 0, THCState_getCurrentStream(state)>>>(
- n_threads, THCudaTensor_data(state, input), THCudaTensor_data(state, scale), -beta, THCudaTensor_data(state, output));
- THCudaCheck(cudaGetLastError());
-
- THCudaTensor_free(state, input);
-}
-
-
-extern "C"
-void LRNbackward(THCState* state, THCudaTensor* input, THCudaTensor* output,
- THCudaTensor* gradOutput, THCudaTensor* gradInput, THCudaTensor* scale,
- int local_size, float alpha, float beta, float k)
-{
- THCudaTensor_resizeAs(state, gradInput, input);
-
- int batchSize;
- int nInputPlane;
- int imsize_h;
- int imsize_w;
-
- if (input->nDimension == 3) {
- batchSize = 1;
- nInputPlane = input->size[0];
- imsize_h = input->size[1];
- imsize_w = input->size[2];
- }
- else
- {
- batchSize = input->size[0];
- nInputPlane = input->size[1];
- imsize_h = input->size[2];
- imsize_w = input->size[3];
- }
-
- input = THCudaTensor_newContiguous(state, input);
- gradOutput = THCudaTensor_newContiguous(state, gradOutput);
-
- int n_threads = batchSize * imsize_h * imsize_w;
- LRNComputeDiff<<<GET_BLOCKS(n_threads), CUDA_NUM_THREADS, 0, THCState_getCurrentStream(state)>>>(
- n_threads, THCudaTensor_data(state, input), THCudaTensor_data(state, output),
- THCudaTensor_data(state, scale), THCudaTensor_data(state, gradOutput), batchSize, nInputPlane, imsize_h, imsize_w,
- local_size, -beta, float(2. * alpha * beta / local_size),
- THCudaTensor_data(state, gradInput));
- THCudaCheck(cudaGetLastError());
-
- THCudaTensor_free(state, input);
- THCudaTensor_free(state, gradOutput);
-}
-
-void THNN_CudaSpatialCrossMapLRN_updateOutput(
- THCState *state,
- THCudaTensor *input,
- THCudaTensor *output,
- THCudaTensor *scale,
- int size,
- float alpha,
- float beta,
- float k)
-{
- LRNforward(state, input, output, scale, size, alpha, beta, k);
-}
-void THNN_CudaSpatialCrossMapLRN_updateGradInput(
- THCState *state,
- THCudaTensor *input,
- THCudaTensor *gradOutput,
- THCudaTensor *gradInput,
- THCudaTensor *scale,
- THCudaTensor *output,
- int size,
- float alpha,
- float beta,
- float k)
-{
- LRNbackward(state, input, output, gradOutput, gradInput, scale, size, alpha, beta, k);
-}
+#include "generic/SpatialCrossMapLRN.cu"
+#include "THCGenerateFloatTypes.h"
diff --git a/lib/THCUNN/SpatialDilatedConvolution.cu b/lib/THCUNN/SpatialDilatedConvolution.cu
index 5b2f0e6..a4a8e38 100644
--- a/lib/THCUNN/SpatialDilatedConvolution.cu
+++ b/lib/THCUNN/SpatialDilatedConvolution.cu
@@ -2,338 +2,8 @@
#include "common.h"
#include "im2col.h"
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
-void THNN_CudaSpatialDilatedConvolution_updateOutput(THCState *state,
- THCudaTensor *input, THCudaTensor *output, THCudaTensor *weight,
- THCudaTensor *bias, THCudaTensor *columns,
- THCudaTensor *ones, int kW, int kH, int dW, int dH,
- int padW, int padH, int dilationW, int dilationH) {
-
- THCUNN_assertSameGPU(state, 5, input, output, weight, columns, ones);
- if (bias) {
- THCUNN_assertSameGPU(state, 2, weight, bias);
- }
- THArgCheck(input->nDimension == 3 || input->nDimension == 4, 2, "3D or 4D (batch mode) tensor is expected");
- THArgCheck(weight->nDimension == 4, 4, "weight tensor must be 4D (nOutputPlane,nInputPlane,kH,kW)");
- THArgCheck(!bias || weight->size[0] == bias->size[0], 4, "nOutputPlane mismatch in weight and bias");
- THArgCheck(kW > 0 && kH > 0, 8, "kernel size should be greater than zero");
- THArgCheck(dW > 0 && dH > 0, 10, "stride should be greater than zero");
-
- // Params:
- int nInputPlane = weight->size[1];
- int nOutputPlane = weight->size[0];
-
- int batch = 1;
- if (input->nDimension == 3) {
- THArgCheck(input->size[0] == nInputPlane, 2, "input channels and nInputPlane dont match");
- // Force batch
- batch = 0;
- THCudaTensor_resize4d(state, input, 1, input->size[0], input->size[1], input->size[2]);
- } else {
- THArgCheck(input->size[1] == nInputPlane, 2, "input channels and nInputPlane dont match");
- }
-
- long inputWidth = input->size[3];
- long inputHeight = input->size[2];
- long outputWidth = (inputWidth + 2*padW - (dilationW * (kW - 1) + 1)) / dW + 1;
- long outputHeight = (inputHeight + 2*padH - (dilationH * (kH - 1) + 1)) / dH + 1;
-
- if (outputWidth < 1 || outputHeight < 1)
- THError("Given input size: (%dx%dx%d). Calculated output size: (%dx%dx%d). Output size is too small",
- nInputPlane,inputHeight,inputWidth,nOutputPlane,outputHeight,outputWidth);
-
- // Batch size + input planes
- long batchSize = input->size[0];
-
- // Resize output
- THCudaTensor_resize4d(state, output, batchSize, nOutputPlane, outputHeight, outputWidth);
-
- // Resize temporary columns
- THCudaTensor_resize2d(state, columns, nInputPlane*kW*kH, outputHeight*outputWidth);
-
- // Define a buffer of ones, for bias accumulation
- // Note: this buffer can be shared with other modules, it only ever gets increased,
- // and always contains ones.
- if (ones->nDimension != 2 || ones->size[0]*ones->size[1] < outputHeight*outputWidth) {
- // Resize plane and fill with ones...
- THCudaTensor_resize2d(state, ones, outputHeight, outputWidth);
- THCudaTensor_fill(state, ones, 1);
- }
-
- // Helpers
- THCudaTensor *input_n = THCudaTensor_new(state);
- THCudaTensor *output_n = THCudaTensor_new(state);
-
- // For each elt in batch, do:
- for (int elt = 0; elt < batchSize; elt ++) {
- // Matrix mulitply per output:
- THCudaTensor_select(state, input_n, input, 0, elt);
- THCudaTensor_select(state, output_n, output, 0, elt);
-
- // Do Bias first:
- // M,N,K are dims of matrix A and B
- // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
- long m_ = nOutputPlane;
- long n_ = outputHeight * outputWidth;
- long k_ = 1;
-
- // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
- if (bias) {
- THCudaBlas_Sgemm(
- state,
- 't', 'n',
- n_, m_, k_,
- 1,
- THCudaTensor_data(state, ones), k_,
- THCudaTensor_data(state, bias), k_,
- 0,
- THCudaTensor_data(state, output_n), n_
- );
- } else {
- THCudaTensor_zero(state, output_n);
- }
-
- // Extract columns:
- im2col(
- THCState_getCurrentStream(state),
- THCudaTensor_data(state, input_n),
- nInputPlane, inputHeight, inputWidth, kH, kW, padH, padW, dH, dW,
- dilationH, dilationW,
- THCudaTensor_data(state, columns)
- );
-
- // M,N,K are dims of matrix A and B
- // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
- long m = nOutputPlane;
- long n = columns->size[1];
- long k = nInputPlane*kH*kW;
-
- // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
- THCudaBlas_Sgemm(
- state,
- 'n', 'n',
- n, m, k,
- 1,
- THCudaTensor_data(state, columns), n,
- THCudaTensor_data(state, weight), k,
- 1,
- THCudaTensor_data(state, output_n), n
- );
- }
-
- // Free
- THCudaTensor_free(state, input_n);
- THCudaTensor_free(state, output_n);
-
- // Resize output
- if (batch == 0) {
- THCudaTensor_resize3d(state, output, nOutputPlane, outputHeight, outputWidth);
- THCudaTensor_resize3d(state, input, nInputPlane, inputHeight, inputWidth);
- }
-}
-
-void THNN_CudaSpatialDilatedConvolution_updateGradInput(THCState *state,
- THCudaTensor *input, THCudaTensor *gradOutput,
- THCudaTensor *gradInput, THCudaTensor *weight,
- THCudaTensor *gradColumns,
- int kW, int kH, int dW, int dH, int padW, int padH,
- int dilationW, int dilationH ) {
-
- THCUNN_assertSameGPU(state, 5, input, gradOutput, weight,
- gradColumns, gradInput);
- THArgCheck(input->nDimension == 3 || input->nDimension == 4, 2, "3D or 4D (batch mode) tensor is expected");
- THArgCheck(weight->nDimension == 4, 4, "weight tensor must be 4D (nOutputPlane,nInputPlane,kH,kW)");
- THArgCheck(kW > 0 && kH > 0, 9, "kernel size should be greater than zero");
- THArgCheck(dW > 0 && dH > 0, 11, "stride should be greater than zero");
-
- // Params
- int nInputPlane = weight->size[1];
- int nOutputPlane = weight->size[0];
-
- int batch = 1;
- if (input->nDimension == 3) {
- // Force batch
- batch = 0;
- THCudaTensor_resize4d(state, input, 1, input->size[0], input->size[1], input->size[2]);
- THCudaTensor_resize4d(state, gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2]);
- }
-
- long inputWidth = input->size[3];
- long inputHeight = input->size[2];
- long outputWidth = (inputWidth + 2*padW - (dilationW * (kW - 1) + 1)) / dW + 1;
- long outputHeight = (inputHeight + 2*padH - (dilationH * (kH - 1) + 1)) / dH + 1;
-
- // Batch size + input planes
- long batchSize = input->size[0];
-
- // Resize output
- THCudaTensor_resize4d(state, gradInput, batchSize, nInputPlane, inputHeight, inputWidth);
-
- // Resize temporary columns
- THCudaTensor_resize2d(state, gradColumns, nInputPlane*kW*kH, outputHeight*outputWidth);
-
- // Helpers
- THCudaTensor *gradInput_n = THCudaTensor_new(state);
- THCudaTensor *gradOutput_n = THCudaTensor_new(state);
-
- // For each elt in batch, do:
- for (int elt = 0; elt < batchSize; elt ++) {
- // Matrix mulitply per sample:
- THCudaTensor_select(state, gradInput_n, gradInput, 0, elt);
- THCudaTensor_select(state, gradOutput_n, gradOutput, 0, elt);
-
- // M,N,K are dims of matrix A and B
- // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
- long m = nInputPlane*kW*kH;
- long n = gradColumns->size[1];
- long k = nOutputPlane;
-
- // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
- THCudaBlas_Sgemm(
- state,
- 'n', 't',
- n, m, k,
- 1,
- THCudaTensor_data(state, gradOutput_n), n,
- THCudaTensor_data(state, weight), m,
- 0,
- THCudaTensor_data(state, gradColumns), n
- );
-
- // Unpack columns back into input:
- col2im(
- THCState_getCurrentStream(state),
- THCudaTensor_data(state, gradColumns),
- nInputPlane, inputHeight, inputWidth, kH, kW, padH, padW, dH, dW,
- dilationH, dilationW,
- THCudaTensor_data(state, gradInput_n)
- );
- }
-
- // Free
- THCudaTensor_free(state, gradInput_n);
- THCudaTensor_free(state, gradOutput_n);
-
- // Resize output
- if (batch == 0) {
- THCudaTensor_resize3d(state, gradOutput, nOutputPlane, outputHeight, outputWidth);
- THCudaTensor_resize3d(state, input, nInputPlane, inputHeight, inputWidth);
- THCudaTensor_resize3d(state, gradInput, nInputPlane, inputHeight, inputWidth);
- }
-}
-
-void THNN_CudaSpatialDilatedConvolution_accGradParameters(THCState *state,
- THCudaTensor *input, THCudaTensor *gradOutput,
- THCudaTensor *gradWeight, THCudaTensor *gradBias,
- THCudaTensor *columns, THCudaTensor *ones,
- int kW, int kH, int dW, int dH,
- int padW, int padH, int dilationW, int dilationH, float scale) {
-
- THCUNN_assertSameGPU(state, 5, input, gradOutput, gradWeight, columns, ones);
- if (gradBias) {
- THCUNN_assertSameGPU(state, 2, gradWeight, gradBias);
- }
- THArgCheck(input->nDimension == 3 || input->nDimension == 4, 2, "3D or 4D (batch mode) tensor is expected");
- THArgCheck(gradWeight->nDimension == 4, 4, "gradWeight tensor must be 4D (nOutputPlane,nInputPlane,kH,kW)");
- THArgCheck(!gradBias || gradWeight->size[0] == gradBias->size[0], 4, "nOutputPlane mismatch in gradWeight and gradBias");
- THArgCheck(kW > 0 && kH > 0, 8, "kernel size should be greater than zero");
- THArgCheck(dW > 0 && dH > 0, 10, "stride should be greater than zero");
-
- // Params
- int nInputPlane = gradWeight->size[1];
- int nOutputPlane = gradWeight->size[0];
-
- int batch = 1;
- if (input->nDimension == 3) {
- // Force batch
- batch = 0;
- THCudaTensor_resize4d(state, input, 1, input->size[0], input->size[1], input->size[2]);
- THCudaTensor_resize4d(state, gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2]);
- }
-
- long inputWidth = input->size[3];
- long inputHeight = input->size[2];
- long outputWidth = (inputWidth + 2*padW - (dilationW * (kW - 1) + 1)) / dW + 1;
- long outputHeight = (inputHeight + 2*padH - (dilationH * (kH - 1) + 1)) / dH + 1;
-
- // Batch size + input planes
- long batchSize = input->size[0];
-
- // Define a buffer of ones, for bias accumulation
- if (ones->nDimension != 2 || ones->size[0]*ones->size[1] < outputHeight*outputWidth) {
- // Resize plane and fill with ones...
- THCudaTensor_resize2d(state, ones, outputHeight, outputWidth);
- THCudaTensor_fill(state, ones, 1);
- }
-
- // Resize temporary columns
- THCudaTensor_resize2d(state, columns, nInputPlane*kW*kH, outputHeight*outputWidth);
-
- // Helpers
- THCudaTensor *input_n = THCudaTensor_new(state);
- THCudaTensor *gradOutput_n = THCudaTensor_new(state);
-
- // For each elt in batch, do:
- for (int elt = 0; elt < batchSize; elt ++) {
- // Matrix mulitply per output:
- THCudaTensor_select(state, input_n, input, 0, elt);
- THCudaTensor_select(state, gradOutput_n, gradOutput, 0, elt);
-
- // Extract columns:
- im2col(
- THCState_getCurrentStream(state),
- THCudaTensor_data(state, input_n),
- nInputPlane, inputHeight, inputWidth, kH, kW, padH, padW, dH, dW,
- dilationH, dilationW,
- THCudaTensor_data(state, columns)
- );
-
- // M,N,K are dims of matrix A and B
- // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
- long m = nOutputPlane;
- long n = nInputPlane*kW*kH;
- long k = columns->size[1];
-
- // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
- THCudaBlas_Sgemm(
- state,
- 't', 'n',
- n, m, k,
- scale,
- THCudaTensor_data(state, columns), k,
- THCudaTensor_data(state, gradOutput_n), k,
- 1,
- THCudaTensor_data(state, gradWeight), n
- );
-
- // Do Bias:
- // M,N,K are dims of matrix A and B
- // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
- long m_ = nOutputPlane;
- long k_ = outputHeight * outputWidth;
-
- // Do GEMV (note: this is a bit confusing because gemv assumes column-major matrices)
- if (gradBias) {
- THCudaBlas_Sgemv(
- state,
- 't',
- k_, m_,
- scale,
- THCudaTensor_data(state, gradOutput_n), k_,
- THCudaTensor_data(state, ones), 1,
- 1,
- THCudaTensor_data(state, gradBias), 1
- );
- }
- }
-
- // Free
- THCudaTensor_free(state, input_n);
- THCudaTensor_free(state, gradOutput_n);
-
- // Resize
- if (batch == 0) {
- THCudaTensor_resize3d(state, gradOutput, nOutputPlane, outputHeight, outputWidth);
- THCudaTensor_resize3d(state, input, nInputPlane, inputHeight, inputWidth);
- }
-}
+#include "generic/SpatialDilatedConvolution.cu"
+#include "THCGenerateFloatTypes.h"
diff --git a/lib/THCUNN/SpatialDilatedMaxPooling.cu b/lib/THCUNN/SpatialDilatedMaxPooling.cu
index 26ac65d..94fd681 100644
--- a/lib/THCUNN/SpatialDilatedMaxPooling.cu
+++ b/lib/THCUNN/SpatialDilatedMaxPooling.cu
@@ -1,15 +1,17 @@
#include "THCUNN.h"
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
#include "common.h"
// kernels borrowed from Caffe
-template <typename Dtype>
+template <typename Dtype, typename AccType>
__global__ void MaxPoolForward(const int nthreads, const Dtype* bottom_data,
const int num, const int channels, const int height,
const int width, const int pooled_height, const int pooled_width,
const int kernel_h, const int kernel_w, const int stride_h,
const int stride_w, const int pad_h, const int pad_w,
const int dilation_h, const int dilation_w, Dtype* top_data,
- Dtype* top_mask) {
+ long* top_mask) {
CUDA_KERNEL_LOOP(index, nthreads) {
int pw = index % pooled_width;
int ph = (index / pooled_width) % pooled_height;
@@ -23,26 +25,26 @@ __global__ void MaxPoolForward(const int nthreads, const Dtype* bottom_data,
hstart += dilation_h;
while(wstart < 0)
wstart += dilation_w;
- Dtype maxval = -FLT_MAX;
+ AccType maxval = THCNumerics<AccType>::min();
int maxidx = -1;
bottom_data += (n * channels + c) * height * width;
for (int h = hstart; h < hend; h += dilation_h) {
for (int w = wstart; w < wend; w += dilation_w) {
- if (bottom_data[h * width + w] > maxval) {
+ if (ScalarConvert<Dtype, AccType>::to(bottom_data[h * width + w]) > maxval) {
maxidx = h * width + w;
- maxval = bottom_data[maxidx];
+ maxval = ScalarConvert<Dtype, AccType>::to(bottom_data[maxidx]);
}
}
}
- top_data[index] = maxval;
+ top_data[index] = ScalarConvert<AccType, Dtype>::to(maxval);
top_mask[index] = maxidx + TH_INDEX_BASE;
}
}
-template <typename Dtype>
+template <typename Dtype, typename AccType>
__global__ void MaxPoolBackward(const int nthreads, const Dtype* top_diff,
- const Dtype* top_mask, const int num, const int channels,
+ const long* top_mask, const int num, const int channels,
const int height, const int width, const int pooled_height,
const int pooled_width, const int kernel_h, const int kernel_w,
const int stride_h, const int stride_w, const int pad_h, const int pad_w,
@@ -61,147 +63,21 @@ __global__ void MaxPoolBackward(const int nthreads, const Dtype* top_diff,
int pwstart =
(w + pad_w < ((kernel_w - 1) * dilation_w + 1)) ? 0 : (w + pad_w - ((kernel_w - 1) * dilation_w + 1)) / stride_w + 1;
int pwend = min((w + pad_w) / stride_w + 1, pooled_width);
-
- Dtype gradient = 0;
+
+ AccType gradient = AccType(0);
int offset = (n * channels + c) * pooled_height * pooled_width;
top_diff += offset;
top_mask += offset;
for (int ph = phstart; ph < phend; ++ph) {
for (int pw = pwstart; pw < pwend; ++pw) {
- if (top_mask[ph * pooled_width + pw] - TH_INDEX_BASE == h * width + w) {
- gradient += top_diff[ph * pooled_width + pw];
- }
+ if (top_mask[ph * pooled_width + pw] - TH_INDEX_BASE == h * width + w) {
+ gradient += ScalarConvert<Dtype, AccType>::to(top_diff[ph * pooled_width + pw]);
+ }
}
}
- bottom_diff[index] = gradient;
- }
-}
-
-void THNN_CudaSpatialDilatedMaxPooling_updateOutput(THCState *state, THCudaTensor *input, THCudaTensor *output, THCudaTensor *indices, int kW, int kH, int dW, int dH, int padW, int padH, int dilationW, int dilationH, bool ceil_mode)
-{
-
- THCUNN_assertSameGPU(state, 3, input, output, indices);
- THArgCheck(input->nDimension == 3 || input->nDimension == 4, 2, "3D or 4D (batch) tensor expected");
-
- long nInputCols, nInputRows, nInputPlane, batchSize;
- long nOutputCols, nOutputRows;
-
- if (input->nDimension == 3) {
- nInputCols = input->size[2];
- nInputRows = input->size[1];
- nInputPlane = input->size[0];
- batchSize = 1;
+ bottom_diff[index] = ScalarConvert<AccType, Dtype>::to(gradient);
}
- else
- {
- nInputCols = input->size[3];
- nInputRows = input->size[2];
- nInputPlane = input->size[1];
- batchSize = input->size[0];
- }
-
- THArgCheck(nInputCols >= kW - padW && nInputRows >= kH - padH, 2, "input image smaller than kernel size");
- THArgCheck(kW/2 >= padW && kH/2 >= padH, 2, "pad should be smaller than half of kernel size");
-
- if(ceil_mode) {
- nOutputCols = ceil(float(nInputCols - (dilationW * (kW - 1) + 1) + 2*padW) / float(dW)) + 1;
- nOutputRows = ceil(float(nInputRows - (dilationH * (kH - 1) + 1) + 2*padH) / float(dH)) + 1;
- }
- else {
- nOutputCols = floor(float(nInputCols - (dilationW * (kW - 1) + 1) + 2*padW) / float(dW)) + 1;
- nOutputRows = floor(float(nInputRows - (dilationH * (kH - 1) + 1) + 2*padH) / float(dH)) + 1;
- }
-
-if (nOutputCols < 1 || nOutputRows < 1)
- THError("Given input size: (%dx%dx%d). Calculated output size: (%dx%dx%d). Output size is too small",
- nInputPlane,nInputRows,nInputCols,nInputPlane,nOutputRows,nOutputCols);
-
-if (padW || padH)
- {
- // ensure that the last pooling starts inside the image
- if ((nOutputRows - 1)*dH >= nInputRows + padH)
- --nOutputRows;
- if ((nOutputCols - 1)*dW >= nInputCols + padW)
- --nOutputCols;
- }
-
- input = THCudaTensor_newContiguous(state, input);
- float* input_data = THCudaTensor_data(state, input);
-
- THCudaTensor_resize4d(state, output, batchSize, nInputPlane, nOutputRows, nOutputCols);
- THCudaTensor_resizeAs(state, indices, output);
-
- float* indices_data = THCudaTensor_data(state, indices);
- float* output_data = THCudaTensor_data(state, output);
-
- int count = THCudaTensor_nElement(state, output);
-
- MaxPoolForward <<< GET_BLOCKS(count), CUDA_NUM_THREADS, 0, THCState_getCurrentStream(state) >>>
- (count, input_data,
- batchSize, nInputPlane, nInputRows, nInputCols, nOutputRows, nOutputCols,
- kH, kW, dH, dW, padH, padW, dilationH, dilationW, output_data, indices_data);
- THCudaCheck(cudaGetLastError());
-
- if(input->nDimension == 3)
- THCudaTensor_resize3d(state, output, nInputPlane, nOutputRows, nOutputCols);
-
- THCudaTensor_free(state, input);
}
-void THNN_CudaSpatialDilatedMaxPooling_updateGradInput(THCState *state, THCudaTensor *input, THCudaTensor *gradOutput, THCudaTensor *gradInput, THCudaTensor *indices, int kW, int kH, int dW, int dH, int padW, int padH, int dilationW, int dilationH, bool ceil_mode)
-{
- THCUNN_assertSameGPU(state, 4, input, gradOutput, indices, gradInput);
-
- input = THCudaTensor_newContiguous(state, input);
- gradOutput = THCudaTensor_newContiguous(state, gradOutput);
-
- long nInputCols, nInputRows, nInputPlane, batchSize;
- long nOutputCols, nOutputRows;
-
- if (input->nDimension == 3) {
- nInputCols = input->size[2];
- nInputRows = input->size[1];
- nInputPlane = input->size[0];
- batchSize = 1;
- }
- else
- {
- nInputCols = input->size[3];
- nInputRows = input->size[2];
- nInputPlane = input->size[1];
- batchSize = input->size[0];
- }
-
- if(ceil_mode) {
- nOutputCols = ceil(float(nInputCols - (dilationW * (kW - 1) + 1) + 2*padW) / float(dW)) + 1;
- nOutputRows = ceil(float(nInputRows - (dilationH * (kH - 1) + 1) + 2*padH) / float(dH)) + 1;
- }
- else {
- nOutputCols = floor(float(nInputCols - (dilationW * (kW - 1) + 1) + 2*padW) / float(dW)) + 1;
- nOutputRows = floor(float(nInputRows - (dilationH * (kH - 1) + 1) + 2*padH) / float(dH)) + 1;
- }
-
- if (nOutputCols < 1 || nOutputRows < 1)
- THError("Given input size: (%dx%dx%d). Calculated output size: (%dx%dx%d). Output size is too small",
- nInputPlane,nInputRows,nInputCols,nInputPlane,nOutputRows,nOutputCols);
-
- gradOutput = THCudaTensor_newContiguous(state, gradOutput);
- THCudaTensor_resizeAs(state, gradInput, input);
-
- int count = THCudaTensor_nElement(state, input);
-
- MaxPoolBackward <<< GET_BLOCKS(count), CUDA_NUM_THREADS, 0, THCState_getCurrentStream(state) >>>
- (count,
- THCudaTensor_data(state, gradOutput),
- THCudaTensor_data(state, indices),
- batchSize, nInputPlane, nInputRows, nInputCols, nOutputRows, nOutputCols,
- kH, kW, dH, dW, padH, padW, dilationH, dilationW,
- THCudaTensor_data(state, gradInput));
- THCudaCheck(cudaGetLastError());
-
- THCudaTensor_free(state, gradOutput);
-
- // clean
- THCudaTensor_free(state, input);
- THCudaTensor_free(state, gradOutput);
-}
+#include "generic/SpatialDilatedMaxPooling.cu"
+#include "THCGenerateFloatTypes.h"
diff --git a/lib/THCUNN/SpatialFractionalMaxPooling.cu b/lib/THCUNN/SpatialFractionalMaxPooling.cu
index 289b1d6..22a2878 100644
--- a/lib/THCUNN/SpatialFractionalMaxPooling.cu
+++ b/lib/THCUNN/SpatialFractionalMaxPooling.cu
@@ -3,15 +3,19 @@
#include "THCDeviceTensor.cuh"
#include "THCDeviceTensorUtils.cuh"
#include "THCDeviceUtils.cuh"
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
+#include "THCAtomics.cuh"
#include <cfloat>
-__device__ inline float getInterval(float sample,
- int index,
- int inputSize,
- int outputSize,
- int poolSize) {
- float alpha = (float)(inputSize - poolSize) / (float) (outputSize - 1);
+template <typename Dtype, typename Acctype>
+__device__ inline int getInterval(Acctype sample,
+ int index,
+ int inputSize,
+ int outputSize,
+ int poolSize) {
+ Acctype alpha = (Acctype)(inputSize - poolSize) / (Acctype) (outputSize - 1);
if (index == outputSize - 1) {
return inputSize - poolSize;
} else {
@@ -20,12 +24,12 @@ __device__ inline float getInterval(float sample,
}
// We template on poolSizeW to allow the innermost loop to be unrolled
-template <int PoolSizeWStatic>
+template <int PoolSizeWStatic, typename Dtype, typename Acctype>
__global__ void SpatialFractionalMaxPooling_updateOutput(
- THCDeviceTensor<float, 4> input,
- THCDeviceTensor<float, 4> output,
- THCDeviceTensor<float, 4> indices,
- THCDeviceTensor<float, 3> samples,
+ THCDeviceTensor<Dtype, 4> input,
+ THCDeviceTensor<Dtype, 4> output,
+ THCDeviceTensor<THCIndex_t, 4> indices,
+ THCDeviceTensor<Dtype, 3> samples,
int poolSizeW, int poolSizeH) {
// Output (h, w) point that this thread is responsible for
@@ -38,33 +42,39 @@ __global__ void SpatialFractionalMaxPooling_updateOutput(
int outputW = ourOutputPoint % output.getSize(3);
int outputH = ourOutputPoint / output.getSize(3);
- int poolW = getInterval(samples[batch][plane][0], outputW,
+ int poolW = getInterval<Dtype, Acctype>(ScalarConvert<Dtype, Acctype>::to(samples[batch][plane][0]), outputW,
input.getSize(3), output.getSize(3), poolSizeW);
- int poolH = getInterval(samples[batch][plane][1], outputH,
+ int poolH = getInterval<Dtype, Acctype>(ScalarConvert<Dtype, Acctype>::to(samples[batch][plane][1]), outputH,
input.getSize(2), output.getSize(2), poolSizeH);
- float maxVal = -FLT_MAX;
+ Dtype maxVal = THCNumerics<Dtype>::min();
int maxIndex = -1;
for (int h = poolH; h < poolH + poolSizeH; ++h) {
if (PoolSizeWStatic == -1) {
for (int w = poolW; w < poolW + poolSizeW; ++w) {
- float val = input[batch][plane][h][w];
- maxVal = fmaxf(val, maxVal);
- maxIndex = (maxVal == val) ? (h * input.getSize(3) + w) : maxIndex;
+ Dtype val = input[batch][plane][h][w];
+ // for consistency with THNN, favor the first max
+ if (val > maxVal) {
+ maxIndex = h * input.getSize(3) + w;
+ maxVal = val;
+ }
}
} else {
#pragma unroll
for (int i = 0; i < PoolSizeWStatic; ++i) {
int w = i + poolW;
- float val = input[batch][plane][h][w];
- maxVal = fmaxf(val, maxVal);
- maxIndex = (maxVal == val) ? (h * input.getSize(3) + w) : maxIndex;
+ Dtype val = input[batch][plane][h][w];
+ // for consistency with THNN, favor the first max
+ if (val > maxVal) {
+ maxIndex = h * input.getSize(3) + w;
+ maxVal = val;
+ }
}
}
}
- assert(maxVal != -FLT_MAX);
+ assert(maxVal != THCNumerics<Dtype>::min());
assert(maxIndex != -1);
// +1 for Lua index
@@ -73,100 +83,11 @@ __global__ void SpatialFractionalMaxPooling_updateOutput(
}
}
-void THNN_CudaSpatialFractionalMaxPooling_updateOutput(
- THCState *state,
- THCudaTensor *input,
- THCudaTensor *output,
- int outputW, int outputH,
- int poolSizeW, int poolSizeH,
- THCudaTensor *indices,
- THCudaTensor *randomSamples)
-{
- int planeDim = 0;
- int dimh = 1;
- int dimw = 2;
- long numBatch = 1;
-
- long numInputDims = THCudaTensor_nDimension(state, input);
- THArgCheck(numInputDims == 3 || numInputDims == 4, 2,
- "3D or 4D (batch mode) tensor expected");
-
- if (numInputDims == 4) {
- numBatch = THCudaTensor_size(state, input, 0);
- planeDim++;
- dimh++;
- dimw++;
- }
-
- /* sizes */
- long numPlanes = THCudaTensor_size(state, input, planeDim);
- long inputH = THCudaTensor_size(state, input, dimh);
- long inputW = THCudaTensor_size(state, input, dimw);
-
- THArgCheck(outputH + poolSizeH - 1 < inputH, 6,
- "poolSizeH too large relative to input height");
- THArgCheck(outputW + poolSizeW - 1 < inputW, 5,
- "poolSizeW too large relative to input width");
-
- THCDeviceTensor<float, 4> devInput;
- THCDeviceTensor<float, 4> devOutput;
- THCDeviceTensor<float, 4> devIndices;
- THCDeviceTensor<float, 3> devSamples =
- toDeviceTensor<float, 3>(state, randomSamples);
-
- if (numInputDims == 3) {
- /* resize output */
- THCudaTensor_resize3d(state, output, numPlanes, outputH, outputW);
- /* indices will contain the locations for each output point */
- THCudaTensor_resize3d(state, indices, numPlanes, outputH, outputW);
-
- devInput = toDeviceTensor<float, 3>(state, input).upcastOuter<4>();
- devOutput = toDeviceTensor<float, 3>(state, output).upcastOuter<4>();
- devIndices = toDeviceTensor<float, 3>(state, indices).upcastOuter<4>();
- } else {
- THCudaTensor_resize4d(state, output, numBatch, numPlanes, outputH, outputW);
- /* indices will contain the locations for each output point */
- THCudaTensor_resize4d(state, indices, numBatch, numPlanes, outputH, outputW);
-
- devInput = toDeviceTensor<float, 4>(state, input);
- devOutput = toDeviceTensor<float, 4>(state, output);
- devIndices = toDeviceTensor<float, 4>(state, indices);
- }
-
- // block is limited to 4 warps
- // grid handles overflow per each plane
- int outputPlaneSize = devOutput.getSize(2) * devOutput.getSize(3);
- dim3 grid(THCCeilDiv(outputPlaneSize, 128),
- devInput.getSize(1),
- devInput.getSize(0));
- dim3 block(outputPlaneSize > 128 ? 128 : outputPlaneSize);
-
-#define SFMP_UPDATE_OUTPUT(POOL_W) \
- SpatialFractionalMaxPooling_updateOutput<POOL_W> \
- <<<grid, block, 0, THCState_getCurrentStream(state)>>>( \
- devInput, devOutput, devIndices, devSamples, poolSizeW, poolSizeH);
-
-#define SFMP_UPDATE_OUTPUT_CASE(POOL_W) \
- case POOL_W: SFMP_UPDATE_OUTPUT(POOL_W); break
-
- switch (poolSizeW) {
- SFMP_UPDATE_OUTPUT_CASE(2);
- SFMP_UPDATE_OUTPUT_CASE(3);
- SFMP_UPDATE_OUTPUT_CASE(4);
- SFMP_UPDATE_OUTPUT_CASE(5);
- SFMP_UPDATE_OUTPUT_CASE(6);
- SFMP_UPDATE_OUTPUT_CASE(7);
- default:
- // dynamic pool width
- SFMP_UPDATE_OUTPUT_CASE(-1);
- }
- THCudaCheck(cudaGetLastError());
-}
-
+template <typename Dtype>
__global__ void SpatialFractionalMaxPooling_updateGradInput(
- THCDeviceTensor<float, 4> gradInput,
- THCDeviceTensor<float, 4> gradOutput,
- THCDeviceTensor<float, 4> indices) {
+ THCDeviceTensor<Dtype, 4> gradInput,
+ THCDeviceTensor<Dtype, 4> gradOutput,
+ THCDeviceTensor<THCIndex_t, 4> indices) {
// Output (h, w) point that this thread is responsible for
int ourOutputPoint = threadIdx.x + blockIdx.x * blockDim.x;
int plane = blockIdx.y;
@@ -188,62 +109,5 @@ __global__ void SpatialFractionalMaxPooling_updateGradInput(
}
}
-void THNN_CudaSpatialFractionalMaxPooling_updateGradInput(
- THCState *state,
- THCudaTensor *input,
- THCudaTensor *gradOutput,
- THCudaTensor *gradInput,
- int outputW, int outputH,
- int poolSizeW, int poolSizeH,
- THCudaTensor *indices)
-{
- int dimh = 1;
- int dimw = 2;
-
- long numInputDims = THCudaTensor_nDimension(state, input);
- if (numInputDims == 4) {
- dimh++;
- dimw++;
- }
-
- /* sizes */
- long inputH = THCudaTensor_size(state, input, dimh);
- long inputW = THCudaTensor_size(state, input, dimw);
-
- THArgCheck(outputH == THCudaTensor_size(state, gradOutput, dimh), 3,
- "gradOutput height unexpected");
- THArgCheck(outputW == THCudaTensor_size(state, gradOutput, dimw), 3,
- "gradOutput width unexpected");
-
- /* resize */
- THCudaTensor_resizeAs(state, gradInput, input);
- THCudaTensor_zero(state, gradInput);
-
- THCDeviceTensor<float, 4> devGradInput;
- THCDeviceTensor<float, 4> devGradOutput;
- THCDeviceTensor<float, 4> devIndices;
-
- /* backprop */
- if (numInputDims == 3) {
- devGradInput = toDeviceTensor<float, 3>(state, gradInput).upcastOuter<4>();
- devGradOutput = toDeviceTensor<float, 3>(state, gradOutput).upcastOuter<4>();
- devIndices = toDeviceTensor<float, 3>(state, indices).upcastOuter<4>();
- } else {
- devGradInput = toDeviceTensor<float, 4>(state, gradInput);
- devGradOutput = toDeviceTensor<float, 4>(state, gradOutput);
- devIndices = toDeviceTensor<float, 4>(state, indices);
- }
-
- // block is limited to 4 warps
- // grid handles overflow per each plane
- int outputPlaneSize = devGradOutput.getSize(2) * devGradOutput.getSize(3);
- dim3 grid(THCCeilDiv(outputPlaneSize, 128),
- devGradInput.getSize(1),
- devGradInput.getSize(0));
- dim3 block(outputPlaneSize > 128 ? 128 : outputPlaneSize);
-
- SpatialFractionalMaxPooling_updateGradInput
- <<<grid, block, 0, THCState_getCurrentStream(state)>>>(
- devGradInput, devGradOutput, devIndices);
- THCudaCheck(cudaGetLastError());
-}
+#include "generic/SpatialFractionalMaxPooling.cu"
+#include "THCGenerateFloatTypes.h"
diff --git a/lib/THCUNN/SpatialFullConvolution.cu b/lib/THCUNN/SpatialFullConvolution.cu
index 4c31485..4e37ecf 100644
--- a/lib/THCUNN/SpatialFullConvolution.cu
+++ b/lib/THCUNN/SpatialFullConvolution.cu
@@ -1,337 +1,8 @@
#include "THCUNN.h"
#include "im2col.h"
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
-void THNN_CudaSpatialFullConvolution_updateOutput(
- THCState *state,
- THCudaTensor *input,
- THCudaTensor *output,
- THCudaTensor *weight,
- THCudaTensor *bias,
- THCudaTensor *columns,
- THCudaTensor *ones,
- int kW, int kH,
- int dW, int dH,
- int padW, int padH,
- int adjW, int adjH)
-{
-
- int nInputPlane = THCudaTensor_size(state, weight, 0);
- int nOutputPlane = THCudaTensor_size(state, weight, 1);
-
- THCUNN_assertSameGPU(state, 6, input, output, weight,
- bias, columns, ones);
-
- THArgCheck(input->nDimension == 3 || input->nDimension == 4, 2, "3D or 4D (batch mode) tensor is expected");
-
- int batch = 1;
- if (input->nDimension == 3) {
- THArgCheck(input->size[0] == nInputPlane, 2, "input channels and nInputPlane dont match");
- // Force batch
- batch = 0;
- THCudaTensor_resize4d(state, input, 1, input->size[0], input->size[1], input->size[2]);
- } else {
- THArgCheck(input->size[1] == nInputPlane, 2, "input channels and nInputPlane dont match");
- }
-
- long inputWidth = input->size[3];
- long inputHeight = input->size[2];
- long outputWidth = (inputWidth - 1) * dW - 2*padW + kW + adjW;
- long outputHeight = (inputHeight - 1) * dH - 2*padH + kH + adjH;
-
- // Batch size + input planes
- long batchSize = input->size[0];
-
- // Resize output
- THCudaTensor_resize4d(state, output, batchSize, nOutputPlane, outputHeight, outputWidth);
-
- // Resize temporary columns
- THCudaTensor_resize2d(state, columns, nOutputPlane*kW*kH, inputHeight*inputWidth);
-
- // Define a buffer of ones, for bias accumulation
- // Note: this buffer can be shared with other modules, it only ever gets increased,
- // and always contains ones.
- if (ones->nDimension != 2 || ones->size[0]*ones->size[1] < outputHeight*outputWidth) {
- // Resize plane and fill with ones...
- THCudaTensor_resize2d(state, ones, outputHeight, outputWidth);
- THCudaTensor_fill(state, ones, 1);
- }
-
- // Helpers
- THCudaTensor *input_n = THCudaTensor_new(state);
- THCudaTensor *output_n = THCudaTensor_new(state);
-
- // For each elt in batch, do:
- for (int elt = 0; elt < batchSize; elt ++) {
- // Matrix mulitply per output:
- THCudaTensor_select(state, input_n, input, 0, elt);
- THCudaTensor_select(state, output_n, output, 0, elt);
-
- // M,N,K are dims of matrix A and B
- // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
- long m = weight->size[1] * weight->size[2] * weight->size[3];
- long n = columns->size[1];
- long k = weight->size[0];
-
- // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
- THCudaBlas_Sgemm(
- state,
- 'n', 't',
- n, m, k,
- 1,
- THCudaTensor_data(state, input_n), n,
- THCudaTensor_data(state, weight), m,
- 0,
- THCudaTensor_data(state, columns), n
- );
-
- // Unpack columns back into input:
- col2im(
- THCState_getCurrentStream(state),
- THCudaTensor_data(state, columns),
- nOutputPlane, outputHeight, outputWidth, kH, kW, padH, padW, dH, dW,
- 1, 1, THCudaTensor_data(state, output_n)
- );
-
- // Do Bias after:
- // M,N,K are dims of matrix A and B
- // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
- long m_ = nOutputPlane;
- long n_ = outputHeight * outputWidth;
- long k_ = 1;
-
- // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
- if (bias) {
- THCudaBlas_Sgemm(
- state,
- 't', 'n',
- n_, m_, k_,
- 1,
- THCudaTensor_data(state, ones), k_,
- THCudaTensor_data(state, bias), k_,
- 1,
- THCudaTensor_data(state, output_n), n_
- );
- }
-
- }
-
- // Free
- THCudaTensor_free(state, input_n);
- THCudaTensor_free(state, output_n);
-
- // Resize output
- if (batch == 0) {
- THCudaTensor_resize3d(state, output, nOutputPlane, outputHeight, outputWidth);
- THCudaTensor_resize3d(state, input, nInputPlane, inputHeight, inputWidth);
- }
-}
-
-void THNN_CudaSpatialFullConvolution_updateGradInput(
- THCState *state,
- THCudaTensor *input,
- THCudaTensor *gradOutput,
- THCudaTensor *gradInput,
- THCudaTensor *weight,
- THCudaTensor *gradColumns,
- int kW, int kH,
- int dW, int dH,
- int padW, int padH,
- int adjW, int adjH)
-{
- int nInputPlane = THCudaTensor_size(state, weight, 0);
- int nOutputPlane = THCudaTensor_size(state, weight, 1);
-
- THCUNN_assertSameGPU(state, 5, input, gradOutput, weight,
- gradColumns, gradInput);
- THArgCheck(input->nDimension == 3 || input->nDimension == 4, 2, "3D or 4D (batch mode) tensor is expected");
-
- int batch = 1;
- if (input->nDimension == 3) {
- // Force batch
- batch = 0;
- THCudaTensor_resize4d(state, input, 1, input->size[0], input->size[1], input->size[2]);
- THCudaTensor_resize4d(state, gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2]);
- }
-
- long inputWidth = input->size[3];
- long inputHeight = input->size[2];
- long outputWidth = (inputWidth - 1) * dW - 2*padW + kW + adjW;
- long outputHeight = (inputHeight - 1) * dH - 2*padH + kH + adjH;
-
- // Batch size + input planes
- long batchSize = input->size[0];
-
- // Resize output
- THCudaTensor_resize4d(state, gradInput, batchSize, nInputPlane, inputHeight, inputWidth);
-
- // Resize temporary columns
- THCudaTensor_resize2d(state, gradColumns, nOutputPlane*kW*kH, inputHeight*inputWidth);
-
- // Helpers
- THCudaTensor *gradInput_n = THCudaTensor_new(state);
- THCudaTensor *gradOutput_n = THCudaTensor_new(state);
-
- // For each elt in batch, do:
- for (int elt = 0; elt < batchSize; elt ++) {
- // Matrix mulitply per sample:
- THCudaTensor_select(state, gradInput_n, gradInput, 0, elt);
- THCudaTensor_select(state, gradOutput_n, gradOutput, 0, elt);
-
- // Extract columns:
- im2col(
- THCState_getCurrentStream(state),
- THCudaTensor_data(state, gradOutput_n),
- nOutputPlane, outputHeight, outputWidth, kH, kW, padH, padW, dH, dW,
- 1, 1, THCudaTensor_data(state, gradColumns)
- );
-
-
- // M,N,K are dims of matrix A and B
- // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
- long m = weight->size[0];
- long n = gradColumns->size[1];
- long k = weight->size[1] * weight->size[2] * weight->size[3];
-
- // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
- THCudaBlas_Sgemm(
- state,
- 'n', 'n',
- n, m, k,
- 1,
- THCudaTensor_data(state, gradColumns), n,
- THCudaTensor_data(state, weight), k,
- 0,
- THCudaTensor_data(state, gradInput_n), n
- );
- }
-
-
- // Free
- THCudaTensor_free(state, gradInput_n);
- THCudaTensor_free(state, gradOutput_n);
-
- // Resize output
- if (batch == 0) {
- THCudaTensor_resize3d(state, gradOutput, nOutputPlane, outputHeight, outputWidth);
- THCudaTensor_resize3d(state, input, nInputPlane, inputHeight, inputWidth);
- THCudaTensor_resize3d(state, gradInput, nInputPlane, inputHeight, inputWidth);
- }
-}
-
-
-void THNN_CudaSpatialFullConvolution_accGradParameters(
- THCState *state,
- THCudaTensor *input,
- THCudaTensor *gradOutput,
- THCudaTensor *gradWeight,
- THCudaTensor *gradBias,
- THCudaTensor *columns,
- THCudaTensor *ones,
- int kW, int kH,
- int dW, int dH,
- int padW, int padH,
- int adjW, int adjH,
- float scale)
-{
- int nInputPlane = THCudaTensor_size(state, gradWeight, 0);
- int nOutputPlane = THCudaTensor_size(state, gradWeight, 1);
-
- THCUNN_assertSameGPU(state, 6, input, gradOutput, gradWeight,
- gradBias, columns, ones);
-
- THArgCheck(input->nDimension == 3 || input->nDimension == 4, 2, "3D or 4D (batch mode) tensor is expected");
-
- int batch = 1;
- if (input->nDimension == 3) {
- // Force batch
- batch = 0;
- THCudaTensor_resize4d(state, input, 1, input->size[0], input->size[1], input->size[2]);
- THCudaTensor_resize4d(state, gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2]);
- }
-
- long inputWidth = input->size[3];
- long inputHeight = input->size[2];
- long outputWidth = (inputWidth - 1) * dW - 2*padW + kW + adjW;
- long outputHeight = (inputHeight - 1) * dH - 2*padH + kH + adjH;
-
- // Batch size + input planes
- long batchSize = input->size[0];
-
- // Define a buffer of ones, for bias accumulation
- if (ones->nDimension != 2 || ones->size[0]*ones->size[1] < outputHeight*outputWidth) {
- // Resize plane and fill with ones...
- THCudaTensor_resize2d(state, ones, outputHeight, outputWidth);
- THCudaTensor_fill(state, ones, 1);
- }
-
- // Resize temporary columns
- THCudaTensor_resize2d(state, columns, nOutputPlane*kW*kH, inputHeight*inputWidth);
-
- // Helpers
- THCudaTensor *input_n = THCudaTensor_new(state);
- THCudaTensor *gradOutput_n = THCudaTensor_new(state);
-
- // For each elt in batch, do:
- for (int elt = 0; elt < batchSize; elt ++) {
- // Matrix mulitply per output:
- THCudaTensor_select(state, input_n, input, 0, elt);
- THCudaTensor_select(state, gradOutput_n, gradOutput, 0, elt);
-
- // Extract columns:
- im2col(
- THCState_getCurrentStream(state),
- THCudaTensor_data(state, gradOutput_n),
- nOutputPlane, outputHeight, outputWidth, kH, kW, padH, padW, dH, dW,
- 1, 1, THCudaTensor_data(state, columns)
- );
-
- // M,N,K are dims of matrix A and B
- // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
- long n = columns->size[0]; // nOutputPlane * kh * kw
- long m = input_n->size[0]; // nInputPlane
- long k = columns->size[1]; // inputHeight * inputWidth
-
- // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
- THCudaBlas_Sgemm(
- state,
- 't', 'n',
- n, m, k,
- scale,
- THCudaTensor_data(state, columns), k,
- THCudaTensor_data(state, input_n), k,
- 1,
- THCudaTensor_data(state, gradWeight), n
- );
-
- // Do Bias:
- // M,N,K are dims of matrix A and B
- // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
- long m_ = nOutputPlane;
- long k_ = outputHeight * outputWidth;
-
- // Do GEMV (note: this is a bit confusing because gemv assumes column-major matrices)
- if (gradBias) {
- THCudaBlas_Sgemv(
- state,
- 't',
- k_, m_,
- scale,
- THCudaTensor_data(state, gradOutput_n), k_,
- THCudaTensor_data(state, ones), 1,
- 1,
- THCudaTensor_data(state, gradBias), 1
- );
- }
- }
-
- // Free
- THCudaTensor_free(state, input_n);
- THCudaTensor_free(state, gradOutput_n);
-
- // Resize
- if (batch == 0) {
- THCudaTensor_resize3d(state, gradOutput, nOutputPlane, outputHeight, outputWidth);
- THCudaTensor_resize3d(state, input, nInputPlane, inputHeight, inputWidth);
- }
-}
+#include "generic/SpatialFullConvolution.cu"
+#include "THCGenerateFloatTypes.h"
diff --git a/lib/THCUNN/SpatialMaxPooling.cu b/lib/THCUNN/SpatialMaxPooling.cu
index ac6e3fd..90e6fe4 100644
--- a/lib/THCUNN/SpatialMaxPooling.cu
+++ b/lib/THCUNN/SpatialMaxPooling.cu
@@ -1,18 +1,4 @@
#include "THCUNN.h"
-#include "common.h"
-void THNN_CudaSpatialMaxPooling_updateOutput(THCState *state, THCudaTensor *input, THCudaTensor *output, THCudaTensor *indices, int kW, int kH, int dW, int dH, int padW, int padH, bool ceil_mode)
-{
- THNN_CudaSpatialDilatedMaxPooling_updateOutput(
- state, input, output, indices,
- kW, kH, dW, dH, padW, padH, 1, 1, ceil_mode);
-
-}
-
-void THNN_CudaSpatialMaxPooling_updateGradInput(THCState *state, THCudaTensor *input, THCudaTensor *gradOutput, THCudaTensor *gradInput, THCudaTensor *indices, int kW, int kH, int dW, int dH, int padW, int padH, bool ceil_mode)
-{
- THNN_CudaSpatialDilatedMaxPooling_updateGradInput(
- state, input, gradOutput, gradInput, indices,
- kW, kH, dW, dH, padW, padH, 1, 1, ceil_mode);
-
-}
+#include "generic/SpatialMaxPooling.cu"
+#include "THCGenerateFloatTypes.h"
diff --git a/lib/THCUNN/SpatialMaxUnpooling.cu b/lib/THCUNN/SpatialMaxUnpooling.cu
index b56bd56..2d8a481 100644
--- a/lib/THCUNN/SpatialMaxUnpooling.cu
+++ b/lib/THCUNN/SpatialMaxUnpooling.cu
@@ -2,7 +2,7 @@
#include "common.h"
template <typename Dtype>
-__global__ void MaxUnpoolForward(const int nthreads, const Dtype* bottom_data, const Dtype* bottom_mask,
+__global__ void MaxUnpoolForward(const int nthreads, const Dtype* bottom_data, const long* bottom_mask,
const int num, const int channels, const int iheight, const int iwidth, const int oheight, const int owidth, Dtype* top_data) {
CUDA_KERNEL_LOOP(index, nthreads) { //index here indices the input pixels
int c = (index / iwidth / iheight) % channels;
@@ -15,7 +15,7 @@ __global__ void MaxUnpoolForward(const int nthreads, const Dtype* bottom_data, c
}
template <typename Dtype>
-__global__ void MaxUnpoolBackward(const int nthreads, const Dtype* top_diff, const Dtype* bottom_mask,
+__global__ void MaxUnpoolBackward(const int nthreads, const Dtype* top_diff, const long* bottom_mask,
const int num, const int channels, const int iheight, const int iwidth, const int oheight, const int owidth, Dtype* bottom_diff) {
CUDA_KERNEL_LOOP(index, nthreads) {
int c = (index / iwidth / iheight) % channels;
@@ -27,79 +27,5 @@ __global__ void MaxUnpoolBackward(const int nthreads, const Dtype* top_diff, con
}
}
-void THNN_CudaSpatialMaxUnpooling_updateOutput(THCState *state, THCudaTensor *input, THCudaTensor *output, THCudaTensor *indices, int owidth, int oheight)
-{
- THCUNN_assertSameGPU(state, 3, input, output, indices);
- THArgCheck(input->nDimension == 3 || input->nDimension == 4, 2, "3D or 4D (batch) tensor expected");
-
- long nInputCols, nInputRows, nInputPlane, batchSize;
-
- if (input->nDimension == 3) {
- nInputCols = input->size[2];
- nInputRows = input->size[1];
- nInputPlane = input->size[0];
- batchSize = 1;
- }
- else
- {
- nInputCols = input->size[3];
- nInputRows = input->size[2];
- nInputPlane = input->size[1];
- batchSize = input->size[0];
- }
-
- input = THCudaTensor_newContiguous(state, input);
- indices = THCudaTensor_newContiguous(state, indices);
- THCudaTensor_resize4d(state, output, batchSize, nInputPlane, oheight, owidth);
- THCudaTensor_zero(state, output);
-
- int count = THCudaTensor_nElement(state, input);
-
- MaxUnpoolForward <<< GET_BLOCKS(count), CUDA_NUM_THREADS, 0, THCState_getCurrentStream(state) >>>
- (count, THCudaTensor_data(state, input), THCudaTensor_data(state, indices),
- batchSize, nInputPlane, nInputRows, nInputCols, oheight, owidth, THCudaTensor_data(state, output));
- THCudaCheck(cudaGetLastError());
-
- if(input->nDimension == 3)
- THCudaTensor_resize3d(state, output, nInputPlane, oheight, owidth);
-
- THCudaTensor_free(state, input);
-
-}
-
-void THNN_CudaSpatialMaxUnpooling_updateGradInput(THCState *state, THCudaTensor *input, THCudaTensor *gradOutput, THCudaTensor *gradInput, THCudaTensor *indices, int owidth, int oheight)
-{
- THCUNN_assertSameGPU(state, 4, input, gradOutput, indices, gradInput);
-
- long nInputCols, nInputRows, nInputPlane, batchSize;
-
- if (input->nDimension == 3) {
- nInputCols = input->size[2];
- nInputRows = input->size[1];
- nInputPlane = input->size[0];
- batchSize = 1;
- }
- else
- {
- nInputCols = input->size[3];
- nInputRows = input->size[2];
- nInputPlane = input->size[1];
- batchSize = input->size[0];
- }
-
- input = THCudaTensor_newContiguous(state, input);
- indices = THCudaTensor_newContiguous(state, indices);
- gradOutput = THCudaTensor_newContiguous(state, gradOutput);
- THCudaTensor_resizeAs(state, gradInput, input);
-
- int count = THCudaTensor_nElement(state, input);
-
- MaxUnpoolBackward <<< GET_BLOCKS(count), CUDA_NUM_THREADS, 0, THCState_getCurrentStream(state) >>>
- (count, THCudaTensor_data(state, gradOutput), THCudaTensor_data(state, indices),
- batchSize, nInputPlane, nInputRows, nInputCols, oheight, owidth, THCudaTensor_data(state, gradInput));
- THCudaCheck(cudaGetLastError());
-
- // clean
- THCudaTensor_free(state, input);
- THCudaTensor_free(state, gradOutput);
-}
+#include "generic/SpatialMaxUnpooling.cu"
+#include "THCGenerateFloatTypes.h"
diff --git a/lib/THCUNN/SpatialReflectionPadding.cu b/lib/THCUNN/SpatialReflectionPadding.cu
index c3ae14e..75aae1a 100644
--- a/lib/THCUNN/SpatialReflectionPadding.cu
+++ b/lib/THCUNN/SpatialReflectionPadding.cu
@@ -5,9 +5,14 @@
#include "THCDeviceUtils.cuh"
#include "THCReduceApplyUtils.cuh"
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
+#include "THCAtomics.cuh"
+
+template<typename Dtype>
__global__ void SpatialReflectionPadding_updateOutput(
- THCDeviceTensor<float, 4> input,
- THCDeviceTensor<float, 4> output,
+ THCDeviceTensor<Dtype, 4> input,
+ THCDeviceTensor<Dtype, 4> output,
int padT, int padB, int padL, int padR) {
int outputPointId = threadIdx.x + blockIdx.x * blockDim.x;
@@ -36,70 +41,14 @@ __global__ void SpatialReflectionPadding_updateOutput(
+ 2 * padT + input.getSize(2) - 1
- oStartY + iStartY;
- float valueToCopy = input[batch][plane][inputPointY][inputPointX];
+ Dtype valueToCopy = input[batch][plane][inputPointY][inputPointX];
output[batch][plane][outputPointY][outputPointX] = valueToCopy;
}
-void THNN_CudaSpatialReflectionPadding_updateOutput(THCState *state,
- THCudaTensor *input,
- THCudaTensor *output,
- int padL, int padR,
- int padT, int padB
- ) {
- THArgCheck(TensorUtils<THCudaTensor>::canUse32BitIndexMath(state, input), 2,
- "input tensor must fit into 32-bit index math");
-
- int planeDim = 0;
- int dimh = 1;
- int dimw = 2;
- int numBatch = 1;
-
- int numInputDims = THCudaTensor_nDimension(state, input);
- THArgCheck(numInputDims == 3 || numInputDims == 4, 2,
- "input must be 3 or 4-dimensional");
-
- if (numInputDims == 4) {
- numBatch = THCudaTensor_size(state, input, 0);
- planeDim++;
- dimh++;
- dimw++;
- }
-
- int numPlanes = THCudaTensor_size(state, input, planeDim);
- int inputH = THCudaTensor_size(state, input, dimh);
- int inputW = THCudaTensor_size(state, input, dimw);
- int outputH = inputH + padT + padB;
- int outputW = inputW + padL + padR;
-
- THCDeviceTensor<float, 4> devInput;
- THCDeviceTensor<float, 4> devOutput;
-
- if (numInputDims == 3) {
- THCudaTensor_resize3d(state, output, numPlanes, outputH, outputW);
-
- devInput = toDeviceTensor<float, 3>(state, input).upcastOuter<4>();
- devOutput = toDeviceTensor<float, 3>(state, output).upcastOuter<4>();
- } else {
- THCudaTensor_resize4d(state, output, numBatch, numPlanes, outputH, outputW);
-
- devInput = toDeviceTensor<float, 4>(state, input);
- devOutput = toDeviceTensor<float, 4>(state, output);
- }
-
- int outputPlaneSize = devOutput.getSize(2) * devOutput.getSize(3);
- dim3 gridSize(THCCeilDiv(outputPlaneSize, 256),
- devOutput.getSize(1),
- devOutput.getSize(0));
- dim3 blockSize(outputPlaneSize > 256 ? 256 : outputPlaneSize);
-
- SpatialReflectionPadding_updateOutput<<<gridSize, blockSize, 0, THCState_getCurrentStream(state)>>>(
- devInput, devOutput, padT, padB, padL, padR);
- THCudaCheck(cudaGetLastError());
-}
-
+template <typename Dtype>
__global__ void SpatialReflectionPadding_updateGradInput(
- THCDeviceTensor<float, 4> gradInput,
- THCDeviceTensor<float, 4> gradOutput,
+ THCDeviceTensor<Dtype, 4> gradInput,
+ THCDeviceTensor<Dtype, 4> gradOutput,
int padT, int padB, int padL, int padR) {
int outputPointId = threadIdx.x + blockIdx.x * blockDim.x;
@@ -128,54 +77,9 @@ __global__ void SpatialReflectionPadding_updateGradInput(
+ 2 * padT + gradInput.getSize(2) - 1
- oStartY + iStartY;
- float valueToCopy = gradOutput[batch][plane][outputPointY][outputPointX];
+ Dtype valueToCopy = gradOutput[batch][plane][outputPointY][outputPointX];
atomicAdd(&gradInput[batch][plane][inputPointY][inputPointX], valueToCopy);
}
-void THNN_CudaSpatialReflectionPadding_updateGradInput(THCState *state,
- THCudaTensor *input,
- THCudaTensor *gradOutput,
- THCudaTensor *gradInput,
- int padL, int padR,
- int padT, int padB) {
-
- THArgCheck(TensorUtils<THCudaTensor>::canUse32BitIndexMath(state, input), 2,
- "input tensor must fit into 32-bit index math");
- THArgCheck(TensorUtils<THCudaTensor>::canUse32BitIndexMath(state, gradOutput), 3,
- "output gradient tensor must fit into 32-bit index math");
-
- int planeDim = 0;
- int dimh = 1;
- int dimw = 2;
-
- int numInputDims = THCudaTensor_nDimension(state, input);
- if (numInputDims == 4) {
- planeDim++;
- dimh++;
- dimw++;
- }
-
- THCudaTensor_resizeAs(state, gradInput, input);
- THCudaTensor_zero(state, gradInput);
-
- THCDeviceTensor<float, 4> devGradInput;
- THCDeviceTensor<float, 4> devGradOutput;
-
- if (numInputDims == 3) {
- devGradInput = toDeviceTensor<float, 3>(state, gradInput).upcastOuter<4>();
- devGradOutput = toDeviceTensor<float, 3>(state, gradOutput).upcastOuter<4>();
- } else {
- devGradInput = toDeviceTensor<float, 4>(state, gradInput);
- devGradOutput = toDeviceTensor<float, 4>(state, gradOutput);
- }
-
- int outputPlaneSize = devGradOutput.getSize(2) * devGradOutput.getSize(3);
- dim3 gridSize(THCCeilDiv(outputPlaneSize, 256),
- devGradOutput.getSize(1),
- devGradOutput.getSize(0));
- dim3 blockSize(outputPlaneSize > 256 ? 256 : outputPlaneSize);
-
- SpatialReflectionPadding_updateGradInput<<<gridSize, blockSize, 0, THCState_getCurrentStream(state)>>>(
- devGradInput, devGradOutput, padT, padB, padL, padR);
- THCudaCheck(cudaGetLastError());
-}
+#include "generic/SpatialReflectionPadding.cu"
+#include "THCGenerateFloatTypes.h"
diff --git a/lib/THCUNN/SpatialReplicationPadding.cu b/lib/THCUNN/SpatialReplicationPadding.cu
index fc09291..f022e57 100644
--- a/lib/THCUNN/SpatialReplicationPadding.cu
+++ b/lib/THCUNN/SpatialReplicationPadding.cu
@@ -5,9 +5,14 @@
#include "THCDeviceUtils.cuh"
#include "THCReduceApplyUtils.cuh"
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
+#include "THCAtomics.cuh"
+
+template <typename Dtype>
__global__ void SpatialReplicationPadding_updateOutput(
- THCDeviceTensor<float, 4> input,
- THCDeviceTensor<float, 4> output,
+ THCDeviceTensor<Dtype, 4> input,
+ THCDeviceTensor<Dtype, 4> output,
int padT, int padB, int padL, int padR) {
int outputPointId = threadIdx.x + blockIdx.x * blockDim.x;
@@ -27,70 +32,14 @@ __global__ void SpatialReplicationPadding_updateOutput(
int inputPointX = min(max(padL, outputPointX), input.getSize(3) + padL - 1) - oStartX + iStartX;
int inputPointY = min(max(padT, outputPointY), input.getSize(2) + padT - 1) - oStartY + iStartY;
- float valueToCopy = input[batch][plane][inputPointY][inputPointX];
+ Dtype valueToCopy = input[batch][plane][inputPointY][inputPointX];
output[batch][plane][outputPointY][outputPointX] = valueToCopy;
}
-void THNN_CudaSpatialReplicationPadding_updateOutput(THCState *state,
- THCudaTensor *input,
- THCudaTensor *output,
- int padL, int padR,
- int padT, int padB
- ) {
- THArgCheck(TensorUtils<THCudaTensor>::canUse32BitIndexMath(state, input), 2,
- "input tensor must fit into 32-bit index math");
-
- int planeDim = 0;
- int dimh = 1;
- int dimw = 2;
- int numBatch = 1;
-
- int numInputDims = THCudaTensor_nDimension(state, input);
- THArgCheck(numInputDims == 3 || numInputDims == 4, 2,
- "input must be 3 or 4-dimensional");
-
- if (numInputDims == 4) {
- numBatch = THCudaTensor_size(state, input, 0);
- planeDim++;
- dimh++;
- dimw++;
- }
-
- int numPlanes = THCudaTensor_size(state, input, planeDim);
- int inputH = THCudaTensor_size(state, input, dimh);
- int inputW = THCudaTensor_size(state, input, dimw);
- int outputH = inputH + padT + padB;
- int outputW = inputW + padL + padR;
-
- THCDeviceTensor<float, 4> devInput;
- THCDeviceTensor<float, 4> devOutput;
-
- if (numInputDims == 3) {
- THCudaTensor_resize3d(state, output, numPlanes, outputH, outputW);
-
- devInput = toDeviceTensor<float, 3>(state, input).upcastOuter<4>();
- devOutput = toDeviceTensor<float, 3>(state, output).upcastOuter<4>();
- } else {
- THCudaTensor_resize4d(state, output, numBatch, numPlanes, outputH, outputW);
-
- devInput = toDeviceTensor<float, 4>(state, input);
- devOutput = toDeviceTensor<float, 4>(state, output);
- }
-
- int outputPlaneSize = devOutput.getSize(2) * devOutput.getSize(3);
- dim3 gridSize(THCCeilDiv(outputPlaneSize, 256),
- devOutput.getSize(1),
- devOutput.getSize(0));
- dim3 blockSize(outputPlaneSize > 256 ? 256 : outputPlaneSize);
-
- SpatialReplicationPadding_updateOutput<<<gridSize, blockSize, 0, THCState_getCurrentStream(state)>>>(
- devInput, devOutput, padT, padB, padL, padR);
-
-}
-
+template <typename Dtype>
__global__ void SpatialReplicationPadding_updateGradInput(
- THCDeviceTensor<float, 4> gradInput,
- THCDeviceTensor<float, 4> gradOutput,
+ THCDeviceTensor<Dtype, 4> gradInput,
+ THCDeviceTensor<Dtype, 4> gradOutput,
int padT, int padB, int padL, int padR) {
int outputPointId = threadIdx.x + blockIdx.x * blockDim.x;
@@ -110,54 +59,10 @@ __global__ void SpatialReplicationPadding_updateGradInput(
int inputPointX = min(max(padL, outputPointX), gradInput.getSize(3) + padL - 1) - oStartX + iStartX;
int inputPointY = min(max(padT, outputPointY), gradInput.getSize(2) + padT - 1) - oStartY + iStartY;
- float valueToCopy = gradOutput[batch][plane][outputPointY][outputPointX];
+ Dtype valueToCopy = gradOutput[batch][plane][outputPointY][outputPointX];
atomicAdd(&gradInput[batch][plane][inputPointY][inputPointX], valueToCopy);
}
-void THNN_CudaSpatialReplicationPadding_updateGradInput(THCState *state,
- THCudaTensor *input,
- THCudaTensor *gradOutput,
- THCudaTensor *gradInput,
- int padL, int padR,
- int padT, int padB) {
-
- THArgCheck(TensorUtils<THCudaTensor>::canUse32BitIndexMath(state, input), 2,
- "input tensor must fit into 32-bit index math");
- THArgCheck(TensorUtils<THCudaTensor>::canUse32BitIndexMath(state, gradOutput), 3,
- "output gradient tensor must fit into 32-bit index math");
-
- int planeDim = 0;
- int dimh = 1;
- int dimw = 2;
-
- int numInputDims = THCudaTensor_nDimension(state, input);
- if (numInputDims == 4) {
- planeDim++;
- dimh++;
- dimw++;
- }
-
- THCudaTensor_resizeAs(state, gradInput, input);
- THCudaTensor_zero(state, gradInput);
- THCDeviceTensor<float, 4> devGradInput;
- THCDeviceTensor<float, 4> devGradOutput;
-
- if (numInputDims == 3) {
- devGradInput = toDeviceTensor<float, 3>(state, gradInput).upcastOuter<4>();
- devGradOutput = toDeviceTensor<float, 3>(state, gradOutput).upcastOuter<4>();
- } else {
- devGradInput = toDeviceTensor<float, 4>(state, gradInput);
- devGradOutput = toDeviceTensor<float, 4>(state, gradOutput);
- }
-
- int outputPlaneSize = devGradOutput.getSize(2) * devGradOutput.getSize(3);
- dim3 gridSize(THCCeilDiv(outputPlaneSize, 256),
- devGradOutput.getSize(1),
- devGradOutput.getSize(0));
- dim3 blockSize(outputPlaneSize > 256 ? 256 : outputPlaneSize);
-
- SpatialReplicationPadding_updateGradInput<<<gridSize, blockSize, 0, THCState_getCurrentStream(state)>>>(
- devGradInput, devGradOutput, padT, padB, padL, padR);
-
-}
+#include "generic/SpatialReplicationPadding.cu"
+#include "THCGenerateFloatTypes.h"
diff --git a/lib/THCUNN/SpatialSubSampling.cu b/lib/THCUNN/SpatialSubSampling.cu
index d801862..a38661b 100644
--- a/lib/THCUNN/SpatialSubSampling.cu
+++ b/lib/THCUNN/SpatialSubSampling.cu
@@ -1,5 +1,7 @@
#include "THCUNN.h"
-#include "common.h"
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
+#include "THCAtomics.cuh"
#define CUDA_MAX_THREADS 1024 // this is safe, in reality 256 is our limit
@@ -8,7 +10,8 @@
* this function subsamples an input 3D tensor along dimensions 1 and 2
* 3D input, 3D output, 1D weight, 1D bias
*/
-__global__ void subsample(float *input, float *output, float *weight, float *bias,
+ template <typename Dtype, typename Acctype>
+__global__ void subsample(Dtype *input, Dtype *output, Dtype *weight, Dtype *bias,
int input_n, int input_h, int input_w,
int kH, int kW, int dH, int dW)
{
@@ -37,18 +40,18 @@ __global__ void subsample(float *input, float *output, float *weight, float *bia
input = input + i*input_w*input_h;
// Get the good mask for (k,i) (k out, i in)
- float the_weight = weight[k];
+ Dtype the_weight = weight[k];
// Initialize to the bias
- float the_bias = bias[k];
+ Dtype the_bias = bias[k];
// For all output pixels...
for(yy = yy_start; yy < yy_end; yy+=yy_step) {
for(xx = xx_start; xx < xx_end; xx+=xx_step) {
// Compute the mean of the input image...
- float *ptr_input = input + yy*dH*input_w + xx*dW;
- float *ptr_output = output + yy*output_w + xx;
- float sum = 0;
+ Dtype *ptr_input = input + yy*dH*input_w + xx*dW;
+ Dtype *ptr_output = output + yy*output_w + xx;
+ Acctype sum = 0;
int kx, ky;
for(ky = 0; ky < kH; ky++) {
for(kx = 0; kx < kW; kx++)
@@ -56,7 +59,7 @@ __global__ void subsample(float *input, float *output, float *weight, float *bia
ptr_input += input_w; // next input line
}
// Update output
- *ptr_output = the_weight*sum + the_bias;
+ *ptr_output = ScalarConvert<Acctype, Dtype>::to(the_weight*sum + the_bias);
}
}
}
@@ -65,7 +68,8 @@ __global__ void subsample(float *input, float *output, float *weight, float *bia
* Description:
* this function computes the gradWeight from input and gradOutput
*/
-__global__ void subgradweight(float *input, float *gradOutput, float *gradWeight, float *gradBias,
+ template <typename Dtype, typename Acctype>
+__global__ void subgradweight(Dtype *input, Dtype *gradOutput, Dtype *gradWeight, Dtype *gradBias,
int input_n, int input_h, int input_w,
int kH, int kW, int dH, int dW,
float scale)
@@ -98,15 +102,15 @@ __global__ void subgradweight(float *input, float *gradOutput, float *gradWeight
int tid = blockDim.x*threadIdx.y + threadIdx.x;
// create array to hold partial sums
- __shared__ float sums[CUDA_MAX_THREADS];
+ __shared__ Acctype sums[CUDA_MAX_THREADS];
sums[tid] = 0;
// compute partial sums
for(yy = yy_start; yy < yy_end; yy+=yy_step) {
for(xx = xx_start; xx < xx_end; xx+=xx_step) {
- float *ptr_input = input + yy*dH*input_w + xx*dW;
- float *ptr_gradOutput = gradOutput + yy*output_w + xx;
- float z = *ptr_gradOutput;
+ Dtype *ptr_input = input + yy*dH*input_w + xx*dW;
+ Dtype *ptr_gradOutput = gradOutput + yy*output_w + xx;
+ Dtype z = *ptr_gradOutput;
long kx, ky;
for(ky = 0; ky < kH; ky++) {
for(kx = 0; kx < kW; kx++) {
@@ -120,7 +124,11 @@ __global__ void subgradweight(float *input, float *gradOutput, float *gradWeight
// reduce: accumulate all partial sums to produce final gradWeight
if ((threadIdx.x == 0) && (threadIdx.y == 0)) {
- for(int i = 0; i < blockDim.x*blockDim.y; i++) gradWeight[k] += scale*sums[i];
+ Acctype scaledSums = Acctype(0);
+ for(int i = 0; i < blockDim.x*blockDim.y; i++) {
+ scaledSums += scale*sums[i];
+ }
+ gradWeight[k] += ScalarConvert<Acctype, Dtype>::to(scaledSums);
}
__syncthreads();
@@ -133,8 +141,11 @@ __global__ void subgradweight(float *input, float *gradOutput, float *gradWeight
// reduce gradBias
if ((threadIdx.x == 0) && (threadIdx.y == 0)) {
- for (int i=0; i<(blockDim.x*blockDim.y); i++)
- gradBias[k] += scale*sums[i];
+ Acctype scaledSums = Acctype(0);
+ for (int i=0; i<(blockDim.x*blockDim.y); i++) {
+ scaledSums += scale*sums[i];
+ }
+ gradBias[k] += ScalarConvert<Acctype, Dtype>::to(scaledSums);
}
}
@@ -142,7 +153,8 @@ __global__ void subgradweight(float *input, float *gradOutput, float *gradWeight
* Description:
* this function computes the gradInput from weight and gradOutput
*/
-__global__ void subgradinput(float *gradInput, float *gradOutput, float *weight,
+ template <typename Dtype>
+__global__ void subgradinput(Dtype *gradInput, Dtype *gradOutput, Dtype *weight,
int input_n, int input_h, int input_w,
int kH, int kW, int dH, int dW)
{
@@ -171,18 +183,20 @@ __global__ void subgradinput(float *gradInput, float *gradOutput, float *weight,
gradInput = gradInput + i*input_w*input_h;
// get weight
- float the_weight = weight[k];
+ Dtype the_weight = weight[k];
// compute gradInput
for(yy = yy_start; yy < yy_end; yy+=yy_step) {
for(xx = xx_start; xx < xx_end; xx+=xx_step) {
- float *ptr_gradInput = gradInput + yy*dH*input_w + xx*dW;
- float *ptr_gradOutput = gradOutput + yy*output_w + xx;
- float z = *ptr_gradOutput * the_weight;
+ Dtype *ptr_gradInput = gradInput + yy*dH*input_w + xx*dW;
+ Dtype *ptr_gradOutput = gradOutput + yy*output_w + xx;
+ Dtype z = *ptr_gradOutput * the_weight;
int kx, ky;
for(ky = 0; ky < kH; ky++) {
- for(kx = 0; kx < kW; kx++)
+ for(kx = 0; kx < kW; kx++) {
+ // FIXME: should this be done at accreal precision?
ptr_gradInput[kx] += z;
+ }
ptr_gradInput += input_w;
}
}
@@ -193,7 +207,8 @@ __global__ void subgradinput(float *gradInput, float *gradOutput, float *weight,
* Description:
* this function computes the gradInput from weight and gradOutput
*/
-__global__ void subgradinputAtomic(float *gradInput, float *gradOutput, float *weight,
+ template <typename Dtype>
+__global__ void subgradinputAtomic(Dtype *gradInput, Dtype *gradOutput, Dtype *weight,
int input_n, int input_h, int input_w,
int kH, int kW, int dH, int dW)
{
@@ -222,17 +237,18 @@ __global__ void subgradinputAtomic(float *gradInput, float *gradOutput, float *w
gradInput = gradInput + i*input_w*input_h;
// get weight
- float the_weight = weight[k];
+ Dtype the_weight = weight[k];
// compute gradInput
for(yy = yy_start; yy < yy_end; yy+=yy_step) {
for(xx = xx_start; xx < xx_end; xx+=xx_step) {
- float *ptr_gradInput = gradInput + yy*dH*input_w + xx*dW;
- float *ptr_gradOutput = gradOutput + yy*output_w + xx;
- float z = *ptr_gradOutput * the_weight;
+ Dtype *ptr_gradInput = gradInput + yy*dH*input_w + xx*dW;
+ Dtype *ptr_gradOutput = gradOutput + yy*output_w + xx;
+ Dtype z = *ptr_gradOutput * the_weight;
int kx, ky;
for(ky = 0; ky < kH; ky++) {
for(kx = 0; kx < kW; kx++) {
+ // FIXME: should this be done at accreal precision?
atomicAdd(&(ptr_gradInput[kx]), z);
}
ptr_gradInput += input_w;
@@ -241,205 +257,8 @@ __global__ void subgradinputAtomic(float *gradInput, float *gradOutput, float *w
}
}
-void THNN_CudaSpatialSubSampling_updateOutput(THCState *state, THCudaTensor *input, THCudaTensor *output, THCudaTensor *weight, THCudaTensor *bias, int kW, int kH, int dW, int dH)
-{
- float *weight_data = THCudaTensor_data(state, weight);
- float *bias_data = THCudaTensor_data(state, bias);
- float *output_data;
- float *input_data;
-
- int nInputPlane = THCudaTensor_size(state, weight, 0);
-
- THCUNN_assertSameGPU(state, 4, input, output, weight, bias);
- THArgCheck(input->nDimension == 3 || input->nDimension == 4, 2, "3D or 4D (batch) tensor expected");
-
- if (input->nDimension == 3) {
- long nInputCols = input->size[2];
- long nInputRows = input->size[1];
- long nOutputCols = (nInputCols - kW) / dW + 1;
- long nOutputRows = (nInputRows - kH) / dH + 1;
-
- THArgCheck(input->size[0] == nInputPlane, 2, "invalid number of input planes");
- THArgCheck(nInputCols >= kW && nInputRows >= kH, 2, "input image smaller than kernel size");
-
- input = THCudaTensor_newContiguous(state, input);
- input_data = THCudaTensor_data(state, input);
-
- THCudaTensor_resize3d(state, output, nInputPlane, nOutputRows, nOutputCols);
- output_data = THCudaTensor_data(state, output);
-
- // cuda blocks & threads:
- int yblocks = (int)(16L / nInputPlane);
- yblocks = yblocks < 1 ? 1 : yblocks;
- dim3 blocks(nInputPlane,yblocks);
- dim3 threads(32,8);
-
- // run subsample kernel
- subsample <<<blocks, threads, 0, THCState_getCurrentStream(state)>>> (
- input_data, output_data, weight_data, bias_data,
- nInputPlane, nInputRows, nInputCols, kH, kW, dH, dW);
- THCudaCheck(cudaGetLastError());
- } else {
- long nInputCols = input->size[3];
- long nInputRows = input->size[2];
- long nbatch = input->size[0];
- long nOutputCols = (nInputCols - kW) / dW + 1;
- long nOutputRows = (nInputRows - kH) / dH + 1;
-
- THArgCheck(input->size[1] == nInputPlane, 2, "invalid number of input planes");
- THArgCheck(nInputCols >= kW && nInputRows >= kH, 2, "input image smaller than kernel size");
-
- input = THCudaTensor_newContiguous(state, input);
- input_data = THCudaTensor_data(state, input);
-
- THCudaTensor_resize4d(state, output, nbatch, nInputPlane, nOutputRows, nOutputCols);
- output_data = THCudaTensor_data(state, output);
-
- // cuda blocks & threads:
- int yblocks = (int)(16L / nInputPlane);
- yblocks = yblocks < 1 ? 1 : yblocks;
- dim3 blocks(nInputPlane*nbatch,yblocks);
- dim3 threads(32,8);
-
- // run subsample kernel
- subsample <<<blocks, threads, 0, THCState_getCurrentStream(state)>>> (
- input_data, output_data, weight_data, bias_data,
- nInputPlane, nInputRows, nInputCols, kH, kW, dH, dW);
- THCudaCheck(cudaGetLastError());
- }
-
- // clean
- THCudaTensor_free(state, input);
-
-}
-
-void THNN_CudaSpatialSubSampling_updateGradInput(THCState *state, THCudaTensor *input, THCudaTensor *gradOutput, THCudaTensor *gradInput, THCudaTensor *weight, int kW, int kH, int dW, int dH)
-{
- THCUNN_assertSameGPU(state, 4, input, gradOutput, weight, gradInput);
-
- int nInputPlane = THCudaTensor_size(state, weight, 0);
-
- if (input->nDimension == 3) {
- long nInputCols = input->size[2];
- long nInputRows = input->size[1];
-
- float *weight_data = THCudaTensor_data(state, weight);
- float *gradOutput_data = THCudaTensor_data(state, gradOutput);
- float *gradInput_data;
-
- THCudaTensor_resizeAs(state, gradInput, input);
- THCudaTensor_zero(state, gradInput);
- gradInput_data = THCudaTensor_data(state, gradInput);
-
- // cuda blocks & threads:
- int yblocks = (int)(16L / nInputPlane);
- yblocks = yblocks < 1 ? 1 : yblocks;
- dim3 blocks(nInputPlane,yblocks);
- dim3 threads(32,8);
-
- // run updateGradInput kernel
- if (kH <= dH && kW <= dW) {
- subgradinput <<<blocks, threads, 0, THCState_getCurrentStream(state)>>> (
- gradInput_data, gradOutput_data, weight_data,
- nInputPlane, nInputRows, nInputCols, kH, kW, dH, dW);
- } else {
- subgradinputAtomic <<<blocks, threads, 0, THCState_getCurrentStream(state)>>> (
- gradInput_data, gradOutput_data, weight_data,
- nInputPlane, nInputRows, nInputCols, kH, kW, dH, dW);
- }
- THCudaCheck(cudaGetLastError());
- } else {
- long nInputCols = input->size[3];
- long nInputRows = input->size[2];
- long nbatch = input->size[0];
-
- float *weight_data = THCudaTensor_data(state, weight);
- float *gradOutput_data = THCudaTensor_data(state, gradOutput);
- float *gradInput_data;
-
- THCudaTensor_resizeAs(state, gradInput, input);
- THCudaTensor_zero(state, gradInput);
- gradInput_data = THCudaTensor_data(state, gradInput);
-
- // cuda blocks & threads:
- int yblocks = (int)(16L / nInputPlane);
- yblocks = yblocks < 1 ? 1 : yblocks;
- dim3 blocks(nInputPlane*nbatch,yblocks);
- dim3 threads(32,8);
-
- // run updateGradInput kernel
- if (kH <= dH && kW <= dW) {
- subgradinput <<<blocks, threads, 0, THCState_getCurrentStream(state)>>> (
- gradInput_data, gradOutput_data, weight_data,
- nInputPlane, nInputRows, nInputCols, kH, kW, dH, dW);
- } else {
- subgradinputAtomic <<<blocks, threads, 0, THCState_getCurrentStream(state)>>> (
- gradInput_data, gradOutput_data, weight_data,
- nInputPlane, nInputRows, nInputCols, kH, kW, dH, dW);
- }
- THCudaCheck(cudaGetLastError());
- }
-}
-
-void THNN_CudaSpatialSubSampling_accGradParameters(THCState *state, THCudaTensor *input, THCudaTensor *gradOutput, THCudaTensor *gradWeight, THCudaTensor *gradBias, int kW, int kH, int dW, int dH, float scale)
-{
- THCUNN_assertSameGPU(state, 4, input, gradOutput, gradWeight, gradBias);
-
- int nInputPlane = THCudaTensor_size(state, gradWeight, 0);
-
- if (input->nDimension == 3) {
- long nInputCols = input->size[2];
- long nInputRows = input->size[1];
-
- float *gradWeight_data = THCudaTensor_data(state, gradWeight);
- float *gradBias_data = THCudaTensor_data(state, gradBias);
- float *gradOutput_data = THCudaTensor_data(state, gradOutput);
- float *input_data;
-
- input = THCudaTensor_newContiguous(state, input);
- input_data = THCudaTensor_data(state, input);
-
- // cuda blocks & threads:
- dim3 blocks(nInputPlane);
- dim3 threads(32,8);
-
- // run gradweight kernel
- subgradweight <<<blocks, threads, 0, THCState_getCurrentStream(state)>>> (
- input_data, gradOutput_data, gradWeight_data, gradBias_data,
- nInputPlane, nInputRows, nInputCols, kH, kW, dH, dW, scale);
- THCudaCheck(cudaGetLastError());
- } else {
- long nInputCols = input->size[3];
- long nInputRows = input->size[2];
- long nbatch = input->size[0];
-
- float *gradWeight_data = THCudaTensor_data(state, gradWeight);
- float *gradBias_data = THCudaTensor_data(state, gradBias);
- float *gradOutput_data = THCudaTensor_data(state, gradOutput);
- float *input_data;
-
- input = THCudaTensor_newContiguous(state, input);
- input_data = THCudaTensor_data(state, input);
-
- // cuda blocks & threads:
- dim3 blocks(nInputPlane);
- dim3 threads(32,8);
-
- // run gradweight kernel
- long sl;
- for (sl=0; sl<nbatch; sl++) {
- subgradweight <<<blocks, threads, 0, THCState_getCurrentStream(state)>>> (
- input_data + sl*input->stride[0],
- gradOutput_data + sl*gradOutput->stride[0],
- gradWeight_data, gradBias_data,
- nInputPlane, nInputRows, nInputCols, kH, kW, dH, dW, scale);
- }
- THCudaCheck(cudaGetLastError());
- }
- // clean
- THCudaTensor_free(state, input);
-
-}
+#include "generic/SpatialSubSampling.cu"
+#include "THCGenerateFloatTypes.h"
#undef CUDA_MAX_THREADS
diff --git a/lib/THCUNN/SpatialUpSamplingBilinear.cu b/lib/THCUNN/SpatialUpSamplingBilinear.cu
index 27fab85..039f8e7 100644
--- a/lib/THCUNN/SpatialUpSamplingBilinear.cu
+++ b/lib/THCUNN/SpatialUpSamplingBilinear.cu
@@ -5,10 +5,14 @@
#include "THCDeviceTensor.cuh"
#include "THCDeviceTensorUtils.cuh"
#include "THCDeviceUtils.cuh"
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
+#include "THCAtomics.cuh"
+template<typename Dtype, typename Acctype>
__global__ void caffe_gpu_interp2_kernel(const int n,
- const float rheight, const float rwidth,
- const THCDeviceTensor<float, 4> data1, THCDeviceTensor<float, 4> data2) {
+ const Acctype rheight, const Acctype rwidth,
+ const THCDeviceTensor<Dtype, 4> data1, THCDeviceTensor<Dtype, 4> data2) {
int index = threadIdx.x + blockIdx.x * blockDim.x;
const int batchsize = data1.getSize(0);
const int channels = data1.getSize(1);
@@ -26,72 +30,42 @@ __global__ void caffe_gpu_interp2_kernel(const int n,
const int w1 = w2;
for (int n = 0; n < batchsize ; n++){
for (int c = 0; c < channels; ++c) {
- const float val = data1[n][c][h1][w1];
+ const Dtype val = data1[n][c][h1][w1];
data2[n][c][h2][w2] = val;
}
}
return;
}
//
- const float h1r = rheight * h2;
+ const Acctype h1r = rheight * h2;
const int h1 = h1r;
const int h1p = (h1 < height1 - 1) ? 1 : 0;
- const float h1lambda = h1r - h1;
- const float h0lambda = 1.0f - h1lambda;
+ const Acctype h1lambda = h1r - h1;
+ const Acctype h0lambda = Acctype(1) - h1lambda;
//
- const float w1r = rwidth * w2;
+ const Acctype w1r = rwidth * w2;
const int w1 = w1r;
const int w1p = (w1 < width1 - 1) ? 1 : 0;
- const float w1lambda = w1r - w1;
- const float w0lambda = 1.0f - w1lambda;
+ const Acctype w1lambda = w1r - w1;
+ const Acctype w0lambda = Acctype(1) - w1lambda;
//
for (int n = 0; n < batchsize ; n++){
for (int c = 0; c < channels; ++c) {
- const float val = h0lambda * (w0lambda * data1[n][c][h1][w1]
+ const Acctype val = h0lambda * (w0lambda * data1[n][c][h1][w1]
+ w1lambda * data1[n][c][h1][w1+w1p])
+ h1lambda * (w0lambda * data1[n][c][h1+h1p][w1]
+ w1lambda * data1[n][c][h1+h1p][w1+w1p]);
- data2[n][c][h2][w2] = val;
+ data2[n][c][h2][w2] = ScalarConvert<Acctype, Dtype>::to(val);
}
}
}
}
-void THNN_CudaSpatialUpSamplingBilinear_updateOutput(
- THCState *state,
- THCudaTensor *input,
- THCudaTensor *output,
- int outputHeight,
- int outputWidth) {
- input = THCudaTensor_newContiguous(state, input);
- output = THCudaTensor_newContiguous(state, output);
- THCUNN_assertSameGPU(state, 2, input, output);
- THCudaTensor_zero(state, output);
- THCDeviceTensor<float, 4> idata = toDeviceTensor<float, 4>(state, input);
- THCDeviceTensor<float, 4> odata = toDeviceTensor<float, 4>(state, output);
- int height1 = idata.getSize(2);
- int width1 = idata.getSize(3);
- int height2 = odata.getSize(2);
- int width2 = odata.getSize(3);
- assert( height1 > 0 && width1 > 0 && height2 > 0 && width2 > 0);
- const float rheight= (height2 > 1) ? (float)(height1 - 1)/(height2 - 1) : 0.f;
- const float rwidth = (width2 > 1) ? (float)(width1 - 1)/(width2 - 1) : 0.f;
- const int num_kernels = height2 * width2;
- const int num_threads =
- THCState_getCurrentDeviceProperties(state)->maxThreadsPerBlock;
- cudaStream_t stream = THCState_getCurrentStream(state);
- caffe_gpu_interp2_kernel<<<THCCeilDiv(num_kernels, num_threads), num_threads ,
- 0 , stream>>>(num_kernels, rheight, rwidth, idata, odata);
- THCudaCheck(cudaGetLastError());
- THCudaTensor_free(state, input);
- THCudaTensor_free(state, output);
-}
-
-
// Backward (adjoint) operation 1 <- 2 (accumulates)
+template <typename Dtype, typename Acctype>
__global__ void caffe_gpu_interp2_kernel_backward(const int n,
- const float rheight, const float rwidth,
- THCDeviceTensor<float, 4> data1, const THCDeviceTensor<float, 4> data2){
+ const Acctype rheight, const Acctype rwidth,
+ THCDeviceTensor<Dtype, 4> data1, const THCDeviceTensor<Dtype, 4> data2){
int index = threadIdx.x + blockIdx.x * blockDim.x;
const int batchsize = data1.getSize(0);
const int channels = data1.getSize(1);
@@ -108,69 +82,41 @@ __global__ void caffe_gpu_interp2_kernel_backward(const int n,
const int w1 = w2;
for (int n = 0; n < batchsize ; n++){
for (int c = 0; c < channels; ++c) {
- const float val = data2[n][c][h1][w1];
+ const Dtype val = data2[n][c][h1][w1];
data1[n][c][h2][w2] += val;
}
}
return;
}
//
- const float h1r = rheight * h2;
+ const Acctype h1r = rheight * h2;
const int h1 = h1r;
const int h1p = (h1 < height1 - 1) ? 1 : 0;
- const float h1lambda = h1r - h1;
- const float h0lambda = 1.0f - h1lambda;
+ const Acctype h1lambda = h1r - h1;
+ const Acctype h0lambda = Acctype(1) - h1lambda;
//
- const float w1r = rwidth * w2;
+ const Acctype w1r = rwidth * w2;
const int w1 = w1r;
const int w1p = (w1 < width1 - 1) ? 1 : 0;
- const float w1lambda = w1r - w1;
- const float w0lambda = 1.0f - w1lambda;
+ const Acctype w1lambda = w1r - w1;
+ const Acctype w0lambda = Acctype(1) - w1lambda;
//
for (int n = 0; n < batchsize ; n++){
for (int c = 0; c < channels; ++c) {
- const float d2val = data2[n][c][h2][w2];
- atomicAdd(data1[n][c][h1][w1].data(), h0lambda * w0lambda * d2val);
- atomicAdd(data1[n][c][h1][w1+w1p].data(), h0lambda * w1lambda * d2val);
- atomicAdd(data1[n][c][h1+h1p][w1].data(), h1lambda * w0lambda * d2val);
+ const Dtype d2val = data2[n][c][h2][w2];
+ atomicAdd(data1[n][c][h1][w1].data(),
+ ScalarConvert<Acctype, Dtype>::to(h0lambda * w0lambda * d2val));
+ atomicAdd(data1[n][c][h1][w1+w1p].data(),
+ ScalarConvert<Acctype, Dtype>::to(h0lambda * w1lambda * d2val));
+ atomicAdd(data1[n][c][h1+h1p][w1].data(),
+ ScalarConvert<Acctype, Dtype>::to(h1lambda * w0lambda * d2val));
atomicAdd(data1[n][c][h1+h1p][w1+w1p].data(),
- h1lambda * w1lambda * d2val);
+ ScalarConvert<Acctype, Dtype>::to(h1lambda * w1lambda * d2val));
}
}
}
}
-void THNN_CudaSpatialUpSamplingBilinear_updateGradInput(
- THCState *state,
- THCudaTensor *gradOutput,
- THCudaTensor *gradInput,
- int nbatch,
- int nchannels,
- int inputHeight,
- int inputWidth,
- int outputHeight,
- int outputWidth) {
- gradInput = THCudaTensor_newContiguous(state, gradInput);
- gradOutput = THCudaTensor_newContiguous(state, gradOutput);
- THCUNN_assertSameGPU(state, 2, gradOutput, gradInput);
- THCudaTensor_zero(state, gradInput);
- THCDeviceTensor<float, 4> data1 = toDeviceTensor<float, 4>(state, gradInput);
- THCDeviceTensor<float, 4> data2 = toDeviceTensor<float, 4>(state, gradOutput);
- int height1 = data1.getSize(2);
- int width1 = data1.getSize(3);
- int height2 = data2.getSize(2);
- int width2 = data2.getSize(3);
- assert(height1 > 0 && width1 > 0 && height2 > 0 && width2 > 0);
- const float rheight= (height2 > 1) ? (float)(height1 - 1)/(height2 - 1) : 0.f;
- const float rwidth = (width2 > 1) ? (float)(width1 - 1) / (width2 - 1) : 0.f;
- const int num_kernels = height2 * width2;
- const int num_threads =
- THCState_getCurrentDeviceProperties(state)->maxThreadsPerBlock;
- cudaStream_t stream = THCState_getCurrentStream(state);
- caffe_gpu_interp2_kernel_backward<<<THCCeilDiv(num_kernels, num_threads),
- num_threads, 0, stream>>>(num_kernels, rheight, rwidth, data1, data2);
- THCudaCheck(cudaGetLastError());
- THCudaTensor_free(state, gradInput);
- THCudaTensor_free(state, gradOutput);
-}
+#include "generic/SpatialUpSamplingBilinear.cu"
+#include "THCGenerateFloatTypes.h"
diff --git a/lib/THCUNN/SpatialUpSamplingNearest.cu b/lib/THCUNN/SpatialUpSamplingNearest.cu
index c188d3a..5753c6e 100644
--- a/lib/THCUNN/SpatialUpSamplingNearest.cu
+++ b/lib/THCUNN/SpatialUpSamplingNearest.cu
@@ -6,6 +6,8 @@
#include <thrust/transform_reduce.h>
#include <thrust/functional.h>
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
/*
* Description:
@@ -46,7 +48,8 @@ __device__ int translate_idx_inv(int ii, int d1, int d2, int d3, int scale_facto
}
-__global__ void upscale(float *input, float *output, long no_elements,
+template <typename Dtype>
+__global__ void upscale(Dtype *input, Dtype *output, long no_elements,
int scale_factor, int d1, int d2, int d3)
{
// output offset:
@@ -57,121 +60,26 @@ __global__ void upscale(float *input, float *output, long no_elements,
output[ii]=input[ipidx];
}
-
-void THNN_CudaSpatialUpSamplingNearest_updateOutput(THCState *state, THCudaTensor *input, THCudaTensor *output, int scale_factor)
-{
- THCudaTensor_zero(state, output);
-
- THCUNN_assertSameGPU(state, 2, input, output);
-
- input = THCudaTensor_newContiguous(state, input);
- // This is for allocating output Tensor
- long no_elements = 1;
- for(int i = 0; i < input->nDimension; i++){
- no_elements *= input->size[i];
- }
- no_elements *= scale_factor * scale_factor;
-
- int d1;
- int d2;
- int d3;
-
- if (input->nDimension == 3) {
- d1 = output->size[0];
- d2 = output->size[1];
- d3 = output->size[2];
- } else {
- d1 = output->size[1];
- d2 = output->size[2];
- d3 = output->size[3];
- }
-
- float *input_data = THCudaTensor_data(state, input);
- float *output_data = THCudaTensor_data(state, output);
-
- // cuda blocks & threads:
- long nthreads = 256;
- // Max number of blocks: http://en.wikipedia.org/wiki/CUDA
- // 65535 for SM 2.x, 2^32 -1 for >= 3.0
- // TODO: When we move to SM 3.5 we should update this
- long n_xblocks = min(max((int)ceil((float)no_elements / nthreads), 1), 65535);
- long n_yblocks = (long)ceil((float)no_elements / (float)(n_xblocks * nthreads));
- if (n_yblocks > 65535) {
- THError("Input size is too large! aborting");
- }
- dim3 blocks(n_xblocks, n_yblocks);
- dim3 threads(nthreads);
-
- // kernel:
- upscale<<<blocks, threads, 0, THCState_getCurrentStream(state)>>> (input_data, output_data, no_elements, scale_factor, d1, d2, d3);
- THCudaCheck(cudaGetLastError());
-
- // final cut:
- THCudaTensor_free(state, input);
-}
-
/*
* Description:
*/
-__global__ void downscale(float *gradInput_data, float *gradOutput_data, long no_elements,
+template <typename Dtype, typename Acctype>
+__global__ void downscale(Dtype *gradInput_data, Dtype *gradOutput_data, long no_elements,
int scale_factor, int d1, int d2, int d3)
{
// output offset:
long ii = threadIdx.x + blockDim.x * blockIdx.x;
ii += threadIdx.y + blockDim.y * (blockDim.x * gridDim.x) * blockIdx.y;
if (ii >= no_elements) return;
+ Acctype sum = Acctype(0);
for (int i=0; i < scale_factor; i++){
for(int j=0; j < scale_factor; j++){
int ipidx = translate_idx_inv(ii, d1, d2, d3, scale_factor, i, j);
- gradInput_data[ii] += gradOutput_data[ipidx];
+ sum += gradOutput_data[ipidx];
}
}
+ gradInput_data[ii] += ScalarConvert<Acctype, Dtype>::to(sum);
}
-
-void THNN_CudaSpatialUpSamplingNearest_updateGradInput(THCState *state, THCudaTensor *input, THCudaTensor *gradOutput, THCudaTensor *gradInput, int scale_factor)
-{
- THCUNN_assertSameGPU(state, 2, gradOutput, gradInput);
-
- THCudaTensor_zero(state, gradInput);
-
- float *gradInput_data = THCudaTensor_data(state, gradInput);
- float *gradOutput_data = THCudaTensor_data(state, gradOutput);
-
- long no_elements = 1;
- for(int i = 0; i < gradInput->nDimension; i++){
- no_elements *= gradInput->size[i];
- }
-
- int d1;
- int d2;
- int d3;
-
- if (gradInput->nDimension == 3) {
- d1 = gradInput->size[0];
- d2 = gradInput->size[1];
- d3 = gradInput->size[2];
- } else {
- d1 = gradInput->size[1];
- d2 = gradInput->size[2];
- d3 = gradInput->size[3];
- }
-
- // cuda blocks & threads:
- long nthreads = 256;
- // Max number of blocks: http://en.wikipedia.org/wiki/CUDA
- // 65535 for SM 2.x, 2^32 -1 for >= 3.0
- // TODO: When we move to SM 3.5 we should update this
- long n_xblocks = min(max((int)ceil((float)no_elements / nthreads), 1), 65535);
- long n_yblocks = (long)ceil((float)no_elements / (float)(n_xblocks * nthreads));
- if (n_yblocks > 65535) {
- THError("Input size is too large! aborting");
- }
- dim3 blocks(n_xblocks, n_yblocks);
- dim3 threads(nthreads);
-
- // kernel:
- downscale<<<blocks, threads, 0, THCState_getCurrentStream(state)>>> (gradInput_data, gradOutput_data, no_elements,
- scale_factor, d1, d2, d3);
- THCudaCheck(cudaGetLastError());
-}
+#include "generic/SpatialUpSamplingNearest.cu"
+#include "THCGenerateFloatTypes.h"
diff --git a/lib/THCUNN/Sqrt.cu b/lib/THCUNN/Sqrt.cu
index e1a4fe3..9d0dddc 100644
--- a/lib/THCUNN/Sqrt.cu
+++ b/lib/THCUNN/Sqrt.cu
@@ -1,40 +1,32 @@
#include "THCUNN.h"
-#include "common.h"
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
+template <typename T>
struct sqrtupdateOutput_functor
{
- const float bias;
+ const T bias;
- sqrtupdateOutput_functor(float bias_)
+ sqrtupdateOutput_functor(T bias_)
: bias(bias_)
{}
- __device__ void operator()(float *output, const float *input) const
+ __device__ void operator()(T *output, const T *input) const
{
*output = sqrt(*input + bias);
}
};
-void THNN_CudaSqrt_updateOutput(THCState *state, THCudaTensor *input, THCudaTensor *output, float eps)
-{
- THCUNN_assertSameGPU(state, 2, input, output);
- THCudaTensor_resizeAs(state, output, input);
- THC_pointwiseApply2(state, output, input, sqrtupdateOutput_functor(eps));
-}
-
+template <typename T>
struct sqrtupdateGradInput_functor
{
sqrtupdateGradInput_functor() {}
- __device__ void operator()(float *gradInput, const float *output, const float *gradOutput) const
+ __device__ void operator()(T *gradInput, const T *output, const T *gradOutput) const
{
- *gradInput = (*output == 0.0f) ? 0.0f : ((0.5f * *gradOutput) / *output);
+ *gradInput = (THCNumerics<T>::eq(*output,ScalarConvert<float, T>::to(0.0f))) ? ScalarConvert<float, T>::to(0.0f) : ((ScalarConvert<float, T>::to(0.5f) * *gradOutput) / *output);
}
};
-void THNN_CudaSqrt_updateGradInput(THCState *state, THCudaTensor *input, THCudaTensor *gradOutput, THCudaTensor *gradInput, THCudaTensor *output)
-{
- THCUNN_assertSameGPU(state, 3, output, gradOutput, gradInput);
- THCudaTensor_resizeAs(state, gradInput, output);
- THC_pointwiseApply3(state, gradInput, output, gradOutput, sqrtupdateGradInput_functor());
-}
+#include "generic/Sqrt.cu"
+#include "THCGenerateFloatTypes.h"
diff --git a/lib/THCUNN/Square.cu b/lib/THCUNN/Square.cu
index a6d147c..4acf67f 100644
--- a/lib/THCUNN/Square.cu
+++ b/lib/THCUNN/Square.cu
@@ -1,32 +1,24 @@
#include "THCUNN.h"
-#include "common.h"
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
+template <typename T>
struct squareupdateOutput_functor
{
- __device__ void operator()(float* output, const float* input) const
+ __device__ void operator()(T* output, const T* input) const
{
*output = (*input) * (*input);
}
};
-void THNN_CudaSquare_updateOutput(THCState *state, THCudaTensor *input, THCudaTensor *output)
-{
- THCUNN_assertSameGPU(state, 2, input, output);
- THCudaTensor_resizeAs(state, output, input);
- THC_pointwiseApply2(state, output, input, squareupdateOutput_functor());
-}
-
+template <typename T>
struct squareupdateGradInput_functor
{
- __device__ void operator()(float* gradInput, const float* input, const float* gradOutput) const
+ __device__ void operator()(T* gradInput, const T* input, const T* gradOutput) const
{
- *gradInput = 2.0 * (*gradOutput) * (*input);
+ *gradInput = ScalarConvert<double, T>::to(2.0) * (*gradOutput) * (*input);
}
};
-void THNN_CudaSquare_updateGradInput(THCState *state, THCudaTensor *input, THCudaTensor *gradOutput, THCudaTensor *gradInput)
-{
- THCUNN_assertSameGPU(state, 3, input, gradOutput, gradInput);
- THCudaTensor_resizeAs(state, gradInput, input);
- THC_pointwiseApply3(state, gradInput, input, gradOutput, squareupdateGradInput_functor());
-}
+#include "generic/Square.cu"
+#include "THCGenerateFloatTypes.h"
diff --git a/lib/THCUNN/THCHalfAutoNumerics.cuh b/lib/THCUNN/THCHalfAutoNumerics.cuh
new file mode 100644
index 0000000..89a9602
--- /dev/null
+++ b/lib/THCUNN/THCHalfAutoNumerics.cuh
@@ -0,0 +1,248 @@
+#ifndef THC_HALF_AUTO_NUMERICS_INC
+#define THC_HALF_AUTO_NUMERICS_INC
+
+#include "THCHalf.h"
+#include "THCNumerics.cuh"
+
+// Half numerics functions defined as free functions, so cunn code can be
+//written generically, i.e. without excessive calling of THCNumerics<half> functions.
+
+#ifdef CUDA_HALF_TENSOR
+
+// these functions should move to THCNumerics
+
+inline __host__ __device__ half fmaxType(half x, half y) {
+ return THCNumerics<half>::ge(x, y) ? x : y;
+}
+
+inline __host__ __device__ float fmaxType(float x, half y) {
+ return fmaxf(x, ScalarConvert<half, float>::to(y));
+}
+
+inline __host__ __device__ float fmaxType(float x, float y) {
+ return fmaxf(x, y);
+}
+
+inline __host__ __device__ double fmaxType(double x, double y) {
+ return fmax(x, y);
+}
+
+inline __host__ __device__ half mul(half a, half b) {
+ #ifdef __CUDA_ARCH__
+ #ifdef CUDA_HALF_INSTRUCTIONS
+ return __hmul(a, b);
+ #else
+ float fa = __half2float(a);
+ float fb = __half2float(b);
+ return __float2half( fa * fb );
+ #endif
+ #else // __CUDA_ARCH__
+ return THC_float2half(THC_half2float(a) * THC_half2float(b));
+ #endif
+}
+
+inline __host__ __device__ half div(half a, half b) {
+ #ifdef __CUDA_ARCH__
+ #ifdef CUDA_HALF_INSTRUCTIONS
+ return __hdiv(a, b);
+ #else
+ float fa = __half2float(a);
+ float fb = __half2float(b);
+ return __float2half( fa / fb );
+ #endif
+ #else // __CUDA_ARCH__
+ return THC_float2half(THC_half2float(a) / THC_half2float(b));
+ #endif
+}
+
+// arithmetic functions
+
+inline __host__ __device__ half operator+(half a, half b) {
+ return THCNumerics<half>::add(a, b);
+}
+
+inline __host__ __device__ float operator+(half a, float b) {
+ return ScalarConvert<half, float>::to(a) + b;
+}
+
+inline __host__ __device__ float operator+(float a, half b) {
+ return a + ScalarConvert<half, float>::to(b);
+}
+
+inline __host__ __device__ double operator+(double a, half b) {
+ return a + ScalarConvert<half, double>::to(b);
+}
+
+inline __host__ __device__ half operator-(half a) {
+ return THCNumerics<half>::neg(a);
+}
+
+inline __host__ __device__ half operator-(half a, half b) {
+ return THCNumerics<half>::add(a, THCNumerics<half>::neg(b));
+}
+
+inline __host__ __device__ half operator-(half a, int b) {
+ return THCNumerics<half>::add(a, THCNumerics<half>::neg(ScalarConvert<int, half>::to(b)));
+}
+
+inline __host__ __device__ float operator-(half a, float b) {
+ return ScalarConvert<half, float>::to(a) - b;
+}
+
+inline __host__ __device__ double operator-(half a, double b) {
+ return ScalarConvert<half, double>::to(a) - b;
+}
+
+inline __host__ __device__ half operator-(int a, half b) {
+ return THCNumerics<half>::add(ScalarConvert<int, half>::to(a), THCNumerics<half>::neg(b));
+}
+
+inline __host__ __device__ float operator-(float a, half b) {
+ return a - ScalarConvert<half, float>::to(b);
+}
+
+inline __host__ __device__ double operator-(double a, half b) {
+ return a - ScalarConvert<half, double>::to(b);
+}
+
+inline __host__ __device__ half operator*(half a, half b) {
+ return mul(a, b);
+}
+
+inline __host__ __device__ float operator*(half a, float b) {
+ return ScalarConvert<half, float>::to(a) * b;
+}
+
+inline __host__ __device__ double operator*(half a, double b) {
+ return ScalarConvert<half, double>::to(a) * b;
+}
+
+inline __host__ __device__ half operator*(half a, int b) {
+ return a * ScalarConvert<int, half>::to(b);
+}
+
+inline __host__ __device__ float operator*(float a, half b) {
+ return a * ScalarConvert<half, float>::to(b);
+}
+
+inline __host__ __device__ double operator*(double a, half b) {
+ return a * ScalarConvert<half, double>::to(b);
+}
+
+inline __host__ __device__ half operator/(half a, half b) {
+ return div(a, b);
+}
+
+inline __host__ __device__ float operator/(float a, half b) {
+ return a / ScalarConvert<half, float>::to(b);
+}
+
+inline __host__ __device__ double operator/(double a, half b) {
+ return a / ScalarConvert<half, double>::to(b);
+}
+
+inline __host__ __device__ half operator/(int a, half b) {
+ return ScalarConvert<int, half>::to(a) / b;
+}
+
+inline __host__ __device__ float operator/(half a, float b) {
+ return ScalarConvert<half, float>::to(a) / b;
+}
+
+inline __host__ __device__ double operator/(half a, double b) {
+ return ScalarConvert<half, double>::to(a) / b;
+}
+
+inline __host__ __device__ half operator/(half a, int b) {
+ return a / ScalarConvert<int, half>::to(b);
+}
+
+inline __host__ __device__ half& operator+=(half &lhs, const half &rhs) {
+ lhs = lhs + rhs;
+ return lhs;
+}
+inline __host__ __device__ float& operator+=(float &lhs, const half &rhs) {
+ lhs = lhs + rhs;
+ return lhs;
+}
+
+inline __host__ __device__ float& operator-=(float &lhs, const half &rhs) {
+ lhs = lhs - rhs;
+ return lhs;
+}
+
+inline __host__ __device__ half& operator*=(half &lhs, const half &rhs) {
+ lhs = lhs * rhs;
+ return lhs;
+}
+
+inline __host__ __device__ half& operator/=(half &lhs, const int &rhs) {
+ lhs = lhs / rhs;
+ return lhs;
+}
+
+inline __host__ __device__ half& operator/=(half &lhs, const half &rhs) {
+ lhs = lhs / rhs;
+ return lhs;
+}
+
+inline __host__ __device__ half abs(half a) {
+ return THCNumerics<half>::abs(a);
+}
+
+inline __host__ __device__ half exp(half a) {
+ return THCNumerics<half>::exp(a);
+}
+
+inline __host__ __device__ half log1p(half a) {
+ return THCNumerics<half>::log1p(a);
+}
+
+inline __host__ __device__ half pow(half a, half b) {
+ return THCNumerics<half>::pow(a, b);
+}
+
+inline __host__ __device__ half sqrt(half a) {
+ return THCNumerics<half>::sqrt(a);
+}
+
+inline __host__ __device__ half tanh(half a) {
+ return THCNumerics<half>::tanh(a);
+}
+
+// comparison functions
+
+inline __host__ __device__ bool operator<(half a, half b) {
+ return THCNumerics<half>::lt(a, b);
+}
+
+inline __host__ __device__ bool operator<=(half a, half b) {
+ return THCNumerics<half>::le(a, b);
+}
+
+inline __host__ __device__ bool operator<=(half a, int b) {
+ return THCNumerics<half>::le(a, ScalarConvert<int, half>::to(b));
+}
+
+inline __host__ __device__ bool operator<(half a, int b) {
+ return THCNumerics<half>::lt(a, ScalarConvert<int, half>::to(b));
+}
+
+inline __host__ __device__ bool operator>(half a, half b) {
+ return THCNumerics<half>::gt(a, b);
+}
+
+inline __host__ __device__ bool operator>(half a, int b) {
+ return THCNumerics<half>::gt(a, ScalarConvert<int, half>::to(b));
+}
+
+inline __host__ __device__ bool operator>=(half a, half b) {
+ return THCNumerics<half>::ge(a, b);
+}
+
+inline __host__ __device__ bool operator>=(half a, int b) {
+ return THCNumerics<half>::ge(a, ScalarConvert<int ,half>::to(b));
+}
+
+#endif
+#endif
diff --git a/lib/THCUNN/THCUNN.h b/lib/THCUNN/THCUNN.h
index 9aeef2d..4e95b06 100644
--- a/lib/THCUNN/THCUNN.h
+++ b/lib/THCUNN/THCUNN.h
@@ -1,1090 +1,11 @@
#include <THC/THC.h>
#include <THC/THCApply.cuh>
-#define THIndexTensor THCudaLongTensor
-#define THIndexTensor_(NAME) THCudaLongTensor_ ## NAME
+#define THCIndexTensor THCudaLongTensor
+#define THCIndexTensor_(NAME) THCudaLongTensor_ ## NAME
+typedef long THCIndex_t;
-TH_API void THNN_CudaAbs_updateOutput(
- THCState *state,
- THCudaTensor *input,
- THCudaTensor *output);
-TH_API void THNN_CudaAbs_updateGradInput(
- THCState *state,
- THCudaTensor *input,
- THCudaTensor *gradOutput,
- THCudaTensor *gradInput);
+#define THNN_(NAME) TH_CONCAT_3(THNN_, CReal, NAME)
-TH_API void THNN_CudaAbsCriterion_updateOutput(
- THCState *state,
- THCudaTensor *input,
- THCudaTensor *target,
- THCudaTensor *output,
- bool sizeAverage);
-TH_API void THNN_CudaAbsCriterion_updateGradInput(
- THCState *state,
- THCudaTensor *input,
- THCudaTensor *target,
- THCudaTensor *gradInput,
- bool sizeAverage);
-
-TH_API void THNN_CudaBCECriterion_updateOutput(
- THCState *state,
- THCudaTensor *input,
- THCudaTensor *target,
- THCudaTensor *output,
- bool sizeAverage,
- THCudaTensor *weights); // [OPTIONAL]
-TH_API void THNN_CudaBCECriterion_updateGradInput(
- THCState *state,
- THCudaTensor *input,
- THCudaTensor *target,
- THCudaTensor *gradInput,
- bool sizeAverage,
- THCudaTensor *weights); // [OPTIONAL]
-
-TH_API void THNN_CudaClassNLLCriterion_updateOutput(
- THCState *state,
- THCudaTensor *input,
- THIndexTensor *target,
- THCudaTensor *output,
- bool sizeAverage,
- THCudaTensor *weights, // [OPTIONAL]
- THCudaTensor *total_weight);
-TH_API void THNN_CudaClassNLLCriterion_updateGradInput(
- THCState *state,
- THCudaTensor *input,
- THIndexTensor *target,
- THCudaTensor *gradInput,
- bool sizeAverage,
- THCudaTensor *weights, // [OPTIONAL]
- THCudaTensor *total_weight);
-
-TH_API void THNN_CudaSpatialClassNLLCriterion_updateOutput(
- THCState *state,
- THCudaTensor *input,
- THIndexTensor *target,
- THCudaTensor *output,
- bool sizeAverage,
- THCudaTensor *weights, // [OPTIONAL]
- THCudaTensor *total_weight);
-TH_API void THNN_CudaSpatialClassNLLCriterion_updateGradInput(
- THCState *state,
- THCudaTensor *input,
- THIndexTensor *target,
- THCudaTensor *gradInput,
- bool sizeAverage,
- THCudaTensor *weights, // [OPTIONAL]
- THCudaTensor *total_weight);
-
-TH_API void THNN_CudaDistKLDivCriterion_updateOutput(
- THCState *state,
- THCudaTensor *input,
- THCudaTensor *target,
- THCudaTensor *output,
- bool sizeAverage);
-TH_API void THNN_CudaDistKLDivCriterion_updateGradInput(
- THCState *state,
- THCudaTensor *input,
- THCudaTensor *target,
- THCudaTensor *gradInput,
- bool sizeAverage);
-
-TH_API void THNN_CudaELU_updateOutput(
- THCState *state,
- THCudaTensor *input,
- THCudaTensor *output,
- float alpha,
- bool inplace);
-TH_API void THNN_CudaELU_updateGradInput(
- THCState *state,
- THCudaTensor *input,
- THCudaTensor *gradOutput,
- THCudaTensor *gradInput,
- THCudaTensor *output,
- float alpha,
- bool inplace);
-
-TH_API void THNN_CudaHardTanh_updateOutput(
- THCState *state,
- THCudaTensor *input,
- THCudaTensor *output,
- float min_val,
- float max_val,
- bool inplace);
-TH_API void THNN_CudaHardTanh_updateGradInput(
- THCState *state,
- THCudaTensor *input,
- THCudaTensor *gradOutput,
- THCudaTensor *gradInput,
- float min_val,
- float max_val,
- bool inplace);
-
-TH_API void THNN_CudaL1Cost_updateOutput(
- THCState *state,
- THCudaTensor *input,
- THCudaTensor *output);
-TH_API void THNN_CudaL1Cost_updateGradInput(
- THCState *state,
- THCudaTensor *input,
- THCudaTensor *gradOutput, // [OPTIONAL]
- THCudaTensor *gradInput);
-
-TH_API void THNN_CudaLeakyReLU_updateOutput(
- THCState *state,
- THCudaTensor *input,
- THCudaTensor *output,
- double negval, bool inplace);
-TH_API void THNN_CudaLeakyReLU_updateGradInput(
- THCState *state,
- THCudaTensor *input,
- THCudaTensor *gradOutput,
- THCudaTensor *gradInput,
- double negval,
- bool inplace);
-
-TH_API void THNN_CudaLogSigmoid_updateOutput(
- THCState *state,
- THCudaTensor *input,
- THCudaTensor *output,
- THCudaTensor *buffer);
-TH_API void THNN_CudaLogSigmoid_updateGradInput(
- THCState *state,
- THCudaTensor *input,
- THCudaTensor *gradOutput,
- THCudaTensor *gradInput,
- THCudaTensor *buffer);
-
-TH_API void THNN_CudaLogSoftMax_updateOutput(
- THCState *state,
- THCudaTensor *input,
- THCudaTensor *output);
-TH_API void THNN_CudaLogSoftMax_updateGradInput(
- THCState *state,
- THCudaTensor *input,
- THCudaTensor *gradOutput,
- THCudaTensor *gradInput,
- THCudaTensor *output);
-
-TH_API void THNN_CudaLookupTable_accGradParameters(
- THCState *state,
- THIndexTensor *input,
- THCudaTensor *gradOutput,
- THCudaTensor *gradWeight,
- THIndexTensor *count,
- THIndexTensor *sorted, // [OPTIONAL]
- THIndexTensor *indices, // [OPTIONAL]
- bool scaleGradByFreq,
- int paddingValue,
- float scale);
-
-TH_API void THNN_CudaLookupTable_renorm(
- THCState *state,
- THIndexTensor *idx,
- THCudaTensor *weight,
- float maxNorm,
- float normType);
-
-TH_API void THNN_CudaMarginCriterion_updateOutput(
- THCState *state,
- THCudaTensor *input,
- THCudaTensor *target,
- THCudaTensor *output,
- bool sizeAverage,
- float margin);
-TH_API void THNN_CudaMarginCriterion_updateGradInput(
- THCState *state,
- THCudaTensor *input,
- THCudaTensor *target,
- THCudaTensor *gradInput,
- bool sizeAverage,
- float margin);
-
-TH_API void THNN_CudaSoftMarginCriterion_updateOutput(
- THCState *state,
- THCudaTensor *input,
- THCudaTensor *target,
- THCudaTensor *output,
- int sizeAverage);
-
-TH_API void THNN_CudaSoftMarginCriterion_updateGradInput(
- THCState *state,
- THCudaTensor *input,
- THCudaTensor *target,
- THCudaTensor *gradInput,
- int sizeAverage);
-
-TH_API void THNN_CudaMSECriterion_updateOutput(
- THCState *state,
- THCudaTensor *input,
- THCudaTensor *target,
- THCudaTensor *output,
- bool sizeAverage);
-TH_API void THNN_CudaMSECriterion_updateGradInput(
- THCState *state,
- THCudaTensor *input,
- THCudaTensor *target,
- THCudaTensor *gradInput,
- bool sizeAverage);
-
-TH_API void THNN_CudaMultiMarginCriterion_updateOutput(
- THCState *state,
- THCudaTensor *input,
- THCudaTensor *target,
- THCudaTensor *output,
- bool sizeAverage,
- int p,
- THCudaTensor *weights, // [OPTIONAL]
- float margin);
-TH_API void THNN_CudaMultiMarginCriterion_updateGradInput(
- THCState *state,
- THCudaTensor *input,
- THCudaTensor *target,
- THCudaTensor *gradInput,
- bool sizeAverage,
- int p,
- THCudaTensor *weights, // [OPTIONAL]
- float margin);
-
-TH_API void THNN_CudaMultiLabelMarginCriterion_updateOutput(
- THCState *state,
- THCudaTensor *input,
- THCudaTensor *target,
- THCudaTensor *output,
- THCudaTensor *istarget,
- bool sizeAverage);
-TH_API void THNN_CudaMultiLabelMarginCriterion_updateGradInput(
- THCState *state,
- THCudaTensor *input,
- THCudaTensor *target,
- THCudaTensor *gradInput,
- THCudaTensor *istarget,
- bool sizeAverage);
-
-TH_API void THNN_CudaPReLU_updateOutput(
- THCState *state,
- THCudaTensor *input,
- THCudaTensor *output,
- THCudaTensor *weight,
- long nOutputPlane);
-TH_API void THNN_CudaPReLU_updateGradInput(
- THCState *state,
- THCudaTensor *input,
- THCudaTensor *gradOutput,
- THCudaTensor *gradInput,
- THCudaTensor *weight,
- long nOutputPlane);
-TH_API void THNN_CudaPReLU_accGradParameters(
- THCState *state,
- THCudaTensor *input,
- THCudaTensor *gradOutput,
- THCudaTensor *gradInput,
- THCudaTensor *weight,
- THCudaTensor *gradWeight,
- THCudaTensor *gradWeightBuf,
- THCudaTensor *gradWeightBuf2,
- long nOutputPlane,
- float scale);
-
-TH_API void THNN_CudaRReLU_updateOutput(
- THCState *state,
- THCudaTensor *input,
- THCudaTensor *output,
- THCudaTensor *noise,
- double lower,
- double upper,
- bool train,
- bool inplace,
- void *generator);
-TH_API void THNN_CudaRReLU_updateGradInput(
- THCState *state,
- THCudaTensor *input,
- THCudaTensor *gradOutput,
- THCudaTensor *gradInput,
- THCudaTensor *noise,
- double lower,
- double upper,
- bool train,
- bool inplace);
-
-TH_API void THNN_CudaSigmoid_updateOutput(
- THCState *state,
- THCudaTensor *input,
- THCudaTensor *output);
-TH_API void THNN_CudaSigmoid_updateGradInput(
- THCState *state,
- THCudaTensor *input,
- THCudaTensor *gradOutput,
- THCudaTensor *gradInput,
- THCudaTensor *output);
-
-TH_API void THNN_CudaSmoothL1Criterion_updateOutput(
- THCState *state,
- THCudaTensor *input,
- THCudaTensor *target,
- THCudaTensor *output,
- bool sizeAverage);
-TH_API void THNN_CudaSmoothL1Criterion_updateGradInput(
- THCState *state,
- THCudaTensor *input,
- THCudaTensor *target,
- THCudaTensor *gradInput,
- bool sizeAverage);
-
-TH_API void THNN_CudaSoftMax_updateOutput(
- THCState *state,
- THCudaTensor *input,
- THCudaTensor *output);
-TH_API void THNN_CudaSoftMax_updateGradInput(
- THCState *state,
- THCudaTensor *input,
- THCudaTensor *gradOutput,
- THCudaTensor *gradInput,
- THCudaTensor *output);
-
-TH_API void THNN_CudaSoftPlus_updateOutput(
- THCState *state,
- THCudaTensor *input,
- THCudaTensor *output,
- float beta,
- float threshold);
-TH_API void THNN_CudaSoftPlus_updateGradInput(
- THCState *state,
- THCudaTensor *input,
- THCudaTensor *gradOutput,
- THCudaTensor *gradInput,
- THCudaTensor *output,
- float beta,
- float threshold);
-
-TH_API void THNN_CudaSoftShrink_updateOutput(
- THCState *state,
- THCudaTensor *input,
- THCudaTensor *output,
- double lambda);
-TH_API void THNN_CudaSoftShrink_updateGradInput(
- THCState *state,
- THCudaTensor *input,
- THCudaTensor *gradOutput,
- THCudaTensor *gradInput,
- double lambda);
-
-TH_API void THNN_CudaSqrt_updateOutput(
- THCState *state,
- THCudaTensor *input,
- THCudaTensor *output,
- float eps);
-TH_API void THNN_CudaSqrt_updateGradInput(
- THCState *state,
- THCudaTensor *input,
- THCudaTensor *gradOutput,
- THCudaTensor *gradInput,
- THCudaTensor *output);
-
-TH_API void THNN_CudaSquare_updateOutput(
- THCState *state,
- THCudaTensor *input,
- THCudaTensor *output);
-TH_API void THNN_CudaSquare_updateGradInput(
- THCState *state,
- THCudaTensor *input,
- THCudaTensor *gradOutput,
- THCudaTensor *gradInput);
-
-TH_API void THNN_CudaTanh_updateOutput(
- THCState *state,
- THCudaTensor *input,
- THCudaTensor *output);
-TH_API void THNN_CudaTanh_updateGradInput(
- THCState *state,
- THCudaTensor *input,
- THCudaTensor *gradOutput,
- THCudaTensor *gradInput,
- THCudaTensor *output);
-
-TH_API void THNN_CudaThreshold_updateOutput(
- THCState *state,
- THCudaTensor *input,
- THCudaTensor *output,
- double threshold,
- double val,
- bool inplace);
-TH_API void THNN_CudaThreshold_updateGradInput(
- THCState *state,
- THCudaTensor *input,
- THCudaTensor *gradOutput,
- THCudaTensor *gradInput,
- double threshold,
- double val,
- bool inplace);
-
-TH_API void THNN_CudaTemporalConvolution_updateOutput(
- THCState *state,
- THCudaTensor *input,
- THCudaTensor *output,
- THCudaTensor *weight,
- THCudaTensor *bias,
- int kW, int dW,
- int inputFrameSize,
- int outputFrameSize);
-
-TH_API void THNN_CudaTemporalConvolution_updateGradInput(
- THCState* state,
- THCudaTensor *input,
- THCudaTensor *gradOutput,
- THCudaTensor *gradInput,
- THCudaTensor *weight,
- int kW, int dW);
-
-TH_API void THNN_CudaTemporalConvolution_accGradParameters(
- THCState *state,
- THCudaTensor *input,
- THCudaTensor *gradOutput,
- THCudaTensor *gradWeight,
- THCudaTensor *gradBias,
- int kW, int dW,
- float scale);
-
-TH_API void THNN_CudaTemporalMaxPooling_updateOutput(
- THCState *state,
- THCudaTensor *input,
- THCudaTensor *output,
- THCudaTensor *indices,
- int kW, int dW);
-
-TH_API void THNN_CudaTemporalMaxPooling_updateGradInput(
- THCState *state,
- THCudaTensor *input,
- THCudaTensor *gradOutput,
- THCudaTensor *gradInput,
- THCudaTensor *indices,
- int kW, int dW);
-
-TH_API void THNN_CudaSparseLinear_updateOutput(
- THCState *state,
- THCudaTensor *input,
- THCudaTensor *output,
- THCudaTensor *weight,
- THCudaTensor *bias);
-TH_API void THNN_CudaSparseLinear_accGradParameters(
- THCState *state,
- THCudaTensor *input,
- THCudaTensor *gradOutput,
- THCudaTensor *gradWeight,
- THCudaTensor *gradBias,
- THCudaTensor *weight,
- THCudaTensor *bias,
- double weightDecay,
- double scale);
-TH_API void THNN_CudaSparseLinear_legacyUpdateOutput(
- THCState *state,
- THCudaTensor *input,
- THCudaTensor *output,
- THCudaTensor *weight,
- THCudaTensor *bias);
-TH_API void THNN_CudaSparseLinear_legacyAccGradParameters(
- THCState *state,
- THCudaTensor *input,
- THCudaTensor *gradOutput,
- THCudaTensor *gradWeight,
- THCudaTensor *gradBias,
- THCudaTensor *weight,
- THCudaTensor *bias,
- double weightDecay,
- double scale);
-TH_API void THNN_CudaSparseLinear_zeroGradParameters(
- THCState *state,
- THCudaTensor *gradWeight,
- THCudaTensor *gradBias,
- THCudaTensor *lastInput);
-TH_API void THNN_CudaSparseLinear_updateParameters(
- THCState *state,
- THCudaTensor *weight,
- THCudaTensor *bias,
- THCudaTensor *gradWeight,
- THCudaTensor *gradBias,
- THCudaTensor *lastInput,
- double learningRate);
-
-TH_API void THNN_CudaBatchNormalization_updateOutput(
- THCState *state,
- THCudaTensor *input,
- THCudaTensor *output,
- THCudaTensor *weight, // [OPTIONAL]
- THCudaTensor *bias, // [OPTIONAL]
- THCudaTensor *runningMean,
- THCudaTensor *runningVar,
- THCudaTensor *saveMean,
- THCudaTensor *saveStd,
- bool train,
- double momentum,
- double eps);
-TH_API void THNN_CudaBatchNormalization_backward(
- THCState *state,
- THCudaTensor *input,
- THCudaTensor *gradOutput,
- THCudaTensor *gradInput, // [OPTIONAL]
- THCudaTensor *gradWeight, // [OPTIONAL]
- THCudaTensor *gradBias, // [OPTIONAL]
- THCudaTensor *weight, // [OPTIONAL]
- THCudaTensor *running_mean,
- THCudaTensor *running_var,
- THCudaTensor *save_mean,
- THCudaTensor *save_std,
- bool train,
- float scale,
- double eps);
-
-TH_API void THNN_CudaSpatialConvolutionMM_updateOutput(
- THCState *state,
- THCudaTensor *input,
- THCudaTensor *output,
- THCudaTensor *weight,
- THCudaTensor *bias, // [OPTIONAL]
- THCudaTensor *columns,
- THCudaTensor *ones,
- int kW, int kH,
- int dW, int dH,
- int padW, int padH);
-TH_API void THNN_CudaSpatialConvolutionMM_updateGradInput(
- THCState *state,
- THCudaTensor *input,
- THCudaTensor *gradOutput,
- THCudaTensor *gradInput,
- THCudaTensor *weight,
- THCudaTensor *columns,
- THCudaTensor *ones,
- int kW, int kH,
- int dW, int dH,
- int padW, int padH);
-TH_API void THNN_CudaSpatialConvolutionMM_accGradParameters(
- THCState *state,
- THCudaTensor *input,
- THCudaTensor *gradOutput,
- THCudaTensor *gradWeight,
- THCudaTensor *gradBias, // [OPTIONAL]
- THCudaTensor *columns,
- THCudaTensor *ones,
- int kW, int kH,
- int dW, int dH,
- int padW, int padH,
- float scale);
-
-TH_API void THNN_CudaSpatialConvolutionLocal_updateOutput(
- THCState *state,
- THCudaTensor *input,
- THCudaTensor *output,
- THCudaTensor *weight,
- THCudaTensor *bias,
- THCudaTensor *finput,
- THCudaTensor *fgradInput,
- int kW, int kH,
- int dW, int dH,
- int padW, int padH,
- long inputWidth, long inputHeight,
- long outputWidth, long outputHeight);
-TH_API void THNN_CudaSpatialConvolutionLocal_updateGradInput(
- THCState *state,
- THCudaTensor *input,
- THCudaTensor *gradOutput,
- THCudaTensor *gradInput,
- THCudaTensor *weight,
- THCudaTensor *finput,
- THCudaTensor *fgradInput,
- int kW, int kH,
- int dW, int dH,
- int padW, int padH,
- long inputWidth, long inputHeight,
- long outputWidth, long outputHeight);
-TH_API void THNN_CudaSpatialConvolutionLocal_accGradParameters(
- THCState *state,
- THCudaTensor *input,
- THCudaTensor *gradOutput,
- THCudaTensor *gradWeight,
- THCudaTensor *gradBias,
- THCudaTensor *finput,
- THCudaTensor *fgradInput,
- int kW, int kH,
- int dW, int dH,
- int padW, int padH,
- long inputWidth, long inputHeight,
- long outputWidth, long outputHeight,
- float scale);
-
-TH_API void THNN_CudaSpatialFullConvolution_updateOutput(
- THCState *state,
- THCudaTensor *input,
- THCudaTensor *output,
- THCudaTensor *weight,
- THCudaTensor *bias, // [OPTIONAL]
- THCudaTensor *columns,
- THCudaTensor *ones,
- int kW, int kH,
- int dW, int dH,
- int padW, int padH,
- int adjW, int adjH);
-TH_API void THNN_CudaSpatialFullConvolution_updateGradInput(
- THCState *state,
- THCudaTensor *input,
- THCudaTensor *gradOutput,
- THCudaTensor *gradInput,
- THCudaTensor *weight,
- THCudaTensor *gradColumns,
- int kW, int kH,
- int dW, int dH,
- int padW, int padH,
- int adjW, int adjH);
-TH_API void THNN_CudaSpatialFullConvolution_accGradParameters(
- THCState *state,
- THCudaTensor *input,
- THCudaTensor *gradOutput,
- THCudaTensor *gradWeight,
- THCudaTensor *gradBias, // [OPTIONAL]
- THCudaTensor *columns,
- THCudaTensor *ones,
- int kW, int kH,
- int dW, int dH,
- int padW, int padH,
- int adjW, int adjH,
- float scale);
-
-TH_API void THNN_CudaSpatialDilatedConvolution_updateOutput(
- THCState *state,
- THCudaTensor *input,
- THCudaTensor *output,
- THCudaTensor *weight,
- THCudaTensor *bias, // [OPTIONAL]
- THCudaTensor *columns,
- THCudaTensor *ones,
- int kW, int kH,
- int dW, int dH,
- int padW, int padH,
- int dilationW, int dilationH);
-
-TH_API void THNN_CudaSpatialDilatedConvolution_updateGradInput(
- THCState *state,
- THCudaTensor *input,
- THCudaTensor *gradOutput,
- THCudaTensor *gradInput,
- THCudaTensor *weight,
- THCudaTensor *gradColumns,
- int kW, int kH,
- int dW, int dH,
- int padW, int padH,
- int dilationW, int dilationH );
-
-TH_API void THNN_CudaSpatialDilatedConvolution_accGradParameters(
- THCState *state,
- THCudaTensor *input,
- THCudaTensor *gradOutput,
- THCudaTensor *gradWeight,
- THCudaTensor *gradBias, // [OPTIONAL]
- THCudaTensor *columns,
- THCudaTensor *ones,
- int kW, int kH,
- int dW, int dH,
- int padW, int padH,
- int dilationW, int dilationH,
- float scale);
-
-TH_API void THNN_CudaSpatialCrossMapLRN_updateOutput(
- THCState *state,
- THCudaTensor *input,
- THCudaTensor *output,
- THCudaTensor *scale,
- int size,
- float alpha,
- float beta,
- float k);
-TH_API void THNN_CudaSpatialCrossMapLRN_updateGradInput(
- THCState *state,
- THCudaTensor *input,
- THCudaTensor *gradOutput,
- THCudaTensor *gradInput,
- THCudaTensor *scale,
- THCudaTensor *output,
- int size,
- float alpha,
- float beta,
- float k);
-
-TH_API void THNN_CudaSpatialAdaptiveMaxPooling_updateOutput(
- THCState *state,
- THCudaTensor *input,
- THCudaTensor *output,
- THCudaTensor *indices,
- int nOutputCols,
- int nOutputRows);
-TH_API void THNN_CudaSpatialAdaptiveMaxPooling_updateGradInput(
- THCState *state,
- THCudaTensor *input,
- THCudaTensor *gradOutput,
- THCudaTensor *gradInput,
- THCudaTensor *indices);
-
-TH_API void THNN_CudaSpatialAveragePooling_updateOutput(
- THCState *state,
- THCudaTensor *input,
- THCudaTensor *output,
- int kW, int kH,
- int dW, int dH,
- int padW, int padH,
- bool ceil_mode,
- bool count_include_pad);
-TH_API void THNN_CudaSpatialAveragePooling_updateGradInput(
- THCState *state,
- THCudaTensor *input,
- THCudaTensor *gradOutput,
- THCudaTensor *gradInput,
- int kW, int kH,
- int dW, int dH,
- int padW, int padH,
- bool ceil_mode,
- bool count_include_pad);
-
-TH_API void THNN_CudaSpatialMaxPooling_updateOutput(
- THCState *state,
- THCudaTensor *input,
- THCudaTensor *output,
- THCudaTensor *indices,
- int kW, int kH,
- int dW, int dH,
- int padW, int padH,
- bool ceil_mode);
-TH_API void THNN_CudaSpatialMaxPooling_updateGradInput(
- THCState *state,
- THCudaTensor *input,
- THCudaTensor *gradOutput,
- THCudaTensor *gradInput,
- THCudaTensor *indices,
- int kW, int kH,
- int dW, int dH,
- int padW, int padH,
- bool ceil_mode);
-
-TH_API void THNN_CudaSpatialDilatedMaxPooling_updateOutput(
- THCState *state,
- THCudaTensor *input,
- THCudaTensor *output,
- THCudaTensor *indices,
- int kW, int kH,
- int dW, int dH,
- int padW, int padH,
- int dilationW, int dilationH,
- bool ceil_mode);
-TH_API void THNN_CudaSpatialDilatedMaxPooling_updateGradInput(
- THCState *state,
- THCudaTensor *input,
- THCudaTensor *gradOutput,
- THCudaTensor *gradInput,
- THCudaTensor *indices,
- int kW, int kH,
- int dW, int dH,
- int padW, int padH,
- int dilationW, int dilationH,
- bool ceil_mode);
-
-TH_API void THNN_CudaSpatialMaxUnpooling_updateOutput(
- THCState *state,
- THCudaTensor *input,
- THCudaTensor *output,
- THCudaTensor *indices,
- int owidth, int oheight);
-TH_API void THNN_CudaSpatialMaxUnpooling_updateGradInput(
- THCState *state,
- THCudaTensor *input,
- THCudaTensor *gradOutput,
- THCudaTensor *gradInput,
- THCudaTensor *indices,
- int owidth, int oheight);
-
-TH_API void THNN_CudaSpatialFractionalMaxPooling_updateOutput(
- THCState *state,
- THCudaTensor *input,
- THCudaTensor *output,
- int outputW, int outputH,
- int poolSizeW, int poolSizeH,
- THCudaTensor *indices,
- THCudaTensor *randomSamples);
-TH_API void THNN_CudaSpatialFractionalMaxPooling_updateGradInput(
- THCState *state,
- THCudaTensor *input,
- THCudaTensor *gradOutput,
- THCudaTensor *gradInput,
- int outputW, int outputH,
- int poolSizeW, int poolSizeH,
- THCudaTensor *indices);
-
-TH_API void THNN_CudaSpatialSubSampling_updateOutput(
- THCState *state,
- THCudaTensor *input,
- THCudaTensor *output,
- THCudaTensor *weight,
- THCudaTensor *bias,
- int kW, int kH,
- int dW, int dH);
-TH_API void THNN_CudaSpatialSubSampling_updateGradInput(
- THCState *state,
- THCudaTensor *input,
- THCudaTensor *gradOutput,
- THCudaTensor *gradInput,
- THCudaTensor *weight,
- int kW, int kH,
- int dW, int dH);
-TH_API void THNN_CudaSpatialSubSampling_accGradParameters(
- THCState *state,
- THCudaTensor *input,
- THCudaTensor *gradOutput,
- THCudaTensor *gradWeight,
- THCudaTensor *gradBias,
- int kW, int kH,
- int dW, int dH,
- float scale);
-
-TH_API void THNN_CudaSpatialUpSamplingNearest_updateOutput(
- THCState *state,
- THCudaTensor *input,
- THCudaTensor *output,
- int scale_factor);
-TH_API void THNN_CudaSpatialUpSamplingNearest_updateGradInput(
- THCState *state,
- THCudaTensor *input,
- THCudaTensor *gradOutput,
- THCudaTensor *gradInput,
- int scale_factor);
-
-TH_API void THNN_CudaSpatialUpSamplingBilinear_updateOutput(
- THCState *state,
- THCudaTensor *input,
- THCudaTensor *output,
- int outputHeight,
- int outputWidth);
-TH_API void THNN_CudaSpatialUpSamplingBilinear_updateGradInput(
- THCState *state,
- THCudaTensor *gradOutput,
- THCudaTensor *gradInput,
- int nbatch,
- int nchannels,
- int inputHeight,
- int inputWidth,
- int outputHeight,
- int outputWidth);
-
-TH_API void THNN_CudaVolumetricAveragePooling_updateOutput(
- THCState *state,
- THCudaTensor *input,
- THCudaTensor *output,
- int kT, int kW, int kH,
- int dT, int dW, int dH);
-TH_API void THNN_CudaVolumetricAveragePooling_updateGradInput(
- THCState *state,
- THCudaTensor *input,
- THCudaTensor *gradOutput,
- THCudaTensor *gradInput,
- int kT, int kW, int kH,
- int dT, int dW, int dH);
-
-TH_API void THNN_CudaVolumetricConvolution_updateOutput(
- THCState *state,
- THCudaTensor *input,
- THCudaTensor *output,
- THCudaTensor *weight,
- THCudaTensor *bias,
- THCudaTensor *finput,
- THCudaTensor *fgradInput,
- int dT, int dW, int dH,
- int padT, int padW, int padH);
-TH_API void THNN_CudaVolumetricConvolution_updateGradInput(
- THCState *state,
- THCudaTensor *input,
- THCudaTensor *gradOutput,
- THCudaTensor *gradInput,
- THCudaTensor *weight,
- THCudaTensor *finput,
- int dT, int dW, int dH,
- int padT, int padW, int padH);
-TH_API void THNN_CudaVolumetricConvolution_accGradParameters(
- THCState *state,
- THCudaTensor *input,
- THCudaTensor *gradOutput,
- THCudaTensor *gradWeight,
- THCudaTensor *gradBias,
- THCudaTensor *finput,
- THCudaTensor *fgradInput,
- int dT, int dW, int dH,
- int padT, int padW, int padH,
- float scale);
-
-TH_API void THNN_CudaVolumetricFullConvolution_updateOutput(
- THCState *state,
- THCudaTensor *input,
- THCudaTensor *output,
- THCudaTensor *weight,
- THCudaTensor *bias,
- THCudaTensor *finput,
- THCudaTensor *fgradInput,
- int dT, int dW, int dH,
- int padT, int padW, int padH,
- int adjT, int adjW, int adjH);
-TH_API void THNN_CudaVolumetricFullConvolution_updateGradInput(
- THCState *state,
- THCudaTensor *input,
- THCudaTensor *gradOutput,
- THCudaTensor *gradInput,
- THCudaTensor *weight,
- THCudaTensor *finput,
- THCudaTensor *fgradInput,
- int dT, int dW, int dH,
- int padT, int padW, int padH,
- int adjT, int adjW, int adjH);
-TH_API void THNN_CudaVolumetricFullConvolution_accGradParameters(
- THCState *state,
- THCudaTensor *input,
- THCudaTensor *gradOutput,
- THCudaTensor *gradWeight,
- THCudaTensor *gradBias,
- THCudaTensor *finput,
- THCudaTensor *fgradInput,
- int dT, int dW, int dH,
- int padT, int padW, int padH,
- int adjT, int adjW, int adjH,
- float scale);
-
-TH_API void THNN_CudaVolumetricDilatedConvolution_updateOutput(
- THCState *state,
- THCudaTensor *input,
- THCudaTensor *output,
- THCudaTensor *weight,
- THCudaTensor *bias,
- THCudaTensor *columns,
- THCudaTensor *ones,
- int kT, int kW, int kH,
- int dT, int dW, int dH,
- int padT, int padW, int padH,
- int dilationT, int dilationW, int dilationH);
-
-TH_API void THNN_CudaVolumetricDilatedConvolution_updateGradInput(
- THCState *state,
- THCudaTensor *input,
- THCudaTensor *gradOutput,
- THCudaTensor *gradInput,
- THCudaTensor *weight,
- THCudaTensor *gradColumns,
- int kT, int kW, int kH,
- int dT, int dW, int dH,
- int padT, int padW, int padH,
- int dilationT, int dilationW, int dilationH);
-
-TH_API void THNN_CudaVolumetricDilatedConvolution_accGradParameters(
- THCState *state,
- THCudaTensor *input,
- THCudaTensor *gradOutput,
- THCudaTensor *gradWeight,
- THCudaTensor *gradBias,
- THCudaTensor *columns,
- THCudaTensor *ones,
- int kT, int kW, int kH,
- int dT, int dW, int dH,
- int padT, int padW, int padH,
- int dilationT, int dilationW, int dilationH,
- float scale);
-
-TH_API void THNN_CudaVolumetricMaxPooling_updateOutput(
- THCState *state,
- THCudaTensor *input,
- THCudaTensor *output,
- THCudaTensor *indices,
- int kT, int kW, int kH,
- int dT, int dW, int dH,
- int padT, int padW, int padH,
- bool ceilMode);
-TH_API void THNN_CudaVolumetricMaxPooling_updateGradInput(
- THCState *state,
- THCudaTensor *input,
- THCudaTensor *gradOutput,
- THCudaTensor *gradInput,
- THCudaTensor *indices,
- int dT, int dW, int dH,
- int padT, int padW, int padH);
-
-TH_API void THNN_CudaVolumetricDilatedMaxPooling_updateOutput(
- THCState *state,
- THCudaTensor *input,
- THCudaTensor *output,
- THCudaTensor *indices,
- int kT, int kW, int kH,
- int dT, int dW, int dH,
- int padT, int padW, int padH,
- int dilationT, int dilationW, int dilationH,
- bool ceilMode);
-TH_API void THNN_CudaVolumetricDilatedMaxPooling_updateGradInput(
- THCState *state,
- THCudaTensor *input,
- THCudaTensor *gradOutput,
- THCudaTensor *gradInput,
- THCudaTensor *indices,
- int dT, int dW, int dH,
- int padT, int padW, int padH,
- int dilationT, int dilationW, int dilationH);
-
-TH_API void THNN_CudaVolumetricMaxUnpooling_updateOutput(
- THCState *state,
- THCudaTensor *input,
- THCudaTensor *output,
- THCudaTensor *indices,
- int outputTime, int outputWidth, int outputHeight,
- int dT, int dW, int dH,
- int padT, int padW, int padH);
-TH_API void THNN_CudaVolumetricMaxUnpooling_updateGradInput(
- THCState *state,
- THCudaTensor *input,
- THCudaTensor *gradOutput,
- THCudaTensor *gradInput,
- THCudaTensor *indices,
- int outputTime, int outputWidth, int outputHeight,
- int dT, int dW, int dH,
- int padT, int padW, int padH);
-
-TH_API void THNN_CudaSpatialReflectionPadding_updateOutput(
- THCState *state,
- THCudaTensor *input,
- THCudaTensor *output,
- int padL, int padR,
- int padT, int padB);
-TH_API void THNN_CudaSpatialReflectionPadding_updateGradInput(
- THCState *state,
- THCudaTensor *input,
- THCudaTensor *gradOutput,
- THCudaTensor *gradInput,
- int padL, int padR,
- int padT, int padB);
-
-TH_API void THNN_CudaSpatialReplicationPadding_updateOutput(
- THCState *state,
- THCudaTensor *input,
- THCudaTensor *output,
- int padL, int padR,
- int padT, int padB);
-TH_API void THNN_CudaSpatialReplicationPadding_updateGradInput(
- THCState *state,
- THCudaTensor *input,
- THCudaTensor *gradOutput,
- THCudaTensor *gradInput,
- int padL, int padR,
- int padT, int padB);
-
-TH_API void THNN_CudaVolumetricReplicationPadding_updateOutput(
- THCState *state,
- THCudaTensor *input,
- THCudaTensor *output,
- int pleft, int pright,
- int ptop, int pbottom,
- int pfront, int pback);
-TH_API void THNN_CudaVolumetricReplicationPadding_updateGradInput(
- THCState *state,
- THCudaTensor *input,
- THCudaTensor *gradOutput,
- THCudaTensor *gradInput,
- int pleft, int pright,
- int ptop, int pbottom,
- int pfront, int pback);
+#include "generic/THCUNN.h"
+#include "THCGenerateFloatTypes.h"
diff --git a/lib/THCUNN/Tanh.cu b/lib/THCUNN/Tanh.cu
index 726169a..0bd1ee8 100644
--- a/lib/THCUNN/Tanh.cu
+++ b/lib/THCUNN/Tanh.cu
@@ -1,32 +1,24 @@
#include "THCUNN.h"
-#include "common.h"
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
+template <typename T>
struct tanhupdateOutput_functor
{
- __device__ void operator()(float *output, const float *input) const
+ __device__ void operator()(T *output, const T *input) const
{
*output = tanh(*input);
}
};
-void THNN_CudaTanh_updateOutput(THCState *state, THCudaTensor *input, THCudaTensor *output)
-{
- THCUNN_assertSameGPU(state, 2, input, output);
- THCudaTensor_resizeAs(state, output, input);
- THC_pointwiseApply2(state, output, input, tanhupdateOutput_functor());
-}
-
+template <typename T>
struct tanhupdateGradInput_functor
{
- __device__ void operator()(float *gradInput, const float *output, const float *gradOutput) const
+ __device__ void operator()(T *gradInput, const T *output, const T *gradOutput) const
{
*gradInput = *gradOutput * (1 - *output * *output);
}
};
-void THNN_CudaTanh_updateGradInput(THCState *state, THCudaTensor *input, THCudaTensor *gradOutput, THCudaTensor *gradInput, THCudaTensor *output)
-{
- THCUNN_assertSameGPU(state, 3, output, gradOutput, gradInput);
- THCudaTensor_resizeAs(state, gradInput, output);
- THC_pointwiseApply3(state, gradInput, output, gradOutput, tanhupdateGradInput_functor());
-}
+#include "generic/Tanh.cu"
+#include "THCGenerateFloatTypes.h"
diff --git a/lib/THCUNN/TemporalConvolution.cu b/lib/THCUNN/TemporalConvolution.cu
index 078fd7b..f4e9c69 100644
--- a/lib/THCUNN/TemporalConvolution.cu
+++ b/lib/THCUNN/TemporalConvolution.cu
@@ -1,345 +1,7 @@
#include "THCUNN.h"
#include "common.h"
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
-void THNN_CudaTemporalConvolution_updateOutput(
- THCState *state,
- THCudaTensor *input,
- THCudaTensor *output,
- THCudaTensor *weight,
- THCudaTensor *bias,
- int kW, int dW,
- int inputFrameSize,
- int outputFrameSize) {
-
- THCudaTensor *outputWindow, *inputWindow;
- int nInputFrame, nOutputFrame;
- long k, i;
-
- int dimS = 0; // sequence dimension
- int dimF = 1; // feature dimension
-
- THCUNN_assertSameGPU(state, 4, input, output, weight, bias);
- THArgCheck( input->nDimension == 2 || input->nDimension == 3, 2, "2D or 3D(batch mode) tensor expected");
-
- if (input->nDimension == 3)
- {
- dimS = 1;
- dimF = 2;
- }
- THArgCheck( input->size[dimF] == inputFrameSize, 2, "invalid input frame size");
- THArgCheck( input->size[dimS] >= kW, 2, "input sequence smaller than kernel size");
-
- input = THCudaTensor_newContiguous(state, input);
- outputWindow = THCudaTensor_new(state);
- inputWindow = THCudaTensor_new(state);
-
- nInputFrame = input->size[dimS];
- nOutputFrame = (nInputFrame - kW) / dW + 1;
-
- if (input->nDimension == 2)
- {
- THCudaTensor_resize2d(state, output,
- nOutputFrame,
- outputFrameSize);
-
- /* bias first */
- for(k = 0; k < nOutputFrame; k++)
- {
- THCudaTensor_select(state, outputWindow, output, 0, k);
- THCudaTensor_copy(state, outputWindow, bias);
- }
-
-
- /* ouch */
- for(k = 0; nOutputFrame > 0; k++)
- {
- long outputFrameStride = (kW-1)/dW+1;
- long inputFrameStride = outputFrameStride*dW;
- long nFrame = (nInputFrame-k*dW-kW)/inputFrameStride + 1;
- nOutputFrame -= nFrame;
-
- THCudaTensor_setStorage2d(state, inputWindow, input->storage,
- input->storageOffset+k*dW*input->size[1],
- nFrame, inputFrameStride*input->size[1],
- kW*input->size[1], 1);
-
- THCudaTensor_setStorage2d(state, outputWindow, output->storage,
- output->storageOffset + k*output->size[1],
- nFrame, outputFrameStride*output->size[1],
- output->size[1], 1);
-
- THCudaTensor_transpose(state, weight, NULL, 0, 1);
- THCudaTensor_addmm(state, outputWindow, 1, outputWindow, 1, inputWindow, weight);
- THCudaTensor_transpose(state, weight, NULL, 0, 1);
- }
- }
- else
- {
- THCudaTensor *outputSample = THCudaTensor_new(state);
- THCudaTensor *inputSample = THCudaTensor_new(state);
- int nBatchFrame = input->size[0];
-
- THCudaTensor_resize3d(state, output,
- nBatchFrame,
- nOutputFrame,
- outputFrameSize);
-
- for(i = 0; i < nBatchFrame; i++)
- {
- THCudaTensor_select(state, outputSample, output, 0, i);
- THCudaTensor_select(state, inputSample, input, 0, i);
- long nOutputSampleFrame = nOutputFrame;
-
- /* bias first */
- for(k = 0; k < nOutputFrame; k++)
- {
- THCudaTensor_select(state, outputWindow, outputSample, 0, k);
- THCudaTensor_copy(state, outputWindow, bias);
- }
-
- /* ouch */
- for(k = 0; nOutputSampleFrame > 0; k++)
- {
- long outputFrameStride = (kW-1)/dW+1;
- long inputFrameStride = outputFrameStride*dW;
- long nFrame = (nInputFrame-k*dW-kW)/inputFrameStride + 1;
- nOutputSampleFrame -= nFrame;
-
- THCudaTensor_setStorage2d(state, inputWindow, inputSample->storage,
- inputSample->storageOffset+k*dW*inputSample->size[1],
- nFrame, inputFrameStride*inputSample->size[1],
- kW*inputSample->size[1], 1);
-
- THCudaTensor_setStorage2d(state, outputWindow, outputSample->storage,
- outputSample->storageOffset + k*outputSample->size[1],
- nFrame, outputFrameStride*outputSample->size[1],
- outputSample->size[1], 1);
-
- THCudaTensor_transpose(state, weight, NULL, 0, 1);
- THCudaTensor_addmm(state, outputWindow, 1, outputWindow, 1, inputWindow, weight);
- THCudaTensor_transpose(state, weight, NULL, 0, 1);
- }
- }
- THCudaTensor_free(state, outputSample);
- THCudaTensor_free(state, inputSample);
- }
-
- THCudaTensor_free(state, outputWindow);
- THCudaTensor_free(state, inputWindow);
- THCudaTensor_free(state, input);
-
-}
-
-void THNN_CudaTemporalConvolution_updateGradInput(
- THCState* state,
- THCudaTensor *input,
- THCudaTensor *gradOutput,
- THCudaTensor *gradInput,
- THCudaTensor *weight,
- int kW, int dW) {
-
- long nInputFrame;
- long nOutputFrame;
-
- THCudaTensor *gradOutputWindow;
- THCudaTensor *gradInputWindow;
- long k, i;
-
- int dimS = 0; // sequence dimension
-
- THCUNN_assertSameGPU(state, 4, input, gradOutput, weight, gradInput);
-
- if (gradOutput->nDimension == 3)
- {
- dimS = 1;
- }
-
- nInputFrame = input->size[dimS];
- nOutputFrame = gradOutput->size[dimS];
-
-
- /* Not necessary with partial backprop: */
- gradOutputWindow = THCudaTensor_new(state);
- gradInputWindow = THCudaTensor_new(state);
-
- THCudaTensor_resizeAs(state, gradInput, input);
- THCudaTensor_zero(state, gradInput);
-
- if (gradOutput->nDimension == 2)
- {
- /* ouch */
- for(k = 0; nOutputFrame > 0; k++)
- {
- long outputFrameStride = (kW-1)/dW+1;
- long inputFrameStride = outputFrameStride*dW;
- long nFrame = (nInputFrame-k*dW-kW)/inputFrameStride + 1;
- nOutputFrame -= nFrame;
-
- THCudaTensor_setStorage2d(state, gradOutputWindow, gradOutput->storage,
- gradOutput->storageOffset + k*gradOutput->size[1],
- nFrame, outputFrameStride*gradOutput->size[1],
- gradOutput->size[1], 1);
-
- THCudaTensor_setStorage2d(state, gradInputWindow, gradInput->storage,
- gradInput->storageOffset+k*dW*gradInput->size[1],
- nFrame, inputFrameStride*gradInput->size[1],
- kW*gradInput->size[1], 1);
-
- THCudaTensor_addmm(state, gradInputWindow, 1, gradInputWindow, 1, gradOutputWindow, weight);
- }
- }
- else
- {
- THCudaTensor *gradOutputSample = THCudaTensor_new(state);
- THCudaTensor *gradInputSample = THCudaTensor_new(state);
- long nBatchFrame = input->size[0];
- for(i = 0; i < nBatchFrame; i++)
- {
- THCudaTensor_select(state, gradOutputSample, gradOutput, 0, i);
- THCudaTensor_select(state, gradInputSample, gradInput, 0, i);
- long nOutputSampleFrame = nOutputFrame;
-
- /* ouch */
- for(k = 0; nOutputSampleFrame > 0; k++)
- {
- long outputFrameStride = (kW-1)/dW+1;
- long inputFrameStride = outputFrameStride*dW;
- long nFrame = (nInputFrame-k*dW-kW)/inputFrameStride + 1;
- nOutputSampleFrame -= nFrame;
-
- THCudaTensor_setStorage2d(state, gradOutputWindow, gradOutputSample->storage,
- gradOutputSample->storageOffset + k*gradOutputSample->size[1],
- nFrame, outputFrameStride*gradOutputSample->size[1],
- gradOutputSample->size[1], 1);
-
- THCudaTensor_setStorage2d(state, gradInputWindow, gradInputSample->storage,
- gradInputSample->storageOffset+k*dW*gradInputSample->size[1],
- nFrame, inputFrameStride*gradInputSample->size[1],
- kW*gradInputSample->size[1], 1);
-
- THCudaTensor_addmm(state, gradInputWindow, 1, gradInputWindow, 1, gradOutputWindow, weight);
- }
- }
- THCudaTensor_free(state, gradOutputSample);
- THCudaTensor_free(state, gradInputSample);
- }
-
- THCudaTensor_free(state, gradOutputWindow);
- THCudaTensor_free(state, gradInputWindow);
-
-}
-
-void THNN_CudaTemporalConvolution_accGradParameters(
- THCState *state,
- THCudaTensor *input,
- THCudaTensor *gradOutput,
- THCudaTensor *gradWeight,
- THCudaTensor *gradBias,
- int kW, int dW,
- float scale) {
-
- long nInputFrame;
- long nOutputFrame;
-
- THCudaTensor *gradOutputWindow;
- THCudaTensor *inputWindow;
- long k, i;
-
- int dimS = 0; // sequence dimension
-
- if (gradOutput->nDimension == 3)
- {
- dimS = 1;
- }
-
- nInputFrame = input->size[dimS];
- nOutputFrame = gradOutput->size[dimS];
-
- /* Not necessary with partial backprop: */
- input = THCudaTensor_newContiguous(state, input);
- gradOutputWindow = THCudaTensor_new(state);
- inputWindow = THCudaTensor_new(state);
-
- if (input->nDimension == 2)
- {
- /* bias first */
- for(k = 0; k < nOutputFrame; k++)
- {
- THCudaTensor_select(state, gradOutputWindow, gradOutput, 0, k);
- THCudaTensor_cadd(state, gradBias, gradBias, scale, gradOutputWindow);
- }
-
- /* ouch */
- for(k = 0; nOutputFrame > 0; k++)
- {
- long outputFrameStride = (kW-1)/dW+1;
- long inputFrameStride = outputFrameStride*dW;
- long nFrame = (nInputFrame-k*dW-kW)/inputFrameStride + 1;
- nOutputFrame -= nFrame;
-
- THCudaTensor_setStorage2d(state, inputWindow, input->storage,
- input->storageOffset+k*dW*input->size[1],
- nFrame, inputFrameStride*input->size[1],
- kW*input->size[1], 1);
-
- THCudaTensor_setStorage2d(state, gradOutputWindow, gradOutput->storage,
- gradOutput->storageOffset + k*gradOutput->size[1],
- nFrame, outputFrameStride*gradOutput->size[1],
- gradOutput->size[1], 1);
-
- THCudaTensor_transpose(state, gradOutputWindow, NULL, 0, 1);
- THCudaTensor_addmm(state, gradWeight, 1, gradWeight, scale, gradOutputWindow, inputWindow);
- THCudaTensor_transpose(state, gradOutputWindow, NULL, 0, 1);
- }
- }
- else
- {
- THCudaTensor *gradOutputSample = THCudaTensor_new(state);
- THCudaTensor *inputSample = THCudaTensor_new(state);
- long nBatchFrame = input->size[0];
-
- for(i = 0; i < nBatchFrame; i++)
- {
- THCudaTensor_select(state, gradOutputSample, gradOutput, 0, i);
- THCudaTensor_select(state, inputSample, input, 0, i);
- long nOutputSampleFrame = nOutputFrame;
-
- /* bias first */
- for(k = 0; k < nOutputFrame; k++)
- {
- THCudaTensor_select(state, gradOutputWindow, gradOutputSample, 0, k);
- THCudaTensor_cadd(state, gradBias, gradBias, scale, gradOutputWindow);
- }
-
- /* ouch */
- for(k = 0; nOutputSampleFrame > 0; k++)
- {
- long outputFrameStride = (kW-1)/dW+1;
- long inputFrameStride = outputFrameStride*dW;
- long nFrame = (nInputFrame-k*dW-kW)/inputFrameStride + 1;
- nOutputSampleFrame -= nFrame;
-
- THCudaTensor_setStorage2d(state, inputWindow, inputSample->storage,
- inputSample->storageOffset+k*dW*inputSample->size[1],
- nFrame, inputFrameStride*inputSample->size[1],
- kW*inputSample->size[1], 1);
-
- THCudaTensor_setStorage2d(state, gradOutputWindow, gradOutputSample->storage,
- gradOutputSample->storageOffset + k*gradOutputSample->size[1],
- nFrame, outputFrameStride*gradOutputSample->size[1],
- gradOutputSample->size[1], 1);
-
- THCudaTensor_transpose(state, gradOutputWindow, NULL, 0, 1);
- THCudaTensor_addmm(state, gradWeight, 1, gradWeight, scale, gradOutputWindow, inputWindow);
- THCudaTensor_transpose(state, gradOutputWindow, NULL, 0, 1);
- }
- }
- THCudaTensor_free(state, gradOutputSample);
- THCudaTensor_free(state, inputSample);
- }
-
- THCudaTensor_free(state, gradOutputWindow);
- THCudaTensor_free(state, inputWindow);
- THCudaTensor_free(state, input);
-
-}
+#include "generic/TemporalConvolution.cu"
+#include "THCGenerateFloatTypes.h"
diff --git a/lib/THCUNN/TemporalMaxPooling.cu b/lib/THCUNN/TemporalMaxPooling.cu
index 0b31c60..384e409 100644
--- a/lib/THCUNN/TemporalMaxPooling.cu
+++ b/lib/THCUNN/TemporalMaxPooling.cu
@@ -1,28 +1,32 @@
#include "THCUNN.h"
#include "common.h"
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
+#include "THCAtomics.cuh"
#define TEMPORAL_MAX_POOLING_THREADS 1024
-__global__ void cunn_TemporalMaxPooling_updateOutputKernel(float *input, float *output, float *indices, int input_w, int input_n, int output_w, int kW, int dW) {
+template <typename Dtype>
+__global__ void cunn_TemporalMaxPooling_updateOutputKernel(Dtype *input, Dtype *output, THCIndex_t *indices, int input_w, int input_n, int output_w, int kW, int dW) {
// Block idx is the batch index, thread idx + block idx y * MAX_THREADS is the time index
- float *input_data = input + blockIdx.x * input_w * input_n + (
+ Dtype *input_data = input + blockIdx.x * input_w * input_n + (
threadIdx.x + blockIdx.y * TEMPORAL_MAX_POOLING_THREADS) * input_n * dW;
- float *output_data = output + blockIdx.x * output_w * input_n + (
+ Dtype *output_data = output + blockIdx.x * output_w * input_n + (
threadIdx.x + blockIdx.y * TEMPORAL_MAX_POOLING_THREADS) * input_n;
- float *indices_data = indices + blockIdx.x * output_w * input_n + (
+ THCIndex_t *indices_data = indices + blockIdx.x * output_w * input_n + (
threadIdx.x + blockIdx.y * TEMPORAL_MAX_POOLING_THREADS) * input_n;
int feat = 0;
int time = 0;
int max_time = input_n * kW;
- float max_value;
- float max_index = 0.0;
+ Dtype max_value;
+ THCIndex_t max_index = 0;
if (threadIdx.x + blockIdx.y * TEMPORAL_MAX_POOLING_THREADS < output_w) {
// For all features
for (feat = 0; feat < input_n; ++feat) {
- max_value = -FLT_MAX;
+ max_value = THCNumerics<Dtype>::min();
// For all values in the kernel space
for (time = 0; time < max_time; time += input_n) {
if (max_value < input_data[time + feat]) {
@@ -31,18 +35,19 @@ __global__ void cunn_TemporalMaxPooling_updateOutputKernel(float *input, float *
}
}
output_data[feat] = max_value;
- indices_data[feat] = (float)max_index;
+ indices_data[feat] = max_index;
}
}
}
-__global__ void cunn_TemporalMaxPooling_updateGradInputKernel(float *gradInput, float *gradOutput, float *indices, int input_w, int input_n, int output_w, int kW, int dW) {
+template <typename Dtype>
+__global__ void cunn_TemporalMaxPooling_updateGradInputKernel(Dtype *gradInput, Dtype *gradOutput, THCIndex_t *indices, int input_w, int input_n, int output_w, int kW, int dW) {
// Block idx is the batch index, thread idx + block idx y * MAX_THREADS is the time index
- float *gradInput_data = gradInput + blockIdx.x * input_w * input_n + (
+ Dtype *gradInput_data = gradInput + blockIdx.x * input_w * input_n + (
threadIdx.x + blockIdx.y * TEMPORAL_MAX_POOLING_THREADS) * input_n * dW;
- float *gradOutput_data = gradOutput + blockIdx.x * output_w * input_n + (
+ Dtype *gradOutput_data = gradOutput + blockIdx.x * output_w * input_n + (
threadIdx.x + blockIdx.y * TEMPORAL_MAX_POOLING_THREADS) * input_n;
- float *indices_data = indices + blockIdx.x * output_w * input_n + (
+ THCIndex_t *indices_data = indices + blockIdx.x * output_w * input_n + (
threadIdx.x + blockIdx.y * TEMPORAL_MAX_POOLING_THREADS) * input_n;
int feat = 0;
@@ -50,18 +55,19 @@ __global__ void cunn_TemporalMaxPooling_updateGradInputKernel(float *gradInput,
if (threadIdx.x + blockIdx.y * TEMPORAL_MAX_POOLING_THREADS < output_w) {
// For all features
for (feat = 0; feat < input_n; ++feat) {
- gradInput_data[(int)indices_data[feat] * input_n + feat] += gradOutput_data[feat];
+ gradInput_data[indices_data[feat] * input_n + feat] += gradOutput_data[feat];
}
}
}
-__global__ void cunn_TemporalMaxPooling_updateGradInputKernelAtomic(float *gradInput, float *gradOutput, float *indices, int input_w, int input_n, int output_w, int kW, int dW) {
+template <typename Dtype>
+__global__ void cunn_TemporalMaxPooling_updateGradInputKernelAtomic(Dtype *gradInput, Dtype *gradOutput, THCIndex_t *indices, int input_w, int input_n, int output_w, int kW, int dW) {
// Block idx is the batch index, thread idx + block idx y * MAX_THREADS is the time index
- float *gradInput_data = gradInput + blockIdx.x * input_w * input_n + (
+ Dtype *gradInput_data = gradInput + blockIdx.x * input_w * input_n + (
threadIdx.x + blockIdx.y * TEMPORAL_MAX_POOLING_THREADS) * input_n * dW;
- float *gradOutput_data = gradOutput + blockIdx.x * output_w * input_n + (
+ Dtype *gradOutput_data = gradOutput + blockIdx.x * output_w * input_n + (
threadIdx.x + blockIdx.y * TEMPORAL_MAX_POOLING_THREADS) * input_n;
- float *indices_data = indices + blockIdx.x * output_w * input_n + (
+ THCIndex_t *indices_data = indices + blockIdx.x * output_w * input_n + (
threadIdx.x + blockIdx.y * TEMPORAL_MAX_POOLING_THREADS) * input_n;
int feat = 0;
@@ -69,153 +75,10 @@ __global__ void cunn_TemporalMaxPooling_updateGradInputKernelAtomic(float *gradI
if (threadIdx.x + blockIdx.y * TEMPORAL_MAX_POOLING_THREADS < output_w) {
// For all features
for (feat = 0; feat < input_n; ++feat) {
- atomicAdd(&gradInput_data[(int)indices_data[feat] * input_n + feat], gradOutput_data[feat]);
+ atomicAdd(&gradInput_data[indices_data[feat] * input_n + feat], gradOutput_data[feat]);
}
}
}
-void THNN_CudaTemporalMaxPooling_updateOutput(
- THCState *state,
- THCudaTensor *input,
- THCudaTensor *output,
- THCudaTensor *indices,
- int kW, int dW) {
-
- int dimT = 0; // Temporal dimension
- int dimF = 1; // Feature dimension
-
- int batch = 1;
- int input_w;
- int input_n;
- int output_w;
- int nthreads;
-
- float *input_data;
- float *output_data;
- float *indices_data;
-
- THCUNN_assertSameGPU(state, 3, input, output, indices);
- THArgCheck( input->nDimension == 2 || input->nDimension == 3, 2, "2D or 3D(batch mode) tensor expected");
-
- if (input->nDimension == 3)
- {
- dimT = 1;
- dimF = 2;
- batch = input->size[0];
- }
- THArgCheck( input->size[dimT] >= kW, 2, "input sequence smaller than kernel size");
-
- input = THCudaTensor_newContiguous(state, input);
-
- input_w = input->size[dimT];
- input_n = input->size[dimF];
- output_w = (input_w - kW) / dW + 1;
-
- if (input->nDimension == 2)
- {
- THCudaTensor_resize2d(state, output, output_w, input->size[dimF]);
- THCudaTensor_resize2d(state, indices, output_w, input->size[dimF]);
- }
- else
- {
- THCudaTensor_resize3d(state, output, batch, output_w, input->size[dimF]);
- THCudaTensor_resize3d(state, indices, batch, output_w, input->size[dimF]);
- }
-
- input_data = THCudaTensor_data(state, input);
- output_data = THCudaTensor_data(state, output);
- indices_data = THCudaTensor_data(state, indices);
-
- dim3 blocks(batch);
- nthreads = (output_w / 32) * 32;
- if (output_w % 32 > 0) {
- nthreads += 32;
- }
-
- if (nthreads > TEMPORAL_MAX_POOLING_THREADS) {
- blocks.y = nthreads / TEMPORAL_MAX_POOLING_THREADS;
- if (nthreads % TEMPORAL_MAX_POOLING_THREADS > 0) {
- blocks.y += 1;
- }
- nthreads = TEMPORAL_MAX_POOLING_THREADS;
- }
-
- dim3 threads(nthreads);
- cunn_TemporalMaxPooling_updateOutputKernel <<< blocks, threads, 0, THCState_getCurrentStream(state) >>>(
- input_data, output_data, indices_data, input_w, input_n, output_w, kW, dW);
- THCudaCheck(cudaGetLastError());
- THCudaTensor_free(state, input);
-
-}
-
-void THNN_CudaTemporalMaxPooling_updateGradInput(
- THCState *state,
- THCudaTensor *input,
- THCudaTensor *gradOutput,
- THCudaTensor *gradInput,
- THCudaTensor *indices,
- int kW, int dW) {
-
- int dimT = 0; // Temporal dimension
- int dimF = 1; // Feature dimension
-
- int batch = 1;
- int input_w;
- int input_n;
- int output_w;
- int nthreads;
-
- float *gradInput_data;
- float *gradOutput_data;
- float *indices_data;
-
- THCUNN_assertSameGPU(state, 4, input, gradOutput, gradInput, indices);
- THArgCheck( input->nDimension == 2 || input->nDimension == 3, 2, "2D or 3D(batch mode) tensor expected");
-
- THCudaTensor_resizeAs(state, gradInput, input);
- THCudaTensor_zero(state, gradInput);
-
- if (input->nDimension == 3)
- {
- dimT = 1;
- dimF = 2;
- batch = input->size[0];
- }
- THArgCheck( input->size[dimT] >= kW, 2, "input sequence smaller than kernel size");
-
- gradOutput = THCudaTensor_newContiguous(state, gradOutput);
-
- input_w = input->size[dimT];
- input_n = input->size[dimF];
- output_w = (input_w - kW) / dW + 1;
-
- gradInput_data = THCudaTensor_data(state, gradInput);
- gradOutput_data = THCudaTensor_data(state, gradOutput);
- indices_data = THCudaTensor_data(state, indices);
-
- dim3 blocks(batch);
- nthreads = (output_w / 32) * 32;
- if (output_w % 32 > 0) {
- nthreads += 32;
- }
-
- if (nthreads > TEMPORAL_MAX_POOLING_THREADS) {
- blocks.y = nthreads / TEMPORAL_MAX_POOLING_THREADS;
- if (nthreads % TEMPORAL_MAX_POOLING_THREADS > 0) {
- blocks.y += 1;
- }
- nthreads = TEMPORAL_MAX_POOLING_THREADS;
- }
-
- dim3 threads(nthreads);
- if (kW <= dW) {
- cunn_TemporalMaxPooling_updateGradInputKernel <<< blocks, threads, 0, THCState_getCurrentStream(state) >>>(
- gradInput_data, gradOutput_data, indices_data, input_w, input_n, output_w, kW, dW);
- } else {
- cunn_TemporalMaxPooling_updateGradInputKernelAtomic <<< blocks, threads, 0, THCState_getCurrentStream(state) >>>(
- gradInput_data, gradOutput_data, indices_data, input_w, input_n, output_w, kW, dW);
- }
- THCudaCheck(cudaGetLastError());
- THCudaTensor_free(state, gradOutput);
-
-}
+#include "generic/TemporalMaxPooling.cu"
+#include "THCGenerateFloatTypes.h"
diff --git a/lib/THCUNN/Threshold.cu b/lib/THCUNN/Threshold.cu
index 623add8..47fe794 100644
--- a/lib/THCUNN/Threshold.cu
+++ b/lib/THCUNN/Threshold.cu
@@ -1,112 +1,74 @@
#include "THCUNN.h"
-#include "common.h"
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
+template <typename T>
struct ThresholdUpdateOutput
{
- const float threshold_;
- const float val_;
+ const T threshold_;
+ const T val_;
- ThresholdUpdateOutput(float threshold, float val)
+ ThresholdUpdateOutput(T threshold, T val)
: threshold_(threshold)
, val_(val)
{}
- __device__ __forceinline__ void operator()(float *out, float *in)
+ __device__ __forceinline__ void operator()(T *out, T *in)
{
- float x = *in;
+ T x = *in;
*out = (x > threshold_) ? x : val_;
}
};
// in-place variant
+template <typename T>
struct ThresholdUpdateOutputIP
{
- const float threshold_;
- const float val_;
+ const T threshold_;
+ const T val_;
- ThresholdUpdateOutputIP(float threshold, float val)
+ ThresholdUpdateOutputIP(T threshold, T val)
: threshold_(threshold)
, val_(val)
{}
- __device__ __forceinline__ void operator()(float *x)
+ __device__ __forceinline__ void operator()(T *x)
{
*x = (*x > threshold_) ? *x : val_;
}
};
-void THNN_CudaThreshold_updateOutput(THCState *state, THCudaTensor *input, THCudaTensor *output,
- double threshold, double val, bool inplace)
-{
- THCUNN_assertSameGPU(state, 2, input, output);
-
- if (inplace)
- {
- THC_pointwiseApply1(state, input,
- ThresholdUpdateOutputIP(threshold, val)
- );
- THCudaTensor_set(state, output, input);
- }
- else
- {
- THCudaTensor_resizeAs(state, output, input);
- THC_pointwiseApply2(state, output, input,
- ThresholdUpdateOutput(threshold, val)
- );
- }
-
- THCudaCheck(cudaGetLastError());
-}
-
+template <typename T>
struct ThresholdUpdateGradInput
{
- const float threshold_;
+ const T threshold_;
- ThresholdUpdateGradInput(float threshold)
+ ThresholdUpdateGradInput(T threshold)
: threshold_(threshold)
{}
__device__ __forceinline__ void operator()(
- float *gradInput, float *input, float *gradOutput) const
+ T *gradInput, T *input, T *gradOutput) const
{
- *gradInput = (*input > threshold_) ? *gradOutput : 0;
+ *gradInput = (*input > threshold_) ? *gradOutput : ScalarConvert<int, T>::to(0);
}
};
+template <typename T>
struct ThresholdUpdateGradInputIP
{
- const float threshold_;
+ const T threshold_;
- ThresholdUpdateGradInputIP(float threshold)
+ ThresholdUpdateGradInputIP(T threshold)
: threshold_(threshold)
{}
__device__ __forceinline__ void operator()(
- float *gradOutput, float *input) const
+ T *gradOutput, T *input) const
{
- *gradOutput = (*input > threshold_) ? *gradOutput : 0;
+ *gradOutput = (*input > threshold_) ? *gradOutput : ScalarConvert<int, T>::to(0);
}
};
-void THNN_CudaThreshold_updateGradInput(THCState *state, THCudaTensor *input, THCudaTensor *gradOutput,
- THCudaTensor *gradInput, double threshold, double val, bool inplace)
-{
- THCUNN_assertSameGPU(state, 3, input, gradInput, gradOutput);
-
- if (inplace)
- {
- THC_pointwiseApply2(state, gradOutput, input,
- ThresholdUpdateGradInputIP(threshold)
- );
- THCudaTensor_set(state, gradInput, gradOutput);
- }
- else
- {
- THCudaTensor_resizeAs(state, gradInput, input);
- THC_pointwiseApply3(state, gradInput, input, gradOutput,
- ThresholdUpdateGradInput(threshold)
- );
- }
-
- THCudaCheck(cudaGetLastError());
-}
+#include "generic/Threshold.cu"
+#include "THCGenerateFloatTypes.h"
diff --git a/lib/THCUNN/VolumetricAveragePooling.cu b/lib/THCUNN/VolumetricAveragePooling.cu
index 9542232..f584dcf 100644
--- a/lib/THCUNN/VolumetricAveragePooling.cu
+++ b/lib/THCUNN/VolumetricAveragePooling.cu
@@ -3,10 +3,14 @@
#include "THCDeviceTensor.cuh"
#include "THCDeviceTensorUtils.cuh"
#include "THCDeviceUtils.cuh"
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
+#include "THCAtomics.cuh"
+template <typename Dtype, typename Acctype>
__global__ void cuda_VolumetricAveragePooling_updateOutput(
- THCDeviceTensor<float, 4> input, THCDeviceTensor<float, 4> output,
- int kT, int kH, int kW, int dT, int dH, int dW, float normFactor, int offsetZ)
+ THCDeviceTensor<Dtype, 4> input, THCDeviceTensor<Dtype, 4> output,
+ int kT, int kH, int kW, int dT, int dH, int dW, Acctype normFactor, int offsetZ)
{
int oCol = blockIdx.x * blockDim.x + threadIdx.x;
int oRow = blockIdx.y * blockDim.y + threadIdx.y;
@@ -15,7 +19,7 @@ __global__ void cuda_VolumetricAveragePooling_updateOutput(
if (oRow < output.getSize(2) && oCol < output.getSize(3))
{
- float sum = 0.0;
+ Acctype sum = 0.0;
int iColumn = oCol * dW;
int iRow = oRow * dH;
@@ -33,7 +37,7 @@ __global__ void cuda_VolumetricAveragePooling_updateOutput(
{
if (iColumn + column < input.getSize(3))
{
- float val = input[slice][iFrame + frame][iRow + row][iColumn + column];
+ Dtype val = input[slice][iFrame + frame][iRow + row][iColumn + column];
sum += val;
}
}
@@ -42,17 +46,17 @@ __global__ void cuda_VolumetricAveragePooling_updateOutput(
}
}
- output[slice][oFrame][oRow][oCol] = sum * normFactor;
+ output[slice][oFrame][oRow][oCol] = ScalarConvert<Acctype, Dtype>::to(sum * normFactor);
}
}
// Inner-most loop size (kW) passed as template parameter for
// performance reasons.
//
-template<int KERNEL_WIDTH>
+template<int KERNEL_WIDTH, typename Dtype, typename Acctype>
__global__ void cuda_VolumetricAveragePooling_updateOutput(
- THCDeviceTensor<float, 4> input, THCDeviceTensor<float, 4> output,
- int kT, int kH, int dT, int dH, int dW, float normFactor, int offsetZ)
+ THCDeviceTensor<Dtype, 4> input, THCDeviceTensor<Dtype, 4> output,
+ int kT, int kH, int dT, int dH, int dW, Acctype normFactor, int offsetZ)
{
int oCol = blockIdx.x * blockDim.x + threadIdx.x;
int oRow = blockIdx.y * blockDim.y + threadIdx.y;
@@ -61,7 +65,7 @@ __global__ void cuda_VolumetricAveragePooling_updateOutput(
if (oRow < output.getSize(2) && oCol < output.getSize(3))
{
- float sum = 0.0;
+ Acctype sum = 0.0;
int iColumn = oCol * dW;
int iRow = oRow * dH;
@@ -79,7 +83,7 @@ __global__ void cuda_VolumetricAveragePooling_updateOutput(
{
if (iColumn + column < input.getSize(3))
{
- float val = input[slice][iFrame + frame][iRow + row][iColumn + column];
+ Dtype val = input[slice][iFrame + frame][iRow + row][iColumn + column];
sum += val;
}
}
@@ -88,7 +92,7 @@ __global__ void cuda_VolumetricAveragePooling_updateOutput(
}
}
- output[slice][oFrame][oRow][oCol] = sum * normFactor;
+ output[slice][oFrame][oRow][oCol] = ScalarConvert<Acctype, Dtype>::to(sum * normFactor);
}
}
@@ -97,127 +101,11 @@ __global__ void cuda_VolumetricAveragePooling_updateOutput(
cudaInput, cudaOutput, kT, kH, dT, dH, dW, normFactor, offsetZ); \
break
-
-void THNN_CudaVolumetricAveragePooling_updateOutput(
- THCState *state, THCudaTensor *input, THCudaTensor *output,
- int kT, int kW, int kH,
- int dT, int dW, int dH)
-{
- int batchSize;
- int inputSlices;
- int inputTime;
- int inputHeight;
- int inputWidth;
-
- if (THCudaTensor_nDimension(state, input) == 4)
- {
- THArgCheck(
- THCudaTensor_size(state, input, 1) >= kT &&
- THCudaTensor_size(state, input, 2) >= kH &&
- THCudaTensor_size(state, input, 3) >= kW, 2,
- "input image smaller than kernel size"
- );
-
- /* sizes */
- batchSize = 1;
- inputSlices = THCudaTensor_size(state, input, 0);
- inputTime = THCudaTensor_size(state, input, 1);
- inputHeight = THCudaTensor_size(state, input, 2);
- inputWidth = THCudaTensor_size(state, input, 3);
- }
- else if (THCudaTensor_nDimension(state, input) == 5)
- {
- THArgCheck(
- THCudaTensor_size(state, input, 2) >= kT &&
- THCudaTensor_size(state, input, 3) >= kH &&
- THCudaTensor_size(state, input, 4) >= kW, 2,
- "input image smaller than kernel size"
-
- );
- /* sizes */
- batchSize = THCudaTensor_size(state, input, 0);
- inputSlices = THCudaTensor_size(state, input, 1);
- inputTime = THCudaTensor_size(state, input, 2);
- inputHeight = THCudaTensor_size(state, input, 3);
- inputWidth = THCudaTensor_size(state, input, 4);
- }
- else
- {
- THArgCheck(false, 2, "4D or 5D tensor expected");
- }
-
- int outputTime = (inputTime - kT) / dT + 1;
- int outputHeight = (inputHeight - kH) / dH + 1;
- int outputWidth = (inputWidth - kW) / dW + 1;
-
- if (input->nDimension == 4) /* 4D */
- {
- /* resize output */
- THCudaTensor_resize4d(state, output, inputSlices,
- outputTime, outputHeight, outputWidth);
- }
- else /* 5D */
- {
- THCudaTensor_resize5d(state, output, batchSize, inputSlices,
- outputTime, outputHeight, outputWidth);
- }
-
- input = THCudaTensor_newContiguous(state, input);
-
- // Collapse batch and feature dimensions
- THCDeviceTensor<float, 4> cudaInput;
- THCDeviceTensor<float, 4> cudaOutput;
- if (THCudaTensor_nDimension(state, input) == 4)
- {
- cudaInput = toDeviceTensor<float, 4>(state, input);
- cudaOutput = toDeviceTensor<float, 4>(state, output);
- }
- else
- {
- cudaInput = toDeviceTensor<float, 5>(state, input).downcastOuter<4>();
- cudaOutput = toDeviceTensor<float, 5>(state, output).downcastOuter<4>();
- }
-
- int totalZ = outputTime * inputSlices * batchSize;
- int offsetZ = 0;
- dim3 block(32, 8);
- while (totalZ > 0) {
- dim3 grid(THCCeilDiv(outputWidth, static_cast<int>(block.x)),
- THCCeilDiv(outputHeight, static_cast<int>(block.y)),
- totalZ > 65535 ? 65535 : totalZ);
-
- float normFactor = 1.0f / static_cast<float>(kT * kH * kW);
- switch (kW)
- {
- LAUNCH_UPDATE_OUTPUT_KERNEL_WIDTH(1);
- LAUNCH_UPDATE_OUTPUT_KERNEL_WIDTH(2);
- LAUNCH_UPDATE_OUTPUT_KERNEL_WIDTH(3);
- LAUNCH_UPDATE_OUTPUT_KERNEL_WIDTH(4);
- LAUNCH_UPDATE_OUTPUT_KERNEL_WIDTH(5);
- LAUNCH_UPDATE_OUTPUT_KERNEL_WIDTH(6);
- LAUNCH_UPDATE_OUTPUT_KERNEL_WIDTH(7);
- default:
- cuda_VolumetricAveragePooling_updateOutput<<<grid, block>>>(
- cudaInput,
- cudaOutput,
- kT, kH, kW,
- dT, dH, dW,
- normFactor,
- offsetZ
- );
- break;
- }
- totalZ -= 65535;
- offsetZ += 65535;
- THCudaCheck(cudaGetLastError());
- }
- THCudaTensor_free(state, input);
-}
-
+template <typename Dtype, typename Acctype>
__global__ void cuda_VolumetricAveragePooling_updateGradInput_Stride1(
- THCDeviceTensor<float, 4> gradOutput,
- THCDeviceTensor<float, 4> gradInput,
- int kT, int kH, int kW, float normFactor, int offsetZ)
+ THCDeviceTensor<Dtype, 4> gradOutput,
+ THCDeviceTensor<Dtype, 4> gradInput,
+ int kT, int kH, int kW, Acctype normFactor, int offsetZ)
{
int iCol = blockIdx.x * blockDim.x + threadIdx.x;
int iRow = blockIdx.y * blockDim.y + threadIdx.y;
@@ -227,8 +115,8 @@ __global__ void cuda_VolumetricAveragePooling_updateGradInput_Stride1(
// guard against over-tiled threads
if (iRow < gradInput.getSize(2) && iCol < gradInput.getSize(3))
{
- float sum = 0.0;
- float *gOut = &gradOutput[slice][max(0, iFrame - kT + 1)]
+ Acctype sum = 0.0;
+ Dtype *gOut = &gradOutput[slice][max(0, iFrame - kT + 1)]
[max(0, iRow - kH + 1)][max(0, iCol - kW + 1)];
int frameOffset = 0;
for (int oFrame = max(0, iFrame - kT + 1);
@@ -252,13 +140,14 @@ __global__ void cuda_VolumetricAveragePooling_updateGradInput_Stride1(
}
frameOffset += gradOutput.getSize(2) * gradOutput.getSize(3);
}
- gradInput[slice][iFrame][iRow][iCol] = sum * normFactor;
+ gradInput[slice][iFrame][iRow][iCol] = ScalarConvert<Acctype, Dtype>::to(sum * normFactor);
}
}
+template <typename Dtype, typename Acctype>
__global__ void cuda_VolumetricAveragePooling_updateGradInput_atomicAdd(
- THCDeviceTensor<float, 4> gradOutput,
- THCDeviceTensor<float, 4> gradInput,
+ THCDeviceTensor<Dtype, 4> gradOutput,
+ THCDeviceTensor<Dtype, 4> gradInput,
int kT, int kH, int kW, int dT, int dH, int dW, int offsetZ)
{
int oCol = blockIdx.x * blockDim.x + threadIdx.x;
@@ -269,7 +158,8 @@ __global__ void cuda_VolumetricAveragePooling_updateGradInput_atomicAdd(
// guard against over-tiled threads
if (oRow < gradOutput.getSize(2) && oCol < gradOutput.getSize(3))
{
- float val = gradOutput[slice][oFrame][oRow][oCol] / (kT * kH * kW);
+ Dtype val = ScalarConvert<Acctype, Dtype>::to(
+ ScalarConvert<Dtype, Acctype>::to(gradOutput[slice][oFrame][oRow][oCol]) / (kT * kH * kW));
for (int iFrame = oFrame * dT; iFrame < oFrame * dT + kT; ++iFrame)
{
for (int iRow = oRow * dH; iRow < oRow * dH + kH; ++iRow)
@@ -283,9 +173,10 @@ __global__ void cuda_VolumetricAveragePooling_updateGradInput_atomicAdd(
}
}
+template <typename Dtype, typename Acctype>
__global__ void cuda_VolumetricAveragePooling_updateGradInput(
- THCDeviceTensor<float, 4> gradOutput,
- THCDeviceTensor<float, 4> gradInput,
+ THCDeviceTensor<Dtype, 4> gradOutput,
+ THCDeviceTensor<Dtype, 4> gradInput,
int kT, int kH, int kW,
int dT, int dH, int dW, int offsetZ)
{
@@ -297,7 +188,8 @@ __global__ void cuda_VolumetricAveragePooling_updateGradInput(
// guard against over-tiled threads
if (oRow < gradOutput.getSize(2) && oCol < gradOutput.getSize(3))
{
- float val = gradOutput[slice][oFrame][oRow][oCol] / (kT * kH * kW);
+ Dtype val = ScalarConvert<Acctype, Dtype>::to(
+ ScalarConvert<Dtype, Acctype>::to(gradOutput[slice][oFrame][oRow][oCol]) / (kT * kH * kW));
for (int iFrame = oFrame * dT; iFrame < oFrame * dT + kT; ++iFrame)
{
for (int iRow = oRow * dH; iRow < oRow * dH + kH; ++iRow)
@@ -311,116 +203,5 @@ __global__ void cuda_VolumetricAveragePooling_updateGradInput(
}
}
-void THNN_CudaVolumetricAveragePooling_updateGradInput(
- THCState *state,
- THCudaTensor *input,
- THCudaTensor *gradOutput,
- THCudaTensor *gradInput,
- int kT, int kW, int kH,
- int dT, int dW, int dH)
-{
- bool kernelsOverlap = (dT < kT) || (dH < kH) || (dW < kW);
-
- // Resize and initialize result tensor.
- THCudaTensor_resizeAs(state, gradInput, input);
- THCudaTensor_zero(state, gradInput);
-
- int batchSize;
- int inputSlices;
- int inputTime;
- int inputHeight;
- int inputWidth;
-
- int outputTime;
- int outputHeight;
- int outputWidth;
-
- if (THCudaTensor_nDimension(state, input) == 4) /* 4D */
- {
- batchSize = 1;
- inputSlices = THCudaTensor_size(state, input, 0);
- inputTime = THCudaTensor_size(state, input, 1);
- inputHeight = THCudaTensor_size(state, input, 2);
- inputWidth = THCudaTensor_size(state, input, 3);
-
- outputTime = THCudaTensor_size(state, gradOutput, 1);
- outputHeight = THCudaTensor_size(state, gradOutput, 2);
- outputWidth = THCudaTensor_size(state, gradOutput, 3);
- }
- else
- {
- batchSize = THCudaTensor_size(state, input, 0);
- inputSlices = THCudaTensor_size(state, input, 1);
- inputTime = THCudaTensor_size(state, input, 2);
- inputHeight = THCudaTensor_size(state, input, 3);
- inputWidth = THCudaTensor_size(state, input, 4);
-
- outputTime = THCudaTensor_size(state, gradOutput, 2);
- outputHeight = THCudaTensor_size(state, gradOutput, 3);
- outputWidth = THCudaTensor_size(state, gradOutput, 4);
- }
-
- gradOutput = THCudaTensor_newContiguous(state, gradOutput);
-
- // Collapse batch and feature dimensions
- THCDeviceTensor<float, 4> cudaGradInput;
- THCDeviceTensor<float, 4> cudaGradOutput;
- if (THCudaTensor_nDimension(state, input) == 4)
- {
- cudaGradInput = toDeviceTensor<float, 4>(state, gradInput);
- cudaGradOutput = toDeviceTensor<float, 4>(state, gradOutput);
- }
- else
- {
- cudaGradInput =
- toDeviceTensor<float, 5>(state, gradInput).downcastOuter<4>();
- cudaGradOutput =
- toDeviceTensor<float, 5>(state, gradOutput).downcastOuter<4>();
- }
-
- dim3 block(32, 8);
-
- // Optimizing for stride 1 is probably only of limited value, but this
- // specialization yields 3x speedup over the atomicAdd implementation.
- if (dT == 1 && dH == 1 && dW == 1)
- {
- int totalZ = inputTime * inputSlices * batchSize;
- int offsetZ = 0;
- while (totalZ > 0) {
- dim3 grid(THCCeilDiv(inputWidth, static_cast<int>(block.x)),
- THCCeilDiv(inputHeight, static_cast<int>(block.y)),
- totalZ > 65535 ? 65535 : totalZ);
- cuda_VolumetricAveragePooling_updateGradInput_Stride1<<<grid, block>>>(
- cudaGradOutput, cudaGradInput, kT, kH, kW, 1.0f/(kT * kH * kW), offsetZ);
- THCudaCheck(cudaGetLastError());
- totalZ -= 65535;
- offsetZ += 65535;
- }
- }
- else
- {
- int totalZ = outputTime * inputSlices * batchSize;
- int offsetZ = 0;
- while (totalZ > 0) {
-
- dim3 grid(THCCeilDiv(outputWidth, static_cast<int>(block.x)),
- THCCeilDiv(outputHeight, static_cast<int>(block.y)),
- totalZ > 65535 ? 65535 : totalZ);
- if (kernelsOverlap)
- {
- cuda_VolumetricAveragePooling_updateGradInput_atomicAdd<<<grid, block>>>(
- cudaGradOutput, cudaGradInput, kT, kH, kW, dT, dH, dW, offsetZ);
- }
- else
- {
- cuda_VolumetricAveragePooling_updateGradInput<<<grid, block>>>(
- cudaGradOutput, cudaGradInput, kT, kH, kW, dT, dH, dW, offsetZ);
- }
- THCudaCheck(cudaGetLastError());
- totalZ -= 65535;
- offsetZ += 65535;
- }
- }
-
- THCudaTensor_free(state, gradOutput);
-}
+#include "generic/VolumetricAveragePooling.cu"
+#include "THCGenerateFloatTypes.h"
diff --git a/lib/THCUNN/VolumetricConvolution.cu b/lib/THCUNN/VolumetricConvolution.cu
index 8576ae8..78f45f9 100644
--- a/lib/THCUNN/VolumetricConvolution.cu
+++ b/lib/THCUNN/VolumetricConvolution.cu
@@ -1,16 +1,19 @@
#include "THCUNN.h"
#include "common.h"
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
// Kernel for fast unfold+copy
// Borrowed from Theano
// Authors: Arjun Jain, Frédéric Bastien, Jan Schlüter, Nicolas Ballas
-__global__ void im3d2col_kernel(const int n, const float* data_im,
+template <typename Dtype>
+__global__ void im3d2col_kernel(const int n, const Dtype* data_im,
const int height, const int width, const int depth,
const int kernel_h, const int kernel_w, const int kernel_d,
const int pad_h, const int pad_w, const int pad_d,
const int stride_h, const int stride_w, const int stride_d,
const int height_col, const int width_col, const int depth_col,
- float* data_col)
+ Dtype* data_col)
{
CUDA_KERNEL_LOOP(index, n)
{
@@ -29,11 +32,11 @@ __global__ void im3d2col_kernel(const int n, const float* data_im,
int w_in = w_out * stride_w - pad_w;
int d_in = d_out * stride_d - pad_d;
- float* data_col_ptr = data_col;
+ Dtype* data_col_ptr = data_col;
data_col_ptr += channel_out * (height_col * width_col * depth_col) +
h_out * (width_col * depth_col) + w_out * depth_col + d_out;
- const float* data_im_ptr = data_im;
+ const Dtype* data_im_ptr = data_im;
data_im_ptr += channel_in * (height * width * depth) +
h_in * (width * depth) + w_in * depth + d_in;
@@ -48,7 +51,7 @@ __global__ void im3d2col_kernel(const int n, const float* data_im,
int d = d_in + k;
*data_col_ptr = (h >= 0 && w >= 0 && d >= 0 &&
h < height && w < width && d < depth) ?
- data_im_ptr[i * (width * depth) + j *depth + k] : 0;
+ data_im_ptr[i * (width * depth) + j *depth + k] : ScalarConvert<int, Dtype>::to(0);
data_col_ptr += height_col * width_col * depth_col;
}
}
@@ -56,12 +59,13 @@ __global__ void im3d2col_kernel(const int n, const float* data_im,
}
}
-void im3d2col(cudaStream_t stream, const float* data_im, const int channels,
+template <typename Dtype>
+void im3d2col(cudaStream_t stream, const Dtype* data_im, const int channels,
const int height, const int width, const int depth,
const int kernel_h, const int kernel_w, const int kernel_d,
const int pad_h, const int pad_w, const int pad_d,
const int stride_h, const int stride_w, const int stride_d,
- float* data_col)
+ Dtype* data_col)
{
// We are going to launch channels * height_col * width_col * depth_col kernels, each
// kernel responsible for copying a single-channel grid.
@@ -80,19 +84,19 @@ void im3d2col(cudaStream_t stream, const float* data_im, const int channels,
THCudaCheck(cudaGetLastError());
}
-
-__global__ void col2im3d_kernel(const int n, const float* data_col,
+template <typename Dtype, typename Acctype>
+__global__ void col2im3d_kernel(const int n, const Dtype* data_col,
const int height, const int width, const int depth,
const int channels,
const int patch_h, const int patch_w, const int patch_d,
const int pad_h, const int pad_w, const int pad_d,
const int stride_h, const int stride_w, const int stride_d,
const int height_col, const int width_col, const int depth_col,
- float* data_im)
+ Dtype* data_im)
{
CUDA_KERNEL_LOOP(index, n)
{
- float val = 0;
+ Acctype val = 0;
int d = index % depth + pad_d;
int w_index = index / depth;
int w = w_index % width + pad_w;
@@ -120,16 +124,17 @@ __global__ void col2im3d_kernel(const int n, const float* data_col,
val += data_col[offset + h_col * coeff_h_col + w_col * coeff_w_col + d_col * coeff_d_col];
}
}
- data_im[index] = val;
+ data_im[index] = ScalarConvert<Acctype, Dtype>::to(val);
}
}
-void col2im3d(cudaStream_t stream, const float* data_col, const int channels,
+template <typename Dtype, typename Acctype>
+void col2im3d(cudaStream_t stream, const Dtype* data_col, const int channels,
const int height, const int width, const int depth,
const int patch_h, const int patch_w, const int patch_d,
const int pad_h, const int pad_w, const int pad_d,
const int stride_h, const int stride_w, const int stride_d,
- float* data_im)
+ Dtype* data_im)
{
int height_col = (height + 2 * pad_h - patch_h) / stride_h + 1;
int width_col = (width + 2 * pad_w - patch_w) / stride_w + 1;
@@ -138,7 +143,7 @@ void col2im3d(cudaStream_t stream, const float* data_col, const int channels,
// To avoid involving atomic operations, we will launch one kernel per
// bottom dimension, and then in the kernel add up the top dimensions.
- col2im3d_kernel<<<GET_BLOCKS(num_kernels),
+ col2im3d_kernel<Dtype, Acctype><<<GET_BLOCKS(num_kernels),
CUDA_NUM_THREADS, 0, stream>>>(num_kernels, data_col,
height, width, depth, channels,
patch_h, patch_w, patch_d,
@@ -149,371 +154,5 @@ void col2im3d(cudaStream_t stream, const float* data_col, const int channels,
THCudaCheck(cudaGetLastError());
}
-void THNN_CudaVolumetricConvolution_updateOutput(
- THCState *state,
- THCudaTensor *input,
- THCudaTensor *output,
- THCudaTensor *weight,
- THCudaTensor *bias,
- THCudaTensor *finput,
- THCudaTensor *fgradInput,
- int dT, int dW, int dH,
- int padT, int padW, int padH)
-{
- THCudaTensor *columns = finput;
- THCudaTensor *ones = fgradInput;
- THCUNN_assertSameGPU(state, 6, input, output, weight, bias, columns, ones);
-
- THArgCheck(input->nDimension == 4 || input->nDimension == 5, 2,
- "4D or 5D (batch mode) tensor is expected"
- );
-
- THArgCheck(weight->nDimension == 5, 4,
- "5D weight tensor is expected (nOutputPlane x nInputPlane x kT x kH x kW)"
- );
-
- int nOutputPlane = (int)weight->size[0];
- int nInputPlane = (int)weight->size[1];
- int kT = (int)weight->size[2];
- int kH = (int)weight->size[3];
- int kW = (int)weight->size[4];
-
- int batch = 1;
- if (input->nDimension == 4)
- {
- // Force batch
- batch = 0;
- THCudaTensor_resize5d(state, input, 1, input->size[0], input->size[1],
- input->size[2], input->size[3]);
- }
-
- long inputWidth = input->size[3];
- long inputHeight = input->size[2];
- long inputDepth = input->size[4];
- long outputWidth = (inputWidth + 2*padH - kH) / dH + 1;
- long outputHeight = (inputHeight + 2*padT - kT) / dT + 1;
- long outputDepth = (inputDepth + 2*padW - kW) / dW + 1;
-
- // Batch size + input planes
- long batchSize = input->size[0];
-
- // Resize output
- THCudaTensor_resize5d(state, output, batchSize, nOutputPlane,
- outputHeight, outputWidth, outputDepth);
-
- // Resize temporary columns
- THCudaTensor_resize2d(state, columns, nInputPlane*kW*kH*kT, outputDepth*outputHeight*outputWidth);
-
- // Define a buffer of ones, for bias accumulation
- // Note: this buffer can be shared with other modules, it only ever gets increased,
- // and always contains ones.
- if (ones->nDimension != 3 || ones->size[0]*ones->size[1]*ones->size[2] < outputDepth*outputHeight*outputWidth)
- {
- // Resize plane and fill with ones...
- THCudaTensor_resize3d(state, ones, outputHeight, outputWidth, outputDepth);
- THCudaTensor_fill(state, ones, 1);
- }
-
- // Helpers
- THCudaTensor *input_n = THCudaTensor_new(state);
- THCudaTensor *output_n = THCudaTensor_new(state);
-
- // For each elt in batch, do:
- for (int elt = 0; elt < batchSize; elt ++)
- {
- // Matrix mulitply per output:
- THCudaTensor_select(state, input_n, input, 0, elt);
- THCudaTensor_select(state, output_n, output, 0, elt);
-
- // Do Bias first:
- // M,N,K are dims of matrix A and B
- // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
- long m_ = nOutputPlane;
- long n_ = outputDepth * outputHeight * outputWidth;
- long k_ = 1;
-
- // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
- THCudaBlas_Sgemm(
- state,
- 't', 'n',
- n_, m_, k_,
- 1,
- THCudaTensor_data(state, ones), k_,
- THCudaTensor_data(state, bias), k_,
- 0,
- THCudaTensor_data(state, output_n), n_
- );
-
- // Extract columns:
- im3d2col(
- THCState_getCurrentStream(state),
- THCudaTensor_data(state, input_n),
- nInputPlane, inputHeight, inputWidth, inputDepth, kT, kH, kW, padT, padH, padW, dT, dH, dW,
- THCudaTensor_data(state, columns)
- );
-
- // M,N,K are dims of matrix A and B
- // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
- long m = weight->size[0];
- long n = columns->size[1];
- long k = weight->size[1]*weight->size[2]*weight->size[3]*weight->size[4];
-
- // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
- THCudaBlas_Sgemm(
- state,
- 'n', 'n',
- n, m, k,
- 1,
- THCudaTensor_data(state, columns), n,
- THCudaTensor_data(state, weight), k,
- 1,
- THCudaTensor_data(state, output_n), n
- );
- }
-
- // Free
- THCudaTensor_free(state, input_n);
- THCudaTensor_free(state, output_n);
-
- // Resize output
- if (batch == 0)
- {
- THCudaTensor_resize4d(state, output, nOutputPlane, outputHeight, outputWidth, outputDepth);
- THCudaTensor_resize4d(state, input, nInputPlane, inputHeight, inputWidth, inputDepth);
- }
-}
-
-void THNN_CudaVolumetricConvolution_updateGradInput(
- THCState *state,
- THCudaTensor *input,
- THCudaTensor *gradOutput,
- THCudaTensor *gradInput,
- THCudaTensor *weight,
- THCudaTensor *finput,
- int dT, int dW, int dH,
- int padT, int padW, int padH)
-{
- THArgCheck(weight->nDimension == 5, 4,
- "5D weight tensor is expected (nOutputPlane x nInputPlane x kT x kH x kW)"
- );
-
- int nOutputPlane = (int)weight->size[0];
- int nInputPlane = (int)weight->size[1];
- int kT = (int)weight->size[2];
- int kH = (int)weight->size[3];
- int kW = (int)weight->size[4];
-
- THCudaTensor *gradColumns = finput;
-
- THCUNN_assertSameGPU(state, 5, input, gradOutput, weight, gradColumns, gradInput);
- THArgCheck(input->nDimension == 4 || input->nDimension == 5, 2,
- "4D or 5D (batch mode) tensor is expected"
- );
-
- int batch = 1;
- if (input->nDimension == 4)
- {
- // Force batch
- batch = 0;
- THCudaTensor_resize5d(state, input, 1, input->size[0], input->size[1], input->size[2], input->size[3]);
- THCudaTensor_resize5d(state, gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2], gradOutput->size[3]);
- }
-
- long inputWidth = input->size[3];
- long inputHeight = input->size[2];
- long inputDepth = input->size[4];
- long outputWidth = (inputWidth + 2*padH - kH) / dH + 1;
- long outputHeight = (inputHeight + 2*padT - kT) / dT + 1;
- long outputDepth = (inputDepth + 2*padW - kW) / dW + 1;
-
- // Batch size + input planes
- long batchSize = input->size[0];
-
- // Resize output
- THCudaTensor_resize5d(state, gradInput, batchSize, nInputPlane, inputHeight, inputWidth, inputDepth);
-
- // Resize temporary columns
- THCudaTensor_resize2d(state, gradColumns, nInputPlane*kH*kT*kW, outputDepth*outputHeight*outputWidth);
-
- // Helpers
- THCudaTensor *input_n = THCudaTensor_new(state);
- THCudaTensor *gradInput_n = THCudaTensor_new(state);
- THCudaTensor *gradOutput_n = THCudaTensor_new(state);
-
- // For each elt in batch, do:
- for (int elt = 0; elt < batchSize; elt ++)
- {
- // Matrix mulitply per sample:
- THCudaTensor_select(state, input_n, input, 0, elt);
- THCudaTensor_select(state, gradInput_n, gradInput, 0, elt);
- THCudaTensor_select(state, gradOutput_n, gradOutput, 0, elt);
-
- // M,N,K are dims of matrix A and B
- // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
- long m = weight->size[1]*weight->size[2]*weight->size[3]*weight->size[4];
- long n = gradColumns->size[1];
- long k = weight->size[0];
-
- // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
- THCudaBlas_Sgemm(
- state,
- 'n', 't',
- n, m, k,
- 1,
- THCudaTensor_data(state, gradOutput_n), n,
- THCudaTensor_data(state, weight), m,
- 0,
- THCudaTensor_data(state, gradColumns), n
- );
-
- // Unpack columns back into input:
- col2im3d(
- THCState_getCurrentStream(state),
- THCudaTensor_data(state, gradColumns),
- nInputPlane, inputHeight, inputWidth, inputDepth, kT, kH, kW, padT, padH, padW, dT, dH, dW,
- THCudaTensor_data(state, gradInput_n)
- );
- }
-
- // Free
- THCudaTensor_free(state, input_n);
- THCudaTensor_free(state, gradInput_n);
- THCudaTensor_free(state, gradOutput_n);
-
- // Resize output
- if (batch == 0)
- {
- THCudaTensor_resize4d(state, gradOutput, nOutputPlane, outputHeight, outputWidth, outputDepth);
- THCudaTensor_resize4d(state, input, nInputPlane, inputHeight, inputWidth, inputDepth);
- THCudaTensor_resize4d(state, gradInput, nInputPlane, inputHeight, inputWidth, inputDepth);
- }
-}
-
-void THNN_CudaVolumetricConvolution_accGradParameters(
- THCState *state,
- THCudaTensor *input,
- THCudaTensor *gradOutput,
- THCudaTensor *gradWeight,
- THCudaTensor *gradBias,
- THCudaTensor *finput,
- THCudaTensor *fgradInput,
- int dT, int dW, int dH,
- int padT, int padW, int padH,
- float scale)
-{
- THCudaTensor *columns = finput;
- THCudaTensor *ones = fgradInput;
- THCUNN_assertSameGPU(state, 6, input, gradOutput, gradWeight, gradBias, columns, ones);
-
- THArgCheck(gradWeight->nDimension == 5, 4,
- "5D gradWeight tensor is expected (nOutputPlane x nInputPlane x kT x kH x kW)"
- );
-
- int nOutputPlane = (int)gradWeight->size[0];
- int nInputPlane = (int)gradWeight->size[1];
- int kT = (int)gradWeight->size[2];
- int kH = (int)gradWeight->size[3];
- int kW = (int)gradWeight->size[4];
-
- THArgCheck(
- input->nDimension == 4 || input->nDimension == 5, 2,
- "3D or 4D (batch mode) tensor is expected"
- );
-
- int batch = 1;
- if (input->nDimension == 4)
- {
- // Force batch
- batch = 0;
- THCudaTensor_resize5d(state, input, 1, input->size[0], input->size[1], input->size[2], input->size[3]);
- THCudaTensor_resize5d(state, gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2], gradOutput->size[3]);
- }
-
- long inputWidth = input->size[3];
- long inputHeight = input->size[2];
- long inputDepth = input->size[4];
- long outputWidth = (inputWidth + 2*padH - kH) / dH + 1;
- long outputHeight = (inputHeight + 2*padT - kT) / dT + 1;
- long outputDepth = (inputDepth + 2*padW - kW) / dW + 1;
-
- // Batch size + input planes
- long batchSize = input->size[0];
-
- // Define a buffer of ones, for bias accumulation
- if (ones->nDimension != 3 || ones->size[0]*ones->size[1]*ones->size[2] < outputDepth*outputHeight*outputWidth)
- {
- // Resize plane and fill with ones...
- THCudaTensor_resize3d(state, ones, outputHeight, outputWidth, outputDepth);
- THCudaTensor_fill(state, ones, 1);
- }
-
- // Resize temporary columns
- THCudaTensor_resize2d(state, columns, nInputPlane*kH*kT*kW, outputDepth*outputHeight*outputWidth);
-
- // Helpers
- THCudaTensor *input_n = THCudaTensor_new(state);
- THCudaTensor *gradOutput_n = THCudaTensor_new(state);
-
- // For each elt in batch, do:
- for (int elt = 0; elt < batchSize; elt ++)
- {
- // Matrix mulitply per output:
- THCudaTensor_select(state, input_n, input, 0, elt);
- THCudaTensor_select(state, gradOutput_n, gradOutput, 0, elt);
-
- // Extract columns:
- im3d2col(
- THCState_getCurrentStream(state),
- THCudaTensor_data(state, input_n),
- nInputPlane, inputHeight, inputWidth, inputDepth, kT, kH, kW, padT, padH, padW, dT, dH, dW,
- THCudaTensor_data(state, columns)
- );
-
- // M,N,K are dims of matrix A and B
- // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
- long m = gradWeight->size[0];
- long n = gradWeight->size[1]*gradWeight->size[2]*gradWeight->size[3]*gradWeight->size[4];
- long k = columns->size[1];
-
- // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
- THCudaBlas_Sgemm(
- state,
- 't', 'n',
- n, m, k,
- scale,
- THCudaTensor_data(state, columns), k,
- THCudaTensor_data(state, gradOutput_n), k,
- 1,
- THCudaTensor_data(state, gradWeight), n
- );
-
- // Do Bias:
- // M,N,K are dims of matrix A and B
- // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
- long m_ = nOutputPlane;
- long k_ = outputDepth * outputHeight * outputWidth;
-
- // Do GEMV (note: this is a bit confusing because gemv assumes column-major matrices)
- THCudaBlas_Sgemv(
- state,
- 't',
- k_, m_,
- scale,
- THCudaTensor_data(state, gradOutput_n), k_,
- THCudaTensor_data(state, ones), 1,
- 1,
- THCudaTensor_data(state, gradBias), 1
- );
- }
-
- // Free
- THCudaTensor_free(state, input_n);
- THCudaTensor_free(state, gradOutput_n);
-
- // Resize
- if (batch == 0)
- {
- THCudaTensor_resize4d(state, gradOutput, nOutputPlane, outputHeight, outputWidth, outputDepth);
- THCudaTensor_resize4d(state, input, nInputPlane, inputHeight, inputWidth, inputDepth);
- }
-}
+#include "generic/VolumetricConvolution.cu"
+#include "THCGenerateFloatTypes.h"
diff --git a/lib/THCUNN/VolumetricDilatedConvolution.cu b/lib/THCUNN/VolumetricDilatedConvolution.cu
index 72caba7..d82e02d 100644
--- a/lib/THCUNN/VolumetricDilatedConvolution.cu
+++ b/lib/THCUNN/VolumetricDilatedConvolution.cu
@@ -1,367 +1,8 @@
#include "THCUNN.h"
#include "common.h"
#include "vol2col.h"
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
-
-void THNN_CudaVolumetricDilatedConvolution_updateOutput(
- THCState *state,
- THCudaTensor *input,
- THCudaTensor *output,
- THCudaTensor *weight,
- THCudaTensor *bias,
- THCudaTensor *columns,
- THCudaTensor *ones,
- int kT, int kW, int kH,
- int dT, int dW, int dH,
- int padT, int padW, int padH,
- int dilationT, int dilationW, int dilationH) {
-
- THCUNN_assertSameGPU(state, 5, input, output, weight, columns, ones);
- if (bias) {
- THCUNN_assertSameGPU(state, 2, weight, bias);
- }
- THArgCheck(input->nDimension == 4 || input->nDimension == 5, 2, "4D or 5D (batch mode) tensor is expected, but got: %d", input->nDimension);
- THArgCheck(weight->nDimension == 5, 4, "weight tensor must be 5D (nOutputPlane,nInputPlane,kT,kH,kW)");
- THArgCheck(!bias || weight->size[0] == bias->size[0], 4, "nOutputPlane mismatch in weight and bias");
- THArgCheck(kT > 0 && kW > 0 && kH > 0, 8, "kernel size should be greater than zero");
- THArgCheck(dT > 0 && dW > 0 && dH > 0, 10, "stride should be greater than zero");
-
- // Params:
- int nInputPlane = weight->size[1];
- int nOutputPlane = weight->size[0];
-
- int batch = 1;
- if (input->nDimension == 4) {
- THArgCheck(input->size[0] == nInputPlane, 2, "input channels and nInputPlane dont match");
- // Force batch
- batch = 0;
- THCudaTensor_resize5d(state, input, 1, input->size[0], input->size[1], input->size[2], input->size[3]);
- } else {
- THArgCheck(input->size[1] == nInputPlane, 2, "input channels and nInputPlane dont match");
- }
-
- long inputDepth = input->size[2];
- long inputHeight = input->size[3];
- long inputWidth = input->size[4];
- long outputDepth = (inputDepth + 2*padT - (dilationT * (kT - 1) + 1)) / dT + 1;
- long outputHeight = (inputHeight + 2*padH - (dilationH * (kH - 1) + 1)) / dH + 1;
- long outputWidth = (inputWidth + 2*padW - (dilationW * (kW - 1) + 1)) / dW + 1;
-
- if (outputDepth < 1 || outputWidth < 1 || outputHeight < 1)
- THError("Given input size: (%dx%dx%dx%d). Calculated output size: (%dx%dx%dx%d). Output size is too small",
- nInputPlane,inputDepth,inputHeight,inputWidth,nOutputPlane,outputDepth,outputHeight,outputWidth);
-
- // Batch size + input planes
- long batchSize = input->size[0];
-
- // Resize output
- THCudaTensor_resize5d(state, output, batchSize, nOutputPlane, outputDepth, outputHeight, outputWidth);
-
- // Resize temporary columns
- THCudaTensor_resize2d(state, columns, nInputPlane*kT*kW*kH, outputDepth*outputHeight*outputWidth);
-
- // Define a buffer of ones, for bias accumulation
- // Note: this buffer can be shared with other modules, it only ever gets increased,
- // and always contains ones.
- if (ones->nDimension != 2 || ones->size[0]*ones->size[1]*ones->size[2] < outputDepth*outputHeight*outputWidth) {
- // Resize plane and fill with ones...
- THCudaTensor_resize3d(state, ones, outputDepth, outputHeight, outputWidth);
- THCudaTensor_fill(state, ones, 1);
- }
-
- // Helpers
- THCudaTensor *input_n = THCudaTensor_new(state);
- THCudaTensor *output_n = THCudaTensor_new(state);
-
- // For each elt in batch, do:
- for (int elt = 0; elt < batchSize; elt ++) {
- // Matrix mulitply per output:
- THCudaTensor_select(state, input_n, input, 0, elt);
- THCudaTensor_select(state, output_n, output, 0, elt);
-
- // Do Bias first:
- // M,N,K are dims of matrix A and B
- // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
- long m_ = nOutputPlane;
- long n_ = outputDepth * outputHeight * outputWidth;
- long k_ = 1;
-
- // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
- if (bias) {
- THCudaBlas_Sgemm(
- state,
- 't', 'n',
- n_, m_, k_,
- 1,
- THCudaTensor_data(state, ones), k_,
- THCudaTensor_data(state, bias), k_,
- 0,
- THCudaTensor_data(state, output_n), n_
- );
- } else {
- THCudaTensor_zero(state, output_n);
- }
-
- // Extract columns:
- vol2col(
- THCState_getCurrentStream(state),
- THCudaTensor_data(state, input_n),
- nInputPlane, inputDepth, inputHeight, inputWidth,
- kT, kH, kW, padT, padH, padW, dT, dH, dW,
- dilationT, dilationH, dilationW,
- THCudaTensor_data(state, columns)
- );
-
- // M,N,K are dims of matrix A and B
- // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
- long m = nOutputPlane;
- long n = columns->size[1];
- long k = nInputPlane*kT*kH*kW;
-
- // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
- THCudaBlas_Sgemm(
- state,
- 'n', 'n',
- n, m, k,
- 1,
- THCudaTensor_data(state, columns), n,
- THCudaTensor_data(state, weight), k,
- 1,
- THCudaTensor_data(state, output_n), n
- );
- }
-
- // Free
- THCudaTensor_free(state, input_n);
- THCudaTensor_free(state, output_n);
-
- // Resize output
- if (batch == 0) {
- THCudaTensor_resize4d(state, output, nOutputPlane, outputDepth, outputHeight, outputWidth);
- THCudaTensor_resize4d(state, input, nInputPlane, inputDepth, inputHeight, inputWidth);
- }
-}
-
-void THNN_CudaVolumetricDilatedConvolution_updateGradInput(
- THCState *state,
- THCudaTensor *input,
- THCudaTensor *gradOutput,
- THCudaTensor *gradInput,
- THCudaTensor *weight,
- THCudaTensor *gradColumns,
- int kT, int kW, int kH,
- int dT, int dW, int dH,
- int padT, int padW, int padH,
- int dilationT, int dilationW, int dilationH) {
-
- THCUNN_assertSameGPU(state, 5, input, gradOutput, weight,
- gradColumns, gradInput);
- THArgCheck(input->nDimension == 4 || input->nDimension == 5, 2, "4D or 5D (batch mode) tensor is expected");
- THArgCheck(gradOutput->nDimension == 4 || gradOutput->nDimension == 5, 3, "4D or 5D (batch mode) tensor is expected");
- THArgCheck(weight->nDimension == 5, 4, "weight tensor must be 5D (nOutputPlane,nInputPlane,kT,kH,kW)");
- THArgCheck(kT > 0 && kW > 0 && kH > 0, 8, "kernel size should be greater than zero");
- THArgCheck(dT > 0 && dW > 0 && dH > 0, 10, "stride should be greater than zero");
-
- // Params
- int nInputPlane = weight->size[1];
- int nOutputPlane = weight->size[0];
-
- int batch = 1;
- if (input->nDimension == 4) {
- // Force batch
- batch = 0;
- THCudaTensor_resize5d(state, input, 1, input->size[0], input->size[1], input->size[2], input->size[3]);
- THCudaTensor_resize5d(state, gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2], gradOutput->size[3]);
- }
-
- long inputDepth = input->size[2];
- long inputWidth = input->size[4];
- long inputHeight = input->size[3];
- long outputDepth = (inputDepth + 2*padT - (dilationT * (kT - 1) + 1)) / dT + 1;
- long outputWidth = (inputWidth + 2*padW - (dilationW * (kW - 1) + 1)) / dW + 1;
- long outputHeight = (inputHeight + 2*padH - (dilationH * (kH - 1) + 1)) / dH + 1;
-
- // Batch size + input planes
- long batchSize = input->size[0];
-
- // Resize output
- THCudaTensor_resize5d(state, gradInput, batchSize, nInputPlane, inputDepth, inputHeight, inputWidth);
-
- // Resize temporary columns
- THCudaTensor_resize2d(state, gradColumns, nInputPlane*kT*kW*kH, outputDepth*outputHeight*outputWidth);
-
- // Helpers
- THCudaTensor *gradInput_n = THCudaTensor_new(state);
- THCudaTensor *gradOutput_n = THCudaTensor_new(state);
-
- // For each elt in batch, do:
- for (int elt = 0; elt < batchSize; elt ++) {
- // Matrix mulitply per sample:
- THCudaTensor_select(state, gradInput_n, gradInput, 0, elt);
- THCudaTensor_select(state, gradOutput_n, gradOutput, 0, elt);
-
- // M,N,K are dims of matrix A and B
- // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
- long m = nInputPlane*kT*kW*kH;
- long n = gradColumns->size[1];
- long k = nOutputPlane;
-
- // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
- THCudaBlas_Sgemm(
- state,
- 'n', 't',
- n, m, k,
- 1,
- THCudaTensor_data(state, gradOutput_n), n,
- THCudaTensor_data(state, weight), m,
- 0,
- THCudaTensor_data(state, gradColumns), n
- );
-
- // Unpack columns back into input:
- col2vol(
- THCState_getCurrentStream(state),
- THCudaTensor_data(state, gradColumns),
- nInputPlane, inputDepth, inputHeight, inputWidth,
- kT, kH, kW, padT, padH, padW, dT, dH, dW,
- dilationT, dilationH, dilationW,
- THCudaTensor_data(state, gradInput_n)
- );
- }
-
- // Free
- THCudaTensor_free(state, gradInput_n);
- THCudaTensor_free(state, gradOutput_n);
-
- // Resize output
- if (batch == 0) {
- THCudaTensor_resize4d(state, gradOutput, nOutputPlane, outputDepth, outputHeight, outputWidth);
- THCudaTensor_resize4d(state, input, nInputPlane, inputDepth, inputHeight, inputWidth);
- THCudaTensor_resize4d(state, gradInput, nInputPlane, inputDepth, inputHeight, inputWidth);
- }
-}
-
-void THNN_CudaVolumetricDilatedConvolution_accGradParameters(
- THCState *state,
- THCudaTensor *input,
- THCudaTensor *gradOutput,
- THCudaTensor *gradWeight,
- THCudaTensor *gradBias,
- THCudaTensor *columns,
- THCudaTensor *ones,
- int kT, int kW, int kH,
- int dT, int dW, int dH,
- int padT, int padW, int padH,
- int dilationT, int dilationW, int dilationH,
- float scale) {
-
- THCUNN_assertSameGPU(state, 5, input, gradOutput, gradWeight, columns, ones);
- if (gradBias) {
- THCUNN_assertSameGPU(state, 2, gradWeight, gradBias);
- }
- THArgCheck(input->nDimension == 4 || input->nDimension == 5, 2, "4D or 5D (batch mode) tensor is expected");
- THArgCheck(gradOutput->nDimension == 4 || gradOutput->nDimension == 5, 3, "4D or 5D (batch mode) tensor is expected");
- THArgCheck(gradWeight->nDimension == 5, 4, "gradWeight tensor must be 5D (nOutputPlane,nInputPlane,kT,kH,kW)");
- THArgCheck(kT > 0 && kW > 0 && kH > 0, 8, "kernel size should be greater than zero");
- THArgCheck(dT > 0 && dW > 0 && dH > 0, 10, "stride should be greater than zero");
-
- // Params
- int nInputPlane = gradWeight->size[1];
- int nOutputPlane = gradWeight->size[0];
-
- int batch = 1;
- if (input->nDimension == 4) {
- // Force batch
- batch = 0;
- THCudaTensor_resize5d(state, input, 1, input->size[0], input->size[1], input->size[2], input->size[3]);
- THCudaTensor_resize5d(state, gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2], gradOutput->size[3]);
- }
-
- long inputDepth = input->size[2];
- long inputWidth = input->size[4];
- long inputHeight = input->size[3];
- long outputDepth = (inputDepth + 2*padT - (dilationT * (kT - 1) + 1)) / dT + 1;
- long outputWidth = (inputWidth + 2*padW - (dilationW * (kW - 1) + 1)) / dW + 1;
- long outputHeight = (inputHeight + 2*padH - (dilationH * (kH - 1) + 1)) / dH + 1;
-
- // Batch size + input planes
- long batchSize = input->size[0];
-
- // Define a buffer of ones, for bias accumulation
- if (ones->nDimension != 3 || ones->size[0]*ones->size[1]*ones->size[2] < outputDepth*outputHeight*outputWidth) {
- // Resize plane and fill with ones...
- THCudaTensor_resize3d(state, ones, outputDepth, outputHeight, outputWidth);
- THCudaTensor_fill(state, ones, 1);
- }
-
- // Resize temporary columns
- THCudaTensor_resize2d(state, columns, nInputPlane*kT*kW*kH, outputDepth*outputHeight*outputWidth);
-
- // Helpers
- THCudaTensor *input_n = THCudaTensor_new(state);
- THCudaTensor *gradOutput_n = THCudaTensor_new(state);
-
- // For each elt in batch, do:
- for (int elt = 0; elt < batchSize; elt ++) {
- // Matrix mulitply per output:
- THCudaTensor_select(state, input_n, input, 0, elt);
- THCudaTensor_select(state, gradOutput_n, gradOutput, 0, elt);
-
- // Extract columns:
- vol2col(
- THCState_getCurrentStream(state),
- THCudaTensor_data(state, input_n),
- nInputPlane, inputDepth, inputHeight, inputWidth, kT, kH, kW, padT, padH, padW, dT, dH, dW,
- dilationT, dilationH, dilationW,
- THCudaTensor_data(state, columns)
- );
-
- // M,N,K are dims of matrix A and B
- // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
- long m = nOutputPlane;
- long n = nInputPlane*kT*kW*kH;
- long k = columns->size[1];
-
- // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
- THCudaBlas_Sgemm(
- state,
- 't', 'n',
- n, m, k,
- scale,
- THCudaTensor_data(state, columns), k,
- THCudaTensor_data(state, gradOutput_n), k,
- 1,
- THCudaTensor_data(state, gradWeight), n
- );
-
- // Do Bias:
- // M,N,K are dims of matrix A and B
- // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
- long m_ = nOutputPlane;
- long k_ = outputDepth * outputHeight * outputWidth;
-
- // Do GEMV (note: this is a bit confusing because gemv assumes column-major matrices)
- if (gradBias) {
- THCudaBlas_Sgemv(
- state,
- 't',
- k_, m_,
- scale,
- THCudaTensor_data(state, gradOutput_n), k_,
- THCudaTensor_data(state, ones), 1,
- 1,
- THCudaTensor_data(state, gradBias), 1
- );
- }
- }
-
- // Free
- THCudaTensor_free(state, input_n);
- THCudaTensor_free(state, gradOutput_n);
-
- // Resize output
- if (batch == 0) {
- THCudaTensor_resize4d(state, gradOutput, nOutputPlane, outputDepth, outputHeight, outputWidth);
- THCudaTensor_resize4d(state, input, nInputPlane, inputDepth, inputHeight, inputWidth);
- }
-}
+#include "generic/VolumetricDilatedConvolution.cu"
+#include "THCGenerateFloatTypes.h"
diff --git a/lib/THCUNN/VolumetricDilatedMaxPooling.cu b/lib/THCUNN/VolumetricDilatedMaxPooling.cu
index 49c56e7..0cff32c 100644
--- a/lib/THCUNN/VolumetricDilatedMaxPooling.cu
+++ b/lib/THCUNN/VolumetricDilatedMaxPooling.cu
@@ -3,16 +3,20 @@
#include "THCDeviceTensor.cuh"
#include "THCDeviceTensorUtils.cuh"
#include "THCDeviceUtils.cuh"
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
+#include "THCAtomics.cuh"
#include <cfloat>
+template <typename Dtype>
__global__ void cuda_VolumetricDilatedMaxPooling_updateOutput(
- THCDeviceTensor<float, 4> input,
- THCDeviceTensor<float, 4> indices,
- THCDeviceTensor<float, 4> output,
+ THCDeviceTensor<Dtype, 4> input,
+ THCDeviceTensor<THCIndex_t, 4> indices,
+ THCDeviceTensor<Dtype, 4> output,
int kT, int kH, int kW,
int dT, int dH, int dW,
- int padT, int padH, int padW,
+ int padT, int padH, int padW,
int dilationT, int dilationH, int dilationW,
int offsetZ)
{
@@ -31,7 +35,7 @@ __global__ void cuda_VolumetricDilatedMaxPooling_updateOutput(
int maxRow = 0;
int maxFrame = 0;
- float max = -FLT_MAX;
+ Dtype max = THCNumerics<Dtype>::min();
for (int frame = 0; frame < kT; ++frame)
{
@@ -45,7 +49,7 @@ __global__ void cuda_VolumetricDilatedMaxPooling_updateOutput(
{
if (iColumn + column * dilationW < input.getSize(3) && iColumn + column * dilationW >= 0)
{
- float val = input[slice][iFrame + frame * dilationT][iRow + row * dilationH][iColumn + column * dilationW];
+ Dtype val = input[slice][iFrame + frame * dilationT][iRow + row * dilationH][iColumn + column * dilationW];
if (max < val)
{
@@ -62,7 +66,7 @@ __global__ void cuda_VolumetricDilatedMaxPooling_updateOutput(
}
output[slice][oFrame][oRow][oColumn] = max;
- float *idx = &indices[slice][oFrame][oRow][oColumn];
+ THCIndex_t *idx = &indices[slice][oFrame][oRow][oColumn];
((unsigned char*)(idx))[0] = maxFrame;
((unsigned char*)(idx))[1] = maxRow;
((unsigned char*)(idx))[2] = maxColumn;
@@ -70,13 +74,13 @@ __global__ void cuda_VolumetricDilatedMaxPooling_updateOutput(
}
}
-template <int KERNEL_WIDTH>
+template <int KERNEL_WIDTH, typename Dtype>
__global__ void cuda_VolumetricDilatedMaxPooling_updateOutput(
- THCDeviceTensor<float, 4> input, THCDeviceTensor<float, 4> indices,
- THCDeviceTensor<float, 4> output,
+ THCDeviceTensor<Dtype, 4> input, THCDeviceTensor<THCIndex_t, 4> indices,
+ THCDeviceTensor<Dtype, 4> output,
int kT, int kH,
int dT, int dH, int dW,
- int padT, int padH, int padW,
+ int padT, int padH, int padW,
int dilationT, int dilationH, int dilationW,
int offsetZ)
{
@@ -95,7 +99,7 @@ __global__ void cuda_VolumetricDilatedMaxPooling_updateOutput(
int maxRow = 0;
int maxFrame;
- float max = -FLT_MAX;
+ Dtype max = THCNumerics<Dtype>::min();
for (int frame = 0; frame < kT; ++frame)
{
@@ -109,7 +113,7 @@ __global__ void cuda_VolumetricDilatedMaxPooling_updateOutput(
{
if (iColumn + column * dilationW < input.getSize(3) && iColumn + column * dilationW >= 0)
{
- float val = input[slice][iFrame + frame * dilationT][iRow + row * dilationH][iColumn + column * dilationW];
+ Dtype val = input[slice][iFrame + frame * dilationT][iRow + row * dilationH][iColumn + column * dilationW];
if (max < val)
{
@@ -126,7 +130,7 @@ __global__ void cuda_VolumetricDilatedMaxPooling_updateOutput(
}
output[slice][oFrame][oRow][oColumn] = max;
- float *idx = &indices[slice][oFrame][oRow][oColumn];
+ THCIndex_t *idx = &indices[slice][oFrame][oRow][oColumn];
((unsigned char*)(idx))[0] = maxFrame;
((unsigned char*)(idx))[1] = maxRow;
((unsigned char*)(idx))[2] = maxColumn;
@@ -134,193 +138,13 @@ __global__ void cuda_VolumetricDilatedMaxPooling_updateOutput(
}
}
-#define UPDATE_OUTPUT_KERNEL_WIDTH(KW) case KW: \
- cuda_VolumetricDilatedMaxPooling_updateOutput<KW><<<grid, block, \
- 0, THCState_getCurrentStream(state)>>>( \
- cudaInput, cudaIndices, cudaOutput, kT, kH, dT, dH, dW, padT, padH, padW,\
- dilationT, dilationH, dilationW, offsetZ); \
- break
-
-
-void THNN_CudaVolumetricDilatedMaxPooling_updateOutput(
- THCState *state, THCudaTensor *input, THCudaTensor *output, THCudaTensor *indices,
- int kT, int kW, int kH,
- int dT, int dW, int dH,
- int padT, int padW, int padH,
- int dilationT, int dilationW, int dilationH,
- bool ceilMode)
-{
- int batchSize;
- int inputSlices;
- int inputTime;
- int inputHeight;
- int inputWidth;
- int outputTime;
- int outputHeight;
- int outputWidth;
-
- THCUNN_assertSameGPU(state, 3, input, indices, output);
-
- if (THCudaTensor_nDimension(state, input) == 4)
- {
- THArgCheck(
- THCudaTensor_size(state, input, 1) >= kT &&
- THCudaTensor_size(state, input, 2) >= kH &&
- THCudaTensor_size(state, input, 3) >= kW, 2,
- "input image smaller than kernel size"
- );
-
- /* sizes */
- batchSize = 1;
- inputSlices = THCudaTensor_size(state, input, 0);
- inputTime = THCudaTensor_size(state, input, 1);
- inputHeight = THCudaTensor_size(state, input, 2);
- inputWidth = THCudaTensor_size(state, input, 3);
- }
- else if (THCudaTensor_nDimension(state, input) == 5)
- {
- THArgCheck(
- THCudaTensor_size(state, input, 4) >= kW &&
- THCudaTensor_size(state, input, 3) >= kH &&
- THCudaTensor_size(state, input, 2) >= kT, 2,
- "input image smaller than kernel size"
- );
-
- /* sizes */
- batchSize = THCudaTensor_size(state, input, 0);
- inputSlices = THCudaTensor_size(state, input, 1);
- inputTime = THCudaTensor_size(state, input, 2);
- inputHeight = THCudaTensor_size(state, input, 3);
- inputWidth = THCudaTensor_size(state, input, 4);
- }
- else
- {
- THArgCheck(false, 2, "4D or 5D tensor expected");
- }
-
- THArgCheck(kT/2 >= padT && kW/2 >= padW && kH/2 >= padH, 2,
- "pad should be smaller than half of kernel size"
- );
-
- if (ceilMode)
- {
- outputTime = (int)(ceil((float)(inputTime - (dilationT * (kT - 1) + 1) + 2*padT) / dT)) + 1;
- outputHeight = (int)(ceil((float)(inputHeight - (dilationH * (kH - 1) + 1) + 2*padH) / dH)) + 1;
- outputWidth = (int)(ceil((float)(inputWidth - (dilationW * (kW - 1) + 1) + 2*padW) / dW)) + 1;
- }
- else
- {
- outputTime = (int)(floor((float)(inputTime - (dilationT * (kT - 1) + 1) + 2*padT) / dT)) + 1;
- outputHeight = (int)(floor((float)(inputHeight - (dilationH * (kH - 1) + 1) + 2*padH) / dH)) + 1;
- outputWidth = (int)(floor((float)(inputWidth - (dilationW * (kW - 1) + 1) + 2*padW) / dW)) + 1;
- }
-
- if (outputTime < 1 || outputHeight < 1 || outputWidth < 1)
- THError("Given input size: (%dx%dx%dx%d). Calculated output size: (%dx%dx%dx%d). Output size is too small",
- inputSlices,inputTime,inputHeight,inputWidth,inputSlices,outputTime,outputHeight,outputWidth);
-
- if (padT || padW || padH)
- {
- if ((outputTime - 1)*dT >= inputTime + padT)
- --outputTime;
- if ((outputHeight - 1)*dH >= inputHeight + padH)
- --outputHeight;
- if ((outputWidth - 1)*dW >= inputWidth + padW)
- --outputWidth;
- }
-
- if (input->nDimension == 4) /* 4D */
- {
- /* resize output */
- THCudaTensor_resize4d(state, output, inputSlices,
- outputTime, outputHeight, outputWidth);
- /* indices pack ti,i,j locations for each output point as uchar into
- each float of the tensor */
- THCudaTensor_resize4d(state, indices, inputSlices,
- outputTime, outputHeight, outputWidth);
- }
- else
- { /* 5D */
- THCudaTensor_resize5d(state, output, batchSize, inputSlices,
- outputTime, outputHeight, outputWidth);
- // Index tensor packs index offsets as uchars into floats
- THCudaTensor_resize5d(state, indices, batchSize, inputSlices,
- outputTime, outputHeight, outputWidth);
- }
-
- input = THCudaTensor_newContiguous(state, input);
-
- // Collapse batch and feature dimensions
- THCDeviceTensor<float, 4> cudaInput;
- THCDeviceTensor<float, 4> cudaOutput;
- if (THCudaTensor_nDimension(state, input) == 4)
- {
- cudaInput = toDeviceTensor<float, 4>(state, input);
- cudaOutput = toDeviceTensor<float, 4>(state, output);
- }
- else
- {
- cudaInput = toDeviceTensor<float, 5>(state, input).downcastOuter<4>();
- cudaOutput = toDeviceTensor<float, 5>(state, output).downcastOuter<4>();
- }
-
- THLongStorage *indicesSize = THLongStorage_newWithSize(4);
- long indicesSizeRaw[4] = { batchSize * inputSlices,
- outputTime, outputHeight, outputWidth };
- THLongStorage_rawCopy(indicesSize, indicesSizeRaw);
-
- THCudaTensor *indices1 = THCudaTensor_newWithStorage(
- state, THCudaTensor_storage(state, indices),
- THCudaTensor_storageOffset(state, indices),
- indicesSize, NULL);
-
- THLongStorage_free(indicesSize);
-
- THCDeviceTensor<float, 4> cudaIndices =
- toDeviceTensor<float, 4>(state, indices1);
-
- int totalZ = outputTime * inputSlices * batchSize;
- int offsetZ = 0;
- dim3 block(32, 8);
-
- while (totalZ > 0) {
- dim3 grid(THCCeilDiv(outputWidth, static_cast<int>(block.x)),
- THCCeilDiv(outputHeight, static_cast<int>(block.y)),
- totalZ > 65535 ? 65535 : totalZ);
-
- switch (kW)
- {
- UPDATE_OUTPUT_KERNEL_WIDTH(1);
- UPDATE_OUTPUT_KERNEL_WIDTH(2);
- UPDATE_OUTPUT_KERNEL_WIDTH(3);
- UPDATE_OUTPUT_KERNEL_WIDTH(4);
- UPDATE_OUTPUT_KERNEL_WIDTH(5);
- UPDATE_OUTPUT_KERNEL_WIDTH(6);
- UPDATE_OUTPUT_KERNEL_WIDTH(7);
- default:
- cuda_VolumetricDilatedMaxPooling_updateOutput<<<grid, block,
- 0, THCState_getCurrentStream(state)>>>(
- cudaInput, cudaIndices, cudaOutput,
- kT, kH, kW, dT, dH, dW,
- padT, padH, padW, dilationT, dilationH, dilationW, offsetZ);
- }
- THCudaCheck(cudaGetLastError());
- totalZ -= 65535;
- offsetZ += 65535;
- }
-
- THCudaTensor_free(state, input);
- THCudaTensor_free(state, indices1);
-}
-
-#undef UPDATE_OUTPUT_KERNEL_WIDTH
-
+template <typename Dtype>
__global__ void cuda_VolumetricDilatedMaxPooling_updateGradInput(
- THCDeviceTensor<float, 4> gradOutput,
- THCDeviceTensor<float, 4> indices,
- THCDeviceTensor<float, 4> gradInput,
+ THCDeviceTensor<Dtype, 4> gradOutput,
+ THCDeviceTensor<THCIndex_t, 4> indices,
+ THCDeviceTensor<Dtype, 4> gradInput,
int dT, int dH, int dW,
- int padT, int padH, int padW,
+ int padT, int padH, int padW,
int dilationT, int dilationH, int dilationW,
int offsetZ)
{
@@ -331,7 +155,7 @@ __global__ void cuda_VolumetricDilatedMaxPooling_updateGradInput(
if (oRow < gradOutput.getSize(2) && oColumn < gradOutput.getSize(3))
{
- float *idx = &indices[slice][oFrame][oRow][oColumn];
+ THCIndex_t *idx = &indices[slice][oFrame][oRow][oColumn];
int iFrame = ((unsigned char*)(idx))[0] * dilationT + oFrame * dT - padT;
int iRow = ((unsigned char*)(idx))[1] * dilationH + oRow * dH - padH;
int iColumn = ((unsigned char*)(idx))[2] * dilationW + oColumn * dW - padW;
@@ -340,98 +164,5 @@ __global__ void cuda_VolumetricDilatedMaxPooling_updateGradInput(
}
}
-void THNN_CudaVolumetricDilatedMaxPooling_updateGradInput(
- THCState *state, THCudaTensor *input, THCudaTensor *gradOutput, THCudaTensor *gradInput,
- THCudaTensor *indices,
- int dT, int dW, int dH,
- int padT, int padW, int padH,
- int dilationT, int dilationW, int dilationH)
-{
- // Resize and initialize result tensor.
- THCudaTensor_resizeAs(state, gradInput, input);
- THCudaTensor_zero(state, gradInput);
-
- int batchSize;
- int inputSlices;
-
- int outputTime;
- int outputHeight;
- int outputWidth;
-
- THCUNN_assertSameGPU(state, 4, input, indices, gradOutput, gradInput);
-
- if (THCudaTensor_nDimension(state, input) == 4) /* 4D */
- {
- batchSize = 1;
- inputSlices = THCudaTensor_size(state, input, 0);
-
- outputTime = THCudaTensor_size(state, gradOutput, 1);
- outputHeight = THCudaTensor_size(state, gradOutput, 2);
- outputWidth = THCudaTensor_size(state, gradOutput, 3);
- }
- else
- {
- batchSize = THCudaTensor_size(state, input, 0);
- inputSlices = THCudaTensor_size(state, input, 1);
-
- outputTime = THCudaTensor_size(state, gradOutput, 2);
- outputHeight = THCudaTensor_size(state, gradOutput, 3);
- outputWidth = THCudaTensor_size(state, gradOutput, 4);
- }
-
- gradOutput = THCudaTensor_newContiguous(state, gradOutput);
-
- // Collapse batch and feature dimensions
- THCDeviceTensor<float, 4> cudaGradInput;
- THCDeviceTensor<float, 4> cudaGradOutput;
- if (THCudaTensor_nDimension(state, input) == 4)
- {
- cudaGradInput = toDeviceTensor<float, 4>(state, gradInput);
- cudaGradOutput = toDeviceTensor<float, 4>(state, gradOutput);
- }
- else
- {
- cudaGradInput =
- toDeviceTensor<float, 5>(state, gradInput).downcastOuter<4>();
- cudaGradOutput =
- toDeviceTensor<float, 5>(state, gradOutput).downcastOuter<4>();
- }
-
- THLongStorage *indicesSize = THLongStorage_newWithSize(4);
- long indicesSizeRaw[4] = { batchSize * inputSlices,
- outputTime, outputHeight, outputWidth };
- THLongStorage_rawCopy(indicesSize, indicesSizeRaw);
- THCudaTensor *indices1 = THCudaTensor_newWithStorage(
- state, THCudaTensor_storage(state, indices),
- THCudaTensor_storageOffset(state, indices), indicesSize, NULL);
- THLongStorage_free(indicesSize);
-
- THCDeviceTensor<float, 4> cudaIndices =
- toDeviceTensor<float, 4>(state, indices1);
-
- int totalZ = outputTime * inputSlices * batchSize;
- int offsetZ = 0;
- dim3 block(32, 8);
-
- while (totalZ > 0) {
- dim3 grid(THCCeilDiv(outputWidth, static_cast<int>(block.x)),
- THCCeilDiv(outputHeight, static_cast<int>(block.y)),
- totalZ > 65535 ? 65535 : totalZ);
-
- cuda_VolumetricDilatedMaxPooling_updateGradInput<<<grid, block,
- 0, THCState_getCurrentStream(state)>>>(
- cudaGradOutput,
- cudaIndices,
- cudaGradInput,
- dT, dH, dW,
- padT, padH, padW,
- dilationT, dilationH, dilationW, offsetZ);
- THCudaCheck(cudaGetLastError());
- totalZ -= 65535;
- offsetZ += 65535;
- }
-
- // cleanup
- THCudaTensor_free(state, gradOutput);
- THCudaTensor_free(state, indices1);
-}
+#include "generic/VolumetricDilatedMaxPooling.cu"
+#include "THCGenerateFloatTypes.h"
diff --git a/lib/THCUNN/VolumetricFullConvolution.cu b/lib/THCUNN/VolumetricFullConvolution.cu
index ae414b7..93c4c0f 100644
--- a/lib/THCUNN/VolumetricFullConvolution.cu
+++ b/lib/THCUNN/VolumetricFullConvolution.cu
@@ -1,356 +1,8 @@
#include "THCUNN.h"
#include "common.h"
#include "vol2col.h"
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
-
-void THNN_CudaVolumetricFullConvolution_updateOutput(
- THCState *state,
- THCudaTensor *input,
- THCudaTensor *output,
- THCudaTensor *weight,
- THCudaTensor *bias,
- THCudaTensor *finput,
- THCudaTensor *fgradInput,
- int dT, int dW, int dH,
- int padT, int padW, int padH,
- int adjT, int adjW, int adjH)
-{
-
- THCudaTensor *columns = finput;
- THCudaTensor *ones = fgradInput;
-
- int nInputPlane = THCudaTensor_size(state, weight, 0);
- int nOutputPlane = THCudaTensor_size(state, weight, 1);
- const int kT = (int)weight->size[2];
- const int kH = (int)weight->size[3];
- const int kW = (int)weight->size[4];
-
- THCUNN_assertSameGPU(state, 6, input, output, weight,
- bias, columns, ones);
- THArgCheck(input->nDimension == 4 || input->nDimension == 5, 2, "4D or 5D (batch mode) tensor is expected");
-
- int batch = 1;
- if (input->nDimension == 4) {
- THArgCheck(input->size[0] == nInputPlane, 2, "input channels and nInputPlane dont match");
- // Force batch
- batch = 0;
- THCudaTensor_resize5d(state, input, 1, input->size[0], input->size[1], input->size[2], input->size[3]);
- } else {
- THArgCheck(input->size[1] == nInputPlane, 2, "input channels and nInputPlane dont match");
- }
-
- long inputWidth = input->size[4];
- long inputHeight = input->size[3];
- long inputDepth = input->size[2];
- long outputWidth = (inputWidth - 1) * dW - 2*padW + kW + adjW;
- long outputHeight = (inputHeight - 1) * dH - 2*padH + kH + adjH;
- long outputDepth = (inputDepth - 1) * dT - 2*padT + kT + adjT;
-
- // Batch size + input planes
- long batchSize = input->size[0];
-
- // Resize output
- THCudaTensor_resize5d(state, output, batchSize, nOutputPlane, outputDepth, outputHeight, outputWidth);
-
- // Resize temporary columns
- THCudaTensor_resize2d(state, columns, nOutputPlane*kW*kH*kT, inputDepth*inputHeight*inputWidth);
-
- // Define a buffer of ones, for bias accumulation
- // Note: this buffer can be shared with other modules, it only ever gets increased,
- // and always contains ones.
- if (ones->nDimension != 3 || ones->size[0]*ones->size[1]*ones->size[2] < outputDepth*outputHeight*outputWidth) {
- // Resize plane and fill with ones...
- THCudaTensor_resize3d(state, ones, outputDepth, outputHeight, outputWidth);
- THCudaTensor_fill(state, ones, 1);
- }
-
- // Helpers
- THCudaTensor *input_n = THCudaTensor_new(state);
- THCudaTensor *output_n = THCudaTensor_new(state);
-
- // For each elt in batch, do:
- for (int elt = 0; elt < batchSize; elt ++) {
- // Matrix mulitply per output:
- THCudaTensor_select(state, input_n, input, 0, elt);
- THCudaTensor_select(state, output_n, output, 0, elt);
-
- // M,N,K are dims of matrix A and B
- // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
- long m = weight->size[1] * weight->size[2] * weight->size[3] * weight->size[4];
- long n = columns->size[1];
- long k = weight->size[0];
-
- // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
- THCudaBlas_Sgemm(
- state,
- 'n', 't',
- n, m, k,
- 1,
- THCudaTensor_data(state, input_n), n,
- THCudaTensor_data(state, weight), m,
- 0,
- THCudaTensor_data(state, columns), n
- );
-
- // Unpack columns back into input:
- col2vol(
- THCState_getCurrentStream(state),
- THCudaTensor_data(state, columns),
- nOutputPlane, outputDepth, outputHeight, outputWidth, kT, kH, kW, padT, padH, padW, dT, dH, dW,
- 1,1,1,
- THCudaTensor_data(state, output_n)
- );
-
- // Do Bias after:
- // M,N,K are dims of matrix A and B
- // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
- long m_ = nOutputPlane;
- long n_ = outputDepth * outputHeight * outputWidth;
- long k_ = 1;
-
- // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
- THCudaBlas_Sgemm(
- state,
- 't', 'n',
- n_, m_, k_,
- 1,
- THCudaTensor_data(state, ones), k_,
- THCudaTensor_data(state, bias), k_,
- 1,
- THCudaTensor_data(state, output_n), n_
- );
-
- }
-
- // Free
- THCudaTensor_free(state, input_n);
- THCudaTensor_free(state, output_n);
-
- // Resize output
- if (batch == 0) {
- THCudaTensor_resize4d(state, output, nOutputPlane, outputDepth, outputHeight, outputWidth);
- THCudaTensor_resize4d(state, input, nInputPlane, inputDepth, inputHeight, inputWidth);
- }
-}
-
-void THNN_CudaVolumetricFullConvolution_updateGradInput(
- THCState *state,
- THCudaTensor *input,
- THCudaTensor *gradOutput,
- THCudaTensor *gradInput,
- THCudaTensor *weight,
- THCudaTensor *finput,
- THCudaTensor *fgradInput,
- int dT, int dW, int dH,
- int padT, int padW, int padH,
- int adjT, int adjW, int adjH)
-{
- THCudaTensor *gradColumns = finput;
-
- int nInputPlane = THCudaTensor_size(state, weight, 0);
- int nOutputPlane = THCudaTensor_size(state, weight, 1);
- const int kT = (int)weight->size[2];
- const int kH = (int)weight->size[3];
- const int kW = (int)weight->size[4];
-
- THCUNN_assertSameGPU(state, 5, input, gradOutput, weight,
- gradColumns, gradInput);
- THArgCheck(input->nDimension == 4 || input->nDimension == 5, 2, "4D or 5D (batch mode) tensor is expected");
-
- int batch = 1;
- if (input->nDimension == 4) {
- // Force batch
- batch = 0;
- THCudaTensor_resize5d(state, input, 1, input->size[0], input->size[1], input->size[2], input->size[3]);
- THCudaTensor_resize5d(state, gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2], gradOutput->size[3]);
- }
-
- long inputWidth = input->size[4];
- long inputHeight = input->size[3];
- long inputDepth = input->size[2];
- long outputWidth = (inputWidth - 1) * dW - 2*padW + kW + adjW;
- long outputHeight = (inputHeight - 1) * dH - 2*padH + kH + adjH;
- long outputDepth = (inputDepth - 1) * dT - 2*padT + kT + adjT;
-
- // Batch size + input planes
- long batchSize = input->size[0];
-
- // Resize output
- THCudaTensor_resize5d(state, gradInput, batchSize, nInputPlane, inputDepth, inputHeight, inputWidth);
-
- // Resize temporary columns
- THCudaTensor_resize2d(state, gradColumns, nOutputPlane*kW*kH*kT, inputDepth*inputHeight*inputWidth);
-
- // Helpers
- THCudaTensor *gradInput_n = THCudaTensor_new(state);
- THCudaTensor *gradOutput_n = THCudaTensor_new(state);
-
- // For each elt in batch, do:
- for (int elt = 0; elt < batchSize; elt ++) {
- // Matrix mulitply per sample:
- THCudaTensor_select(state, gradInput_n, gradInput, 0, elt);
- THCudaTensor_select(state, gradOutput_n, gradOutput, 0, elt);
-
- // Extract columns:
- vol2col(
- THCState_getCurrentStream(state),
- THCudaTensor_data(state, gradOutput_n),
- nOutputPlane, outputDepth, outputHeight, outputWidth, kT, kH, kW, padT, padH, padW, dT, dH, dW,
- 1,1,1,
- THCudaTensor_data(state, gradColumns)
- );
-
-
- // M,N,K are dims of matrix A and B
- // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
- long m = weight->size[0];
- long n = gradColumns->size[1];
- long k = weight->size[1] * weight->size[2] * weight->size[3] * weight->size[4];
-
- // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
- THCudaBlas_Sgemm(
- state,
- 'n', 'n',
- n, m, k,
- 1,
- THCudaTensor_data(state, gradColumns), n,
- THCudaTensor_data(state, weight), k,
- 0,
- THCudaTensor_data(state, gradInput_n), n
- );
- }
-
-
- // Free
- THCudaTensor_free(state, gradInput_n);
- THCudaTensor_free(state, gradOutput_n);
-
- // Resize output
- if (batch == 0) {
- THCudaTensor_resize4d(state, gradOutput, nOutputPlane, outputDepth, outputHeight, outputWidth);
- THCudaTensor_resize4d(state, input, nInputPlane, inputDepth, inputHeight, inputWidth);
- THCudaTensor_resize4d(state, gradInput, nInputPlane, inputDepth, inputHeight, inputWidth);
- }
-}
-
-
-void THNN_CudaVolumetricFullConvolution_accGradParameters(
- THCState *state,
- THCudaTensor *input,
- THCudaTensor *gradOutput,
- THCudaTensor *gradWeight,
- THCudaTensor *gradBias,
- THCudaTensor *finput,
- THCudaTensor *fgradInput,
- int dT, int dW, int dH,
- int padT, int padW, int padH,
- int adjT, int adjW, int adjH,
- float scale)
-{
- THCudaTensor *columns = finput;
- THCudaTensor *ones = fgradInput;
-
- int nInputPlane = THCudaTensor_size(state, gradWeight, 0);
- int nOutputPlane = THCudaTensor_size(state, gradWeight, 1);
- const int kT = (int)gradWeight->size[2];
- const int kH = (int)gradWeight->size[3];
- const int kW = (int)gradWeight->size[4];
-
- THCUNN_assertSameGPU(state, 6, input, gradOutput, gradWeight,
- gradBias, columns, ones);
- THArgCheck(input->nDimension == 4 || input->nDimension == 5, 2, "4D or 5D (batch mode) tensor is expected");
-
- int batch = 1;
- if (input->nDimension == 4) {
- // Force batch
- batch = 0;
- THCudaTensor_resize5d(state, input, 1, input->size[0], input->size[1], input->size[2], input->size[3]);
- THCudaTensor_resize5d(state, gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2], gradOutput->size[3]);
- }
-
- long inputWidth = input->size[4];
- long inputHeight = input->size[3];
- long inputDepth = input->size[2];
- long outputWidth = (inputWidth - 1) * dW - 2*padW + kW + adjW;
- long outputHeight = (inputHeight - 1) * dH - 2*padH + kH + adjH;
- long outputDepth = (inputDepth - 1) * dT - 2*padT + kT + adjT;
-
- // Batch size + input planes
- long batchSize = input->size[0];
-
- // Define a buffer of ones, for bias accumulation
- if (ones->nDimension != 3 || ones->size[0]*ones->size[1]*ones->size[2] < outputDepth*outputHeight*outputWidth) {
- // Resize plane and fill with ones...
- THCudaTensor_resize3d(state, ones, outputDepth, outputHeight, outputWidth);
- THCudaTensor_fill(state, ones, 1);
- }
-
- // Resize temporary columns
- THCudaTensor_resize2d(state, columns, nOutputPlane*kW*kH*kT, inputDepth*inputHeight*inputWidth);
-
- // Helpers
- THCudaTensor *input_n = THCudaTensor_new(state);
- THCudaTensor *gradOutput_n = THCudaTensor_new(state);
-
- // For each elt in batch, do:
- for (int elt = 0; elt < batchSize; elt ++) {
- // Matrix mulitply per output:
- THCudaTensor_select(state, input_n, input, 0, elt);
- THCudaTensor_select(state, gradOutput_n, gradOutput, 0, elt);
-
- // Extract columns:
- vol2col(
- THCState_getCurrentStream(state),
- THCudaTensor_data(state, gradOutput_n),
- nOutputPlane, outputDepth, outputHeight, outputWidth, kT, kH, kW, padT, padH, padW, dT, dH, dW,
- 1,1,1,
- THCudaTensor_data(state, columns)
- );
-
- // M,N,K are dims of matrix A and B
- // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
- long n = columns->size[0]; // nOutputPlane * kt * kh * kw
- long m = input_n->size[0]; // nInputPlane
- long k = columns->size[1]; // inputHeight * inputWidth
-
- // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
- THCudaBlas_Sgemm(
- state,
- 't', 'n',
- n, m, k,
- scale,
- THCudaTensor_data(state, columns), k,
- THCudaTensor_data(state, input_n), k,
- 1,
- THCudaTensor_data(state, gradWeight), n
- );
-
- // Do Bias:
- // M,N,K are dims of matrix A and B
- // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
- long m_ = nOutputPlane;
- long k_ = outputDepth * outputHeight * outputWidth;
-
- // Do GEMV (note: this is a bit confusing because gemv assumes column-major matrices)
- THCudaBlas_Sgemv(
- state,
- 't',
- k_, m_,
- scale,
- THCudaTensor_data(state, gradOutput_n), k_,
- THCudaTensor_data(state, ones), 1,
- 1,
- THCudaTensor_data(state, gradBias), 1
- );
- }
-
- // Free
- THCudaTensor_free(state, input_n);
- THCudaTensor_free(state, gradOutput_n);
-
- // Resize
- if (batch == 0) {
- THCudaTensor_resize4d(state, gradOutput, nOutputPlane, outputDepth, outputHeight, outputWidth);
- THCudaTensor_resize4d(state, input, nInputPlane, inputDepth, inputHeight, inputWidth);
- }
-}
+#include "generic/VolumetricFullConvolution.cu"
+#include "THCGenerateFloatTypes.h"
diff --git a/lib/THCUNN/VolumetricMaxPooling.cu b/lib/THCUNN/VolumetricMaxPooling.cu
index 892bef9..2f7de7b 100644
--- a/lib/THCUNN/VolumetricMaxPooling.cu
+++ b/lib/THCUNN/VolumetricMaxPooling.cu
@@ -6,27 +6,5 @@
#include <cfloat>
-void THNN_CudaVolumetricMaxPooling_updateOutput(
- THCState *state, THCudaTensor *input, THCudaTensor *output, THCudaTensor *indices,
- int kT, int kW, int kH,
- int dT, int dW, int dH,
- int padT, int padW, int padH,
- bool ceilMode)
-{
- THNN_CudaVolumetricDilatedMaxPooling_updateOutput(
- state, input, output, indices,
- kT, kW, kH, dT, dW, dH, padT, padW, padH, 1, 1, 1, ceilMode);
-
-}
-
-void THNN_CudaVolumetricMaxPooling_updateGradInput(
- THCState *state, THCudaTensor *input, THCudaTensor *gradOutput, THCudaTensor *gradInput,
- THCudaTensor *indices,
- int dT, int dW, int dH,
- int padT, int padW, int padH)
-{
- THNN_CudaVolumetricDilatedMaxPooling_updateGradInput(
- state, input, gradOutput, gradInput, indices,
- dT, dW, dH, padT, padW, padH, 1, 1, 1);
-
-}
+#include "generic/VolumetricMaxPooling.cu"
+#include "THCGenerateFloatTypes.h"
diff --git a/lib/THCUNN/VolumetricMaxUnpooling.cu b/lib/THCUNN/VolumetricMaxUnpooling.cu
index b583211..5d2b8b4 100644
--- a/lib/THCUNN/VolumetricMaxUnpooling.cu
+++ b/lib/THCUNN/VolumetricMaxUnpooling.cu
@@ -3,13 +3,16 @@
#include "THCDeviceTensor.cuh"
#include "THCDeviceTensorUtils.cuh"
#include "THCDeviceUtils.cuh"
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
#include <cfloat>
+template <typename Dtype>
__global__ void cuda_VolumetricMaxUnpooling_updateOutput(
- THCDeviceTensor<float, 4> input,
- THCDeviceTensor<float, 4> indices,
- THCDeviceTensor<float, 4> output,
+ THCDeviceTensor<Dtype, 4> input,
+ THCDeviceTensor<THCIndex_t, 4> indices,
+ THCDeviceTensor<Dtype, 4> output,
int dT, int dH, int dW,
int padT, int padH, int padW, int offsetZ)
{
@@ -24,9 +27,9 @@ __global__ void cuda_VolumetricMaxUnpooling_updateOutput(
long start_h = iRow * dH - padH;
long start_w = iColumn * dW - padW;
- float val = input[slice][iFrame][iRow][iColumn];
-
- float *idx = &indices[slice][iFrame][iRow][iColumn];
+ Dtype val = input[slice][iFrame][iRow][iColumn];
+
+ THCIndex_t *idx = &indices[slice][iFrame][iRow][iColumn];
long maxz = ((unsigned char*)(idx))[0];
long maxy = ((unsigned char*)(idx))[1];
long maxx = ((unsigned char*)(idx))[2];
@@ -34,104 +37,11 @@ __global__ void cuda_VolumetricMaxUnpooling_updateOutput(
}
}
-void THNN_CudaVolumetricMaxUnpooling_updateOutput(
- THCState *state, THCudaTensor *input, THCudaTensor *output, THCudaTensor *indices,
- int outputTime, int outputWidth, int outputHeight,
- int dT, int dW, int dH,
- int padT, int padW, int padH)
-{
- int batchSize;
- int inputSlices;
- int inputTime;
- int inputHeight;
- int inputWidth;
-
- THCUNN_assertSameGPU(state, 3, input, indices, output);
-
- if (THCudaTensor_nDimension(state, input) == 4)
- {
- /* sizes */
- batchSize = 1;
- inputSlices = THCudaTensor_size(state, input, 0);
- inputTime = THCudaTensor_size(state, input, 1);
- inputHeight = THCudaTensor_size(state, input, 2);
- inputWidth = THCudaTensor_size(state, input, 3);
- }
- else if (THCudaTensor_nDimension(state, input) == 5)
- {
- /* sizes */
- batchSize = THCudaTensor_size(state, input, 0);
- inputSlices = THCudaTensor_size(state, input, 1);
- inputTime = THCudaTensor_size(state, input, 2);
- inputHeight = THCudaTensor_size(state, input, 3);
- inputWidth = THCudaTensor_size(state, input, 4);
- }
- else
- {
- THArgCheck(false, 2, "4D or 5D tensor expected");
- }
-
- if (input->nDimension == 4) /* 4D */
- {
- /* resize output */
- THCudaTensor_resize4d(state, output, inputSlices,
- outputTime, outputHeight, outputWidth);
- }
- else
- { /* 5D */
- THCudaTensor_resize5d(state, output, batchSize, inputSlices,
- outputTime, outputHeight, outputWidth);
- }
-
- input = THCudaTensor_newContiguous(state, input);
- indices = THCudaTensor_newContiguous(state, indices);
- THCudaTensor_zero(state, output);
-
- // Collapse batch and feature dimensions
- THCDeviceTensor<float, 4> cudaInput;
- THCDeviceTensor<float, 4> cudaOutput;
- THCDeviceTensor<float, 4> cudaIndices;
-
- if (THCudaTensor_nDimension(state, input) == 4)
- {
- cudaInput = toDeviceTensor<float, 4>(state, input);
- cudaOutput = toDeviceTensor<float, 4>(state, output);
- cudaIndices = toDeviceTensor<float, 4>(state, indices);
- }
- else
- {
- cudaInput = toDeviceTensor<float, 5>(state, input).downcastOuter<4>();
- cudaOutput = toDeviceTensor<float, 5>(state, output).downcastOuter<4>();
- cudaIndices = toDeviceTensor<float, 5>(state, indices).downcastOuter<4>();
- }
-
- int totalZ = inputTime * inputSlices * batchSize;
- int offsetZ = 0;
- dim3 block(32, 8);
-
- while (totalZ > 0) {
- dim3 grid(THCCeilDiv(inputWidth, static_cast<int>(block.x)),
- THCCeilDiv(inputHeight, static_cast<int>(block.y)),
- totalZ > 65535 ? 65535 : totalZ);
-
- cuda_VolumetricMaxUnpooling_updateOutput<<<grid, block,
- 0, THCState_getCurrentStream(state)>>>(
- cudaInput, cudaIndices, cudaOutput,
- dT, dH, dW,
- padT, padH, padW, offsetZ);
- THCudaCheck(cudaGetLastError());
- totalZ -= 65535;
- offsetZ += 65535;
- }
-
- THCudaTensor_free(state, input);
- THCudaTensor_free(state, indices);
-}
-
+template <typename Dtype>
__global__ void cuda_VolumetricMaxUnpooling_updateGradInput(
- THCDeviceTensor<float, 4> gradOutput,
- THCDeviceTensor<float, 4> indices,
- THCDeviceTensor<float, 4> gradInput,
+ THCDeviceTensor<Dtype, 4> gradOutput,
+ THCDeviceTensor<THCIndex_t, 4> indices,
+ THCDeviceTensor<Dtype, 4> gradInput,
int dT, int dH, int dW,
int padT, int padH, int padW, int offsetZ)
{
@@ -142,105 +52,21 @@ __global__ void cuda_VolumetricMaxUnpooling_updateGradInput(
if (iRow < gradInput.getSize(2) && iColumn < gradInput.getSize(3))
{
-
+
long start_t = iFrame * dT - padT;
long start_h = iRow * dH - padH;
long start_w = iColumn * dW - padW;
- float *idx = &indices[slice][iFrame][iRow][iColumn];
+ THCIndex_t *idx = &indices[slice][iFrame][iRow][iColumn];
long maxz = ((unsigned char*)(idx))[0];
long maxy = ((unsigned char*)(idx))[1];
long maxx = ((unsigned char*)(idx))[2];
- float grad_val = gradOutput[slice][start_t + maxz][start_h + maxy][start_w + maxx];
+ Dtype grad_val = gradOutput[slice][start_t + maxz][start_h + maxy][start_w + maxx];
gradInput[slice][iFrame][iRow][iColumn] = grad_val;
}
}
-void THNN_CudaVolumetricMaxUnpooling_updateGradInput(
- THCState *state, THCudaTensor *input, THCudaTensor *gradOutput, THCudaTensor *gradInput,
- THCudaTensor *indices,
- int outputTime, int outputWidth, int outputHeight,
- int dT, int dW, int dH,
- int padT, int padW, int padH)
-{
-
- int batchSize;
- int inputSlices;
- int inputTime;
- int inputHeight;
- int inputWidth;
-
- THCUNN_assertSameGPU(state, 4, input, indices, gradOutput, gradInput);
-
- if (THCudaTensor_nDimension(state, input) == 4) /* 4D */
- {
- batchSize = 1;
- inputSlices = THCudaTensor_size(state, input, 0);
- inputTime = THCudaTensor_size(state, input, 1);
- inputHeight = THCudaTensor_size(state, input, 2);
- inputWidth = THCudaTensor_size(state, input, 3);
- }
- else
- {
- batchSize = THCudaTensor_size(state, input, 0);
- inputSlices = THCudaTensor_size(state, input, 1);
- inputTime = THCudaTensor_size(state, input, 2);
- inputHeight = THCudaTensor_size(state, input, 3);
- inputWidth = THCudaTensor_size(state, input, 4);
- }
-
- input = THCudaTensor_newContiguous(state, input);
- indices = THCudaTensor_newContiguous(state, indices);
- gradOutput = THCudaTensor_newContiguous(state, gradOutput);
- THCudaTensor_resizeAs(state, gradInput, input);
- THCudaTensor_zero(state, gradInput);
-
- // Collapse batch and feature dimensions
- THCDeviceTensor<float, 4> cudaGradInput;
- THCDeviceTensor<float, 4> cudaGradOutput;
- THCDeviceTensor<float, 4> cudaIndices;
-
- if (THCudaTensor_nDimension(state, input) == 4)
- {
- cudaGradInput = toDeviceTensor<float, 4>(state, gradInput);
- cudaGradOutput = toDeviceTensor<float, 4>(state, gradOutput);
- cudaIndices = toDeviceTensor<float, 4>(state, indices);
- }
- else
- {
- cudaGradInput =
- toDeviceTensor<float, 5>(state, gradInput).downcastOuter<4>();
- cudaGradOutput =
- toDeviceTensor<float, 5>(state, gradOutput).downcastOuter<4>();
- cudaIndices =
- toDeviceTensor<float, 5>(state, indices).downcastOuter<4>();
- }
-
- int totalZ = inputTime * inputSlices * batchSize;
- int offsetZ = 0;
- dim3 block(32, 8);
-
- while (totalZ > 0) {
- dim3 grid(THCCeilDiv(inputWidth, static_cast<int>(block.x)),
- THCCeilDiv(inputHeight, static_cast<int>(block.y)),
- totalZ > 65535 ? 65535 : totalZ);
-
- cuda_VolumetricMaxUnpooling_updateGradInput<<<grid, block,
- 0, THCState_getCurrentStream(state)>>>(
- cudaGradOutput,
- cudaIndices,
- cudaGradInput,
- dT, dH, dW,
- padT, padH, padW, offsetZ);
- THCudaCheck(cudaGetLastError());
- totalZ -= 65535;
- offsetZ += 65535;
- }
-
- // cleanup
- THCudaTensor_free(state, input);
- THCudaTensor_free(state, gradOutput);
- THCudaTensor_free(state, indices);
-} \ No newline at end of file
+#include "generic/VolumetricMaxUnpooling.cu"
+#include "THCGenerateFloatTypes.h"
diff --git a/lib/THCUNN/VolumetricReplicationPadding.cu b/lib/THCUNN/VolumetricReplicationPadding.cu
index cca51d5..39c37b4 100644
--- a/lib/THCUNN/VolumetricReplicationPadding.cu
+++ b/lib/THCUNN/VolumetricReplicationPadding.cu
@@ -4,10 +4,14 @@
#include "THCDeviceTensorUtils.cuh"
#include "THCDeviceUtils.cuh"
#include "THCReduceApplyUtils.cuh"
+#include "THCHalf.h"
+#include "THCHalfAutoNumerics.cuh"
+#include "THCAtomics.cuh"
+template <typename Dtype>
__global__ void VolumetricReplicationPadding_updateOutput(
- THCDeviceTensor<float, 5> input,
- THCDeviceTensor<float, 5> output,
+ THCDeviceTensor<Dtype, 5> input,
+ THCDeviceTensor<Dtype, 5> output,
int pfront, int pback, int ptop, int pbottom, int pleft, int pright) {
int outputPointId = threadIdx.x + blockIdx.x * blockDim.x;
@@ -35,76 +39,15 @@ __global__ void VolumetricReplicationPadding_updateOutput(
int inputPointZ = min(max(pfront, outputPointZ),
input.getSize(2) + pfront - 1) - oStartZ + iStartZ;
- float valueToCopy =
+ Dtype valueToCopy =
input[batch][plane][inputPointZ][inputPointY][inputPointX];
output[batch][plane][outputPointZ][outputPointY][outputPointX] = valueToCopy;
}
-void THNN_CudaVolumetricReplicationPadding_updateOutput(THCState *state,
- THCudaTensor *input,
- THCudaTensor *output,
- int pleft, int pright,
- int ptop, int pbottom,
- int pfront, int pback) {
- THArgCheck(TensorUtils<THCudaTensor>::canUse32BitIndexMath(state, input), 2,
- "input tensor must fit into 32-bit index math");
-
- int planeDim = 0;
- int dimd = 1;
- int dimh = 2;
- int dimw = 3;
- int numBatch = 1;
-
- int numInputDims = THCudaTensor_nDimension(state, input);
- THArgCheck(numInputDims == 4 || numInputDims == 5, 2,
- "input must be 4 or 5-dimensional");
-
- if (numInputDims == 5) {
- numBatch = THCudaTensor_size(state, input, 0);
- planeDim++;
- dimd++;
- dimh++;
- dimw++;
- }
-
- int numPlanes = THCudaTensor_size(state, input, planeDim);
- int inputD = THCudaTensor_size(state, input, dimd);
- int inputH = THCudaTensor_size(state, input, dimh);
- int inputW = THCudaTensor_size(state, input, dimw);
- int outputD = inputD + pfront + pback;
- int outputH = inputH + ptop + pbottom;
- int outputW = inputW + pleft + pright;
-
- THCDeviceTensor<float, 5> devInput;
- THCDeviceTensor<float, 5> devOutput;
-
- if (numInputDims == 4) {
- THCudaTensor_resize4d(state, output, numPlanes, outputD, outputH, outputW);
-
- devInput = toDeviceTensor<float, 4>(state, input).upcastOuter<5>();
- devOutput = toDeviceTensor<float, 4>(state, output).upcastOuter<5>();
- } else {
- THCudaTensor_resize5d(state, output, numBatch, numPlanes, outputD, outputH,
- outputW);
-
- devInput = toDeviceTensor<float, 5>(state, input);
- devOutput = toDeviceTensor<float, 5>(state, output);
- }
-
- int outputPlaneSize = devOutput.getSize(2) * devOutput.getSize(3) *
- devOutput.getSize(4);
- dim3 gridSize(THCCeilDiv(outputPlaneSize, 256),
- devOutput.getSize(1),
- devOutput.getSize(0));
- dim3 blockSize(outputPlaneSize > 256 ? 256 : outputPlaneSize);
-
- VolumetricReplicationPadding_updateOutput<<<gridSize, blockSize, 0, THCState_getCurrentStream(state)>>>(
- devInput, devOutput, pfront, pback, ptop, pbottom, pleft, pright);
-}
-
+template <typename Dtype>
__global__ void VolumetricReplicationPadding_updateGradInput(
- THCDeviceTensor<float, 5> gradInput,
- THCDeviceTensor<float, 5> gradOutput,
+ THCDeviceTensor<Dtype, 5> gradInput,
+ THCDeviceTensor<Dtype, 5> gradOutput,
int pfront, int pback, int ptop, int pbottom, int pleft, int pright) {
int outputPointId = threadIdx.x + blockIdx.x * blockDim.x;
int plane = blockIdx.y;
@@ -134,56 +77,12 @@ __global__ void VolumetricReplicationPadding_updateGradInput(
int inputPointZ = min(max(pfront, outputPointZ),
gradInput.getSize(2) + pfront - 1) - oStartZ + iStartZ;
- float valueToCopy =
+ Dtype valueToCopy =
gradOutput[batch][plane][outputPointZ][outputPointY][outputPointX];
atomicAdd(&gradInput[batch][plane][inputPointZ][inputPointY][inputPointX],
valueToCopy);
}
-void THNN_CudaVolumetricReplicationPadding_updateGradInput(
- THCState *state, THCudaTensor *input, THCudaTensor *gradOutput,
- THCudaTensor *gradInput, int pleft, int pright, int ptop, int pbottom,
- int pfront, int pback) {
- THArgCheck(TensorUtils<THCudaTensor>::canUse32BitIndexMath(state, input), 2,
- "input tensor must fit into 32-bit index math");
- THArgCheck(TensorUtils<THCudaTensor>::canUse32BitIndexMath(state, gradOutput),
- 3, "output gradient tensor must fit into 32-bit index math");
-
- int planeDim = 0;
- int dimd = 1;
- int dimh = 2;
- int dimw = 3;
-
- int numInputDims = THCudaTensor_nDimension(state, input);
- if (numInputDims == 5) {
- planeDim++;
- dimd++;
- dimh++;
- dimw++;
- }
-
- THCudaTensor_resizeAs(state, gradInput, input);
- THCudaTensor_zero(state, gradInput);
- THCDeviceTensor<float, 5> devGradInput;
- THCDeviceTensor<float, 5> devGradOutput;
-
- if (numInputDims == 4) {
- devGradInput = toDeviceTensor<float, 4>(state, gradInput).upcastOuter<5>();
- devGradOutput =
- toDeviceTensor<float, 4>(state, gradOutput).upcastOuter<5>();
- } else {
- devGradInput = toDeviceTensor<float, 5>(state, gradInput);
- devGradOutput = toDeviceTensor<float, 5>(state, gradOutput);
- }
-
- int outputPlaneSize = devGradOutput.getSize(2) * devGradOutput.getSize(3) *
- devGradOutput.getSize(4);
- dim3 gridSize(THCCeilDiv(outputPlaneSize, 256),
- devGradOutput.getSize(1),
- devGradOutput.getSize(0));
- dim3 blockSize(outputPlaneSize > 256 ? 256 : outputPlaneSize);
-
- VolumetricReplicationPadding_updateGradInput<<<gridSize, blockSize, 0, THCState_getCurrentStream(state)>>>(
- devGradInput, devGradOutput, pfront, pback, ptop, pbottom, pleft, pright);
-}
+#include "generic/VolumetricReplicationPadding.cu"
+#include "THCGenerateFloatTypes.h"
diff --git a/lib/THCUNN/common.h b/lib/THCUNN/common.h
index e097535..12fbdd1 100644
--- a/lib/THCUNN/common.h
+++ b/lib/THCUNN/common.h
@@ -8,6 +8,10 @@
#define THCUNN_assertSameGPU(...) THAssertMsg(THCudaTensor_checkGPU(__VA_ARGS__), \
"Some of weight/gradient/input tensors are located on different GPUs. Please move them to a single one.")
+// _generic can be removed once everything is genericized
+#define THCUNN_assertSameGPU_generic(...) THAssertMsg(THCTensor_(checkGPU)(__VA_ARGS__), \
+ "Some of weight/gradient/input tensors are located on different GPUs. Please move them to a single one.")
+
// Use 1024 threads per block, which requires cuda sm_2x or above
const int CUDA_NUM_THREADS = 1024;
@@ -17,4 +21,12 @@ inline int GET_BLOCKS(const int N)
return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS;
}
+#define THCUNN_resizeAs_indices(STATE, I1, I2) \
+ THLongStorage *size2 = THCTensor_(newSizeOf)(STATE, I2); \
+ if (!THCudaLongTensor_isSize(STATE, I1, size2)) \
+ { \
+ THCudaLongTensor_resize(STATE, I1, size2, NULL); \
+ } \
+ THLongStorage_free(size2);
+
#endif
diff --git a/lib/THCUNN/generic/Abs.cu b/lib/THCUNN/generic/Abs.cu
new file mode 100644
index 0000000..dc1e9a2
--- /dev/null
+++ b/lib/THCUNN/generic/Abs.cu
@@ -0,0 +1,28 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/Abs.cu"
+#else
+
+#include "../common.h"
+
+void THNN_(Abs_updateOutput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *output)
+{
+ THCUNN_assertSameGPU_generic(state, 2, input, output);
+ THCTensor_(resizeAs)(state, output, input);
+ THC_pointwiseApply2(state, output, input, absupdateOutput_functor<real>());
+}
+
+void THNN_(Abs_updateGradInput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *gradOutput,
+ THCTensor *gradInput)
+{
+ THCUNN_assertSameGPU_generic(state, 3, input, gradOutput, gradInput);
+ THCTensor_(resizeAs)(state, gradInput, input);
+ THC_pointwiseApply3(state, gradInput, input, gradOutput, absupdateGradInput_functor<real>());
+}
+
+#endif
diff --git a/lib/THCUNN/generic/AbsCriterion.cu b/lib/THCUNN/generic/AbsCriterion.cu
new file mode 100644
index 0000000..904fbba
--- /dev/null
+++ b/lib/THCUNN/generic/AbsCriterion.cu
@@ -0,0 +1,59 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/AbsCriterion.cu"
+#else
+
+void THNN_(AbsCriterion_updateOutput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *target,
+ THCTensor *output,
+ bool sizeAverage)
+{
+ THCUNN_assertSameGPU_generic(state, 2, input, target);
+
+ long size = THCTensor_(nElement)(state, input);
+
+ input = THCTensor_(newContiguous)(state, input);
+ target = THCTensor_(newContiguous)(state, target);
+
+ thrust::device_ptr<real> input_data(THCTensor_(data)(state, input));
+ thrust::device_ptr<real> target_data(THCTensor_(data)(state, target));
+ accreal sum = thrust::inner_product(input_data, input_data+size, target_data, (accreal)0, thrust::plus<accreal>(), abs_functor<real, accreal>());
+
+ if (sizeAverage)
+ sum /= size;
+
+ THCTensor_(free)(state, input);
+ THCTensor_(free)(state, target);
+
+ THCTensor_(set1d)(state, output, 0, ScalarConvert<accreal, real>::to(sum));
+}
+
+void THNN_(AbsCriterion_updateGradInput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *target,
+ THCTensor *gradInput,
+ bool sizeAverage)
+{
+ THCUNN_assertSameGPU_generic(state, 3, input, target, gradInput);
+
+ long size = THCTensor_(nElement)(state, input);
+ real norm = ScalarConvert<double, real>::to(sizeAverage ? 1./size : 1.);
+
+ input = THCTensor_(newContiguous)(state, input);
+ target = THCTensor_(newContiguous)(state, target);
+
+ THCTensor_(resizeAs)(state, gradInput, input);
+
+ thrust::device_ptr<real> input_data(THCTensor_(data)(state, input));
+ thrust::device_ptr<real> target_data(THCTensor_(data)(state, target));
+ thrust::device_ptr<real> gradInput_data(THCTensor_(data)(state, gradInput));
+
+ thrust::transform(input_data, input_data+size, target_data, gradInput_data, abs_updateGradInput_functor<real>(norm));
+
+ THCTensor_(free)(state, input);
+ THCTensor_(free)(state, target);
+}
+
+#endif
diff --git a/lib/THCUNN/generic/BCECriterion.cu b/lib/THCUNN/generic/BCECriterion.cu
new file mode 100644
index 0000000..a6f76d6
--- /dev/null
+++ b/lib/THCUNN/generic/BCECriterion.cu
@@ -0,0 +1,99 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/BCECriterion.cu"
+#else
+
+void THNN_(BCECriterion_updateOutput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *target,
+ THCTensor *output,
+ bool sizeAverage,
+ THCTensor *weights)
+{
+ THCUNN_assertSameGPU_generic(state, 3, input, target, weights);
+
+ long size = THCTensor_(nElement)(state, input);
+
+ input = THCTensor_(newContiguous)(state, input);
+ target = THCTensor_(newContiguous)(state, target);
+
+ thrust::device_ptr<real> input_data(THCTensor_(data)(state, input));
+ thrust::device_ptr<real> target_data(THCTensor_(data)(state, target));
+
+ accreal sum;
+ if (weights) {
+ weights = THCTensor_(newContiguous)(state, weights);
+ thrust::device_ptr<real> weights_data(THCTensor_(data)(state, weights));
+ sum = thrust::transform_reduce(
+ thrust::make_zip_iterator(thrust::make_tuple(input_data, target_data, weights_data)),
+ thrust::make_zip_iterator(thrust::make_tuple(input_data+size, target_data+size, weights_data+size)),
+ bce_functor_weights<real, accreal>(),
+ (accreal) 0,
+ thrust::plus<accreal>()
+ );
+ THCTensor_(free)(state, weights);
+ } else {
+ sum = thrust::transform_reduce(
+ thrust::make_zip_iterator(thrust::make_tuple(input_data, target_data)),
+ thrust::make_zip_iterator(thrust::make_tuple(input_data+size, target_data+size)),
+ bce_functor<real, accreal>(),
+ (accreal) 0,
+ thrust::plus<accreal>()
+ );
+ }
+
+ if (sizeAverage)
+ sum /= size;
+
+ THCTensor_(free)(state, input);
+ THCTensor_(free)(state, target);
+
+ THCTensor_(set1d)(state, output, 0, ScalarConvert<accreal, real>::to(sum));
+}
+
+void THNN_(BCECriterion_updateGradInput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *target,
+ THCTensor *gradInput,
+ bool sizeAverage,
+ THCTensor *weights)
+{
+ THCUNN_assertSameGPU_generic(state, 4, input, target, gradInput, weights);
+
+ long size = THCTensor_(nElement)(state, input);
+ real norm = ScalarConvert<accreal, real>::to(sizeAverage ? accreal(1)/size : accreal(1));
+
+ input = THCTensor_(newContiguous)(state, input);
+ target = THCTensor_(newContiguous)(state, target);
+
+ THCTensor_(resizeAs)(state, gradInput, input);
+
+ thrust::device_ptr<real> input_data(THCTensor_(data)(state, input));
+ thrust::device_ptr<real> target_data(THCTensor_(data)(state, target));
+ thrust::device_ptr<real> gradInput_data(THCTensor_(data)(state, gradInput));
+
+ if (weights) {
+ weights = THCTensor_(newContiguous)(state, weights);
+ thrust::device_ptr<real> weights_data(THCTensor_(data)(state, weights));
+ thrust::transform(
+ thrust::make_zip_iterator(thrust::make_tuple(input_data, target_data, weights_data)),
+ thrust::make_zip_iterator(thrust::make_tuple(input_data+size, target_data+size, weights_data+size)),
+ gradInput_data,
+ bce_updateGradInput_functor_weights<real, accreal>(norm)
+ );
+ THCTensor_(free)(state, weights);
+ } else {
+ thrust::transform(
+ thrust::make_zip_iterator(thrust::make_tuple(input_data, target_data)),
+ thrust::make_zip_iterator(thrust::make_tuple(input_data+size, target_data+size)),
+ gradInput_data,
+ bce_updateGradInput_functor<real, accreal>(norm)
+ );
+ }
+
+ THCTensor_(free)(state, input);
+ THCTensor_(free)(state, target);
+}
+
+#endif
diff --git a/lib/THCUNN/generic/BatchNormalization.cu b/lib/THCUNN/generic/BatchNormalization.cu
new file mode 100644
index 0000000..80bb937
--- /dev/null
+++ b/lib/THCUNN/generic/BatchNormalization.cu
@@ -0,0 +1,97 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/BatchNormalization.cu"
+#else
+
+#define DeviceTensor3 THCDeviceTensor<real, 3>
+#define DeviceTensor1 THCDeviceTensor<real, 1>
+
+template <int Dim>
+static THCDeviceTensor<real, Dim> devicetensor(THCState *state, THCTensor *t) {
+ if (!t) {
+ return THCDeviceTensor<real, Dim>();
+ }
+
+ int inDim = THCTensor_(nDimension)(state, t);
+ if (inDim == Dim) {
+ return toDeviceTensor<real, Dim>(state, t);
+ }
+
+ // View in which the last dimensions are collapsed or expanded as needed
+ THAssert(THCTensor_(isContiguous)(state, t));
+ int size[Dim];
+ for (int i = 0; i < Dim || i < inDim; ++i) {
+ if (i < Dim && i < inDim) {
+ size[i] = t->size[i];
+ } else if (i < Dim) {
+ size[i] = 1;
+ } else {
+ size[Dim - 1] *= t->size[i];
+ }
+ }
+ return THCDeviceTensor<real, Dim>(THCTensor_(data)(state, t), size);
+}
+
+void THNN_(BatchNormalization_updateOutput)(
+ THCState *state, THCTensor *input_, THCTensor *output_,
+ THCTensor *weight_, THCTensor *bias_, THCTensor *runningMean_,
+ THCTensor *runningVar_, THCTensor *saveMean_, THCTensor *saveStd_,
+ bool train, double momentum, double eps) {
+
+ DeviceTensor3 input = devicetensor<3>(state, input_);
+ DeviceTensor3 output = devicetensor<3>(state, output_);
+ DeviceTensor1 weight = devicetensor<1>(state, weight_);
+ DeviceTensor1 bias = devicetensor<1>(state, bias_);
+ DeviceTensor1 runningMean = devicetensor<1>(state, runningMean_);
+ DeviceTensor1 runningVar = devicetensor<1>(state, runningVar_);
+ DeviceTensor1 saveMean = devicetensor<1>(state, saveMean_);
+ DeviceTensor1 saveStd = devicetensor<1>(state, saveStd_);
+
+ cudaStream_t s = THCState_getCurrentStream(state);
+ cudaDeviceProp *prop = THCState_getCurrentDeviceProperties(state);
+
+ if (!train) {
+ dim3 blocks(input.getSize(1));
+ dim3 threads(getNumThreads(input.getSize(2)));
+ BatchNormalizationUpdateOutputInference_kernel<real, accreal, DeviceTensor1, DeviceTensor3> <<<blocks, threads, 0, s>>>(
+ input, output, runningMean, runningVar, weight, bias, eps);
+ } else {
+ dim3 blocks(input.getSize(1));
+ dim3 threads(getNumThreads(input.getSize(2)));
+ BatchNormalizationUpdateOutput_kernel<real, accreal, DeviceTensor1, DeviceTensor3> <<<blocks, threads, 0, s>>>(
+ input, output, weight, bias, eps, momentum, runningMean, runningVar,
+ saveMean, saveStd);
+ }
+ THCudaCheck(cudaGetLastError());
+}
+
+void THNN_(BatchNormalization_backward)(
+ THCState *state, THCTensor *input_, THCTensor *gradOutput_,
+ THCTensor *gradInput_, THCTensor *gradWeight_, THCTensor *gradBias_,
+ THCTensor *weight_, THCTensor *runningMean_, THCTensor *runningVar_,
+ THCTensor *saveMean_, THCTensor *saveStd_, bool train, float scale, double eps) {
+
+ DeviceTensor3 input = devicetensor<3>(state, input_);
+ DeviceTensor3 gradOutput = devicetensor<3>(state, gradOutput_);
+ DeviceTensor3 gradInput = devicetensor<3>(state, gradInput_);
+ DeviceTensor1 gradWeight = devicetensor<1>(state, gradWeight_);
+ DeviceTensor1 gradBias = devicetensor<1>(state, gradBias_);
+ DeviceTensor1 weight = devicetensor<1>(state, weight_);
+ DeviceTensor1 runningMean = devicetensor<1>(state, runningMean_);
+ DeviceTensor1 runningVar = devicetensor<1>(state, runningVar_);
+ DeviceTensor1 saveMean = devicetensor<1>(state, saveMean_);
+ DeviceTensor1 saveStd = devicetensor<1>(state, saveStd_);
+
+ cudaStream_t s = THCState_getCurrentStream(state);
+
+ dim3 blocks(gradOutput.getSize(1));
+ dim3 threads(getNumThreads(gradOutput.getSize(2)));
+ BatchNormalizationBackward_kernel<real, accreal, DeviceTensor1, DeviceTensor3> <<<blocks, threads, 0, s>>>(
+ input, gradOutput, gradInput, gradWeight, gradBias, weight, runningMean, runningVar,
+ saveMean, saveStd, train, scale, eps);
+ THCudaCheck(cudaGetLastError());
+}
+
+#undef DeviceTensor3
+#undef DeviceTensor1
+
+#endif
diff --git a/lib/THCUNN/generic/ClassNLLCriterion.cu b/lib/THCUNN/generic/ClassNLLCriterion.cu
new file mode 100644
index 0000000..62925a8
--- /dev/null
+++ b/lib/THCUNN/generic/ClassNLLCriterion.cu
@@ -0,0 +1,156 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/ClassNLLCriterion.cu"
+#else
+
+void THNN_(ClassNLLCriterion_updateOutput)(
+ THCState *state,
+ THCTensor *input,
+ THCIndexTensor *target,
+ THCTensor *output,
+ bool sizeAverage,
+ THCTensor *weights,
+ THCTensor *total_weight) {
+ if (THCIndexTensor_(nDimension)(state, target) > 1) {
+ THError("multi-target not supported");
+ }
+
+ int n_dims = THCTensor_(nDimension)(state, input);
+ int n_classes = THCTensor_(size)(state, input, n_dims - 1);
+
+ if (weights) {
+ THCUNN_assertSameGPU_generic(
+ state, 5, input, target, weights, output, total_weight
+ );
+ } else {
+ THCUNN_assertSameGPU_generic(
+ state, 4, input, target, output, total_weight
+ );
+ }
+
+ if (THCTensor_(nDimension)(state, input) > 2) {
+ THArgCheck(0, 2, "vector or matrix expected");
+ }
+ if (weights && THCTensor_(nElement)(state, weights) != n_classes) {
+ THError("weight tensor should be defined either for all or no classes");
+ }
+
+ input = THCTensor_(newContiguous)(state, input);
+ weights = weights ? THCTensor_(newContiguous)(state, weights) : NULL;
+ target = THCIndexTensor_(newContiguous)(state, target);
+
+ real *input_data = THCTensor_(data)(state, input);
+ real *weights_data = weights ? THCTensor_(data)(state, weights) : NULL;
+ THCIndex_t *target_data = THCIndexTensor_(data)(state, target);
+ real *output_data = THCTensor_(data)(state, output);
+ real *total_weight_data = THCTensor_(data)(state, total_weight);
+
+ if (THCTensor_(nDimension)(state, input) == 1) {
+ cunn_ClassNLLCriterion_updateOutput_kernel1<real>
+ <<<1, 1, 0, THCState_getCurrentStream(state)>>>(
+ output_data,
+ total_weight_data,
+ input_data,
+ target_data,
+ weights_data,
+ sizeAverage,
+ n_classes
+ );
+
+ } else if (THCTensor_(nDimension)(state, input) == 2) {
+ cunn_ClassNLLCriterion_updateOutput_kernel<real, accreal>
+ <<<1, NTHREADS, 0, THCState_getCurrentStream(state)>>>(
+ output_data,
+ total_weight_data,
+ input_data,
+ target_data,
+ weights_data,
+ sizeAverage,
+ THCTensor_(size)(state, input, 0),
+ THCTensor_(size)(state, input, 1),
+ n_classes
+ );
+ }
+ THCudaCheck(cudaGetLastError());
+
+ if (weights) {
+ THCTensor_(free)(state, weights);
+ }
+ THCIndexTensor_(free)(state, target);
+ THCTensor_(free)(state, input);
+}
+
+void THNN_(ClassNLLCriterion_updateGradInput)(
+ THCState *state,
+ THCTensor *input,
+ THCIndexTensor *target,
+ THCTensor *gradInput,
+ bool sizeAverage,
+ THCTensor *weights,
+ THCTensor *total_weight) {
+ if (THCIndexTensor_(nDimension)(state, target) > 1) {
+ THError("multi-target not supported");
+ }
+
+ int n_dims = THCTensor_(nDimension)(state, input);
+ int n_classes = THCTensor_(size)(state, input, n_dims - 1);
+
+ THArgCheck(THCTensor_(isContiguous)(state, gradInput), 4, "gradInput must be contiguous");
+
+ if (weights) {
+ THCUNN_assertSameGPU(
+ state, 5, weights, input, target, gradInput, total_weight
+ );
+ }
+ else {
+ THCUNN_assertSameGPU(
+ state, 4, input, target, gradInput, total_weight
+ );
+ }
+
+ if (THCTensor_(nDimension)(state, input) > 2) {
+ THArgCheck(0, 2, "vector or matrix expected");
+ }
+ if (weights && THCTensor_(nElement)(state, weights) != n_classes) {
+ THError("weight tensor should be defined either for all or no classes");
+ }
+
+ weights = weights ? THCTensor_(newContiguous)(state, weights) : NULL;
+ target = THCIndexTensor_(newContiguous)(state, target);
+
+ real *weights_data = weights ? THCTensor_(data)(state, weights) : NULL;
+ real *gradInput_data = THCTensor_(data)(state, gradInput);
+ THCIndex_t *target_data = THCIndexTensor_(data)(state, target);
+ real *total_weight_data = THCTensor_(data)(state, total_weight);
+
+ if (THCTensor_(nDimension)(state, input) == 1) {
+ cunn_ClassNLLCriterion_updateGradInput_kernel1<real>
+ <<<1, 1, 0, THCState_getCurrentStream(state)>>>(
+ gradInput_data,
+ weights_data,
+ target_data,
+ total_weight_data,
+ sizeAverage,
+ n_classes
+ );
+ } else {
+ cunn_ClassNLLCriterion_updateGradInput_kernel<real>
+ <<<1, NTHREADS, 0, THCState_getCurrentStream(state)>>>(
+ gradInput_data,
+ target_data,
+ weights_data,
+ total_weight_data,
+ sizeAverage,
+ THCTensor_(size)(state, input, 0),
+ THCTensor_(size)(state, input, 1),
+ n_classes
+ );
+ }
+ THCudaCheck(cudaGetLastError());
+
+ if (weights) {
+ THCTensor_(free)(state, weights);
+ }
+ THCIndexTensor_(free)(state, target);
+}
+
+#endif
diff --git a/lib/THCUNN/generic/DistKLDivCriterion.cu b/lib/THCUNN/generic/DistKLDivCriterion.cu
new file mode 100644
index 0000000..e06fe9e
--- /dev/null
+++ b/lib/THCUNN/generic/DistKLDivCriterion.cu
@@ -0,0 +1,67 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/DistKLDivCriterion.cu"
+#else
+
+void THNN_(DistKLDivCriterion_updateOutput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *target,
+ THCTensor *output,
+ bool sizeAverage)
+{
+ THCUNN_assertSameGPU_generic(state, 2, input, target);
+
+ THArgCheck(THCTensor_(nElement)(state, input) == THCTensor_(nElement)(state, target), 2,
+ "input and target need to have the same number of elements");
+
+ accreal sum;
+
+ long size = THCTensor_(nElement)(state, input);
+
+ input = THCTensor_(newContiguous)(state, input);
+ target = THCTensor_(newContiguous)(state, target);
+
+ thrust::device_ptr<real> input_data(THCTensor_(data)(state, input));
+ thrust::device_ptr<real> target_data(THCTensor_(data)(state, target));
+ sum = thrust::inner_product(input_data, input_data+size, target_data, (accreal) 0, thrust::plus<accreal>(), kl_functor<real, accreal>());
+
+ if (sizeAverage)
+ sum /= size;
+
+ THCTensor_(free)(state, input);
+ THCTensor_(free)(state, target);
+
+ THCTensor_(set1d)(state, output, 0, ScalarConvert<accreal, real>::to(sum));
+}
+
+void THNN_(DistKLDivCriterion_updateGradInput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *target,
+ THCTensor *gradInput,
+ bool sizeAverage)
+{
+ THCUNN_assertSameGPU_generic(state, 3, input, target, gradInput);
+
+ THArgCheck(THCTensor_(nElement)(state, input) == THCTensor_(nElement)(state, target), 2,
+ "input and target need to have the same number of elements");
+
+ long size = THCTensor_(nElement)(state, input);
+ real norm = (sizeAverage ? ScalarConvert<accreal, real>::to(accreal(1)/size) : ScalarConvert<int, real>::to(1));
+
+ input = THCTensor_(newContiguous)(state, input);
+ target = THCTensor_(newContiguous)(state, target);
+
+ THCTensor_(resizeAs)(state, gradInput, input);
+
+ thrust::device_ptr<real> input_data(THCTensor_(data)(state, input));
+ thrust::device_ptr<real> target_data(THCTensor_(data)(state, target));
+ thrust::device_ptr<real> gradInput_data(THCTensor_(data)(state, gradInput));
+
+ thrust::transform(input_data, input_data+size, target_data, gradInput_data, kl_updateGradInput_functor<real>(norm));
+
+ THCTensor_(free)(state, input);
+ THCTensor_(free)(state, target);
+}
+
+#endif
diff --git a/lib/THCUNN/generic/ELU.cu b/lib/THCUNN/generic/ELU.cu
new file mode 100644
index 0000000..291f26e
--- /dev/null
+++ b/lib/THCUNN/generic/ELU.cu
@@ -0,0 +1,53 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/ELU.cu"
+#else
+
+#include "../common.h"
+
+
+void THNN_(ELU_updateOutput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *output,
+ real alpha,
+ bool inplace)
+{
+ THCUNN_assertSameGPU_generic(state, 2, input, output);
+
+ if (inplace)
+ {
+ THC_pointwiseApply1(state, input, ELUupdateOutputIP_functor<real>(alpha));
+ THCTensor_(set)(state, output, input);
+ }
+ else
+ {
+ THCTensor_(resizeAs)(state, output, input);
+ THC_pointwiseApply2(state, output, input, ELUupdateOutput_functor<real>(alpha));
+ }
+}
+
+
+void THNN_(ELU_updateGradInput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *gradOutput,
+ THCTensor *gradInput,
+ THCTensor *output,
+ real alpha,
+ bool inplace)
+{
+ THCUNN_assertSameGPU_generic(state, 3, output, gradOutput, gradInput);
+
+ if (inplace)
+ {
+ THC_pointwiseApply2(state, gradOutput, output, ELUupdateGradInputIP_functor<real>(alpha));
+ THCTensor_(set)(state, gradInput, gradOutput);
+ }
+ else
+ {
+ THCTensor_(resizeAs)(state, gradInput, output);
+ THC_pointwiseApply3(state, gradInput, output, gradOutput, ELUupdateGradInput_functor<real>(alpha));
+ }
+}
+
+#endif
diff --git a/lib/THCUNN/generic/HardTanh.cu b/lib/THCUNN/generic/HardTanh.cu
new file mode 100644
index 0000000..8899476
--- /dev/null
+++ b/lib/THCUNN/generic/HardTanh.cu
@@ -0,0 +1,54 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/HardTanh.cu"
+#else
+
+#include "../common.h"
+
+void THNN_(HardTanh_updateOutput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *output,
+ real min_val,
+ real max_val,
+ bool inplace)
+{
+ THCUNN_assertSameGPU_generic(state, 2, input, output);
+ if(inplace)
+ {
+ THCTensor_(set)(state, output, input);
+ THC_pointwiseApply1(state, output, hardtanhupdateOutput_functor<real>(min_val, max_val));
+ }
+ else
+ {
+ THCTensor_(resizeAs)(state, output, input);
+ THC_pointwiseApply2(state, output, input,
+ hardtanhupdateOutput_functor<real>(min_val, max_val));
+ }
+}
+
+void THNN_(HardTanh_updateGradInput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *gradOutput,
+ THCTensor *gradInput,
+ real min_val,
+ real max_val,
+ bool inplace)
+{
+ THCUNN_assertSameGPU_generic(state, 3, input, gradOutput, gradInput);
+
+ if (inplace)
+ {
+ THCTensor_(set)(state, gradInput, gradOutput);
+ THC_pointwiseApply2(state, gradInput, input,
+ hardtanhupdateGradInput_functor<real>(min_val, max_val));
+ }
+ else
+ {
+ THCTensor_(resizeAs)(state, gradInput, input);
+ THC_pointwiseApply3(state, gradInput, input, gradOutput,
+ hardtanhupdateGradInput_functor<real>(min_val, max_val));
+ }
+}
+
+#endif
diff --git a/lib/THCUNN/generic/L1Cost.cu b/lib/THCUNN/generic/L1Cost.cu
new file mode 100644
index 0000000..de15189
--- /dev/null
+++ b/lib/THCUNN/generic/L1Cost.cu
@@ -0,0 +1,42 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/L1Cost.cu"
+#else
+
+void THNN_(L1Cost_updateOutput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *output)
+{
+ THCUNN_assertSameGPU_generic(state, 1, input);
+ accreal sum;
+ long size = THCTensor_(nElement)(state, input);
+ input = THCTensor_(newContiguous)(state, input);
+ thrust::device_ptr<real> input_data(THCTensor_(data)(state, input));
+ sum = thrust::transform_reduce(input_data, input_data+size, l1cost_functor<real, accreal>(), accreal(0), thrust::plus<accreal>());
+
+ THCTensor_(free)(state, input);
+
+ THCTensor_(set1d)(state, output, 0, ScalarConvert<accreal, real>::to(sum));
+}
+
+void THNN_(L1Cost_updateGradInput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *gradOutput,
+ THCTensor *gradInput)
+{
+ THCUNN_assertSameGPU_generic(state, 2, input, gradInput);
+ long size = THCTensor_(nElement)(state, input);
+
+ input = THCTensor_(newContiguous)(state, input);
+ THCTensor_(resizeAs)(state, gradInput, input);
+
+ thrust::device_ptr<real> input_data(THCTensor_(data)(state, input));
+ thrust::device_ptr<real> gradInput_data(THCTensor_(data)(state, gradInput));
+
+ thrust::transform(input_data, input_data+size, gradInput_data, l1cost_updateGradInput_functor<real>());
+
+ THCTensor_(free)(state, input);
+}
+
+#endif
diff --git a/lib/THCUNN/generic/LeakyReLU.cu b/lib/THCUNN/generic/LeakyReLU.cu
new file mode 100644
index 0000000..20fb778
--- /dev/null
+++ b/lib/THCUNN/generic/LeakyReLU.cu
@@ -0,0 +1,54 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/LeakyReLU.cu"
+#else
+
+#include "../common.h"
+
+void THNN_(LeakyReLU_updateOutput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *output,
+ real negval,
+ bool inplace)
+{
+ THCUNN_assertSameGPU_generic(state, 2, input, output);
+
+ if (inplace)
+ {
+ THC_pointwiseApply1(state, input, LeakyReLUUpdateOutputIP<real>(negval));
+ THCTensor_(set)(state, output, input);
+ }
+ else
+ {
+ THCTensor_(resizeAs)(state, output, input);
+ THC_pointwiseApply2(state, output, input, LeakyReLUUpdateOutput<real>(negval));
+ }
+
+ THCudaCheck(cudaGetLastError());
+}
+
+void THNN_(LeakyReLU_updateGradInput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *gradOutput,
+ THCTensor *gradInput,
+ real negval,
+ bool inplace)
+{
+ THCUNN_assertSameGPU_generic(state, 3, input, gradInput, gradOutput);
+
+ if (inplace)
+ {
+ THC_pointwiseApply2(state, gradOutput, input, LeakyReLUUpdateGradInputIP<real>(negval));
+ THCTensor_(set)(state, gradInput, gradOutput);
+ }
+ else
+ {
+ THCTensor_(resizeAs)(state, gradInput, input);
+ THC_pointwiseApply3(state, gradInput, input, gradOutput, LeakyReLUUpdateGradInput<real>(negval));
+ }
+
+ THCudaCheck(cudaGetLastError());
+}
+
+#endif
diff --git a/lib/THCUNN/generic/LogSigmoid.cu b/lib/THCUNN/generic/LogSigmoid.cu
new file mode 100644
index 0000000..4a6a4c9
--- /dev/null
+++ b/lib/THCUNN/generic/LogSigmoid.cu
@@ -0,0 +1,30 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/LogSigmoid.cu"
+#else
+
+#include "../common.h"
+
+void THNN_(LogSigmoid_updateOutput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *output,
+ THCTensor *buffer)
+{
+ THCUNN_assertSameGPU_generic(state, 2, input, output);
+ THCTensor_(resizeAs)(state, output, input);
+ THC_pointwiseApply2(state, output, input, logSigmoid_updateOutput_functor<real>());
+}
+
+void THNN_(LogSigmoid_updateGradInput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *gradOutput,
+ THCTensor *gradInput,
+ THCTensor *buffer)
+{
+ THCUNN_assertSameGPU_generic(state, 3, input, gradOutput, gradInput);
+ THCTensor_(resizeAs)(state, gradInput, input);
+ THC_pointwiseApply3(state, gradInput, input, gradOutput, logSigmoid_updateGradInput_functor<real>());
+}
+
+#endif
diff --git a/lib/THCUNN/generic/LogSoftMax.cu b/lib/THCUNN/generic/LogSoftMax.cu
new file mode 100644
index 0000000..5755ea2
--- /dev/null
+++ b/lib/THCUNN/generic/LogSoftMax.cu
@@ -0,0 +1,238 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/LogSoftMax.cu"
+#else
+
+#include "../common.h"
+
+void THNN_(LogSoftMax_updateOutput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *output)
+{
+ THCUNN_assertSameGPU_generic(state, 2, input, output);
+
+ THCTensor_(resizeAs)(state, output, input);
+
+ bool spatial = false;
+ int batchSize = 1;
+ int classSize = 0;
+ int height = 0;
+ int width = 0;
+
+ int ndims = THCTensor_(nDimension)(state, input);
+
+ if (ndims == 1)
+ {
+ classSize = THCTensor_(size)(state, input, 0);
+ input = THCTensor_(newContiguous)(state, input);
+ }
+ else if (ndims == 2)
+ {
+ batchSize = THCTensor_(size)(state, input, 0);
+ classSize = THCTensor_(size)(state, input, 1);
+ input = THCTensor_(newContiguous)(state, input);
+ }
+ else if (ndims == 3)
+ {
+ spatial = true;
+ classSize = THCTensor_(size)(state, input, 0);
+ height = THCTensor_(size)(state, input, 1);
+ width = THCTensor_(size)(state, input, 2);
+
+ // create contiguous tensor with cuda layout from tensor with torch layout
+ // C x H x W -> W x H x C
+ THCTensor_(transpose)(state, input, input, 0, 2);
+ // W x H x C -> H x W x C
+ THCTensor_(transpose)(state, input, input, 0, 1);
+ THCTensor *transposedInput = THCTensor_(newContiguous)(state, input);
+ THCTensor_(transpose)(state, input, input, 0, 1);
+ THCTensor_(transpose)(state, input, input, 0, 2);
+ input = transposedInput;
+ }
+ else if (ndims == 4)
+ {
+ spatial = true;
+ batchSize = THCTensor_(size)(state, input, 0);
+ classSize = THCTensor_(size)(state, input, 1);
+ height = THCTensor_(size)(state, input, 2);
+ width = THCTensor_(size)(state, input, 3);
+
+ // create contiguous tensor with cuda layout from tensor with torch layout
+ // B x C x H x W -> B x W x H x C
+ THCTensor_(transpose)(state, input, input, 1, 3);
+ // B x W x H x C -> B x H x W x C
+ THCTensor_(transpose)(state, input, input, 1, 2);
+ THCTensor *transposedInput = THCTensor_(newContiguous)(state, input);
+ THCTensor_(transpose)(state, input, input, 1, 2);
+ THCTensor_(transpose)(state, input, input, 1, 3);
+ input = transposedInput;
+ }
+ else
+ {
+ THError("1D, 2D, 3D or 4D Tensor expected");
+ }
+
+ if (!spatial)
+ {
+ dim3 grid(batchSize);
+ dim3 block(1024);
+
+ cunn_LogSoftMax_updateOutput_kernel<2, real, accreal>
+ <<<grid, block, block.x * sizeof(accreal), THCState_getCurrentStream(state)>>>(
+ THCTensor_(data)(state, output),
+ THCTensor_(data)(state, input),
+ classSize
+ );
+ }
+ else
+ {
+ dim3 grid(batchSize);
+ dim3 block(1024);
+
+ cunn_SpatialLogSoftMax_updateOutput_kernel<real, accreal>
+ <<<grid, block, 0, THCState_getCurrentStream(state)>>>(
+ THCTensor_(data)(state, output),
+ THCTensor_(data)(state, input),
+ classSize, height, width
+ );
+ }
+
+ cudaError errcode = cudaGetLastError();
+ if (errcode != cudaSuccess)
+ {
+ THError(cudaGetErrorString(errcode));
+ }
+
+ THCTensor_(free)(state, input);
+}
+
+void THNN_(LogSoftMax_updateGradInput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *gradOutput,
+ THCTensor *gradInput,
+ THCTensor *output)
+{
+ THCUNN_assertSameGPU_generic(state, 3, output, gradOutput, gradInput);
+
+ THCTensor_(resizeAs)(state, gradInput, output);
+
+ bool spatial = false;
+ int batchSize = 1;
+ int classSize = 0;
+ int height = 0;
+ int width = 0;
+
+ int ndims = THCTensor_(nDimension)(state, input);
+
+ if (ndims == 1)
+ {
+ classSize = THCTensor_(size)(state, gradInput, 0);
+ output = THCTensor_(newContiguous)(state, output);
+ gradOutput = THCTensor_(newContiguous)(state, gradOutput);
+ }
+ else if (ndims == 2)
+ {
+ batchSize = THCTensor_(size)(state, gradInput, 0);
+ classSize = THCTensor_(size)(state, gradInput, 1);
+ output = THCTensor_(newContiguous)(state, output);
+ gradOutput = THCTensor_(newContiguous)(state, gradOutput);
+ }
+ else if (ndims == 3)
+ {
+ spatial = true;
+ classSize = THCTensor_(size)(state, input, 0);
+ height = THCTensor_(size)(state, input, 1);
+ width = THCTensor_(size)(state, input, 2);
+
+ // create contiguous tensor with cuda layout from tensor with torch layout
+ // C x H x W -> W x H x C
+ THCTensor_(transpose)(state, output, output, 0, 2);
+ // W x H x C -> H x W x C
+ THCTensor_(transpose)(state, output, output, 0, 1);
+ THCTensor *transposedOutput = THCTensor_(newContiguous)(state, output);
+ THCTensor_(transpose)(state, output, output, 0, 1);
+ THCTensor_(transpose)(state, output, output, 0, 2);
+ output = transposedOutput;
+
+ // create contiguous tensor with cuda layout from tensor with torch layout
+ // C x H x W -> W x H x C
+ THCTensor_(transpose)(state, gradOutput, gradOutput, 0, 2);
+ // W x H x C -> H x W x C
+ THCTensor_(transpose)(state, gradOutput, gradOutput, 0, 1);
+ THCTensor *transposedGradOutput = THCTensor_(newContiguous)(state, gradOutput);
+ THCTensor_(transpose)(state, gradOutput, gradOutput, 0, 1);
+ THCTensor_(transpose)(state, gradOutput, gradOutput, 0, 2);
+ gradOutput = transposedGradOutput;
+ }
+ else if (ndims == 4)
+ {
+ spatial = true;
+ batchSize = THCTensor_(size)(state, gradInput, 0);
+ classSize = THCTensor_(size)(state, input, 1);
+ height = THCTensor_(size)(state, input, 2);
+ width = THCTensor_(size)(state, input, 3);
+
+ // create contiguous tensor with cuda layout from tensor with torch layout
+ // B x C x H x W -> B x W x H x C
+ THCTensor_(transpose)(state, output, output, 1, 3);
+ // B x W x H x C -> B x H x W x C
+ THCTensor_(transpose)(state, output, output, 1, 2);
+ THCTensor *transposedOutput = THCTensor_(newContiguous)(state, output);
+ THCTensor_(transpose)(state, output, output, 1, 2);
+ THCTensor_(transpose)(state, output, output, 1, 3);
+ output = transposedOutput;
+
+ // create contiguous tensor with cuda layout from tensor with torch layout
+ // B x C x H x W -> B x W x H x C
+ THCTensor_(transpose)(state, gradOutput, gradOutput, 1, 3);
+ // B x W x H x C -> B x H x W x C
+ THCTensor_(transpose)(state, gradOutput, gradOutput, 1, 2);
+ THCTensor *transposedGradOutput = THCTensor_(newContiguous)(state, gradOutput);
+ THCTensor_(transpose)(state, gradOutput, gradOutput, 1, 2);
+ THCTensor_(transpose)(state, gradOutput, gradOutput, 1, 3);
+ gradOutput = transposedGradOutput;
+ }
+ else
+ {
+ THError("1D, 2D, 3D or 4D Tensor expected");
+ }
+
+ if (!spatial)
+ {
+ dim3 grid(batchSize);
+ dim3 block(1024);
+
+ cunn_LogSoftMax_updateGradInput_kernel<2, real, accreal>
+ <<<grid, block, block.x * sizeof(accreal), THCState_getCurrentStream(state)>>>(
+ THCTensor_(data)(state, gradInput),
+ THCTensor_(data)(state, output),
+ THCTensor_(data)(state, gradOutput),
+ classSize
+ );
+ }
+ else
+ {
+ dim3 grid(batchSize);
+ dim3 block(1024);
+
+ cunn_SpatialLogSoftMax_updateGradInput_kernel<real, accreal>
+ <<<grid, block, 0, THCState_getCurrentStream(state)>>>(
+ THCTensor_(data)(state, gradInput),
+ THCTensor_(data)(state, output),
+ THCTensor_(data)(state, gradOutput),
+ classSize, height, width
+ );
+ }
+
+ cudaError errcode = cudaGetLastError();
+ if (errcode != cudaSuccess)
+ {
+ THError(cudaGetErrorString(errcode));
+ }
+
+ THCTensor_(free)(state, gradOutput);
+ THCTensor_(free)(state, output);
+}
+
+#endif
diff --git a/lib/THCUNN/generic/LookupTable.cu b/lib/THCUNN/generic/LookupTable.cu
new file mode 100644
index 0000000..2027425
--- /dev/null
+++ b/lib/THCUNN/generic/LookupTable.cu
@@ -0,0 +1,157 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/LookupTable.cu"
+#else
+
+void THNN_(LookupTable_accGradParameters)(
+ THCState *state,
+ THCIndexTensor *input,
+ THCTensor *gradOutput,
+ THCTensor *gradWeight,
+ THCIndexTensor *count,
+ THCIndexTensor *sorted,
+ THCIndexTensor *indices,
+ bool scaleGradByFreq,
+ int paddingValue,
+ real scale)
+{
+ THCUNN_assertSameGPU_generic(state, 5, input, gradOutput, gradWeight, sorted, indices);
+ if (!(THCIndexTensor_(isContiguous)(state, input) &&
+ THCTensor_(isContiguous)(state, gradOutput) &&
+ THCTensor_(isContiguous)(state, gradWeight)))
+ {
+ THError("Tensors must be contiguous");
+ }
+
+ int nDim = THCIndexTensor_(nDimension)(state, input);
+ if (nDim != 1 && nDim != 2)
+ THError("input must be a vector or matrix");
+
+ long numel = THCIndexTensor_(nElement)(state, input);
+ long stride = gradWeight->stride[0];
+
+ cudaStream_t stream = THCState_getCurrentStream(state);
+
+ if (numel <= 768 && !scaleGradByFreq) {
+ cunn_LookupTable_accGradParametersKernelByFeature<<<DIVUP(stride,4), 128, 0, stream>>>(
+ THCIndexTensor_(data)(state, input),
+ THCTensor_(data)(state, gradOutput),
+ THCTensor_(data)(state, gradWeight),
+ scale,
+ numel,
+ stride,
+ paddingValue);
+ THCudaCheck(cudaGetLastError());
+ return;
+ }
+
+ THLongStorage *inputSize = THCIndexTensor_(newSizeOf)(state, input);
+ THCIndexTensor_(resize)(state, sorted, inputSize, NULL);
+ THCIndexTensor_(resize)(state, indices, inputSize, NULL);
+ THLongStorage_free(inputSize);
+
+ // Sort the inputs into sorted with the corresponding indices
+ THCIndexTensor_(sort)(state, sorted, indices, input, 0, 0);
+
+ THCIndex_t *sorted_data = THCIndexTensor_(data)(state, sorted);
+ THCIndex_t *indices_data = THCIndexTensor_(data)(state, indices);
+ THCIndex_t *count_data = NULL;
+
+ if (scaleGradByFreq)
+ {
+ THCIndexTensor_(resizeAs)(state, count, input);
+ count_data = THCIndexTensor_(data)(state, count);
+
+ thrust::device_ptr<THCIndex_t> sorted_ptr(sorted_data);
+ thrust::device_ptr<THCIndex_t> count_ptr(count_data);
+
+ // Compute an increasing sequence per unique item in sorted:
+ // sorted: 2 5 5 5 7 7 8 9 9
+ // count: 1 1 2 3 1 2 1 1 2
+ thrust::inclusive_scan_by_key(
+#if CUDA_VERSION >= 7000
+ thrust::cuda::par.on(THCState_getCurrentStream(state)),
+#endif
+ sorted_ptr,
+ sorted_ptr + numel,
+ thrust::make_constant_iterator(1),
+ count_ptr
+ );
+
+ // Take the maximum of each count per unique key in reverse:
+ // sorted: 2 5 5 5 7 7 8 9 9
+ // count: 1 3 3 3 2 2 1 2 2
+ thrust::inclusive_scan_by_key(
+#if CUDA_VERSION >= 7000
+ thrust::cuda::par.on(THCState_getCurrentStream(state)),
+#endif
+ thrust::make_reverse_iterator(sorted_ptr + numel),
+ thrust::make_reverse_iterator(sorted_ptr),
+ thrust::make_reverse_iterator(count_ptr + numel),
+ thrust::make_reverse_iterator(count_ptr + numel),
+ thrust::equal_to<long>(),
+ thrust::maximum<long>()
+ );
+ }
+
+ dim3 grid(DIVUP(numel,4), DIVUP(stride,128));
+ dim3 block(32, 4);
+ cunn_LookupTable_accGradParametersKernel<real, accreal><<<grid, block, 0, stream>>>(
+ sorted_data,
+ indices_data,
+ THCTensor_(data)(state, gradOutput),
+ THCTensor_(data)(state, gradWeight),
+ count_data,
+ scale,
+ numel,
+ stride,
+ paddingValue
+ );
+ THCudaCheck(cudaGetLastError());
+}
+
+void THNN_(LookupTable_renorm)(
+ THCState *state,
+ THCIndexTensor *idx,
+ THCTensor *weight,
+ real maxNorm,
+ real normType)
+{
+ THCUNN_assertSameGPU_generic(state, 2, idx, weight);
+ if (!(THCIndexTensor_(isContiguous)(state, idx) &&
+ THCTensor_(isContiguous)(state, weight)))
+ {
+ THError("Tensors must be contiguous");
+ }
+ if (THCIndexTensor_(nDimension)(state, idx) != 1)
+ THError("idx must be a vector");
+ if (normType <= 0)
+ THError("non-positive-norm not supported");
+
+ THCIndex_t numel = THCIndexTensor_(nElement)(state, idx);
+ long stride = weight->stride[0];
+
+ // get the unique indices
+ thrust::device_ptr<real> weight_ptr(THCTensor_(data)(state, weight));
+ thrust::device_ptr<THCIndex_t> idx_ptr(THCIndexTensor_(data)(state, idx));
+ thrust::device_ptr<THCIndex_t> end_ptr = thrust::unique(idx_ptr, idx_ptr+numel);
+ numel = end_ptr - idx_ptr;
+
+ pow_v<real, accreal> unary_pow(normType);
+ thrust::plus<accreal> binary_plus;
+ // numel << stride, since idx usually contains sparse row indices
+ for (long i = 0; i < numel; i++)
+ {
+ THCIndex_t k = idx_ptr[i] - TH_INDEX_BASE;
+ thrust::device_ptr<real> row_ptr = weight_ptr + k * stride;
+ accreal norm = thrust::transform_reduce(row_ptr, row_ptr + stride,
+ unary_pow, 0, binary_plus);
+ norm = std::pow(norm, (accreal) (1.0 / normType));
+ if (norm > ScalarConvert<real, accreal>::to(maxNorm))
+ {
+ multiply_s<real> unary_mul(ScalarConvert<accreal, real>::to(maxNorm / (norm + 1e-7)));
+ thrust::transform(row_ptr, row_ptr + stride, row_ptr, unary_mul);
+ }
+ }
+}
+
+#endif
diff --git a/lib/THCUNN/generic/MSECriterion.cu b/lib/THCUNN/generic/MSECriterion.cu
new file mode 100644
index 0000000..f4a38de
--- /dev/null
+++ b/lib/THCUNN/generic/MSECriterion.cu
@@ -0,0 +1,75 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/MSECriterion.cu"
+#else
+
+void THNN_(MSECriterion_updateOutput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *target,
+ THCTensor *output,
+ bool sizeAverage)
+{
+ THCUNN_assertSameGPU_generic(state, 2, input, target);
+ THArgCheck(THCTensor_(nElement)(state, input) == THCTensor_(nElement)(state, target), 2,
+ "input and target need to have the same number of elements"
+ );
+
+ long size = THCTensor_(nElement)(state, input);
+
+ input = THCTensor_(newContiguous)(state, input);
+ target = THCTensor_(newContiguous)(state, target);
+
+ thrust::device_ptr<real> input_data(THCTensor_(data)(state, input));
+ thrust::device_ptr<real> target_data(THCTensor_(data)(state, target));
+ accreal sum = thrust::inner_product(
+#if CUDA_VERSION >= 7000
+ thrust::cuda::par.on(THCState_getCurrentStream(state)),
+#endif
+ input_data, input_data+size, target_data, (accreal) 0,
+ thrust::plus<accreal>(), mse_functor<real, accreal>());
+
+ if (sizeAverage)
+ sum /= size;
+
+ THCTensor_(free)(state, input);
+ THCTensor_(free)(state, target);
+
+ THCTensor_(set1d)(state, output, 0, ScalarConvert<accreal, real>::to(sum));
+}
+
+void THNN_(MSECriterion_updateGradInput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *target,
+ THCTensor *gradInput,
+ bool sizeAverage)
+{
+ THCUNN_assertSameGPU_generic(state, 3, input, target, gradInput);
+ THArgCheck(THCTensor_(nElement)(state, input) == THCTensor_(nElement)(state, target), 2,
+ "input and target need to have the same number of elements"
+ );
+
+ long size = THCTensor_(nElement)(state, input);
+ accreal norm = sizeAverage ? (accreal)(2)/size : (accreal)(2);
+
+ input = THCTensor_(newContiguous)(state, input);
+ target = THCTensor_(newContiguous)(state, target);
+
+ THCTensor_(resizeAs)(state, gradInput, input);
+
+ thrust::device_ptr<real> input_data(THCTensor_(data)(state, input));
+ thrust::device_ptr<real> target_data(THCTensor_(data)(state, target));
+ thrust::device_ptr<real> gradInput_data(THCTensor_(data)(state, gradInput));
+
+ thrust::transform(
+#if CUDA_VERSION >= 7000
+ thrust::cuda::par.on(THCState_getCurrentStream(state)),
+#endif
+ input_data, input_data+size, target_data, gradInput_data,
+ mse_updateGradInput_functor<real, accreal>(norm));
+
+ THCTensor_(free)(state, input);
+ THCTensor_(free)(state, target);
+}
+
+#endif
diff --git a/lib/THCUNN/generic/MarginCriterion.cu b/lib/THCUNN/generic/MarginCriterion.cu
new file mode 100644
index 0000000..4b7e85f
--- /dev/null
+++ b/lib/THCUNN/generic/MarginCriterion.cu
@@ -0,0 +1,64 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/MarginCriterion.cu"
+#else
+
+void THNN_(MarginCriterion_updateOutput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *target,
+ THCTensor *output,
+ bool sizeAverage,
+ real margin)
+{
+ THCUNN_assertSameGPU_generic(state, 2, input, target);
+
+ long size = THCTensor_(nElement)(state, input);
+
+ input = THCTensor_(newContiguous)(state, input);
+ target = THCTensor_(newContiguous)(state, target);
+
+ thrust::device_ptr<real> input_data(THCTensor_(data)(state, input));
+ thrust::device_ptr<real> target_data(THCTensor_(data)(state, target));
+ accreal sum = thrust::inner_product(input_data, input_data+size, target_data, (accreal) 0, thrust::plus<accreal>(),
+ margin_functor<real, accreal>(ScalarConvert<real, accreal>::to(margin)));
+
+ if (sizeAverage)
+ sum /= size;
+
+ THCTensor_(free)(state, input);
+ THCTensor_(free)(state, target);
+
+ THCTensor_(set1d)(state, output, 0, ScalarConvert<accreal, real>::to(sum));
+}
+
+
+void THNN_(MarginCriterion_updateGradInput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *target,
+ THCTensor *gradInput,
+ bool sizeAverage,
+ real margin)
+{
+ THCUNN_assertSameGPU_generic(state, 3, input, target, gradInput);
+
+ long size = THCTensor_(nElement)(state, input);
+ accreal norm = sizeAverage ? 1.f/size : 1;
+
+ input = THCTensor_(newContiguous)(state, input);
+ target = THCTensor_(newContiguous)(state, target);
+
+ THCTensor_(resizeAs)(state, gradInput, input);
+
+ thrust::device_ptr<real> input_data(THCTensor_(data)(state, input));
+ thrust::device_ptr<real> target_data(THCTensor_(data)(state, target));
+ thrust::device_ptr<real> gradInput_data(THCTensor_(data)(state, gradInput));
+
+ thrust::transform(input_data, input_data+size, target_data, gradInput_data,
+ margin_updateGradInput_functor<real, accreal>(ScalarConvert<real, accreal>::to(margin), norm));
+
+ THCTensor_(free)(state, input);
+ THCTensor_(free)(state, target);
+}
+
+#endif
diff --git a/lib/THCUNN/generic/MultiLabelMarginCriterion.cu b/lib/THCUNN/generic/MultiLabelMarginCriterion.cu
new file mode 100644
index 0000000..c9a56c4
--- /dev/null
+++ b/lib/THCUNN/generic/MultiLabelMarginCriterion.cu
@@ -0,0 +1,111 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/MultiLabelMarginCriterion.cu"
+#else
+
+void THNN_(MultiLabelMarginCriterion_updateOutput)(
+ THCState *state,
+ THCTensor *input,
+ THCIndexTensor *target,
+ THCTensor *output,
+ THCTensor *istarget,
+ bool sizeaverage)
+{
+ input = THCTensor_(newContiguous)(state, input);
+ target = THCIndexTensor_(newContiguous)(state, target);
+ istarget = THCTensor_(newContiguous)(state, istarget);
+ THCTensor_(resizeAs)(state, istarget, input);
+
+ if(input->nDimension == 1)
+ {
+ THCTensor_(resize1d)(state, output, 1);
+
+ dim3 blocks(1);
+ dim3 threads(MULTILABELMARGIN_THREADS);
+
+ cunn_MultiLabelMarginCriterion_updateOutput_kernel<real, accreal> <<<blocks,threads>>>(
+ THCTensor_(data)(state, output),
+ THCTensor_(data)(state, input),
+ THCIndexTensor_(data)(state, target),
+ THCTensor_(data)(state, istarget),
+ 1, input->size[0],
+ sizeaverage
+ );
+ THCudaCheck(cudaGetLastError());
+ }
+ else if(input->nDimension == 2)
+ {
+ THCTensor *output_tmp = THCTensor_(newWithSize1d)(state, input->size[0]);
+
+ dim3 blocks(input->size[0]);
+ dim3 threads(MULTILABELMARGIN_THREADS);
+
+ cunn_MultiLabelMarginCriterion_updateOutput_kernel<real, accreal> <<<blocks,threads>>>(
+ THCTensor_(data)(state, output_tmp),
+ THCTensor_(data)(state, input),
+ THCIndexTensor_(data)(state, target),
+ THCTensor_(data)(state, istarget),
+ input->size[0], input->size[1],
+ sizeaverage
+ );
+ THCudaCheck(cudaGetLastError());
+ THCTensor_(resize1d)(state, output, 1);
+ THCTensor_(set1d)(state, output, 0, ScalarConvert<accreal, real>::to(THCTensor_(sumall)(state, output_tmp)));
+ THCTensor_(free)(state, output_tmp);
+ }
+ else
+ THError("vector or matrix expected");
+
+ THCTensor_(free)(state, input);
+ THCIndexTensor_(free)(state, target);
+ THCTensor_(free)(state, istarget);
+}
+
+void THNN_(MultiLabelMarginCriterion_updateGradInput)(
+ THCState *state,
+ THCTensor *input,
+ THCIndexTensor *target,
+ THCTensor *gradInput,
+ THCTensor *istarget,
+ bool sizeaverage)
+{
+ input = THCTensor_(newContiguous)(state, input);
+ target = THCIndexTensor_(newContiguous)(state, target);
+ istarget = THCTensor_(newContiguous)(state, istarget);
+ THCTensor_(resizeAs)(state, gradInput, input);
+
+ if(gradInput->nDimension == 1)
+ {
+ dim3 blocks(1);
+ dim3 threads(MULTILABELMARGIN_THREADS);
+
+ cunn_MultiLabelMarginCriterion_updateGradInput_kernel<real, accreal> <<<blocks,threads>>>(THCTensor_(data)(state, gradInput),
+ THCTensor_(data)(state, input),
+ THCIndexTensor_(data)(state, target),
+ THCTensor_(data)(state, istarget),
+ 1, gradInput->size[0],
+ sizeaverage);
+
+ }
+ else if(gradInput->nDimension == 2)
+ {
+ dim3 blocks(gradInput->size[0]);
+ dim3 threads(MULTILABELMARGIN_THREADS);
+
+ cunn_MultiLabelMarginCriterion_updateGradInput_kernel<real, accreal> <<<blocks,threads>>>(THCTensor_(data)(state, gradInput),
+ THCTensor_(data)(state, input),
+ THCIndexTensor_(data)(state, target),
+ THCTensor_(data)(state, istarget),
+ gradInput->size[0], gradInput->size[1],
+ sizeaverage);
+ }
+ else
+ THError("vector or matrix expected");
+
+ THCudaCheck(cudaGetLastError());
+
+ THCTensor_(free)(state, input);
+ THCIndexTensor_(free)(state, target);
+ THCTensor_(free)(state, istarget);
+}
+
+#endif
diff --git a/lib/THCUNN/generic/MultiMarginCriterion.cu b/lib/THCUNN/generic/MultiMarginCriterion.cu
new file mode 100644
index 0000000..303c9a9
--- /dev/null
+++ b/lib/THCUNN/generic/MultiMarginCriterion.cu
@@ -0,0 +1,181 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/MultiMarginCriterion.cu"
+#else
+
+void THNN_(MultiMarginCriterion_updateOutput)(
+ THCState *state,
+ THCTensor *input,
+ THCIndexTensor *target,
+ THCTensor *output,
+ bool sizeAverage,
+ int p,
+ THCTensor *weights,
+ real margin)
+{
+ THCUNN_assertSameGPU_generic(state, 2, input, target);
+ input = THCTensor_(newContiguous)(state, input);
+ if(weights)
+ weights = THCTensor_(newContiguous)(state, weights);
+ if (input->nDimension == 1)
+ {
+ dim3 blocks(1);
+ dim3 threads(MULTIMARGIN_THREADS);
+ if (p == 1)
+ {
+ cunn_MultiMarginCriterion_updateOutput_kernel<1, real, accreal> <<<blocks,threads, 0, THCState_getCurrentStream(state)>>>(
+ THCTensor_(data)(state, output),
+ THCTensor_(data)(state, input),
+ THCIndexTensor_(data)(state, target),
+ weights ? THCTensor_(data)(state, weights) : NULL,
+ 1, input->size[0],
+ sizeAverage,
+ margin
+ );
+ }
+ else if (p == 2)
+ {
+ cunn_MultiMarginCriterion_updateOutput_kernel<2, real, accreal> <<<blocks,threads, 0, THCState_getCurrentStream(state)>>>(
+ THCTensor_(data)(state, output),
+ THCTensor_(data)(state, input),
+ THCIndexTensor_(data)(state, target),
+ weights ? THCTensor_(data)(state, weights) : NULL,
+ 1, input->size[0],
+ sizeAverage,
+ margin
+ );
+ }
+ THCudaCheck(cudaGetLastError());
+ }
+ else if (input->nDimension == 2)
+ {
+ THCTensor *output_ = THCTensor_(newWithSize1d)(state, input->size[0]); // tmp outupt buffer
+ dim3 blocks(input->size[0]);
+ dim3 threads(MULTIMARGIN_THREADS);
+ if (p == 1)
+ {
+ cunn_MultiMarginCriterion_updateOutput_kernel<1, real, accreal> <<<blocks,threads, 0, THCState_getCurrentStream(state)>>>(
+ THCTensor_(data)(state, output_),
+ THCTensor_(data)(state, input),
+ THCIndexTensor_(data)(state, target),
+ weights ? THCTensor_(data)(state, weights) : NULL,
+ input->size[0], input->size[1],
+ sizeAverage,
+ margin
+ );
+ }
+ else if (p == 2)
+ {
+ cunn_MultiMarginCriterion_updateOutput_kernel<2, real, accreal> <<<blocks,threads, 0, THCState_getCurrentStream(state)>>>(
+ THCTensor_(data)(state, output_),
+ THCTensor_(data)(state, input),
+ THCIndexTensor_(data)(state, target),
+ weights ? THCTensor_(data)(state, weights) : NULL,
+ input->size[0], input->size[1],
+ sizeAverage,
+ margin
+ );
+ }
+ THCudaCheck(cudaGetLastError());
+ float sum = THCTensor_(sumall)(state, output_);
+ THCTensor_(set1d)(state, output, 0, ScalarConvert<accreal, real>::to(sum));
+ THCTensor_(free)(state, output_);
+ }
+ else
+ {
+ THError("vector or matrix expected");
+ }
+
+ THCTensor_(free)(state, input);
+ if(weights)
+ THCTensor_(free)(state, weights);
+}
+
+void THNN_(MultiMarginCriterion_updateGradInput)(
+ THCState *state,
+ THCTensor *input,
+ THCIndexTensor *target,
+ THCTensor *gradInput,
+ bool sizeAverage,
+ int p,
+ THCTensor *weights,
+ real margin)
+{
+ THCUNN_assertSameGPU_generic(state, 3, input, gradInput, target);
+ input = THCTensor_(newContiguous)(state, input);
+ THCTensor_(resizeAs)(state, gradInput, input);
+ if(weights)
+ weights = THCTensor_(newContiguous)(state, weights);
+
+ if (input->nDimension == 1)
+ {
+ dim3 blocks(1);
+ dim3 threads(MULTIMARGIN_THREADS);
+
+ if (p == 1)
+ {
+ cunn_MultiMarginCriterion_updateGradInput_kernel<1, real, accreal> <<<blocks,threads, 0, THCState_getCurrentStream(state)>>>(
+ THCTensor_(data)(state, gradInput),
+ THCTensor_(data)(state, input),
+ THCIndexTensor_(data)(state, target),
+ weights ? THCTensor_(data)(state, weights) : NULL,
+ 1, gradInput->size[0],
+ sizeAverage,
+ margin
+ );
+ }
+ else if (p == 2)
+ {
+ cunn_MultiMarginCriterion_updateGradInput_kernel<2, real, accreal> <<<blocks,threads, 0, THCState_getCurrentStream(state)>>>(
+ THCTensor_(data)(state, gradInput),
+ THCTensor_(data)(state, input),
+ THCIndexTensor_(data)(state, target),
+ weights ? THCTensor_(data)(state, weights) : NULL,
+ 1, gradInput->size[0],
+ sizeAverage,
+ margin
+ );
+ }
+ THCudaCheck(cudaGetLastError());
+ }
+ else if (input->nDimension == 2)
+ {
+ dim3 blocks(gradInput->size[0]);
+ dim3 threads(MULTIMARGIN_THREADS);
+
+ if (p == 1)
+ {
+ cunn_MultiMarginCriterion_updateGradInput_kernel<1, real, accreal> <<<blocks,threads, 0, THCState_getCurrentStream(state)>>>(
+ THCTensor_(data)(state, gradInput),
+ THCTensor_(data)(state, input),
+ THCIndexTensor_(data)(state, target),
+ weights ? THCTensor_(data)(state, weights) : NULL,
+ gradInput->size[0], gradInput->size[1],
+ sizeAverage,
+ margin
+ );
+ }
+ else if (p == 2)
+ {
+ cunn_MultiMarginCriterion_updateGradInput_kernel<2, real, accreal> <<<blocks,threads, 0, THCState_getCurrentStream(state)>>>(
+ THCTensor_(data)(state, gradInput),
+ THCTensor_(data)(state, input),
+ THCIndexTensor_(data)(state, target),
+ weights ? THCTensor_(data)(state, weights) : NULL,
+ gradInput->size[0], gradInput->size[1],
+ sizeAverage,
+ margin
+ );
+ }
+ THCudaCheck(cudaGetLastError());
+ }
+ else
+ {
+ THError("vector or matrix expected");
+ }
+
+ THCTensor_(free)(state, input);
+ if(weights)
+ THCTensor_(free)(state, weights);
+}
+
+#endif
diff --git a/lib/THCUNN/generic/PReLU.cu b/lib/THCUNN/generic/PReLU.cu
new file mode 100644
index 0000000..8f2f670
--- /dev/null
+++ b/lib/THCUNN/generic/PReLU.cu
@@ -0,0 +1,154 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/PReLU.cu"
+#else
+
+void THNN_(PReLU_updateOutput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *output,
+ THCTensor *weight,
+ long nOutputPlane)
+{
+ THCTensor_(resizeAs)(state, output, input);
+
+ real *w = THCTensor_(data)(state, weight);
+
+ if (nOutputPlane == 0)
+ {
+ THC_pointwiseApply2(state, output, input, PReLUUpdateOutput<real>(w));
+ }
+ else
+ {
+ int ndim = THCTensor_(nDimension)(state, input);
+ input = THCTensor_(newContiguous)(state, input);
+
+ int n = THCTensor_(nElement)(state, input);
+ int mapSize = 1;
+ if (ndim == 3)
+ mapSize = (input->size[1] * input->size[2]);
+ else if (ndim == 4)
+ mapSize = (input->size[2] * input->size[3]);
+ int nElemsPerSample = nOutputPlane * mapSize;
+ preluForward<<<GET_BLOCKS(n), CUDA_NUM_THREADS, 0, THCState_getCurrentStream(state)>>>(
+ THCTensor_(data)(state, output),
+ THCTensor_(data)(state, input),
+ w,
+ n, nElemsPerSample, mapSize
+ );
+ THCudaCheck(cudaGetLastError());
+ THCTensor_(free)(state, input);
+ }
+}
+
+void THNN_(PReLU_updateGradInput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *gradOutput,
+ THCTensor *gradInput,
+ THCTensor *weight,
+ long nOutputPlane)
+{
+ THCTensor_(resizeAs)(state, gradInput, input);
+
+ real *w = THCTensor_(data)(state, weight);
+ if (nOutputPlane == 0)
+ {
+ THC_pointwiseApply3(state, gradInput, gradOutput, input, PReLUUpdateGradInput<real>(w));
+ }
+ else
+ {
+ int ndim = THCTensor_(nDimension)(state, input);
+ input = THCTensor_(newContiguous)(state, input);
+ gradOutput = THCTensor_(newContiguous)(state, gradOutput);
+
+ int n = THCTensor_(nElement)(state, input);
+ int mapSize = 1;
+ if (ndim == 3)
+ mapSize = (input->size[1] * input->size[2]);
+ else if (ndim == 4)
+ mapSize = (input->size[2] * input->size[3]);
+ int nElemsPerSample = nOutputPlane * mapSize;
+ preluBackward<<<GET_BLOCKS(n), CUDA_NUM_THREADS, 0, THCState_getCurrentStream(state)>>>(
+ THCTensor_(data)(state, gradInput),
+ THCTensor_(data)(state, input),
+ w,
+ THCTensor_(data)(state, gradOutput),
+ n, nElemsPerSample, mapSize
+ );
+ THCudaCheck(cudaGetLastError());
+ THCTensor_(free)(state, input);
+ THCTensor_(free)(state, gradOutput);
+ }
+}
+
+void THNN_(PReLU_accGradParameters)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *gradOutput,
+ THCTensor *gradInput,
+ THCTensor *weight,
+ THCTensor *gradWeight,
+ THCTensor *gradWeightBuf,
+ THCTensor *gradWeightBuf2,
+ long nOutputPlane,
+ real scale)
+{
+ // use grad input for temporary storage, then call updateGradInput again
+
+ if (nOutputPlane == 0)
+ {
+ THC_pointwiseApply3(state, gradInput, input, gradOutput, PReLUAccGradParametersShared<real>());
+
+ // introduces a sync point
+ real sum = ScalarConvert<accreal, real>::to(THCTensor_(sumall)(state, gradInput));
+ real w = THCTensor_(get1d)(state, gradWeight, 0);
+ THCTensor_(set1d)(state, gradWeight, 0, w + sum * scale);
+
+ // restore gradInput
+ THNN_(PReLU_updateGradInput)(state, input, gradOutput, gradInput, weight, nOutputPlane);
+ }
+ else
+ {
+ int ndim = THCTensor_(nDimension)(state, input);
+
+ if (ndim == 1)
+ {
+ THC_pointwiseApply3(state, gradWeight, input, gradOutput, PReLUAccGradParameters1to1<real>(scale));
+ }
+ else
+ {
+ THC_pointwiseApply3(state, gradInput, input, gradOutput, PReLUAccGradParameters<real>(scale));
+ THCTensor *sumbuf = gradWeightBuf2;
+ THCTensor_(resizeAs)(state, gradWeightBuf, gradWeight);
+
+ if (ndim == 2)
+ {
+ THCTensor_(sum)(state, gradWeightBuf, gradInput, 0);
+ THCTensor_(cadd)(state, gradWeight, gradWeight, scale, gradWeightBuf);
+ }
+ else if (ndim == 3)
+ {
+ THCTensor *buffer = THCTensor_(newContiguous)(state, gradInput);
+ THCTensor_(resize2d)(state, buffer, nOutputPlane, input->size[1] * input->size[2]);
+ THCTensor_(sum)(state, gradWeightBuf, buffer, 1);
+ THCTensor_(cadd)(state, gradWeight, gradWeight, scale, gradWeightBuf);
+ THCTensor_(free)(state, buffer);
+ }
+ else if (ndim == 4)
+ {
+ THCTensor *buffer = THCTensor_(newContiguous)(state, gradInput);
+ THCTensor_(resize3d)(state, buffer, input->size[0], nOutputPlane, input->size[2] * input->size[3]);
+ THCTensor_(resize2d)(state, sumbuf, input->size[0], nOutputPlane);
+ THCTensor_(sum)(state, sumbuf, buffer, 2);
+ THCTensor_(sum)(state, gradWeightBuf, sumbuf, 0);
+ THCTensor_(cadd)(state, gradWeight, gradWeight, scale, gradWeightBuf);
+ THCTensor_(free)(state, buffer);
+ }
+
+ // restore gradInput
+ THNN_(PReLU_updateGradInput)(state, input, gradOutput, gradInput, weight, nOutputPlane);
+ }
+ }
+}
+
+#endif
diff --git a/lib/THCUNN/generic/RReLU.cu b/lib/THCUNN/generic/RReLU.cu
new file mode 100644
index 0000000..93ad99b
--- /dev/null
+++ b/lib/THCUNN/generic/RReLU.cu
@@ -0,0 +1,108 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/RReLU.cu"
+#else
+
+#include "../common.h"
+
+void THNN_(RReLU_updateOutput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *output,
+ THCTensor *noise,
+ double lower,
+ double upper,
+ bool train,
+ bool inplace,
+ void *generator)
+{
+ THCUNN_assertSameGPU_generic(state, 3, input, output, noise);
+ struct curandStateMtgp32* gen_states = THCRandom_generatorStates(state);
+
+ if (train)
+ {
+ input = THCTensor_(newContiguous)(state, input);
+ THCTensor_(resizeAs)(state, noise, input);
+ real *input_data = THCTensor_(data)(state, input);
+ real *noise_data = THCTensor_(data)(state, noise);
+ long n = THCTensor_(nElement)(state, input);
+ if (inplace)
+ {
+ rreluUpdateOutputTrain<<<NUM_BLOCKS(n), BLOCK_SIZE, 0, THCState_getCurrentStream(state)>>>(
+ n, gen_states, input_data, noise_data, input_data, lower, upper);
+ THCTensor_(set)(state, output, input);
+ }
+ else
+ {
+ THCTensor_(resizeAs)(state, output, input);
+ real *output_data = THCTensor_(data)(state, output);
+ rreluUpdateOutputTrain<<<NUM_BLOCKS(n), BLOCK_SIZE, 0, THCState_getCurrentStream(state)>>>(
+ n, gen_states, input_data, noise_data, output_data, lower, upper);
+ }
+ THCudaCheck(cudaGetLastError());
+ THCTensor_(free)(state, input);
+ }
+ else
+ {
+ const real negSlope = ScalarConvert<double, real>::to((lower + upper) / 2);
+ if (inplace)
+ {
+ THC_pointwiseApply1(state, input, RReLUUpdateOutputEvalIP_functor<real>(negSlope));
+ THCTensor_(set)(state, output, input);
+ }
+ else
+ {
+ THCTensor_(resizeAs)(state, output, input);
+ THC_pointwiseApply2(state, output, input, RReLUUpdateOutputEval_functor<real>(negSlope));
+ }
+ }
+}
+
+void THNN_(RReLU_updateGradInput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *gradOutput,
+ THCTensor *gradInput,
+ THCTensor *noise,
+ double lower,
+ double upper,
+ bool train,
+ bool inplace)
+{
+ THCUNN_assertSameGPU_generic(state, 4, input, gradOutput, gradInput, noise);
+
+ gradOutput = THCTensor_(newContiguous)(state, gradOutput);
+
+ if (train && upper - lower > 1E-6) // e.g. if upper == lower, RReLU behaves like LeakyReLU
+ {
+ // multiply the gradient by the noise tensor
+ if (inplace)
+ {
+ THCTensor_(cmul)(state, gradOutput, gradOutput, noise);
+ THCTensor_(set)(state, gradInput, gradOutput);
+ }
+ else
+ {
+ THCTensor_(resizeAs)(state, gradInput, input);
+ THCTensor_(cmul)(state, gradInput, gradOutput, noise);
+ }
+ }
+ else
+ {
+ // use constant factor for negative input values
+ const real negSlope = ScalarConvert<double, real>::to((lower + upper) / 2);
+ if (inplace)
+ {
+ THC_pointwiseApply2(state, gradOutput, input, RReLUupdateGradInputEvalIP_functor<real>(negSlope));
+ THCTensor_(set)(state, gradInput, gradOutput);
+ }
+ else
+ {
+ THCTensor_(resizeAs)(state, gradInput, input);
+ THC_pointwiseApply3(state, gradInput, gradOutput, input, RReLUupdateGradInputEval_functor<real>(negSlope));
+ }
+ }
+
+ THCTensor_(free)(state, gradOutput);
+}
+
+#endif
diff --git a/lib/THCUNN/generic/Sigmoid.cu b/lib/THCUNN/generic/Sigmoid.cu
new file mode 100644
index 0000000..7bc5739
--- /dev/null
+++ b/lib/THCUNN/generic/Sigmoid.cu
@@ -0,0 +1,29 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/Sigmoid.cu"
+#else
+
+#include "../common.h"
+
+void THNN_(Sigmoid_updateOutput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *output)
+{
+ THCUNN_assertSameGPU_generic(state, 2, input, output);
+ THCTensor_(resizeAs)(state, output, input);
+ THC_pointwiseApply2(state, output, input, sigmoidupdateOutput_functor<real>());
+}
+
+void THNN_(Sigmoid_updateGradInput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *gradOutput,
+ THCTensor *gradInput,
+ THCTensor *output)
+{
+ THCUNN_assertSameGPU_generic(state, 3, output, gradOutput, gradInput);
+ THCTensor_(resizeAs)(state, gradInput, output);
+ THC_pointwiseApply3(state, gradInput, output, gradOutput, sigmoidupdateGradInput_functor<real>());
+}
+
+#endif
diff --git a/lib/THCUNN/generic/SmoothL1Criterion.cu b/lib/THCUNN/generic/SmoothL1Criterion.cu
new file mode 100644
index 0000000..8bf6865
--- /dev/null
+++ b/lib/THCUNN/generic/SmoothL1Criterion.cu
@@ -0,0 +1,79 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/SmoothL1Criterion.cu"
+#else
+
+void THNN_(SmoothL1Criterion_updateOutput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *target,
+ THCTensor *output,
+ bool sizeAverage)
+{
+ THCUNN_assertSameGPU_generic(state, 2, input, target);
+ THArgCheck(
+ THCTensor_(nElement)(state, input) == THCTensor_(nElement)(state, target), 2,
+ "input and target need to have the same number of elements"
+ );
+
+ long size = THCTensor_(nElement)(state, input);
+
+ input = THCTensor_(newContiguous)(state, input);
+ target = THCTensor_(newContiguous)(state, target);
+
+ thrust::device_ptr<real> input_data(THCTensor_(data)(state, input));
+ thrust::device_ptr<real> target_data(THCTensor_(data)(state, target));
+ accreal sum = thrust::inner_product(
+#if CUDA_VERSION >= 7000
+ thrust::cuda::par.on(THCState_getCurrentStream(state)),
+#endif
+ input_data, input_data+size, target_data, (accreal) 0,
+ thrust::plus<accreal>(), smoothl1_functor<real, accreal>()
+ );
+
+ if (sizeAverage)
+ sum /= size;
+
+ THCTensor_(free)(state, input);
+ THCTensor_(free)(state, target);
+
+ THCTensor_(set1d)(state, output, 0, ScalarConvert<accreal, real>::to(sum));
+}
+
+void THNN_(SmoothL1Criterion_updateGradInput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *target,
+ THCTensor *gradInput,
+ bool sizeAverage)
+{
+ THCUNN_assertSameGPU_generic(state, 3, input, target, gradInput);
+ THArgCheck(
+ THCTensor_(nElement)(state, input) == THCTensor_(nElement)(state, target), 2,
+ "input and target need to have the same number of elements"
+ );
+
+ long size = THCTensor_(nElement)(state, input);
+ real norm = ScalarConvert<accreal, real>::to(sizeAverage ? accreal(1)/size : accreal(1));
+
+ input = THCTensor_(newContiguous)(state, input);
+ target = THCTensor_(newContiguous)(state, target);
+
+ THCTensor_(resizeAs)(state, gradInput, input);
+
+ thrust::device_ptr<real> input_data(THCTensor_(data)(state, input));
+ thrust::device_ptr<real> target_data(THCTensor_(data)(state, target));
+ thrust::device_ptr<real> gradInput_data(THCTensor_(data)(state, gradInput));
+
+ thrust::transform(
+#if CUDA_VERSION >= 7000
+ thrust::cuda::par.on(THCState_getCurrentStream(state)),
+#endif
+ input_data, input_data+size, target_data, gradInput_data,
+ smoothl1_updateGradInput_functor<real>(norm)
+ );
+
+ THCTensor_(free)(state, input);
+ THCTensor_(free)(state, target);
+}
+
+#endif
diff --git a/lib/THCUNN/generic/SoftMarginCriterion.cu b/lib/THCUNN/generic/SoftMarginCriterion.cu
new file mode 100644
index 0000000..b5b5329
--- /dev/null
+++ b/lib/THCUNN/generic/SoftMarginCriterion.cu
@@ -0,0 +1,60 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/SoftMarginCriterion.cu"
+#else
+
+void THNN_(SoftMarginCriterion_updateOutput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *target,
+ THCTensor *output,
+ int sizeAverage)
+{
+ THCUNN_assertSameGPU_generic(state, 2, input, target);
+ accreal sum;
+
+ long size = THCTensor_(nElement)(state, input);
+
+ input = THCTensor_(newContiguous)(state, input);
+ target = THCTensor_(newContiguous)(state, target);
+
+ thrust::device_ptr<real> input_data(THCTensor_(data)(state, input));
+ thrust::device_ptr<real> target_data(THCTensor_(data)(state, target));
+ sum = thrust::inner_product(input_data, input_data+size, target_data, (accreal) 0, thrust::plus<accreal>(), softmargin_functor<real, accreal>());
+
+ if(sizeAverage)
+ sum /= size;
+
+ THCTensor_(free)(state, input);
+ THCTensor_(free)(state, target);
+
+ THCTensor_(set1d)(state, output, 0, ScalarConvert<accreal, real>::to(sum));
+}
+
+void THNN_(SoftMarginCriterion_updateGradInput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *target,
+ THCTensor *gradInput,
+ int sizeAverage)
+{
+ THCUNN_assertSameGPU_generic(state, 3, input, target, gradInput);
+
+ long size = THCTensor_(nElement)(state, input);
+ accreal norm = (sizeAverage ? 1./size : 1.);
+
+ input = THCTensor_(newContiguous)(state, input);
+ target = THCTensor_(newContiguous)(state, target);
+
+ THCTensor_(resizeAs)(state, gradInput, input);
+
+ thrust::device_ptr<real> input_data(THCTensor_(data)(state, input));
+ thrust::device_ptr<real> target_data(THCTensor_(data)(state, target));
+ thrust::device_ptr<real> gradInput_data(THCTensor_(data)(state, gradInput));
+
+ thrust::transform(input_data, input_data+size, target_data, gradInput_data, softmargin_updateGradInput_functor<real, accreal>(norm));
+
+ THCTensor_(free)(state, input);
+ THCTensor_(free)(state, target);
+}
+
+#endif
diff --git a/lib/THCUNN/generic/SoftMax.cu b/lib/THCUNN/generic/SoftMax.cu
new file mode 100644
index 0000000..917e3f0
--- /dev/null
+++ b/lib/THCUNN/generic/SoftMax.cu
@@ -0,0 +1,153 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/SoftMax.cu"
+#else
+
+#include "../common.h"
+
+void THNN_(SoftMax_updateOutput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *output)
+{
+ THCUNN_assertSameGPU_generic(state, 2, input, output);
+
+ input = THCTensor_(newContiguous)(state, input);
+ THCTensor_(resizeAs)(state, output, input);
+ long batchSize, dim, stride0, stride1 = 1;
+ long blocksY = 1, blocksZ = 1;
+
+ if (input->nDimension == 1)
+ {
+ batchSize = 1;
+ dim = input->size[0];
+ stride0 = 1;
+ }
+ else if (input->nDimension == 2)
+ {
+ batchSize = input->size[0];
+ dim = input->size[1];
+ stride0 = 1;
+ }
+ else if (input->nDimension == 3)
+ {
+ batchSize = 1;
+ dim = input->size[0];
+ blocksY = input->size[1];
+ blocksZ = input->size[2];
+ stride0 = blocksY * blocksZ;
+ stride1 = blocksZ;
+ }
+ else if (input->nDimension == 4)
+ {
+ batchSize = input->size[0];
+ dim = input->size[1];
+ blocksY = input->size[2];
+ blocksZ = input->size[3];
+ stride0 = blocksY * blocksZ;
+ stride1 = blocksZ;
+ }
+ else
+ {
+ THError("1D, 2D, 3D or 4D tensor expected");
+ }
+
+ // when possible use only 2d grid of thread blocks to stay compatible with compute capability 2.X devices.
+ if (blocksY * blocksZ < 65536)
+ {
+ blocksY *= blocksZ;
+ blocksZ = 1;
+ if (input->nDimension == 3 || input->nDimension == 4) {
+ stride0 = blocksY * blocksZ;
+ stride1 = blocksZ;
+ }
+ }
+
+ dim3 blocks(batchSize, blocksY, blocksZ);
+ dim3 threads(SOFTMAX_THREADS);
+ cunn_SoftMax_updateOutput_kernel<real, accreal><<<blocks, threads, 0, THCState_getCurrentStream(state)>>>(
+ THCTensor_(data)(state, output),
+ THCTensor_(data)(state, input),
+ batchSize, dim, stride0, stride1
+ );
+ THCudaCheck(cudaGetLastError());
+
+ THCTensor_(free)(state, input);
+}
+
+void THNN_(SoftMax_updateGradInput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *gradOutput,
+ THCTensor *gradInput,
+ THCTensor *output)
+{
+ THCUNN_assertSameGPU_generic(state, 3, output, gradOutput, gradInput);
+
+ output = THCTensor_(newContiguous)(state, output);
+ gradOutput = THCTensor_(newContiguous)(state, gradOutput);
+
+ THCTensor_(resizeAs)(state, gradInput, output);
+ long batchSize, dim, stride0, stride1 = 1;
+ long blocksY = 1, blocksZ = 1;
+
+ if (gradInput->nDimension == 1)
+ {
+ batchSize = 1;
+ dim = gradInput->size[0];
+ stride0 = 1;
+ }
+ else if (gradInput->nDimension == 2)
+ {
+ batchSize = gradInput->size[0];
+ dim = gradInput->size[1];
+ stride0 = 1;
+ }
+ else if (gradInput->nDimension == 3)
+ {
+ batchSize = 1;
+ dim = gradInput->size[0];
+ blocksY = gradInput->size[1];
+ blocksZ = gradInput->size[2];
+ stride0 = blocksY * blocksZ;
+ stride1 = blocksZ;
+ }
+ else if (gradInput->nDimension == 4)
+ {
+ batchSize = gradInput->size[0];
+ dim = gradInput->size[1];
+ blocksY = gradInput->size[2];
+ blocksZ = gradInput->size[3];
+ stride0 = blocksY * blocksZ;
+ stride1 = blocksZ;
+ }
+ else
+ {
+ THError("1D, 2D, 3D or 4D tensor expected");
+ }
+
+ // when possible use only 2d grid of thread blocks to stay compatible with compute capability 2.X devices.
+ if (blocksY * blocksZ < 65536)
+ {
+ blocksY *= blocksZ;
+ blocksZ = 1;
+ if (input->nDimension == 3 || input->nDimension == 4) {
+ stride0 = blocksY * blocksZ;
+ stride1 = blocksZ;
+ }
+ }
+
+ dim3 blocks(batchSize, blocksY, blocksZ);
+ dim3 threads(SOFTMAX_THREADS);
+ cunn_SoftMax_updateGradInput_kernel<real, accreal><<<blocks, threads, 0, THCState_getCurrentStream(state)>>>(
+ THCTensor_(data)(state, gradInput),
+ THCTensor_(data)(state, output),
+ THCTensor_(data)(state, gradOutput),
+ batchSize, dim, stride0, stride1
+ );
+ THCudaCheck(cudaGetLastError());
+
+ THCTensor_(free)(state, gradOutput);
+ THCTensor_(free)(state, output);
+}
+
+#endif
diff --git a/lib/THCUNN/generic/SoftPlus.cu b/lib/THCUNN/generic/SoftPlus.cu
new file mode 100644
index 0000000..39794b0
--- /dev/null
+++ b/lib/THCUNN/generic/SoftPlus.cu
@@ -0,0 +1,33 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/SoftPlus.cu"
+#else
+
+#include "../common.h"
+
+void THNN_(SoftPlus_updateOutput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *output,
+ real beta,
+ real threshold)
+{
+ THCUNN_assertSameGPU_generic(state, 2, input, output);
+ THCTensor_(resizeAs)(state, output, input);
+ THC_pointwiseApply2(state, output, input, softPlusupdateOutput_functor<real>(threshold, beta));
+}
+
+void THNN_(SoftPlus_updateGradInput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *gradOutput,
+ THCTensor *gradInput,
+ THCTensor *output,
+ real beta,
+ real threshold)
+{
+ THCUNN_assertSameGPU_generic(state, 4, input, output, gradOutput, gradInput);
+ THCTensor_(resizeAs)(state, gradInput, output);
+ THC_pointwiseApply3(state, gradInput, output, gradOutput, softPlusupdateGradInput_functor<real>(threshold, beta));
+}
+
+#endif
diff --git a/lib/THCUNN/generic/SoftShrink.cu b/lib/THCUNN/generic/SoftShrink.cu
new file mode 100644
index 0000000..aa530bb
--- /dev/null
+++ b/lib/THCUNN/generic/SoftShrink.cu
@@ -0,0 +1,32 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/SoftShrink.cu"
+#else
+
+#include "../common.h"
+
+void THNN_(SoftShrink_updateOutput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *output,
+ real lambda)
+{
+ THCUNN_assertSameGPU_generic(state, 2, input, output);
+ THCTensor_(resizeAs)(state, output, input);
+ THC_pointwiseApply2(state, output, input, SoftShrinkUpdateOutput<real>(lambda));
+ THCudaCheck(cudaGetLastError());
+}
+
+void THNN_(SoftShrink_updateGradInput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *gradOutput,
+ THCTensor *gradInput,
+ real lambda)
+{
+ THCUNN_assertSameGPU_generic(state, 3, input, gradOutput, gradInput);
+ THCTensor_(resizeAs)(state, gradInput, input);
+ THC_pointwiseApply3(state, gradInput, input, gradOutput, SoftShrinkUpdateGradInput<real>(lambda));
+ THCudaCheck(cudaGetLastError());
+}
+
+#endif
diff --git a/lib/THCUNN/generic/SparseLinear.cu b/lib/THCUNN/generic/SparseLinear.cu
new file mode 100644
index 0000000..438a8e4
--- /dev/null
+++ b/lib/THCUNN/generic/SparseLinear.cu
@@ -0,0 +1,269 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/SparseLinear.cu"
+#else
+
+static bool checkInput(THCTensor* t)
+{
+ return t->nDimension == 2 && t->size[1] == 3;
+}
+
+static bool checkSize2D(THCTensor* t, long size0, long size1)
+{
+ return t->nDimension == 2 && t->size[0] == size0 && t->size[1] == size1;
+}
+
+static bool checkSize1D(THCTensor* t, long size0)
+{
+ return t->nDimension == 1 && t->size[0] == size0;
+}
+
+static inline void copyCudaFloatingType(THCState *state, THCudaIntTensor *buf, THCTensor *t) {
+ #ifdef THC_REAL_IS_FLOAT
+ THCudaIntTensor_copyCudaFloat(state, buf, t);
+ #elif defined(THC_REAL_IS_DOUBLE)
+ THCudaIntTensor_copyCudaDouble(state, buf, t);
+ #elif defined(THC_REAL_IS_HALF)
+ THCudaIntTensor_copyCudaHalf(state, buf, t);
+ #endif
+}
+
+void THNN_(SparseLinear_updateOutput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *output,
+ THCTensor *weight,
+ THCTensor *bias)
+{
+ THAssert(THCTensor_(checkGPU)(state, 4, input, output, weight, bias));
+
+ long h;
+ long outDim = THCTensor_(size)(state, weight, 0);
+ long inDim = THCTensor_(size)(state, weight, 1);
+
+ THArgCheck(checkInput(input), 2, "input size must be nnz x 3");
+ THArgCheck(THCTensor_(nDimension)(state, output) == 2, 3, "output must be batchsize x outputsize");
+ THArgCheck(checkSize1D(bias, outDim), 5, "bias size wrong");
+
+ long batchnum = THCTensor_(size)(state, output, 0);
+ long nnz = THCTensor_(size)(state, input, 0);
+
+ THCTensor *buffer = THCTensor_(new)(state);
+ THCTensor *sel = THCTensor_(new)(state);
+ THCTensor *values = THCTensor_(new)(state);
+ THCudaIntTensor *rowbuf = THCudaIntTensor_new(state);
+ THCudaIntTensor *csrPtrs = THCudaIntTensor_new(state);
+ THCudaIntTensor *colInds = THCudaIntTensor_new(state);
+
+ THCTensor_(resize1d)(state, values, nnz);
+ THCudaIntTensor_resize1d(state, rowbuf, nnz);
+ THCudaIntTensor_resize1d(state, colInds, nnz);
+ THCudaIntTensor_resize1d(state, csrPtrs, batchnum+1);
+
+ // Get data ready for cusparse, need CudaInt buffers
+ // We do not need to sort, since rows are already in order
+ // If rows might get out of order in future implementations, or if cusparse
+ // complains with an illegal memory access, sort like we do in AccGradParameters
+ THCTensor_(select)(state, sel, input, 1, 0);
+ copyCudaFloatingType(state, rowbuf, sel);
+ THCTensor_(select)(state, sel, input, 1, 1);
+ copyCudaFloatingType(state, colInds, sel);
+ THCTensor_(select)(state, sel, input, 1, 2);
+ THCTensor_(copyCuda)(state, values, sel);
+
+ init_cusparse();
+ cusparseXcoo2csr(cusparse_handle,
+ THCudaIntTensor_data(state, rowbuf), nnz, batchnum,
+ THCudaIntTensor_data(state, csrPtrs), CUSPARSE_INDEX_BASE_ONE);
+
+ // output = bias
+ THCTensor_(resize2d)(state, buffer, outDim, batchnum);
+ THCTensor_(zero)(state, buffer);
+ for (h=0; h<batchnum; h++) {
+ THCTensor_(select)(state, sel, buffer, 1, h);
+ THCTensor_(copy)(state, sel, bias);
+ }
+
+ // output = W * x
+ real one = ScalarConvert<int, real>::to(1);
+ cusparseMatDescr_t descr = 0;
+ cusparseCreateMatDescr(&descr);
+ cusparseSetMatType(descr,CUSPARSE_MATRIX_TYPE_GENERAL);
+ cusparseSetMatIndexBase(descr,CUSPARSE_INDEX_BASE_ONE);
+ #ifdef THC_REAL_IS_FLOAT
+ cusparseScsrmm(cusparse_handle,
+ #elif defined(THC_REAL_IS_DOUBLE)
+ cusparseDcsrmm(cusparse_handle,
+ #endif
+ CUSPARSE_OPERATION_NON_TRANSPOSE,
+ batchnum, outDim, inDim, nnz,
+ &one,
+ descr,
+ THCTensor_(data)(state, values),
+ THCudaIntTensor_data(state, csrPtrs),
+ THCudaIntTensor_data(state, colInds),
+ THCTensor_(data)(state, weight), inDim,
+ &one, THCTensor_(data)(state, buffer), batchnum
+ );
+ THCTensor_(transpose)(state, buffer, NULL, 0, 1);
+
+ // We do work in the buffer to keep the output contiguous
+ THCTensor_(copy)(state, output, buffer);
+
+ cusparseDestroyMatDescr(descr);
+ descr = 0;
+ THCTensor_(free)(state, buffer);
+ THCTensor_(free)(state, sel);
+ THCTensor_(free)(state, values);
+ THCudaIntTensor_free(state, rowbuf);
+ THCudaIntTensor_free(state, colInds);
+ THCudaIntTensor_free(state, csrPtrs);
+}
+
+void THNN_(SparseLinear_accGradParameters)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *gradOutput,
+ THCTensor *gradWeight,
+ THCTensor *gradBias,
+ THCTensor *weight,
+ THCTensor *bias,
+ double weightDecay,
+ double scale)
+{
+ long outDim = THCTensor_(size)(state, weight, 0);
+ long inDim = THCTensor_(size)(state, weight, 1);
+
+ THArgCheck(checkInput(input), 2, "input size must be batchsize x nnz x 2");
+ THArgCheck(checkSize2D(gradWeight, outDim, inDim), 4, "gradWeight size wrong");
+ THArgCheck(checkSize1D(gradBias, outDim), 5, "gradBias size wrong");
+ THArgCheck(THCTensor_(isContiguous)(state, gradOutput), 3,
+ "gradOutput must be contiguous");
+
+ long nnz = THCTensor_(size)(state, input, 0);
+ long batchnum = THCTensor_(size)(state, gradOutput, 0);
+
+ THCTensor *buf = THCTensor_(new)(state);
+ THCTensor *cols = THCTensor_(new)(state);
+ THCTensor *sel = THCTensor_(new)(state);
+ THCudaLongTensor *inds = THCudaLongTensor_new(state);
+ THCTensor *values = THCTensor_(new)(state);
+ THCudaIntTensor *colbuf = THCudaIntTensor_new(state);
+ THCudaIntTensor *colPtrs = THCudaIntTensor_new(state);
+ THCudaIntTensor *rowInds = THCudaIntTensor_new(state);
+
+ THCTensor_(select)(state, sel, input, 1, 0); // rowInds
+ THCTensor_(select)(state, cols, input, 1, 1); // colInds
+ THCTensor_(cadd)(state, buf, sel, batchnum, cols); // colInds * buatchdim + rowInds
+ THCTensor_(sort)(state, buf, inds, buf, 0, 0); // Indicies are now in ind
+ THCTensor_(indexSelect)(state, buf, input, 0, inds);
+
+ THCTensor_(resize1d)(state, values, nnz);
+ THCudaIntTensor_resize1d(state, colbuf, nnz);
+ THCudaIntTensor_resize1d(state, rowInds, nnz);
+ THCudaIntTensor_resize1d(state, colPtrs, inDim+1);
+
+ // Get data ready for cusparse, need CudaInt buffers
+ THCTensor_(select)(state, sel, buf, 1, 0);
+ copyCudaFloatingType(state, rowInds, sel);
+ THCTensor_(select)(state, sel, buf, 1, 1);
+ copyCudaFloatingType(state, colbuf, sel);
+ THCTensor_(select)(state, sel, buf, 1, 2);
+ THCTensor_(copyCuda)(state, values, sel);
+
+ init_cusparse();
+ // Secretly coo2csc
+ cusparseXcoo2csr(cusparse_handle,
+ THCudaIntTensor_data(state, colbuf), nnz, inDim,
+ THCudaIntTensor_data(state, colPtrs), CUSPARSE_INDEX_BASE_ONE);
+
+ // FORTRAN expects contiguous col-major matricies
+ THCTensor_(transpose)(state, gradOutput, NULL, 0, 1);
+ THCTensor_(resize2d)(state, buf, batchnum, outDim);
+ THCTensor_(copy)(state, buf, gradOutput);
+ THCTensor_(transpose)(state, gradOutput, NULL, 0, 1); // Restore gradOutput
+
+ real one = ScalarConvert<int, real>::to(1);
+ cusparseMatDescr_t descr = 0;
+ cusparseCreateMatDescr(&descr);
+ cusparseSetMatType(descr,CUSPARSE_MATRIX_TYPE_GENERAL);
+ cusparseSetMatIndexBase(descr,CUSPARSE_INDEX_BASE_ONE);
+ #ifdef THC_REAL_IS_FLOAT
+ cusparseScsrmm(cusparse_handle,
+ #elif defined(THC_REAL_IS_DOUBLE)
+ cusparseDcsrmm(cusparse_handle,
+ #endif
+ CUSPARSE_OPERATION_NON_TRANSPOSE,
+ inDim, outDim, batchnum, nnz,
+ &one,
+ descr,
+ THCTensor_(data)(state, values),
+ THCudaIntTensor_data(state, colPtrs),
+ THCudaIntTensor_data(state, rowInds),
+ THCTensor_(data)(state, buf), batchnum,
+ &one, THCTensor_(data)(state, gradWeight), inDim
+ );
+
+ THCTensor_(sum)(state, buf, gradOutput, 0);
+ THCTensor_(resize1d)(state, buf, outDim);
+ THCTensor_(cadd)(state, gradBias, gradBias, scale, buf);
+
+ if (weightDecay != 0)
+ {
+ THCTensor_(cadd)(state, gradWeight, gradWeight, weightDecay, weight);
+ THCTensor_(cadd)(state, gradBias, gradBias, weightDecay, bias);
+ }
+
+ THCTensor_(free)(state, buf);
+ THCTensor_(free)(state, sel);
+ THCTensor_(free)(state, cols);
+ THCudaLongTensor_free(state, inds);
+ THCTensor_(free)(state, values);
+ THCudaIntTensor_free(state, colbuf);
+ THCudaIntTensor_free(state, rowInds);
+ THCudaIntTensor_free(state, colPtrs);
+}
+
+void THNN_(SparseLinear_legacyUpdateOutput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *output,
+ THCTensor *weight,
+ THCTensor *bias) {
+ THError("CUDA does not support legacy input format, please use a table of nnz x 2 vectors");
+}
+void THNN_(SparseLinear_legacyAccGradParameters)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *gradOutput,
+ THCTensor *gradWeight,
+ THCTensor *gradBias,
+ THCTensor *weight,
+ THCTensor *bias,
+ double weightDecay,
+ double scale) {
+ THError("CUDA does not support legacy input format, please use a table of nnz x 2 vectors");
+}
+
+// Dense updates are pretty fast on the GPU
+void THNN_(SparseLinear_zeroGradParameters)(
+ THCState *state,
+ THCTensor *gradWeight,
+ THCTensor *gradBias,
+ THCTensor *lastInput) {
+ THCTensor_(zero)(state, gradWeight);
+ THCTensor_(zero)(state, gradBias);
+}
+
+void THNN_(SparseLinear_updateParameters)(
+ THCState *state,
+ THCTensor *weight,
+ THCTensor *bias,
+ THCTensor *gradWeight,
+ THCTensor *gradBias,
+ THCTensor *lastInput,
+ double learningRate) {
+ THCTensor_(cadd)(state, weight, weight, -learningRate, gradWeight);
+ THCTensor_(cadd)(state, bias, bias, -learningRate, gradBias);
+}
+
+#endif
diff --git a/lib/THCUNN/generic/SpatialAdaptiveMaxPooling.cu b/lib/THCUNN/generic/SpatialAdaptiveMaxPooling.cu
new file mode 100644
index 0000000..fc6fe6d
--- /dev/null
+++ b/lib/THCUNN/generic/SpatialAdaptiveMaxPooling.cu
@@ -0,0 +1,188 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/SpatialAdaptiveMaxPooling.cu"
+#else
+
+#include "../common.h"
+
+void THNN_(SpatialAdaptiveMaxPooling_updateOutput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *output,
+ THCIndexTensor *indices,
+ int nOutputCols,
+ int nOutputRows)
+{
+ THCUNN_assertSameGPU_generic(state, 3, input, output, indices);
+
+ THCIndex_t *indices_data;
+ real *output_data;
+ real *input_data;
+
+ THArgCheck(input->nDimension == 3 || input->nDimension == 4, 2, "3D or 4D (batch) tensor expected");
+
+ if (input->nDimension == 3) {
+ long nInputCols = input->size[2];
+ long nInputRows = input->size[1];
+ long nInputPlane = input->size[0];
+
+ long istride_d = input->stride[0];
+ long istride_h = input->stride[1];
+ long istride_w = input->stride[2];
+
+ input_data = THCTensor_(data)(state, input);
+
+ THCTensor_(resize3d)(state, output, nInputPlane, nOutputRows, nOutputCols);
+ THCIndexTensor_(resize4d)(state, indices, 2, nInputPlane, nOutputRows, nOutputCols);
+
+ indices_data = THCIndexTensor_(data)(state, indices);
+ output_data = THCTensor_(data)(state, output);
+
+ // cuda blocks & threads:
+ int yblocks = (int)(16L / nInputPlane);
+ yblocks = yblocks < 1 ? 1 : yblocks;
+ dim3 blocks(nInputPlane,yblocks);
+ dim3 threads(32,8);
+
+ // run maxpool kernel
+ adaptivemaxpool <<<blocks, threads, 0, THCState_getCurrentStream(state)>>> (input_data, output_data,
+ indices_data+nInputPlane*nOutputCols*nOutputRows, indices_data,
+ nInputPlane, nInputRows, nInputCols, nOutputRows, nOutputCols,
+ istride_h, istride_w, istride_d);
+ THCudaCheck(cudaGetLastError());
+
+ } else {
+ long nInputCols = input->size[3];
+ long nInputRows = input->size[2];
+ long nInputPlane = input->size[1];
+ long nbatch = input->size[0];
+
+ long istride_d = input->stride[1];
+ long istride_h = input->stride[2];
+ long istride_w = input->stride[3];
+
+ input = THCTensor_(newContiguous)(state, input);
+ input_data = THCTensor_(data)(state, input);
+
+ THCTensor_(resize4d)(state, output, nbatch, nInputPlane, nOutputRows, nOutputCols);
+ THCIndexTensor_(resize5d)(state, indices, 2, nbatch, nInputPlane, nOutputRows, nOutputCols);
+
+ indices_data = THCIndexTensor_(data)(state, indices);
+ output_data = THCTensor_(data)(state, output);
+
+ // cuda blocks & threads:
+ int yblocks = (int)(16L / nInputPlane);
+ yblocks = yblocks < 1 ? 1 : yblocks;
+ dim3 blocks(nInputPlane*nbatch,yblocks);
+ dim3 threads(32,8);
+
+ // run maxpool kernel
+ adaptivemaxpool <<<blocks, threads, 0, THCState_getCurrentStream(state)>>> (input_data, output_data,
+ indices_data+nbatch*nInputPlane*nOutputCols*nOutputRows, indices_data,
+ nInputPlane, nInputRows, nInputCols, nOutputRows, nOutputCols,
+ istride_h, istride_w, istride_d);
+ THCudaCheck(cudaGetLastError());
+ // clean
+ THCTensor_(free)(state, input);
+ }
+}
+
+void THNN_(SpatialAdaptiveMaxPooling_updateGradInput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *gradOutput,
+ THCTensor *gradInput,
+ THCIndexTensor *indices)
+{
+ bool atomic = true; // suboptimal, but without atomic it doesn't pass the tests
+
+ THCUNN_assertSameGPU_generic(state, 4, input, indices, gradOutput, gradInput);
+
+ THCIndex_t *indices_data;
+ real *gradInput_data;
+ real *gradOutput_data;
+
+ gradOutput = THCTensor_(newContiguous)(state, gradOutput);
+
+ if (input->nDimension == 3) {
+ long nInputCols = input->size[2];
+ long nInputRows = input->size[1];
+ long nInputPlane = input->size[0];
+ long nOutputCols = gradOutput->size[2];
+ long nOutputRows = gradOutput->size[1];
+
+ //bool atomic = (nInputCols%nOutputCols != 0) || (nInputRows%nOutputRows != 0);
+
+ THCTensor_(resizeAs)(state, gradInput, input);
+ THCTensor_(zero)(state, gradInput);
+
+ indices_data = THCIndexTensor_(data)(state, indices);
+ gradOutput_data = THCTensor_(data)(state, gradOutput);
+ gradInput_data = THCTensor_(data)(state, gradInput);
+
+ // cuda blocks & threads:
+ int yblocks = (int)(16L / nInputPlane);
+ yblocks = yblocks < 1 ? 1 : yblocks;
+ dim3 blocks(nInputPlane,yblocks);
+ dim3 threads(32,8);
+
+ if(atomic)
+ {
+ // run updateGradInput kernel, accumulate gradients atomically
+ atomicadaptivemaxgradinput <<<blocks, threads, 0, THCState_getCurrentStream(state)>>> (gradInput_data, gradOutput_data,
+ indices_data+nInputPlane*nOutputCols*nOutputRows, indices_data,
+ nInputPlane, nInputRows, nInputCols, nOutputRows, nOutputCols);
+ }
+ else
+ {
+ // run updateGradInput kernel
+ atomicadaptivemaxgradinput <<<blocks, threads, 0, THCState_getCurrentStream(state)>>> (gradInput_data, gradOutput_data,
+ indices_data+nInputPlane*nOutputCols*nOutputRows, indices_data,
+ nInputPlane, nInputRows, nInputCols, nOutputRows, nOutputCols);
+ }
+ THCudaCheck(cudaGetLastError());
+ } else {
+ long nInputCols = input->size[3];
+ long nInputRows = input->size[2];
+ long nInputPlane = input->size[1];
+ long nbatch = input->size[0];
+ long nOutputCols = gradOutput->size[3];
+ long nOutputRows = gradOutput->size[2];
+
+ //bool atomic = //(nInputCols%nOutputCols != 0) || (nInputRows%nOutputRows != 0);
+
+ THCTensor_(resizeAs)(state, gradInput, input);
+ THCTensor_(zero)(state, gradInput);
+
+ indices_data = THCIndexTensor_(data)(state, indices);
+ gradOutput_data = THCTensor_(data)(state, gradOutput);
+ gradInput_data = THCTensor_(data)(state, gradInput);
+
+ // cuda blocks & threads:
+ int yblocks = (int)(16L / nInputPlane);
+ yblocks = yblocks < 1 ? 1 : yblocks;
+ dim3 blocks(nInputPlane*nbatch,yblocks);
+ dim3 threads(32,8);
+
+ if(atomic)
+ {
+ // run updateGradInput kernel, accumulate gradients atomically
+ atomicadaptivemaxgradinput <<<blocks, threads, 0, THCState_getCurrentStream(state)>>> (gradInput_data, gradOutput_data,
+ indices_data+nbatch*nInputPlane*nOutputCols*nOutputRows, indices_data,
+ nInputPlane, nInputRows, nInputCols, nOutputRows, nOutputCols);
+ }
+ else
+ {
+ // run updateGradInput kernel, accumulate gradients atomically
+ adaptivemaxgradinput <<<blocks, threads, 0, THCState_getCurrentStream(state)>>> (gradInput_data, gradOutput_data,
+ indices_data+nbatch*nInputPlane*nOutputCols*nOutputRows, indices_data,
+ nInputPlane, nInputRows, nInputCols, nOutputRows, nOutputCols);
+ }
+ THCudaCheck(cudaGetLastError());
+ }
+
+ // clean
+ THCTensor_(free)(state,gradOutput);
+
+}
+
+#endif
diff --git a/lib/THCUNN/generic/SpatialAveragePooling.cu b/lib/THCUNN/generic/SpatialAveragePooling.cu
new file mode 100644
index 0000000..09f539e
--- /dev/null
+++ b/lib/THCUNN/generic/SpatialAveragePooling.cu
@@ -0,0 +1,164 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/SpatialAveragePooling.cu"
+#else
+
+void THNN_(SpatialAveragePooling_updateOutput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *output,
+ int kW, int kH,
+ int dW, int dH,
+ int padW, int padH,
+ bool ceil_mode,
+ bool count_include_pad)
+{
+ THCUNN_assertSameGPU_generic(state, 2, input, output);
+ THArgCheck(input->nDimension == 3 || input->nDimension == 4, 2, "3D or 4D (batch) tensor expected");
+
+ long nInputCols, nInputRows, nInputPlane, batchSize;
+ long nOutputCols, nOutputRows;
+
+ if (input->nDimension == 3) {
+ nInputCols = input->size[2];
+ nInputRows = input->size[1];
+ nInputPlane = input->size[0];
+ batchSize = 1;
+ }
+ else
+ {
+ nInputCols = input->size[3];
+ nInputRows = input->size[2];
+ nInputPlane = input->size[1];
+ batchSize = input->size[0];
+ }
+
+ THArgCheck(nInputCols >= kW - 2*padW && nInputRows >= kH - 2*padH, 2, "input image smaller than kernel size");
+ THArgCheck(kW/2 >= padW && kH/2 >= padH, 2, "pad should be smaller than half of kernel size");
+
+ if(ceil_mode) {
+ nOutputCols = ceil(float(nInputCols - kW + 2*padW) / float(dW)) + 1;
+ nOutputRows = ceil(float(nInputRows - kH + 2*padH) / float(dH)) + 1;
+ }
+ else {
+ nOutputCols = floor(float(nInputCols - kW + 2*padW) / float(dW)) + 1;
+ nOutputRows = floor(float(nInputRows - kH + 2*padH) / float(dH)) + 1;
+ }
+ if (padW || padH)
+ {
+ // ensure that the last pooling starts inside the image
+ // needed to avoid problems in ceil mode
+ if ((nOutputRows - 1)*dH >= nInputRows + padH)
+ --nOutputRows;
+ if ((nOutputCols - 1)*dW >= nInputCols + padW)
+ --nOutputCols;
+ }
+
+ input = THCTensor_(newContiguous)(state, input);
+ real* input_data = THCTensor_(data)(state, input);
+
+ THCTensor_(resize4d)(state, output, batchSize, nInputPlane, nOutputRows, nOutputCols);
+
+ real* output_data = THCTensor_(data)(state, output);
+
+ int count = THCTensor_(nElement)(state, output);
+
+ if(count_include_pad)
+ AvePoolForward<real, accreal, true>
+ <<<GET_BLOCKS(count), CUDA_NUM_THREADS, 0, THCState_getCurrentStream(state) >>>(
+ count, input_data,
+ batchSize, nInputPlane, nInputRows, nInputCols, nOutputRows, nOutputCols,
+ kH, kW, dH, dW, padH, padW, output_data);
+ else
+ AvePoolForward<real, accreal, false>
+ <<<GET_BLOCKS(count), CUDA_NUM_THREADS, 0, THCState_getCurrentStream(state) >>>(
+ count, input_data,
+ batchSize, nInputPlane, nInputRows, nInputCols, nOutputRows, nOutputCols,
+ kH, kW, dH, dW, padH, padW, output_data);
+ THCudaCheck(cudaGetLastError());
+
+ if(input->nDimension == 3)
+ THCTensor_(resize3d)(state, output, nInputPlane, nOutputRows, nOutputCols);
+
+ THCTensor_(free)(state, input);
+
+}
+
+void THNN_(SpatialAveragePooling_updateGradInput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *gradOutput,
+ THCTensor *gradInput,
+ int kW, int kH,
+ int dW, int dH,
+ int padW, int padH,
+ bool ceil_mode,
+ bool count_include_pad)
+{
+ THCUNN_assertSameGPU_generic(state, 3, input, gradOutput, gradInput);
+
+ input = THCTensor_(newContiguous)(state, input);
+ gradOutput = THCTensor_(newContiguous)(state, gradOutput);
+
+ long nInputCols, nInputRows, nInputPlane, batchSize;
+ long nOutputCols, nOutputRows;
+
+ if (input->nDimension == 3) {
+ nInputCols = input->size[2];
+ nInputRows = input->size[1];
+ nInputPlane = input->size[0];
+ batchSize = 1;
+ }
+ else
+ {
+ nInputCols = input->size[3];
+ nInputRows = input->size[2];
+ nInputPlane = input->size[1];
+ batchSize = input->size[0];
+ }
+
+ if(ceil_mode) {
+ nOutputCols = ceil(float(nInputCols - kW + 2*padW) / float(dW)) + 1;
+ nOutputRows = ceil(float(nInputRows - kH + 2*padH) / float(dH)) + 1;
+ }
+ else {
+ nOutputCols = floor(float(nInputCols - kW + 2*padW) / float(dW)) + 1;
+ nOutputRows = floor(float(nInputRows - kH + 2*padH) / float(dH)) + 1;
+ }
+ if (padW || padH)
+ {
+ // ensure that the last pooling starts inside the image
+ // needed to avoid problems in ceil mode
+ if ((nOutputRows - 1)*dH >= nInputRows + padH)
+ --nOutputRows;
+ if ((nOutputCols - 1)*dW >= nInputCols + padW)
+ --nOutputCols;
+ }
+
+ THCTensor_(resizeAs)(state, gradInput, input);
+
+ int count = THCTensor_(nElement)(state, input);
+
+ if(count_include_pad)
+ AvePoolBackward<real, accreal, true>
+ <<< GET_BLOCKS(count), CUDA_NUM_THREADS, 0, THCState_getCurrentStream(state) >>>
+ (count,
+ THCTensor_(data)(state, gradOutput),
+ batchSize, nInputPlane, nInputRows, nInputCols, nOutputRows, nOutputCols,
+ kH, kW, dH, dW, padH, padW,
+ THCTensor_(data)(state, gradInput));
+ else
+ AvePoolBackward<real, accreal, false>
+ <<< GET_BLOCKS(count), CUDA_NUM_THREADS, 0, THCState_getCurrentStream(state) >>>
+ (count,
+ THCTensor_(data)(state, gradOutput),
+ batchSize, nInputPlane, nInputRows, nInputCols, nOutputRows, nOutputCols,
+ kH, kW, dH, dW, padH, padW,
+ THCTensor_(data)(state, gradInput));
+ THCudaCheck(cudaGetLastError());
+
+ // clean
+ THCTensor_(free)(state, input);
+ THCTensor_(free)(state, gradOutput);
+}
+
+#endif
diff --git a/lib/THCUNN/generic/SpatialClassNLLCriterion.cu b/lib/THCUNN/generic/SpatialClassNLLCriterion.cu
new file mode 100644
index 0000000..edb3a0c
--- /dev/null
+++ b/lib/THCUNN/generic/SpatialClassNLLCriterion.cu
@@ -0,0 +1,126 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/SpatialClassNLLCriterion.cu"
+#else
+
+void THNN_(SpatialClassNLLCriterion_updateOutput)(
+ THCState *state,
+ THCTensor *input,
+ THCIndexTensor *target,
+ THCTensor *output,
+ bool sizeAverage,
+ THCTensor *weights,
+ THCTensor *total_weight)
+{
+ THArgCheck(THCIndexTensor_(nDimension)(state, target) == 3, 1,
+ "only batches of spatial targets supported (3D tensors)");
+ THArgCheck(THCTensor_(nDimension)(state, input) == 4, 2,
+ "only batches of spatial inputs supported (4D tensors)");
+ if (weights && THCTensor_(nElement)(state, weights) != THCTensor_(size)(state, input, 1)) {
+ THError("weight tensor should be defined either for all or no classes");
+ }
+
+ if (weights)
+ THCUNN_assertSameGPU_generic(state, 5, input, target, weights, output, total_weight);
+ else
+ THCUNN_assertSameGPU_generic(state, 4, input, target, output, total_weight);
+
+ input = THCTensor_(newContiguous)(state, input);
+ weights = weights ? THCTensor_(newContiguous)(state, weights) : NULL;
+ target = THCIndexTensor_(newContiguous)(state, target);
+
+ real *input_data = THCTensor_(data)(state, input);
+ real *weights_data = weights ? THCTensor_(data)(state, weights) : NULL;
+ THCIndex_t *target_data = THCIndexTensor_(data)(state, target);
+ real *output_data = THCTensor_(data)(state, output);
+ real *total_weight_data = THCTensor_(data)(state, total_weight);
+
+ THCIndex_t batch_size = THCIndexTensor_(size)(state, target, 0);
+ THCIndex_t map_nelem = THCIndexTensor_(nElement)(state, target) / batch_size;
+ int blocks_per_sample = GET_BLOCKS(map_nelem) / 128;
+ blocks_per_sample = (blocks_per_sample == 0) ? 1 : blocks_per_sample;
+ int total_blocks = blocks_per_sample * batch_size;
+
+ THCTensor_(fill)(state, output, ScalarConvert<int, real>::to(0));
+ THCTensor_(fill)(state, total_weight, ScalarConvert<int, real>::to(0));
+
+ cunn_SpatialClassNLLCriterion_updateOutput_kernel<real, accreal>
+ <<<total_blocks, CUDA_NUM_THREADS, 0, THCState_getCurrentStream(state)>>>(
+ output_data,
+ total_weight_data,
+ input_data,
+ target_data,
+ weights_data,
+ sizeAverage,
+ THCTensor_(size)(state, input, 0),
+ THCTensor_(size)(state, input, 1),
+ THCTensor_(size)(state, input, 2) * THCTensor_(size)(state, input, 3),
+ blocks_per_sample
+ );
+ THCudaCheck(cudaGetLastError());
+
+ if (weights)
+ THCTensor_(free)(state, weights);
+ THCIndexTensor_(free)(state, target);
+ THCTensor_(free)(state, input);
+}
+
+void THNN_(SpatialClassNLLCriterion_updateGradInput)(
+ THCState *state,
+ THCTensor *input,
+ THCIndexTensor *target,
+ THCTensor *gradInput,
+ bool sizeAverage,
+ THCTensor *weights,
+ THCTensor *total_weight)
+{
+ THArgCheck(THCIndexTensor_(nDimension)(state, target) == 3, 1,
+ "only batches of spatial targets supported (3D tensors)");
+ THArgCheck(THCTensor_(nDimension)(state, input) == 4, 2,
+ "only batches of spatial inputs supported (4D tensors)");
+ THArgCheck(THCTensor_(isContiguous)(state, gradInput), 4,
+ "gradInput must be contiguous");
+ if (weights && THCTensor_(nElement)(state, weights) != THCTensor_(size)(state, input, 1)) {
+ THError("weight tensor should be defined either for all or no classes");
+ }
+
+ if (weights)
+ THCUNN_assertSameGPU_generic(state, 5, weights, input, target, gradInput, total_weight);
+ else
+ THCUNN_assertSameGPU_generic(state, 4, input, target, gradInput, total_weight);
+
+ input = THCTensor_(newContiguous)(state, input);
+ weights = weights ? THCTensor_(newContiguous)(state, weights) : NULL;
+ target = THCIndexTensor_(newContiguous)(state, target);
+
+ real *weights_data = weights ? THCTensor_(data)(state, weights) : NULL;
+ real *gradInput_data = THCTensor_(data)(state, gradInput);
+ THCIndex_t *target_data = THCIndexTensor_(data)(state, target);
+ real *total_weight_data = THCTensor_(data)(state, total_weight);
+
+ THCIndex_t batch_size = THCIndexTensor_(size)(state, target, 0);
+ THCIndex_t map_nelem = THCIndexTensor_(nElement)(state, target) / batch_size;
+ int blocks_per_sample = GET_BLOCKS(map_nelem) / 128;
+ blocks_per_sample = (blocks_per_sample == 0) ? 1 : blocks_per_sample;
+ int total_blocks = blocks_per_sample * batch_size;
+
+ cunn_SpatialClassNLLCriterion_updateGradInput_kernel
+ <<<total_blocks, CUDA_NUM_THREADS, 0, THCState_getCurrentStream(state)>>>(
+ gradInput_data,
+ target_data,
+ weights_data,
+ total_weight_data,
+ sizeAverage,
+ THCTensor_(size)(state, input, 0),
+ THCTensor_(size)(state, input, 1),
+ THCTensor_(size)(state, input, 2) *THCTensor_(size)(state, input, 3),
+ blocks_per_sample
+ );
+ THCudaCheck(cudaGetLastError());
+
+ if (weights)
+ THCTensor_(free)(state, weights);
+ THCIndexTensor_(free)(state, target);
+ THCTensor_(free)(state, input);
+}
+
+#endif
diff --git a/lib/THCUNN/generic/SpatialConvolutionLocal.cu b/lib/THCUNN/generic/SpatialConvolutionLocal.cu
new file mode 100644
index 0000000..d317fe2
--- /dev/null
+++ b/lib/THCUNN/generic/SpatialConvolutionLocal.cu
@@ -0,0 +1,324 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/SpatialConvolutionLocal.cu"
+#else
+
+void THNN_(SpatialConvolutionLocal_updateOutput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *output,
+ THCTensor *weight,
+ THCTensor *bias,
+ THCTensor *finput,
+ THCTensor *fgradInput,
+ int kW, int kH,
+ int dW, int dH,
+ int padW, int padH,
+ long inputWidth, long inputHeight,
+ long outputWidth, long outputHeight)
+{
+ THCUNN_assertSameGPU_generic(state, 5, input, output, weight,
+ bias, finput);
+
+ long nInputPlane = THCTensor_(size)(state,weight,2)/(kW*kH);
+ long nOutputPlane = THCTensor_(size)(state,weight,1);
+
+ int batch = 1;
+ if (input->nDimension == 3) {
+ // Force batch
+ batch = 0;
+ THCTensor_(resize4d)(state, input, 1, nInputPlane, inputHeight, inputWidth);
+ }
+
+ // Batch size + input planes
+ long batchSize = input->size[0];
+
+ // Resize output
+ THCTensor_(resize4d)(state, output, batchSize, nOutputPlane, outputHeight, outputWidth);
+
+ // Augment the input
+ THCTensor_(resize3d)(state, finput, batchSize, nInputPlane*kW*kH, outputHeight*outputWidth);
+
+ // Helpers
+ THCTensor *input_n = THCTensor_(new)(state);
+ THCTensor *finput_n = THCTensor_(new)(state);
+ THCTensor *output_n = THCTensor_(new)(state);
+
+ // For each elt in batch, do:
+ for (int elt = 0; elt < batchSize; elt ++) {
+ THCTensor *finput3d, *output3d;
+ THCTensor *wslice = THCTensor_(new)(state);
+ THCTensor *islice = THCTensor_(new)(state);
+ THCTensor *oslice = THCTensor_(new)(state);
+
+ // Matrix mulitply per output:
+ THCTensor_(select)(state, input_n, input, 0, elt);
+ THCTensor_(select)(state, finput_n, finput, 0, elt);
+ THCTensor_(select)(state, output_n, output, 0, elt);
+
+ // Extract columns:
+ im2col(
+ THCState_getCurrentStream(state),
+ THCTensor_(data)(state, input_n),
+ nInputPlane, inputHeight, inputWidth, kH, kW, padH, padW, dH, dW,
+ 1, 1, THCTensor_(data)(state, finput_n)
+ );
+
+ output3d = THCTensor_(newWithStorage3d)(state, output_n->storage, output_n->storageOffset,
+ outputHeight*outputWidth, 1,
+ nOutputPlane, outputHeight*outputWidth,
+ 1, nOutputPlane*outputHeight*outputWidth);
+
+ finput3d = THCTensor_(newWithStorage3d)(state, finput_n->storage, finput_n->storageOffset,
+ outputHeight*outputWidth, 1,
+ kW*kH*nInputPlane, outputHeight*outputWidth,
+ 1, kW*kH*nInputPlane*outputHeight*outputWidth);
+
+ THCTensor_(copy)(state, output_n, bias);
+
+ for (int i = 0; i < outputHeight; i++) {
+ for(int j = 0; j < outputWidth; j++) {
+ int sliceidx = i * outputWidth + j;
+ THCTensor_(select)(state, wslice, weight, 0, sliceidx);
+ THCTensor_(select)(state, islice, finput3d, 0, sliceidx);
+ THCTensor_(select)(state, oslice, output3d, 0, sliceidx);
+ THCTensor_(addmm)(state, oslice, ScalarConvert<int, real>::to(1), oslice, ScalarConvert<int, real>::to(1), wslice, islice);
+ }
+ }
+
+
+ // weight: oH*oW x nOutputPlane x nInputPlane*kH*kW
+ // finput3d: oH*oW x nInputPlane*kH*kW x 1
+ // THCTensor_(baddbmm)(state, output3d, 1.0, output3d, 1.0, weight, finput3d);
+ // output3d: oH*oW x nOutputPlane x 1
+
+ THCTensor_(free)(state, output3d);
+ THCTensor_(free)(state, finput3d);
+ THCTensor_(free)(state, wslice);
+ THCTensor_(free)(state, islice);
+ THCTensor_(free)(state, oslice);
+ }
+
+ // Free
+ THCTensor_(free)(state, input_n);
+ THCTensor_(free)(state, finput_n);
+ THCTensor_(free)(state, output_n);
+
+ // Resize output
+ if (batch == 0) {
+ THCTensor_(resize3d)(state, output, nOutputPlane, outputHeight, outputWidth);
+ THCTensor_(resize3d)(state, input, nInputPlane, inputHeight, inputWidth);
+ }
+}
+
+void THNN_(SpatialConvolutionLocal_updateGradInput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *gradOutput,
+ THCTensor *gradInput,
+ THCTensor *weight,
+ THCTensor *finput,
+ THCTensor *fgradInput,
+ int kW, int kH,
+ int dW, int dH,
+ int padW, int padH,
+ long inputWidth, long inputHeight,
+ long outputWidth, long outputHeight)
+{
+ THCUNN_assertSameGPU_generic(state, 5, input, gradOutput, weight,
+ fgradInput, gradInput);
+
+ long nInputPlane = THCTensor_(size)(state,weight,2)/(kW*kH);
+ long nOutputPlane = THCTensor_(size)(state,weight,1);
+
+ int batch = 1;
+ if (input->nDimension == 3) {
+ // Force batch
+ batch = 0;
+ THCTensor_(resize4d)(state, input, 1, nInputPlane, inputHeight, inputWidth);
+ THCTensor_(resize4d)(state, gradOutput, 1, nOutputPlane, outputHeight, outputWidth);
+ }
+
+ // Batch size + input planes
+ long batchSize = input->size[0];
+
+ // Resize output
+ THCTensor_(resize4d)(state, gradInput, batchSize, nInputPlane, inputHeight, inputWidth);
+
+ // Resize temporary columns
+ THCTensor_(resize3d)(state, fgradInput, batchSize, nInputPlane*kW*kH, outputHeight*outputWidth);
+
+ // Helpers
+ THCTensor *gradInput_n = THCTensor_(new)(state);
+ THCTensor *fgradInput_n = THCTensor_(new)(state);
+ THCTensor *gradOutput_n = THCTensor_(new)(state);
+
+ THCTensor_(transpose)(state, weight, weight, 1, 2);
+
+ // For each elt in batch, do:
+ for (int elt = 0; elt < batchSize; elt ++) {
+ THCTensor *gradOutput3d, *fgradInput3d;
+ THCTensor *wslice = THCTensor_(new)(state);
+ THCTensor *gislice = THCTensor_(new)(state);
+ THCTensor *goslice = THCTensor_(new)(state);
+
+ // Matrix mulitply per sample:
+ THCTensor_(select)(state, gradInput_n, gradInput, 0, elt);
+ THCTensor_(select)(state, fgradInput_n, fgradInput, 0, elt);
+ THCTensor_(select)(state, gradOutput_n, gradOutput, 0, elt);
+
+ gradOutput3d = THCTensor_(newWithStorage3d)(state, gradOutput_n->storage, gradOutput_n->storageOffset,
+ outputHeight*outputWidth, 1,
+ nOutputPlane, outputHeight*outputWidth,
+ 1, nOutputPlane*outputHeight*outputWidth);
+ fgradInput3d = THCTensor_(newWithStorage3d)(state, fgradInput_n->storage, fgradInput_n->storageOffset,
+ outputHeight*outputWidth, 1,
+ kW*kH*nInputPlane, outputHeight*outputWidth,
+ 1, kW*kH*nInputPlane*outputHeight*outputWidth);
+
+ for (int i = 0; i < outputHeight; i++) {
+ for(int j = 0; j < outputWidth; j++) {
+ int sliceidx = i * outputWidth + j;
+ THCTensor_(select)(state, wslice, weight, 0, sliceidx);
+ THCTensor_(select)(state, gislice, fgradInput3d, 0, sliceidx);
+ THCTensor_(select)(state, goslice, gradOutput3d, 0, sliceidx);
+ THCTensor_(addmm)(state, gislice, ScalarConvert<int, real>::to(0), gislice, ScalarConvert<int, real>::to(1), wslice, goslice);
+ }
+ }
+
+ // weight: oH*oW x nInputPlane*kH*kW x nOutputPlane
+ // gradOutput3d: oH*oW x nOutputPlane x 1
+ //THCTensor_(baddbmm)(state, fgradInput3d, 0.0, fgradInput3d, 1.0, weight, gradOutput3d);
+ // fgradInput3d: oH*oW x nInputPlane*kH*kW x 1
+
+ // Unpack columns back into input:
+ col2im<real, accreal>(
+ THCState_getCurrentStream(state),
+ THCTensor_(data)(state, fgradInput_n),
+ nInputPlane, inputHeight, inputWidth, kH, kW, padH, padW, dH, dW,
+ 1, 1, THCTensor_(data)(state, gradInput_n)
+ );
+
+ THCTensor_(free)(state, gradOutput3d);
+ THCTensor_(free)(state, fgradInput3d);
+ THCTensor_(free)(state, wslice);
+ THCTensor_(free)(state, gislice);
+ THCTensor_(free)(state, goslice);
+ }
+
+ // Free
+ THCTensor_(free)(state, gradInput_n);
+ THCTensor_(free)(state, fgradInput_n);
+ THCTensor_(free)(state, gradOutput_n);
+
+ // Resize output
+ if (batch == 0) {
+ THCTensor_(resize3d)(state, gradOutput, nOutputPlane, outputHeight, outputWidth);
+ THCTensor_(resize3d)(state, input, nInputPlane, inputHeight, inputWidth);
+ THCTensor_(resize3d)(state, gradInput, nInputPlane, inputHeight, inputWidth);
+ }
+
+ THCTensor_(transpose)(state, weight, weight, 1, 2);
+}
+
+void THNN_(SpatialConvolutionLocal_accGradParameters)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *gradOutput,
+ THCTensor *gradWeight,
+ THCTensor *gradBias,
+ THCTensor *finput,
+ THCTensor *fgradInput,
+ int kW, int kH,
+ int dW, int dH,
+ int padW, int padH,
+ long inputWidth, long inputHeight,
+ long outputWidth, long outputHeight,
+ real scale)
+{
+ THCUNN_assertSameGPU_generic(state, 5, input, gradOutput, gradWeight,
+ gradBias, finput);
+
+ long nInputPlane = THCTensor_(size)(state,gradWeight,2)/(kW*kH);
+ long nOutputPlane = THCTensor_(size)(state,gradWeight,1);
+
+ int batch = 1;
+ if (input->nDimension == 3) {
+ // Force batch
+ batch = 0;
+ THCTensor_(resize4d)(state, input, 1, nInputPlane, inputHeight, inputWidth);
+ THCTensor_(resize4d)(state, gradOutput, 1, nOutputPlane, outputHeight, outputWidth);
+ }
+
+ // Batch size + input planes
+ long batchSize = input->size[0];
+
+ // Helpers
+ THCTensor *input_n = THCTensor_(new)(state);
+ THCTensor *finput_n = THCTensor_(new)(state);
+ THCTensor *gradOutput_n = THCTensor_(new)(state);
+
+ // For each elt in batch, do:
+ for (int elt = 0; elt < batchSize; elt ++) {
+ THCTensor *gradOutput3d, *finput3d;
+ THCTensor *gwslice = THCTensor_(new)(state);
+ THCTensor *islice = THCTensor_(new)(state);
+ THCTensor *goslice = THCTensor_(new)(state);
+
+ // Matrix mulitply per output:
+ THCTensor_(select)(state, input_n, input, 0, elt);
+ THCTensor_(select)(state, finput_n, finput, 0, elt);
+ THCTensor_(select)(state, gradOutput_n, gradOutput, 0, elt);
+
+ gradOutput3d = THCTensor_(newWithStorage3d)(state, gradOutput_n->storage, gradOutput_n->storageOffset,
+ outputHeight*outputWidth, 1,
+ nOutputPlane, outputHeight*outputWidth,
+ 1, nOutputPlane*outputHeight*outputWidth);
+ finput3d = THCTensor_(newWithStorage3d)(state, finput_n->storage, finput_n->storageOffset,
+ outputHeight*outputWidth, 1,
+ 1, kW*kH*nInputPlane*outputHeight*outputWidth,
+ kW*kH*nInputPlane, outputHeight*outputWidth);
+
+ // Extract columns:
+ im2col(
+ THCState_getCurrentStream(state),
+ THCTensor_(data)(state, input_n),
+ nInputPlane, inputHeight, inputWidth, kH, kW, padH, padW, dH, dW,
+ 1, 1, THCTensor_(data)(state, finput_n)
+ );
+
+ for (int i = 0; i < outputHeight; i++) {
+ for(int j = 0; j < outputWidth; j++) {
+ int sliceidx = i * outputWidth + j;
+ THCTensor_(select)(state, gwslice, gradWeight, 0, sliceidx);
+ THCTensor_(select)(state, goslice, gradOutput3d, 0, sliceidx);
+ THCTensor_(select)(state, islice, finput3d, 0, sliceidx);
+ THCTensor_(addmm)(state, gwslice, ScalarConvert<int, real>::to(1), gwslice, scale, goslice, islice);
+ }
+ }
+ // gradOutput3d: oH*oW x nOutputPlane x 1
+ // finput3d: oH*oW x 1 x kW*kH*nInputPlane
+ //THCTensor_(baddbmm)(state, gradWeight, 1.0, gradWeight, scale, gradOutput3d, finput3d);
+ // gradWeight: oH*oW x nOutputPlane x kW*kH*nInputPlane
+
+ THCTensor_(cadd)(state, gradBias, gradBias, scale, gradOutput_n);
+
+ THCTensor_(free)(state, gradOutput3d);
+ THCTensor_(free)(state, finput3d);
+ THCTensor_(free)(state, gwslice);
+ THCTensor_(free)(state, goslice);
+ THCTensor_(free)(state, islice);
+ }
+
+ // Free
+ THCTensor_(free)(state, input_n);
+ THCTensor_(free)(state, finput_n);
+ THCTensor_(free)(state, gradOutput_n);
+
+ // Resize
+ if (batch == 0) {
+ THCTensor_(resize3d)(state, gradOutput, nOutputPlane, outputHeight, outputWidth);
+ THCTensor_(resize3d)(state, input, nInputPlane, inputHeight, inputWidth);
+ }
+}
+
+#endif
diff --git a/lib/THCUNN/generic/SpatialConvolutionMM.cu b/lib/THCUNN/generic/SpatialConvolutionMM.cu
new file mode 100644
index 0000000..46a62fb
--- /dev/null
+++ b/lib/THCUNN/generic/SpatialConvolutionMM.cu
@@ -0,0 +1,427 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/SpatialConvolutionMM.cu"
+#else
+
+void THNN_(SpatialConvolutionMM_updateOutput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *output,
+ THCTensor *weight,
+ THCTensor *bias,
+ THCTensor *columns,
+ THCTensor *ones,
+ int kW, int kH,
+ int dW, int dH,
+ int padW, int padH) {
+
+ THCUNN_assertSameGPU_generic(state, 5, input, output, weight, columns, ones);
+ if (bias) {
+ THCUNN_assertSameGPU_generic(state, 2, weight, bias);
+ }
+ THArgCheck(input->nDimension == 3 || input->nDimension == 4, 2, "3D or 4D (batch mode) tensor is expected");
+ THArgCheck(!bias || weight->size[0] == bias->size[0], 4, "nOutputPlane mismatch in weight and bias");
+ THArgCheck(kW > 0 && kH > 0, 8, "kernel size should be greater than zero");
+ THArgCheck(dW > 0 && dH > 0, 10, "stride should be greater than zero");
+ THArgCheck(weight->nDimension == 2 || weight->nDimension == 4, 4, "weight tensor should be 2D or 4D");
+
+ int freeWeight = 0;
+
+ // Params:
+ int nInputPlane = weight->nDimension == 2 ? weight->size[1]/(kH*kW) : weight->size[1];
+ int nOutputPlane = weight->size[0];
+
+ if (weight->nDimension == 4) {
+ long s1 = weight->size[0];
+ long s2 = weight->size[1] * weight->size[2] * weight->size[3];
+ weight = THCTensor_(newWithStorage2d)(state, weight->storage, weight->storageOffset, s1, -1, s2, -1);
+ freeWeight = 1;
+ }
+
+ int batch = 1;
+ if (input->nDimension == 3) {
+ THArgCheck(input->size[0] == nInputPlane, 2, "input channels and nInputPlane dont match");
+ // Force batch
+ batch = 0;
+ THCTensor_(resize4d)(state, input, 1, input->size[0], input->size[1], input->size[2]);
+ } else {
+ THArgCheck(input->size[1] == nInputPlane, 2, "input channels and nInputPlane dont match");
+ }
+
+ long inputWidth = input->size[3];
+ long inputHeight = input->size[2];
+ long outputWidth = (inputWidth + 2*padW - kW) / dW + 1;
+ long outputHeight = (inputHeight + 2*padH - kH) / dH + 1;
+
+ if (outputWidth < 1 || outputHeight < 1)
+ THError("Given input size: (%dx%dx%d). Calculated output size: (%dx%dx%d). Output size is too small",
+ nInputPlane,inputHeight,inputWidth,nOutputPlane,outputHeight,outputWidth);
+
+ // Batch size + input planes
+ long batchSize = input->size[0];
+
+ // Resize output
+ THCTensor_(resize4d)(state, output, batchSize, nOutputPlane, outputHeight, outputWidth);
+
+ // Resize temporary columns
+ THCTensor_(resize2d)(state, columns, nInputPlane*kW*kH, outputHeight*outputWidth);
+
+ // Define a buffer of ones, for bias accumulation
+ // Note: this buffer can be shared with other modules, it only ever gets increased,
+ // and always contains ones.
+ if (ones->nDimension != 2 || ones->size[0]*ones->size[1] < outputHeight*outputWidth) {
+ // Resize plane and fill with ones...
+ THCTensor_(resize2d)(state, ones, outputHeight, outputWidth);
+ THCTensor_(fill)(state, ones, ScalarConvert<int, real>::to(1));
+ }
+
+ // Helpers
+ THCTensor *input_n = THCTensor_(new)(state);
+ THCTensor *output_n = THCTensor_(new)(state);
+
+ // For each elt in batch, do:
+ for (int elt = 0; elt < batchSize; elt ++) {
+ // Matrix mulitply per output:
+ THCTensor_(select)(state, input_n, input, 0, elt);
+ THCTensor_(select)(state, output_n, output, 0, elt);
+
+ // Do Bias first:
+ // M,N,K are dims of matrix A and B
+ // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
+ long m_ = nOutputPlane;
+ long n_ = outputHeight * outputWidth;
+ long k_ = 1;
+
+ // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+ if (bias) {
+ #ifdef THC_REAL_IS_FLOAT
+ THCudaBlas_Sgemm(
+ #elif defined(THC_REAL_IS_HALF)
+ THCudaBlas_Hgemm(
+ #elif defined(THC_REAL_IS_DOUBLE)
+ THCudaBlas_Dgemm(
+ #endif
+ state,
+ 't', 'n',
+ n_, m_, k_,
+ ScalarConvert<int, real>::to(1),
+ THCTensor_(data)(state, ones), k_,
+ THCTensor_(data)(state, bias), k_,
+ ScalarConvert<int, real>::to(0),
+ THCTensor_(data)(state, output_n), n_
+ );
+ } else {
+ THCTensor_(zero)(state, output_n);
+ }
+
+ // Extract columns:
+ im2col(
+ THCState_getCurrentStream(state),
+ THCTensor_(data)(state, input_n),
+ nInputPlane, inputHeight, inputWidth, kH, kW, padH, padW, dH, dW,
+ 1, 1, THCTensor_(data)(state, columns)
+ );
+
+ // M,N,K are dims of matrix A and B
+ // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
+ long m = nOutputPlane;
+ long n = columns->size[1];
+ long k = nInputPlane*kH*kW;
+
+ // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+ #ifdef THC_REAL_IS_FLOAT
+ THCudaBlas_Sgemm(
+ #elif defined(THC_REAL_IS_HALF)
+ THCudaBlas_Hgemm(
+ #elif defined(THC_REAL_IS_DOUBLE)
+ THCudaBlas_Dgemm(
+ #endif
+ state,
+ 'n', 'n',
+ n, m, k,
+ ScalarConvert<int, real>::to(1),
+ THCTensor_(data)(state, columns), n,
+ THCTensor_(data)(state, weight), k,
+ ScalarConvert<int, real>::to(1),
+ THCTensor_(data)(state, output_n), n
+ );
+ }
+
+ // Free
+ THCTensor_(free)(state, input_n);
+ THCTensor_(free)(state, output_n);
+ if (freeWeight)
+ THCTensor_(free)(state, weight);
+
+ // Resize output
+ if (batch == 0) {
+ THCTensor_(resize3d)(state, output, nOutputPlane, outputHeight, outputWidth);
+ THCTensor_(resize3d)(state, input, nInputPlane, inputHeight, inputWidth);
+ }
+}
+
+void THNN_(SpatialConvolutionMM_updateGradInput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *gradOutput,
+ THCTensor *gradInput,
+ THCTensor *weight,
+ THCTensor *gradColumns,
+ THCTensor *ones,
+ int kW, int kH,
+ int dW, int dH,
+ int padW, int padH) {
+
+ THCUNN_assertSameGPU_generic(state, 5, input, gradOutput, weight,
+ gradColumns, gradInput);
+ THArgCheck(input->nDimension == 3 || input->nDimension == 4, 2, "3D or 4D (batch mode) tensor is expected");
+ THArgCheck(kW > 0 && kH > 0, 9, "kernel size should be greater than zero");
+ THArgCheck(dW > 0 && dH > 0, 11, "stride should be greater than zero");
+ THArgCheck(weight->nDimension == 2 || weight->nDimension == 4, 4, "weight tensor should be 2D or 4D");
+
+ // Params
+ int nInputPlane = weight->nDimension == 2 ? weight->size[1]/(kW*kH) : weight->size[1];
+ int nOutputPlane = weight->size[0];
+
+ int freeWeight = 0;
+ if (weight->nDimension == 4) {
+ long s1 = weight->size[0];
+ long s2 = weight->size[1] * weight->size[2] * weight->size[3];
+ weight = THCTensor_(newWithStorage2d)(state, weight->storage, weight->storageOffset, s1, -1, s2, -1);
+ freeWeight = 1;
+ }
+
+ int batch = 1;
+ if (input->nDimension == 3) {
+ // Force batch
+ batch = 0;
+ THCTensor_(resize4d)(state, input, 1, input->size[0], input->size[1], input->size[2]);
+ THCTensor_(resize4d)(state, gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2]);
+ }
+
+ long inputWidth = input->size[3];
+ long inputHeight = input->size[2];
+ long outputWidth = (inputWidth + 2*padW - kW) / dW + 1;
+ long outputHeight = (inputHeight + 2*padH - kH) / dH + 1;
+
+ // Batch size + input planes
+ long batchSize = input->size[0];
+
+ // Resize output
+ THCTensor_(resize4d)(state, gradInput, batchSize, nInputPlane, inputHeight, inputWidth);
+
+ // Resize temporary columns
+ THCTensor_(resize2d)(state, gradColumns, nInputPlane*kW*kH, outputHeight*outputWidth);
+
+ // Helpers
+ THCTensor *gradInput_n = THCTensor_(new)(state);
+ THCTensor *gradOutput_n = THCTensor_(new)(state);
+
+ // For each elt in batch, do:
+ for (int elt = 0; elt < batchSize; elt ++) {
+ // Matrix mulitply per sample:
+ THCTensor_(select)(state, gradInput_n, gradInput, 0, elt);
+ THCTensor_(select)(state, gradOutput_n, gradOutput, 0, elt);
+
+ // M,N,K are dims of matrix A and B
+ // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
+ long m = nInputPlane*kW*kH;
+ long n = gradColumns->size[1];
+ long k = nOutputPlane;
+
+ // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+ #ifdef THC_REAL_IS_FLOAT
+ THCudaBlas_Sgemm(
+ #elif defined(THC_REAL_IS_HALF)
+ THCudaBlas_Hgemm(
+ #elif defined(THC_REAL_IS_DOUBLE)
+ THCudaBlas_Dgemm(
+ #endif
+ state,
+ 'n', 't',
+ n, m, k,
+ ScalarConvert<int, real>::to(1),
+ THCTensor_(data)(state, gradOutput_n), n,
+ THCTensor_(data)(state, weight), m,
+ ScalarConvert<int, real>::to(0),
+ THCTensor_(data)(state, gradColumns), n
+ );
+
+ // Unpack columns back into input:
+ col2im<real, accreal>(
+ THCState_getCurrentStream(state),
+ THCTensor_(data)(state, gradColumns),
+ nInputPlane, inputHeight, inputWidth, kH, kW, padH, padW, dH, dW,
+ 1, 1, THCTensor_(data)(state, gradInput_n)
+ );
+ }
+
+ // Free
+ THCTensor_(free)(state, gradInput_n);
+ THCTensor_(free)(state, gradOutput_n);
+ if (freeWeight)
+ THCTensor_(free)(state, weight);
+
+ // Resize output
+ if (batch == 0) {
+ THCTensor_(resize3d)(state, gradOutput, nOutputPlane, outputHeight, outputWidth);
+ THCTensor_(resize3d)(state, input, nInputPlane, inputHeight, inputWidth);
+ THCTensor_(resize3d)(state, gradInput, nInputPlane, inputHeight, inputWidth);
+ }
+}
+
+void THNN_(SpatialConvolutionMM_accGradParameters)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *gradOutput,
+ THCTensor *gradWeight,
+ THCTensor *gradBias,
+ THCTensor *columns,
+ THCTensor *ones,
+ int kW, int kH,
+ int dW, int dH,
+ int padW, int padH,
+ real scale) {
+
+ THCUNN_assertSameGPU_generic(state, 5, input, gradOutput, gradWeight, columns, ones);
+ if (gradBias) {
+ THCUNN_assertSameGPU_generic(state, 2, gradWeight, gradBias);
+ }
+ THArgCheck(input->nDimension == 3 || input->nDimension == 4, 2, "3D or 4D (batch mode) tensor is expected");
+ THArgCheck(!gradBias || gradWeight->size[0] == gradBias->size[0], 4, "nOutputPlane mismatch in gradWeight and gradBias");
+ THArgCheck(kW > 0 && kH > 0, 8, "kernel size should be greater than zero");
+ THArgCheck(dW > 0 && dH > 0, 10, "stride should be greater than zero");
+ THArgCheck(gradWeight->nDimension == 2 || gradWeight->nDimension == 4, 4, "gradWeight tensor should be 2D or 4D");
+
+ // Params
+ int nInputPlane = gradWeight->nDimension == 2 ? gradWeight->size[1]/(kW*kH) : gradWeight->size[1];
+ int nOutputPlane = gradWeight->size[0];
+
+ int freeWeight = 0;
+ if (gradWeight->nDimension == 4) {
+ long s1 = gradWeight->size[0];
+ long s2 = gradWeight->size[1] * gradWeight->size[2] * gradWeight->size[3];
+ gradWeight = THCTensor_(newWithStorage2d)(state, gradWeight->storage, gradWeight->storageOffset, s1, -1, s2, -1);
+ freeWeight = 1;
+ }
+
+ int batch = 1;
+ if (input->nDimension == 3) {
+ // Force batch
+ batch = 0;
+ THCTensor_(resize4d)(state, input, 1, input->size[0], input->size[1], input->size[2]);
+ THCTensor_(resize4d)(state, gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2]);
+ }
+
+ long inputWidth = input->size[3];
+ long inputHeight = input->size[2];
+ long outputWidth = (inputWidth + 2*padW - kW) / dW + 1;
+ long outputHeight = (inputHeight + 2*padH - kH) / dH + 1;
+
+ // Batch size + input planes
+ long batchSize = input->size[0];
+
+ // Define a buffer of ones, for bias accumulation
+ if (ones->nDimension != 2 || ones->size[0]*ones->size[1] < outputHeight*outputWidth) {
+ // Resize plane and fill with ones...
+ THCTensor_(resize2d)(state, ones, outputHeight, outputWidth);
+ THCTensor_(fill)(state, ones, ScalarConvert<int, real>::to(1));
+ }
+
+ // Resize temporary columns
+ THCTensor_(resize2d)(state, columns, nInputPlane*kW*kH, outputHeight*outputWidth);
+
+ // Helpers
+ THCTensor *input_n = THCTensor_(new)(state);
+ THCTensor *gradOutput_n = THCTensor_(new)(state);
+
+ // For each elt in batch, do:
+ for (int elt = 0; elt < batchSize; elt ++) {
+ // Matrix mulitply per output:
+ THCTensor_(select)(state, input_n, input, 0, elt);
+ THCTensor_(select)(state, gradOutput_n, gradOutput, 0, elt);
+
+ // Extract columns:
+ im2col(
+ THCState_getCurrentStream(state),
+ THCTensor_(data)(state, input_n),
+ nInputPlane, inputHeight, inputWidth, kH, kW, padH, padW, dH, dW,
+ 1, 1, THCTensor_(data)(state, columns)
+ );
+
+ // M,N,K are dims of matrix A and B
+ // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
+ long m = nOutputPlane;
+ long n = nInputPlane*kW*kH;
+ long k = columns->size[1];
+
+ // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+ #ifdef THC_REAL_IS_FLOAT
+ THCudaBlas_Sgemm(
+ #elif defined(THC_REAL_IS_HALF)
+ THCudaBlas_Hgemm(
+ #elif defined(THC_REAL_IS_DOUBLE)
+ THCudaBlas_Dgemm(
+ #endif
+ state,
+ 't', 'n',
+ n, m, k,
+ scale,
+ THCTensor_(data)(state, columns), k,
+ THCTensor_(data)(state, gradOutput_n), k,
+ ScalarConvert<int, real>::to(1),
+ THCTensor_(data)(state, gradWeight), n
+ );
+
+ // Do Bias:
+ // M,N,K are dims of matrix A and B
+ // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
+ long m_ = nOutputPlane;
+ long k_ = outputHeight * outputWidth;
+
+ // Do GEMV (note: this is a bit confusing because gemv assumes column-major matrices)
+ if (gradBias) {
+ #if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE)
+ #ifdef THC_REAL_IS_FLOAT
+ THCudaBlas_Sgemv(
+ #elif defined(THC_REAL_IS_DOUBLE)
+ THCudaBlas_Dgemv(
+ #endif
+ state,
+ 't',
+ k_, m_,
+ scale,
+ THCTensor_(data)(state, gradOutput_n), k_,
+ THCTensor_(data)(state, ones), 1,
+ ScalarConvert<int, real>::to(1),
+ THCTensor_(data)(state, gradBias), 1
+ );
+ #endif
+ #ifdef THC_REAL_IS_HALF
+ THCudaBlas_Hgemm(
+ state,
+ 't', 'n',
+ m_, 1, k_,
+ scale,
+ THCTensor_(data)(state, gradOutput_n), k_,
+ THCTensor_(data)(state, ones), k_,
+ ScalarConvert<int, real>::to(1),
+ THCTensor_(data)(state, gradBias), m_
+ );
+ #endif
+ }
+ }
+
+ // Free
+ THCTensor_(free)(state, input_n);
+ THCTensor_(free)(state, gradOutput_n);
+ if (freeWeight)
+ THCTensor_(free)(state, gradWeight);
+
+ // Resize
+ if (batch == 0) {
+ THCTensor_(resize3d)(state, gradOutput, nOutputPlane, outputHeight, outputWidth);
+ THCTensor_(resize3d)(state, input, nInputPlane, inputHeight, inputWidth);
+ }
+}
+
+#endif
diff --git a/lib/THCUNN/generic/SpatialCrossMapLRN.cu b/lib/THCUNN/generic/SpatialCrossMapLRN.cu
new file mode 100644
index 0000000..a09ea0b
--- /dev/null
+++ b/lib/THCUNN/generic/SpatialCrossMapLRN.cu
@@ -0,0 +1,114 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/SpatialCrossMapLRN.cu"
+#else
+
+void LRNforward(THCState* state, THCTensor* input, THCTensor* output,
+ THCTensor* scale, int local_size, real alpha, real beta, real k)
+{
+ THCTensor_(resizeAs)(state, output, input);
+ THCTensor_(resizeAs)(state, scale, input);
+
+ int batchSize;
+ int nInputPlane;
+ int imsize_h;
+ int imsize_w;
+
+ if (input->nDimension == 3) {
+ batchSize = 1;
+ nInputPlane = input->size[0];
+ imsize_h = input->size[1];
+ imsize_w = input->size[2];
+ }
+ else
+ {
+ batchSize = input->size[0];
+ nInputPlane = input->size[1];
+ imsize_h = input->size[2];
+ imsize_w = input->size[3];
+ }
+
+ input = THCTensor_(newContiguous)(state, input);
+
+ int n_threads = batchSize * imsize_h * imsize_w;
+ LRNFillScale<real, accreal> <<<GET_BLOCKS(n_threads), CUDA_NUM_THREADS, 0, THCState_getCurrentStream(state)>>>(
+ n_threads, THCTensor_(data)(state, input), batchSize, nInputPlane, imsize_h, imsize_w, local_size,
+ alpha / local_size, k, THCTensor_(data)(state, scale));
+ n_threads *= nInputPlane;
+ THCudaCheck(cudaGetLastError());
+ LRNComputeOutput<<<GET_BLOCKS(n_threads), CUDA_NUM_THREADS, 0, THCState_getCurrentStream(state)>>>(
+ n_threads, THCTensor_(data)(state, input), THCTensor_(data)(state, scale), -beta, THCTensor_(data)(state, output));
+ THCudaCheck(cudaGetLastError());
+
+ THCTensor_(free)(state, input);
+}
+
+
+void LRNbackward(THCState* state, THCTensor* input, THCTensor* output,
+ THCTensor* gradOutput, THCTensor* gradInput, THCTensor* scale,
+ int local_size, real alpha, real beta, real k)
+{
+ THCTensor_(resizeAs)(state, gradInput, input);
+
+ int batchSize;
+ int nInputPlane;
+ int imsize_h;
+ int imsize_w;
+
+ if (input->nDimension == 3) {
+ batchSize = 1;
+ nInputPlane = input->size[0];
+ imsize_h = input->size[1];
+ imsize_w = input->size[2];
+ }
+ else
+ {
+ batchSize = input->size[0];
+ nInputPlane = input->size[1];
+ imsize_h = input->size[2];
+ imsize_w = input->size[3];
+ }
+
+ input = THCTensor_(newContiguous)(state, input);
+ gradOutput = THCTensor_(newContiguous)(state, gradOutput);
+
+ int n_threads = batchSize * imsize_h * imsize_w;
+ LRNComputeDiff<real, accreal> <<<GET_BLOCKS(n_threads), CUDA_NUM_THREADS, 0, THCState_getCurrentStream(state)>>>(
+ n_threads, THCTensor_(data)(state, input), THCTensor_(data)(state, output),
+ THCTensor_(data)(state, scale), THCTensor_(data)(state, gradOutput), batchSize, nInputPlane, imsize_h, imsize_w,
+ local_size, -beta, ScalarConvert<int, real>::to(2) * alpha * beta / local_size,
+ THCTensor_(data)(state, gradInput));
+ THCudaCheck(cudaGetLastError());
+
+ THCTensor_(free)(state, input);
+ THCTensor_(free)(state, gradOutput);
+}
+
+void THNN_(SpatialCrossMapLRN_updateOutput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *output,
+ THCTensor *scale,
+ int size,
+ real alpha,
+ real beta,
+ real k)
+{
+ LRNforward(state, input, output, scale, size, alpha, beta, k);
+}
+
+void THNN_(SpatialCrossMapLRN_updateGradInput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *gradOutput,
+ THCTensor *gradInput,
+ THCTensor *scale,
+ THCTensor *output,
+ int size,
+ real alpha,
+ real beta,
+ real k)
+{
+ LRNbackward(state, input, output, gradOutput, gradInput, scale, size, alpha, beta, k);
+}
+
+#endif
diff --git a/lib/THCUNN/generic/SpatialDilatedConvolution.cu b/lib/THCUNN/generic/SpatialDilatedConvolution.cu
new file mode 100644
index 0000000..fb5fe51
--- /dev/null
+++ b/lib/THCUNN/generic/SpatialDilatedConvolution.cu
@@ -0,0 +1,402 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/SpatialDilatedConvolution.cu"
+#else
+
+void THNN_(SpatialDilatedConvolution_updateOutput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *output,
+ THCTensor *weight,
+ THCTensor *bias,
+ THCTensor *columns,
+ THCTensor *ones,
+ int kW, int kH,
+ int dW, int dH,
+ int padW, int padH,
+ int dilationW, int dilationH) {
+
+ THCUNN_assertSameGPU_generic(state, 5, input, output, weight, columns, ones);
+ if (bias) {
+ THCUNN_assertSameGPU_generic(state, 2, weight, bias);
+ }
+ THArgCheck(input->nDimension == 3 || input->nDimension == 4, 2, "3D or 4D (batch mode) tensor is expected");
+ THArgCheck(weight->nDimension == 4, 4, "weight tensor must be 4D (nOutputPlane,nInputPlane,kH,kW)");
+ THArgCheck(!bias || weight->size[0] == bias->size[0], 4, "nOutputPlane mismatch in weight and bias");
+ THArgCheck(kW > 0 && kH > 0, 8, "kernel size should be greater than zero");
+ THArgCheck(dW > 0 && dH > 0, 10, "stride should be greater than zero");
+ THArgCheck(dilationW > 0 && dilationH > 0, 14, "dilation should be greater than 0");
+
+ // Params:
+ int nInputPlane = weight->size[1];
+ int nOutputPlane = weight->size[0];
+
+ int batch = 1;
+ if (input->nDimension == 3) {
+ THArgCheck(input->size[0] == nInputPlane, 2, "input channels and nInputPlane dont match");
+ // Force batch
+ batch = 0;
+ THCTensor_(resize4d)(state, input, 1, input->size[0], input->size[1], input->size[2]);
+ } else {
+ THArgCheck(input->size[1] == nInputPlane, 2, "input channels and nInputPlane dont match");
+ }
+
+ long inputWidth = input->size[3];
+ long inputHeight = input->size[2];
+ long outputWidth = (inputWidth + 2*padW - (dilationW * (kW - 1) + 1)) / dW + 1;
+ long outputHeight = (inputHeight + 2*padH - (dilationH * (kH - 1) + 1)) / dH + 1;
+
+ if (outputWidth < 1 || outputHeight < 1)
+ THError("Given input size: (%dx%dx%d). Calculated output size: (%dx%dx%d). Output size is too small",
+ nInputPlane,inputHeight,inputWidth,nOutputPlane,outputHeight,outputWidth);
+
+ // Batch size + input planes
+ long batchSize = input->size[0];
+
+ // Resize output
+ THCTensor_(resize4d)(state, output, batchSize, nOutputPlane, outputHeight, outputWidth);
+
+ // Resize temporary columns
+ THCTensor_(resize2d)(state, columns, nInputPlane*kW*kH, outputHeight*outputWidth);
+
+ // Define a buffer of ones, for bias accumulation
+ // Note: this buffer can be shared with other modules, it only ever gets increased,
+ // and always contains ones.
+ if (ones->nDimension != 2 || ones->size[0]*ones->size[1] < outputHeight*outputWidth) {
+ // Resize plane and fill with ones...
+ THCTensor_(resize2d)(state, ones, outputHeight, outputWidth);
+ THCTensor_(fill)(state, ones, ScalarConvert<int, real>::to(1));
+ }
+
+ // Helpers
+ THCTensor *input_n = THCTensor_(new)(state);
+ THCTensor *output_n = THCTensor_(new)(state);
+
+ // For each elt in batch, do:
+ for (int elt = 0; elt < batchSize; elt ++) {
+ // Matrix mulitply per output:
+ THCTensor_(select)(state, input_n, input, 0, elt);
+ THCTensor_(select)(state, output_n, output, 0, elt);
+
+ // Do Bias first:
+ // M,N,K are dims of matrix A and B
+ // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
+ long m_ = nOutputPlane;
+ long n_ = outputHeight * outputWidth;
+ long k_ = 1;
+
+ // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+ if (bias) {
+ #ifdef THC_REAL_IS_FLOAT
+ THCudaBlas_Sgemm(
+ #elif defined(THC_REAL_IS_HALF)
+ THCudaBlas_Hgemm(
+ #elif defined(THC_REAL_IS_DOUBLE)
+ THCudaBlas_Dgemm(
+ #endif
+ state,
+ 't', 'n',
+ n_, m_, k_,
+ ScalarConvert<int, real>::to(1),
+ THCTensor_(data)(state, ones), k_,
+ THCTensor_(data)(state, bias), k_,
+ ScalarConvert<int, real>::to(0),
+ THCTensor_(data)(state, output_n), n_
+ );
+ } else {
+ THCTensor_(zero)(state, output_n);
+ }
+
+ // Extract columns:
+ im2col(
+ THCState_getCurrentStream(state),
+ THCTensor_(data)(state, input_n),
+ nInputPlane, inputHeight, inputWidth, kH, kW, padH, padW, dH, dW,
+ dilationH, dilationW,
+ THCTensor_(data)(state, columns)
+ );
+
+ // M,N,K are dims of matrix A and B
+ // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
+ long m = nOutputPlane;
+ long n = columns->size[1];
+ long k = nInputPlane*kH*kW;
+
+ // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+ #ifdef THC_REAL_IS_FLOAT
+ THCudaBlas_Sgemm(
+ #elif defined(THC_REAL_IS_HALF)
+ THCudaBlas_Hgemm(
+ #elif defined(THC_REAL_IS_DOUBLE)
+ THCudaBlas_Dgemm(
+ #endif
+ state,
+ 'n', 'n',
+ n, m, k,
+ ScalarConvert<int, real>::to(1),
+ THCTensor_(data)(state, columns), n,
+ THCTensor_(data)(state, weight), k,
+ ScalarConvert<int, real>::to(1),
+ THCTensor_(data)(state, output_n), n
+ );
+ }
+
+ // Free
+ THCTensor_(free)(state, input_n);
+ THCTensor_(free)(state, output_n);
+
+ // Resize output
+ if (batch == 0) {
+ THCTensor_(resize3d)(state, output, nOutputPlane, outputHeight, outputWidth);
+ THCTensor_(resize3d)(state, input, nInputPlane, inputHeight, inputWidth);
+ }
+}
+
+void THNN_(SpatialDilatedConvolution_updateGradInput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *gradOutput,
+ THCTensor *gradInput,
+ THCTensor *weight,
+ THCTensor *gradColumns,
+ int kW, int kH,
+ int dW, int dH,
+ int padW, int padH,
+ int dilationW, int dilationH) {
+
+ THCUNN_assertSameGPU_generic(state, 5, input, gradOutput, weight,
+ gradColumns, gradInput);
+ THArgCheck(input->nDimension == 3 || input->nDimension == 4, 2, "3D or 4D (batch mode) tensor is expected");
+ THArgCheck(weight->nDimension == 4, 4, "weight tensor must be 4D (nOutputPlane,nInputPlane,kH,kW)");
+ THArgCheck(kW > 0 && kH > 0, 9, "kernel size should be greater than zero");
+ THArgCheck(dW > 0 && dH > 0, 11, "stride should be greater than zero");
+
+ // Params
+ int nInputPlane = weight->size[1];
+ int nOutputPlane = weight->size[0];
+
+ int batch = 1;
+ if (input->nDimension == 3) {
+ // Force batch
+ batch = 0;
+ THCTensor_(resize4d)(state, input, 1, input->size[0], input->size[1], input->size[2]);
+ THCTensor_(resize4d)(state, gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2]);
+ }
+
+ long inputWidth = input->size[3];
+ long inputHeight = input->size[2];
+ long outputWidth = (inputWidth + 2*padW - (dilationW * (kW - 1) + 1)) / dW + 1;
+ long outputHeight = (inputHeight + 2*padH - (dilationH * (kH - 1) + 1)) / dH + 1;
+
+ // Batch size + input planes
+ long batchSize = input->size[0];
+
+ // Resize output
+ THCTensor_(resize4d)(state, gradInput, batchSize, nInputPlane, inputHeight, inputWidth);
+
+ // Resize temporary columns
+ THCTensor_(resize2d)(state, gradColumns, nInputPlane*kW*kH, outputHeight*outputWidth);
+
+ // Helpers
+ THCTensor *gradInput_n = THCTensor_(new)(state);
+ THCTensor *gradOutput_n = THCTensor_(new)(state);
+
+ // For each elt in batch, do:
+ for (int elt = 0; elt < batchSize; elt ++) {
+ // Matrix mulitply per sample:
+ THCTensor_(select)(state, gradInput_n, gradInput, 0, elt);
+ THCTensor_(select)(state, gradOutput_n, gradOutput, 0, elt);
+
+ // M,N,K are dims of matrix A and B
+ // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
+ long m = nInputPlane*kW*kH;
+ long n = gradColumns->size[1];
+ long k = nOutputPlane;
+
+ // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+ #ifdef THC_REAL_IS_FLOAT
+ THCudaBlas_Sgemm(
+ #elif defined(THC_REAL_IS_HALF)
+ THCudaBlas_Hgemm(
+ #elif defined(THC_REAL_IS_DOUBLE)
+ THCudaBlas_Dgemm(
+ #endif
+ state,
+ 'n', 't',
+ n, m, k,
+ ScalarConvert<int, real>::to(1),
+ THCTensor_(data)(state, gradOutput_n), n,
+ THCTensor_(data)(state, weight), m,
+ ScalarConvert<int, real>::to(0),
+ THCTensor_(data)(state, gradColumns), n
+ );
+
+ // Unpack columns back into input:
+ col2im<real, accreal>(
+ THCState_getCurrentStream(state),
+ THCTensor_(data)(state, gradColumns),
+ nInputPlane, inputHeight, inputWidth, kH, kW, padH, padW, dH, dW,
+ dilationH, dilationW,
+ THCTensor_(data)(state, gradInput_n)
+ );
+ }
+
+ // Free
+ THCTensor_(free)(state, gradInput_n);
+ THCTensor_(free)(state, gradOutput_n);
+
+ // Resize output
+ if (batch == 0) {
+ THCTensor_(resize3d)(state, gradOutput, nOutputPlane, outputHeight, outputWidth);
+ THCTensor_(resize3d)(state, input, nInputPlane, inputHeight, inputWidth);
+ THCTensor_(resize3d)(state, gradInput, nInputPlane, inputHeight, inputWidth);
+ }
+}
+
+void THNN_(SpatialDilatedConvolution_accGradParameters)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *gradOutput,
+ THCTensor *gradWeight,
+ THCTensor *gradBias,
+ THCTensor *columns,
+ THCTensor *ones,
+ int kW, int kH,
+ int dW, int dH,
+ int padW, int padH,
+ int dilationW, int dilationH,
+ real scale) {
+
+ THCUNN_assertSameGPU_generic(state, 5, input, gradOutput, gradWeight, columns, ones);
+ if (gradBias) {
+ THCUNN_assertSameGPU_generic(state, 2, gradWeight, gradBias);
+ }
+ THArgCheck(input->nDimension == 3 || input->nDimension == 4, 2, "3D or 4D (batch mode) tensor is expected");
+ THArgCheck(gradWeight->nDimension == 4, 4, "gradWeight tensor must be 4D (nOutputPlane,nInputPlane,kH,kW)");
+ THArgCheck(!gradBias || gradWeight->size[0] == gradBias->size[0], 4, "nOutputPlane mismatch in gradWeight and gradBias");
+ THArgCheck(kW > 0 && kH > 0, 8, "kernel size should be greater than zero");
+ THArgCheck(dW > 0 && dH > 0, 10, "stride should be greater than zero");
+
+ // Params
+ int nInputPlane = gradWeight->size[1];
+ int nOutputPlane = gradWeight->size[0];
+
+ int batch = 1;
+ if (input->nDimension == 3) {
+ // Force batch
+ batch = 0;
+ THCTensor_(resize4d)(state, input, 1, input->size[0], input->size[1], input->size[2]);
+ THCTensor_(resize4d)(state, gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2]);
+ }
+
+ long inputWidth = input->size[3];
+ long inputHeight = input->size[2];
+ long outputWidth = (inputWidth + 2*padW - (dilationW * (kW - 1) + 1)) / dW + 1;
+ long outputHeight = (inputHeight + 2*padH - (dilationH * (kH - 1) + 1)) / dH + 1;
+
+ // Batch size + input planes
+ long batchSize = input->size[0];
+
+ // Define a buffer of ones, for bias accumulation
+ if (ones->nDimension != 2 || ones->size[0]*ones->size[1] < outputHeight*outputWidth) {
+ // Resize plane and fill with ones...
+ THCTensor_(resize2d)(state, ones, outputHeight, outputWidth);
+ THCTensor_(fill)(state, ones, ScalarConvert<int, real>::to(1));
+ }
+
+ // Resize temporary columns
+ THCTensor_(resize2d)(state, columns, nInputPlane*kW*kH, outputHeight*outputWidth);
+
+ // Helpers
+ THCTensor *input_n = THCTensor_(new)(state);
+ THCTensor *gradOutput_n = THCTensor_(new)(state);
+
+ // For each elt in batch, do:
+ for (int elt = 0; elt < batchSize; elt ++) {
+ // Matrix mulitply per output:
+ THCTensor_(select)(state, input_n, input, 0, elt);
+ THCTensor_(select)(state, gradOutput_n, gradOutput, 0, elt);
+
+ // Extract columns:
+ im2col(
+ THCState_getCurrentStream(state),
+ THCTensor_(data)(state, input_n),
+ nInputPlane, inputHeight, inputWidth, kH, kW, padH, padW, dH, dW,
+ dilationH, dilationW,
+ THCTensor_(data)(state, columns)
+ );
+
+ // M,N,K are dims of matrix A and B
+ // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
+ long m = nOutputPlane;
+ long n = nInputPlane*kW*kH;
+ long k = columns->size[1];
+
+ // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+ #ifdef THC_REAL_IS_FLOAT
+ THCudaBlas_Sgemm(
+ #elif defined(THC_REAL_IS_HALF)
+ THCudaBlas_Hgemm(
+ #elif defined(THC_REAL_IS_DOUBLE)
+ THCudaBlas_Dgemm(
+ #endif
+ state,
+ 't', 'n',
+ n, m, k,
+ scale,
+ THCTensor_(data)(state, columns), k,
+ THCTensor_(data)(state, gradOutput_n), k,
+ ScalarConvert<int, real>::to(1),
+ THCTensor_(data)(state, gradWeight), n
+ );
+
+ // Do Bias:
+ // M,N,K are dims of matrix A and B
+ // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
+ long m_ = nOutputPlane;
+ long k_ = outputHeight * outputWidth;
+
+ // Do GEMV (note: this is a bit confusing because gemv assumes column-major matrices)
+ if (gradBias) {
+ #if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE)
+ #ifdef THC_REAL_IS_FLOAT
+ THCudaBlas_Sgemv(
+ #elif defined(THC_REAL_IS_DOUBLE)
+ THCudaBlas_Dgemv(
+ #endif
+ state,
+ 't',
+ k_, m_,
+ scale,
+ THCTensor_(data)(state, gradOutput_n), k_,
+ THCTensor_(data)(state, ones), 1,
+ ScalarConvert<int, real>::to(1),
+ THCTensor_(data)(state, gradBias), 1
+ );
+ #endif
+ #ifdef THC_REAL_IS_HALF
+ THCudaBlas_Hgemm(
+ state,
+ 't', 'n',
+ m_, 1, k_,
+ scale,
+ THCTensor_(data)(state, gradOutput_n), k_,
+ THCTensor_(data)(state, ones), k_,
+ ScalarConvert<int, real>::to(1),
+ THCTensor_(data)(state, gradBias), m_
+ );
+ #endif
+ }
+ }
+
+ // Free
+ THCTensor_(free)(state, input_n);
+ THCTensor_(free)(state, gradOutput_n);
+
+ // Resize
+ if (batch == 0) {
+ THCTensor_(resize3d)(state, gradOutput, nOutputPlane, outputHeight, outputWidth);
+ THCTensor_(resize3d)(state, input, nInputPlane, inputHeight, inputWidth);
+ }
+}
+
+#endif
diff --git a/lib/THCUNN/generic/SpatialDilatedMaxPooling.cu b/lib/THCUNN/generic/SpatialDilatedMaxPooling.cu
new file mode 100644
index 0000000..b99b5e6
--- /dev/null
+++ b/lib/THCUNN/generic/SpatialDilatedMaxPooling.cu
@@ -0,0 +1,155 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/SpatialDilatedMaxPooling.cu"
+#else
+
+#include "../common.h"
+
+void THNN_(SpatialDilatedMaxPooling_updateOutput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *output,
+ THCIndexTensor *indices,
+ int kW, int kH,
+ int dW, int dH,
+ int padW, int padH,
+ int dilationW, int dilationH,
+ bool ceil_mode)
+{
+
+ THCUNN_assertSameGPU_generic(state, 3, input, output, indices);
+ THArgCheck(input->nDimension == 3 || input->nDimension == 4, 2, "3D or 4D (batch) tensor expected");
+
+ long nInputCols, nInputRows, nInputPlane, batchSize;
+ long nOutputCols, nOutputRows;
+
+ if (input->nDimension == 3) {
+ nInputCols = input->size[2];
+ nInputRows = input->size[1];
+ nInputPlane = input->size[0];
+ batchSize = 1;
+ }
+ else
+ {
+ nInputCols = input->size[3];
+ nInputRows = input->size[2];
+ nInputPlane = input->size[1];
+ batchSize = input->size[0];
+ }
+
+ THArgCheck(nInputCols >= kW - padW && nInputRows >= kH - padH, 2, "input image smaller than kernel size");
+ THArgCheck(kW/2 >= padW && kH/2 >= padH, 2, "pad should be smaller than half of kernel size");
+ THArgCheck(dilationW > 0 && dilationH > 0, 11, "dilation should be greater than 0");
+ if(ceil_mode) {
+ nOutputCols = ceil(float(nInputCols - (dilationW * (kW - 1) + 1) + 2*padW) / float(dW)) + 1;
+ nOutputRows = ceil(float(nInputRows - (dilationH * (kH - 1) + 1) + 2*padH) / float(dH)) + 1;
+ }
+ else {
+ nOutputCols = floor(float(nInputCols - (dilationW * (kW - 1) + 1) + 2*padW) / float(dW)) + 1;
+ nOutputRows = floor(float(nInputRows - (dilationH * (kH - 1) + 1) + 2*padH) / float(dH)) + 1;
+ }
+
+if (nOutputCols < 1 || nOutputRows < 1)
+ THError("Given input size: (%dx%dx%d). Calculated output size: (%dx%dx%d). Output size is too small",
+ nInputPlane,nInputRows,nInputCols,nInputPlane,nOutputRows,nOutputCols);
+
+if (padW || padH)
+ {
+ // ensure that the last pooling starts inside the image
+ if ((nOutputRows - 1)*dH >= nInputRows + padH)
+ --nOutputRows;
+ if ((nOutputCols - 1)*dW >= nInputCols + padW)
+ --nOutputCols;
+ }
+
+ input = THCTensor_(newContiguous)(state, input);
+ real* input_data = THCTensor_(data)(state, input);
+
+ THCTensor_(resize4d)(state, output, batchSize, nInputPlane, nOutputRows, nOutputCols);
+ THCUNN_resizeAs_indices(state, indices, output);
+
+ THCIndex_t* indices_data = THCIndexTensor_(data)(state, indices);
+ real* output_data = THCTensor_(data)(state, output);
+
+ int count = THCTensor_(nElement)(state, output);
+
+ MaxPoolForward<real, accreal> <<< GET_BLOCKS(count), CUDA_NUM_THREADS, 0, THCState_getCurrentStream(state) >>>
+ (count, input_data,
+ batchSize, nInputPlane, nInputRows, nInputCols, nOutputRows, nOutputCols,
+ kH, kW, dH, dW, padH, padW, dilationH, dilationW, output_data, indices_data);
+ THCudaCheck(cudaGetLastError());
+
+ if(input->nDimension == 3)
+ THCTensor_(resize3d)(state, output, nInputPlane, nOutputRows, nOutputCols);
+
+ THCTensor_(free)(state, input);
+}
+
+void THNN_(SpatialDilatedMaxPooling_updateGradInput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *gradOutput,
+ THCTensor *gradInput,
+ THCIndexTensor *indices,
+ int kW, int kH,
+ int dW, int dH,
+ int padW, int padH,
+ int dilationW, int dilationH,
+ bool ceil_mode)
+{
+ THCUNN_assertSameGPU_generic(state, 4, input, gradOutput, indices, gradInput);
+
+ input = THCTensor_(newContiguous)(state, input);
+ gradOutput = THCTensor_(newContiguous)(state, gradOutput);
+
+ long nInputCols, nInputRows, nInputPlane, batchSize;
+ long nOutputCols, nOutputRows;
+
+ if (input->nDimension == 3) {
+ nInputCols = input->size[2];
+ nInputRows = input->size[1];
+ nInputPlane = input->size[0];
+ batchSize = 1;
+ }
+ else
+ {
+ nInputCols = input->size[3];
+ nInputRows = input->size[2];
+ nInputPlane = input->size[1];
+ batchSize = input->size[0];
+ }
+
+ if(ceil_mode) {
+ nOutputCols = ceil(float(nInputCols - (dilationW * (kW - 1) + 1) + 2*padW) / float(dW)) + 1;
+ nOutputRows = ceil(float(nInputRows - (dilationH * (kH - 1) + 1) + 2*padH) / float(dH)) + 1;
+ }
+ else {
+ nOutputCols = floor(float(nInputCols - (dilationW * (kW - 1) + 1) + 2*padW) / float(dW)) + 1;
+ nOutputRows = floor(float(nInputRows - (dilationH * (kH - 1) + 1) + 2*padH) / float(dH)) + 1;
+ }
+
+ if (nOutputCols < 1 || nOutputRows < 1)
+ THError("Given input size: (%dx%dx%d). Calculated output size: (%dx%dx%d). Output size is too small",
+ nInputPlane,nInputRows,nInputCols,nInputPlane,nOutputRows,nOutputCols);
+
+ gradOutput = THCTensor_(newContiguous)(state, gradOutput);
+ THCTensor_(resizeAs)(state, gradInput, input);
+
+ int count = THCTensor_(nElement)(state, input);
+
+ MaxPoolBackward<real, accreal> <<< GET_BLOCKS(count), CUDA_NUM_THREADS, 0, THCState_getCurrentStream(state) >>>
+ (count,
+ THCTensor_(data)(state, gradOutput),
+ THCIndexTensor_(data)(state, indices),
+ batchSize, nInputPlane, nInputRows, nInputCols, nOutputRows, nOutputCols,
+ kH, kW, dH, dW, padH, padW, dilationH, dilationW,
+ THCTensor_(data)(state, gradInput));
+ THCudaCheck(cudaGetLastError());
+
+ THCTensor_(free)(state, gradOutput);
+
+ // clean
+ THCTensor_(free)(state, input);
+ THCTensor_(free)(state, gradOutput);
+}
+
+#endif
diff --git a/lib/THCUNN/generic/SpatialFractionalMaxPooling.cu b/lib/THCUNN/generic/SpatialFractionalMaxPooling.cu
new file mode 100644
index 0000000..6e7fc99
--- /dev/null
+++ b/lib/THCUNN/generic/SpatialFractionalMaxPooling.cu
@@ -0,0 +1,155 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/SpatialFractionalMaxPooling.cu"
+#else
+
+void THNN_(SpatialFractionalMaxPooling_updateOutput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *output,
+ int outputW, int outputH,
+ int poolSizeW, int poolSizeH,
+ THCIndexTensor *indices,
+ THCTensor *randomSamples)
+{
+ int planeDim = 0;
+ int dimh = 1;
+ int dimw = 2;
+ long numBatch = 1;
+
+ long numInputDims = THCTensor_(nDimension)(state, input);
+ THArgCheck(numInputDims == 3 || numInputDims == 4, 2,
+ "3D or 4D (batch mode) tensor expected");
+
+ if (numInputDims == 4) {
+ numBatch = THCTensor_(size)(state, input, 0);
+ planeDim++;
+ dimh++;
+ dimw++;
+ }
+
+ /* sizes */
+ long numPlanes = THCTensor_(size)(state, input, planeDim);
+ long inputH = THCTensor_(size)(state, input, dimh);
+ long inputW = THCTensor_(size)(state, input, dimw);
+
+ THArgCheck(outputH + poolSizeH - 1 < inputH, 6,
+ "poolSizeH too large relative to input height");
+ THArgCheck(outputW + poolSizeW - 1 < inputW, 5,
+ "poolSizeW too large relative to input width");
+
+ THCDeviceTensor<real, 4> devInput;
+ THCDeviceTensor<real, 4> devOutput;
+ THCDeviceTensor<THCIndex_t, 4> devIndices;
+ THCDeviceTensor<real, 3> devSamples =
+ toDeviceTensor<real, 3>(state, randomSamples);
+
+ if (numInputDims == 3) {
+ /* resize output */
+ THCTensor_(resize3d)(state, output, numPlanes, outputH, outputW);
+ /* indices will contain the locations for each output point */
+ THCIndexTensor_(resize3d)(state, indices, numPlanes, outputH, outputW);
+
+ devInput = toDeviceTensor<real, 3>(state, input).upcastOuter<4>();
+ devOutput = toDeviceTensor<real, 3>(state, output).upcastOuter<4>();
+ devIndices = toDeviceTensor<THCIndex_t, 3>(state, indices).upcastOuter<4>();
+ } else {
+ THCTensor_(resize4d)(state, output, numBatch, numPlanes, outputH, outputW);
+ /* indices will contain the locations for each output point */
+ THCIndexTensor_(resize4d)(state, indices, numBatch, numPlanes, outputH, outputW);
+
+ devInput = toDeviceTensor<real, 4>(state, input);
+ devOutput = toDeviceTensor<real, 4>(state, output);
+ devIndices = toDeviceTensor<THCIndex_t, 4>(state, indices);
+ }
+
+ // block is limited to 4 warps
+ // grid handles overflow per each plane
+ int outputPlaneSize = devOutput.getSize(2) * devOutput.getSize(3);
+ dim3 grid(THCCeilDiv(outputPlaneSize, 128),
+ devInput.getSize(1),
+ devInput.getSize(0));
+ dim3 block(outputPlaneSize > 128 ? 128 : outputPlaneSize);
+
+#define SFMP_UPDATE_OUTPUT(POOL_W) \
+ SpatialFractionalMaxPooling_updateOutput<POOL_W, real, accreal> \
+ <<<grid, block, 0, THCState_getCurrentStream(state)>>>( \
+ devInput, devOutput, devIndices, devSamples, poolSizeW, poolSizeH);
+
+#define SFMP_UPDATE_OUTPUT_CASE(POOL_W) \
+ case POOL_W: SFMP_UPDATE_OUTPUT(POOL_W); break
+
+ switch (poolSizeW) {
+ SFMP_UPDATE_OUTPUT_CASE(2);
+ SFMP_UPDATE_OUTPUT_CASE(3);
+ SFMP_UPDATE_OUTPUT_CASE(4);
+ SFMP_UPDATE_OUTPUT_CASE(5);
+ SFMP_UPDATE_OUTPUT_CASE(6);
+ SFMP_UPDATE_OUTPUT_CASE(7);
+ default:
+ // dynamic pool width
+ SFMP_UPDATE_OUTPUT_CASE(-1);
+ }
+ THCudaCheck(cudaGetLastError());
+}
+
+void THNN_(SpatialFractionalMaxPooling_updateGradInput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *gradOutput,
+ THCTensor *gradInput,
+ int outputW, int outputH,
+ int poolSizeW, int poolSizeH,
+ THCIndexTensor *indices)
+{
+ int dimh = 1;
+ int dimw = 2;
+
+ long numInputDims = THCTensor_(nDimension)(state, input);
+ if (numInputDims == 4) {
+ dimh++;
+ dimw++;
+ }
+
+ /* sizes */
+ long inputH = THCTensor_(size)(state, input, dimh);
+ long inputW = THCTensor_(size)(state, input, dimw);
+
+ THArgCheck(outputH == THCTensor_(size)(state, gradOutput, dimh), 3,
+ "gradOutput height unexpected");
+ THArgCheck(outputW == THCTensor_(size)(state, gradOutput, dimw), 3,
+ "gradOutput width unexpected");
+
+ /* resize */
+ THCTensor_(resizeAs)(state, gradInput, input);
+ THCTensor_(zero)(state, gradInput);
+
+ THCDeviceTensor<real, 4> devGradInput;
+ THCDeviceTensor<real, 4> devGradOutput;
+ THCDeviceTensor<THCIndex_t, 4> devIndices;
+
+ /* backprop */
+ if (numInputDims == 3) {
+ devGradInput = toDeviceTensor<real, 3>(state, gradInput).upcastOuter<4>();
+ devGradOutput = toDeviceTensor<real, 3>(state, gradOutput).upcastOuter<4>();
+ devIndices = toDeviceTensor<THCIndex_t, 3>(state, indices).upcastOuter<4>();
+ } else {
+ devGradInput = toDeviceTensor<real, 4>(state, gradInput);
+ devGradOutput = toDeviceTensor<real, 4>(state, gradOutput);
+ devIndices = toDeviceTensor<THCIndex_t, 4>(state, indices);
+ }
+
+ // block is limited to 4 warps
+ // grid handles overflow per each plane
+ int outputPlaneSize = devGradOutput.getSize(2) * devGradOutput.getSize(3);
+ dim3 grid(THCCeilDiv(outputPlaneSize, 128),
+ devGradInput.getSize(1),
+ devGradInput.getSize(0));
+ dim3 block(outputPlaneSize > 128 ? 128 : outputPlaneSize);
+
+ SpatialFractionalMaxPooling_updateGradInput
+ <<<grid, block, 0, THCState_getCurrentStream(state)>>>(
+ devGradInput, devGradOutput, devIndices);
+ THCudaCheck(cudaGetLastError());
+}
+
+#endif
diff --git a/lib/THCUNN/generic/SpatialFullConvolution.cu b/lib/THCUNN/generic/SpatialFullConvolution.cu
new file mode 100644
index 0000000..7f0ebc7
--- /dev/null
+++ b/lib/THCUNN/generic/SpatialFullConvolution.cu
@@ -0,0 +1,381 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/SpatialFullConvolution.cu"
+#else
+
+void THNN_(SpatialFullConvolution_updateOutput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *output,
+ THCTensor *weight,
+ THCTensor *bias,
+ THCTensor *columns,
+ THCTensor *ones,
+ int kW, int kH,
+ int dW, int dH,
+ int padW, int padH,
+ int adjW, int adjH)
+{
+
+ int nInputPlane = THCTensor_(size)(state, weight, 0);
+ int nOutputPlane = THCTensor_(size)(state, weight, 1);
+
+ THCUNN_assertSameGPU_generic(state, 6, input, output, weight,
+ bias, columns, ones);
+
+ THArgCheck(input->nDimension == 3 || input->nDimension == 4, 2, "3D or 4D (batch mode) tensor is expected");
+
+ int batch = 1;
+ if (input->nDimension == 3) {
+ THArgCheck(input->size[0] == nInputPlane, 2, "input channels and nInputPlane dont match");
+ // Force batch
+ batch = 0;
+ THCTensor_(resize4d)(state, input, 1, input->size[0], input->size[1], input->size[2]);
+ } else {
+ THArgCheck(input->size[1] == nInputPlane, 2, "input channels and nInputPlane dont match");
+ }
+
+ long inputWidth = input->size[3];
+ long inputHeight = input->size[2];
+ long outputWidth = (inputWidth - 1) * dW - 2*padW + kW + adjW;
+ long outputHeight = (inputHeight - 1) * dH - 2*padH + kH + adjH;
+
+ // Batch size + input planes
+ long batchSize = input->size[0];
+
+ // Resize output
+ THCTensor_(resize4d)(state, output, batchSize, nOutputPlane, outputHeight, outputWidth);
+
+ // Resize temporary columns
+ THCTensor_(resize2d)(state, columns, nOutputPlane*kW*kH, inputHeight*inputWidth);
+
+ // Define a buffer of ones, for bias accumulation
+ // Note: this buffer can be shared with other modules, it only ever gets increased,
+ // and always contains ones.
+ if (ones->nDimension != 2 || ones->size[0]*ones->size[1] < outputHeight*outputWidth) {
+ // Resize plane and fill with ones...
+ THCTensor_(resize2d)(state, ones, outputHeight, outputWidth);
+ THCTensor_(fill)(state, ones, ScalarConvert<int, real>::to(1));
+ }
+
+ // Helpers
+ THCTensor *input_n = THCTensor_(new)(state);
+ THCTensor *output_n = THCTensor_(new)(state);
+
+ // For each elt in batch, do:
+ for (int elt = 0; elt < batchSize; elt ++) {
+ // Matrix mulitply per output:
+ THCTensor_(select)(state, input_n, input, 0, elt);
+ THCTensor_(select)(state, output_n, output, 0, elt);
+
+ // M,N,K are dims of matrix A and B
+ // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
+ long m = weight->size[1] * weight->size[2] * weight->size[3];
+ long n = columns->size[1];
+ long k = weight->size[0];
+
+ // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+ #ifdef THC_REAL_IS_FLOAT
+ THCudaBlas_Sgemm(
+ #elif defined(THC_REAL_IS_HALF)
+ THCudaBlas_Hgemm(
+ #elif defined(THC_REAL_IS_DOUBLE)
+ THCudaBlas_Dgemm(
+ #endif
+ state,
+ 'n', 't',
+ n, m, k,
+ ScalarConvert<int, real>::to(1),
+ THCTensor_(data)(state, input_n), n,
+ THCTensor_(data)(state, weight), m,
+ ScalarConvert<int, real>::to(0),
+ THCTensor_(data)(state, columns), n
+ );
+
+ // Unpack columns back into input:
+ col2im<real, accreal>(
+ THCState_getCurrentStream(state),
+ THCTensor_(data)(state, columns),
+ nOutputPlane, outputHeight, outputWidth, kH, kW, padH, padW, dH, dW,
+ 1, 1, THCTensor_(data)(state, output_n)
+ );
+
+ // Do Bias after:
+ // M,N,K are dims of matrix A and B
+ // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
+ long m_ = nOutputPlane;
+ long n_ = outputHeight * outputWidth;
+ long k_ = 1;
+
+ // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+ if (bias) {
+ #ifdef THC_REAL_IS_FLOAT
+ THCudaBlas_Sgemm(
+ #elif defined(THC_REAL_IS_HALF)
+ THCudaBlas_Hgemm(
+ #elif defined(THC_REAL_IS_DOUBLE)
+ THCudaBlas_Dgemm(
+ #endif
+ state,
+ 't', 'n',
+ n_, m_, k_,
+ ScalarConvert<int, real>::to(1),
+ THCTensor_(data)(state, ones), k_,
+ THCTensor_(data)(state, bias), k_,
+ ScalarConvert<int, real>::to(1),
+ THCTensor_(data)(state, output_n), n_
+ );
+ }
+
+ }
+
+ // Free
+ THCTensor_(free)(state, input_n);
+ THCTensor_(free)(state, output_n);
+
+ // Resize output
+ if (batch == 0) {
+ THCTensor_(resize3d)(state, output, nOutputPlane, outputHeight, outputWidth);
+ THCTensor_(resize3d)(state, input, nInputPlane, inputHeight, inputWidth);
+ }
+}
+
+void THNN_(SpatialFullConvolution_updateGradInput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *gradOutput,
+ THCTensor *gradInput,
+ THCTensor *weight,
+ THCTensor *gradColumns,
+ int kW, int kH,
+ int dW, int dH,
+ int padW, int padH,
+ int adjW, int adjH)
+{
+ int nInputPlane = THCTensor_(size)(state, weight, 0);
+ int nOutputPlane = THCTensor_(size)(state, weight, 1);
+
+ THCUNN_assertSameGPU_generic(state, 5, input, gradOutput, weight,
+ gradColumns, gradInput);
+ THArgCheck(input->nDimension == 3 || input->nDimension == 4, 2, "3D or 4D (batch mode) tensor is expected");
+
+ int batch = 1;
+ if (input->nDimension == 3) {
+ // Force batch
+ batch = 0;
+ THCTensor_(resize4d)(state, input, 1, input->size[0], input->size[1], input->size[2]);
+ THCTensor_(resize4d)(state, gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2]);
+ }
+
+ long inputWidth = input->size[3];
+ long inputHeight = input->size[2];
+ long outputWidth = (inputWidth - 1) * dW - 2*padW + kW + adjW;
+ long outputHeight = (inputHeight - 1) * dH - 2*padH + kH + adjH;
+
+ // Batch size + input planes
+ long batchSize = input->size[0];
+
+ // Resize output
+ THCTensor_(resize4d)(state, gradInput, batchSize, nInputPlane, inputHeight, inputWidth);
+
+ // Resize temporary columns
+ THCTensor_(resize2d)(state, gradColumns, nOutputPlane*kW*kH, inputHeight*inputWidth);
+
+ // Helpers
+ THCTensor *gradInput_n = THCTensor_(new)(state);
+ THCTensor *gradOutput_n = THCTensor_(new)(state);
+
+ // For each elt in batch, do:
+ for (int elt = 0; elt < batchSize; elt ++) {
+ // Matrix mulitply per sample:
+ THCTensor_(select)(state, gradInput_n, gradInput, 0, elt);
+ THCTensor_(select)(state, gradOutput_n, gradOutput, 0, elt);
+
+ // Extract columns:
+ im2col(
+ THCState_getCurrentStream(state),
+ THCTensor_(data)(state, gradOutput_n),
+ nOutputPlane, outputHeight, outputWidth, kH, kW, padH, padW, dH, dW,
+ 1, 1, THCTensor_(data)(state, gradColumns)
+ );
+
+
+ // M,N,K are dims of matrix A and B
+ // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
+ long m = weight->size[0];
+ long n = gradColumns->size[1];
+ long k = weight->size[1] * weight->size[2] * weight->size[3];
+
+ // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+ #ifdef THC_REAL_IS_FLOAT
+ THCudaBlas_Sgemm(
+ #elif defined(THC_REAL_IS_HALF)
+ THCudaBlas_Hgemm(
+ #elif defined(THC_REAL_IS_DOUBLE)
+ THCudaBlas_Dgemm(
+ #endif
+ state,
+ 'n', 'n',
+ n, m, k,
+ ScalarConvert<int, real>::to(1),
+ THCTensor_(data)(state, gradColumns), n,
+ THCTensor_(data)(state, weight), k,
+ ScalarConvert<int, real>::to(0),
+ THCTensor_(data)(state, gradInput_n), n
+ );
+ }
+
+
+ // Free
+ THCTensor_(free)(state, gradInput_n);
+ THCTensor_(free)(state, gradOutput_n);
+
+ // Resize output
+ if (batch == 0) {
+ THCTensor_(resize3d)(state, gradOutput, nOutputPlane, outputHeight, outputWidth);
+ THCTensor_(resize3d)(state, input, nInputPlane, inputHeight, inputWidth);
+ THCTensor_(resize3d)(state, gradInput, nInputPlane, inputHeight, inputWidth);
+ }
+}
+
+
+void THNN_(SpatialFullConvolution_accGradParameters)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *gradOutput,
+ THCTensor *gradWeight,
+ THCTensor *gradBias,
+ THCTensor *columns,
+ THCTensor *ones,
+ int kW, int kH,
+ int dW, int dH,
+ int padW, int padH,
+ int adjW, int adjH,
+ real scale)
+{
+ int nInputPlane = THCTensor_(size)(state, gradWeight, 0);
+ int nOutputPlane = THCTensor_(size)(state, gradWeight, 1);
+
+ THCUNN_assertSameGPU_generic(state, 6, input, gradOutput, gradWeight,
+ gradBias, columns, ones);
+
+ THArgCheck(input->nDimension == 3 || input->nDimension == 4, 2, "3D or 4D (batch mode) tensor is expected");
+
+ int batch = 1;
+ if (input->nDimension == 3) {
+ // Force batch
+ batch = 0;
+ THCTensor_(resize4d)(state, input, 1, input->size[0], input->size[1], input->size[2]);
+ THCTensor_(resize4d)(state, gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2]);
+ }
+
+ long inputWidth = input->size[3];
+ long inputHeight = input->size[2];
+ long outputWidth = (inputWidth - 1) * dW - 2*padW + kW + adjW;
+ long outputHeight = (inputHeight - 1) * dH - 2*padH + kH + adjH;
+
+ // Batch size + input planes
+ long batchSize = input->size[0];
+
+ // Define a buffer of ones, for bias accumulation
+ if (ones->nDimension != 2 || ones->size[0]*ones->size[1] < outputHeight*outputWidth) {
+ // Resize plane and fill with ones...
+ THCTensor_(resize2d)(state, ones, outputHeight, outputWidth);
+ THCTensor_(fill)(state, ones, ScalarConvert<int, real>::to(1));
+ }
+
+ // Resize temporary columns
+ THCTensor_(resize2d)(state, columns, nOutputPlane*kW*kH, inputHeight*inputWidth);
+
+ // Helpers
+ THCTensor *input_n = THCTensor_(new)(state);
+ THCTensor *gradOutput_n = THCTensor_(new)(state);
+
+ // For each elt in batch, do:
+ for (int elt = 0; elt < batchSize; elt ++) {
+ // Matrix mulitply per output:
+ THCTensor_(select)(state, input_n, input, 0, elt);
+ THCTensor_(select)(state, gradOutput_n, gradOutput, 0, elt);
+
+ // Extract columns:
+ im2col(
+ THCState_getCurrentStream(state),
+ THCTensor_(data)(state, gradOutput_n),
+ nOutputPlane, outputHeight, outputWidth, kH, kW, padH, padW, dH, dW,
+ 1, 1, THCTensor_(data)(state, columns)
+ );
+
+ // M,N,K are dims of matrix A and B
+ // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
+ long n = columns->size[0]; // nOutputPlane * kh * kw
+ long m = input_n->size[0]; // nInputPlane
+ long k = columns->size[1]; // inputHeight * inputWidth
+
+ // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+ #ifdef THC_REAL_IS_FLOAT
+ THCudaBlas_Sgemm(
+ #elif defined(THC_REAL_IS_HALF)
+ THCudaBlas_Hgemm(
+ #elif defined(THC_REAL_IS_DOUBLE)
+ THCudaBlas_Dgemm(
+ #endif
+ state,
+ 't', 'n',
+ n, m, k,
+ scale,
+ THCTensor_(data)(state, columns), k,
+ THCTensor_(data)(state, input_n), k,
+ ScalarConvert<int, real>::to(1),
+ THCTensor_(data)(state, gradWeight), n
+ );
+
+ // Do Bias:
+ // M,N,K are dims of matrix A and B
+ // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
+ long m_ = nOutputPlane;
+ long k_ = outputHeight * outputWidth;
+
+ // Do GEMV (note: this is a bit confusing because gemv assumes column-major matrices)
+ if (gradBias) {
+ #if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE)
+ #ifdef THC_REAL_IS_FLOAT
+ THCudaBlas_Sgemv(
+ #elif defined(THC_REAL_IS_DOUBLE)
+ THCudaBlas_Dgemv(
+ #endif
+ state,
+ 't',
+ k_, m_,
+ scale,
+ THCTensor_(data)(state, gradOutput_n), k_,
+ THCTensor_(data)(state, ones), 1,
+ ScalarConvert<int, real>::to(1),
+ THCTensor_(data)(state, gradBias), 1
+ );
+ #endif
+ #ifdef THC_REAL_IS_HALF
+ THCudaBlas_Hgemm(
+ state,
+ 't', 'n',
+ m_, 1, k_,
+ scale,
+ THCTensor_(data)(state, gradOutput_n), k_,
+ THCTensor_(data)(state, ones), k_,
+ ScalarConvert<int, real>::to(1),
+ THCTensor_(data)(state, gradBias), m_
+ );
+ #endif
+ }
+ }
+
+ // Free
+ THCTensor_(free)(state, input_n);
+ THCTensor_(free)(state, gradOutput_n);
+
+ // Resize
+ if (batch == 0) {
+ THCTensor_(resize3d)(state, gradOutput, nOutputPlane, outputHeight, outputWidth);
+ THCTensor_(resize3d)(state, input, nInputPlane, inputHeight, inputWidth);
+ }
+}
+
+#endif
diff --git a/lib/THCUNN/generic/SpatialMaxPooling.cu b/lib/THCUNN/generic/SpatialMaxPooling.cu
new file mode 100644
index 0000000..6be838d
--- /dev/null
+++ b/lib/THCUNN/generic/SpatialMaxPooling.cu
@@ -0,0 +1,40 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/SpatialMaxPooling.cu"
+#else
+
+#include "../common.h"
+
+void THNN_(SpatialMaxPooling_updateOutput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *output,
+ THCIndexTensor *indices,
+ int kW, int kH,
+ int dW, int dH,
+ int padW, int padH,
+ bool ceil_mode)
+{
+ THNN_(SpatialDilatedMaxPooling_updateOutput)(
+ state, input, output, indices,
+ kW, kH, dW, dH, padW, padH, 1, 1, ceil_mode);
+
+}
+
+void THNN_(SpatialMaxPooling_updateGradInput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *gradOutput,
+ THCTensor *gradInput,
+ THCIndexTensor *indices,
+ int kW, int kH,
+ int dW, int dH,
+ int padW, int padH,
+ bool ceil_mode)
+{
+ THNN_(SpatialDilatedMaxPooling_updateGradInput)(
+ state, input, gradOutput, gradInput, indices,
+ kW, kH, dW, dH, padW, padH, 1, 1, ceil_mode);
+
+}
+
+#endif
diff --git a/lib/THCUNN/generic/SpatialMaxUnpooling.cu b/lib/THCUNN/generic/SpatialMaxUnpooling.cu
new file mode 100644
index 0000000..8b0c15e
--- /dev/null
+++ b/lib/THCUNN/generic/SpatialMaxUnpooling.cu
@@ -0,0 +1,93 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/SpatialMaxUnpooling.cu"
+#else
+
+void THNN_(SpatialMaxUnpooling_updateOutput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *output,
+ THCIndexTensor *indices,
+ int owidth, int oheight)
+{
+ THCUNN_assertSameGPU_generic(state, 3, input, output, indices);
+ THArgCheck(input->nDimension == 3 || input->nDimension == 4, 2, "3D or 4D (batch) tensor expected");
+
+ long nInputCols, nInputRows, nInputPlane, batchSize;
+
+ if (input->nDimension == 3) {
+ nInputCols = input->size[2];
+ nInputRows = input->size[1];
+ nInputPlane = input->size[0];
+ batchSize = 1;
+ }
+ else
+ {
+ nInputCols = input->size[3];
+ nInputRows = input->size[2];
+ nInputPlane = input->size[1];
+ batchSize = input->size[0];
+ }
+
+ input = THCTensor_(newContiguous)(state, input);
+ indices = THCIndexTensor_(newContiguous)(state, indices);
+ THCTensor_(resize4d)(state, output, batchSize, nInputPlane, oheight, owidth);
+ THCTensor_(zero)(state, output);
+
+ int count = THCTensor_(nElement)(state, input);
+
+ MaxUnpoolForward <<< GET_BLOCKS(count), CUDA_NUM_THREADS, 0, THCState_getCurrentStream(state) >>>
+ (count, THCTensor_(data)(state, input), THCIndexTensor_(data)(state, indices),
+ batchSize, nInputPlane, nInputRows, nInputCols, oheight, owidth, THCTensor_(data)(state, output));
+ THCudaCheck(cudaGetLastError());
+
+ if(input->nDimension == 3)
+ THCTensor_(resize3d)(state, output, nInputPlane, oheight, owidth);
+
+ THCTensor_(free)(state, input);
+
+}
+
+void THNN_(SpatialMaxUnpooling_updateGradInput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *gradOutput,
+ THCTensor *gradInput,
+ THCIndexTensor *indices,
+ int owidth, int oheight)
+{
+ THCUNN_assertSameGPU_generic(state, 4, input, gradOutput, indices, gradInput);
+
+ long nInputCols, nInputRows, nInputPlane, batchSize;
+
+ if (input->nDimension == 3) {
+ nInputCols = input->size[2];
+ nInputRows = input->size[1];
+ nInputPlane = input->size[0];
+ batchSize = 1;
+ }
+ else
+ {
+ nInputCols = input->size[3];
+ nInputRows = input->size[2];
+ nInputPlane = input->size[1];
+ batchSize = input->size[0];
+ }
+
+ input = THCTensor_(newContiguous)(state, input);
+ indices = THCIndexTensor_(newContiguous)(state, indices);
+ gradOutput = THCTensor_(newContiguous)(state, gradOutput);
+ THCTensor_(resizeAs)(state, gradInput, input);
+
+ int count = THCTensor_(nElement)(state, input);
+
+ MaxUnpoolBackward <<< GET_BLOCKS(count), CUDA_NUM_THREADS, 0, THCState_getCurrentStream(state) >>>
+ (count, THCTensor_(data)(state, gradOutput), THCIndexTensor_(data)(state, indices),
+ batchSize, nInputPlane, nInputRows, nInputCols, oheight, owidth, THCTensor_(data)(state, gradInput));
+ THCudaCheck(cudaGetLastError());
+
+ // clean
+ THCTensor_(free)(state, input);
+ THCTensor_(free)(state, gradOutput);
+}
+
+#endif
diff --git a/lib/THCUNN/generic/SpatialReflectionPadding.cu b/lib/THCUNN/generic/SpatialReflectionPadding.cu
new file mode 100644
index 0000000..e4efa91
--- /dev/null
+++ b/lib/THCUNN/generic/SpatialReflectionPadding.cu
@@ -0,0 +1,110 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/SpatialReflectionPadding.cu"
+#else
+
+void THNN_(SpatialReflectionPadding_updateOutput)(THCState *state,
+ THCTensor *input,
+ THCTensor *output,
+ int padL, int padR,
+ int padT, int padB) {
+ THArgCheck(TensorUtils<THCTensor>::canUse32BitIndexMath(state, input), 2,
+ "input tensor must fit into 32-bit index math");
+
+ int planeDim = 0;
+ int dimh = 1;
+ int dimw = 2;
+ int numBatch = 1;
+
+ int numInputDims = THCTensor_(nDimension)(state, input);
+ THArgCheck(numInputDims == 3 || numInputDims == 4, 2,
+ "input must be 3 or 4-dimensional");
+
+ if (numInputDims == 4) {
+ numBatch = THCTensor_(size)(state, input, 0);
+ planeDim++;
+ dimh++;
+ dimw++;
+ }
+
+ int numPlanes = THCTensor_(size)(state, input, planeDim);
+ int inputH = THCTensor_(size)(state, input, dimh);
+ int inputW = THCTensor_(size)(state, input, dimw);
+ int outputH = inputH + padT + padB;
+ int outputW = inputW + padL + padR;
+
+ THCDeviceTensor<real, 4> devInput;
+ THCDeviceTensor<real, 4> devOutput;
+
+ if (numInputDims == 3) {
+ THCTensor_(resize3d)(state, output, numPlanes, outputH, outputW);
+
+ devInput = toDeviceTensor<real, 3>(state, input).upcastOuter<4>();
+ devOutput = toDeviceTensor<real, 3>(state, output).upcastOuter<4>();
+ } else {
+ THCTensor_(resize4d)(state, output, numBatch, numPlanes, outputH, outputW);
+
+ devInput = toDeviceTensor<real, 4>(state, input);
+ devOutput = toDeviceTensor<real, 4>(state, output);
+ }
+
+ int outputPlaneSize = devOutput.getSize(2) * devOutput.getSize(3);
+ dim3 gridSize(THCCeilDiv(outputPlaneSize, 256),
+ devOutput.getSize(1),
+ devOutput.getSize(0));
+ dim3 blockSize(outputPlaneSize > 256 ? 256 : outputPlaneSize);
+
+ SpatialReflectionPadding_updateOutput<<<gridSize, blockSize, 0, THCState_getCurrentStream(state)>>>(
+ devInput, devOutput, padT, padB, padL, padR);
+ THCudaCheck(cudaGetLastError());
+}
+
+void THNN_(SpatialReflectionPadding_updateGradInput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *gradOutput,
+ THCTensor *gradInput,
+ int padL, int padR,
+ int padT, int padB) {
+
+ THArgCheck(TensorUtils<THCTensor>::canUse32BitIndexMath(state, input), 2,
+ "input tensor must fit into 32-bit index math");
+ THArgCheck(TensorUtils<THCTensor>::canUse32BitIndexMath(state, gradOutput), 3,
+ "output gradient tensor must fit into 32-bit index math");
+
+ int planeDim = 0;
+ int dimh = 1;
+ int dimw = 2;
+
+ int numInputDims = THCTensor_(nDimension)(state, input);
+ if (numInputDims == 4) {
+ planeDim++;
+ dimh++;
+ dimw++;
+ }
+
+ THCTensor_(resizeAs)(state, gradInput, input);
+ THCTensor_(zero)(state, gradInput);
+
+ THCDeviceTensor<real, 4> devGradInput;
+ THCDeviceTensor<real, 4> devGradOutput;
+
+ if (numInputDims == 3) {
+ devGradInput = toDeviceTensor<real, 3>(state, gradInput).upcastOuter<4>();
+ devGradOutput = toDeviceTensor<real, 3>(state, gradOutput).upcastOuter<4>();
+ } else {
+ devGradInput = toDeviceTensor<real, 4>(state, gradInput);
+ devGradOutput = toDeviceTensor<real, 4>(state, gradOutput);
+ }
+
+ int outputPlaneSize = devGradOutput.getSize(2) * devGradOutput.getSize(3);
+ dim3 gridSize(THCCeilDiv(outputPlaneSize, 256),
+ devGradOutput.getSize(1),
+ devGradOutput.getSize(0));
+ dim3 blockSize(outputPlaneSize > 256 ? 256 : outputPlaneSize);
+
+ SpatialReflectionPadding_updateGradInput<<<gridSize, blockSize, 0, THCState_getCurrentStream(state)>>>(
+ devGradInput, devGradOutput, padT, padB, padL, padR);
+ THCudaCheck(cudaGetLastError());
+}
+
+#endif
diff --git a/lib/THCUNN/generic/SpatialReplicationPadding.cu b/lib/THCUNN/generic/SpatialReplicationPadding.cu
new file mode 100644
index 0000000..cd775d3
--- /dev/null
+++ b/lib/THCUNN/generic/SpatialReplicationPadding.cu
@@ -0,0 +1,111 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/SpatialReplicationPadding.cu"
+#else
+
+void THNN_(SpatialReplicationPadding_updateOutput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *output,
+ int padL, int padR,
+ int padT, int padB) {
+ THArgCheck(TensorUtils<THCTensor>::canUse32BitIndexMath(state, input), 2,
+ "input tensor must fit into 32-bit index math");
+
+ int planeDim = 0;
+ int dimh = 1;
+ int dimw = 2;
+ int numBatch = 1;
+
+ int numInputDims = THCTensor_(nDimension)(state, input);
+ THArgCheck(numInputDims == 3 || numInputDims == 4, 2,
+ "input must be 3 or 4-dimensional");
+
+ if (numInputDims == 4) {
+ numBatch = THCTensor_(size)(state, input, 0);
+ planeDim++;
+ dimh++;
+ dimw++;
+ }
+
+ int numPlanes = THCTensor_(size)(state, input, planeDim);
+ int inputH = THCTensor_(size)(state, input, dimh);
+ int inputW = THCTensor_(size)(state, input, dimw);
+ int outputH = inputH + padT + padB;
+ int outputW = inputW + padL + padR;
+
+ THCDeviceTensor<real, 4> devInput;
+ THCDeviceTensor<real, 4> devOutput;
+
+ if (numInputDims == 3) {
+ THCTensor_(resize3d)(state, output, numPlanes, outputH, outputW);
+
+ devInput = toDeviceTensor<real, 3>(state, input).upcastOuter<4>();
+ devOutput = toDeviceTensor<real, 3>(state, output).upcastOuter<4>();
+ } else {
+ THCTensor_(resize4d)(state, output, numBatch, numPlanes, outputH, outputW);
+
+ devInput = toDeviceTensor<real, 4>(state, input);
+ devOutput = toDeviceTensor<real, 4>(state, output);
+ }
+
+ int outputPlaneSize = devOutput.getSize(2) * devOutput.getSize(3);
+ dim3 gridSize(THCCeilDiv(outputPlaneSize, 256),
+ devOutput.getSize(1),
+ devOutput.getSize(0));
+ dim3 blockSize(outputPlaneSize > 256 ? 256 : outputPlaneSize);
+
+ SpatialReplicationPadding_updateOutput<<<gridSize, blockSize, 0, THCState_getCurrentStream(state)>>>(
+ devInput, devOutput, padT, padB, padL, padR);
+
+}
+
+void THNN_(SpatialReplicationPadding_updateGradInput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *gradOutput,
+ THCTensor *gradInput,
+ int padL, int padR,
+ int padT, int padB) {
+
+ THArgCheck(TensorUtils<THCTensor>::canUse32BitIndexMath(state, input), 2,
+ "input tensor must fit into 32-bit index math");
+ THArgCheck(TensorUtils<THCTensor>::canUse32BitIndexMath(state, gradOutput), 3,
+ "output gradient tensor must fit into 32-bit index math");
+
+ int planeDim = 0;
+ int dimh = 1;
+ int dimw = 2;
+
+ int numInputDims = THCTensor_(nDimension)(state, input);
+ if (numInputDims == 4) {
+ planeDim++;
+ dimh++;
+ dimw++;
+ }
+
+ THCTensor_(resizeAs)(state, gradInput, input);
+ THCTensor_(zero)(state, gradInput);
+
+ THCDeviceTensor<real, 4> devGradInput;
+ THCDeviceTensor<real, 4> devGradOutput;
+
+ if (numInputDims == 3) {
+ devGradInput = toDeviceTensor<real, 3>(state, gradInput).upcastOuter<4>();
+ devGradOutput = toDeviceTensor<real, 3>(state, gradOutput).upcastOuter<4>();
+ } else {
+ devGradInput = toDeviceTensor<real, 4>(state, gradInput);
+ devGradOutput = toDeviceTensor<real, 4>(state, gradOutput);
+ }
+
+ int outputPlaneSize = devGradOutput.getSize(2) * devGradOutput.getSize(3);
+ dim3 gridSize(THCCeilDiv(outputPlaneSize, 256),
+ devGradOutput.getSize(1),
+ devGradOutput.getSize(0));
+ dim3 blockSize(outputPlaneSize > 256 ? 256 : outputPlaneSize);
+
+ SpatialReplicationPadding_updateGradInput<<<gridSize, blockSize, 0, THCState_getCurrentStream(state)>>>(
+ devGradInput, devGradOutput, padT, padB, padL, padR);
+
+}
+
+#endif
diff --git a/lib/THCUNN/generic/SpatialSubSampling.cu b/lib/THCUNN/generic/SpatialSubSampling.cu
new file mode 100644
index 0000000..4b5f238
--- /dev/null
+++ b/lib/THCUNN/generic/SpatialSubSampling.cu
@@ -0,0 +1,230 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/SpatialSubSampling.cu"
+#else
+
+#include "../common.h"
+
+void THNN_(SpatialSubSampling_updateOutput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *output,
+ THCTensor *weight,
+ THCTensor *bias,
+ int kW, int kH,
+ int dW, int dH)
+{
+ real *weight_data = THCTensor_(data)(state, weight);
+ real *bias_data = THCTensor_(data)(state, bias);
+ real *output_data;
+ real *input_data;
+
+ int nInputPlane = THCTensor_(size)(state, weight, 0);
+
+ THCUNN_assertSameGPU_generic(state, 4, input, output, weight, bias);
+ THArgCheck(input->nDimension == 3 || input->nDimension == 4, 2, "3D or 4D (batch) tensor expected");
+
+ if (input->nDimension == 3) {
+ long nInputCols = input->size[2];
+ long nInputRows = input->size[1];
+ long nOutputCols = (nInputCols - kW) / dW + 1;
+ long nOutputRows = (nInputRows - kH) / dH + 1;
+
+ THArgCheck(input->size[0] == nInputPlane, 2, "invalid number of input planes");
+ THArgCheck(nInputCols >= kW && nInputRows >= kH, 2, "input image smaller than kernel size");
+
+ input = THCTensor_(newContiguous)(state, input);
+ input_data = THCTensor_(data)(state, input);
+
+ THCTensor_(resize3d)(state, output, nInputPlane, nOutputRows, nOutputCols);
+ output_data = THCTensor_(data)(state, output);
+
+ // cuda blocks & threads:
+ int yblocks = (int)(16L / nInputPlane);
+ yblocks = yblocks < 1 ? 1 : yblocks;
+ dim3 blocks(nInputPlane,yblocks);
+ dim3 threads(32,8);
+
+ // run subsample kernel
+ subsample<real, accreal> <<<blocks, threads, 0, THCState_getCurrentStream(state)>>> (
+ input_data, output_data, weight_data, bias_data,
+ nInputPlane, nInputRows, nInputCols, kH, kW, dH, dW);
+ THCudaCheck(cudaGetLastError());
+ } else {
+ long nInputCols = input->size[3];
+ long nInputRows = input->size[2];
+ long nbatch = input->size[0];
+ long nOutputCols = (nInputCols - kW) / dW + 1;
+ long nOutputRows = (nInputRows - kH) / dH + 1;
+
+ THArgCheck(input->size[1] == nInputPlane, 2, "invalid number of input planes");
+ THArgCheck(nInputCols >= kW && nInputRows >= kH, 2, "input image smaller than kernel size");
+
+ input = THCTensor_(newContiguous)(state, input);
+ input_data = THCTensor_(data)(state, input);
+
+ THCTensor_(resize4d)(state, output, nbatch, nInputPlane, nOutputRows, nOutputCols);
+ output_data = THCTensor_(data)(state, output);
+
+ // cuda blocks & threads:
+ int yblocks = (int)(16L / nInputPlane);
+ yblocks = yblocks < 1 ? 1 : yblocks;
+ dim3 blocks(nInputPlane*nbatch,yblocks);
+ dim3 threads(32,8);
+
+ // run subsample kernel
+ subsample<real, accreal> <<<blocks, threads, 0, THCState_getCurrentStream(state)>>> (
+ input_data, output_data, weight_data, bias_data,
+ nInputPlane, nInputRows, nInputCols, kH, kW, dH, dW);
+ THCudaCheck(cudaGetLastError());
+ }
+
+ // clean
+ THCTensor_(free)(state, input);
+
+}
+
+void THNN_(SpatialSubSampling_updateGradInput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *gradOutput,
+ THCTensor *gradInput,
+ THCTensor *weight,
+ int kW, int kH,
+ int dW, int dH)
+{
+ THCUNN_assertSameGPU_generic(state, 4, input, gradOutput, weight, gradInput);
+
+ int nInputPlane = THCTensor_(size)(state, weight, 0);
+
+ if (input->nDimension == 3) {
+ long nInputCols = input->size[2];
+ long nInputRows = input->size[1];
+
+ real *weight_data = THCTensor_(data)(state, weight);
+ real *gradOutput_data = THCTensor_(data)(state, gradOutput);
+ real *gradInput_data;
+
+ THCTensor_(resizeAs)(state, gradInput, input);
+ THCTensor_(zero)(state, gradInput);
+ gradInput_data = THCTensor_(data)(state, gradInput);
+
+ // cuda blocks & threads:
+ int yblocks = (int)(16L / nInputPlane);
+ yblocks = yblocks < 1 ? 1 : yblocks;
+ dim3 blocks(nInputPlane,yblocks);
+ dim3 threads(32,8);
+
+ // run updateGradInput kernel
+ if (kH <= dH && kW <= dW) {
+ subgradinput <<<blocks, threads, 0, THCState_getCurrentStream(state)>>> (
+ gradInput_data, gradOutput_data, weight_data,
+ nInputPlane, nInputRows, nInputCols, kH, kW, dH, dW);
+ } else {
+ subgradinputAtomic <<<blocks, threads, 0, THCState_getCurrentStream(state)>>> (
+ gradInput_data, gradOutput_data, weight_data,
+ nInputPlane, nInputRows, nInputCols, kH, kW, dH, dW);
+ }
+ THCudaCheck(cudaGetLastError());
+ } else {
+ long nInputCols = input->size[3];
+ long nInputRows = input->size[2];
+ long nbatch = input->size[0];
+
+ real *weight_data = THCTensor_(data)(state, weight);
+ real *gradOutput_data = THCTensor_(data)(state, gradOutput);
+ real *gradInput_data;
+
+ THCTensor_(resizeAs)(state, gradInput, input);
+ THCTensor_(zero)(state, gradInput);
+ gradInput_data = THCTensor_(data)(state, gradInput);
+
+ // cuda blocks & threads:
+ int yblocks = (int)(16L / nInputPlane);
+ yblocks = yblocks < 1 ? 1 : yblocks;
+ dim3 blocks(nInputPlane*nbatch,yblocks);
+ dim3 threads(32,8);
+
+ // run updateGradInput kernel
+ if (kH <= dH && kW <= dW) {
+ subgradinput <<<blocks, threads, 0, THCState_getCurrentStream(state)>>> (
+ gradInput_data, gradOutput_data, weight_data,
+ nInputPlane, nInputRows, nInputCols, kH, kW, dH, dW);
+ } else {
+ subgradinputAtomic <<<blocks, threads, 0, THCState_getCurrentStream(state)>>> (
+ gradInput_data, gradOutput_data, weight_data,
+ nInputPlane, nInputRows, nInputCols, kH, kW, dH, dW);
+ }
+ THCudaCheck(cudaGetLastError());
+ }
+}
+
+void THNN_(SpatialSubSampling_accGradParameters)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *gradOutput,
+ THCTensor *gradWeight,
+ THCTensor *gradBias,
+ int kW, int kH,
+ int dW, int dH,
+ float scale)
+{
+ THCUNN_assertSameGPU_generic(state, 4, input, gradOutput, gradWeight, gradBias);
+
+ int nInputPlane = THCTensor_(size)(state, gradWeight, 0);
+
+ if (input->nDimension == 3) {
+ long nInputCols = input->size[2];
+ long nInputRows = input->size[1];
+
+ real *gradWeight_data = THCTensor_(data)(state, gradWeight);
+ real *gradBias_data = THCTensor_(data)(state, gradBias);
+ real *gradOutput_data = THCTensor_(data)(state, gradOutput);
+ real *input_data;
+
+ input = THCTensor_(newContiguous)(state, input);
+ input_data = THCTensor_(data)(state, input);
+
+ // cuda blocks & threads:
+ dim3 blocks(nInputPlane);
+ dim3 threads(32,8);
+
+ // run gradweight kernel
+ subgradweight<real, accreal> <<<blocks, threads, 0, THCState_getCurrentStream(state)>>> (
+ input_data, gradOutput_data, gradWeight_data, gradBias_data,
+ nInputPlane, nInputRows, nInputCols, kH, kW, dH, dW, scale);
+ THCudaCheck(cudaGetLastError());
+ } else {
+ long nInputCols = input->size[3];
+ long nInputRows = input->size[2];
+ long nbatch = input->size[0];
+
+ real *gradWeight_data = THCTensor_(data)(state, gradWeight);
+ real *gradBias_data = THCTensor_(data)(state, gradBias);
+ real *gradOutput_data = THCTensor_(data)(state, gradOutput);
+ real *input_data;
+
+ input = THCTensor_(newContiguous)(state, input);
+ input_data = THCTensor_(data)(state, input);
+
+ // cuda blocks & threads:
+ dim3 blocks(nInputPlane);
+ dim3 threads(32,8);
+
+ // run gradweight kernel
+ long sl;
+ for (sl=0; sl<nbatch; sl++) {
+ subgradweight<real, accreal> <<<blocks, threads, 0, THCState_getCurrentStream(state)>>> (
+ input_data + sl*input->stride[0],
+ gradOutput_data + sl*gradOutput->stride[0],
+ gradWeight_data, gradBias_data,
+ nInputPlane, nInputRows, nInputCols, kH, kW, dH, dW, scale);
+ }
+ THCudaCheck(cudaGetLastError());
+ }
+
+ // clean
+ THCTensor_(free)(state, input);
+
+}
+
+#endif
diff --git a/lib/THCUNN/generic/SpatialUpSamplingBilinear.cu b/lib/THCUNN/generic/SpatialUpSamplingBilinear.cu
new file mode 100644
index 0000000..e21f269
--- /dev/null
+++ b/lib/THCUNN/generic/SpatialUpSamplingBilinear.cu
@@ -0,0 +1,72 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/SpatialUpSamplingBilinear.cu"
+#else
+
+void THNN_(SpatialUpSamplingBilinear_updateOutput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *output,
+ int outputHeight,
+ int outputWidth)
+{
+ input = THCTensor_(newContiguous)(state, input);
+ output = THCTensor_(newContiguous)(state, output);
+ THCUNN_assertSameGPU_generic(state, 2, input, output);
+ THCTensor_(zero)(state, output);
+ THCDeviceTensor<real, 4> idata = toDeviceTensor<real, 4>(state, input);
+ THCDeviceTensor<real, 4> odata = toDeviceTensor<real, 4>(state, output);
+ int height1 = idata.getSize(2);
+ int width1 = idata.getSize(3);
+ int height2 = odata.getSize(2);
+ int width2 = odata.getSize(3);
+ assert( height1 > 0 && width1 > 0 && height2 > 0 && width2 > 0);
+ const accreal rheight= (height2 > 1) ? (accreal)(height1 - 1)/(height2 - 1) : accreal(0);
+ const accreal rwidth = (width2 > 1) ? (accreal)(width1 - 1)/(width2 - 1) : accreal(0);
+ const int num_kernels = height2 * width2;
+ const int num_threads =
+ THCState_getCurrentDeviceProperties(state)->maxThreadsPerBlock;
+ cudaStream_t stream = THCState_getCurrentStream(state);
+ caffe_gpu_interp2_kernel<real, accreal> <<<THCCeilDiv(num_kernels, num_threads), num_threads ,
+ 0 , stream>>>(num_kernels, rheight, rwidth, idata, odata);
+ THCudaCheck(cudaGetLastError());
+ THCTensor_(free)(state, input);
+ THCTensor_(free)(state, output);
+}
+
+
+void THNN_(SpatialUpSamplingBilinear_updateGradInput)(
+ THCState *state,
+ THCTensor *gradOutput,
+ THCTensor *gradInput,
+ int nbatch,
+ int nchannels,
+ int inputHeight,
+ int inputWidth,
+ int outputHeight,
+ int outputWidth)
+{
+ gradInput = THCTensor_(newContiguous)(state, gradInput);
+ gradOutput = THCTensor_(newContiguous)(state, gradOutput);
+ THCUNN_assertSameGPU_generic(state, 2, gradOutput, gradInput);
+ THCTensor_(zero)(state, gradInput);
+ THCDeviceTensor<real, 4> data1 = toDeviceTensor<real, 4>(state, gradInput);
+ THCDeviceTensor<real, 4> data2 = toDeviceTensor<real, 4>(state, gradOutput);
+ int height1 = data1.getSize(2);
+ int width1 = data1.getSize(3);
+ int height2 = data2.getSize(2);
+ int width2 = data2.getSize(3);
+ assert(height1 > 0 && width1 > 0 && height2 > 0 && width2 > 0);
+ const accreal rheight= (height2 > 1) ? (accreal)(height1 - 1)/(height2 - 1) : accreal(0);
+ const accreal rwidth = (width2 > 1) ? (accreal)(width1 - 1) / (width2 - 1) : accreal(0);
+ const int num_kernels = height2 * width2;
+ const int num_threads =
+ THCState_getCurrentDeviceProperties(state)->maxThreadsPerBlock;
+ cudaStream_t stream = THCState_getCurrentStream(state);
+ caffe_gpu_interp2_kernel_backward<real ,accreal> <<<THCCeilDiv(num_kernels, num_threads),
+ num_threads, 0, stream>>>(num_kernels, rheight, rwidth, data1, data2);
+ THCudaCheck(cudaGetLastError());
+ THCTensor_(free)(state, gradInput);
+ THCTensor_(free)(state, gradOutput);
+}
+
+#endif
diff --git a/lib/THCUNN/generic/SpatialUpSamplingNearest.cu b/lib/THCUNN/generic/SpatialUpSamplingNearest.cu
new file mode 100644
index 0000000..5bc4ff9
--- /dev/null
+++ b/lib/THCUNN/generic/SpatialUpSamplingNearest.cu
@@ -0,0 +1,115 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/SpatialUpSamplingNearest.cu"
+#else
+
+#include "../common.h"
+
+void THNN_(SpatialUpSamplingNearest_updateOutput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *output,
+ int scale_factor)
+{
+ THCTensor_(zero)(state, output);
+
+ THCUNN_assertSameGPU_generic(state, 2, input, output);
+
+ input = THCTensor_(newContiguous)(state, input);
+ // This is for allocating output Tensor
+ long no_elements = 1;
+ for(int i = 0; i < input->nDimension; i++){
+ no_elements *= input->size[i];
+ }
+ no_elements *= scale_factor * scale_factor;
+
+ int d1;
+ int d2;
+ int d3;
+
+ if (input->nDimension == 3) {
+ d1 = output->size[0];
+ d2 = output->size[1];
+ d3 = output->size[2];
+ } else {
+ d1 = output->size[1];
+ d2 = output->size[2];
+ d3 = output->size[3];
+ }
+
+ real *input_data = THCTensor_(data)(state, input);
+ real *output_data = THCTensor_(data)(state, output);
+
+ // cuda blocks & threads:
+ long nthreads = 256;
+ // Max number of blocks: http://en.wikipedia.org/wiki/CUDA
+ // 65535 for SM 2.x, 2^32 -1 for >= 3.0
+ // TODO: When we move to SM 3.5 we should update this
+ long n_xblocks = min(max((int)ceil((float)no_elements / nthreads), 1), 65535);
+ long n_yblocks = (long)ceil((float)no_elements / (float)(n_xblocks * nthreads));
+ if (n_yblocks > 65535) {
+ THError("Input size is too large! aborting");
+ }
+ dim3 blocks(n_xblocks, n_yblocks);
+ dim3 threads(nthreads);
+
+ // kernel:
+ upscale<<<blocks, threads, 0, THCState_getCurrentStream(state)>>> (input_data, output_data, no_elements, scale_factor, d1, d2, d3);
+ THCudaCheck(cudaGetLastError());
+
+ // final cut:
+ THCTensor_(free)(state, input);
+}
+
+void THNN_(SpatialUpSamplingNearest_updateGradInput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *gradOutput,
+ THCTensor *gradInput,
+ int scale_factor)
+{
+ THCUNN_assertSameGPU_generic(state, 2, gradOutput, gradInput);
+
+ THCTensor_(zero)(state, gradInput);
+
+ real *gradInput_data = THCTensor_(data)(state, gradInput);
+ real *gradOutput_data = THCTensor_(data)(state, gradOutput);
+
+ long no_elements = 1;
+ for(int i = 0; i < gradInput->nDimension; i++){
+ no_elements *= gradInput->size[i];
+ }
+
+ int d1;
+ int d2;
+ int d3;
+
+ if (gradInput->nDimension == 3) {
+ d1 = gradInput->size[0];
+ d2 = gradInput->size[1];
+ d3 = gradInput->size[2];
+ } else {
+ d1 = gradInput->size[1];
+ d2 = gradInput->size[2];
+ d3 = gradInput->size[3];
+ }
+
+ // cuda blocks & threads:
+ long nthreads = 256;
+ // Max number of blocks: http://en.wikipedia.org/wiki/CUDA
+ // 65535 for SM 2.x, 2^32 -1 for >= 3.0
+ // TODO: When we move to SM 3.5 we should update this
+ long n_xblocks = min(max((int)ceil((float)no_elements / nthreads), 1), 65535);
+ long n_yblocks = (long)ceil((float)no_elements / (float)(n_xblocks * nthreads));
+ if (n_yblocks > 65535) {
+ THError("Input size is too large! aborting");
+ }
+ dim3 blocks(n_xblocks, n_yblocks);
+ dim3 threads(nthreads);
+
+ // kernel:
+ downscale<real ,accreal> <<<blocks, threads, 0, THCState_getCurrentStream(state)>>> (gradInput_data, gradOutput_data, no_elements,
+ scale_factor, d1, d2, d3);
+ THCudaCheck(cudaGetLastError());
+}
+
+#endif
diff --git a/lib/THCUNN/generic/Sqrt.cu b/lib/THCUNN/generic/Sqrt.cu
new file mode 100644
index 0000000..a49798f
--- /dev/null
+++ b/lib/THCUNN/generic/Sqrt.cu
@@ -0,0 +1,30 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/Sqrt.cu"
+#else
+
+#include "../common.h"
+
+void THNN_(Sqrt_updateOutput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *output,
+ real eps)
+{
+ THCUNN_assertSameGPU_generic(state, 2, input, output);
+ THCTensor_(resizeAs)(state, output, input);
+ THC_pointwiseApply2(state, output, input, sqrtupdateOutput_functor<real>(eps));
+}
+
+void THNN_(Sqrt_updateGradInput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *gradOutput,
+ THCTensor *gradInput,
+ THCTensor *output)
+{
+ THCUNN_assertSameGPU_generic(state, 3, output, gradOutput, gradInput);
+ THCTensor_(resizeAs)(state, gradInput, output);
+ THC_pointwiseApply3(state, gradInput, output, gradOutput, sqrtupdateGradInput_functor<real>());
+}
+
+#endif
diff --git a/lib/THCUNN/generic/Square.cu b/lib/THCUNN/generic/Square.cu
new file mode 100644
index 0000000..06018d7
--- /dev/null
+++ b/lib/THCUNN/generic/Square.cu
@@ -0,0 +1,28 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/Square.cu"
+#else
+
+#include "../common.h"
+
+void THNN_(Square_updateOutput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *output)
+{
+ THCUNN_assertSameGPU_generic(state, 2, input, output);
+ THCTensor_(resizeAs)(state, output, input);
+ THC_pointwiseApply2(state, output, input, squareupdateOutput_functor<real>());
+}
+
+void THNN_(Square_updateGradInput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *gradOutput,
+ THCTensor *gradInput)
+{
+ THCUNN_assertSameGPU_generic(state, 3, input, gradOutput, gradInput);
+ THCTensor_(resizeAs)(state, gradInput, input);
+ THC_pointwiseApply3(state, gradInput, input, gradOutput, squareupdateGradInput_functor<real>());
+}
+
+#endif
diff --git a/lib/THCUNN/generic/THCUNN.h b/lib/THCUNN/generic/THCUNN.h
new file mode 100644
index 0000000..91d68ae
--- /dev/null
+++ b/lib/THCUNN/generic/THCUNN.h
@@ -0,0 +1,1171 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/THCUNN.h"
+#else
+
+TH_API void THNN_(Abs_updateOutput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *output);
+
+TH_API void THNN_(Abs_updateGradInput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *gradOutput,
+ THCTensor *gradInput);
+
+TH_API void THNN_(AbsCriterion_updateOutput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *target,
+ THCTensor *output,
+ bool sizeAverage);
+
+TH_API void THNN_(AbsCriterion_updateGradInput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *target,
+ THCTensor *gradInput,
+ bool sizeAverage);
+
+TH_API void THNN_(BatchNormalization_updateOutput)(
+ THCState *state,
+ THCTensor *input_,
+ THCTensor *output_,
+ THCTensor *weight_, // [OPTIONAL]
+ THCTensor *bias_, // [OPTIONAL]
+ THCTensor *runningMean_,
+ THCTensor *runningVar_,
+ THCTensor *saveMean_,
+ THCTensor *saveStd_,
+ bool train,
+ double momentum,
+ double eps);
+
+TH_API void THNN_(BatchNormalization_backward)(
+ THCState *state,
+ THCTensor *input_,
+ THCTensor *gradOutput_,
+ THCTensor *gradInput_, // [OPTIONAL]
+ THCTensor *gradWeight_, // [OPTIONAL]
+ THCTensor *gradBias_, // [OPTIONAL]
+ THCTensor *weight_, // [OPTIONAL]
+ THCTensor *runningMean_,
+ THCTensor *runningVar_,
+ THCTensor *saveMean_,
+ THCTensor *saveStd_,
+ bool train,
+ float scale,
+ double eps);
+
+TH_API void THNN_(BCECriterion_updateOutput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *target,
+ THCTensor *output,
+ bool sizeAverage,
+ THCTensor *weights); // [OPTIONAL]
+
+TH_API void THNN_(BCECriterion_updateGradInput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *target,
+ THCTensor *gradInput,
+ bool sizeAverage,
+ THCTensor *weights); // [OPTIONAL]
+
+TH_API void THNN_(ClassNLLCriterion_updateOutput)(
+ THCState *state,
+ THCTensor *input,
+ THCIndexTensor *target,
+ THCTensor *output,
+ bool sizeAverage,
+ THCTensor *weights, // [OPTIONAL]
+ THCTensor *total_weight);
+
+TH_API void THNN_(ClassNLLCriterion_updateGradInput)(
+ THCState *state,
+ THCTensor *input,
+ THCIndexTensor *target,
+ THCTensor *gradInput,
+ bool sizeAverage,
+ THCTensor *weights, // [OPTIONAL]
+ THCTensor *total_weight);
+
+TH_API void THNN_(DistKLDivCriterion_updateOutput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *target,
+ THCTensor *output,
+ bool sizeAverage);
+
+TH_API void THNN_(DistKLDivCriterion_updateGradInput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *target,
+ THCTensor *gradInput,
+ bool sizeAverage);
+
+TH_API void THNN_(ELU_updateOutput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *output,
+ real alpha,
+ bool inplace);
+
+TH_API void THNN_(ELU_updateGradInput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *gradOutput,
+ THCTensor *gradInput,
+ THCTensor *output,
+ real alpha,
+ bool inplace);
+
+TH_API void THNN_(HardTanh_updateOutput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *output,
+ real min_val,
+ real max_val,
+ bool inplace);
+
+TH_API void THNN_(HardTanh_updateGradInput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *gradOutput,
+ THCTensor *gradInput,
+ real min_val,
+ real max_val,
+ bool inplace);
+
+TH_API void THNN_(LeakyReLU_updateOutput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *output,
+ real negval,
+ bool inplace);
+
+TH_API void THNN_(LeakyReLU_updateGradInput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *gradOutput,
+ THCTensor *gradInput,
+ real negval,
+ bool inplace);
+
+TH_API void THNN_(LogSigmoid_updateOutput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *output,
+ THCTensor *buffer);
+
+TH_API void THNN_(LogSigmoid_updateGradInput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *gradOutput,
+ THCTensor *gradInput,
+ THCTensor *buffer);
+
+TH_API void THNN_(LogSoftMax_updateOutput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *output);
+
+TH_API void THNN_(LogSoftMax_updateGradInput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *gradOutput,
+ THCTensor *gradInput,
+ THCTensor *output);
+
+TH_API void THNN_(LookupTable_accGradParameters)(
+ THCState *state,
+ THCIndexTensor *input,
+ THCTensor *gradOutput,
+ THCTensor *gradWeight,
+ THCIndexTensor *count,
+ THCIndexTensor *sorted, // [OPTIONAL]
+ THCIndexTensor *indices, // [OPTIONAL]
+ bool scaleGradByFreq,
+ int paddingValue,
+ real scale);
+
+TH_API void THNN_(LookupTable_renorm)(
+ THCState *state,
+ THCIndexTensor *idx,
+ THCTensor *weight,
+ real maxNorm,
+ real normType);
+
+TH_API void THNN_(L1Cost_updateOutput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *output);
+
+TH_API void THNN_(L1Cost_updateGradInput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *gradOutput, // [OPTIONAL]
+ THCTensor *gradInput);
+
+TH_API void THNN_(MarginCriterion_updateOutput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *target,
+ THCTensor *output,
+ bool sizeAverage,
+ real margin);
+
+TH_API void THNN_(MarginCriterion_updateGradInput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *target,
+ THCTensor *gradInput,
+ bool sizeAverage,
+ real margin);
+
+TH_API void THNN_(MSECriterion_updateOutput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *target,
+ THCTensor *output,
+ bool sizeAverage);
+
+TH_API void THNN_(MSECriterion_updateGradInput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *target,
+ THCTensor *gradInput,
+ bool sizeAverage);
+
+TH_API void THNN_(MultiLabelMarginCriterion_updateOutput)(
+ THCState *state,
+ THCTensor *input,
+ THCIndexTensor *target,
+ THCTensor *output,
+ THCTensor *istarget,
+ bool sizeaverage);
+
+TH_API void THNN_(MultiLabelMarginCriterion_updateGradInput)(
+ THCState *state,
+ THCTensor *input,
+ THCIndexTensor *target,
+ THCTensor *gradInput,
+ THCTensor *istarget,
+ bool sizeaverage);
+
+TH_API void THNN_(MultiMarginCriterion_updateGradInput)(
+ THCState *state,
+ THCTensor *input,
+ THCIndexTensor *target,
+ THCTensor *gradInput,
+ bool sizeAverage,
+ int p,
+ THCTensor *weights,
+ real margin);
+
+TH_API void THNN_(MultiMarginCriterion_updateOutput)(
+ THCState *state,
+ THCTensor *input,
+ THCIndexTensor *target,
+ THCTensor *output,
+ bool sizeAverage,
+ int p,
+ THCTensor *weights, // [OPTIONAL]
+ real margin);
+
+TH_API void THNN_(MultiMarginCriterion_updateGradInput)(
+ THCState *state,
+ THCTensor *input,
+ THCIndexTensor *target,
+ THCTensor *gradInput,
+ bool sizeAverage,
+ int p,
+ THCTensor *weights, // [OPTIONAL]
+ real margin);
+
+TH_API void THNN_(PReLU_updateOutput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *output,
+ THCTensor *weight,
+ long nOutputPlane);
+
+TH_API void THNN_(PReLU_updateGradInput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *gradOutput,
+ THCTensor *gradInput,
+ THCTensor *weight,
+ long nOutputPlane);
+
+TH_API void THNN_(PReLU_accGradParameters)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *gradOutput,
+ THCTensor *gradInput,
+ THCTensor *weight,
+ THCTensor *gradWeight,
+ THCTensor *gradWeightBuf,
+ THCTensor *gradWeightBuf2,
+ long nOutputPlane,
+ real scale);
+
+TH_API void THNN_(SmoothL1Criterion_updateOutput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *target,
+ THCTensor *output,
+ bool sizeAverage);
+
+TH_API void THNN_(SmoothL1Criterion_updateGradInput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *target,
+ THCTensor *gradInput,
+ bool sizeAverage);
+
+TH_API void THNN_(SparseLinear_updateOutput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *output,
+ THCTensor *weight,
+ THCTensor *bias);
+
+TH_API void THNN_(SparseLinear_accGradParameters)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *gradOutput,
+ THCTensor *gradWeight,
+ THCTensor *gradBias,
+ THCTensor *weight,
+ THCTensor *bias,
+ double weightDecay,
+ double scale);
+
+TH_API void THNN_(SparseLinear_legacyUpdateOutput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *output,
+ THCTensor *weight,
+ THCTensor *bias);
+
+TH_API void THNN_(SparseLinear_legacyAccGradParameters)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *gradOutput,
+ THCTensor *gradWeight,
+ THCTensor *gradBias,
+ THCTensor *weight,
+ THCTensor *bias,
+ double weightDecay,
+ double scale);
+
+TH_API void THNN_(SparseLinear_zeroGradParameters)(
+ THCState *state,
+ THCTensor *gradWeight,
+ THCTensor *gradBias,
+ THCTensor *lastInput);
+
+TH_API void THNN_(SparseLinear_updateParameters)(
+ THCState *state,
+ THCTensor *weight,
+ THCTensor *bias,
+ THCTensor *gradWeight,
+ THCTensor *gradBias,
+ THCTensor *lastInput,
+ double learningRate);
+
+TH_API void THNN_(SpatialAdaptiveMaxPooling_updateOutput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *output,
+ THCIndexTensor *indices,
+ int nOutputCols,
+ int nOutputRows);
+
+TH_API void THNN_(SpatialAdaptiveMaxPooling_updateGradInput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *gradOutput,
+ THCTensor *gradInput,
+ THCIndexTensor *indices);
+
+TH_API void THNN_(SpatialAveragePooling_updateOutput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *output,
+ int kW, int kH,
+ int dW, int dH,
+ int padW, int padH,
+ bool ceil_mode,
+ bool count_include_pad);
+
+TH_API void THNN_(SpatialAveragePooling_updateGradInput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *gradOutput,
+ THCTensor *gradInput,
+ int kW, int kH,
+ int dW, int dH,
+ int padW, int padH,
+ bool ceil_mode,
+ bool count_include_pad);
+
+TH_API void THNN_(SpatialClassNLLCriterion_updateOutput)(
+ THCState *state,
+ THCTensor *input,
+ THCIndexTensor *target,
+ THCTensor *output,
+ bool sizeAverage,
+ THCTensor *weights, // [OPTIONAL]
+ THCTensor *total_weight);
+
+TH_API void THNN_(SpatialClassNLLCriterion_updateGradInput)(
+ THCState *state,
+ THCTensor *input,
+ THCIndexTensor *target,
+ THCTensor *gradInput,
+ bool sizeAverage,
+ THCTensor *weights, // [OPTIONAL]
+ THCTensor *total_weight);
+
+TH_API void THNN_(SpatialConvolutionLocal_updateOutput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *output,
+ THCTensor *weight,
+ THCTensor *bias,
+ THCTensor *finput,
+ THCTensor *fgradInput,
+ int kW, int kH,
+ int dW, int dH,
+ int padW, int padH,
+ long inputWidth, long inputHeight,
+ long outputWidth, long outputHeight);
+
+TH_API void THNN_(SpatialConvolutionLocal_updateGradInput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *gradOutput,
+ THCTensor *gradInput,
+ THCTensor *weight,
+ THCTensor *finput,
+ THCTensor *fgradInput,
+ int kW, int kH,
+ int dW, int dH,
+ int padW, int padH,
+ long inputWidth, long inputHeight,
+ long outputWidth, long outputHeight);
+
+TH_API void THNN_(SpatialConvolutionLocal_accGradParameters)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *gradOutput,
+ THCTensor *gradWeight,
+ THCTensor *gradBias,
+ THCTensor *finput,
+ THCTensor *fgradInput,
+ int kW, int kH,
+ int dW, int dH,
+ int padW, int padH,
+ long inputWidth, long inputHeight,
+ long outputWidth, long outputHeight,
+ real scale);
+
+TH_API void THNN_(SpatialConvolutionMM_updateOutput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *output,
+ THCTensor *weight,
+ THCTensor *bias,
+ THCTensor *columns,
+ THCTensor *ones,
+ int kW, int kH,
+ int dW, int dH,
+ int padW, int padH);
+
+TH_API void THNN_(SpatialConvolutionMM_updateGradInput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *gradOutput,
+ THCTensor *gradInput,
+ THCTensor *weight,
+ THCTensor *gradColumns,
+ THCTensor *ones,
+ int kW, int kH,
+ int dW, int dH,
+ int padW, int padH);
+
+TH_API void THNN_(SpatialConvolutionMM_accGradParameters)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *gradOutput,
+ THCTensor *gradWeight,
+ THCTensor *gradBias,
+ THCTensor *columns,
+ THCTensor *ones,
+ int kW, int kH,
+ int dW, int dH,
+ int padW, int padH,
+ real scale);
+
+TH_API void THNN_(SpatialCrossMapLRN_updateOutput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *output,
+ THCTensor *scale,
+ int size,
+ real alpha,
+ real beta,
+ real k);
+
+TH_API void THNN_(SpatialCrossMapLRN_updateGradInput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *gradOutput,
+ THCTensor *gradInput,
+ THCTensor *scale,
+ THCTensor *output,
+ int size,
+ real alpha,
+ real beta,
+ real k);
+
+TH_API void THNN_(SpatialDilatedConvolution_updateOutput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *output,
+ THCTensor *weight,
+ THCTensor *bias, // [OPTIONAL]
+ THCTensor *columns,
+ THCTensor *ones,
+ int kW, int kH,
+ int dW, int dH,
+ int padW, int padH,
+ int dilationW, int dilationH);
+
+TH_API void THNN_(SpatialDilatedConvolution_updateGradInput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *gradOutput,
+ THCTensor *gradInput,
+ THCTensor *weight,
+ THCTensor *gradColumns,
+ int kW, int kH,
+ int dW, int dH,
+ int padW, int padH,
+ int dilationW, int dilationH);
+
+TH_API void THNN_(SpatialDilatedConvolution_accGradParameters)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *gradOutput,
+ THCTensor *gradWeight,
+ THCTensor *gradBias, // [OPTIONAL]
+ THCTensor *columns,
+ THCTensor *ones,
+ int kW, int kH,
+ int dW, int dH,
+ int padW, int padH,
+ int dilationW, int dilationH,
+ real scale);
+
+TH_API void THNN_(SpatialDilatedMaxPooling_updateOutput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *output,
+ THCIndexTensor *indices,
+ int kW, int kH,
+ int dW, int dH,
+ int padW, int padH,
+ int dilationW, int dilationH,
+ bool ceil_mode);
+
+TH_API void THNN_(SpatialDilatedMaxPooling_updateGradInput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *gradOutput,
+ THCTensor *gradInput,
+ THCIndexTensor *indices,
+ int kW, int kH,
+ int dW, int dH,
+ int padW, int padH,
+ int dilationW, int dilationH,
+ bool ceil_mode);
+
+TH_API void THNN_(SpatialFractionalMaxPooling_updateOutput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *output,
+ int outputW, int outputH,
+ int poolSizeW, int poolSizeH,
+ THCIndexTensor *indices,
+ THCTensor *randomSamples);
+
+TH_API void THNN_(SpatialFractionalMaxPooling_updateGradInput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *gradOutput,
+ THCTensor *gradInput,
+ int outputW, int outputH,
+ int poolSizeW, int poolSizeH,
+ THCIndexTensor *indices);
+
+TH_API void THNN_(SpatialFullConvolution_updateOutput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *output,
+ THCTensor *weight,
+ THCTensor *bias, // [OPTIONAL]
+ THCTensor *columns,
+ THCTensor *ones,
+ int kW, int kH,
+ int dW, int dH,
+ int padW, int padH,
+ int adjW, int adjH);
+
+TH_API void THNN_(SpatialFullConvolution_updateGradInput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *gradOutput,
+ THCTensor *gradInput,
+ THCTensor *weight,
+ THCTensor *gradColumns,
+ int kW, int kH,
+ int dW, int dH,
+ int padW, int padH,
+ int adjW, int adjH);
+
+TH_API void THNN_(SpatialFullConvolution_accGradParameters)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *gradOutput,
+ THCTensor *gradWeight,
+ THCTensor *gradBias, // [OPTIONAL]
+ THCTensor *columns,
+ THCTensor *ones,
+ int kW, int kH,
+ int dW, int dH,
+ int padW, int padH,
+ int adjW, int adjH,
+ real scale);
+
+TH_API void THNN_(SpatialMaxPooling_updateOutput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *output,
+ THCIndexTensor *indices,
+ int kW, int kH,
+ int dW, int dH,
+ int padW, int padH,
+ bool ceil_mode);
+
+TH_API void THNN_(SpatialMaxPooling_updateGradInput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *gradOutput,
+ THCTensor *gradInput,
+ THCIndexTensor *indices,
+ int kW, int kH,
+ int dW, int dH,
+ int padW, int padH,
+ bool ceil_mode);
+
+TH_API void THNN_(SpatialMaxUnpooling_updateOutput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *output,
+ THCIndexTensor *indices,
+ int owidth, int oheight);
+
+TH_API void THNN_(SpatialMaxUnpooling_updateGradInput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *gradOutput,
+ THCTensor *gradInput,
+ THCIndexTensor *indices,
+ int owidth, int oheight);
+
+TH_API void THNN_(SpatialReflectionPadding_updateOutput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *output,
+ int padL, int padR,
+ int padT, int padB);
+
+TH_API void THNN_(SpatialReflectionPadding_updateGradInput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *gradOutput,
+ THCTensor *gradInput,
+ int padL, int padR,
+ int padT, int padB);
+
+TH_API void THNN_(SpatialReplicationPadding_updateOutput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *output,
+ int padL, int padR,
+ int padT, int padB);
+
+TH_API void THNN_(SpatialReplicationPadding_updateGradInput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *gradOutput,
+ THCTensor *gradInput,
+ int padL, int padR,
+ int padT, int padB);
+
+TH_API void THNN_(SpatialSubSampling_updateOutput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *output,
+ THCTensor *weight,
+ THCTensor *bias,
+ int kW, int kH,
+ int dW, int dH);
+
+TH_API void THNN_(SpatialSubSampling_updateGradInput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *gradOutput,
+ THCTensor *gradInput,
+ THCTensor *weight,
+ int kW, int kH,
+ int dW, int dH);
+
+TH_API void THNN_(SpatialSubSampling_accGradParameters)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *gradOutput,
+ THCTensor *gradWeight,
+ THCTensor *gradBias,
+ int kW, int kH,
+ int dW, int dH,
+ float scale);
+
+TH_API void THNN_(SpatialUpSamplingBilinear_updateOutput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *output,
+ int outputHeight,
+ int outputWidth);
+
+TH_API void THNN_(SpatialUpSamplingBilinear_updateGradInput)(
+ THCState *state,
+ THCTensor *gradOutput,
+ THCTensor *gradInput,
+ int nbatch,
+ int nchannels,
+ int inputHeight,
+ int inputWidth,
+ int outputHeight,
+ int outputWidth);
+
+TH_API void THNN_(SpatialUpSamplingNearest_updateGradInput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *gradOutput,
+ THCTensor *gradInput,
+ int scale_factor);
+
+TH_API void THNN_(SpatialUpSamplingNearest_updateOutput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *output,
+ int scale_factor);
+
+TH_API void THNN_(RReLU_updateOutput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *output,
+ THCTensor *noise,
+ double lower,
+ double upper,
+ bool train,
+ bool inplace,
+ void *generator);
+
+TH_API void THNN_(RReLU_updateGradInput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *gradOutput,
+ THCTensor *gradInput,
+ THCTensor *noise,
+ double lower,
+ double upper,
+ bool train,
+ bool inplace);
+
+TH_API void THNN_(Sigmoid_updateOutput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *output);
+
+TH_API void THNN_(Sigmoid_updateGradInput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *gradOutput,
+ THCTensor *gradInput,
+ THCTensor *output);
+
+TH_API void THNN_(SoftMarginCriterion_updateOutput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *target,
+ THCTensor *output,
+ int sizeAverage);
+
+TH_API void THNN_(SoftMarginCriterion_updateGradInput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *target,
+ THCTensor *gradInput,
+ int sizeAverage);
+
+TH_API void THNN_(SoftMax_updateOutput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *output);
+
+TH_API void THNN_(SoftMax_updateGradInput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *gradOutput,
+ THCTensor *gradInput,
+ THCTensor *output);
+
+TH_API void THNN_(SoftPlus_updateOutput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *output,
+ real beta,
+ real threshold);
+
+TH_API void THNN_(SoftPlus_updateGradInput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *gradOutput,
+ THCTensor *gradInput,
+ THCTensor *output,
+ real beta,
+ real threshold);
+
+TH_API void THNN_(SoftShrink_updateOutput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *output,
+ real lambda);
+
+TH_API void THNN_(SoftShrink_updateGradInput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *gradOutput,
+ THCTensor *gradInput,
+ real lambda);
+
+TH_API void THNN_(Square_updateOutput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *output);
+
+TH_API void THNN_(Square_updateGradInput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *gradOutput,
+ THCTensor *gradInput);
+
+TH_API void THNN_(Sqrt_updateOutput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *output,
+ real eps);
+
+TH_API void THNN_(Sqrt_updateGradInput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *gradOutput,
+ THCTensor *gradInput,
+ THCTensor *output);
+
+TH_API void THNN_(Tanh_updateOutput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *output);
+
+TH_API void THNN_(Tanh_updateGradInput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *gradOutput,
+ THCTensor *gradInput,
+ THCTensor *output);
+
+TH_API void THNN_(TemporalConvolution_updateOutput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *output,
+ THCTensor *weight,
+ THCTensor *bias,
+ int kW, int dW,
+ int inputFrameSize,
+ int outputFrameSize);
+
+TH_API void THNN_(TemporalConvolution_updateGradInput)(
+ THCState* state,
+ THCTensor *input,
+ THCTensor *gradOutput,
+ THCTensor *gradInput,
+ THCTensor *weight,
+ int kW, int dW);
+
+TH_API void THNN_(TemporalConvolution_accGradParameters)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *gradOutput,
+ THCTensor *gradWeight,
+ THCTensor *gradBias,
+ int kW, int dW,
+ real scale);
+
+TH_API void THNN_(TemporalMaxPooling_updateOutput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *output,
+ THCIndexTensor *indices,
+ int kW, int dW);
+
+TH_API void THNN_(TemporalMaxPooling_updateGradInput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *gradOutput,
+ THCTensor *gradInput,
+ THCIndexTensor *indices,
+ int kW, int dW);
+
+TH_API void THNN_(Threshold_updateOutput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *output,
+ real threshold,
+ real val,
+ bool inplace);
+
+TH_API void THNN_(Threshold_updateGradInput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *gradOutput,
+ THCTensor *gradInput,
+ real threshold,
+ real val,
+ bool inplace);
+
+TH_API void THNN_(VolumetricAveragePooling_updateOutput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *output,
+ int kT, int kW, int kH,
+ int dT, int dW, int dH);
+
+TH_API void THNN_(VolumetricAveragePooling_updateGradInput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *gradOutput,
+ THCTensor *gradInput,
+ int kT, int kW, int kH,
+ int dT, int dW, int dH);
+
+TH_API void THNN_(VolumetricReplicationPadding_updateOutput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *output,
+ int pleft, int pright,
+ int ptop, int pbottom,
+ int pfront, int pback);
+
+TH_API void THNN_(VolumetricConvolution_updateOutput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *output,
+ THCTensor *weight,
+ THCTensor *bias,
+ THCTensor *finput,
+ THCTensor *fgradInput,
+ int dT, int dW, int dH,
+ int padT, int padW, int padH);
+
+TH_API void THNN_(VolumetricConvolution_updateGradInput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *gradOutput,
+ THCTensor *gradInput,
+ THCTensor *weight,
+ THCTensor *finput,
+ int dT, int dW, int dH,
+ int padT, int padW, int padH);
+
+TH_API void THNN_(VolumetricConvolution_accGradParameters)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *gradOutput,
+ THCTensor *gradWeight,
+ THCTensor *gradBias,
+ THCTensor *finput,
+ THCTensor *fgradInput,
+ int dT, int dW, int dH,
+ int padT, int padW, int padH,
+ real scale);
+
+TH_API void THNN_(VolumetricDilatedConvolution_updateOutput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *output,
+ THCTensor *weight,
+ THCTensor *bias,
+ THCTensor *columns,
+ THCTensor *ones,
+ int kT, int kW, int kH,
+ int dT, int dW, int dH,
+ int padT, int padW, int padH,
+ int dilationT, int dilationW, int dilationH);
+
+TH_API void THNN_(VolumetricDilatedConvolution_updateGradInput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *gradOutput,
+ THCTensor *gradInput,
+ THCTensor *weight,
+ THCTensor *gradColumns,
+ int kT, int kW, int kH,
+ int dT, int dW, int dH,
+ int padT, int padW, int padH,
+ int dilationT, int dilationW, int dilationH);
+
+TH_API void THNN_(VolumetricDilatedConvolution_accGradParameters)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *gradOutput,
+ THCTensor *gradWeight,
+ THCTensor *gradBias,
+ THCTensor *columns,
+ THCTensor *ones,
+ int kT, int kW, int kH,
+ int dT, int dW, int dH,
+ int padT, int padW, int padH,
+ int dilationT, int dilationW, int dilationH,
+ real scale);
+
+TH_API void THNN_(VolumetricDilatedMaxPooling_updateOutput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *output,
+ THCIndexTensor *indices,
+ int kT, int kW, int kH,
+ int dT, int dW, int dH,
+ int padT, int padW, int padH,
+ int dilationT, int dilationW, int dilationH,
+ bool ceilMode);
+
+TH_API void THNN_(VolumetricDilatedMaxPooling_updateGradInput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *gradOutput,
+ THCTensor *gradInput,
+ THCIndexTensor *indices,
+ int dT, int dW, int dH,
+ int padT, int padW, int padH,
+ int dilationT, int dilationW, int dilationH);
+
+TH_API void THNN_(VolumetricFullConvolution_updateOutput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *output,
+ THCTensor *weight,
+ THCTensor *bias,
+ THCTensor *finput,
+ THCTensor *fgradInput,
+ int dT, int dW, int dH,
+ int padT, int padW, int padH,
+ int adjT, int adjW, int adjH);
+
+TH_API void THNN_(VolumetricFullConvolution_updateGradInput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *gradOutput,
+ THCTensor *gradInput,
+ THCTensor *weight,
+ THCTensor *finput,
+ THCTensor *fgradInput,
+ int dT, int dW, int dH,
+ int padT, int padW, int padH,
+ int adjT, int adjW, int adjH);
+
+TH_API void THNN_(VolumetricFullConvolution_accGradParameters)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *gradOutput,
+ THCTensor *gradWeight,
+ THCTensor *gradBias,
+ THCTensor *finput,
+ THCTensor *fgradInput,
+ int dT, int dW, int dH,
+ int padT, int padW, int padH,
+ int adjT, int adjW, int adjH,
+ real scale);
+
+TH_API void THNN_(VolumetricMaxPooling_updateOutput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *output,
+ THCIndexTensor *indices,
+ int kT, int kW, int kH,
+ int dT, int dW, int dH,
+ int padT, int padW, int padH,
+ bool ceilMode);
+
+TH_API void THNN_(VolumetricMaxPooling_updateGradInput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *gradOutput,
+ THCTensor *gradInput,
+ THCIndexTensor *indices,
+ int dT, int dW, int dH,
+ int padT, int padW, int padH);
+
+TH_API void THNN_(VolumetricMaxUnpooling_updateOutput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *output,
+ THCIndexTensor *indices,
+ int outputTime, int outputWidth, int outputHeight,
+ int dT, int dW, int dH,
+ int padT, int padW, int padH);
+
+TH_API void THNN_(VolumetricMaxUnpooling_updateGradInput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *gradOutput,
+ THCTensor *gradInput,
+ THCIndexTensor *indices,
+ int outputTime, int outputWidth, int outputHeight,
+ int dT, int dW, int dH,
+ int padT, int padW, int padH);
+
+TH_API void THNN_(VolumetricReplicationPadding_updateGradInput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *gradOutput,
+ THCTensor *gradInput,
+ int pleft, int pright,
+ int ptop, int pbottom,
+ int pfront, int pback);
+
+TH_API void THNN_(VolumetricReplicationPadding_updateOutput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *output,
+ int pleft, int pright,
+ int ptop, int pbottom,
+ int pfront, int pback);
+
+#endif
diff --git a/lib/THCUNN/generic/Tanh.cu b/lib/THCUNN/generic/Tanh.cu
new file mode 100644
index 0000000..3db4ce0
--- /dev/null
+++ b/lib/THCUNN/generic/Tanh.cu
@@ -0,0 +1,29 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/Tanh.cu"
+#else
+
+#include "../common.h"
+
+void THNN_(Tanh_updateOutput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *output)
+{
+ THCUNN_assertSameGPU_generic(state, 2, input, output);
+ THCTensor_(resizeAs)(state, output, input);
+ THC_pointwiseApply2(state, output, input, tanhupdateOutput_functor<real>());
+}
+
+void THNN_(Tanh_updateGradInput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *gradOutput,
+ THCTensor *gradInput,
+ THCTensor *output)
+{
+ THCUNN_assertSameGPU_generic(state, 3, output, gradOutput, gradInput);
+ THCTensor_(resizeAs)(state, gradInput, output);
+ THC_pointwiseApply3(state, gradInput, output, gradOutput, tanhupdateGradInput_functor<real>());
+}
+
+#endif
diff --git a/lib/THCUNN/generic/TemporalConvolution.cu b/lib/THCUNN/generic/TemporalConvolution.cu
new file mode 100644
index 0000000..f074cb7
--- /dev/null
+++ b/lib/THCUNN/generic/TemporalConvolution.cu
@@ -0,0 +1,348 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/TemporalConvolution.cu"
+#else
+
+void THNN_(TemporalConvolution_updateOutput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *output,
+ THCTensor *weight,
+ THCTensor *bias,
+ int kW, int dW,
+ int inputFrameSize,
+ int outputFrameSize) {
+
+ THCTensor *outputWindow, *inputWindow;
+ int nInputFrame, nOutputFrame;
+ long k, i;
+
+ int dimS = 0; // sequence dimension
+ int dimF = 1; // feature dimension
+
+ THCUNN_assertSameGPU_generic(state, 4, input, output, weight, bias);
+ THArgCheck( input->nDimension == 2 || input->nDimension == 3, 2, "2D or 3D(batch mode) tensor expected");
+
+ if (input->nDimension == 3)
+ {
+ dimS = 1;
+ dimF = 2;
+ }
+ THArgCheck( input->size[dimF] == inputFrameSize, 2, "invalid input frame size");
+ THArgCheck( input->size[dimS] >= kW, 2, "input sequence smaller than kernel size");
+
+ input = THCTensor_(newContiguous)(state, input);
+ outputWindow = THCTensor_(new)(state);
+ inputWindow = THCTensor_(new)(state);
+
+ nInputFrame = input->size[dimS];
+ nOutputFrame = (nInputFrame - kW) / dW + 1;
+
+ if (input->nDimension == 2)
+ {
+ THCTensor_(resize2d)(state, output,
+ nOutputFrame,
+ outputFrameSize);
+
+ /* bias first */
+ for(k = 0; k < nOutputFrame; k++)
+ {
+ THCTensor_(select)(state, outputWindow, output, 0, k);
+ THCTensor_(copy)(state, outputWindow, bias);
+ }
+
+
+ /* ouch */
+ for(k = 0; nOutputFrame > 0; k++)
+ {
+ long outputFrameStride = (kW-1)/dW+1;
+ long inputFrameStride = outputFrameStride*dW;
+ long nFrame = (nInputFrame-k*dW-kW)/inputFrameStride + 1;
+ nOutputFrame -= nFrame;
+
+ THCTensor_(setStorage2d)(state, inputWindow, input->storage,
+ input->storageOffset+k*dW*input->size[1],
+ nFrame, inputFrameStride*input->size[1],
+ kW*input->size[1], 1);
+
+ THCTensor_(setStorage2d)(state, outputWindow, output->storage,
+ output->storageOffset + k*output->size[1],
+ nFrame, outputFrameStride*output->size[1],
+ output->size[1], 1);
+
+ THCTensor_(transpose)(state, weight, NULL, 0, 1);
+ THCTensor_(addmm)(state, outputWindow, ScalarConvert<int, real>::to(1), outputWindow, ScalarConvert<int, real>::to(1), inputWindow, weight);
+ THCTensor_(transpose)(state, weight, NULL, 0, 1);
+ }
+ }
+ else
+ {
+ THCTensor *outputSample = THCTensor_(new)(state);
+ THCTensor *inputSample = THCTensor_(new)(state);
+ int nBatchFrame = input->size[0];
+
+ THCTensor_(resize3d)(state, output,
+ nBatchFrame,
+ nOutputFrame,
+ outputFrameSize);
+
+ for(i = 0; i < nBatchFrame; i++)
+ {
+ THCTensor_(select)(state, outputSample, output, 0, i);
+ THCTensor_(select)(state, inputSample, input, 0, i);
+ long nOutputSampleFrame = nOutputFrame;
+
+ /* bias first */
+ for(k = 0; k < nOutputFrame; k++)
+ {
+ THCTensor_(select)(state, outputWindow, outputSample, 0, k);
+ THCTensor_(copy)(state, outputWindow, bias);
+ }
+
+ /* ouch */
+ for(k = 0; nOutputSampleFrame > 0; k++)
+ {
+ long outputFrameStride = (kW-1)/dW+1;
+ long inputFrameStride = outputFrameStride*dW;
+ long nFrame = (nInputFrame-k*dW-kW)/inputFrameStride + 1;
+ nOutputSampleFrame -= nFrame;
+
+ THCTensor_(setStorage2d)(state, inputWindow, inputSample->storage,
+ inputSample->storageOffset+k*dW*inputSample->size[1],
+ nFrame, inputFrameStride*inputSample->size[1],
+ kW*inputSample->size[1], 1);
+
+ THCTensor_(setStorage2d)(state, outputWindow, outputSample->storage,
+ outputSample->storageOffset + k*outputSample->size[1],
+ nFrame, outputFrameStride*outputSample->size[1],
+ outputSample->size[1], 1);
+
+ THCTensor_(transpose)(state, weight, NULL, 0, 1);
+ THCTensor_(addmm)(state, outputWindow, ScalarConvert<int, real>::to(1), outputWindow, ScalarConvert<int, real>::to(1), inputWindow, weight);
+ THCTensor_(transpose)(state, weight, NULL, 0, 1);
+ }
+ }
+ THCTensor_(free)(state, outputSample);
+ THCTensor_(free)(state, inputSample);
+ }
+
+ THCTensor_(free)(state, outputWindow);
+ THCTensor_(free)(state, inputWindow);
+ THCTensor_(free)(state, input);
+
+}
+
+void THNN_(TemporalConvolution_updateGradInput)(
+ THCState* state,
+ THCTensor *input,
+ THCTensor *gradOutput,
+ THCTensor *gradInput,
+ THCTensor *weight,
+ int kW, int dW) {
+
+ long nInputFrame;
+ long nOutputFrame;
+
+ THCTensor *gradOutputWindow;
+ THCTensor *gradInputWindow;
+ long k, i;
+
+ int dimS = 0; // sequence dimension
+
+ THCUNN_assertSameGPU_generic(state, 4, input, gradOutput, weight, gradInput);
+
+ if (gradOutput->nDimension == 3)
+ {
+ dimS = 1;
+ }
+
+ nInputFrame = input->size[dimS];
+ nOutputFrame = gradOutput->size[dimS];
+
+
+ /* Not necessary with partial backprop: */
+ gradOutputWindow = THCTensor_(new)(state);
+ gradInputWindow = THCTensor_(new)(state);
+
+ THCTensor_(resizeAs)(state, gradInput, input);
+ THCTensor_(zero)(state, gradInput);
+
+ if (gradOutput->nDimension == 2)
+ {
+ /* ouch */
+ for(k = 0; nOutputFrame > 0; k++)
+ {
+ long outputFrameStride = (kW-1)/dW+1;
+ long inputFrameStride = outputFrameStride*dW;
+ long nFrame = (nInputFrame-k*dW-kW)/inputFrameStride + 1;
+ nOutputFrame -= nFrame;
+
+ THCTensor_(setStorage2d)(state, gradOutputWindow, gradOutput->storage,
+ gradOutput->storageOffset + k*gradOutput->size[1],
+ nFrame, outputFrameStride*gradOutput->size[1],
+ gradOutput->size[1], 1);
+
+ THCTensor_(setStorage2d)(state, gradInputWindow, gradInput->storage,
+ gradInput->storageOffset+k*dW*gradInput->size[1],
+ nFrame, inputFrameStride*gradInput->size[1],
+ kW*gradInput->size[1], 1);
+
+ THCTensor_(addmm)(state, gradInputWindow, ScalarConvert<int, real>::to(1), gradInputWindow, ScalarConvert<int, real>::to(1), gradOutputWindow, weight);
+ }
+ }
+ else
+ {
+ THCTensor *gradOutputSample = THCTensor_(new)(state);
+ THCTensor *gradInputSample = THCTensor_(new)(state);
+ long nBatchFrame = input->size[0];
+ for(i = 0; i < nBatchFrame; i++)
+ {
+ THCTensor_(select)(state, gradOutputSample, gradOutput, 0, i);
+ THCTensor_(select)(state, gradInputSample, gradInput, 0, i);
+ long nOutputSampleFrame = nOutputFrame;
+
+ /* ouch */
+ for(k = 0; nOutputSampleFrame > 0; k++)
+ {
+ long outputFrameStride = (kW-1)/dW+1;
+ long inputFrameStride = outputFrameStride*dW;
+ long nFrame = (nInputFrame-k*dW-kW)/inputFrameStride + 1;
+ nOutputSampleFrame -= nFrame;
+
+ THCTensor_(setStorage2d)(state, gradOutputWindow, gradOutputSample->storage,
+ gradOutputSample->storageOffset + k*gradOutputSample->size[1],
+ nFrame, outputFrameStride*gradOutputSample->size[1],
+ gradOutputSample->size[1], 1);
+
+ THCTensor_(setStorage2d)(state, gradInputWindow, gradInputSample->storage,
+ gradInputSample->storageOffset+k*dW*gradInputSample->size[1],
+ nFrame, inputFrameStride*gradInputSample->size[1],
+ kW*gradInputSample->size[1], 1);
+
+ THCTensor_(addmm)(state, gradInputWindow, ScalarConvert<int, real>::to(1), gradInputWindow, ScalarConvert<int, real>::to(1), gradOutputWindow, weight);
+ }
+ }
+ THCTensor_(free)(state, gradOutputSample);
+ THCTensor_(free)(state, gradInputSample);
+ }
+
+ THCTensor_(free)(state, gradOutputWindow);
+ THCTensor_(free)(state, gradInputWindow);
+
+}
+
+void THNN_(TemporalConvolution_accGradParameters)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *gradOutput,
+ THCTensor *gradWeight,
+ THCTensor *gradBias,
+ int kW, int dW,
+ real scale) {
+
+ long nInputFrame;
+ long nOutputFrame;
+
+ THCTensor *gradOutputWindow;
+ THCTensor *inputWindow;
+ long k, i;
+
+ int dimS = 0; // sequence dimension
+
+ if (gradOutput->nDimension == 3)
+ {
+ dimS = 1;
+ }
+
+ nInputFrame = input->size[dimS];
+ nOutputFrame = gradOutput->size[dimS];
+
+ /* Not necessary with partial backprop: */
+ input = THCTensor_(newContiguous)(state, input);
+ gradOutputWindow = THCTensor_(new)(state);
+ inputWindow = THCTensor_(new)(state);
+
+ if (input->nDimension == 2)
+ {
+ /* bias first */
+ for(k = 0; k < nOutputFrame; k++)
+ {
+ THCTensor_(select)(state, gradOutputWindow, gradOutput, 0, k);
+ THCTensor_(cadd)(state, gradBias, gradBias, scale, gradOutputWindow);
+ }
+
+ /* ouch */
+ for(k = 0; nOutputFrame > 0; k++)
+ {
+ long outputFrameStride = (kW-1)/dW+1;
+ long inputFrameStride = outputFrameStride*dW;
+ long nFrame = (nInputFrame-k*dW-kW)/inputFrameStride + 1;
+ nOutputFrame -= nFrame;
+
+ THCTensor_(setStorage2d)(state, inputWindow, input->storage,
+ input->storageOffset+k*dW*input->size[1],
+ nFrame, inputFrameStride*input->size[1],
+ kW*input->size[1], 1);
+
+ THCTensor_(setStorage2d)(state, gradOutputWindow, gradOutput->storage,
+ gradOutput->storageOffset + k*gradOutput->size[1],
+ nFrame, outputFrameStride*gradOutput->size[1],
+ gradOutput->size[1], 1);
+
+ THCTensor_(transpose)(state, gradOutputWindow, NULL, 0, 1);
+ THCTensor_(addmm)(state, gradWeight, ScalarConvert<int, real>::to(1), gradWeight, scale, gradOutputWindow, inputWindow);
+ THCTensor_(transpose)(state, gradOutputWindow, NULL, 0, 1);
+ }
+ }
+ else
+ {
+ THCTensor *gradOutputSample = THCTensor_(new)(state);
+ THCTensor *inputSample = THCTensor_(new)(state);
+ long nBatchFrame = input->size[0];
+
+ for(i = 0; i < nBatchFrame; i++)
+ {
+ THCTensor_(select)(state, gradOutputSample, gradOutput, 0, i);
+ THCTensor_(select)(state, inputSample, input, 0, i);
+ long nOutputSampleFrame = nOutputFrame;
+
+ /* bias first */
+ for(k = 0; k < nOutputFrame; k++)
+ {
+ THCTensor_(select)(state, gradOutputWindow, gradOutputSample, 0, k);
+ THCTensor_(cadd)(state, gradBias, gradBias, scale, gradOutputWindow);
+ }
+
+ /* ouch */
+ for(k = 0; nOutputSampleFrame > 0; k++)
+ {
+ long outputFrameStride = (kW-1)/dW+1;
+ long inputFrameStride = outputFrameStride*dW;
+ long nFrame = (nInputFrame-k*dW-kW)/inputFrameStride + 1;
+ nOutputSampleFrame -= nFrame;
+
+ THCTensor_(setStorage2d)(state, inputWindow, inputSample->storage,
+ inputSample->storageOffset+k*dW*inputSample->size[1],
+ nFrame, inputFrameStride*inputSample->size[1],
+ kW*inputSample->size[1], 1);
+
+ THCTensor_(setStorage2d)(state, gradOutputWindow, gradOutputSample->storage,
+ gradOutputSample->storageOffset + k*gradOutputSample->size[1],
+ nFrame, outputFrameStride*gradOutputSample->size[1],
+ gradOutputSample->size[1], 1);
+
+ THCTensor_(transpose)(state, gradOutputWindow, NULL, 0, 1);
+ THCTensor_(addmm)(state, gradWeight, ScalarConvert<int, real>::to(1), gradWeight, scale, gradOutputWindow, inputWindow);
+ THCTensor_(transpose)(state, gradOutputWindow, NULL, 0, 1);
+ }
+ }
+ THCTensor_(free)(state, gradOutputSample);
+ THCTensor_(free)(state, inputSample);
+ }
+
+ THCTensor_(free)(state, gradOutputWindow);
+ THCTensor_(free)(state, inputWindow);
+ THCTensor_(free)(state, input);
+
+}
+
+#endif
diff --git a/lib/THCUNN/generic/TemporalMaxPooling.cu b/lib/THCUNN/generic/TemporalMaxPooling.cu
new file mode 100644
index 0000000..d833c5a
--- /dev/null
+++ b/lib/THCUNN/generic/TemporalMaxPooling.cu
@@ -0,0 +1,151 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/TemporalMaxPooling.cu"
+#else
+
+void THNN_(TemporalMaxPooling_updateOutput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *output,
+ THCIndexTensor *indices,
+ int kW, int dW) {
+
+ int dimT = 0; // Temporal dimension
+ int dimF = 1; // Feature dimension
+
+ int batch = 1;
+ int input_w;
+ int input_n;
+ int output_w;
+ int nthreads;
+
+ real *input_data;
+ real *output_data;
+ THCIndex_t *indices_data;
+
+ THCUNN_assertSameGPU_generic(state, 3, input, output, indices);
+ THArgCheck( input->nDimension == 2 || input->nDimension == 3, 2, "2D or 3D(batch mode) tensor expected");
+
+ if (input->nDimension == 3)
+ {
+ dimT = 1;
+ dimF = 2;
+ batch = input->size[0];
+ }
+ THArgCheck( input->size[dimT] >= kW, 2, "input sequence smaller than kernel size");
+
+ input = THCTensor_(newContiguous)(state, input);
+
+ input_w = input->size[dimT];
+ input_n = input->size[dimF];
+ output_w = (input_w - kW) / dW + 1;
+
+ if (input->nDimension == 2)
+ {
+ THCTensor_(resize2d)(state, output, output_w, input->size[dimF]);
+ THCIndexTensor_(resize2d)(state, indices, output_w, input->size[dimF]);
+ }
+ else
+ {
+ THCTensor_(resize3d)(state, output, batch, output_w, input->size[dimF]);
+ THCIndexTensor_(resize3d)(state, indices, batch, output_w, input->size[dimF]);
+ }
+
+ input_data = THCTensor_(data)(state, input);
+ output_data = THCTensor_(data)(state, output);
+ indices_data = THCIndexTensor_(data)(state, indices);
+
+ dim3 blocks(batch);
+ nthreads = (output_w / 32) * 32;
+ if (output_w % 32 > 0) {
+ nthreads += 32;
+ }
+
+ if (nthreads > TEMPORAL_MAX_POOLING_THREADS) {
+ blocks.y = nthreads / TEMPORAL_MAX_POOLING_THREADS;
+ if (nthreads % TEMPORAL_MAX_POOLING_THREADS > 0) {
+ blocks.y += 1;
+ }
+ nthreads = TEMPORAL_MAX_POOLING_THREADS;
+ }
+
+ dim3 threads(nthreads);
+ cunn_TemporalMaxPooling_updateOutputKernel <<< blocks, threads, 0, THCState_getCurrentStream(state) >>>(
+ input_data, output_data, indices_data, input_w, input_n, output_w, kW, dW);
+ THCudaCheck(cudaGetLastError());
+ THCTensor_(free)(state, input);
+
+}
+
+void THNN_(TemporalMaxPooling_updateGradInput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *gradOutput,
+ THCTensor *gradInput,
+ THCIndexTensor *indices,
+ int kW, int dW) {
+
+ int dimT = 0; // Temporal dimension
+ int dimF = 1; // Feature dimension
+
+ int batch = 1;
+ int input_w;
+ int input_n;
+ int output_w;
+ int nthreads;
+
+ real *gradInput_data;
+ real *gradOutput_data;
+ THCIndex_t *indices_data;
+
+ THCUNN_assertSameGPU_generic(state, 4, input, gradOutput, gradInput, indices);
+ THArgCheck( input->nDimension == 2 || input->nDimension == 3, 2, "2D or 3D(batch mode) tensor expected");
+
+ THCTensor_(resizeAs)(state, gradInput, input);
+ THCTensor_(zero)(state, gradInput);
+
+ if (input->nDimension == 3)
+ {
+ dimT = 1;
+ dimF = 2;
+ batch = input->size[0];
+ }
+ THArgCheck( input->size[dimT] >= kW, 2, "input sequence smaller than kernel size");
+
+ gradOutput = THCTensor_(newContiguous)(state, gradOutput);
+
+ input_w = input->size[dimT];
+ input_n = input->size[dimF];
+ output_w = (input_w - kW) / dW + 1;
+
+ gradInput_data = THCTensor_(data)(state, gradInput);
+ gradOutput_data = THCTensor_(data)(state, gradOutput);
+ indices_data = THCIndexTensor_(data)(state, indices);
+
+ dim3 blocks(batch);
+ nthreads = (output_w / 32) * 32;
+ if (output_w % 32 > 0) {
+ nthreads += 32;
+ }
+
+ if (nthreads > TEMPORAL_MAX_POOLING_THREADS) {
+ blocks.y = nthreads / TEMPORAL_MAX_POOLING_THREADS;
+ if (nthreads % TEMPORAL_MAX_POOLING_THREADS > 0) {
+ blocks.y += 1;
+ }
+ nthreads = TEMPORAL_MAX_POOLING_THREADS;
+ }
+
+ dim3 threads(nthreads);
+ if (kW <= dW) {
+ cunn_TemporalMaxPooling_updateGradInputKernel <<< blocks, threads, 0, THCState_getCurrentStream(state) >>>(
+ gradInput_data, gradOutput_data, indices_data, input_w, input_n, output_w, kW, dW);
+ } else {
+ cunn_TemporalMaxPooling_updateGradInputKernelAtomic <<< blocks, threads, 0, THCState_getCurrentStream(state) >>>(
+ gradInput_data, gradOutput_data, indices_data, input_w, input_n, output_w, kW, dW);
+ }
+ THCudaCheck(cudaGetLastError());
+ THCTensor_(free)(state, gradOutput);
+
+}
+
+#endif
diff --git a/lib/THCUNN/generic/Threshold.cu b/lib/THCUNN/generic/Threshold.cu
new file mode 100644
index 0000000..0ab84c1
--- /dev/null
+++ b/lib/THCUNN/generic/Threshold.cu
@@ -0,0 +1,64 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/Threshold.cu"
+#else
+
+#include "../common.h"
+
+void THNN_(Threshold_updateOutput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *output,
+ real threshold,
+ real val,
+ bool inplace)
+{
+ THCUNN_assertSameGPU_generic(state, 2, input, output);
+
+ if (inplace)
+ {
+ THC_pointwiseApply1(state, input,
+ ThresholdUpdateOutputIP<real>(threshold, val)
+ );
+ THCTensor_(set)(state, output, input);
+ }
+ else
+ {
+ THCTensor_(resizeAs)(state, output, input);
+ THC_pointwiseApply2(state, output, input,
+ ThresholdUpdateOutput<real>(threshold, val)
+ );
+ }
+
+ THCudaCheck(cudaGetLastError());
+}
+
+void THNN_(Threshold_updateGradInput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *gradOutput,
+ THCTensor *gradInput,
+ real threshold,
+ real val,
+ bool inplace)
+{
+ THCUNN_assertSameGPU_generic(state, 3, input, gradInput, gradOutput);
+
+ if (inplace)
+ {
+ THC_pointwiseApply2(state, gradOutput, input,
+ ThresholdUpdateGradInputIP<real>(threshold)
+ );
+ THCTensor_(set)(state, gradInput, gradOutput);
+ }
+ else
+ {
+ THCTensor_(resizeAs)(state, gradInput, input);
+ THC_pointwiseApply3(state, gradInput, input, gradOutput,
+ ThresholdUpdateGradInput<real>(threshold)
+ );
+ }
+
+ THCudaCheck(cudaGetLastError());
+}
+
+#endif
diff --git a/lib/THCUNN/generic/VolumetricAveragePooling.cu b/lib/THCUNN/generic/VolumetricAveragePooling.cu
new file mode 100644
index 0000000..62823a8
--- /dev/null
+++ b/lib/THCUNN/generic/VolumetricAveragePooling.cu
@@ -0,0 +1,237 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/VolumetricAveragePooling.cu"
+#else
+
+void THNN_(VolumetricAveragePooling_updateOutput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *output,
+ int kT, int kW, int kH,
+ int dT, int dW, int dH)
+{
+ int batchSize;
+ int inputSlices;
+ int inputTime;
+ int inputHeight;
+ int inputWidth;
+
+ if (THCTensor_(nDimension)(state, input) == 4)
+ {
+ THArgCheck(
+ THCTensor_(size)(state, input, 1) >= kT &&
+ THCTensor_(size)(state, input, 2) >= kH &&
+ THCTensor_(size)(state, input, 3) >= kW, 2,
+ "input image smaller than kernel size"
+ );
+
+ /* sizes */
+ batchSize = 1;
+ inputSlices = THCTensor_(size)(state, input, 0);
+ inputTime = THCTensor_(size)(state, input, 1);
+ inputHeight = THCTensor_(size)(state, input, 2);
+ inputWidth = THCTensor_(size)(state, input, 3);
+ }
+ else if (THCTensor_(nDimension)(state, input) == 5)
+ {
+ THArgCheck(
+ THCTensor_(size)(state, input, 2) >= kT &&
+ THCTensor_(size)(state, input, 3) >= kH &&
+ THCTensor_(size)(state, input, 4) >= kW, 2,
+ "input image smaller than kernel size"
+
+ );
+ /* sizes */
+ batchSize = THCTensor_(size)(state, input, 0);
+ inputSlices = THCTensor_(size)(state, input, 1);
+ inputTime = THCTensor_(size)(state, input, 2);
+ inputHeight = THCTensor_(size)(state, input, 3);
+ inputWidth = THCTensor_(size)(state, input, 4);
+ }
+ else
+ {
+ THArgCheck(false, 2, "4D or 5D tensor expected");
+ }
+
+ int outputTime = (inputTime - kT) / dT + 1;
+ int outputHeight = (inputHeight - kH) / dH + 1;
+ int outputWidth = (inputWidth - kW) / dW + 1;
+
+ if (input->nDimension == 4) /* 4D */
+ {
+ /* resize output */
+ THCTensor_(resize4d)(state, output, inputSlices,
+ outputTime, outputHeight, outputWidth);
+ }
+ else /* 5D */
+ {
+ THCTensor_(resize5d)(state, output, batchSize, inputSlices,
+ outputTime, outputHeight, outputWidth);
+ }
+
+ input = THCTensor_(newContiguous)(state, input);
+
+ // Collapse batch and feature dimensions
+ THCDeviceTensor<real, 4> cudaInput;
+ THCDeviceTensor<real, 4> cudaOutput;
+ if (THCTensor_(nDimension)(state, input) == 4)
+ {
+ cudaInput = toDeviceTensor<real, 4>(state, input);
+ cudaOutput = toDeviceTensor<real, 4>(state, output);
+ }
+ else
+ {
+ cudaInput = toDeviceTensor<real, 5>(state, input).downcastOuter<4>();
+ cudaOutput = toDeviceTensor<real, 5>(state, output).downcastOuter<4>();
+ }
+
+ int totalZ = outputTime * inputSlices * batchSize;
+ int offsetZ = 0;
+ dim3 block(32, 8);
+ while (totalZ > 0) {
+ dim3 grid(THCCeilDiv(outputWidth, static_cast<int>(block.x)),
+ THCCeilDiv(outputHeight, static_cast<int>(block.y)),
+ totalZ > 65535 ? 65535 : totalZ);
+
+ accreal normFactor = ScalarConvert<int, accreal>::to(1) / static_cast<accreal>(kT * kH * kW);
+ switch (kW)
+ {
+ LAUNCH_UPDATE_OUTPUT_KERNEL_WIDTH(1);
+ LAUNCH_UPDATE_OUTPUT_KERNEL_WIDTH(2);
+ LAUNCH_UPDATE_OUTPUT_KERNEL_WIDTH(3);
+ LAUNCH_UPDATE_OUTPUT_KERNEL_WIDTH(4);
+ LAUNCH_UPDATE_OUTPUT_KERNEL_WIDTH(5);
+ LAUNCH_UPDATE_OUTPUT_KERNEL_WIDTH(6);
+ LAUNCH_UPDATE_OUTPUT_KERNEL_WIDTH(7);
+ default:
+ cuda_VolumetricAveragePooling_updateOutput<real, accreal><<<grid, block>>>(
+ cudaInput,
+ cudaOutput,
+ kT, kH, kW,
+ dT, dH, dW,
+ normFactor,
+ offsetZ
+ );
+ break;
+ }
+ totalZ -= 65535;
+ offsetZ += 65535;
+ THCudaCheck(cudaGetLastError());
+ }
+ THCTensor_(free)(state, input);
+}
+
+void THNN_(VolumetricAveragePooling_updateGradInput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *gradOutput,
+ THCTensor *gradInput,
+ int kT, int kW, int kH,
+ int dT, int dW, int dH)
+{
+ bool kernelsOverlap = (dT < kT) || (dH < kH) || (dW < kW);
+
+ // Resize and initialize result tensor.
+ THCTensor_(resizeAs)(state, gradInput, input);
+ THCTensor_(zero)(state, gradInput);
+
+ int batchSize;
+ int inputSlices;
+ int inputTime;
+ int inputHeight;
+ int inputWidth;
+
+ int outputTime;
+ int outputHeight;
+ int outputWidth;
+
+ if (THCTensor_(nDimension)(state, input) == 4) /* 4D */
+ {
+ batchSize = 1;
+ inputSlices = THCTensor_(size)(state, input, 0);
+ inputTime = THCTensor_(size)(state, input, 1);
+ inputHeight = THCTensor_(size)(state, input, 2);
+ inputWidth = THCTensor_(size)(state, input, 3);
+
+ outputTime = THCTensor_(size)(state, gradOutput, 1);
+ outputHeight = THCTensor_(size)(state, gradOutput, 2);
+ outputWidth = THCTensor_(size)(state, gradOutput, 3);
+ }
+ else
+ {
+ batchSize = THCTensor_(size)(state, input, 0);
+ inputSlices = THCTensor_(size)(state, input, 1);
+ inputTime = THCTensor_(size)(state, input, 2);
+ inputHeight = THCTensor_(size)(state, input, 3);
+ inputWidth = THCTensor_(size)(state, input, 4);
+
+ outputTime = THCTensor_(size)(state, gradOutput, 2);
+ outputHeight = THCTensor_(size)(state, gradOutput, 3);
+ outputWidth = THCTensor_(size)(state, gradOutput, 4);
+ }
+
+ gradOutput = THCTensor_(newContiguous)(state, gradOutput);
+
+ // Collapse batch and feature dimensions
+ THCDeviceTensor<real, 4> cudaGradInput;
+ THCDeviceTensor<real, 4> cudaGradOutput;
+ if (THCTensor_(nDimension)(state, input) == 4)
+ {
+ cudaGradInput = toDeviceTensor<real, 4>(state, gradInput);
+ cudaGradOutput = toDeviceTensor<real, 4>(state, gradOutput);
+ }
+ else
+ {
+ cudaGradInput =
+ toDeviceTensor<real, 5>(state, gradInput).downcastOuter<4>();
+ cudaGradOutput =
+ toDeviceTensor<real, 5>(state, gradOutput).downcastOuter<4>();
+ }
+
+ dim3 block(32, 8);
+
+ // Optimizing for stride 1 is probably only of limited value, but this
+ // specialization yields 3x speedup over the atomicAdd implementation.
+ if (dT == 1 && dH == 1 && dW == 1)
+ {
+ int totalZ = inputTime * inputSlices * batchSize;
+ int offsetZ = 0;
+ while (totalZ > 0) {
+ dim3 grid(THCCeilDiv(inputWidth, static_cast<int>(block.x)),
+ THCCeilDiv(inputHeight, static_cast<int>(block.y)),
+ totalZ > 65535 ? 65535 : totalZ);
+ cuda_VolumetricAveragePooling_updateGradInput_Stride1<real, accreal><<<grid, block>>>(
+ cudaGradOutput, cudaGradInput, kT, kH, kW, 1.0f/(kT * kH * kW), offsetZ);
+ THCudaCheck(cudaGetLastError());
+ totalZ -= 65535;
+ offsetZ += 65535;
+ }
+ }
+ else
+ {
+ int totalZ = outputTime * inputSlices * batchSize;
+ int offsetZ = 0;
+ while (totalZ > 0) {
+
+ dim3 grid(THCCeilDiv(outputWidth, static_cast<int>(block.x)),
+ THCCeilDiv(outputHeight, static_cast<int>(block.y)),
+ totalZ > 65535 ? 65535 : totalZ);
+ if (kernelsOverlap)
+ {
+ cuda_VolumetricAveragePooling_updateGradInput_atomicAdd<real, accreal><<<grid, block>>>(
+ cudaGradOutput, cudaGradInput, kT, kH, kW, dT, dH, dW, offsetZ);
+ }
+ else
+ {
+ cuda_VolumetricAveragePooling_updateGradInput<real, accreal><<<grid, block>>>(
+ cudaGradOutput, cudaGradInput, kT, kH, kW, dT, dH, dW, offsetZ);
+ }
+ THCudaCheck(cudaGetLastError());
+ totalZ -= 65535;
+ offsetZ += 65535;
+ }
+ }
+
+ THCTensor_(free)(state, gradOutput);
+}
+
+#endif
diff --git a/lib/THCUNN/generic/VolumetricConvolution.cu b/lib/THCUNN/generic/VolumetricConvolution.cu
new file mode 100644
index 0000000..8ed0bca
--- /dev/null
+++ b/lib/THCUNN/generic/VolumetricConvolution.cu
@@ -0,0 +1,416 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/VolumetricConvolution.cu"
+#else
+
+void THNN_(VolumetricConvolution_updateOutput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *output,
+ THCTensor *weight,
+ THCTensor *bias,
+ THCTensor *finput,
+ THCTensor *fgradInput,
+ int dT, int dW, int dH,
+ int padT, int padW, int padH)
+{
+ THCTensor *columns = finput;
+ THCTensor *ones = fgradInput;
+ THCUNN_assertSameGPU_generic(state, 6, input, output, weight, bias, columns, ones);
+
+ THArgCheck(input->nDimension == 4 || input->nDimension == 5, 2,
+ "4D or 5D (batch mode) tensor is expected"
+ );
+
+ THArgCheck(weight->nDimension == 5, 4,
+ "5D weight tensor is expected (nOutputPlane x nInputPlane x kT x kH x kW)"
+ );
+
+ int nOutputPlane = (int)weight->size[0];
+ int nInputPlane = (int)weight->size[1];
+ int kT = (int)weight->size[2];
+ int kH = (int)weight->size[3];
+ int kW = (int)weight->size[4];
+
+ int batch = 1;
+ if (input->nDimension == 4)
+ {
+ // Force batch
+ batch = 0;
+ THCTensor_(resize5d)(state, input, 1, input->size[0], input->size[1],
+ input->size[2], input->size[3]);
+ }
+
+ long inputWidth = input->size[3];
+ long inputHeight = input->size[2];
+ long inputDepth = input->size[4];
+ long outputWidth = (inputWidth + 2*padH - kH) / dH + 1;
+ long outputHeight = (inputHeight + 2*padT - kT) / dT + 1;
+ long outputDepth = (inputDepth + 2*padW - kW) / dW + 1;
+
+ // Batch size + input planes
+ long batchSize = input->size[0];
+
+ // Resize output
+ THCTensor_(resize5d)(state, output, batchSize, nOutputPlane,
+ outputHeight, outputWidth, outputDepth);
+
+ // Resize temporary columns
+ THCTensor_(resize2d)(state, columns, nInputPlane*kW*kH*kT, outputDepth*outputHeight*outputWidth);
+
+ // Define a buffer of ones, for bias accumulation
+ // Note: this buffer can be shared with other modules, it only ever gets increased,
+ // and always contains ones.
+ if (ones->nDimension != 3 || ones->size[0]*ones->size[1]*ones->size[2] < outputDepth*outputHeight*outputWidth)
+ {
+ // Resize plane and fill with ones...
+ THCTensor_(resize3d)(state, ones, outputHeight, outputWidth, outputDepth);
+ THCTensor_(fill)(state, ones, ScalarConvert<int, real>::to(1));
+ }
+
+ // Helpers
+ THCTensor *input_n = THCTensor_(new)(state);
+ THCTensor *output_n = THCTensor_(new)(state);
+
+ // For each elt in batch, do:
+ for (int elt = 0; elt < batchSize; elt ++)
+ {
+ // Matrix mulitply per output:
+ THCTensor_(select)(state, input_n, input, 0, elt);
+ THCTensor_(select)(state, output_n, output, 0, elt);
+
+ // Do Bias first:
+ // M,N,K are dims of matrix A and B
+ // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
+ long m_ = nOutputPlane;
+ long n_ = outputDepth * outputHeight * outputWidth;
+ long k_ = 1;
+
+ // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+ #ifdef THC_REAL_IS_FLOAT
+ THCudaBlas_Sgemm(
+ #elif defined(THC_REAL_IS_HALF)
+ THCudaBlas_Hgemm(
+ #elif defined(THC_REAL_IS_DOUBLE)
+ THCudaBlas_Dgemm(
+ #endif
+ state,
+ 't', 'n',
+ n_, m_, k_,
+ ScalarConvert<int, real>::to(1),
+ THCTensor_(data)(state, ones), k_,
+ THCTensor_(data)(state, bias), k_,
+ ScalarConvert<int, real>::to(0),
+ THCTensor_(data)(state, output_n), n_
+ );
+
+ // Extract columns:
+ im3d2col(
+ THCState_getCurrentStream(state),
+ THCTensor_(data)(state, input_n),
+ nInputPlane, inputHeight, inputWidth, inputDepth, kT, kH, kW, padT, padH, padW, dT, dH, dW,
+ THCTensor_(data)(state, columns)
+ );
+
+ // M,N,K are dims of matrix A and B
+ // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
+ long m = weight->size[0];
+ long n = columns->size[1];
+ long k = weight->size[1]*weight->size[2]*weight->size[3]*weight->size[4];
+
+ // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+ #ifdef THC_REAL_IS_FLOAT
+ THCudaBlas_Sgemm(
+ #elif defined(THC_REAL_IS_HALF)
+ THCudaBlas_Hgemm(
+ #elif defined(THC_REAL_IS_DOUBLE)
+ THCudaBlas_Dgemm(
+ #endif
+ state,
+ 'n', 'n',
+ n, m, k,
+ ScalarConvert<int, real>::to(1),
+ THCTensor_(data)(state, columns), n,
+ THCTensor_(data)(state, weight), k,
+ ScalarConvert<int, real>::to(1),
+ THCTensor_(data)(state, output_n), n
+ );
+ }
+
+ // Free
+ THCTensor_(free)(state, input_n);
+ THCTensor_(free)(state, output_n);
+
+ // Resize output
+ if (batch == 0)
+ {
+ THCTensor_(resize4d)(state, output, nOutputPlane, outputHeight, outputWidth, outputDepth);
+ THCTensor_(resize4d)(state, input, nInputPlane, inputHeight, inputWidth, inputDepth);
+ }
+}
+
+void THNN_(VolumetricConvolution_updateGradInput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *gradOutput,
+ THCTensor *gradInput,
+ THCTensor *weight,
+ THCTensor *finput,
+ int dT, int dW, int dH,
+ int padT, int padW, int padH)
+{
+ THArgCheck(weight->nDimension == 5, 4,
+ "5D weight tensor is expected (nOutputPlane x nInputPlane x kT x kH x kW)"
+ );
+
+ int nOutputPlane = (int)weight->size[0];
+ int nInputPlane = (int)weight->size[1];
+ int kT = (int)weight->size[2];
+ int kH = (int)weight->size[3];
+ int kW = (int)weight->size[4];
+
+ THCTensor *gradColumns = finput;
+
+ THCUNN_assertSameGPU_generic(state, 5, input, gradOutput, weight, gradColumns, gradInput);
+ THArgCheck(input->nDimension == 4 || input->nDimension == 5, 2,
+ "4D or 5D (batch mode) tensor is expected"
+ );
+
+ int batch = 1;
+ if (input->nDimension == 4)
+ {
+ // Force batch
+ batch = 0;
+ THCTensor_(resize5d)(state, input, 1, input->size[0], input->size[1], input->size[2], input->size[3]);
+ THCTensor_(resize5d)(state, gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2], gradOutput->size[3]);
+ }
+
+ long inputWidth = input->size[3];
+ long inputHeight = input->size[2];
+ long inputDepth = input->size[4];
+ long outputWidth = (inputWidth + 2*padH - kH) / dH + 1;
+ long outputHeight = (inputHeight + 2*padT - kT) / dT + 1;
+ long outputDepth = (inputDepth + 2*padW - kW) / dW + 1;
+
+ // Batch size + input planes
+ long batchSize = input->size[0];
+
+ // Resize output
+ THCTensor_(resize5d)(state, gradInput, batchSize, nInputPlane, inputHeight, inputWidth, inputDepth);
+
+ // Resize temporary columns
+ THCTensor_(resize2d)(state, gradColumns, nInputPlane*kH*kT*kW, outputDepth*outputHeight*outputWidth);
+
+ // Helpers
+ THCTensor *input_n = THCTensor_(new)(state);
+ THCTensor *gradInput_n = THCTensor_(new)(state);
+ THCTensor *gradOutput_n = THCTensor_(new)(state);
+
+ // For each elt in batch, do:
+ for (int elt = 0; elt < batchSize; elt ++)
+ {
+ // Matrix mulitply per sample:
+ THCTensor_(select)(state, input_n, input, 0, elt);
+ THCTensor_(select)(state, gradInput_n, gradInput, 0, elt);
+ THCTensor_(select)(state, gradOutput_n, gradOutput, 0, elt);
+
+ // M,N,K are dims of matrix A and B
+ // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
+ long m = weight->size[1]*weight->size[2]*weight->size[3]*weight->size[4];
+ long n = gradColumns->size[1];
+ long k = weight->size[0];
+
+ // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+ #ifdef THC_REAL_IS_FLOAT
+ THCudaBlas_Sgemm(
+ #elif defined(THC_REAL_IS_HALF)
+ THCudaBlas_Hgemm(
+ #elif defined(THC_REAL_IS_DOUBLE)
+ THCudaBlas_Dgemm(
+ #endif
+ state,
+ 'n', 't',
+ n, m, k,
+ ScalarConvert<int, real>::to(1),
+ THCTensor_(data)(state, gradOutput_n), n,
+ THCTensor_(data)(state, weight), m,
+ ScalarConvert<int, real>::to(0),
+ THCTensor_(data)(state, gradColumns), n
+ );
+
+ // Unpack columns back into input:
+ col2im3d<real, accreal>(
+ THCState_getCurrentStream(state),
+ THCTensor_(data)(state, gradColumns),
+ nInputPlane, inputHeight, inputWidth, inputDepth, kT, kH, kW, padT, padH, padW, dT, dH, dW,
+ THCTensor_(data)(state, gradInput_n)
+ );
+ }
+
+ // Free
+ THCTensor_(free)(state, input_n);
+ THCTensor_(free)(state, gradInput_n);
+ THCTensor_(free)(state, gradOutput_n);
+
+ // Resize output
+ if (batch == 0)
+ {
+ THCTensor_(resize4d)(state, gradOutput, nOutputPlane, outputHeight, outputWidth, outputDepth);
+ THCTensor_(resize4d)(state, input, nInputPlane, inputHeight, inputWidth, inputDepth);
+ THCTensor_(resize4d)(state, gradInput, nInputPlane, inputHeight, inputWidth, inputDepth);
+ }
+}
+
+void THNN_(VolumetricConvolution_accGradParameters)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *gradOutput,
+ THCTensor *gradWeight,
+ THCTensor *gradBias,
+ THCTensor *finput,
+ THCTensor *fgradInput,
+ int dT, int dW, int dH,
+ int padT, int padW, int padH,
+ real scale)
+{
+ THCTensor *columns = finput;
+ THCTensor *ones = fgradInput;
+ THCUNN_assertSameGPU_generic(state, 6, input, gradOutput, gradWeight, gradBias, columns, ones);
+
+ THArgCheck(gradWeight->nDimension == 5, 4,
+ "5D gradWeight tensor is expected (nOutputPlane x nInputPlane x kT x kH x kW)"
+ );
+
+ int nOutputPlane = (int)gradWeight->size[0];
+ int nInputPlane = (int)gradWeight->size[1];
+ int kT = (int)gradWeight->size[2];
+ int kH = (int)gradWeight->size[3];
+ int kW = (int)gradWeight->size[4];
+
+ THArgCheck(
+ input->nDimension == 4 || input->nDimension == 5, 2,
+ "3D or 4D (batch mode) tensor is expected"
+ );
+
+ int batch = 1;
+ if (input->nDimension == 4)
+ {
+ // Force batch
+ batch = 0;
+ THCTensor_(resize5d)(state, input, 1, input->size[0], input->size[1], input->size[2], input->size[3]);
+ THCTensor_(resize5d)(state, gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2], gradOutput->size[3]);
+ }
+
+ long inputWidth = input->size[3];
+ long inputHeight = input->size[2];
+ long inputDepth = input->size[4];
+ long outputWidth = (inputWidth + 2*padH - kH) / dH + 1;
+ long outputHeight = (inputHeight + 2*padT - kT) / dT + 1;
+ long outputDepth = (inputDepth + 2*padW - kW) / dW + 1;
+
+ // Batch size + input planes
+ long batchSize = input->size[0];
+
+ // Define a buffer of ones, for bias accumulation
+ if (ones->nDimension != 3 || ones->size[0]*ones->size[1]*ones->size[2] < outputDepth*outputHeight*outputWidth)
+ {
+ // Resize plane and fill with ones...
+ THCTensor_(resize3d)(state, ones, outputHeight, outputWidth, outputDepth);
+ THCTensor_(fill)(state, ones, ScalarConvert<int, real>::to(1));
+ }
+
+ // Resize temporary columns
+ THCTensor_(resize2d)(state, columns, nInputPlane*kH*kT*kW, outputDepth*outputHeight*outputWidth);
+
+ // Helpers
+ THCTensor *input_n = THCTensor_(new)(state);
+ THCTensor *gradOutput_n = THCTensor_(new)(state);
+
+ // For each elt in batch, do:
+ for (int elt = 0; elt < batchSize; elt ++)
+ {
+ // Matrix mulitply per output:
+ THCTensor_(select)(state, input_n, input, 0, elt);
+ THCTensor_(select)(state, gradOutput_n, gradOutput, 0, elt);
+
+ // Extract columns:
+ im3d2col(
+ THCState_getCurrentStream(state),
+ THCTensor_(data)(state, input_n),
+ nInputPlane, inputHeight, inputWidth, inputDepth, kT, kH, kW, padT, padH, padW, dT, dH, dW,
+ THCTensor_(data)(state, columns)
+ );
+
+ // M,N,K are dims of matrix A and B
+ // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
+ long m = gradWeight->size[0];
+ long n = gradWeight->size[1]*gradWeight->size[2]*gradWeight->size[3]*gradWeight->size[4];
+ long k = columns->size[1];
+
+ // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+ #ifdef THC_REAL_IS_FLOAT
+ THCudaBlas_Sgemm(
+ #elif defined(THC_REAL_IS_HALF)
+ THCudaBlas_Hgemm(
+ #elif defined(THC_REAL_IS_DOUBLE)
+ THCudaBlas_Dgemm(
+ #endif
+ state,
+ 't', 'n',
+ n, m, k,
+ scale,
+ THCTensor_(data)(state, columns), k,
+ THCTensor_(data)(state, gradOutput_n), k,
+ ScalarConvert<int, real>::to(1),
+ THCTensor_(data)(state, gradWeight), n
+ );
+
+ // Do Bias:
+ // M,N,K are dims of matrix A and B
+ // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
+ long m_ = nOutputPlane;
+ long k_ = outputDepth * outputHeight * outputWidth;
+
+ // Do GEMV (note: this is a bit confusing because gemv assumes column-major matrices)
+ #if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE)
+ #ifdef THC_REAL_IS_FLOAT
+ THCudaBlas_Sgemv(
+ #elif defined(THC_REAL_IS_DOUBLE)
+ THCudaBlas_Dgemv(
+ #endif
+ state,
+ 't',
+ k_, m_,
+ scale,
+ THCTensor_(data)(state, gradOutput_n), k_,
+ THCTensor_(data)(state, ones), 1,
+ ScalarConvert<int, real>::to(1),
+ THCTensor_(data)(state, gradBias), 1
+ );
+ #endif
+ #ifdef THC_REAL_IS_HALF
+ THCudaBlas_Hgemm(
+ state,
+ 't', 'n',
+ m_, 1, k_,
+ scale,
+ THCTensor_(data)(state, gradOutput_n), k_,
+ THCTensor_(data)(state, ones), k_,
+ ScalarConvert<int, real>::to(1),
+ THCTensor_(data)(state, gradBias), m_
+ );
+ #endif
+ }
+
+ // Free
+ THCTensor_(free)(state, input_n);
+ THCTensor_(free)(state, gradOutput_n);
+
+ // Resize
+ if (batch == 0)
+ {
+ THCTensor_(resize4d)(state, gradOutput, nOutputPlane, outputHeight, outputWidth, outputDepth);
+ THCTensor_(resize4d)(state, input, nInputPlane, inputHeight, inputWidth, inputDepth);
+ }
+}
+
+#endif
diff --git a/lib/THCUNN/generic/VolumetricDilatedConvolution.cu b/lib/THCUNN/generic/VolumetricDilatedConvolution.cu
new file mode 100644
index 0000000..22536fe
--- /dev/null
+++ b/lib/THCUNN/generic/VolumetricDilatedConvolution.cu
@@ -0,0 +1,411 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/VolumetricDilatedConvolution.cu"
+#else
+
+void THNN_(VolumetricDilatedConvolution_updateOutput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *output,
+ THCTensor *weight,
+ THCTensor *bias,
+ THCTensor *columns,
+ THCTensor *ones,
+ int kT, int kW, int kH,
+ int dT, int dW, int dH,
+ int padT, int padW, int padH,
+ int dilationT, int dilationW, int dilationH) {
+
+ THCUNN_assertSameGPU_generic(state, 5, input, output, weight, columns, ones);
+ if (bias) {
+ THCUNN_assertSameGPU_generic(state, 2, weight, bias);
+ }
+ THArgCheck(input->nDimension == 4 || input->nDimension == 5, 2, "4D or 5D (batch mode) tensor is expected, but got: %d", input->nDimension);
+ THArgCheck(weight->nDimension == 5, 4, "weight tensor must be 5D (nOutputPlane,nInputPlane,kT,kH,kW)");
+ THArgCheck(!bias || weight->size[0] == bias->size[0], 4, "nOutputPlane mismatch in weight and bias");
+ THArgCheck(kT > 0 && kW > 0 && kH > 0, 8, "kernel size should be greater than zero");
+ THArgCheck(dT > 0 && dW > 0 && dH > 0, 10, "stride should be greater than zero");
+ THArgCheck(dilationT > 0 && dilationW > 0 && dilationH > 0, 16, "dilation should be greater than 0");
+
+ // Params:
+ int nInputPlane = weight->size[1];
+ int nOutputPlane = weight->size[0];
+
+ int batch = 1;
+ if (input->nDimension == 4) {
+ THArgCheck(input->size[0] == nInputPlane, 2, "input channels and nInputPlane dont match");
+ // Force batch
+ batch = 0;
+ THCTensor_(resize5d)(state, input, 1, input->size[0], input->size[1], input->size[2], input->size[3]);
+ } else {
+ THArgCheck(input->size[1] == nInputPlane, 2, "input channels and nInputPlane dont match");
+ }
+
+ long inputDepth = input->size[2];
+ long inputHeight = input->size[3];
+ long inputWidth = input->size[4];
+ long outputDepth = (inputDepth + 2*padT - (dilationT * (kT - 1) + 1)) / dT + 1;
+ long outputHeight = (inputHeight + 2*padH - (dilationH * (kH - 1) + 1)) / dH + 1;
+ long outputWidth = (inputWidth + 2*padW - (dilationW * (kW - 1) + 1)) / dW + 1;
+
+ if (outputDepth < 1 || outputWidth < 1 || outputHeight < 1)
+ THError("Given input size: (%dx%dx%dx%d). Calculated output size: (%dx%dx%dx%d). Output size is too small",
+ nInputPlane,inputDepth,inputHeight,inputWidth,nOutputPlane,outputDepth,outputHeight,outputWidth);
+
+ // Batch size + input planes
+ long batchSize = input->size[0];
+
+ // Resize output
+ THCTensor_(resize5d)(state, output, batchSize, nOutputPlane, outputDepth, outputHeight, outputWidth);
+
+ // Resize temporary columns
+ THCTensor_(resize2d)(state, columns, nInputPlane*kT*kW*kH, outputDepth*outputHeight*outputWidth);
+
+ // Define a buffer of ones, for bias accumulation
+ // Note: this buffer can be shared with other modules, it only ever gets increased,
+ // and always contains ones.
+ if (ones->nDimension != 2 || ones->size[0]*ones->size[1]*ones->size[2] < outputDepth*outputHeight*outputWidth) {
+ // Resize plane and fill with ones...
+ THCTensor_(resize3d)(state, ones, outputDepth, outputHeight, outputWidth);
+ THCTensor_(fill)(state, ones, ScalarConvert<int, real>::to(1));
+ }
+
+ // Helpers
+ THCTensor *input_n = THCTensor_(new)(state);
+ THCTensor *output_n = THCTensor_(new)(state);
+
+ // For each elt in batch, do:
+ for (int elt = 0; elt < batchSize; elt ++) {
+ // Matrix mulitply per output:
+ THCTensor_(select)(state, input_n, input, 0, elt);
+ THCTensor_(select)(state, output_n, output, 0, elt);
+
+ // Do Bias first:
+ // M,N,K are dims of matrix A and B
+ // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
+ long m_ = nOutputPlane;
+ long n_ = outputDepth * outputHeight * outputWidth;
+ long k_ = 1;
+
+ // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+ if (bias) {
+ #ifdef THC_REAL_IS_FLOAT
+ THCudaBlas_Sgemm(
+ #elif defined(THC_REAL_IS_HALF)
+ THCudaBlas_Hgemm(
+ #elif defined(THC_REAL_IS_DOUBLE)
+ THCudaBlas_Dgemm(
+ #endif
+ state,
+ 't', 'n',
+ n_, m_, k_,
+ ScalarConvert<int, real>::to(1),
+ THCTensor_(data)(state, ones), k_,
+ THCTensor_(data)(state, bias), k_,
+ ScalarConvert<int, real>::to(0),
+ THCTensor_(data)(state, output_n), n_
+ );
+ } else {
+ THCTensor_(zero)(state, output_n);
+ }
+
+ // Extract columns:
+ vol2col(
+ THCState_getCurrentStream(state),
+ THCTensor_(data)(state, input_n),
+ nInputPlane, inputDepth, inputHeight, inputWidth,
+ kT, kH, kW, padT, padH, padW, dT, dH, dW,
+ dilationT, dilationH, dilationW,
+ THCTensor_(data)(state, columns)
+ );
+
+ // M,N,K are dims of matrix A and B
+ // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
+ long m = nOutputPlane;
+ long n = columns->size[1];
+ long k = nInputPlane*kT*kH*kW;
+
+ // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+ #ifdef THC_REAL_IS_FLOAT
+ THCudaBlas_Sgemm(
+ #elif defined(THC_REAL_IS_HALF)
+ THCudaBlas_Hgemm(
+ #elif defined(THC_REAL_IS_DOUBLE)
+ THCudaBlas_Dgemm(
+ #endif
+ state,
+ 'n', 'n',
+ n, m, k,
+ ScalarConvert<int, real>::to(1),
+ THCTensor_(data)(state, columns), n,
+ THCTensor_(data)(state, weight), k,
+ ScalarConvert<int, real>::to(1),
+ THCTensor_(data)(state, output_n), n
+ );
+ }
+
+ // Free
+ THCTensor_(free)(state, input_n);
+ THCTensor_(free)(state, output_n);
+
+ // Resize output
+ if (batch == 0) {
+ THCTensor_(resize4d)(state, output, nOutputPlane, outputDepth, outputHeight, outputWidth);
+ THCTensor_(resize4d)(state, input, nInputPlane, inputDepth, inputHeight, inputWidth);
+ }
+}
+
+void THNN_(VolumetricDilatedConvolution_updateGradInput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *gradOutput,
+ THCTensor *gradInput,
+ THCTensor *weight,
+ THCTensor *gradColumns,
+ int kT, int kW, int kH,
+ int dT, int dW, int dH,
+ int padT, int padW, int padH,
+ int dilationT, int dilationW, int dilationH) {
+
+ THCUNN_assertSameGPU_generic(state, 5, input, gradOutput, weight,
+ gradColumns, gradInput);
+ THArgCheck(input->nDimension == 4 || input->nDimension == 5, 2, "4D or 5D (batch mode) tensor is expected");
+ THArgCheck(gradOutput->nDimension == 4 || gradOutput->nDimension == 5, 3, "4D or 5D (batch mode) tensor is expected");
+ THArgCheck(weight->nDimension == 5, 4, "weight tensor must be 5D (nOutputPlane,nInputPlane,kT,kH,kW)");
+ THArgCheck(kT > 0 && kW > 0 && kH > 0, 8, "kernel size should be greater than zero");
+ THArgCheck(dT > 0 && dW > 0 && dH > 0, 10, "stride should be greater than zero");
+
+ // Params
+ int nInputPlane = weight->size[1];
+ int nOutputPlane = weight->size[0];
+
+ int batch = 1;
+ if (input->nDimension == 4) {
+ // Force batch
+ batch = 0;
+ THCTensor_(resize5d)(state, input, 1, input->size[0], input->size[1], input->size[2], input->size[3]);
+ THCTensor_(resize5d)(state, gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2], gradOutput->size[3]);
+ }
+
+ long inputDepth = input->size[2];
+ long inputWidth = input->size[4];
+ long inputHeight = input->size[3];
+ long outputDepth = (inputDepth + 2*padT - (dilationT * (kT - 1) + 1)) / dT + 1;
+ long outputWidth = (inputWidth + 2*padW - (dilationW * (kW - 1) + 1)) / dW + 1;
+ long outputHeight = (inputHeight + 2*padH - (dilationH * (kH - 1) + 1)) / dH + 1;
+
+ // Batch size + input planes
+ long batchSize = input->size[0];
+
+ // Resize output
+ THCTensor_(resize5d)(state, gradInput, batchSize, nInputPlane, inputDepth, inputHeight, inputWidth);
+
+ // Resize temporary columns
+ THCTensor_(resize2d)(state, gradColumns, nInputPlane*kT*kW*kH, outputDepth*outputHeight*outputWidth);
+
+ // Helpers
+ THCTensor *gradInput_n = THCTensor_(new)(state);
+ THCTensor *gradOutput_n = THCTensor_(new)(state);
+
+ // For each elt in batch, do:
+ for (int elt = 0; elt < batchSize; elt ++) {
+ // Matrix mulitply per sample:
+ THCTensor_(select)(state, gradInput_n, gradInput, 0, elt);
+ THCTensor_(select)(state, gradOutput_n, gradOutput, 0, elt);
+
+ // M,N,K are dims of matrix A and B
+ // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
+ long m = nInputPlane*kT*kW*kH;
+ long n = gradColumns->size[1];
+ long k = nOutputPlane;
+
+ // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+ #ifdef THC_REAL_IS_FLOAT
+ THCudaBlas_Sgemm(
+ #elif defined(THC_REAL_IS_HALF)
+ THCudaBlas_Hgemm(
+ #elif defined(THC_REAL_IS_DOUBLE)
+ THCudaBlas_Dgemm(
+ #endif
+ state,
+ 'n', 't',
+ n, m, k,
+ ScalarConvert<int, real>::to(1),
+ THCTensor_(data)(state, gradOutput_n), n,
+ THCTensor_(data)(state, weight), m,
+ ScalarConvert<int, real>::to(0),
+ THCTensor_(data)(state, gradColumns), n
+ );
+
+ // Unpack columns back into input:
+ col2vol<real, accreal>(
+ THCState_getCurrentStream(state),
+ THCTensor_(data)(state, gradColumns),
+ nInputPlane, inputDepth, inputHeight, inputWidth,
+ kT, kH, kW, padT, padH, padW, dT, dH, dW,
+ dilationT, dilationH, dilationW,
+ THCTensor_(data)(state, gradInput_n)
+ );
+ }
+
+ // Free
+ THCTensor_(free)(state, gradInput_n);
+ THCTensor_(free)(state, gradOutput_n);
+
+ // Resize output
+ if (batch == 0) {
+ THCTensor_(resize4d)(state, gradOutput, nOutputPlane, outputDepth, outputHeight, outputWidth);
+ THCTensor_(resize4d)(state, input, nInputPlane, inputDepth, inputHeight, inputWidth);
+ THCTensor_(resize4d)(state, gradInput, nInputPlane, inputDepth, inputHeight, inputWidth);
+ }
+}
+
+void THNN_(VolumetricDilatedConvolution_accGradParameters)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *gradOutput,
+ THCTensor *gradWeight,
+ THCTensor *gradBias,
+ THCTensor *columns,
+ THCTensor *ones,
+ int kT, int kW, int kH,
+ int dT, int dW, int dH,
+ int padT, int padW, int padH,
+ int dilationT, int dilationW, int dilationH,
+ real scale) {
+
+ THCUNN_assertSameGPU_generic(state, 5, input, gradOutput, gradWeight, columns, ones);
+ if (gradBias) {
+ THCUNN_assertSameGPU_generic(state, 2, gradWeight, gradBias);
+ }
+ THArgCheck(input->nDimension == 4 || input->nDimension == 5, 2, "4D or 5D (batch mode) tensor is expected");
+ THArgCheck(gradOutput->nDimension == 4 || gradOutput->nDimension == 5, 3, "4D or 5D (batch mode) tensor is expected");
+ THArgCheck(gradWeight->nDimension == 5, 4, "gradWeight tensor must be 5D (nOutputPlane,nInputPlane,kT,kH,kW)");
+ THArgCheck(kT > 0 && kW > 0 && kH > 0, 8, "kernel size should be greater than zero");
+ THArgCheck(dT > 0 && dW > 0 && dH > 0, 10, "stride should be greater than zero");
+
+ // Params
+ int nInputPlane = gradWeight->size[1];
+ int nOutputPlane = gradWeight->size[0];
+
+ int batch = 1;
+ if (input->nDimension == 4) {
+ // Force batch
+ batch = 0;
+ THCTensor_(resize5d)(state, input, 1, input->size[0], input->size[1], input->size[2], input->size[3]);
+ THCTensor_(resize5d)(state, gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2], gradOutput->size[3]);
+ }
+
+ long inputDepth = input->size[2];
+ long inputWidth = input->size[4];
+ long inputHeight = input->size[3];
+ long outputDepth = (inputDepth + 2*padT - (dilationT * (kT - 1) + 1)) / dT + 1;
+ long outputWidth = (inputWidth + 2*padW - (dilationW * (kW - 1) + 1)) / dW + 1;
+ long outputHeight = (inputHeight + 2*padH - (dilationH * (kH - 1) + 1)) / dH + 1;
+
+ // Batch size + input planes
+ long batchSize = input->size[0];
+
+ // Define a buffer of ones, for bias accumulation
+ if (ones->nDimension != 3 || ones->size[0]*ones->size[1]*ones->size[2] < outputDepth*outputHeight*outputWidth) {
+ // Resize plane and fill with ones...
+ THCTensor_(resize3d)(state, ones, outputDepth, outputHeight, outputWidth);
+ THCTensor_(fill)(state, ones, ScalarConvert<int, real>::to(1));
+ }
+
+ // Resize temporary columns
+ THCTensor_(resize2d)(state, columns, nInputPlane*kT*kW*kH, outputDepth*outputHeight*outputWidth);
+
+ // Helpers
+ THCTensor *input_n = THCTensor_(new)(state);
+ THCTensor *gradOutput_n = THCTensor_(new)(state);
+
+ // For each elt in batch, do:
+ for (int elt = 0; elt < batchSize; elt ++) {
+ // Matrix mulitply per output:
+ THCTensor_(select)(state, input_n, input, 0, elt);
+ THCTensor_(select)(state, gradOutput_n, gradOutput, 0, elt);
+
+ // Extract columns:
+ vol2col(
+ THCState_getCurrentStream(state),
+ THCTensor_(data)(state, input_n),
+ nInputPlane, inputDepth, inputHeight, inputWidth, kT, kH, kW, padT, padH, padW, dT, dH, dW,
+ dilationT, dilationH, dilationW,
+ THCTensor_(data)(state, columns)
+ );
+
+ // M,N,K are dims of matrix A and B
+ // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
+ long m = nOutputPlane;
+ long n = nInputPlane*kT*kW*kH;
+ long k = columns->size[1];
+
+ // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+ #ifdef THC_REAL_IS_FLOAT
+ THCudaBlas_Sgemm(
+ #elif defined(THC_REAL_IS_HALF)
+ THCudaBlas_Hgemm(
+ #elif defined(THC_REAL_IS_DOUBLE)
+ THCudaBlas_Dgemm(
+ #endif
+ state,
+ 't', 'n',
+ n, m, k,
+ scale,
+ THCTensor_(data)(state, columns), k,
+ THCTensor_(data)(state, gradOutput_n), k,
+ ScalarConvert<int, real>::to(1),
+ THCTensor_(data)(state, gradWeight), n
+ );
+
+ // Do Bias:
+ // M,N,K are dims of matrix A and B
+ // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
+ long m_ = nOutputPlane;
+ long k_ = outputDepth * outputHeight * outputWidth;
+
+ // Do GEMV (note: this is a bit confusing because gemv assumes column-major matrices)
+ if (gradBias) {
+ #if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE)
+ #ifdef THC_REAL_IS_FLOAT
+ THCudaBlas_Sgemv(
+ #elif defined(THC_REAL_IS_DOUBLE)
+ THCudaBlas_Dgemv(
+ #endif
+ state,
+ 't',
+ k_, m_,
+ scale,
+ THCTensor_(data)(state, gradOutput_n), k_,
+ THCTensor_(data)(state, ones), 1,
+ ScalarConvert<int, real>::to(1),
+ THCTensor_(data)(state, gradBias), 1
+ );
+ #endif
+ #ifdef THC_REAL_IS_HALF
+ THCudaBlas_Hgemm(
+ state,
+ 't', 'n',
+ m_, 1, k_,
+ scale,
+ THCTensor_(data)(state, gradOutput_n), k_,
+ THCTensor_(data)(state, ones), k_,
+ ScalarConvert<int, real>::to(1),
+ THCTensor_(data)(state, gradBias), m_
+ );
+ #endif
+ }
+ }
+
+ // Free
+ THCTensor_(free)(state, input_n);
+ THCTensor_(free)(state, gradOutput_n);
+
+ // Resize output
+ if (batch == 0) {
+ THCTensor_(resize4d)(state, gradOutput, nOutputPlane, outputDepth, outputHeight, outputWidth);
+ THCTensor_(resize4d)(state, input, nInputPlane, inputDepth, inputHeight, inputWidth);
+ }
+}
+
+#endif
diff --git a/lib/THCUNN/generic/VolumetricDilatedMaxPooling.cu b/lib/THCUNN/generic/VolumetricDilatedMaxPooling.cu
new file mode 100644
index 0000000..0d6d084
--- /dev/null
+++ b/lib/THCUNN/generic/VolumetricDilatedMaxPooling.cu
@@ -0,0 +1,290 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/VolumetricDilatedMaxPooling.cu"
+#else
+
+#define UPDATE_OUTPUT_KERNEL_WIDTH(KW) case KW: \
+ cuda_VolumetricDilatedMaxPooling_updateOutput<KW><<<grid, block, \
+ 0, THCState_getCurrentStream(state)>>>( \
+ cudaInput, cudaIndices, cudaOutput, kT, kH, dT, dH, dW, padT, padH, padW,\
+ dilationT, dilationH, dilationW, offsetZ); \
+ break
+
+void THNN_(VolumetricDilatedMaxPooling_updateOutput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *output,
+ THCIndexTensor *indices,
+ int kT, int kW, int kH,
+ int dT, int dW, int dH,
+ int padT, int padW, int padH,
+ int dilationT, int dilationW, int dilationH,
+ bool ceilMode)
+{
+ int batchSize;
+ int inputSlices;
+ int inputTime;
+ int inputHeight;
+ int inputWidth;
+ int outputTime;
+ int outputHeight;
+ int outputWidth;
+
+ THCUNN_assertSameGPU_generic(state, 3, input, indices, output);
+
+ if (THCTensor_(nDimension)(state, input) == 4)
+ {
+ THArgCheck(
+ THCTensor_(size)(state, input, 1) >= kT &&
+ THCTensor_(size)(state, input, 2) >= kH &&
+ THCTensor_(size)(state, input, 3) >= kW, 2,
+ "input image smaller than kernel size"
+ );
+
+ /* sizes */
+ batchSize = 1;
+ inputSlices = THCTensor_(size)(state, input, 0);
+ inputTime = THCTensor_(size)(state, input, 1);
+ inputHeight = THCTensor_(size)(state, input, 2);
+ inputWidth = THCTensor_(size)(state, input, 3);
+ }
+ else if (THCTensor_(nDimension)(state, input) == 5)
+ {
+ THArgCheck(
+ THCTensor_(size)(state, input, 4) >= kW &&
+ THCTensor_(size)(state, input, 3) >= kH &&
+ THCTensor_(size)(state, input, 2) >= kT, 2,
+ "input image smaller than kernel size"
+ );
+
+ /* sizes */
+ batchSize = THCTensor_(size)(state, input, 0);
+ inputSlices = THCTensor_(size)(state, input, 1);
+ inputTime = THCTensor_(size)(state, input, 2);
+ inputHeight = THCTensor_(size)(state, input, 3);
+ inputWidth = THCTensor_(size)(state, input, 4);
+ }
+ else
+ {
+ THArgCheck(false, 2, "4D or 5D tensor expected");
+ }
+
+ THArgCheck(kT/2 >= padT && kW/2 >= padW && kH/2 >= padH, 2,
+ "pad should be smaller than half of kernel size"
+ );
+ THArgCheck(dilationT > 0 && dilationW > 0 && dilationH > 0, 14,
+ "dilation should be greater than 0"
+ );
+
+ if (ceilMode)
+ {
+ outputTime = (int)(ceil((float)(inputTime - (dilationT * (kT - 1) + 1) + 2*padT) / dT)) + 1;
+ outputHeight = (int)(ceil((float)(inputHeight - (dilationH * (kH - 1) + 1) + 2*padH) / dH)) + 1;
+ outputWidth = (int)(ceil((float)(inputWidth - (dilationW * (kW - 1) + 1) + 2*padW) / dW)) + 1;
+ }
+ else
+ {
+ outputTime = (int)(floor((float)(inputTime - (dilationT * (kT - 1) + 1) + 2*padT) / dT)) + 1;
+ outputHeight = (int)(floor((float)(inputHeight - (dilationH * (kH - 1) + 1) + 2*padH) / dH)) + 1;
+ outputWidth = (int)(floor((float)(inputWidth - (dilationW * (kW - 1) + 1) + 2*padW) / dW)) + 1;
+ }
+
+ if (outputTime < 1 || outputHeight < 1 || outputWidth < 1)
+ THError("Given input size: (%dx%dx%dx%d). Calculated output size: (%dx%dx%dx%d). Output size is too small",
+ inputSlices,inputTime,inputHeight,inputWidth,inputSlices,outputTime,outputHeight,outputWidth);
+
+ if (padT || padW || padH)
+ {
+ if ((outputTime - 1)*dT >= inputTime + padT)
+ --outputTime;
+ if ((outputHeight - 1)*dH >= inputHeight + padH)
+ --outputHeight;
+ if ((outputWidth - 1)*dW >= inputWidth + padW)
+ --outputWidth;
+ }
+
+ if (input->nDimension == 4) /* 4D */
+ {
+ /* resize output */
+ THCTensor_(resize4d)(state, output, inputSlices,
+ outputTime, outputHeight, outputWidth);
+ /* indices pack ti,i,j locations for each output point as uchar into
+ each float of the tensor */
+ THCIndexTensor_(resize4d)(state, indices, inputSlices,
+ outputTime, outputHeight, outputWidth);
+ }
+ else
+ { /* 5D */
+ THCTensor_(resize5d)(state, output, batchSize, inputSlices,
+ outputTime, outputHeight, outputWidth);
+ // Index tensor packs index offsets as uchars into floats
+ THCIndexTensor_(resize5d)(state, indices, batchSize, inputSlices,
+ outputTime, outputHeight, outputWidth);
+ }
+
+ input = THCTensor_(newContiguous)(state, input);
+
+ // Collapse batch and feature dimensions
+ THCDeviceTensor<real, 4> cudaInput;
+ THCDeviceTensor<real, 4> cudaOutput;
+ if (THCTensor_(nDimension)(state, input) == 4)
+ {
+ cudaInput = toDeviceTensor<real, 4>(state, input);
+ cudaOutput = toDeviceTensor<real, 4>(state, output);
+ }
+ else
+ {
+ cudaInput = toDeviceTensor<real, 5>(state, input).downcastOuter<4>();
+ cudaOutput = toDeviceTensor<real, 5>(state, output).downcastOuter<4>();
+ }
+
+ THLongStorage *indicesSize = THLongStorage_newWithSize(4);
+ long indicesSizeRaw[4] = { batchSize * inputSlices,
+ outputTime, outputHeight, outputWidth };
+ THLongStorage_rawCopy(indicesSize, indicesSizeRaw);
+
+ THCIndexTensor *indices1 = THCIndexTensor_(newWithStorage)(
+ state, THCIndexTensor_(storage)(state, indices),
+ THCIndexTensor_(storageOffset)(state, indices),
+ indicesSize, NULL);
+
+ THLongStorage_free(indicesSize);
+
+ THCDeviceTensor<THCIndex_t, 4> cudaIndices =
+ toDeviceTensor<THCIndex_t, 4>(state, indices1);
+
+ int totalZ = outputTime * inputSlices * batchSize;
+ int offsetZ = 0;
+ dim3 block(32, 8);
+
+ while (totalZ > 0) {
+ dim3 grid(THCCeilDiv(outputWidth, static_cast<int>(block.x)),
+ THCCeilDiv(outputHeight, static_cast<int>(block.y)),
+ totalZ > 65535 ? 65535 : totalZ);
+
+ switch (kW)
+ {
+ UPDATE_OUTPUT_KERNEL_WIDTH(1);
+ UPDATE_OUTPUT_KERNEL_WIDTH(2);
+ UPDATE_OUTPUT_KERNEL_WIDTH(3);
+ UPDATE_OUTPUT_KERNEL_WIDTH(4);
+ UPDATE_OUTPUT_KERNEL_WIDTH(5);
+ UPDATE_OUTPUT_KERNEL_WIDTH(6);
+ UPDATE_OUTPUT_KERNEL_WIDTH(7);
+ default:
+ cuda_VolumetricDilatedMaxPooling_updateOutput<<<grid, block,
+ 0, THCState_getCurrentStream(state)>>>(
+ cudaInput, cudaIndices, cudaOutput,
+ kT, kH, kW, dT, dH, dW,
+ padT, padH, padW, dilationT, dilationH, dilationW, offsetZ);
+ }
+ THCudaCheck(cudaGetLastError());
+ totalZ -= 65535;
+ offsetZ += 65535;
+ }
+
+ THCTensor_(free)(state, input);
+ THCIndexTensor_(free)(state, indices1);
+}
+
+#undef UPDATE_OUTPUT_KERNEL_WIDTH
+
+void THNN_(VolumetricDilatedMaxPooling_updateGradInput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *gradOutput,
+ THCTensor *gradInput,
+ THCIndexTensor *indices,
+ int dT, int dW, int dH,
+ int padT, int padW, int padH,
+ int dilationT, int dilationW, int dilationH)
+{
+ // Resize and initialize result tensor.
+ THCTensor_(resizeAs)(state, gradInput, input);
+ THCTensor_(zero)(state, gradInput);
+
+ int batchSize;
+ int inputSlices;
+
+ int outputTime;
+ int outputHeight;
+ int outputWidth;
+
+ THCUNN_assertSameGPU_generic(state, 4, input, indices, gradOutput, gradInput);
+
+ if (THCTensor_(nDimension)(state, input) == 4) /* 4D */
+ {
+ batchSize = 1;
+ inputSlices = THCTensor_(size)(state, input, 0);
+
+ outputTime = THCTensor_(size)(state, gradOutput, 1);
+ outputHeight = THCTensor_(size)(state, gradOutput, 2);
+ outputWidth = THCTensor_(size)(state, gradOutput, 3);
+ }
+ else
+ {
+ batchSize = THCTensor_(size)(state, input, 0);
+ inputSlices = THCTensor_(size)(state, input, 1);
+
+ outputTime = THCTensor_(size)(state, gradOutput, 2);
+ outputHeight = THCTensor_(size)(state, gradOutput, 3);
+ outputWidth = THCTensor_(size)(state, gradOutput, 4);
+ }
+
+ gradOutput = THCTensor_(newContiguous)(state, gradOutput);
+
+ // Collapse batch and feature dimensions
+ THCDeviceTensor<real, 4> cudaGradInput;
+ THCDeviceTensor<real, 4> cudaGradOutput;
+ if (THCTensor_(nDimension)(state, input) == 4)
+ {
+ cudaGradInput = toDeviceTensor<real, 4>(state, gradInput);
+ cudaGradOutput = toDeviceTensor<real, 4>(state, gradOutput);
+ }
+ else
+ {
+ cudaGradInput =
+ toDeviceTensor<real, 5>(state, gradInput).downcastOuter<4>();
+ cudaGradOutput =
+ toDeviceTensor<real, 5>(state, gradOutput).downcastOuter<4>();
+ }
+
+ THLongStorage *indicesSize = THLongStorage_newWithSize(4);
+ long indicesSizeRaw[4] = { batchSize * inputSlices,
+ outputTime, outputHeight, outputWidth };
+ THLongStorage_rawCopy(indicesSize, indicesSizeRaw);
+ THCIndexTensor *indices1 = THCIndexTensor_(newWithStorage)(
+ state, THCIndexTensor_(storage)(state, indices),
+ THCIndexTensor_(storageOffset)(state, indices), indicesSize, NULL);
+ THLongStorage_free(indicesSize);
+
+ THCDeviceTensor<THCIndex_t, 4> cudaIndices =
+ toDeviceTensor<THCIndex_t, 4>(state, indices1);
+
+ int totalZ = outputTime * inputSlices * batchSize;
+ int offsetZ = 0;
+ dim3 block(32, 8);
+
+ while (totalZ > 0) {
+ dim3 grid(THCCeilDiv(outputWidth, static_cast<int>(block.x)),
+ THCCeilDiv(outputHeight, static_cast<int>(block.y)),
+ totalZ > 65535 ? 65535 : totalZ);
+
+ cuda_VolumetricDilatedMaxPooling_updateGradInput<<<grid, block,
+ 0, THCState_getCurrentStream(state)>>>(
+ cudaGradOutput,
+ cudaIndices,
+ cudaGradInput,
+ dT, dH, dW,
+ padT, padH, padW,
+ dilationT, dilationH, dilationW, offsetZ);
+ THCudaCheck(cudaGetLastError());
+ totalZ -= 65535;
+ offsetZ += 65535;
+ }
+
+ // cleanup
+ THCTensor_(free)(state, gradOutput);
+ THCIndexTensor_(free)(state, indices1);
+}
+
+#endif
diff --git a/lib/THCUNN/generic/VolumetricFullConvolution.cu b/lib/THCUNN/generic/VolumetricFullConvolution.cu
new file mode 100644
index 0000000..7a1f788
--- /dev/null
+++ b/lib/THCUNN/generic/VolumetricFullConvolution.cu
@@ -0,0 +1,399 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/VolumetricFullConvolution.cu"
+#else
+
+void THNN_(VolumetricFullConvolution_updateOutput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *output,
+ THCTensor *weight,
+ THCTensor *bias,
+ THCTensor *finput,
+ THCTensor *fgradInput,
+ int dT, int dW, int dH,
+ int padT, int padW, int padH,
+ int adjT, int adjW, int adjH)
+{
+
+ THCTensor *columns = finput;
+ THCTensor *ones = fgradInput;
+
+ int nInputPlane = THCTensor_(size)(state, weight, 0);
+ int nOutputPlane = THCTensor_(size)(state, weight, 1);
+ const int kT = (int)weight->size[2];
+ const int kH = (int)weight->size[3];
+ const int kW = (int)weight->size[4];
+
+ THCUNN_assertSameGPU_generic(state, 6, input, output, weight,
+ bias, columns, ones);
+ THArgCheck(input->nDimension == 4 || input->nDimension == 5, 2, "4D or 5D (batch mode) tensor is expected");
+
+ int batch = 1;
+ if (input->nDimension == 4) {
+ THArgCheck(input->size[0] == nInputPlane, 2, "input channels and nInputPlane dont match");
+ // Force batch
+ batch = 0;
+ THCTensor_(resize5d)(state, input, 1, input->size[0], input->size[1], input->size[2], input->size[3]);
+ } else {
+ THArgCheck(input->size[1] == nInputPlane, 2, "input channels and nInputPlane dont match");
+ }
+
+ long inputWidth = input->size[4];
+ long inputHeight = input->size[3];
+ long inputDepth = input->size[2];
+ long outputWidth = (inputWidth - 1) * dW - 2*padW + kW + adjW;
+ long outputHeight = (inputHeight - 1) * dH - 2*padH + kH + adjH;
+ long outputDepth = (inputDepth - 1) * dT - 2*padT + kT + adjT;
+
+ // Batch size + input planes
+ long batchSize = input->size[0];
+
+ // Resize output
+ THCTensor_(resize5d)(state, output, batchSize, nOutputPlane, outputDepth, outputHeight, outputWidth);
+
+ // Resize temporary columns
+ THCTensor_(resize2d)(state, columns, nOutputPlane*kW*kH*kT, inputDepth*inputHeight*inputWidth);
+
+ // Define a buffer of ones, for bias accumulation
+ // Note: this buffer can be shared with other modules, it only ever gets increased,
+ // and always contains ones.
+ if (ones->nDimension != 3 || ones->size[0]*ones->size[1]*ones->size[2] < outputDepth*outputHeight*outputWidth) {
+ // Resize plane and fill with ones...
+ THCTensor_(resize3d)(state, ones, outputDepth, outputHeight, outputWidth);
+ THCTensor_(fill)(state, ones, ScalarConvert<int, real>::to(1));
+ }
+
+ // Helpers
+ THCTensor *input_n = THCTensor_(new)(state);
+ THCTensor *output_n = THCTensor_(new)(state);
+
+ // For each elt in batch, do:
+ for (int elt = 0; elt < batchSize; elt ++) {
+ // Matrix mulitply per output:
+ THCTensor_(select)(state, input_n, input, 0, elt);
+ THCTensor_(select)(state, output_n, output, 0, elt);
+
+ // M,N,K are dims of matrix A and B
+ // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
+ long m = weight->size[1] * weight->size[2] * weight->size[3] * weight->size[4];
+ long n = columns->size[1];
+ long k = weight->size[0];
+
+ // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+ #ifdef THC_REAL_IS_FLOAT
+ THCudaBlas_Sgemm(
+ #elif defined(THC_REAL_IS_HALF)
+ THCudaBlas_Hgemm(
+ #elif defined(THC_REAL_IS_DOUBLE)
+ THCudaBlas_Dgemm(
+ #endif
+ state,
+ 'n', 't',
+ n, m, k,
+ ScalarConvert<int, real>::to(1),
+ THCTensor_(data)(state, input_n), n,
+ THCTensor_(data)(state, weight), m,
+ ScalarConvert<int, real>::to(0),
+ THCTensor_(data)(state, columns), n
+ );
+
+ // Unpack columns back into input:
+ col2vol<real, accreal>(
+ THCState_getCurrentStream(state),
+ THCTensor_(data)(state, columns),
+ nOutputPlane, outputDepth, outputHeight, outputWidth, kT, kH, kW, padT, padH, padW, dT, dH, dW,
+ 1,1,1,
+ THCTensor_(data)(state, output_n)
+ );
+
+ // Do Bias after:
+ // M,N,K are dims of matrix A and B
+ // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
+ long m_ = nOutputPlane;
+ long n_ = outputDepth * outputHeight * outputWidth;
+ long k_ = 1;
+
+ // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+ #ifdef THC_REAL_IS_FLOAT
+ THCudaBlas_Sgemm(
+ #elif defined(THC_REAL_IS_HALF)
+ THCudaBlas_Hgemm(
+ #elif defined(THC_REAL_IS_DOUBLE)
+ THCudaBlas_Dgemm(
+ #endif
+ state,
+ 't', 'n',
+ n_, m_, k_,
+ ScalarConvert<int, real>::to(1),
+ THCTensor_(data)(state, ones), k_,
+ THCTensor_(data)(state, bias), k_,
+ ScalarConvert<int, real>::to(1),
+ THCTensor_(data)(state, output_n), n_
+ );
+
+ }
+
+ // Free
+ THCTensor_(free)(state, input_n);
+ THCTensor_(free)(state, output_n);
+
+ // Resize output
+ if (batch == 0) {
+ THCTensor_(resize4d)(state, output, nOutputPlane, outputDepth, outputHeight, outputWidth);
+ THCTensor_(resize4d)(state, input, nInputPlane, inputDepth, inputHeight, inputWidth);
+ }
+}
+
+void THNN_(VolumetricFullConvolution_updateGradInput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *gradOutput,
+ THCTensor *gradInput,
+ THCTensor *weight,
+ THCTensor *finput,
+ THCTensor *fgradInput,
+ int dT, int dW, int dH,
+ int padT, int padW, int padH,
+ int adjT, int adjW, int adjH)
+{
+ THCTensor *gradColumns = finput;
+
+ int nInputPlane = THCTensor_(size)(state, weight, 0);
+ int nOutputPlane = THCTensor_(size)(state, weight, 1);
+ const int kT = (int)weight->size[2];
+ const int kH = (int)weight->size[3];
+ const int kW = (int)weight->size[4];
+
+ THCUNN_assertSameGPU_generic(state, 5, input, gradOutput, weight,
+ gradColumns, gradInput);
+ THArgCheck(input->nDimension == 4 || input->nDimension == 5, 2, "4D or 5D (batch mode) tensor is expected");
+
+ int batch = 1;
+ if (input->nDimension == 4) {
+ // Force batch
+ batch = 0;
+ THCTensor_(resize5d)(state, input, 1, input->size[0], input->size[1], input->size[2], input->size[3]);
+ THCTensor_(resize5d)(state, gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2], gradOutput->size[3]);
+ }
+
+ long inputWidth = input->size[4];
+ long inputHeight = input->size[3];
+ long inputDepth = input->size[2];
+ long outputWidth = (inputWidth - 1) * dW - 2*padW + kW + adjW;
+ long outputHeight = (inputHeight - 1) * dH - 2*padH + kH + adjH;
+ long outputDepth = (inputDepth - 1) * dT - 2*padT + kT + adjT;
+
+ // Batch size + input planes
+ long batchSize = input->size[0];
+
+ // Resize output
+ THCTensor_(resize5d)(state, gradInput, batchSize, nInputPlane, inputDepth, inputHeight, inputWidth);
+
+ // Resize temporary columns
+ THCTensor_(resize2d)(state, gradColumns, nOutputPlane*kW*kH*kT, inputDepth*inputHeight*inputWidth);
+
+ // Helpers
+ THCTensor *gradInput_n = THCTensor_(new)(state);
+ THCTensor *gradOutput_n = THCTensor_(new)(state);
+
+ // For each elt in batch, do:
+ for (int elt = 0; elt < batchSize; elt ++) {
+ // Matrix mulitply per sample:
+ THCTensor_(select)(state, gradInput_n, gradInput, 0, elt);
+ THCTensor_(select)(state, gradOutput_n, gradOutput, 0, elt);
+
+ // Extract columns:
+ vol2col(
+ THCState_getCurrentStream(state),
+ THCTensor_(data)(state, gradOutput_n),
+ nOutputPlane, outputDepth, outputHeight, outputWidth, kT, kH, kW, padT, padH, padW, dT, dH, dW,
+ 1,1,1,
+ THCTensor_(data)(state, gradColumns)
+ );
+
+
+ // M,N,K are dims of matrix A and B
+ // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
+ long m = weight->size[0];
+ long n = gradColumns->size[1];
+ long k = weight->size[1] * weight->size[2] * weight->size[3] * weight->size[4];
+
+ // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+ #ifdef THC_REAL_IS_FLOAT
+ THCudaBlas_Sgemm(
+ #elif defined(THC_REAL_IS_HALF)
+ THCudaBlas_Hgemm(
+ #elif defined(THC_REAL_IS_DOUBLE)
+ THCudaBlas_Dgemm(
+ #endif
+ state,
+ 'n', 'n',
+ n, m, k,
+ ScalarConvert<int, real>::to(1),
+ THCTensor_(data)(state, gradColumns), n,
+ THCTensor_(data)(state, weight), k,
+ ScalarConvert<int, real>::to(0),
+ THCTensor_(data)(state, gradInput_n), n
+ );
+ }
+
+
+ // Free
+ THCTensor_(free)(state, gradInput_n);
+ THCTensor_(free)(state, gradOutput_n);
+
+ // Resize output
+ if (batch == 0) {
+ THCTensor_(resize4d)(state, gradOutput, nOutputPlane, outputDepth, outputHeight, outputWidth);
+ THCTensor_(resize4d)(state, input, nInputPlane, inputDepth, inputHeight, inputWidth);
+ THCTensor_(resize4d)(state, gradInput, nInputPlane, inputDepth, inputHeight, inputWidth);
+ }
+}
+
+
+void THNN_(VolumetricFullConvolution_accGradParameters)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *gradOutput,
+ THCTensor *gradWeight,
+ THCTensor *gradBias,
+ THCTensor *finput,
+ THCTensor *fgradInput,
+ int dT, int dW, int dH,
+ int padT, int padW, int padH,
+ int adjT, int adjW, int adjH,
+ real scale)
+{
+ THCTensor *columns = finput;
+ THCTensor *ones = fgradInput;
+
+ int nInputPlane = THCTensor_(size)(state, gradWeight, 0);
+ int nOutputPlane = THCTensor_(size)(state, gradWeight, 1);
+ const int kT = (int)gradWeight->size[2];
+ const int kH = (int)gradWeight->size[3];
+ const int kW = (int)gradWeight->size[4];
+
+ THCUNN_assertSameGPU_generic(state, 6, input, gradOutput, gradWeight,
+ gradBias, columns, ones);
+ THArgCheck(input->nDimension == 4 || input->nDimension == 5, 2, "4D or 5D (batch mode) tensor is expected");
+
+ int batch = 1;
+ if (input->nDimension == 4) {
+ // Force batch
+ batch = 0;
+ THCTensor_(resize5d)(state, input, 1, input->size[0], input->size[1], input->size[2], input->size[3]);
+ THCTensor_(resize5d)(state, gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2], gradOutput->size[3]);
+ }
+
+ long inputWidth = input->size[4];
+ long inputHeight = input->size[3];
+ long inputDepth = input->size[2];
+ long outputWidth = (inputWidth - 1) * dW - 2*padW + kW + adjW;
+ long outputHeight = (inputHeight - 1) * dH - 2*padH + kH + adjH;
+ long outputDepth = (inputDepth - 1) * dT - 2*padT + kT + adjT;
+
+ // Batch size + input planes
+ long batchSize = input->size[0];
+
+ // Define a buffer of ones, for bias accumulation
+ if (ones->nDimension != 3 || ones->size[0]*ones->size[1]*ones->size[2] < outputDepth*outputHeight*outputWidth) {
+ // Resize plane and fill with ones...
+ THCTensor_(resize3d)(state, ones, outputDepth, outputHeight, outputWidth);
+ THCTensor_(fill)(state, ones, ScalarConvert<int, real>::to(1));
+ }
+
+ // Resize temporary columns
+ THCTensor_(resize2d)(state, columns, nOutputPlane*kW*kH*kT, inputDepth*inputHeight*inputWidth);
+
+ // Helpers
+ THCTensor *input_n = THCTensor_(new)(state);
+ THCTensor *gradOutput_n = THCTensor_(new)(state);
+
+ // For each elt in batch, do:
+ for (int elt = 0; elt < batchSize; elt ++) {
+ // Matrix mulitply per output:
+ THCTensor_(select)(state, input_n, input, 0, elt);
+ THCTensor_(select)(state, gradOutput_n, gradOutput, 0, elt);
+
+ // Extract columns:
+ vol2col(
+ THCState_getCurrentStream(state),
+ THCTensor_(data)(state, gradOutput_n),
+ nOutputPlane, outputDepth, outputHeight, outputWidth, kT, kH, kW, padT, padH, padW, dT, dH, dW,
+ 1,1,1,
+ THCTensor_(data)(state, columns)
+ );
+
+ // M,N,K are dims of matrix A and B
+ // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
+ long n = columns->size[0]; // nOutputPlane * kt * kh * kw
+ long m = input_n->size[0]; // nInputPlane
+ long k = columns->size[1]; // inputHeight * inputWidth
+
+ // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+ #ifdef THC_REAL_IS_FLOAT
+ THCudaBlas_Sgemm(
+ #elif defined(THC_REAL_IS_HALF)
+ THCudaBlas_Hgemm(
+ #elif defined(THC_REAL_IS_DOUBLE)
+ THCudaBlas_Dgemm(
+ #endif
+ state,
+ 't', 'n',
+ n, m, k,
+ scale,
+ THCTensor_(data)(state, columns), k,
+ THCTensor_(data)(state, input_n), k,
+ ScalarConvert<int, real>::to(1),
+ THCTensor_(data)(state, gradWeight), n
+ );
+
+ // Do Bias:
+ // M,N,K are dims of matrix A and B
+ // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
+ long m_ = nOutputPlane;
+ long k_ = outputDepth * outputHeight * outputWidth;
+
+ // Do GEMV (note: this is a bit confusing because gemv assumes column-major matrices)
+ #if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE)
+ #ifdef THC_REAL_IS_FLOAT
+ THCudaBlas_Sgemv(
+ #elif defined(THC_REAL_IS_DOUBLE)
+ THCudaBlas_Dgemv(
+ #endif
+ state,
+ 't',
+ k_, m_,
+ scale,
+ THCTensor_(data)(state, gradOutput_n), k_,
+ THCTensor_(data)(state, ones), 1,
+ ScalarConvert<int, real>::to(1),
+ THCTensor_(data)(state, gradBias), 1
+ );
+ #endif
+ #ifdef THC_REAL_IS_HALF
+ THCudaBlas_Hgemm(
+ state,
+ 't', 'n',
+ m_, 1, k_,
+ scale,
+ THCTensor_(data)(state, gradOutput_n), k_,
+ THCTensor_(data)(state, ones), k_,
+ ScalarConvert<int, real>::to(1),
+ THCTensor_(data)(state, gradBias), m_
+ );
+ #endif
+ }
+
+ // Free
+ THCTensor_(free)(state, input_n);
+ THCTensor_(free)(state, gradOutput_n);
+
+ // Resize
+ if (batch == 0) {
+ THCTensor_(resize4d)(state, gradOutput, nOutputPlane, outputDepth, outputHeight, outputWidth);
+ THCTensor_(resize4d)(state, input, nInputPlane, inputDepth, inputHeight, inputWidth);
+ }
+}
+
+#endif
diff --git a/lib/THCUNN/generic/VolumetricMaxPooling.cu b/lib/THCUNN/generic/VolumetricMaxPooling.cu
new file mode 100644
index 0000000..4a55a45
--- /dev/null
+++ b/lib/THCUNN/generic/VolumetricMaxPooling.cu
@@ -0,0 +1,36 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/VolumetricMaxPooling.cu"
+#else
+
+void THNN_(VolumetricMaxPooling_updateOutput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *output,
+ THCIndexTensor *indices,
+ int kT, int kW, int kH,
+ int dT, int dW, int dH,
+ int padT, int padW, int padH,
+ bool ceilMode)
+{
+ THNN_(VolumetricDilatedMaxPooling_updateOutput)(
+ state, input, output, indices,
+ kT, kW, kH, dT, dW, dH, padT, padW, padH, 1, 1, 1, ceilMode);
+
+}
+
+void THNN_(VolumetricMaxPooling_updateGradInput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *gradOutput,
+ THCTensor *gradInput,
+ THCIndexTensor *indices,
+ int dT, int dW, int dH,
+ int padT, int padW, int padH)
+{
+ THNN_(VolumetricDilatedMaxPooling_updateGradInput)(
+ state, input, gradOutput, gradInput, indices,
+ dT, dW, dH, padT, padW, padH, 1, 1, 1);
+
+}
+
+#endif
diff --git a/lib/THCUNN/generic/VolumetricMaxUnpooling.cu b/lib/THCUNN/generic/VolumetricMaxUnpooling.cu
new file mode 100644
index 0000000..2a68094
--- /dev/null
+++ b/lib/THCUNN/generic/VolumetricMaxUnpooling.cu
@@ -0,0 +1,192 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/VolumetricMaxUnpooling.cu"
+#else
+
+void THNN_(VolumetricMaxUnpooling_updateOutput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *output,
+ THCIndexTensor *indices,
+ int outputTime, int outputWidth, int outputHeight,
+ int dT, int dW, int dH,
+ int padT, int padW, int padH)
+{
+ int batchSize;
+ int inputSlices;
+ int inputTime;
+ int inputHeight;
+ int inputWidth;
+
+ THCUNN_assertSameGPU_generic(state, 3, input, indices, output);
+
+ if (THCTensor_(nDimension)(state, input) == 4)
+ {
+ /* sizes */
+ batchSize = 1;
+ inputSlices = THCTensor_(size)(state, input, 0);
+ inputTime = THCTensor_(size)(state, input, 1);
+ inputHeight = THCTensor_(size)(state, input, 2);
+ inputWidth = THCTensor_(size)(state, input, 3);
+ }
+ else if (THCTensor_(nDimension)(state, input) == 5)
+ {
+ /* sizes */
+ batchSize = THCTensor_(size)(state, input, 0);
+ inputSlices = THCTensor_(size)(state, input, 1);
+ inputTime = THCTensor_(size)(state, input, 2);
+ inputHeight = THCTensor_(size)(state, input, 3);
+ inputWidth = THCTensor_(size)(state, input, 4);
+ }
+ else
+ {
+ THArgCheck(false, 2, "4D or 5D tensor expected");
+ }
+
+ if (input->nDimension == 4) /* 4D */
+ {
+ /* resize output */
+ THCTensor_(resize4d)(state, output, inputSlices,
+ outputTime, outputHeight, outputWidth);
+ }
+ else
+ { /* 5D */
+ THCTensor_(resize5d)(state, output, batchSize, inputSlices,
+ outputTime, outputHeight, outputWidth);
+ }
+
+ input = THCTensor_(newContiguous)(state, input);
+ indices = THCIndexTensor_(newContiguous)(state, indices);
+ THCTensor_(zero)(state, output);
+
+ // Collapse batch and feature dimensions
+ THCDeviceTensor<real, 4> cudaInput;
+ THCDeviceTensor<real, 4> cudaOutput;
+ THCDeviceTensor<THCIndex_t, 4> cudaIndices;
+
+ if (THCTensor_(nDimension)(state, input) == 4)
+ {
+ cudaInput = toDeviceTensor<real, 4>(state, input);
+ cudaOutput = toDeviceTensor<real, 4>(state, output);
+ cudaIndices = toDeviceTensor<THCIndex_t, 4>(state, indices);
+ }
+ else
+ {
+ cudaInput = toDeviceTensor<real, 5>(state, input).downcastOuter<4>();
+ cudaOutput = toDeviceTensor<real, 5>(state, output).downcastOuter<4>();
+ cudaIndices = toDeviceTensor<THCIndex_t, 5>(state, indices).downcastOuter<4>();
+ }
+
+ int totalZ = inputTime * inputSlices * batchSize;
+ int offsetZ = 0;
+ dim3 block(32, 8);
+
+ while (totalZ > 0) {
+ dim3 grid(THCCeilDiv(inputWidth, static_cast<int>(block.x)),
+ THCCeilDiv(inputHeight, static_cast<int>(block.y)),
+ totalZ > 65535 ? 65535 : totalZ);
+
+ cuda_VolumetricMaxUnpooling_updateOutput<<<grid, block,
+ 0, THCState_getCurrentStream(state)>>>(
+ cudaInput, cudaIndices, cudaOutput,
+ dT, dH, dW,
+ padT, padH, padW, offsetZ);
+ THCudaCheck(cudaGetLastError());
+ totalZ -= 65535;
+ offsetZ += 65535;
+ }
+
+ THCTensor_(free)(state, input);
+ THCIndexTensor_(free)(state, indices);
+}
+
+void THNN_(VolumetricMaxUnpooling_updateGradInput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *gradOutput,
+ THCTensor *gradInput,
+ THCIndexTensor *indices,
+ int outputTime, int outputWidth, int outputHeight,
+ int dT, int dW, int dH,
+ int padT, int padW, int padH)
+{
+
+ int batchSize;
+ int inputSlices;
+ int inputTime;
+ int inputHeight;
+ int inputWidth;
+
+ THCUNN_assertSameGPU_generic(state, 4, input, indices, gradOutput, gradInput);
+
+ if (THCTensor_(nDimension)(state, input) == 4) /* 4D */
+ {
+ batchSize = 1;
+ inputSlices = THCTensor_(size)(state, input, 0);
+ inputTime = THCTensor_(size)(state, input, 1);
+ inputHeight = THCTensor_(size)(state, input, 2);
+ inputWidth = THCTensor_(size)(state, input, 3);
+ }
+ else
+ {
+ batchSize = THCTensor_(size)(state, input, 0);
+ inputSlices = THCTensor_(size)(state, input, 1);
+ inputTime = THCTensor_(size)(state, input, 2);
+ inputHeight = THCTensor_(size)(state, input, 3);
+ inputWidth = THCTensor_(size)(state, input, 4);
+ }
+
+ input = THCTensor_(newContiguous)(state, input);
+ indices = THCIndexTensor_(newContiguous)(state, indices);
+ gradOutput = THCTensor_(newContiguous)(state, gradOutput);
+ THCTensor_(resizeAs)(state, gradInput, input);
+ THCTensor_(zero)(state, gradInput);
+
+ // Collapse batch and feature dimensions
+ THCDeviceTensor<real, 4> cudaGradInput;
+ THCDeviceTensor<real, 4> cudaGradOutput;
+ THCDeviceTensor<THCIndex_t, 4> cudaIndices;
+
+ if (THCTensor_(nDimension)(state, input) == 4)
+ {
+ cudaGradInput = toDeviceTensor<real, 4>(state, gradInput);
+ cudaGradOutput = toDeviceTensor<real, 4>(state, gradOutput);
+ cudaIndices = toDeviceTensor<THCIndex_t, 4>(state, indices);
+ }
+ else
+ {
+ cudaGradInput =
+ toDeviceTensor<real, 5>(state, gradInput).downcastOuter<4>();
+ cudaGradOutput =
+ toDeviceTensor<real, 5>(state, gradOutput).downcastOuter<4>();
+ cudaIndices =
+ toDeviceTensor<THCIndex_t, 5>(state, indices).downcastOuter<4>();
+ }
+
+ int totalZ = inputTime * inputSlices * batchSize;
+ int offsetZ = 0;
+ dim3 block(32, 8);
+
+ while (totalZ > 0) {
+ dim3 grid(THCCeilDiv(inputWidth, static_cast<int>(block.x)),
+ THCCeilDiv(inputHeight, static_cast<int>(block.y)),
+ totalZ > 65535 ? 65535 : totalZ);
+
+ cuda_VolumetricMaxUnpooling_updateGradInput<<<grid, block,
+ 0, THCState_getCurrentStream(state)>>>(
+ cudaGradOutput,
+ cudaIndices,
+ cudaGradInput,
+ dT, dH, dW,
+ padT, padH, padW, offsetZ);
+ THCudaCheck(cudaGetLastError());
+ totalZ -= 65535;
+ offsetZ += 65535;
+ }
+
+ // cleanup
+ THCTensor_(free)(state, input);
+ THCTensor_(free)(state, gradOutput);
+ THCIndexTensor_(free)(state, indices);
+}
+
+#endif
diff --git a/lib/THCUNN/generic/VolumetricReplicationPadding.cu b/lib/THCUNN/generic/VolumetricReplicationPadding.cu
new file mode 100644
index 0000000..60bf36c
--- /dev/null
+++ b/lib/THCUNN/generic/VolumetricReplicationPadding.cu
@@ -0,0 +1,120 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/VolumetricReplicationPadding.cu"
+#else
+
+void THNN_(VolumetricReplicationPadding_updateOutput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *output,
+ int pleft, int pright,
+ int ptop, int pbottom,
+ int pfront, int pback) {
+ THArgCheck(TensorUtils<THCTensor>::canUse32BitIndexMath(state, input), 2,
+ "input tensor must fit into 32-bit index math");
+
+ int planeDim = 0;
+ int dimd = 1;
+ int dimh = 2;
+ int dimw = 3;
+ int numBatch = 1;
+
+ int numInputDims = THCTensor_(nDimension)(state, input);
+ THArgCheck(numInputDims == 4 || numInputDims == 5, 2,
+ "input must be 4 or 5-dimensional");
+
+ if (numInputDims == 5) {
+ numBatch = THCTensor_(size)(state, input, 0);
+ planeDim++;
+ dimd++;
+ dimh++;
+ dimw++;
+ }
+
+ int numPlanes = THCTensor_(size)(state, input, planeDim);
+ int inputD = THCTensor_(size)(state, input, dimd);
+ int inputH = THCTensor_(size)(state, input, dimh);
+ int inputW = THCTensor_(size)(state, input, dimw);
+ int outputD = inputD + pfront + pback;
+ int outputH = inputH + ptop + pbottom;
+ int outputW = inputW + pleft + pright;
+
+ THCDeviceTensor<real, 5> devInput;
+ THCDeviceTensor<real, 5> devOutput;
+
+ if (numInputDims == 4) {
+ THCTensor_(resize4d)(state, output, numPlanes, outputD, outputH, outputW);
+
+ devInput = toDeviceTensor<real, 4>(state, input).upcastOuter<5>();
+ devOutput = toDeviceTensor<real, 4>(state, output).upcastOuter<5>();
+ } else {
+ THCTensor_(resize5d)(state, output, numBatch, numPlanes, outputD, outputH,
+ outputW);
+
+ devInput = toDeviceTensor<real, 5>(state, input);
+ devOutput = toDeviceTensor<real, 5>(state, output);
+ }
+
+ int outputPlaneSize = devOutput.getSize(2) * devOutput.getSize(3) *
+ devOutput.getSize(4);
+ dim3 gridSize(THCCeilDiv(outputPlaneSize, 256),
+ devOutput.getSize(1),
+ devOutput.getSize(0));
+ dim3 blockSize(outputPlaneSize > 256 ? 256 : outputPlaneSize);
+
+ VolumetricReplicationPadding_updateOutput<real><<<gridSize, blockSize, 0, THCState_getCurrentStream(state)>>>(
+ devInput, devOutput, pfront, pback, ptop, pbottom, pleft, pright);
+}
+
+void THNN_(VolumetricReplicationPadding_updateGradInput)(
+ THCState *state,
+ THCTensor *input,
+ THCTensor *gradOutput,
+ THCTensor *gradInput,
+ int pleft, int pright,
+ int ptop, int pbottom,
+ int pfront, int pback) {
+ THArgCheck(TensorUtils<THCTensor>::canUse32BitIndexMath(state, input), 2,
+ "input tensor must fit into 32-bit index math");
+ THArgCheck(TensorUtils<THCTensor>::canUse32BitIndexMath(state, gradOutput),
+ 3, "output gradient tensor must fit into 32-bit index math");
+
+ int planeDim = 0;
+ int dimd = 1;
+ int dimh = 2;
+ int dimw = 3;
+
+ int numInputDims = THCTensor_(nDimension)(state, input);
+ if (numInputDims == 5) {
+ planeDim++;
+ dimd++;
+ dimh++;
+ dimw++;
+ }
+
+ THCTensor_(resizeAs)(state, gradInput, input);
+ THCTensor_(zero)(state, gradInput);
+
+ THCDeviceTensor<real, 5> devGradInput;
+ THCDeviceTensor<real, 5> devGradOutput;
+
+ if (numInputDims == 4) {
+ devGradInput = toDeviceTensor<real, 4>(state, gradInput).upcastOuter<5>();
+ devGradOutput =
+ toDeviceTensor<real, 4>(state, gradOutput).upcastOuter<5>();
+ } else {
+ devGradInput = toDeviceTensor<real, 5>(state, gradInput);
+ devGradOutput = toDeviceTensor<real, 5>(state, gradOutput);
+ }
+
+ int outputPlaneSize = devGradOutput.getSize(2) * devGradOutput.getSize(3) *
+ devGradOutput.getSize(4);
+ dim3 gridSize(THCCeilDiv(outputPlaneSize, 256),
+ devGradOutput.getSize(1),
+ devGradOutput.getSize(0));
+ dim3 blockSize(outputPlaneSize > 256 ? 256 : outputPlaneSize);
+
+ VolumetricReplicationPadding_updateGradInput<<<gridSize, blockSize, 0, THCState_getCurrentStream(state)>>>(
+ devGradInput, devGradOutput, pfront, pback, ptop, pbottom, pleft, pright);
+}
+
+#endif
diff --git a/lib/THCUNN/im2col.h b/lib/THCUNN/im2col.h
index a81dbd2..ba57263 100644
--- a/lib/THCUNN/im2col.h
+++ b/lib/THCUNN/im2col.h
@@ -2,6 +2,7 @@
#define THCUNN_IM2COL_H
#include "common.h"
+#include "THCNumerics.cuh"
// Kernel for fast unfold+copy
// (borrowed from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/conv_layer.cu)
@@ -29,7 +30,7 @@ __global__ void im2col_kernel(const int n, const Dtype* data_im,
int h = h_in + i * dilation_h;
int w = w_in + j * dilation_w;
*data_col = (h >= 0 && w >= 0 && h < height && w < width) ?
- data_im[i * dilation_h * width + j * dilation_w] : 0;
+ data_im[i * dilation_h * width + j * dilation_w] : ScalarConvert<int, Dtype>::to(0);
data_col += height_col * width_col;
}
}
@@ -59,7 +60,7 @@ void im2col(cudaStream_t stream, const Dtype* data_im, const int channels,
THCudaCheck(cudaGetLastError());
}
-template <typename Dtype>
+template <typename Dtype, typename Acctype>
__global__ void col2im_kernel(const int n, const Dtype* data_col,
const int height, const int width, const int channels,
const int kernel_h, const int kernel_w,
@@ -69,7 +70,7 @@ __global__ void col2im_kernel(const int n, const Dtype* data_col,
const int height_col, const int width_col,
Dtype* data_im) {
CUDA_KERNEL_LOOP(index, n) {
- Dtype val = 0;
+ Acctype val = Acctype(0);
const int w_im = index % width + pad_w;
const int h_im = (index / width) % height + pad_h;
const int c_im = index / (width * height);
@@ -96,11 +97,11 @@ __global__ void col2im_kernel(const int n, const Dtype* data_col,
}
}
}
- data_im[index] = val;
+ data_im[index] = ScalarConvert<Acctype, Dtype>::to(val);
}
}
-template <typename Dtype>
+template <typename Dtype, typename Acctype>
void col2im(cudaStream_t stream, const Dtype* data_col, const int channels,
const int height, const int width,
const int patch_h, const int patch_w, const int pad_h,
@@ -113,7 +114,7 @@ void col2im(cudaStream_t stream, const Dtype* data_col, const int channels,
int num_kernels = channels * height * width;
// To avoid involving atomic operations, we will launch one kernel per
// bottom dimension, and then in the kernel add up the top dimensions.
- col2im_kernel <<<GET_BLOCKS(num_kernels), CUDA_NUM_THREADS, 0, stream>>> (
+ col2im_kernel<Dtype, Acctype> <<<GET_BLOCKS(num_kernels), CUDA_NUM_THREADS, 0, stream>>> (
num_kernels, data_col, height, width, channels,
patch_h, patch_w, pad_h, pad_w, stride_h, stride_w,
dilation_h, dilation_w,
diff --git a/lib/THCUNN/vol2col.h b/lib/THCUNN/vol2col.h
index add88e1..15b110e 100644
--- a/lib/THCUNN/vol2col.h
+++ b/lib/THCUNN/vol2col.h
@@ -2,6 +2,7 @@
#define THCUNN_VOL2COL_H
#include "common.h"
+#include "THCNumerics.cuh"
// Kernel for fast unfold+copy on volumes
template <typename Dtype>
@@ -33,7 +34,7 @@ CUDA_KERNEL_LOOP(index, n) {
int h = h_in + j * dilation_h;
int w = w_in + k * dilation_w;
*data_col = (t >= 0 && h >= 0 && w >= 0 && t < depth && h < height && w < width) ?
- data_vol[i * dilation_t * height * width + j * dilation_h * width + k * dilation_w] : 0;
+ data_vol[i * dilation_t * height * width + j * dilation_h * width + k * dilation_w] : ScalarConvert<int, Dtype>::to(0);
data_col += depth_col * height_col * width_col;
}
}
@@ -65,7 +66,7 @@ void vol2col(cudaStream_t stream, const Dtype* data_vol, const int channels,
THCudaCheck(cudaGetLastError());
}
-template <typename Dtype>
+template <typename Dtype, typename Acctype>
__global__ void vol2im_kernel(const int n, const Dtype* data_col,
const int depth, const int height, const int width, const int channels,
const int kernel_t, const int kernel_h, const int kernel_w,
@@ -75,7 +76,7 @@ __global__ void vol2im_kernel(const int n, const Dtype* data_col,
const int depth_col, const int height_col, const int width_col,
Dtype* data_vol) {
CUDA_KERNEL_LOOP(index, n) {
- Dtype val = 0;
+ Acctype val = Acctype(0);
const int w_im = index % width + pad_w;
const int h_im = (index / width) % height + pad_h;
const int t_im = (index / width / height) % depth + pad_t;
@@ -112,11 +113,11 @@ __global__ void vol2im_kernel(const int n, const Dtype* data_col,
}
}
}
- data_vol[index] = val;
+ data_vol[index] = ScalarConvert<Acctype, Dtype>::to(val);
}
}
-template <typename Dtype>
+template <typename Dtype, typename Acctype>
void col2vol(cudaStream_t stream, const Dtype* data_col, const int channels,
const int depth, const int height, const int width,
const int patch_t, const int patch_h, const int patch_w,
@@ -130,7 +131,7 @@ void col2vol(cudaStream_t stream, const Dtype* data_col, const int channels,
int num_kernels = channels * depth * height * width;
// To avoid involving atomic operations, we will launch one kernel per
// bottom dimension, and then in the kernel add up the top dimensions.
- vol2im_kernel <<<GET_BLOCKS(num_kernels), CUDA_NUM_THREADS, 0, stream>>> (
+ vol2im_kernel<Dtype, Acctype> <<<GET_BLOCKS(num_kernels), CUDA_NUM_THREADS, 0, stream>>> (
num_kernels, data_col, depth, height, width, channels,
patch_t, patch_h, patch_w, pad_t, pad_h, pad_w, stride_t, stride_h, stride_w,
dilation_t, dilation_h, dilation_w,
diff --git a/test.lua b/test.lua
index 48beced..7cb2854 100644
--- a/test.lua
+++ b/test.lua
@@ -1,4 +1,5 @@
local cunntest = torch.TestSuite()
+local ffi = require 'ffi'
local precision_forward = 1e-4
local precision_backward = 1e-2
local nloop = 1
@@ -6,134 +7,183 @@ local times = {}
--e.g.: th -lcunn -e "nn.testcuda{'Sigmoid_forward'}"
-local function pointwise_forward(proto_module, name, max_error)
- local size = math.random(1,100)
+local typenames = {
+ 'torch.CudaTensor',
+ 'torch.CudaDoubleTensor',
+}
- local tm = {}
- local title = string.format(name..'.forward %d -> %d', size, size)
- times[title] = tm
+local t2cpu = {
+ ['torch.CudaTensor'] = 'torch.FloatTensor',
+ ['torch.CudaDoubleTensor'] = 'torch.DoubleTensor',
- local input = torch.randn(size)
- if name == 'Sqrt' then input:abs() end
- local sconv = proto_module
- local groundtruth = sconv:forward(input)
- local a = torch.Timer()
- for i = 1,nloop do
- groundtruth = sconv:forward(input)
+}
+
+local function checkHalf()
+ if cutorch.hasHalf then
+ table.insert(typenames, 'torch.CudaHalfTensor')
+ t2cpu['torch.CudaHalfTensor'] = 'torch.FloatTensor'
end
- tm.cpu = a:time().real
+end
- input = input:cuda()
- local gconv = proto_module:clone():cuda()
- local rescuda = gconv:forward(input)
- a:reset()
- for i = 1,nloop do
- rescuda = gconv:forward(input)
+-- workarounds for non-existant functions
+function torch.CudaHalfTensor:mean()
+ return self:cuda():mean()
+end
+
+function torch.CudaDoubleTensor:mean()
+ return self:cuda():mean()
+end
+
+local function half_max_error(maxabs)
+ -- arbitrarily double the precision limit
+ return 2 * ((maxabs and (2^(math.floor(math.log(maxabs) / math.log(2)))) * (2^(-10))) or 0)
+end
+
+-- half has additional error on top of double/float
+local function precision_forward_type(precision_f, tensor_type, maxabs)
+ if (tensor_type == 'torch.CudaHalfTensor') then
+ return 1e-2 + precision_f + half_max_error(maxabs)
+ else
+ return precision_f
end
- cutorch.synchronize()
- tm.gpu = a:time().real
+end
- local error = rescuda:float() - groundtruth
- mytester:assertlt(error:abs():max(), max_error, 'error on state (forward) ')
+local function precision_backward_type(precision_b, tensor_type, maxabs)
+ if (tensor_type == 'torch.CudaHalfTensor') then
+ return 1e-1 + precision_b + half_max_error(maxabs)
+ else
+ return precision_b
+ end
+end
+
+local function precision_backward_conv_weightbias(precision_b, tensor_type, maxabs)
+ if (tensor_type == 'torch.CudaHalfTensor') then
+ -- cudnn uses 8 here
+ return 2 + precision_b + half_max_error(maxabs)
+ else
+ return precision_b
+ end
+end
+
+local function pointwise_forward(proto_module, name, max_error)
+ local size = math.random(1,100)
+
+ for k, typename in ipairs(typenames) do
+ local input = torch.randn(size):type(typename)
+ local ctype = t2cpu[typename]
+ local input = input:type(ctype)
+ if name == 'Sqrt' then input:abs() end
+ local sconv = proto_module:type(ctype)
+ local groundtruth = sconv:forward(input)
+
+ input = input:type(typename)
+ local gconv = proto_module:clone():type(typename)
+ local rescuda = gconv:forward(input)
+
+ local error = rescuda:double() - groundtruth:double()
+ mytester:assertlt(error:abs():max(), precision_forward_type(max_error, typename),
+ string.format('error on state (forward) with %s', typename))
+ end
end
local function pointwise_backward(proto_module, name, max_error)
local size = math.random(1,100)
- local tm = {}
- local title = string.format(name..'.backward %d -> %d', size, size)
- times[title] = tm
+ for k, typename in ipairs(typenames) do
+ local input = torch.randn(size):type(typename)
+ local gradOutput = torch.randn(size):type(typename)
- local input = torch.randn(size)
- if name == 'Sqrt' then input:abs() end
- local gradOutput = torch.randn(size)
- local sconv = proto_module
- sconv:forward(input)
- local groundgrad = sconv:backward(input, gradOutput)
- local a = torch.Timer()
- for i = 1,nloop do
- groundgrad = sconv:backward(input, gradOutput)
- end
- tm.cpu = a:time().real
+ local ctype = t2cpu[typename]
+ input = input:type(ctype)
+ gradOutput = gradOutput:type(ctype)
+ if name == 'Sqrt' then input:abs() end
+ local sconv = proto_module:type(ctype)
+ sconv:forward(input)
+ local groundgrad = sconv:backward(input, gradOutput)
- input = input:cuda()
- gradOutput = gradOutput:cuda()
- local gconv = proto_module:clone():cuda()
- gconv:forward(input)
- local rescuda = gconv:backward(input, gradOutput)
- a:reset()
- for i = 1,nloop do
- rescuda = gconv:backward(input, gradOutput)
- end
- cutorch.synchronize()
- tm.gpu = a:time().real
+ input = input:type(typename)
+ gradOutput = gradOutput:type(typename)
+ local gconv = proto_module:clone():type(typename)
+ gconv:forward(input)
+ local rescuda = gconv:backward(input, gradOutput)
- local error = rescuda:float() - groundgrad
+ local error = rescuda:double() - groundgrad:double()
- mytester:assertlt(error:abs():max(), max_error, 'error on state (backward) ')
+ mytester:assertlt(error:abs():max(), precision_backward_type(max_error, typename),
+ string.format('error on state (backward) with %s', typename))
+ end
end
local function pointwise_backward_inplace(proto_module, name)
local size = math.random(1,100)
- local tm = {}
- local title = string.format(name..'.backward_inplace %d -> %d', size, size)
- times[title] = tm
-
- local input = torch.randn(size)
- if name == 'Sqrt' then input:abs() end
- local gradOutput = torch.randn(size)
- local sconv = proto_module
- local groundgrad = sconv:backward(input, gradOutput)
- mytester:assertTensorEq(groundgrad:float(),
- gradOutput:float(),
- 0.000001, "inplace not respected")
-
- local input = torch.randn(size):cuda()
- if name == 'Sqrt' then input:abs() end
- local gradOutput = torch.randn(size):cuda()
- local sconv = proto_module:clone():cuda()
- local groundgrad = sconv:backward(input, gradOutput)
- mytester:assertTensorEq(groundgrad:float(),
- gradOutput:float(),
- 0.000001, "cuda inplace not respected")
+ for k, typename in ipairs(typenames) do
+ local input = torch.randn(size):type(typename)
+ local ctype = t2cpu[typename]
+ input = input:type(ctype)
+ if name == 'Sqrt' then input:abs() end
+ local gradOutput = torch.randn(size)
+ gradOutput = gradOutput:type(ctype)
+ local sconv = proto_module:type(ctype)
+ local groundgrad = sconv:backward(input, gradOutput)
+ mytester:assertTensorEq(groundgrad:double(),
+ gradOutput:double(),
+ 0.000001,
+ string.format("inplace not respected for %s", ctype))
+
+ input = torch.randn(size)
+ input = input:type(typename)
+ if name == 'Sqrt' then input:abs() end
+ gradOutput = torch.randn(size)
+ gradOutput = gradOutput:type(typename)
+ local sconv = proto_module:clone():type(typename)
+ local groundgrad = sconv:backward(input, gradOutput)
+ mytester:assertTensorEq(groundgrad:double(),
+ gradOutput:double(),
+ 0.000001,
+ string.format("cuda inplace not respected for %s", typename))
+ end
end
local function pointwise_transposed(proto_module, name, max_error)
max_error = max_error or 1e-7
- local tm = {}
- local title = name .. '.transposed'
- times[title] = tm
- local input = torch.Tensor(11, 19):uniform(-1, 1)
- if name == 'Sqrt' then
- input:uniform(0.1, 1)
- end
- local inputCUDA = input:clone():cuda()
+ for k, typename in ipairs(typenames) do
+ local ctype = t2cpu[typename]
+ local input = torch.Tensor(11, 19):uniform(-1, 1):type(typename)
+ input = input:type(ctype)
+ local proto_module = proto_module:type(ctype)
+ if name == 'Sqrt' then
+ input:uniform(0.1, 1)
+ end
+ local inputCUDA = input:clone():type(typename)
- local cuda_module = proto_module:clone():cuda()
+ local cuda_module = proto_module:clone():type(typename)
- -- transpose the inputs and DON'T make contiguous
- input = input:transpose(1, 2)
- inputCUDA = inputCUDA:transpose(1, 2)
+ -- transpose the inputs and DON'T make contiguous
+ input = input:transpose(1, 2)
+ inputCUDA = inputCUDA:transpose(1, 2)
- local output = proto_module:forward(input)
- local outputCUDA = cuda_module:forward(inputCUDA)
+ local output = proto_module:forward(input)
+ local outputCUDA = cuda_module:forward(inputCUDA)
- local error = outputCUDA:float() - output
- mytester:assertlt(error:abs():max(), max_error, 'error on state (forward) ')
+ local error = outputCUDA:double() - output:double()
+ mytester:assertlt(error:abs():max(), precision_forward_type(max_error, typename),
+ string.format('error on state (forward) for %s', typename))
- local gradOutput = torch.Tensor(11, 19):uniform(-1, 1)
- local gradOutputCUDA = gradOutput:clone():cuda()
+ local gradOutput = torch.Tensor(11, 19):uniform(-1, 1):type(ctype)
+ local gradOutputCUDA = gradOutput:clone():type(typename)
- gradOutput = gradOutput:transpose(1, 2)
- gradOutputCUDA = gradOutputCUDA:transpose(1, 2)
+ gradOutput = gradOutput:transpose(1, 2)
+ gradOutputCUDA = gradOutputCUDA:transpose(1, 2)
- local gradInput = proto_module:backward(input, gradOutput)
- local gradInputCUDA = cuda_module:backward(inputCUDA, gradOutputCUDA)
+ local gradInput = proto_module:backward(input, gradOutput)
+ local gradInputCUDA = cuda_module:backward(inputCUDA, gradOutputCUDA)
- local error = gradInputCUDA:float() - gradInput
- mytester:assertlt(error:abs():max(), max_error, 'error on state (backward) ')
+ local error = gradInputCUDA:double() - gradInput:double()
+ mytester:assertlt(error:abs():max(), precision_backward_type(max_error, typename),
+ string.format('error on state (backward) for %s', typename))
+ end
end
function cunntest.Tanh_forward()
@@ -253,29 +303,32 @@ end
function cunntest.Sqrt_zero()
local size = math.random(1, 100)
- -- Test zero inputs; we will avoid a div-by-zero by setting to zero
- local module_gpu = nn.Sqrt():cuda()
- local input_gpu = torch.CudaTensor(size, size):zero()
- module_gpu:forward(input_gpu)
+ for k, typename in ipairs(typenames) do
+ -- Test zero inputs; we will avoid a div-by-zero by setting to zero
+ local module_gpu = nn.Sqrt():type(typename)
+ local input_gpu = torch.CudaTensor(size, size):zero():type(typename)
+ module_gpu:forward(input_gpu)
- local gradOutput_gpu = torch.CudaTensor(size, size):fill(1)
- local gradInput_gpu = module_gpu:backward(input_gpu, gradOutput_gpu)
+ local gradOutput_gpu = torch.CudaTensor(size, size):fill(1):type(typename)
+ local gradInput_gpu = module_gpu:backward(input_gpu, gradOutput_gpu)
- mytester:assertTensorEq(gradInput_gpu:float(),
- torch.FloatTensor(size, size):zero(),
- 0.000001, "error in sqrt backward singularity")
+ mytester:assertTensorEq(gradInput_gpu:double(),
+ torch.DoubleTensor(size, size):zero(),
+ 0.000001, "error in sqrt backward singularity")
- -- Verify CPU and GPU zero behavior equivalency
- local module_cpu = nn.Sqrt()
- local input_cpu = input_gpu:float()
- module_cpu:forward(input_cpu)
+ -- Verify CPU and GPU zero behavior equivalency
+ local ctype = t2cpu[typename]
+ local module_cpu = nn.Sqrt():type(ctype)
+ local input_cpu = input_gpu:type(ctype)
+ module_cpu:forward(input_cpu)
- local gradOutput_cpu = gradOutput_gpu:float()
- local gradInput_cpu = module_cpu:backward(input_cpu, gradOutput_cpu)
+ local gradOutput_cpu = gradOutput_gpu:type(ctype)
+ local gradInput_cpu = module_cpu:backward(input_cpu, gradOutput_cpu)
- mytester:assertTensorEq(gradInput_gpu:float(),
- gradInput_cpu:float(),
- 0.000001, "Sqrt_zero CPU and GPU not equivalent")
+ mytester:assertTensorEq(gradInput_gpu:double(),
+ gradInput_cpu:double(),
+ 0.000001, "Sqrt_zero CPU and GPU not equivalent")
+ end
end
function cunntest.Sqrt_transposed()
@@ -295,15 +348,18 @@ function cunntest.Square_transposed()
end
function cunntest.SoftShrink_forward()
- pointwise_forward(nn.SoftShrink(math.random()), 'SoftShrink', precision_forward)
+ local r = ffi.C.THC_half2float(ffi.C.THC_float2half(math.random()))
+ pointwise_forward(nn.SoftShrink(r), 'SoftShrink', precision_forward)
end
function cunntest.SoftShrink_backward()
- pointwise_backward(nn.SoftShrink(math.random()), 'SoftShrink', precision_backward)
+ local r = ffi.C.THC_half2float(ffi.C.THC_float2half(math.random()))
+ pointwise_backward(nn.SoftShrink(r), 'SoftShrink', precision_backward)
end
function cunntest.SoftShrink_transposed()
- pointwise_transposed(nn.SoftShrink(math.random()), 'SoftShrink', precision_backward)
+ local r = ffi.C.THC_half2float(ffi.C.THC_float2half(math.random()))
+ pointwise_transposed(nn.SoftShrink(r), 'SoftShrink', precision_backward)
end
function cunntest.ELU_forward()
@@ -363,67 +419,48 @@ function cunntest.LogSoftMax_forward_batch()
local size = math.random(1,256)
local bs = math.random(32,256)
- local tm = {}
- local title = string.format('LogSoftMax forward batch %d x %d -> %d x %d', bs, size, bs, size)
- times[title] = tm
-
- local input = torch.randn(bs, size)
- local sconv = nn.LogSoftMax()
- local groundtruth = sconv:forward(input)
- local a = torch.Timer()
- for i = 1,nloop do
- groundtruth = sconv:forward(input)
- end
- tm.cpu = a:time().real
+ for k, typename in ipairs(typenames) do
+ local input = torch.randn(bs, size):type(typename)
+ local ctype = t2cpu[typename]
+ input = input:type(ctype)
+ local sconv = nn.LogSoftMax():type(ctype)
+ local groundtruth = sconv:forward(input)
- input = input:cuda()
- local gconv = nn.LogSoftMax():cuda()
- local rescuda = gconv:forward(input)
- a:reset()
- for i = 1,nloop do
- rescuda = gconv:forward(input)
- end
- cutorch.synchronize()
- tm.gpu = a:time().real
+ input = input:type(typename)
+ local gconv = nn.LogSoftMax():type(typename)
+ local rescuda = gconv:forward(input)
- local error = rescuda:float() - groundtruth
- mytester:assertlt(error:abs():max(), precision_forward*10, 'error on state (forward) ')
+ local error = rescuda:double() - groundtruth:double()
+ mytester:assertlt(error:abs():max(), precision_forward_type(precision_forward*10, typename),
+ string.format('error on state (forward) with %s', typename))
+ end
end
function cunntest.LogSoftMax_backward_batch()
local size = math.random(1,256)
local bs = math.random(32,256)
- local tm = {}
- local title = string.format('LogSoftMax.backward batch %d x %d -> %d x %d', bs, size, bs, size)
- times[title] = tm
-
- local input = torch.randn(bs, size)
- local gradOutput = torch.randn(bs, size)
- local sconv = nn.LogSoftMax()
- sconv:forward(input)
- local groundgrad = sconv:backward(input, gradOutput)
- local a = torch.Timer()
- for i = 1,nloop do
- groundgrad = sconv:backward(input, gradOutput)
- end
- tm.cpu = a:time().real
+ for k, typename in ipairs(typenames) do
+ local input = torch.randn(bs, size):type(typename)
+ local gradOutput = torch.randn(bs, size):type(typename)
+ local ctype = t2cpu[typename]
+ input = input:type(ctype)
+ gradOutput = gradOutput:type(ctype)
+ local sconv = nn.LogSoftMax():type(ctype)
+ sconv:forward(input)
+ local groundgrad = sconv:backward(input, gradOutput)
- input = input:cuda()
- gradOutput = gradOutput:cuda()
- local gconv = sconv:clone():cuda()
- gconv:forward(input)
- local rescuda = gconv:backward(input, gradOutput)
- a:reset()
- for i = 1,nloop do
- rescuda = gconv:backward(input, gradOutput)
- end
- cutorch.synchronize()
- tm.gpu = a:time().real
+ input = input:type(typename)
+ gradOutput = gradOutput:type(typename)
+ local gconv = sconv:clone():type(typename)
+ gconv:forward(input)
+ local rescuda = gconv:backward(input, gradOutput)
- local error = rescuda:float() - groundgrad
+ local error = rescuda:double() - groundgrad:double()
- mytester:assertlt(error:abs():max(), precision_backward, 'error on state (backward) ')
+ mytester:assertlt(error:abs():max(), precision_backward_type(precision_backward, typename),
+ string.format('error on state (backward) with %s', typename))
+ end
end
function cunntest.SpatialLogSoftMax_forward()
@@ -431,31 +468,22 @@ function cunntest.SpatialLogSoftMax_forward()
local ini = math.random(8,32)
local inj = math.random(8,32)
- local tm = {}
- local title = string.format('SpatialLogSoftMax forward %d x %d x %d', size, inj, ini)
- times[title] = tm
-
- local input = torch.randn(size, inj, ini)
- local sconv = nn.SpatialLogSoftMax()
- local groundtruth = sconv:forward(input)
- local a = torch.Timer()
- for i = 1,nloop do
- groundtruth = sconv:forward(input)
- end
- tm.cpu = a:time().real
+ for k, typename in ipairs(typenames) do
+ local input = torch.randn(size, inj, ini):type(typename)
+ local ctype = t2cpu[typename]
+ input = input:type(ctype)
+ local sconv = nn.SpatialLogSoftMax():type(ctype)
+ local groundtruth = sconv:forward(input):type(ctype)
- input = input:cuda()
- local gconv = nn.SpatialLogSoftMax():cuda()
- local rescuda = gconv:forward(input)
- a:reset()
- for i = 1,nloop do
- rescuda = gconv:forward(input)
- end
- cutorch.synchronize()
- tm.gpu = a:time().real
+ input = input:type(typename)
+ local gconv = nn.SpatialLogSoftMax():type(typename)
+ local rescuda = gconv:forward(input)
- local error = rescuda:float() - groundtruth
- mytester:assertlt(error:abs():max(), precision_forward*25, 'error on state (forward) ')
+ local error = rescuda:double() - groundtruth:double()
+ mytester:assertlt(error:abs():max(),
+ precision_forward_type(precision_forward*25, typename),
+ string.format('error on state (forward) with %s', typename))
+ end
end
function cunntest.SpatialLogSoftMax_backward()
@@ -463,36 +491,27 @@ function cunntest.SpatialLogSoftMax_backward()
local ini = math.random(8,32)
local inj = math.random(8,32)
- local tm = {}
- local title = string.format('SpatialLogSoftMax.backward %d x %d x %d', size, inj, ini)
- times[title] = tm
-
- local input = torch.randn(size, inj, ini)
- local gradOutput = torch.randn(size, inj, ini)
- local sconv = nn.SpatialLogSoftMax()
- sconv:forward(input)
- local groundgrad = sconv:backward(input, gradOutput)
- local a = torch.Timer()
- for i = 1,nloop do
- groundgrad = sconv:backward(input, gradOutput)
- end
- tm.cpu = a:time().real
+ for k, typename in ipairs(typenames) do
+ local input = torch.randn(size, inj, ini):type(typename)
+ local gradOutput = torch.randn(size, inj, ini):type(typename)
+ local ctype = t2cpu[typename]
+ input = input:type(ctype)
+ gradOutput = gradOutput:type(ctype)
+ local sconv = nn.SpatialLogSoftMax():type(ctype)
+ sconv:forward(input)
+ local groundgrad = sconv:backward(input, gradOutput)
- input = input:cuda()
- gradOutput = gradOutput:cuda()
- local gconv = sconv:clone():cuda()
- gconv:forward(input)
- local rescuda = gconv:backward(input, gradOutput)
- a:reset()
- for i = 1,nloop do
- rescuda = gconv:backward(input, gradOutput)
- end
- cutorch.synchronize()
- tm.gpu = a:time().real
+ input = input:type(typename)
+ gradOutput = gradOutput:type(typename)
+ local gconv = sconv:clone():type(typename)
+ gconv:forward(input)
+ local rescuda = gconv:backward(input, gradOutput)
- local error = rescuda:float() - groundgrad
+ local error = rescuda:double() - groundgrad:double()
- mytester:assertlt(error:abs():max(), precision_backward, 'error on state (backward) ')
+ mytester:assertlt(error:abs():max(), precision_backward_type(precision_backward, typename),
+ string.format('error on state (backward) with %s', typename))
+ end
end
function cunntest.SpatialLogSoftMax_forward_batch()
@@ -501,31 +520,22 @@ function cunntest.SpatialLogSoftMax_forward_batch()
local ini = math.random(8,32)
local inj = math.random(8,32)
- local tm = {}
- local title = string.format('SpatialLogSoftMax forward batch %d x %d x %d x %d', bs, size, inj, ini)
- times[title] = tm
-
- local input = torch.randn(bs, size, inj, ini)
- local sconv = nn.SpatialLogSoftMax()
- local groundtruth = sconv:forward(input)
- local a = torch.Timer()
- for i = 1,nloop do
- groundtruth = sconv:forward(input)
- end
- tm.cpu = a:time().real
+ for k, typename in ipairs(typenames) do
+ local input = torch.randn(bs, size, inj, ini):type(typename)
+ local ctype = t2cpu[typename]
+ input = input:type(ctype)
+ local sconv = nn.SpatialLogSoftMax():type(ctype)
+ local groundtruth = sconv:forward(input)
- input = input:cuda()
- local gconv = nn.SpatialLogSoftMax():cuda()
- local rescuda = gconv:forward(input)
- a:reset()
- for i = 1,nloop do
- rescuda = gconv:forward(input)
- end
- cutorch.synchronize()
- tm.gpu = a:time().real
+ input = input:type(typename)
+ local gconv = nn.SpatialLogSoftMax():type(typename)
+ local rescuda = gconv:forward(input)
- local error = rescuda:float() - groundtruth
- mytester:assertlt(error:abs():max(), precision_forward*25, 'error on state (forward) ')
+ local error = rescuda:double() - groundtruth:double()
+ mytester:assertlt(error:abs():max(),
+ precision_forward_type(precision_forward*25, typename),
+ string.format('error on state (forward) with %s', typename))
+ end
end
function cunntest.SpatialLogSoftMax_backward_batch()
@@ -534,36 +544,27 @@ function cunntest.SpatialLogSoftMax_backward_batch()
local ini = math.random(8,32)
local inj = math.random(8,32)
- local tm = {}
- local title = string.format('SpatialLogSoftMax.backward batch %d x %d x %d x %d', bs, size, inj, ini)
- times[title] = tm
-
- local input = torch.randn(bs, size, inj, ini)
- local gradOutput = torch.randn(bs, size, inj, ini)
- local sconv = nn.SpatialLogSoftMax()
- sconv:forward(input)
- local groundgrad = sconv:backward(input, gradOutput)
- local a = torch.Timer()
- for i = 1,nloop do
- groundgrad = sconv:backward(input, gradOutput)
- end
- tm.cpu = a:time().real
+ for k, typename in ipairs(typenames) do
+ local input = torch.randn(bs, size, inj, ini):type(typename)
+ local gradOutput = torch.randn(bs, size, inj, ini):type(typename)
+ local ctype = t2cpu[typename]
+ input = input:type(ctype)
+ gradOutput = gradOutput:type(ctype)
+ local sconv = nn.SpatialLogSoftMax():type(ctype)
+ sconv:forward(input)
+ local groundgrad = sconv:backward(input, gradOutput)
- input = input:cuda()
- gradOutput = gradOutput:cuda()
- local gconv = sconv:clone():cuda()
- gconv:forward(input)
- local rescuda = gconv:backward(input, gradOutput)
- a:reset()
- for i = 1,nloop do
- rescuda = gconv:backward(input, gradOutput)
- end
- cutorch.synchronize()
- tm.gpu = a:time().real
+ input = input:type(typename)
+ gradOutput = gradOutput:type(typename)
+ local gconv = sconv:clone():type(typename)
+ gconv:forward(input)
+ local rescuda = gconv:backward(input, gradOutput)
- local error = rescuda:float() - groundgrad
+ local error = rescuda:double() - groundgrad:double()
- mytester:assertlt(error:abs():max(), precision_backward, 'error on state (backward) ')
+ mytester:assertlt(error:abs():max(), precision_backward_type(precision_backward, typename),
+ string.format('error on state (backward) with %s', typename))
+ end
end
@@ -732,44 +733,35 @@ function cunntest.SparseLinear_forward()
local ini = math.random(50,100)
local inj = math.random(5,10)
- local module = nn.SparseLinear(ini,inj)
- local sslin = module
- local gslin = module:clone():cuda()
-
- -- Create a random sparse vector
- local input = {}
- for i=1,inb do
- local nnz = math.random(5, 10)
- local inds = torch.randperm(ini)[{{1,nnz}}]
- input[i] = torch.Tensor(nnz, 2)
- input[i]:select(2,1):copy(inds)
- input[i]:select(2,2):copy(torch.rand(nnz))
- end
+ for k, typename in ipairs(typenames) do
+ if typename ~= "torch.CudaHalfTensor" then
+ local ctype = t2cpu[typename]
+ local module = nn.SparseLinear(ini,inj):type(ctype)
+ local sslin = module
+ local gslin = module:clone():type(typename)
+
+ -- Create a random sparse vector
+ local input = {}
+ for i=1,inb do
+ local nnz = math.random(5, 10)
+ local inds = torch.randperm(ini)[{{1,nnz}}]
+ input[i] = torch.Tensor(nnz, 2):type(ctype)
+ input[i]:select(2,1):copy(inds)
+ input[i]:select(2,2):copy(torch.rand(nnz):type(typename):type(ctype))
+ end
- local tm = {}
- local title = string.format('SparseLinear forward %d -> %d', ini, inj)
- times[title] = tm
+ local groundtruth = sslin:forward(input)
+ sslin:zeroGradParameters()
- local groundtruth = sslin:forward(input)
- sslin:zeroGradParameters()
- local a = torch.Timer()
- for i = 1,nloop do
- groundtruth = sslin:forward(input)
- end
- tm.cpu = a:time().real
-
- for i,v in ipairs(input) do input[i] = input[i]:cuda() end
- local rescuda = gslin:forward(input)
- gslin:zeroGradParameters()
- a:reset()
- for i = 1,nloop do
- rescuda = gslin:forward(input)
- end
- cutorch.synchronize()
- tm.gpu = a:time().real
+ for i,v in ipairs(input) do input[i] = input[i]:type(typename) end
+ local rescuda = gslin:forward(input)
+ gslin:zeroGradParameters()
- local error = rescuda:float() - groundtruth
- mytester:assertlt(error:abs():max(), precision_forward, 'error on state (forward) ')
+ local error = rescuda:double() - groundtruth:double()
+ mytester:assertlt(error:abs():max(), precision_forward_type(precision_forward, typename),
+ string.format('error on state (forward) with %s', typename))
+ end
+ end
end
function cunntest.SparseLinear_backward()
@@ -777,201 +769,184 @@ function cunntest.SparseLinear_backward()
local ini = math.random(50,100)
local inj = math.random(5,10)
- local gslin = nn.SparseLinear(ini,inj):cuda()
- local sslin = nn.Linear(ini,inj)
- gslin.weight = sslin.weight:clone():cuda()
- gslin.bias = sslin.bias:clone():cuda()
-
- -- Create a random sparse vector
- local input = {}
- local nonsparse = torch.zeros(inb, ini)
- for i=1,inb do
- local nnz = math.random(3, 5)
- local inds = torch.randperm(ini)[{{1,nnz}}]
- input[i] = torch.Tensor(nnz, 2)
- input[i]:select(2,1):copy(inds)
- input[i]:select(2,2):copy(torch.rand(nnz))
- nonsparse[i]:scatter(1, input[i]:select(2,1):long(), input[i]:select(2,2))
- end
+ for k, typename in ipairs(typenames) do
+ if typename ~= "torch.CudaHalfTensor" then
+ local ctype = t2cpu[typename]
+ local gslin = nn.SparseLinear(ini,inj):type(typename)
+ local sslin = nn.Linear(ini,inj):type(ctype)
+ gslin.weight = sslin.weight:clone():type(typename)
+ gslin.bias = sslin.bias:clone():type(typename)
+
+ -- Create a random sparse vector
+ local input = {}
+ local nonsparse = torch.zeros(inb, ini):type(ctype)
+ for i=1,inb do
+ local nnz = math.random(3, 5)
+ local inds = torch.randperm(ini)[{{1,nnz}}]
+ input[i] = torch.Tensor(nnz, 2):type(ctype)
+ input[i]:select(2,1):copy(inds)
+ input[i]:select(2,2):copy(torch.rand(nnz):type(typename):type(ctype))
+ nonsparse[i]:scatter(1, input[i]:select(2,1):long(), input[i]:select(2,2))
+ end
- local tm = {}
- local title = string.format('SparseLinear backward %d <- %d', ini, inj)
- times[title] = tm
-
- local gradOutput = torch.randn(inb, inj)
- sslin:forward(nonsparse)
- local groundgrad = sslin:backward(nonsparse, gradOutput)
- sslin:zeroGradParameters()
- local a = torch.Timer()
- for i = 1,nloop do
- sslin:backward(nonsparse, gradOutput)
- end
- tm.cpu = a:time().real
- local groundweight = sslin.gradWeight
- local groundbias = sslin.gradBias
-
- for i,v in ipairs(input) do input[i] = input[i]:cuda() end
- gradOutput = gradOutput:cuda()
- gslin:forward(input)
- local rescuda = gslin:backward(input, gradOutput)
- gslin:zeroGradParameters()
- a:reset()
- for i = 1,nloop do
- gslin:backward(input, gradOutput)
+ local gradOutput = torch.randn(inb, inj):type(typename):type(ctype)
+ sslin:forward(nonsparse)
+ local groundgrad = sslin:backward(nonsparse, gradOutput)
+ sslin:zeroGradParameters()
+ local groundweight = sslin.gradWeight
+ local groundbias = sslin.gradBias
+
+ for i,v in ipairs(input) do input[i] = input[i]:type(typename) end
+ gradOutput = gradOutput:type(typename)
+ gslin:forward(input)
+ local rescuda = gslin:backward(input, gradOutput)
+ gslin:zeroGradParameters()
+ local weightcuda = gslin.gradWeight
+ local biascuda = gslin.gradBias
+
+ local werror = weightcuda:double() - groundweight:double()
+ local berror = biascuda:double() - groundbias:double()
+
+ mytester:assertlt(werror:abs():max(), precision_backward_type(precision_backward, typename),
+ string.format('error on weight (backward) with %s', typename))
+ mytester:assertlt(berror:abs():max(), precision_backward_type(precision_backward, typename),
+ string.format('error on bias (backward) with %s', typename))
+
+ gslin:updateParameters(.1)
+ sslin:updateParameters(.1)
+ werror = gslin.weight:double() - sslin.weight:double()
+ berror = gslin.bias:double() - sslin.bias:double()
+
+ mytester:assertlt(werror:abs():max(), precision_backward_type(precision_backward, typename),
+ string.format('error on weight (update) with %s', typename))
+ mytester:assertlt(berror:abs():max(), precision_backward_type(precision_backward, typename),
+ string.format('error on bias (update) with %s', typename))
+
+ gslin:zeroGradParameters()
+ end
end
- local weightcuda = gslin.gradWeight
- local biascuda = gslin.gradBias
- cutorch.synchronize()
- tm.gpu = a:time().real
-
- local werror = weightcuda:float() - groundweight
- local berror = biascuda:float() - groundbias
-
- mytester:assertlt(werror:abs():max(), precision_backward, 'error on weight (backward) ')
- mytester:assertlt(berror:abs():max(), precision_backward, 'error on bias (backward) ')
-
- gslin:updateParameters(.1)
- sslin:updateParameters(.1)
- werror = gslin.weight:float() - sslin.weight
- berror = gslin.bias:float() - sslin.bias
-
- mytester:assertlt(werror:abs():max(), precision_backward, 'error on weight (update) ')
- mytester:assertlt(berror:abs():max(), precision_backward, 'error on bias (update) ')
-
- gslin:zeroGradParameters()
end
local function BatchNormalization_forward(moduleName, inputSize)
local planes = inputSize[2]
- local tm = {}
- local title = moduleName .. '.forward ' .. table.concat(inputSize, 'x')
- times[title] = tm
- local input = torch.randn(table.unpack(inputSize))
- local sbnorm = nn[moduleName](planes)
- local groundtruth = sbnorm:forward(input)
- local a = torch.Timer()
- for i = 1,nloop do
- groundtruth = sbnorm:forward(input)
- end
- tm.cpu = a:time().real
+ for k, typename in ipairs(typenames) do
+ local input = torch.randn(table.unpack(inputSize)):type(typename)
- input = input:cuda()
- local gbnorm = nn[moduleName](planes):cuda()
- gbnorm.weight = sbnorm.weight:cuda()
- gbnorm.bias = sbnorm.bias:cuda()
- local rescuda = gbnorm:forward(input)
+ local ctype = t2cpu[typename]
+ input = input:type(ctype)
+ local sbnorm = nn[moduleName](planes):type(ctype)
+ local groundtruth = sbnorm:forward(input)
- a:reset()
- for i = 1,nloop do
- rescuda = gbnorm:forward(input)
- end
- cutorch.synchronize()
- tm.gpu = a:time().real
+ input = input:type(typename)
+ local gbnorm = nn[moduleName](planes):type(typename)
+ gbnorm.weight = sbnorm.weight:type(typename)
+ gbnorm.bias = sbnorm.bias:type(typename)
+ local rescuda = gbnorm:forward(input)
- local error = rescuda:float() - groundtruth
- mytester:assertlt(error:abs():max(), precision_forward, 'error on state (forward)')
- mytester:assertlt((gbnorm.running_mean:float() - sbnorm.running_mean):abs():max(),
- precision_forward, 'error on running_mean (forward)')
- mytester:assertlt((gbnorm.running_var:float() - sbnorm.running_var):abs():max(),
- precision_forward, 'error on running_var (forward)')
+ local error = rescuda:double() - groundtruth:double()
+ mytester:assertlt(error:abs():max(), precision_forward_type(precision_forward, typename, rescuda:abs():max()),
+ string.format('error on state (forward) with %s', typename))
+ mytester:assertlt((gbnorm.running_mean:double() - sbnorm.running_mean:double()):abs():max(),
+ precision_forward_type(precision_forward, typename, gbnorm.running_mean:abs():max()),
+ string.format('error on running_mean (forward) with %s', typenanme))
+ mytester:assertlt((gbnorm.running_var:double() - sbnorm.running_var:double()):abs():max(),
+ precision_forward_type(precision_forward, typename, gbnorm.running_var:abs():max()),
+ string.format('error on running_var (forward) with %s', typename))
+ end
end
local function BatchNormalization_forward_inference(moduleName, inputSize)
local planes = inputSize[2]
- local tm = {}
- local title = moduleName .. '.forward (evaluate) ' .. table.concat(inputSize, 'x')
- times[title] = tm
- local input = torch.randn(table.unpack(inputSize))
- local sbnorm = nn[moduleName](planes)
- sbnorm.running_mean:normal(1, 2)
- sbnorm.running_var:uniform(1e-3, 2)
- sbnorm:evaluate()
- local groundtruth = sbnorm:forward(input)
- local a = torch.Timer()
- for i = 1,nloop do
- groundtruth = sbnorm:forward(input)
- end
- tm.cpu = a:time().real
+ for k, typename in ipairs(typenames) do
+ local input = torch.randn(table.unpack(inputSize)):type(typename)
- input = input:cuda()
- local gbnorm = nn[moduleName](planes):cuda()
- gbnorm:evaluate()
- gbnorm.weight = sbnorm.weight:cuda()
- gbnorm.bias = sbnorm.bias:cuda()
- gbnorm.running_mean = sbnorm.running_mean:cuda()
- gbnorm.running_var = sbnorm.running_var:cuda()
- local rescuda = gbnorm:forward(input)
- a:reset()
- for i = 1,nloop do
- rescuda = gbnorm:forward(input)
- end
- cutorch.synchronize()
- tm.gpu = a:time().real
+ local ctype = t2cpu[typename]
+ input = input:type(ctype)
+ local sbnorm = nn[moduleName](planes):type(ctype)
+ sbnorm.running_mean:normal(1, 2)
+ sbnorm.running_var:uniform(1e-3, 2)
+ sbnorm.running_var = sbnorm.running_var:type(typename):type(ctype)
+ sbnorm.running_mean = sbnorm.running_mean:type(typename):type(ctype)
- local error = rescuda:float() - groundtruth
- mytester:assertlt(error:abs():max(), precision_forward, 'error on state (forward evaluate)')
+ sbnorm:evaluate()
+ local groundtruth = sbnorm:forward(input)
+
+ input = input:type(typename)
+ local gbnorm = nn[moduleName](planes):type(typename)
+ gbnorm:evaluate()
+ gbnorm.weight = sbnorm.weight:type(typename)
+ gbnorm.bias = sbnorm.bias:type(typename)
+ gbnorm.running_mean = sbnorm.running_mean:type(typename)
+ gbnorm.running_var = sbnorm.running_var:type(typename)
+ local rescuda = gbnorm:forward(input)
+
+ local error = rescuda:double() - groundtruth:double()
+ mytester:assertlt(error:abs():max(), precision_forward_type(precision_forward, typename, rescuda:abs():max()),
+ string.format('error on state (forward evaluate) with %s', typename))
+ end
end
local function BatchNormalization_backward(moduleName, mode, inputSize, backwardFn)
assert(mode == 'training' or mode == 'evaluation', 'invalid mode')
local planes = inputSize[2]
- local tm = {}
- local title = moduleName .. '.backward ' .. table.concat(inputSize, 'x')
- times[title] = tm
- local input = torch.randn(table.unpack(inputSize))
- local gradOutput = torch.randn(table.unpack(inputSize))
- local sbnorm = nn[moduleName](planes)
- if mode == 'training' then
- sbnorm:training()
- else
- sbnorm:evaluate()
- end
- sbnorm:forward(input)
- sbnorm:zeroGradParameters()
- local groundgrad = backwardFn(sbnorm, input, gradOutput)
- local a = torch.Timer()
- for i = 1,nloop do
- sbnorm:zeroGradParameters()
- groundgrad = backwardFn(sbnorm, input, gradOutput)
- end
- local groundweight = sbnorm.gradWeight
- local groundbias = sbnorm.gradBias
- tm.cpu = a:time().real
+ for k, typename in ipairs(typenames) do
+ local input = torch.randn(table.unpack(inputSize)):type(typename)
+ local gradOutput = torch.randn(table.unpack(inputSize)):type(typename)
- input = input:cuda()
- gradOutput = gradOutput:cuda()
- local gbnorm = nn[moduleName](planes):cuda()
- if mode == 'training' then
- gbnorm:training()
- else
- gbnorm:evaluate()
- end
- gbnorm.weight = sbnorm.weight:cuda()
- gbnorm.bias = sbnorm.bias:cuda()
- gbnorm:forward(input)
- gbnorm:zeroGradParameters()
- local rescuda = backwardFn(gbnorm, input, gradOutput)
- a:reset()
- for i = 1,nloop do
+ local ctype = t2cpu[typename]
+ input = input:type(ctype)
+ gradOutput = gradOutput:type(ctype)
+ local sbnorm = nn[moduleName](planes):type(ctype)
+ if mode == 'training' then
+ sbnorm:training()
+ else
+ sbnorm:evaluate()
+ end
+ sbnorm:forward(input)
+ sbnorm:zeroGradParameters()
+ local groundgrad = backwardFn(sbnorm, input, gradOutput)
+ local groundweight = sbnorm.gradWeight
+ local groundbias = sbnorm.gradBias
+
+ input = input:type(typename)
+ gradOutput = gradOutput:type(typename)
+ local gbnorm = nn[moduleName](planes):type(typename)
+ if mode == 'training' then
+ gbnorm:training()
+ else
+ gbnorm:evaluate()
+ end
+ gbnorm.weight = sbnorm.weight:type(typename)
+ gbnorm.bias = sbnorm.bias:type(typename)
+ gbnorm:forward(input)
gbnorm:zeroGradParameters()
- rescuda = backwardFn(gbnorm, input, gradOutput)
- end
- local weightcuda = gbnorm.gradWeight
- local biascuda = gbnorm.gradBias
- cutorch.synchronize()
- tm.gpu = a:time().real
-
- local error = rescuda:float() - groundgrad
- local werror = weightcuda:float() - groundweight
- local berror = biascuda:float() - groundbias
-
- mytester:assertlt(error:abs():max(), precision_backward, 'error on state (backward) ')
- mytester:assertlt(werror:abs():max(), precision_backward, 'error on weight (backward) ')
- mytester:assertlt(berror:abs():max(), precision_backward, 'error on bias (backward) ')
+ local rescuda = backwardFn(gbnorm, input, gradOutput)
+ local weightcuda = gbnorm.gradWeight
+ local biascuda = gbnorm.gradBias
+
+ local error = rescuda:double() - groundgrad:double()
+ local werror = weightcuda:double() - groundweight:double()
+ local berror = biascuda:double() - groundbias:double()
+
+ local backerror = precision_backward_type(precision_backward, typename, rescuda:abs():max())
+ if typename == 'torch.CudaHalfTensor' and (mode == 'training') then
+ -- this correction is empirical; mean can be off by roughly 4e-4, multiplied by roughly stdval^2.
+ backerror = backerror + (sbnorm.save_std:max())^2 * 4e-4
+ end
+ mytester:assertlt(error:abs():max(),
+ backerror,
+ string.format('error on state (backward) with %s', typename))
+ mytester:assertlt(werror:abs():max(),
+ precision_backward_type(precision_backward, typename, weightcuda:abs():max()),
+ string.format('error on weight (backward) with %s', typename))
+ mytester:assertlt(berror:abs():max(),
+ precision_backward_type(precision_backward, typename, biascuda:abs():max()),
+ string.format('error on bias (backward) with %s', typename))
+ end
end
local function testBatchNormalization(name, dim, k)
@@ -1027,44 +1002,34 @@ function cunntest.SpatialConvolutionMM_forward_single()
local ini = (outi-1)*si+ki-padW*2
local inj = (outj-1)*sj+kj-padH*2
- local tm = {}
- local title = string.format('SpatialConvolutionMM.forward %dx%dx%d o %dx%d -> %dx%dx%d [s: %dx%d] [p: %dx%d]',
- from, inj, ini, kj, ki, to, outj, outi, sj, si, padH, padW)
- times[title] = tm
-
local function jacTests(noBias)
noBias = noBias or false
- local input = torch.randn(from,inj,ini)
- local sconv = nn.SpatialConvolutionMM(from,to,ki,kj,si,sj,padW,padH)
- if noBias then
- sconv:noBias()
- end
- local groundtruth = sconv:forward(input)
- local a = torch.Timer()
- for i = 1,nloop do
- groundtruth = sconv:forward(input)
- end
- tm.cpu = a:time().real
-
- input = input:cuda()
- local gconv = nn.SpatialConvolutionMM(from,to,ki,kj,si,sj,padW,padH):cuda()
- if noBias then
- gconv:noBias()
- end
- gconv.weight = sconv.weight:cuda()
- if gconv.bias then
- gconv.bias = sconv.bias:cuda()
- end
- local rescuda = gconv:forward(input)
- a:reset()
- for i = 1,nloop do
- rescuda = gconv:forward(input)
+ for k, typename in ipairs(typenames) do
+ local input = torch.randn(from,inj,ini):type(typename)
+
+ local ctype = t2cpu[typename]
+ input = input:type(ctype)
+ local sconv = nn.SpatialConvolutionMM(from,to,ki,kj,si,sj,padW,padH):type(ctype)
+ if noBias then
+ sconv:noBias()
+ end
+ local groundtruth = sconv:forward(input)
+
+ input = input:type(typename)
+ local gconv = nn.SpatialConvolutionMM(from,to,ki,kj,si,sj,padW,padH):type(typename)
+ if noBias then
+ gconv:noBias()
+ end
+ gconv.weight = sconv.weight:type(typename)
+ if gconv.bias then
+ gconv.bias = sconv.bias:type(typename)
+ end
+ local rescuda = gconv:forward(input)
+
+ local error = rescuda:double() - groundtruth:double()
+ mytester:assertlt(error:abs():max(), precision_forward_type(precision_forward, typename),
+ string.format('error on state (forward) with %s', typename))
end
- cutorch.synchronize()
- tm.gpu = a:time().real
-
- local error = rescuda:float() - groundtruth
- mytester:assertlt(error:abs():max(), precision_forward, 'error on state (forward) ')
end
jacTests(false)
@@ -1086,44 +1051,34 @@ function cunntest.SpatialConvolutionMM_forward_batch()
local ini = (outi-1)*si+ki-padW*2
local inj = (outj-1)*sj+kj-padH*2
- local tm = {}
- local title = string.format('SpatialConvolutionMM.forward %dx%dx%dx%d o %dx%d -> %dx%dx%dx%d [s: %dx%d] [p: %dx%d]',
- bs, from, inj, ini, kj, ki, bs, to, outj, outi, sj, si, padH, padW)
- times[title] = tm
-
local function jacTests(noBias)
noBias = noBias or false
- local input = torch.randn(bs,from,inj,ini)
- local sconv = nn.SpatialConvolutionMM(from,to,ki,kj,si,sj,padW,padH)
- if noBias then
- sconv:noBias()
- end
- local groundtruth = sconv:forward(input)
- local a = torch.Timer()
- for i = 1,nloop do
- groundtruth = sconv:forward(input)
+ for k, typename in ipairs(typenames) do
+ local input = torch.randn(bs,from,inj,ini):type(typename)
+
+ local ctype = t2cpu[typename]
+ input = input:type(ctype)
+ local sconv = nn.SpatialConvolutionMM(from,to,ki,kj,si,sj,padW,padH):type(ctype)
+ if noBias then
+ sconv:noBias()
+ end
+ local groundtruth = sconv:forward(input)
+
+ input = input:type(typename)
+ local gconv = nn.SpatialConvolutionMM(from,to,ki,kj,si,sj,padW,padH):type(typename)
+ if noBias then
+ gconv:noBias()
+ end
+ gconv.weight = sconv.weight:type(typename)
+ if gconv.bias then
+ gconv.bias = sconv.bias:type(typename)
+ end
+ local rescuda = gconv:forward(input)
+
+ local error = rescuda:double() - groundtruth:double()
+ mytester:assertlt(error:abs():max(), precision_forward_type(precision_forward, typename),
+ string.format('error on state (forward) with %s', typename))
end
- tm.cpu = a:time().real
-
- input = input:cuda()
- local gconv = nn.SpatialConvolutionMM(from,to,ki,kj,si,sj,padW,padH):cuda()
- if noBias then
- gconv:noBias()
- end
- gconv.weight = sconv.weight:cuda()
- if gconv.bias then
- gconv.bias = sconv.bias:cuda()
- end
- local rescuda = gconv:forward(input)
- a:reset()
- for i = 1,nloop do
- rescuda = gconv:forward(input)
- end
- cutorch.synchronize()
- tm.gpu = a:time().real
-
- local error = rescuda:float() - groundtruth
- mytester:assertlt(error:abs():max(), precision_forward, 'error on state (forward) ')
end
@@ -1143,62 +1098,56 @@ function cunntest.SpatialConvolutionMM_backward_single()
local ini = (outi-1)*si+ki-padW*2
local inj = (outj-1)*sj+kj-padH*2
- local tm = {}
- local title = string.format('SpatialConvolutionMM.backward %dx%dx%d o %dx%d -> %dx%dx%d [s: %dx%d] [p: %dx%d]',
- from, inj, ini, kj, ki, to, outj, outi, sj, si, padH, padW)
- times[title] = tm
-
local function jacTests(noBias)
noBias = noBias or false
- local input = torch.randn(from,inj,ini)
- local gradOutput = torch.randn(to,outj,outi)
- local sconv = nn.SpatialConvolutionMM(from,to,ki,kj,si,sj,padW,padH)
- if noBias then
- sconv:noBias()
- end
- sconv:forward(input)
- sconv:zeroGradParameters()
- local groundgrad = sconv:backward(input, gradOutput)
- local a = torch.Timer()
- for i = 1,nloop do
- sconv:zeroGradParameters()
- groundgrad = sconv:backward(input, gradOutput)
- end
- local groundweight = sconv.gradWeight
- local groundbias = sconv.gradBias
- tm.cpu = a:time().real
- input = input:cuda()
- gradOutput = gradOutput:cuda()
- local gconv = nn.SpatialConvolutionMM(from,to,ki,kj,si,sj,padW,padH):cuda()
- if noBias then
- gconv:noBias()
- end
- gconv.weight = sconv.weight:cuda()
- if gconv.bias then
- gconv.bias = sconv.bias:cuda()
- end
- gconv:forward(input)
- gconv:zeroGradParameters()
- local rescuda = gconv:backward(input, gradOutput)
- a:reset()
- for i = 1,nloop do
+ for k, typename in ipairs(typenames) do
+ local input = torch.randn(from,inj,ini):type(typename)
+ local gradOutput = torch.randn(to,outj,outi):type(typename)
+
+ local ctype = t2cpu[typename]
+ input = input:type(ctype)
+ gradOutput = gradOutput:type(ctype)
+ local sconv = nn.SpatialConvolutionMM(from,to,ki,kj,si,sj,padW,padH):type(ctype)
+ if noBias then
+ sconv:noBias()
+ end
+ sconv:forward(input)
+ sconv:zeroGradParameters()
+ local groundgrad = sconv:backward(input, gradOutput)
+ local groundweight = sconv.gradWeight
+ local groundbias = sconv.gradBias
+
+ input = input:type(typename)
+ gradOutput = gradOutput:type(typename)
+ local gconv = nn.SpatialConvolutionMM(from,to,ki,kj,si,sj,padW,padH):type(typename)
+ if noBias then
+ gconv:noBias()
+ end
+ gconv.weight = sconv.weight:type(typename)
+ if gconv.bias then
+ gconv.bias = sconv.bias:type(typename)
+ end
+ gconv:forward(input)
gconv:zeroGradParameters()
- rescuda = gconv:backward(input, gradOutput)
- end
- local weightcuda = gconv.gradWeight
- cutorch.synchronize()
- tm.gpu = a:time().real
-
- local error = rescuda:float() - groundgrad
- local werror = weightcuda:float() - groundweight
-
- mytester:assertlt(error:abs():max(), precision_backward, 'error on state (backward) ')
- mytester:assertlt(werror:abs():max(), precision_backward, 'error on weight (backward) ')
-
- if gconv.bias then
- local berror = gconv.gradBias:float() - groundbias
- mytester:assertlt(berror:abs():max(), precision_backward, 'error on bias (backward) ')
+ local rescuda = gconv:backward(input, gradOutput)
+ local weightcuda = gconv.gradWeight
+
+ local error = rescuda:double() - groundgrad:double()
+ local werror = weightcuda:double() - groundweight:double()
+
+ mytester:assertlt(error:abs():max(), precision_backward_type(precision_backward, typename),
+ string.format('error on state (backward) with %s', typename))
+ mytester:assertlt(werror:abs():max(),
+ precision_backward_conv_weightbias(precision_backward, typename, weightcuda:abs():max()),
+ string.format('error on weight (backward) with %s', typename))
+
+ if gconv.bias then
+ local berror = gconv.gradBias:double() - groundbias:double()
+ mytester:assertlt(berror:abs():max(),
+ precision_backward_conv_weightbias(precision_backward, typename, gconv.gradBias:abs():max()),
+ string.format('error on bias (backward) with %s', typename))
+ end
end
end
@@ -1221,61 +1170,55 @@ function cunntest.SpatialConvolutionMM_backward_batch()
local ini = (outi-1)*si+ki-padW*2
local inj = (outj-1)*sj+kj-padH*2
- local tm = {}
- local title = string.format('SpatialConvolutionMM.backward %dx%dx%dx%d o %dx%d -> %dx%dx%dx%d [s: %dx%d] [p: %dx%d]',
- bs, from, inj, ini, kj, ki, bs, to, outj, outi, sj, si, padH, padW)
- times[title] = tm
-
local function jacTests(noBias)
noBias = noBias or false
- local input = torch.randn(bs,from,inj,ini)
- local gradOutput = torch.randn(bs,to,outj,outi)
- local sconv = nn.SpatialConvolutionMM(from,to,ki,kj,si,sj,padW,padH)
- if noBias then
- sconv:noBias()
- end
- sconv:forward(input)
- sconv:zeroGradParameters()
- local groundgrad = sconv:backward(input, gradOutput)
- local a = torch.Timer()
- for i = 1,nloop do
- sconv:zeroGradParameters()
- groundgrad = sconv:backward(input, gradOutput)
- end
- local groundweight = sconv.gradWeight
- local groundbias = sconv.gradBias
- tm.cpu = a:time().real
- input = input:cuda()
- gradOutput = gradOutput:cuda()
- local gconv = nn.SpatialConvolutionMM(from,to,ki,kj,si,sj,padW,padH):cuda()
- if noBias then
- gconv:noBias()
- end
- gconv.weight = sconv.weight:cuda()
- if gconv.bias then
- gconv.bias = sconv.bias:cuda()
- end
- gconv:forward(input)
- gconv:zeroGradParameters()
- local rescuda = gconv:backward(input, gradOutput)
- a:reset()
- for i = 1,nloop do
+ for k, typename in ipairs(typenames) do
+ local input = torch.randn(bs,from,inj,ini)
+ local gradOutput = torch.randn(bs,to,outj,outi)
+
+ local ctype = t2cpu[typename]
+ input = input:type(ctype)
+ gradOutput = gradOutput:type(ctype)
+ local sconv = nn.SpatialConvolutionMM(from,to,ki,kj,si,sj,padW,padH):type(ctype)
+ if noBias then
+ sconv:noBias()
+ end
+ sconv:forward(input)
+ sconv:zeroGradParameters()
+ local groundgrad = sconv:backward(input, gradOutput)
+ local groundweight = sconv.gradWeight
+ local groundbias = sconv.gradBias
+
+ input = input:type(typename)
+ gradOutput = gradOutput:type(typename)
+ local gconv = nn.SpatialConvolutionMM(from,to,ki,kj,si,sj,padW,padH):type(typename)
+ if noBias then
+ gconv:noBias()
+ end
+ gconv.weight = sconv.weight:type(typename)
+ if gconv.bias then
+ gconv.bias = sconv.bias:type(typename)
+ end
+ gconv:forward(input)
gconv:zeroGradParameters()
- rescuda = gconv:backward(input, gradOutput)
- end
- local weightcuda = gconv.gradWeight
- cutorch.synchronize()
- tm.gpu = a:time().real
-
- local error = rescuda:float() - groundgrad
- local werror = weightcuda:float() - groundweight
-
- mytester:assertlt(error:abs():max(), precision_backward, 'error on state (backward) ')
- mytester:assertlt(werror:abs():max(), precision_backward, 'error on weight (backward) ')
- if gconv.bias then
- local berror = gconv.gradBias:float() - groundbias
- mytester:assertlt(berror:abs():max(), precision_backward, 'error on bias (backward) ')
+ local rescuda = gconv:backward(input, gradOutput)
+ local weightcuda = gconv.gradWeight
+
+ local error = rescuda:double() - groundgrad:double()
+ local werror = weightcuda:double() - groundweight:double()
+
+ mytester:assertlt(error:abs():max(), precision_backward_type(precision_backward, typename),
+ string.format('error on state (backward) with %s', typename))
+ mytester:assertlt(werror:abs():max(),
+ precision_backward_conv_weightbias(precision_backward, typename, weightcuda:abs():max()),
+ string.format('error on weight (backward) with %s', typename))
+ if gconv.bias then
+ local berror = gconv.gradBias:double() - groundbias:double()
+ mytester:assertlt(berror:abs():max(),
+ precision_backward_conv_weightbias(precision_backward, typename, gconv.gradBias:abs():max()),
+ string.format('error on bias (backward) with %s', typename))
+ end
end
end
@@ -1297,34 +1240,24 @@ function cunntest.SpatialConvolutionLocal_forward_single()
local ini = (outi-1)*si+ki-padW*2
local inj = (outj-1)*sj+kj-padH*2
- local tm = {}
- local title = string.format('SpatialConvolutionLocal.forward %dx%dx%d o %dx%d -> %dx%dx%d [s: %dx%d] [p: %dx%d]',
- from, inj, ini, kj, ki, to, outj, outi, sj, si, padH, padW)
- times[title] = tm
+ for k, typename in ipairs(typenames) do
+ local input = torch.randn(from,inj,ini):type(typename)
- local input = torch.randn(from,inj,ini)
- local sconv = nn.SpatialConvolutionLocal(from,to,ini,inj,ki,kj,si,sj,padW,padH)
- local groundtruth = sconv:forward(input)
- local a = torch.Timer()
- for i = 1,nloop do
- groundtruth = sconv:forward(input)
- end
- tm.cpu = a:time().real
+ local ctype = t2cpu[typename]
+ input = input:type(ctype)
+ local sconv = nn.SpatialConvolutionLocal(from,to,ini,inj,ki,kj,si,sj,padW,padH):type(ctype)
+ local groundtruth = sconv:forward(input)
- input = input:cuda()
- local gconv = nn.SpatialConvolutionLocal(from,to,ini,inj,ki,kj,si,sj,padW,padH):cuda()
- gconv.weight = sconv.weight:cuda()
- gconv.bias = sconv.bias:cuda()
- local rescuda = gconv:forward(input)
- a:reset()
- for i = 1,nloop do
- rescuda = gconv:forward(input)
- end
- cutorch.synchronize()
- tm.gpu = a:time().real
+ input = input:type(typename)
+ local gconv = nn.SpatialConvolutionLocal(from,to,ini,inj,ki,kj,si,sj,padW,padH):type(typename)
+ gconv.weight = sconv.weight:type(typename)
+ gconv.bias = sconv.bias:type(typename)
+ local rescuda = gconv:forward(input)
- local error = rescuda:float() - groundtruth
- mytester:assertlt(error:abs():max(), precision_forward, 'error on state (forward) ')
+ local error = rescuda:double() - groundtruth:double()
+ mytester:assertlt(error:abs():max(), precision_forward_type(precision_forward, typename),
+ string.format('error on state (forward) with %s', typename))
+ end
end
function cunntest.SpatialConvolutionLocal_forward_batch()
@@ -1342,34 +1275,24 @@ function cunntest.SpatialConvolutionLocal_forward_batch()
local ini = (outi-1)*si+ki-padW*2
local inj = (outj-1)*sj+kj-padH*2
- local tm = {}
- local title = string.format('SpatialConvolutionLocal.forward %dx%dx%dx%d o %dx%d -> %dx%dx%dx%d [s: %dx%d] [p: %dx%d]',
- bs, from, inj, ini, kj, ki, bs, to, outj, outi, sj, si, padH, padW)
- times[title] = tm
+ for k, typename in ipairs(typenames) do
+ local input = torch.randn(bs,from,inj,ini):type(typename)
- local input = torch.randn(bs,from,inj,ini)
- local sconv = nn.SpatialConvolutionLocal(from,to,ini,inj,ki,kj,si,sj,padW,padH)
- local groundtruth = sconv:forward(input)
- local a = torch.Timer()
- for i = 1,nloop do
- groundtruth = sconv:forward(input)
- end
- tm.cpu = a:time().real
+ local ctype = t2cpu[typename]
+ input = input:type(ctype)
+ local sconv = nn.SpatialConvolutionLocal(from,to,ini,inj,ki,kj,si,sj,padW,padH):type(ctype)
+ local groundtruth = sconv:forward(input)
- input = input:cuda()
- local gconv = nn.SpatialConvolutionLocal(from,to,ini,inj,ki,kj,si,sj,padW,padH):cuda()
- gconv.weight = sconv.weight:cuda()
- gconv.bias = sconv.bias:cuda()
- local rescuda = gconv:forward(input)
- a:reset()
- for i = 1,nloop do
- rescuda = gconv:forward(input)
- end
- cutorch.synchronize()
- tm.gpu = a:time().real
+ input = input:type(typename)
+ local gconv = nn.SpatialConvolutionLocal(from,to,ini,inj,ki,kj,si,sj,padW,padH):type(typename)
+ gconv.weight = sconv.weight:type(typename)
+ gconv.bias = sconv.bias:type(typename)
+ local rescuda = gconv:forward(input)
- local error = rescuda:float() - groundtruth
- mytester:assertlt(error:abs():max(), precision_forward, 'error on state (forward) ')
+ local error = rescuda:double() - groundtruth:double()
+ mytester:assertlt(error:abs():max(), precision_forward_type(precision_forward, typename),
+ string.format('error on state (forward) with %s', typename))
+ end
end
function cunntest.SpatialConvolutionLocal_backward_single()
@@ -1386,51 +1309,44 @@ function cunntest.SpatialConvolutionLocal_backward_single()
local ini = (outi-1)*si+ki-padW*2
local inj = (outj-1)*sj+kj-padH*2
- local tm = {}
- local title = string.format('SpatialConvolutionLocal.backward %dx%dx%d o %dx%d -> %dx%dx%d [s: %dx%d] [p: %dx%d]',
- from, inj, ini, kj, ki, to, outj, outi, sj, si, padH, padW)
- times[title] = tm
+ for k, typename in ipairs(typenames) do
+ local input = torch.randn(from,inj,ini):type(typename)
+ local gradOutput = torch.randn(to,outj,outi):type(typename)
- local input = torch.randn(from,inj,ini)
- local gradOutput = torch.randn(to,outj,outi)
- local sconv = nn.SpatialConvolutionLocal(from,to,ini,inj,ki,kj,si,sj,padW,padH)
- sconv:forward(input)
- sconv:zeroGradParameters()
- local groundgrad = sconv:backward(input, gradOutput)
- local a = torch.Timer()
- for i = 1,nloop do
+ local ctype = t2cpu[typename]
+ input = input:type(ctype)
+ gradOutput = gradOutput:type(ctype)
+ local sconv = nn.SpatialConvolutionLocal(from,to,ini,inj,ki,kj,si,sj,padW,padH):type(ctype)
+ sconv:forward(input)
sconv:zeroGradParameters()
- groundgrad = sconv:backward(input, gradOutput)
- end
- local groundweight = sconv.gradWeight
- local groundbias = sconv.gradBias
- tm.cpu = a:time().real
+ local groundgrad = sconv:backward(input, gradOutput)
+ local groundweight = sconv.gradWeight
+ local groundbias = sconv.gradBias
- input = input:cuda()
- gradOutput = gradOutput:cuda()
- local gconv = nn.SpatialConvolutionLocal(from,to,ini,inj,ki,kj,si,sj,padW,padH):cuda()
- gconv.weight = sconv.weight:cuda()
- gconv.bias = sconv.bias:cuda()
- gconv:forward(input)
- gconv:zeroGradParameters()
- local rescuda = gconv:backward(input, gradOutput)
- a:reset()
- for i = 1,nloop do
+ input = input:type(typename)
+ gradOutput = gradOutput:type(typename)
+ local gconv = nn.SpatialConvolutionLocal(from,to,ini,inj,ki,kj,si,sj,padW,padH):type(typename)
+ gconv.weight = sconv.weight:type(typename)
+ gconv.bias = sconv.bias:type(typename)
+ gconv:forward(input)
gconv:zeroGradParameters()
- rescuda = gconv:backward(input, gradOutput)
- end
- local weightcuda = gconv.gradWeight
- local biascuda = gconv.gradBias
- cutorch.synchronize()
- tm.gpu = a:time().real
+ local rescuda = gconv:backward(input, gradOutput)
+ local weightcuda = gconv.gradWeight
+ local biascuda = gconv.gradBias
- local error = rescuda:float() - groundgrad
- local werror = weightcuda:float() - groundweight
- local berror = biascuda:float() - groundbias
+ local error = rescuda:double() - groundgrad:double()
+ local werror = weightcuda:double() - groundweight:double()
+ local berror = biascuda:double() - groundbias:double()
- mytester:assertlt(error:abs():max(), precision_backward, 'error on state (backward) ')
- mytester:assertlt(werror:abs():max(), precision_backward, 'error on weight (backward) ')
- mytester:assertlt(berror:abs():max(), precision_backward, 'error on bias (backward) ')
+ mytester:assertlt(error:abs():max(), precision_backward_type(precision_backward, typename),
+ string.format('error on state (backward) with %s', typename))
+ mytester:assertlt(werror:abs():max(),
+ precision_backward_conv_weightbias(precision_backward, typename, weightcuda:abs():max()),
+ string.format('error on weight (backward) with %s', typename))
+ mytester:assertlt(berror:abs():max(),
+ precision_backward_conv_weightbias(precision_backward, typename, biascuda:abs():max()),
+ string.format('error on bias (backward) with %s', typename))
+ end
end
function cunntest.SpatialConvolutionLocal_backward_batch()
@@ -1448,51 +1364,44 @@ function cunntest.SpatialConvolutionLocal_backward_batch()
local ini = (outi-1)*si+ki-padW*2
local inj = (outj-1)*sj+kj-padH*2
- local tm = {}
- local title = string.format('SpatialConvolutionLocal.backward %dx%dx%dx%d o %dx%d -> %dx%dx%dx%d [s: %dx%d] [p: %dx%d]',
- bs, from, inj, ini, kj, ki, bs, to, outj, outi, sj, si, padH, padW)
- times[title] = tm
+ for k, typename in ipairs(typenames) do
+ local input = torch.randn(bs,from,inj,ini):type(typename)
+ local gradOutput = torch.randn(bs,to,outj,outi):type(typename)
- local input = torch.randn(bs,from,inj,ini)
- local gradOutput = torch.randn(bs,to,outj,outi)
- local sconv = nn.SpatialConvolutionLocal(from,to,ini,inj,ki,kj,si,sj,padW,padH)
- sconv:forward(input)
- sconv:zeroGradParameters()
- local groundgrad = sconv:backward(input, gradOutput)
- local a = torch.Timer()
- for i = 1,nloop do
+ local ctype = t2cpu[typename]
+ input = input:type(ctype)
+ gradOutput = gradOutput:type(ctype)
+ local sconv = nn.SpatialConvolutionLocal(from,to,ini,inj,ki,kj,si,sj,padW,padH):type(ctype)
+ sconv:forward(input)
sconv:zeroGradParameters()
- groundgrad = sconv:backward(input, gradOutput)
- end
- local groundweight = sconv.gradWeight
- local groundbias = sconv.gradBias
- tm.cpu = a:time().real
+ local groundgrad = sconv:backward(input, gradOutput)
+ local groundweight = sconv.gradWeight
+ local groundbias = sconv.gradBias
- input = input:cuda()
- gradOutput = gradOutput:cuda()
- local gconv = nn.SpatialConvolutionLocal(from,to,ini,inj,ki,kj,si,sj,padW,padH):cuda()
- gconv.weight = sconv.weight:cuda()
- gconv.bias = sconv.bias:cuda()
- gconv:forward(input)
- gconv:zeroGradParameters()
- local rescuda = gconv:backward(input, gradOutput)
- a:reset()
- for i = 1,nloop do
+ input = input:type(typename)
+ gradOutput = gradOutput:type(typename)
+ local gconv = nn.SpatialConvolutionLocal(from,to,ini,inj,ki,kj,si,sj,padW,padH):type(typename)
+ gconv.weight = sconv.weight:type(typename)
+ gconv.bias = sconv.bias:type(typename)
+ gconv:forward(input)
gconv:zeroGradParameters()
- rescuda = gconv:backward(input, gradOutput)
- end
- local weightcuda = gconv.gradWeight
- local biascuda = gconv.gradBias
- cutorch.synchronize()
- tm.gpu = a:time().real
+ local rescuda = gconv:backward(input, gradOutput)
+ local weightcuda = gconv.gradWeight
+ local biascuda = gconv.gradBias
- local error = rescuda:float() - groundgrad
- local werror = weightcuda:float() - groundweight
- local berror = biascuda:float() - groundbias
+ local error = rescuda:double() - groundgrad:double()
+ local werror = weightcuda:double() - groundweight:double()
+ local berror = biascuda:double() - groundbias:double()
- mytester:assertlt(error:abs():max(), precision_backward, 'error on state (backward) ')
- mytester:assertlt(werror:abs():max(), precision_backward, 'error on weight (backward) ')
- mytester:assertlt(berror:abs():max(), precision_backward, 'error on bias (backward) ')
+ mytester:assertlt(error:abs():max(), precision_backward_type(precision_backward, typename),
+ string.format('error on state (backward) with %s', typename))
+ mytester:assertlt(werror:abs():max(),
+ precision_backward_conv_weightbias(precision_backward, typename, weightcuda:abs():max()),
+ string.format('error on weight (backward) with %s', typename))
+ mytester:assertlt(berror:abs():max(),
+ precision_backward_conv_weightbias(precision_backward, typename, biascuda:abs():max()),
+ string.format('error on bias (backward) with %s', typename))
+ end
end
function cunntest.SpatialFullConvolution_forward_single()
@@ -1511,46 +1420,34 @@ function cunntest.SpatialFullConvolution_forward_single()
local ini = math.floor((outi + 2 * padW - ki) / si + 1)
local inj = math.floor((outj + 2 * padH - kj) / sj + 1)
- local tm = {}
- local title =
- string.format('SpatialFullConvolution.forward %dx%dx%d o %dx%d '
- .. '-> %dx%dx%d [s: %dx%d] [p: %dx%d] [a: %dx%d]',
- from, inj, ini, kj, ki, to, outj, outi, sj, si, padH, padW, adjH, adjW)
- times[title] = tm
-
local function jacTests(noBias)
noBias = noBias or false
- local input = torch.randn(from,inj,ini)
- local sconv = nn.SpatialFullConvolution(from,to,ki,kj,si,sj,padW,padH,adjW,adjH)
- if noBias then
- sconv:noBias()
+ for k, typename in ipairs(typenames) do
+ local input = torch.randn(from,inj,ini):type(typename)
+
+ local ctype = t2cpu[typename]
+ input = input:type(ctype)
+ local sconv = nn.SpatialFullConvolution(from,to,ki,kj,si,sj,padW,padH,adjW,adjH):type(ctype)
+ if noBias then
+ sconv:noBias()
+ end
+ local groundtruth = sconv:forward(input)
+
+ input = input:type(typename)
+ local gconv = nn.SpatialFullConvolution(from,to,ki,kj,si,sj,padW,padH,adjW,adjH):type(typename)
+ if noBias then
+ gconv:noBias()
+ end
+ gconv.weight = sconv.weight:type(typename)
+ if gconv.bias then
+ gconv.bias = sconv.bias:type(typename)
+ end
+ local rescuda = gconv:forward(input)
+
+ local error = rescuda:double() - groundtruth:double()
+ mytester:assertlt(error:abs():max(), precision_forward_type(precision_forward, typename),
+ string.format('error on state (forward) with %s', typename))
end
- local groundtruth = sconv:forward(input)
- local a = torch.Timer()
- for i = 1,nloop do
- groundtruth = sconv:forward(input)
- end
- tm.cpu = a:time().real
-
- input = input:cuda()
- local gconv = nn.SpatialFullConvolution(from,to,ki,kj,si,sj,padW,padH,adjW,adjH):cuda()
- if noBias then
- gconv:noBias()
- end
- gconv.weight = sconv.weight:cuda()
- if gconv.bias then
- gconv.bias = sconv.bias:cuda()
- end
- local rescuda = gconv:forward(input)
- a:reset()
- for i = 1,nloop do
- rescuda = gconv:forward(input)
- end
- cutorch.synchronize()
- tm.gpu = a:time().real
-
- local error = rescuda:float() - groundtruth
- mytester:assertlt(error:abs():max(), precision_forward, 'error on state (forward) ')
end
jacTests(false)
@@ -1574,44 +1471,34 @@ function cunntest.SpatialFullConvolution_forward_batch()
local ini = math.floor((outi + 2 * padW - ki) / si + 1)
local inj = math.floor((outj + 2 * padH - kj) / sj + 1)
- local tm = {}
- local title = string.format('SpatialFullConvolution.forward %dx%dx%dx%d o %dx%d -> %dx%dx%dx%d [s: %dx%d] [p: %dx%d] [a: %dx%d]',
- bs, from, inj, ini, kj, ki, bs, to, outj, outi, sj, si, padH, padW, adjH, adjW)
- times[title] = tm
-
local function jacTests(noBias)
noBias = noBias or false
- local input = torch.randn(bs,from,inj,ini)
- local sconv = nn.SpatialFullConvolution(from,to,ki,kj,si,sj,padW,padH,adjW,adjH)
- if noBias then
- sconv:noBias()
- end
- local groundtruth = sconv:forward(input)
- local a = torch.Timer()
- for i = 1,nloop do
- groundtruth = sconv:forward(input)
- end
- tm.cpu = a:time().real
-
- input = input:cuda()
- local gconv = nn.SpatialFullConvolution(from,to,ki,kj,si,sj,padW,padH,adjW,adjH):cuda()
- if noBias then
- gconv:noBias()
+ for k, typename in ipairs(typenames) do
+ local input = torch.randn(bs,from,inj,ini):type(typename)
+
+ local ctype = t2cpu[typename]
+ input = input:type(ctype)
+ local sconv = nn.SpatialFullConvolution(from,to,ki,kj,si,sj,padW,padH,adjW,adjH):type(ctype)
+ if noBias then
+ sconv:noBias()
+ end
+ local groundtruth = sconv:forward(input)
+
+ input = input:type(typename)
+ local gconv = nn.SpatialFullConvolution(from,to,ki,kj,si,sj,padW,padH,adjW,adjH):type(typename)
+ if noBias then
+ gconv:noBias()
+ end
+ gconv.weight = sconv.weight:type(typename)
+ if gconv.bias then
+ gconv.bias = sconv.bias:type(typename)
+ end
+ local rescuda = gconv:forward(input)
+
+ local error = rescuda:double() - groundtruth:double()
+ mytester:assertlt(error:abs():max(), precision_forward_type(precision_forward, typename),
+ string.format('error on state (forward) with %s', typename))
end
- gconv.weight = sconv.weight:cuda()
- if gconv.bias then
- gconv.bias = sconv.bias:cuda()
- end
- local rescuda = gconv:forward(input)
- a:reset()
- for i = 1,nloop do
- rescuda = gconv:forward(input)
- end
- cutorch.synchronize()
- tm.gpu = a:time().real
-
- local error = rescuda:float() - groundtruth
- mytester:assertlt(error:abs():max(), precision_forward, 'error on state (forward) ')
end
jacTests(false)
@@ -1634,62 +1521,54 @@ function cunntest.SpatialFullConvolution_backward_single()
local ini = math.floor((outi + 2 * padW - ki) / si + 1)
local inj = math.floor((outj + 2 * padH - kj) / sj + 1)
- local tm = {}
- local title = string.format('SpatialFullConvolution.backward %dx%dx%d o %dx%d -> %dx%dx%d [s: %dx%d] [p: %dx%d] [a: %dx%d]',
- from, inj, ini, kj, ki, to, outj, outi, sj, si, padH, padW, adjH, adjW)
- times[title] = tm
-
local function jacTests(noBias)
noBias = noBias or false
- local input = torch.randn(from,inj,ini)
- local sconv = nn.SpatialFullConvolution(from,to,ki,kj,si,sj,padW,padH,adjW,adjH)
- if noBias then
- sconv:noBias()
- end
- local output = sconv:forward(input)
- local gradOutput = output:clone():normal()
- sconv:zeroGradParameters()
- local groundgrad = sconv:backward(input, gradOutput)
- local a = torch.Timer()
- for i = 1,nloop do
+ for k, typename in ipairs(typenames) do
+ local input = torch.randn(from,inj,ini):type(typename)
+
+ local ctype = t2cpu[typename]
+ input = input:type(ctype)
+ local sconv = nn.SpatialFullConvolution(from,to,ki,kj,si,sj,padW,padH,adjW,adjH):type(ctype)
+ if noBias then
+ sconv:noBias()
+ end
+ local output = sconv:forward(input)
+ local gradOutput = output:clone():normal()
sconv:zeroGradParameters()
- groundgrad = sconv:backward(input, gradOutput)
- end
- local groundweight = sconv.gradWeight
- local groundbias = sconv.gradBias
- tm.cpu = a:time().real
-
- input = input:cuda()
- gradOutput = gradOutput:cuda()
- local gconv = nn.SpatialFullConvolution(from,to,ki,kj,si,sj,padW,padH,adjW,adjH):cuda()
- if noBias then
- gconv:noBias()
- end
- gconv.weight = sconv.weight:cuda()
- if gconv.bias then
- gconv.bias = sconv.bias:cuda()
- end
- gconv:forward(input)
- gconv:zeroGradParameters()
- local rescuda = gconv:backward(input, gradOutput)
- a:reset()
- for i = 1,nloop do
+ local groundgrad = sconv:backward(input, gradOutput)
+ local groundweight = sconv.gradWeight
+ local groundbias = sconv.gradBias
+
+ input = input:type(typename)
+ gradOutput = gradOutput:type(typename)
+ local gconv = nn.SpatialFullConvolution(from,to,ki,kj,si,sj,padW,padH,adjW,adjH):type(typename)
+ if noBias then
+ gconv:noBias()
+ end
+ gconv.weight = sconv.weight:type(typename)
+ if gconv.bias then
+ gconv.bias = sconv.bias:type(typename)
+ end
+ gconv:forward(input)
gconv:zeroGradParameters()
- rescuda = gconv:backward(input, gradOutput)
- end
- local weightcuda = gconv.gradWeight
- cutorch.synchronize()
- tm.gpu = a:time().real
-
- local error = rescuda:float() - groundgrad
- local werror = weightcuda:float() - groundweight
-
- mytester:assertlt(error:abs():max(), precision_backward, 'error on state (backward) ')
- mytester:assertlt(werror:abs():max(), precision_backward, 'error on weight (backward) ')
-
- if gconv.bias then
- local berror = gconv.gradBias:float() - groundbias
- mytester:assertlt(berror:abs():max(), precision_backward, 'error on bias (backward) ')
+ local rescuda = gconv:backward(input, gradOutput)
+ local weightcuda = gconv.gradWeight
+
+ local error = rescuda:double() - groundgrad:double()
+ local werror = weightcuda:double() - groundweight:double()
+
+ mytester:assertlt(error:abs():max(), precision_backward_type(precision_backward, typename),
+ string.format('error on state (backward) with %s', typename))
+ mytester:assertlt(werror:abs():max(),
+ precision_backward_conv_weightbias(precision_backward, typename, weightcuda:abs():max()),
+ string.format('error on weight (backward) with %s', typename))
+
+ if gconv.bias then
+ local berror = gconv.gradBias:double() - groundbias:double()
+ mytester:assertlt(berror:abs():max(),
+ precision_backward_conv_weightbias(precision_backward, typename, gconv.gradBias:abs():max()),
+ string.format('error on bias (backward) with %s', typename))
+ end
end
end
@@ -1714,63 +1593,54 @@ function cunntest.SpatialFullConvolution_backward_batch()
local ini = math.floor((outi + 2 * padW - ki) / si + 1)
local inj = math.floor((outj + 2 * padH - kj) / sj + 1)
- local tm = {}
- local title = string.format('SpatialFullConvolution.backward %dx%dx%dx%d o %dx%d '
- .. '-> %dx%dx%dx%d [s: %dx%d] [p: %dx%d] [a: %dx%d]',
- bs, from, inj, ini, kj, ki,
- bs, to, outj, outi, sj, si, padH, padW, adjH, adjW)
- times[title] = tm
-
local function jacTests(noBias)
noBias = noBias or false
- local input = torch.randn(bs,from,inj,ini)
- local sconv = nn.SpatialFullConvolution(from,to,ki,kj,si,sj,padW,padH,adjW,adjH)
- if noBias then
- sconv:noBias()
- end
- local output = sconv:forward(input)
- local gradOutput = output:clone():normal()
- sconv:zeroGradParameters()
- local groundgrad = sconv:backward(input, gradOutput)
- local a = torch.Timer()
- for i = 1,nloop do
- sconv:zeroGradParameters()
- groundgrad = sconv:backward(input, gradOutput)
- end
- local groundweight = sconv.gradWeight
- local groundbias = sconv.gradBias
- tm.cpu = a:time().real
- input = input:cuda()
- gradOutput = gradOutput:cuda()
- local gconv = nn.SpatialFullConvolution(from,to,ki,kj,si,sj,padW,padH,adjW,adjH):cuda()
- if noBias then
- gconv:noBias()
- end
- gconv.weight = sconv.weight:cuda()
- if gconv.bias then
- gconv.bias = sconv.bias:cuda()
- end
- gconv:forward(input)
- gconv:zeroGradParameters()
- local rescuda = gconv:backward(input, gradOutput)
- a:reset()
- for i = 1,nloop do
+ for k, typename in ipairs(typenames) do
+ local input = torch.randn(bs,from,inj,ini):type(typename)
+
+ local ctype = t2cpu[typename]
+ input = input:type(ctype)
+ local sconv = nn.SpatialFullConvolution(from,to,ki,kj,si,sj,padW,padH,adjW,adjH):type(ctype)
+ if noBias then
+ sconv:noBias()
+ end
+ local output = sconv:forward(input)
+ local gradOutput = output:clone():normal()
+ sconv:zeroGradParameters()
+ local groundgrad = sconv:backward(input, gradOutput)
+ local groundweight = sconv.gradWeight
+ local groundbias = sconv.gradBias
+
+ input = input:type(typename)
+ gradOutput = gradOutput:type(typename)
+ local gconv = nn.SpatialFullConvolution(from,to,ki,kj,si,sj,padW,padH,adjW,adjH):type(typename)
+ if noBias then
+ gconv:noBias()
+ end
+ gconv.weight = sconv.weight:type(typename)
+ if gconv.bias then
+ gconv.bias = sconv.bias:type(typename)
+ end
+ gconv:forward(input)
gconv:zeroGradParameters()
- rescuda = gconv:backward(input, gradOutput)
- end
- local weightcuda = gconv.gradWeight
- cutorch.synchronize()
- tm.gpu = a:time().real
-
- local error = rescuda:float() - groundgrad
- local werror = weightcuda:float() - groundweight
-
- mytester:assertlt(error:abs():max(), precision_backward, 'error on state (backward) ')
- mytester:assertlt(werror:abs():max(), precision_backward, 'error on weight (backward) ')
- if gconv.bias then
- local berror = gconv.gradBias:float() - groundbias
- mytester:assertlt(berror:abs():max(), precision_backward, 'error on bias (backward) ')
+ local rescuda = gconv:backward(input, gradOutput)
+ local weightcuda = gconv.gradWeight
+
+ local error = rescuda:double() - groundgrad:double()
+ local werror = weightcuda:double() - groundweight:double()
+
+ mytester:assertlt(error:abs():max(), precision_backward_type(precision_backward, typename),
+ string.format('error on state (backward) with %s', typename))
+ mytester:assertlt(werror:abs():max(),
+ precision_backward_conv_weightbias(precision_backward, typename, weightcuda:abs():max()),
+ string.format('error on weight (backward) with %s', typename))
+ if gconv.bias then
+ local berror = gconv.gradBias:double() - groundbias:double()
+ mytester:assertlt(berror:abs():max(),
+ precision_backward_conv_weightbias(precision_backward, typename, gconv.gradBias:abs():max()),
+ string.format('error on bias (backward) with %s', typename))
+ end
end
end
@@ -1794,36 +1664,24 @@ function cunntest.SpatialDilatedConvolution_forward_single()
local ini = (outi - 1) * si - 2 * padW + dilationW * (ki-1) + 1
local inj = (outj - 1) * sj - 2 * padH + dilationH * (kj-1) + 1
- local tm = {}
- local title =
- string.format('SpatialDilatedConvolution.forward %dx%dx%d o %dx%d '
- .. '-> %dx%dx%d [s: %dx%d] [p: %dx%d] [a: %dx%d]',
- from, inj, ini, kj, ki, to, outj, outi, sj, si, padH, padW, dilationH, dilationW)
- times[title] = tm
+ for k, typename in ipairs(typenames) do
+ local input = torch.randn(from,inj,ini):type(typename)
- local input = torch.randn(from,inj,ini)
- local sconv = nn.SpatialDilatedConvolution(from,to,ki,kj,si,sj,padW,padH,dilationW,dilationH)
- local groundtruth = sconv:forward(input)
- local a = torch.Timer()
- for i = 1,nloop do
- groundtruth = sconv:forward(input)
- end
- tm.cpu = a:time().real
+ local ctype = t2cpu[typename]
+ input = input:type(ctype)
+ local sconv = nn.SpatialDilatedConvolution(from,to,ki,kj,si,sj,padW,padH,dilationW,dilationH):type(ctype)
+ local groundtruth = sconv:forward(input)
- input = input:cuda()
- local gconv = nn.SpatialDilatedConvolution(from,to,ki,kj,si,sj,padW,padH,dilationW,dilationH):cuda()
- gconv.weight = sconv.weight:cuda()
- gconv.bias = sconv.bias:cuda()
- local rescuda = gconv:forward(input)
- a:reset()
- for i = 1,nloop do
- rescuda = gconv:forward(input)
- end
- cutorch.synchronize()
- tm.gpu = a:time().real
+ input = input:type(typename)
+ local gconv = nn.SpatialDilatedConvolution(from,to,ki,kj,si,sj,padW,padH,dilationW,dilationH):type(typename)
+ gconv.weight = sconv.weight:type(typename)
+ gconv.bias = sconv.bias:type(typename)
+ local rescuda = gconv:forward(input)
- local error = rescuda:float() - groundtruth
- mytester:assertlt(error:abs():max(), precision_forward, 'error on state (forward) ')
+ local error = rescuda:double() - groundtruth:double()
+ mytester:assertlt(error:abs():max(), precision_forward_type(precision_forward, typename),
+ string.format('error on state (forward) with %s', typename))
+ end
end
function cunntest.SpatialDilatedConvolution_forward_batch()
@@ -1838,39 +1696,29 @@ function cunntest.SpatialDilatedConvolution_forward_batch()
local padH = math.random(0,1)
local outi = math.random(ki, 64)
local outj = math.random(kj, 64)
- local dilationW = math.random(0,10)
- local dilationH = math.random(0,10)
+ local dilationW = math.random(1,10)
+ local dilationH = math.random(1,10)
local ini = (outi - 1) * si - 2 * padW + dilationW * (ki-1) + 1
local inj = (outj - 1) * sj - 2 * padH + dilationH * (kj-1) + 1
- local tm = {}
- local title = string.format('SpatialDilatedConvolution.forward %dx%dx%dx%d o %dx%d -> %dx%dx%dx%d [s: %dx%d] [p: %dx%d] [a: %dx%d]',
- bs, from, inj, ini, kj, ki, bs, to, outj, outi, sj, si, padH, padW, dilationH, dilationW)
- times[title] = tm
+ for k, typename in ipairs(typenames) do
+ local input = torch.randn(bs,from,inj,ini):type(typename)
- local input = torch.randn(bs,from,inj,ini)
- local sconv = nn.SpatialDilatedConvolution(from,to,ki,kj,si,sj,padW,padH,dilationW,dilationH)
- local groundtruth = sconv:forward(input)
- local a = torch.Timer()
- for i = 1,nloop do
- groundtruth = sconv:forward(input)
- end
- tm.cpu = a:time().real
+ local ctype = t2cpu[typename]
+ input = input:type(ctype)
+ local sconv = nn.SpatialDilatedConvolution(from,to,ki,kj,si,sj,padW,padH,dilationW,dilationH):type(ctype)
+ local groundtruth = sconv:forward(input)
- input = input:cuda()
- local gconv = nn.SpatialDilatedConvolution(from,to,ki,kj,si,sj,padW,padH,dilationW,dilationH):cuda()
- gconv.weight = sconv.weight:cuda()
- gconv.bias = sconv.bias:cuda()
- local rescuda = gconv:forward(input)
- a:reset()
- for i = 1,nloop do
- rescuda = gconv:forward(input)
- end
- cutorch.synchronize()
- tm.gpu = a:time().real
+ input = input:type(typename)
+ local gconv = nn.SpatialDilatedConvolution(from,to,ki,kj,si,sj,padW,padH,dilationW,dilationH):type(typename)
+ gconv.weight = sconv.weight:type(typename)
+ gconv.bias = sconv.bias:type(typename)
+ local rescuda = gconv:forward(input)
- local error = rescuda:float() - groundtruth
- mytester:assertlt(error:abs():max(), precision_forward, 'error on state (forward) ')
+ local error = rescuda:double() - groundtruth:double()
+ mytester:assertlt(error:abs():max(), precision_forward_type(precision_forward, typename),
+ string.format('error on state (forward) with %s', typename))
+ end
end
function cunntest.SpatialDilatedConvolution_backward_single()
@@ -1884,56 +1732,48 @@ function cunntest.SpatialDilatedConvolution_backward_single()
local padH = math.random(0,1)
local outi = math.random(ki, 64)
local outj = math.random(kj, 64)
- local dilationW = math.random(0,10)
- local dilationH = math.random(0,10)
+ local dilationW = math.random(1,10)
+ local dilationH = math.random(1,10)
local ini = (outi - 1) * si - 2 * padW + dilationW * (ki-1) + 1
local inj = (outj - 1) * sj - 2 * padH + dilationH * (kj-1) + 1
- local tm = {}
- local title = string.format('SpatialDilatedConvolution.backward %dx%dx%d o %dx%d -> %dx%dx%d [s: %dx%d] [p: %dx%d] [a: %dx%d]',
- from, inj, ini, kj, ki, to, outj, outi, sj, si, padH, padW, dilationH, dilationW)
- times[title] = tm
+ for k, typename in ipairs(typenames) do
+ local input = torch.randn(from,inj,ini):type(typename)
- local input = torch.randn(from,inj,ini)
- local sconv = nn.SpatialDilatedConvolution(from,to,ki,kj,si,sj,padW,padH,dilationW,dilationH)
- local output = sconv:forward(input)
- local gradOutput = output:clone():normal()
- sconv:zeroGradParameters()
- local groundgrad = sconv:backward(input, gradOutput)
- local a = torch.Timer()
- for i = 1,nloop do
+ local ctype = t2cpu[typename]
+ input = input:type(ctype)
+ local sconv = nn.SpatialDilatedConvolution(from,to,ki,kj,si,sj,padW,padH,dilationW,dilationH):type(ctype)
+ local output = sconv:forward(input)
+ local gradOutput = output:clone():normal()
sconv:zeroGradParameters()
- groundgrad = sconv:backward(input, gradOutput)
- end
- local groundweight = sconv.gradWeight
- local groundbias = sconv.gradBias
- tm.cpu = a:time().real
+ local groundgrad = sconv:backward(input, gradOutput)
+ local groundweight = sconv.gradWeight
+ local groundbias = sconv.gradBias
- input = input:cuda()
- gradOutput = gradOutput:cuda()
- local gconv = nn.SpatialDilatedConvolution(from,to,ki,kj,si,sj,padW,padH,dilationW,dilationH):cuda()
- gconv.weight = sconv.weight:cuda()
- gconv.bias = sconv.bias:cuda()
- gconv:forward(input)
- gconv:zeroGradParameters()
- local rescuda = gconv:backward(input, gradOutput)
- a:reset()
- for i = 1,nloop do
+ input = input:type(typename)
+ gradOutput = gradOutput:type(typename)
+ local gconv = nn.SpatialDilatedConvolution(from,to,ki,kj,si,sj,padW,padH,dilationW,dilationH):type(typename)
+ gconv.weight = sconv.weight:type(typename)
+ gconv.bias = sconv.bias:type(typename)
+ gconv:forward(input)
gconv:zeroGradParameters()
- rescuda = gconv:backward(input, gradOutput)
- end
- local weightcuda = gconv.gradWeight
- local biascuda = gconv.gradBias
- cutorch.synchronize()
- tm.gpu = a:time().real
+ local rescuda = gconv:backward(input, gradOutput)
+ local weightcuda = gconv.gradWeight
+ local biascuda = gconv.gradBias
- local error = rescuda:float() - groundgrad
- local werror = weightcuda:float() - groundweight
- local berror = biascuda:float() - groundbias
+ local error = rescuda:double() - groundgrad:double()
+ local werror = weightcuda:double() - groundweight:double()
+ local berror = biascuda:double() - groundbias:double()
- mytester:assertlt(error:abs():max(), precision_backward, 'error on state (backward) ')
- mytester:assertlt(werror:abs():max(), precision_backward, 'error on weight (backward) ')
- mytester:assertlt(berror:abs():max(), precision_backward, 'error on bias (backward) ')
+ mytester:assertlt(error:abs():max(), precision_backward_type(precision_backward, typename),
+ string.format('error on state (backward) with %s', typename))
+ mytester:assertlt(werror:abs():max(),
+ precision_backward_conv_weightbias(precision_backward, typename, weightcuda:abs():max()),
+ string.format('error on weight (backward) with %s', typename))
+ mytester:assertlt(berror:abs():max(),
+ precision_backward_conv_weightbias(precision_backward, typename, biascuda:abs():max()),
+ string.format('error on bias (backward) with %s', typename))
+ end
end
function cunntest.SpatialDilatedConvolution_backward_batch()
@@ -1948,58 +1788,48 @@ function cunntest.SpatialDilatedConvolution_backward_batch()
local padH = math.random(0,1)
local outi = math.random(ki, 64)
local outj = math.random(kj, 64)
- local dilationW = math.random(0,10)
- local dilationH = math.random(0,10)
+ local dilationW = math.random(1,10)
+ local dilationH = math.random(1,10)
local ini = (outi - 1) * si - 2 * padW + dilationW * (ki-1) + 1
local inj = (outj - 1) * sj - 2 * padH + dilationH * (kj-1) + 1
- local tm = {}
- local title = string.format('SpatialDilatedConvolution.backward %dx%dx%dx%d o %dx%d '
- .. '-> %dx%dx%dx%d [s: %dx%d] [p: %dx%d] [a: %dx%d]',
- bs, from, inj, ini, kj, ki,
- bs, to, outj, outi, sj, si, padH, padW, dilationH, dilationW)
- times[title] = tm
+ for k, typename in ipairs(typenames) do
+ local input = torch.randn(bs,from,inj,ini):type(typename)
- local input = torch.randn(bs,from,inj,ini)
- local sconv = nn.SpatialDilatedConvolution(from,to,ki,kj,si,sj,padW,padH,dilationW,dilationH)
- local output = sconv:forward(input)
- local gradOutput = output:clone():normal()
- sconv:zeroGradParameters()
- local groundgrad = sconv:backward(input, gradOutput)
- local a = torch.Timer()
- for i = 1,nloop do
+ local ctype = t2cpu[typename]
+ input = input:type(ctype)
+ local sconv = nn.SpatialDilatedConvolution(from,to,ki,kj,si,sj,padW,padH,dilationW,dilationH):type(ctype)
+ local output = sconv:forward(input)
+ local gradOutput = output:clone():normal()
sconv:zeroGradParameters()
- groundgrad = sconv:backward(input, gradOutput)
- end
- local groundweight = sconv.gradWeight
- local groundbias = sconv.gradBias
- tm.cpu = a:time().real
+ local groundgrad = sconv:backward(input, gradOutput)
+ local groundweight = sconv.gradWeight
+ local groundbias = sconv.gradBias
- input = input:cuda()
- gradOutput = gradOutput:cuda()
- local gconv = nn.SpatialDilatedConvolution(from,to,ki,kj,si,sj,padW,padH,dilationW,dilationH):cuda()
- gconv.weight = sconv.weight:cuda()
- gconv.bias = sconv.bias:cuda()
- gconv:forward(input)
- gconv:zeroGradParameters()
- local rescuda = gconv:backward(input, gradOutput)
- a:reset()
- for i = 1,nloop do
+ input = input:type(typename)
+ gradOutput = gradOutput:type(typename)
+ local gconv = nn.SpatialDilatedConvolution(from,to,ki,kj,si,sj,padW,padH,dilationW,dilationH):type(typename)
+ gconv.weight = sconv.weight:type(typename)
+ gconv.bias = sconv.bias:type(typename)
+ gconv:forward(input)
gconv:zeroGradParameters()
- rescuda = gconv:backward(input, gradOutput)
- end
- local weightcuda = gconv.gradWeight
- local biascuda = gconv.gradBias
- cutorch.synchronize()
- tm.gpu = a:time().real
+ local rescuda = gconv:backward(input, gradOutput)
+ local weightcuda = gconv.gradWeight
+ local biascuda = gconv.gradBias
- local error = rescuda:float() - groundgrad
- local werror = weightcuda:float() - groundweight
- local berror = biascuda:float() - groundbias
+ local error = rescuda:double() - groundgrad:double()
+ local werror = weightcuda:double() - groundweight:double()
+ local berror = biascuda:double() - groundbias:double()
- mytester:assertlt(error:abs():max(), precision_backward, 'error on state (backward) ')
- mytester:assertlt(werror:abs():max(), precision_backward, 'error on weight (backward) ')
- mytester:assertlt(berror:abs():max(), precision_backward, 'error on bias (backward) ')
+ mytester:assertlt(error:abs():max(), precision_backward_type(precision_backward, typename),
+ string.format('error on state (backward) with %s', typename))
+ mytester:assertlt(werror:abs():max(),
+ precision_backward_conv_weightbias(precision_backward, typename, weightcuda:abs():max()),
+ string.format('error on weight (backward) with %s', typename))
+ mytester:assertlt(berror:abs():max(),
+ precision_backward_conv_weightbias(precision_backward, typename, biascuda:abs():max()),
+ string.format('error on bias (backward) with %s', typename))
+ end
end
function cunntest.SpatialSubSampling_forward()
@@ -2014,37 +1844,27 @@ function cunntest.SpatialSubSampling_forward()
local ini = (outi-1)*si+ki
local inj = (outj-1)*sj+kj
- local tm = {}
- local title = string.format('SpatialSubSampling.forward %dx%dx%d o %dx%d -> %dx%dx%d',
- from, inj, ini, kj, ki, to, outj, outi)
- times[title] = tm
+ for k, typename in ipairs(typenames) do
+ local input = torch.randn(from,inj,ini):type(typename)
- local input = torch.randn(from,inj,ini)
- local sconv = nn.SpatialSubSampling(from,ki,kj,si,sj)
- local groundtruth = sconv:forward(input)
- local a = torch.Timer()
- for i = 1,nloop do
- groundtruth = sconv:forward(input)
- end
- tm.cpu = a:time().real
+ local ctype = t2cpu[typename]
+ input = input:type(ctype)
+ local sconv = nn.SpatialSubSampling(from,ki,kj,si,sj):type(ctype)
+ local groundtruth = sconv:forward(input)
- input = input:cuda()
- local gconv = nn.SpatialSubSampling(from,ki,kj,si,sj):cuda()
- gconv.weight = sconv.weight:cuda()
- gconv.bias = sconv.bias:cuda()
- local rescuda = gconv:forward(input)
- a:reset()
- for i = 1,nloop do
- rescuda = gconv:forward(input)
- end
- cutorch.synchronize()
- tm.gpu = a:time().real
+ input = input:type(typename)
+ local gconv = nn.SpatialSubSampling(from,ki,kj,si,sj):type(typename)
+ gconv.weight = sconv.weight:type(typename)
+ gconv.bias = sconv.bias:type(typename)
+ local rescuda = gconv:forward(input)
- local error = rescuda:float() - groundtruth
- mytester:assertlt(error:abs():max(), precision_forward, 'error on state (forward) ')
+ local error = rescuda:double() - groundtruth:double()
+ mytester:assertlt(error:abs():max(), precision_forward_type(precision_forward, typename),
+ string.format('error on state (forward) with %s', typename))
+ end
end
-function cunntest.SpatialSubSampling_forward_batch()
+function cunntest.Sampling_forward_batch()
local bs = math.random(4,10)
local from = math.random(1,64)
local to = from
@@ -2057,34 +1877,24 @@ function cunntest.SpatialSubSampling_forward_batch()
local ini = (outi-1)*si+ki
local inj = (outj-1)*sj+kj
- local tm = {}
- local title = string.format('SpatialSubSampling.forward %dx%dx%dx%d o %dx%d -> %dx%dx%dx%d',
- bs, from, inj, ini, kj, ki, bs, to, outj, outi)
- times[title] = tm
+ for k, typename in ipairs(typenames) do
+ local input = torch.randn(bs,from,inj,ini):type(typename)
- local input = torch.randn(bs,from,inj,ini)
- local sconv = nn.SpatialSubSampling(from,ki,kj,si,sj)
- local groundtruth = sconv:forward(input)
- local a = torch.Timer()
- for i = 1,nloop do
- groundtruth = sconv:forward(input)
- end
- tm.cpu = a:time().real
+ local ctype = t2cpu[typename]
+ input = input:type(ctype)
+ local sconv = nn.SpatialSubSampling(from,ki,kj,si,sj):type(ctype)
+ local groundtruth = sconv:forward(input)
- input = input:cuda()
- local gconv = nn.SpatialSubSampling(from,ki,kj,si,sj):cuda()
- gconv.weight = sconv.weight:cuda()
- gconv.bias = sconv.bias:cuda()
- local rescuda = gconv:forward(input)
- a:reset()
- for i = 1,nloop do
- rescuda = gconv:forward(input)
- end
- cutorch.synchronize()
- tm.gpu = a:time().real
+ input = input:type(typename)
+ local gconv = nn.SpatialSubSampling(from,ki,kj,si,sj):type(typename)
+ gconv.weight = sconv.weight:type(typename)
+ gconv.bias = sconv.bias:type(typename)
+ local rescuda = gconv:forward(input)
- local error = rescuda:float() - groundtruth
- mytester:assertlt(error:abs():max(), precision_forward, 'error on state (forward) ')
+ local error = rescuda:double() - groundtruth:double()
+ mytester:assertlt(error:abs():max(), precision_forward_type(precision_forward, typename),
+ string.format('error on state (forward) with %s', typename))
+ end
end
function cunntest.SpatialSubSampling_backward()
@@ -2099,51 +1909,50 @@ function cunntest.SpatialSubSampling_backward()
local ini = (outi-1)*si+ki
local inj = (outj-1)*sj+kj
- local tm = {}
- local title = string.format('SpatialSubSampling.backward %dx%dx%d o %dx%d -> %dx%dx%d',
- from, inj, ini, kj, ki, to, outj, outi)
- times[title] = tm
+ for k, typename in ipairs(typenames) do
+ -- FIXME: SpatialSubSampling accumulates directly to real, causes
+ -- precision issues with half
+ precision_backward_old = precision_backward
+ if typename == 'torch.CudaHalfTensor' then
+ precision_backward = 0.4
+ end
+ local input = torch.randn(from,inj,ini):type(typename)
+ local gradOutput = torch.randn(to,outj,outi):type(typename)
- local input = torch.randn(from,inj,ini)
- local gradOutput = torch.randn(to,outj,outi)
- local sconv = nn.SpatialSubSampling(from,ki,kj,si,sj)
- sconv:forward(input)
- sconv:zeroGradParameters()
- local groundgrad = sconv:backward(input, gradOutput)
- local a = torch.Timer()
- for i = 1,nloop do
+ local ctype = t2cpu[typename]
+ input = input:type(ctype)
+ gradOutput = gradOutput:type(ctype)
+ local sconv = nn.SpatialSubSampling(from,ki,kj,si,sj):type(ctype)
+ sconv:forward(input)
sconv:zeroGradParameters()
- groundgrad = sconv:backward(input, gradOutput)
- end
- local groundweight = sconv.gradWeight
- local groundbias = sconv.gradBias
- tm.cpu = a:time().real
+ local groundgrad = sconv:backward(input, gradOutput)
+ local groundweight = sconv.gradWeight
+ local groundbias = sconv.gradBias
- input = input:cuda()
- gradOutput = gradOutput:cuda()
- local gconv = nn.SpatialSubSampling(from,ki,kj,si,sj):cuda()
- gconv.weight = sconv.weight:cuda()
- gconv.bias = sconv.bias:cuda()
- gconv:forward(input)
- gconv:zeroGradParameters()
- local rescuda = gconv:backward(input, gradOutput)
- a:reset()
- for i = 1,nloop do
+ input = input:type(typename)
+ gradOutput = gradOutput:type(typename)
+ local gconv = nn.SpatialSubSampling(from,ki,kj,si,sj):type(typename)
+ gconv.weight = sconv.weight:type(typename)
+ gconv.bias = sconv.bias:type(typename)
+ gconv:forward(input)
gconv:zeroGradParameters()
- rescuda = gconv:backward(input, gradOutput)
- end
- local weightcuda = gconv.gradWeight
- local biascuda = gconv.gradBias
- cutorch.synchronize()
- tm.gpu = a:time().real
+ local rescuda = gconv:backward(input, gradOutput)
+ local weightcuda = gconv.gradWeight
+ local biascuda = gconv.gradBias
- local error = rescuda:float() - groundgrad
- local werror = weightcuda:float() - groundweight
- local berror = biascuda:float() - groundbias
+ local error = rescuda:double() - groundgrad:double()
+ local werror = weightcuda:double() - groundweight:double()
+ local berror = biascuda:double() - groundbias:double()
- mytester:assertlt(error:abs():max(), precision_backward, 'error on state (backward) ')
- mytester:assertlt(werror:abs():max(), precision_backward, 'error on weight (backward) ')
- mytester:assertlt(berror:abs():max(), precision_backward, 'error on bias (backward) ')
+ mytester:assertlt(error:abs():max(), precision_backward_type(precision_backward, typename),
+ string.format('error on state (backward) with %s', typename))
+ mytester:assertlt(werror:abs():max(), precision_backward_type(precision_backward, typename),
+ string.format('error on weight (backward) with %s', typename))
+ mytester:assertlt(berror:abs():max(), precision_backward_type(precision_backward, typename),
+ string.format('error on bias (backward) with %s', typename))
+
+ precision_backward = precision_backward_old
+ end
end
function cunntest.SpatialSubSampling_backward_batch()
@@ -2159,51 +1968,47 @@ function cunntest.SpatialSubSampling_backward_batch()
local ini = (outi-1)*si+ki
local inj = (outj-1)*sj+kj
- local tm = {}
- local title = string.format('SpatialSubSampling.backward %dx%dx%dx%d o %dx%d -> %dx%dx%dx%d',
- bs, from, inj, ini, kj, ki, bs, to, outj, outi)
- times[title] = tm
+ for k, typename in ipairs(typenames) do
+ local input = torch.randn(bs,from,inj,ini):type(typename)
+ local gradOutput = torch.randn(bs,to,outj,outi):type(typename)
- local input = torch.randn(bs,from,inj,ini)
- local gradOutput = torch.randn(bs,to,outj,outi)
- local sconv = nn.SpatialSubSampling(from,ki,kj,si,sj)
- sconv:forward(input)
- sconv:zeroGradParameters()
- local groundgrad = sconv:backward(input, gradOutput)
- local a = torch.Timer()
- for i = 1,nloop do
+ local ctype = t2cpu[typename]
+ input = input:type(ctype)
+ gradOutput = gradOutput:type(ctype)
+ local sconv = nn.SpatialSubSampling(from,ki,kj,si,sj):type(ctype)
+ sconv:forward(input)
sconv:zeroGradParameters()
- groundgrad = sconv:backward(input, gradOutput)
- end
- local groundweight = sconv.gradWeight
- local groundbias = sconv.gradBias
- tm.cpu = a:time().real
+ local groundgrad = sconv:backward(input, gradOutput)
+ local groundweight = sconv.gradWeight
+ local groundbias = sconv.gradBias
- input = input:cuda()
- gradOutput = gradOutput:cuda()
- local gconv = nn.SpatialSubSampling(from,ki,kj,si,sj):cuda()
- gconv.weight = sconv.weight:cuda()
- gconv.bias = sconv.bias:cuda()
- gconv:forward(input)
- gconv:zeroGradParameters()
- local rescuda = gconv:backward(input, gradOutput)
- a:reset()
- for i = 1,nloop do
+ input = input:type(typename)
+ gradOutput = gradOutput:type(typename)
+ local gconv = nn.SpatialSubSampling(from,ki,kj,si,sj):type(typename)
+ gconv.weight = sconv.weight:type(typename)
+ gconv.bias = sconv.bias:type(typename)
+ gconv:forward(input)
gconv:zeroGradParameters()
- rescuda = gconv:backward(input, gradOutput)
- end
- local weightcuda = gconv.gradWeight
- local biascuda = gconv.gradBias
- cutorch.synchronize()
- tm.gpu = a:time().real
+ local rescuda = gconv:backward(input, gradOutput)
+ local weightcuda = gconv.gradWeight
+ local biascuda = gconv.gradBias
- local error = rescuda:float() - groundgrad
- local werror = weightcuda:float() - groundweight
- local berror = biascuda:float() - groundbias
+ local error = rescuda:double() - groundgrad:double()
+ local werror = weightcuda:double() - groundweight:double()
+ local berror = biascuda:double() - groundbias:double()
- mytester:assertlt(error:abs():max(), precision_backward, 'error on state (backward) ')
- mytester:assertlt(werror:abs():max(), precision_backward, 'error on weight (backward) ')
- mytester:assertlt(berror:abs():max(), precision_backward, 'error on bias (backward) ')
+ -- FIXME: SpatialSubSampling accumulates directly to real, causes
+ -- precision issues with half, so we double the error tolerance
+ mytester:assertlt(error:abs():max(),
+ 2*precision_backward_type(precision_backward, typename, rescuda:abs():max()),
+ string.format('error on state (backward) with %s', typename))
+ mytester:assertlt(werror:abs():max(),
+ 2*precision_backward_type(precision_backward, typename, weightcuda:abs():max()),
+ string.format('error on weight (backward) with %s', typename))
+ mytester:assertlt(berror:abs():max(),
+ 2*precision_backward_type(precision_backward, typename, biascuda:abs():max()),
+ string.format('error on bias (backward) with %s', typename))
+ end
end
function cunntest.SpatialMaxPooling_forward()
@@ -2221,36 +2026,26 @@ function cunntest.SpatialMaxPooling_forward()
local inj = (outj-1)*sj+kj - padj*2
local ceil_mode = math.random(0,1) == 1
- local tm = {}
- local title = string.format('SpatialMaxPooling.forward %dx%dx%d o %dx%d -> %dx%dx%d',
- from, inj, ini, kj, ki, to, outj, outi)
- times[title] = tm
-
- local input = torch.randn(from,inj,ini)
- local sconv = nn.SpatialMaxPooling(ki,kj,si,sj,padi,padj)
- if ceil_mode then sconv:ceil() end
- local groundtruth = sconv:forward(input)
- local a = torch.Timer()
- for i = 1,nloop do
- groundtruth = sconv:forward(input)
- end
- tm.cpu = a:time().real
+ for k, typename in ipairs(typenames) do
+ local input = torch.randn(from,inj,ini):type(typename)
+ local ctype = t2cpu[typename]
+ input = input:type(ctype)
+ local sconv = nn.SpatialMaxPooling(ki,kj,si,sj,padi,padj):type(ctype)
+ if ceil_mode then sconv:ceil() end
+ local groundtruth = sconv:forward(input)
- input = input:cuda()
- local gconv = nn.SpatialMaxPooling(ki,kj,si,sj,padi,padj):cuda()
- if ceil_mode then gconv:ceil() end
- local rescuda = gconv:forward(input)
- a:reset()
- for i = 1,nloop do
- rescuda = gconv:forward(input)
- end
- cutorch.synchronize()
- tm.gpu = a:time().real
+ input = input:type(typename)
+ local gconv = nn.SpatialMaxPooling(ki,kj,si,sj,padi,padj):type(typename)
+ if ceil_mode then gconv:ceil() end
+ local rescuda = gconv:forward(input)
- local error = rescuda:float() - groundtruth
- mytester:assertlt(error:abs():max(), precision_forward, 'error on state (forward) ')
- local error_ind = gconv.indices:float() - sconv.indices
- mytester:asserteq(error_ind:max(), 0, 'error on indices (forward) ')
+ local error = rescuda:double() - groundtruth:double()
+ mytester:assertlt(error:abs():max(), precision_forward_type(precision_forward, typename),
+ string.format('error on state (forward) with %s', typename))
+ local error_ind = gconv.indices:long() - sconv.indices
+ mytester:asserteq(error_ind:max(), 0,
+ string.format('error on indices (forward) with %s', typename))
+ end
end
function cunntest.SpatialMaxPooling_forward_batch()
@@ -2269,34 +2064,23 @@ function cunntest.SpatialMaxPooling_forward_batch()
local inj = (outj-1)*sj+kj - padj*2
local ceil_mode = math.random(0,1) == 1
- local tm = {}
- local title = string.format('SpatialMaxPooling.forward %dx%dx%dx%d o %dx%d -> %dx%dx%dx%d',
- bs, from, inj, ini, kj, ki, bs, to, outj, outi)
- times[title] = tm
-
- local input = torch.randn(bs,from,inj,ini)
- local sconv = nn.SpatialMaxPooling(ki,kj,si,sj,padi,padj)
- if ceil_mode then sconv:ceil() end
- local groundtruth = sconv:forward(input)
- local a = torch.Timer()
- for i = 1,nloop do
- groundtruth = sconv:forward(input)
- end
- tm.cpu = a:time().real
+ for k, typename in ipairs(typenames) do
+ local input = torch.randn(bs,from,inj,ini):type(typename)
+ local ctype = t2cpu[typename]
+ input = input:type(ctype)
+ local sconv = nn.SpatialMaxPooling(ki,kj,si,sj,padi,padj):type(ctype)
+ if ceil_mode then sconv:ceil() end
+ local groundtruth = sconv:forward(input)
- input = input:cuda()
- local gconv = nn.SpatialMaxPooling(ki,kj,si,sj,padi,padj):cuda()
- if ceil_mode then gconv:ceil() end
- local rescuda = gconv:forward(input)
- a:reset()
- for i = 1,nloop do
- rescuda = gconv:forward(input)
- end
- cutorch.synchronize()
- tm.gpu = a:time().real
+ input = input:type(typename)
+ local gconv = nn.SpatialMaxPooling(ki,kj,si,sj,padi,padj):type(typename)
+ if ceil_mode then gconv:ceil() end
+ local rescuda = gconv:forward(input)
- local error = rescuda:float() - groundtruth
- mytester:assertlt(error:abs():max(), precision_forward, 'error on state (forward) ')
+ local error = rescuda:double() - groundtruth:double()
+ mytester:assertlt(error:abs():max(), precision_forward_type(precision_forward, typename),
+ string.format('error on state (forward) with %s', typename))
+ end
end
function cunntest.SpatialMaxUnpooling_forward_batch()
@@ -2316,40 +2100,29 @@ function cunntest.SpatialMaxUnpooling_forward_batch()
local ini = fun((outi + padi*2 - ki)/si) +1
local inj = fun((outj + padj*2 - kj)/sj) +1
- local tm = {}
- local title = string.format('SpatialMaxUnpooling.forward %dx%dx%dx%d o %dx%d -> %dx%dx%dx%d',
- bs, from, inj, ini, kj, ki, bs, to, outj, outi)
- times[title] = tm
+ for k, typename in ipairs(typenames) do
+ local ctype = t2cpu[typename]
+ local pooler = nn.SpatialMaxPooling(ki,kj,si,sj,padi,padj):type(ctype)
+ if ceil_mode then pooler:ceil() end
+ local sunpool = nn.SpatialMaxUnpooling(pooler):type(ctype)
- local pooler = nn.SpatialMaxPooling(ki,kj,si,sj,padi,padj)
- if ceil_mode then pooler:ceil() end
- local sunpool = nn.SpatialMaxUnpooling(pooler)
-
- local original = torch.randn(bs,from,outj,outi)
- local input = pooler:forward(original)
- local groundtruth = sunpool:forward(input)
- local a = torch.Timer()
- for i = 1,nloop do
- groundtruth = sunpool:forward(input)
- end
- tm.cpu = a:time().real
+ local original = torch.randn(bs,from,outj,outi):type(typename)
+ original = original:type(ctype)
+ local input = pooler:forward(original)
+ local groundtruth = sunpool:forward(input)
- original = original:cuda()
- pooler = nn.SpatialMaxPooling(ki,kj,si,sj,padi,padj):cuda()
- if ceil_mode then pooler:ceil() end
- local gunpool = nn.SpatialMaxUnpooling(pooler):cuda()
+ original = original:type(typename)
+ pooler = nn.SpatialMaxPooling(ki,kj,si,sj,padi,padj):type(typename)
+ if ceil_mode then pooler:ceil() end
+ local gunpool = nn.SpatialMaxUnpooling(pooler):type(typename)
- input = pooler:forward(original)
- local rescuda = gunpool:forward(input)
- a:reset()
- for i = 1,nloop do
- rescuda = gunpool:forward(input)
- end
- cutorch.synchronize()
- tm.gpu = a:time().real
+ input = pooler:forward(original)
+ local rescuda = gunpool:forward(input)
- local error = rescuda:float() - groundtruth
- mytester:assertlt(error:abs():max(), precision_forward, 'error on state (forward) ')
+ local error = rescuda:double() - groundtruth:double()
+ mytester:assertlt(error:abs():max(), precision_forward_type(precision_forward, typename),
+ string.format('error on state (forward) with %s', typename))
+ end
end
function cunntest.SpatialMaxPooling_backward()
@@ -2365,45 +2138,34 @@ function cunntest.SpatialMaxPooling_backward()
local padj = math.random(0,kj/2-1)
local ini = (outi-1)*si+ki - padi*2
local inj = (outj-1)*sj+kj - padj*2
- local ceil_mode = math.random(0,1) == 1
+ local ceil_mode = true--math.random(0,1) == 1
- local tm = {}
- local title = string.format('SpatialMaxPooling.backward %dx%dx%d o %dx%d -> %dx%dx%d',
- from, inj, ini, kj, ki, to, outj, outi)
- times[title] = tm
+ for k, typename in ipairs(typenames) do
+ local input = torch.randn(from,inj,ini):type(typename)
+ local gradOutput = torch.randn(to,outj,outi):type(typename)
+ local ctype = t2cpu[typename]
+ input = input:type(ctype)
+ gradOutput = gradOutput:type(ctype)
- local input = torch.randn(from,inj,ini)
- local gradOutput = torch.randn(to,outj,outi)
- local sconv = nn.SpatialMaxPooling(ki,kj,si,sj,padi,padj)
- if ceil_mode then sconv:ceil() end
- sconv:forward(input)
- sconv:zeroGradParameters()
- local groundgrad = sconv:backward(input, gradOutput)
- local a = torch.Timer()
- for i = 1,nloop do
+ local sconv = nn.SpatialMaxPooling(ki,kj,si,sj,padi,padj):type(ctype)
+ if ceil_mode then sconv:ceil() end
+ sconv:forward(input)
sconv:zeroGradParameters()
- groundgrad = sconv:backward(input, gradOutput)
- end
- tm.cpu = a:time().real
+ local groundgrad = sconv:backward(input, gradOutput)
- input = input:cuda()
- gradOutput = gradOutput:cuda()
- local gconv = nn.SpatialMaxPooling(ki,kj,si,sj,padi,padj):cuda()
- if ceil_mode then gconv:ceil() end
- gconv:forward(input)
- gconv:zeroGradParameters()
- local rescuda = gconv:backward(input, gradOutput)
- a:reset()
- for i = 1,nloop do
+ input = input:type(typename)
+ gradOutput = gradOutput:type(typename)
+ local gconv = nn.SpatialMaxPooling(ki,kj,si,sj,padi,padj):type(typename)
+ if ceil_mode then gconv:ceil() end
+ gconv:forward(input)
gconv:zeroGradParameters()
- rescuda = gconv:backward(input, gradOutput)
- end
- cutorch.synchronize()
- tm.gpu = a:time().real
+ local rescuda = gconv:backward(input, gradOutput)
- local error = rescuda:float() - groundgrad
+ local error = rescuda:double() - groundgrad:double()
- mytester:assertlt(error:abs():max(), precision_backward, 'error on state (backward) ')
+ mytester:assertlt(error:abs():max(), precision_backward_type(precision_backward, typename),
+ string.format('error on state (backward) with %s', typename))
+ end
end
function cunntest.SpatialMaxPooling_backward_batch()
@@ -2422,43 +2184,31 @@ function cunntest.SpatialMaxPooling_backward_batch()
local inj = (outj-1)*sj+kj - padj*2
local ceil_mode = math.random(0,1) == 1
- local tm = {}
- local title = string.format('SpatialMaxPooling.backward %dx%dx%dx%d o %dx%d -> %dx%dx%dx%d',
- bs, from, inj, ini, kj, ki, bs, to, outj, outi)
- times[title] = tm
-
- local input = torch.randn(bs,from,inj,ini)
- local gradOutput = torch.randn(bs,to,outj,outi)
- local sconv = nn.SpatialMaxPooling(ki,kj,si,sj,padi,padj)
- if ceil_mode then sconv:ceil() end
- sconv:forward(input)
- sconv:zeroGradParameters()
- local groundgrad = sconv:backward(input, gradOutput)
- local a = torch.Timer()
- for i = 1,nloop do
+ for k, typename in ipairs(typenames) do
+ local input = torch.randn(bs,from,inj,ini):type(typename)
+ local gradOutput = torch.randn(bs,to,outj,outi):type(typename)
+ local ctype = t2cpu[typename]
+ local input = input:type(ctype)
+ local gradOutput = gradOutput:type(ctype)
+ local sconv = nn.SpatialMaxPooling(ki,kj,si,sj,padi,padj):type(ctype)
+ if ceil_mode then sconv:ceil() end
+ sconv:forward(input)
sconv:zeroGradParameters()
- groundgrad = sconv:backward(input, gradOutput)
- end
- tm.cpu = a:time().real
+ local groundgrad = sconv:backward(input, gradOutput)
- input = input:cuda()
- gradOutput = gradOutput:cuda()
- local gconv = nn.SpatialMaxPooling(ki,kj,si,sj,padi,padj):cuda()
- if ceil_mode then gconv:ceil() end
- gconv:forward(input)
- gconv:zeroGradParameters()
- local rescuda = gconv:backward(input, gradOutput)
- a:reset()
- for i = 1,nloop do
+ input = input:type(typename)
+ gradOutput = gradOutput:type(typename)
+ local gconv = nn.SpatialMaxPooling(ki,kj,si,sj,padi,padj):type(typename)
+ if ceil_mode then gconv:ceil() end
+ gconv:forward(input)
gconv:zeroGradParameters()
- rescuda = gconv:backward(input, gradOutput)
- end
- cutorch.synchronize()
- tm.gpu = a:time().real
+ local rescuda = gconv:backward(input, gradOutput)
- local error = rescuda:float() - groundgrad
+ local error = rescuda:double() - groundgrad:double()
- mytester:assertlt(error:abs():max(), precision_backward, 'error on state (backward) ')
+ mytester:assertlt(error:abs():max(), precision_backward_type(precision_backward, typename),
+ string.format('error on state (backward) with %s', typename))
+ end
end
function cunntest.SpatialMaxUnpooling_backward_batch()
@@ -2478,50 +2228,38 @@ function cunntest.SpatialMaxUnpooling_backward_batch()
local ini = fun((outi + padi*2 - ki)/si) +1
local inj = fun((outj + padj*2 - kj)/sj) +1
- local tm = {}
- local title = string.format('SpatialMaxUnpooling.backward %dx%dx%dx%d o %dx%d -> %dx%dx%dx%d',
- bs, from, inj, ini, kj, ki, bs, to, outj, outi)
- times[title] = tm
-
- local pooler = nn.SpatialMaxPooling(ki,kj,si,sj,padi,padj)
- if ceil_mode then pooler:ceil() end
- local sunpool = nn.SpatialMaxUnpooling(pooler)
-
- local original = torch.randn(bs,from,outj,outi)
- local input = pooler:forward(original)
- local gradOutput = torch.randn(original:size())
- sunpool:forward(input)
- sunpool:zeroGradParameters()
- local groundgrad = sunpool:backward(input, gradOutput)
- local a = torch.Timer()
- for i = 1,nloop do
+ for k, typename in ipairs(typenames) do
+ local ctype = t2cpu[typename]
+ local pooler = nn.SpatialMaxPooling(ki,kj,si,sj,padi,padj):type(ctype)
+ if ceil_mode then pooler:ceil() end
+ local sunpool = nn.SpatialMaxUnpooling(pooler):type(ctype)
+
+ local original = torch.randn(bs,from,outj,outi):type(typename)
+ original = original:type(ctype)
+ local input = pooler:forward(original)
+ local gradOutput = torch.randn(original:size()):type(typename)
+ gradOutput = gradOutput:type(ctype)
+ sunpool:forward(input)
sunpool:zeroGradParameters()
- groundgrad = sunpool:backward(input, gradOutput)
- end
- tm.cpu = a:time().real
+ local groundgrad = sunpool:backward(input, gradOutput)
- pooler = nn.SpatialMaxPooling(ki,kj,si,sj,padi,padj):cuda()
- if ceil_mode then pooler:ceil() end
- local gunpool = nn.SpatialMaxUnpooling(pooler):cuda()
+ pooler = nn.SpatialMaxPooling(ki,kj,si,sj,padi,padj):type(typename)
+ if ceil_mode then pooler:ceil() end
+ local gunpool = nn.SpatialMaxUnpooling(pooler):type(typename)
- original = original:cuda()
- input = pooler:forward(original)
- gunpool:forward(input)
+ original = original:type(typename)
+ input = pooler:forward(original)
+ gunpool:forward(input)
- gradOutput = gradOutput:cuda()
- gunpool:zeroGradParameters()
- local rescuda = gunpool:backward(input, gradOutput)
- a:reset()
- for i = 1,nloop do
+ gradOutput = gradOutput:type(typename)
gunpool:zeroGradParameters()
- rescuda = gunpool:backward(input, gradOutput)
- end
- cutorch.synchronize()
- tm.gpu = a:time().real
+ local rescuda = gunpool:backward(input, gradOutput)
- local error = rescuda:float() - groundgrad
+ local error = rescuda:double() - groundgrad:double()
- mytester:assertlt(error:abs():max(), precision_backward, 'error on state (backward) ')
+ mytester:assertlt(error:abs():max(), precision_backward_type(precision_backward, typename),
+ string.format('error on state (backward) with %s', typename))
+ end
end
function cunntest.SpatialDilatedMaxPooling_forward()
@@ -2541,36 +2279,26 @@ function cunntest.SpatialDilatedMaxPooling_forward()
local inj = (outj-1)*sj+(dilationj*(kj-1)+1)-2*padj
local ceil_mode = math.random(0,1) == 1
- local tm = {}
- local title = string.format('SpatialDilatedMaxPooling.forward %dx%dx%d o %dx%d -> %dx%dx%d',
- from, inj, ini, kj, ki, to, outj, outi)
- times[title] = tm
-
- local input = torch.randn(from,inj,ini)
- local sconv = nn.SpatialDilatedMaxPooling(ki,kj,si,sj,padi,padj,dilationi,dilationj)
- if ceil_mode then sconv:ceil() end
- local groundtruth = sconv:forward(input)
- local a = torch.Timer()
- for i = 1,nloop do
- groundtruth = sconv:forward(input)
- end
- tm.cpu = a:time().real
+ for k, typename in ipairs(typenames) do
+ local input = torch.randn(from,inj,ini):type(typename)
+ local ctype = t2cpu[typename]
+ input = input:type(ctype)
+ local sconv = nn.SpatialDilatedMaxPooling(ki,kj,si,sj,padi,padj,dilationi,dilationj):type(ctype)
+ if ceil_mode then sconv:ceil() end
+ local groundtruth = sconv:forward(input)
- input = input:cuda()
- local gconv = nn.SpatialDilatedMaxPooling(ki,kj,si,sj,padi,padj,dilationi,dilationj):cuda()
- if ceil_mode then gconv:ceil() end
- local rescuda = gconv:forward(input)
- a:reset()
- for i = 1,nloop do
- rescuda = gconv:forward(input)
- end
- cutorch.synchronize()
- tm.gpu = a:time().real
+ input = input:type(typename)
+ local gconv = nn.SpatialDilatedMaxPooling(ki,kj,si,sj,padi,padj,dilationi,dilationj):type(typename)
+ if ceil_mode then gconv:ceil() end
+ local rescuda = gconv:forward(input)
- local error = rescuda:float() - groundtruth
- mytester:assertlt(error:abs():max(), precision_forward, 'error on state (forward) ')
- local error_ind = gconv.indices:float() - sconv.indices
- mytester:asserteq(error_ind:max(), 0, 'error on indices (forward) ')
+ local error = rescuda:double() - groundtruth:double()
+ mytester:assertlt(error:abs():max(), precision_forward_type(precision_forward, typename),
+ string.format('error on state (forward) with %s', typename))
+ local error_ind = gconv.indices:long() - sconv.indices
+ mytester:asserteq(error_ind:max(), 0,
+ string.format('error on indices (forward) with %s', typename))
+ end
end
function cunntest.SpatialDilatedMaxPooling_forward_batch()
@@ -2591,34 +2319,23 @@ function cunntest.SpatialDilatedMaxPooling_forward_batch()
local inj = (outj-1)*sj+(dilationj*(kj-1)+1)-2*padj
local ceil_mode = math.random(0,1) == 1
- local tm = {}
- local title = string.format('SpatialDilatedMaxPooling.forward %dx%dx%dx%d o %dx%d -> %dx%dx%dx%d',
- bs, from, inj, ini, kj, ki, bs, to, outj, outi)
- times[title] = tm
-
- local input = torch.randn(bs,from,inj,ini)
- local sconv = nn.SpatialDilatedMaxPooling(ki,kj,si,sj,padi,padj,dilationi,dilationj)
- if ceil_mode then sconv:ceil() end
- local groundtruth = sconv:forward(input)
- local a = torch.Timer()
- for i = 1,nloop do
- groundtruth = sconv:forward(input)
- end
- tm.cpu = a:time().real
+ for k, typename in ipairs(typenames) do
+ local input = torch.randn(bs,from,inj,ini):type(typename)
+ local ctype = t2cpu[typename]
+ input = input:type(ctype)
+ local sconv = nn.SpatialDilatedMaxPooling(ki,kj,si,sj,padi,padj,dilationi,dilationj):type(ctype)
+ if ceil_mode then sconv:ceil() end
+ local groundtruth = sconv:forward(input)
- input = input:cuda()
- local gconv = nn.SpatialDilatedMaxPooling(ki,kj,si,sj,padi,padj,dilationi,dilationj):cuda()
- if ceil_mode then gconv:ceil() end
- local rescuda = gconv:forward(input)
- a:reset()
- for i = 1,nloop do
- rescuda = gconv:forward(input)
- end
- cutorch.synchronize()
- tm.gpu = a:time().real
+ input = input:type(typename)
+ local gconv = nn.SpatialDilatedMaxPooling(ki,kj,si,sj,padi,padj,dilationi,dilationj):type(typename)
+ if ceil_mode then gconv:ceil() end
+ local rescuda = gconv:forward(input)
- local error = rescuda:float() - groundtruth
- mytester:assertlt(error:abs():max(), precision_forward, 'error on state (forward) ')
+ local error = rescuda:double() - groundtruth:double()
+ mytester:assertlt(error:abs():max(), precision_forward_type(precision_forward, typename),
+ string.format('error on state (forward) with %s', typename))
+ end
end
function cunntest.SpatialDilatedMaxPooling_backward()
@@ -2638,43 +2355,31 @@ function cunntest.SpatialDilatedMaxPooling_backward()
local inj = (outj-1)*sj+(dilationj*(kj-1)+1)-2*padj
local ceil_mode = math.random(0,1) == 1
- local tm = {}
- local title = string.format('SpatialDilatedMaxPooling.backward %dx%dx%d o %dx%d -> %dx%dx%d',
- from, inj, ini, kj, ki, to, outj, outi)
- times[title] = tm
-
- local input = torch.randn(from,inj,ini)
- local gradOutput = torch.randn(to,outj,outi)
- local sconv = nn.SpatialDilatedMaxPooling(ki,kj,si,sj,padi,padj,dilationi,dilationj)
- if ceil_mode then sconv:ceil() end
- sconv:forward(input)
- sconv:zeroGradParameters()
- local groundgrad = sconv:backward(input, gradOutput)
- local a = torch.Timer()
- for i = 1,nloop do
+ for k, typename in ipairs(typenames) do
+ local input = torch.randn(from,inj,ini):type(typename)
+ local gradOutput = torch.randn(to,outj,outi):type(typename)
+ local ctype = t2cpu[typename]
+ input = input:type(ctype)
+ gradOutput = gradOutput:type(ctype)
+ local sconv = nn.SpatialDilatedMaxPooling(ki,kj,si,sj,padi,padj,dilationi,dilationj):type(ctype)
+ if ceil_mode then sconv:ceil() end
+ sconv:forward(input)
sconv:zeroGradParameters()
- groundgrad = sconv:backward(input, gradOutput)
- end
- tm.cpu = a:time().real
+ local groundgrad = sconv:backward(input, gradOutput)
- input = input:cuda()
- gradOutput = gradOutput:cuda()
- local gconv = nn.SpatialDilatedMaxPooling(ki,kj,si,sj,padi,padj,dilationi,dilationj):cuda()
- if ceil_mode then gconv:ceil() end
- gconv:forward(input)
- gconv:zeroGradParameters()
- local rescuda = gconv:backward(input, gradOutput)
- a:reset()
- for i = 1,nloop do
+ input = input:type(typename)
+ gradOutput = gradOutput:type(typename)
+ local gconv = nn.SpatialDilatedMaxPooling(ki,kj,si,sj,padi,padj,dilationi,dilationj):type(typename)
+ if ceil_mode then gconv:ceil() end
+ gconv:forward(input)
gconv:zeroGradParameters()
- rescuda = gconv:backward(input, gradOutput)
- end
- cutorch.synchronize()
- tm.gpu = a:time().real
+ local rescuda = gconv:backward(input, gradOutput)
- local error = rescuda:float() - groundgrad
+ local error = rescuda:double() - groundgrad:double()
- mytester:assertlt(error:abs():max(), precision_backward, 'error on state (backward) ')
+ mytester:assertlt(error:abs():max(), precision_backward_type(precision_backward, typename),
+ string.format('error on state (backward) with %s', typename))
+ end
end
function cunntest.SpatialDilatedMaxPooling_backward_batch()
@@ -2695,43 +2400,31 @@ function cunntest.SpatialDilatedMaxPooling_backward_batch()
local inj = (outj-1)*sj+(dilationj*(kj-1)+1)-2*padj
local ceil_mode = math.random(0,1) == 1
- local tm = {}
- local title = string.format('SpatialDilatedMaxPooling.backward %dx%dx%dx%d o %dx%d -> %dx%dx%dx%d',
- bs, from, inj, ini, kj, ki, bs, to, outj, outi)
- times[title] = tm
-
- local input = torch.randn(bs,from,inj,ini)
- local gradOutput = torch.randn(bs,to,outj,outi)
- local sconv = nn.SpatialDilatedMaxPooling(ki,kj,si,sj,padi,padj,dilationi,dilationj)
- if ceil_mode then sconv:ceil() end
- sconv:forward(input)
- sconv:zeroGradParameters()
- local groundgrad = sconv:backward(input, gradOutput)
- local a = torch.Timer()
- for i = 1,nloop do
+ for k, typename in ipairs(typenames) do
+ local input = torch.randn(bs,from,inj,ini):type(typename)
+ local gradOutput = torch.randn(bs,to,outj,outi):type(typename)
+ local ctype = t2cpu[typename]
+ input = input:type(ctype)
+ gradOutput = gradOutput:type(ctype)
+ local sconv = nn.SpatialDilatedMaxPooling(ki,kj,si,sj,padi,padj,dilationi,dilationj):type(ctype)
+ if ceil_mode then sconv:ceil() end
+ sconv:forward(input)
sconv:zeroGradParameters()
- groundgrad = sconv:backward(input, gradOutput)
- end
- tm.cpu = a:time().real
+ local groundgrad = sconv:backward(input, gradOutput)
- input = input:cuda()
- gradOutput = gradOutput:cuda()
- local gconv = nn.SpatialDilatedMaxPooling(ki,kj,si,sj,padi,padj,dilationi,dilationj):cuda()
- if ceil_mode then gconv:ceil() end
- gconv:forward(input)
- gconv:zeroGradParameters()
- local rescuda = gconv:backward(input, gradOutput)
- a:reset()
- for i = 1,nloop do
+ input = input:type(typename)
+ gradOutput = gradOutput:type(typename)
+ local gconv = nn.SpatialDilatedMaxPooling(ki,kj,si,sj,padi,padj,dilationi,dilationj):type(typename)
+ if ceil_mode then gconv:ceil() end
+ gconv:forward(input)
gconv:zeroGradParameters()
- rescuda = gconv:backward(input, gradOutput)
- end
- cutorch.synchronize()
- tm.gpu = a:time().real
+ local rescuda = gconv:backward(input, gradOutput)
- local error = rescuda:float() - groundgrad
+ local error = rescuda:double() - groundgrad:double()
- mytester:assertlt(error:abs():max(), precision_backward, 'error on state (backward) ')
+ mytester:assertlt(error:abs():max(), precision_backward_type(precision_backward, typename),
+ string.format('error on state (backward) with %s', typename))
+ end
end
function cunntest.SpatialFractionalMaxPooling_forward()
@@ -2752,66 +2445,55 @@ function cunntest.SpatialFractionalMaxPooling_forward()
local ratioW = outW / inW
local ratioH = outH / inH
- local tm = {}
- local title =
- string.format('SpatialFractionalMaxPooling.forward %dx%dx%dx%d o %dx%d -> %dx%dx%dx%d',
- batch, plane, inH, inW, poolSizeH, poolSizeW, batch, plane, outH, outW)
- times[title] = tm
-
- local input = nil
- if batch == 1 then
- input = torch.Tensor(plane, inH, inW):uniform()
- else
- input = torch.Tensor(batch, plane, inH, inW):uniform()
- end
+ for k, typename in ipairs(typenames) do
+ local input = nil
+ if batch == 1 then
+ input = torch.Tensor(plane, inH, inW):uniform():type(typename)
+ else
+ input = torch.Tensor(batch, plane, inH, inW):uniform():type(typename)
+ end
- local module = nil
- if useRatio then
- module =
- nn.SpatialFractionalMaxPooling(poolSizeW, poolSizeH, ratioW, ratioH)
- else
- module =
- nn.SpatialFractionalMaxPooling(poolSizeW, poolSizeH, outW, outH)
- end
+ local ctype = t2cpu[typename]
+ input = input:type(ctype)
+ local module = nil
+ if useRatio then
+ module =
+ nn.SpatialFractionalMaxPooling(poolSizeW, poolSizeH, ratioW, ratioH):type(ctype)
+ else
+ module =
+ nn.SpatialFractionalMaxPooling(poolSizeW, poolSizeH, outW, outH):type(ctype)
+ end
- module:fixPoolingRegions()
+ module:fixPoolingRegions()
- local groundtruth = module:forward(input)
- local a = torch.Timer()
- for i = 1,nloop do
- groundtruth = module:forward(input)
- end
- tm.cpu = a:time().real
+ local groundtruth = module:forward(input)
- input = input:cuda()
+ input = input:type(typename)
- local gmodule = nil
- if useRatio then
- gmodule =
- nn.SpatialFractionalMaxPooling(poolSizeW, poolSizeH, ratioW, ratioH)
- else
- gmodule =
- nn.SpatialFractionalMaxPooling(poolSizeW, poolSizeH, outW, outH)
- end
+ local gmodule = nil
+ if useRatio then
+ gmodule =
+ nn.SpatialFractionalMaxPooling(poolSizeW, poolSizeH, ratioW, ratioH)
+ else
+ gmodule =
+ nn.SpatialFractionalMaxPooling(poolSizeW, poolSizeH, outW, outH)
+ end
- gmodule = gmodule:fixPoolingRegions():cuda()
+ gmodule = gmodule:fixPoolingRegions():type(typename)
- -- For comparison purposes, make sure we are using the same random pooling regions
- -- as the CPU
- gmodule.randomSamples = module.randomSamples:cuda()
+ -- For comparison purposes, make sure we are using the same random pooling regions
+ -- as the CPU
+ gmodule.randomSamples = module.randomSamples:type(typename)
- local rescuda = gmodule:forward(input)
- a:reset()
- for i = 1,nloop do
- rescuda = gmodule:forward(input)
- end
- cutorch.synchronize()
- tm.gpu = a:time().real
+ local rescuda = gmodule:forward(input)
- local error = rescuda:float() - groundtruth
- mytester:assertlt(error:abs():max(), precision_forward, 'error on state (forward) ')
- local error_ind = gmodule.indices:float() - module.indices
- mytester:asserteq(error_ind:abs():max(), 0, 'error on indices (forward) ')
+ local error = rescuda:double() - groundtruth:double()
+ mytester:assertlt(error:abs():max(), precision_forward_type(precision_forward, typename),
+ string.format('error on state (forward) with %s', typename))
+ local error_ind = gmodule.indices:long() - module.indices
+ mytester:asserteq(error_ind:abs():max(), 0,
+ string.format('error on indices (forward) with %s', typename))
+ end
end
function cunntest.SpatialFractionalMaxPooling_backward()
@@ -2828,78 +2510,71 @@ function cunntest.SpatialFractionalMaxPooling_backward()
local inW = math.random(minInW, minInW + 6)
local inH = math.random(minInH, minInH + 6)
- local tm = {}
- local title =
- string.format('SpatialFractionalMaxPooling.backward %dx%dx%dx%d o %dx%d -> %dx%dx%dx%d',
- batch, plane, inH, inW, poolSizeH, poolSizeW, batch, plane, outH, outW)
- times[title] = tm
-
- local input = nil
- local gradOutput = nil
- if batch == 1 then
- input = torch.Tensor(plane, inH, inW):uniform()
- gradOutput = torch.Tensor(plane, outH, outW):uniform()
- else
- input = torch.Tensor(batch, plane, inH, inW):uniform()
- gradOutput = torch.Tensor(batch, plane, outH, outW):uniform()
- end
+ for k, typename in ipairs(typenames) do
+ local input = nil
+ local gradOutput = nil
+ if batch == 1 then
+ input = torch.Tensor(plane, inH, inW):uniform():type(typename)
+ gradOutput = torch.Tensor(plane, outH, outW):uniform():type(typename)
+ else
+ input = torch.Tensor(batch, plane, inH, inW):uniform():type(typename)
+ gradOutput = torch.Tensor(batch, plane, outH, outW):uniform():type(typename)
+ end
- local module =
- nn.SpatialFractionalMaxPooling(poolSizeW, poolSizeH, outW, outH)
- :fixPoolingRegions()
+ local ctype = t2cpu[typename]
+ input = input:type(ctype)
+ gradOutput = gradOutput:type(ctype)
+ local module =
+ nn.SpatialFractionalMaxPooling(poolSizeW, poolSizeH, outW, outH)
+ :fixPoolingRegions():type(ctype)
- module:forward(input)
- module:zeroGradParameters()
- local groundgrad = module:backward(input, gradOutput)
- local a = torch.Timer()
- for i = 1,nloop do
+ -- convert type of randomSamples and ensure we don't resample
+ module:initSampleBuffer_(input)
+ module:fixPoolingRegions()
+ module.randomSamples = module.randomSamples:type(typename):type(ctype)
+ module:forward(input)
module:zeroGradParameters()
- groundgrad = module:backward(input, gradOutput)
- end
- tm.cpu = a:time().real
-
- input = input:cuda()
- gradOutput = gradOutput:cuda()
-
- local gmodule =
- nn.SpatialFractionalMaxPooling(poolSizeW, poolSizeH, outW, outH)
- :fixPoolingRegions():cuda()
- -- For comparison purposes, make sure we are using the same random pooling regions
- -- as the CPU
- gmodule.randomSamples = module.randomSamples:cuda()
-
- gmodule:forward(input)
- gmodule:zeroGradParameters()
- local rescuda = gmodule:backward(input, gradOutput)
- a:reset()
- for i = 1,nloop do
+ local groundgrad = module:backward(input, gradOutput)
+
+ input = input:type(typename)
+ gradOutput = gradOutput:type(typename)
+
+ local gmodule =
+ nn.SpatialFractionalMaxPooling(poolSizeW, poolSizeH, outW, outH)
+ :fixPoolingRegions():type(typename)
+ -- For comparison purposes, make sure we are using the same random pooling regions
+ -- as the CPU
+ gmodule.randomSamples = module.randomSamples:type(typename)
+
+ gmodule:forward(input)
gmodule:zeroGradParameters()
- rescuda = gmodule:backward(input, gradOutput)
- end
- cutorch.synchronize()
- tm.gpu = a:time().real
+ local rescuda = gmodule:backward(input, gradOutput)
- local error = rescuda:float() - groundgrad
- mytester:assertlt(error:abs():max(), precision_backward, 'error on state (backward) ')
+ local error = rescuda:double() - groundgrad:double()
+ mytester:assertlt(error:abs():max(), precision_backward_type(precision_backward, typename),
+ string.format('error on state (backward) with %s', typename))
+ end
end
function cunntest.SpatialAveragePooling_includepad()
- local net = nn.SpatialAveragePooling(2, 2, 1, 1, 1, 1):cuda()
- local net_no_include_pad = net:clone()
- net_no_include_pad:setCountExcludePad()
- local net_include_pad = net:clone()
- net_include_pad:setCountIncludePad()
+ for k, typename in ipairs(typenames) do
+ local net = nn.SpatialAveragePooling(2, 2, 1, 1, 1, 1):type(typename)
+ local net_no_include_pad = net:clone()
+ net_no_include_pad:setCountExcludePad()
+ local net_include_pad = net:clone()
+ net_include_pad:setCountIncludePad()
- local input = torch.FloatTensor(1, 1, 1, 1):cuda()
- input[1][1][1][1] = 3
- local out_noinclude = net_no_include_pad:forward(input)
- local out_include = net_include_pad:forward(input)
+ local input = torch.FloatTensor(1, 1, 1, 1):type(typename)
+ input[1][1][1][1] = 3
+ local out_noinclude = net_no_include_pad:forward(input)
+ local out_include = net_include_pad:forward(input)
- local noinc_out = out_noinclude[1][1][1][1]
- local inc_out = out_include[1][1][1][1]
- mytester:assertne(noinc_out, inc_out)
- mytester:asserteq(3, noinc_out)
- mytester:asserteq(3/4, inc_out)
+ local noinc_out = out_noinclude[1][1][1][1]
+ local inc_out = out_include[1][1][1][1]
+ mytester:assertne(noinc_out, inc_out)
+ mytester:asserteq(3, noinc_out)
+ mytester:asserteq(3/4, inc_out)
+ end
end
function cunntest.SpatialAveragePooling_forward()
@@ -2918,36 +2593,26 @@ function cunntest.SpatialAveragePooling_forward()
local ceil_mode = math.random(0,1) == 1
local count_exclude_pad = math.random(0,1) == 1
- local tm = {}
- local title = string.format('SpatialAveragePooling.forward %dx%dx%d o %dx%d -> %dx%dx%d',
- from, inj, ini, kj, ki, to, outj, outi)
- times[title] = tm
+ for k, typename in ipairs(typenames) do
+ local input = torch.randn(from,inj,ini):type(typename)
- local input = torch.randn(from,inj,ini)
- local sconv = nn.SpatialAveragePooling(ki,kj,si,sj,padi,padj)
- if ceil_mode then sconv:ceil() end
- if count_exclude_pad then sconv:setCountExcludePad() end
- local groundtruth = sconv:forward(input)
- local a = torch.Timer()
- for i = 1,nloop do
- groundtruth = sconv:forward(input)
- end
- tm.cpu = a:time().real
+ local ctype = t2cpu[typename]
+ input = input:type(ctype)
+ local sconv = nn.SpatialAveragePooling(ki,kj,si,sj,padi,padj):type(ctype)
+ if ceil_mode then sconv:ceil() end
+ if count_exclude_pad then sconv:setCountExcludePad() end
+ local groundtruth = sconv:forward(input)
- input = input:cuda()
- local gconv = nn.SpatialAveragePooling(ki,kj,si,sj,padi,padj):cuda()
- if ceil_mode then gconv:ceil() end
- if count_exclude_pad then gconv:setCountExcludePad() end
- local rescuda = gconv:forward(input)
- a:reset()
- for i = 1,nloop do
- rescuda = gconv:forward(input)
- end
- cutorch.synchronize()
- tm.gpu = a:time().real
+ input = input:type(typename)
+ local gconv = nn.SpatialAveragePooling(ki,kj,si,sj,padi,padj):type(typename)
+ if ceil_mode then gconv:ceil() end
+ if count_exclude_pad then gconv:setCountExcludePad() end
+ local rescuda = gconv:forward(input)
- local error = rescuda:float() - groundtruth
- mytester:assertlt(error:abs():max(), precision_forward, 'error on state (forward) ')
+ local error = rescuda:double() - groundtruth:double()
+ mytester:assertlt(error:abs():max(), precision_forward_type(precision_forward, typename),
+ string.format('error on state (forward) with %s', typename))
+ end
end
function cunntest.SpatialAveragePooling_forward_batch()
@@ -2967,36 +2632,26 @@ function cunntest.SpatialAveragePooling_forward_batch()
local ceil_mode = math.random(0,1) == 1
local count_exclude_pad = math.random(0,1) == 1
- local tm = {}
- local title = string.format('SpatialAveragePooling.forward %dx%dx%dx%d o %dx%d -> %dx%dx%dx%d',
- bs, from, inj, ini, kj, ki, bs, to, outj, outi)
- times[title] = tm
+ for k, typename in ipairs(typenames) do
+ local input = torch.randn(bs,from,inj,ini):type(typename)
+ local ctype = t2cpu[typename]
- local input = torch.randn(bs,from,inj,ini)
- local sconv = nn.SpatialAveragePooling(ki,kj,si,sj,padi,padj)
- if ceil_mode then sconv:ceil() end
- if count_exclude_pad then sconv:setCountExcludePad() end
- local groundtruth = sconv:forward(input)
- local a = torch.Timer()
- for i = 1,nloop do
- groundtruth = sconv:forward(input)
- end
- tm.cpu = a:time().real
+ input = input:type(ctype)
+ local sconv = nn.SpatialAveragePooling(ki,kj,si,sj,padi,padj):type(ctype)
+ if ceil_mode then sconv:ceil() end
+ if count_exclude_pad then sconv:setCountExcludePad() end
+ local groundtruth = sconv:forward(input)
- input = input:cuda()
- local gconv = nn.SpatialAveragePooling(ki,kj,si,sj,padi,padj):cuda()
- if ceil_mode then gconv:ceil() end
- if count_exclude_pad then gconv:setCountExcludePad() end
- local rescuda = gconv:forward(input)
- a:reset()
- for i = 1,nloop do
- rescuda = gconv:forward(input)
- end
- cutorch.synchronize()
- tm.gpu = a:time().real
+ input = input:type(typename)
+ local gconv = nn.SpatialAveragePooling(ki,kj,si,sj,padi,padj):type(typename)
+ if ceil_mode then gconv:ceil() end
+ if count_exclude_pad then gconv:setCountExcludePad() end
+ local rescuda = gconv:forward(input)
- local error = rescuda:float() - groundtruth
- mytester:assertlt(error:abs():max(), precision_forward, 'error on state (forward) ')
+ local error = rescuda:double() - groundtruth:double()
+ mytester:assertlt(error:abs():max(), precision_forward_type(precision_forward, typename),
+ string.format('error on state (forward) with %s', typename))
+ end
end
function cunntest.SpatialAveragePooling_backward()
@@ -3015,45 +2670,34 @@ function cunntest.SpatialAveragePooling_backward()
local ceil_mode = math.random(0,1) == 1
local count_exclude_pad = math.random(0,1) == 1
- local tm = {}
- local title = string.format('SpatialAveragePooling.backward %dx%dx%d o %dx%d -> %dx%dx%d',
- from, inj, ini, kj, ki, to, outj, outi)
- times[title] = tm
+ for k, typename in ipairs(typenames) do
+ local input = torch.randn(from,inj,ini):type(typename)
+ local gradOutput = torch.randn(to,outj,outi):type(typename)
- local input = torch.randn(from,inj,ini)
- local gradOutput = torch.randn(to,outj,outi)
- local sconv = nn.SpatialAveragePooling(ki,kj,si,sj,padi,padj)
- if ceil_mode then sconv:ceil() end
- if count_exclude_pad then sconv:setCountExcludePad() end
- sconv:forward(input)
- sconv:zeroGradParameters()
- local groundgrad = sconv:backward(input, gradOutput)
- local a = torch.Timer()
- for i = 1,nloop do
+ local ctype = t2cpu[typename]
+ input = input:type(ctype)
+ gradOutput = gradOutput:type(ctype)
+ local sconv = nn.SpatialAveragePooling(ki,kj,si,sj,padi,padj):type(ctype)
+ if ceil_mode then sconv:ceil() end
+ if count_exclude_pad then sconv:setCountExcludePad() end
+ sconv:forward(input)
sconv:zeroGradParameters()
- groundgrad = sconv:backward(input, gradOutput)
- end
- tm.cpu = a:time().real
+ local groundgrad = sconv:backward(input, gradOutput)
- input = input:cuda()
- gradOutput = gradOutput:cuda()
- local gconv = nn.SpatialAveragePooling(ki,kj,si,sj,padi,padj):cuda()
- if ceil_mode then gconv:ceil() end
- if count_exclude_pad then gconv:setCountExcludePad() end
- gconv:forward(input)
- gconv:zeroGradParameters()
- local rescuda = gconv:backward(input, gradOutput)
- a:reset()
- for i = 1,nloop do
+ input = input:type(typename)
+ gradOutput = gradOutput:type(typename)
+ local gconv = nn.SpatialAveragePooling(ki,kj,si,sj,padi,padj):type(typename)
+ if ceil_mode then gconv:ceil() end
+ if count_exclude_pad then gconv:setCountExcludePad() end
+ gconv:forward(input)
gconv:zeroGradParameters()
- rescuda = gconv:backward(input, gradOutput)
- end
- cutorch.synchronize()
- tm.gpu = a:time().real
+ local rescuda = gconv:backward(input, gradOutput)
- local error = rescuda:float() - groundgrad
+ local error = rescuda:double() - groundgrad:double()
- mytester:assertlt(error:abs():max(), precision_backward, 'error on state (backward) ')
+ mytester:assertlt(error:abs():max(), precision_backward_type(precision_backward, typename),
+ string.format('error on state (backward) with %s', typename))
+ end
end
function cunntest.SpatialAveragePooling_backward_batch()
@@ -3073,45 +2717,34 @@ function cunntest.SpatialAveragePooling_backward_batch()
local ceil_mode = math.random(0,1) == 1
local count_exclude_pad = math.random(0,1) == 1
- local tm = {}
- local title = string.format('SpatialAveragePooling.backward %dx%dx%dx%d o %dx%d -> %dx%dx%dx%d',
- bs, from, inj, ini, kj, ki, bs, to, outj, outi)
- times[title] = tm
+ for k, typename in ipairs(typenames) do
+ local input = torch.randn(bs,from,inj,ini):type(typename)
+ local gradOutput = torch.randn(bs,to,outj,outi):type(typename)
- local input = torch.randn(bs,from,inj,ini)
- local gradOutput = torch.randn(bs,to,outj,outi)
- local sconv = nn.SpatialAveragePooling(ki,kj,si,sj,padi,padj)
- if ceil_mode then sconv:ceil() end
- if count_exclude_pad then sconv:setCountExcludePad() end
- sconv:forward(input)
- sconv:zeroGradParameters()
- local groundgrad = sconv:backward(input, gradOutput)
- local a = torch.Timer()
- for i = 1,nloop do
+ local ctype = t2cpu[typename]
+ input = input:type(ctype)
+ gradOutput = gradOutput:type(ctype)
+ local sconv = nn.SpatialAveragePooling(ki,kj,si,sj,padi,padj):type(ctype)
+ if ceil_mode then sconv:ceil() end
+ if count_exclude_pad then sconv:setCountExcludePad() end
+ sconv:forward(input)
sconv:zeroGradParameters()
- groundgrad = sconv:backward(input, gradOutput)
- end
- tm.cpu = a:time().real
+ local groundgrad = sconv:backward(input, gradOutput)
- input = input:cuda()
- gradOutput = gradOutput:cuda()
- local gconv = nn.SpatialAveragePooling(ki,kj,si,sj,padi,padj):cuda()
- if ceil_mode then gconv:ceil() end
- if count_exclude_pad then gconv:setCountExcludePad() end
- gconv:forward(input)
- gconv:zeroGradParameters()
- local rescuda = gconv:backward(input, gradOutput)
- a:reset()
- for i = 1,nloop do
+ input = input:type(typename)
+ gradOutput = gradOutput:type(typename)
+ local gconv = nn.SpatialAveragePooling(ki,kj,si,sj,padi,padj):type(typename)
+ if ceil_mode then gconv:ceil() end
+ if count_exclude_pad then gconv:setCountExcludePad() end
+ gconv:forward(input)
gconv:zeroGradParameters()
- rescuda = gconv:backward(input, gradOutput)
- end
- cutorch.synchronize()
- tm.gpu = a:time().real
+ local rescuda = gconv:backward(input, gradOutput)
- local error = rescuda:float() - groundgrad
+ local error = rescuda:double() - groundgrad:double()
- mytester:assertlt(error:abs():max(), precision_backward, 'error on state (backward) ')
+ mytester:assertlt(error:abs():max(), precision_backward_type(precision_backward, typename),
+ string.format('error on state (backward) with %s', typename))
+ end
end
function cunntest.SpatialAdaptiveMaxPooling_forward()
@@ -3122,34 +2755,24 @@ function cunntest.SpatialAdaptiveMaxPooling_forward()
local ini = math.random(10,256)
local inj = math.random(10,256)
- local tm = {}
- local title = string.format('SpatialAdaptiveMaxPooling.forward %dx%dx%d -> %dx%dx%d',
- from, inj, ini, to, outj, outi)
- times[title] = tm
+ for k, typename in ipairs(typenames) do
+ local input = torch.randn(from,inj,ini):type(typename)
+ local ctype = t2cpu[typename]
+ input = input:type(ctype)
+ local sconv = nn.SpatialAdaptiveMaxPooling(outi,outj):type(ctype)
+ local groundtruth = sconv:forward(input):type(ctype)
- local input = torch.randn(from,inj,ini)
- local sconv = nn.SpatialAdaptiveMaxPooling(outi,outj)
- local groundtruth = sconv:forward(input)
- local a = torch.Timer()
- for i = 1,nloop do
- groundtruth = sconv:forward(input)
- end
- tm.cpu = a:time().real
+ input = input:type(typename)
+ local gconv = nn.SpatialAdaptiveMaxPooling(outi,outj):type(typename)
+ local rescuda = gconv:forward(input)
- input = input:cuda()
- local gconv = nn.SpatialAdaptiveMaxPooling(outi,outj):cuda()
- local rescuda = gconv:forward(input)
- a:reset()
- for i = 1,nloop do
- rescuda = gconv:forward(input)
+ local error = rescuda:double() - groundtruth:double()
+ mytester:assertlt(error:abs():max(), precision_forward_type(precision_forward, typename),
+ string.format('error on state (forward) with %s', typename))
+ local error_ind = gconv.indices:long() - sconv.indices
+ mytester:asserteq(error_ind:max(), 0,
+ string.format('error on indices (forward) with %s', typename))
end
- cutorch.synchronize()
- tm.gpu = a:time().real
-
- local error = rescuda:float() - groundtruth
- mytester:assertlt(error:abs():max(), precision_forward, 'error on state (forward) ')
- local error_ind = gconv.indices:float() - sconv.indices
- mytester:asserteq(error_ind:max(), 0, 'error on indices (forward) ')
end
function cunntest.SpatialAdaptiveMaxPooling_forward_noncontig()
@@ -3160,35 +2783,24 @@ function cunntest.SpatialAdaptiveMaxPooling_forward_noncontig()
local ini = math.random(10,256)
local inj = math.random(10,256)
- local tm = {}
- local title = string.format('SpatialAdaptiveMaxPooling.forward %s %dx%dx%d -> %dx%dx%d',
- 'non-contiguous',from, inj, ini, to, outj, outi)
- times[title] = tm
+ for k, typename in ipairs(typenames) do
+ local input0 = torch.randn(from,ini,inj):type(typename)
+ local ctype = t2cpu[typename]
+ local input = input0:type(ctype):transpose(2,3)
+ local sconv = nn.SpatialAdaptiveMaxPooling(outi,outj):type(ctype)
+ local groundtruth = sconv:forward(input)
- local input0 = torch.randn(from,ini,inj)
- local input = input0:transpose(2,3)
- local sconv = nn.SpatialAdaptiveMaxPooling(outi,outj)
- local groundtruth = sconv:forward(input)
- local a = torch.Timer()
- for i = 1,nloop do
- groundtruth = sconv:forward(input)
- end
- tm.cpu = a:time().real
+ input = input0:type(typename):transpose(2,3)
+ local gconv = nn.SpatialAdaptiveMaxPooling(outi,outj):type(typename)
+ local rescuda = gconv:forward(input)
- input = input0:cuda():transpose(2,3)
- local gconv = nn.SpatialAdaptiveMaxPooling(outi,outj):cuda()
- local rescuda = gconv:forward(input)
- a:reset()
- for i = 1,nloop do
- rescuda = gconv:forward(input)
+ local error = rescuda:double() - groundtruth:double()
+ mytester:assertlt(error:abs():max(), precision_forward_type(precision_forward, typename),
+ string.format('error on state (forward) with %s', typename))
+ local error_ind = gconv.indices:long() - sconv.indices
+ mytester:asserteq(error_ind:max(), 0,
+ string.format('error on indices (forward) with %s', typename))
end
- cutorch.synchronize()
- tm.gpu = a:time().real
-
- local error = rescuda:float() - groundtruth
- mytester:assertlt(error:abs():max(), precision_forward, 'error on state (forward) ')
- local error_ind = gconv.indices:float() - sconv.indices
- mytester:asserteq(error_ind:max(), 0, 'error on indices (forward) ')
end
function cunntest.SpatialAdaptiveMaxPooling_forward_batch()
@@ -3200,32 +2812,21 @@ function cunntest.SpatialAdaptiveMaxPooling_forward_batch()
local ini = math.random(10,256)
local inj = math.random(10,256)
- local tm = {}
- local title = string.format('SpatialAdaptiveMaxPooling.forward %dx%dx%dx%d -> %dx%dx%dx%d',
- bs, from, inj, ini, bs, to, outj, outi)
- times[title] = tm
+ for k, typename in ipairs(typenames) do
+ local input = torch.randn(bs,from,inj,ini):type(typename)
+ local ctype = t2cpu[typename]
+ input = input:type(ctype)
+ local sconv = nn.SpatialAdaptiveMaxPooling(outi,outj):type(ctype)
+ local groundtruth = sconv:forward(input)
- local input = torch.randn(bs,from,inj,ini)
- local sconv = nn.SpatialAdaptiveMaxPooling(outi,outj)
- local groundtruth = sconv:forward(input)
- local a = torch.Timer()
- for i = 1,nloop do
- groundtruth = sconv:forward(input)
- end
- tm.cpu = a:time().real
+ input = input:type(typename)
+ local gconv = nn.SpatialAdaptiveMaxPooling(outi,outj):type(typename)
+ local rescuda = gconv:forward(input)
- input = input:cuda()
- local gconv = nn.SpatialAdaptiveMaxPooling(outi,outj):cuda()
- local rescuda = gconv:forward(input)
- a:reset()
- for i = 1,nloop do
- rescuda = gconv:forward(input)
+ local error = rescuda:double() - groundtruth:double()
+ mytester:assertlt(error:abs():max(), precision_forward_type(precision_forward, typename),
+ string.format('error on state (forward) with %s', typename))
end
- cutorch.synchronize()
- tm.gpu = a:time().real
-
- local error = rescuda:float() - groundtruth
- mytester:assertlt(error:abs():max(), precision_forward, 'error on state (forward) ')
end
function cunntest.SpatialAdaptiveMaxPooling_backward()
@@ -3236,41 +2837,29 @@ function cunntest.SpatialAdaptiveMaxPooling_backward()
local ini = math.random(10,256)
local inj = math.random(10,256)
- local tm = {}
- local title = string.format('SpatialAdaptiveMaxPooling.backward %dx%dx%d -> %dx%dx%d',
- from, inj, ini, to, outj, outi)
- times[title] = tm
-
- local input = torch.randn(from,inj,ini)
- local gradOutput = torch.randn(to,outj,outi)
- local sconv = nn.SpatialAdaptiveMaxPooling(outi,outj)
- sconv:forward(input)
- sconv:zeroGradParameters()
- local groundgrad = sconv:backward(input, gradOutput)
- local a = torch.Timer()
- for i = 1,nloop do
+ for k, typename in ipairs(typenames) do
+ local input = torch.randn(from,inj,ini):type(typename)
+ local gradOutput = torch.randn(to,outj,outi):type(typename)
+ local ctype = t2cpu[typename]
+ input = input:type(ctype)
+ gradOutput = gradOutput:type(ctype)
+ local sconv = nn.SpatialAdaptiveMaxPooling(outi,outj):type(ctype)
+ sconv:forward(input)
sconv:zeroGradParameters()
- groundgrad = sconv:backward(input, gradOutput)
- end
- tm.cpu = a:time().real
+ local groundgrad = sconv:backward(input, gradOutput)
- input = input:cuda()
- gradOutput = gradOutput:cuda()
- local gconv = nn.SpatialAdaptiveMaxPooling(outi,outj):cuda()
- gconv:forward(input)
- gconv:zeroGradParameters()
- local rescuda = gconv:backward(input, gradOutput)
- a:reset()
- for i = 1,nloop do
+ input = input:type(typename)
+ gradOutput = gradOutput:type(typename)
+ local gconv = nn.SpatialAdaptiveMaxPooling(outi,outj):type(typename)
+ gconv:forward(input)
gconv:zeroGradParameters()
- rescuda = gconv:backward(input, gradOutput)
- end
- cutorch.synchronize()
- tm.gpu = a:time().real
+ local rescuda = gconv:backward(input, gradOutput)
- local error = rescuda:float() - groundgrad
+ local error = rescuda:double() - groundgrad:double()
- mytester:assertlt(error:abs():max(), precision_backward, 'error on state (backward) ')
+ mytester:assertlt(error:abs():max(), precision_backward_type(precision_backward, typename),
+ string.format('error on state (backward) with %s', typename))
+ end
end
function cunntest.SpatialAdaptiveMaxPooling_backward_noncontig()
@@ -3281,42 +2870,29 @@ function cunntest.SpatialAdaptiveMaxPooling_backward_noncontig()
local ini = math.random(10,256)
local inj = math.random(10,256)
- local tm = {}
- local title = string.format('SpatialAdaptiveMaxPooling.backward %s %dx%dx%d -> %dx%dx%d',
- 'non-contiguous', from, inj, ini, to, outj, outi)
- times[title] = tm
-
- local input0 = torch.randn(from,ini,inj)
- local input = input0:transpose(2,3)
- local gradOutput = torch.randn(to,outj,outi)
- local sconv = nn.SpatialAdaptiveMaxPooling(outi,outj)
- sconv:forward(input)
- sconv:zeroGradParameters()
- local groundgrad = sconv:backward(input, gradOutput)
- local a = torch.Timer()
- for i = 1,nloop do
+ for k, typename in ipairs(typenames) do
+ local input0 = torch.randn(from,ini,inj):type(typename)
+ local gradOutput = torch.randn(to,outj,outi):type(typename)
+ local ctype = t2cpu[typename]
+ local input = input0:type(ctype):transpose(2,3)
+ gradOutput = gradOutput:type(ctype)
+ local sconv = nn.SpatialAdaptiveMaxPooling(outi,outj):type(ctype)
+ sconv:forward(input)
sconv:zeroGradParameters()
- groundgrad = sconv:backward(input, gradOutput)
- end
- tm.cpu = a:time().real
+ local groundgrad = sconv:backward(input, gradOutput)
- input = input0:cuda():transpose(2,3)
- gradOutput = gradOutput:cuda()
- local gconv = nn.SpatialAdaptiveMaxPooling(outi,outj):cuda()
- gconv:forward(input)
- gconv:zeroGradParameters()
- local rescuda = gconv:backward(input, gradOutput)
- a:reset()
- for i = 1,nloop do
+ input = input0:type(typename):transpose(2,3)
+ gradOutput = gradOutput:type(typename)
+ local gconv = nn.SpatialAdaptiveMaxPooling(outi,outj):type(typename)
+ gconv:forward(input)
gconv:zeroGradParameters()
- rescuda = gconv:backward(input, gradOutput)
- end
- cutorch.synchronize()
- tm.gpu = a:time().real
+ local rescuda = gconv:backward(input, gradOutput)
- local error = rescuda:float() - groundgrad
+ local error = rescuda:double() - groundgrad:double()
- mytester:assertlt(error:abs():max(), precision_backward, 'error on state (backward) ')
+ mytester:assertlt(error:abs():max(), precision_backward_type(precision_backward, typename),
+ string.format('error on state (backward) with %s', typename))
+ end
end
function cunntest.SpatialAdaptiveMaxPooling_backward_batch()
@@ -3328,41 +2904,29 @@ function cunntest.SpatialAdaptiveMaxPooling_backward_batch()
local ini = math.random(10,256)
local inj = math.random(10,256)
- local tm = {}
- local title = string.format('SpatialAdaptiveMaxPooling.backward %dx%dx%dx%d -> %dx%dx%dx%d',
- bs, from, inj, ini, bs, to, outj, outi)
- times[title] = tm
-
- local input = torch.randn(bs,from,inj,ini)
- local gradOutput = torch.randn(bs,to,outj,outi)
- local sconv = nn.SpatialAdaptiveMaxPooling(outi,outj)
- sconv:forward(input)
- sconv:zeroGradParameters()
- local groundgrad = sconv:backward(input, gradOutput)
- local a = torch.Timer()
- for i = 1,nloop do
+ for k, typename in ipairs(typenames) do
+ local input = torch.randn(bs,from,inj,ini):type(typename)
+ local gradOutput = torch.randn(bs,to,outj,outi):type(typename)
+ local ctype = t2cpu[typename]
+ input = input:type(ctype)
+ gradOutput = gradOutput:type(ctype)
+ local sconv = nn.SpatialAdaptiveMaxPooling(outi,outj):type(ctype)
+ sconv:forward(input)
sconv:zeroGradParameters()
- groundgrad = sconv:backward(input, gradOutput)
- end
- tm.cpu = a:time().real
+ local groundgrad = sconv:backward(input, gradOutput)
- input = input:cuda()
- gradOutput = gradOutput:cuda()
- local gconv = nn.SpatialAdaptiveMaxPooling(outi,outj):cuda()
- gconv:forward(input)
- gconv:zeroGradParameters()
- local rescuda = gconv:backward(input, gradOutput)
- a:reset()
- for i = 1,nloop do
+ input = input:type(typename)
+ gradOutput = gradOutput:type(typename)
+ local gconv = nn.SpatialAdaptiveMaxPooling(outi,outj):type(typename)
+ gconv:forward(input)
gconv:zeroGradParameters()
- rescuda = gconv:backward(input, gradOutput)
- end
- cutorch.synchronize()
- tm.gpu = a:time().real
+ local rescuda = gconv:backward(input, gradOutput)
- local error = rescuda:float() - groundgrad
+ local error = rescuda:double() - groundgrad:double()
- mytester:assertlt(error:abs():max(), precision_backward, 'error on state (backward) ')
+ mytester:assertlt(error:abs():max(), precision_backward_type(precision_backward, typename),
+ string.format('error on state (backward) with %s', typename))
+ end
end
function cunntest.SpatialLPPooling_forward()
@@ -3495,164 +3059,154 @@ end
function cunntest.BCECriterion_forward()
local size = math.random(1,100)
- local input = torch.Tensor(size):uniform()
- local target = torch.Tensor(size):uniform():gt(0.5):type(torch.type(input))
-
- local tm = {}
- local title = string.format('BCECriterion.forward, Size: %d', size)
- times[title] = tm
-
- local crit = nn.BCECriterion()
- local rescpu = crit:forward(input, target)
- local a = torch.Timer()
- for i = 1,nloop do
- rescpu = crit:forward(input, target)
- end
- tm.cpu = a:time().real
-
- input = input:cuda()
- target = target:cuda()
- local g_crit = nn.BCECriterion():cuda()
- local rescuda = g_crit:forward(input, target)
- a:reset()
- for i = 1,nloop do
- rescuda = g_crit:forward(input, target)
- end
- cutorch.synchronize()
- tm.gpu = a:time().real
- local errorVal = rescuda - rescpu
- mytester:assertlt(errorVal, precision_forward, 'error on state (forward) ')
-
- -- test vs lua implementation
- buffer = input.new()
- local restruth = BCECriterion_forward_truth(buffer, input, target, nil, true)
- for i = 1,nloop do
- local restruth = BCECriterion_forward_truth(buffer, input, target, nil, true)
+
+ for k, typename in ipairs(typenames) do
+ local input = torch.Tensor(size):uniform():type(typename)
+ local target = torch.Tensor(size):uniform():gt(0.5):type(torch.type(input))
+
+ local ctype = t2cpu[typename]
+ input = input:type(ctype)
+ target = target:type(ctype)
+ local crit = nn.BCECriterion():type(ctype)
+ local rescpu = crit:forward(input, target)
+
+ input = input:type(typename)
+ target = target:type(typename)
+ local g_crit = nn.BCECriterion():type(typename)
+ local rescuda = g_crit:forward(input, target)
+ local errorVal = rescuda - rescpu
+ mytester:assertlt(errorVal, precision_forward_type(precision_forward, typename),
+ string.format('error on state (forward) with %s', typename))
+
+ -- test vs lua implementation
+ input = input:type(ctype)
+ target = target:type(ctype)
+ buffer = input.new()
+ local restruth = BCECriterion_forward_truth(buffer, input, target, nil, true)
+ errorVal = rescpu - restruth
+ mytester:assertlt(errorVal, precision_forward_type(precision_forward, typename),
+ string.format('error on state (forward) with %s', typename))
+ errorVal = rescuda - restruth
+ mytester:assertlt(errorVal, precision_forward_type(precision_forward, typename),
+ string.format('error on state (forward) with %s', typename))
end
- errorVal = rescpu - restruth
- mytester:assertlt(errorVal, precision_forward, 'error on state (forward) ')
- errorVal = rescuda - restruth
- mytester:assertlt(errorVal, precision_forward, 'error on state (forward) ')
end
-
function cunntest.BCECriterionWeights_forward()
local size = math.random(1,100)
- local input = torch.Tensor(size):uniform()
- local target = torch.Tensor(size):uniform():gt(0.5):type(torch.type(input))
- local weights = torch.Tensor(size):uniform()
-
- local tm = {}
- local title = string.format('BCECriterionWeights.forward, Size: %d', size)
- times[title] = tm
-
- local crit = nn.BCECriterion(weights)
- local rescpu = crit:forward(input, target)
- local a = torch.Timer()
- for i = 1,nloop do
- rescpu = crit:forward(input, target)
- end
- tm.cpu = a:time().real
-
- input = input:cuda()
- target = target:cuda()
- weights = weights:cuda()
- local g_crit = nn.BCECriterion(weights):cuda()
- local rescuda = g_crit:forward(input, target)
- a:reset()
- for i = 1,nloop do
- rescuda = g_crit:forward(input, target)
+ for k, typename in ipairs(typenames) do
+ local input = torch.Tensor(size):uniform():type(typename)
+ local target = torch.Tensor(size):uniform():gt(0.5):type(torch.type(input))
+ local weights = torch.Tensor(size):uniform():type(typename)
+
+ local ctype = t2cpu[typename]
+ input = input:type(ctype)
+ target = target:type(ctype)
+ weights = weights:type(ctype)
+ local crit = nn.BCECriterion(weights):type(ctype)
+ local rescpu = crit:forward(input, target)
+
+ input = input:type(typename)
+ target = target:type(typename)
+ weights = weights:type(typename)
+ local g_crit = nn.BCECriterion(weights):type(typename)
+ local rescuda = g_crit:forward(input, target)
+
+ local errorVal = rescuda - rescpu
+ mytester:assertlt(errorVal, precision_forward_type(precision_forward, typename),
+ string.format('error on state (forward) with %s', typename))
+
+ -- test vs lua implementation
+ -- FIXME: half does not support dot without CUDA 8.0, so can't compare to lua implementation.
+ if typename ~= 'torch.CudaHalfTensor' then
+ buffer = input.new()
+ restruth = BCECriterion_forward_truth(buffer, input, target, weights, true)
+ errorVal = rescpu - restruth
+ mytester:assertlt(errorVal, precision_forward_type(precision_forward, typename),
+ string.format('error on state (forward) with %s', typename))
+ errorVal = rescuda - restruth
+ mytester:assertlt(errorVal, precision_forward_type(precision_forward, typename),
+ string.format('error on state (forward) with %s', typename))
+ end
end
- cutorch.synchronize()
- tm.gpu = a:time().real
- local errorVal = rescuda - rescpu
- mytester:assertlt(errorVal, precision_forward, 'error on state (forward) ')
-
- -- test vs lua implementation
- buffer = input.new()
- local restruth = BCECriterion_forward_truth(buffer, input, target, weights, true)
- for i = 1,nloop do
- local restruth = BCECriterion_forward_truth(buffer, input, target, weights, true)
- end
- errorVal = rescpu - restruth
- mytester:assertlt(errorVal, precision_forward, 'error on state (forward) ')
- errorVal = rescuda - restruth
- mytester:assertlt(errorVal, precision_forward, 'error on state (forward) ')
end
-
function cunntest.MarginCriterion_forward()
local size = math.random(1,100)
- local input = (torch.rand(size)-0.5) * 2 -- data spread from -1 to 1
- local target = (torch.round(torch.rand(size))*2)-1 -- generate random labels -1, 1
-
- local tm = {}
- local title = string.format('MarginCriterion.forward, Size: %d', size)
- times[title] = tm
-
- local crit = nn.MarginCriterion()
- local groundtruth= crit:forward(input, target)
- local a = torch.Timer()
- for i = 1,nloop do
- groundtruth = crit:forward(input, target)
- end
- tm.cpu = a:time().real
-
- input = input:cuda()
- target = target:cuda()
- local g_crit = nn.MarginCriterion():cuda()
- local rescuda = g_crit:forward(input, target)
- a:reset()
- for i = 1,nloop do
- rescuda = g_crit:forward(input, target)
+
+ for k, typename in ipairs(typenames) do
+ local input = ((torch.rand(size)-0.5) * 2):type(typename) -- data spread from -1 to 1
+ local target = ((torch.round(torch.rand(size))*2)-1):type(typename) -- generate random labels -1, 1
+
+ local ctype = t2cpu[typename]
+ input = input:type(ctype)
+ target = input:type(ctype)
+ local crit = nn.MarginCriterion():type(ctype)
+ local groundtruth= crit:forward(input, target)
+
+ input = input:type(typename)
+ target = target:type(typename)
+ local g_crit = nn.MarginCriterion():type(typename)
+ local rescuda = g_crit:forward(input, target)
+ local errorVal = rescuda - groundtruth
+ mytester:assertlt(errorVal, precision_forward_type(precision_forward, typename),
+ string.format('error on state (forward) with %s', typename))
end
- cutorch.synchronize()
- tm.gpu = a:time().real
- local errorVal = rescuda - groundtruth
- mytester:assertlt(errorVal, precision_forward, 'error on state (forward) ')
end
function cunntest.MultiLabelMarginCriterion_forward()
local size = math.random(1,100)
- local input = (torch.rand(size)-0.5) * 2 -- data spread from -1 to 1
- local target = torch.round(torch.rand(size)*(size-1)):add(1) -- generate random labels > 0
- local zero = math.random(0,size) -- turn some labels into 0 targets
- if zero > 0 then
- target:sub(size-zero+1,size):zero()
- end
- local crit = nn.MultiLabelMarginCriterion()
- local groundtruth= crit:forward(input, target)
- input = input:cuda()
- target = target:cuda()
- local g_crit = nn.MultiLabelMarginCriterion():cuda()
- local rescuda = g_crit:forward(input, target)
- local errorVal = rescuda - groundtruth
- mytester:assertlt(errorVal, precision_forward, 'error on state (forward) ')
+ for k, typename in ipairs(typenames) do
+ local input = ((torch.rand(size)-0.5) * 2):type(typename) -- data spread from -1 to 1
+ local target = torch.round(torch.rand(size)*(size-1)):add(1) -- generate random labels > 0
+ local zero = math.random(0,size) -- turn some labels into 0 targets
+ if zero > 0 then
+ target:sub(size-zero+1,size):zero()
+ end
+
+ local ctype = t2cpu[typename]
+ input = input:type(ctype)
+ local crit = nn.MultiLabelMarginCriterion():type(ctype)
+ local groundtruth= crit:forward(input, target)
+ input = input:type(typename)
+ target = target:type(typename)
+ local g_crit = nn.MultiLabelMarginCriterion():type(typename)
+ local rescuda = g_crit:forward(input, target)
+ local errorVal = rescuda - groundtruth
+ mytester:assertlt(errorVal, precision_forward_type(precision_forward, typename),
+ string.format('error on state (forward) with %s', typename))
+ end
end
function cunntest.MultiLabelMarginCriterion_backward()
local size = math.random(1,100)
- local input = (torch.rand(size)-0.5) * 2 -- data spread from -1 to 1
- local target = torch.round(torch.rand(size)*(size-1)):add(1) -- generate random labels > 0
- local zero = math.random(0,size) -- turn some labels into 0 targets
- if zero > 0 then
- target:sub(size-zero+1,size):zero()
- end
- local crit = nn.MultiLabelMarginCriterion()
- local pred = crit:forward(input, target)
- local groundgrad = crit:backward(input, target)
+ for k, typename in ipairs(typenames) do
+ local input = ((torch.rand(size)-0.5) * 2):type(typename) -- data spread from -1 to 1
+ local target = torch.round(torch.rand(size)*(size-1)):add(1) -- generate random labels > 0
+ local zero = math.random(0,size) -- turn some labels into 0 targets
+ if zero > 0 then
+ target:sub(size-zero+1,size):zero()
+ end
- input = input:cuda()
- target = target:cuda()
- local g_crit = nn.MultiLabelMarginCriterion():cuda()
- g_crit:forward(input, target)
- local rescuda = g_crit:backward(input, target)
+ local ctype = t2cpu[typename]
+ input = input:type(ctype)
+ local crit = nn.MultiLabelMarginCriterion():type(ctype)
+ local pred = crit:forward(input, target)
+ local groundgrad = crit:backward(input, target)
- local error = rescuda:float() - groundgrad
+ input = input:type(typename)
+ target = target:type(typename)
+ local g_crit = nn.MultiLabelMarginCriterion():type(typename)
+ g_crit:forward(input, target)
+ local rescuda = g_crit:backward(input, target)
- mytester:assertlt(error:abs():max(), precision_backward, 'error on state (backward) ')
+ local error = rescuda:double() - groundgrad:double()
+
+ mytester:assertlt(error:abs():max(), precision_backward_type(precision_backward, typename),
+ string.format('error on state (backward) with %s', typename))
+ end
end
function cunntest.SpatialCrossMapLRN_forward_batch()
@@ -3664,31 +3218,22 @@ function cunntest.SpatialCrossMapLRN_forward_batch()
local beta = math.random(0,100)/100
local k = math.random(1,3)
- local tm = {}
- local title = string.format('SpatialCrossMapLRN.forward')
- times[title] = tm
+ for k, typename in ipairs(typenames) do
+ local input = torch.rand(bs, nbfeatures, inputSize, inputSize):type(typename)
- local input = torch.rand(bs, nbfeatures, inputSize, inputSize)
- local sconv = nn.SpatialCrossMapLRN(size, alpha, beta, k)
- local groundtruth = sconv:forward(input)
- local a = torch.Timer()
- for i = 1,nloop do
- groundtruth = sconv:forward(input)
- end
- tm.cpu = a:time().real
+ local ctype = t2cpu[typename]
+ input = input:type(ctype)
+ local sconv = nn.SpatialCrossMapLRN(size, alpha, beta, k):type(ctype)
+ local groundtruth = sconv:forward(input)
- input = input:cuda()
- local gconv = nn.SpatialCrossMapLRN(size, alpha, beta, k):cuda()
- local rescuda = gconv:forward(input)
- a:reset()
- for i = 1,nloop do
- rescuda = gconv:forward(input)
- end
- cutorch.synchronize()
- tm.gpu = a:time().real
+ input = input:type(typename)
+ local gconv = nn.SpatialCrossMapLRN(size, alpha, beta, k):type(typename)
+ local rescuda = gconv:forward(input)
- local error = rescuda:float() - groundtruth
- mytester:assertlt(error:abs():max(), precision_forward, 'error on state (forward) ')
+ local error = rescuda:double() - groundtruth:double()
+ mytester:assertlt(error:abs():max(), precision_forward_type(precision_forward, typename),
+ string.format('error on state (forward) with %s', typename))
+ end
end
function cunntest.SpatialCrossMapLRN_backward_batch()
@@ -3700,242 +3245,208 @@ function cunntest.SpatialCrossMapLRN_backward_batch()
local beta = math.random(0,100)/100
local k = math.random(1,3)
- local tm = {}
- local title = string.format('SpatialCrossMapLRN.backward')
- times[title] = tm
+ for k, typename in ipairs(typenames) do
+ local input = torch.rand(bs, nbfeatures, inputSize, inputSize):type(typename)
+ local gradOutput = torch.rand(input:size()):type(typename)
- local input = torch.rand(bs, nbfeatures, inputSize, inputSize)
- local gradOutput = torch.rand(input:size())
- local sconv = nn.SpatialCrossMapLRN(size, alpha, beta, k)
- sconv:forward(input)
- sconv:zeroGradParameters()
- local groundgrad = sconv:backward(input, gradOutput)
- local a = torch.Timer()
- for i = 1,nloop do
+ local ctype = t2cpu[typename]
+ input = input:type(ctype)
+ gradOutput = gradOutput:type(ctype)
+ local sconv = nn.SpatialCrossMapLRN(size, alpha, beta, k):type(ctype)
+ sconv:forward(input)
sconv:zeroGradParameters()
- groundgrad = sconv:backward(input, gradOutput)
- end
- tm.cpu = a:time().real
+ local groundgrad = sconv:backward(input, gradOutput)
- input = input:cuda()
- gradOutput = gradOutput:cuda()
- local gconv = nn.SpatialCrossMapLRN(size, alpha, beta, k):cuda()
- gconv:forward(input)
- gconv:zeroGradParameters()
- local rescuda = gconv:backward(input, gradOutput)
- a:reset()
- for i = 1,nloop do
+ input = input:type(ctype)
+ gradOutput = gradOutput:type(ctype)
+ local gconv = nn.SpatialCrossMapLRN(size, alpha, beta, k):type(ctype)
+ gconv:forward(input)
gconv:zeroGradParameters()
- rescuda = gconv:backward(input, gradOutput)
- end
- cutorch.synchronize()
- tm.gpu = a:time().real
+ local rescuda = gconv:backward(input, gradOutput)
- local error = rescuda:float() - groundgrad
+ local error = rescuda:double() - groundgrad:double()
- mytester:assertlt(error:abs():max(), precision_backward, 'error on state (backward) ')
+ mytester:assertlt(error:abs():max(), precision_backward_type(precision_backward),
+ string.format('error on state (backward) with %s', typename))
+ end
end
function cunntest.MarginCriterion_backward()
local size = math.random(1,100)
- local tm = {}
- local title = string.format('MarginCriterion.backward, Size %d', size)
- times[title] = tm
-
- local input = (torch.rand(size)-0.5) * 2 -- data spread from -1 to 1
- local target = (torch.round(torch.rand(size))*2)-1 -- generate random labels -1, 1
+ for k, typename in ipairs(typenames) do
+ local input = ((torch.rand(size)-0.5) * 2):type(typename) -- data spread from -1 to 1
+ local target = ((torch.round(torch.rand(size))*2)-1):type(typename) -- generate random labels -1, 1
- local crit = nn.MarginCriterion()
- crit:forward(input, target)
- local groundgrad = crit:backward(input, target)
- local a = torch.Timer()
- for i = 1,nloop do
- groundgrad = crit:backward(input, target)
- end
- tm.cpu = a:time().real
+ local ctype = t2cpu[typename]
+ input = input:type(ctype)
+ target = target:type(ctype)
+ local crit = nn.MarginCriterion():type(ctype)
+ crit:forward(input, target)
+ local groundgrad = crit:backward(input, target)
- input = input:cuda()
- target = target:cuda()
- local g_crit = nn.MarginCriterion():cuda()
- g_crit:forward(input, target)
- local rescuda = g_crit:backward(input, target)
- a:reset()
- for i = 1,nloop do
- rescuda = g_crit:backward(input, target)
- end
- cutorch.synchronize()
- tm.gpu = a:time().real
+ input = input:type(typename)
+ target = target:type(typename)
+ local g_crit = nn.MarginCriterion():type(typename)
+ g_crit:forward(input, target)
+ local rescuda = g_crit:backward(input, target)
- local error = rescuda:float() - groundgrad
+ local error = rescuda:double() - groundgrad:double()
- mytester:assertlt(error:abs():max(), precision_backward, 'error on state (backward) ')
+ mytester:assertlt(error:abs():max(), precision_backward_type(precision_backward),
+ string.format('error on state (backward) with %s', typename))
+ end
end
function cunntest.BCECriterion_backward()
local size = math.random(1,100)
- local tm = {}
- local title = string.format('BCECriterion.backward, Size %d', size)
- times[title] = tm
-
- local input = torch.Tensor(size):uniform()
- local target = torch.Tensor(size):uniform():gt(0.5):type(torch.type(input))
+ for k, typename in ipairs(typenames) do
+ local input = torch.Tensor(size):uniform():type(typename)
+ local target = torch.Tensor(size):uniform():gt(0.5):type(torch.type(input))
- local crit = nn.BCECriterion()
- crit:forward(input, target)
- local groundgrad = crit:backward(input, target)
- local a = torch.Timer()
- for i = 1,nloop do
- groundgrad = crit:backward(input, target)
- end
- tm.cpu = a:time().real
+ local ctype = t2cpu[typename]
+ input = input:type(ctype)
+ target = target:type(ctype)
+ local crit = nn.BCECriterion():type(ctype)
+ crit:forward(input, target)
+ local groundgrad = crit:backward(input, target)
- input = input:cuda()
- target = target:cuda()
- local g_crit = nn.BCECriterion():cuda()
- g_crit:forward(input, target)
- local rescuda = g_crit:backward(input, target)
- a:reset()
- for i = 1,nloop do
- rescuda = g_crit:backward(input, target)
- end
- cutorch.synchronize()
- tm.gpu = a:time().real
+ input = input:type(typename)
+ target = target:type(typename)
+ local g_crit = nn.BCECriterion():type(typename)
+ g_crit:forward(input, target)
+ local rescuda = g_crit:backward(input, target)
- local error = rescuda:float() - groundgrad
+ local error = rescuda:double() - groundgrad:double()
- mytester:assertlt(error:abs():max(), precision_backward, 'error on state (backward) ')
+ mytester:assertlt(error:abs():max(), precision_backward_type(precision_backward, typename),
+ string.format('error on state (backward) with %s', typename))
+ end
end
function cunntest.BCECriterionWeights_backward()
local size = math.random(1,100)
- local tm = {}
- local title = string.format('BCECriterionWeights.backward, Size %d', size)
- times[title] = tm
-
- local input = torch.Tensor(size):uniform()
- local target = torch.Tensor(size):uniform():gt(0.5):type(torch.type(input))
- local weights = torch.Tensor(size):uniform()
-
- local crit = nn.BCECriterion(weights)
- crit:forward(input, target)
- local groundgrad = crit:backward(input, target)
- local a = torch.Timer()
- for i = 1,nloop do
- groundgrad = crit:backward(input, target)
- end
- tm.cpu = a:time().real
-
- input = input:cuda()
- target = target:cuda()
- weights = weights:cuda()
- local g_crit = nn.BCECriterion(weights):cuda()
- g_crit:forward(input, target)
- local rescuda = g_crit:backward(input, target)
- a:reset()
- for i = 1,nloop do
- rescuda = g_crit:backward(input, target)
+ for k, typename in ipairs(typenames) do
+ local input = torch.Tensor(size):uniform():type(typename)
+ local target = torch.Tensor(size):uniform():gt(0.5):type(torch.type(input))
+ local weights = torch.Tensor(size):uniform():type(typename)
+
+ local ctype = t2cpu[typename]
+ input = input:type(ctype)
+ target = target:type(ctype)
+ weights = weights:type(ctype)
+ local crit = nn.BCECriterion(weights):type(ctype)
+ crit:forward(input, target)
+ local groundgrad = crit:backward(input, target)
+
+ input = input:type(typename)
+ target = target:type(typename)
+ weights = weights:type(typename)
+ local g_crit = nn.BCECriterion(weights):type(typename)
+ g_crit:forward(input, target)
+ local rescuda = g_crit:backward(input, target)
+
+ local error = rescuda:double() - groundgrad:double()
+
+ mytester:assertlt(error:abs():max(), precision_backward_type(precision_backward, typename),
+ string.format('error on state (backward) with %s', typename))
end
- cutorch.synchronize()
- tm.gpu = a:time().real
-
- local error = rescuda:float() - groundgrad
-
- mytester:assertlt(error:abs():max(), precision_backward, 'error on state (backward) ')
end
function cunntest.mse()
for sizeAverage = 0, 1 do
- local size = math.random(3000,5000)
- local input = torch.randn(size,1,1)
- local target = torch.randn(size)
- local mod = nn.MSECriterion(sizeAverage == 1)
-
- local tm = {}
- local title = string.format('MSECriterion sizeAverage %d, %d ', sizeAverage, size)
- times[title] = tm
-
- local a = torch.Timer()
- local fout = mod:forward(input,target)
- local fgin = mod:backward(input,target):clone()
- tm.cpu = a:time().real
-
- local cinput = input:cuda()
- local ctarget = target:cuda()
- local cmod = nn.MSECriterion(sizeAverage == 1):cuda()
- a:reset()
- local cout = cmod:forward(cinput,ctarget)
- local cgin = cmod:backward(cinput,ctarget)
- cutorch.synchronize()
- tm.gpu = a:time().real
-
- mytester:assertlt(math.abs(fout-cout), 0.02, 'error on output')
- local gerr = cgin:float() - fgin
- mytester:assertlt(gerr:abs():max(), precision_forward, 'error on gradInput')
+ for k, typename in ipairs(typenames) do
+ local size = math.random(3000,5000)
+ local input = torch.randn(size,1,1):type(typename)
+ local target = torch.randn(size):type(typename)
+
+ local ctype = t2cpu[typename]
+ input = input:type(ctype)
+ target = target:type(ctype)
+ local mod = nn.MSECriterion(sizeAverage == 1):type(ctype)
+
+ local fout = mod:forward(input,target)
+ local fgin = mod:backward(input,target):clone()
+
+ local cinput = input:type(typename)
+ local ctarget = target:type(typename)
+ local cmod = nn.MSECriterion(sizeAverage == 1):type(typename)
+ local cout = cmod:forward(cinput,ctarget)
+ local cgin = cmod:backward(cinput,ctarget)
+
+ if (typename == 'torch.CudaHalfTensor') then
+ fout = ffi.C.THC_half2float(ffi.C.THC_float2half(fout))
+ end
+ mytester:assertlt(math.abs(fout-cout), precision_forward_type(0.02, typename),
+ string.format('error on output with %s', typename))
+ local gerr = cgin:double() - fgin:double()
+ mytester:assertlt(gerr:abs():max(), precision_forward_type(precision_forward, typename),
+ string.format('error on gradInput with %s', typename))
+ end
end
end
function cunntest.SmoothL1()
for sizeAverage = 0, 1 do
local size = math.random(3000,5000)
- local input = torch.randn(size,1,1)
- local target = torch.randn(size)
- local mod = nn.SmoothL1Criterion(sizeAverage == 1)
-
- local tm = {}
- local title = string.format('SmoothL1Criterion sizeAverage %d, %d ', sizeAverage, size)
- times[title] = tm
-
- local a = torch.Timer()
- local fout = mod:forward(input,target)
- local fgin = mod:backward(input,target):clone()
- tm.cpu = a:time().real
- local cinput = input:cuda()
- local ctarget = target:cuda()
- local cmod = nn.SmoothL1Criterion(sizeAverage == 1):cuda()
- a:reset()
- local cout = cmod:forward(cinput,ctarget)
- local cgin = cmod:backward(cinput,ctarget)
- cutorch.synchronize()
- tm.gpu = a:time().real
-
- mytester:assertlt(math.abs(fout-cout), 0.01, 'error on output')
- local gerr = cgin:float() - fgin
- mytester:assertlt(gerr:abs():max(), precision_forward, 'error on gradInput')
+ for k, typename in ipairs(typenames) do
+ local input = torch.randn(size,1,1):type(typename)
+ local target = torch.randn(size):type(typename)
+
+ local ctype = t2cpu[typename]
+ input = input:type(ctype)
+ target = target:type(ctype)
+ local mod = nn.SmoothL1Criterion(sizeAverage == 1):type(ctype)
+
+ local fout = mod:forward(input,target)
+ local fgin = mod:backward(input,target):clone()
+
+ local cinput = input:type(typename)
+ local ctarget = target:type(typename)
+ local cmod = nn.SmoothL1Criterion(sizeAverage == 1):type(typename)
+ local cout = cmod:forward(cinput,ctarget)
+ local cgin = cmod:backward(cinput,ctarget)
+
+ if (typename == 'torch.CudaHalfTensor') then
+ fout = ffi.C.THC_half2float(ffi.C.THC_float2half(fout))
+ end
+ mytester:assertlt(math.abs(fout-cout), 0.01, string.format('error on output with %s', typename))
+ local gerr = cgin:double() - fgin:double()
+ mytester:assertlt(gerr:abs():max(), precision_forward_type(precision_forward, typename),
+ string.format('error on gradInput with %s', typename))
+ end
end
end
function cunntest.SoftMarginCriterion()
for sizeAverage = 0, 1 do
- local size = math.random(3000,5000)
- local input = torch.randn(size,1,1)
- local target = torch.randn(size)
- local mod = nn.SoftMarginCriterion(sizeAverage == 1)
-
- local tm = {}
- local title = string.format('SoftMarginCriterion sizeAverage %d, %d ', sizeAverage, size)
- times[title] = tm
-
- local a = torch.Timer()
- local fout = mod:forward(input,target)
- local fgin = mod:backward(input,target):clone()
- tm.cpu = a:time().real
-
- local cinput = input:cuda()
- local ctarget = target:cuda()
- local cmod = nn.SoftMarginCriterion(sizeAverage == 1):cuda()
- a:reset()
- local cout = cmod:forward(cinput,ctarget)
- local cgin = cmod:backward(cinput,ctarget)
- cutorch.synchronize()
- tm.gpu = a:time().real
-
- mytester:assertlt(math.abs(fout-cout), 0.01, 'error on output')
- local gerr = cgin:float() - fgin
- mytester:assertlt(gerr:abs():max(), precision_forward, 'error on gradInput')
+ for k, typename in ipairs(typenames) do
+ local size = math.random(3000,5000)
+ local input = torch.randn(size,1,1):type(typename)
+ local target = torch.randn(size):type(typename)
+
+ local ctype = t2cpu[typename]
+ input = input:type(ctype)
+ target = target:type(ctype)
+ local mod = nn.SoftMarginCriterion(sizeAverage == 1):type(ctype)
+
+ local fout = mod:forward(input,target)
+ local fgin = mod:backward(input,target):clone()
+
+ local cinput = input:type(typename)
+ local ctarget = target:type(typename)
+ local cmod = nn.SoftMarginCriterion(sizeAverage == 1):type(typename)
+ local cout = cmod:forward(cinput,ctarget)
+ local cgin = cmod:backward(cinput,ctarget)
+
+ mytester:assertlt(math.abs(fout-cout), 0.01, 'error on output')
+ local gerr = cgin:double() - fgin:double()
+ mytester:assertlt(gerr:abs():max(), precision_forward_type(precision_forward, typename),
+ string.format('error on gradInput with %s', typename))
+ end
end
end
@@ -3943,31 +3454,31 @@ end
function cunntest.distkldiv()
for sizeAverage = 0, 1 do
local size = math.random(3000,5000)
- local input = torch.randn(size) -- TODO, make it back to (size, 1, 1), see https://github.com/torch/cunn/issues/245#issuecomment-209260954
- local target = torch.randn(size)
- local mod = nn.DistKLDivCriterion(sizeAverage == 1)
- local tm = {}
- local title = string.format('DistKLDivCriterion sizeAverage %d, %d ',sizeAverage,size)
- times[title] = tm
-
- local a = torch.Timer()
- local fout = mod:forward(input,target)
- local fgin = mod:backward(input,target):clone()
- tm.cpu = a:time().real
-
- local cinput = input:cuda()
- local ctarget = target:cuda()
- local cmod = nn.DistKLDivCriterion(sizeAverage == 1):cuda()
- a:reset()
- local cout = cmod:forward(cinput,ctarget)
- local cgin = cmod:backward(cinput,ctarget)
- cutorch.synchronize()
- tm.gpu = a:time().real
-
- mytester:assertlt(math.abs(fout-cout), precision_forward, 'error on output')
- local gerr = cgin:float() - fgin
- mytester:assertlt(gerr:abs():max(), precision_backward, 'error on gradInput')
+ for k, typename in ipairs(typenames) do
+ local input = torch.randn(size):type(typename) -- TODO, make it back to (size, 1, 1), see https://github.com/torch/cunn/issues/245#issuecomment-209260954
+ local target = torch.randn(size):type(typename)
+
+ local ctype = t2cpu[typename]
+ input = input:type(ctype)
+ target = target:type(ctype)
+ local mod = nn.DistKLDivCriterion(sizeAverage == 1):type(ctype)
+
+ local fout = mod:forward(input,target)
+ local fgin = mod:backward(input,target):clone()
+
+ local cinput = input:type(typename)
+ local ctarget = target:type(typename)
+ local cmod = nn.DistKLDivCriterion(sizeAverage == 1):type(typename)
+ local cout = cmod:forward(cinput,ctarget)
+ local cgin = cmod:backward(cinput,ctarget)
+
+ mytester:assertlt(math.abs(fout-cout), precision_forward_type(precision_forward, typename),
+ string.format('error on output with %s', typename))
+ local gerr = cgin:double() - fgin:double()
+ mytester:assertlt(gerr:abs():max(), precision_backward_type(precision_backward, typename),
+ string.format('error on gradInput with %s', typename))
+ end
end
end
@@ -3979,34 +3490,24 @@ function cunntest.TemporalConvolution_forward()
local outi = math.random(1,256) -- nOutputFrame
local ini = (outi-1)*si+ki -- nInputFrame
- local tm = {}
- local title = string.format('TemporalConvolution.forward %dx%d o %d -> %dx%d [s: %d]',
- from, ini, ki, to, outi, si)
- times[title] = tm
+ for k, typename in ipairs(typenames) do
+ local input = torch.randn(ini,from):type(typename)
- local input = torch.randn(ini,from)
- local sconv = nn.TemporalConvolution(from,to,ki,si)
- local groundtruth = sconv:forward(input)
- local a = torch.Timer()
- for i = 1,nloop do
- groundtruth = sconv:forward(input)
- end
- tm.cpu = a:time().real
+ local ctype = t2cpu[typename]
+ input = input:type(ctype)
+ local sconv = nn.TemporalConvolution(from,to,ki,si):type(ctype)
+ local groundtruth = sconv:forward(input)
- input = input:cuda()
- local gconv = nn.TemporalConvolution(from,to,ki,si):cuda()
- gconv.weight = sconv.weight:cuda()
- gconv.bias = sconv.bias:cuda()
- local rescuda = gconv:forward(input)
- a:reset()
- for i = 1,nloop do
- rescuda = gconv:forward(input)
- end
- cutorch.synchronize()
- tm.gpu = a:time().real
+ input = input:type(typename)
+ local gconv = nn.TemporalConvolution(from,to,ki,si):type(typename)
+ gconv.weight = sconv.weight:type(typename)
+ gconv.bias = sconv.bias:type(typename)
+ local rescuda = gconv:forward(input)
- local error = rescuda:float() - groundtruth
- mytester:assertlt(error:abs():max(), precision_forward, 'error on state (forward) ')
+ local error = rescuda:double() - groundtruth:double()
+ mytester:assertlt(error:abs():max(), precision_forward_type(precision_forward, typename),
+ string.format('error on state (forward) with %s', typename))
+ end
end
function cunntest.TemporalConvolution_forward_batch()
@@ -4018,34 +3519,24 @@ function cunntest.TemporalConvolution_forward_batch()
local outi = math.random(1,256)
local ini = (outi-1)*si+ki
- local tm = {}
- local title = string.format('TemporalConvolution.forward %dx%dx%d o %d -> %dx%dx%d [s: %d]',
- bs, from, ini, ki, bs, to, outi, si)
- times[title] = tm
+ for k, typename in ipairs(typenames) do
+ local input = torch.randn(bs,ini,from):type(typename)
- local input = torch.randn(bs,ini,from)
- local sconv = nn.TemporalConvolution(from,to,ki,si)
- local groundtruth = sconv:forward(input)
- local a = torch.Timer()
- for i = 1,nloop do
- groundtruth = sconv:forward(input)
- end
- tm.cpu = a:time().real
+ local ctype = t2cpu[typename]
+ input = input:type(ctype)
+ local sconv = nn.TemporalConvolution(from,to,ki,si):type(ctype)
+ local groundtruth = sconv:forward(input)
- input = input:cuda()
- local gconv = nn.TemporalConvolution(from,to,ki,si):cuda()
- gconv.weight = sconv.weight:cuda()
- gconv.bias = sconv.bias:cuda()
- local rescuda = gconv:forward(input)
- a:reset()
- for i = 1,nloop do
- rescuda = gconv:forward(input)
- end
- cutorch.synchronize()
- tm.gpu = a:time().real
+ input = input:type(typename)
+ local gconv = nn.TemporalConvolution(from,to,ki,si):type(typename)
+ gconv.weight = sconv.weight:type(typename)
+ gconv.bias = sconv.bias:type(typename)
+ local rescuda = gconv:forward(input)
- local error = rescuda:float() - groundtruth
- mytester:assertlt(error:abs():max(), precision_forward, 'error on state (forward) ')
+ local error = rescuda:double() - groundtruth:double()
+ mytester:assertlt(error:abs():max(), precision_forward_type(precision_forward, typename),
+ string.format('error on state (forward) with %s', typename))
+ end
end
function cunntest.TemporalConvolution_backward()
@@ -4056,52 +3547,44 @@ function cunntest.TemporalConvolution_backward()
local outi = math.random(1,256)
local ini = (outi-1)*si+ki
- local tm = {}
- local title = string.format('TemporalConvolution.backward %dx%d o %d -> %dx%d',
- from, ini, ki, to, outi)
-
- times[title] = tm
+ for k, typename in ipairs(typenames) do
+ local input = torch.randn(ini,from):type(typename)
+ local gradOutput = torch.randn(outi,to):type(typename)
- local input = torch.randn(ini,from)
- local gradOutput = torch.randn(outi,to)
- local sconv = nn.TemporalConvolution(from,to,ki,si)
- sconv:forward(input)
- sconv:zeroGradParameters()
- local groundgrad = sconv:backward(input, gradOutput)
- local a = torch.Timer()
- for i = 1,nloop do
+ local ctype = t2cpu[typename]
+ input = input:type(ctype)
+ gradOutput = gradOutput:type(ctype)
+ local sconv = nn.TemporalConvolution(from,to,ki,si):type(ctype)
+ sconv:forward(input)
sconv:zeroGradParameters()
- groundgrad = sconv:backward(input, gradOutput)
- end
- local groundweight = sconv.gradWeight
- local groundbias = sconv.gradBias
- tm.cpu = a:time().real
+ local groundgrad = sconv:backward(input, gradOutput)
+ local groundweight = sconv.gradWeight
+ local groundbias = sconv.gradBias
- input = input:cuda()
- gradOutput = gradOutput:cuda()
- local gconv = nn.TemporalConvolution(from,to,ki,si):cuda()
- gconv.weight = sconv.weight:cuda()
- gconv.bias = sconv.bias:cuda()
- gconv:forward(input)
- gconv:zeroGradParameters()
- local rescuda = gconv:backward(input, gradOutput)
- a:reset()
- for i = 1,nloop do
+ input = input:type(typename)
+ gradOutput = gradOutput:type(typename)
+ local gconv = nn.TemporalConvolution(from,to,ki,si):type(typename)
+ gconv.weight = sconv.weight:type(typename)
+ gconv.bias = sconv.bias:type(typename)
+ gconv:forward(input)
gconv:zeroGradParameters()
- rescuda = gconv:backward(input, gradOutput)
- end
- local weightcuda = gconv.gradWeight
- local biascuda = gconv.gradBias
- cutorch.synchronize()
- tm.gpu = a:time().real
+ local rescuda = gconv:backward(input, gradOutput)
+ local weightcuda = gconv.gradWeight
+ local biascuda = gconv.gradBias
- local error = rescuda:float() - groundgrad
- local werror = weightcuda:float() - groundweight
- local berror = biascuda:float() - groundbias
+ local error = rescuda:double() - groundgrad:double()
+ local werror = weightcuda:double() - groundweight:double()
+ local berror = biascuda:double() - groundbias:double()
- mytester:assertlt(error:abs():max(), precision_backward, 'error on state (backward) ')
- mytester:assertlt(werror:abs():max(), precision_backward, 'error on weight (backward) ')
- mytester:assertlt(berror:abs():max(), precision_backward, 'error on bias (backward) ')
+ mytester:assertlt(error:abs():max(), precision_backward_type(precision_backward, typename),
+ string.format('error on state (backward) with %s', typename))
+ mytester:assertlt(werror:abs():max(),
+ precision_backward_conv_weightbias(precision_backward, typename, weightcuda:abs():max()),
+ string.format('error on weight (backward) with %s', typename))
+ mytester:assertlt(berror:abs():max(),
+ precision_backward_conv_weightbias(precision_backward, typename, biascuda:abs():max()),
+ string.format('error on bias (backward) with %s', typename))
+ end
end
function cunntest.TemporalConvolution_backward_batch()
@@ -4113,51 +3596,44 @@ function cunntest.TemporalConvolution_backward_batch()
local outi = math.random(1,256)
local ini = (outi-1)*si+ki
- local tm = {}
- local title = string.format('TemporalConvolution.backward %dx%dx%d o %d -> %dx%dx%d',
- bs, from, ini, ki, bs, to, outi)
- times[title] = tm
+ for k, typename in ipairs(typenames) do
+ local input = torch.randn(bs,ini,from):type(typename)
+ local gradOutput = torch.randn(bs,outi,to):type(typename)
- local input = torch.randn(bs,ini,from)
- local gradOutput = torch.randn(bs,outi,to)
- local sconv = nn.TemporalConvolution(from,to,ki,si)
- sconv:forward(input)
- sconv:zeroGradParameters()
- local groundgrad = sconv:backward(input, gradOutput)
- local a = torch.Timer()
- for i = 1,nloop do
+ local ctype = t2cpu[typename]
+ input = input:type(ctype)
+ gradOutput = gradOutput:type(ctype)
+ local sconv = nn.TemporalConvolution(from,to,ki,si):type(ctype)
+ sconv:forward(input)
sconv:zeroGradParameters()
- groundgrad = sconv:backward(input, gradOutput)
- end
- local groundweight = sconv.gradWeight
- local groundbias = sconv.gradBias
- tm.cpu = a:time().real
+ local groundgrad = sconv:backward(input, gradOutput)
+ local groundweight = sconv.gradWeight
+ local groundbias = sconv.gradBias
- input = input:cuda()
- gradOutput = gradOutput:cuda()
- local gconv = nn.TemporalConvolution(from,to,ki,si):cuda()
- gconv.weight = sconv.weight:cuda()
- gconv.bias = sconv.bias:cuda()
- gconv:forward(input)
- gconv:zeroGradParameters()
- local rescuda = gconv:backward(input, gradOutput)
- a:reset()
- for i = 1,nloop do
+ input = input:type(typename)
+ gradOutput = gradOutput:type(typename)
+ local gconv = nn.TemporalConvolution(from,to,ki,si):type(typename)
+ gconv.weight = sconv.weight:type(typename)
+ gconv.bias = sconv.bias:type(typename)
+ gconv:forward(input)
gconv:zeroGradParameters()
- rescuda = gconv:backward(input, gradOutput)
- end
- local weightcuda = gconv.gradWeight
- local biascuda = gconv.gradBias
- cutorch.synchronize()
- tm.gpu = a:time().real
+ local rescuda = gconv:backward(input, gradOutput)
+ local weightcuda = gconv.gradWeight
+ local biascuda = gconv.gradBias
- local error = rescuda:float() - groundgrad
- local werror = weightcuda:float() - groundweight
- local berror = biascuda:float() - groundbias
+ local error = rescuda:double() - groundgrad:double()
+ local werror = weightcuda:double() - groundweight:double()
+ local berror = biascuda:double() - groundbias:double()
- mytester:assertlt(error:abs():max(), precision_backward, 'error on state (backward) ')
- mytester:assertlt(werror:abs():max(), precision_backward, 'error on weight (backward) ')
- mytester:assertlt(berror:abs():max(), precision_backward, 'error on bias (backward) ')
+ mytester:assertlt(error:abs():max(), precision_backward_type(precision_backward, typename),
+ string.format('error on state (backward) with %s', typename))
+ mytester:assertlt(werror:abs():max(),
+ precision_backward_conv_weightbias(precision_backward, typename, weightcuda:abs():max()),
+ string.format('error on weight (backward) with %s', typename))
+ mytester:assertlt(berror:abs():max(),
+ precision_backward_conv_weightbias(precision_backward, typename, biascuda:abs():max()),
+ string.format('error on bias (backward) with %s', typename))
+ end
end
function cunntest.Dropout()
@@ -4211,66 +3687,46 @@ end
function cunntest.SoftPlus_forward()
local size = math.random(1,100)
- local tm = {}
- local title = string.format('SoftPlus forward %d -> %d', size, size)
- times[title] = tm
-
- local input = torch.randn(size)
- local sconv = nn.SoftPlus()
- local groundtruth = sconv:forward(input)
- local a = torch.Timer()
- for i = 1,nloop do
- groundtruth = sconv:forward(input)
- end
- tm.cpu = a:time().real
+ for k, typename in ipairs(typenames) do
+ local input = torch.randn(size):type(typename)
+ local ctype = t2cpu[typename]
+ input = input:type(ctype)
+ local sconv = nn.SoftPlus():type(ctype)
+ local groundtruth = sconv:forward(input)
- input = input:cuda()
- local gconv = nn.SoftPlus():cuda()
- local rescuda = gconv:forward(input)
- a:reset()
- for i = 1,nloop do
- rescuda = gconv:forward(input)
- end
- cutorch.synchronize()
- tm.gpu = a:time().real
+ input = input:type(typename)
+ local gconv = nn.SoftPlus():type(typename)
+ local rescuda = gconv:forward(input)
- local error = rescuda:float() - groundtruth
- mytester:assertlt(error:abs():max(), precision_forward, 'error on state (forward) ')
+ local error = rescuda:double() - groundtruth:double()
+ mytester:assertlt(error:abs():max(), precision_forward_type(precision_forward,typename),
+ string.format('error on state (forward) with %s', typename))
+ end
end
function cunntest.SoftPlus_backward()
local size = math.random(1,100)
- local tm = {}
- local title = string.format('SoftPlus.backward %d -> %d', size, size)
- times[title] = tm
-
- local input = torch.randn(size)
- local gradOutput = torch.randn(size)
- local sconv = nn.SoftPlus()
- sconv:forward(input)
- local groundgrad = sconv:backward(input, gradOutput)
- local a = torch.Timer()
- for i = 1,nloop do
- groundgrad = sconv:backward(input, gradOutput)
- end
- tm.cpu = a:time().real
-
- input = input:cuda()
- gradOutput = gradOutput:cuda()
- local gconv = sconv:clone():cuda()
- gconv:forward(input)
- local rescuda = gconv:backward(input, gradOutput)
- a:reset()
- for i = 1,nloop do
- rescuda = gconv:backward(input, gradOutput)
- end
- cutorch.synchronize()
- tm.gpu = a:time().real
+ for k, typename in ipairs(typenames) do
+ local input = torch.randn(size):type(typename)
+ local gradOutput = torch.randn(size):type(typename)
+ local ctype = t2cpu[typename]
+ input = input:type(ctype)
+ gradOutput = gradOutput:type(ctype)
+ local sconv = nn.SoftPlus():type(ctype)
+ sconv:forward(input)
+ local groundgrad = sconv:backward(input, gradOutput)
- local error = rescuda:float() - groundgrad
+ input = input:type(typename)
+ gradOutput = gradOutput:type(typename)
+ local gconv = sconv:clone():type(typename)
+ gconv:forward(input)
+ local rescuda = gconv:backward(input, gradOutput)
- mytester:assertlt(error:abs():max(), precision_backward, 'error on state (backward) ')
+ local error = rescuda:double() - groundgrad:double()
+ mytester:assertlt(error:abs():max(), precision_backward_type(precision_backward, typename),
+ string.format('error on state (backward) with %s', typename))
+ end
end
function cunntest.SpatialUpSamplingNearest_forward()
@@ -4279,32 +3735,22 @@ function cunntest.SpatialUpSamplingNearest_forward()
local w = torch.random(3, 15)
local scale = torch.random(2,5)
- local tm = {}
- local title = string.format('SpatialUpSamplingNearest.forward %dx%dx%d -> %dx%dx%d',
- f, h, w, f, h*scale, w*scale)
- times[title] = tm
+ for k, typename in ipairs(typenames) do
+ local input = torch.randn(f, h, w):type(typename)
- local input = torch.randn(f, h, w)
- local sconv = nn.SpatialUpSamplingNearest(scale)
- local groundtruth = sconv:forward(input)
- local a = torch.Timer()
- for i = 1,nloop do
- groundtruth = sconv:forward(input)
- end
- tm.cpu = a:time().real
+ local ctype = t2cpu[typename]
+ input = input:type(ctype)
+ local sconv = nn.SpatialUpSamplingNearest(scale):type(ctype)
+ local groundtruth = sconv:forward(input)
- input = input:cuda()
- local gconv = sconv:clone():cuda()
- local rescuda = gconv:forward(input)
- a:reset()
- for i = 1,nloop do
- rescuda = gconv:forward(input)
- end
- cutorch.synchronize()
- tm.gpu = a:time().real
+ input = input:type(typename)
+ local gconv = sconv:clone():type(typename)
+ local rescuda = gconv:forward(input)
- local error = rescuda:float() - groundtruth
- mytester:assertlt(error:abs():max(), precision_forward, 'error on state (forward) ')
+ local error = rescuda:double() - groundtruth:double()
+ mytester:assertlt(error:abs():max(), precision_forward_type(precision_forward, typename),
+ string.format('error on state (forward) with %s', typename))
+ end
end
function cunntest.SpatialUpSamplingNearest_forward_batch()
@@ -4314,33 +3760,22 @@ function cunntest.SpatialUpSamplingNearest_forward_batch()
local w = torch.random(3, 15)
local scale = torch.random(2,5)
- local tm = {}
- local title = string.format('SpatialUpSamplingNearest.forward %dx%dx%dx%d -> %dx%dx%dx%d',
- nbatch, f, h, w, nbatch, f, h*scale, w*scale)
- times[title] = tm
-
- local input = torch.randn(nbatch, f, h, w)
- local sconv = nn.SpatialUpSamplingNearest(scale)
- local groundtruth = sconv:forward(input)
- local a = torch.Timer()
- for i = 1,nloop do
- groundtruth = sconv:forward(input)
- end
- tm.cpu = a:time().real
+ for k, typename in ipairs(typenames) do
+ local input = torch.randn(nbatch, f, h, w):type(typename)
- input = input:cuda()
- local gconv = sconv:clone():cuda()
- local rescuda = gconv:forward(input)
- a:reset()
- for i = 1,nloop do
- rescuda = gconv:forward(input)
- end
- cutorch.synchronize()
- tm.gpu = a:time().real
+ local ctype = t2cpu[typename]
+ input = input:type(ctype)
+ local sconv = nn.SpatialUpSamplingNearest(scale):type(ctype)
+ local groundtruth = sconv:forward(input)
- local error = rescuda:float() - groundtruth
- mytester:assertlt(error:abs():max(), precision_forward, 'error on state (forward) ')
+ input = input:type(typename)
+ local gconv = sconv:clone():type(typename)
+ local rescuda = gconv:forward(input)
+ local error = rescuda:double() - groundtruth:double()
+ mytester:assertlt(error:abs():max(), precision_forward_type(precision_forward, typename),
+ string.format('error on state (forward) with %s', typename))
+ end
end
function cunntest.SpatialUpSamplingNearest_backward()
@@ -4349,41 +3784,30 @@ function cunntest.SpatialUpSamplingNearest_backward()
local w = torch.random(3, 15)
local scale = torch.random(2,5)
- local tm = {}
- local title = string.format('SpatialUpSamplingNearest.backward %dx%dx%d -> %dx%dx%d',
- f, h, w, f, h*scale, w*scale)
- times[title] = tm
+ for k, typename in ipairs(typenames) do
+ local input = torch.randn(f, h, w):type(typename)
+ local gradOutput = torch.randn(f, h*scale, w*scale):type(typename)
- local input = torch.randn(f, h, w)
- local gradOutput = torch.randn(f, h*scale, w*scale)
- local sconv = nn.SpatialUpSamplingNearest(scale)
- sconv:forward(input)
- sconv:zeroGradParameters()
- local groundgrad = sconv:backward(input, gradOutput)
- local a = torch.Timer()
- for i = 1,nloop do
+ local ctype = t2cpu[typename]
+ input = input:type(ctype)
+ gradOutput = gradOutput:type(ctype)
+ local sconv = nn.SpatialUpSamplingNearest(scale):type(ctype)
+ sconv:forward(input)
sconv:zeroGradParameters()
- groundgrad = sconv:backward(input, gradOutput)
- end
- tm.cpu = a:time().real
+ local groundgrad = sconv:backward(input, gradOutput)
- input = input:cuda()
- gradOutput = gradOutput:cuda()
- local gconv = sconv:clone():cuda()
- gconv:forward(input)
- gconv:zeroGradParameters()
- local rescuda = gconv:backward(input, gradOutput)
- a:reset()
- for i = 1,nloop do
+ input = input:type(typename)
+ gradOutput = gradOutput:type(typename)
+ local gconv = sconv:clone():type(typename)
+ gconv:forward(input)
gconv:zeroGradParameters()
- rescuda = gconv:backward(input, gradOutput)
- end
- cutorch.synchronize()
- tm.gpu = a:time().real
+ local rescuda = gconv:backward(input, gradOutput)
- local error = rescuda:float() - groundgrad
+ local error = rescuda:double() - groundgrad:double()
- mytester:assertlt(error:abs():max(), precision_backward, 'error on state (backward) ')
+ mytester:assertlt(error:abs():max(), precision_backward_type(precision_backward, typename),
+ string.format('error on state (backward) with %s', typename))
+ end
end
function cunntest.SpatialUpSamplingNearest_backward_batch()
@@ -4393,41 +3817,30 @@ function cunntest.SpatialUpSamplingNearest_backward_batch()
local w = torch.random(3, 15)
local scale = torch.random(2,5)
- local tm = {}
- local title = string.format('SpatialUpSamplingNearest.backward %dx%dx%dx%d -> %dx%dx%dx%d',
- nbatch, f, h, w, nbatch, f, h*scale, w*scale)
- times[title] = tm
+ for k, typename in ipairs(typenames) do
+ local input = torch.randn(nbatch, f, h, w):type(typename)
+ local gradOutput = torch.randn(nbatch, f, h*scale, w*scale):type(typename)
- local input = torch.randn(nbatch, f, h, w)
- local gradOutput = torch.randn(nbatch, f, h*scale, w*scale)
- local sconv = nn.SpatialUpSamplingNearest(scale)
- sconv:forward(input)
- sconv:zeroGradParameters()
- local groundgrad = sconv:backward(input, gradOutput)
- local a = torch.Timer()
- for i = 1,nloop do
+ local ctype = t2cpu[typename]
+ input = input:type(ctype)
+ gradOutput = gradOutput:type(ctype)
+ local sconv = nn.SpatialUpSamplingNearest(scale):type(ctype)
+ sconv:forward(input)
sconv:zeroGradParameters()
- groundgrad = sconv:backward(input, gradOutput)
- end
- tm.cpu = a:time().real
+ local groundgrad = sconv:backward(input, gradOutput)
- input = input:cuda()
- gradOutput = gradOutput:cuda()
- local gconv = sconv:clone():cuda()
- gconv:forward(input)
- gconv:zeroGradParameters()
- local rescuda = gconv:backward(input, gradOutput)
- a:reset()
- for i = 1,nloop do
+ input = input:type(typename)
+ gradOutput = gradOutput:type(typename)
+ local gconv = sconv:clone():type(typename)
+ gconv:forward(input)
gconv:zeroGradParameters()
- rescuda = gconv:backward(input, gradOutput)
- end
- cutorch.synchronize()
- tm.gpu = a:time().real
+ local rescuda = gconv:backward(input, gradOutput)
- local error = rescuda:float() - groundgrad
+ local error = rescuda:double() - groundgrad:double()
- mytester:assertlt(error:abs():max(), precision_backward, 'error on state (backward) ')
+ mytester:assertlt(error:abs():max(), precision_backward_type(precision_backward, typename),
+ string.format('error on state (backward) with %s', typename))
+ end
end
function cunntest.SpatialUpSamplingBilinear_forward()
@@ -4436,34 +3849,22 @@ function cunntest.SpatialUpSamplingBilinear_forward()
local w = torch.random(3, 15)
local scale = torch.random(2,5)
- local tm = {}
- local title =
- string.format('SpatialUpSamplingBilinear.forward %dx%dx%d -> %dx%dx%d',
- f, h, w, f, h*scale, w*scale)
- times[title] = tm
+ for k, typename in ipairs(typenames) do
+ local input = torch.randn(f, h, w):type(typename)
- local input = torch.randn(f, h, w)
- local sconv = nn.SpatialUpSamplingBilinear(scale)
- local groundtruth = sconv:forward(input)
- local a = torch.Timer()
- for i = 1,nloop do
- groundtruth = sconv:forward(input)
- end
- tm.cpu = a:time().real
+ local ctype = t2cpu[typename]
+ input = input:type(ctype)
+ local sconv = nn.SpatialUpSamplingBilinear(scale):type(ctype)
+ local groundtruth = sconv:forward(input)
- input = input:cuda()
- local gconv = sconv:clone():cuda()
- local rescuda = gconv:forward(input)
- a:reset()
- for i = 1,nloop do
- rescuda = gconv:forward(input)
- end
- cutorch.synchronize()
- tm.gpu = a:time().real
+ input = input:type(typename)
+ local gconv = sconv:clone():type(typename)
+ local rescuda = gconv:forward(input)
- local error = rescuda:float() - groundtruth
- mytester:assertlt(error:abs():max(), precision_forward,
- 'error on state (forward) ')
+ local error = rescuda:double() - groundtruth:double()
+ mytester:assertlt(error:abs():max(), precision_forward_type(precision_forward, typename),
+ string.format('error on state (forward) with %s', typename))
+ end
end
function cunntest.SpatialUpSamplingBilinear_forward_batch()
@@ -4473,35 +3874,22 @@ function cunntest.SpatialUpSamplingBilinear_forward_batch()
local w = torch.random(3, 15)
local scale = torch.random(2,5)
- local tm = {}
- local title =
- string.format('SpatialUpSamplingBilinear.forward %dx%dx%dx%d -> %dx%dx%dx%d',
- nbatch, f, h, w, nbatch, f, h*scale, w*scale)
- times[title] = tm
+ for k, typename in ipairs(typenames) do
+ local input = torch.randn(nbatch, f, h, w):type(typename)
- local input = torch.randn(nbatch, f, h, w)
- local sconv = nn.SpatialUpSamplingBilinear(scale)
- local groundtruth = sconv:forward(input)
- local a = torch.Timer()
- for i = 1,nloop do
- groundtruth = sconv:forward(input)
- end
- tm.cpu = a:time().real
-
- input = input:cuda()
- local gconv = sconv:clone():cuda()
- local rescuda = gconv:forward(input)
- a:reset()
- for i = 1,nloop do
- rescuda = gconv:forward(input)
- end
- cutorch.synchronize()
- tm.gpu = a:time().real
+ local ctype = t2cpu[typename]
+ input = input:type(ctype)
+ local sconv = nn.SpatialUpSamplingBilinear(scale):type(ctype)
+ local groundtruth = sconv:forward(input)
- local error = rescuda:float() - groundtruth
- mytester:assertlt(error:abs():max(), precision_forward,
- 'error on state (forward) ')
+ input = input:type(typename)
+ local gconv = sconv:clone():type(typename)
+ local rescuda = gconv:forward(input)
+ local error = rescuda:double() - groundtruth:double()
+ mytester:assertlt(error:abs():max(), precision_forward_type(precision_forward, typename),
+ string.format('error on state (forward) with %s', typename))
+ end
end
function cunntest.SpatialUpSamplingBilinear_backward()
@@ -4510,43 +3898,30 @@ function cunntest.SpatialUpSamplingBilinear_backward()
local w = torch.random(3, 15)
local scale = torch.random(2,5)
- local tm = {}
- local title =
- string.format('SpatialUpSamplingBilinear.backward %dx%dx%d -> %dx%dx%d',
- f, h, w, f, h*scale, w*scale)
- times[title] = tm
+ for k, typename in ipairs(typenames) do
+ local input = torch.randn(f, h, w):type(typename)
+ local gradOutput = torch.randn(f, h*scale, w*scale):type(typename)
- local input = torch.randn(f, h, w)
- local gradOutput = torch.randn(f, h*scale, w*scale)
- local sconv = nn.SpatialUpSamplingBilinear(scale)
- sconv:forward(input)
- sconv:zeroGradParameters()
- local groundgrad = sconv:backward(input, gradOutput)
- local a = torch.Timer()
- for i = 1,nloop do
+ local ctype = t2cpu[typename]
+ input = input:type(ctype)
+ gradOutput = gradOutput:type(ctype)
+ local sconv = nn.SpatialUpSamplingBilinear(scale):type(ctype)
+ sconv:forward(input)
sconv:zeroGradParameters()
- groundgrad = sconv:backward(input, gradOutput)
- end
- tm.cpu = a:time().real
+ local groundgrad = sconv:backward(input, gradOutput)
- input = input:cuda()
- gradOutput = gradOutput:cuda()
- local gconv = sconv:clone():cuda()
- gconv:forward(input)
- gconv:zeroGradParameters()
- local rescuda = gconv:backward(input, gradOutput)
- a:reset()
- for i = 1,nloop do
+ input = input:type(typename)
+ gradOutput = gradOutput:type(typename)
+ local gconv = sconv:clone():type(typename)
+ gconv:forward(input)
gconv:zeroGradParameters()
- rescuda = gconv:backward(input, gradOutput)
- end
- cutorch.synchronize()
- tm.gpu = a:time().real
+ local rescuda = gconv:backward(input, gradOutput)
- local error = rescuda:float() - groundgrad
+ local error = rescuda:double() - groundgrad:double()
- mytester:assertlt(error:abs():max(), precision_backward,
- 'error on state (backward) ')
+ mytester:assertlt(error:abs():max(), precision_backward_type(precision_backward, typename),
+ string.format('error on state (backward) with %s', typename))
+ end
end
function cunntest.SpatialUpSamplingBilinear_backward_batch()
@@ -4556,146 +3931,152 @@ function cunntest.SpatialUpSamplingBilinear_backward_batch()
local w = torch.random(3, 15)
local scale = torch.random(2,5)
- local input = torch.randn(nbatch, f, h, w)
- local gradOutput = torch.randn(nbatch, f, h*scale, w*scale)
- local sconv = nn.SpatialUpSamplingBilinear(scale)
- local output = sconv:forward(input)
- sconv:zeroGradParameters()
- local groundgrad = sconv:backward(input, gradOutput)
+ for k, typename in ipairs(typenames) do
+ local input = torch.randn(nbatch, f, h, w):type(typename)
+ local gradOutput = torch.randn(nbatch, f, h*scale, w*scale):type(typename)
- input = input:cuda()
- gradOutput = gradOutput:cuda()
- local gconv = sconv:clone():cuda()
- gconv:forward(input)
- gconv:zeroGradParameters()
- local rescuda = gconv:backward(input, gradOutput)
+ local ctype = t2cpu[typename]
+ input = input:type(ctype)
+ gradOutput = gradOutput:type(ctype)
+ local sconv = nn.SpatialUpSamplingBilinear(scale):type(ctype)
+ local output = sconv:forward(input)
+ sconv:zeroGradParameters()
+ local groundgrad = sconv:backward(input, gradOutput)
+
+ input = input:type(typename)
+ gradOutput = gradOutput:type(typename)
+ local gconv = sconv:clone():type(typename)
+ gconv:forward(input)
+ gconv:zeroGradParameters()
+ local rescuda = gconv:backward(input, gradOutput)
- local err = rescuda:float() - groundgrad
+ local err = rescuda:double() - groundgrad:double()
- mytester:assertlt(err:abs():max(), precision_backward,
- 'error on state (backward) ')
+ mytester:assertlt(err:abs():max(), precision_backward_type(precision_backward, typename),
+ string.format('error on state (backward) with %s', typename))
+ end
end
function cunntest.l1cost()
local size = math.random(300,500)
- local input = torch.randn(size)
- local mod = nn.L1Cost()
- local tm = {}
- local title = string.format('L1Cost %d ',size)
- times[title] = tm
+ for k, typename in ipairs(typenames) do
+ local input = torch.randn(size):type(typename)
- local a = torch.Timer()
- local fout = mod:forward(input)
- local fgin = mod:backward(input):clone()
- tm.cpu = a:time().real
+ local ctype = t2cpu[typename]
+ input = input:type(ctype)
+ local mod = nn.L1Cost():type(ctype)
- local cinput = input:cuda()
- local cmod = nn.L1Cost():cuda()
- a:reset()
- local cout = cmod:forward(cinput)
- local cgin = cmod:backward(cinput)
- cutorch.synchronize()
- tm.gpu = a:time().real
+ local fout = mod:forward(input)
+ local fgin = mod:backward(input):clone()
+
+ local cinput = input:type(typename)
+ local cmod = nn.L1Cost():type(typename)
+ local cout = cmod:forward(cinput)
+ local cgin = cmod:backward(cinput)
- mytester:assertlt(math.abs(fout-cout), precision_forward, 'error on output')
- local gerr = cgin:float() - fgin
- mytester:assertlt(gerr:abs():max(), precision_forward, 'error on gradInput')
+ if (typename == 'torch.CudaHalfTensor') then
+ fout = ffi.C.THC_half2float(ffi.C.THC_float2half(fout))
+ end
+ mytester:assertlt(math.abs(fout-cout), precision_forward_type(precision_forward, typename),
+ string.format('error on output with %s', typename))
+ local gerr = cgin:double() - fgin:double()
+ mytester:assertlt(gerr:abs():max(), precision_forward_type(precision_forward, typename),
+ string.format('error on gradInput with %s', typename))
+ end
end
function cunntest.ClassNLLCriterionSingleTarget()
local size = math.random(3000,5000)
- local input = torch.randn(size)
- local target = 1
- local mod = nn.ClassNLLCriterion()
- local tm = {}
- local title = string.format('ClassNLLCriterionSingleTarget %d ',size)
- times[title] = tm
+ for k, typename in ipairs(typenames) do
+ local input = torch.randn(size):type(typename)
+ local target = 1
- local a = torch.Timer()
- local fout = mod:forward(input, target)
- local fgin = mod:backward(input, target):clone()
- tm.cpu = a:time().real
+ local ctype = t2cpu[typename]
+ input = input:type(ctype)
+ local mod = nn.ClassNLLCriterion():type(ctype)
- local cinput = input:cuda()
- local ctarget = torch.CudaTensor(1):fill(target)
- local cmod = nn.ClassNLLCriterion():cuda()
- a:reset()
- local cout = cmod:forward(cinput,ctarget)
- local cgin = cmod:backward(cinput,ctarget)
- cutorch.synchronize()
- tm.gpu = a:time().real
+ local fout = mod:forward(input, target)
+ local fgin = mod:backward(input, target):clone()
+
+ local cinput = input:type(typename)
+ local ctarget = torch.CudaTensor(1):fill(target)
+ local cmod = nn.ClassNLLCriterion():type(typename)
+ local cout = cmod:forward(cinput,ctarget)
+ local cgin = cmod:backward(cinput,ctarget)
- mytester:assertlt(
- math.abs(fout-cout), precision_forward, 'error on output')
- local gerr = cgin:float() - fgin
- mytester:assertlt(gerr:abs():max(), precision_forward, 'error on gradInput')
+ mytester:assertlt(
+ math.abs(fout-cout), precision_forward_type(precision_forward, typename),
+ string.format('error on output with %s', typename))
+ local gerr = cgin:double() - fgin:double()
+ mytester:assertlt(gerr:abs():max(), precision_forward_type(precision_forward, typename),
+ string.format('error on gradInput with %s', typename))
+ end
end
function cunntest.ClassNLLCriterionSingleTargetWeights()
local size = math.random(3000,5000)
- local input = torch.randn(size)
- local target = 1
- local weights = torch.rand(size)
- local mod = nn.ClassNLLCriterion(weights)
- local tm = {}
- local title = string.format('ClassNLLCriterionSingleTargetWeights %d ',size)
- times[title] = tm
+ for k, typename in ipairs(typenames) do
+ local input = torch.randn(size):type(typename)
+ local target = 1
+ local weights = torch.rand(size):type(typename)
- local a = torch.Timer()
- local fout = mod:forward(input, target)
- local fgin = mod:backward(input, target):clone()
- tm.cpu = a:time().real
+ local ctype = t2cpu[typename]
+ input = input:type(ctype)
+ weights = weights:type(ctype)
+ local mod = nn.ClassNLLCriterion(weights):type(ctype)
- local cinput = input:cuda()
- local cweights = weights:cuda()
- local ctarget = torch.CudaTensor(1):fill(target)
- local cmod = nn.ClassNLLCriterion(cweights):cuda()
- a:reset()
- local cout = cmod:forward(cinput,ctarget)
- local cgin = cmod:backward(cinput,ctarget)
- cutorch.synchronize()
- tm.gpu = a:time().real
+ local fout = mod:forward(input, target)
+ local fgin = mod:backward(input, target):clone()
- mytester:assertlt(
- math.abs(fout-cout), precision_forward, 'error on output')
- local gerr = cgin:float() - fgin
- mytester:assertlt(gerr:abs():max(), precision_forward, 'error on gradInput')
+ local cinput = input:type(typename)
+ local cweights = weights:type(typename)
+ local ctarget = torch.CudaTensor(1):fill(target)
+ local cmod = nn.ClassNLLCriterion(cweights):type(typename)
+ local cout = cmod:forward(cinput,ctarget)
+ local cgin = cmod:backward(cinput,ctarget)
+
+ mytester:assertlt(
+ math.abs(fout-cout), precision_forward_type(precision_forward, typename),
+ string.format('error on output with %s', typename))
+ local gerr = cgin:double() - fgin:double()
+ mytester:assertlt(gerr:abs():max(), precision_forward_type(precision_forward, typename),
+ string.format('error on gradInput with %s', typename))
+ end
end
function cunntest.ClassNLLCriterionMultipleTarget()
local size = math.random(3000,5000)
- local input = torch.randn(size, size)
- local target = torch.randperm(size)
- local mod = nn.ClassNLLCriterion()
- local tm = {}
- local title = string.format('ClassNLLCriterionMultiTarget %d ',size)
- times[title] = tm
+ for k, typename in ipairs(typenames) do
+ local input = torch.randn(size, size):type(typename)
+ local target = torch.randperm(size)
- local a = torch.Timer()
- local fout = mod:forward(input, target)
- local fgin = mod:backward(input, target):clone()
- tm.cpu = a:time().real
+ local ctype = t2cpu[typename]
+ input = input:type(ctype)
+ local mod = nn.ClassNLLCriterion():type(ctype)
- local cinput = input:cuda()
- local ctarget = target:cuda()
+ local fout = mod:forward(input, target)
+ local fgin = mod:backward(input, target):clone()
- local cmod = nn.ClassNLLCriterion():cuda()
- a:reset()
- local cout = cmod:forward(cinput,ctarget)
- local cgin = cmod:backward(cinput,ctarget)
- cutorch.synchronize()
- tm.gpu = a:time().real
+ local cinput = input:type(typename)
+ local ctarget = target:cuda()
+
+ local cmod = nn.ClassNLLCriterion():type(typename)
+ local cout = cmod:forward(cinput,ctarget)
+ local cgin = cmod:backward(cinput,ctarget)
- mytester:assertlt(
- math.abs(fout-cout), precision_forward, 'error on output')
+ mytester:assertlt(
+ math.abs(fout-cout), precision_forward_type(precision_forward, typename),
+ string.format('error on output with %s', typename))
- local gerr = cgin:float() - fgin
- mytester:assertlt(gerr:abs():max(), precision_forward, 'error on gradInput')
+ local gerr = cgin:double() - fgin:double()
+ mytester:assertlt(gerr:abs():max(), precision_forward_type(precision_forward, typename),
+ string.format('error on gradInput with %s', typename))
+ end
end
function cunntest.SpatialClassNLLCriterion()
@@ -4703,103 +4084,98 @@ function cunntest.SpatialClassNLLCriterion()
local h = math.random(300, 500)
local w = math.random(300, 800)
local classes = math.random(10,30)
- local input = torch.randn(batchSize, classes, h, w)
- local target = torch.Tensor(batchSize, h, w)
- target:apply(function() return math.random(1, classes) end)
- local mod = nn.SpatialClassNLLCriterion()
- local tm = {}
- local title = string.format('SpatialClassNLLCriterion %dx%dx%dx%d ',
- batchSize, classes, h, w)
- times[title] = tm
+ for k, typename in ipairs(typenames) do
+ local input = torch.randn(batchSize, classes, h, w):type(typename)
+ local target = torch.Tensor(batchSize, h, w)
+ target:apply(function() return math.random(1, classes) end)
+ local ctype = t2cpu[typename]
+ input = input:type(ctype)
+ local mod = nn.SpatialClassNLLCriterion():type(ctype)
+ local fout = mod:forward(input, target)
+ local fgin = mod:backward(input, target):clone()
- local a = torch.Timer()
- local fout = mod:forward(input, target)
- local fgin = mod:backward(input, target):clone()
- tm.cpu = a:time().real
+ local cinput = input:type(typename)
+ local ctarget = target:type(typename)
- local cinput = input:cuda()
- local ctarget = target:cuda()
-
- local cmod = nn.SpatialClassNLLCriterion():cuda()
- a:reset()
- local cout = cmod:forward(cinput,ctarget)
- local cgin = cmod:backward(cinput,ctarget)
- cutorch.synchronize()
- tm.gpu = a:time().real
+ local cmod = nn.SpatialClassNLLCriterion():type(typename)
+ local cout = cmod:forward(cinput,ctarget)
+ local cgin = cmod:backward(cinput,ctarget)
+ cutorch.synchronize()
- mytester:assertlt(
- math.abs(fout-cout), precision_forward, 'error on output')
+ mytester:assertlt(
+ math.abs(fout-cout), precision_forward_type(precision_forward, typename),
+ string.format('error on output with %s', typename))
- local gerr = cgin:float() - fgin
- mytester:assertlt(gerr:abs():max(), precision_forward, 'error on gradInput')
+ local gerr = cgin:double() - fgin:double()
+ mytester:assertlt(gerr:abs():max(), precision_forward_type(precision_forward, typename),
+ string.format('error on gradInput with %s', typename))
+ end
end
function cunntest.ClassNLLCriterionMultipleTargetWeights()
local size = math.random(3000,5000)
- local input = torch.randn(size, size)
- local target = torch.randperm(size)
- local weights = torch.rand(size)
- local mod = nn.ClassNLLCriterion(weights)
- local tm = {}
- local title = string.format('ClassNLLCriterionMultiTargetWeights %d ',size)
- times[title] = tm
+ for k, typename in ipairs(typenames) do
+ local input = torch.randn(size, size):type(typename)
+ local target = torch.randperm(size)
+ local weights = torch.rand(size):type(typename)
- local a = torch.Timer()
- local fout = mod:forward(input, target)
- local fgin = mod:backward(input, target):clone()
- tm.cpu = a:time().real
+ local ctype = t2cpu[typename]
+ input = input:type(ctype)
+ weights = weights:type(ctype)
+ local mod = nn.ClassNLLCriterion(weights):type(ctype)
- local cinput = input:cuda()
- local ctarget = target:cuda()
- local cweights = weights:cuda()
+ local fout = mod:forward(input, target)
+ local fgin = mod:backward(input, target):clone()
- local cmod = nn.ClassNLLCriterion(cweights):cuda()
- a:reset()
- local cout = cmod:forward(cinput,ctarget)
- local cgin = cmod:backward(cinput,ctarget)
- cutorch.synchronize()
- tm.gpu = a:time().real
+ local cinput = input:type(typename)
+ local ctarget = target:cuda()
+ local cweights = weights:type(typename)
- mytester:assertlt(
- math.abs(fout-cout), precision_forward, 'error on output')
+ local cmod = nn.ClassNLLCriterion(cweights):type(typename)
+ local cout = cmod:forward(cinput,ctarget)
+ local cgin = cmod:backward(cinput,ctarget)
+
+ mytester:assertlt(
+ math.abs(fout-cout), precision_forward_type(precision_forward, typename),
+ string.format('error on output with %s', typename))
- local gerr = cgin:float() - fgin
- mytester:assertlt(gerr:abs():max(), precision_forward, 'error on gradInput')
+ local gerr = cgin:double() - fgin:double()
+ mytester:assertlt(gerr:abs():max(), precision_forward_type(precision_forward, typename),
+ string.format('error on gradInput with %s', typename))
+ end
end
function cunntest.TemporalMaxPooling()
- local input = torch.rand(16, 18, 3)
local settings = {{2, 2}, {3, 3}, {4, 2}, {2, 4}, {3, 5}}
for i, setting in ipairs(settings) do
- local mod = nn.TemporalMaxPooling(setting[1], setting[2])
-
- local tm = {}
- local title = 'TemporalMaxPooling '..setting[1]..' '..setting[2]
- times[title] = tm
-
- local a = torch.Timer()
- local fout = mod:forward(input)
- local fgout = torch.rand(fout:size())
- local fgin = mod:backward(input, fgout):clone()
- tm.cpu = a:time().real
-
- local cinput = input:cuda()
- local cgout = fgout:cuda()
- local cmod = nn.TemporalMaxPooling(setting[1], setting[2]):cuda()
- a:reset()
- local cout = cmod:forward(cinput)
- local cgin = cmod:backward(cinput, cgout)
- cutorch.synchronize()
- tm.gpu = a:time().real
-
- local outerror = cout:float() - fout
- mytester:assertlt(outerror:abs():max(), precision_forward, 'error on output')
-
- local ginerror = cgin:float() - fgin
- mytester:assertlt(ginerror:abs():max(), precision_backward, 'error on gradInput')
+ for k, typename in ipairs(typenames) do
+ local input = torch.rand(16, 18, 3):type(typename)
+
+ local ctype = t2cpu[typename]
+ input = input:type(ctype)
+ local mod = nn.TemporalMaxPooling(setting[1], setting[2]):type(ctype)
+
+ local fout = mod:forward(input)
+ local fgout = torch.rand(fout:size()):type(typename):type(ctype)
+ local fgin = mod:backward(input, fgout):clone()
+
+ local cinput = input:type(typename)
+ local cgout = fgout:type(typename)
+ local cmod = nn.TemporalMaxPooling(setting[1], setting[2]):type(typename)
+ local cout = cmod:forward(cinput)
+ local cgin = cmod:backward(cinput, cgout)
+
+ local outerror = cout:double() - fout:double()
+ mytester:assertlt(outerror:abs():max(), precision_forward_type(precision_forward, typename),
+ string.format('error on output with %s', typename))
+
+ local ginerror = cgin:double() - fgin:double()
+ mytester:assertlt(ginerror:abs():max(), precision_backward_type(precision_backward, typename),
+ string.format('error on gradInput with %s', typename))
+ end
end
end
@@ -4819,35 +4195,26 @@ function cunntest.VolumetricConvolution_forward_single()
local inj = (outj-1)*sj+kj
local ink = (outk-1)*sk+kk
- local tm = {}
- local title = string.format('VolumetricConvolution.forward %dx%dx%dx%d o %dx%dx%d -> %dx%dx%dx%d',
- from, ink, inj, ini, kk, kj, ki, to, outk, outj, outi)
- times[title] = tm
+ for k, typename in ipairs(typenames) do
+ local input = torch.randn(from,ini,inj,ink):type(typename)
- local input = torch.randn(from,ini,inj,ink)
- local sconv = nn.VolumetricConvolution(from,to,ki,kk,kj,si,sk,sj)
- local groundtruth = sconv:forward(input)
- local a = torch.Timer()
- for i = 1,nloop do
- groundtruth = sconv:forward(input)
- end
- tm.cpu = a:time().real
+ local ctype = t2cpu[typename]
+ input = input:type(ctype)
+ local sconv = nn.VolumetricConvolution(from,to,ki,kk,kj,si,sk,sj):type(ctype)
+ local groundtruth = sconv:forward(input)
- input = input:cuda()
- local gconv = nn.VolumetricConvolution(from,to,ki,kk,kj,si,sk,sj):cuda()
- gconv.weight = sconv.weight:cuda()
- gconv.bias = sconv.bias:cuda()
- local rescuda = gconv:forward(input)
- a:reset()
- for i = 1,nloop do
- rescuda = gconv:forward(input)
- end
- cutorch.synchronize()
- tm.gpu = a:time().real
+ input = input:type(typename)
+ local gconv = nn.VolumetricConvolution(from,to,ki,kk,kj,si,sk,sj):type(typename)
+ gconv.weight = sconv.weight:type(typename)
+ gconv.bias = sconv.bias:type(typename)
+ local rescuda = gconv:forward(input)
- local error = rescuda:float() - groundtruth
- mytester:assertlt(error:abs():max(), precision_forward, 'error on state (forward) ')
- mytester:assert(groundtruth:isSize(rescuda:size()), 'size mismatch on state (forward)')
+ local error = rescuda:double() - groundtruth:double()
+ mytester:assertlt(error:abs():max(), precision_forward_type(precision_forward, typename),
+ string.format('error on state (forward) with %s', typename))
+ mytester:assert(groundtruth:isSize(rescuda:size()),
+ string.format('size mismatch on state (forward) with %s', typename))
+ end
end
function cunntest.VolumetricConvolution_forward_batch()
@@ -4867,35 +4234,26 @@ function cunntest.VolumetricConvolution_forward_batch()
local inj = (outj-1)*sj+kj
local ink = (outk-1)*sk+kk
- local tm = {}
- local title = string.format('VolumetricConvolution.forward %dx%dx%dx%dx%d o %dx%dx%d -> %dx%dx%dx%dx%d',
- bs, from, ink, inj, ini, kk, kj, ki, bs, to, outk, outj, outi)
- times[title] = tm
+ for k, typename in ipairs(typenames) do
+ local input = torch.randn(bs,from,ini,inj, ink):type(typename)
- local input = torch.randn(bs,from,ini,inj, ink)
- local sconv = nn.VolumetricConvolution(from,to,ki,kk,kj,si,sj,sk)
- local groundtruth = sconv:forward(input)
- local a = torch.Timer()
- for i = 1,nloop do
- groundtruth = sconv:forward(input, sconv)
- end
- tm.cpu = a:time().real
+ local ctype = t2cpu[typename]
+ input = input:type(ctype)
+ local sconv = nn.VolumetricConvolution(from,to,ki,kk,kj,si,sj,sk):type(ctype)
+ local groundtruth = sconv:forward(input)
- input = input:cuda()
- local gconv = nn.VolumetricConvolution(from,to,ki,kk,kj,si,sj,sk):cuda()
- gconv.weight = sconv.weight:cuda()
- gconv.bias = sconv.bias:cuda()
- local rescuda = gconv:forward(input)
- a:reset()
- for i = 1,nloop do
- rescuda = gconv:forward(input)
- end
- cutorch.synchronize()
- tm.gpu = a:time().real
+ input = input:type(typename)
+ local gconv = nn.VolumetricConvolution(from,to,ki,kk,kj,si,sj,sk):type(typename)
+ gconv.weight = sconv.weight:type(typename)
+ gconv.bias = sconv.bias:type(typename)
+ local rescuda = gconv:forward(input)
- local error = rescuda:float() - groundtruth
- mytester:assertlt(error:abs():max(), precision_forward, 'error on state (forward) ')
- mytester:assert(groundtruth:isSize(rescuda:size()), 'size mismatch on state (forward)')
+ local error = rescuda:double() - groundtruth:double()
+ mytester:assertlt(error:abs():max(), precision_forward_type(precision_forward, typename),
+ string.format('error on state (forward) with %s', typename))
+ mytester:assert(groundtruth:isSize(rescuda:size()),
+ string.format('size mismatch on state (forward) with %s', typename))
+ end
end
function cunntest.VolumetricConvolution_backward_single()
@@ -4914,50 +4272,44 @@ function cunntest.VolumetricConvolution_backward_single()
local inj = (outj-1)*sj+kj
local ink = (outk-1)*sk+kk
- local tm = {}
- local title = string.format('VolumetricConvolution.backward %dx%dx%dx%d o %dx%dx%d -> %dx%dx%dx%d',
- from, ink, inj, ini, kk, kj, ki, to, outk, outj, outi)
- times[title] = tm
+ for k, typename in ipairs(typenames) do
+ local input = torch.randn(from, ini, inj, ink):type(typename)
+ local gradOutput = torch.randn(to, outi, outj, outk):type(typename)
- local input = torch.randn(from, ini, inj, ink)
- local gradOutput = torch.randn(to, outi, outj, outk)
- local sconv = nn.VolumetricConvolution(from,to,ki,kk,kj,si,sk,sj)
- sconv:forward(input)
- sconv:zeroGradParameters()
- local groundgrad = sconv:backward(input, gradOutput)
- local a = torch.Timer()
- for i = 1,nloop do
+ local ctype = t2cpu[typename]
+ input = input:type(ctype)
+ gradOutput = gradOutput:type(ctype)
+ local sconv = nn.VolumetricConvolution(from,to,ki,kk,kj,si,sk,sj):type(ctype)
+ sconv:forward(input)
sconv:zeroGradParameters()
- groundgrad = sconv:backward(input, gradOutput)
- end
- local groundweight = sconv.gradWeight
- local groundbias = sconv.gradBias
- tm.cpu = a:time().real
+ local groundgrad = sconv:backward(input, gradOutput)
+ local groundweight = sconv.gradWeight
+ local groundbias = sconv.gradBias
- input = input:cuda()
- gradOutput = gradOutput:cuda()
- local gconv = nn.VolumetricConvolution(from,to,ki,kk,kj,si,sk,sj):cuda()
- gconv.weight = sconv.weight:cuda()
- gconv.bias = sconv.bias:cuda()
- gconv:forward(input)
- gconv:zeroGradParameters()
- local rescuda = gconv:backward(input, gradOutput)
- a:reset()
- for i = 1,nloop do
+ input = input:type(typename)
+ gradOutput = gradOutput:type(typename)
+ local gconv = nn.VolumetricConvolution(from,to,ki,kk,kj,si,sk,sj):type(typename)
+ gconv.weight = sconv.weight:type(typename)
+ gconv.bias = sconv.bias:type(typename)
+ gconv:forward(input)
gconv:zeroGradParameters()
- rescuda = gconv:backward(input, gradOutput)
+ local rescuda = gconv:backward(input, gradOutput)
+ local weightcuda = gconv.gradWeight
+ local biascuda = gconv.gradBias
+ local error = rescuda:double() - groundgrad:double()
+ local werror = weightcuda:double() - groundweight:double()
+ local berror = biascuda:double() - groundbias:double()
+ mytester:assert(groundgrad:isSize(rescuda:size()),
+ string.format('size mismatch on state (forward) with %s', typename))
+ mytester:assertlt(error:abs():max(), precision_backward_type(precision_backward, typename),
+ string.format('error on state (backward) with %s', typename))
+ mytester:assertlt(werror:abs():max(),
+ precision_backward_conv_weightbias(precision_backward, typename, weightcuda:abs():max()),
+ string.format('error on weight (backward) with %s', typename))
+ mytester:assertlt(berror:abs():max(),
+ precision_backward_conv_weightbias(precision_backward, typename, biascuda:abs():max()),
+ string.format('error on bias (backward) with %s', typename))
end
- local weightcuda = gconv.gradWeight
- local biascuda = gconv.gradBias
- cutorch.synchronize()
- tm.gpu = a:time().real
- local error = rescuda:float() - groundgrad
- local werror = weightcuda:float() - groundweight
- local berror = biascuda:float() - groundbias
- mytester:assert(groundgrad:isSize(rescuda:size()), 'size mismatch on state (forward)')
- mytester:assertlt(error:abs():max(), precision_backward, 'error on state (backward) ')
- mytester:assertlt(werror:abs():max(), precision_backward, 'error on weight (backward) ')
- mytester:assertlt(berror:abs():max(), precision_backward, 'error on bias (backward) ')
end
function cunntest.VolumetricConvolution_backward_batch()
@@ -4977,50 +4329,44 @@ function cunntest.VolumetricConvolution_backward_batch()
local inj = (outj-1)*sj+kj
local ink = (outk-1)*sk+kk
- local tm = {}
- local title = string.format('VolumetricConvolution.backward %dx%dx%dx%dx%d o %dx%dx%d -> %dx%dx%dx%dx%d',
- bs, from, ink, inj, ini, kk, kj, ki, bs, to, outk, outj, outi)
- times[title] = tm
+ for k, typename in ipairs(typenames) do
+ local input = torch.randn(bs, from, ini, inj, ink):type(typename)
+ local gradOutput = torch.randn(bs, to, outi, outj, outk):type(typename)
- local input = torch.randn(bs, from, ini, inj, ink)
- local gradOutput = torch.randn(bs, to, outi, outj, outk)
- local sconv = nn.VolumetricConvolution(from,to,ki,kk,kj,si,sk,sj)
- sconv:forward(input)
- sconv:zeroGradParameters()
- local groundgrad = sconv:backward(input, gradOutput)
- local a = torch.Timer()
- for i = 1,nloop do
+ local ctype = t2cpu[typename]
+ input = input:type(ctype)
+ gradOutput = gradOutput:type(ctype)
+ local sconv = nn.VolumetricConvolution(from,to,ki,kk,kj,si,sk,sj):type(ctype)
+ sconv:forward(input)
sconv:zeroGradParameters()
- groundgrad = sconv:backward(input, gradOutput)
- end
- local groundweight = sconv.gradWeight
- local groundbias = sconv.gradBias
- tm.cpu = a:time().real
+ local groundgrad = sconv:backward(input, gradOutput)
+ local groundweight = sconv.gradWeight
+ local groundbias = sconv.gradBias
- input = input:cuda()
- gradOutput = gradOutput:cuda()
- local gconv = nn.VolumetricConvolution(from,to,ki,kk,kj,si,sk,sj):cuda()
- gconv.weight = sconv.weight:cuda()
- gconv.bias = sconv.bias:cuda()
- gconv:forward(input)
- gconv:zeroGradParameters()
- local rescuda = gconv:backward(input, gradOutput)
- a:reset()
- for i = 1,nloop do
+ input = input:type(typename)
+ gradOutput = gradOutput:type(typename)
+ local gconv = nn.VolumetricConvolution(from,to,ki,kk,kj,si,sk,sj):type(typename)
+ gconv.weight = sconv.weight:type(typename)
+ gconv.bias = sconv.bias:type(typename)
+ gconv:forward(input)
gconv:zeroGradParameters()
- rescuda = gconv:backward(input, gradOutput)
+ local rescuda = gconv:backward(input, gradOutput)
+ local weightcuda = gconv.gradWeight
+ local biascuda = gconv.gradBias
+ local error = rescuda:double() - groundgrad:double()
+ local werror = weightcuda:double() - groundweight:double()
+ local berror = biascuda:double() - groundbias:double()
+ mytester:assert(groundgrad:isSize(rescuda:size()),
+ string.format('size mismatch on state (forward) with %s', typename))
+ mytester:assertlt(error:abs():max(), precision_backward_type(precision_backward, typename),
+ string.format('error on state (backward) with %s', typename))
+ mytester:assertlt(werror:abs():max(),
+ precision_backward_conv_weightbias(precision_backward, typename, weightcuda:abs():max()),
+ string.format('error on weight (backward) with %s', typename))
+ mytester:assertlt(berror:abs():max(),
+ precision_backward_conv_weightbias(precision_backward, typename, biascuda:abs():max()),
+ string.format('error on bias (backward) with %s', typename))
end
- local weightcuda = gconv.gradWeight
- local biascuda = gconv.gradBias
- cutorch.synchronize()
- tm.gpu = a:time().real
- local error = rescuda:float() - groundgrad
- local werror = weightcuda:float() - groundweight
- local berror = biascuda:float() - groundbias
- mytester:assert(groundgrad:isSize(rescuda:size()), 'size mismatch on state (forward)')
- mytester:assertlt(error:abs():max(), precision_backward, 'error on state (backward) ')
- mytester:assertlt(werror:abs():max(), precision_backward, 'error on weight (backward) ')
- mytester:assertlt(berror:abs():max(), precision_backward, 'error on bias (backward) ')
end
function cunntest.VolumetricMaxPooling_forward()
@@ -5041,32 +4387,22 @@ function cunntest.VolumetricMaxPooling_forward()
local oH = math.floor((iH - kH + 2*padH) / dH + 1)
local oW = math.floor((iW - kW + 2*padW) / dW + 1)
- local tm = {}
- local title = string.format('VolumetricMaxPooling.forward %dx%dx%dx%d o %dx%dx%d (%dx%dx%d)-> %dx%dx%dx%d',
- iF, iT, iH, iW, kT, kH, kW, dT, dH, dW, iF, oT, oH, oW)
- times[title] = tm
+ for k, typename in ipairs(typenames) do
+ local input = torch.Tensor(iF, iT, iH, iW):float():uniform(-1, 1):type(typename)
- local input = torch.Tensor(iF, iT, iH, iW):float():uniform(-1, 1)
- local layer = nn.VolumetricMaxPooling(kT, kW, kH, dT, dW, dH, padT, padW, padH):float()
- local output = layer:forward(input)
- local timer = torch.Timer()
- for i = 1,nloop do
- output = layer:forward(input)
- end
- tm.cpu = timer:time().real
+ local ctype = t2cpu[typename]
+ input = input:type(ctype)
+ local layer = nn.VolumetricMaxPooling(kT, kW, kH, dT, dW, dH, padT, padW, padH):type(ctype)
+ local output = layer:forward(input)
- local inputCUDA = input:cuda()
- local layerCUDA = layer:clone():cuda()
- local outputCUDA = layerCUDA:forward(inputCUDA)
- timer:reset()
- for i = 1,nloop do
- outputCUDA = layerCUDA:forward(inputCUDA)
- end
- cutorch.synchronize()
- tm.gpu = timer:time().real
+ local inputCUDA = input:type(typename)
+ local layerCUDA = layer:clone():type(typename)
+ local outputCUDA = layerCUDA:forward(inputCUDA)
- local error = outputCUDA:float() - output
- mytester:assertlt(error:abs():max(), precision_forward, 'error on state (forward) ')
+ local error = outputCUDA:double() - output:double()
+ mytester:assertlt(error:abs():max(), precision_forward_type(precision_forward, typename),
+ string.format('error on state (forward) with %s', typename))
+ end
end
function cunntest.VolumetricMaxPooling_backward()
@@ -5087,38 +4423,27 @@ function cunntest.VolumetricMaxPooling_backward()
local oH = math.floor((iH - kH + 2*padH) / dH + 1)
local oW = math.floor((iW - kW + 2*padW) / dW + 1)
- local tm = {}
- local title = string.format('VolumetricMaxPooling.backward %dx%dx%dx%d o %dx%dx%d (%dx%dx%d) -> %dx%dx%dx%d',
- iF, iT, iH, iW, kT, kH, kW, dT, dH, dW, iF, oT, oH, oW)
- times[title] = tm
+ for k, typename in ipairs(typenames) do
+ local input = torch.Tensor(iF, iT, iH, iW):float():uniform(-1, 1):type(typename)
- local input = torch.Tensor(iF, iT, iH, iW):float():uniform(-1, 1)
- local layer = nn.VolumetricMaxPooling(kT, kW, kH, dT, dW, dH, padT, padW, padH):float()
- local output = layer:forward(input)
- local gradOutput = output:clone():uniform(-1, 1)
+ local ctype = t2cpu[typename]
+ input = input:type(ctype)
+ local layer = nn.VolumetricMaxPooling(kT, kW, kH, dT, dW, dH, padT, padW, padH):type(ctype)
+ local output = layer:forward(input)
+ local gradOutput = output:clone():uniform(-1, 1)
- local gradInput = layer:backward(input, gradOutput)
- local timer = torch.Timer()
- for i = 1,nloop do
- gradInput = layer:backward(input, gradOutput)
- end
- tm.cpu = timer:time().real
+ local gradInput = layer:backward(input, gradOutput)
- local inputCUDA = input:cuda()
- local layerCUDA = layer:clone():cuda()
- local outputCUDA = layerCUDA:forward(inputCUDA)
- local gradOutputCUDA = gradOutput:cuda()
- local gradInputCUDA = layerCUDA:backward(inputCUDA, gradOutputCUDA)
+ local inputCUDA = input:type(typename)
+ local layerCUDA = layer:clone():type(typename)
+ local outputCUDA = layerCUDA:forward(inputCUDA)
+ local gradOutputCUDA = gradOutput:type(typename)
+ local gradInputCUDA = layerCUDA:backward(inputCUDA, gradOutputCUDA)
- timer:reset()
- for i = 1,nloop do
- gradInputCUDA = layerCUDA:backward(inputCUDA, gradOutputCUDA)
+ local error = gradInputCUDA:double() - gradInput:double()
+ mytester:assertlt(error:abs():max(), precision_forward_type(precision_forward, typename),
+ string.format('error on state (backward) with %s', typename))
end
- cutorch.synchronize()
- tm.gpu = timer:time().real
-
- local error = gradInputCUDA:float() - gradInput
- mytester:assertlt(error:abs():max(), precision_forward, 'error on state (backward) ')
end
function cunntest.VolumetricDilatedMaxPooling_forward_batch()
@@ -5145,34 +4470,24 @@ function cunntest.VolumetricDilatedMaxPooling_forward_batch()
local inj = (outj-1)*sj+(dilationj*(kj-1)+1)-2*padj
local ceil_mode = math.random(0,1) == 1
- local tm = {}
- local title = string.format('VolumetricDilatedMaxPooling.forward %dx%dx%dx%dx%d o %dx%dx%d -> %d%dx%dx%dx%d',
- bs, from, int, inj, ini, kt, kj, ki, bs, to, outt, outj, outi)
- times[title] = tm
+ for k, typename in ipairs(typenames) do
+ local input = torch.randn(bs,from,int,inj,ini):type(typename)
- local input = torch.randn(bs,from,int,inj,ini)
- local sconv = nn.VolumetricDilatedMaxPooling(kt,ki,kj,st,si,sj,padt,padi,padj,dilationt,dilationi,dilationj)
- if ceil_mode then sconv:ceil() end
- local groundtruth = sconv:forward(input)
- local a = torch.Timer()
- for i = 1,nloop do
- groundtruth = sconv:forward(input)
- end
- tm.cpu = a:time().real
+ local ctype = t2cpu[typename]
+ input = input:type(ctype)
+ local sconv = nn.VolumetricDilatedMaxPooling(kt,ki,kj,st,si,sj,padt,padi,padj,dilationt,dilationi,dilationj):type(ctype)
+ if ceil_mode then sconv:ceil() end
+ local groundtruth = sconv:forward(input)
- input = input:cuda()
- local gconv = nn.VolumetricDilatedMaxPooling(kt,ki,kj,st,si,sj,padt,padi,padj,dilationt,dilationi,dilationj):cuda()
- if ceil_mode then gconv:ceil() end
- local rescuda = gconv:forward(input)
- a:reset()
- for i = 1,nloop do
- rescuda = gconv:forward(input)
- end
- cutorch.synchronize()
- tm.gpu = a:time().real
+ input = input:type(typename)
+ local gconv = nn.VolumetricDilatedMaxPooling(kt,ki,kj,st,si,sj,padt,padi,padj,dilationt,dilationi,dilationj):type(typename)
+ if ceil_mode then gconv:ceil() end
+ local rescuda = gconv:forward(input)
- local error = rescuda:float() - groundtruth
- mytester:assertlt(error:abs():max(), precision_forward, 'error on state (forward) ')
+ local error = rescuda:double() - groundtruth:double()
+ mytester:assertlt(error:abs():max(), precision_forward_type(precision_forward, typename),
+ string.format('error on state (forward) with %s', typename))
+ end
end
function cunntest.VolumetricDilatedMaxPooling_backward_batch()
@@ -5199,43 +4514,32 @@ function cunntest.VolumetricDilatedMaxPooling_backward_batch()
local inj = (outj-1)*sj+(dilationj*(kj-1)+1)-2*padj
local ceil_mode = math.random(0,1) == 1
- local tm = {}
- local title = string.format('VolumetricDilatedMaxPooling.forward %dx%dx%dx%dx%d o %dx%dx%d -> %d%dx%dx%dx%d',
- bs, from, int, inj, ini, kt, kj, ki, bs, to, outt, outj, outi)
- times[title] = tm
+ for k, typename in ipairs(typenames) do
+ local input = torch.randn(bs,from,int,inj,ini):type(typename)
+ local gradOutput = torch.randn(bs,to,outt,outj,outi):type(typename)
- local input = torch.randn(bs,from,int,inj,ini)
- local gradOutput = torch.randn(bs,to,outt,outj,outi)
- local sconv = nn.VolumetricDilatedMaxPooling(kt,ki,kj,st,si,sj,padt,padi,padj,dilationt,dilationi,dilationj)
- if ceil_mode then sconv:ceil() end
- sconv:forward(input)
- sconv:zeroGradParameters()
- local groundgrad = sconv:backward(input, gradOutput)
- local a = torch.Timer()
- for i = 1,nloop do
+ local ctype = t2cpu[typename]
+ input = input:type(ctype)
+ gradOutput = gradOutput:type(ctype)
+ local sconv = nn.VolumetricDilatedMaxPooling(kt,ki,kj,st,si,sj,padt,padi,padj,dilationt,dilationi,dilationj):type(ctype)
+ if ceil_mode then sconv:ceil() end
+ sconv:forward(input)
sconv:zeroGradParameters()
- groundgrad = sconv:backward(input, gradOutput)
- end
- tm.cpu = a:time().real
+ local groundgrad = sconv:backward(input, gradOutput)
- input = input:cuda()
- gradOutput = gradOutput:cuda()
- local gconv = nn.VolumetricDilatedMaxPooling(kt,ki,kj,st,si,sj,padt,padi,padj,dilationt,dilationi,dilationj):cuda()
- if ceil_mode then gconv:ceil() end
- gconv:forward(input)
- gconv:zeroGradParameters()
- local rescuda = gconv:backward(input, gradOutput)
- a:reset()
- for i = 1,nloop do
+ input = input:type(typename)
+ gradOutput = gradOutput:type(typename)
+ local gconv = nn.VolumetricDilatedMaxPooling(kt,ki,kj,st,si,sj,padt,padi,padj,dilationt,dilationi,dilationj):type(typename)
+ if ceil_mode then gconv:ceil() end
+ gconv:forward(input)
gconv:zeroGradParameters()
- rescuda = gconv:backward(input, gradOutput)
- end
- cutorch.synchronize()
- tm.gpu = a:time().real
+ local rescuda = gconv:backward(input, gradOutput)
- local error = rescuda:float() - groundgrad
+ local error = rescuda:double() - groundgrad:double()
- mytester:assertlt(error:abs():max(), precision_backward, 'error on state (backward) ')
+ mytester:assertlt(error:abs():max(), precision_backward_type(precision_backward, typename),
+ string.format('error on state (backward) with %s', typename))
+ end
end
function cunntest.VolumetricMaxUnpooling_forward_batch()
@@ -5252,42 +4556,31 @@ function cunntest.VolumetricMaxUnpooling_forward_batch()
local padt = math.random(0,kt/2-1)
local padi = math.random(0,ki/2-1)
local padj = math.random(0,kj/2-1)
- local it = ((outt + padt*2 - kt)/st) +1
- local ii = ((outi + padi*2 - ki)/si) +1
- local ij = ((outj + padj*2 - kj)/sj) +1
+ local it = math.max(((outt + padt*2 - kt)/st) +1, kt)
+ local ii = math.max(((outi + padi*2 - ki)/si) +1, ki)
+ local ij = math.max(((outj + padj*2 - kj)/sj) +1, kj)
- local tm = {}
- local title = string.format('VolumetricMaxUnpooling.forward %dx%dx%dx%dx%d o %dx%dx%d -> %d%dx%dx%dx%d',
- bs, from, it, ij, ii, kt, kj, ki, bs, to, outt, outj, outi)
- times[title] = tm
+ for k, typename in ipairs(typenames) do
+ local ctype = t2cpu[typename]
- local pooler = nn.VolumetricMaxPooling(kt,ki,kj,st,si,sj,padt,padi,padj)
- local sunpool = nn.VolumetricMaxUnpooling(pooler)
+ local pooler = nn.VolumetricMaxPooling(kt,ki,kj,st,si,sj,padt,padi,padj):type(ctype)
+ local sunpool = nn.VolumetricMaxUnpooling(pooler):type(ctype)
- local original = torch.randn(bs,from,it,ij,ii)
- local input = pooler:forward(original)
- local groundtruth = sunpool:forward(input)
- local a = torch.Timer()
- for i = 1,nloop do
- groundtruth = sunpool:forward(input)
- end
- tm.cpu = a:time().real
+ local original = torch.randn(bs,from,it,ij,ii):type(typename):type(ctype)
+ local input = pooler:forward(original)
+ local groundtruth = sunpool:forward(input)
- original = original:cuda()
- pooler = nn.VolumetricMaxPooling(kt,ki,kj,st,si,sj,padt,padi,padj):cuda()
- local gunpool = nn.VolumetricMaxUnpooling(pooler):cuda()
+ original = original:type(typename)
+ pooler = nn.VolumetricMaxPooling(kt,ki,kj,st,si,sj,padt,padi,padj):type(typename)
+ local gunpool = nn.VolumetricMaxUnpooling(pooler):type(typename)
- input = pooler:forward(original)
- local rescuda = gunpool:forward(input)
- a:reset()
- for i = 1,nloop do
- rescuda = gunpool:forward(input)
- end
- cutorch.synchronize()
- tm.gpu = a:time().real
+ input = pooler:forward(original)
+ local rescuda = gunpool:forward(input)
- local error = rescuda:float() - groundtruth
- mytester:assertlt(error:abs():max(), precision_forward, 'error on state (forward) ')
+ local error = rescuda:double() - groundtruth:double()
+ mytester:assertlt(error:abs():max(), precision_forward_type(precision_forward, typename),
+ string.format('error on state (forward) with %s', typename))
+ end
end
function cunntest.VolumetricMaxUnpooling_backward_batch()
@@ -5304,52 +4597,39 @@ function cunntest.VolumetricMaxUnpooling_backward_batch()
local padt = math.random(0,kt/2-1)
local padi = math.random(0,ki/2-1)
local padj = math.random(0,kj/2-1)
- local it = ((outt + padt*2 - kt)/st) +1
- local ii = ((outi + padi*2 - ki)/si) +1
- local ij = ((outj + padj*2 - kj)/sj) +1
+ local it = math.max(((outt + padt*2 - kt)/st) +1, kt)
+ local ii = math.max(((outi + padi*2 - ki)/si) +1, ki)
+ local ij = math.max(((outj + padj*2 - kj)/sj) +1, kj)
- local tm = {}
- local title = string.format('VolumetricMaxUnpooling.backward %dx%dx%dx%dx%d o %dx%dx%d -> %d%dx%dx%dx%d',
- bs, from, it, ij, ii, kt, kj, ki, bs, to, outt, outj, outi)
- times[title] = tm
+ for k, typename in ipairs(typenames) do
+ local ctype = t2cpu[typename]
- local pooler = nn.VolumetricMaxPooling(kt,ki,kj,st,si,sj,padt,padi,padj)
- local sunpool = nn.VolumetricMaxUnpooling(pooler)
+ local pooler = nn.VolumetricMaxPooling(kt,ki,kj,st,si,sj,padt,padi,padj):type(ctype)
+ local sunpool = nn.VolumetricMaxUnpooling(pooler):type(ctype)
- local original = torch.randn(bs,from,it,ij,ii)
- local input = pooler:forward(original)
- local gradOutput = torch.randn(original:size())
- sunpool:forward(input)
- sunpool:zeroGradParameters()
- local groundgrad = sunpool:backward(input, gradOutput)
- local a = torch.Timer()
- for i = 1,nloop do
+ local original = torch.randn(bs,from,it,ij,ii):type(typename):type(ctype)
+ local input = pooler:forward(original)
+ local gradOutput = torch.randn(original:size()):type(typename):type(ctype)
+ sunpool:forward(input)
sunpool:zeroGradParameters()
- groundgrad = sunpool:backward(input, gradOutput)
- end
- tm.cpu = a:time().real
+ local groundgrad = sunpool:backward(input, gradOutput)
- pooler = nn.VolumetricMaxPooling(kt,ki,kj,st,si,sj,padt,padi,padj):cuda()
- local gunpool = nn.VolumetricMaxUnpooling(pooler):cuda()
+ pooler = nn.VolumetricMaxPooling(kt,ki,kj,st,si,sj,padt,padi,padj):type(typename)
+ local gunpool = nn.VolumetricMaxUnpooling(pooler):type(typename)
- original = original:cuda()
- input = pooler:forward(original)
- gunpool:forward(input)
+ original = original:type(typename)
+ input = pooler:forward(original)
+ gunpool:forward(input)
- gradOutput = gradOutput:cuda()
- gunpool:zeroGradParameters()
- local rescuda = gunpool:backward(input, gradOutput)
- a:reset()
- for i = 1,nloop do
+ gradOutput = gradOutput:type(typename)
gunpool:zeroGradParameters()
- rescuda = gunpool:backward(input, gradOutput)
- end
- cutorch.synchronize()
- tm.gpu = a:time().real
+ local rescuda = gunpool:backward(input, gradOutput)
- local error = rescuda:float() - groundgrad
+ local error = rescuda:double() - groundgrad:double()
- mytester:assertlt(error:abs():max(), precision_backward, 'error on state (backward) ')
+ mytester:assertlt(error:abs():max(), precision_backward_type(precision_backward, typename),
+ string.format('error on state (backward) with %s', typename))
+ end
end
function cunntest.VolumetricAveragePooling_forward()
@@ -5367,32 +4647,22 @@ function cunntest.VolumetricAveragePooling_forward()
local iH = (oH - 1) * dH + kH
local iW = (oW - 1) * dW + kW
- local tm = {}
- local title = string.format('VolumetricAveragePooling.forward %dx%dx%dx%d o %dx%dx%d (%dx%dx%d) -> %dx%dx%dx%d',
- iF, iT, iH, iW, kT, kH, kW, dT, dH, dW, iF, oT, oH, oW)
- times[title] = tm
+ for k, typename in ipairs(typenames) do
+ local input = torch.Tensor(iF, iT, iH, iW):float():uniform(-1, 1):type(typename)
- local input = torch.Tensor(iF, iT, iH, iW):float():uniform(-1, 1)
- local layer = nn.VolumetricAveragePooling(kT, kW, kH, dT, dW, dH):float()
- local output = layer:forward(input)
- local timer = torch.Timer()
- for i = 1,nloop do
- output = layer:forward(input)
- end
- tm.cpu = timer:time().real
+ local ctype = t2cpu[typename]
+ input = input:type(ctype)
+ local layer = nn.VolumetricAveragePooling(kT, kW, kH, dT, dW, dH):type(ctype)
+ local output = layer:forward(input)
- local inputCUDA = input:cuda()
- local layerCUDA = layer:clone():cuda()
- local outputCUDA = layerCUDA:forward(inputCUDA)
- timer:reset()
- for i = 1,nloop do
- outputCUDA = layerCUDA:forward(inputCUDA)
- end
- cutorch.synchronize()
- tm.gpu = timer:time().real
+ local inputCUDA = input:type(typename)
+ local layerCUDA = layer:clone():type(typename)
+ local outputCUDA = layerCUDA:forward(inputCUDA)
- local error = outputCUDA:float() - output
- mytester:assertlt(error:abs():max(), precision_forward, 'error on state (forward) ')
+ local error = outputCUDA:double() - output:double()
+ mytester:assertlt(error:abs():max(), precision_forward_type(precision_forward, typename),
+ string.format('error on state (forward) with %s', typename))
+ end
end
function cunntest.VolumetricAveragePooling_backward()
@@ -5410,36 +4680,25 @@ function cunntest.VolumetricAveragePooling_backward()
local iH = (oH - 1) * dH + kH
local iW = (oW - 1) * dW + kW
- local tm = {}
- local title = string.format('VolumetricAveragePooling.backward %dx%dx%dx%d o %dx%dx%d (%dx%dx%d) -> %dx%dx%dx%d',
- iF, iT, iH, iW, kT, kH, kW, dT, dH, dW, iF, oT, oH, oW)
- times[title] = tm
+ for k, typename in ipairs(typenames) do
+ local input = torch.Tensor(iF, iT, iH, iW):float():uniform(-1, 1):type(typename)
- local input = torch.Tensor(iF, iT, iH, iW):float():uniform(-1, 1)
- local layer = nn.VolumetricAveragePooling(kT, kW, kH, dT, dW, dH):float()
- local output = layer:forward(input)
- local gradOutput = output:clone():uniform(-1, 1)
+ local ctype = t2cpu[typename]
+ input = input:type(ctype)
+ local layer = nn.VolumetricAveragePooling(kT, kW, kH, dT, dW, dH):type(ctype)
+ local output = layer:forward(input)
+ local gradOutput = output:clone():uniform(-1, 1)
- local gradInput = layer:backward(input, gradOutput)
- local timer = torch.Timer()
- for i = 1,nloop do
- gradInput = layer:backward(input, gradOutput)
- end
- tm.cpu = timer:time().real
+ local gradInput = layer:backward(input, gradOutput)
- local inputCUDA = input:cuda() local layerCUDA = layer:clone():cuda()
- local outputCUDA = layerCUDA:forward(inputCUDA) local gradOutputCUDA = gradOutput:cuda()
- local gradInputCUDA = layerCUDA:backward(inputCUDA, gradOutputCUDA)
+ local inputCUDA = input:type(typename) local layerCUDA = layer:clone():type(typename)
+ local outputCUDA = layerCUDA:forward(inputCUDA) local gradOutputCUDA = gradOutput:type(typename)
+ local gradInputCUDA = layerCUDA:backward(inputCUDA, gradOutputCUDA)
- timer:reset()
- for i = 1,nloop do
- gradInputCUDA = layerCUDA:backward(inputCUDA, gradOutputCUDA)
+ local error = gradInputCUDA:double() - gradInput:double()
+ mytester:assertlt(error:abs():max(), precision_forward_type(precision_forward, typename),
+ string.format('error on state (backward) with %s', typename))
end
- cutorch.synchronize()
- tm.gpu = timer:time().real
-
- local error = gradInputCUDA:float() - gradInput
- mytester:assertlt(error:abs():max(), precision_forward, 'error on state (backward) ')
end
function cunntest.CMul_forward_batch()
@@ -5526,32 +4785,22 @@ function cunntest.PReLU_forward()
local nOutputPlane = 8
local w = math.random(1,100)
local h = math.random(1,100)
+ local input = torch.randn(nOutputPlane,h,w)
- local tm = {}
- local title = string.format('PReLU forward %d x %d', w, h)
- times[title] = tm
+ for k, typename in ipairs(typenames) do
+ local ctype = t2cpu[typename]
+ local input = input:type(ctype)
+ local sconv = nn.PReLU(nOutputPlane):type(ctype)
+ local groundtruth = sconv:forward(input)
- local input = torch.randn(nOutputPlane,h,w)
- local sconv = nn.PReLU(nOutputPlane)
- local groundtruth = sconv:forward(input)
- local a = torch.Timer()
- for i = 1,nloop do
- groundtruth = sconv:forward(input)
- end
- tm.cpu = a:time().real
-
- input = input:cuda()
- local gconv = sconv:cuda()
- local rescuda = gconv:forward(input)
- a:reset()
- for i = 1,nloop do
- rescuda = gconv:forward(input)
- end
- cutorch.synchronize()
- tm.gpu = a:time().real
+ input = input:type(typename)
+ local gconv = sconv:type(typename)
+ local rescuda = gconv:forward(input)
- local error = rescuda:float() - groundtruth
- mytester:assertlt(error:abs():max(), precision_forward, 'error on state')
+ local error = rescuda:double() - groundtruth:double()
+ mytester:assertlt(error:abs():max(), precision_forward_type(precision_forward, typename),
+ string.format('error on state with %s', typename))
+ end
end
function cunntest.PReLU_backward()
@@ -5559,39 +4808,33 @@ function cunntest.PReLU_backward()
local w = math.random(1,10)
local h = math.random(1,10)
- local tm = {}
- local title = string.format('PReLU backward %d x %d', w, h)
- times[title] = tm
-
- local input = torch.randn(nOutputPlane, h, w)
- local gradOutput = torch.randn(#input)
- local sconv = nn.PReLU(nOutputPlane)
- local gconv = sconv:clone():cuda()
-
- sconv:forward(input)
- local groundgrad = sconv:backward(input, gradOutput)
- local a = torch.Timer()
- for i = 1,nloop do
- groundgrad = sconv:backward(input, gradOutput)
+ for k, typename in ipairs(typenames) do
+ local input = torch.randn(nOutputPlane, h, w):type(typename)
+ local gradOutput = torch.randn(#input):type(typename)
+ local ctype = t2cpu[typename]
+ input = input:type(ctype)
+ gradOutput = gradOutput:type(ctype)
+ local sconv = nn.PReLU(nOutputPlane):type(ctype)
+ local gconv = sconv:clone():type(typename)
+
+ sconv:forward(input)
+ sconv:zeroGradParameters()
+ local groundgrad = sconv:backward(input, gradOutput)
+
+ input = input:type(typename)
+ gradOutput = gradOutput:type(typename)
+ gconv:forward(input)
+ gconv:zeroGradParameters()
+ local rescuda = gconv:backward(input, gradOutput)
+
+ local err = rescuda:double() - groundgrad:double()
+ local weightGradError = gconv.gradWeight:double() - sconv.gradWeight:double()
+
+ mytester:assertlt(err:abs():max(), precision_backward_type(precision_backward, typename),
+ string.format('error on state %s', typename))
+ mytester:assertlt(weightGradError:abs():max(), precision_backward_type(precision_backward, typename),
+ string.format('error on weight %s', typename))
end
- tm.cpu = a:time().real
-
- input = input:cuda()
- gradOutput = gradOutput:cuda()
- gconv:forward(input)
- local rescuda = gconv:backward(input, gradOutput)
- a:reset()
- for i = 1,nloop do
- rescuda = gconv:backward(input, gradOutput)
- end
- cutorch.synchronize()
- tm.gpu = a:time().real
-
- local err = rescuda:float() - groundgrad
- local weightGradError = gconv.gradWeight:float() - sconv.gradWeight
-
- mytester:assertlt(err:abs():max(), precision_backward, 'error on state')
- mytester:assertlt(weightGradError:abs():max(), precision_backward, 'error on weight')
end
@@ -5600,40 +4843,29 @@ function cunntest.RReLU_forward()
local w = math.random(1,100)
local h = math.random(1,100)
- for _,train in ipairs({true,false}) do
- for _,inplace in ipairs({false,true}) do
- local tm = {}
- local title = string.format('RReLU forward %d x %d (inplace: %s, train: %s)',
- w, h, tostring(inplace), tostring(train))
- times[title] = tm
-
- local input = torch.randn(nOutputPlane, h, w) - 0.5
- local sconv = nn.RReLU(1/8, 1/3, inplace)
- if not train then
- sconv:evaluate()
- end
- local groundtruth = sconv:forward(input:clone())
- local a = torch.Timer()
- for i = 1,nloop do
- groundtruth = sconv:forward(input:clone())
- end
- tm.cpu = a:time().real
-
- input = input:cuda()
- local gconv = sconv:cuda()
- local rescuda = gconv:forward(input:clone())
- a:reset()
- for i = 1,nloop do
- rescuda = gconv:forward(input:clone())
- end
- cutorch.synchronize()
- tm.gpu = a:time().real
-
- if not train then
- local error = rescuda:float() - groundtruth
- mytester:assertlt(error:abs():max(), precision_forward, 'error on state')
+ for k, typename in ipairs(typenames) do
+ for _,train in ipairs({true,false}) do
+ for _,inplace in ipairs({false,true}) do
+ local input = torch.randn(nOutputPlane, h, w):type(typename) - 0.5
+ local ctype = t2cpu[typename]
+ input = input:type(ctype)
+ local sconv = nn.RReLU(1/8, 1/3, inplace):type(ctype)
+ if not train then
+ sconv:evaluate()
+ end
+ local groundtruth = sconv:forward(input:clone())
+
+ input = input:type(typename)
+ local gconv = sconv:type(typename)
+ local rescuda = gconv:forward(input:clone())
+
+ if not train then
+ local error = rescuda:double() - groundtruth:double()
+ mytester:assertlt(error:abs():max(), precision_forward_type(precision_forward, typename),
+ string.format('error on state %s', typename))
+ end
end
- end
+ end
end
end
@@ -5642,50 +4874,40 @@ function cunntest.RReLU_backward()
local w = math.random(1,10)
local h = math.random(1,10)
- for _,train in ipairs({true,false}) do
- for _,inplace in ipairs({false,true}) do
- local tm = {}
- local title = string.format('RReLU backward %d x %d (inplace: %s, train: %s)',
- w, h, tostring(inplace), tostring(train))
- times[title] = tm
-
- local input = torch.randn(nOutputPlane, h, w)
- local gradOutput = torch.randn(#input) - 0.5
- local sconv = nn.RReLU(1/8, 1/3, inplace)
- if not train then
- sconv:evaluate()
- end
+ for k, typename in ipairs(typenames) do
+ for _,train in ipairs({true,false}) do
+ for _,inplace in ipairs({false,true}) do
+ local ctype = t2cpu[typename]
+ local input = torch.randn(nOutputPlane, h, w):type(typename)
+ local gradOutput = torch.randn(#input):type(typename) - 0.5
+ input = input:type(ctype)
+ gradOutput = gradOutput:type(ctype)
+ local sconv = nn.RReLU(1/8, 1/3, inplace):type(ctype)
+ if not train then
+ sconv:evaluate()
+ end
- sconv:forward(input:clone())
- local groundgrad = sconv:backward(input, gradOutput:clone())
- local a = torch.Timer()
- for i = 1,nloop do
- groundgrad = sconv:backward(input, gradOutput:clone())
- end
- tm.cpu = a:time().real
+ sconv:forward(input:clone())
+ local groundgrad = sconv:backward(input, gradOutput:clone())
- local gconv = sconv:clone():cuda()
- input = input:cuda()
- gradOutput = gradOutput:cuda()
- gconv:forward(input:clone())
- local rescuda = gconv:backward(input, gradOutput:clone())
- a:reset()
- for i = 1,nloop do
- rescuda = gconv:backward(input, gradOutput:clone())
- end
- cutorch.synchronize()
- tm.gpu = a:time().real
+ local gconv = sconv:clone():type(typename)
+ input = input:type(typename)
+ gradOutput = gradOutput:type(typename)
+ gconv:forward(input:clone())
+ local rescuda = gconv:backward(input, gradOutput:clone())
- if not train then
- local err = rescuda:float() - groundgrad
- mytester:assertlt(err:abs():max(), precision_backward, 'error on state')
- end
+ if not train then
+ local err = rescuda:double() - groundgrad:double()
+ mytester:assertlt(err:abs():max(), precision_backward_type(precision_backward, typename),
+ string.format('error on state', typename))
+ end
- input = -torch.rand(1000):cuda()
- gconv:forward(input) -- fill internal noise tensor
- local g = gconv:backward(input, torch.ones(1000):cuda())
- local err = math.abs(g[input:le(0)]:mean()-(gconv.lower+gconv.upper)/2)
- mytester:assertlt(err, 0.05, 'mean deviation of gradient for negative inputs')
+ input = -torch.rand(1000):type(typename)
+ gconv:forward(input) -- fill internal noise tensor
+ local g = gconv:backward(input, torch.ones(1000):type(typename))
+ local err = math.abs(g[input:le(0)]:mean()-(gconv.lower+gconv.upper)/2)
+ mytester:assertlt(err, 0.05, 'mean deviation of gradient for negative inputs')
+ end
end
end
end
@@ -5705,79 +4927,91 @@ function cunntest.VolumetricFullConvolution_pair_test()
local inChan = math.random(1,32)
local outChan = math.random(1,32)
- local module = nn.VolumetricFullConvolution(inChan, outChan, kT, kH, kW,
- dT, dH, dW, pT, pH, pW);
- module.weight:fill(1);
- module.bias:fill(0.1);
-
- local bs = math.random(8,32)
- local inD = math.random(8,32)
- local inH = math.random(8,32)
- local inW = math.random(8,32)
- local outD = (inD - 1) * dT - 2 * pT + kT
- local outH = (inH - 1) * dH - 2 * pH + kH
- local outW = (inW - 1) * dW - 2 * pW + kW
- local input = torch.Tensor(bs, inChan, inD, inH, inW):fill(1);
- local gradOut = torch.randn(bs, outChan, outD, outH, outW)
-
- local outcpu = module:forward(input)
- local gradcpu = module:backward(input, gradOut)
- module:cuda()
- local outgpu = module:forward(input:cuda())
- local gradgpu = module:backward(input:cuda(), gradOut:cuda())
-
- local error = outgpu:float() - outcpu
- mytester:assertlt(error:abs():max(), precision_forward,
- 'error on state (forward) ')
-
- local error = gradgpu:float() - gradcpu
- mytester:assertlt(error:abs():max(), precision_backward,
- 'error on state (backward) ')
+ for k, typename in ipairs(typenames) do
+ local ctype = t2cpu[typename]
+ local module = nn.VolumetricFullConvolution(inChan, outChan, kT, kH, kW,
+ dT, dH, dW, pT, pH, pW):type(ctype);
+ module.weight:fill(1);
+ module.bias:fill(0.1);
+ module.weight = module.weight:type(typename):type(ctype)
+ module.bias = module.bias:type(typename):type(ctype)
+
+ local bs = math.random(8,32)
+ local inD = math.random(8,32)
+ local inH = math.random(8,32)
+ local inW = math.random(8,32)
+ local outD = (inD - 1) * dT - 2 * pT + kT
+ local outH = (inH - 1) * dH - 2 * pH + kH
+ local outW = (inW - 1) * dW - 2 * pW + kW
+ local input = torch.Tensor(bs, inChan, inD, inH, inW):fill(1):type(typename):type(ctype)
+ local gradOut = torch.randn(bs, outChan, outD, outH, outW):type(typename):type(ctype)
+
+ local outcpu = module:forward(input)
+ local gradcpu = module:backward(input, gradOut)
+ module:type(typename)
+ local outgpu = module:forward(input:type(typename))
+ local gradgpu = module:backward(input:type(typename), gradOut:type(typename))
+
+ local error = outgpu:type(typename) - outcpu:type(typename)
+ mytester:assertlt(error:abs():max(),
+ precision_forward_type(precision_forward, typename, outgpu:abs():max()),
+ string.format('error on state (forward) with %s', typename))
+
+ local error = gradgpu:type(typename) - gradcpu:type(typename)
+ mytester:assertlt(error:abs():max(),
+ precision_backward_type(precision_backward, typename),
+ string.format('error on state (backward) with %s', typename))
+ end
end
function cunntest.VolumetricFullConvolution()
- local module = nn.VolumetricFullConvolution(3, 1, 3, 3, 3, 3, 3, 3);
- module.weight:fill(1);
- module.bias:fill(0.1);
- module:cuda();
-
- local input = torch.Tensor(1, 3, 2, 2, 2):zero();
- for c = 1,3 do
- input[1][c][1][1][1] = 1
- end
- local output = module:forward(input:cuda())
- for t = 1,6 do
- for h = 1,6 do
- for w = 1,6 do
- if t <= 3 and h <= 3 and w <= 3 then
- mytester:assertlt(output[1][1][t][h][w] - 3.1, precision_forward, 'error on forward ')
- else
- mytester:assertlt(output[1][1][t][h][w] - 0.1, precision_forward, 'error on forward ')
+ for k, typename in ipairs(typenames) do
+ local ctype = t2cpu[typename]
+ local module = nn.VolumetricFullConvolution(3, 1, 3, 3, 3, 3, 3, 3):type(ctype);
+ module.weight:fill(1);
+ module.bias:fill(0.1);
+ module:type(typename);
+
+ local input = torch.Tensor(1, 3, 2, 2, 2):zero();
+ for c = 1,3 do
+ input[1][c][1][1][1] = 1
+ end
+ local output = module:forward(input:type(typename))
+ for t = 1,6 do
+ for h = 1,6 do
+ for w = 1,6 do
+ if t <= 3 and h <= 3 and w <= 3 then
+ mytester:assertlt(output[1][1][t][h][w] - 3.1, precision_forward_type(precision_forward, typename),
+ string.format('error on forward with %s', typename))
+ else
+ mytester:assertlt(output[1][1][t][h][w] - 0.1, precision_forward_type(precision_forward, typename),
+ string.format('error on forward with %s', typename))
+ end
end
end
end
- end
- module:zeroGradParameters()
- local gradOut = torch.Tensor(1, 1, 6, 6, 6):fill(0.1);
- local gradIn = module:backward(input:cuda(), gradOut:cuda())
- for t = 1,2 do
- for h = 1,2 do
- for w = 1,2 do
- mytester:assertlt(gradIn[1][1][t][h][w] - 2.7, precision_backward,
- 'error on backward input gradients ')
+ module:zeroGradParameters()
+ local gradOut = torch.Tensor(1, 1, 6, 6, 6):fill(0.1);
+ local gradIn = module:backward(input:type(typename), gradOut:type(typename))
+ for t = 1,2 do
+ for h = 1,2 do
+ for w = 1,2 do
+ mytester:assertlt(gradIn[1][1][t][h][w] - 2.7, precision_backward_type(precision_backward, typename),
+ string.format('error on backward input gradients with %s', typename))
+ end
end
end
- end
- mytester:assertlt(module.gradBias[1] - 21.6, precision_backward,
- 'error on backward gradBias ')
- for c = 1,3 do
- for t = 1,3 do
- for h = 1,3 do
- for w = 1,3 do
- mytester:assertlt(module.gradWeight[c][1][t][h][w] - 0.1, precision_backward,
- 'error on backward weight gradients ')
+ mytester:assertlt(module.gradBias[1] - 21.6, precision_backward_type(precision_backward, typename),
+ string.format('error on backward gradBias with %s', typename))
+ for c = 1,3 do
+ for t = 1,3 do
+ for h = 1,3 do
+ for w = 1,3 do
+ mytester:assertlt(module.gradWeight[c][1][t][h][w] - 0.1, precision_backward_type(precision_backward, typename),
+ string.format('error on backward weight gradients with %s', typename))
+ end
end
end
end
@@ -5806,35 +5040,46 @@ function cunntest.VolumetricDilatedConvolution()
local inj = (outj - 1) * sj - 2 * padH + dilationH * (kj-1) + 1
local ink = (outk - 1) * sk - 2 * padT + dilationT * (kk-1) + 1
- local input = torch.randn(from,ink,inj,ini)
- local sconv = nn.VolumetricDilatedConvolution(from,to,kk,ki,kj,sk,si,sj,padT,padW,padH,dilationT,dilationW,dilationH)
- local output = sconv:forward(input)
- local gradOutput = output:clone():normal()
- sconv:zeroGradParameters()
- local groundgrad = sconv:backward(input, gradOutput)
- local groundweight = sconv.gradWeight
- local groundbias = sconv.gradBias
+ for k, typename in ipairs(typenames) do
+ local input = torch.randn(from,ink,inj,ini):type(typename)
- input = input:cuda()
- gradOutput = gradOutput:cuda()
- local gconv = nn.VolumetricDilatedConvolution(from,to,kk,ki,kj,sk,si,sj,padT,padW,padH,dilationT,dilationW,dilationH):cuda()
- gconv.weight = sconv.weight:cuda()
- gconv.bias = sconv.bias:cuda()
- local rescuda = gconv:forward(input)
- gconv:zeroGradParameters()
- local gradcuda = gconv:backward(input, gradOutput)
- local weightcuda = gconv.gradWeight
- local biascuda = gconv.gradBias
+ local ctype = t2cpu[typename]
+ input = input:type(ctype)
+ local sconv = nn.VolumetricDilatedConvolution(from,to,kk,ki,kj,sk,si,sj,padT,padW,padH,dilationT,dilationW,dilationH):type(ctype)
+ local output = sconv:forward(input)
+ local gradOutput = output:clone():normal()
+ sconv:zeroGradParameters()
+ local groundgrad = sconv:backward(input, gradOutput)
+ local groundweight = sconv.gradWeight
+ local groundbias = sconv.gradBias
- local error = rescuda:float() - output
- local gerror = gradcuda:float() - groundgrad
- local werror = weightcuda:float() - groundweight
- local berror = biascuda:float() - groundbias
+ input = input:type(typename)
+ gradOutput = gradOutput:type(typename)
+ local gconv = nn.VolumetricDilatedConvolution(from,to,kk,ki,kj,sk,si,sj,padT,padW,padH,dilationT,dilationW,dilationH):type(typename)
+ gconv.weight = sconv.weight:type(typename)
+ gconv.bias = sconv.bias:type(typename)
+ local rescuda = gconv:forward(input)
+ gconv:zeroGradParameters()
+ local gradcuda = gconv:backward(input, gradOutput)
+ local weightcuda = gconv.gradWeight
+ local biascuda = gconv.gradBias
- mytester:assertlt(error:abs():max(), precision_forward, 'error on state (forward) ')
- mytester:assertlt(gerror:abs():max(), precision_backward, 'error on state (backward) ')
- mytester:assertlt(werror:abs():max(), precision_backward, 'error on weight (backward) ')
- mytester:assertlt(berror:abs():max(), precision_backward, 'error on bias (backward) ')
+ local error = rescuda:double() - output:double()
+ local gerror = gradcuda:double() - groundgrad:double()
+ local werror = weightcuda:double() - groundweight:double()
+ local berror = biascuda:double() - groundbias:double()
+
+ mytester:assertlt(error:abs():max(), precision_forward_type(precision_forward, typename),
+ string.format('error on state (forward) with %s', typename))
+ mytester:assertlt(gerror:abs():max(), precision_backward_type(precision_backward, typename),
+ string.format('error on state (backward) with %s', typename))
+ mytester:assertlt(werror:abs():max(),
+ precision_backward_conv_weightbias(precision_backward, typename, weightcuda:abs():max()),
+ string.format('error on weight (backward) with %s', typename))
+ mytester:assertlt(berror:abs():max(),
+ precision_backward_conv_weightbias(precision_backward, typename, biascuda:abs():max()),
+ string.format('error on bias (backward) with %s', typename))
+ end
end
function cunntest.LookupTable_forward()
@@ -5842,31 +5087,21 @@ function cunntest.LookupTable_forward()
local nDim = 100
local nInput = 1000
- local tm = {}
- local title = string.format('LookupTable forward %d x %d', nVocab, nDim)
- times[title] = tm
+ for k, typename in ipairs(typenames) do
+ local input = torch.LongTensor(nInput):random(nVocab)
- local input = torch.LongTensor(nInput):random(nVocab)
- local sconv = nn.LookupTable(nVocab, nDim)
- local groundtruth = sconv:forward(input)
- local a = torch.Timer()
- for i = 1,nloop do
- groundtruth = sconv:forward(input)
- end
- tm.cpu = a:time().real
+ local ctype = t2cpu[typename]
+ local sconv = nn.LookupTable(nVocab, nDim):type(ctype)
+ local groundtruth = sconv:forward(input)
- input = input:cuda()
- local gconv = sconv:cuda()
- local rescuda = gconv:forward(input)
- a:reset()
- for i = 1,nloop do
- rescuda = gconv:forward(input)
- end
- cutorch.synchronize()
- tm.gpu = a:time().real
+ input = input:cuda()
+ local gconv = sconv:type(typename)
+ local rescuda = gconv:forward(input)
- local error = rescuda:float() - groundtruth
- mytester:assertlt(error:abs():max(), precision_forward, 'error on state')
+ local error = rescuda:double() - groundtruth:double()
+ mytester:assertlt(error:abs():max(), precision_forward_type(precision_forward, typename),
+ string.format('error on state with %s', typename))
+ end
end
function cunntest.LookupTable_backward()
@@ -5886,73 +5121,69 @@ function cunntest.LookupTable_backward()
s[k] = v[torch.random(#v)]
end
- local input, gradOutput
- if s.batch then
- input = torch.LongTensor(s.nInput, 5):random(s.nVocab)
- gradOutput = torch.randn(s.nInput, 5, s.nDim)
- else
- input = torch.LongTensor(s.nInput):random(s.nVocab)
- gradOutput = torch.randn(s.nInput, s.nDim)
- end
-
- local sconv = nn.LookupTable(s.nVocab, s.nDim, s.paddingValue)
- local gconv = sconv:clone():cuda()
- if s.scaleGradByFreq then
- sconv = sconv:scaleGradByFreq()
- gconv = gconv:scaleGradByFreq()
- end
+ for k, typename in ipairs(typenames) do
+ local ctype = t2cpu[typename]
+ local input, gradOutput
+ if s.batch then
+ input = torch.LongTensor(s.nInput, 5):random(s.nVocab)
+ gradOutput = torch.randn(s.nInput, 5, s.nDim):type(typename):type(ctype)
+ else
+ input = torch.LongTensor(s.nInput):random(s.nVocab)
+ gradOutput = torch.randn(s.nInput, s.nDim):type(typename):type(ctype)
+ end
- sconv:forward(input)
- sconv:backward(input, gradOutput)
+ local sconv = nn.LookupTable(s.nVocab, s.nDim, s.paddingValue):type(ctype)
+ local gconv = sconv:clone():type(typename)
+ if s.scaleGradByFreq then
+ sconv = sconv:scaleGradByFreq()
+ gconv = gconv:scaleGradByFreq()
+ end
- input = input:cuda()
- gradOutput = gradOutput:cuda()
- gconv:forward(input)
- gconv:backward(input, gradOutput)
+ sconv:forward(input)
+ sconv:backward(input, gradOutput)
- local weightGradError = gconv.gradWeight:float() - sconv.gradWeight
- mytester:assertlt(weightGradError:abs():max(), precision_backward,
- 'error on weight for size ' .. tostring(s.nInput) ..
- ' nVocab: ' .. tostring(s.nVocab) ..
- ' nDim ' .. tostring(s.nDim) ..
- ' scaleGradByFreq: ' .. tostring(s.scaleGradByFreq) ..
- ' batch: ' .. tostring(s.batch) ..
- ' paddingValue: ' .. tostring(s.paddingValue))
+ input = input:cuda()
+ gradOutput = gradOutput:type(typename)
+ gconv:forward(input)
+ gconv:backward(input, gradOutput)
+
+ local weightGradError = gconv.gradWeight:double() - sconv.gradWeight:double()
+ mytester:assertlt(weightGradError:abs():max(),
+ precision_backward_conv_weightbias(precision_backward, typename, gconv.gradWeight:abs():max()),
+ 'error on weight for size ' .. tostring(s.nInput) ..
+ ' nVocab: ' .. tostring(s.nVocab) ..
+ ' nDim ' .. tostring(s.nDim) ..
+ ' scaleGradByFreq: ' .. tostring(s.scaleGradByFreq) ..
+ ' batch: ' .. tostring(s.batch) ..
+ ' paddingValue: ' .. tostring(s.paddingValue) ..
+ ' type:' .. typename)
+ end
end
local nVocab = 10000
local nDim = 128
local nInput = 1000
- local tm = {}
- local title = string.format('LookupTable backward %d x %d', nVocab, nDim, nInput)
- times[title] = tm
- local input = torch.LongTensor(nInput):random(nVocab)
- local gradOutput = torch.randn(nInput, nDim)
- local sconv = nn.LookupTable(nVocab, nDim)
- local gconv = sconv:clone():cuda()
+ for k, typename in ipairs(typenames) do
+ local input = torch.LongTensor(nInput):random(nVocab)
- sconv:forward(input)
- sconv:backward(input, gradOutput)
- local a = torch.Timer()
- for i = 1,nloop do
- sconv:backward(input, gradOutput)
- end
- tm.cpu = a:time().real
+ local ctype = t2cpu[typename]
+ local gradOutput = torch.randn(nInput, nDim):type(ctype)
+ local sconv = nn.LookupTable(nVocab, nDim):type(ctype)
+ local gconv = sconv:clone():type(typename)
- input = input:cuda()
- gradOutput = gradOutput:cuda()
- gconv:forward(input)
- gconv:backward(input, gradOutput)
- a:reset()
- for i = 1,nloop do
- gconv:backward(input, gradOutput)
- end
- cutorch.synchronize()
- tm.gpu = a:time().real
+ sconv:forward(input)
+ sconv:backward(input, gradOutput)
+
+ input = input:cuda()
+ gradOutput = gradOutput:type(typename)
+ gconv:forward(input)
+ gconv:backward(input, gradOutput)
- local weightGradError = gconv.gradWeight:float() - sconv.gradWeight
- mytester:assertlt(weightGradError:abs():max(), precision_backward, 'error on weight')
+ local weightGradError = gconv.gradWeight:double() - sconv.gradWeight:double()
+ mytester:assertlt(weightGradError:abs():max(), precision_backward_type(precision_backward, typename),
+ string.format('error on weight with %s', typename))
+ end
end
function cunntest.getParameters()
@@ -5980,36 +5211,23 @@ function cunntest.SpatialReflectionPadding_forward()
local padT = math.random(-3,3)
local padB = math.random(-3,3)
- local tm = {}
- local title =
- string.format(
- 'SpatialReflectionPadding.forward %dx%dx%dx%d -> %dx%dx%dx%d',
- batch, plane, sizeY, sizeX,
- batch, plane, sizeY + padT + padB, sizeX + padL + padR)
- times[title] = tm
+ for k, typename in ipairs(typenames) do
+ local input = torch.rand(batch, plane, sizeY, sizeX):type(typename)
- local input = torch.rand(batch, plane, sizeY, sizeX)
- local module = nn.SpatialReflectionPadding(padL, padR, padT, padB)
- local groundtruth = module:forward(input)
- local a = torch.Timer()
- for i = 1,nloop do
- groundtruth = module:forward(input)
- end
- tm.cpu = a:time().real
+ local ctype = t2cpu[typename]
+ input = input:type(ctype)
+ local module = nn.SpatialReflectionPadding(padL, padR, padT, padB):type(ctype)
+ local groundtruth = module:forward(input)
- input = input:cuda()
- local gmodule = nn.SpatialReflectionPadding(padL, padR, padT, padB):cuda()
- local rescuda = gmodule:forward(input)
- a:reset()
- for i = 1,nloop do
- rescuda = gmodule:forward(input)
- end
- cutorch.synchronize()
- tm.gpu = a:time().real
+ input = input:type(typename)
+ local gmodule = nn.SpatialReflectionPadding(padL, padR, padT, padB):type(typename)
+ local rescuda = gmodule:forward(input)
- local error = rescuda:float() - groundtruth
- mytester:assertlt(error:abs():max(),
- precision_forward, 'error on state (forward) ')
+ local error = rescuda:double() - groundtruth:double()
+ mytester:assertlt(error:abs():max(),
+ precision_forward_type(precision_forward, typename),
+ string.format('error on state (forward) with %s', typename))
+ end
end
function cunntest.SpatialReflectionPadding_backward()
@@ -6022,46 +5240,32 @@ function cunntest.SpatialReflectionPadding_backward()
local padT = math.random(-3,3)
local padB = math.random(-3,3)
- local tm = {}
- local title =
- string.format(
- 'SpatialReflectionPadding.backward %dx%dx%dx%d -> %dx%dx%dx%d',
- batch, plane, sizeY, sizeX,
- batch, plane, sizeY + padT + padB, sizeX + padL + padR)
- times[title] = tm
+ for k, typename in ipairs(typenames) do
+ local input = torch.rand(batch, plane, sizeY, sizeX):type(typename)
+ local gradOutput = torch.rand(
+ batch, plane, sizeY + padT + padB, sizeX + padL + padR
+ ):type(typename)
- local input = torch.rand(batch, plane, sizeY, sizeX)
- local gradOutput = torch.rand(
- batch, plane, sizeY + padT + padB, sizeX + padL + padR
- )
- local module = nn.SpatialReflectionPadding(padL, padR, padT, padB)
- module:forward(input)
- module:zeroGradParameters()
- local groundgrad = module:backward(input, gradOutput)
- local a = torch.Timer()
- for i = 1,nloop do
- module:zeroGradParameters()
- groundgrad = module:backward(input, gradOutput)
- end
- tm.cpu = a:time().real
+ local ctype = t2cpu[typename]
+ input = input:type(ctype)
+ gradOutput = gradOutput:type(ctype)
+ local module = nn.SpatialReflectionPadding(padL, padR, padT, padB):type(ctype)
+ module:forward(input)
+ module:zeroGradParameters()
+ local groundgrad = module:backward(input, gradOutput)
- input = input:cuda()
- gradOutput = gradOutput:cuda()
- local gmodule = nn.SpatialReflectionPadding(padL, padR, padT, padB):cuda()
- gmodule:forward(input)
- gmodule:zeroGradParameters()
- local rescuda = gmodule:backward(input, gradOutput)
- a:reset()
- for i = 1,nloop do
- gmodule:zeroGradParameters()
- rescuda = gmodule:backward(input, gradOutput)
- end
- cutorch.synchronize()
- tm.gpu = a:time().real
+ input = input:type(typename)
+ gradOutput = gradOutput:type(typename)
+ local gmodule = nn.SpatialReflectionPadding(padL, padR, padT, padB):type(typename)
+ gmodule:forward(input)
+ gmodule:zeroGradParameters()
+ local rescuda = gmodule:backward(input, gradOutput)
- local error = rescuda:float() - groundgrad
- mytester:assertlt(error:abs():max(),
- precision_backward, 'error on state (backward) ')
+ local error = rescuda:double() - groundgrad:double()
+ mytester:assertlt(error:abs():max(),
+ precision_backward_type(precision_backward, type),
+ string.format('error on state (backward) with %s', typename))
+ end
end
function cunntest.SpatialReplicationPadding_forward()
@@ -6074,36 +5278,23 @@ function cunntest.SpatialReplicationPadding_forward()
local padT = math.random(-3,3)
local padB = math.random(-3,3)
- local tm = {}
- local title =
- string.format(
- 'SpatialReplicationPadding.forward %dx%dx%dx%d -> %dx%dx%dx%d',
- batch, plane, sizeY, sizeX,
- batch, plane, sizeY + padT + padB, sizeX + padL + padR)
- times[title] = tm
+ for k, typename in ipairs(typenames) do
+ local input = torch.rand(batch, plane, sizeY, sizeX):type(typename)
- local input = torch.rand(batch, plane, sizeY, sizeX)
- local module = nn.SpatialReplicationPadding(padL, padR, padT, padB)
- local groundtruth = module:forward(input)
- local a = torch.Timer()
- for i = 1,nloop do
- groundtruth = module:forward(input)
- end
- tm.cpu = a:time().real
+ local ctype = t2cpu[typename]
+ input = input:type(ctype)
+ local module = nn.SpatialReplicationPadding(padL, padR, padT, padB):type(ctype)
+ local groundtruth = module:forward(input)
- input = input:cuda()
- local gmodule = nn.SpatialReplicationPadding(padL, padR, padT, padB):cuda()
- local rescuda = gmodule:forward(input)
- a:reset()
- for i = 1,nloop do
- rescuda = gmodule:forward(input)
- end
- cutorch.synchronize()
- tm.gpu = a:time().real
+ input = input:type(typename)
+ local gmodule = nn.SpatialReplicationPadding(padL, padR, padT, padB):type(typename)
+ local rescuda = gmodule:forward(input)
- local error = rescuda:float() - groundtruth
- mytester:assertlt(error:abs():max(),
- precision_forward, 'error on state (forward) ')
+ local error = rescuda:double() - groundtruth:double()
+ mytester:assertlt(error:abs():max(),
+ precision_forward_type(precision_forward, type),
+ string.format('error on state (forward) with %s', typename))
+ end
end
function cunntest.SpatialReplicationPadding_backward()
@@ -6116,46 +5307,32 @@ function cunntest.SpatialReplicationPadding_backward()
local padT = math.random(-3,3)
local padB = math.random(-3,3)
- local tm = {}
- local title =
- string.format(
- 'SpatialReplicationPadding.backward %dx%dx%dx%d -> %dx%dx%dx%d',
- batch, plane, sizeY, sizeX,
- batch, plane, sizeY + padT + padB, sizeX + padL + padR)
- times[title] = tm
+ for k, typename in ipairs(typenames) do
+ local input = torch.rand(batch, plane, sizeY, sizeX):type(typename)
+ local gradOutput = torch.rand(
+ batch, plane, sizeY + padT + padB, sizeX + padL + padR
+ ):type(typename)
- local input = torch.rand(batch, plane, sizeY, sizeX)
- local gradOutput = torch.rand(
- batch, plane, sizeY + padT + padB, sizeX + padL + padR
- )
- local module = nn.SpatialReplicationPadding(padL, padR, padT, padB)
- module:forward(input)
- module:zeroGradParameters()
- local groundgrad = module:backward(input, gradOutput)
- local a = torch.Timer()
- for i = 1,nloop do
- module:zeroGradParameters()
- groundgrad = module:backward(input, gradOutput)
- end
- tm.cpu = a:time().real
+ local ctype = t2cpu[typename]
+ input = input:type(ctype)
+ gradOutput = gradOutput:type(ctype)
+ local module = nn.SpatialReplicationPadding(padL, padR, padT, padB):type(ctype)
+ module:forward(input)
+ module:zeroGradParameters()
+ local groundgrad = module:backward(input, gradOutput)
- input = input:cuda()
- gradOutput = gradOutput:cuda()
- local gmodule = nn.SpatialReplicationPadding(padL, padR, padT, padB):cuda()
- gmodule:forward(input)
- gmodule:zeroGradParameters()
- local rescuda = gmodule:backward(input, gradOutput)
- a:reset()
- for i = 1,nloop do
- gmodule:zeroGradParameters()
- rescuda = gmodule:backward(input, gradOutput)
- end
- cutorch.synchronize()
- tm.gpu = a:time().real
+ input = input:type(typename)
+ gradOutput = gradOutput:type(typename)
+ local gmodule = nn.SpatialReplicationPadding(padL, padR, padT, padB):type(typename)
+ gmodule:forward(input)
+ gmodule:zeroGradParameters()
+ local rescuda = gmodule:backward(input, gradOutput)
- local error = rescuda:float() - groundgrad
- mytester:assertlt(error:abs():max(),
- precision_backward, 'error on state (backward) ')
+ local error = rescuda:double() - groundgrad:double()
+ mytester:assertlt(error:abs():max(),
+ precision_backward_type(precision_backward, typename),
+ string.format('error on state (backward) with %s', typename))
+ end
end
function cunntest.VolumetricReplicationPadding_forward()
@@ -6171,40 +5348,25 @@ function cunntest.VolumetricReplicationPadding_forward()
local pfront = math.random(-3,3)
local pback = math.random(-3,3)
- local tm = {}
- local title =
- string.format(
- 'VolumetricReplicationPadding.forward %dx%dx%dx%dx%d -> ' ..
- '%dx%dx%dx%dx%d',
- batch, plane, sizeZ, sizeY, sizeX,
- batch, plane, sizeZ + pfront + pback, sizeY + ptop + pbottom,
- sizeX + pleft + pright)
- times[title] = tm
+ for k, typename in ipairs(typenames) do
+ local input = torch.rand(batch, plane, sizeZ, sizeY, sizeX):type(typename)
- local input = torch.rand(batch, plane, sizeZ, sizeY, sizeX)
- local module = nn.VolumetricReplicationPadding(pleft, pright, ptop, pbottom,
- pfront, pback)
- local groundtruth = module:forward(input)
- local a = torch.Timer()
- for i = 1, nloop do
- groundtruth = module:forward(input)
- end
- tm.cpu = a:time().real
+ local ctype = t2cpu[typename]
+ input = input:type(ctype)
+ local module = nn.VolumetricReplicationPadding(pleft, pright, ptop, pbottom,
+ pfront, pback):type(ctype)
+ local groundtruth = module:forward(input)
- input = input:cuda()
- local gmodule = nn.VolumetricReplicationPadding(pleft, pright, ptop, pbottom,
- pfront, pback):cuda()
- local rescuda = gmodule:forward(input)
- a:reset()
- for i = 1, nloop do
- rescuda = gmodule:forward(input)
- end
- cutorch.synchronize()
- tm.gpu = a:time().real
+ input = input:type(typename)
+ local gmodule = nn.VolumetricReplicationPadding(pleft, pright, ptop, pbottom,
+ pfront, pback):type(typename)
+ local rescuda = gmodule:forward(input)
- local error = rescuda:float() - groundtruth
- mytester:assertlt(error:abs():max(),
- precision_forward, 'error on state (forward) ')
+ local error = rescuda:double() - groundtruth:double()
+ mytester:assertlt(error:abs():max(),
+ precision_forward_type(precision_forward, typename),
+ string.format('error on state (forward) with %s', typename))
+ end
end
function cunntest.VolumetricReplicationPadding_backward()
@@ -6220,51 +5382,35 @@ function cunntest.VolumetricReplicationPadding_backward()
local pfront = math.random(-3,3)
local pback = math.random(-3,3)
- local tm = {}
- local title =
- string.format(
- 'VolumetricReplicationPadding.backward %dx%dx%dx%dx%d -> ' ..
- '%dx%dx%dx%dx%d',
- batch, plane, sizeZ, sizeY, sizeX,
- batch, plane, sizeZ + pfront + pback, sizeY + ptop + pbottom,
- sizeX + pleft + pright)
- times[title] = tm
-
- local input = torch.rand(batch, plane, sizeZ, sizeY, sizeX)
- local gradOutput = torch.rand(
- batch, plane, sizeZ + pfront + pback, sizeY + ptop + pbottom,
- sizeX + pleft + pright
- )
- local module = nn.VolumetricReplicationPadding(pleft, pright, ptop, pbottom,
- pfront, pback)
- module:forward(input)
- module:zeroGradParameters()
- local groundgrad = module:backward(input, gradOutput)
- local a = torch.Timer()
- for i = 1, nloop do
+ for k, typename in ipairs(typenames) do
+ local input = torch.rand(batch, plane, sizeZ, sizeY, sizeX):type(typename)
+ local gradOutput = torch.rand(
+ batch, plane, sizeZ + pfront + pback, sizeY + ptop + pbottom,
+ sizeX + pleft + pright
+ ):type(typename)
+
+ local ctype = t2cpu[typename]
+ input = input:type(ctype)
+ gradOutput = gradOutput:type(ctype)
+ local module = nn.VolumetricReplicationPadding(pleft, pright, ptop, pbottom,
+ pfront, pback):type(ctype)
+ module:forward(input)
module:zeroGradParameters()
- groundgrad = module:backward(input, gradOutput)
- end
- tm.cpu = a:time().real
+ local groundgrad = module:backward(input, gradOutput)
- input = input:cuda()
- gradOutput = gradOutput:cuda()
- local gmodule = nn.VolumetricReplicationPadding(pleft, pright, ptop, pbottom,
- pfront, pback):cuda()
- gmodule:forward(input)
- gmodule:zeroGradParameters()
- local rescuda = gmodule:backward(input, gradOutput)
- a:reset()
- for i = 1, nloop do
+ input = input:type(typename)
+ gradOutput = gradOutput:type(typename)
+ local gmodule = nn.VolumetricReplicationPadding(pleft, pright, ptop, pbottom,
+ pfront, pback):type(typename)
+ gmodule:forward(input)
gmodule:zeroGradParameters()
- rescuda = gmodule:backward(input, gradOutput)
- end
- cutorch.synchronize()
- tm.gpu = a:time().real
+ local rescuda = gmodule:backward(input, gradOutput)
- local error = rescuda:float() - groundgrad
- mytester:assertlt(error:abs():max(),
- precision_backward, 'error on state (backward) ')
+ local error = rescuda:double() - groundgrad:double()
+ mytester:assertlt(error:abs():max(),
+ precision_backward_type(precision_backward, typename),
+ string.format('error on state (backward) with %s', typename))
+ end
end
function cunntest.GPU()
@@ -6508,6 +5654,7 @@ function nn.testcuda(tests, print_timing, n_loop, seed)
nloop = n_loop or nloop
local oldtype = torch.getdefaulttensortype()
torch.setdefaulttensortype('torch.FloatTensor')
+ checkHalf()
initSeed(seed)
mytester = torch.Tester()
mytester:add(cunntest)