diff options
40 files changed, 2981 insertions, 1669 deletions
diff --git a/TensorMath.lua b/TensorMath.lua index 17d7547..9cc8305 100644 --- a/TensorMath.lua +++ b/TensorMath.lua @@ -29,13 +29,13 @@ local unpack = unpack or table.unpack -- specific to CUDA local typenames = {'CudaByteTensor', - 'CudaCharTensor', - 'CudaShortTensor', - 'CudaIntTensor', - 'CudaLongTensor', - 'CudaTensor', - 'CudaDoubleTensor', - 'CudaHalfTensor' + 'CudaCharTensor', + 'CudaShortTensor', + 'CudaIntTensor', + 'CudaLongTensor', + 'CudaTensor', + 'CudaDoubleTensor', + 'CudaHalfTensor' } for _, typename in ipairs(typenames) do @@ -430,12 +430,21 @@ local handledTypereals = {'unsigned char', 'double', 'half' } +local handledTypeaccreals = {'long', + 'long', + 'long', + 'long', + 'long', + 'double', + 'float' +} for k, Tensor in pairs(handledTypenames) do if Tensor == 'CudaHalfTensor' then interface:print("#ifdef CUDA_HALF_TENSOR") end local real = handledTypereals[k] + local accreal = handledTypeaccreals[k] function interface.luaname2wrapname(self, name) return string.format('cutorch_%s_%s', Tensor, name) @@ -515,6 +524,18 @@ for k, Tensor in pairs(handledTypenames) do {name=real, default=1}, {name=Tensor}}) + wrap("mul", + cname("mul"), + {{name=Tensor, default=true, returned=true, method={default='nil'}}, + {name=Tensor, method={default=1}}, + {name=real}}) + + wrap("div", + cname("div"), + {{name=Tensor, default=true, returned=true, method={default='nil'}}, + {name=Tensor, method={default=1}}, + {name=real}}) + for _, name in ipairs({"cmul", "cpow", "cdiv"}) do wrap(name, cname(name), @@ -523,6 +544,224 @@ for k, Tensor in pairs(handledTypenames) do {name=Tensor}}) end + for _,name in ipairs({"min", "max"}) do + wrap(name, + cname(name .. "all"), + {{name=Tensor}, + {name=real, creturned=true}}, + cname(name), + {{name=Tensor, default=true, returned=true}, + {name='CudaLongTensor', default=true, returned=true}, + {name=Tensor}, + {name="index"}}) + end + + if Tensor == 'CudaByteTensor' then + for _,name in pairs({'all', 'any'}) do + wrap(name, + cname('logical' .. name), + {{name=Tensor}, + {name="boolean", creturned=true}}) + end + end + + for _,name in pairs({'lt','gt','le','ge','eq','ne'}) do + wrap(name, + cname(name .. 'Value'), + {{name='CudaByteTensor',default=true, returned=true}, + {name=Tensor}, + {name=real}}, + cname(name .. 'ValueT'), + {{name=Tensor, returned=true}, + {name=Tensor}, + {name=real}}, + cname(name .. 'Tensor'), + {{name='CudaByteTensor',default=true, returned=true}, + {name=Tensor}, + {name=Tensor}}, + cname(name .. 'TensorT'), + {{name=Tensor, returned=true}, + {name=Tensor}, + {name=Tensor}}) + end + + wrap("sum", + cname("sumall"), + {{name=Tensor}, + {name=accreal, creturned=true}}, + cname("sum"), + {{name=Tensor, default=true, returned=true}, + {name=Tensor}, + {name="index"}}) + + wrap("prod", + cname("prodall"), + {{name=Tensor}, + {name=accreal, creturned=true}}, + cname("prod"), + {{name=Tensor, default=true, returned=true}, + {name=Tensor}, + {name="index"}}) + + wrap("maskedFill", + cname("maskedFill"), + {{name=Tensor, returned=true, method={default='nil'}}, + {name='CudaByteTensor'}, + {name=real}}) + + wrap("maskedCopy", + cname("maskedCopy"), + {{name=Tensor, returned=true, method={default='nil'}}, + {name='CudaByteTensor'}, + {name=Tensor}}) + + wrap("maskedSelect", + cname("maskedSelect"), + {{name=Tensor, returned=true, default=true}, + {name=Tensor}, + {name='CudaByteTensor'}}) + + -- BLAS functions + if real == 'float' or real == 'double' or real == 'half' then + wrap("mv", + cname("addmv"), + {{name=Tensor, default=true, returned=true, method={default='nil'}, + init=function(arg) + return table.concat( + { + arg.__metatable.init(arg), + string.format("TH%s_checkGPU(cutorch_getstate(L), 1, %s);", + Tensor, arg.args[5]:carg()), + string.format("TH%s_resize1d(cutorch_getstate(L), %s, %s->size[0]);", Tensor, arg:carg(), arg.args[5]:carg()) + }, '\n') + end, + precall=function(arg) + return table.concat( + { + string.format("TH%s_zero(cutorch_getstate(L), %s);", Tensor, arg:carg()), + arg.__metatable.precall(arg) + }, '\n') + end + }, + {name=real, default=1, invisible=true}, + {name=Tensor, default=1, invisible=true}, + {name=real, default=1, invisible=true}, + {name=Tensor, dim=2}, + {name=Tensor, dim=1}} + ) + + wrap("mm", + cname("addmm"), + {{name=Tensor, default=true, returned=true, method={default='nil'}, + init=function(arg) + return table.concat( + { + arg.__metatable.init(arg), + string.format("TH%s_checkGPU(cutorch_getstate(L), 2, %s, %s);", + Tensor, arg.args[5]:carg(), arg.args[6]:carg()), + string.format("TH%s_resize2d(cutorch_getstate(L), %s, %s->size[0], %s->size[1]);", + Tensor, arg:carg(), arg.args[5]:carg(), arg.args[6]:carg()) + }, '\n') + end, + }, + {name=real, default=0, invisible=true}, + {name=Tensor, default=1, invisible=true}, + {name=real, default=1, invisible=true}, + {name=Tensor, dim=2}, + {name=Tensor, dim=2}} + ) + + wrap("bmm", + cname("baddbmm"), + {{name=Tensor, default=true, returned=true, method={default='nil'}, + init=function(arg) + return table.concat( + { + arg.__metatable.init(arg), + string.format("TH%s_checkGPU(cutorch_getstate(L), 2, %s, %s);", + Tensor, arg.args[5]:carg(), arg.args[6]:carg()), + string.format("TH%s_resize3d(cutorch_getstate(L), %s, %s->size[0], %s->size[1], %s->size[2]);", + Tensor, arg:carg(), arg.args[5]:carg(), arg.args[5]:carg(), arg.args[6]:carg()) + }, '\n') + end, + }, + {name=real, default=0, invisible=true}, + {name=Tensor, default=1, invisible=true}, + {name=real, default=1, invisible=true}, + {name=Tensor, dim=3}, + {name=Tensor, dim=3}} + ) + + wrap("ger", + cname("addr"), + {{name=Tensor, default=true, returned=true, method={default='nil'}, + init=function(arg) + return table.concat( + { + arg.__metatable.init(arg), + string.format("TH%s_checkGPU(cutorch_getstate(L), 2, %s, %s);", + Tensor, arg.args[5]:carg(), arg.args[6]:carg()), + string.format("TH%s_resize2d(cutorch_getstate(L), %s, %s->size[0], %s->size[0]);", Tensor, arg:carg(), arg.args[5]:carg(), arg.args[6]:carg()) + }, '\n') + end, + precall=function(arg) + return table.concat( + { + string.format("TH%s_zero(cutorch_getstate(L), %s);", Tensor, arg:carg()), + arg.__metatable.precall(arg) + }, '\n') + end + }, + {name=real, default=1, invisible=true}, + {name=Tensor, default=1, invisible=true}, + {name=real, default=1, invisible=true}, + {name=Tensor, dim=1}, + {name=Tensor, dim=1}} + ) + + for _,f in ipairs({ + {name="addmv", dim1=1, dim2=2, dim3=1}, + {name="addmm", dim1=2, dim2=2, dim3=2}, + {name="addr", dim1=2, dim2=1, dim3=1}, + {name="baddbmm", dim1=3, dim2=3, dim3=3}, + {name="addbmm", dim1=2, dim2=3, dim3=3}, + } + ) do + + interface:wrap(f.name, + cname(f.name), + {{name=Tensor, default=true, returned=true}, + {name=real, default=1}, + {name=Tensor, dim=f.dim1}, + {name=real, default=1}, + {name=Tensor, dim=f.dim2}, + {name=Tensor, dim=f.dim3}}) + + -- there is an ambiguity here, hence the more complicated setup + method:wrap(f.name, + cname(f.name), + {{name=Tensor, returned=true, dim=f.dim1}, + {name=real, default=1, invisible=true}, + {name=Tensor, default=1, dim=f.dim1}, + {name=real, default=1}, + {name=Tensor, dim=f.dim2}, + {name=Tensor, dim=f.dim3}}, + cname(f.name), + {{name=Tensor, returned=true, dim=f.dim1}, + {name=real}, + {name=Tensor, default=1, dim=f.dim1}, + {name=real}, + {name=Tensor, dim=f.dim2}, + {name=Tensor, dim=f.dim3}}) + end + end + + wrap("dot", + cname("dot"), + {{name=Tensor}, + {name=Tensor}, + {name=real, creturned=true}}) + method:register("m_cutorch_" .. Tensor .. "Math__") interface:print(method:tostring()) method:clearhistory() @@ -677,20 +916,20 @@ wrap("addcdiv", wrap("maskedFill", cname("maskedFill"), {{name=Tensor, returned=true, method={default='nil'}}, - {name=Tensor}, + {name='CudaByteTensor'}, {name=real}}) wrap("maskedCopy", cname("maskedCopy"), {{name=Tensor, returned=true, method={default='nil'}}, - {name=Tensor}, + {name='CudaByteTensor'}, {name=Tensor}}) wrap("maskedSelect", cname("maskedSelect"), {{name=Tensor, returned=true, default=true}, {name=Tensor}, - {name=Tensor}}) + {name='CudaByteTensor'}}) wrap("gather", cname("gather"), @@ -914,7 +1153,7 @@ for _,name in ipairs({"min", "max"}) do {name=real, creturned=true}}, cname(name), {{name=Tensor, default=true, returned=true}, - {name=Tensor, default=true, returned=true}, + {name='CudaLongTensor', default=true, returned=true}, {name=Tensor}, {name="index"}}) end @@ -1018,20 +1257,21 @@ wrap("clamp", for _,name in pairs({'lt','gt','le','ge','eq','ne'}) do wrap(name, cname(name .. 'Value'), - {{name=Tensor, default=true, returned=true}, - {name=Tensor}, - {name=real}}, + {{name='CudaByteTensor',default=true, returned=true}, + {name=Tensor}, + {name=real}}, + cname(name .. 'ValueT'), + {{name=Tensor, returned=true}, + {name=Tensor}, + {name=real}}, cname(name .. 'Tensor'), - {{name=Tensor, default=true, returned=true}, - {name=Tensor}, - {name=Tensor}}) -end - -for _,name in pairs({'all', 'any'}) do - wrap(name, - cname('logical' .. name), - {{name=Tensor}, - {name="boolean", creturned=true}}) + {{name='CudaByteTensor',default=true, returned=true}, + {name=Tensor}, + {name=Tensor}}, + cname(name .. 'TensorT'), + {{name=Tensor, returned=true}, + {name=Tensor}, + {name=Tensor}}) end wrap("cat", diff --git a/TensorOperator.c b/TensorOperator.c index bbd33b5..ae7c2b3 100644 --- a/TensorOperator.c +++ b/TensorOperator.c @@ -2,180 +2,12 @@ #include "luaT.h" #include "THC.h" -static int cutorch_CudaTensorOperator___add__(lua_State *L) -{ - THCudaTensor *tensor1 = luaT_toudata(L, 1, "torch.CudaTensor"); - THCudaTensor *tensor2 = luaT_toudata(L, 2, "torch.CudaTensor"); - THCudaTensor *r; - THCState *state = cutorch_getstate(L); - THAssert(THCudaTensor_checkGPU(state, 2, tensor1, tensor2)); +#include "THCTensorMath.h" - if(!tensor1 && !tensor2) - luaL_error(L, "expecting two Tensors or one Tensor and one number"); - else - { - r = THCudaTensor_new(state); - luaT_pushudata(L, r, "torch.CudaTensor"); +#define cutorch_TensorOperator_(NAME) TH_CONCAT_4(cutorch_,CReal,TensorOperator_,NAME) +#define torch_Tensor_(NAME) TH_CONCAT_4(torch_,CReal,Tensor_,NAME) +#define torch_Tensor TH_CONCAT_STRING_3(torch.,CReal,Tensor) +#define cutorch_Tensor_(NAME) TH_CONCAT_4(cutorch_,CReal,Tensor_,NAME) - if(!tensor1 && tensor2) - { - THCudaTensor_resizeAs(state, r, tensor2); - THCudaTensor_copy(state, r, tensor2); - THCudaTensor_add(state, r, r, luaL_checknumber(L, 1)); - } - else if(tensor1 && !tensor2) - { - THCudaTensor_resizeAs(state, r, tensor1); - THCudaTensor_copy(state, r, tensor1); - THCudaTensor_add(state, r, r, luaL_checknumber(L, 2)); - } - else - { - THCudaTensor_resizeAs(state, r, tensor1); - THCudaTensor_copy(state, r, tensor1); - THCudaTensor_cadd(state, r, r, 1, tensor2); - } - } - return 1; -} - -static int cutorch_CudaTensorOperator___sub__(lua_State *L) -{ - THCudaTensor *tensor1 = luaT_toudata(L, 1, "torch.CudaTensor"); - THCudaTensor *tensor2 = luaT_toudata(L, 2, "torch.CudaTensor"); - THCudaTensor *r; - THCState *state = cutorch_getstate(L); - THAssert(THCudaTensor_checkGPU(state, 2, tensor1, tensor2)); - - if(!tensor1 && !tensor2) - luaL_error(L, "expecting two Tensors or one Tensor and one number"); - else - { - r = THCudaTensor_new(state); - luaT_pushudata(L, r, "torch.CudaTensor"); - - if(!tensor1 && tensor2) - { - THCudaTensor_resizeAs(state, r, tensor2); - THCudaTensor_fill(state, r, luaL_checknumber(L, 1)); - THCudaTensor_cadd(state, r, r, -1, tensor2); - } - else if(tensor1 && !tensor2) - { - THCudaTensor_resizeAs(state, r, tensor1); - THCudaTensor_copy(state, r, tensor1); - THCudaTensor_add(state, r, r, -luaL_checknumber(L, 2)); - } - else - { - THCudaTensor_resizeAs(state, r, tensor1); - THCudaTensor_copy(state, r, tensor1); - THCudaTensor_cadd(state, r, r, -1, tensor2); - } - } - return 1; -} - -static int cutorch_CudaTensorOperator___unm__(lua_State *L) -{ - THCudaTensor *tensor = luaT_checkudata(L, 1, "torch.CudaTensor"); - THCudaTensor *r; - THCState *state = cutorch_getstate(L); - THAssert(THCudaTensor_checkGPU(state, 1, tensor)); - - r = THCudaTensor_new(state); - luaT_pushudata(L, r, "torch.CudaTensor"); - THCudaTensor_resizeAs(state, r, tensor); - THCudaTensor_copy(state, r, tensor); - THCudaTensor_mul(state, r, r, -1); - - return 1; -} - -static int cutorch_CudaTensorOperator___mul__(lua_State *L) -{ - THCudaTensor *tensor1 = luaT_toudata(L, 1, "torch.CudaTensor"); - THCudaTensor *tensor2 = luaT_toudata(L, 2, "torch.CudaTensor"); - THCudaTensor *r; - THCState *state = cutorch_getstate(L); - THAssert(THCudaTensor_checkGPU(state, 2, tensor1, tensor2)); - - if(!tensor1 && !tensor2) - luaL_error(L, "expecting two Tensors or one Tensor and one number"); - else - { - r = THCudaTensor_new(state); - luaT_pushudata(L, r, "torch.CudaTensor"); - - if(!tensor1 && tensor2) - { - THCudaTensor_resizeAs(state, r, tensor2); - THCudaTensor_copy(state, r, tensor2); - THCudaTensor_mul(state, r, r, luaL_checknumber(L, 1)); - } - else if(tensor1 && !tensor2) - { - THCudaTensor_resizeAs(state, r, tensor1); - THCudaTensor_copy(state, r, tensor1); - THCudaTensor_mul(state, r, r, luaL_checknumber(L, 2)); - } - else - { - int dimt = tensor1->nDimension; - int dims = tensor2->nDimension; - - if(dimt == 1 && dims == 1) - lua_pushnumber(L, THCudaTensor_dot(state, tensor1, tensor2)); /* ok, we wasted r, but who cares */ - else if(dimt == 2 && dims == 1) - { - THCudaTensor_resize1d(state, r, tensor1->size[0]); - THCudaTensor_zero(state, r); - THCudaTensor_addmv(state, r, 1, r, 1, tensor1, tensor2); - } - else if(dimt == 2 && dims == 2) - { - THCudaTensor_resize2d(state, r, tensor1->size[0], tensor2->size[1]); - THCudaTensor_zero(state, r); - THCudaTensor_addmm(state, r, 1, r, 1, tensor1, tensor2); - } - else - luaL_error(L, "multiplication between %dD and %dD tensors not yet supported", tensor1->nDimension, tensor2->nDimension); - } - } - return 1; -} - -static int cutorch_CudaTensorOperator___div__(lua_State *L) -{ - THCudaTensor *tensor = luaT_checkudata(L, 1, "torch.CudaTensor"); - THCudaTensor *r; - THCState *state = cutorch_getstate(L); - THAssert(THCudaTensor_checkGPU(state, 1, tensor)); - - luaL_argcheck(L, lua_isnumber(L,2), 2, "number expected"); - - r = THCudaTensor_new(state); - luaT_pushudata(L, r, "torch.CudaTensor"); - - THCudaTensor_resizeAs(state, r, tensor); - THCudaTensor_copy(state, r, tensor); - THCudaTensor_mul(state, r, r, 1/lua_tonumber(L, 2)); - - return 1; -} - -static const struct luaL_Reg cutorch_CudaTensorOperator__ [] = { - {"__add__", cutorch_CudaTensorOperator___add__}, - {"__sub__", cutorch_CudaTensorOperator___sub__}, - {"__unm__", cutorch_CudaTensorOperator___unm__}, - {"__mul__", cutorch_CudaTensorOperator___mul__}, - {"__div__", cutorch_CudaTensorOperator___div__}, - {NULL, NULL} -}; - -void cutorch_CudaTensorOperator_init(lua_State *L) -{ - luaT_pushmetatable(L, "torch.CudaTensor"); - luaL_setfuncs(L, cutorch_CudaTensorOperator__, 0); - lua_pop(L, 1); -} +#include "generic/TensorOperator.c" +#include "THCGenerateAllTypes.h" diff --git a/generic/TensorOperator.c b/generic/TensorOperator.c new file mode 100644 index 0000000..5844a9d --- /dev/null +++ b/generic/TensorOperator.c @@ -0,0 +1,263 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/TensorOperator.c" +#else + +static int cutorch_TensorOperator_(__add__)(lua_State *L) +{ + THCTensor *tensor1 = luaT_toudata(L, 1, torch_Tensor); + THCTensor *tensor2 = luaT_toudata(L, 2, torch_Tensor); + THCTensor *r; + THCState *state = cutorch_getstate(L); + THAssert(THCTensor_(checkGPU)(state, 2, tensor1, tensor2)); + + if(!tensor1 && !tensor2) + luaL_error(L, "expecting two Tensors or one Tensor and one number"); + else + { + r = THCTensor_(new)(state); + luaT_pushudata(L, r, torch_Tensor); + + if(!tensor1 && tensor2) + { + THCTensor_(resizeAs)(state, r, tensor2); + THCTensor_(copy)(state, r, tensor2); + double v = luaL_checknumber(L, 1); +#ifdef THC_REAL_IS_HALF + half value = THC_float2half((float) v); +#else + real value = (real) v; +#endif + + THCTensor_(add)(state, r, r, value); + } + else if(tensor1 && !tensor2) + { + THCTensor_(resizeAs)(state, r, tensor1); + THCTensor_(copy)(state, r, tensor1); + + double v = luaL_checknumber(L, 2); +#ifdef THC_REAL_IS_HALF + half value = THC_float2half((float) v); +#else + real value = (real) v; +#endif + + THCTensor_(add)(state, r, r, value); + } + else + { + THCTensor_(resizeAs)(state, r, tensor1); + THCTensor_(copy)(state, r, tensor1); + +#ifdef THC_REAL_IS_HALF + half one = THC_float2half(1.0f); +#else + real one = (real) 1; +#endif + + THCTensor_(cadd)(state, r, r, one, tensor2); + } + } + return 1; +} + +static int cutorch_TensorOperator_(__sub__)(lua_State *L) +{ + THCTensor *tensor1 = luaT_toudata(L, 1, torch_Tensor); + THCTensor *tensor2 = luaT_toudata(L, 2, torch_Tensor); + THCTensor *r; + THCState *state = cutorch_getstate(L); + THAssert(THCTensor_(checkGPU)(state, 2, tensor1, tensor2)); + + if(!tensor1 && !tensor2) + luaL_error(L, "expecting two Tensors or one Tensor and one number"); + else + { + r = THCTensor_(new)(state); + luaT_pushudata(L, r, torch_Tensor); + +#ifdef THC_REAL_IS_HALF + half neg = THC_float2half(-1.0f); +#else + real neg = (real) -1; +#endif + + if(!tensor1 && tensor2) + { + THCTensor_(resizeAs)(state, r, tensor2); + + double v = luaL_checknumber(L, 1); +#ifdef THC_REAL_IS_HALF + half value = THC_float2half((float) v); +#else + real value = (real) v; +#endif + + THCTensor_(fill)(state, r, value); + THCTensor_(cadd)(state, r, r, neg, tensor2); + } + else if(tensor1 && !tensor2) + { + THCTensor_(resizeAs)(state, r, tensor1); + THCTensor_(copy)(state, r, tensor1); + + double v = -luaL_checknumber(L, 2); +#ifdef THC_REAL_IS_HALF + half value = THC_float2half((float) v); +#else + real value = (real) v; +#endif + + THCTensor_(add)(state, r, r, value); + } + else + { + THCTensor_(resizeAs)(state, r, tensor1); + THCTensor_(copy)(state, r, tensor1); + THCTensor_(cadd)(state, r, r, neg, tensor2); + } + } + return 1; +} + +static int cutorch_TensorOperator_(__unm__)(lua_State *L) +{ + THCTensor *tensor = luaT_checkudata(L, 1, torch_Tensor); + THCTensor *r; + THCState *state = cutorch_getstate(L); + THAssert(THCTensor_(checkGPU)(state, 1, tensor)); + + r = THCTensor_(new)(state); + luaT_pushudata(L, r, torch_Tensor); + THCTensor_(resizeAs)(state, r, tensor); + THCTensor_(copy)(state, r, tensor); + +#ifdef THC_REAL_IS_HALF + half neg = THC_float2half(-1.0f); +#else + real neg = (real) -1; +#endif + + THCTensor_(mul)(state, r, r, neg); + + return 1; +} + +static int cutorch_TensorOperator_(__mul__)(lua_State *L) +{ + // FIXME: implement +#ifdef THC_REAL_IS_FLOAT + THCTensor *tensor1 = luaT_toudata(L, 1, torch_Tensor); + THCTensor *tensor2 = luaT_toudata(L, 2, torch_Tensor); + THCTensor *r; + THCState *state = cutorch_getstate(L); + THAssert(THCTensor_(checkGPU)(state, 2, tensor1, tensor2)); + + if(!tensor1 && !tensor2) + luaL_error(L, "expecting two Tensors or one Tensor and one number"); + else + { + r = THCTensor_(new)(state); + luaT_pushudata(L, r, torch_Tensor); + + if(!tensor1 && tensor2) + { + THCTensor_(resizeAs)(state, r, tensor2); + THCTensor_(copy)(state, r, tensor2); + + double v = luaL_checknumber(L, 1); +#ifdef THC_REAL_IS_HALF + half value = THC_float2half((float) v); +#else + real value = (real) v; +#endif + + THCTensor_(mul)(state, r, r, value); + } + else if(tensor1 && !tensor2) + { + THCTensor_(resizeAs)(state, r, tensor1); + THCTensor_(copy)(state, r, tensor1); + + double v = luaL_checknumber(L, 2); +#ifdef THC_REAL_IS_HALF + half value = THC_float2half((float) v); +#else + real value = (real) v; +#endif + + THCTensor_(mul)(state, r, r, value); + } + else + { + int dimt = tensor1->nDimension; + int dims = tensor2->nDimension; + + if(dimt == 1 && dims == 1) + lua_pushnumber(L, THCTensor_(dot)(state, tensor1, tensor2)); /* ok, we wasted r, but who cares */ + else if(dimt == 2 && dims == 1) + { + THCTensor_(resize1d)(state, r, tensor1->size[0]); + THCTensor_(zero)(state, r); + THCTensor_(addmv)(state, r, 1, r, 1, tensor1, tensor2); + } + else if(dimt == 2 && dims == 2) + { + THCTensor_(resize2d)(state, r, tensor1->size[0], tensor2->size[1]); + THCTensor_(zero)(state, r); + THCTensor_(addmm)(state, r, 1, r, 1, tensor1, tensor2); + } + else + luaL_error(L, "multiplication between %dD and %dD tensors not yet supported", tensor1->nDimension, tensor2->nDimension); + } + } + return 1; +#else + THError("unimplemented"); +#endif +} + +static int cutorch_TensorOperator_(__div__)(lua_State *L) +{ + THCTensor *tensor = luaT_checkudata(L, 1, torch_Tensor); + THCTensor *r; + THCState *state = cutorch_getstate(L); + THAssert(THCTensor_(checkGPU)(state, 1, tensor)); + + luaL_argcheck(L, lua_isnumber(L,2), 2, "number expected"); + + r = THCTensor_(new)(state); + luaT_pushudata(L, r, torch_Tensor); + + THCTensor_(resizeAs)(state, r, tensor); + THCTensor_(copy)(state, r, tensor); + + double v = luaL_checknumber(L, 2); +#ifdef THC_REAL_IS_HALF + half value = THC_float2half(1.0f / (float) v); +#else + real value = (real) 1 / (real) v; +#endif + + THCTensor_(mul)(state, r, r, value); + + return 1; +} + +static const struct luaL_Reg cutorch_TensorOperator_(_) [] = { + {"__add__", cutorch_TensorOperator_(__add__)}, + {"__sub__", cutorch_TensorOperator_(__sub__)}, + {"__unm__", cutorch_TensorOperator_(__unm__)}, + {"__mul__", cutorch_TensorOperator_(__mul__)}, + {"__div__", cutorch_TensorOperator_(__div__)}, + {NULL, NULL} +}; + +void cutorch_TensorOperator_(init)(lua_State *L) +{ + luaT_pushmetatable(L, torch_Tensor); + luaT_setfuncs(L, cutorch_TensorOperator_(_), 0); + lua_pop(L, 1); +} + +#endif @@ -26,7 +26,16 @@ extern void cutorch_CudaDoubleTensor_init(lua_State* L); extern void cutorch_CudaHalfTensor_init(lua_State* L); #endif +extern void cutorch_CudaByteTensorOperator_init(lua_State* L); +extern void cutorch_CudaCharTensorOperator_init(lua_State* L); +extern void cutorch_CudaShortTensorOperator_init(lua_State* L); +extern void cutorch_CudaIntTensorOperator_init(lua_State* L); +extern void cutorch_CudaLongTensorOperator_init(lua_State* L); extern void cutorch_CudaTensorOperator_init(lua_State* L); +extern void cutorch_CudaDoubleTensorOperator_init(lua_State* L); +#ifdef CUDA_HALF_TENSOR +extern void cutorch_CudaHalfTensorOperator_init(lua_State* L); +#endif extern void cutorch_CudaByteTensorMath_init(lua_State* L); extern void cutorch_CudaCharTensorMath_init(lua_State* L); @@ -980,7 +989,16 @@ int luaopen_libcutorch(lua_State *L) cutorch_CudaHalfTensor_init(L); #endif + cutorch_CudaByteTensorOperator_init(L); + cutorch_CudaCharTensorOperator_init(L); + cutorch_CudaShortTensorOperator_init(L); + cutorch_CudaIntTensorOperator_init(L); + cutorch_CudaLongTensorOperator_init(L); cutorch_CudaTensorOperator_init(L); + cutorch_CudaDoubleTensorOperator_init(L); +#ifdef CUDA_HALF_TENSOR + cutorch_CudaHalfTensorOperator_init(L); +#endif cutorch_CudaByteTensorMath_init(L); cutorch_CudaCharTensorMath_init(L); diff --git a/lib/THC/CMakeLists.txt b/lib/THC/CMakeLists.txt index dfc7ec3..d11364c 100644 --- a/lib/THC/CMakeLists.txt +++ b/lib/THC/CMakeLists.txt @@ -117,8 +117,8 @@ SET(src-cuda THCTensorMathMagma.cu THCTensorMathPairwise.cu THCTensorMathPointwise.cu + THCTensorMathReduce.cu THCTensorMathScan.cu - THCTensorMathTransformReduce.cu THCTensorMasked.cu THCTensorIndex.cu THCTensorConv.cu @@ -181,6 +181,7 @@ INSTALL(FILES THCDeviceTensorUtils-inl.cuh THCGenerateAllTypes.h THCHalf.h + THCNumerics.cuh THCTensorInfo.cuh THCTensorTypeUtils.cuh DESTINATION "${THC_INSTALL_INCLUDE_SUBDIR}/THC") @@ -198,10 +199,20 @@ INSTALL(FILES generic/THCTensorCopy.c generic/THCTensorCopy.cu generic/THCTensorCopy.h + generic/THCTensorMasked.h + generic/THCTensorMasked.cu generic/THCTensorMath.h generic/THCTensorMath.cu + generic/THCTensorMathBlas.cu + generic/THCTensorMathBlas.h + generic/THCTensorMathCompare.h + generic/THCTensorMathCompare.cu + generic/THCTensorMathCompareT.h + generic/THCTensorMathCompareT.cu generic/THCTensorMathPairwise.h generic/THCTensorMathPairwise.cu generic/THCTensorMathPointwise.h generic/THCTensorMathPointwise.cu + generic/THCTensorMathReduce.h + generic/THCTensorMathReduce.cu DESTINATION "${THC_INSTALL_INCLUDE_SUBDIR}/THC/generic") diff --git a/lib/THC/THCBlas.cu b/lib/THC/THCBlas.cu index 1edbcb0..5b99506 100644 --- a/lib/THC/THCBlas.cu +++ b/lib/THC/THCBlas.cu @@ -1,109 +1,79 @@ #include "THCBlas.h" #include "THCGeneral.h" +#include "THCHalf.h" -void THCudaBlas_swap(THCState *state, long n, float *x, long incx, float *y, long incy) +float THCudaBlas_Sdot(THCState *state, long n, float *x, long incx, float *y, long incy) { - if(n == 1) - { - incx = 1; - incy = 1; - } - - if( (n <= INT_MAX) && (incx <= INT_MAX) && (incy <= INT_MAX) ) - { - int i_n = (int)n; - int i_incx = (int)incx; - int i_incy = (int)incy; - THCublasCheck(cublasSswap(THCState_getCurrentBlasHandle(state), i_n, x, i_incx, y, i_incy)); - return; - } - THError("Cublas_swap only supports n, incx and" - " incy upto signed integer limits: %d", INT_MAX); -} - -void THCudaBlas_scal(THCState *state, long n, float a, float *x, long incx) -{ - if(n == 1) - incx = 1; - - if( (n <= INT_MAX) && (incx <= INT_MAX) ) - { - int i_n = (int)n; - int i_incx = (int)incx; - THCublasCheck(cublasSscal(THCState_getCurrentBlasHandle(state), i_n, &a, x, i_incx)); - return; - } - THError("Cublas_scal only supports n and incx " - "upto signed integer limits: %d", INT_MAX); -} - -void THCudaBlas_copy(THCState *state, long n, float *x, long incx, float *y, long incy) -{ - if(n == 1) - { + if (n == 1) { incx = 1; incy = 1; } - if( (n <= INT_MAX) && (incx <= INT_MAX) && (incy <= INT_MAX) ) - { + if ((n <= INT_MAX) && (incx <= INT_MAX) && (incy <= INT_MAX)) { int i_n = (int)n; int i_incx = (int)incx; int i_incy = (int)incy; - THCublasCheck(cublasScopy(THCState_getCurrentBlasHandle(state), i_n, x, i_incx, y, i_incy)); - return; + float result; + THCublasCheck(cublasSdot(THCState_getCurrentBlasHandle(state), i_n, x, i_incx, y, i_incy, &result)); + return result; } - THError("Cublas_copy only supports n, incx and incy " - "upto signed integer limits: %d", INT_MAX); + THError("Cublas_Sdot only supports n, incx and incy " + "up to signed integer limits: %d", INT_MAX); + return 0; } -void THCudaBlas_axpy(THCState *state, long n, float a, float *x, long incx, float *y, long incy) +double THCudaBlas_Ddot(THCState *state, long n, double *x, long incx, double *y, long incy) { - if(n == 1) - { + if (n == 1) { incx = 1; incy = 1; } - if( (n <= INT_MAX) && (incx <= INT_MAX) && (incy <= INT_MAX) ) - { + if ((n <= INT_MAX) && (incx <= INT_MAX) && (incy <= INT_MAX)) { int i_n = (int)n; int i_incx = (int)incx; int i_incy = (int)incy; - THCublasCheck(cublasSaxpy(THCState_getCurrentBlasHandle(state), i_n, &a, x, i_incx, y, i_incy)); - return; + double result; + THCublasCheck(cublasDdot(THCState_getCurrentBlasHandle(state), i_n, x, i_incx, y, i_incy, &result)); + return result; } - THError("Cublas_axpy only supports n, incx and incy " - "upto signed integer limits: %d", INT_MAX); + THError("Cublas_Ddot only supports n, incx and incy " + "up to signed integer limits: %d", INT_MAX); + return 0; } -float THCudaBlas_dot(THCState *state, long n, float *x, long incx, float *y, long incy) +/* Level 2 */ +void THCudaBlas_Sgemv(THCState *state, char trans, long m, long n, float alpha, float *a, long lda, float *x, long incx, float beta, float *y, long incy) { if(n == 1) - { - incx = 1; - incy = 1; - } + lda = m; - if( (n <= INT_MAX) && (incx <= INT_MAX) && (incy <= INT_MAX) ) + cublasOperation_t op; + if (trans == 't') op = CUBLAS_OP_T; + else if (trans == 'n') op = CUBLAS_OP_N; + else if (trans == 'c') op = CUBLAS_OP_C; + + if( (m <= INT_MAX) && (n <= INT_MAX) && + (lda > 0) && (lda <= INT_MAX) && + (incx > 0) && (incx <= INT_MAX) && + (incy > 0) && (incy <= INT_MAX) ) { + int i_m = (int)m; int i_n = (int)n; + int i_lda = (int)lda; int i_incx = (int)incx; int i_incy = (int)incy; - float result; - THCublasCheck(cublasSdot(THCState_getCurrentBlasHandle(state), i_n, x, i_incx, y, i_incy, &result)); - cudaDeviceSynchronize(); - return result; + + THCublasCheck(cublasSgemv(THCState_getCurrentBlasHandle(state), op, i_m, i_n, &alpha, a, i_lda, x, i_incx, &beta, y, i_incy)); + return; } - THError("Cublas_dot only supports n, incx and incy " - "upto signed integer limits: %d", INT_MAX); - return -1; + THError("Cublas_Sgemv only supports m, n, lda, incx, incy" + "in the range 0 < [val] <= %d", INT_MAX); } -/* Level 2 */ -void THCudaBlas_gemv(THCState *state, char trans, long m, long n, float alpha, float *a, long lda, float *x, long incx, float beta, float *y, long incy) +void THCudaBlas_Dgemv(THCState *state, char trans, long m, long n, double alpha, double *a, long lda, double *x, long incx, double beta, double *y, long incy) { if(n == 1) lda = m; @@ -124,14 +94,14 @@ void THCudaBlas_gemv(THCState *state, char trans, long m, long n, float alpha, f int i_incx = (int)incx; int i_incy = (int)incy; - THCublasCheck(cublasSgemv(THCState_getCurrentBlasHandle(state), op, i_m, i_n, &alpha, a, i_lda, x, i_incx, &beta, y, i_incy)); + THCublasCheck(cublasDgemv(THCState_getCurrentBlasHandle(state), op, i_m, i_n, &alpha, a, i_lda, x, i_incx, &beta, y, i_incy)); return; } - THError("Cublas_gemv only supports m, n, lda, incx, incy" + THError("Cublas_Dgemv only supports m, n, lda, incx, incy" "in the range 0 < [val] <= %d", INT_MAX); } -void THCudaBlas_ger(THCState *state, long m, long n, float alpha, float *x, long incx, float *y, long incy, float *a, long lda) +void THCudaBlas_Sger(THCState *state, long m, long n, float alpha, float *x, long incx, float *y, long incy, float *a, long lda) { if(n == 1) lda = m; @@ -147,10 +117,31 @@ void THCudaBlas_ger(THCState *state, long m, long n, float alpha, float *x, long THCublasCheck(cublasSger(THCState_getCurrentBlasHandle(state), i_m, i_n, &alpha, x, i_incx, y, i_incy, a, i_lda)); return; } - THError("Cublas_ger only supports m, n, lda, incx, incy" + THError("Cublas_Sger only supports m, n, lda, incx, incy" "with the bound [val] <= %d", INT_MAX); } +void THCudaBlas_Dger(THCState *state, long m, long n, double alpha, double *x, long incx, double *y, long incy, double *a, long lda) +{ + if(n == 1) + lda = m; + + if( (m <= INT_MAX) && (n <= INT_MAX) && (lda <= INT_MAX) && (incx <= INT_MAX) && (incy <= INT_MAX) ) + { + int i_m = (int)m; + int i_n = (int)n; + int i_lda = (int)lda; + int i_incx = (int)incx; + int i_incy = (int)incy; + + THCublasCheck(cublasDger(THCState_getCurrentBlasHandle(state), i_m, i_n, &alpha, x, i_incx, y, i_incy, a, i_lda)); + return; + } + THError("Cublas_Dger only supports m, n, lda, incx, incy" + "with the bound [val] <= %d", INT_MAX); +} + + cublasOperation_t convertTransToCublasOperation(char trans) { if (trans == 't') return CUBLAS_OP_T; else if (trans == 'n') return CUBLAS_OP_N; @@ -193,7 +184,7 @@ void adjustLd(char transa, char transb, long m, long n, long k, long *lda, long } /* Level 3 */ -void THCudaBlas_gemm(THCState *state, char transa, char transb, long m, long n, long k, float alpha, float *a, long lda, float *b, long ldb, float beta, float *c, long ldc) +void THCudaBlas_Sgemm(THCState *state, char transa, char transb, long m, long n, long k, float alpha, float *a, long lda, float *b, long ldb, float beta, float *c, long ldc) { adjustLd(transa, transb, m, n, k, &lda, &ldb, &ldc); cublasOperation_t opa = convertTransToCublasOperation(transa); @@ -211,17 +202,84 @@ void THCudaBlas_gemm(THCState *state, char transa, char transb, long m, long n, THCublasCheck(cublasSgemm(THCState_getCurrentBlasHandle(state), opa, opb, i_m, i_n, i_k, &alpha, a, i_lda, b, i_ldb, &beta, c, i_ldc)); return; } - THError("Cublas_gemm only supports m, n, k, lda, ldb, ldc" + THError("Cublas_Sgemm only supports m, n, k, lda, ldb, ldc" "with the bound [val] <= %d", INT_MAX); } -void THCudaBlas_gemmBatched(THCState *state, char transa, char transb, long m, long n, long k, - float alpha, const float *a[], long lda, const float *b[], long ldb, - float beta, float *c[], long ldc, long batchCount) +#ifdef CUDA_HALF_TENSOR +// In CUDA 8.0, definition of data types for sgemmex changed +#if CUDA_VERSION < 8000 +# define CUDA_R_16F CUBLAS_DATA_HALF +#endif + +void THCudaBlas_Hgemm(THCState *state, char transa, char transb, long m, long n, long k, half alpha, half *a, long lda, half *b, long ldb, half beta, half *c, long ldc) +{ + adjustLd(transa, transb, m, n, k, &lda, &ldb, &ldc); + cublasOperation_t opa = convertTransToCublasOperation(transa); + cublasOperation_t opb = convertTransToCublasOperation(transb); + + if( (m <= INT_MAX) && (n <= INT_MAX) && (k <= INT_MAX) && (lda <= INT_MAX) && (ldb <= INT_MAX) && (ldc <= INT_MAX) ) + { + int i_m = (int)m; + int i_n = (int)n; + int i_k = (int)k; + int i_lda = (int)lda; + int i_ldb = (int)ldb; + int i_ldc = (int)ldc; + + // Check for native Hgemm support + if (THC_nativeHalfInstructions(state)) { + THCublasCheck(cublasHgemm(THCState_getCurrentBlasHandle(state), opa, opb, + i_m, i_n, i_k, &alpha, a, i_lda, b, i_ldb, + &beta, c, i_ldc)); + } else { + // Simulated Hgemm + float fAlpha = THC_half2float(alpha); + float fBeta = THC_half2float(beta); + + THCublasCheck(cublasSgemmEx(THCState_getCurrentBlasHandle(state), opa, opb, + i_m, i_n, i_k, &fAlpha, + a, CUDA_R_16F, i_lda, b, CUDA_R_16F, + i_ldb, &fBeta, c, CUDA_R_16F, i_ldc)); + } + + return; + } + THError("Cublas_Hgemm only supports m, n, k, lda, ldb, ldc" + "with th bound [val] <= %d", INT_MAX); +} +#endif + +void THCudaBlas_Dgemm(THCState *state, char transa, char transb, long m, long n, long k, double alpha, double *a, long lda, double *b, long ldb, double beta, double *c, long ldc) +{ + adjustLd(transa, transb, m, n, k, &lda, &ldb, &ldc); + cublasOperation_t opa = convertTransToCublasOperation(transa); + cublasOperation_t opb = convertTransToCublasOperation(transb); + + if( (m <= INT_MAX) && (n <= INT_MAX) && (k <= INT_MAX) && (lda <= INT_MAX) && (ldb <= INT_MAX) && (ldc <= INT_MAX) ) + { + int i_m = (int)m; + int i_n = (int)n; + int i_k = (int)k; + int i_lda = (int)lda; + int i_ldb = (int)ldb; + int i_ldc = (int)ldc; + + THCublasCheck(cublasDgemm(THCState_getCurrentBlasHandle(state), opa, opb, i_m, i_n, i_k, &alpha, a, i_lda, b, i_ldb, &beta, c, i_ldc)); + return; + } + THError("Cublas_Dgemm only supports m, n, k, lda, ldb, ldc" + "with the bound [val] <= %d", INT_MAX); +} + + +void THCudaBlas_SgemmBatched(THCState *state, char transa, char transb, long m, long n, long k, + float alpha, const float *a[], long lda, const float *b[], long ldb, + float beta, float *c[], long ldc, long batchCount) { if( (m >= INT_MAX) || (n >= INT_MAX) || (k >= INT_MAX) || (lda >= INT_MAX) || (ldb >= INT_MAX) || (ldc >= INT_MAX) || (batchCount >= INT_MAX) ) { - THError("Cublas_gemm only supports m, n, k, lda, ldb, ldc, batchCount" + THError("Cublas_SgemmBatched only supports m, n, k, lda, ldb, ldc, batchCount" "with the bound [val] <= %d", INT_MAX); } @@ -235,22 +293,61 @@ void THCudaBlas_gemmBatched(THCState *state, char transa, char transb, long m, l (int)batchCount)); } +void THCudaBlas_DgemmBatched(THCState *state, char transa, char transb, long m, long n, long k, + double alpha, const double *a[], long lda, const double *b[], long ldb, + double beta, double *c[], long ldc, long batchCount) +{ + if( (m >= INT_MAX) || (n >= INT_MAX) || (k >= INT_MAX) || (lda >= INT_MAX) || (ldb >= INT_MAX) || (ldc >= INT_MAX) || (batchCount >= INT_MAX) ) + { + THError("Cublas_DgemmBatched only supports m, n, k, lda, ldb, ldc, batchCount" + "with the bound [val] <= %d", INT_MAX); + } + + adjustLd(transa, transb, m, n, k, &lda, &ldb, &ldc); + cublasOperation_t opa = convertTransToCublasOperation(transa); + cublasOperation_t opb = convertTransToCublasOperation(transb); + + THCublasCheck(cublasDgemmBatched(THCState_getCurrentBlasHandle(state), + opa, opb, (int)m, (int)n, (int)k, + &alpha, a, (int)lda, b, (int)ldb, &beta, c, (int)ldc, + (int)batchCount)); +} + /* Inverse */ -void THCudaBlas_getrf(THCState *state, int n, float **a, int lda, int *pivot, int *info, int batchSize) { +void THCudaBlas_Sgetrf(THCState *state, int n, float **a, int lda, int *pivot, int *info, int batchSize) { if( (n >= INT_MAX) || (lda >= INT_MAX) || (batchSize >= INT_MAX) ) { - THError("Cublas_getrf only supports n, lda, batchSize" + THError("Cublas_Sgetrf only supports n, lda, batchSize" "with the bound [val] <= %d", INT_MAX); } THCublasCheck(cublasSgetrfBatched(THCState_getCurrentBlasHandle(state), n, a, lda, pivot, info, batchSize)); } -void THCudaBlas_getri(THCState *state, int n, const float **a, int lda, int *pivot, float **c, int ldc, int *info, int batchSize) { +void THCudaBlas_Dgetrf(THCState *state, int n, double **a, int lda, int *pivot, int *info, int batchSize) { + if( (n >= INT_MAX) || (lda >= INT_MAX) || (batchSize >= INT_MAX) ) + { + THError("Cublas_Dgetrf only supports n, lda, batchSize" + "with the bound [val] <= %d", INT_MAX); + } + THCublasCheck(cublasDgetrfBatched(THCState_getCurrentBlasHandle(state), n, a, lda, pivot, info, batchSize)); +} + +void THCudaBlas_Sgetri(THCState *state, int n, const float **a, int lda, int *pivot, float **c, int ldc, int *info, int batchSize) { if( (n >= INT_MAX) || (lda >= INT_MAX)|| (ldc >= INT_MAX) || (batchSize >= INT_MAX) ) { - THError("Cublas_getrf only supports n, lda, ldc, batchSize" + THError("Cublas_Sgetri only supports n, lda, ldc, batchSize" "with the bound [val] <= %d", INT_MAX); } THCublasCheck(cublasSgetriBatched(THCState_getCurrentBlasHandle(state), n, a, lda, pivot, c, ldc, info, batchSize)); } + +void THCudaBlas_Dgetri(THCState *state, int n, const double **a, int lda, int *pivot, double **c, int ldc, int *info, int batchSize) { + + if( (n >= INT_MAX) || (lda >= INT_MAX)|| (ldc >= INT_MAX) || (batchSize >= INT_MAX) ) + { + THError("Cublas_Dgetri only supports n, lda, ldc, batchSize" + "with the bound [val] <= %d", INT_MAX); + } + THCublasCheck(cublasDgetriBatched(THCState_getCurrentBlasHandle(state), n, a, lda, pivot, c, ldc, info, batchSize)); +} diff --git a/lib/THC/THCBlas.h b/lib/THC/THCBlas.h index 613b78e..45f58eb 100644 --- a/lib/THC/THCBlas.h +++ b/lib/THC/THCBlas.h @@ -2,26 +2,37 @@ #define THC_BLAS_INC #include "THCGeneral.h" +#include "THCHalf.h" /* Level 1 */ -THC_API void THCudaBlas_swap(THCState *state, long n, float *x, long incx, float *y, long incy); -THC_API void THCudaBlas_scal(THCState *state, long n, float a, float *x, long incx); -THC_API void THCudaBlas_copy(THCState *state, long n, float *x, long incx, float *y, long incy); -THC_API void THCudaBlas_axpy(THCState *state, long n, float a, float *x, long incx, float *y, long incy); -THC_API float THCudaBlas_dot(THCState *state, long n, float *x, long incx, float *y, long incy); +THC_API float THCudaBlas_Sdot(THCState *state, long n, float *x, long incx, float *y, long incy); +THC_API double THCudaBlas_Ddot(THCState *state, long n, double *x, long incx, double *y, long incy); /* Level 2 */ -THC_API void THCudaBlas_gemv(THCState *state, char trans, long m, long n, float alpha, float *a, long lda, float *x, long incx, float beta, float *y, long incy); -THC_API void THCudaBlas_ger(THCState *state, long m, long n, float alpha, float *x, long incx, float *y, long incy, float *a, long lda); +THC_API void THCudaBlas_Sgemv(THCState *state, char trans, long m, long n, float alpha, float *a, long lda, float *x, long incx, float beta, float *y, long incy); +THC_API void THCudaBlas_Dgemv(THCState *state, char trans, long m, long n, double alpha, double *a, long lda, double *x, long incx, double beta, double *y, long incy); +THC_API void THCudaBlas_Sger(THCState *state, long m, long n, float alpha, float *x, long incx, float *y, long incy, float *a, long lda); +THC_API void THCudaBlas_Dger(THCState *state, long m, long n, double alpha, double *x, long incx, double *y, long incy, double *a, long lda); /* Level 3 */ -THC_API void THCudaBlas_gemm(THCState *state, char transa, char transb, long m, long n, long k, float alpha, float *a, long lda, float *b, long ldb, float beta, float *c, long ldc); -THC_API void THCudaBlas_gemmBatched(THCState *state, char transa, char transb, long m, long n, long k, - float alpha, const float *a[], long lda, const float *b[], long ldb, - float beta, float *c[], long ldc, long batchCount); +THC_API void THCudaBlas_Sgemm(THCState *state, char transa, char transb, long m, long n, long k, float alpha, float *a, long lda, float *b, long ldb, float beta, float *c, long ldc); +THC_API void THCudaBlas_Dgemm(THCState *state, char transa, char transb, long m, long n, long k, double alpha, double *a, long lda, double *b, long ldb, double beta, double *c, long ldc); + +#ifdef CUDA_HALF_TENSOR +THC_API void THCudaBlas_Hgemm(THCState *state, char transa, char transb, long m, long n, long k, half alpha, half *a, long lda, half *b, long ldb, half beta, half *c, long ldc); +#endif + +THC_API void THCudaBlas_SgemmBatched(THCState *state, char transa, char transb, long m, long n, long k, + float alpha, const float *a[], long lda, const float *b[], long ldb, + float beta, float *c[], long ldc, long batchCount); +THC_API void THCudaBlas_DgemmBatched(THCState *state, char transa, char transb, long m, long n, long k, + double alpha, const double *a[], long lda, const double *b[], long ldb, + double beta, double *c[], long ldc, long batchCount); /* Inverse */ -THC_API void THCudaBlas_getrf(THCState *state, int n, float **a, int lda, int *pivot, int *info, int batchSize); -THC_API void THCudaBlas_getri(THCState *state, int n, const float **a, int lda, int *pivot, float **c, int ldc, int *info, int batchSize); +THC_API void THCudaBlas_Sgetrf(THCState *state, int n, float **a, int lda, int *pivot, int *info, int batchSize); +THC_API void THCudaBlas_Dgetrf(THCState *state, int n, double **a, int lda, int *pivot, int *info, int batchSize); +THC_API void THCudaBlas_Sgetri(THCState *state, int n, const float **a, int lda, int *pivot, float **c, int ldc, int *info, int batchSize); +THC_API void THCudaBlas_Dgetri(THCState *state, int n, const double **a, int lda, int *pivot, double **c, int ldc, int *info, int batchSize); #endif diff --git a/lib/THC/THCGenerateAllTypes.h b/lib/THC/THCGenerateAllTypes.h index 28037bd..3900ca5 100644 --- a/lib/THC/THCGenerateAllTypes.h +++ b/lib/THC/THCGenerateAllTypes.h @@ -81,7 +81,9 @@ #undef THC_REAL_IS_LONG #define real float -#define accreal double +/* FIXME: fp64 has bad performance on some platforms; avoid using it unless + we opt into it? */ +#define accreal float #define Real Float #define CReal Cuda #define THC_REAL_IS_FLOAT @@ -109,7 +111,9 @@ #ifdef CUDA_HALF_TENSOR #define real half -#define accreal half +/* FIXME: fp64 has bad performance on some platforms; avoid using it unless + we opt into it? */ +#define accreal float #define Real Half #define CReal CudaHalf #define THC_REAL_IS_HALF diff --git a/lib/THC/THCHalf.cu b/lib/THC/THCHalf.cu index 7847ef6..7777bf7 100644 --- a/lib/THC/THCHalf.cu +++ b/lib/THC/THCHalf.cu @@ -119,3 +119,12 @@ half THC_float2half(float a) memcpy(&ret, &ir, sizeof(half)); return ret; } + +THC_EXTERNC int THC_nativeHalfInstructions(THCState *state) { + cudaDeviceProp* prop = + THCState_getCurrentDeviceProperties(state); + + // CC 5.3+ + return (prop->major > 5 || + (prop->major == 5 && prop->minor == 3)); +} diff --git a/lib/THC/THCHalf.h b/lib/THC/THCHalf.h index d87647b..7440de4 100644 --- a/lib/THC/THCHalf.h +++ b/lib/THC/THCHalf.h @@ -3,12 +3,12 @@ #include "THCGeneral.h" -// We compile with CudaHalfTensor support if we have this: +/* We compile with CudaHalfTensor support if we have this: */ #if CUDA_VERSION >= 7050 || CUDA_HAS_FP16 #define CUDA_HALF_TENSOR 1 #endif -// Native fp16 ALU instructions are available if we have this: +/* Kernel side: Native fp16 ALU instructions are available if we have this: */ #if defined(CUDA_HALF_TENSOR) && (__CUDA_ARCH__ >= 530) #define CUDA_HALF_INSTRUCTIONS 1 #endif @@ -23,6 +23,9 @@ THC_EXTERNC void THCHalf2Float(THCState *state, float *out, half *in, long len); THC_EXTERNC half THC_float2half(float a); THC_EXTERNC float THC_half2float(half a); -#endif // CUDA_HALF_TENSOR +/* Check for native fp16 support on the current device (CC 5.3+) */ +THC_EXTERNC int THC_nativeHalfInstructions(THCState *state); + +#endif /* CUDA_HALF_TENSOR */ #endif diff --git a/lib/THC/THCNumerics.cuh b/lib/THC/THCNumerics.cuh new file mode 100644 index 0000000..bc299d4 --- /dev/null +++ b/lib/THC/THCNumerics.cuh @@ -0,0 +1,239 @@ +#ifndef THC_NUMERICS_INC +#define THC_NUMERICS_INC + +#include <cuda.h> +#include <limits.h> +#include "THCHalf.h" + +/// Class for numeric limits of the particular data type, which +/// includes support for `half`. +/// Unfortunately since `half` does not have a constructor, these have +/// to be expressed as functions (either that or non-const statics). +template <typename T> +struct THCNumerics { +}; + +template <> +struct THCNumerics<unsigned char> { + static inline __host__ __device__ unsigned char min() { return 0; } + static inline __host__ __device__ unsigned char max() { return UCHAR_MAX; } + + static inline __host__ __device__ bool lt(unsigned char a, unsigned char b) { return a < b; } + static inline __host__ __device__ bool le(unsigned char a, unsigned char b) { return a <= b; } + static inline __host__ __device__ bool gt(unsigned char a, unsigned char b) { return a > b; } + static inline __host__ __device__ bool ge(unsigned char a, unsigned char b) { return a >= b; } + static inline __host__ __device__ bool eq(unsigned char a, unsigned char b) { return a == b; } + static inline __host__ __device__ bool ne(unsigned char a, unsigned char b) { return a != b; } +}; + +template <> +struct THCNumerics<char> { + static inline __host__ __device__ char min() { return CHAR_MIN; } + static inline __host__ __device__ char max() { return CHAR_MAX; } + + static inline __host__ __device__ bool lt(char a, char b) { return a < b; } + static inline __host__ __device__ bool le(char a, char b) { return a <= b; } + static inline __host__ __device__ bool gt(char a, char b) { return a > b; } + static inline __host__ __device__ bool ge(char a, char b) { return a >= b; } + static inline __host__ __device__ bool eq(char a, char b) { return a == b; } + static inline __host__ __device__ bool ne(char a, char b) { return a != b; } +}; + +template <> +struct THCNumerics<short> { + static inline __host__ __device__ short min() { return SHRT_MIN; } + static inline __host__ __device__ short max() { return SHRT_MAX; } + + static inline __host__ __device__ bool lt(short a, short b) { return a < b; } + static inline __host__ __device__ bool le(short a, short b) { return a <= b; } + static inline __host__ __device__ bool gt(short a, short b) { return a > b; } + static inline __host__ __device__ bool ge(short a, short b) { return a >= b; } + static inline __host__ __device__ bool eq(short a, short b) { return a == b; } + static inline __host__ __device__ bool ne(short a, short b) { return a != b; } +}; + +template <> +struct THCNumerics<int> { + static inline __host__ __device__ int min() { return INT_MIN; } + static inline __host__ __device__ int max() { return INT_MAX; } + + static inline __host__ __device__ bool lt(int a, int b) { return a < b; } + static inline __host__ __device__ bool le(int a, int b) { return a <= b; } + static inline __host__ __device__ bool gt(int a, int b) { return a > b; } + static inline __host__ __device__ bool ge(int a, int b) { return a >= b; } + static inline __host__ __device__ bool eq(int a, int b) { return a == b; } + static inline __host__ __device__ bool ne(int a, int b) { return a != b; } +}; + +template <> +struct THCNumerics<long> { + static inline __host__ __device__ long min() { return LONG_MIN; } + static inline __host__ __device__ long max() { return LONG_MAX; } + + static inline __host__ __device__ bool lt(long a, long b) { return a < b; } + static inline __host__ __device__ bool le(long a, long b) { return a <= b; } + static inline __host__ __device__ bool gt(long a, long b) { return a > b; } + static inline __host__ __device__ bool ge(long a, long b) { return a >= b; } + static inline __host__ __device__ bool eq(long a, long b) { return a == b; } + static inline __host__ __device__ bool ne(long a, long b) { return a != b; } +}; + +#ifdef CUDA_HALF_TENSOR +template <> +struct THCNumerics<half> { + static inline __host__ __device__ half min() { half h; h.x = 0xfbff; return h; } + static inline __host__ __device__ half max() { half h; h.x = 0x7bff; return h; } + + static inline __host__ __device__ bool lt(half a, half b) { +#ifdef __CUDA_ARCH__ +#ifdef CUDA_HALF_INSTRUCTIONS + return __hlt(a, b); +#else + float fa = __half2float(a); + float fb = __half2float(b); + return fa < fb; +#endif +#else // __CUDA_ARCH__ + return THC_half2float(a) < THC_half2float(b); +#endif + } + + static inline __host__ __device__ bool le(half a, half b) { +#ifdef __CUDA_ARCH__ +#ifdef CUDA_HALF_INSTRUCTIONS + return __hle(a, b); +#else + float fa = __half2float(a); + float fb = __half2float(b); + return fa <= fb; +#endif +#else // __CUDA_ARCH__ + return THC_half2float(a) <= THC_half2float(b); +#endif + } + + static inline __host__ __device__ bool gt(half a, half b) { +#ifdef __CUDA_ARCH__ +#ifdef CUDA_HALF_INSTRUCTIONS + return __hgt(a, b); +#else + float fa = __half2float(a); + float fb = __half2float(b); + return fa > fb; +#endif +#else // __CUDA_ARCH__ + return THC_half2float(a) > THC_half2float(b); +#endif + } + + static inline __host__ __device__ bool ge(half a, half b) { +#ifdef __CUDA_ARCH__ +#ifdef CUDA_HALF_INSTRUCTIONS + return __hge(a, b); +#else + float fa = __half2float(a); + float fb = __half2float(b); + return fa >= fb; +#endif +#else // __CUDA_ARCH__ + return THC_half2float(a) >= THC_half2float(b); +#endif + } + + static inline __host__ __device__ bool eq(half a, half b) { +#ifdef __CUDA_ARCH__ +#ifdef CUDA_HALF_INSTRUCTIONS + return __heq(a, b); +#else + float fa = __half2float(a); + float fb = __half2float(b); + return fa == fb; +#endif +#else // __CUDA_ARCH__ + return THC_half2float(a) == THC_half2float(b); +#endif + } + + static inline __host__ __device__ bool ne(half a, half b) { +#ifdef __CUDA_ARCH__ +#ifdef CUDA_HALF_INSTRUCTIONS + return __hne(a, b); +#else + float fa = __half2float(a); + float fb = __half2float(b); + return fa != fb; +#endif +#else // __CUDA_ARCH__ + return THC_half2float(a) != THC_half2float(b); +#endif + } +}; +#endif + +template <> +struct THCNumerics<float> { + static inline __host__ __device__ float min() { return -FLT_MAX; } + static inline __host__ __device__ float max() { return FLT_MAX; } + + static inline __host__ __device__ bool lt(float a, float b) { return a < b; } + static inline __host__ __device__ bool le(float a, float b) { return a <= b; } + static inline __host__ __device__ bool gt(float a, float b) { return a > b; } + static inline __host__ __device__ bool ge(float a, float b) { return a >= b; } + static inline __host__ __device__ bool eq(float a, float b) { return a == b; } + static inline __host__ __device__ bool ne(float a, float b) { return a != b; } +}; + +template <> +struct THCNumerics<double> { + static inline __host__ __device__ double min() { return -DBL_MAX; } + static inline __host__ __device__ double max() { return DBL_MAX; } + + static inline __host__ __device__ bool lt(double a, double b) { return a < b; } + static inline __host__ __device__ bool le(double a, double b) { return a <= b; } + static inline __host__ __device__ bool gt(double a, double b) { return a > b; } + static inline __host__ __device__ bool ge(double a, double b) { return a >= b; } + static inline __host__ __device__ bool eq(double a, double b) { return a == b; } + static inline __host__ __device__ bool ne(double a, double b) { return a != b; } +}; + +/// `half` has some type conversion issues associated with it, since it +/// is a struct without a constructor/implicit conversion constructor. +/// We use this to convert scalar values to the given type that the +/// tensor expects. +template <typename In, typename Out> +struct ScalarConvert { + static __host__ __device__ Out to(const In v) { return (Out) v; } +}; + +#ifdef CUDA_HALF_TENSOR +template <typename Out> +struct ScalarConvert<half, Out> { + static __host__ __device__ Out to(const half v) { +#ifdef __CUDA_ARCH__ + return (Out) __half2float(v); +#else + return (Out) THC_half2float(v); +#endif + } +}; + +template <typename In> +struct ScalarConvert<In, half> { + static __host__ __device__ half to(const In v) { +#ifdef __CUDA_ARCH__ + return __float2half((float) v); +#else + return THC_float2half((float) v); +#endif + } +}; + +template <> +struct ScalarConvert<half, half> { + static __host__ __device__ half to(const half v) { + return v; + } +}; +#endif + +#endif // THC_TENSOR_TYPE_UTILS_INC diff --git a/lib/THC/THCReduce.cuh b/lib/THC/THCReduce.cuh index 89f599f..a7135de 100644 --- a/lib/THC/THCReduce.cuh +++ b/lib/THC/THCReduce.cuh @@ -8,6 +8,7 @@ // arguments without copying or temporary storage. // +#include "THCTensorTypeUtils.cuh" #include "THCReduceApplyUtils.cuh" // Threads per thread block @@ -29,14 +30,14 @@ template <typename ModifyOp, __launch_bounds__(32 * 16, 4) #endif __global__ void -THCudaTensor_reduceNoncontigDim(TensorInfo<T, IndexType> out, - TensorInfo<T, IndexType> in, - IndexType reductionStride, - IndexType reductionSize, - IndexType totalSlices, - T init, - ModifyOp modifyOp, - ReduceOp reduceOp) { +kernelReduceNoncontigDim(TensorInfo<T, IndexType> out, + TensorInfo<T, IndexType> in, + IndexType reductionStride, + IndexType reductionSize, + IndexType totalSlices, + T init, + ModifyOp modifyOp, + ReduceOp reduceOp) { const IndexType sliceIndex = getReduceNoncontigDimSliceIndex<IndexType>(); if (sliceIndex >= totalSlices) { @@ -77,13 +78,13 @@ template <typename ModifyOp, typename IndexType, int ADims, int BDims> __global__ void -THCudaTensor_reduceContigDim(TensorInfo<T, IndexType> out, - TensorInfo<T, IndexType> in, - IndexType reductionSize, - IndexType totalSlices, - T init, - ModifyOp modifyOp, - ReduceOp reduceOp) { +kernelReduceContigDim(TensorInfo<T, IndexType> out, + TensorInfo<T, IndexType> in, + IndexType reductionSize, + IndexType totalSlices, + T init, + ModifyOp modifyOp, + ReduceOp reduceOp) { const IndexType sliceIndex = getReduceContigDimSliceIndex<IndexType>(); if (sliceIndex >= totalSlices) { @@ -107,7 +108,9 @@ THCudaTensor_reduceContigDim(TensorInfo<T, IndexType> out, } // Reduce within the block - extern __shared__ T smem[]; + // FIXME: extern name + extern __shared__ char smemChar[]; + T* smem = (T*) smemChar; r = reduceBlock<T, ReduceOp>(smem, blockDim.x, r, reduceOp, init); if (threadIdx.x == 0) { @@ -139,14 +142,16 @@ inline dim3 getContigReduceBlock(long numSlices, long reductionSize) { // Scale up block size based on the reduction dimension size long warpsInReductionSize = THCCeilDiv(reductionSize, 32L); - int numWarps = - warpsInReductionSize > (long) maxWarps ? maxWarps : (int) warpsInReductionSize; + int numWarps = warpsInReductionSize > (long) maxWarps ? + maxWarps : (int) warpsInReductionSize; + return dim3(numWarps * 32); } inline bool getNoncontigReduceGrid(long elements, dim3& grid) { // One output point per thread - return THC_getGridFromTiles(THCCeilDiv(elements, (long) THC_NONCONTIG_REDUCE_BLOCK_SIZE), grid); + return THC_getGridFromTiles(THCCeilDiv(elements, + (long) THC_NONCONTIG_REDUCE_BLOCK_SIZE), grid); } inline bool getContigReduceGrid(long elements, dim3& grid) { @@ -156,26 +161,26 @@ inline bool getContigReduceGrid(long elements, dim3& grid) { // Performs a reduction out[..., 0, ...] = reduce_i(modify(in[..., i, ...])) for // all in where i and the out's 0 are indexed at dimension `dim` -template <typename ModifyOp, typename ReduceOp> -bool THCudaTensor_reduceDim(THCState* state, - THCudaTensor* out, - THCudaTensor* in, - const ModifyOp& modifyOp, - const ReduceOp& reduceOp, - float init, - int dim) { - long inElements = THCudaTensor_nElement(state, in); - - long reductionSize = THCudaTensor_size(state, in, dim); - long reductionStride = THCudaTensor_stride(state, in, dim); +template <typename TensorType, typename ModifyOp, typename ReduceOp> +bool THC_reduceDim(THCState* state, + TensorType* out, + TensorType* in, + const ModifyOp& modifyOp, + const ReduceOp& reduceOp, + typename TensorUtils<TensorType>::DataType init, + int dim) { + long inElements = TensorUtils<TensorType>::getNumElements(state, in); + + long reductionSize = TensorUtils<TensorType>::getSize(state, in, dim); + long reductionStride = TensorUtils<TensorType>::getStride(state, in, dim); long outElements = inElements / reductionSize; - if (THCudaTensor_nDimension(state, out) > MAX_CUTORCH_DIMS || - THCudaTensor_nDimension(state, in) > MAX_CUTORCH_DIMS) { + if (TensorUtils<TensorType>::getDims(state, out) > MAX_CUTORCH_DIMS || + TensorUtils<TensorType>::getDims(state, in) > MAX_CUTORCH_DIMS) { return false; } - if (THCudaTensor_nDimension(state, in) == 0) { + if (TensorUtils<TensorType>::getDims(state, in) == 0) { // Zero-dim tensor; do nothing return true; } @@ -193,7 +198,7 @@ bool THCudaTensor_reduceDim(THCState* state, } block = getContigReduceBlock(outElements, reductionSize); - smemSize = sizeof(float) * block.x; + smemSize = sizeof(typename TensorUtils<TensorType>::DataType) * block.x; } else { if (!getNoncontigReduceGrid(outElements, grid)) { return false; @@ -203,9 +208,9 @@ bool THCudaTensor_reduceDim(THCState* state, } // Resize out to correspond to the reduced size - THLongStorage* sizes = THCudaTensor_newSizeOf(state, in); + THLongStorage* sizes = TensorUtils<TensorType>::newSizeOf(state, in); THLongStorage_set(sizes, dim, 1); - THCudaTensor_resize(state, out, sizes, NULL); + TensorUtils<TensorType>::resize(state, out, sizes, NULL); THLongStorage_free(sizes); // It is possible that the tensor dimensions are able to be collapsed, @@ -216,80 +221,84 @@ bool THCudaTensor_reduceDim(THCState* state, // (or vice versa), the contiguous tensor can be collapsed to one // dimension, and the loop to translate the linear index to the array // index can be similarly collapsed. That is what this unrolling is for. -#define HANDLE_CASE(T, TYPE, OUT, IN) \ +#define HANDLE_CASE(TYPE, OUT, IN) \ if (contigReduction) { \ - THCudaTensor_reduceContigDim<ModifyOp, ReduceOp, T, TYPE, OUT, IN> \ + kernelReduceContigDim<ModifyOp, ReduceOp, \ + typename TensorUtils<TensorType>::DataType, \ + TYPE, OUT, IN> \ <<<grid, block, smemSize, THCState_getCurrentStream(state)>>>( \ outInfo, inInfo, reductionSize, \ (TYPE) outElements, init, modifyOp, reduceOp); \ } else { \ - THCudaTensor_reduceNoncontigDim<ModifyOp, ReduceOp, T, TYPE, OUT, IN> \ + kernelReduceNoncontigDim<ModifyOp, ReduceOp, \ + typename TensorUtils<TensorType>::DataType, \ + TYPE, OUT, IN> \ <<<grid, block, 0, THCState_getCurrentStream(state)>>>( \ outInfo, inInfo, reductionStride, reductionSize, \ (TYPE) outElements, init, modifyOp, reduceOp); \ } \ -#define HANDLE_IN_CASE(T, TYPE, OUT, IN) \ +#define HANDLE_IN_CASE(TYPE, OUT, IN) \ { \ if (inInfo.isContiguous()) { \ - HANDLE_CASE(T, TYPE, OUT, -2); \ + HANDLE_CASE(TYPE, OUT, -2); \ } else { \ switch (IN) { \ case 1: \ - HANDLE_CASE(T, TYPE, OUT, 1); \ + HANDLE_CASE(TYPE, OUT, 1); \ break; \ case 2: \ - HANDLE_CASE(T, TYPE, OUT, 2); \ + HANDLE_CASE(TYPE, OUT, 2); \ break; \ default: \ - HANDLE_CASE(T, TYPE, OUT, -1); \ + HANDLE_CASE(TYPE, OUT, -1); \ break; \ } \ } \ } -#define HANDLE_OUT_CASE(T, TYPE, OUT, IN) \ +#define HANDLE_OUT_CASE(TYPE, OUT, IN) \ { \ if (outInfo.isContiguous()) { \ - HANDLE_IN_CASE(T, TYPE, -2, IN); \ + HANDLE_IN_CASE(TYPE, -2, IN); \ } else { \ switch (OUT) { \ case 1: \ - HANDLE_IN_CASE(T, TYPE, 1, IN); \ + HANDLE_IN_CASE(TYPE, 1, IN); \ break; \ case 2: \ - HANDLE_IN_CASE(T, TYPE, 2, IN); \ - break; \ - case 3: \ - HANDLE_IN_CASE(T, TYPE, 3, IN); \ + HANDLE_IN_CASE(TYPE, 2, IN); \ break; \ default: \ - HANDLE_IN_CASE(T, TYPE, -1, IN); \ + HANDLE_IN_CASE(TYPE, -1, IN); \ break; \ } \ } \ } - if (TensorUtils<THCudaTensor>::canUse32BitIndexMath(state, out) && - TensorUtils<THCudaTensor>::canUse32BitIndexMath(state, in)) { - TensorInfo<float, unsigned int> outInfo = - getTensorInfo<THCudaTensor, unsigned int>(state, out); + if (TensorUtils<TensorType>::canUse32BitIndexMath(state, out) && + TensorUtils<TensorType>::canUse32BitIndexMath(state, in)) { + TensorInfo<typename TensorUtils<TensorType>::DataType, + unsigned int> outInfo = + getTensorInfo<TensorType, unsigned int>(state, out); outInfo.collapseDims(); - TensorInfo<float, unsigned int> inInfo = - getTensorInfo<THCudaTensor, unsigned int>(state, in); + TensorInfo<typename TensorUtils<TensorType>::DataType, + unsigned int> inInfo = + getTensorInfo<TensorType, unsigned int>(state, in); inInfo.reduceDim(dim); inInfo.collapseDims(); - HANDLE_OUT_CASE(typename TensorUtils<THCudaTensor>::DataType, - unsigned int, outInfo.dims, inInfo.dims); + HANDLE_OUT_CASE(unsigned int, outInfo.dims, inInfo.dims); } else { - TensorInfo<float, unsigned long> outInfo = - getTensorInfo<THCudaTensor, unsigned long>(state, out); + TensorInfo<typename TensorUtils<TensorType>::DataType, + unsigned long> outInfo = + getTensorInfo<TensorType, unsigned long>(state, out); outInfo.collapseDims(); - TensorInfo<float, unsigned long> inInfo = - getTensorInfo<THCudaTensor, unsigned long>(state, in); + TensorInfo<typename TensorUtils<TensorType>::DataType, + unsigned long> inInfo = + getTensorInfo<TensorType, unsigned long>(state, in); inInfo.reduceDim(dim); inInfo.collapseDims(); @@ -297,11 +306,9 @@ bool THCudaTensor_reduceDim(THCState* state, // version and the completely generic version, to reduce // compilation time. if (outInfo.isContiguous() && inInfo.isContiguous()) { - HANDLE_CASE(typename TensorUtils<THCudaTensor>::DataType, - unsigned long, -2, -2); + HANDLE_CASE(unsigned long, -2, -2); } else { - HANDLE_CASE(typename TensorUtils<THCudaTensor>::DataType, - unsigned long, -1, -1); + HANDLE_CASE(unsigned long, -1, -1); } } #undef HANDLE_CASE diff --git a/lib/THC/THCReduceAll.cuh b/lib/THC/THCReduceAll.cuh index 3fe81a9..498fb53 100644 --- a/lib/THC/THCReduceAll.cuh +++ b/lib/THC/THCReduceAll.cuh @@ -20,26 +20,30 @@ // Kernel that handles an entire reduction of a tensor in one pass template <typename ModifyOp, typename ReduceOp, - typename T, + typename ReduceAccOp, + typename InT, + typename AccT, typename IndexType, int ADims> __global__ void -THCudaTensor_reduceAll(TensorInfo<T, IndexType> in, - IndexType totalElements, - T init, - ModifyOp modifyOp, - ReduceOp reduceOp, - T* out) { +kernelReduceAll(TensorInfo<InT, IndexType> in, + IndexType totalElements, + AccT init, + ModifyOp modifyOp, + ReduceOp reduceOp, + ReduceAccOp reduceAccOp, + AccT* out) { // With a block-wide stride, have each thread perform its own reduction. - T r = init; + AccT r = init; for (IndexType i = threadIdx.x; i < totalElements; i += blockDim.x) { - const IndexType inOffset = IndexToOffset<T, IndexType, ADims>::get(i, in); + const IndexType inOffset = IndexToOffset<InT, IndexType, ADims>::get(i, in); r = reduceOp(r, modifyOp(in.data[inOffset])); } // Reduce within the block - extern __shared__ T smem[]; - r = reduceBlock<T, ReduceOp>(smem, blockDim.x, r, reduceOp, init); + extern __shared__ char smemChar[]; + AccT* smem = (AccT*) smemChar; + r = reduceBlock<AccT, ReduceAccOp>(smem, blockDim.x, r, reduceAccOp, init); if (threadIdx.x == 0) { // Write out reduced value @@ -62,29 +66,33 @@ __device__ __forceinline__ IndexType getEndIndex(IndexType totalSize) { // Kernel that handles an entire reduction of a tensor in two passes template <typename ModifyOp, typename ReduceOp, - typename T, + typename ReduceAccOp, + typename InT, + typename AccT, typename IndexType, int ADims> __global__ void -THCudaTensor_reduceAllPass1(TensorInfo<T, IndexType> in, - IndexType totalElements, - T init, - ModifyOp modifyOp, - ReduceOp reduceOp, - T* scratchSpace) { +kernelReduceAllPass1(TensorInfo<InT, IndexType> in, + IndexType totalElements, + AccT init, + ModifyOp modifyOp, + ReduceOp reduceOp, + ReduceAccOp reduceAccOp, + AccT* scratchSpace) { const IndexType startIndex = getStartIndex<IndexType>(totalElements); const IndexType endIndex = getEndIndex<IndexType>(totalElements); // With a block-wide stride, have each thread perform its own reduction. - T r = init; + AccT r = init; for (IndexType i = startIndex + threadIdx.x; i < endIndex; i += blockDim.x) { - const IndexType inOffset = IndexToOffset<T, IndexType, ADims>::get(i, in); + const IndexType inOffset = IndexToOffset<InT, IndexType, ADims>::get(i, in); r = reduceOp(r, modifyOp(in.data[inOffset])); } // Reduce within the block - extern __shared__ T smem[]; - r = reduceBlock<T, ReduceOp>(smem, blockDim.x, r, reduceOp, init); + extern __shared__ char smemChar[]; + AccT* smem = (AccT*) smemChar; + r = reduceBlock<AccT, ReduceAccOp>(smem, blockDim.x, r, reduceAccOp, init); if (threadIdx.x == 0) { // Write out block-wide reduced value @@ -94,18 +102,19 @@ THCudaTensor_reduceAllPass1(TensorInfo<T, IndexType> in, template <typename ReduceOp, typename T, typename IndexType> __global__ void -THCudaTensor_reduceAllPass2(int numPass1Blocks, - T init, - ReduceOp reduceOp, - T* scratchSpace, - T* out) { +kernelReduceAllPass2(int numPass1Blocks, + T init, + ReduceOp reduceOp, + T* scratchSpace, + T* out) { T r = init; if (threadIdx.x < numPass1Blocks) { r = scratchSpace[threadIdx.x]; } // Reduce within the block - extern __shared__ T smem[]; + extern __shared__ char smemChar[]; + T* smem = (T*) smemChar; r = reduceBlock<T, ReduceOp>(smem, numPass1Blocks, r, reduceOp, init); if (threadIdx.x == 0) { @@ -119,13 +128,13 @@ inline bool isTwoPassReductionSize(long elements) { return (elements > THC_TWO_PASS_REDUCTION_SIZE); } -template <typename T> +template <typename InT, typename AccT> inline long getTwoPassBlocks(THCState* state, long elements) { long numBlocks = THCCeilDiv(elements, THC_REDUCE_ALL_BLOCK_SIZE); // We can only have as many blocks as there is scratch space long scratchSpace = - THCState_getCurrentDeviceScratchSpaceSize(state) / sizeof(T); + THCState_getCurrentDeviceScratchSpaceSize(state) / sizeof(AccT); THAssert(scratchSpace > 0); if (numBlocks > scratchSpace) { @@ -136,22 +145,22 @@ inline long getTwoPassBlocks(THCState* state, long elements) { } // Get the block/grid size that we want -template <typename T> +template <typename InT, typename AccT> inline void getPass1ReduceBlockGrid(THCState* state, long elements, dim3& grid, dim3& block) { - grid = dim3(getTwoPassBlocks<T>(state, elements)); + grid = dim3(getTwoPassBlocks<InT, AccT>(state, elements)); block = dim3(THC_REDUCE_ALL_BLOCK_SIZE); } -template <typename T> +template <typename InT, typename AccT> inline void getPass2ReduceBlockGrid(THCState* state, long elements, dim3& grid, dim3& block) { grid = dim3(1); // We only need as many threads as there were blocks originally - block = dim3(getTwoPassBlocks<T>(state, elements)); + block = dim3(getTwoPassBlocks<InT, AccT>(state, elements)); } -template <typename T> +template <typename InT, typename AccT> inline void getSinglePassReduceBlockGrid(long elements, dim3& grid, dim3& block) { grid = dim3(1); @@ -160,75 +169,83 @@ inline void getSinglePassReduceBlockGrid(long elements, template <typename ModifyOp, typename ReduceOp, - typename T, + typename ReduceAccOp, + typename InT, + typename AccT, typename IndexType, int ADims> void callReduceAll(THCState* state, - const TensorInfo<T, IndexType>& in, + const TensorInfo<InT, IndexType>& in, long totalElements, - T init, + AccT init, const ModifyOp& modifyOp, const ReduceOp& reduceOp, - T* devOut) { + const ReduceAccOp& reduceAccOp, + AccT* devOut) { dim3 grid; dim3 block; if (isTwoPassReductionSize(totalElements)) { - getPass1ReduceBlockGrid<T>(state, totalElements, grid, block); - size_t smemSize = block.x * sizeof(T); + getPass1ReduceBlockGrid<InT, AccT>(state, totalElements, grid, block); + size_t smemSize = block.x * sizeof(AccT); - THCudaTensor_reduceAllPass1<ModifyOp, ReduceOp, T, IndexType, ADims> + kernelReduceAllPass1<ModifyOp, ReduceOp, ReduceAccOp, InT, AccT, IndexType, ADims> <<<grid, block, smemSize, THCState_getCurrentStream(state)>>>( - in, (IndexType) totalElements, init, modifyOp, reduceOp, - (T*) THCState_getCurrentDeviceScratchSpace(state)); + in, (IndexType) totalElements, init, modifyOp, reduceOp, reduceAccOp, + (AccT*) THCState_getCurrentDeviceScratchSpace(state)); int numPass1Blocks = grid.x; - getPass2ReduceBlockGrid<T>(state, totalElements, grid, block); - smemSize = block.x * sizeof(T); + getPass2ReduceBlockGrid<InT, AccT>(state, totalElements, grid, block); + smemSize = block.x * sizeof(AccT); - THCudaTensor_reduceAllPass2<ReduceOp, T, IndexType> + kernelReduceAllPass2<ReduceAccOp, AccT, IndexType> <<<grid, block, smemSize, THCState_getCurrentStream(state)>>>( - numPass1Blocks, init, reduceOp, - (T*) THCState_getCurrentDeviceScratchSpace(state), + numPass1Blocks, init, reduceAccOp, + (AccT*) THCState_getCurrentDeviceScratchSpace(state), devOut); } else { - getSinglePassReduceBlockGrid<T>(totalElements, grid, block); - size_t smemSize = block.x * sizeof(T); + getSinglePassReduceBlockGrid<InT, AccT>(totalElements, grid, block); + size_t smemSize = block.x * sizeof(AccT); - THCudaTensor_reduceAll<ModifyOp, ReduceOp, T, IndexType, ADims> + kernelReduceAll<ModifyOp, ReduceOp, ReduceAccOp, InT, AccT, IndexType, ADims> <<<grid, block, smemSize, THCState_getCurrentStream(state)>>>( - in, (IndexType) totalElements, init, modifyOp, reduceOp, devOut); + in, (IndexType) totalElements, init, modifyOp, reduceOp, reduceAccOp, devOut); } } -// Reduces the entire tensor to one floating-point value. `out` points -// to host-resident memory. -template <typename ModifyOp, typename ReduceOp> -bool THCudaTensor_reduceAll(THCState* state, - THCudaTensor* in, - const ModifyOp& modifyOp, - const ReduceOp& reduceOp, - float init, - float* out, - int outOnDevice) { - long inElements = THCudaTensor_nElement(state, in); - - if (THCudaTensor_nDimension(state, in) > MAX_CUTORCH_DIMS) { +// Reduces the entire tensor to one value. `out` points to +// host-resident memory. +template <typename TensorType, + typename ModifyOp, + typename ReduceOp, + typename ReduceAccOp, + typename AccT> +bool THC_reduceAll(THCState* state, + TensorType* in, + const ModifyOp& modifyOp, + const ReduceOp& reduceOp, + const ReduceAccOp& reduceAccOp, + AccT init, + AccT* out, + int outOnDevice) { + long inElements = TensorUtils<TensorType>::getNumElements(state, in); + + if (TensorUtils<TensorType>::getDims(state, in) > MAX_CUTORCH_DIMS) { return false; } - if (THCudaTensor_nDimension(state, in) == 0) { + if (TensorUtils<TensorType>::getDims(state, in) == 0) { // Zero-dim tensor; do nothing *out = init; return true; } - float* devOut = out; + AccT* devOut = out; if (!outOnDevice) { // Use the stream-specific scratch space for the reduction kernel // to write out its value - devOut = (float*) THCState_getCurrentDeviceScratchSpace(state); + devOut = (AccT*) THCState_getCurrentDeviceScratchSpace(state); } // It is possible that the tensor dimensions are able to be collapsed, @@ -240,10 +257,12 @@ bool THCudaTensor_reduceAll(THCState* state, // dimension, and the loop to translate the linear index to the array // index can be similarly collapsed. That is what this unrolling is for. #define HANDLE_CASE(TYPE, IN) \ - callReduceAll<ModifyOp, ReduceOp, \ - typename TensorUtils<THCudaTensor>::DataType, \ + callReduceAll<ModifyOp, ReduceOp, ReduceAccOp, \ + typename TensorUtils<TensorType>::DataType, \ + AccT, \ TYPE, IN>( \ - state, inInfo, inElements, init, modifyOp, reduceOp, devOut); + state, inInfo, inElements, init, modifyOp, \ + reduceOp, reduceAccOp, devOut); #define HANDLE_IN_CASE(TYPE, IN) \ { \ @@ -257,9 +276,6 @@ bool THCudaTensor_reduceAll(THCState* state, case 2: \ HANDLE_CASE(TYPE, 2); \ break; \ - case 3: \ - HANDLE_CASE(TYPE, 3); \ - break; \ default: \ HANDLE_CASE(TYPE, -1); \ break; \ @@ -267,15 +283,16 @@ bool THCudaTensor_reduceAll(THCState* state, } \ } - if (TensorUtils<THCudaTensor>::canUse32BitIndexMath(state, in)) { - TensorInfo<float, unsigned int> inInfo = - getTensorInfo<THCudaTensor, unsigned int>(state, in); + if (TensorUtils<TensorType>::canUse32BitIndexMath(state, in)) { + TensorInfo<typename TensorUtils<TensorType>::DataType, unsigned int> inInfo = + getTensorInfo<TensorType, unsigned int>(state, in); inInfo.collapseDims(); HANDLE_IN_CASE(unsigned int, inInfo.dims); } else { - TensorInfo<float, unsigned long long> inInfo = - getTensorInfo<THCudaTensor, unsigned long long>(state, in); + TensorInfo<typename TensorUtils<TensorType>::DataType, + unsigned long long> inInfo = + getTensorInfo<TensorType, unsigned long long>(state, in); inInfo.collapseDims(); // For large tensors, we only compile the completely contiguous @@ -293,7 +310,7 @@ bool THCudaTensor_reduceAll(THCState* state, // If our destination is not on the device, copy the value back to // the host (synchronous!) if (!outOnDevice) { - cudaMemcpy(out, devOut, sizeof(float), cudaMemcpyDeviceToHost); + cudaMemcpy(out, devOut, sizeof(AccT), cudaMemcpyDeviceToHost); } return true; diff --git a/lib/THC/THCTensorCopy.cu b/lib/THC/THCTensorCopy.cu index ea38d2e..e74d23d 100644 --- a/lib/THC/THCTensorCopy.cu +++ b/lib/THC/THCTensorCopy.cu @@ -1,5 +1,6 @@ #include "THCApply.cuh" #include "THCHalf.h" +#include "THCNumerics.cuh" inline int curGPU() { int curDev; diff --git a/lib/THC/THCTensorMasked.cu b/lib/THC/THCTensorMasked.cu index 6ee5bad..f3cb4ef 100644 --- a/lib/THC/THCTensorMasked.cu +++ b/lib/THC/THCTensorMasked.cu @@ -1,9 +1,6 @@ #include "THCTensorMath.h" #include "THCGeneral.h" -#include "THCDeviceUtils.cuh" -#include "THCBlas.h" #include "THCTensorCopy.h" -#include "THCTensorRandom.h" #include "THCApply.cuh" #include "THCReduce.cuh" @@ -13,202 +10,47 @@ #include <thrust/system/cuda/execution_policy.h> #endif +template <typename T, typename MaskT> struct TensorMaskedFillOp { - TensorMaskedFillOp(float v) : value(v) {} - __device__ __forceinline__ void operator()(float* t, float* mask) { - // Really mask should be `0` or `1` but we can't propagate errors here. - if (*mask != 0.0f) { + TensorMaskedFillOp(T v) : value(v) {} + __device__ inline void operator()(T* t, MaskT* mask) { + if (*mask) { *t = value; } } - float value; + T value; }; -void THCudaTensor_maskedFill(THCState* state, - THCudaTensor *tensor, THCudaTensor *mask, float value) -{ - THAssert(THCudaTensor_checkGPU(state, 2, tensor, mask)); - THArgCheck(THCudaTensor_nElement(state, tensor) == - THCudaTensor_nElement(state, mask), - 2, "sizes do not match"); - - if (!THC_pointwiseApply2(state, tensor, mask, TensorMaskedFillOp(value))) { - THArgCheck(false, 2, CUTORCH_DIM_WARNING); - } - - THCudaCheck(cudaGetLastError()); -} - +template <typename T, typename MaskT, typename MaskPrefixSumT> struct TensorMaskedCopyOp { - TensorMaskedCopyOp(float* s) : src(s) {} + TensorMaskedCopyOp(T* s) : in(s) {} - __device__ __forceinline__ void operator()(float* out, float* mask, float* maskPrefixSum) { - // Really mask should be `0` or `1` but we can't propagate errors here. - if (*mask != 0.0f) { - // We've already checked that this offset is <= 2^24, so this is ok. - *out = src[(int) *maskPrefixSum]; + __device__ inline void operator()(T* out, + MaskT* mask, + MaskPrefixSumT* maskPrefixSum) { + if (*mask) { + *out = in[*maskPrefixSum]; } } // Where we are copying from - float* src; + T* in; }; - -void THCudaTensor_maskedCopy(THCState* state, - THCudaTensor *tensor, THCudaTensor *mask, THCudaTensor *src) -{ - THAssert(THCudaTensor_checkGPU(state, 3, tensor, src, mask)); - long maskSize = THCudaTensor_nElement(state, mask); - long tensorSize = THCudaTensor_nElement(state, tensor); - long srcSize = THCudaTensor_nElement(state, src); - - // Since we are performing a prefix sum of mask, it cannot exceed - // the size allowed in consecutive integers in float32 - THArgCheck(maskSize <= (long) FLOAT32_MAX_CONSECUTIVE_INT, - 3, "mask nElements exceeds single-precision float " - "consecutive integer precision size (2^24)"); - - // `mask` and `tensor` must have the same number of elements - THArgCheck(maskSize == tensorSize, 2, - "mask and tensor must have the same number of elements"); - - THCudaTensor* contigMask = THCudaTensor_newContiguous(state, mask); - long oneElements = (long) THCudaTensor_sumall(state, contigMask); - - // The number of `1` elements present in the mask must be <= the - // number of elements available in `src` - if (oneElements > srcSize) { - THCudaTensor_free(state, contigMask); - THArgCheck(false, 2, "source nElements must be == mask `1` elements"); - } - - // Use a prefix sum to determine the copy locations of the masked elements - THCudaTensor* maskPrefixSum = THCudaTensor_new(state); - THCudaTensor_resizeAs(state, maskPrefixSum, contigMask); - - // We are getting elements from `src` based on an offset from - // `maskPrefixSum`, so that should be made contiguous too - THCudaTensor* contigSrc = THCudaTensor_newContiguous(state, src); - - thrust::device_ptr<float> - maskData(THCudaTensor_data(state, contigMask)); - thrust::device_ptr<float> - maskPrefixSumData(THCudaTensor_data(state, maskPrefixSum)); - thrust::exclusive_scan( -#if CUDA_VERSION >= 7000 - thrust::cuda::par.on(THCState_getCurrentStream(state)), -#endif - maskData, - maskData + THCudaTensor_nElement(state, contigMask), - maskPrefixSumData); - - // update `tensor` where `mask` == 1 but pull from `src` at - // maskPrefixSum - bool status = THC_pointwiseApply3( - state, tensor, contigMask, maskPrefixSum, - TensorMaskedCopyOp(THCudaTensor_data(state, contigSrc))); - - THCudaTensor_free(state, contigSrc); - THCudaTensor_free(state, maskPrefixSum); - THCudaTensor_free(state, contigMask); - - THArgCheck(status, 2, CUTORCH_DIM_WARNING); - THCudaCheck(cudaGetLastError()); -} - +template <typename T, typename MaskT, typename MaskPrefixSumT> struct TensorMaskedSelectOp { - TensorMaskedSelectOp(float* t) : out(t) {} - __device__ __forceinline__ void operator()(float* mask, float* maskPrefixSum, float* in) { - // Really mask should be `0` or `1` but we can't propagate errors here. - if (*mask != 0.0f) { - out[(int) *maskPrefixSum] = *in; + TensorMaskedSelectOp(T* t) : out(t) {} + __device__ inline void operator()(MaskT* mask, + MaskPrefixSumT* maskPrefixSum, + T* in) { + if (*mask) { + out[*maskPrefixSum] = *in; } } - float* out; + T* out; }; -void THCudaTensor_maskedSelect(THCState* state, - THCudaTensor *tensor, THCudaTensor *src, THCudaTensor *mask) -{ - THAssert(THCudaTensor_checkGPU(state, 3, tensor, src, mask)); - THArgCheck(THCudaTensor_nElement(state, mask) == THCudaTensor_nElement(state, src), - 2, "sizes do not match"); - - // Since we are performing a prefix sum of mask, it cannot exceed - // the size allowed in consecutive integers in float32 - THArgCheck(THCudaTensor_nElement(state, mask) <= - (long) FLOAT32_MAX_CONSECUTIVE_INT, - 3, "mask nElements exceeds single-precision float " - "consecutive integer precision size (2^24)"); - - // Determine our output size - THCudaTensor* contigMask = THCudaTensor_newContiguous(state, mask); - long totalElements = (long) THCudaTensor_sumall(state, contigMask); - - // This should be contiguous already, so no need to make it contig - // for the apply kernel - THCudaTensor_resize1d(state, tensor, totalElements); - - // Use a prefix sum to determine the output locations of the masked elements - THCudaTensor* maskPrefixSum = THCudaTensor_new(state); - THCudaTensor_resizeAs(state, maskPrefixSum, contigMask); - - thrust::device_ptr<float> - maskData(THCudaTensor_data(state, contigMask)); - thrust::device_ptr<float> - maskPrefixSumData(THCudaTensor_data(state, maskPrefixSum)); - thrust::exclusive_scan( -#if CUDA_VERSION >= 7000 - thrust::cuda::par.on(THCState_getCurrentStream(state)), -#endif - maskData, - maskData + THCudaTensor_nElement(state, contigMask), - maskPrefixSumData); - - // Then copy over the masked elements at their desired output index - bool status = THC_pointwiseApply3( - state, contigMask, maskPrefixSum, - src, TensorMaskedSelectOp(THCudaTensor_data(state, tensor))); - - THCudaTensor_free(state, contigMask); - THCudaTensor_free(state, maskPrefixSum); - - THArgCheck(status, 2, CUTORCH_DIM_WARNING); - THCudaCheck(cudaGetLastError()); -} - -void THCudaTensor_maskedFillByte(THCState* state, THCudaTensor *tensor, THByteTensor *mask, float value) -{ - THAssert(THCudaTensor_checkGPU(state, 1, tensor)); - THLongStorage* maskSize = THByteTensor_newSizeOf(mask); - THCudaTensor* maskCuda = THCudaTensor_newWithSize(state, maskSize, NULL); - THLongStorage_free(maskSize); - THCudaTensor_copyByte(state, maskCuda, mask); - THCudaTensor_maskedFill(state, tensor, maskCuda, value); - THCudaTensor_free(state, maskCuda); -} - -void THCudaTensor_maskedCopyByte(THCState* state, THCudaTensor *tensor, THByteTensor *mask, THCudaTensor *src) -{ - THAssert(THCudaTensor_checkGPU(state, 2, tensor, src)); - THLongStorage* maskSize = THByteTensor_newSizeOf(mask); - THCudaTensor* maskCuda = THCudaTensor_newWithSize(state, maskSize, NULL); - THLongStorage_free(maskSize); - THCudaTensor_copyByte(state, maskCuda, mask); - THCudaTensor_maskedCopy(state, tensor, maskCuda, src); - THCudaTensor_free(state, maskCuda); -} - -void THCudaTensor_maskedSelectByte(THCState* state, THCudaTensor *tensor, THCudaTensor *src, THByteTensor *mask) -{ - THAssert(THCudaTensor_checkGPU(state, 2, tensor, src)); - THLongStorage* maskSize = THByteTensor_newSizeOf(mask); - THCudaTensor* maskCuda = THCudaTensor_newWithSize(state, maskSize, NULL); - THLongStorage_free(maskSize); - THCudaTensor_copyByte(state, maskCuda, mask); - THCudaTensor_maskedSelect(state, tensor, src, maskCuda); - THCudaTensor_free(state, maskCuda); -} +#include "generic/THCTensorMasked.cu" +#include "THCGenerateAllTypes.h" diff --git a/lib/THC/THCTensorMath.cu b/lib/THC/THCTensorMath.cu index 5e140ef..f2a7607 100644 --- a/lib/THC/THCTensorMath.cu +++ b/lib/THC/THCTensorMath.cu @@ -1,12 +1,9 @@ #include "THCTensorMath.h" #include "THCGeneral.h" #include "THCTensorCopy.h" -#include "THCTensorRandom.h" #include "THCApply.cuh" -#include "THCReduce.cuh" -#include "THCReduceAll.cuh" +#include "THCNumerics.cuh" -#include <thrust/functional.h> #include <cfloat> void THCudaTensor_cat(THCState *state, THCudaTensor *result, THCudaTensor *ta, THCudaTensor *tb, int dimension) @@ -147,132 +144,6 @@ void THCudaTensor_addcdiv(THCState *state, THCudaTensor *self_, THCudaTensor *t, THCudaCheck(cudaGetLastError()); } -float THCudaTensor_minall(THCState *state, THCudaTensor *self) -{ - THAssert(THCudaTensor_checkGPU(state, 1, self)); - float val = FLT_MAX; - if (!THCudaTensor_reduceAll(state, self, - thrust::identity<float>(), - thrust::minimum<float>(), - FLT_MAX, &val, 0)) { - THArgCheck(false, 1, CUTORCH_DIM_WARNING); - } - - THCudaCheck(cudaGetLastError()); - return val; -} - -float THCudaTensor_maxall(THCState *state, THCudaTensor *self) -{ - THAssert(THCudaTensor_checkGPU(state, 1, self)); - float val = -FLT_MAX; - if (!THCudaTensor_reduceAll(state, self, - thrust::identity<float>(), - thrust::maximum<float>(), - -FLT_MAX, &val, 0)) { - THArgCheck(false, 1, CUTORCH_DIM_WARNING); - } - - THCudaCheck(cudaGetLastError()); - return val; -} - -float THCudaTensor_sumall(THCState *state, THCudaTensor *self) -{ - THAssert(THCudaTensor_checkGPU(state, 1, self)); - float val = 0.0f; - if (!THCudaTensor_reduceAll(state, self, - thrust::identity<float>(), - thrust::plus<float>(), - 0.0f, &val, 0)) { - THArgCheck(false, 1, CUTORCH_DIM_WARNING); - } - - THCudaCheck(cudaGetLastError()); - return val; -} - -float THCudaTensor_prodall(THCState *state, THCudaTensor *self) -{ - THAssert(THCudaTensor_checkGPU(state, 1, self)); - float val = 1.0f; - if (!THCudaTensor_reduceAll(state, self, - thrust::identity<float>(), - thrust::multiplies<float>(), - 1.0f, &val, 0)) { - THArgCheck(false, 1, CUTORCH_DIM_WARNING); - } - - THCudaCheck(cudaGetLastError()); - return val; -} - -void THCudaTensor_sum(THCState* state, THCudaTensor *self, THCudaTensor *src, long dimension) -{ - THAssert(THCudaTensor_checkGPU(state, 2, self, src)); - if (!THCudaTensor_reduceDim( - state, self, src, - thrust::identity<float>(), thrust::plus<float>(), 0.0f, dimension)) { - THArgCheck(false, 2, CUTORCH_DIM_WARNING); - } - - THCudaCheck(cudaGetLastError()); -} - -void THCudaTensor_prod(THCState* state, THCudaTensor *self, THCudaTensor *src, long dimension) -{ - THAssert(THCudaTensor_checkGPU(state, 2, self, src)); - if (!THCudaTensor_reduceDim( - state, self, src, - thrust::identity<float>(), thrust::multiplies<float>(), 1.0f, dimension)) { - THArgCheck(false, 2, CUTORCH_DIM_WARNING); - } - - THCudaCheck(cudaGetLastError()); -} - -struct logicalall_functor -{ - __device__ inline float operator()(float x, float y) const - { - return x && y; - } -}; - -struct logicalany_functor -{ - __device__ float operator()(float x, float y) const - { - return x || y; - } -}; - -int THCudaTensor_logicalall(THCState *state, THCudaTensor *self) { - THAssert(THCudaTensor_checkGPU(state, 1, self)); - float result = 0.0f; - if (!THCudaTensor_reduceAll(state, self, - thrust::identity<float>(), - logicalall_functor(), - 1.0f, &result, 0)) { - THArgCheck(false, 1, CUTORCH_DIM_WARNING); - } - - return (int) result; -} - -int THCudaTensor_logicalany(THCState *state, THCudaTensor *self) { - THAssert(THCudaTensor_checkGPU(state, 1, self)); - float result = 0.0f; - if (!THCudaTensor_reduceAll(state, self, - thrust::identity<float>(), - logicalany_functor(), - 0.0f, &result, 0)) { - THArgCheck(false, 1, CUTORCH_DIM_WARNING); - } - - return (int) result; -} - template <typename T> struct TensorFillOp { TensorFillOp(T v) : val(v) {} diff --git a/lib/THC/THCTensorMath.h b/lib/THC/THCTensorMath.h index 6593da2..89d8989 100644 --- a/lib/THC/THCTensorMath.h +++ b/lib/THC/THCTensorMath.h @@ -7,12 +7,26 @@ #include "generic/THCTensorMath.h" #include "THCGenerateAllTypes.h" +#include "generic/THCTensorMathBlas.h" +#include "THCGenerateAllTypes.h" + #include "generic/THCTensorMathPairwise.h" #include "THCGenerateAllTypes.h" #include "generic/THCTensorMathPointwise.h" #include "THCGenerateAllTypes.h" +#include "generic/THCTensorMathReduce.h" +#include "THCGenerateAllTypes.h" + +#include "generic/THCTensorMathCompare.h" +#include "THCGenerateAllTypes.h" + +#include "generic/THCTensorMathCompareT.h" +#include "THCGenerateAllTypes.h" + +#include "generic/THCTensorMasked.h" +#include "THCGenerateAllTypes.h" THC_API void THCudaTensor_tril(THCState *state, THCudaTensor *self, THCudaTensor *src, long k); THC_API void THCudaTensor_triu(THCState *state, THCudaTensor *self, THCudaTensor *src, long k); @@ -20,16 +34,6 @@ THC_API void THCudaTensor_triu(THCState *state, THCudaTensor *self, THCudaTensor THC_API void THCudaTensor_addcmul(THCState *state, THCudaTensor *self, THCudaTensor* t, float value, THCudaTensor *src1, THCudaTensor *src2); THC_API void THCudaTensor_addcdiv(THCState *state, THCudaTensor *self, THCudaTensor* t, float value, THCudaTensor *src1, THCudaTensor *src2); -THC_API float THCudaTensor_dot(THCState *state, THCudaTensor *self, THCudaTensor *src); - -THC_API float THCudaTensor_minall(THCState *state, THCudaTensor *self); -THC_API float THCudaTensor_maxall(THCState *state, THCudaTensor *self); -THC_API float THCudaTensor_sumall(THCState *state, THCudaTensor *self); -THC_API float THCudaTensor_prodall(THCState *state, THCudaTensor *self); -THC_API void THCudaTensor_min(THCState *state, THCudaTensor *values, THCudaTensor *indices, THCudaTensor *src, long dim); -THC_API void THCudaTensor_max(THCState *state, THCudaTensor *values, THCudaTensor *indices, THCudaTensor *src, long dim); -THC_API void THCudaTensor_sum(THCState *state, THCudaTensor *self, THCudaTensor *src, long dim); -THC_API void THCudaTensor_prod(THCState *state, THCudaTensor *self, THCudaTensor *src, long dim); THC_API void THCudaTensor_cumsum(THCState *state, THCudaTensor *self, THCudaTensor *src, long dim); THC_API void THCudaTensor_cumprod(THCState *state, THCudaTensor *self, THCudaTensor *src, long dim); @@ -38,14 +42,6 @@ THC_API void THCudaTensor_cmax(THCState *state, THCudaTensor *self, THCudaTensor THC_API void THCudaTensor_cminValue(THCState *state, THCudaTensor *self, THCudaTensor *src, float value); THC_API void THCudaTensor_cmaxValue(THCState *state, THCudaTensor *self, THCudaTensor *src, float value); -THC_API void THCudaTensor_addmv(THCState *state, THCudaTensor *self, float beta, THCudaTensor *t, float alpha, THCudaTensor *mat, THCudaTensor *vec); -THC_API void THCudaTensor_addmm(THCState *state, THCudaTensor *self, float beta, THCudaTensor *t, float alpha, THCudaTensor *mat1, THCudaTensor *mat2); -THC_API void THCudaTensor_addr(THCState *state, THCudaTensor *self, float beta, THCudaTensor *t, float alpha, THCudaTensor *vec1, THCudaTensor *vec2); -THC_API void THCudaTensor_addbmm(THCState *state, THCudaTensor *result, float beta, THCudaTensor *t, - float alpha, THCudaTensor *batch1, THCudaTensor *batch2); -THC_API void THCudaTensor_baddbmm(THCState *state, THCudaTensor *result, float beta, THCudaTensor *t, - float alpha, THCudaTensor *batch1, THCudaTensor *batch2); - THC_API void THCudaTensor_log(THCState *state, THCudaTensor *self, THCudaTensor *src); THC_API void THCudaTensor_log1p(THCState *state, THCudaTensor *self, THCudaTensor *src); THC_API void THCudaTensor_sigmoid(THCState *state, THCudaTensor *self, THCudaTensor *src); @@ -93,20 +89,6 @@ THC_API void THCudaTensor_qr(THCState *state, THCudaTensor *rq_, THCudaTensor *r THC_API void THCudaTensor_cat(THCState *state, THCudaTensor *result, THCudaTensor *ta, THCudaTensor *tb, int dimension); THC_API void THCudaTensor_catArray(THCState *state, THCudaTensor *result, THCudaTensor **inputs, int numInputs, int dimension); -THC_API void THCudaTensor_ltValue(THCState *state, THCudaTensor *self_, THCudaTensor *src, float value); -THC_API void THCudaTensor_gtValue(THCState *state, THCudaTensor *self_, THCudaTensor *src, float value); -THC_API void THCudaTensor_leValue(THCState *state, THCudaTensor *self_, THCudaTensor *src, float value); -THC_API void THCudaTensor_geValue(THCState *state, THCudaTensor *self_, THCudaTensor *src, float value); -THC_API void THCudaTensor_eqValue(THCState *state, THCudaTensor *self_, THCudaTensor *src, float value); -THC_API void THCudaTensor_neValue(THCState *state, THCudaTensor *self_, THCudaTensor *src, float value); - -THC_API void THCudaTensor_ltTensor(THCState *state, THCudaTensor *self_, THCudaTensor *src1, THCudaTensor *src2); -THC_API void THCudaTensor_gtTensor(THCState *state, THCudaTensor *self_, THCudaTensor *src1, THCudaTensor *src2); -THC_API void THCudaTensor_leTensor(THCState *state, THCudaTensor *self_, THCudaTensor *src1, THCudaTensor *src2); -THC_API void THCudaTensor_geTensor(THCState *state, THCudaTensor *self_, THCudaTensor *src1, THCudaTensor *src2); -THC_API void THCudaTensor_eqTensor(THCState *state, THCudaTensor *self_, THCudaTensor *src1, THCudaTensor *src2); -THC_API void THCudaTensor_neTensor(THCState *state, THCudaTensor *self_, THCudaTensor *src1, THCudaTensor *src2); - THC_API float THCudaTensor_meanall(THCState *state, THCudaTensor *self); THC_API void THCudaTensor_mean(THCState *state, THCudaTensor *self, THCudaTensor *src, long dim); THC_API float THCudaTensor_varall(THCState *state, THCudaTensor *self); @@ -131,19 +113,11 @@ THC_API void THCudaTensor_indexAdd_long(THCState *state, THCudaTensor *res_, int THC_API void THCudaTensor_indexFill_long(THCState *state, THCudaTensor *tensor, int dim, THLongTensor *index, float val); THC_API void THCudaTensor_indexSelect_long(THCState *state, THCudaTensor *tensor, THCudaTensor *src, int dim, THLongTensor *index); -THC_API void THCudaTensor_maskedFill(THCState *state, THCudaTensor *tensor, THCudaTensor *mask, float value); -THC_API void THCudaTensor_maskedCopy(THCState *state, THCudaTensor *tensor, THCudaTensor *mask, THCudaTensor *src); -THC_API void THCudaTensor_maskedSelect(THCState *state, THCudaTensor *tensor, THCudaTensor *src, THCudaTensor *mask); - -THC_API void THCudaTensor_maskedFillByte(THCState *state, THCudaTensor *tensor, THByteTensor *mask, float value); -THC_API void THCudaTensor_maskedCopyByte(THCState *state, THCudaTensor *tensor, THByteTensor *mask, THCudaTensor *src); -THC_API void THCudaTensor_maskedSelectByte(THCState *state, THCudaTensor *tensor, THCudaTensor *src, THByteTensor *mask); - THC_API void THCudaTensor_gather(THCState* state, THCudaTensor *tensor, THCudaTensor *src, int dim, THCudaTensor *index); THC_API void THCudaTensor_scatter(THCState* state, THCudaTensor *tensor, int dim, THCudaTensor *index, THCudaTensor *src); THC_API void THCudaTensor_scatterFill(THCState* state, THCudaTensor *tensor, int dim, THCudaTensor *index, float value); -THC_API int THCudaTensor_logicalall(THCState *state, THCudaTensor *self); -THC_API int THCudaTensor_logicalany(THCState *state, THCudaTensor *self); +THC_API int THCudaByteTensor_logicalall(THCState *state, THCudaByteTensor *self); +THC_API int THCudaByteTensor_logicalany(THCState *state, THCudaByteTensor *self); #endif diff --git a/lib/THC/THCTensorMath2.cu b/lib/THC/THCTensorMath2.cu index 0d6bb52..dc8544c 100644 --- a/lib/THC/THCTensorMath2.cu +++ b/lib/THC/THCTensorMath2.cu @@ -525,24 +525,24 @@ void THCudaTensor_norm(THCState *state, THCudaTensor* self, THCudaTensor* src, f { THAssert(THCudaTensor_checkGPU(state, 2, self, src)); if (value == 0.0f) { - THCudaTensor_reduceDim(state, self, src, - TensorNonZeroOp(), thrust::plus<float>(), - 0.0f, dimension); + THC_reduceDim(state, self, src, + TensorNonZeroOp(), thrust::plus<float>(), + 0.0f, dimension); } else if (value == 1.0f) { - THCudaTensor_reduceDim(state, self, src, - TensorNormOp<1>(value), thrust::plus<float>(), - 0.0f, dimension); + THC_reduceDim(state, self, src, + TensorNormOp<1>(value), thrust::plus<float>(), + 0.0f, dimension); } else if (value == 2.0f) { - THCudaTensor_reduceDim(state, self, src, - TensorNormOp<2>(value), thrust::plus<float>(), - 0.0f, dimension); + THC_reduceDim(state, self, src, + TensorNormOp<2>(value), thrust::plus<float>(), + 0.0f, dimension); THCudaTensor_pow(state, self, self, 0.5f); } else { - THCudaTensor_reduceDim(state, self, src, - TensorNormOp<-1>(value), thrust::plus<float>(), - 0.0f, dimension); + THC_reduceDim(state, self, src, + TensorNormOp<-1>(value), thrust::plus<float>(), + 0.0f, dimension); THCudaTensor_pow(state, self, self, 1.0f / value); } @@ -684,7 +684,7 @@ THC_API void THCudaTensor_cross(THCState *state, THCudaTensor *self, THCudaTenso for (i = 0; i < nd; i++) { THArgCheck(THCudaTensor_size(state, x, i) == THCudaTensor_size(state, y, i), 1, "dimension %i of x and y does not match", i); if (dimension < 0 && THCudaTensor_size(state, x, i) == 3) { - dimension = i; + dimension = i; } } diff --git a/lib/THC/THCTensorMathBlas.cu b/lib/THC/THCTensorMathBlas.cu index 0e373d8..0804d64 100644 --- a/lib/THC/THCTensorMathBlas.cu +++ b/lib/THC/THCTensorMathBlas.cu @@ -2,425 +2,7 @@ #include "THCGeneral.h" #include "THCBlas.h" #include "THCTensorCopy.h" -#include "THCTensorRandom.h" -#include "THCApply.cuh" -#include "THCReduce.cuh" +#include "THCNumerics.cuh" -float THCudaTensor_dot(THCState *state, THCudaTensor *self, THCudaTensor *src) -{ - THAssert(THCudaTensor_checkGPU(state, 2, self, src)); - THArgCheck(THCudaTensor_nElement(state, self) == THCudaTensor_nElement(state, src), 2, "sizes do not match"); - - { - self = THCudaTensor_newContiguous(state, self); - src = THCudaTensor_newContiguous(state, src); - - float result = THCudaBlas_dot(state, - THCudaTensor_nElement(state, self), - THCudaTensor_data(state, self), 1, - THCudaTensor_data(state, src), 1); - THCudaTensor_free(state, src); - THCudaTensor_free(state, self); - - return result; - } -} - -void THCudaTensor_addmv(THCState *state, THCudaTensor *r_, float beta, THCudaTensor *t, float alpha, THCudaTensor *mat, THCudaTensor *vec) -{ - THAssert(THCudaTensor_checkGPU(state, 4, r_, t, mat, vec)); - if( (mat->nDimension != 2) || (vec->nDimension != 1) ) - THError("matrix and vector expected"); - - if( mat->size[1] != vec->size[0] ) - THError("size mismatch"); - - if(t->nDimension != 1) - THError("size mismatch"); - - if(t->size[0] != mat->size[0]) - THError("size mismatch"); - - if(r_ != t) - { - THCudaTensor_resizeAs(state, r_, t); - THCudaTensor_copy(state, r_, t); - } - - if(mat->stride[0] == 1) - { - THCudaBlas_gemv(state, 'n', mat->size[0], mat->size[1], - alpha, THCudaTensor_data(state, mat), mat->stride[1], - THCudaTensor_data(state, vec), vec->stride[0], - beta, THCudaTensor_data(state, r_), r_->stride[0]); - } - else if(mat->stride[1] == 1) - { - THCudaBlas_gemv(state, 't', mat->size[1], mat->size[0], - alpha, THCudaTensor_data(state, mat), mat->stride[0], - THCudaTensor_data(state, vec), vec->stride[0], - beta, THCudaTensor_data(state, r_), r_->stride[0]); - } - else - { - THCudaTensor *cmat = THCudaTensor_newContiguous(state, mat); - - THCudaBlas_gemv(state, 't', mat->size[1], mat->size[0], - alpha, THCudaTensor_data(state, cmat), cmat->stride[0], - THCudaTensor_data(state, vec), vec->stride[0], - beta, THCudaTensor_data(state, r_), r_->stride[0]); - - THCudaTensor_free(state, cmat); - } -} - -void THCudaTensor_addmm(THCState *state, THCudaTensor *r_, float beta, THCudaTensor *t, float alpha, THCudaTensor *m1, THCudaTensor *m2) -{ - THAssert(THCudaTensor_checkGPU(state, 4, r_, t, m1, m2)); - char transpose_r, transpose_m1, transpose_m2; - THCudaTensor *r__, *m1_, *m2_; - - if( (m1->nDimension != 2) || (m2->nDimension != 2) ) - THError("matrix and matrix expected"); - - if(t->nDimension != 2) - THError("size mismatch"); - - if( (t->size[0] != m1->size[0]) || (t->size[1] != m2->size[1]) || (m1->size[1] != m2->size[0]) ) - THError("size mismatch"); - - if(t != r_) - { - THCudaTensor_resizeAs(state, r_, t); - THCudaTensor_copy(state, r_, t); - } - - /* r_ */ - if(r_->stride[0] == 1 && - r_->stride[1] != 0) - { - transpose_r = 'n'; - r__ = r_; - } - else if(r_->stride[1] == 1 && - r_->stride[0] != 0) - { - THCudaTensor *swap = m2; - m2 = m1; - m1 = swap; - transpose_r = 't'; - r__ = r_; - } - else - { - transpose_r = 'n'; - - THCudaTensor *transp_r_ = THCudaTensor_newTranspose(state, r_, 0, 1); - r__ = THCudaTensor_newClone(state, transp_r_); - THCudaTensor_free(state, transp_r_); - THCudaTensor_transpose(state, r__, NULL, 0, 1); - } - - /* m1 */ - if(m1->stride[(transpose_r == 'n' ? 0 : 1)] == 1 && - m1->stride[(transpose_r == 'n' ? 1 : 0)] != 0) - { - transpose_m1 = 'n'; - m1_ = m1; - } - else if(m1->stride[(transpose_r == 'n' ? 1 : 0)] == 1 && - m1->stride[(transpose_r == 'n' ? 0 : 1)] != 0) - { - transpose_m1 = 't'; - m1_ = m1; - } - else - { - transpose_m1 = (transpose_r == 'n' ? 't' : 'n'); - m1_ = THCudaTensor_newContiguous(state, m1); - } - - /* m2 */ - if(m2->stride[(transpose_r == 'n' ? 0 : 1)] == 1 && - m2->stride[(transpose_r == 'n' ? 1 : 0)] != 0) - { - transpose_m2 = 'n'; - m2_ = m2; - } - else if(m2->stride[(transpose_r == 'n' ? 1 : 0)] == 1 && - m2->stride[(transpose_r == 'n' ? 0 : 1)] != 0) - { - transpose_m2 = 't'; - m2_ = m2; - } - else - { - transpose_m2 = (transpose_r == 'n' ? 't' : 'n'); - m2_ = THCudaTensor_newContiguous(state, m2); - } - - /* do the operation */ - THCudaBlas_gemm(state, - transpose_m1, - transpose_m2, - r__->size[(transpose_r == 'n' ? 0 : 1)], - r__->size[(transpose_r == 'n' ? 1 : 0)], - m1_->size[(transpose_r == 'n' ? 1 : 0)], - alpha, - THCudaTensor_data(state, m1_), - (transpose_m1 == 'n' ? m1_->stride[(transpose_r == 'n' ? 1 : 0)] : m1_->stride[(transpose_r == 'n' ? 0 : 1)]), - THCudaTensor_data(state, m2_), - (transpose_m2 == 'n' ? m2_->stride[(transpose_r == 'n' ? 1 : 0)] : m2_->stride[(transpose_r == 'n' ? 0 : 1)]), - beta, - THCudaTensor_data(state, r__), - r__->stride[(transpose_r == 'n' ? 1 : 0)]); - - /* free intermediate variables */ - if(m1_ != m1) - THCudaTensor_free(state, m1_); - - if(m2_ != m2) - THCudaTensor_free(state, m2_); - - if(r__ != r_) - THCudaTensor_freeCopyTo(state, r__, r_); -} - -void THCudaTensor_addr(THCState *state, THCudaTensor *r_, float beta, THCudaTensor *t, float alpha, THCudaTensor *vec1, THCudaTensor *vec2) -{ - THAssert(THCudaTensor_checkGPU(state, 4, r_, t, vec1, vec2)); - if( (vec1->nDimension != 1) || (vec2->nDimension != 1) ) - THError("vector and vector expected"); - - if(t->nDimension != 2) - THError("size mismatch"); - - if( (t->size[0] != vec1->size[0]) || (t->size[1] != vec2->size[0]) ) - THError("size mismatch"); - - if(r_ != t) - { - THCudaTensor_resizeAs(state, r_, t); - THCudaTensor_copy(state, r_, t); - } - - if(beta != 1) - THCudaTensor_mul(state, r_, r_, beta); - - if(r_->stride[0] == 1) - { - THCudaBlas_ger(state, vec1->size[0], vec2->size[0], - alpha, THCudaTensor_data(state, vec1), vec1->stride[0], - THCudaTensor_data(state, vec2), vec2->stride[0], - THCudaTensor_data(state, r_), r_->stride[1]); - } - else if(r_->stride[1] == 1) - { - THCudaBlas_ger(state, vec2->size[0], vec1->size[0], - alpha, THCudaTensor_data(state, vec2), vec2->stride[0], - THCudaTensor_data(state, vec1), vec1->stride[0], - THCudaTensor_data(state, r_), r_->stride[0]); - } - else - { - THCudaTensor *cr = THCudaTensor_newClone(state, r_); - - THCudaBlas_ger(state, vec2->size[0], vec1->size[0], - alpha, THCudaTensor_data(state, vec2), vec2->stride[0], - THCudaTensor_data(state, vec1), vec1->stride[0], - THCudaTensor_data(state, cr), cr->stride[0]); - - THCudaTensor_freeCopyTo(state, cr, r_); - } -} - -void THCudaTensor_addbmm(THCState *state, THCudaTensor *result, float beta, THCudaTensor *t, - float alpha, THCudaTensor *batch1, THCudaTensor *batch2) { - THAssert(THCudaTensor_checkGPU(state, 4, result, t, batch1, batch2)); - THArgCheck(THCudaTensor_nDimension(state, t) == 2, 4, "expected 2D tensor"); - THArgCheck(THCudaTensor_nDimension(state, batch1) == 3, 6, "expected 3D tensor"); - THArgCheck(THCudaTensor_nDimension(state, batch2) == 3, 7, "expected 3D tensor"); - - long batchnum = THCudaTensor_size(state, batch1, 0); - long m1d1 = THCudaTensor_size(state, batch1, 1); - long innerdim = THCudaTensor_size(state, batch1, 2); - long m2d2 = THCudaTensor_size(state, batch2, 2); - - THArgCheck(batchnum == THCudaTensor_size(state, batch2, 0), 7, - "equal number of batches expected"); - // M is t, as listed in the docs under addbmm - THArgCheck(m1d1 == THCudaTensor_size(state, t, 0), 6, - "first dimension must match first dimension of M"); - THArgCheck(m2d2 == THCudaTensor_size(state, t, 1), 7, - "second dimension must match second dimension of M"); - THArgCheck(innerdim == THCudaTensor_size(state, batch2, 1), 6, - "second dimension must match first dimension of batch2"); - - if (t != result) { - THCudaTensor_resizeAs(state, result, t); - THCudaTensor_copy(state, result, t); - } - - THCudaTensor *slice1 = THCudaTensor_new(state); - THCudaTensor *slice2 = THCudaTensor_new(state); - for (long i=0; i<batchnum; i++) { - THCudaTensor_select(state, slice1, batch1, 0, i); - THCudaTensor_select(state, slice2, batch2, 0, i); - - THCudaTensor_addmm(state, result, beta, result, alpha, slice1, slice2); - beta = 1; - } - THCudaTensor_free(state, slice1); - THCudaTensor_free(state, slice2); -} - -void THCudaTensor_baddbmm(THCState *state, THCudaTensor *result, float beta, THCudaTensor *t, - float alpha, THCudaTensor *batch1, THCudaTensor *batch2) { - THAssert(THCudaTensor_checkGPU(state, 4, result, t, batch1, batch2)); - THArgCheck(THCudaTensor_nDimension(state, t) == 3, 4, "expected 3D tensor"); - THArgCheck(THCudaTensor_nDimension(state, batch1) == 3, 6, "expected 3D tensor"); - THArgCheck(THCudaTensor_nDimension(state, batch2) == 3, 7, "expected 3D tensor"); - THArgCheck(THCudaTensor_size(state, t, 0) == THCudaTensor_size(state, batch1, 0), 6, - "equal number of batches expected"); - THArgCheck(THCudaTensor_size(state, t, 0) == THCudaTensor_size(state, batch2, 0), 7, - "equal number of batches expected"); - THArgCheck(THCudaTensor_size(state, t, 1) == THCudaTensor_size(state, batch1, 1), 6, - "wrong matrix size"); - THArgCheck(THCudaTensor_size(state, t, 2) == THCudaTensor_size(state, batch2, 2), 7, - "wrong matrix size"); - THArgCheck(THCudaTensor_size(state, batch1, 2) == THCudaTensor_size(state, batch2, 1), 6, - "wrong matrix size"); - - if (t != result) { - THCudaTensor_resizeAs(state, result, t); - THCudaTensor_copy(state, result, t); - } - - bool transpose_result; - char transpose_batch1, transpose_batch2; - long lda, ldb, ldc; - THCudaTensor *result_, *batch1_, *batch2_; - if (result->stride[1] == 1) - { - transpose_result = false; - result_ = result; - ldc = result_->stride[2]; - } - else if (result->stride[2] == 1) - { - transpose_result = true; - - THCudaTensor *swap = batch2; - batch2 = batch1; - batch1 = swap; - - result_ = result; - ldc = result_->stride[1]; - } - else - { - transpose_result = false; - - THCudaTensor *transp_r_ = THCudaTensor_newTranspose(state, result, 1, 2); - result_ = THCudaTensor_newClone(state, transp_r_); - THCudaTensor_free(state, transp_r_); - THCudaTensor_transpose(state, result_, NULL, 1, 2); - - ldc = result_->stride[2]; - } - - if (batch1->stride[transpose_result ? 2 : 1] == 1) - { - transpose_batch1 = 'n'; - batch1_ = batch1; - lda = batch1_->stride[transpose_result ? 1 : 2]; - } - else if (batch1->stride[transpose_result ? 1 : 2] == 1) - { - transpose_batch1 = 't'; - batch1_ = batch1; - lda = batch1_->stride[transpose_result ? 2 : 1]; - } - else - { - transpose_batch1 = transpose_result ? 'n' : 't'; - batch1_ = THCudaTensor_newContiguous(state, batch1); - lda = batch1_->stride[1]; - } - - if (batch2->stride[transpose_result ? 2 : 1] == 1) - { - transpose_batch2 = 'n'; - batch2_ = batch2; - ldb = batch2_->stride[transpose_result ? 1 : 2]; - } - else if (batch2->stride[transpose_result ? 1 : 2] == 1) - { - transpose_batch2 = 't'; - batch2_ = batch2; - ldb = batch2_->stride[transpose_result ? 2 : 1]; - } - else - { - transpose_batch2 = transpose_result ? 'n' : 't'; - batch2_ = THCudaTensor_newContiguous(state, batch2); - ldb = batch2_->stride[1]; - } - - // Compute pointers to matrices in each batch. - long num_batches = result_->size[0]; - size_t matrices_size = num_batches * sizeof(float*); - const float **matrices1 = (const float **)THAlloc(matrices_size); - const float **matrices2 = (const float **)THAlloc(matrices_size); - float **result_matrices = (float **)THAlloc(matrices_size); - for (int i = 0; i < num_batches; ++i) - { - matrices1[i] = THCudaTensor_data(state, batch1_) + i * batch1_->stride[0]; - matrices2[i] = THCudaTensor_data(state, batch2_) + i * batch2_->stride[0]; - result_matrices[i] = THCudaTensor_data(state, result_) + i * result_->stride[0]; - } - - // Copy pointers to device. - const float **d_matrices1, **d_matrices2; - float **d_result_matrices; - THCudaCheck(THCudaMalloc(state, (void**)&d_matrices1, matrices_size)); - THCudaCheck(THCudaMalloc(state, (void**)&d_matrices2, matrices_size)); - THCudaCheck(THCudaMalloc(state, (void**)&d_result_matrices, matrices_size)); - - THCudaCheck(cudaMemcpyAsync(d_matrices1, matrices1, matrices_size, - cudaMemcpyHostToDevice, THCState_getCurrentStream(state))); - THCudaCheck(cudaMemcpyAsync(d_matrices2, matrices2, matrices_size, - cudaMemcpyHostToDevice, THCState_getCurrentStream(state))); - THCudaCheck(cudaMemcpyAsync(d_result_matrices, result_matrices, matrices_size, - cudaMemcpyHostToDevice, THCState_getCurrentStream(state))); - - THCudaBlas_gemmBatched( - state, - transpose_batch1, - transpose_batch2, - result_->size[transpose_result ? 2 : 1], - result_->size[transpose_result ? 1 : 2], - batch1_->size[transpose_result ? 1 : 2], - alpha, - d_matrices1, lda, - d_matrices2, ldb, - beta, - d_result_matrices, ldc, - num_batches); - - THCudaFree(state, d_matrices1); - THCudaFree(state, d_matrices2); - THCudaFree(state, d_result_matrices); - THFree(matrices1); - THFree(matrices2); - THFree(result_matrices); - - if (batch1_ != batch1) - THCudaTensor_free(state, batch1_); - - if (batch2_ != batch2) - THCudaTensor_free(state, batch2_); - - if (result_ != result) - THCudaTensor_freeCopyTo(state, result_, result); -} +#include "generic/THCTensorMathBlas.cu" +#include "THCGenerateAllTypes.h" diff --git a/lib/THC/THCTensorMathCompare.cu b/lib/THC/THCTensorMathCompare.cu index 71cfdd5..8e01e83 100644 --- a/lib/THC/THCTensorMathCompare.cu +++ b/lib/THC/THCTensorMathCompare.cu @@ -1,109 +1,85 @@ #include "THCTensorMath.h" #include "THCGeneral.h" -#include "THCBlas.h" #include "THCTensorCopy.h" -#include "THCTensorRandom.h" #include "THCApply.cuh" -#include "THCReduce.cuh" - -template<class Op> -void THCudaTensor_logicalValue(THCState *state, THCudaTensor *self_, THCudaTensor *src, Op op) -{ - THCudaTensor_resizeAs(state, self_, src); - - if (!THC_pointwiseApply2(state, self_, src, op)) { - THArgCheck(false, 2, CUTORCH_DIM_WARNING); - } - - THCudaCheck(cudaGetLastError()); -} +#include "THCNumerics.cuh" +template <typename T, typename TOut> struct TensorLTValueOp { - TensorLTValueOp(float v) : value(v) {} - __device__ __forceinline__ void operator()(float* out, float* in) { - *out = (*in < value); + TensorLTValueOp(T v) : value(v) {} + __device__ __forceinline__ void operator()(TOut* out, T* in) { + *out = ScalarConvert<bool, TOut>::to(THCNumerics<T>::lt(*in, value)); } - const float value; + const T value; }; -void THCudaTensor_ltValue(THCState *state, THCudaTensor *self_, THCudaTensor *src, float value) -{ - THAssert(THCudaTensor_checkGPU(state, 2, self_, src)); - THCudaTensor_logicalValue(state, self_, src, TensorLTValueOp(value)); -} - +template <typename T, typename TOut> struct TensorGTValueOp { - TensorGTValueOp(float v) : value(v) {} - __device__ __forceinline__ void operator()(float* out, float* in) { - *out = (*in > value); + TensorGTValueOp(T v) : value(v) {} + __device__ __forceinline__ void operator()(TOut* out, T* in) { + *out = ScalarConvert<bool, TOut>::to(THCNumerics<T>::gt(*in, value)); } - const float value; + const T value; }; -void THCudaTensor_gtValue(THCState *state, THCudaTensor *self_, THCudaTensor *src, float value) -{ - THAssert(THCudaTensor_checkGPU(state, 2, self_, src)); - THCudaTensor_logicalValue(state, self_, src, TensorGTValueOp(value)); -} +template <typename T, typename TOut> struct TensorLEValueOp { - TensorLEValueOp(float v) : value(v) {} - __device__ __forceinline__ void operator()(float* out, float* in) { - *out = (*in <= value); + TensorLEValueOp(T v) : value(v) {} + __device__ __forceinline__ void operator()(TOut* out, T* in) { + *out = ScalarConvert<bool, TOut>::to(THCNumerics<T>::le(*in, value)); } - const float value; + const T value; }; -void THCudaTensor_leValue(THCState *state, THCudaTensor *self_, THCudaTensor *src, float value) -{ - THAssert(THCudaTensor_checkGPU(state, 2, self_, src)); - THCudaTensor_logicalValue(state, self_, src, TensorLEValueOp(value)); -} - +template <typename T, typename TOut> struct TensorGEValueOp { - TensorGEValueOp(float v) : value(v) {} - __device__ __forceinline__ void operator()(float* out, float* in) { - *out = (*in >= value); + TensorGEValueOp(T v) : value(v) {} + __device__ __forceinline__ void operator()(TOut* out, T* in) { + *out = ScalarConvert<bool, TOut>::to(THCNumerics<T>::ge(*in, value)); } - const float value; + const T value; }; -void THCudaTensor_geValue(THCState *state, THCudaTensor *self_, THCudaTensor *src, float value) -{ - THAssert(THCudaTensor_checkGPU(state, 2, self_, src)); - THCudaTensor_logicalValue(state, self_, src, TensorGEValueOp(value)); -} - +template <typename T, typename TOut> struct TensorEQValueOp { - TensorEQValueOp(float v) : value(v) {} - __device__ __forceinline__ void operator()(float* out, float* in) { - *out = (*in == value); + TensorEQValueOp(T v) : value(v) {} + __device__ __forceinline__ void operator()(TOut* out, T* in) { + *out = ScalarConvert<bool, TOut>::to(THCNumerics<T>::eq(*in, value)); } - const float value; + const T value; }; -void THCudaTensor_eqValue(THCState *state, THCudaTensor *self_, THCudaTensor *src, float value) -{ - THAssert(THCudaTensor_checkGPU(state, 2, self_, src)); - THCudaTensor_logicalValue(state, self_, src, TensorEQValueOp(value)); -} - +template <typename T, typename TOut> struct TensorNEValueOp { - TensorNEValueOp(float v) : value(v) {} - __device__ __forceinline__ void operator()(float* out, float* in) { - *out = (*in != value); + TensorNEValueOp(T v) : value(v) {} + __device__ __forceinline__ void operator()(TOut* out, T* in) { + *out = ScalarConvert<bool, TOut>::to(THCNumerics<T>::ne(*in, value)); } - const float value; + const T value; }; -void THCudaTensor_neValue(THCState *state, THCudaTensor *self_, THCudaTensor *src, float value) -{ - THAssert(THCudaTensor_checkGPU(state, 2, self_, src)); - THCudaTensor_logicalValue(state, self_, src, TensorNEValueOp(value)); +template<typename TensorType, typename TensorTypeOut, class Op> +void THC_logicalValue(THCState *state, + TensorTypeOut *self_, + TensorType *src, + Op op) { + THLongStorage* st = TensorUtils<TensorType>::newSizeOf(state, src); + TensorUtils<TensorTypeOut>::resize(state, self_, st, NULL); + THLongStorage_free(st); + + if (!THC_pointwiseApply2(state, self_, src, op)) { + THArgCheck(false, 2, CUTORCH_DIM_WARNING); + } + + THCudaCheck(cudaGetLastError()); } + +#include "generic/THCTensorMathCompare.cu" +#include "THCGenerateAllTypes.h" diff --git a/lib/THC/THCTensorMathCompareT.cu b/lib/THC/THCTensorMathCompareT.cu index 0567a89..a5a130d 100644 --- a/lib/THC/THCTensorMathCompareT.cu +++ b/lib/THC/THCTensorMathCompareT.cu @@ -1,96 +1,72 @@ #include "THCTensorMath.h" #include "THCGeneral.h" -#include "THCBlas.h" #include "THCTensorCopy.h" #include "THCApply.cuh" +#include "THCNumerics.cuh" #include "THCReduce.cuh" -template<class Op> -void THCudaTensor_logicalTensor(THCState *state, THCudaTensor *self_, THCudaTensor *src1, THCudaTensor *src2, Op op) -{ - THCudaTensor_resizeAs(state, self_, src1); - THArgCheck(THCudaTensor_nElement(state, src1) == THCudaTensor_nElement(state, src2), 3, "sizes do not match"); - - if (!THC_pointwiseApply3(state, self_, src1, src2, op)) { - THArgCheck(false, 2, CUTORCH_DIM_WARNING); - } - - THCudaCheck(cudaGetLastError()); -} - +template <typename T, typename TOut> struct TensorLTOp { - __device__ __forceinline__ void operator()(float* out, float* a, float* b) { - *out = (float) (*a < *b); + __device__ inline void operator()(TOut* out, T* a, T* b) { + *out = ScalarConvert<bool, TOut>::to(THCNumerics<T>::lt(*a, *b)); } }; +template <typename T, typename TOut> struct TensorGTOp { - __device__ __forceinline__ void operator()(float* out, float* a, float* b) { - *out = (float) (*a > *b); + __device__ inline void operator()(TOut* out, T* a, T* b) { + *out = ScalarConvert<bool, TOut>::to(THCNumerics<T>::gt(*a, *b)); } }; +template <typename T, typename TOut> struct TensorLEOp { - __device__ __forceinline__ void operator()(float* out, float* a, float* b) { - *out = (float) (*a <= *b); + __device__ inline void operator()(TOut* out, T* a, T* b) { + *out = ScalarConvert<bool, TOut>::to(THCNumerics<T>::le(*a, *b)); } }; +template <typename T, typename TOut> struct TensorGEOp { - __device__ __forceinline__ void operator()(float* out, float* a, float* b) { - *out = (float) (*a >= *b); + __device__ inline void operator()(TOut* out, T* a, T* b) { + *out = ScalarConvert<bool, TOut>::to(THCNumerics<T>::ge(*a, *b)); } }; +template <typename T, typename TOut> struct TensorEQOp { - __device__ __forceinline__ void operator()(float* out, float* a, float* b) { - *out = (float) (*a == *b); + __device__ inline void operator()(TOut* out, T* a, T* b) { + *out = ScalarConvert<bool, TOut>::to(THCNumerics<T>::eq(*a, *b)); } }; +template <typename T, typename TOut> struct TensorNEOp { - __device__ __forceinline__ void operator()(float* out, float* a, float* b) { - *out = (float) (*a != *b); + __device__ inline void operator()(TOut* out, T* a, T* b) { + *out = ScalarConvert<bool, TOut>::to(THCNumerics<T>::ne(*a, *b)); } }; -void THCudaTensor_ltTensor(THCState *state, THCudaTensor *self_, THCudaTensor *src1, THCudaTensor *src2) -{ - THAssert(THCudaTensor_checkGPU(state, 3, self_, src1, src2)); - THCudaTensor_logicalTensor(state, self_, src1, src2, TensorLTOp()); -} - - -void THCudaTensor_gtTensor(THCState *state, THCudaTensor *self_, THCudaTensor *src1, THCudaTensor *src2) -{ - THAssert(THCudaTensor_checkGPU(state, 3, self_, src1, src2)); - THCudaTensor_logicalTensor(state, self_, src1, src2, TensorGTOp()); -} +template<typename TensorType, typename TensorTypeOut, typename Op> +void THC_logicalTensor(THCState *state, + TensorTypeOut *self_, + TensorType *src1, + TensorType *src2, + Op op) { + THLongStorage* st = TensorUtils<TensorType>::newSizeOf(state, src1); + TensorUtils<TensorTypeOut>::resize(state, self_, st, NULL); + THLongStorage_free(st); + THArgCheck(TensorUtils<TensorType>::getNumElements(state, src1) == + TensorUtils<TensorType>::getNumElements(state, src2), 3, + "sizes do not match"); -void THCudaTensor_leTensor(THCState *state, THCudaTensor *self_, THCudaTensor *src1, THCudaTensor *src2) -{ - THAssert(THCudaTensor_checkGPU(state, 3, self_, src1, src2)); - THCudaTensor_logicalTensor(state, self_, src1, src2, TensorLEOp()); -} - - -void THCudaTensor_geTensor(THCState *state, THCudaTensor *self_, THCudaTensor *src1, THCudaTensor *src2) -{ - THAssert(THCudaTensor_checkGPU(state, 3, self_, src1, src2)); - THCudaTensor_logicalTensor(state, self_, src1, src2, TensorGEOp()); -} - + if (!THC_pointwiseApply3(state, self_, src1, src2, op)) { + THArgCheck(false, 2, CUTORCH_DIM_WARNING); + } -void THCudaTensor_eqTensor(THCState *state, THCudaTensor *self_, THCudaTensor *src1, THCudaTensor *src2) -{ - THAssert(THCudaTensor_checkGPU(state, 3, self_, src1, src2)); - THCudaTensor_logicalTensor(state, self_, src1, src2, TensorEQOp()); + THCudaCheck(cudaGetLastError()); } - -void THCudaTensor_neTensor(THCState *state, THCudaTensor *self_, THCudaTensor *src1, THCudaTensor *src2) -{ - THAssert(THCudaTensor_checkGPU(state, 3, self_, src1, src2)); - THCudaTensor_logicalTensor(state, self_, src1, src2, TensorNEOp()); -} +#include "generic/THCTensorMathCompareT.cu" +#include "THCGenerateAllTypes.h" diff --git a/lib/THC/THCTensorMathMagma.cu b/lib/THC/THCTensorMathMagma.cu index b4fbc03..d599722 100644 --- a/lib/THC/THCTensorMathMagma.cu +++ b/lib/THC/THCTensorMathMagma.cu @@ -409,7 +409,7 @@ void THCudaTensor_getri(THCState *state, THCudaTensor *ra_, THCudaTensor *a) THCudaCheck(THCudaMalloc(state, (void**)&ipiv_gpu, n * sizeof(int))); // Run LU - THCudaBlas_getrf(state, n, d_matrices1, n, ipiv_gpu, info_gpu, 1); + THCudaBlas_Sgetrf(state, n, d_matrices1, n, ipiv_gpu, info_gpu, 1); THCudaCheck(cudaMemcpy(&info, info_gpu, sizeof(int), cudaMemcpyDeviceToHost)); @@ -419,7 +419,7 @@ void THCudaTensor_getri(THCState *state, THCudaTensor *ra_, THCudaTensor *a) THError("CUBLAS getrf : Argument %d : illegal value", -info); // Inverse - THCudaBlas_getri(state, n, d_matrices1_const, n, ipiv_gpu, d_matrices2, n, info_gpu, 1); + THCudaBlas_Sgetri(state, n, d_matrices1_const, n, ipiv_gpu, d_matrices2, n, info_gpu, 1); if (info > 0) THError("CUBLAS getri : U(%d,%d) is 0, U is singular", info, info); else if (info < 0) diff --git a/lib/THC/THCTensorMathPairwise.cu b/lib/THC/THCTensorMathPairwise.cu index 2c081d1..bbcd76f 100644 --- a/lib/THC/THCTensorMathPairwise.cu +++ b/lib/THC/THCTensorMathPairwise.cu @@ -1,10 +1,9 @@ #include "THCTensorMath.h" #include "THCGeneral.h" -#include "THCBlas.h" #include "THCHalf.h" #include "THCTensorCopy.h" #include "THCApply.cuh" -#include "THCReduce.cuh" +#include "THCNumerics.cuh" template <typename T> struct TensorAddConstantOp { diff --git a/lib/THC/THCTensorMathPointwise.cu b/lib/THC/THCTensorMathPointwise.cu index 72b16e8..a8a1031 100644 --- a/lib/THC/THCTensorMathPointwise.cu +++ b/lib/THC/THCTensorMathPointwise.cu @@ -1,9 +1,9 @@ #include "THCTensorMath.h" #include "THCGeneral.h" -#include "THCBlas.h" #include "THCHalf.h" #include "THCTensorCopy.h" #include "THCApply.cuh" +#include "THCNumerics.cuh" #include "THCReduce.cuh" #define IMPLEMENT_CUDA_TENSOR_BASIC_FUNC(NAME, CFUNC) \ diff --git a/lib/THC/THCTensorMathReduce.cu b/lib/THC/THCTensorMathReduce.cu new file mode 100644 index 0000000..27949ad --- /dev/null +++ b/lib/THC/THCTensorMathReduce.cu @@ -0,0 +1,358 @@ +#include "THCTensorMath.h" +#include "THCGeneral.h" +#include "THCNumerics.cuh" +#include "THCReduce.cuh" +#include "THCReduceAll.cuh" +#include <thrust/functional.h> + +// Reduction operators that support `half`, unlike Thrust +template <typename InT, typename AccT> +struct ReduceAdd { + inline __device__ AccT operator()(AccT a, InT b) const { + return a + (AccT) b; + } +}; + +#ifdef CUDA_HALF_TENSOR +template <> +struct ReduceAdd<half, half> { + inline __device__ half operator()(half a, half b) const { +#ifdef CUDA_HALF_INSTRUCTIONS + return __hadd(a, b); +#else + float fa = __half2float(a); + float fb = __half2float(b); + return __float2half(fa + fb); +#endif + } +}; + +template <> +struct ReduceAdd<half, float> { + inline __device__ float operator()(float a, half b) const { + return a + __half2float(b); + } +}; +#endif // CUDA_HALF_TENSOR + +template <typename InT, typename AccT> +struct ReduceMultiply { + inline __device__ AccT operator()(AccT a, InT b) const { + return a * (AccT) b; + } +}; + +#ifdef CUDA_HALF_TENSOR +template <> +struct ReduceMultiply<half, half> { + inline __device__ half operator()(half a, half b) const { +#ifdef CUDA_HALF_INSTRUCTIONS + return __hmul(a, b); +#else + float fa = __half2float(a); + float fb = __half2float(b); + return __float2half(fa * fb); +#endif + } +}; + +template <> +struct ReduceMultiply<half, float> { + inline __device__ float operator()(float a, half b) const { + return a * __half2float(b); + } +}; +#endif // CUDA_HALF_TENSOR + +template <typename T> +struct ReduceMin { + inline __device__ T operator()(T a, T b) const { + return THCNumerics<T>::lt(a, b) ? a : b; + } +}; + +template <typename T> +struct ReduceMax { + inline __device__ T operator()(T a, T b) const { + return THCNumerics<T>::gt(a, b) ? a : b; + } +}; + +struct LogicalAll { + inline __device__ unsigned char operator()(unsigned char x, + unsigned char y) const { + return (x && y); + } +}; + +struct LogicalAny { + inline __device__ unsigned char operator()(unsigned char x, + unsigned char y) const { + return (x || y); + } +}; + + +THC_API int +THCudaByteTensor_logicalall(THCState *state, THCudaByteTensor *self) { + THAssert(THCudaByteTensor_checkGPU(state, 1, self)); + unsigned char result; + if (!THC_reduceAll(state, self, + thrust::identity<unsigned char>(), + LogicalAll(), + LogicalAll(), + (unsigned char) 1, &result, 0)) { + THArgCheck(false, 1, CUTORCH_DIM_WARNING); + } + + return (int) result; +} + +THC_API int +THCudaByteTensor_logicalany(THCState *state, THCudaByteTensor *self) { + THAssert(THCudaByteTensor_checkGPU(state, 1, self)); + unsigned char result; + if (!THC_reduceAll(state, self, + thrust::identity<unsigned char>(), + LogicalAny(), + LogicalAny(), + (unsigned char) 0, &result, 0)) { + THArgCheck(false, 1, CUTORCH_DIM_WARNING); + } + + return (int) result; +} + + +#include <thrust/functional.h> + +/* A set of reduction kernels that take in binary ops on thrust pairs (of value, index). + These are useful when you not only have to do a reduction, but you might have + to preserve the location of contention (for example min/max operations). + The structure of the kernels follows the structure of the reduction kernels. +*/ +template <typename K, typename Index, class BinaryFunction> +__global__ void +kernelTransformReduceOuterDimIndex(K *tgt1, + Index *tgt2, + K *src_, + unsigned num_orows, + unsigned num_irows, + unsigned row_size, + thrust::pair<K, Index> init, + BinaryFunction binary_op) { + for (unsigned orow = blockIdx.x; orow < num_orows; orow += gridDim.x) { + for (unsigned irow = blockIdx.y * blockDim.x + threadIdx.x; + irow < num_irows; + irow += gridDim.y * blockDim.x) { + K *src = src_ + orow * row_size * num_irows + irow; + thrust::pair<K, Index> acc = init; + + for (unsigned col = 0; col < row_size; ++col) { + // +1 for Lua index + acc = binary_op(thrust::make_pair<K, Index>(*src, col+1), + acc); + src += num_irows; + } + + tgt1[orow * num_irows + irow] = acc.first; + tgt2[orow * num_irows + irow] = acc.second; + } + } +} + +template <typename TensorTypeK, + typename TensorTypeIndex, + typename BinaryFunction> +__host__ void +THC_transformReduceOuterDimIndex(THCState *state, + TensorTypeK *tgt1, + TensorTypeIndex *tgt2, + TensorTypeK *src, + long rdim, + const thrust::pair< + typename TensorUtils<TensorTypeK>::DataType, + typename TensorUtils<TensorTypeIndex>::DataType>& init, + BinaryFunction binary_op) { + unsigned ndim = TensorUtils<TensorTypeK>::getDims(state, src); + unsigned num_orows = 1; + for (unsigned dim = 0; dim < rdim; dim++) { + num_orows *= TensorUtils<TensorTypeK>::getSize(state, src, dim); + } + unsigned row_size = TensorUtils<TensorTypeK>::getSize(state, src, rdim); + unsigned num_irows = 1; + for (unsigned dim = rdim + 1; dim < ndim; dim++) { + num_irows *= TensorUtils<TensorTypeK>::getSize(state, src, dim); + } + + dim3 threads(min(512, num_irows)); + unsigned maxGridDim = 1024; + dim3 grid(min(maxGridDim, num_orows), + min(maxGridDim, THCCeilDiv(num_irows, threads.x))); + + kernelTransformReduceOuterDimIndex + <<<grid, threads, 0, THCState_getCurrentStream(state)>>>( + TensorUtils<TensorTypeK>::getData(state, tgt1), + TensorUtils<TensorTypeIndex>::getData(state, tgt2), + TensorUtils<TensorTypeK>::getData(state, src), + num_orows, num_irows, row_size, init, binary_op); + + THCudaCheck(cudaGetLastError()); +} + +/* Reduce the innermost dimension of a tensor (on thrust::pair functors which are (value, index)) + * + * For an n-d tensor (n <= 4) where the reduction is along the innermost dimension: + * + * - block.x is the innermost dimension, i.e. dimension 0; + * - block.y and grid.y make up dimension 1; and + * - grid.x and grid z are the remaining two outer dimensions (if any) + * + * Reduction along other dimensions is handled in a separate kernel. + */ +template <typename K, typename Index, class BinaryFunction> +__global__ void +kernelTransformReduceInnermostDimIndex(K *tgt1, + Index* tgt2, + K *src_, + unsigned num_rows, + unsigned row_size, + thrust::pair<K, Index> init, + BinaryFunction binary_op) { + __shared__ K sbuf[32][16 + 1]; // avoid bank conflict + __shared__ Index ibuf[32][16 + 1]; // avoid bank conflict + + for (unsigned block_row = blockIdx.x * blockDim.y; + block_row < num_rows; + block_row += blockDim.y * gridDim.x) { + unsigned row = block_row + threadIdx.y; + thrust::pair<K, Index> acc = init; + if (row < num_rows) { + K *src = src_ + row * row_size; + // Sequential reduction within a thread. + for (unsigned col = threadIdx.x; col < row_size; col += blockDim.x) { + acc = binary_op(thrust::make_pair<K, Index>(src[col], col + 1), acc); + } + } + + sbuf[threadIdx.y][threadIdx.x] = acc.first; + ibuf[threadIdx.y][threadIdx.x] = acc.second; + + __syncthreads(); + + // Reduce intermediate values to single value. + K* sline = &sbuf[threadIdx.y][0]; + Index* iline = &ibuf[threadIdx.y][0]; + for (unsigned s = 8; s > 0; s >>= 1) { + if (row < num_rows && threadIdx.x < s) { + thrust::pair<K, Index> arg1 = + thrust::make_pair<K, Index>(sline[threadIdx.x], iline[threadIdx.x]); + thrust::pair<K, Index> arg2 = + thrust::make_pair<K, Index>(sline[threadIdx.x + s], iline[threadIdx.x + s]); + thrust::pair<K, Index> res = binary_op(arg1, arg2); + + sline[threadIdx.x] = res.first; + iline[threadIdx.x] = res.second; + } + __syncthreads(); + } + + if (row < num_rows && threadIdx.x == 0) { + tgt1[row] = sline[0]; + tgt2[row] = iline[0]; + } + __syncthreads(); + } +} + +template <typename TensorTypeK, + typename TensorTypeIndex, + typename BinaryFunction> +__host__ void +THC_transformReduceInnermostDimIndex(THCState *state, + TensorTypeK *tgt1, + TensorTypeIndex *tgt2, + TensorTypeK *src, + const thrust::pair< + typename TensorUtils<TensorTypeK>::DataType, + typename TensorUtils<TensorTypeIndex>::DataType>& init, + BinaryFunction binary_op) { + unsigned ndim = TensorUtils<TensorTypeK>::getDims(state, src); + unsigned num_rows = 1; + for (unsigned dim = 0; dim < ndim - 1; dim++) { + num_rows *= TensorUtils<TensorTypeK>::getSize(state, src, dim); + } + unsigned row_size = TensorUtils<TensorTypeK>::getSize(state, src, ndim - 1); + + dim3 threads(16, 32); + dim3 grid(min(1024, THCCeilDiv(num_rows, threads.y))); + + kernelTransformReduceInnermostDimIndex + <<<grid, threads, 0, THCState_getCurrentStream(state)>>>( + TensorUtils<TensorTypeK>::getData(state, tgt1), + TensorUtils<TensorTypeIndex>::getData(state, tgt2), + TensorUtils<TensorTypeK>::getData(state, src), + num_rows, row_size, init, binary_op); + + THCudaCheck(cudaGetLastError()); +} + +template <typename TensorTypeK, + typename TensorTypeIndex, + typename BinaryFunction> +void +THC_reduceDimIndex(THCState *state, + TensorTypeK *tgt1_, + TensorTypeIndex *tgt2_, + TensorTypeK *src, + long dimension, + const thrust::pair< + typename TensorUtils<TensorTypeK>::DataType, + typename TensorUtils<TensorTypeIndex>::DataType>& init, + BinaryFunction binary_op) +{ + THArgCheck(dimension >= 0 && + dimension < TensorUtils<TensorTypeK>::getDims(state, src), + 3, "dimension out of range"); + + THLongStorage *dim = TensorUtils<TensorTypeK>::newSizeOf(state, src); + THLongStorage_set(dim, dimension, 1); + TensorUtils<TensorTypeK>::resize(state, tgt1_, dim, NULL); + TensorUtils<TensorTypeIndex>::resize(state, tgt2_, dim, NULL); + THLongStorage_free(dim); + + TensorTypeK *tgt1 = TensorUtils<TensorTypeK>::newContiguous(state, tgt1_); + TensorTypeIndex *tgt2 = TensorUtils<TensorTypeIndex>::newContiguous(state, tgt2_); + src = TensorUtils<TensorTypeK>::newContiguous(state, src); + + if (dimension == TensorUtils<TensorTypeK>::getDims(state, src) - 1) { + THC_transformReduceInnermostDimIndex(state, tgt1, tgt2, src, init, binary_op); + } else { + THC_transformReduceOuterDimIndex(state, tgt1, tgt2, src, dimension, init, binary_op); + } + + TensorUtils<TensorTypeK>::free(state, src); + TensorUtils<TensorTypeK>::freeCopyTo(state, tgt1, tgt1_); + TensorUtils<TensorTypeIndex>::freeCopyTo(state, tgt2, tgt2_); +} + +template <typename T, typename Index> +struct MaxValuePair { + __host__ __device__ + thrust::pair<T, Index> operator()(const thrust::pair<T, Index>& a, + const thrust::pair<T, Index>& b) { + return THCNumerics<T>::ge(a.first, b.first) ? a : b; + } +}; + +template <typename T, typename Index> +struct MinValuePair { + __host__ __device__ + thrust::pair<T, Index> operator()(const thrust::pair<T, Index>& a, + const thrust::pair<T, Index>& b) { + return THCNumerics<T>::le(a.first, b.first) ? a : b; + } +}; + +#include "generic/THCTensorMathReduce.cu" +#include "THCGenerateAllTypes.h" diff --git a/lib/THC/THCTensorMathTransformReduce.cu b/lib/THC/THCTensorMathTransformReduce.cu deleted file mode 100644 index 912b7b1..0000000 --- a/lib/THC/THCTensorMathTransformReduce.cu +++ /dev/null @@ -1,213 +0,0 @@ -#include "THCTensorMath.h" -#include "THCGeneral.h" -#include "THCBlas.h" -#include "THCTensorCopy.h" -#include "THCTensorRandom.h" -#include "THCApply.cuh" -#include "THCReduce.cuh" - -#include <thrust/functional.h> - -/* A set of reduction kernels that take in binary ops on thrust pairs (of value, index). - These are useful when you not only have to do a reduction, but you might have - to preserve the location of contention (for example min/max operations). - The structure of the kernels follows the structure of the reduction kernels. -*/ -template<class BinaryFunction> -__global__ void THCudaTensor_kernel_transformReduceOuterDimIndex(float *tgt1, float *tgt2, - float *src_, - unsigned num_orows, - unsigned num_irows, - unsigned row_size, - thrust::pair<float,float> init, - BinaryFunction binary_op) -{ - for (unsigned orow = blockIdx.x; orow < num_orows; orow += gridDim.x) { - for (unsigned irow = blockIdx.y * blockDim.x + threadIdx.x; irow < num_irows; irow += gridDim.y * blockDim.x) { - float *src = src_ + orow * row_size * num_irows + irow; - thrust::pair<float,float> acc = init; - - for (unsigned col = 0; col < row_size; ++col) { - acc = binary_op(thrust::make_pair(*src, col+1), acc); // i+1 for 1-indexing - src += num_irows; - } - tgt1[orow * num_irows + irow] = acc.first; - tgt2[orow * num_irows + irow] = acc.second; - } - } -} - -template<class BinaryFunction> -__host__ void THCudaTensor_transformReduceOuterDimIndex(THCState *state, THCudaTensor *tgt1, THCudaTensor *tgt2, - THCudaTensor *src, - long rdim, thrust::pair<float,float> init, - BinaryFunction binary_op) -{ - unsigned ndim = THCudaTensor_nDimension(state, src); - unsigned num_orows = 1; - for (unsigned dim = 0; dim < rdim; dim++) { - num_orows *= THCudaTensor_size(state, src, dim); - } - unsigned row_size = THCudaTensor_size(state, src, rdim); - unsigned num_irows = 1; - for (unsigned dim = rdim + 1; dim < ndim; dim++) { - num_irows *= THCudaTensor_size(state, src, dim); - } - - dim3 threads(min(512, num_irows)); - unsigned maxGridDim = 1024; - dim3 grid(min(maxGridDim, num_orows), min(maxGridDim, THCCeilDiv(num_irows, threads.x))); - - THCudaTensor_kernel_transformReduceOuterDimIndex<<<grid, threads, 0, THCState_getCurrentStream(state)>>>( - THCudaTensor_data(state, tgt1), THCudaTensor_data(state, tgt2), - THCudaTensor_data(state, src), num_orows, num_irows, row_size, init, binary_op); - cudaError errcode = cudaGetLastError(); - if(errcode != cudaSuccess) { - THError(cudaGetErrorString(errcode)); - } -} - -/* Reduce the innermost dimension of a tensor (on thrust::pair functors which are (value, index)) - * - * For an n-d tensor (n <= 4) where the reduction is along the innermost dimension: - * - * - block.x is the innermost dimension, i.e. dimension 0; - * - block.y and grid.y make up dimension 1; and - * - grid.x and grid z are the remaining two outer dimensions (if any) - * - * Reduction along other dimensions is handled in a separate kernel. - */ -template<class BinaryFunction> -__global__ void THCudaTensor_kernel_transformReduceInnermostDimIndex( - float *tgt1, float* tgt2, float *src_, - unsigned num_rows, unsigned row_size, - thrust::pair<float,float> init, BinaryFunction binary_op) -{ - __shared__ float sbuf[32][16]; - __shared__ float ibuf[32][16]; - - for (unsigned block_row = blockIdx.x * blockDim.y; block_row < num_rows; block_row += blockDim.y * gridDim.x) { - unsigned row = block_row + threadIdx.y; - thrust::pair<float,float> acc = init; - if (row < num_rows) { - float *src = src_ + row * row_size; - // Sequential reduction within a thread. - for (unsigned col = threadIdx.x; col < row_size; col += blockDim.x) { - acc = binary_op(thrust::make_pair(src[col], col+1), acc); - } - } - - sbuf[threadIdx.y][threadIdx.x] = acc.first; - ibuf[threadIdx.y][threadIdx.x] = acc.second; - - // Reduce intermediate values to single value. - float* sline = &sbuf[threadIdx.y][0]; - float* iline = &ibuf[threadIdx.y][0]; - for (unsigned s = 8; s > 0; s >>= 1) { - if (row < num_rows && threadIdx.x < s) { - thrust::pair<float,float> arg1 = thrust::make_pair<float,float>(sline[threadIdx.x], iline[threadIdx.x]); - thrust::pair<float,float> arg2 = thrust::make_pair<float,float>(sline[threadIdx.x + s], iline[threadIdx.x + s]); - thrust::pair<float,float> res = binary_op(arg1, arg2); - sline[threadIdx.x] = res.first; - iline[threadIdx.x] = res.second; - } - __syncthreads(); - } - - if (row < num_rows && threadIdx.x == 0) { - tgt1[row] = sline[0]; - tgt2[row] = iline[0]; - } - __syncthreads(); - } -} - -template<class BinaryFunction> -__host__ void THCudaTensor_transformReduceInnermostDimIndex( - THCState *state, THCudaTensor *tgt1, THCudaTensor *tgt2, THCudaTensor *src, - thrust::pair<float,float> init, BinaryFunction binary_op) -{ - unsigned ndim = THCudaTensor_nDimension(state, src); - unsigned num_rows = 1; - for (unsigned dim = 0; dim < ndim - 1; dim++) { - num_rows *= THCudaTensor_size(state, src, dim); - } - unsigned row_size = THCudaTensor_size(state, src, ndim - 1); - - dim3 threads(16, 32); - dim3 grid(min(1024, THCCeilDiv(num_rows, threads.y))); - - THCudaTensor_kernel_transformReduceInnermostDimIndex<<<grid, threads, 0, THCState_getCurrentStream(state)>>>( - THCudaTensor_data(state, tgt1), THCudaTensor_data(state, tgt2), - THCudaTensor_data(state, src), num_rows, row_size, init, binary_op); - cudaError errcode = cudaGetLastError(); - if(errcode != cudaSuccess) { - THError(cudaGetErrorString(errcode)); - } -} - -template<class BinaryFunction> -void THCudaTensor_reduceDimIndex(THCState *state, THCudaTensor *tgt1_, THCudaTensor *tgt2_, THCudaTensor *src, - long dimension, thrust::pair<float,float> init, - BinaryFunction binary_op) -{ - THArgCheck(dimension >= 0 && dimension < THCudaTensor_nDimension(state, src), 3, "dimension out of range"); - - THLongStorage *dim = THCudaTensor_newSizeOf(state, src); - THLongStorage_set(dim, dimension, 1); - THCudaTensor_resize(state, tgt1_, dim, NULL); - THCudaTensor_resize(state, tgt2_, dim, NULL); - THLongStorage_free(dim); - - THCudaTensor *tgt1 = THCudaTensor_newContiguous(state, tgt1_); - THCudaTensor *tgt2 = THCudaTensor_newContiguous(state, tgt2_); - src = THCudaTensor_newContiguous(state, src); - - if(dimension == THCudaTensor_nDimension(state, src)-1) { - THCudaTensor_transformReduceInnermostDimIndex(state, tgt1, tgt2, src, init, binary_op); - } else { - THCudaTensor_transformReduceOuterDimIndex(state, tgt1, tgt2, src, dimension, init, binary_op); - } - - THCudaTensor_free(state, src); - THCudaTensor_freeCopyTo(state, tgt1, tgt1_); - THCudaTensor_freeCopyTo(state, tgt2, tgt2_); -} - -struct maxvalue_functor -{ - __host__ __device__ thrust::pair<float,float> operator()(const thrust::pair<float,float> &a, - const thrust::pair<float,float> &b) - { - if (a.first > b.first) return a; - else return b; - } -}; - -void THCudaTensor_max(THCState *state, THCudaTensor *values, THCudaTensor *indices, THCudaTensor *src, long dimension) -{ - THAssert(THCudaTensor_checkGPU(state, 3, values, indices, src)); - const float minfloat32 = -3.402823466e+38f; - thrust::pair<float,float> init = thrust::make_pair<float,float>(minfloat32, 1); - return THCudaTensor_reduceDimIndex(state, values, indices, src, dimension, init, - maxvalue_functor()); -} - -struct minvalue_functor -{ - __host__ __device__ thrust::pair<float,float> operator()(const thrust::pair<float,float> &a, - const thrust::pair<float,float> &b) - { - if (a.first < b.first) return a; - else return b; - } -}; - -void THCudaTensor_min(THCState *state, THCudaTensor *values, THCudaTensor *indices, THCudaTensor *src, long dimension) -{ - THAssert(THCudaTensor_checkGPU(state, 3, values, indices, src)); - const float maxfloat32 = 3.402823466e+38f; - thrust::pair<float,float> init = thrust::make_pair<float,float>(maxfloat32, 1); - return THCudaTensor_reduceDimIndex(state, values, indices, src, dimension, init, - minvalue_functor()); -} diff --git a/lib/THC/THCTensorTypeUtils.cu b/lib/THC/THCTensorTypeUtils.cu index fc9bd60..96cd3bc 100644 --- a/lib/THC/THCTensorTypeUtils.cu +++ b/lib/THC/THCTensorTypeUtils.cu @@ -33,6 +33,12 @@ TensorUtils<TENSOR_TYPE>::newContiguous(THCState* state, \ return TENSOR_TYPE##_newContiguous(state, t); \ } \ \ +THLongStorage* \ +TensorUtils<TENSOR_TYPE>::newSizeOf(THCState* state, \ + TENSOR_TYPE* t) { \ + return TENSOR_TYPE##_newSizeOf(state, t); \ +} \ + \ void \ TensorUtils<TENSOR_TYPE>::retain(THCState* state, \ TENSOR_TYPE* t) { \ @@ -53,6 +59,14 @@ TensorUtils<TENSOR_TYPE>::freeCopyTo(THCState* state, \ } \ \ void \ +TensorUtils<TENSOR_TYPE>::resize(THCState* state, \ + TENSOR_TYPE* out, \ + THLongStorage* sizes, \ + THLongStorage* strides) { \ + TENSOR_TYPE##_resize(state, out, sizes, strides); \ +} \ + \ +void \ TensorUtils<TENSOR_TYPE>::resizeAs(THCState* state, \ TENSOR_TYPE* dst, \ TENSOR_TYPE* src) { \ diff --git a/lib/THC/THCTensorTypeUtils.cuh b/lib/THC/THCTensorTypeUtils.cuh index 4456f47..22a2f92 100644 --- a/lib/THC/THCTensorTypeUtils.cuh +++ b/lib/THC/THCTensorTypeUtils.cuh @@ -31,17 +31,22 @@ template <typename TensorType> struct TensorUtils { }; -#define TENSOR_UTILS(TENSOR_TYPE, DATA_TYPE) \ +#define TENSOR_UTILS(TENSOR_TYPE, DATA_TYPE, ACC_DATA_TYPE) \ template <> \ struct TensorUtils<TENSOR_TYPE> { \ typedef DATA_TYPE DataType; \ + typedef ACC_DATA_TYPE AccDataType; \ \ static TENSOR_TYPE* newTensor(THCState* state); \ static TENSOR_TYPE* newContiguous(THCState* state, TENSOR_TYPE* t); \ + static THLongStorage* newSizeOf(THCState* state, TENSOR_TYPE* t); \ static void retain(THCState* state, TENSOR_TYPE* t); \ static void free(THCState* state, TENSOR_TYPE* t); \ static void freeCopyTo(THCState* state, TENSOR_TYPE* src, \ TENSOR_TYPE* dst); \ + static void resize(THCState* state, TENSOR_TYPE* out, \ + THLongStorage* sizes, \ + THLongStorage* strides); \ static void resizeAs(THCState* state, TENSOR_TYPE* dst, \ TENSOR_TYPE* src); \ static DATA_TYPE* getData(THCState* state, TENSOR_TYPE* t); \ @@ -61,16 +66,16 @@ struct TensorUtils { static bool canUse32BitIndexMath(THCState* state, TENSOR_TYPE* t); \ } -TENSOR_UTILS(THCudaByteTensor, unsigned char); -TENSOR_UTILS(THCudaCharTensor, char); -TENSOR_UTILS(THCudaShortTensor, short); -TENSOR_UTILS(THCudaIntTensor, int); -TENSOR_UTILS(THCudaLongTensor, long); -TENSOR_UTILS(THCudaTensor, float); -TENSOR_UTILS(THCudaDoubleTensor, double); +TENSOR_UTILS(THCudaByteTensor, unsigned char, long); +TENSOR_UTILS(THCudaCharTensor, char, long); +TENSOR_UTILS(THCudaShortTensor, short, long); +TENSOR_UTILS(THCudaIntTensor, int, long); +TENSOR_UTILS(THCudaLongTensor, long, long); +TENSOR_UTILS(THCudaTensor, float, float); +TENSOR_UTILS(THCudaDoubleTensor, double, double); #ifdef CUDA_HALF_TENSOR -TENSOR_UTILS(THCudaHalfTensor, half); +TENSOR_UTILS(THCudaHalfTensor, half, float); #endif #undef TENSOR_UTILS @@ -91,15 +96,6 @@ getTensorInfo(THCState* state, TensorType* t) { TensorUtils<TensorType>::getData(state, t), dims, sz, st); } -/// `half` has some type conversion issues associated with it, since it -/// is a struct without a constructor/implicit conversion constructor. -/// We use this to convert scalar values to the given type that the -/// tensor expects. -template <typename In, typename Out> -struct ScalarConvert { - static __host__ __device__ Out to(const In v) { return (Out) v; } -}; - template <typename T> struct ScalarNegate { static __host__ __device__ T to(const T v) { return -v; } @@ -111,36 +107,6 @@ struct ScalarInv { }; #ifdef CUDA_HALF_TENSOR - -template <typename Out> -struct ScalarConvert<half, Out> { - static __host__ __device__ Out to(const half v) { -#ifdef __CUDA_ARCH__ - return (Out) __half2float(v); -#else - return (Out) THC_half2float(v); -#endif - } -}; - -template <typename In> -struct ScalarConvert<In, half> { - static __host__ __device__ half to(const In v) { -#ifdef __CUDA_ARCH__ - return __float2half((float) v); -#else - return THC_float2half((float) v); -#endif - } -}; - -template <> -struct ScalarConvert<half, half> { - static __host__ __device__ half to(const half v) { - return v; - } -}; - template <> struct ScalarNegate<half> { static __host__ __device__ half to(const half v) { diff --git a/lib/THC/generic/THCTensorMasked.cu b/lib/THC/generic/THCTensorMasked.cu new file mode 100644 index 0000000..e6a5704 --- /dev/null +++ b/lib/THC/generic/THCTensorMasked.cu @@ -0,0 +1,191 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/THCTensorMasked.cu" +#else + + +THC_API void +THCTensor_(maskedFill)(THCState* state, + THCTensor *tensor, THCudaByteTensor *mask, real value) +{ + THAssert(THCTensor_(checkGPU)(state, 2, tensor, mask)); + THArgCheck(THCTensor_(nElement)(state, tensor) == + THCudaByteTensor_nElement(state, mask), + 2, "sizes do not match"); + + if (!THC_pointwiseApply2(state, tensor, mask, + TensorMaskedFillOp<real, unsigned char>(value))) { + THArgCheck(false, 2, CUTORCH_DIM_WARNING); + } + + THCudaCheck(cudaGetLastError()); +} + +THC_API void +THCTensor_(maskedFillByte)(THCState* state, + THCTensor *tensor, THByteTensor *mask, real value) +{ + THAssert(THCTensor_(checkGPU)(state, 1, tensor)); + THLongStorage* maskSizes = THByteTensor_newSizeOf(mask); + THCudaByteTensor* maskCuda = THCudaByteTensor_newWithSize(state, maskSizes, NULL); + THLongStorage_free(maskSizes); + THCudaByteTensor_copyByte(state, maskCuda, mask); + THCTensor_(maskedFill)(state, tensor, maskCuda, value); + THCudaByteTensor_free(state, maskCuda); +} + +THC_API void +THCTensor_(maskedCopy)(THCState* state, + THCTensor *tensor, THCudaByteTensor *mask, THCTensor *src) +{ + THAssert(THCTensor_(checkGPU)(state, 3, tensor, src, mask)); + long maskSize = THCudaByteTensor_nElement(state, mask); + long tensorSize = THCTensor_(nElement)(state, tensor); + long srcSize = THCTensor_(nElement)(state, src); + + // `mask` and `tensor` must have the same number of elements + THArgCheck(maskSize == tensorSize, 2, + "mask and tensor must have the same number of elements"); + + // Determine our output size + long totalElements = THCudaByteTensor_sumall(state, mask); + + // The number of `1` elements present in the mask must be <= the + // number of elements available in `src` + if (totalElements > srcSize) { + THArgCheck(false, 2, "source nElements must be == mask `1` elements"); + } + + // FIXME: there appears to be a bug in Thrust (CUDA 7.0) for mixed + // iterator prefix sums? Convert `mask` to the same datatype as what + // we're accumulating the prefix sum in (long) to get around it + THCudaLongTensor* maskLong = THCudaLongTensor_new(state); + THLongStorage* maskSizes = THCudaByteTensor_newSizeOf(state, mask); + THCudaLongTensor_resize(state, maskLong, maskSizes, NULL); + THCudaLongTensor_copyCudaByte(state, maskLong, mask); + + // Use a prefix sum to determine the output locations of the masked elements + THCudaLongTensor* maskPrefixSum = THCudaLongTensor_new(state); + THCudaLongTensor_resize(state, maskPrefixSum, maskSizes, NULL); + THLongStorage_free(maskSizes); + + thrust::device_ptr<long> + maskData(THCudaLongTensor_data(state, maskLong)); + thrust::device_ptr<long> + maskPrefixSumData(THCudaLongTensor_data(state, maskPrefixSum)); + + thrust::exclusive_scan( +#if CUDA_VERSION >= 7000 + thrust::cuda::par.on(THCState_getCurrentStream(state)), +#endif + maskData, + maskData + THCudaLongTensor_nElement(state, maskLong), + maskPrefixSumData); + + // We are getting elements from `src` based on an offset from + // `maskPrefixSum`, so that should be made contiguous too + THCTensor* contigSrc = THCTensor_(newContiguous)(state, src); + + // update `tensor` where `mask` == 1 but pull from `src` at + // maskPrefixSum + bool status = THC_pointwiseApply3( + state, tensor, mask, maskPrefixSum, + TensorMaskedCopyOp<real, unsigned char, long>( + THCTensor_(data)(state, contigSrc))); + + THCTensor_(free)(state, contigSrc); + THCudaLongTensor_free(state, maskLong); + THCudaLongTensor_free(state, maskPrefixSum); + + THArgCheck(status, 2, CUTORCH_DIM_WARNING); + THCudaCheck(cudaGetLastError()); +} + +THC_API void +THCTensor_(maskedCopyByte)(THCState* state, + THCTensor *tensor, THByteTensor *mask, THCTensor *src) { + THAssert(THCTensor_(checkGPU)(state, 2, tensor, src)); + THLongStorage* maskSizes = THByteTensor_newSizeOf(mask); + THCudaByteTensor* maskCuda = THCudaByteTensor_newWithSize(state, maskSizes, NULL); + THLongStorage_free(maskSizes); + THCudaByteTensor_copyByte(state, maskCuda, mask); + THCTensor_(maskedCopy)(state, tensor, maskCuda, src); + THCudaByteTensor_free(state, maskCuda); +} + +THC_API void +THCTensor_(maskedSelect)(THCState* state, + THCTensor* tensor, THCTensor* src, THCudaByteTensor* mask) { + THAssert(THCTensor_(checkGPU)(state, 3, tensor, src, mask)); + THArgCheck(THCudaByteTensor_nElement(state, mask) == + THCTensor_(nElement)(state, src), + 2, "sizes do not match"); + + // Determine our output size + long totalElements = THCudaByteTensor_sumall(state, mask); + THCTensor* tensorContig = THCTensor_(newContiguous)(state, tensor); + + THCTensor_(resize1d)(state, tensorContig, totalElements); + if (tensor != tensorContig) { + THCTensor_(resize1d)(state, tensor, totalElements); + } + + // FIXME: there appears to be a bug in Thrust (CUDA 7.0) for mixed + // iterator prefix sums? Convert `mask` to the same datatype as what + // we're accumulating the prefix sum in (long) to get around it + THCudaLongTensor* maskLong = THCudaLongTensor_new(state); + THLongStorage* maskSizes = THCudaByteTensor_newSizeOf(state, mask); + THCudaLongTensor_resize(state, maskLong, maskSizes, NULL); + THCudaLongTensor_copyCudaByte(state, maskLong, mask); + + // Use a prefix sum to determine the output locations of the masked elements + THCudaLongTensor* maskPrefixSum = THCudaLongTensor_new(state); + THCudaLongTensor_resize(state, maskPrefixSum, maskSizes, NULL); + THLongStorage_free(maskSizes); + + thrust::device_ptr<long> + maskData(THCudaLongTensor_data(state, maskLong)); + thrust::device_ptr<long> + maskPrefixSumData(THCudaLongTensor_data(state, maskPrefixSum)); + + thrust::exclusive_scan( +#if CUDA_VERSION >= 7000 + thrust::cuda::par.on(THCState_getCurrentStream(state)), +#endif + maskData, + maskData + THCudaLongTensor_nElement(state, maskLong), + maskPrefixSumData); + + // Then copy over the masked elements at their desired output index + bool status = THC_pointwiseApply3( + state, mask, maskPrefixSum, + src, TensorMaskedSelectOp<real, unsigned char, long>( + THCTensor_(data)(state, tensor))); + + THCudaLongTensor_free(state, maskLong); + THCudaLongTensor_free(state, maskPrefixSum); + + if (tensor != tensorContig) { + THCTensor_(freeCopyTo)(state, tensorContig, tensor); + } else { + THCTensor_(free)(state, tensorContig); + } + + THArgCheck(status, 2, CUTORCH_DIM_WARNING); + THCudaCheck(cudaGetLastError()); +} + +// FIXME: remove now that we have THCudaByteTensor? +THC_API void +THCTensor_(maskedSelectByte)(THCState* state, + THCTensor *tensor, THCTensor *src, THByteTensor *mask) +{ + THAssert(THCTensor_(checkGPU)(state, 2, tensor, src)); + THLongStorage* maskSizes = THByteTensor_newSizeOf(mask); + THCudaByteTensor* maskCuda = THCudaByteTensor_newWithSize(state, maskSizes, NULL); + THLongStorage_free(maskSizes); + THCudaByteTensor_copyByte(state, maskCuda, mask); + THCTensor_(maskedSelect)(state, tensor, src, maskCuda); + THCudaByteTensor_free(state, maskCuda); +} + +#endif diff --git a/lib/THC/generic/THCTensorMasked.h b/lib/THC/generic/THCTensorMasked.h new file mode 100644 index 0000000..98f5aee --- /dev/null +++ b/lib/THC/generic/THCTensorMasked.h @@ -0,0 +1,38 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/THCTensorMasked.h" +#else + +THC_API void THCTensor_(maskedFill)(THCState *state, + THCTensor *tensor, + THCudaByteTensor *mask, + real value); + +// FIXME: remove now that we have THCudaByteTensor? +THC_API void THCTensor_(maskedFillByte)(THCState *state, + THCTensor *tensor, + THByteTensor *mask, + real value); + +THC_API void THCTensor_(maskedCopy)(THCState *state, + THCTensor *tensor, + THCudaByteTensor *mask, + THCTensor *src); + +// FIXME: remove now that we have THCudaByteTensor? +THC_API void THCTensor_(maskedCopyByte)(THCState *state, + THCTensor *tensor, + THByteTensor *mask, + THCTensor *src); + +THC_API void THCTensor_(maskedSelect)(THCState *state, + THCTensor *tensor, + THCTensor *src, + THCudaByteTensor *mask); + +// FIXME: remove now that we have THCudaByteTensor? +THC_API void THCTensor_(maskedSelectByte)(THCState *state, + THCTensor *tensor, + THCTensor *src, + THByteTensor *mask); + +#endif diff --git a/lib/THC/generic/THCTensorMathBlas.cu b/lib/THC/generic/THCTensorMathBlas.cu new file mode 100644 index 0000000..91e922c --- /dev/null +++ b/lib/THC/generic/THCTensorMathBlas.cu @@ -0,0 +1,563 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/THCTensorMathBlas.cu" +#else + +THC_API real +THCTensor_(dot)(THCState *state, THCTensor *self, THCTensor *src) +{ +#if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) + THAssert(THCTensor_(checkGPU)(state, 2, self, src)); + THArgCheck(THCTensor_(nElement)(state, self) == + THCTensor_(nElement)(state, src), 2, "sizes do not match"); + + self = THCTensor_(newContiguous)(state, self); + src = THCTensor_(newContiguous)(state, src); + +#ifdef THC_REAL_IS_FLOAT + real result = THCudaBlas_Sdot(state, + THCTensor_(nElement)(state, self), + THCTensor_(data)(state, self), 1, + THCTensor_(data)(state, src), 1); +#elif defined(THC_REAL_IS_DOUBLE) + real result = THCudaBlas_Ddot(state, + THCTensor_(nElement)(state, self), + THCTensor_(data)(state, self), 1, + THCTensor_(data)(state, src), 1); +#endif + + THCTensor_(free)(state, src); + THCTensor_(free)(state, self); + return result; + +#else + THError("unimplemented data type"); + return ScalarConvert<int, real>::to(0); +#endif +} + +THC_API void +THCTensor_(addmv)(THCState *state, THCTensor *r_, real beta, THCTensor *t, real alpha, THCTensor *mat, THCTensor *vec) +{ +#if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) + THAssert(THCTensor_(checkGPU)(state, 4, r_, t, mat, vec)); + if( (mat->nDimension != 2) || (vec->nDimension != 1) ) + THError("matrix and vector expected"); + + if( mat->size[1] != vec->size[0] ) + THError("size mismatch"); + + if(t->nDimension != 1) + THError("size mismatch"); + + if(t->size[0] != mat->size[0]) + THError("size mismatch"); + + if(r_ != t) + { + THCTensor_(resizeAs)(state, r_, t); + THCTensor_(copy)(state, r_, t); + } + + if(mat->stride[0] == 1) + { +#ifdef THC_REAL_IS_FLOAT + THCudaBlas_Sgemv(state, 'n', mat->size[0], mat->size[1], + alpha, THCTensor_(data)(state, mat), mat->stride[1], + THCTensor_(data)(state, vec), vec->stride[0], + beta, THCTensor_(data)(state, r_), r_->stride[0]); +#elif defined(THC_REAL_IS_DOUBLE) + THCudaBlas_Dgemv(state, 'n', mat->size[0], mat->size[1], + alpha, THCTensor_(data)(state, mat), mat->stride[1], + THCTensor_(data)(state, vec), vec->stride[0], + beta, THCTensor_(data)(state, r_), r_->stride[0]); +#endif + } + else if(mat->stride[1] == 1) + { +#ifdef THC_REAL_IS_FLOAT + THCudaBlas_Sgemv(state, 't', mat->size[1], mat->size[0], + alpha, THCTensor_(data)(state, mat), mat->stride[0], + THCTensor_(data)(state, vec), vec->stride[0], + beta, THCTensor_(data)(state, r_), r_->stride[0]); +#elif defined(THC_REAL_IS_DOUBLE) + THCudaBlas_Dgemv(state, 't', mat->size[1], mat->size[0], + alpha, THCTensor_(data)(state, mat), mat->stride[0], + THCTensor_(data)(state, vec), vec->stride[0], + beta, THCTensor_(data)(state, r_), r_->stride[0]); +#endif + } + else + { + THCTensor *cmat = THCTensor_(newContiguous)(state, mat); + +#ifdef THC_REAL_IS_FLOAT + THCudaBlas_Sgemv(state, 't', mat->size[1], mat->size[0], + alpha, THCTensor_(data)(state, cmat), cmat->stride[0], + THCTensor_(data)(state, vec), vec->stride[0], + beta, THCTensor_(data)(state, r_), r_->stride[0]); +#elif defined(THC_REAL_IS_DOUBLE) + THCudaBlas_Dgemv(state, 't', mat->size[1], mat->size[0], + alpha, THCTensor_(data)(state, cmat), cmat->stride[0], + THCTensor_(data)(state, vec), vec->stride[0], + beta, THCTensor_(data)(state, r_), r_->stride[0]); +#endif + + THCTensor_(free)(state, cmat); + } + +#else + THError("unimplemented data type"); +#endif +} + +THC_API void +THCTensor_(addr)(THCState *state, THCTensor *r_, real beta, THCTensor *t, real alpha, THCTensor *vec1, THCTensor *vec2) +{ +#if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) + THAssert(THCTensor_(checkGPU)(state, 4, r_, t, vec1, vec2)); + if ( (vec1->nDimension != 1) || (vec2->nDimension != 1) ) { + THError("vector and vector expected"); + } + + if (t->nDimension != 2) { + THError("size mismatch"); + } + + if ( (t->size[0] != vec1->size[0]) || (t->size[1] != vec2->size[0]) ) { + THError("size mismatch"); + } + + if (r_ != t) { + THCTensor_(resizeAs)(state, r_, t); + THCTensor_(copy)(state, r_, t); + } + + if(beta != 1) { + THCTensor_(mul)(state, r_, r_, beta); + } + + if(r_->stride[0] == 1) + { +#ifdef THC_REAL_IS_FLOAT + THCudaBlas_Sger(state, vec1->size[0], vec2->size[0], + alpha, THCTensor_(data)(state, vec1), vec1->stride[0], + THCTensor_(data)(state, vec2), vec2->stride[0], + THCTensor_(data)(state, r_), r_->stride[1]); +#elif defined(THC_REAL_IS_DOUBLE) + THCudaBlas_Dger(state, vec1->size[0], vec2->size[0], + alpha, THCTensor_(data)(state, vec1), vec1->stride[0], + THCTensor_(data)(state, vec2), vec2->stride[0], + THCTensor_(data)(state, r_), r_->stride[1]); +#endif + } + else if(r_->stride[1] == 1) + { +#ifdef THC_REAL_IS_FLOAT + THCudaBlas_Sger(state, vec2->size[0], vec1->size[0], + alpha, THCTensor_(data)(state, vec2), vec2->stride[0], + THCTensor_(data)(state, vec1), vec1->stride[0], + THCTensor_(data)(state, r_), r_->stride[0]); +#elif defined(THC_REAL_IS_DOUBLE) + THCudaBlas_Dger(state, vec2->size[0], vec1->size[0], + alpha, THCTensor_(data)(state, vec2), vec2->stride[0], + THCTensor_(data)(state, vec1), vec1->stride[0], + THCTensor_(data)(state, r_), r_->stride[0]); +#endif + } + else + { + THCTensor *cr = THCTensor_(newClone)(state, r_); + +#ifdef THC_REAL_IS_FLOAT + THCudaBlas_Sger(state, vec2->size[0], vec1->size[0], + alpha, THCTensor_(data)(state, vec2), vec2->stride[0], + THCTensor_(data)(state, vec1), vec1->stride[0], + THCTensor_(data)(state, cr), cr->stride[0]); +#elif defined(THC_REAL_IS_DOUBLE) + THCudaBlas_Dger(state, vec2->size[0], vec1->size[0], + alpha, THCTensor_(data)(state, vec2), vec2->stride[0], + THCTensor_(data)(state, vec1), vec1->stride[0], + THCTensor_(data)(state, cr), cr->stride[0]); +#endif + + THCTensor_(freeCopyTo)(state, cr, r_); + } +#else + THError("unimplemented data type"); +#endif +} + +THC_API void +THCTensor_(addmm)(THCState *state, THCTensor *r_, real beta, THCTensor *t, real alpha, THCTensor *m1, THCTensor *m2) +{ +#if defined(THC_REAL_IS_HALF) || defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) + + THAssert(THCTensor_(checkGPU)(state, 4, r_, t, m1, m2)); + char transpose_r, transpose_m1, transpose_m2; + THCTensor *r__, *m1_, *m2_; + + if( (m1->nDimension != 2) || (m2->nDimension != 2) ) + THError("matrix and matrix expected"); + + if(t->nDimension != 2) + THError("size mismatch"); + + if( (t->size[0] != m1->size[0]) || (t->size[1] != m2->size[1]) || (m1->size[1] != m2->size[0]) ) + THError("size mismatch"); + + if(t != r_) + { + THCTensor_(resizeAs)(state, r_, t); + THCTensor_(copy)(state, r_, t); + } + + /* r_ */ + if(r_->stride[0] == 1 && + r_->stride[1] != 0) + { + transpose_r = 'n'; + r__ = r_; + } + else if(r_->stride[1] == 1 && + r_->stride[0] != 0) + { + THCTensor *swap = m2; + m2 = m1; + m1 = swap; + transpose_r = 't'; + r__ = r_; + } + else + { + transpose_r = 'n'; + + THCTensor *transp_r_ = THCTensor_(newTranspose)(state, r_, 0, 1); + r__ = THCTensor_(newClone)(state, transp_r_); + THCTensor_(free)(state, transp_r_); + THCTensor_(transpose)(state, r__, NULL, 0, 1); + } + + /* m1 */ + if(m1->stride[(transpose_r == 'n' ? 0 : 1)] == 1 && + m1->stride[(transpose_r == 'n' ? 1 : 0)] != 0) + { + transpose_m1 = 'n'; + m1_ = m1; + } + else if(m1->stride[(transpose_r == 'n' ? 1 : 0)] == 1 && + m1->stride[(transpose_r == 'n' ? 0 : 1)] != 0) + { + transpose_m1 = 't'; + m1_ = m1; + } + else + { + transpose_m1 = (transpose_r == 'n' ? 't' : 'n'); + m1_ = THCTensor_(newContiguous)(state, m1); + } + + /* m2 */ + if(m2->stride[(transpose_r == 'n' ? 0 : 1)] == 1 && + m2->stride[(transpose_r == 'n' ? 1 : 0)] != 0) + { + transpose_m2 = 'n'; + m2_ = m2; + } + else if(m2->stride[(transpose_r == 'n' ? 1 : 0)] == 1 && + m2->stride[(transpose_r == 'n' ? 0 : 1)] != 0) + { + transpose_m2 = 't'; + m2_ = m2; + } + else + { + transpose_m2 = (transpose_r == 'n' ? 't' : 'n'); + m2_ = THCTensor_(newContiguous)(state, m2); + } + +#ifdef THC_REAL_IS_HALF + THCudaBlas_Hgemm(state, + transpose_m1, + transpose_m2, + r__->size[(transpose_r == 'n' ? 0 : 1)], + r__->size[(transpose_r == 'n' ? 1 : 0)], + m1_->size[(transpose_r == 'n' ? 1 : 0)], + alpha, + THCTensor_(data)(state, m1_), + (transpose_m1 == 'n' ? m1_->stride[(transpose_r == 'n' ? 1 : 0)] : m1_->stride[(transpose_r == 'n' ? 0 : 1)]), + THCTensor_(data)(state, m2_), + (transpose_m2 == 'n' ? m2_->stride[(transpose_r == 'n' ? 1 : 0)] : m2_->stride[(transpose_r == 'n' ? 0 : 1)]), + beta, + THCTensor_(data)(state, r__), + r__->stride[(transpose_r == 'n' ? 1 : 0)]); +#elif defined(THC_REAL_IS_FLOAT) + THCudaBlas_Sgemm(state, + transpose_m1, + transpose_m2, + r__->size[(transpose_r == 'n' ? 0 : 1)], + r__->size[(transpose_r == 'n' ? 1 : 0)], + m1_->size[(transpose_r == 'n' ? 1 : 0)], + alpha, + THCTensor_(data)(state, m1_), + (transpose_m1 == 'n' ? m1_->stride[(transpose_r == 'n' ? 1 : 0)] : m1_->stride[(transpose_r == 'n' ? 0 : 1)]), + THCTensor_(data)(state, m2_), + (transpose_m2 == 'n' ? m2_->stride[(transpose_r == 'n' ? 1 : 0)] : m2_->stride[(transpose_r == 'n' ? 0 : 1)]), + beta, + THCTensor_(data)(state, r__), + r__->stride[(transpose_r == 'n' ? 1 : 0)]); +#elif defined(THC_REAL_IS_DOUBLE) + THCudaBlas_Dgemm(state, + transpose_m1, + transpose_m2, + r__->size[(transpose_r == 'n' ? 0 : 1)], + r__->size[(transpose_r == 'n' ? 1 : 0)], + m1_->size[(transpose_r == 'n' ? 1 : 0)], + alpha, + THCTensor_(data)(state, m1_), + (transpose_m1 == 'n' ? m1_->stride[(transpose_r == 'n' ? 1 : 0)] : m1_->stride[(transpose_r == 'n' ? 0 : 1)]), + THCTensor_(data)(state, m2_), + (transpose_m2 == 'n' ? m2_->stride[(transpose_r == 'n' ? 1 : 0)] : m2_->stride[(transpose_r == 'n' ? 0 : 1)]), + beta, + THCTensor_(data)(state, r__), + r__->stride[(transpose_r == 'n' ? 1 : 0)]); +#endif + + /* free intermediate variables */ + if(m1_ != m1) { + THCTensor_(free)(state, m1_); + } + + if(m2_ != m2) { + THCTensor_(free)(state, m2_); + } + + if(r__ != r_) { + THCTensor_(freeCopyTo)(state, r__, r_); + } +#else + THError("unimplemented data type"); +#endif +} + +THC_API void +THCTensor_(addbmm)(THCState *state, THCTensor *result, real beta, THCTensor *t, + real alpha, THCTensor *batch1, THCTensor *batch2) { +#if defined(THC_REAL_IS_HALF) || defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) + THAssert(THCTensor_(checkGPU)(state, 4, result, t, batch1, batch2)); + THArgCheck(THCTensor_(nDimension)(state, t) == 2, 4, "expected 2D tensor"); + THArgCheck(THCTensor_(nDimension)(state, batch1) == 3, 6, "expected 3D tensor"); + THArgCheck(THCTensor_(nDimension)(state, batch2) == 3, 7, "expected 3D tensor"); + + long batchnum = THCTensor_(size)(state, batch1, 0); + long m1d1 = THCTensor_(size)(state, batch1, 1); + long innerdim = THCTensor_(size)(state, batch1, 2); + long m2d2 = THCTensor_(size)(state, batch2, 2); + + THArgCheck(batchnum == THCTensor_(size)(state, batch2, 0), 7, + "equal number of batches expected"); + // M is t, as listed in the docs under addbmm + THArgCheck(m1d1 == THCTensor_(size)(state, t, 0), 6, + "first dimension must match first dimension of M"); + THArgCheck(m2d2 == THCTensor_(size)(state, t, 1), 7, + "second dimension must match second dimension of M"); + THArgCheck(innerdim == THCTensor_(size)(state, batch2, 1), 6, + "second dimension must match first dimension of batch2"); + + if (t != result) { + THCTensor_(resizeAs)(state, result, t); + THCTensor_(copy)(state, result, t); + } + + THCTensor *slice1 = THCTensor_(new)(state); + THCTensor *slice2 = THCTensor_(new)(state); + for (long i=0; i<batchnum; i++) { + THCTensor_(select)(state, slice1, batch1, 0, i); + THCTensor_(select)(state, slice2, batch2, 0, i); + + THCTensor_(addmm)(state, result, beta, result, alpha, slice1, slice2); + beta = ScalarConvert<int, real>::to(1); + } + THCTensor_(free)(state, slice1); + THCTensor_(free)(state, slice2); +#else + THError("unimplemented data type"); +#endif +} + +THC_API void +THCTensor_(baddbmm)(THCState *state, THCTensor *result, real beta, THCTensor *t, + real alpha, THCTensor *batch1, THCTensor *batch2) { +#if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) + THAssert(THCTensor_(checkGPU)(state, 4, result, t, batch1, batch2)); + THArgCheck(THCTensor_(nDimension)(state, t) == 3, 4, "expected 3D tensor"); + THArgCheck(THCTensor_(nDimension)(state, batch1) == 3, 6, "expected 3D tensor"); + THArgCheck(THCTensor_(nDimension)(state, batch2) == 3, 7, "expected 3D tensor"); + THArgCheck(THCTensor_(size)(state, t, 0) == THCTensor_(size)(state, batch1, 0), 6, + "equal number of batches expected"); + THArgCheck(THCTensor_(size)(state, t, 0) == THCTensor_(size)(state, batch2, 0), 7, + "equal number of batches expected"); + THArgCheck(THCTensor_(size)(state, t, 1) == THCTensor_(size)(state, batch1, 1), 6, + "wrong matrix size"); + THArgCheck(THCTensor_(size)(state, t, 2) == THCTensor_(size)(state, batch2, 2), 7, + "wrong matrix size"); + THArgCheck(THCTensor_(size)(state, batch1, 2) == THCTensor_(size)(state, batch2, 1), 6, + "wrong matrix size"); + + if (t != result) { + THCTensor_(resizeAs)(state, result, t); + THCTensor_(copy)(state, result, t); + } + + bool transpose_result; + char transpose_batch1, transpose_batch2; + long lda, ldb, ldc; + THCTensor *result_, *batch1_, *batch2_; + if (result->stride[1] == 1) + { + transpose_result = false; + result_ = result; + ldc = result_->stride[2]; + } + else if (result->stride[2] == 1) + { + transpose_result = true; + + THCTensor *swap = batch2; + batch2 = batch1; + batch1 = swap; + + result_ = result; + ldc = result_->stride[1]; + } + else + { + transpose_result = false; + + THCTensor *transp_r_ = THCTensor_(newTranspose)(state, result, 1, 2); + result_ = THCTensor_(newClone)(state, transp_r_); + THCTensor_(free)(state, transp_r_); + THCTensor_(transpose)(state, result_, NULL, 1, 2); + + ldc = result_->stride[2]; + } + + if (batch1->stride[transpose_result ? 2 : 1] == 1) + { + transpose_batch1 = 'n'; + batch1_ = batch1; + lda = batch1_->stride[transpose_result ? 1 : 2]; + } + else if (batch1->stride[transpose_result ? 1 : 2] == 1) + { + transpose_batch1 = 't'; + batch1_ = batch1; + lda = batch1_->stride[transpose_result ? 2 : 1]; + } + else + { + transpose_batch1 = transpose_result ? 'n' : 't'; + batch1_ = THCTensor_(newContiguous)(state, batch1); + lda = batch1_->stride[1]; + } + + if (batch2->stride[transpose_result ? 2 : 1] == 1) + { + transpose_batch2 = 'n'; + batch2_ = batch2; + ldb = batch2_->stride[transpose_result ? 1 : 2]; + } + else if (batch2->stride[transpose_result ? 1 : 2] == 1) + { + transpose_batch2 = 't'; + batch2_ = batch2; + ldb = batch2_->stride[transpose_result ? 2 : 1]; + } + else + { + transpose_batch2 = transpose_result ? 'n' : 't'; + batch2_ = THCTensor_(newContiguous)(state, batch2); + ldb = batch2_->stride[1]; + } + + // Compute pointers to matrices in each batch. + long num_batches = result_->size[0]; + size_t matrices_size = num_batches * sizeof(real*); + const real **matrices1 = (const real **)THAlloc(matrices_size); + const real **matrices2 = (const real **)THAlloc(matrices_size); + real **result_matrices = (real **)THAlloc(matrices_size); + for (int i = 0; i < num_batches; ++i) + { + matrices1[i] = THCTensor_(data)(state, batch1_) + i * batch1_->stride[0]; + matrices2[i] = THCTensor_(data)(state, batch2_) + i * batch2_->stride[0]; + result_matrices[i] = THCTensor_(data)(state, result_) + i * result_->stride[0]; + } + + // Copy pointers to device. + const real **d_matrices1, **d_matrices2; + real **d_result_matrices; + THCudaCheck(THCudaMalloc(state, (void**)&d_matrices1, matrices_size)); + THCudaCheck(THCudaMalloc(state, (void**)&d_matrices2, matrices_size)); + THCudaCheck(THCudaMalloc(state, (void**)&d_result_matrices, matrices_size)); + + THCudaCheck(cudaMemcpyAsync(d_matrices1, matrices1, matrices_size, + cudaMemcpyHostToDevice, THCState_getCurrentStream(state))); + THCudaCheck(cudaMemcpyAsync(d_matrices2, matrices2, matrices_size, + cudaMemcpyHostToDevice, THCState_getCurrentStream(state))); + THCudaCheck(cudaMemcpyAsync(d_result_matrices, result_matrices, matrices_size, + cudaMemcpyHostToDevice, THCState_getCurrentStream(state))); + +#ifdef THC_REAL_IS_FLOAT + THCudaBlas_SgemmBatched( + state, + transpose_batch1, + transpose_batch2, + result_->size[transpose_result ? 2 : 1], + result_->size[transpose_result ? 1 : 2], + batch1_->size[transpose_result ? 1 : 2], + alpha, + d_matrices1, lda, + d_matrices2, ldb, + beta, + d_result_matrices, ldc, + num_batches); +#elif defined(THC_REAL_IS_DOUBLE) + THCudaBlas_DgemmBatched( + state, + transpose_batch1, + transpose_batch2, + result_->size[transpose_result ? 2 : 1], + result_->size[transpose_result ? 1 : 2], + batch1_->size[transpose_result ? 1 : 2], + alpha, + d_matrices1, lda, + d_matrices2, ldb, + beta, + d_result_matrices, ldc, + num_batches); +#endif + + THCudaFree(state, d_matrices1); + THCudaFree(state, d_matrices2); + THCudaFree(state, d_result_matrices); + THFree(matrices1); + THFree(matrices2); + THFree(result_matrices); + + if (batch1_ != batch1) { + THCTensor_(free)(state, batch1_); + } + + if (batch2_ != batch2) { + THCTensor_(free)(state, batch2_); + } + + if (result_ != result) { + THCTensor_(freeCopyTo)(state, result_, result); + } + +#else + THError("unimplemented data type"); +#endif +} + +#endif diff --git a/lib/THC/generic/THCTensorMathBlas.h b/lib/THC/generic/THCTensorMathBlas.h new file mode 100644 index 0000000..68f95e3 --- /dev/null +++ b/lib/THC/generic/THCTensorMathBlas.h @@ -0,0 +1,13 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/THCTensorMathBlas.h" +#else + +THC_API real THCTensor_(dot)(THCState *state, THCTensor *self, THCTensor *src); +THC_API void THCTensor_(addmv)(THCState *state, THCTensor *self, real beta, THCTensor *t, real alpha, THCTensor *mat, THCTensor *vec); +THC_API void THCTensor_(addmm)(THCState *state, THCTensor *self, real beta, THCTensor *t, real alpha, THCTensor *mat1, THCTensor *mat2); +THC_API void THCTensor_(addr)(THCState *state, THCTensor *self, real beta, THCTensor *t, real alpha, THCTensor *vec1, THCTensor *vec2); +THC_API void THCTensor_(addbmm)(THCState *state, THCTensor *result, real beta, THCTensor *t, real alpha, THCTensor *batch1, THCTensor *batch2); +THC_API void THCTensor_(baddbmm)(THCState *state, THCTensor *result, real beta, THCTensor *t, real alpha, THCTensor *batch1, THCTensor *batch2); + + +#endif diff --git a/lib/THC/generic/THCTensorMathCompare.cu b/lib/THC/generic/THCTensorMathCompare.cu new file mode 100644 index 0000000..77f1ab5 --- /dev/null +++ b/lib/THC/generic/THCTensorMathCompare.cu @@ -0,0 +1,101 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/THCTensorMathCompare.cu" +#else + +THC_API void THCTensor_(ltValue)(THCState *state, THCudaByteTensor *self_, THCTensor *src, real value) +{ + THAssert(THCTensor_(checkGPU)(state, 2, self_, src)); + THC_logicalValue(state, self_, src, + TensorLTValueOp<typename TensorUtils<THCTensor>::DataType, + unsigned char>(value)); +} + +THC_API void THCTensor_(gtValue)(THCState *state, THCudaByteTensor *self_, THCTensor *src, real value) +{ + THAssert(THCTensor_(checkGPU)(state, 2, self_, src)); + THC_logicalValue(state, self_, src, + TensorGTValueOp<typename TensorUtils<THCTensor>::DataType, + unsigned char>(value)); +} + +THC_API void THCTensor_(leValue)(THCState *state, THCudaByteTensor *self_, THCTensor *src, real value) +{ + THAssert(THCTensor_(checkGPU)(state, 2, self_, src)); + THC_logicalValue(state, self_, src, + TensorLEValueOp<typename TensorUtils<THCTensor>::DataType, + unsigned char>(value)); +} + +THC_API void THCTensor_(geValue)(THCState *state, THCudaByteTensor *self_, THCTensor *src, real value) +{ + THAssert(THCTensor_(checkGPU)(state, 2, self_, src)); + THC_logicalValue(state, self_, src, + TensorGEValueOp<typename TensorUtils<THCTensor>::DataType, + unsigned char>(value)); +} + +THC_API void THCTensor_(eqValue)(THCState *state, THCudaByteTensor *self_, THCTensor *src, real value) +{ + THAssert(THCTensor_(checkGPU)(state, 2, self_, src)); + THC_logicalValue(state, self_, src, + TensorEQValueOp<typename TensorUtils<THCTensor>::DataType, + unsigned char>(value)); +} + +THC_API void THCTensor_(neValue)(THCState *state, THCudaByteTensor *self_, THCTensor *src, real value) +{ + THAssert(THCTensor_(checkGPU)(state, 2, self_, src)); + THC_logicalValue(state, self_, src, + TensorNEValueOp<typename TensorUtils<THCTensor>::DataType, + unsigned char>(value)); +} + +THC_API void THCTensor_(ltValueT)(THCState *state, THCTensor *self_, THCTensor *src, real value) +{ + THAssert(THCTensor_(checkGPU)(state, 2, self_, src)); + THC_logicalValue(state, self_, src, + TensorLTValueOp<typename TensorUtils<THCTensor>::DataType, + typename TensorUtils<THCTensor>::DataType>(value)); +} + +THC_API void THCTensor_(gtValueT)(THCState *state, THCTensor *self_, THCTensor *src, real value) +{ + THAssert(THCTensor_(checkGPU)(state, 2, self_, src)); + THC_logicalValue(state, self_, src, + TensorGTValueOp<typename TensorUtils<THCTensor>::DataType, + typename TensorUtils<THCTensor>::DataType>(value)); +} + +THC_API void THCTensor_(leValueT)(THCState *state, THCTensor *self_, THCTensor *src, real value) +{ + THAssert(THCTensor_(checkGPU)(state, 2, self_, src)); + THC_logicalValue(state, self_, src, + TensorLEValueOp<typename TensorUtils<THCTensor>::DataType, + typename TensorUtils<THCTensor>::DataType>(value)); +} + +THC_API void THCTensor_(geValueT)(THCState *state, THCTensor *self_, THCTensor *src, real value) +{ + THAssert(THCTensor_(checkGPU)(state, 2, self_, src)); + THC_logicalValue(state, self_, src, + TensorGEValueOp<typename TensorUtils<THCTensor>::DataType, + typename TensorUtils<THCTensor>::DataType>(value)); +} + +THC_API void THCTensor_(eqValueT)(THCState *state, THCTensor *self_, THCTensor *src, real value) +{ + THAssert(THCTensor_(checkGPU)(state, 2, self_, src)); + THC_logicalValue(state, self_, src, + TensorEQValueOp<typename TensorUtils<THCTensor>::DataType, + typename TensorUtils<THCTensor>::DataType>(value)); +} + +THC_API void THCTensor_(neValueT)(THCState *state, THCTensor *self_, THCTensor *src, real value) +{ + THAssert(THCTensor_(checkGPU)(state, 2, self_, src)); + THC_logicalValue(state, self_, src, + TensorNEValueOp<typename TensorUtils<THCTensor>::DataType, + typename TensorUtils<THCTensor>::DataType>(value)); +} + +#endif diff --git a/lib/THC/generic/THCTensorMathCompare.h b/lib/THC/generic/THCTensorMathCompare.h new file mode 100644 index 0000000..7b8837c --- /dev/null +++ b/lib/THC/generic/THCTensorMathCompare.h @@ -0,0 +1,20 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/THCTensorMathCompare.h" +#else + +THC_API void THCTensor_(ltValue)(THCState *state, THCudaByteTensor *self_, THCTensor *src, real value); +THC_API void THCTensor_(gtValue)(THCState *state, THCudaByteTensor *self_, THCTensor *src, real value); +THC_API void THCTensor_(leValue)(THCState *state, THCudaByteTensor *self_, THCTensor *src, real value); +THC_API void THCTensor_(geValue)(THCState *state, THCudaByteTensor *self_, THCTensor *src, real value); +THC_API void THCTensor_(eqValue)(THCState *state, THCudaByteTensor *self_, THCTensor *src, real value); +THC_API void THCTensor_(neValue)(THCState *state, THCudaByteTensor *self_, THCTensor *src, real value); + +THC_API void THCTensor_(ltValueT)(THCState *state, THCTensor *self_, THCTensor *src, real value); +THC_API void THCTensor_(gtValueT)(THCState *state, THCTensor *self_, THCTensor *src, real value); +THC_API void THCTensor_(leValueT)(THCState *state, THCTensor *self_, THCTensor *src, real value); +THC_API void THCTensor_(geValueT)(THCState *state, THCTensor *self_, THCTensor *src, real value); +THC_API void THCTensor_(eqValueT)(THCState *state, THCTensor *self_, THCTensor *src, real value); +THC_API void THCTensor_(neValueT)(THCState *state, THCTensor *self_, THCTensor *src, real value); + + +#endif diff --git a/lib/THC/generic/THCTensorMathCompareT.cu b/lib/THC/generic/THCTensorMathCompareT.cu new file mode 100644 index 0000000..4b59abf --- /dev/null +++ b/lib/THC/generic/THCTensorMathCompareT.cu @@ -0,0 +1,113 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/THCTensorMathCompareT.cu" +#else + +THC_API void +THCTensor_(ltTensor)(THCState *state, THCudaByteTensor *self_, THCTensor *src1, THCTensor *src2) +{ + THAssert(THCTensor_(checkGPU)(state, 3, self_, src1, src2)); + THC_logicalTensor(state, self_, src1, src2, + TensorLTOp<typename TensorUtils<THCTensor>::DataType, + unsigned char>()); +} + +THC_API void +THCTensor_(gtTensor)(THCState *state, THCudaByteTensor *self_, THCTensor *src1, THCTensor *src2) +{ + THAssert(THCTensor_(checkGPU)(state, 3, self_, src1, src2)); + THC_logicalTensor(state, self_, src1, src2, + TensorGTOp<typename TensorUtils<THCTensor>::DataType, + unsigned char>()); +} + +THC_API void +THCTensor_(leTensor)(THCState *state, THCudaByteTensor *self_, THCTensor *src1, THCTensor *src2) +{ + THAssert(THCTensor_(checkGPU)(state, 3, self_, src1, src2)); + THC_logicalTensor(state, self_, src1, src2, + TensorLEOp<typename TensorUtils<THCTensor>::DataType, + unsigned char>()); +} + +THC_API void +THCTensor_(geTensor)(THCState *state, THCudaByteTensor *self_, THCTensor *src1, THCTensor *src2) +{ + THAssert(THCTensor_(checkGPU)(state, 3, self_, src1, src2)); + THC_logicalTensor(state, self_, src1, src2, + TensorGEOp<typename TensorUtils<THCTensor>::DataType, + unsigned char>()); +} + +THC_API void +THCTensor_(eqTensor)(THCState *state, THCudaByteTensor *self_, THCTensor *src1, THCTensor *src2) +{ + THAssert(THCTensor_(checkGPU)(state, 3, self_, src1, src2)); + THC_logicalTensor(state, self_, src1, src2, + TensorEQOp<typename TensorUtils<THCTensor>::DataType, + unsigned char>()); +} + +THC_API void +THCTensor_(neTensor)(THCState *state, THCudaByteTensor *self_, THCTensor *src1, THCTensor *src2) +{ + THAssert(THCTensor_(checkGPU)(state, 3, self_, src1, src2)); + THC_logicalTensor(state, self_, src1, src2, + TensorNEOp<typename TensorUtils<THCTensor>::DataType, + unsigned char>()); +} + +THC_API void +THCTensor_(ltTensorT)(THCState *state, THCTensor *self_, THCTensor *src1, THCTensor *src2) +{ + THAssert(THCTensor_(checkGPU)(state, 3, self_, src1, src2)); + THC_logicalTensor(state, self_, src1, src2, + TensorLTOp<typename TensorUtils<THCTensor>::DataType, + typename TensorUtils<THCTensor>::DataType>()); +} + +THC_API void +THCTensor_(gtTensorT)(THCState *state, THCTensor *self_, THCTensor *src1, THCTensor *src2) +{ + THAssert(THCTensor_(checkGPU)(state, 3, self_, src1, src2)); + THC_logicalTensor(state, self_, src1, src2, + TensorGTOp<typename TensorUtils<THCTensor>::DataType, + typename TensorUtils<THCTensor>::DataType>()); +} + +THC_API void +THCTensor_(leTensorT)(THCState *state, THCTensor *self_, THCTensor *src1, THCTensor *src2) +{ + THAssert(THCTensor_(checkGPU)(state, 3, self_, src1, src2)); + THC_logicalTensor(state, self_, src1, src2, + TensorLEOp<typename TensorUtils<THCTensor>::DataType, + typename TensorUtils<THCTensor>::DataType>()); +} + +THC_API void +THCTensor_(geTensorT)(THCState *state, THCTensor *self_, THCTensor *src1, THCTensor *src2) +{ + THAssert(THCTensor_(checkGPU)(state, 3, self_, src1, src2)); + THC_logicalTensor(state, self_, src1, src2, + TensorGEOp<typename TensorUtils<THCTensor>::DataType, + typename TensorUtils<THCTensor>::DataType>()); +} + +THC_API void +THCTensor_(eqTensorT)(THCState *state, THCTensor *self_, THCTensor *src1, THCTensor *src2) +{ + THAssert(THCTensor_(checkGPU)(state, 3, self_, src1, src2)); + THC_logicalTensor(state, self_, src1, src2, + TensorEQOp<typename TensorUtils<THCTensor>::DataType, + typename TensorUtils<THCTensor>::DataType>()); +} + +THC_API void +THCTensor_(neTensorT)(THCState *state, THCTensor *self_, THCTensor *src1, THCTensor *src2) +{ + THAssert(THCTensor_(checkGPU)(state, 3, self_, src1, src2)); + THC_logicalTensor(state, self_, src1, src2, + TensorNEOp<typename TensorUtils<THCTensor>::DataType, + typename TensorUtils<THCTensor>::DataType>()); +} + +#endif diff --git a/lib/THC/generic/THCTensorMathCompareT.h b/lib/THC/generic/THCTensorMathCompareT.h new file mode 100644 index 0000000..0d76835 --- /dev/null +++ b/lib/THC/generic/THCTensorMathCompareT.h @@ -0,0 +1,19 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/THCTensorMathCompareT.h" +#else + +THC_API void THCTensor_(ltTensor)(THCState *state, THCudaByteTensor *self_, THCTensor *src1, THCTensor *src2); +THC_API void THCTensor_(gtTensor)(THCState *state, THCudaByteTensor *self_, THCTensor *src1, THCTensor *src2); +THC_API void THCTensor_(leTensor)(THCState *state, THCudaByteTensor *self_, THCTensor *src1, THCTensor *src2); +THC_API void THCTensor_(geTensor)(THCState *state, THCudaByteTensor *self_, THCTensor *src1, THCTensor *src2); +THC_API void THCTensor_(eqTensor)(THCState *state, THCudaByteTensor *self_, THCTensor *src1, THCTensor *src2); +THC_API void THCTensor_(neTensor)(THCState *state, THCudaByteTensor *self_, THCTensor *src1, THCTensor *src2); + +THC_API void THCTensor_(ltTensorT)(THCState *state, THCTensor *self_, THCTensor *src1, THCTensor *src2); +THC_API void THCTensor_(gtTensorT)(THCState *state, THCTensor *self_, THCTensor *src1, THCTensor *src2); +THC_API void THCTensor_(leTensorT)(THCState *state, THCTensor *self_, THCTensor *src1, THCTensor *src2); +THC_API void THCTensor_(geTensorT)(THCState *state, THCTensor *self_, THCTensor *src1, THCTensor *src2); +THC_API void THCTensor_(eqTensorT)(THCState *state, THCTensor *self_, THCTensor *src1, THCTensor *src2); +THC_API void THCTensor_(neTensorT)(THCState *state, THCTensor *self_, THCTensor *src1, THCTensor *src2); + +#endif diff --git a/lib/THC/generic/THCTensorMathReduce.cu b/lib/THC/generic/THCTensorMathReduce.cu new file mode 100644 index 0000000..e17013c --- /dev/null +++ b/lib/THC/generic/THCTensorMathReduce.cu @@ -0,0 +1,135 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/THCTensorMathReduce.cu" +#else + +THC_API void +THCTensor_(sum)(THCState* state, THCTensor *self, THCTensor *src, long dimension) { + THAssert(THCTensor_(checkGPU)(state, 2, self, src)); + if (!THC_reduceDim(state, self, src, + thrust::identity<real>(), + ReduceAdd<real, real>(), + ScalarConvert<int, real>::to(0), + dimension)) { + THArgCheck(false, 2, CUTORCH_DIM_WARNING); + } + + THCudaCheck(cudaGetLastError()); +} + +THC_API void +THCTensor_(prod)(THCState* state, THCTensor *self, THCTensor *src, long dimension) { + THAssert(THCTensor_(checkGPU)(state, 2, self, src)); + if (!THC_reduceDim(state, self, src, + thrust::identity<real>(), + ReduceMultiply<real, real>(), + ScalarConvert<int, real>::to(1), + dimension)) { + THArgCheck(false, 2, CUTORCH_DIM_WARNING); + } + + THCudaCheck(cudaGetLastError()); +} + +THC_API accreal +THCTensor_(sumall)(THCState *state, THCTensor *self) { + THAssert(THCTensor_(checkGPU)(state, 1, self)); + accreal val; + if (!THC_reduceAll(state, self, + thrust::identity<real>(), + ReduceAdd<real, accreal>(), + ReduceAdd<accreal, accreal>(), + ScalarConvert<int, accreal>::to(0), + &val, 0)) { + THArgCheck(false, 1, CUTORCH_DIM_WARNING); + } + + THCudaCheck(cudaGetLastError()); + return val; +} + +THC_API accreal +THCTensor_(prodall)(THCState *state, THCTensor *self) { + THAssert(THCTensor_(checkGPU)(state, 1, self)); + accreal val; + if (!THC_reduceAll(state, self, + thrust::identity<real>(), + ReduceMultiply<real, accreal>(), + ReduceMultiply<accreal, accreal>(), + ScalarConvert<int, accreal>::to(1), + &val, 0)) { + THArgCheck(false, 1, CUTORCH_DIM_WARNING); + } + + THCudaCheck(cudaGetLastError()); + return val; +} + +THC_API real +THCTensor_(minall)(THCState *state, THCTensor *self) { + THAssert(THCTensor_(checkGPU)(state, 1, self)); + real val; + if (!THC_reduceAll(state, self, + thrust::identity<real>(), + ReduceMin<real>(), + ReduceMin<real>(), + THCNumerics<real>::max(), &val, 0)) { + THArgCheck(false, 1, CUTORCH_DIM_WARNING); + } + + THCudaCheck(cudaGetLastError()); + return val; +} + +THC_API real +THCTensor_(maxall)(THCState *state, THCTensor *self) { + THAssert(THCTensor_(checkGPU)(state, 1, self)); + real val; + if (!THC_reduceAll(state, self, + thrust::identity<real>(), + ReduceMax<real>(), + ReduceMax<real>(), + THCNumerics<real>::min(), &val, 0)) { + THArgCheck(false, 1, CUTORCH_DIM_WARNING); + } + + THCudaCheck(cudaGetLastError()); + return val; +} + +THC_API void +THCTensor_(max)(THCState *state, + THCTensor *values, + THCudaLongTensor *indices, + THCTensor *src, + long dimension) { + THAssert(THCTensor_(checkGPU)(state, 3, values, indices, src)); + + thrust::pair<typename TensorUtils<THCTensor>::DataType, long> + init = + thrust::make_pair<typename TensorUtils<THCTensor>::DataType, long>( + THCNumerics<typename TensorUtils<THCTensor>::DataType>::min(), 1); + + return THC_reduceDimIndex( + state, values, indices, src, dimension, init, + MaxValuePair<typename TensorUtils<THCTensor>::DataType, long>()); +} + +THC_API void +THCTensor_(min)(THCState *state, + THCTensor *values, + THCudaLongTensor *indices, + THCTensor *src, + long dimension) { + THAssert(THCTensor_(checkGPU)(state, 3, values, indices, src)); + + thrust::pair<typename TensorUtils<THCTensor>::DataType, long> + init = + thrust::make_pair<typename TensorUtils<THCTensor>::DataType, long>( + THCNumerics<typename TensorUtils<THCTensor>::DataType>::max(), 1); + + return THC_reduceDimIndex( + state, values, indices, src, dimension, init, + MinValuePair<typename TensorUtils<THCTensor>::DataType, long>()); +} + +#endif diff --git a/lib/THC/generic/THCTensorMathReduce.h b/lib/THC/generic/THCTensorMathReduce.h new file mode 100644 index 0000000..f584d68 --- /dev/null +++ b/lib/THC/generic/THCTensorMathReduce.h @@ -0,0 +1,23 @@ +#ifndef THC_GENERIC_FILE +#define THC_GENERIC_FILE "generic/THCTensorMathReduce.h" +#else + +THC_API void THCTensor_(sum)(THCState *state, THCTensor *self, THCTensor *src, long dim); +THC_API void THCTensor_(prod)(THCState *state, THCTensor *self, THCTensor *src, long dim); + +THC_API accreal THCTensor_(sumall)(THCState *state, THCTensor *self); +THC_API accreal THCTensor_(prodall)(THCState *state, THCTensor *self); + +THC_API void THCTensor_(min)(THCState *state, + THCTensor *values, + THCudaLongTensor *indices, + THCTensor *src, long dim); +THC_API void THCTensor_(max)(THCState *state, + THCTensor *values, + THCudaLongTensor *indices, + THCTensor *src, long dim); + +THC_API real THCTensor_(minall)(THCState *state, THCTensor *self); +THC_API real THCTensor_(maxall)(THCState *state, THCTensor *self); + +#endif diff --git a/test/test.lua b/test/test.lua index 143f92c..960944b 100644 --- a/test/test.lua +++ b/test/test.lua @@ -863,10 +863,10 @@ function test.allAndAny() local size1 = chooseInt(10, 100) local t = nil if torch.uniform(0, 1) > 0.5 then - t = torch.CudaTensor(size1):fill(1) + t = torch.CudaByteTensor(size1):fill(1) else local size2 = chooseInt(10, 100) - t = torch.CudaTensor(size1, size2):fill(1) + t = torch.CudaByteTensor(size1, size2):fill(1) if torch.uniform(0, 1) > 0.5 then t = t:transpose(1, 2) @@ -1323,7 +1323,7 @@ function test.cross() local ndims = chooseInt(1, 10) local crossdim = chooseInt(1, ndims) sizes = {} - for i = 1, ndims do + for i = 1, ndims do sizes[i] = chooseInt(1, math.min(20, math.sqrt(nelems))) nelems = nelems / sizes[i] end @@ -2359,7 +2359,7 @@ function test.maskedSelect() local mask = torch.ByteTensor(n_row,n_col):bernoulli() local y = x:maskedSelect(mask) x=x:cuda() - mask=mask:cuda() + mask=mask:cudaByte() local y_cuda = x:maskedSelect(mask) tester:assertTensorEq(y, y_cuda:float(), 0.00001, "Error in maskedSelect") checkMultiDevice(x, 'maskedSelect', mask) @@ -2369,7 +2369,7 @@ function test.maskedSelect() local mask = torch.ByteTensor(n_row,n_col):bernoulli() local y = x:t():maskedSelect(mask) x=x:cuda() - mask=mask:cuda() + mask=mask:cudaByte() local y_cuda = x:t():maskedSelect(mask) tester:assertTensorEq(y, y_cuda:float(), 0.00001, "Error in maskedSelect non-contiguous") @@ -2380,7 +2380,7 @@ function test.maskedSelect() local y = torch.FloatTensor() y:maskedSelect(x, mask) x=x:cuda() - mask=mask:cuda() + mask=mask:cudaByte() local y_cuda = torch.CudaTensor() y_cuda:maskedSelect(x, mask) tester:assertTensorEq(y, y_cuda:float(), 0.00001, @@ -2392,7 +2392,7 @@ function test.maskedSelect() local y = torch.FloatTensor() y:maskedSelect(x:t(), mask) x=x:cuda() - mask=mask:cuda() + mask=mask:cudaByte() local y_cuda = torch.CudaTensor() y_cuda:maskedSelect(x:t(), mask) tester:assertTensorEq(y, y_cuda:float(), 0.00001, @@ -2425,7 +2425,7 @@ function test.maskedCopy() local mask = torch.ByteTensor(n_row,n_col):bernoulli() y:maskedCopy(mask, x:clone()) local y_cuda=x:cuda():fill(-1) - mask=mask:cuda() + mask=mask:cudaByte() x=x:cuda() y_cuda:maskedCopy(mask, x) tester:assertTensorEq(y, y_cuda:float(), 0.00001, @@ -2439,7 +2439,7 @@ function test.maskedCopy() y:maskedCopy(mask, x:t()) local y_cuda=x:cuda():fill(-1) x=x:cuda() - mask=mask:cuda() + mask=mask:cudaByte() y_cuda:maskedCopy(mask, x:t()) tester:assertTensorEq(y, y_cuda:float(), 0.00001, "Error in maskedCopy (non-contiguous source)") @@ -2451,7 +2451,7 @@ function test.maskedCopy() y:t():maskedCopy(mask, x:t()) local y_cuda=x:cuda():fill(-1) x=x:cuda() - mask=mask:cuda() + mask=mask:cudaByte() y_cuda:t():maskedCopy(mask, x:t()) tester:assertTensorEq(y, y_cuda:float(), 0.00001, "Error in maskedCopy (non-contiguous dest)") @@ -2501,7 +2501,7 @@ function test.maskedFill() local mask = torch.ByteTensor(n_row,n_col):bernoulli() x:maskedFill(mask, 334) local x_cuda=gt:cuda() - mask=mask:cuda() + mask=mask:cudaByte() x_cuda:maskedFill(mask, 334) tester:assertTensorEq(x, x_cuda:float(), 0.00001, "Error in maskedFill") checkMultiDevice(x_cuda, 'maskedFill', mask, 334) @@ -2511,7 +2511,7 @@ function test.maskedFill() mask = mask:byte() x:t():maskedFill(mask, 334) local x_cuda = gt:cuda() - mask=mask:cuda() + mask=mask:cudaByte() x_cuda:t():maskedFill(mask, 334) tester:assertTensorEq(x, x_cuda:float(), 0.00001, "Error in maskedFill non-contiguous") diff --git a/torch/generic/Tensor.c b/torch/generic/Tensor.c index a4dbed9..97e80e9 100644 --- a/torch/generic/Tensor.c +++ b/torch/generic/Tensor.c @@ -531,7 +531,7 @@ static int torch_Tensor_(indexFill)(lua_State *L) return 1; } -#endif +#endif // THC_REAL_IS_FLOAT static int torch_Tensor_(transpose)(lua_State *L) { @@ -655,10 +655,8 @@ static int torch_Tensor_(__newindex__)(lua_State *L) THCState *state = cutorch_getstate(L); THCTensor *tensor = luaT_checkudata(L, 1, torch_Tensor); THLongStorage *idx = NULL; -#ifdef THC_REAL_IS_FLOAT THByteTensor *mask; - THCudaTensor *maskCuda; -#endif + THCudaByteTensor *maskCuda; if(lua_isnumber(L, 2)) { @@ -849,16 +847,18 @@ static int torch_Tensor_(__newindex__)(lua_State *L) THCTensor_(free)(state, tensor); lua_pushboolean(L, 1); } - // FIXME: pending generic implementation of - // maskedFillByte/maskedCopyByte/maskedFill/maskedCopy -#ifdef THC_REAL_IS_FLOAT else if((mask = luaT_toudata(L, 2, "torch.ByteTensor"))) { THCTensor *vals; if (lua_isnumber(L, 3)) { - THCTensor_(maskedFillByte)(state, tensor, mask, - (real)(luaL_checknumber(L,3))); +#ifdef THC_REAL_IS_HALF + real value = THC_float2half((float) luaL_checknumber(L, 3)); +#else + real value = (real) luaL_checknumber(L, 3); +#endif + + THCTensor_(maskedFillByte)(state, tensor, mask, value); } else if((vals = luaT_toudata(L, 3, torch_Tensor))) { @@ -869,13 +869,18 @@ static int torch_Tensor_(__newindex__)(lua_State *L) luaL_error(L,"number or tensor expected"); } } - else if((maskCuda = luaT_toudata(L, 2, "torch.CudaTensor"))) + else if((maskCuda = luaT_toudata(L, 2, "torch.CudaByteTensor"))) { THCTensor *vals; if (lua_isnumber(L, 3)) { - THCTensor_(maskedFill)(state, tensor, maskCuda, - (real)(luaL_checknumber(L,3))); +#ifdef THC_REAL_IS_HALF + real value = THC_float2half((float) luaL_checknumber(L, 3)); +#else + real value = (real) luaL_checknumber(L, 3); +#endif + + THCTensor_(maskedFill)(state, tensor, maskCuda, value); } else if((vals = luaT_toudata(L, 3, torch_Tensor))) { @@ -886,7 +891,6 @@ static int torch_Tensor_(__newindex__)(lua_State *L) luaL_error(L,"number or tensor expected"); } } -#endif // THC_REAL_IS_FLOAT else { lua_pushboolean(L, 0); @@ -900,10 +904,8 @@ static int torch_Tensor_(__index__)(lua_State *L) THCState *state = cutorch_getstate(L); THCTensor *tensor = luaT_checkudata(L, 1, torch_Tensor); THLongStorage *idx = NULL; -#ifdef THC_REAL_IS_FLOAT THByteTensor *mask; - THCudaTensor *maskCuda; -#endif + THCudaByteTensor *maskCuda; if(lua_isnumber(L, 2)) { @@ -1039,8 +1041,6 @@ static int torch_Tensor_(__index__)(lua_State *L) lua_pushboolean(L, 1); return 2; } - // FIXME: pending generic implementation of maskedSelectByte/maskedSelect -#ifdef THC_REAL_IS_FLOAT else if((mask = luaT_toudata(L, 2, "torch.ByteTensor"))) { THCTensor *vals = THCTensor_(new)(state); @@ -1049,7 +1049,7 @@ static int torch_Tensor_(__index__)(lua_State *L) lua_pushboolean(L, 1); return 2; } - else if((maskCuda = luaT_toudata(L, 2, "torch.CudaTensor"))) + else if((maskCuda = luaT_toudata(L, 2, "torch.CudaByteTensor"))) { THCTensor *vals = THCTensor_(new)(state); THCTensor_(maskedSelect)(state, vals, tensor, maskCuda); @@ -1057,7 +1057,6 @@ static int torch_Tensor_(__index__)(lua_State *L) lua_pushboolean(L, 1); return 2; } -#endif // THC_REAL_IS_FLOAT else { lua_pushboolean(L, 0); |