Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/torch/cutorch.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--TensorMath.lua288
-rw-r--r--TensorOperator.c182
-rw-r--r--generic/TensorOperator.c263
-rw-r--r--init.c18
-rw-r--r--lib/THC/CMakeLists.txt13
-rw-r--r--lib/THC/THCBlas.cu263
-rw-r--r--lib/THC/THCBlas.h37
-rw-r--r--lib/THC/THCGenerateAllTypes.h8
-rw-r--r--lib/THC/THCHalf.cu9
-rw-r--r--lib/THC/THCHalf.h9
-rw-r--r--lib/THC/THCNumerics.cuh239
-rw-r--r--lib/THC/THCReduce.cuh145
-rw-r--r--lib/THC/THCReduceAll.cuh179
-rw-r--r--lib/THC/THCTensorCopy.cu1
-rw-r--r--lib/THC/THCTensorMasked.cu204
-rw-r--r--lib/THC/THCTensorMath.cu131
-rw-r--r--lib/THC/THCTensorMath.h58
-rw-r--r--lib/THC/THCTensorMath2.cu26
-rw-r--r--lib/THC/THCTensorMathBlas.cu424
-rw-r--r--lib/THC/THCTensorMathCompare.cu120
-rw-r--r--lib/THC/THCTensorMathCompareT.cu98
-rw-r--r--lib/THC/THCTensorMathMagma.cu4
-rw-r--r--lib/THC/THCTensorMathPairwise.cu3
-rw-r--r--lib/THC/THCTensorMathPointwise.cu2
-rw-r--r--lib/THC/THCTensorMathReduce.cu358
-rw-r--r--lib/THC/THCTensorMathTransformReduce.cu213
-rw-r--r--lib/THC/THCTensorTypeUtils.cu14
-rw-r--r--lib/THC/THCTensorTypeUtils.cuh62
-rw-r--r--lib/THC/generic/THCTensorMasked.cu191
-rw-r--r--lib/THC/generic/THCTensorMasked.h38
-rw-r--r--lib/THC/generic/THCTensorMathBlas.cu563
-rw-r--r--lib/THC/generic/THCTensorMathBlas.h13
-rw-r--r--lib/THC/generic/THCTensorMathCompare.cu101
-rw-r--r--lib/THC/generic/THCTensorMathCompare.h20
-rw-r--r--lib/THC/generic/THCTensorMathCompareT.cu113
-rw-r--r--lib/THC/generic/THCTensorMathCompareT.h19
-rw-r--r--lib/THC/generic/THCTensorMathReduce.cu135
-rw-r--r--lib/THC/generic/THCTensorMathReduce.h23
-rw-r--r--test/test.lua24
-rw-r--r--torch/generic/Tensor.c39
40 files changed, 2981 insertions, 1669 deletions
diff --git a/TensorMath.lua b/TensorMath.lua
index 17d7547..9cc8305 100644
--- a/TensorMath.lua
+++ b/TensorMath.lua
@@ -29,13 +29,13 @@ local unpack = unpack or table.unpack
-- specific to CUDA
local typenames = {'CudaByteTensor',
- 'CudaCharTensor',
- 'CudaShortTensor',
- 'CudaIntTensor',
- 'CudaLongTensor',
- 'CudaTensor',
- 'CudaDoubleTensor',
- 'CudaHalfTensor'
+ 'CudaCharTensor',
+ 'CudaShortTensor',
+ 'CudaIntTensor',
+ 'CudaLongTensor',
+ 'CudaTensor',
+ 'CudaDoubleTensor',
+ 'CudaHalfTensor'
}
for _, typename in ipairs(typenames) do
@@ -430,12 +430,21 @@ local handledTypereals = {'unsigned char',
'double',
'half'
}
+local handledTypeaccreals = {'long',
+ 'long',
+ 'long',
+ 'long',
+ 'long',
+ 'double',
+ 'float'
+}
for k, Tensor in pairs(handledTypenames) do
if Tensor == 'CudaHalfTensor' then
interface:print("#ifdef CUDA_HALF_TENSOR")
end
local real = handledTypereals[k]
+ local accreal = handledTypeaccreals[k]
function interface.luaname2wrapname(self, name)
return string.format('cutorch_%s_%s', Tensor, name)
@@ -515,6 +524,18 @@ for k, Tensor in pairs(handledTypenames) do
{name=real, default=1},
{name=Tensor}})
+ wrap("mul",
+ cname("mul"),
+ {{name=Tensor, default=true, returned=true, method={default='nil'}},
+ {name=Tensor, method={default=1}},
+ {name=real}})
+
+ wrap("div",
+ cname("div"),
+ {{name=Tensor, default=true, returned=true, method={default='nil'}},
+ {name=Tensor, method={default=1}},
+ {name=real}})
+
for _, name in ipairs({"cmul", "cpow", "cdiv"}) do
wrap(name,
cname(name),
@@ -523,6 +544,224 @@ for k, Tensor in pairs(handledTypenames) do
{name=Tensor}})
end
+ for _,name in ipairs({"min", "max"}) do
+ wrap(name,
+ cname(name .. "all"),
+ {{name=Tensor},
+ {name=real, creturned=true}},
+ cname(name),
+ {{name=Tensor, default=true, returned=true},
+ {name='CudaLongTensor', default=true, returned=true},
+ {name=Tensor},
+ {name="index"}})
+ end
+
+ if Tensor == 'CudaByteTensor' then
+ for _,name in pairs({'all', 'any'}) do
+ wrap(name,
+ cname('logical' .. name),
+ {{name=Tensor},
+ {name="boolean", creturned=true}})
+ end
+ end
+
+ for _,name in pairs({'lt','gt','le','ge','eq','ne'}) do
+ wrap(name,
+ cname(name .. 'Value'),
+ {{name='CudaByteTensor',default=true, returned=true},
+ {name=Tensor},
+ {name=real}},
+ cname(name .. 'ValueT'),
+ {{name=Tensor, returned=true},
+ {name=Tensor},
+ {name=real}},
+ cname(name .. 'Tensor'),
+ {{name='CudaByteTensor',default=true, returned=true},
+ {name=Tensor},
+ {name=Tensor}},
+ cname(name .. 'TensorT'),
+ {{name=Tensor, returned=true},
+ {name=Tensor},
+ {name=Tensor}})
+ end
+
+ wrap("sum",
+ cname("sumall"),
+ {{name=Tensor},
+ {name=accreal, creturned=true}},
+ cname("sum"),
+ {{name=Tensor, default=true, returned=true},
+ {name=Tensor},
+ {name="index"}})
+
+ wrap("prod",
+ cname("prodall"),
+ {{name=Tensor},
+ {name=accreal, creturned=true}},
+ cname("prod"),
+ {{name=Tensor, default=true, returned=true},
+ {name=Tensor},
+ {name="index"}})
+
+ wrap("maskedFill",
+ cname("maskedFill"),
+ {{name=Tensor, returned=true, method={default='nil'}},
+ {name='CudaByteTensor'},
+ {name=real}})
+
+ wrap("maskedCopy",
+ cname("maskedCopy"),
+ {{name=Tensor, returned=true, method={default='nil'}},
+ {name='CudaByteTensor'},
+ {name=Tensor}})
+
+ wrap("maskedSelect",
+ cname("maskedSelect"),
+ {{name=Tensor, returned=true, default=true},
+ {name=Tensor},
+ {name='CudaByteTensor'}})
+
+ -- BLAS functions
+ if real == 'float' or real == 'double' or real == 'half' then
+ wrap("mv",
+ cname("addmv"),
+ {{name=Tensor, default=true, returned=true, method={default='nil'},
+ init=function(arg)
+ return table.concat(
+ {
+ arg.__metatable.init(arg),
+ string.format("TH%s_checkGPU(cutorch_getstate(L), 1, %s);",
+ Tensor, arg.args[5]:carg()),
+ string.format("TH%s_resize1d(cutorch_getstate(L), %s, %s->size[0]);", Tensor, arg:carg(), arg.args[5]:carg())
+ }, '\n')
+ end,
+ precall=function(arg)
+ return table.concat(
+ {
+ string.format("TH%s_zero(cutorch_getstate(L), %s);", Tensor, arg:carg()),
+ arg.__metatable.precall(arg)
+ }, '\n')
+ end
+ },
+ {name=real, default=1, invisible=true},
+ {name=Tensor, default=1, invisible=true},
+ {name=real, default=1, invisible=true},
+ {name=Tensor, dim=2},
+ {name=Tensor, dim=1}}
+ )
+
+ wrap("mm",
+ cname("addmm"),
+ {{name=Tensor, default=true, returned=true, method={default='nil'},
+ init=function(arg)
+ return table.concat(
+ {
+ arg.__metatable.init(arg),
+ string.format("TH%s_checkGPU(cutorch_getstate(L), 2, %s, %s);",
+ Tensor, arg.args[5]:carg(), arg.args[6]:carg()),
+ string.format("TH%s_resize2d(cutorch_getstate(L), %s, %s->size[0], %s->size[1]);",
+ Tensor, arg:carg(), arg.args[5]:carg(), arg.args[6]:carg())
+ }, '\n')
+ end,
+ },
+ {name=real, default=0, invisible=true},
+ {name=Tensor, default=1, invisible=true},
+ {name=real, default=1, invisible=true},
+ {name=Tensor, dim=2},
+ {name=Tensor, dim=2}}
+ )
+
+ wrap("bmm",
+ cname("baddbmm"),
+ {{name=Tensor, default=true, returned=true, method={default='nil'},
+ init=function(arg)
+ return table.concat(
+ {
+ arg.__metatable.init(arg),
+ string.format("TH%s_checkGPU(cutorch_getstate(L), 2, %s, %s);",
+ Tensor, arg.args[5]:carg(), arg.args[6]:carg()),
+ string.format("TH%s_resize3d(cutorch_getstate(L), %s, %s->size[0], %s->size[1], %s->size[2]);",
+ Tensor, arg:carg(), arg.args[5]:carg(), arg.args[5]:carg(), arg.args[6]:carg())
+ }, '\n')
+ end,
+ },
+ {name=real, default=0, invisible=true},
+ {name=Tensor, default=1, invisible=true},
+ {name=real, default=1, invisible=true},
+ {name=Tensor, dim=3},
+ {name=Tensor, dim=3}}
+ )
+
+ wrap("ger",
+ cname("addr"),
+ {{name=Tensor, default=true, returned=true, method={default='nil'},
+ init=function(arg)
+ return table.concat(
+ {
+ arg.__metatable.init(arg),
+ string.format("TH%s_checkGPU(cutorch_getstate(L), 2, %s, %s);",
+ Tensor, arg.args[5]:carg(), arg.args[6]:carg()),
+ string.format("TH%s_resize2d(cutorch_getstate(L), %s, %s->size[0], %s->size[0]);", Tensor, arg:carg(), arg.args[5]:carg(), arg.args[6]:carg())
+ }, '\n')
+ end,
+ precall=function(arg)
+ return table.concat(
+ {
+ string.format("TH%s_zero(cutorch_getstate(L), %s);", Tensor, arg:carg()),
+ arg.__metatable.precall(arg)
+ }, '\n')
+ end
+ },
+ {name=real, default=1, invisible=true},
+ {name=Tensor, default=1, invisible=true},
+ {name=real, default=1, invisible=true},
+ {name=Tensor, dim=1},
+ {name=Tensor, dim=1}}
+ )
+
+ for _,f in ipairs({
+ {name="addmv", dim1=1, dim2=2, dim3=1},
+ {name="addmm", dim1=2, dim2=2, dim3=2},
+ {name="addr", dim1=2, dim2=1, dim3=1},
+ {name="baddbmm", dim1=3, dim2=3, dim3=3},
+ {name="addbmm", dim1=2, dim2=3, dim3=3},
+ }
+ ) do
+
+ interface:wrap(f.name,
+ cname(f.name),
+ {{name=Tensor, default=true, returned=true},
+ {name=real, default=1},
+ {name=Tensor, dim=f.dim1},
+ {name=real, default=1},
+ {name=Tensor, dim=f.dim2},
+ {name=Tensor, dim=f.dim3}})
+
+ -- there is an ambiguity here, hence the more complicated setup
+ method:wrap(f.name,
+ cname(f.name),
+ {{name=Tensor, returned=true, dim=f.dim1},
+ {name=real, default=1, invisible=true},
+ {name=Tensor, default=1, dim=f.dim1},
+ {name=real, default=1},
+ {name=Tensor, dim=f.dim2},
+ {name=Tensor, dim=f.dim3}},
+ cname(f.name),
+ {{name=Tensor, returned=true, dim=f.dim1},
+ {name=real},
+ {name=Tensor, default=1, dim=f.dim1},
+ {name=real},
+ {name=Tensor, dim=f.dim2},
+ {name=Tensor, dim=f.dim3}})
+ end
+ end
+
+ wrap("dot",
+ cname("dot"),
+ {{name=Tensor},
+ {name=Tensor},
+ {name=real, creturned=true}})
+
method:register("m_cutorch_" .. Tensor .. "Math__")
interface:print(method:tostring())
method:clearhistory()
@@ -677,20 +916,20 @@ wrap("addcdiv",
wrap("maskedFill",
cname("maskedFill"),
{{name=Tensor, returned=true, method={default='nil'}},
- {name=Tensor},
+ {name='CudaByteTensor'},
{name=real}})
wrap("maskedCopy",
cname("maskedCopy"),
{{name=Tensor, returned=true, method={default='nil'}},
- {name=Tensor},
+ {name='CudaByteTensor'},
{name=Tensor}})
wrap("maskedSelect",
cname("maskedSelect"),
{{name=Tensor, returned=true, default=true},
{name=Tensor},
- {name=Tensor}})
+ {name='CudaByteTensor'}})
wrap("gather",
cname("gather"),
@@ -914,7 +1153,7 @@ for _,name in ipairs({"min", "max"}) do
{name=real, creturned=true}},
cname(name),
{{name=Tensor, default=true, returned=true},
- {name=Tensor, default=true, returned=true},
+ {name='CudaLongTensor', default=true, returned=true},
{name=Tensor},
{name="index"}})
end
@@ -1018,20 +1257,21 @@ wrap("clamp",
for _,name in pairs({'lt','gt','le','ge','eq','ne'}) do
wrap(name,
cname(name .. 'Value'),
- {{name=Tensor, default=true, returned=true},
- {name=Tensor},
- {name=real}},
+ {{name='CudaByteTensor',default=true, returned=true},
+ {name=Tensor},
+ {name=real}},
+ cname(name .. 'ValueT'),
+ {{name=Tensor, returned=true},
+ {name=Tensor},
+ {name=real}},
cname(name .. 'Tensor'),
- {{name=Tensor, default=true, returned=true},
- {name=Tensor},
- {name=Tensor}})
-end
-
-for _,name in pairs({'all', 'any'}) do
- wrap(name,
- cname('logical' .. name),
- {{name=Tensor},
- {name="boolean", creturned=true}})
+ {{name='CudaByteTensor',default=true, returned=true},
+ {name=Tensor},
+ {name=Tensor}},
+ cname(name .. 'TensorT'),
+ {{name=Tensor, returned=true},
+ {name=Tensor},
+ {name=Tensor}})
end
wrap("cat",
diff --git a/TensorOperator.c b/TensorOperator.c
index bbd33b5..ae7c2b3 100644
--- a/TensorOperator.c
+++ b/TensorOperator.c
@@ -2,180 +2,12 @@
#include "luaT.h"
#include "THC.h"
-static int cutorch_CudaTensorOperator___add__(lua_State *L)
-{
- THCudaTensor *tensor1 = luaT_toudata(L, 1, "torch.CudaTensor");
- THCudaTensor *tensor2 = luaT_toudata(L, 2, "torch.CudaTensor");
- THCudaTensor *r;
- THCState *state = cutorch_getstate(L);
- THAssert(THCudaTensor_checkGPU(state, 2, tensor1, tensor2));
+#include "THCTensorMath.h"
- if(!tensor1 && !tensor2)
- luaL_error(L, "expecting two Tensors or one Tensor and one number");
- else
- {
- r = THCudaTensor_new(state);
- luaT_pushudata(L, r, "torch.CudaTensor");
+#define cutorch_TensorOperator_(NAME) TH_CONCAT_4(cutorch_,CReal,TensorOperator_,NAME)
+#define torch_Tensor_(NAME) TH_CONCAT_4(torch_,CReal,Tensor_,NAME)
+#define torch_Tensor TH_CONCAT_STRING_3(torch.,CReal,Tensor)
+#define cutorch_Tensor_(NAME) TH_CONCAT_4(cutorch_,CReal,Tensor_,NAME)
- if(!tensor1 && tensor2)
- {
- THCudaTensor_resizeAs(state, r, tensor2);
- THCudaTensor_copy(state, r, tensor2);
- THCudaTensor_add(state, r, r, luaL_checknumber(L, 1));
- }
- else if(tensor1 && !tensor2)
- {
- THCudaTensor_resizeAs(state, r, tensor1);
- THCudaTensor_copy(state, r, tensor1);
- THCudaTensor_add(state, r, r, luaL_checknumber(L, 2));
- }
- else
- {
- THCudaTensor_resizeAs(state, r, tensor1);
- THCudaTensor_copy(state, r, tensor1);
- THCudaTensor_cadd(state, r, r, 1, tensor2);
- }
- }
- return 1;
-}
-
-static int cutorch_CudaTensorOperator___sub__(lua_State *L)
-{
- THCudaTensor *tensor1 = luaT_toudata(L, 1, "torch.CudaTensor");
- THCudaTensor *tensor2 = luaT_toudata(L, 2, "torch.CudaTensor");
- THCudaTensor *r;
- THCState *state = cutorch_getstate(L);
- THAssert(THCudaTensor_checkGPU(state, 2, tensor1, tensor2));
-
- if(!tensor1 && !tensor2)
- luaL_error(L, "expecting two Tensors or one Tensor and one number");
- else
- {
- r = THCudaTensor_new(state);
- luaT_pushudata(L, r, "torch.CudaTensor");
-
- if(!tensor1 && tensor2)
- {
- THCudaTensor_resizeAs(state, r, tensor2);
- THCudaTensor_fill(state, r, luaL_checknumber(L, 1));
- THCudaTensor_cadd(state, r, r, -1, tensor2);
- }
- else if(tensor1 && !tensor2)
- {
- THCudaTensor_resizeAs(state, r, tensor1);
- THCudaTensor_copy(state, r, tensor1);
- THCudaTensor_add(state, r, r, -luaL_checknumber(L, 2));
- }
- else
- {
- THCudaTensor_resizeAs(state, r, tensor1);
- THCudaTensor_copy(state, r, tensor1);
- THCudaTensor_cadd(state, r, r, -1, tensor2);
- }
- }
- return 1;
-}
-
-static int cutorch_CudaTensorOperator___unm__(lua_State *L)
-{
- THCudaTensor *tensor = luaT_checkudata(L, 1, "torch.CudaTensor");
- THCudaTensor *r;
- THCState *state = cutorch_getstate(L);
- THAssert(THCudaTensor_checkGPU(state, 1, tensor));
-
- r = THCudaTensor_new(state);
- luaT_pushudata(L, r, "torch.CudaTensor");
- THCudaTensor_resizeAs(state, r, tensor);
- THCudaTensor_copy(state, r, tensor);
- THCudaTensor_mul(state, r, r, -1);
-
- return 1;
-}
-
-static int cutorch_CudaTensorOperator___mul__(lua_State *L)
-{
- THCudaTensor *tensor1 = luaT_toudata(L, 1, "torch.CudaTensor");
- THCudaTensor *tensor2 = luaT_toudata(L, 2, "torch.CudaTensor");
- THCudaTensor *r;
- THCState *state = cutorch_getstate(L);
- THAssert(THCudaTensor_checkGPU(state, 2, tensor1, tensor2));
-
- if(!tensor1 && !tensor2)
- luaL_error(L, "expecting two Tensors or one Tensor and one number");
- else
- {
- r = THCudaTensor_new(state);
- luaT_pushudata(L, r, "torch.CudaTensor");
-
- if(!tensor1 && tensor2)
- {
- THCudaTensor_resizeAs(state, r, tensor2);
- THCudaTensor_copy(state, r, tensor2);
- THCudaTensor_mul(state, r, r, luaL_checknumber(L, 1));
- }
- else if(tensor1 && !tensor2)
- {
- THCudaTensor_resizeAs(state, r, tensor1);
- THCudaTensor_copy(state, r, tensor1);
- THCudaTensor_mul(state, r, r, luaL_checknumber(L, 2));
- }
- else
- {
- int dimt = tensor1->nDimension;
- int dims = tensor2->nDimension;
-
- if(dimt == 1 && dims == 1)
- lua_pushnumber(L, THCudaTensor_dot(state, tensor1, tensor2)); /* ok, we wasted r, but who cares */
- else if(dimt == 2 && dims == 1)
- {
- THCudaTensor_resize1d(state, r, tensor1->size[0]);
- THCudaTensor_zero(state, r);
- THCudaTensor_addmv(state, r, 1, r, 1, tensor1, tensor2);
- }
- else if(dimt == 2 && dims == 2)
- {
- THCudaTensor_resize2d(state, r, tensor1->size[0], tensor2->size[1]);
- THCudaTensor_zero(state, r);
- THCudaTensor_addmm(state, r, 1, r, 1, tensor1, tensor2);
- }
- else
- luaL_error(L, "multiplication between %dD and %dD tensors not yet supported", tensor1->nDimension, tensor2->nDimension);
- }
- }
- return 1;
-}
-
-static int cutorch_CudaTensorOperator___div__(lua_State *L)
-{
- THCudaTensor *tensor = luaT_checkudata(L, 1, "torch.CudaTensor");
- THCudaTensor *r;
- THCState *state = cutorch_getstate(L);
- THAssert(THCudaTensor_checkGPU(state, 1, tensor));
-
- luaL_argcheck(L, lua_isnumber(L,2), 2, "number expected");
-
- r = THCudaTensor_new(state);
- luaT_pushudata(L, r, "torch.CudaTensor");
-
- THCudaTensor_resizeAs(state, r, tensor);
- THCudaTensor_copy(state, r, tensor);
- THCudaTensor_mul(state, r, r, 1/lua_tonumber(L, 2));
-
- return 1;
-}
-
-static const struct luaL_Reg cutorch_CudaTensorOperator__ [] = {
- {"__add__", cutorch_CudaTensorOperator___add__},
- {"__sub__", cutorch_CudaTensorOperator___sub__},
- {"__unm__", cutorch_CudaTensorOperator___unm__},
- {"__mul__", cutorch_CudaTensorOperator___mul__},
- {"__div__", cutorch_CudaTensorOperator___div__},
- {NULL, NULL}
-};
-
-void cutorch_CudaTensorOperator_init(lua_State *L)
-{
- luaT_pushmetatable(L, "torch.CudaTensor");
- luaL_setfuncs(L, cutorch_CudaTensorOperator__, 0);
- lua_pop(L, 1);
-}
+#include "generic/TensorOperator.c"
+#include "THCGenerateAllTypes.h"
diff --git a/generic/TensorOperator.c b/generic/TensorOperator.c
new file mode 100644
index 0000000..5844a9d
--- /dev/null
+++ b/generic/TensorOperator.c
@@ -0,0 +1,263 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/TensorOperator.c"
+#else
+
+static int cutorch_TensorOperator_(__add__)(lua_State *L)
+{
+ THCTensor *tensor1 = luaT_toudata(L, 1, torch_Tensor);
+ THCTensor *tensor2 = luaT_toudata(L, 2, torch_Tensor);
+ THCTensor *r;
+ THCState *state = cutorch_getstate(L);
+ THAssert(THCTensor_(checkGPU)(state, 2, tensor1, tensor2));
+
+ if(!tensor1 && !tensor2)
+ luaL_error(L, "expecting two Tensors or one Tensor and one number");
+ else
+ {
+ r = THCTensor_(new)(state);
+ luaT_pushudata(L, r, torch_Tensor);
+
+ if(!tensor1 && tensor2)
+ {
+ THCTensor_(resizeAs)(state, r, tensor2);
+ THCTensor_(copy)(state, r, tensor2);
+ double v = luaL_checknumber(L, 1);
+#ifdef THC_REAL_IS_HALF
+ half value = THC_float2half((float) v);
+#else
+ real value = (real) v;
+#endif
+
+ THCTensor_(add)(state, r, r, value);
+ }
+ else if(tensor1 && !tensor2)
+ {
+ THCTensor_(resizeAs)(state, r, tensor1);
+ THCTensor_(copy)(state, r, tensor1);
+
+ double v = luaL_checknumber(L, 2);
+#ifdef THC_REAL_IS_HALF
+ half value = THC_float2half((float) v);
+#else
+ real value = (real) v;
+#endif
+
+ THCTensor_(add)(state, r, r, value);
+ }
+ else
+ {
+ THCTensor_(resizeAs)(state, r, tensor1);
+ THCTensor_(copy)(state, r, tensor1);
+
+#ifdef THC_REAL_IS_HALF
+ half one = THC_float2half(1.0f);
+#else
+ real one = (real) 1;
+#endif
+
+ THCTensor_(cadd)(state, r, r, one, tensor2);
+ }
+ }
+ return 1;
+}
+
+static int cutorch_TensorOperator_(__sub__)(lua_State *L)
+{
+ THCTensor *tensor1 = luaT_toudata(L, 1, torch_Tensor);
+ THCTensor *tensor2 = luaT_toudata(L, 2, torch_Tensor);
+ THCTensor *r;
+ THCState *state = cutorch_getstate(L);
+ THAssert(THCTensor_(checkGPU)(state, 2, tensor1, tensor2));
+
+ if(!tensor1 && !tensor2)
+ luaL_error(L, "expecting two Tensors or one Tensor and one number");
+ else
+ {
+ r = THCTensor_(new)(state);
+ luaT_pushudata(L, r, torch_Tensor);
+
+#ifdef THC_REAL_IS_HALF
+ half neg = THC_float2half(-1.0f);
+#else
+ real neg = (real) -1;
+#endif
+
+ if(!tensor1 && tensor2)
+ {
+ THCTensor_(resizeAs)(state, r, tensor2);
+
+ double v = luaL_checknumber(L, 1);
+#ifdef THC_REAL_IS_HALF
+ half value = THC_float2half((float) v);
+#else
+ real value = (real) v;
+#endif
+
+ THCTensor_(fill)(state, r, value);
+ THCTensor_(cadd)(state, r, r, neg, tensor2);
+ }
+ else if(tensor1 && !tensor2)
+ {
+ THCTensor_(resizeAs)(state, r, tensor1);
+ THCTensor_(copy)(state, r, tensor1);
+
+ double v = -luaL_checknumber(L, 2);
+#ifdef THC_REAL_IS_HALF
+ half value = THC_float2half((float) v);
+#else
+ real value = (real) v;
+#endif
+
+ THCTensor_(add)(state, r, r, value);
+ }
+ else
+ {
+ THCTensor_(resizeAs)(state, r, tensor1);
+ THCTensor_(copy)(state, r, tensor1);
+ THCTensor_(cadd)(state, r, r, neg, tensor2);
+ }
+ }
+ return 1;
+}
+
+static int cutorch_TensorOperator_(__unm__)(lua_State *L)
+{
+ THCTensor *tensor = luaT_checkudata(L, 1, torch_Tensor);
+ THCTensor *r;
+ THCState *state = cutorch_getstate(L);
+ THAssert(THCTensor_(checkGPU)(state, 1, tensor));
+
+ r = THCTensor_(new)(state);
+ luaT_pushudata(L, r, torch_Tensor);
+ THCTensor_(resizeAs)(state, r, tensor);
+ THCTensor_(copy)(state, r, tensor);
+
+#ifdef THC_REAL_IS_HALF
+ half neg = THC_float2half(-1.0f);
+#else
+ real neg = (real) -1;
+#endif
+
+ THCTensor_(mul)(state, r, r, neg);
+
+ return 1;
+}
+
+static int cutorch_TensorOperator_(__mul__)(lua_State *L)
+{
+ // FIXME: implement
+#ifdef THC_REAL_IS_FLOAT
+ THCTensor *tensor1 = luaT_toudata(L, 1, torch_Tensor);
+ THCTensor *tensor2 = luaT_toudata(L, 2, torch_Tensor);
+ THCTensor *r;
+ THCState *state = cutorch_getstate(L);
+ THAssert(THCTensor_(checkGPU)(state, 2, tensor1, tensor2));
+
+ if(!tensor1 && !tensor2)
+ luaL_error(L, "expecting two Tensors or one Tensor and one number");
+ else
+ {
+ r = THCTensor_(new)(state);
+ luaT_pushudata(L, r, torch_Tensor);
+
+ if(!tensor1 && tensor2)
+ {
+ THCTensor_(resizeAs)(state, r, tensor2);
+ THCTensor_(copy)(state, r, tensor2);
+
+ double v = luaL_checknumber(L, 1);
+#ifdef THC_REAL_IS_HALF
+ half value = THC_float2half((float) v);
+#else
+ real value = (real) v;
+#endif
+
+ THCTensor_(mul)(state, r, r, value);
+ }
+ else if(tensor1 && !tensor2)
+ {
+ THCTensor_(resizeAs)(state, r, tensor1);
+ THCTensor_(copy)(state, r, tensor1);
+
+ double v = luaL_checknumber(L, 2);
+#ifdef THC_REAL_IS_HALF
+ half value = THC_float2half((float) v);
+#else
+ real value = (real) v;
+#endif
+
+ THCTensor_(mul)(state, r, r, value);
+ }
+ else
+ {
+ int dimt = tensor1->nDimension;
+ int dims = tensor2->nDimension;
+
+ if(dimt == 1 && dims == 1)
+ lua_pushnumber(L, THCTensor_(dot)(state, tensor1, tensor2)); /* ok, we wasted r, but who cares */
+ else if(dimt == 2 && dims == 1)
+ {
+ THCTensor_(resize1d)(state, r, tensor1->size[0]);
+ THCTensor_(zero)(state, r);
+ THCTensor_(addmv)(state, r, 1, r, 1, tensor1, tensor2);
+ }
+ else if(dimt == 2 && dims == 2)
+ {
+ THCTensor_(resize2d)(state, r, tensor1->size[0], tensor2->size[1]);
+ THCTensor_(zero)(state, r);
+ THCTensor_(addmm)(state, r, 1, r, 1, tensor1, tensor2);
+ }
+ else
+ luaL_error(L, "multiplication between %dD and %dD tensors not yet supported", tensor1->nDimension, tensor2->nDimension);
+ }
+ }
+ return 1;
+#else
+ THError("unimplemented");
+#endif
+}
+
+static int cutorch_TensorOperator_(__div__)(lua_State *L)
+{
+ THCTensor *tensor = luaT_checkudata(L, 1, torch_Tensor);
+ THCTensor *r;
+ THCState *state = cutorch_getstate(L);
+ THAssert(THCTensor_(checkGPU)(state, 1, tensor));
+
+ luaL_argcheck(L, lua_isnumber(L,2), 2, "number expected");
+
+ r = THCTensor_(new)(state);
+ luaT_pushudata(L, r, torch_Tensor);
+
+ THCTensor_(resizeAs)(state, r, tensor);
+ THCTensor_(copy)(state, r, tensor);
+
+ double v = luaL_checknumber(L, 2);
+#ifdef THC_REAL_IS_HALF
+ half value = THC_float2half(1.0f / (float) v);
+#else
+ real value = (real) 1 / (real) v;
+#endif
+
+ THCTensor_(mul)(state, r, r, value);
+
+ return 1;
+}
+
+static const struct luaL_Reg cutorch_TensorOperator_(_) [] = {
+ {"__add__", cutorch_TensorOperator_(__add__)},
+ {"__sub__", cutorch_TensorOperator_(__sub__)},
+ {"__unm__", cutorch_TensorOperator_(__unm__)},
+ {"__mul__", cutorch_TensorOperator_(__mul__)},
+ {"__div__", cutorch_TensorOperator_(__div__)},
+ {NULL, NULL}
+};
+
+void cutorch_TensorOperator_(init)(lua_State *L)
+{
+ luaT_pushmetatable(L, torch_Tensor);
+ luaT_setfuncs(L, cutorch_TensorOperator_(_), 0);
+ lua_pop(L, 1);
+}
+
+#endif
diff --git a/init.c b/init.c
index 9352ef7..d486341 100644
--- a/init.c
+++ b/init.c
@@ -26,7 +26,16 @@ extern void cutorch_CudaDoubleTensor_init(lua_State* L);
extern void cutorch_CudaHalfTensor_init(lua_State* L);
#endif
+extern void cutorch_CudaByteTensorOperator_init(lua_State* L);
+extern void cutorch_CudaCharTensorOperator_init(lua_State* L);
+extern void cutorch_CudaShortTensorOperator_init(lua_State* L);
+extern void cutorch_CudaIntTensorOperator_init(lua_State* L);
+extern void cutorch_CudaLongTensorOperator_init(lua_State* L);
extern void cutorch_CudaTensorOperator_init(lua_State* L);
+extern void cutorch_CudaDoubleTensorOperator_init(lua_State* L);
+#ifdef CUDA_HALF_TENSOR
+extern void cutorch_CudaHalfTensorOperator_init(lua_State* L);
+#endif
extern void cutorch_CudaByteTensorMath_init(lua_State* L);
extern void cutorch_CudaCharTensorMath_init(lua_State* L);
@@ -980,7 +989,16 @@ int luaopen_libcutorch(lua_State *L)
cutorch_CudaHalfTensor_init(L);
#endif
+ cutorch_CudaByteTensorOperator_init(L);
+ cutorch_CudaCharTensorOperator_init(L);
+ cutorch_CudaShortTensorOperator_init(L);
+ cutorch_CudaIntTensorOperator_init(L);
+ cutorch_CudaLongTensorOperator_init(L);
cutorch_CudaTensorOperator_init(L);
+ cutorch_CudaDoubleTensorOperator_init(L);
+#ifdef CUDA_HALF_TENSOR
+ cutorch_CudaHalfTensorOperator_init(L);
+#endif
cutorch_CudaByteTensorMath_init(L);
cutorch_CudaCharTensorMath_init(L);
diff --git a/lib/THC/CMakeLists.txt b/lib/THC/CMakeLists.txt
index dfc7ec3..d11364c 100644
--- a/lib/THC/CMakeLists.txt
+++ b/lib/THC/CMakeLists.txt
@@ -117,8 +117,8 @@ SET(src-cuda
THCTensorMathMagma.cu
THCTensorMathPairwise.cu
THCTensorMathPointwise.cu
+ THCTensorMathReduce.cu
THCTensorMathScan.cu
- THCTensorMathTransformReduce.cu
THCTensorMasked.cu
THCTensorIndex.cu
THCTensorConv.cu
@@ -181,6 +181,7 @@ INSTALL(FILES
THCDeviceTensorUtils-inl.cuh
THCGenerateAllTypes.h
THCHalf.h
+ THCNumerics.cuh
THCTensorInfo.cuh
THCTensorTypeUtils.cuh
DESTINATION "${THC_INSTALL_INCLUDE_SUBDIR}/THC")
@@ -198,10 +199,20 @@ INSTALL(FILES
generic/THCTensorCopy.c
generic/THCTensorCopy.cu
generic/THCTensorCopy.h
+ generic/THCTensorMasked.h
+ generic/THCTensorMasked.cu
generic/THCTensorMath.h
generic/THCTensorMath.cu
+ generic/THCTensorMathBlas.cu
+ generic/THCTensorMathBlas.h
+ generic/THCTensorMathCompare.h
+ generic/THCTensorMathCompare.cu
+ generic/THCTensorMathCompareT.h
+ generic/THCTensorMathCompareT.cu
generic/THCTensorMathPairwise.h
generic/THCTensorMathPairwise.cu
generic/THCTensorMathPointwise.h
generic/THCTensorMathPointwise.cu
+ generic/THCTensorMathReduce.h
+ generic/THCTensorMathReduce.cu
DESTINATION "${THC_INSTALL_INCLUDE_SUBDIR}/THC/generic")
diff --git a/lib/THC/THCBlas.cu b/lib/THC/THCBlas.cu
index 1edbcb0..5b99506 100644
--- a/lib/THC/THCBlas.cu
+++ b/lib/THC/THCBlas.cu
@@ -1,109 +1,79 @@
#include "THCBlas.h"
#include "THCGeneral.h"
+#include "THCHalf.h"
-void THCudaBlas_swap(THCState *state, long n, float *x, long incx, float *y, long incy)
+float THCudaBlas_Sdot(THCState *state, long n, float *x, long incx, float *y, long incy)
{
- if(n == 1)
- {
- incx = 1;
- incy = 1;
- }
-
- if( (n <= INT_MAX) && (incx <= INT_MAX) && (incy <= INT_MAX) )
- {
- int i_n = (int)n;
- int i_incx = (int)incx;
- int i_incy = (int)incy;
- THCublasCheck(cublasSswap(THCState_getCurrentBlasHandle(state), i_n, x, i_incx, y, i_incy));
- return;
- }
- THError("Cublas_swap only supports n, incx and"
- " incy upto signed integer limits: %d", INT_MAX);
-}
-
-void THCudaBlas_scal(THCState *state, long n, float a, float *x, long incx)
-{
- if(n == 1)
- incx = 1;
-
- if( (n <= INT_MAX) && (incx <= INT_MAX) )
- {
- int i_n = (int)n;
- int i_incx = (int)incx;
- THCublasCheck(cublasSscal(THCState_getCurrentBlasHandle(state), i_n, &a, x, i_incx));
- return;
- }
- THError("Cublas_scal only supports n and incx "
- "upto signed integer limits: %d", INT_MAX);
-}
-
-void THCudaBlas_copy(THCState *state, long n, float *x, long incx, float *y, long incy)
-{
- if(n == 1)
- {
+ if (n == 1) {
incx = 1;
incy = 1;
}
- if( (n <= INT_MAX) && (incx <= INT_MAX) && (incy <= INT_MAX) )
- {
+ if ((n <= INT_MAX) && (incx <= INT_MAX) && (incy <= INT_MAX)) {
int i_n = (int)n;
int i_incx = (int)incx;
int i_incy = (int)incy;
- THCublasCheck(cublasScopy(THCState_getCurrentBlasHandle(state), i_n, x, i_incx, y, i_incy));
- return;
+ float result;
+ THCublasCheck(cublasSdot(THCState_getCurrentBlasHandle(state), i_n, x, i_incx, y, i_incy, &result));
+ return result;
}
- THError("Cublas_copy only supports n, incx and incy "
- "upto signed integer limits: %d", INT_MAX);
+ THError("Cublas_Sdot only supports n, incx and incy "
+ "up to signed integer limits: %d", INT_MAX);
+ return 0;
}
-void THCudaBlas_axpy(THCState *state, long n, float a, float *x, long incx, float *y, long incy)
+double THCudaBlas_Ddot(THCState *state, long n, double *x, long incx, double *y, long incy)
{
- if(n == 1)
- {
+ if (n == 1) {
incx = 1;
incy = 1;
}
- if( (n <= INT_MAX) && (incx <= INT_MAX) && (incy <= INT_MAX) )
- {
+ if ((n <= INT_MAX) && (incx <= INT_MAX) && (incy <= INT_MAX)) {
int i_n = (int)n;
int i_incx = (int)incx;
int i_incy = (int)incy;
- THCublasCheck(cublasSaxpy(THCState_getCurrentBlasHandle(state), i_n, &a, x, i_incx, y, i_incy));
- return;
+ double result;
+ THCublasCheck(cublasDdot(THCState_getCurrentBlasHandle(state), i_n, x, i_incx, y, i_incy, &result));
+ return result;
}
- THError("Cublas_axpy only supports n, incx and incy "
- "upto signed integer limits: %d", INT_MAX);
+ THError("Cublas_Ddot only supports n, incx and incy "
+ "up to signed integer limits: %d", INT_MAX);
+ return 0;
}
-float THCudaBlas_dot(THCState *state, long n, float *x, long incx, float *y, long incy)
+/* Level 2 */
+void THCudaBlas_Sgemv(THCState *state, char trans, long m, long n, float alpha, float *a, long lda, float *x, long incx, float beta, float *y, long incy)
{
if(n == 1)
- {
- incx = 1;
- incy = 1;
- }
+ lda = m;
- if( (n <= INT_MAX) && (incx <= INT_MAX) && (incy <= INT_MAX) )
+ cublasOperation_t op;
+ if (trans == 't') op = CUBLAS_OP_T;
+ else if (trans == 'n') op = CUBLAS_OP_N;
+ else if (trans == 'c') op = CUBLAS_OP_C;
+
+ if( (m <= INT_MAX) && (n <= INT_MAX) &&
+ (lda > 0) && (lda <= INT_MAX) &&
+ (incx > 0) && (incx <= INT_MAX) &&
+ (incy > 0) && (incy <= INT_MAX) )
{
+ int i_m = (int)m;
int i_n = (int)n;
+ int i_lda = (int)lda;
int i_incx = (int)incx;
int i_incy = (int)incy;
- float result;
- THCublasCheck(cublasSdot(THCState_getCurrentBlasHandle(state), i_n, x, i_incx, y, i_incy, &result));
- cudaDeviceSynchronize();
- return result;
+
+ THCublasCheck(cublasSgemv(THCState_getCurrentBlasHandle(state), op, i_m, i_n, &alpha, a, i_lda, x, i_incx, &beta, y, i_incy));
+ return;
}
- THError("Cublas_dot only supports n, incx and incy "
- "upto signed integer limits: %d", INT_MAX);
- return -1;
+ THError("Cublas_Sgemv only supports m, n, lda, incx, incy"
+ "in the range 0 < [val] <= %d", INT_MAX);
}
-/* Level 2 */
-void THCudaBlas_gemv(THCState *state, char trans, long m, long n, float alpha, float *a, long lda, float *x, long incx, float beta, float *y, long incy)
+void THCudaBlas_Dgemv(THCState *state, char trans, long m, long n, double alpha, double *a, long lda, double *x, long incx, double beta, double *y, long incy)
{
if(n == 1)
lda = m;
@@ -124,14 +94,14 @@ void THCudaBlas_gemv(THCState *state, char trans, long m, long n, float alpha, f
int i_incx = (int)incx;
int i_incy = (int)incy;
- THCublasCheck(cublasSgemv(THCState_getCurrentBlasHandle(state), op, i_m, i_n, &alpha, a, i_lda, x, i_incx, &beta, y, i_incy));
+ THCublasCheck(cublasDgemv(THCState_getCurrentBlasHandle(state), op, i_m, i_n, &alpha, a, i_lda, x, i_incx, &beta, y, i_incy));
return;
}
- THError("Cublas_gemv only supports m, n, lda, incx, incy"
+ THError("Cublas_Dgemv only supports m, n, lda, incx, incy"
"in the range 0 < [val] <= %d", INT_MAX);
}
-void THCudaBlas_ger(THCState *state, long m, long n, float alpha, float *x, long incx, float *y, long incy, float *a, long lda)
+void THCudaBlas_Sger(THCState *state, long m, long n, float alpha, float *x, long incx, float *y, long incy, float *a, long lda)
{
if(n == 1)
lda = m;
@@ -147,10 +117,31 @@ void THCudaBlas_ger(THCState *state, long m, long n, float alpha, float *x, long
THCublasCheck(cublasSger(THCState_getCurrentBlasHandle(state), i_m, i_n, &alpha, x, i_incx, y, i_incy, a, i_lda));
return;
}
- THError("Cublas_ger only supports m, n, lda, incx, incy"
+ THError("Cublas_Sger only supports m, n, lda, incx, incy"
"with the bound [val] <= %d", INT_MAX);
}
+void THCudaBlas_Dger(THCState *state, long m, long n, double alpha, double *x, long incx, double *y, long incy, double *a, long lda)
+{
+ if(n == 1)
+ lda = m;
+
+ if( (m <= INT_MAX) && (n <= INT_MAX) && (lda <= INT_MAX) && (incx <= INT_MAX) && (incy <= INT_MAX) )
+ {
+ int i_m = (int)m;
+ int i_n = (int)n;
+ int i_lda = (int)lda;
+ int i_incx = (int)incx;
+ int i_incy = (int)incy;
+
+ THCublasCheck(cublasDger(THCState_getCurrentBlasHandle(state), i_m, i_n, &alpha, x, i_incx, y, i_incy, a, i_lda));
+ return;
+ }
+ THError("Cublas_Dger only supports m, n, lda, incx, incy"
+ "with the bound [val] <= %d", INT_MAX);
+}
+
+
cublasOperation_t convertTransToCublasOperation(char trans) {
if (trans == 't') return CUBLAS_OP_T;
else if (trans == 'n') return CUBLAS_OP_N;
@@ -193,7 +184,7 @@ void adjustLd(char transa, char transb, long m, long n, long k, long *lda, long
}
/* Level 3 */
-void THCudaBlas_gemm(THCState *state, char transa, char transb, long m, long n, long k, float alpha, float *a, long lda, float *b, long ldb, float beta, float *c, long ldc)
+void THCudaBlas_Sgemm(THCState *state, char transa, char transb, long m, long n, long k, float alpha, float *a, long lda, float *b, long ldb, float beta, float *c, long ldc)
{
adjustLd(transa, transb, m, n, k, &lda, &ldb, &ldc);
cublasOperation_t opa = convertTransToCublasOperation(transa);
@@ -211,17 +202,84 @@ void THCudaBlas_gemm(THCState *state, char transa, char transb, long m, long n,
THCublasCheck(cublasSgemm(THCState_getCurrentBlasHandle(state), opa, opb, i_m, i_n, i_k, &alpha, a, i_lda, b, i_ldb, &beta, c, i_ldc));
return;
}
- THError("Cublas_gemm only supports m, n, k, lda, ldb, ldc"
+ THError("Cublas_Sgemm only supports m, n, k, lda, ldb, ldc"
"with the bound [val] <= %d", INT_MAX);
}
-void THCudaBlas_gemmBatched(THCState *state, char transa, char transb, long m, long n, long k,
- float alpha, const float *a[], long lda, const float *b[], long ldb,
- float beta, float *c[], long ldc, long batchCount)
+#ifdef CUDA_HALF_TENSOR
+// In CUDA 8.0, definition of data types for sgemmex changed
+#if CUDA_VERSION < 8000
+# define CUDA_R_16F CUBLAS_DATA_HALF
+#endif
+
+void THCudaBlas_Hgemm(THCState *state, char transa, char transb, long m, long n, long k, half alpha, half *a, long lda, half *b, long ldb, half beta, half *c, long ldc)
+{
+ adjustLd(transa, transb, m, n, k, &lda, &ldb, &ldc);
+ cublasOperation_t opa = convertTransToCublasOperation(transa);
+ cublasOperation_t opb = convertTransToCublasOperation(transb);
+
+ if( (m <= INT_MAX) && (n <= INT_MAX) && (k <= INT_MAX) && (lda <= INT_MAX) && (ldb <= INT_MAX) && (ldc <= INT_MAX) )
+ {
+ int i_m = (int)m;
+ int i_n = (int)n;
+ int i_k = (int)k;
+ int i_lda = (int)lda;
+ int i_ldb = (int)ldb;
+ int i_ldc = (int)ldc;
+
+ // Check for native Hgemm support
+ if (THC_nativeHalfInstructions(state)) {
+ THCublasCheck(cublasHgemm(THCState_getCurrentBlasHandle(state), opa, opb,
+ i_m, i_n, i_k, &alpha, a, i_lda, b, i_ldb,
+ &beta, c, i_ldc));
+ } else {
+ // Simulated Hgemm
+ float fAlpha = THC_half2float(alpha);
+ float fBeta = THC_half2float(beta);
+
+ THCublasCheck(cublasSgemmEx(THCState_getCurrentBlasHandle(state), opa, opb,
+ i_m, i_n, i_k, &fAlpha,
+ a, CUDA_R_16F, i_lda, b, CUDA_R_16F,
+ i_ldb, &fBeta, c, CUDA_R_16F, i_ldc));
+ }
+
+ return;
+ }
+ THError("Cublas_Hgemm only supports m, n, k, lda, ldb, ldc"
+ "with th bound [val] <= %d", INT_MAX);
+}
+#endif
+
+void THCudaBlas_Dgemm(THCState *state, char transa, char transb, long m, long n, long k, double alpha, double *a, long lda, double *b, long ldb, double beta, double *c, long ldc)
+{
+ adjustLd(transa, transb, m, n, k, &lda, &ldb, &ldc);
+ cublasOperation_t opa = convertTransToCublasOperation(transa);
+ cublasOperation_t opb = convertTransToCublasOperation(transb);
+
+ if( (m <= INT_MAX) && (n <= INT_MAX) && (k <= INT_MAX) && (lda <= INT_MAX) && (ldb <= INT_MAX) && (ldc <= INT_MAX) )
+ {
+ int i_m = (int)m;
+ int i_n = (int)n;
+ int i_k = (int)k;
+ int i_lda = (int)lda;
+ int i_ldb = (int)ldb;
+ int i_ldc = (int)ldc;
+
+ THCublasCheck(cublasDgemm(THCState_getCurrentBlasHandle(state), opa, opb, i_m, i_n, i_k, &alpha, a, i_lda, b, i_ldb, &beta, c, i_ldc));
+ return;
+ }
+ THError("Cublas_Dgemm only supports m, n, k, lda, ldb, ldc"
+ "with the bound [val] <= %d", INT_MAX);
+}
+
+
+void THCudaBlas_SgemmBatched(THCState *state, char transa, char transb, long m, long n, long k,
+ float alpha, const float *a[], long lda, const float *b[], long ldb,
+ float beta, float *c[], long ldc, long batchCount)
{
if( (m >= INT_MAX) || (n >= INT_MAX) || (k >= INT_MAX) || (lda >= INT_MAX) || (ldb >= INT_MAX) || (ldc >= INT_MAX) || (batchCount >= INT_MAX) )
{
- THError("Cublas_gemm only supports m, n, k, lda, ldb, ldc, batchCount"
+ THError("Cublas_SgemmBatched only supports m, n, k, lda, ldb, ldc, batchCount"
"with the bound [val] <= %d", INT_MAX);
}
@@ -235,22 +293,61 @@ void THCudaBlas_gemmBatched(THCState *state, char transa, char transb, long m, l
(int)batchCount));
}
+void THCudaBlas_DgemmBatched(THCState *state, char transa, char transb, long m, long n, long k,
+ double alpha, const double *a[], long lda, const double *b[], long ldb,
+ double beta, double *c[], long ldc, long batchCount)
+{
+ if( (m >= INT_MAX) || (n >= INT_MAX) || (k >= INT_MAX) || (lda >= INT_MAX) || (ldb >= INT_MAX) || (ldc >= INT_MAX) || (batchCount >= INT_MAX) )
+ {
+ THError("Cublas_DgemmBatched only supports m, n, k, lda, ldb, ldc, batchCount"
+ "with the bound [val] <= %d", INT_MAX);
+ }
+
+ adjustLd(transa, transb, m, n, k, &lda, &ldb, &ldc);
+ cublasOperation_t opa = convertTransToCublasOperation(transa);
+ cublasOperation_t opb = convertTransToCublasOperation(transb);
+
+ THCublasCheck(cublasDgemmBatched(THCState_getCurrentBlasHandle(state),
+ opa, opb, (int)m, (int)n, (int)k,
+ &alpha, a, (int)lda, b, (int)ldb, &beta, c, (int)ldc,
+ (int)batchCount));
+}
+
/* Inverse */
-void THCudaBlas_getrf(THCState *state, int n, float **a, int lda, int *pivot, int *info, int batchSize) {
+void THCudaBlas_Sgetrf(THCState *state, int n, float **a, int lda, int *pivot, int *info, int batchSize) {
if( (n >= INT_MAX) || (lda >= INT_MAX) || (batchSize >= INT_MAX) )
{
- THError("Cublas_getrf only supports n, lda, batchSize"
+ THError("Cublas_Sgetrf only supports n, lda, batchSize"
"with the bound [val] <= %d", INT_MAX);
}
THCublasCheck(cublasSgetrfBatched(THCState_getCurrentBlasHandle(state), n, a, lda, pivot, info, batchSize));
}
-void THCudaBlas_getri(THCState *state, int n, const float **a, int lda, int *pivot, float **c, int ldc, int *info, int batchSize) {
+void THCudaBlas_Dgetrf(THCState *state, int n, double **a, int lda, int *pivot, int *info, int batchSize) {
+ if( (n >= INT_MAX) || (lda >= INT_MAX) || (batchSize >= INT_MAX) )
+ {
+ THError("Cublas_Dgetrf only supports n, lda, batchSize"
+ "with the bound [val] <= %d", INT_MAX);
+ }
+ THCublasCheck(cublasDgetrfBatched(THCState_getCurrentBlasHandle(state), n, a, lda, pivot, info, batchSize));
+}
+
+void THCudaBlas_Sgetri(THCState *state, int n, const float **a, int lda, int *pivot, float **c, int ldc, int *info, int batchSize) {
if( (n >= INT_MAX) || (lda >= INT_MAX)|| (ldc >= INT_MAX) || (batchSize >= INT_MAX) )
{
- THError("Cublas_getrf only supports n, lda, ldc, batchSize"
+ THError("Cublas_Sgetri only supports n, lda, ldc, batchSize"
"with the bound [val] <= %d", INT_MAX);
}
THCublasCheck(cublasSgetriBatched(THCState_getCurrentBlasHandle(state), n, a, lda, pivot, c, ldc, info, batchSize));
}
+
+void THCudaBlas_Dgetri(THCState *state, int n, const double **a, int lda, int *pivot, double **c, int ldc, int *info, int batchSize) {
+
+ if( (n >= INT_MAX) || (lda >= INT_MAX)|| (ldc >= INT_MAX) || (batchSize >= INT_MAX) )
+ {
+ THError("Cublas_Dgetri only supports n, lda, ldc, batchSize"
+ "with the bound [val] <= %d", INT_MAX);
+ }
+ THCublasCheck(cublasDgetriBatched(THCState_getCurrentBlasHandle(state), n, a, lda, pivot, c, ldc, info, batchSize));
+}
diff --git a/lib/THC/THCBlas.h b/lib/THC/THCBlas.h
index 613b78e..45f58eb 100644
--- a/lib/THC/THCBlas.h
+++ b/lib/THC/THCBlas.h
@@ -2,26 +2,37 @@
#define THC_BLAS_INC
#include "THCGeneral.h"
+#include "THCHalf.h"
/* Level 1 */
-THC_API void THCudaBlas_swap(THCState *state, long n, float *x, long incx, float *y, long incy);
-THC_API void THCudaBlas_scal(THCState *state, long n, float a, float *x, long incx);
-THC_API void THCudaBlas_copy(THCState *state, long n, float *x, long incx, float *y, long incy);
-THC_API void THCudaBlas_axpy(THCState *state, long n, float a, float *x, long incx, float *y, long incy);
-THC_API float THCudaBlas_dot(THCState *state, long n, float *x, long incx, float *y, long incy);
+THC_API float THCudaBlas_Sdot(THCState *state, long n, float *x, long incx, float *y, long incy);
+THC_API double THCudaBlas_Ddot(THCState *state, long n, double *x, long incx, double *y, long incy);
/* Level 2 */
-THC_API void THCudaBlas_gemv(THCState *state, char trans, long m, long n, float alpha, float *a, long lda, float *x, long incx, float beta, float *y, long incy);
-THC_API void THCudaBlas_ger(THCState *state, long m, long n, float alpha, float *x, long incx, float *y, long incy, float *a, long lda);
+THC_API void THCudaBlas_Sgemv(THCState *state, char trans, long m, long n, float alpha, float *a, long lda, float *x, long incx, float beta, float *y, long incy);
+THC_API void THCudaBlas_Dgemv(THCState *state, char trans, long m, long n, double alpha, double *a, long lda, double *x, long incx, double beta, double *y, long incy);
+THC_API void THCudaBlas_Sger(THCState *state, long m, long n, float alpha, float *x, long incx, float *y, long incy, float *a, long lda);
+THC_API void THCudaBlas_Dger(THCState *state, long m, long n, double alpha, double *x, long incx, double *y, long incy, double *a, long lda);
/* Level 3 */
-THC_API void THCudaBlas_gemm(THCState *state, char transa, char transb, long m, long n, long k, float alpha, float *a, long lda, float *b, long ldb, float beta, float *c, long ldc);
-THC_API void THCudaBlas_gemmBatched(THCState *state, char transa, char transb, long m, long n, long k,
- float alpha, const float *a[], long lda, const float *b[], long ldb,
- float beta, float *c[], long ldc, long batchCount);
+THC_API void THCudaBlas_Sgemm(THCState *state, char transa, char transb, long m, long n, long k, float alpha, float *a, long lda, float *b, long ldb, float beta, float *c, long ldc);
+THC_API void THCudaBlas_Dgemm(THCState *state, char transa, char transb, long m, long n, long k, double alpha, double *a, long lda, double *b, long ldb, double beta, double *c, long ldc);
+
+#ifdef CUDA_HALF_TENSOR
+THC_API void THCudaBlas_Hgemm(THCState *state, char transa, char transb, long m, long n, long k, half alpha, half *a, long lda, half *b, long ldb, half beta, half *c, long ldc);
+#endif
+
+THC_API void THCudaBlas_SgemmBatched(THCState *state, char transa, char transb, long m, long n, long k,
+ float alpha, const float *a[], long lda, const float *b[], long ldb,
+ float beta, float *c[], long ldc, long batchCount);
+THC_API void THCudaBlas_DgemmBatched(THCState *state, char transa, char transb, long m, long n, long k,
+ double alpha, const double *a[], long lda, const double *b[], long ldb,
+ double beta, double *c[], long ldc, long batchCount);
/* Inverse */
-THC_API void THCudaBlas_getrf(THCState *state, int n, float **a, int lda, int *pivot, int *info, int batchSize);
-THC_API void THCudaBlas_getri(THCState *state, int n, const float **a, int lda, int *pivot, float **c, int ldc, int *info, int batchSize);
+THC_API void THCudaBlas_Sgetrf(THCState *state, int n, float **a, int lda, int *pivot, int *info, int batchSize);
+THC_API void THCudaBlas_Dgetrf(THCState *state, int n, double **a, int lda, int *pivot, int *info, int batchSize);
+THC_API void THCudaBlas_Sgetri(THCState *state, int n, const float **a, int lda, int *pivot, float **c, int ldc, int *info, int batchSize);
+THC_API void THCudaBlas_Dgetri(THCState *state, int n, const double **a, int lda, int *pivot, double **c, int ldc, int *info, int batchSize);
#endif
diff --git a/lib/THC/THCGenerateAllTypes.h b/lib/THC/THCGenerateAllTypes.h
index 28037bd..3900ca5 100644
--- a/lib/THC/THCGenerateAllTypes.h
+++ b/lib/THC/THCGenerateAllTypes.h
@@ -81,7 +81,9 @@
#undef THC_REAL_IS_LONG
#define real float
-#define accreal double
+/* FIXME: fp64 has bad performance on some platforms; avoid using it unless
+ we opt into it? */
+#define accreal float
#define Real Float
#define CReal Cuda
#define THC_REAL_IS_FLOAT
@@ -109,7 +111,9 @@
#ifdef CUDA_HALF_TENSOR
#define real half
-#define accreal half
+/* FIXME: fp64 has bad performance on some platforms; avoid using it unless
+ we opt into it? */
+#define accreal float
#define Real Half
#define CReal CudaHalf
#define THC_REAL_IS_HALF
diff --git a/lib/THC/THCHalf.cu b/lib/THC/THCHalf.cu
index 7847ef6..7777bf7 100644
--- a/lib/THC/THCHalf.cu
+++ b/lib/THC/THCHalf.cu
@@ -119,3 +119,12 @@ half THC_float2half(float a)
memcpy(&ret, &ir, sizeof(half));
return ret;
}
+
+THC_EXTERNC int THC_nativeHalfInstructions(THCState *state) {
+ cudaDeviceProp* prop =
+ THCState_getCurrentDeviceProperties(state);
+
+ // CC 5.3+
+ return (prop->major > 5 ||
+ (prop->major == 5 && prop->minor == 3));
+}
diff --git a/lib/THC/THCHalf.h b/lib/THC/THCHalf.h
index d87647b..7440de4 100644
--- a/lib/THC/THCHalf.h
+++ b/lib/THC/THCHalf.h
@@ -3,12 +3,12 @@
#include "THCGeneral.h"
-// We compile with CudaHalfTensor support if we have this:
+/* We compile with CudaHalfTensor support if we have this: */
#if CUDA_VERSION >= 7050 || CUDA_HAS_FP16
#define CUDA_HALF_TENSOR 1
#endif
-// Native fp16 ALU instructions are available if we have this:
+/* Kernel side: Native fp16 ALU instructions are available if we have this: */
#if defined(CUDA_HALF_TENSOR) && (__CUDA_ARCH__ >= 530)
#define CUDA_HALF_INSTRUCTIONS 1
#endif
@@ -23,6 +23,9 @@ THC_EXTERNC void THCHalf2Float(THCState *state, float *out, half *in, long len);
THC_EXTERNC half THC_float2half(float a);
THC_EXTERNC float THC_half2float(half a);
-#endif // CUDA_HALF_TENSOR
+/* Check for native fp16 support on the current device (CC 5.3+) */
+THC_EXTERNC int THC_nativeHalfInstructions(THCState *state);
+
+#endif /* CUDA_HALF_TENSOR */
#endif
diff --git a/lib/THC/THCNumerics.cuh b/lib/THC/THCNumerics.cuh
new file mode 100644
index 0000000..bc299d4
--- /dev/null
+++ b/lib/THC/THCNumerics.cuh
@@ -0,0 +1,239 @@
+#ifndef THC_NUMERICS_INC
+#define THC_NUMERICS_INC
+
+#include <cuda.h>
+#include <limits.h>
+#include "THCHalf.h"
+
+/// Class for numeric limits of the particular data type, which
+/// includes support for `half`.
+/// Unfortunately since `half` does not have a constructor, these have
+/// to be expressed as functions (either that or non-const statics).
+template <typename T>
+struct THCNumerics {
+};
+
+template <>
+struct THCNumerics<unsigned char> {
+ static inline __host__ __device__ unsigned char min() { return 0; }
+ static inline __host__ __device__ unsigned char max() { return UCHAR_MAX; }
+
+ static inline __host__ __device__ bool lt(unsigned char a, unsigned char b) { return a < b; }
+ static inline __host__ __device__ bool le(unsigned char a, unsigned char b) { return a <= b; }
+ static inline __host__ __device__ bool gt(unsigned char a, unsigned char b) { return a > b; }
+ static inline __host__ __device__ bool ge(unsigned char a, unsigned char b) { return a >= b; }
+ static inline __host__ __device__ bool eq(unsigned char a, unsigned char b) { return a == b; }
+ static inline __host__ __device__ bool ne(unsigned char a, unsigned char b) { return a != b; }
+};
+
+template <>
+struct THCNumerics<char> {
+ static inline __host__ __device__ char min() { return CHAR_MIN; }
+ static inline __host__ __device__ char max() { return CHAR_MAX; }
+
+ static inline __host__ __device__ bool lt(char a, char b) { return a < b; }
+ static inline __host__ __device__ bool le(char a, char b) { return a <= b; }
+ static inline __host__ __device__ bool gt(char a, char b) { return a > b; }
+ static inline __host__ __device__ bool ge(char a, char b) { return a >= b; }
+ static inline __host__ __device__ bool eq(char a, char b) { return a == b; }
+ static inline __host__ __device__ bool ne(char a, char b) { return a != b; }
+};
+
+template <>
+struct THCNumerics<short> {
+ static inline __host__ __device__ short min() { return SHRT_MIN; }
+ static inline __host__ __device__ short max() { return SHRT_MAX; }
+
+ static inline __host__ __device__ bool lt(short a, short b) { return a < b; }
+ static inline __host__ __device__ bool le(short a, short b) { return a <= b; }
+ static inline __host__ __device__ bool gt(short a, short b) { return a > b; }
+ static inline __host__ __device__ bool ge(short a, short b) { return a >= b; }
+ static inline __host__ __device__ bool eq(short a, short b) { return a == b; }
+ static inline __host__ __device__ bool ne(short a, short b) { return a != b; }
+};
+
+template <>
+struct THCNumerics<int> {
+ static inline __host__ __device__ int min() { return INT_MIN; }
+ static inline __host__ __device__ int max() { return INT_MAX; }
+
+ static inline __host__ __device__ bool lt(int a, int b) { return a < b; }
+ static inline __host__ __device__ bool le(int a, int b) { return a <= b; }
+ static inline __host__ __device__ bool gt(int a, int b) { return a > b; }
+ static inline __host__ __device__ bool ge(int a, int b) { return a >= b; }
+ static inline __host__ __device__ bool eq(int a, int b) { return a == b; }
+ static inline __host__ __device__ bool ne(int a, int b) { return a != b; }
+};
+
+template <>
+struct THCNumerics<long> {
+ static inline __host__ __device__ long min() { return LONG_MIN; }
+ static inline __host__ __device__ long max() { return LONG_MAX; }
+
+ static inline __host__ __device__ bool lt(long a, long b) { return a < b; }
+ static inline __host__ __device__ bool le(long a, long b) { return a <= b; }
+ static inline __host__ __device__ bool gt(long a, long b) { return a > b; }
+ static inline __host__ __device__ bool ge(long a, long b) { return a >= b; }
+ static inline __host__ __device__ bool eq(long a, long b) { return a == b; }
+ static inline __host__ __device__ bool ne(long a, long b) { return a != b; }
+};
+
+#ifdef CUDA_HALF_TENSOR
+template <>
+struct THCNumerics<half> {
+ static inline __host__ __device__ half min() { half h; h.x = 0xfbff; return h; }
+ static inline __host__ __device__ half max() { half h; h.x = 0x7bff; return h; }
+
+ static inline __host__ __device__ bool lt(half a, half b) {
+#ifdef __CUDA_ARCH__
+#ifdef CUDA_HALF_INSTRUCTIONS
+ return __hlt(a, b);
+#else
+ float fa = __half2float(a);
+ float fb = __half2float(b);
+ return fa < fb;
+#endif
+#else // __CUDA_ARCH__
+ return THC_half2float(a) < THC_half2float(b);
+#endif
+ }
+
+ static inline __host__ __device__ bool le(half a, half b) {
+#ifdef __CUDA_ARCH__
+#ifdef CUDA_HALF_INSTRUCTIONS
+ return __hle(a, b);
+#else
+ float fa = __half2float(a);
+ float fb = __half2float(b);
+ return fa <= fb;
+#endif
+#else // __CUDA_ARCH__
+ return THC_half2float(a) <= THC_half2float(b);
+#endif
+ }
+
+ static inline __host__ __device__ bool gt(half a, half b) {
+#ifdef __CUDA_ARCH__
+#ifdef CUDA_HALF_INSTRUCTIONS
+ return __hgt(a, b);
+#else
+ float fa = __half2float(a);
+ float fb = __half2float(b);
+ return fa > fb;
+#endif
+#else // __CUDA_ARCH__
+ return THC_half2float(a) > THC_half2float(b);
+#endif
+ }
+
+ static inline __host__ __device__ bool ge(half a, half b) {
+#ifdef __CUDA_ARCH__
+#ifdef CUDA_HALF_INSTRUCTIONS
+ return __hge(a, b);
+#else
+ float fa = __half2float(a);
+ float fb = __half2float(b);
+ return fa >= fb;
+#endif
+#else // __CUDA_ARCH__
+ return THC_half2float(a) >= THC_half2float(b);
+#endif
+ }
+
+ static inline __host__ __device__ bool eq(half a, half b) {
+#ifdef __CUDA_ARCH__
+#ifdef CUDA_HALF_INSTRUCTIONS
+ return __heq(a, b);
+#else
+ float fa = __half2float(a);
+ float fb = __half2float(b);
+ return fa == fb;
+#endif
+#else // __CUDA_ARCH__
+ return THC_half2float(a) == THC_half2float(b);
+#endif
+ }
+
+ static inline __host__ __device__ bool ne(half a, half b) {
+#ifdef __CUDA_ARCH__
+#ifdef CUDA_HALF_INSTRUCTIONS
+ return __hne(a, b);
+#else
+ float fa = __half2float(a);
+ float fb = __half2float(b);
+ return fa != fb;
+#endif
+#else // __CUDA_ARCH__
+ return THC_half2float(a) != THC_half2float(b);
+#endif
+ }
+};
+#endif
+
+template <>
+struct THCNumerics<float> {
+ static inline __host__ __device__ float min() { return -FLT_MAX; }
+ static inline __host__ __device__ float max() { return FLT_MAX; }
+
+ static inline __host__ __device__ bool lt(float a, float b) { return a < b; }
+ static inline __host__ __device__ bool le(float a, float b) { return a <= b; }
+ static inline __host__ __device__ bool gt(float a, float b) { return a > b; }
+ static inline __host__ __device__ bool ge(float a, float b) { return a >= b; }
+ static inline __host__ __device__ bool eq(float a, float b) { return a == b; }
+ static inline __host__ __device__ bool ne(float a, float b) { return a != b; }
+};
+
+template <>
+struct THCNumerics<double> {
+ static inline __host__ __device__ double min() { return -DBL_MAX; }
+ static inline __host__ __device__ double max() { return DBL_MAX; }
+
+ static inline __host__ __device__ bool lt(double a, double b) { return a < b; }
+ static inline __host__ __device__ bool le(double a, double b) { return a <= b; }
+ static inline __host__ __device__ bool gt(double a, double b) { return a > b; }
+ static inline __host__ __device__ bool ge(double a, double b) { return a >= b; }
+ static inline __host__ __device__ bool eq(double a, double b) { return a == b; }
+ static inline __host__ __device__ bool ne(double a, double b) { return a != b; }
+};
+
+/// `half` has some type conversion issues associated with it, since it
+/// is a struct without a constructor/implicit conversion constructor.
+/// We use this to convert scalar values to the given type that the
+/// tensor expects.
+template <typename In, typename Out>
+struct ScalarConvert {
+ static __host__ __device__ Out to(const In v) { return (Out) v; }
+};
+
+#ifdef CUDA_HALF_TENSOR
+template <typename Out>
+struct ScalarConvert<half, Out> {
+ static __host__ __device__ Out to(const half v) {
+#ifdef __CUDA_ARCH__
+ return (Out) __half2float(v);
+#else
+ return (Out) THC_half2float(v);
+#endif
+ }
+};
+
+template <typename In>
+struct ScalarConvert<In, half> {
+ static __host__ __device__ half to(const In v) {
+#ifdef __CUDA_ARCH__
+ return __float2half((float) v);
+#else
+ return THC_float2half((float) v);
+#endif
+ }
+};
+
+template <>
+struct ScalarConvert<half, half> {
+ static __host__ __device__ half to(const half v) {
+ return v;
+ }
+};
+#endif
+
+#endif // THC_TENSOR_TYPE_UTILS_INC
diff --git a/lib/THC/THCReduce.cuh b/lib/THC/THCReduce.cuh
index 89f599f..a7135de 100644
--- a/lib/THC/THCReduce.cuh
+++ b/lib/THC/THCReduce.cuh
@@ -8,6 +8,7 @@
// arguments without copying or temporary storage.
//
+#include "THCTensorTypeUtils.cuh"
#include "THCReduceApplyUtils.cuh"
// Threads per thread block
@@ -29,14 +30,14 @@ template <typename ModifyOp,
__launch_bounds__(32 * 16, 4)
#endif
__global__ void
-THCudaTensor_reduceNoncontigDim(TensorInfo<T, IndexType> out,
- TensorInfo<T, IndexType> in,
- IndexType reductionStride,
- IndexType reductionSize,
- IndexType totalSlices,
- T init,
- ModifyOp modifyOp,
- ReduceOp reduceOp) {
+kernelReduceNoncontigDim(TensorInfo<T, IndexType> out,
+ TensorInfo<T, IndexType> in,
+ IndexType reductionStride,
+ IndexType reductionSize,
+ IndexType totalSlices,
+ T init,
+ ModifyOp modifyOp,
+ ReduceOp reduceOp) {
const IndexType sliceIndex = getReduceNoncontigDimSliceIndex<IndexType>();
if (sliceIndex >= totalSlices) {
@@ -77,13 +78,13 @@ template <typename ModifyOp,
typename IndexType,
int ADims, int BDims>
__global__ void
-THCudaTensor_reduceContigDim(TensorInfo<T, IndexType> out,
- TensorInfo<T, IndexType> in,
- IndexType reductionSize,
- IndexType totalSlices,
- T init,
- ModifyOp modifyOp,
- ReduceOp reduceOp) {
+kernelReduceContigDim(TensorInfo<T, IndexType> out,
+ TensorInfo<T, IndexType> in,
+ IndexType reductionSize,
+ IndexType totalSlices,
+ T init,
+ ModifyOp modifyOp,
+ ReduceOp reduceOp) {
const IndexType sliceIndex = getReduceContigDimSliceIndex<IndexType>();
if (sliceIndex >= totalSlices) {
@@ -107,7 +108,9 @@ THCudaTensor_reduceContigDim(TensorInfo<T, IndexType> out,
}
// Reduce within the block
- extern __shared__ T smem[];
+ // FIXME: extern name
+ extern __shared__ char smemChar[];
+ T* smem = (T*) smemChar;
r = reduceBlock<T, ReduceOp>(smem, blockDim.x, r, reduceOp, init);
if (threadIdx.x == 0) {
@@ -139,14 +142,16 @@ inline dim3 getContigReduceBlock(long numSlices, long reductionSize) {
// Scale up block size based on the reduction dimension size
long warpsInReductionSize = THCCeilDiv(reductionSize, 32L);
- int numWarps =
- warpsInReductionSize > (long) maxWarps ? maxWarps : (int) warpsInReductionSize;
+ int numWarps = warpsInReductionSize > (long) maxWarps ?
+ maxWarps : (int) warpsInReductionSize;
+
return dim3(numWarps * 32);
}
inline bool getNoncontigReduceGrid(long elements, dim3& grid) {
// One output point per thread
- return THC_getGridFromTiles(THCCeilDiv(elements, (long) THC_NONCONTIG_REDUCE_BLOCK_SIZE), grid);
+ return THC_getGridFromTiles(THCCeilDiv(elements,
+ (long) THC_NONCONTIG_REDUCE_BLOCK_SIZE), grid);
}
inline bool getContigReduceGrid(long elements, dim3& grid) {
@@ -156,26 +161,26 @@ inline bool getContigReduceGrid(long elements, dim3& grid) {
// Performs a reduction out[..., 0, ...] = reduce_i(modify(in[..., i, ...])) for
// all in where i and the out's 0 are indexed at dimension `dim`
-template <typename ModifyOp, typename ReduceOp>
-bool THCudaTensor_reduceDim(THCState* state,
- THCudaTensor* out,
- THCudaTensor* in,
- const ModifyOp& modifyOp,
- const ReduceOp& reduceOp,
- float init,
- int dim) {
- long inElements = THCudaTensor_nElement(state, in);
-
- long reductionSize = THCudaTensor_size(state, in, dim);
- long reductionStride = THCudaTensor_stride(state, in, dim);
+template <typename TensorType, typename ModifyOp, typename ReduceOp>
+bool THC_reduceDim(THCState* state,
+ TensorType* out,
+ TensorType* in,
+ const ModifyOp& modifyOp,
+ const ReduceOp& reduceOp,
+ typename TensorUtils<TensorType>::DataType init,
+ int dim) {
+ long inElements = TensorUtils<TensorType>::getNumElements(state, in);
+
+ long reductionSize = TensorUtils<TensorType>::getSize(state, in, dim);
+ long reductionStride = TensorUtils<TensorType>::getStride(state, in, dim);
long outElements = inElements / reductionSize;
- if (THCudaTensor_nDimension(state, out) > MAX_CUTORCH_DIMS ||
- THCudaTensor_nDimension(state, in) > MAX_CUTORCH_DIMS) {
+ if (TensorUtils<TensorType>::getDims(state, out) > MAX_CUTORCH_DIMS ||
+ TensorUtils<TensorType>::getDims(state, in) > MAX_CUTORCH_DIMS) {
return false;
}
- if (THCudaTensor_nDimension(state, in) == 0) {
+ if (TensorUtils<TensorType>::getDims(state, in) == 0) {
// Zero-dim tensor; do nothing
return true;
}
@@ -193,7 +198,7 @@ bool THCudaTensor_reduceDim(THCState* state,
}
block = getContigReduceBlock(outElements, reductionSize);
- smemSize = sizeof(float) * block.x;
+ smemSize = sizeof(typename TensorUtils<TensorType>::DataType) * block.x;
} else {
if (!getNoncontigReduceGrid(outElements, grid)) {
return false;
@@ -203,9 +208,9 @@ bool THCudaTensor_reduceDim(THCState* state,
}
// Resize out to correspond to the reduced size
- THLongStorage* sizes = THCudaTensor_newSizeOf(state, in);
+ THLongStorage* sizes = TensorUtils<TensorType>::newSizeOf(state, in);
THLongStorage_set(sizes, dim, 1);
- THCudaTensor_resize(state, out, sizes, NULL);
+ TensorUtils<TensorType>::resize(state, out, sizes, NULL);
THLongStorage_free(sizes);
// It is possible that the tensor dimensions are able to be collapsed,
@@ -216,80 +221,84 @@ bool THCudaTensor_reduceDim(THCState* state,
// (or vice versa), the contiguous tensor can be collapsed to one
// dimension, and the loop to translate the linear index to the array
// index can be similarly collapsed. That is what this unrolling is for.
-#define HANDLE_CASE(T, TYPE, OUT, IN) \
+#define HANDLE_CASE(TYPE, OUT, IN) \
if (contigReduction) { \
- THCudaTensor_reduceContigDim<ModifyOp, ReduceOp, T, TYPE, OUT, IN> \
+ kernelReduceContigDim<ModifyOp, ReduceOp, \
+ typename TensorUtils<TensorType>::DataType, \
+ TYPE, OUT, IN> \
<<<grid, block, smemSize, THCState_getCurrentStream(state)>>>( \
outInfo, inInfo, reductionSize, \
(TYPE) outElements, init, modifyOp, reduceOp); \
} else { \
- THCudaTensor_reduceNoncontigDim<ModifyOp, ReduceOp, T, TYPE, OUT, IN> \
+ kernelReduceNoncontigDim<ModifyOp, ReduceOp, \
+ typename TensorUtils<TensorType>::DataType, \
+ TYPE, OUT, IN> \
<<<grid, block, 0, THCState_getCurrentStream(state)>>>( \
outInfo, inInfo, reductionStride, reductionSize, \
(TYPE) outElements, init, modifyOp, reduceOp); \
} \
-#define HANDLE_IN_CASE(T, TYPE, OUT, IN) \
+#define HANDLE_IN_CASE(TYPE, OUT, IN) \
{ \
if (inInfo.isContiguous()) { \
- HANDLE_CASE(T, TYPE, OUT, -2); \
+ HANDLE_CASE(TYPE, OUT, -2); \
} else { \
switch (IN) { \
case 1: \
- HANDLE_CASE(T, TYPE, OUT, 1); \
+ HANDLE_CASE(TYPE, OUT, 1); \
break; \
case 2: \
- HANDLE_CASE(T, TYPE, OUT, 2); \
+ HANDLE_CASE(TYPE, OUT, 2); \
break; \
default: \
- HANDLE_CASE(T, TYPE, OUT, -1); \
+ HANDLE_CASE(TYPE, OUT, -1); \
break; \
} \
} \
}
-#define HANDLE_OUT_CASE(T, TYPE, OUT, IN) \
+#define HANDLE_OUT_CASE(TYPE, OUT, IN) \
{ \
if (outInfo.isContiguous()) { \
- HANDLE_IN_CASE(T, TYPE, -2, IN); \
+ HANDLE_IN_CASE(TYPE, -2, IN); \
} else { \
switch (OUT) { \
case 1: \
- HANDLE_IN_CASE(T, TYPE, 1, IN); \
+ HANDLE_IN_CASE(TYPE, 1, IN); \
break; \
case 2: \
- HANDLE_IN_CASE(T, TYPE, 2, IN); \
- break; \
- case 3: \
- HANDLE_IN_CASE(T, TYPE, 3, IN); \
+ HANDLE_IN_CASE(TYPE, 2, IN); \
break; \
default: \
- HANDLE_IN_CASE(T, TYPE, -1, IN); \
+ HANDLE_IN_CASE(TYPE, -1, IN); \
break; \
} \
} \
}
- if (TensorUtils<THCudaTensor>::canUse32BitIndexMath(state, out) &&
- TensorUtils<THCudaTensor>::canUse32BitIndexMath(state, in)) {
- TensorInfo<float, unsigned int> outInfo =
- getTensorInfo<THCudaTensor, unsigned int>(state, out);
+ if (TensorUtils<TensorType>::canUse32BitIndexMath(state, out) &&
+ TensorUtils<TensorType>::canUse32BitIndexMath(state, in)) {
+ TensorInfo<typename TensorUtils<TensorType>::DataType,
+ unsigned int> outInfo =
+ getTensorInfo<TensorType, unsigned int>(state, out);
outInfo.collapseDims();
- TensorInfo<float, unsigned int> inInfo =
- getTensorInfo<THCudaTensor, unsigned int>(state, in);
+ TensorInfo<typename TensorUtils<TensorType>::DataType,
+ unsigned int> inInfo =
+ getTensorInfo<TensorType, unsigned int>(state, in);
inInfo.reduceDim(dim);
inInfo.collapseDims();
- HANDLE_OUT_CASE(typename TensorUtils<THCudaTensor>::DataType,
- unsigned int, outInfo.dims, inInfo.dims);
+ HANDLE_OUT_CASE(unsigned int, outInfo.dims, inInfo.dims);
} else {
- TensorInfo<float, unsigned long> outInfo =
- getTensorInfo<THCudaTensor, unsigned long>(state, out);
+ TensorInfo<typename TensorUtils<TensorType>::DataType,
+ unsigned long> outInfo =
+ getTensorInfo<TensorType, unsigned long>(state, out);
outInfo.collapseDims();
- TensorInfo<float, unsigned long> inInfo =
- getTensorInfo<THCudaTensor, unsigned long>(state, in);
+ TensorInfo<typename TensorUtils<TensorType>::DataType,
+ unsigned long> inInfo =
+ getTensorInfo<TensorType, unsigned long>(state, in);
inInfo.reduceDim(dim);
inInfo.collapseDims();
@@ -297,11 +306,9 @@ bool THCudaTensor_reduceDim(THCState* state,
// version and the completely generic version, to reduce
// compilation time.
if (outInfo.isContiguous() && inInfo.isContiguous()) {
- HANDLE_CASE(typename TensorUtils<THCudaTensor>::DataType,
- unsigned long, -2, -2);
+ HANDLE_CASE(unsigned long, -2, -2);
} else {
- HANDLE_CASE(typename TensorUtils<THCudaTensor>::DataType,
- unsigned long, -1, -1);
+ HANDLE_CASE(unsigned long, -1, -1);
}
}
#undef HANDLE_CASE
diff --git a/lib/THC/THCReduceAll.cuh b/lib/THC/THCReduceAll.cuh
index 3fe81a9..498fb53 100644
--- a/lib/THC/THCReduceAll.cuh
+++ b/lib/THC/THCReduceAll.cuh
@@ -20,26 +20,30 @@
// Kernel that handles an entire reduction of a tensor in one pass
template <typename ModifyOp,
typename ReduceOp,
- typename T,
+ typename ReduceAccOp,
+ typename InT,
+ typename AccT,
typename IndexType,
int ADims>
__global__ void
-THCudaTensor_reduceAll(TensorInfo<T, IndexType> in,
- IndexType totalElements,
- T init,
- ModifyOp modifyOp,
- ReduceOp reduceOp,
- T* out) {
+kernelReduceAll(TensorInfo<InT, IndexType> in,
+ IndexType totalElements,
+ AccT init,
+ ModifyOp modifyOp,
+ ReduceOp reduceOp,
+ ReduceAccOp reduceAccOp,
+ AccT* out) {
// With a block-wide stride, have each thread perform its own reduction.
- T r = init;
+ AccT r = init;
for (IndexType i = threadIdx.x; i < totalElements; i += blockDim.x) {
- const IndexType inOffset = IndexToOffset<T, IndexType, ADims>::get(i, in);
+ const IndexType inOffset = IndexToOffset<InT, IndexType, ADims>::get(i, in);
r = reduceOp(r, modifyOp(in.data[inOffset]));
}
// Reduce within the block
- extern __shared__ T smem[];
- r = reduceBlock<T, ReduceOp>(smem, blockDim.x, r, reduceOp, init);
+ extern __shared__ char smemChar[];
+ AccT* smem = (AccT*) smemChar;
+ r = reduceBlock<AccT, ReduceAccOp>(smem, blockDim.x, r, reduceAccOp, init);
if (threadIdx.x == 0) {
// Write out reduced value
@@ -62,29 +66,33 @@ __device__ __forceinline__ IndexType getEndIndex(IndexType totalSize) {
// Kernel that handles an entire reduction of a tensor in two passes
template <typename ModifyOp,
typename ReduceOp,
- typename T,
+ typename ReduceAccOp,
+ typename InT,
+ typename AccT,
typename IndexType,
int ADims>
__global__ void
-THCudaTensor_reduceAllPass1(TensorInfo<T, IndexType> in,
- IndexType totalElements,
- T init,
- ModifyOp modifyOp,
- ReduceOp reduceOp,
- T* scratchSpace) {
+kernelReduceAllPass1(TensorInfo<InT, IndexType> in,
+ IndexType totalElements,
+ AccT init,
+ ModifyOp modifyOp,
+ ReduceOp reduceOp,
+ ReduceAccOp reduceAccOp,
+ AccT* scratchSpace) {
const IndexType startIndex = getStartIndex<IndexType>(totalElements);
const IndexType endIndex = getEndIndex<IndexType>(totalElements);
// With a block-wide stride, have each thread perform its own reduction.
- T r = init;
+ AccT r = init;
for (IndexType i = startIndex + threadIdx.x; i < endIndex; i += blockDim.x) {
- const IndexType inOffset = IndexToOffset<T, IndexType, ADims>::get(i, in);
+ const IndexType inOffset = IndexToOffset<InT, IndexType, ADims>::get(i, in);
r = reduceOp(r, modifyOp(in.data[inOffset]));
}
// Reduce within the block
- extern __shared__ T smem[];
- r = reduceBlock<T, ReduceOp>(smem, blockDim.x, r, reduceOp, init);
+ extern __shared__ char smemChar[];
+ AccT* smem = (AccT*) smemChar;
+ r = reduceBlock<AccT, ReduceAccOp>(smem, blockDim.x, r, reduceAccOp, init);
if (threadIdx.x == 0) {
// Write out block-wide reduced value
@@ -94,18 +102,19 @@ THCudaTensor_reduceAllPass1(TensorInfo<T, IndexType> in,
template <typename ReduceOp, typename T, typename IndexType>
__global__ void
-THCudaTensor_reduceAllPass2(int numPass1Blocks,
- T init,
- ReduceOp reduceOp,
- T* scratchSpace,
- T* out) {
+kernelReduceAllPass2(int numPass1Blocks,
+ T init,
+ ReduceOp reduceOp,
+ T* scratchSpace,
+ T* out) {
T r = init;
if (threadIdx.x < numPass1Blocks) {
r = scratchSpace[threadIdx.x];
}
// Reduce within the block
- extern __shared__ T smem[];
+ extern __shared__ char smemChar[];
+ T* smem = (T*) smemChar;
r = reduceBlock<T, ReduceOp>(smem, numPass1Blocks, r, reduceOp, init);
if (threadIdx.x == 0) {
@@ -119,13 +128,13 @@ inline bool isTwoPassReductionSize(long elements) {
return (elements > THC_TWO_PASS_REDUCTION_SIZE);
}
-template <typename T>
+template <typename InT, typename AccT>
inline long getTwoPassBlocks(THCState* state, long elements) {
long numBlocks = THCCeilDiv(elements, THC_REDUCE_ALL_BLOCK_SIZE);
// We can only have as many blocks as there is scratch space
long scratchSpace =
- THCState_getCurrentDeviceScratchSpaceSize(state) / sizeof(T);
+ THCState_getCurrentDeviceScratchSpaceSize(state) / sizeof(AccT);
THAssert(scratchSpace > 0);
if (numBlocks > scratchSpace) {
@@ -136,22 +145,22 @@ inline long getTwoPassBlocks(THCState* state, long elements) {
}
// Get the block/grid size that we want
-template <typename T>
+template <typename InT, typename AccT>
inline void getPass1ReduceBlockGrid(THCState* state, long elements,
dim3& grid, dim3& block) {
- grid = dim3(getTwoPassBlocks<T>(state, elements));
+ grid = dim3(getTwoPassBlocks<InT, AccT>(state, elements));
block = dim3(THC_REDUCE_ALL_BLOCK_SIZE);
}
-template <typename T>
+template <typename InT, typename AccT>
inline void getPass2ReduceBlockGrid(THCState* state, long elements,
dim3& grid, dim3& block) {
grid = dim3(1);
// We only need as many threads as there were blocks originally
- block = dim3(getTwoPassBlocks<T>(state, elements));
+ block = dim3(getTwoPassBlocks<InT, AccT>(state, elements));
}
-template <typename T>
+template <typename InT, typename AccT>
inline void getSinglePassReduceBlockGrid(long elements,
dim3& grid, dim3& block) {
grid = dim3(1);
@@ -160,75 +169,83 @@ inline void getSinglePassReduceBlockGrid(long elements,
template <typename ModifyOp,
typename ReduceOp,
- typename T,
+ typename ReduceAccOp,
+ typename InT,
+ typename AccT,
typename IndexType,
int ADims>
void callReduceAll(THCState* state,
- const TensorInfo<T, IndexType>& in,
+ const TensorInfo<InT, IndexType>& in,
long totalElements,
- T init,
+ AccT init,
const ModifyOp& modifyOp,
const ReduceOp& reduceOp,
- T* devOut) {
+ const ReduceAccOp& reduceAccOp,
+ AccT* devOut) {
dim3 grid;
dim3 block;
if (isTwoPassReductionSize(totalElements)) {
- getPass1ReduceBlockGrid<T>(state, totalElements, grid, block);
- size_t smemSize = block.x * sizeof(T);
+ getPass1ReduceBlockGrid<InT, AccT>(state, totalElements, grid, block);
+ size_t smemSize = block.x * sizeof(AccT);
- THCudaTensor_reduceAllPass1<ModifyOp, ReduceOp, T, IndexType, ADims>
+ kernelReduceAllPass1<ModifyOp, ReduceOp, ReduceAccOp, InT, AccT, IndexType, ADims>
<<<grid, block, smemSize, THCState_getCurrentStream(state)>>>(
- in, (IndexType) totalElements, init, modifyOp, reduceOp,
- (T*) THCState_getCurrentDeviceScratchSpace(state));
+ in, (IndexType) totalElements, init, modifyOp, reduceOp, reduceAccOp,
+ (AccT*) THCState_getCurrentDeviceScratchSpace(state));
int numPass1Blocks = grid.x;
- getPass2ReduceBlockGrid<T>(state, totalElements, grid, block);
- smemSize = block.x * sizeof(T);
+ getPass2ReduceBlockGrid<InT, AccT>(state, totalElements, grid, block);
+ smemSize = block.x * sizeof(AccT);
- THCudaTensor_reduceAllPass2<ReduceOp, T, IndexType>
+ kernelReduceAllPass2<ReduceAccOp, AccT, IndexType>
<<<grid, block, smemSize, THCState_getCurrentStream(state)>>>(
- numPass1Blocks, init, reduceOp,
- (T*) THCState_getCurrentDeviceScratchSpace(state),
+ numPass1Blocks, init, reduceAccOp,
+ (AccT*) THCState_getCurrentDeviceScratchSpace(state),
devOut);
} else {
- getSinglePassReduceBlockGrid<T>(totalElements, grid, block);
- size_t smemSize = block.x * sizeof(T);
+ getSinglePassReduceBlockGrid<InT, AccT>(totalElements, grid, block);
+ size_t smemSize = block.x * sizeof(AccT);
- THCudaTensor_reduceAll<ModifyOp, ReduceOp, T, IndexType, ADims>
+ kernelReduceAll<ModifyOp, ReduceOp, ReduceAccOp, InT, AccT, IndexType, ADims>
<<<grid, block, smemSize, THCState_getCurrentStream(state)>>>(
- in, (IndexType) totalElements, init, modifyOp, reduceOp, devOut);
+ in, (IndexType) totalElements, init, modifyOp, reduceOp, reduceAccOp, devOut);
}
}
-// Reduces the entire tensor to one floating-point value. `out` points
-// to host-resident memory.
-template <typename ModifyOp, typename ReduceOp>
-bool THCudaTensor_reduceAll(THCState* state,
- THCudaTensor* in,
- const ModifyOp& modifyOp,
- const ReduceOp& reduceOp,
- float init,
- float* out,
- int outOnDevice) {
- long inElements = THCudaTensor_nElement(state, in);
-
- if (THCudaTensor_nDimension(state, in) > MAX_CUTORCH_DIMS) {
+// Reduces the entire tensor to one value. `out` points to
+// host-resident memory.
+template <typename TensorType,
+ typename ModifyOp,
+ typename ReduceOp,
+ typename ReduceAccOp,
+ typename AccT>
+bool THC_reduceAll(THCState* state,
+ TensorType* in,
+ const ModifyOp& modifyOp,
+ const ReduceOp& reduceOp,
+ const ReduceAccOp& reduceAccOp,
+ AccT init,
+ AccT* out,
+ int outOnDevice) {
+ long inElements = TensorUtils<TensorType>::getNumElements(state, in);
+
+ if (TensorUtils<TensorType>::getDims(state, in) > MAX_CUTORCH_DIMS) {
return false;
}
- if (THCudaTensor_nDimension(state, in) == 0) {
+ if (TensorUtils<TensorType>::getDims(state, in) == 0) {
// Zero-dim tensor; do nothing
*out = init;
return true;
}
- float* devOut = out;
+ AccT* devOut = out;
if (!outOnDevice) {
// Use the stream-specific scratch space for the reduction kernel
// to write out its value
- devOut = (float*) THCState_getCurrentDeviceScratchSpace(state);
+ devOut = (AccT*) THCState_getCurrentDeviceScratchSpace(state);
}
// It is possible that the tensor dimensions are able to be collapsed,
@@ -240,10 +257,12 @@ bool THCudaTensor_reduceAll(THCState* state,
// dimension, and the loop to translate the linear index to the array
// index can be similarly collapsed. That is what this unrolling is for.
#define HANDLE_CASE(TYPE, IN) \
- callReduceAll<ModifyOp, ReduceOp, \
- typename TensorUtils<THCudaTensor>::DataType, \
+ callReduceAll<ModifyOp, ReduceOp, ReduceAccOp, \
+ typename TensorUtils<TensorType>::DataType, \
+ AccT, \
TYPE, IN>( \
- state, inInfo, inElements, init, modifyOp, reduceOp, devOut);
+ state, inInfo, inElements, init, modifyOp, \
+ reduceOp, reduceAccOp, devOut);
#define HANDLE_IN_CASE(TYPE, IN) \
{ \
@@ -257,9 +276,6 @@ bool THCudaTensor_reduceAll(THCState* state,
case 2: \
HANDLE_CASE(TYPE, 2); \
break; \
- case 3: \
- HANDLE_CASE(TYPE, 3); \
- break; \
default: \
HANDLE_CASE(TYPE, -1); \
break; \
@@ -267,15 +283,16 @@ bool THCudaTensor_reduceAll(THCState* state,
} \
}
- if (TensorUtils<THCudaTensor>::canUse32BitIndexMath(state, in)) {
- TensorInfo<float, unsigned int> inInfo =
- getTensorInfo<THCudaTensor, unsigned int>(state, in);
+ if (TensorUtils<TensorType>::canUse32BitIndexMath(state, in)) {
+ TensorInfo<typename TensorUtils<TensorType>::DataType, unsigned int> inInfo =
+ getTensorInfo<TensorType, unsigned int>(state, in);
inInfo.collapseDims();
HANDLE_IN_CASE(unsigned int, inInfo.dims);
} else {
- TensorInfo<float, unsigned long long> inInfo =
- getTensorInfo<THCudaTensor, unsigned long long>(state, in);
+ TensorInfo<typename TensorUtils<TensorType>::DataType,
+ unsigned long long> inInfo =
+ getTensorInfo<TensorType, unsigned long long>(state, in);
inInfo.collapseDims();
// For large tensors, we only compile the completely contiguous
@@ -293,7 +310,7 @@ bool THCudaTensor_reduceAll(THCState* state,
// If our destination is not on the device, copy the value back to
// the host (synchronous!)
if (!outOnDevice) {
- cudaMemcpy(out, devOut, sizeof(float), cudaMemcpyDeviceToHost);
+ cudaMemcpy(out, devOut, sizeof(AccT), cudaMemcpyDeviceToHost);
}
return true;
diff --git a/lib/THC/THCTensorCopy.cu b/lib/THC/THCTensorCopy.cu
index ea38d2e..e74d23d 100644
--- a/lib/THC/THCTensorCopy.cu
+++ b/lib/THC/THCTensorCopy.cu
@@ -1,5 +1,6 @@
#include "THCApply.cuh"
#include "THCHalf.h"
+#include "THCNumerics.cuh"
inline int curGPU() {
int curDev;
diff --git a/lib/THC/THCTensorMasked.cu b/lib/THC/THCTensorMasked.cu
index 6ee5bad..f3cb4ef 100644
--- a/lib/THC/THCTensorMasked.cu
+++ b/lib/THC/THCTensorMasked.cu
@@ -1,9 +1,6 @@
#include "THCTensorMath.h"
#include "THCGeneral.h"
-#include "THCDeviceUtils.cuh"
-#include "THCBlas.h"
#include "THCTensorCopy.h"
-#include "THCTensorRandom.h"
#include "THCApply.cuh"
#include "THCReduce.cuh"
@@ -13,202 +10,47 @@
#include <thrust/system/cuda/execution_policy.h>
#endif
+template <typename T, typename MaskT>
struct TensorMaskedFillOp {
- TensorMaskedFillOp(float v) : value(v) {}
- __device__ __forceinline__ void operator()(float* t, float* mask) {
- // Really mask should be `0` or `1` but we can't propagate errors here.
- if (*mask != 0.0f) {
+ TensorMaskedFillOp(T v) : value(v) {}
+ __device__ inline void operator()(T* t, MaskT* mask) {
+ if (*mask) {
*t = value;
}
}
- float value;
+ T value;
};
-void THCudaTensor_maskedFill(THCState* state,
- THCudaTensor *tensor, THCudaTensor *mask, float value)
-{
- THAssert(THCudaTensor_checkGPU(state, 2, tensor, mask));
- THArgCheck(THCudaTensor_nElement(state, tensor) ==
- THCudaTensor_nElement(state, mask),
- 2, "sizes do not match");
-
- if (!THC_pointwiseApply2(state, tensor, mask, TensorMaskedFillOp(value))) {
- THArgCheck(false, 2, CUTORCH_DIM_WARNING);
- }
-
- THCudaCheck(cudaGetLastError());
-}
-
+template <typename T, typename MaskT, typename MaskPrefixSumT>
struct TensorMaskedCopyOp {
- TensorMaskedCopyOp(float* s) : src(s) {}
+ TensorMaskedCopyOp(T* s) : in(s) {}
- __device__ __forceinline__ void operator()(float* out, float* mask, float* maskPrefixSum) {
- // Really mask should be `0` or `1` but we can't propagate errors here.
- if (*mask != 0.0f) {
- // We've already checked that this offset is <= 2^24, so this is ok.
- *out = src[(int) *maskPrefixSum];
+ __device__ inline void operator()(T* out,
+ MaskT* mask,
+ MaskPrefixSumT* maskPrefixSum) {
+ if (*mask) {
+ *out = in[*maskPrefixSum];
}
}
// Where we are copying from
- float* src;
+ T* in;
};
-
-void THCudaTensor_maskedCopy(THCState* state,
- THCudaTensor *tensor, THCudaTensor *mask, THCudaTensor *src)
-{
- THAssert(THCudaTensor_checkGPU(state, 3, tensor, src, mask));
- long maskSize = THCudaTensor_nElement(state, mask);
- long tensorSize = THCudaTensor_nElement(state, tensor);
- long srcSize = THCudaTensor_nElement(state, src);
-
- // Since we are performing a prefix sum of mask, it cannot exceed
- // the size allowed in consecutive integers in float32
- THArgCheck(maskSize <= (long) FLOAT32_MAX_CONSECUTIVE_INT,
- 3, "mask nElements exceeds single-precision float "
- "consecutive integer precision size (2^24)");
-
- // `mask` and `tensor` must have the same number of elements
- THArgCheck(maskSize == tensorSize, 2,
- "mask and tensor must have the same number of elements");
-
- THCudaTensor* contigMask = THCudaTensor_newContiguous(state, mask);
- long oneElements = (long) THCudaTensor_sumall(state, contigMask);
-
- // The number of `1` elements present in the mask must be <= the
- // number of elements available in `src`
- if (oneElements > srcSize) {
- THCudaTensor_free(state, contigMask);
- THArgCheck(false, 2, "source nElements must be == mask `1` elements");
- }
-
- // Use a prefix sum to determine the copy locations of the masked elements
- THCudaTensor* maskPrefixSum = THCudaTensor_new(state);
- THCudaTensor_resizeAs(state, maskPrefixSum, contigMask);
-
- // We are getting elements from `src` based on an offset from
- // `maskPrefixSum`, so that should be made contiguous too
- THCudaTensor* contigSrc = THCudaTensor_newContiguous(state, src);
-
- thrust::device_ptr<float>
- maskData(THCudaTensor_data(state, contigMask));
- thrust::device_ptr<float>
- maskPrefixSumData(THCudaTensor_data(state, maskPrefixSum));
- thrust::exclusive_scan(
-#if CUDA_VERSION >= 7000
- thrust::cuda::par.on(THCState_getCurrentStream(state)),
-#endif
- maskData,
- maskData + THCudaTensor_nElement(state, contigMask),
- maskPrefixSumData);
-
- // update `tensor` where `mask` == 1 but pull from `src` at
- // maskPrefixSum
- bool status = THC_pointwiseApply3(
- state, tensor, contigMask, maskPrefixSum,
- TensorMaskedCopyOp(THCudaTensor_data(state, contigSrc)));
-
- THCudaTensor_free(state, contigSrc);
- THCudaTensor_free(state, maskPrefixSum);
- THCudaTensor_free(state, contigMask);
-
- THArgCheck(status, 2, CUTORCH_DIM_WARNING);
- THCudaCheck(cudaGetLastError());
-}
-
+template <typename T, typename MaskT, typename MaskPrefixSumT>
struct TensorMaskedSelectOp {
- TensorMaskedSelectOp(float* t) : out(t) {}
- __device__ __forceinline__ void operator()(float* mask, float* maskPrefixSum, float* in) {
- // Really mask should be `0` or `1` but we can't propagate errors here.
- if (*mask != 0.0f) {
- out[(int) *maskPrefixSum] = *in;
+ TensorMaskedSelectOp(T* t) : out(t) {}
+ __device__ inline void operator()(MaskT* mask,
+ MaskPrefixSumT* maskPrefixSum,
+ T* in) {
+ if (*mask) {
+ out[*maskPrefixSum] = *in;
}
}
- float* out;
+ T* out;
};
-void THCudaTensor_maskedSelect(THCState* state,
- THCudaTensor *tensor, THCudaTensor *src, THCudaTensor *mask)
-{
- THAssert(THCudaTensor_checkGPU(state, 3, tensor, src, mask));
- THArgCheck(THCudaTensor_nElement(state, mask) == THCudaTensor_nElement(state, src),
- 2, "sizes do not match");
-
- // Since we are performing a prefix sum of mask, it cannot exceed
- // the size allowed in consecutive integers in float32
- THArgCheck(THCudaTensor_nElement(state, mask) <=
- (long) FLOAT32_MAX_CONSECUTIVE_INT,
- 3, "mask nElements exceeds single-precision float "
- "consecutive integer precision size (2^24)");
-
- // Determine our output size
- THCudaTensor* contigMask = THCudaTensor_newContiguous(state, mask);
- long totalElements = (long) THCudaTensor_sumall(state, contigMask);
-
- // This should be contiguous already, so no need to make it contig
- // for the apply kernel
- THCudaTensor_resize1d(state, tensor, totalElements);
-
- // Use a prefix sum to determine the output locations of the masked elements
- THCudaTensor* maskPrefixSum = THCudaTensor_new(state);
- THCudaTensor_resizeAs(state, maskPrefixSum, contigMask);
-
- thrust::device_ptr<float>
- maskData(THCudaTensor_data(state, contigMask));
- thrust::device_ptr<float>
- maskPrefixSumData(THCudaTensor_data(state, maskPrefixSum));
- thrust::exclusive_scan(
-#if CUDA_VERSION >= 7000
- thrust::cuda::par.on(THCState_getCurrentStream(state)),
-#endif
- maskData,
- maskData + THCudaTensor_nElement(state, contigMask),
- maskPrefixSumData);
-
- // Then copy over the masked elements at their desired output index
- bool status = THC_pointwiseApply3(
- state, contigMask, maskPrefixSum,
- src, TensorMaskedSelectOp(THCudaTensor_data(state, tensor)));
-
- THCudaTensor_free(state, contigMask);
- THCudaTensor_free(state, maskPrefixSum);
-
- THArgCheck(status, 2, CUTORCH_DIM_WARNING);
- THCudaCheck(cudaGetLastError());
-}
-
-void THCudaTensor_maskedFillByte(THCState* state, THCudaTensor *tensor, THByteTensor *mask, float value)
-{
- THAssert(THCudaTensor_checkGPU(state, 1, tensor));
- THLongStorage* maskSize = THByteTensor_newSizeOf(mask);
- THCudaTensor* maskCuda = THCudaTensor_newWithSize(state, maskSize, NULL);
- THLongStorage_free(maskSize);
- THCudaTensor_copyByte(state, maskCuda, mask);
- THCudaTensor_maskedFill(state, tensor, maskCuda, value);
- THCudaTensor_free(state, maskCuda);
-}
-
-void THCudaTensor_maskedCopyByte(THCState* state, THCudaTensor *tensor, THByteTensor *mask, THCudaTensor *src)
-{
- THAssert(THCudaTensor_checkGPU(state, 2, tensor, src));
- THLongStorage* maskSize = THByteTensor_newSizeOf(mask);
- THCudaTensor* maskCuda = THCudaTensor_newWithSize(state, maskSize, NULL);
- THLongStorage_free(maskSize);
- THCudaTensor_copyByte(state, maskCuda, mask);
- THCudaTensor_maskedCopy(state, tensor, maskCuda, src);
- THCudaTensor_free(state, maskCuda);
-}
-
-void THCudaTensor_maskedSelectByte(THCState* state, THCudaTensor *tensor, THCudaTensor *src, THByteTensor *mask)
-{
- THAssert(THCudaTensor_checkGPU(state, 2, tensor, src));
- THLongStorage* maskSize = THByteTensor_newSizeOf(mask);
- THCudaTensor* maskCuda = THCudaTensor_newWithSize(state, maskSize, NULL);
- THLongStorage_free(maskSize);
- THCudaTensor_copyByte(state, maskCuda, mask);
- THCudaTensor_maskedSelect(state, tensor, src, maskCuda);
- THCudaTensor_free(state, maskCuda);
-}
+#include "generic/THCTensorMasked.cu"
+#include "THCGenerateAllTypes.h"
diff --git a/lib/THC/THCTensorMath.cu b/lib/THC/THCTensorMath.cu
index 5e140ef..f2a7607 100644
--- a/lib/THC/THCTensorMath.cu
+++ b/lib/THC/THCTensorMath.cu
@@ -1,12 +1,9 @@
#include "THCTensorMath.h"
#include "THCGeneral.h"
#include "THCTensorCopy.h"
-#include "THCTensorRandom.h"
#include "THCApply.cuh"
-#include "THCReduce.cuh"
-#include "THCReduceAll.cuh"
+#include "THCNumerics.cuh"
-#include <thrust/functional.h>
#include <cfloat>
void THCudaTensor_cat(THCState *state, THCudaTensor *result, THCudaTensor *ta, THCudaTensor *tb, int dimension)
@@ -147,132 +144,6 @@ void THCudaTensor_addcdiv(THCState *state, THCudaTensor *self_, THCudaTensor *t,
THCudaCheck(cudaGetLastError());
}
-float THCudaTensor_minall(THCState *state, THCudaTensor *self)
-{
- THAssert(THCudaTensor_checkGPU(state, 1, self));
- float val = FLT_MAX;
- if (!THCudaTensor_reduceAll(state, self,
- thrust::identity<float>(),
- thrust::minimum<float>(),
- FLT_MAX, &val, 0)) {
- THArgCheck(false, 1, CUTORCH_DIM_WARNING);
- }
-
- THCudaCheck(cudaGetLastError());
- return val;
-}
-
-float THCudaTensor_maxall(THCState *state, THCudaTensor *self)
-{
- THAssert(THCudaTensor_checkGPU(state, 1, self));
- float val = -FLT_MAX;
- if (!THCudaTensor_reduceAll(state, self,
- thrust::identity<float>(),
- thrust::maximum<float>(),
- -FLT_MAX, &val, 0)) {
- THArgCheck(false, 1, CUTORCH_DIM_WARNING);
- }
-
- THCudaCheck(cudaGetLastError());
- return val;
-}
-
-float THCudaTensor_sumall(THCState *state, THCudaTensor *self)
-{
- THAssert(THCudaTensor_checkGPU(state, 1, self));
- float val = 0.0f;
- if (!THCudaTensor_reduceAll(state, self,
- thrust::identity<float>(),
- thrust::plus<float>(),
- 0.0f, &val, 0)) {
- THArgCheck(false, 1, CUTORCH_DIM_WARNING);
- }
-
- THCudaCheck(cudaGetLastError());
- return val;
-}
-
-float THCudaTensor_prodall(THCState *state, THCudaTensor *self)
-{
- THAssert(THCudaTensor_checkGPU(state, 1, self));
- float val = 1.0f;
- if (!THCudaTensor_reduceAll(state, self,
- thrust::identity<float>(),
- thrust::multiplies<float>(),
- 1.0f, &val, 0)) {
- THArgCheck(false, 1, CUTORCH_DIM_WARNING);
- }
-
- THCudaCheck(cudaGetLastError());
- return val;
-}
-
-void THCudaTensor_sum(THCState* state, THCudaTensor *self, THCudaTensor *src, long dimension)
-{
- THAssert(THCudaTensor_checkGPU(state, 2, self, src));
- if (!THCudaTensor_reduceDim(
- state, self, src,
- thrust::identity<float>(), thrust::plus<float>(), 0.0f, dimension)) {
- THArgCheck(false, 2, CUTORCH_DIM_WARNING);
- }
-
- THCudaCheck(cudaGetLastError());
-}
-
-void THCudaTensor_prod(THCState* state, THCudaTensor *self, THCudaTensor *src, long dimension)
-{
- THAssert(THCudaTensor_checkGPU(state, 2, self, src));
- if (!THCudaTensor_reduceDim(
- state, self, src,
- thrust::identity<float>(), thrust::multiplies<float>(), 1.0f, dimension)) {
- THArgCheck(false, 2, CUTORCH_DIM_WARNING);
- }
-
- THCudaCheck(cudaGetLastError());
-}
-
-struct logicalall_functor
-{
- __device__ inline float operator()(float x, float y) const
- {
- return x && y;
- }
-};
-
-struct logicalany_functor
-{
- __device__ float operator()(float x, float y) const
- {
- return x || y;
- }
-};
-
-int THCudaTensor_logicalall(THCState *state, THCudaTensor *self) {
- THAssert(THCudaTensor_checkGPU(state, 1, self));
- float result = 0.0f;
- if (!THCudaTensor_reduceAll(state, self,
- thrust::identity<float>(),
- logicalall_functor(),
- 1.0f, &result, 0)) {
- THArgCheck(false, 1, CUTORCH_DIM_WARNING);
- }
-
- return (int) result;
-}
-
-int THCudaTensor_logicalany(THCState *state, THCudaTensor *self) {
- THAssert(THCudaTensor_checkGPU(state, 1, self));
- float result = 0.0f;
- if (!THCudaTensor_reduceAll(state, self,
- thrust::identity<float>(),
- logicalany_functor(),
- 0.0f, &result, 0)) {
- THArgCheck(false, 1, CUTORCH_DIM_WARNING);
- }
-
- return (int) result;
-}
-
template <typename T>
struct TensorFillOp {
TensorFillOp(T v) : val(v) {}
diff --git a/lib/THC/THCTensorMath.h b/lib/THC/THCTensorMath.h
index 6593da2..89d8989 100644
--- a/lib/THC/THCTensorMath.h
+++ b/lib/THC/THCTensorMath.h
@@ -7,12 +7,26 @@
#include "generic/THCTensorMath.h"
#include "THCGenerateAllTypes.h"
+#include "generic/THCTensorMathBlas.h"
+#include "THCGenerateAllTypes.h"
+
#include "generic/THCTensorMathPairwise.h"
#include "THCGenerateAllTypes.h"
#include "generic/THCTensorMathPointwise.h"
#include "THCGenerateAllTypes.h"
+#include "generic/THCTensorMathReduce.h"
+#include "THCGenerateAllTypes.h"
+
+#include "generic/THCTensorMathCompare.h"
+#include "THCGenerateAllTypes.h"
+
+#include "generic/THCTensorMathCompareT.h"
+#include "THCGenerateAllTypes.h"
+
+#include "generic/THCTensorMasked.h"
+#include "THCGenerateAllTypes.h"
THC_API void THCudaTensor_tril(THCState *state, THCudaTensor *self, THCudaTensor *src, long k);
THC_API void THCudaTensor_triu(THCState *state, THCudaTensor *self, THCudaTensor *src, long k);
@@ -20,16 +34,6 @@ THC_API void THCudaTensor_triu(THCState *state, THCudaTensor *self, THCudaTensor
THC_API void THCudaTensor_addcmul(THCState *state, THCudaTensor *self, THCudaTensor* t, float value, THCudaTensor *src1, THCudaTensor *src2);
THC_API void THCudaTensor_addcdiv(THCState *state, THCudaTensor *self, THCudaTensor* t, float value, THCudaTensor *src1, THCudaTensor *src2);
-THC_API float THCudaTensor_dot(THCState *state, THCudaTensor *self, THCudaTensor *src);
-
-THC_API float THCudaTensor_minall(THCState *state, THCudaTensor *self);
-THC_API float THCudaTensor_maxall(THCState *state, THCudaTensor *self);
-THC_API float THCudaTensor_sumall(THCState *state, THCudaTensor *self);
-THC_API float THCudaTensor_prodall(THCState *state, THCudaTensor *self);
-THC_API void THCudaTensor_min(THCState *state, THCudaTensor *values, THCudaTensor *indices, THCudaTensor *src, long dim);
-THC_API void THCudaTensor_max(THCState *state, THCudaTensor *values, THCudaTensor *indices, THCudaTensor *src, long dim);
-THC_API void THCudaTensor_sum(THCState *state, THCudaTensor *self, THCudaTensor *src, long dim);
-THC_API void THCudaTensor_prod(THCState *state, THCudaTensor *self, THCudaTensor *src, long dim);
THC_API void THCudaTensor_cumsum(THCState *state, THCudaTensor *self, THCudaTensor *src, long dim);
THC_API void THCudaTensor_cumprod(THCState *state, THCudaTensor *self, THCudaTensor *src, long dim);
@@ -38,14 +42,6 @@ THC_API void THCudaTensor_cmax(THCState *state, THCudaTensor *self, THCudaTensor
THC_API void THCudaTensor_cminValue(THCState *state, THCudaTensor *self, THCudaTensor *src, float value);
THC_API void THCudaTensor_cmaxValue(THCState *state, THCudaTensor *self, THCudaTensor *src, float value);
-THC_API void THCudaTensor_addmv(THCState *state, THCudaTensor *self, float beta, THCudaTensor *t, float alpha, THCudaTensor *mat, THCudaTensor *vec);
-THC_API void THCudaTensor_addmm(THCState *state, THCudaTensor *self, float beta, THCudaTensor *t, float alpha, THCudaTensor *mat1, THCudaTensor *mat2);
-THC_API void THCudaTensor_addr(THCState *state, THCudaTensor *self, float beta, THCudaTensor *t, float alpha, THCudaTensor *vec1, THCudaTensor *vec2);
-THC_API void THCudaTensor_addbmm(THCState *state, THCudaTensor *result, float beta, THCudaTensor *t,
- float alpha, THCudaTensor *batch1, THCudaTensor *batch2);
-THC_API void THCudaTensor_baddbmm(THCState *state, THCudaTensor *result, float beta, THCudaTensor *t,
- float alpha, THCudaTensor *batch1, THCudaTensor *batch2);
-
THC_API void THCudaTensor_log(THCState *state, THCudaTensor *self, THCudaTensor *src);
THC_API void THCudaTensor_log1p(THCState *state, THCudaTensor *self, THCudaTensor *src);
THC_API void THCudaTensor_sigmoid(THCState *state, THCudaTensor *self, THCudaTensor *src);
@@ -93,20 +89,6 @@ THC_API void THCudaTensor_qr(THCState *state, THCudaTensor *rq_, THCudaTensor *r
THC_API void THCudaTensor_cat(THCState *state, THCudaTensor *result, THCudaTensor *ta, THCudaTensor *tb, int dimension);
THC_API void THCudaTensor_catArray(THCState *state, THCudaTensor *result, THCudaTensor **inputs, int numInputs, int dimension);
-THC_API void THCudaTensor_ltValue(THCState *state, THCudaTensor *self_, THCudaTensor *src, float value);
-THC_API void THCudaTensor_gtValue(THCState *state, THCudaTensor *self_, THCudaTensor *src, float value);
-THC_API void THCudaTensor_leValue(THCState *state, THCudaTensor *self_, THCudaTensor *src, float value);
-THC_API void THCudaTensor_geValue(THCState *state, THCudaTensor *self_, THCudaTensor *src, float value);
-THC_API void THCudaTensor_eqValue(THCState *state, THCudaTensor *self_, THCudaTensor *src, float value);
-THC_API void THCudaTensor_neValue(THCState *state, THCudaTensor *self_, THCudaTensor *src, float value);
-
-THC_API void THCudaTensor_ltTensor(THCState *state, THCudaTensor *self_, THCudaTensor *src1, THCudaTensor *src2);
-THC_API void THCudaTensor_gtTensor(THCState *state, THCudaTensor *self_, THCudaTensor *src1, THCudaTensor *src2);
-THC_API void THCudaTensor_leTensor(THCState *state, THCudaTensor *self_, THCudaTensor *src1, THCudaTensor *src2);
-THC_API void THCudaTensor_geTensor(THCState *state, THCudaTensor *self_, THCudaTensor *src1, THCudaTensor *src2);
-THC_API void THCudaTensor_eqTensor(THCState *state, THCudaTensor *self_, THCudaTensor *src1, THCudaTensor *src2);
-THC_API void THCudaTensor_neTensor(THCState *state, THCudaTensor *self_, THCudaTensor *src1, THCudaTensor *src2);
-
THC_API float THCudaTensor_meanall(THCState *state, THCudaTensor *self);
THC_API void THCudaTensor_mean(THCState *state, THCudaTensor *self, THCudaTensor *src, long dim);
THC_API float THCudaTensor_varall(THCState *state, THCudaTensor *self);
@@ -131,19 +113,11 @@ THC_API void THCudaTensor_indexAdd_long(THCState *state, THCudaTensor *res_, int
THC_API void THCudaTensor_indexFill_long(THCState *state, THCudaTensor *tensor, int dim, THLongTensor *index, float val);
THC_API void THCudaTensor_indexSelect_long(THCState *state, THCudaTensor *tensor, THCudaTensor *src, int dim, THLongTensor *index);
-THC_API void THCudaTensor_maskedFill(THCState *state, THCudaTensor *tensor, THCudaTensor *mask, float value);
-THC_API void THCudaTensor_maskedCopy(THCState *state, THCudaTensor *tensor, THCudaTensor *mask, THCudaTensor *src);
-THC_API void THCudaTensor_maskedSelect(THCState *state, THCudaTensor *tensor, THCudaTensor *src, THCudaTensor *mask);
-
-THC_API void THCudaTensor_maskedFillByte(THCState *state, THCudaTensor *tensor, THByteTensor *mask, float value);
-THC_API void THCudaTensor_maskedCopyByte(THCState *state, THCudaTensor *tensor, THByteTensor *mask, THCudaTensor *src);
-THC_API void THCudaTensor_maskedSelectByte(THCState *state, THCudaTensor *tensor, THCudaTensor *src, THByteTensor *mask);
-
THC_API void THCudaTensor_gather(THCState* state, THCudaTensor *tensor, THCudaTensor *src, int dim, THCudaTensor *index);
THC_API void THCudaTensor_scatter(THCState* state, THCudaTensor *tensor, int dim, THCudaTensor *index, THCudaTensor *src);
THC_API void THCudaTensor_scatterFill(THCState* state, THCudaTensor *tensor, int dim, THCudaTensor *index, float value);
-THC_API int THCudaTensor_logicalall(THCState *state, THCudaTensor *self);
-THC_API int THCudaTensor_logicalany(THCState *state, THCudaTensor *self);
+THC_API int THCudaByteTensor_logicalall(THCState *state, THCudaByteTensor *self);
+THC_API int THCudaByteTensor_logicalany(THCState *state, THCudaByteTensor *self);
#endif
diff --git a/lib/THC/THCTensorMath2.cu b/lib/THC/THCTensorMath2.cu
index 0d6bb52..dc8544c 100644
--- a/lib/THC/THCTensorMath2.cu
+++ b/lib/THC/THCTensorMath2.cu
@@ -525,24 +525,24 @@ void THCudaTensor_norm(THCState *state, THCudaTensor* self, THCudaTensor* src, f
{
THAssert(THCudaTensor_checkGPU(state, 2, self, src));
if (value == 0.0f) {
- THCudaTensor_reduceDim(state, self, src,
- TensorNonZeroOp(), thrust::plus<float>(),
- 0.0f, dimension);
+ THC_reduceDim(state, self, src,
+ TensorNonZeroOp(), thrust::plus<float>(),
+ 0.0f, dimension);
} else if (value == 1.0f) {
- THCudaTensor_reduceDim(state, self, src,
- TensorNormOp<1>(value), thrust::plus<float>(),
- 0.0f, dimension);
+ THC_reduceDim(state, self, src,
+ TensorNormOp<1>(value), thrust::plus<float>(),
+ 0.0f, dimension);
} else if (value == 2.0f) {
- THCudaTensor_reduceDim(state, self, src,
- TensorNormOp<2>(value), thrust::plus<float>(),
- 0.0f, dimension);
+ THC_reduceDim(state, self, src,
+ TensorNormOp<2>(value), thrust::plus<float>(),
+ 0.0f, dimension);
THCudaTensor_pow(state, self, self, 0.5f);
} else {
- THCudaTensor_reduceDim(state, self, src,
- TensorNormOp<-1>(value), thrust::plus<float>(),
- 0.0f, dimension);
+ THC_reduceDim(state, self, src,
+ TensorNormOp<-1>(value), thrust::plus<float>(),
+ 0.0f, dimension);
THCudaTensor_pow(state, self, self, 1.0f / value);
}
@@ -684,7 +684,7 @@ THC_API void THCudaTensor_cross(THCState *state, THCudaTensor *self, THCudaTenso
for (i = 0; i < nd; i++) {
THArgCheck(THCudaTensor_size(state, x, i) == THCudaTensor_size(state, y, i), 1, "dimension %i of x and y does not match", i);
if (dimension < 0 && THCudaTensor_size(state, x, i) == 3) {
- dimension = i;
+ dimension = i;
}
}
diff --git a/lib/THC/THCTensorMathBlas.cu b/lib/THC/THCTensorMathBlas.cu
index 0e373d8..0804d64 100644
--- a/lib/THC/THCTensorMathBlas.cu
+++ b/lib/THC/THCTensorMathBlas.cu
@@ -2,425 +2,7 @@
#include "THCGeneral.h"
#include "THCBlas.h"
#include "THCTensorCopy.h"
-#include "THCTensorRandom.h"
-#include "THCApply.cuh"
-#include "THCReduce.cuh"
+#include "THCNumerics.cuh"
-float THCudaTensor_dot(THCState *state, THCudaTensor *self, THCudaTensor *src)
-{
- THAssert(THCudaTensor_checkGPU(state, 2, self, src));
- THArgCheck(THCudaTensor_nElement(state, self) == THCudaTensor_nElement(state, src), 2, "sizes do not match");
-
- {
- self = THCudaTensor_newContiguous(state, self);
- src = THCudaTensor_newContiguous(state, src);
-
- float result = THCudaBlas_dot(state,
- THCudaTensor_nElement(state, self),
- THCudaTensor_data(state, self), 1,
- THCudaTensor_data(state, src), 1);
- THCudaTensor_free(state, src);
- THCudaTensor_free(state, self);
-
- return result;
- }
-}
-
-void THCudaTensor_addmv(THCState *state, THCudaTensor *r_, float beta, THCudaTensor *t, float alpha, THCudaTensor *mat, THCudaTensor *vec)
-{
- THAssert(THCudaTensor_checkGPU(state, 4, r_, t, mat, vec));
- if( (mat->nDimension != 2) || (vec->nDimension != 1) )
- THError("matrix and vector expected");
-
- if( mat->size[1] != vec->size[0] )
- THError("size mismatch");
-
- if(t->nDimension != 1)
- THError("size mismatch");
-
- if(t->size[0] != mat->size[0])
- THError("size mismatch");
-
- if(r_ != t)
- {
- THCudaTensor_resizeAs(state, r_, t);
- THCudaTensor_copy(state, r_, t);
- }
-
- if(mat->stride[0] == 1)
- {
- THCudaBlas_gemv(state, 'n', mat->size[0], mat->size[1],
- alpha, THCudaTensor_data(state, mat), mat->stride[1],
- THCudaTensor_data(state, vec), vec->stride[0],
- beta, THCudaTensor_data(state, r_), r_->stride[0]);
- }
- else if(mat->stride[1] == 1)
- {
- THCudaBlas_gemv(state, 't', mat->size[1], mat->size[0],
- alpha, THCudaTensor_data(state, mat), mat->stride[0],
- THCudaTensor_data(state, vec), vec->stride[0],
- beta, THCudaTensor_data(state, r_), r_->stride[0]);
- }
- else
- {
- THCudaTensor *cmat = THCudaTensor_newContiguous(state, mat);
-
- THCudaBlas_gemv(state, 't', mat->size[1], mat->size[0],
- alpha, THCudaTensor_data(state, cmat), cmat->stride[0],
- THCudaTensor_data(state, vec), vec->stride[0],
- beta, THCudaTensor_data(state, r_), r_->stride[0]);
-
- THCudaTensor_free(state, cmat);
- }
-}
-
-void THCudaTensor_addmm(THCState *state, THCudaTensor *r_, float beta, THCudaTensor *t, float alpha, THCudaTensor *m1, THCudaTensor *m2)
-{
- THAssert(THCudaTensor_checkGPU(state, 4, r_, t, m1, m2));
- char transpose_r, transpose_m1, transpose_m2;
- THCudaTensor *r__, *m1_, *m2_;
-
- if( (m1->nDimension != 2) || (m2->nDimension != 2) )
- THError("matrix and matrix expected");
-
- if(t->nDimension != 2)
- THError("size mismatch");
-
- if( (t->size[0] != m1->size[0]) || (t->size[1] != m2->size[1]) || (m1->size[1] != m2->size[0]) )
- THError("size mismatch");
-
- if(t != r_)
- {
- THCudaTensor_resizeAs(state, r_, t);
- THCudaTensor_copy(state, r_, t);
- }
-
- /* r_ */
- if(r_->stride[0] == 1 &&
- r_->stride[1] != 0)
- {
- transpose_r = 'n';
- r__ = r_;
- }
- else if(r_->stride[1] == 1 &&
- r_->stride[0] != 0)
- {
- THCudaTensor *swap = m2;
- m2 = m1;
- m1 = swap;
- transpose_r = 't';
- r__ = r_;
- }
- else
- {
- transpose_r = 'n';
-
- THCudaTensor *transp_r_ = THCudaTensor_newTranspose(state, r_, 0, 1);
- r__ = THCudaTensor_newClone(state, transp_r_);
- THCudaTensor_free(state, transp_r_);
- THCudaTensor_transpose(state, r__, NULL, 0, 1);
- }
-
- /* m1 */
- if(m1->stride[(transpose_r == 'n' ? 0 : 1)] == 1 &&
- m1->stride[(transpose_r == 'n' ? 1 : 0)] != 0)
- {
- transpose_m1 = 'n';
- m1_ = m1;
- }
- else if(m1->stride[(transpose_r == 'n' ? 1 : 0)] == 1 &&
- m1->stride[(transpose_r == 'n' ? 0 : 1)] != 0)
- {
- transpose_m1 = 't';
- m1_ = m1;
- }
- else
- {
- transpose_m1 = (transpose_r == 'n' ? 't' : 'n');
- m1_ = THCudaTensor_newContiguous(state, m1);
- }
-
- /* m2 */
- if(m2->stride[(transpose_r == 'n' ? 0 : 1)] == 1 &&
- m2->stride[(transpose_r == 'n' ? 1 : 0)] != 0)
- {
- transpose_m2 = 'n';
- m2_ = m2;
- }
- else if(m2->stride[(transpose_r == 'n' ? 1 : 0)] == 1 &&
- m2->stride[(transpose_r == 'n' ? 0 : 1)] != 0)
- {
- transpose_m2 = 't';
- m2_ = m2;
- }
- else
- {
- transpose_m2 = (transpose_r == 'n' ? 't' : 'n');
- m2_ = THCudaTensor_newContiguous(state, m2);
- }
-
- /* do the operation */
- THCudaBlas_gemm(state,
- transpose_m1,
- transpose_m2,
- r__->size[(transpose_r == 'n' ? 0 : 1)],
- r__->size[(transpose_r == 'n' ? 1 : 0)],
- m1_->size[(transpose_r == 'n' ? 1 : 0)],
- alpha,
- THCudaTensor_data(state, m1_),
- (transpose_m1 == 'n' ? m1_->stride[(transpose_r == 'n' ? 1 : 0)] : m1_->stride[(transpose_r == 'n' ? 0 : 1)]),
- THCudaTensor_data(state, m2_),
- (transpose_m2 == 'n' ? m2_->stride[(transpose_r == 'n' ? 1 : 0)] : m2_->stride[(transpose_r == 'n' ? 0 : 1)]),
- beta,
- THCudaTensor_data(state, r__),
- r__->stride[(transpose_r == 'n' ? 1 : 0)]);
-
- /* free intermediate variables */
- if(m1_ != m1)
- THCudaTensor_free(state, m1_);
-
- if(m2_ != m2)
- THCudaTensor_free(state, m2_);
-
- if(r__ != r_)
- THCudaTensor_freeCopyTo(state, r__, r_);
-}
-
-void THCudaTensor_addr(THCState *state, THCudaTensor *r_, float beta, THCudaTensor *t, float alpha, THCudaTensor *vec1, THCudaTensor *vec2)
-{
- THAssert(THCudaTensor_checkGPU(state, 4, r_, t, vec1, vec2));
- if( (vec1->nDimension != 1) || (vec2->nDimension != 1) )
- THError("vector and vector expected");
-
- if(t->nDimension != 2)
- THError("size mismatch");
-
- if( (t->size[0] != vec1->size[0]) || (t->size[1] != vec2->size[0]) )
- THError("size mismatch");
-
- if(r_ != t)
- {
- THCudaTensor_resizeAs(state, r_, t);
- THCudaTensor_copy(state, r_, t);
- }
-
- if(beta != 1)
- THCudaTensor_mul(state, r_, r_, beta);
-
- if(r_->stride[0] == 1)
- {
- THCudaBlas_ger(state, vec1->size[0], vec2->size[0],
- alpha, THCudaTensor_data(state, vec1), vec1->stride[0],
- THCudaTensor_data(state, vec2), vec2->stride[0],
- THCudaTensor_data(state, r_), r_->stride[1]);
- }
- else if(r_->stride[1] == 1)
- {
- THCudaBlas_ger(state, vec2->size[0], vec1->size[0],
- alpha, THCudaTensor_data(state, vec2), vec2->stride[0],
- THCudaTensor_data(state, vec1), vec1->stride[0],
- THCudaTensor_data(state, r_), r_->stride[0]);
- }
- else
- {
- THCudaTensor *cr = THCudaTensor_newClone(state, r_);
-
- THCudaBlas_ger(state, vec2->size[0], vec1->size[0],
- alpha, THCudaTensor_data(state, vec2), vec2->stride[0],
- THCudaTensor_data(state, vec1), vec1->stride[0],
- THCudaTensor_data(state, cr), cr->stride[0]);
-
- THCudaTensor_freeCopyTo(state, cr, r_);
- }
-}
-
-void THCudaTensor_addbmm(THCState *state, THCudaTensor *result, float beta, THCudaTensor *t,
- float alpha, THCudaTensor *batch1, THCudaTensor *batch2) {
- THAssert(THCudaTensor_checkGPU(state, 4, result, t, batch1, batch2));
- THArgCheck(THCudaTensor_nDimension(state, t) == 2, 4, "expected 2D tensor");
- THArgCheck(THCudaTensor_nDimension(state, batch1) == 3, 6, "expected 3D tensor");
- THArgCheck(THCudaTensor_nDimension(state, batch2) == 3, 7, "expected 3D tensor");
-
- long batchnum = THCudaTensor_size(state, batch1, 0);
- long m1d1 = THCudaTensor_size(state, batch1, 1);
- long innerdim = THCudaTensor_size(state, batch1, 2);
- long m2d2 = THCudaTensor_size(state, batch2, 2);
-
- THArgCheck(batchnum == THCudaTensor_size(state, batch2, 0), 7,
- "equal number of batches expected");
- // M is t, as listed in the docs under addbmm
- THArgCheck(m1d1 == THCudaTensor_size(state, t, 0), 6,
- "first dimension must match first dimension of M");
- THArgCheck(m2d2 == THCudaTensor_size(state, t, 1), 7,
- "second dimension must match second dimension of M");
- THArgCheck(innerdim == THCudaTensor_size(state, batch2, 1), 6,
- "second dimension must match first dimension of batch2");
-
- if (t != result) {
- THCudaTensor_resizeAs(state, result, t);
- THCudaTensor_copy(state, result, t);
- }
-
- THCudaTensor *slice1 = THCudaTensor_new(state);
- THCudaTensor *slice2 = THCudaTensor_new(state);
- for (long i=0; i<batchnum; i++) {
- THCudaTensor_select(state, slice1, batch1, 0, i);
- THCudaTensor_select(state, slice2, batch2, 0, i);
-
- THCudaTensor_addmm(state, result, beta, result, alpha, slice1, slice2);
- beta = 1;
- }
- THCudaTensor_free(state, slice1);
- THCudaTensor_free(state, slice2);
-}
-
-void THCudaTensor_baddbmm(THCState *state, THCudaTensor *result, float beta, THCudaTensor *t,
- float alpha, THCudaTensor *batch1, THCudaTensor *batch2) {
- THAssert(THCudaTensor_checkGPU(state, 4, result, t, batch1, batch2));
- THArgCheck(THCudaTensor_nDimension(state, t) == 3, 4, "expected 3D tensor");
- THArgCheck(THCudaTensor_nDimension(state, batch1) == 3, 6, "expected 3D tensor");
- THArgCheck(THCudaTensor_nDimension(state, batch2) == 3, 7, "expected 3D tensor");
- THArgCheck(THCudaTensor_size(state, t, 0) == THCudaTensor_size(state, batch1, 0), 6,
- "equal number of batches expected");
- THArgCheck(THCudaTensor_size(state, t, 0) == THCudaTensor_size(state, batch2, 0), 7,
- "equal number of batches expected");
- THArgCheck(THCudaTensor_size(state, t, 1) == THCudaTensor_size(state, batch1, 1), 6,
- "wrong matrix size");
- THArgCheck(THCudaTensor_size(state, t, 2) == THCudaTensor_size(state, batch2, 2), 7,
- "wrong matrix size");
- THArgCheck(THCudaTensor_size(state, batch1, 2) == THCudaTensor_size(state, batch2, 1), 6,
- "wrong matrix size");
-
- if (t != result) {
- THCudaTensor_resizeAs(state, result, t);
- THCudaTensor_copy(state, result, t);
- }
-
- bool transpose_result;
- char transpose_batch1, transpose_batch2;
- long lda, ldb, ldc;
- THCudaTensor *result_, *batch1_, *batch2_;
- if (result->stride[1] == 1)
- {
- transpose_result = false;
- result_ = result;
- ldc = result_->stride[2];
- }
- else if (result->stride[2] == 1)
- {
- transpose_result = true;
-
- THCudaTensor *swap = batch2;
- batch2 = batch1;
- batch1 = swap;
-
- result_ = result;
- ldc = result_->stride[1];
- }
- else
- {
- transpose_result = false;
-
- THCudaTensor *transp_r_ = THCudaTensor_newTranspose(state, result, 1, 2);
- result_ = THCudaTensor_newClone(state, transp_r_);
- THCudaTensor_free(state, transp_r_);
- THCudaTensor_transpose(state, result_, NULL, 1, 2);
-
- ldc = result_->stride[2];
- }
-
- if (batch1->stride[transpose_result ? 2 : 1] == 1)
- {
- transpose_batch1 = 'n';
- batch1_ = batch1;
- lda = batch1_->stride[transpose_result ? 1 : 2];
- }
- else if (batch1->stride[transpose_result ? 1 : 2] == 1)
- {
- transpose_batch1 = 't';
- batch1_ = batch1;
- lda = batch1_->stride[transpose_result ? 2 : 1];
- }
- else
- {
- transpose_batch1 = transpose_result ? 'n' : 't';
- batch1_ = THCudaTensor_newContiguous(state, batch1);
- lda = batch1_->stride[1];
- }
-
- if (batch2->stride[transpose_result ? 2 : 1] == 1)
- {
- transpose_batch2 = 'n';
- batch2_ = batch2;
- ldb = batch2_->stride[transpose_result ? 1 : 2];
- }
- else if (batch2->stride[transpose_result ? 1 : 2] == 1)
- {
- transpose_batch2 = 't';
- batch2_ = batch2;
- ldb = batch2_->stride[transpose_result ? 2 : 1];
- }
- else
- {
- transpose_batch2 = transpose_result ? 'n' : 't';
- batch2_ = THCudaTensor_newContiguous(state, batch2);
- ldb = batch2_->stride[1];
- }
-
- // Compute pointers to matrices in each batch.
- long num_batches = result_->size[0];
- size_t matrices_size = num_batches * sizeof(float*);
- const float **matrices1 = (const float **)THAlloc(matrices_size);
- const float **matrices2 = (const float **)THAlloc(matrices_size);
- float **result_matrices = (float **)THAlloc(matrices_size);
- for (int i = 0; i < num_batches; ++i)
- {
- matrices1[i] = THCudaTensor_data(state, batch1_) + i * batch1_->stride[0];
- matrices2[i] = THCudaTensor_data(state, batch2_) + i * batch2_->stride[0];
- result_matrices[i] = THCudaTensor_data(state, result_) + i * result_->stride[0];
- }
-
- // Copy pointers to device.
- const float **d_matrices1, **d_matrices2;
- float **d_result_matrices;
- THCudaCheck(THCudaMalloc(state, (void**)&d_matrices1, matrices_size));
- THCudaCheck(THCudaMalloc(state, (void**)&d_matrices2, matrices_size));
- THCudaCheck(THCudaMalloc(state, (void**)&d_result_matrices, matrices_size));
-
- THCudaCheck(cudaMemcpyAsync(d_matrices1, matrices1, matrices_size,
- cudaMemcpyHostToDevice, THCState_getCurrentStream(state)));
- THCudaCheck(cudaMemcpyAsync(d_matrices2, matrices2, matrices_size,
- cudaMemcpyHostToDevice, THCState_getCurrentStream(state)));
- THCudaCheck(cudaMemcpyAsync(d_result_matrices, result_matrices, matrices_size,
- cudaMemcpyHostToDevice, THCState_getCurrentStream(state)));
-
- THCudaBlas_gemmBatched(
- state,
- transpose_batch1,
- transpose_batch2,
- result_->size[transpose_result ? 2 : 1],
- result_->size[transpose_result ? 1 : 2],
- batch1_->size[transpose_result ? 1 : 2],
- alpha,
- d_matrices1, lda,
- d_matrices2, ldb,
- beta,
- d_result_matrices, ldc,
- num_batches);
-
- THCudaFree(state, d_matrices1);
- THCudaFree(state, d_matrices2);
- THCudaFree(state, d_result_matrices);
- THFree(matrices1);
- THFree(matrices2);
- THFree(result_matrices);
-
- if (batch1_ != batch1)
- THCudaTensor_free(state, batch1_);
-
- if (batch2_ != batch2)
- THCudaTensor_free(state, batch2_);
-
- if (result_ != result)
- THCudaTensor_freeCopyTo(state, result_, result);
-}
+#include "generic/THCTensorMathBlas.cu"
+#include "THCGenerateAllTypes.h"
diff --git a/lib/THC/THCTensorMathCompare.cu b/lib/THC/THCTensorMathCompare.cu
index 71cfdd5..8e01e83 100644
--- a/lib/THC/THCTensorMathCompare.cu
+++ b/lib/THC/THCTensorMathCompare.cu
@@ -1,109 +1,85 @@
#include "THCTensorMath.h"
#include "THCGeneral.h"
-#include "THCBlas.h"
#include "THCTensorCopy.h"
-#include "THCTensorRandom.h"
#include "THCApply.cuh"
-#include "THCReduce.cuh"
-
-template<class Op>
-void THCudaTensor_logicalValue(THCState *state, THCudaTensor *self_, THCudaTensor *src, Op op)
-{
- THCudaTensor_resizeAs(state, self_, src);
-
- if (!THC_pointwiseApply2(state, self_, src, op)) {
- THArgCheck(false, 2, CUTORCH_DIM_WARNING);
- }
-
- THCudaCheck(cudaGetLastError());
-}
+#include "THCNumerics.cuh"
+template <typename T, typename TOut>
struct TensorLTValueOp {
- TensorLTValueOp(float v) : value(v) {}
- __device__ __forceinline__ void operator()(float* out, float* in) {
- *out = (*in < value);
+ TensorLTValueOp(T v) : value(v) {}
+ __device__ __forceinline__ void operator()(TOut* out, T* in) {
+ *out = ScalarConvert<bool, TOut>::to(THCNumerics<T>::lt(*in, value));
}
- const float value;
+ const T value;
};
-void THCudaTensor_ltValue(THCState *state, THCudaTensor *self_, THCudaTensor *src, float value)
-{
- THAssert(THCudaTensor_checkGPU(state, 2, self_, src));
- THCudaTensor_logicalValue(state, self_, src, TensorLTValueOp(value));
-}
-
+template <typename T, typename TOut>
struct TensorGTValueOp {
- TensorGTValueOp(float v) : value(v) {}
- __device__ __forceinline__ void operator()(float* out, float* in) {
- *out = (*in > value);
+ TensorGTValueOp(T v) : value(v) {}
+ __device__ __forceinline__ void operator()(TOut* out, T* in) {
+ *out = ScalarConvert<bool, TOut>::to(THCNumerics<T>::gt(*in, value));
}
- const float value;
+ const T value;
};
-void THCudaTensor_gtValue(THCState *state, THCudaTensor *self_, THCudaTensor *src, float value)
-{
- THAssert(THCudaTensor_checkGPU(state, 2, self_, src));
- THCudaTensor_logicalValue(state, self_, src, TensorGTValueOp(value));
-}
+template <typename T, typename TOut>
struct TensorLEValueOp {
- TensorLEValueOp(float v) : value(v) {}
- __device__ __forceinline__ void operator()(float* out, float* in) {
- *out = (*in <= value);
+ TensorLEValueOp(T v) : value(v) {}
+ __device__ __forceinline__ void operator()(TOut* out, T* in) {
+ *out = ScalarConvert<bool, TOut>::to(THCNumerics<T>::le(*in, value));
}
- const float value;
+ const T value;
};
-void THCudaTensor_leValue(THCState *state, THCudaTensor *self_, THCudaTensor *src, float value)
-{
- THAssert(THCudaTensor_checkGPU(state, 2, self_, src));
- THCudaTensor_logicalValue(state, self_, src, TensorLEValueOp(value));
-}
-
+template <typename T, typename TOut>
struct TensorGEValueOp {
- TensorGEValueOp(float v) : value(v) {}
- __device__ __forceinline__ void operator()(float* out, float* in) {
- *out = (*in >= value);
+ TensorGEValueOp(T v) : value(v) {}
+ __device__ __forceinline__ void operator()(TOut* out, T* in) {
+ *out = ScalarConvert<bool, TOut>::to(THCNumerics<T>::ge(*in, value));
}
- const float value;
+ const T value;
};
-void THCudaTensor_geValue(THCState *state, THCudaTensor *self_, THCudaTensor *src, float value)
-{
- THAssert(THCudaTensor_checkGPU(state, 2, self_, src));
- THCudaTensor_logicalValue(state, self_, src, TensorGEValueOp(value));
-}
-
+template <typename T, typename TOut>
struct TensorEQValueOp {
- TensorEQValueOp(float v) : value(v) {}
- __device__ __forceinline__ void operator()(float* out, float* in) {
- *out = (*in == value);
+ TensorEQValueOp(T v) : value(v) {}
+ __device__ __forceinline__ void operator()(TOut* out, T* in) {
+ *out = ScalarConvert<bool, TOut>::to(THCNumerics<T>::eq(*in, value));
}
- const float value;
+ const T value;
};
-void THCudaTensor_eqValue(THCState *state, THCudaTensor *self_, THCudaTensor *src, float value)
-{
- THAssert(THCudaTensor_checkGPU(state, 2, self_, src));
- THCudaTensor_logicalValue(state, self_, src, TensorEQValueOp(value));
-}
-
+template <typename T, typename TOut>
struct TensorNEValueOp {
- TensorNEValueOp(float v) : value(v) {}
- __device__ __forceinline__ void operator()(float* out, float* in) {
- *out = (*in != value);
+ TensorNEValueOp(T v) : value(v) {}
+ __device__ __forceinline__ void operator()(TOut* out, T* in) {
+ *out = ScalarConvert<bool, TOut>::to(THCNumerics<T>::ne(*in, value));
}
- const float value;
+ const T value;
};
-void THCudaTensor_neValue(THCState *state, THCudaTensor *self_, THCudaTensor *src, float value)
-{
- THAssert(THCudaTensor_checkGPU(state, 2, self_, src));
- THCudaTensor_logicalValue(state, self_, src, TensorNEValueOp(value));
+template<typename TensorType, typename TensorTypeOut, class Op>
+void THC_logicalValue(THCState *state,
+ TensorTypeOut *self_,
+ TensorType *src,
+ Op op) {
+ THLongStorage* st = TensorUtils<TensorType>::newSizeOf(state, src);
+ TensorUtils<TensorTypeOut>::resize(state, self_, st, NULL);
+ THLongStorage_free(st);
+
+ if (!THC_pointwiseApply2(state, self_, src, op)) {
+ THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+ }
+
+ THCudaCheck(cudaGetLastError());
}
+
+#include "generic/THCTensorMathCompare.cu"
+#include "THCGenerateAllTypes.h"
diff --git a/lib/THC/THCTensorMathCompareT.cu b/lib/THC/THCTensorMathCompareT.cu
index 0567a89..a5a130d 100644
--- a/lib/THC/THCTensorMathCompareT.cu
+++ b/lib/THC/THCTensorMathCompareT.cu
@@ -1,96 +1,72 @@
#include "THCTensorMath.h"
#include "THCGeneral.h"
-#include "THCBlas.h"
#include "THCTensorCopy.h"
#include "THCApply.cuh"
+#include "THCNumerics.cuh"
#include "THCReduce.cuh"
-template<class Op>
-void THCudaTensor_logicalTensor(THCState *state, THCudaTensor *self_, THCudaTensor *src1, THCudaTensor *src2, Op op)
-{
- THCudaTensor_resizeAs(state, self_, src1);
- THArgCheck(THCudaTensor_nElement(state, src1) == THCudaTensor_nElement(state, src2), 3, "sizes do not match");
-
- if (!THC_pointwiseApply3(state, self_, src1, src2, op)) {
- THArgCheck(false, 2, CUTORCH_DIM_WARNING);
- }
-
- THCudaCheck(cudaGetLastError());
-}
-
+template <typename T, typename TOut>
struct TensorLTOp {
- __device__ __forceinline__ void operator()(float* out, float* a, float* b) {
- *out = (float) (*a < *b);
+ __device__ inline void operator()(TOut* out, T* a, T* b) {
+ *out = ScalarConvert<bool, TOut>::to(THCNumerics<T>::lt(*a, *b));
}
};
+template <typename T, typename TOut>
struct TensorGTOp {
- __device__ __forceinline__ void operator()(float* out, float* a, float* b) {
- *out = (float) (*a > *b);
+ __device__ inline void operator()(TOut* out, T* a, T* b) {
+ *out = ScalarConvert<bool, TOut>::to(THCNumerics<T>::gt(*a, *b));
}
};
+template <typename T, typename TOut>
struct TensorLEOp {
- __device__ __forceinline__ void operator()(float* out, float* a, float* b) {
- *out = (float) (*a <= *b);
+ __device__ inline void operator()(TOut* out, T* a, T* b) {
+ *out = ScalarConvert<bool, TOut>::to(THCNumerics<T>::le(*a, *b));
}
};
+template <typename T, typename TOut>
struct TensorGEOp {
- __device__ __forceinline__ void operator()(float* out, float* a, float* b) {
- *out = (float) (*a >= *b);
+ __device__ inline void operator()(TOut* out, T* a, T* b) {
+ *out = ScalarConvert<bool, TOut>::to(THCNumerics<T>::ge(*a, *b));
}
};
+template <typename T, typename TOut>
struct TensorEQOp {
- __device__ __forceinline__ void operator()(float* out, float* a, float* b) {
- *out = (float) (*a == *b);
+ __device__ inline void operator()(TOut* out, T* a, T* b) {
+ *out = ScalarConvert<bool, TOut>::to(THCNumerics<T>::eq(*a, *b));
}
};
+template <typename T, typename TOut>
struct TensorNEOp {
- __device__ __forceinline__ void operator()(float* out, float* a, float* b) {
- *out = (float) (*a != *b);
+ __device__ inline void operator()(TOut* out, T* a, T* b) {
+ *out = ScalarConvert<bool, TOut>::to(THCNumerics<T>::ne(*a, *b));
}
};
-void THCudaTensor_ltTensor(THCState *state, THCudaTensor *self_, THCudaTensor *src1, THCudaTensor *src2)
-{
- THAssert(THCudaTensor_checkGPU(state, 3, self_, src1, src2));
- THCudaTensor_logicalTensor(state, self_, src1, src2, TensorLTOp());
-}
-
-
-void THCudaTensor_gtTensor(THCState *state, THCudaTensor *self_, THCudaTensor *src1, THCudaTensor *src2)
-{
- THAssert(THCudaTensor_checkGPU(state, 3, self_, src1, src2));
- THCudaTensor_logicalTensor(state, self_, src1, src2, TensorGTOp());
-}
+template<typename TensorType, typename TensorTypeOut, typename Op>
+void THC_logicalTensor(THCState *state,
+ TensorTypeOut *self_,
+ TensorType *src1,
+ TensorType *src2,
+ Op op) {
+ THLongStorage* st = TensorUtils<TensorType>::newSizeOf(state, src1);
+ TensorUtils<TensorTypeOut>::resize(state, self_, st, NULL);
+ THLongStorage_free(st);
+ THArgCheck(TensorUtils<TensorType>::getNumElements(state, src1) ==
+ TensorUtils<TensorType>::getNumElements(state, src2), 3,
+ "sizes do not match");
-void THCudaTensor_leTensor(THCState *state, THCudaTensor *self_, THCudaTensor *src1, THCudaTensor *src2)
-{
- THAssert(THCudaTensor_checkGPU(state, 3, self_, src1, src2));
- THCudaTensor_logicalTensor(state, self_, src1, src2, TensorLEOp());
-}
-
-
-void THCudaTensor_geTensor(THCState *state, THCudaTensor *self_, THCudaTensor *src1, THCudaTensor *src2)
-{
- THAssert(THCudaTensor_checkGPU(state, 3, self_, src1, src2));
- THCudaTensor_logicalTensor(state, self_, src1, src2, TensorGEOp());
-}
-
+ if (!THC_pointwiseApply3(state, self_, src1, src2, op)) {
+ THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+ }
-void THCudaTensor_eqTensor(THCState *state, THCudaTensor *self_, THCudaTensor *src1, THCudaTensor *src2)
-{
- THAssert(THCudaTensor_checkGPU(state, 3, self_, src1, src2));
- THCudaTensor_logicalTensor(state, self_, src1, src2, TensorEQOp());
+ THCudaCheck(cudaGetLastError());
}
-
-void THCudaTensor_neTensor(THCState *state, THCudaTensor *self_, THCudaTensor *src1, THCudaTensor *src2)
-{
- THAssert(THCudaTensor_checkGPU(state, 3, self_, src1, src2));
- THCudaTensor_logicalTensor(state, self_, src1, src2, TensorNEOp());
-}
+#include "generic/THCTensorMathCompareT.cu"
+#include "THCGenerateAllTypes.h"
diff --git a/lib/THC/THCTensorMathMagma.cu b/lib/THC/THCTensorMathMagma.cu
index b4fbc03..d599722 100644
--- a/lib/THC/THCTensorMathMagma.cu
+++ b/lib/THC/THCTensorMathMagma.cu
@@ -409,7 +409,7 @@ void THCudaTensor_getri(THCState *state, THCudaTensor *ra_, THCudaTensor *a)
THCudaCheck(THCudaMalloc(state, (void**)&ipiv_gpu, n * sizeof(int)));
// Run LU
- THCudaBlas_getrf(state, n, d_matrices1, n, ipiv_gpu, info_gpu, 1);
+ THCudaBlas_Sgetrf(state, n, d_matrices1, n, ipiv_gpu, info_gpu, 1);
THCudaCheck(cudaMemcpy(&info, info_gpu, sizeof(int), cudaMemcpyDeviceToHost));
@@ -419,7 +419,7 @@ void THCudaTensor_getri(THCState *state, THCudaTensor *ra_, THCudaTensor *a)
THError("CUBLAS getrf : Argument %d : illegal value", -info);
// Inverse
- THCudaBlas_getri(state, n, d_matrices1_const, n, ipiv_gpu, d_matrices2, n, info_gpu, 1);
+ THCudaBlas_Sgetri(state, n, d_matrices1_const, n, ipiv_gpu, d_matrices2, n, info_gpu, 1);
if (info > 0)
THError("CUBLAS getri : U(%d,%d) is 0, U is singular", info, info);
else if (info < 0)
diff --git a/lib/THC/THCTensorMathPairwise.cu b/lib/THC/THCTensorMathPairwise.cu
index 2c081d1..bbcd76f 100644
--- a/lib/THC/THCTensorMathPairwise.cu
+++ b/lib/THC/THCTensorMathPairwise.cu
@@ -1,10 +1,9 @@
#include "THCTensorMath.h"
#include "THCGeneral.h"
-#include "THCBlas.h"
#include "THCHalf.h"
#include "THCTensorCopy.h"
#include "THCApply.cuh"
-#include "THCReduce.cuh"
+#include "THCNumerics.cuh"
template <typename T>
struct TensorAddConstantOp {
diff --git a/lib/THC/THCTensorMathPointwise.cu b/lib/THC/THCTensorMathPointwise.cu
index 72b16e8..a8a1031 100644
--- a/lib/THC/THCTensorMathPointwise.cu
+++ b/lib/THC/THCTensorMathPointwise.cu
@@ -1,9 +1,9 @@
#include "THCTensorMath.h"
#include "THCGeneral.h"
-#include "THCBlas.h"
#include "THCHalf.h"
#include "THCTensorCopy.h"
#include "THCApply.cuh"
+#include "THCNumerics.cuh"
#include "THCReduce.cuh"
#define IMPLEMENT_CUDA_TENSOR_BASIC_FUNC(NAME, CFUNC) \
diff --git a/lib/THC/THCTensorMathReduce.cu b/lib/THC/THCTensorMathReduce.cu
new file mode 100644
index 0000000..27949ad
--- /dev/null
+++ b/lib/THC/THCTensorMathReduce.cu
@@ -0,0 +1,358 @@
+#include "THCTensorMath.h"
+#include "THCGeneral.h"
+#include "THCNumerics.cuh"
+#include "THCReduce.cuh"
+#include "THCReduceAll.cuh"
+#include <thrust/functional.h>
+
+// Reduction operators that support `half`, unlike Thrust
+template <typename InT, typename AccT>
+struct ReduceAdd {
+ inline __device__ AccT operator()(AccT a, InT b) const {
+ return a + (AccT) b;
+ }
+};
+
+#ifdef CUDA_HALF_TENSOR
+template <>
+struct ReduceAdd<half, half> {
+ inline __device__ half operator()(half a, half b) const {
+#ifdef CUDA_HALF_INSTRUCTIONS
+ return __hadd(a, b);
+#else
+ float fa = __half2float(a);
+ float fb = __half2float(b);
+ return __float2half(fa + fb);
+#endif
+ }
+};
+
+template <>
+struct ReduceAdd<half, float> {
+ inline __device__ float operator()(float a, half b) const {
+ return a + __half2float(b);
+ }
+};
+#endif // CUDA_HALF_TENSOR
+
+template <typename InT, typename AccT>
+struct ReduceMultiply {
+ inline __device__ AccT operator()(AccT a, InT b) const {
+ return a * (AccT) b;
+ }
+};
+
+#ifdef CUDA_HALF_TENSOR
+template <>
+struct ReduceMultiply<half, half> {
+ inline __device__ half operator()(half a, half b) const {
+#ifdef CUDA_HALF_INSTRUCTIONS
+ return __hmul(a, b);
+#else
+ float fa = __half2float(a);
+ float fb = __half2float(b);
+ return __float2half(fa * fb);
+#endif
+ }
+};
+
+template <>
+struct ReduceMultiply<half, float> {
+ inline __device__ float operator()(float a, half b) const {
+ return a * __half2float(b);
+ }
+};
+#endif // CUDA_HALF_TENSOR
+
+template <typename T>
+struct ReduceMin {
+ inline __device__ T operator()(T a, T b) const {
+ return THCNumerics<T>::lt(a, b) ? a : b;
+ }
+};
+
+template <typename T>
+struct ReduceMax {
+ inline __device__ T operator()(T a, T b) const {
+ return THCNumerics<T>::gt(a, b) ? a : b;
+ }
+};
+
+struct LogicalAll {
+ inline __device__ unsigned char operator()(unsigned char x,
+ unsigned char y) const {
+ return (x && y);
+ }
+};
+
+struct LogicalAny {
+ inline __device__ unsigned char operator()(unsigned char x,
+ unsigned char y) const {
+ return (x || y);
+ }
+};
+
+
+THC_API int
+THCudaByteTensor_logicalall(THCState *state, THCudaByteTensor *self) {
+ THAssert(THCudaByteTensor_checkGPU(state, 1, self));
+ unsigned char result;
+ if (!THC_reduceAll(state, self,
+ thrust::identity<unsigned char>(),
+ LogicalAll(),
+ LogicalAll(),
+ (unsigned char) 1, &result, 0)) {
+ THArgCheck(false, 1, CUTORCH_DIM_WARNING);
+ }
+
+ return (int) result;
+}
+
+THC_API int
+THCudaByteTensor_logicalany(THCState *state, THCudaByteTensor *self) {
+ THAssert(THCudaByteTensor_checkGPU(state, 1, self));
+ unsigned char result;
+ if (!THC_reduceAll(state, self,
+ thrust::identity<unsigned char>(),
+ LogicalAny(),
+ LogicalAny(),
+ (unsigned char) 0, &result, 0)) {
+ THArgCheck(false, 1, CUTORCH_DIM_WARNING);
+ }
+
+ return (int) result;
+}
+
+
+#include <thrust/functional.h>
+
+/* A set of reduction kernels that take in binary ops on thrust pairs (of value, index).
+ These are useful when you not only have to do a reduction, but you might have
+ to preserve the location of contention (for example min/max operations).
+ The structure of the kernels follows the structure of the reduction kernels.
+*/
+template <typename K, typename Index, class BinaryFunction>
+__global__ void
+kernelTransformReduceOuterDimIndex(K *tgt1,
+ Index *tgt2,
+ K *src_,
+ unsigned num_orows,
+ unsigned num_irows,
+ unsigned row_size,
+ thrust::pair<K, Index> init,
+ BinaryFunction binary_op) {
+ for (unsigned orow = blockIdx.x; orow < num_orows; orow += gridDim.x) {
+ for (unsigned irow = blockIdx.y * blockDim.x + threadIdx.x;
+ irow < num_irows;
+ irow += gridDim.y * blockDim.x) {
+ K *src = src_ + orow * row_size * num_irows + irow;
+ thrust::pair<K, Index> acc = init;
+
+ for (unsigned col = 0; col < row_size; ++col) {
+ // +1 for Lua index
+ acc = binary_op(thrust::make_pair<K, Index>(*src, col+1),
+ acc);
+ src += num_irows;
+ }
+
+ tgt1[orow * num_irows + irow] = acc.first;
+ tgt2[orow * num_irows + irow] = acc.second;
+ }
+ }
+}
+
+template <typename TensorTypeK,
+ typename TensorTypeIndex,
+ typename BinaryFunction>
+__host__ void
+THC_transformReduceOuterDimIndex(THCState *state,
+ TensorTypeK *tgt1,
+ TensorTypeIndex *tgt2,
+ TensorTypeK *src,
+ long rdim,
+ const thrust::pair<
+ typename TensorUtils<TensorTypeK>::DataType,
+ typename TensorUtils<TensorTypeIndex>::DataType>& init,
+ BinaryFunction binary_op) {
+ unsigned ndim = TensorUtils<TensorTypeK>::getDims(state, src);
+ unsigned num_orows = 1;
+ for (unsigned dim = 0; dim < rdim; dim++) {
+ num_orows *= TensorUtils<TensorTypeK>::getSize(state, src, dim);
+ }
+ unsigned row_size = TensorUtils<TensorTypeK>::getSize(state, src, rdim);
+ unsigned num_irows = 1;
+ for (unsigned dim = rdim + 1; dim < ndim; dim++) {
+ num_irows *= TensorUtils<TensorTypeK>::getSize(state, src, dim);
+ }
+
+ dim3 threads(min(512, num_irows));
+ unsigned maxGridDim = 1024;
+ dim3 grid(min(maxGridDim, num_orows),
+ min(maxGridDim, THCCeilDiv(num_irows, threads.x)));
+
+ kernelTransformReduceOuterDimIndex
+ <<<grid, threads, 0, THCState_getCurrentStream(state)>>>(
+ TensorUtils<TensorTypeK>::getData(state, tgt1),
+ TensorUtils<TensorTypeIndex>::getData(state, tgt2),
+ TensorUtils<TensorTypeK>::getData(state, src),
+ num_orows, num_irows, row_size, init, binary_op);
+
+ THCudaCheck(cudaGetLastError());
+}
+
+/* Reduce the innermost dimension of a tensor (on thrust::pair functors which are (value, index))
+ *
+ * For an n-d tensor (n <= 4) where the reduction is along the innermost dimension:
+ *
+ * - block.x is the innermost dimension, i.e. dimension 0;
+ * - block.y and grid.y make up dimension 1; and
+ * - grid.x and grid z are the remaining two outer dimensions (if any)
+ *
+ * Reduction along other dimensions is handled in a separate kernel.
+ */
+template <typename K, typename Index, class BinaryFunction>
+__global__ void
+kernelTransformReduceInnermostDimIndex(K *tgt1,
+ Index* tgt2,
+ K *src_,
+ unsigned num_rows,
+ unsigned row_size,
+ thrust::pair<K, Index> init,
+ BinaryFunction binary_op) {
+ __shared__ K sbuf[32][16 + 1]; // avoid bank conflict
+ __shared__ Index ibuf[32][16 + 1]; // avoid bank conflict
+
+ for (unsigned block_row = blockIdx.x * blockDim.y;
+ block_row < num_rows;
+ block_row += blockDim.y * gridDim.x) {
+ unsigned row = block_row + threadIdx.y;
+ thrust::pair<K, Index> acc = init;
+ if (row < num_rows) {
+ K *src = src_ + row * row_size;
+ // Sequential reduction within a thread.
+ for (unsigned col = threadIdx.x; col < row_size; col += blockDim.x) {
+ acc = binary_op(thrust::make_pair<K, Index>(src[col], col + 1), acc);
+ }
+ }
+
+ sbuf[threadIdx.y][threadIdx.x] = acc.first;
+ ibuf[threadIdx.y][threadIdx.x] = acc.second;
+
+ __syncthreads();
+
+ // Reduce intermediate values to single value.
+ K* sline = &sbuf[threadIdx.y][0];
+ Index* iline = &ibuf[threadIdx.y][0];
+ for (unsigned s = 8; s > 0; s >>= 1) {
+ if (row < num_rows && threadIdx.x < s) {
+ thrust::pair<K, Index> arg1 =
+ thrust::make_pair<K, Index>(sline[threadIdx.x], iline[threadIdx.x]);
+ thrust::pair<K, Index> arg2 =
+ thrust::make_pair<K, Index>(sline[threadIdx.x + s], iline[threadIdx.x + s]);
+ thrust::pair<K, Index> res = binary_op(arg1, arg2);
+
+ sline[threadIdx.x] = res.first;
+ iline[threadIdx.x] = res.second;
+ }
+ __syncthreads();
+ }
+
+ if (row < num_rows && threadIdx.x == 0) {
+ tgt1[row] = sline[0];
+ tgt2[row] = iline[0];
+ }
+ __syncthreads();
+ }
+}
+
+template <typename TensorTypeK,
+ typename TensorTypeIndex,
+ typename BinaryFunction>
+__host__ void
+THC_transformReduceInnermostDimIndex(THCState *state,
+ TensorTypeK *tgt1,
+ TensorTypeIndex *tgt2,
+ TensorTypeK *src,
+ const thrust::pair<
+ typename TensorUtils<TensorTypeK>::DataType,
+ typename TensorUtils<TensorTypeIndex>::DataType>& init,
+ BinaryFunction binary_op) {
+ unsigned ndim = TensorUtils<TensorTypeK>::getDims(state, src);
+ unsigned num_rows = 1;
+ for (unsigned dim = 0; dim < ndim - 1; dim++) {
+ num_rows *= TensorUtils<TensorTypeK>::getSize(state, src, dim);
+ }
+ unsigned row_size = TensorUtils<TensorTypeK>::getSize(state, src, ndim - 1);
+
+ dim3 threads(16, 32);
+ dim3 grid(min(1024, THCCeilDiv(num_rows, threads.y)));
+
+ kernelTransformReduceInnermostDimIndex
+ <<<grid, threads, 0, THCState_getCurrentStream(state)>>>(
+ TensorUtils<TensorTypeK>::getData(state, tgt1),
+ TensorUtils<TensorTypeIndex>::getData(state, tgt2),
+ TensorUtils<TensorTypeK>::getData(state, src),
+ num_rows, row_size, init, binary_op);
+
+ THCudaCheck(cudaGetLastError());
+}
+
+template <typename TensorTypeK,
+ typename TensorTypeIndex,
+ typename BinaryFunction>
+void
+THC_reduceDimIndex(THCState *state,
+ TensorTypeK *tgt1_,
+ TensorTypeIndex *tgt2_,
+ TensorTypeK *src,
+ long dimension,
+ const thrust::pair<
+ typename TensorUtils<TensorTypeK>::DataType,
+ typename TensorUtils<TensorTypeIndex>::DataType>& init,
+ BinaryFunction binary_op)
+{
+ THArgCheck(dimension >= 0 &&
+ dimension < TensorUtils<TensorTypeK>::getDims(state, src),
+ 3, "dimension out of range");
+
+ THLongStorage *dim = TensorUtils<TensorTypeK>::newSizeOf(state, src);
+ THLongStorage_set(dim, dimension, 1);
+ TensorUtils<TensorTypeK>::resize(state, tgt1_, dim, NULL);
+ TensorUtils<TensorTypeIndex>::resize(state, tgt2_, dim, NULL);
+ THLongStorage_free(dim);
+
+ TensorTypeK *tgt1 = TensorUtils<TensorTypeK>::newContiguous(state, tgt1_);
+ TensorTypeIndex *tgt2 = TensorUtils<TensorTypeIndex>::newContiguous(state, tgt2_);
+ src = TensorUtils<TensorTypeK>::newContiguous(state, src);
+
+ if (dimension == TensorUtils<TensorTypeK>::getDims(state, src) - 1) {
+ THC_transformReduceInnermostDimIndex(state, tgt1, tgt2, src, init, binary_op);
+ } else {
+ THC_transformReduceOuterDimIndex(state, tgt1, tgt2, src, dimension, init, binary_op);
+ }
+
+ TensorUtils<TensorTypeK>::free(state, src);
+ TensorUtils<TensorTypeK>::freeCopyTo(state, tgt1, tgt1_);
+ TensorUtils<TensorTypeIndex>::freeCopyTo(state, tgt2, tgt2_);
+}
+
+template <typename T, typename Index>
+struct MaxValuePair {
+ __host__ __device__
+ thrust::pair<T, Index> operator()(const thrust::pair<T, Index>& a,
+ const thrust::pair<T, Index>& b) {
+ return THCNumerics<T>::ge(a.first, b.first) ? a : b;
+ }
+};
+
+template <typename T, typename Index>
+struct MinValuePair {
+ __host__ __device__
+ thrust::pair<T, Index> operator()(const thrust::pair<T, Index>& a,
+ const thrust::pair<T, Index>& b) {
+ return THCNumerics<T>::le(a.first, b.first) ? a : b;
+ }
+};
+
+#include "generic/THCTensorMathReduce.cu"
+#include "THCGenerateAllTypes.h"
diff --git a/lib/THC/THCTensorMathTransformReduce.cu b/lib/THC/THCTensorMathTransformReduce.cu
deleted file mode 100644
index 912b7b1..0000000
--- a/lib/THC/THCTensorMathTransformReduce.cu
+++ /dev/null
@@ -1,213 +0,0 @@
-#include "THCTensorMath.h"
-#include "THCGeneral.h"
-#include "THCBlas.h"
-#include "THCTensorCopy.h"
-#include "THCTensorRandom.h"
-#include "THCApply.cuh"
-#include "THCReduce.cuh"
-
-#include <thrust/functional.h>
-
-/* A set of reduction kernels that take in binary ops on thrust pairs (of value, index).
- These are useful when you not only have to do a reduction, but you might have
- to preserve the location of contention (for example min/max operations).
- The structure of the kernels follows the structure of the reduction kernels.
-*/
-template<class BinaryFunction>
-__global__ void THCudaTensor_kernel_transformReduceOuterDimIndex(float *tgt1, float *tgt2,
- float *src_,
- unsigned num_orows,
- unsigned num_irows,
- unsigned row_size,
- thrust::pair<float,float> init,
- BinaryFunction binary_op)
-{
- for (unsigned orow = blockIdx.x; orow < num_orows; orow += gridDim.x) {
- for (unsigned irow = blockIdx.y * blockDim.x + threadIdx.x; irow < num_irows; irow += gridDim.y * blockDim.x) {
- float *src = src_ + orow * row_size * num_irows + irow;
- thrust::pair<float,float> acc = init;
-
- for (unsigned col = 0; col < row_size; ++col) {
- acc = binary_op(thrust::make_pair(*src, col+1), acc); // i+1 for 1-indexing
- src += num_irows;
- }
- tgt1[orow * num_irows + irow] = acc.first;
- tgt2[orow * num_irows + irow] = acc.second;
- }
- }
-}
-
-template<class BinaryFunction>
-__host__ void THCudaTensor_transformReduceOuterDimIndex(THCState *state, THCudaTensor *tgt1, THCudaTensor *tgt2,
- THCudaTensor *src,
- long rdim, thrust::pair<float,float> init,
- BinaryFunction binary_op)
-{
- unsigned ndim = THCudaTensor_nDimension(state, src);
- unsigned num_orows = 1;
- for (unsigned dim = 0; dim < rdim; dim++) {
- num_orows *= THCudaTensor_size(state, src, dim);
- }
- unsigned row_size = THCudaTensor_size(state, src, rdim);
- unsigned num_irows = 1;
- for (unsigned dim = rdim + 1; dim < ndim; dim++) {
- num_irows *= THCudaTensor_size(state, src, dim);
- }
-
- dim3 threads(min(512, num_irows));
- unsigned maxGridDim = 1024;
- dim3 grid(min(maxGridDim, num_orows), min(maxGridDim, THCCeilDiv(num_irows, threads.x)));
-
- THCudaTensor_kernel_transformReduceOuterDimIndex<<<grid, threads, 0, THCState_getCurrentStream(state)>>>(
- THCudaTensor_data(state, tgt1), THCudaTensor_data(state, tgt2),
- THCudaTensor_data(state, src), num_orows, num_irows, row_size, init, binary_op);
- cudaError errcode = cudaGetLastError();
- if(errcode != cudaSuccess) {
- THError(cudaGetErrorString(errcode));
- }
-}
-
-/* Reduce the innermost dimension of a tensor (on thrust::pair functors which are (value, index))
- *
- * For an n-d tensor (n <= 4) where the reduction is along the innermost dimension:
- *
- * - block.x is the innermost dimension, i.e. dimension 0;
- * - block.y and grid.y make up dimension 1; and
- * - grid.x and grid z are the remaining two outer dimensions (if any)
- *
- * Reduction along other dimensions is handled in a separate kernel.
- */
-template<class BinaryFunction>
-__global__ void THCudaTensor_kernel_transformReduceInnermostDimIndex(
- float *tgt1, float* tgt2, float *src_,
- unsigned num_rows, unsigned row_size,
- thrust::pair<float,float> init, BinaryFunction binary_op)
-{
- __shared__ float sbuf[32][16];
- __shared__ float ibuf[32][16];
-
- for (unsigned block_row = blockIdx.x * blockDim.y; block_row < num_rows; block_row += blockDim.y * gridDim.x) {
- unsigned row = block_row + threadIdx.y;
- thrust::pair<float,float> acc = init;
- if (row < num_rows) {
- float *src = src_ + row * row_size;
- // Sequential reduction within a thread.
- for (unsigned col = threadIdx.x; col < row_size; col += blockDim.x) {
- acc = binary_op(thrust::make_pair(src[col], col+1), acc);
- }
- }
-
- sbuf[threadIdx.y][threadIdx.x] = acc.first;
- ibuf[threadIdx.y][threadIdx.x] = acc.second;
-
- // Reduce intermediate values to single value.
- float* sline = &sbuf[threadIdx.y][0];
- float* iline = &ibuf[threadIdx.y][0];
- for (unsigned s = 8; s > 0; s >>= 1) {
- if (row < num_rows && threadIdx.x < s) {
- thrust::pair<float,float> arg1 = thrust::make_pair<float,float>(sline[threadIdx.x], iline[threadIdx.x]);
- thrust::pair<float,float> arg2 = thrust::make_pair<float,float>(sline[threadIdx.x + s], iline[threadIdx.x + s]);
- thrust::pair<float,float> res = binary_op(arg1, arg2);
- sline[threadIdx.x] = res.first;
- iline[threadIdx.x] = res.second;
- }
- __syncthreads();
- }
-
- if (row < num_rows && threadIdx.x == 0) {
- tgt1[row] = sline[0];
- tgt2[row] = iline[0];
- }
- __syncthreads();
- }
-}
-
-template<class BinaryFunction>
-__host__ void THCudaTensor_transformReduceInnermostDimIndex(
- THCState *state, THCudaTensor *tgt1, THCudaTensor *tgt2, THCudaTensor *src,
- thrust::pair<float,float> init, BinaryFunction binary_op)
-{
- unsigned ndim = THCudaTensor_nDimension(state, src);
- unsigned num_rows = 1;
- for (unsigned dim = 0; dim < ndim - 1; dim++) {
- num_rows *= THCudaTensor_size(state, src, dim);
- }
- unsigned row_size = THCudaTensor_size(state, src, ndim - 1);
-
- dim3 threads(16, 32);
- dim3 grid(min(1024, THCCeilDiv(num_rows, threads.y)));
-
- THCudaTensor_kernel_transformReduceInnermostDimIndex<<<grid, threads, 0, THCState_getCurrentStream(state)>>>(
- THCudaTensor_data(state, tgt1), THCudaTensor_data(state, tgt2),
- THCudaTensor_data(state, src), num_rows, row_size, init, binary_op);
- cudaError errcode = cudaGetLastError();
- if(errcode != cudaSuccess) {
- THError(cudaGetErrorString(errcode));
- }
-}
-
-template<class BinaryFunction>
-void THCudaTensor_reduceDimIndex(THCState *state, THCudaTensor *tgt1_, THCudaTensor *tgt2_, THCudaTensor *src,
- long dimension, thrust::pair<float,float> init,
- BinaryFunction binary_op)
-{
- THArgCheck(dimension >= 0 && dimension < THCudaTensor_nDimension(state, src), 3, "dimension out of range");
-
- THLongStorage *dim = THCudaTensor_newSizeOf(state, src);
- THLongStorage_set(dim, dimension, 1);
- THCudaTensor_resize(state, tgt1_, dim, NULL);
- THCudaTensor_resize(state, tgt2_, dim, NULL);
- THLongStorage_free(dim);
-
- THCudaTensor *tgt1 = THCudaTensor_newContiguous(state, tgt1_);
- THCudaTensor *tgt2 = THCudaTensor_newContiguous(state, tgt2_);
- src = THCudaTensor_newContiguous(state, src);
-
- if(dimension == THCudaTensor_nDimension(state, src)-1) {
- THCudaTensor_transformReduceInnermostDimIndex(state, tgt1, tgt2, src, init, binary_op);
- } else {
- THCudaTensor_transformReduceOuterDimIndex(state, tgt1, tgt2, src, dimension, init, binary_op);
- }
-
- THCudaTensor_free(state, src);
- THCudaTensor_freeCopyTo(state, tgt1, tgt1_);
- THCudaTensor_freeCopyTo(state, tgt2, tgt2_);
-}
-
-struct maxvalue_functor
-{
- __host__ __device__ thrust::pair<float,float> operator()(const thrust::pair<float,float> &a,
- const thrust::pair<float,float> &b)
- {
- if (a.first > b.first) return a;
- else return b;
- }
-};
-
-void THCudaTensor_max(THCState *state, THCudaTensor *values, THCudaTensor *indices, THCudaTensor *src, long dimension)
-{
- THAssert(THCudaTensor_checkGPU(state, 3, values, indices, src));
- const float minfloat32 = -3.402823466e+38f;
- thrust::pair<float,float> init = thrust::make_pair<float,float>(minfloat32, 1);
- return THCudaTensor_reduceDimIndex(state, values, indices, src, dimension, init,
- maxvalue_functor());
-}
-
-struct minvalue_functor
-{
- __host__ __device__ thrust::pair<float,float> operator()(const thrust::pair<float,float> &a,
- const thrust::pair<float,float> &b)
- {
- if (a.first < b.first) return a;
- else return b;
- }
-};
-
-void THCudaTensor_min(THCState *state, THCudaTensor *values, THCudaTensor *indices, THCudaTensor *src, long dimension)
-{
- THAssert(THCudaTensor_checkGPU(state, 3, values, indices, src));
- const float maxfloat32 = 3.402823466e+38f;
- thrust::pair<float,float> init = thrust::make_pair<float,float>(maxfloat32, 1);
- return THCudaTensor_reduceDimIndex(state, values, indices, src, dimension, init,
- minvalue_functor());
-}
diff --git a/lib/THC/THCTensorTypeUtils.cu b/lib/THC/THCTensorTypeUtils.cu
index fc9bd60..96cd3bc 100644
--- a/lib/THC/THCTensorTypeUtils.cu
+++ b/lib/THC/THCTensorTypeUtils.cu
@@ -33,6 +33,12 @@ TensorUtils<TENSOR_TYPE>::newContiguous(THCState* state, \
return TENSOR_TYPE##_newContiguous(state, t); \
} \
\
+THLongStorage* \
+TensorUtils<TENSOR_TYPE>::newSizeOf(THCState* state, \
+ TENSOR_TYPE* t) { \
+ return TENSOR_TYPE##_newSizeOf(state, t); \
+} \
+ \
void \
TensorUtils<TENSOR_TYPE>::retain(THCState* state, \
TENSOR_TYPE* t) { \
@@ -53,6 +59,14 @@ TensorUtils<TENSOR_TYPE>::freeCopyTo(THCState* state, \
} \
\
void \
+TensorUtils<TENSOR_TYPE>::resize(THCState* state, \
+ TENSOR_TYPE* out, \
+ THLongStorage* sizes, \
+ THLongStorage* strides) { \
+ TENSOR_TYPE##_resize(state, out, sizes, strides); \
+} \
+ \
+void \
TensorUtils<TENSOR_TYPE>::resizeAs(THCState* state, \
TENSOR_TYPE* dst, \
TENSOR_TYPE* src) { \
diff --git a/lib/THC/THCTensorTypeUtils.cuh b/lib/THC/THCTensorTypeUtils.cuh
index 4456f47..22a2f92 100644
--- a/lib/THC/THCTensorTypeUtils.cuh
+++ b/lib/THC/THCTensorTypeUtils.cuh
@@ -31,17 +31,22 @@ template <typename TensorType>
struct TensorUtils {
};
-#define TENSOR_UTILS(TENSOR_TYPE, DATA_TYPE) \
+#define TENSOR_UTILS(TENSOR_TYPE, DATA_TYPE, ACC_DATA_TYPE) \
template <> \
struct TensorUtils<TENSOR_TYPE> { \
typedef DATA_TYPE DataType; \
+ typedef ACC_DATA_TYPE AccDataType; \
\
static TENSOR_TYPE* newTensor(THCState* state); \
static TENSOR_TYPE* newContiguous(THCState* state, TENSOR_TYPE* t); \
+ static THLongStorage* newSizeOf(THCState* state, TENSOR_TYPE* t); \
static void retain(THCState* state, TENSOR_TYPE* t); \
static void free(THCState* state, TENSOR_TYPE* t); \
static void freeCopyTo(THCState* state, TENSOR_TYPE* src, \
TENSOR_TYPE* dst); \
+ static void resize(THCState* state, TENSOR_TYPE* out, \
+ THLongStorage* sizes, \
+ THLongStorage* strides); \
static void resizeAs(THCState* state, TENSOR_TYPE* dst, \
TENSOR_TYPE* src); \
static DATA_TYPE* getData(THCState* state, TENSOR_TYPE* t); \
@@ -61,16 +66,16 @@ struct TensorUtils {
static bool canUse32BitIndexMath(THCState* state, TENSOR_TYPE* t); \
}
-TENSOR_UTILS(THCudaByteTensor, unsigned char);
-TENSOR_UTILS(THCudaCharTensor, char);
-TENSOR_UTILS(THCudaShortTensor, short);
-TENSOR_UTILS(THCudaIntTensor, int);
-TENSOR_UTILS(THCudaLongTensor, long);
-TENSOR_UTILS(THCudaTensor, float);
-TENSOR_UTILS(THCudaDoubleTensor, double);
+TENSOR_UTILS(THCudaByteTensor, unsigned char, long);
+TENSOR_UTILS(THCudaCharTensor, char, long);
+TENSOR_UTILS(THCudaShortTensor, short, long);
+TENSOR_UTILS(THCudaIntTensor, int, long);
+TENSOR_UTILS(THCudaLongTensor, long, long);
+TENSOR_UTILS(THCudaTensor, float, float);
+TENSOR_UTILS(THCudaDoubleTensor, double, double);
#ifdef CUDA_HALF_TENSOR
-TENSOR_UTILS(THCudaHalfTensor, half);
+TENSOR_UTILS(THCudaHalfTensor, half, float);
#endif
#undef TENSOR_UTILS
@@ -91,15 +96,6 @@ getTensorInfo(THCState* state, TensorType* t) {
TensorUtils<TensorType>::getData(state, t), dims, sz, st);
}
-/// `half` has some type conversion issues associated with it, since it
-/// is a struct without a constructor/implicit conversion constructor.
-/// We use this to convert scalar values to the given type that the
-/// tensor expects.
-template <typename In, typename Out>
-struct ScalarConvert {
- static __host__ __device__ Out to(const In v) { return (Out) v; }
-};
-
template <typename T>
struct ScalarNegate {
static __host__ __device__ T to(const T v) { return -v; }
@@ -111,36 +107,6 @@ struct ScalarInv {
};
#ifdef CUDA_HALF_TENSOR
-
-template <typename Out>
-struct ScalarConvert<half, Out> {
- static __host__ __device__ Out to(const half v) {
-#ifdef __CUDA_ARCH__
- return (Out) __half2float(v);
-#else
- return (Out) THC_half2float(v);
-#endif
- }
-};
-
-template <typename In>
-struct ScalarConvert<In, half> {
- static __host__ __device__ half to(const In v) {
-#ifdef __CUDA_ARCH__
- return __float2half((float) v);
-#else
- return THC_float2half((float) v);
-#endif
- }
-};
-
-template <>
-struct ScalarConvert<half, half> {
- static __host__ __device__ half to(const half v) {
- return v;
- }
-};
-
template <>
struct ScalarNegate<half> {
static __host__ __device__ half to(const half v) {
diff --git a/lib/THC/generic/THCTensorMasked.cu b/lib/THC/generic/THCTensorMasked.cu
new file mode 100644
index 0000000..e6a5704
--- /dev/null
+++ b/lib/THC/generic/THCTensorMasked.cu
@@ -0,0 +1,191 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/THCTensorMasked.cu"
+#else
+
+
+THC_API void
+THCTensor_(maskedFill)(THCState* state,
+ THCTensor *tensor, THCudaByteTensor *mask, real value)
+{
+ THAssert(THCTensor_(checkGPU)(state, 2, tensor, mask));
+ THArgCheck(THCTensor_(nElement)(state, tensor) ==
+ THCudaByteTensor_nElement(state, mask),
+ 2, "sizes do not match");
+
+ if (!THC_pointwiseApply2(state, tensor, mask,
+ TensorMaskedFillOp<real, unsigned char>(value))) {
+ THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+ }
+
+ THCudaCheck(cudaGetLastError());
+}
+
+THC_API void
+THCTensor_(maskedFillByte)(THCState* state,
+ THCTensor *tensor, THByteTensor *mask, real value)
+{
+ THAssert(THCTensor_(checkGPU)(state, 1, tensor));
+ THLongStorage* maskSizes = THByteTensor_newSizeOf(mask);
+ THCudaByteTensor* maskCuda = THCudaByteTensor_newWithSize(state, maskSizes, NULL);
+ THLongStorage_free(maskSizes);
+ THCudaByteTensor_copyByte(state, maskCuda, mask);
+ THCTensor_(maskedFill)(state, tensor, maskCuda, value);
+ THCudaByteTensor_free(state, maskCuda);
+}
+
+THC_API void
+THCTensor_(maskedCopy)(THCState* state,
+ THCTensor *tensor, THCudaByteTensor *mask, THCTensor *src)
+{
+ THAssert(THCTensor_(checkGPU)(state, 3, tensor, src, mask));
+ long maskSize = THCudaByteTensor_nElement(state, mask);
+ long tensorSize = THCTensor_(nElement)(state, tensor);
+ long srcSize = THCTensor_(nElement)(state, src);
+
+ // `mask` and `tensor` must have the same number of elements
+ THArgCheck(maskSize == tensorSize, 2,
+ "mask and tensor must have the same number of elements");
+
+ // Determine our output size
+ long totalElements = THCudaByteTensor_sumall(state, mask);
+
+ // The number of `1` elements present in the mask must be <= the
+ // number of elements available in `src`
+ if (totalElements > srcSize) {
+ THArgCheck(false, 2, "source nElements must be == mask `1` elements");
+ }
+
+ // FIXME: there appears to be a bug in Thrust (CUDA 7.0) for mixed
+ // iterator prefix sums? Convert `mask` to the same datatype as what
+ // we're accumulating the prefix sum in (long) to get around it
+ THCudaLongTensor* maskLong = THCudaLongTensor_new(state);
+ THLongStorage* maskSizes = THCudaByteTensor_newSizeOf(state, mask);
+ THCudaLongTensor_resize(state, maskLong, maskSizes, NULL);
+ THCudaLongTensor_copyCudaByte(state, maskLong, mask);
+
+ // Use a prefix sum to determine the output locations of the masked elements
+ THCudaLongTensor* maskPrefixSum = THCudaLongTensor_new(state);
+ THCudaLongTensor_resize(state, maskPrefixSum, maskSizes, NULL);
+ THLongStorage_free(maskSizes);
+
+ thrust::device_ptr<long>
+ maskData(THCudaLongTensor_data(state, maskLong));
+ thrust::device_ptr<long>
+ maskPrefixSumData(THCudaLongTensor_data(state, maskPrefixSum));
+
+ thrust::exclusive_scan(
+#if CUDA_VERSION >= 7000
+ thrust::cuda::par.on(THCState_getCurrentStream(state)),
+#endif
+ maskData,
+ maskData + THCudaLongTensor_nElement(state, maskLong),
+ maskPrefixSumData);
+
+ // We are getting elements from `src` based on an offset from
+ // `maskPrefixSum`, so that should be made contiguous too
+ THCTensor* contigSrc = THCTensor_(newContiguous)(state, src);
+
+ // update `tensor` where `mask` == 1 but pull from `src` at
+ // maskPrefixSum
+ bool status = THC_pointwiseApply3(
+ state, tensor, mask, maskPrefixSum,
+ TensorMaskedCopyOp<real, unsigned char, long>(
+ THCTensor_(data)(state, contigSrc)));
+
+ THCTensor_(free)(state, contigSrc);
+ THCudaLongTensor_free(state, maskLong);
+ THCudaLongTensor_free(state, maskPrefixSum);
+
+ THArgCheck(status, 2, CUTORCH_DIM_WARNING);
+ THCudaCheck(cudaGetLastError());
+}
+
+THC_API void
+THCTensor_(maskedCopyByte)(THCState* state,
+ THCTensor *tensor, THByteTensor *mask, THCTensor *src) {
+ THAssert(THCTensor_(checkGPU)(state, 2, tensor, src));
+ THLongStorage* maskSizes = THByteTensor_newSizeOf(mask);
+ THCudaByteTensor* maskCuda = THCudaByteTensor_newWithSize(state, maskSizes, NULL);
+ THLongStorage_free(maskSizes);
+ THCudaByteTensor_copyByte(state, maskCuda, mask);
+ THCTensor_(maskedCopy)(state, tensor, maskCuda, src);
+ THCudaByteTensor_free(state, maskCuda);
+}
+
+THC_API void
+THCTensor_(maskedSelect)(THCState* state,
+ THCTensor* tensor, THCTensor* src, THCudaByteTensor* mask) {
+ THAssert(THCTensor_(checkGPU)(state, 3, tensor, src, mask));
+ THArgCheck(THCudaByteTensor_nElement(state, mask) ==
+ THCTensor_(nElement)(state, src),
+ 2, "sizes do not match");
+
+ // Determine our output size
+ long totalElements = THCudaByteTensor_sumall(state, mask);
+ THCTensor* tensorContig = THCTensor_(newContiguous)(state, tensor);
+
+ THCTensor_(resize1d)(state, tensorContig, totalElements);
+ if (tensor != tensorContig) {
+ THCTensor_(resize1d)(state, tensor, totalElements);
+ }
+
+ // FIXME: there appears to be a bug in Thrust (CUDA 7.0) for mixed
+ // iterator prefix sums? Convert `mask` to the same datatype as what
+ // we're accumulating the prefix sum in (long) to get around it
+ THCudaLongTensor* maskLong = THCudaLongTensor_new(state);
+ THLongStorage* maskSizes = THCudaByteTensor_newSizeOf(state, mask);
+ THCudaLongTensor_resize(state, maskLong, maskSizes, NULL);
+ THCudaLongTensor_copyCudaByte(state, maskLong, mask);
+
+ // Use a prefix sum to determine the output locations of the masked elements
+ THCudaLongTensor* maskPrefixSum = THCudaLongTensor_new(state);
+ THCudaLongTensor_resize(state, maskPrefixSum, maskSizes, NULL);
+ THLongStorage_free(maskSizes);
+
+ thrust::device_ptr<long>
+ maskData(THCudaLongTensor_data(state, maskLong));
+ thrust::device_ptr<long>
+ maskPrefixSumData(THCudaLongTensor_data(state, maskPrefixSum));
+
+ thrust::exclusive_scan(
+#if CUDA_VERSION >= 7000
+ thrust::cuda::par.on(THCState_getCurrentStream(state)),
+#endif
+ maskData,
+ maskData + THCudaLongTensor_nElement(state, maskLong),
+ maskPrefixSumData);
+
+ // Then copy over the masked elements at their desired output index
+ bool status = THC_pointwiseApply3(
+ state, mask, maskPrefixSum,
+ src, TensorMaskedSelectOp<real, unsigned char, long>(
+ THCTensor_(data)(state, tensor)));
+
+ THCudaLongTensor_free(state, maskLong);
+ THCudaLongTensor_free(state, maskPrefixSum);
+
+ if (tensor != tensorContig) {
+ THCTensor_(freeCopyTo)(state, tensorContig, tensor);
+ } else {
+ THCTensor_(free)(state, tensorContig);
+ }
+
+ THArgCheck(status, 2, CUTORCH_DIM_WARNING);
+ THCudaCheck(cudaGetLastError());
+}
+
+// FIXME: remove now that we have THCudaByteTensor?
+THC_API void
+THCTensor_(maskedSelectByte)(THCState* state,
+ THCTensor *tensor, THCTensor *src, THByteTensor *mask)
+{
+ THAssert(THCTensor_(checkGPU)(state, 2, tensor, src));
+ THLongStorage* maskSizes = THByteTensor_newSizeOf(mask);
+ THCudaByteTensor* maskCuda = THCudaByteTensor_newWithSize(state, maskSizes, NULL);
+ THLongStorage_free(maskSizes);
+ THCudaByteTensor_copyByte(state, maskCuda, mask);
+ THCTensor_(maskedSelect)(state, tensor, src, maskCuda);
+ THCudaByteTensor_free(state, maskCuda);
+}
+
+#endif
diff --git a/lib/THC/generic/THCTensorMasked.h b/lib/THC/generic/THCTensorMasked.h
new file mode 100644
index 0000000..98f5aee
--- /dev/null
+++ b/lib/THC/generic/THCTensorMasked.h
@@ -0,0 +1,38 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/THCTensorMasked.h"
+#else
+
+THC_API void THCTensor_(maskedFill)(THCState *state,
+ THCTensor *tensor,
+ THCudaByteTensor *mask,
+ real value);
+
+// FIXME: remove now that we have THCudaByteTensor?
+THC_API void THCTensor_(maskedFillByte)(THCState *state,
+ THCTensor *tensor,
+ THByteTensor *mask,
+ real value);
+
+THC_API void THCTensor_(maskedCopy)(THCState *state,
+ THCTensor *tensor,
+ THCudaByteTensor *mask,
+ THCTensor *src);
+
+// FIXME: remove now that we have THCudaByteTensor?
+THC_API void THCTensor_(maskedCopyByte)(THCState *state,
+ THCTensor *tensor,
+ THByteTensor *mask,
+ THCTensor *src);
+
+THC_API void THCTensor_(maskedSelect)(THCState *state,
+ THCTensor *tensor,
+ THCTensor *src,
+ THCudaByteTensor *mask);
+
+// FIXME: remove now that we have THCudaByteTensor?
+THC_API void THCTensor_(maskedSelectByte)(THCState *state,
+ THCTensor *tensor,
+ THCTensor *src,
+ THByteTensor *mask);
+
+#endif
diff --git a/lib/THC/generic/THCTensorMathBlas.cu b/lib/THC/generic/THCTensorMathBlas.cu
new file mode 100644
index 0000000..91e922c
--- /dev/null
+++ b/lib/THC/generic/THCTensorMathBlas.cu
@@ -0,0 +1,563 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/THCTensorMathBlas.cu"
+#else
+
+THC_API real
+THCTensor_(dot)(THCState *state, THCTensor *self, THCTensor *src)
+{
+#if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE)
+ THAssert(THCTensor_(checkGPU)(state, 2, self, src));
+ THArgCheck(THCTensor_(nElement)(state, self) ==
+ THCTensor_(nElement)(state, src), 2, "sizes do not match");
+
+ self = THCTensor_(newContiguous)(state, self);
+ src = THCTensor_(newContiguous)(state, src);
+
+#ifdef THC_REAL_IS_FLOAT
+ real result = THCudaBlas_Sdot(state,
+ THCTensor_(nElement)(state, self),
+ THCTensor_(data)(state, self), 1,
+ THCTensor_(data)(state, src), 1);
+#elif defined(THC_REAL_IS_DOUBLE)
+ real result = THCudaBlas_Ddot(state,
+ THCTensor_(nElement)(state, self),
+ THCTensor_(data)(state, self), 1,
+ THCTensor_(data)(state, src), 1);
+#endif
+
+ THCTensor_(free)(state, src);
+ THCTensor_(free)(state, self);
+ return result;
+
+#else
+ THError("unimplemented data type");
+ return ScalarConvert<int, real>::to(0);
+#endif
+}
+
+THC_API void
+THCTensor_(addmv)(THCState *state, THCTensor *r_, real beta, THCTensor *t, real alpha, THCTensor *mat, THCTensor *vec)
+{
+#if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE)
+ THAssert(THCTensor_(checkGPU)(state, 4, r_, t, mat, vec));
+ if( (mat->nDimension != 2) || (vec->nDimension != 1) )
+ THError("matrix and vector expected");
+
+ if( mat->size[1] != vec->size[0] )
+ THError("size mismatch");
+
+ if(t->nDimension != 1)
+ THError("size mismatch");
+
+ if(t->size[0] != mat->size[0])
+ THError("size mismatch");
+
+ if(r_ != t)
+ {
+ THCTensor_(resizeAs)(state, r_, t);
+ THCTensor_(copy)(state, r_, t);
+ }
+
+ if(mat->stride[0] == 1)
+ {
+#ifdef THC_REAL_IS_FLOAT
+ THCudaBlas_Sgemv(state, 'n', mat->size[0], mat->size[1],
+ alpha, THCTensor_(data)(state, mat), mat->stride[1],
+ THCTensor_(data)(state, vec), vec->stride[0],
+ beta, THCTensor_(data)(state, r_), r_->stride[0]);
+#elif defined(THC_REAL_IS_DOUBLE)
+ THCudaBlas_Dgemv(state, 'n', mat->size[0], mat->size[1],
+ alpha, THCTensor_(data)(state, mat), mat->stride[1],
+ THCTensor_(data)(state, vec), vec->stride[0],
+ beta, THCTensor_(data)(state, r_), r_->stride[0]);
+#endif
+ }
+ else if(mat->stride[1] == 1)
+ {
+#ifdef THC_REAL_IS_FLOAT
+ THCudaBlas_Sgemv(state, 't', mat->size[1], mat->size[0],
+ alpha, THCTensor_(data)(state, mat), mat->stride[0],
+ THCTensor_(data)(state, vec), vec->stride[0],
+ beta, THCTensor_(data)(state, r_), r_->stride[0]);
+#elif defined(THC_REAL_IS_DOUBLE)
+ THCudaBlas_Dgemv(state, 't', mat->size[1], mat->size[0],
+ alpha, THCTensor_(data)(state, mat), mat->stride[0],
+ THCTensor_(data)(state, vec), vec->stride[0],
+ beta, THCTensor_(data)(state, r_), r_->stride[0]);
+#endif
+ }
+ else
+ {
+ THCTensor *cmat = THCTensor_(newContiguous)(state, mat);
+
+#ifdef THC_REAL_IS_FLOAT
+ THCudaBlas_Sgemv(state, 't', mat->size[1], mat->size[0],
+ alpha, THCTensor_(data)(state, cmat), cmat->stride[0],
+ THCTensor_(data)(state, vec), vec->stride[0],
+ beta, THCTensor_(data)(state, r_), r_->stride[0]);
+#elif defined(THC_REAL_IS_DOUBLE)
+ THCudaBlas_Dgemv(state, 't', mat->size[1], mat->size[0],
+ alpha, THCTensor_(data)(state, cmat), cmat->stride[0],
+ THCTensor_(data)(state, vec), vec->stride[0],
+ beta, THCTensor_(data)(state, r_), r_->stride[0]);
+#endif
+
+ THCTensor_(free)(state, cmat);
+ }
+
+#else
+ THError("unimplemented data type");
+#endif
+}
+
+THC_API void
+THCTensor_(addr)(THCState *state, THCTensor *r_, real beta, THCTensor *t, real alpha, THCTensor *vec1, THCTensor *vec2)
+{
+#if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE)
+ THAssert(THCTensor_(checkGPU)(state, 4, r_, t, vec1, vec2));
+ if ( (vec1->nDimension != 1) || (vec2->nDimension != 1) ) {
+ THError("vector and vector expected");
+ }
+
+ if (t->nDimension != 2) {
+ THError("size mismatch");
+ }
+
+ if ( (t->size[0] != vec1->size[0]) || (t->size[1] != vec2->size[0]) ) {
+ THError("size mismatch");
+ }
+
+ if (r_ != t) {
+ THCTensor_(resizeAs)(state, r_, t);
+ THCTensor_(copy)(state, r_, t);
+ }
+
+ if(beta != 1) {
+ THCTensor_(mul)(state, r_, r_, beta);
+ }
+
+ if(r_->stride[0] == 1)
+ {
+#ifdef THC_REAL_IS_FLOAT
+ THCudaBlas_Sger(state, vec1->size[0], vec2->size[0],
+ alpha, THCTensor_(data)(state, vec1), vec1->stride[0],
+ THCTensor_(data)(state, vec2), vec2->stride[0],
+ THCTensor_(data)(state, r_), r_->stride[1]);
+#elif defined(THC_REAL_IS_DOUBLE)
+ THCudaBlas_Dger(state, vec1->size[0], vec2->size[0],
+ alpha, THCTensor_(data)(state, vec1), vec1->stride[0],
+ THCTensor_(data)(state, vec2), vec2->stride[0],
+ THCTensor_(data)(state, r_), r_->stride[1]);
+#endif
+ }
+ else if(r_->stride[1] == 1)
+ {
+#ifdef THC_REAL_IS_FLOAT
+ THCudaBlas_Sger(state, vec2->size[0], vec1->size[0],
+ alpha, THCTensor_(data)(state, vec2), vec2->stride[0],
+ THCTensor_(data)(state, vec1), vec1->stride[0],
+ THCTensor_(data)(state, r_), r_->stride[0]);
+#elif defined(THC_REAL_IS_DOUBLE)
+ THCudaBlas_Dger(state, vec2->size[0], vec1->size[0],
+ alpha, THCTensor_(data)(state, vec2), vec2->stride[0],
+ THCTensor_(data)(state, vec1), vec1->stride[0],
+ THCTensor_(data)(state, r_), r_->stride[0]);
+#endif
+ }
+ else
+ {
+ THCTensor *cr = THCTensor_(newClone)(state, r_);
+
+#ifdef THC_REAL_IS_FLOAT
+ THCudaBlas_Sger(state, vec2->size[0], vec1->size[0],
+ alpha, THCTensor_(data)(state, vec2), vec2->stride[0],
+ THCTensor_(data)(state, vec1), vec1->stride[0],
+ THCTensor_(data)(state, cr), cr->stride[0]);
+#elif defined(THC_REAL_IS_DOUBLE)
+ THCudaBlas_Dger(state, vec2->size[0], vec1->size[0],
+ alpha, THCTensor_(data)(state, vec2), vec2->stride[0],
+ THCTensor_(data)(state, vec1), vec1->stride[0],
+ THCTensor_(data)(state, cr), cr->stride[0]);
+#endif
+
+ THCTensor_(freeCopyTo)(state, cr, r_);
+ }
+#else
+ THError("unimplemented data type");
+#endif
+}
+
+THC_API void
+THCTensor_(addmm)(THCState *state, THCTensor *r_, real beta, THCTensor *t, real alpha, THCTensor *m1, THCTensor *m2)
+{
+#if defined(THC_REAL_IS_HALF) || defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE)
+
+ THAssert(THCTensor_(checkGPU)(state, 4, r_, t, m1, m2));
+ char transpose_r, transpose_m1, transpose_m2;
+ THCTensor *r__, *m1_, *m2_;
+
+ if( (m1->nDimension != 2) || (m2->nDimension != 2) )
+ THError("matrix and matrix expected");
+
+ if(t->nDimension != 2)
+ THError("size mismatch");
+
+ if( (t->size[0] != m1->size[0]) || (t->size[1] != m2->size[1]) || (m1->size[1] != m2->size[0]) )
+ THError("size mismatch");
+
+ if(t != r_)
+ {
+ THCTensor_(resizeAs)(state, r_, t);
+ THCTensor_(copy)(state, r_, t);
+ }
+
+ /* r_ */
+ if(r_->stride[0] == 1 &&
+ r_->stride[1] != 0)
+ {
+ transpose_r = 'n';
+ r__ = r_;
+ }
+ else if(r_->stride[1] == 1 &&
+ r_->stride[0] != 0)
+ {
+ THCTensor *swap = m2;
+ m2 = m1;
+ m1 = swap;
+ transpose_r = 't';
+ r__ = r_;
+ }
+ else
+ {
+ transpose_r = 'n';
+
+ THCTensor *transp_r_ = THCTensor_(newTranspose)(state, r_, 0, 1);
+ r__ = THCTensor_(newClone)(state, transp_r_);
+ THCTensor_(free)(state, transp_r_);
+ THCTensor_(transpose)(state, r__, NULL, 0, 1);
+ }
+
+ /* m1 */
+ if(m1->stride[(transpose_r == 'n' ? 0 : 1)] == 1 &&
+ m1->stride[(transpose_r == 'n' ? 1 : 0)] != 0)
+ {
+ transpose_m1 = 'n';
+ m1_ = m1;
+ }
+ else if(m1->stride[(transpose_r == 'n' ? 1 : 0)] == 1 &&
+ m1->stride[(transpose_r == 'n' ? 0 : 1)] != 0)
+ {
+ transpose_m1 = 't';
+ m1_ = m1;
+ }
+ else
+ {
+ transpose_m1 = (transpose_r == 'n' ? 't' : 'n');
+ m1_ = THCTensor_(newContiguous)(state, m1);
+ }
+
+ /* m2 */
+ if(m2->stride[(transpose_r == 'n' ? 0 : 1)] == 1 &&
+ m2->stride[(transpose_r == 'n' ? 1 : 0)] != 0)
+ {
+ transpose_m2 = 'n';
+ m2_ = m2;
+ }
+ else if(m2->stride[(transpose_r == 'n' ? 1 : 0)] == 1 &&
+ m2->stride[(transpose_r == 'n' ? 0 : 1)] != 0)
+ {
+ transpose_m2 = 't';
+ m2_ = m2;
+ }
+ else
+ {
+ transpose_m2 = (transpose_r == 'n' ? 't' : 'n');
+ m2_ = THCTensor_(newContiguous)(state, m2);
+ }
+
+#ifdef THC_REAL_IS_HALF
+ THCudaBlas_Hgemm(state,
+ transpose_m1,
+ transpose_m2,
+ r__->size[(transpose_r == 'n' ? 0 : 1)],
+ r__->size[(transpose_r == 'n' ? 1 : 0)],
+ m1_->size[(transpose_r == 'n' ? 1 : 0)],
+ alpha,
+ THCTensor_(data)(state, m1_),
+ (transpose_m1 == 'n' ? m1_->stride[(transpose_r == 'n' ? 1 : 0)] : m1_->stride[(transpose_r == 'n' ? 0 : 1)]),
+ THCTensor_(data)(state, m2_),
+ (transpose_m2 == 'n' ? m2_->stride[(transpose_r == 'n' ? 1 : 0)] : m2_->stride[(transpose_r == 'n' ? 0 : 1)]),
+ beta,
+ THCTensor_(data)(state, r__),
+ r__->stride[(transpose_r == 'n' ? 1 : 0)]);
+#elif defined(THC_REAL_IS_FLOAT)
+ THCudaBlas_Sgemm(state,
+ transpose_m1,
+ transpose_m2,
+ r__->size[(transpose_r == 'n' ? 0 : 1)],
+ r__->size[(transpose_r == 'n' ? 1 : 0)],
+ m1_->size[(transpose_r == 'n' ? 1 : 0)],
+ alpha,
+ THCTensor_(data)(state, m1_),
+ (transpose_m1 == 'n' ? m1_->stride[(transpose_r == 'n' ? 1 : 0)] : m1_->stride[(transpose_r == 'n' ? 0 : 1)]),
+ THCTensor_(data)(state, m2_),
+ (transpose_m2 == 'n' ? m2_->stride[(transpose_r == 'n' ? 1 : 0)] : m2_->stride[(transpose_r == 'n' ? 0 : 1)]),
+ beta,
+ THCTensor_(data)(state, r__),
+ r__->stride[(transpose_r == 'n' ? 1 : 0)]);
+#elif defined(THC_REAL_IS_DOUBLE)
+ THCudaBlas_Dgemm(state,
+ transpose_m1,
+ transpose_m2,
+ r__->size[(transpose_r == 'n' ? 0 : 1)],
+ r__->size[(transpose_r == 'n' ? 1 : 0)],
+ m1_->size[(transpose_r == 'n' ? 1 : 0)],
+ alpha,
+ THCTensor_(data)(state, m1_),
+ (transpose_m1 == 'n' ? m1_->stride[(transpose_r == 'n' ? 1 : 0)] : m1_->stride[(transpose_r == 'n' ? 0 : 1)]),
+ THCTensor_(data)(state, m2_),
+ (transpose_m2 == 'n' ? m2_->stride[(transpose_r == 'n' ? 1 : 0)] : m2_->stride[(transpose_r == 'n' ? 0 : 1)]),
+ beta,
+ THCTensor_(data)(state, r__),
+ r__->stride[(transpose_r == 'n' ? 1 : 0)]);
+#endif
+
+ /* free intermediate variables */
+ if(m1_ != m1) {
+ THCTensor_(free)(state, m1_);
+ }
+
+ if(m2_ != m2) {
+ THCTensor_(free)(state, m2_);
+ }
+
+ if(r__ != r_) {
+ THCTensor_(freeCopyTo)(state, r__, r_);
+ }
+#else
+ THError("unimplemented data type");
+#endif
+}
+
+THC_API void
+THCTensor_(addbmm)(THCState *state, THCTensor *result, real beta, THCTensor *t,
+ real alpha, THCTensor *batch1, THCTensor *batch2) {
+#if defined(THC_REAL_IS_HALF) || defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE)
+ THAssert(THCTensor_(checkGPU)(state, 4, result, t, batch1, batch2));
+ THArgCheck(THCTensor_(nDimension)(state, t) == 2, 4, "expected 2D tensor");
+ THArgCheck(THCTensor_(nDimension)(state, batch1) == 3, 6, "expected 3D tensor");
+ THArgCheck(THCTensor_(nDimension)(state, batch2) == 3, 7, "expected 3D tensor");
+
+ long batchnum = THCTensor_(size)(state, batch1, 0);
+ long m1d1 = THCTensor_(size)(state, batch1, 1);
+ long innerdim = THCTensor_(size)(state, batch1, 2);
+ long m2d2 = THCTensor_(size)(state, batch2, 2);
+
+ THArgCheck(batchnum == THCTensor_(size)(state, batch2, 0), 7,
+ "equal number of batches expected");
+ // M is t, as listed in the docs under addbmm
+ THArgCheck(m1d1 == THCTensor_(size)(state, t, 0), 6,
+ "first dimension must match first dimension of M");
+ THArgCheck(m2d2 == THCTensor_(size)(state, t, 1), 7,
+ "second dimension must match second dimension of M");
+ THArgCheck(innerdim == THCTensor_(size)(state, batch2, 1), 6,
+ "second dimension must match first dimension of batch2");
+
+ if (t != result) {
+ THCTensor_(resizeAs)(state, result, t);
+ THCTensor_(copy)(state, result, t);
+ }
+
+ THCTensor *slice1 = THCTensor_(new)(state);
+ THCTensor *slice2 = THCTensor_(new)(state);
+ for (long i=0; i<batchnum; i++) {
+ THCTensor_(select)(state, slice1, batch1, 0, i);
+ THCTensor_(select)(state, slice2, batch2, 0, i);
+
+ THCTensor_(addmm)(state, result, beta, result, alpha, slice1, slice2);
+ beta = ScalarConvert<int, real>::to(1);
+ }
+ THCTensor_(free)(state, slice1);
+ THCTensor_(free)(state, slice2);
+#else
+ THError("unimplemented data type");
+#endif
+}
+
+THC_API void
+THCTensor_(baddbmm)(THCState *state, THCTensor *result, real beta, THCTensor *t,
+ real alpha, THCTensor *batch1, THCTensor *batch2) {
+#if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE)
+ THAssert(THCTensor_(checkGPU)(state, 4, result, t, batch1, batch2));
+ THArgCheck(THCTensor_(nDimension)(state, t) == 3, 4, "expected 3D tensor");
+ THArgCheck(THCTensor_(nDimension)(state, batch1) == 3, 6, "expected 3D tensor");
+ THArgCheck(THCTensor_(nDimension)(state, batch2) == 3, 7, "expected 3D tensor");
+ THArgCheck(THCTensor_(size)(state, t, 0) == THCTensor_(size)(state, batch1, 0), 6,
+ "equal number of batches expected");
+ THArgCheck(THCTensor_(size)(state, t, 0) == THCTensor_(size)(state, batch2, 0), 7,
+ "equal number of batches expected");
+ THArgCheck(THCTensor_(size)(state, t, 1) == THCTensor_(size)(state, batch1, 1), 6,
+ "wrong matrix size");
+ THArgCheck(THCTensor_(size)(state, t, 2) == THCTensor_(size)(state, batch2, 2), 7,
+ "wrong matrix size");
+ THArgCheck(THCTensor_(size)(state, batch1, 2) == THCTensor_(size)(state, batch2, 1), 6,
+ "wrong matrix size");
+
+ if (t != result) {
+ THCTensor_(resizeAs)(state, result, t);
+ THCTensor_(copy)(state, result, t);
+ }
+
+ bool transpose_result;
+ char transpose_batch1, transpose_batch2;
+ long lda, ldb, ldc;
+ THCTensor *result_, *batch1_, *batch2_;
+ if (result->stride[1] == 1)
+ {
+ transpose_result = false;
+ result_ = result;
+ ldc = result_->stride[2];
+ }
+ else if (result->stride[2] == 1)
+ {
+ transpose_result = true;
+
+ THCTensor *swap = batch2;
+ batch2 = batch1;
+ batch1 = swap;
+
+ result_ = result;
+ ldc = result_->stride[1];
+ }
+ else
+ {
+ transpose_result = false;
+
+ THCTensor *transp_r_ = THCTensor_(newTranspose)(state, result, 1, 2);
+ result_ = THCTensor_(newClone)(state, transp_r_);
+ THCTensor_(free)(state, transp_r_);
+ THCTensor_(transpose)(state, result_, NULL, 1, 2);
+
+ ldc = result_->stride[2];
+ }
+
+ if (batch1->stride[transpose_result ? 2 : 1] == 1)
+ {
+ transpose_batch1 = 'n';
+ batch1_ = batch1;
+ lda = batch1_->stride[transpose_result ? 1 : 2];
+ }
+ else if (batch1->stride[transpose_result ? 1 : 2] == 1)
+ {
+ transpose_batch1 = 't';
+ batch1_ = batch1;
+ lda = batch1_->stride[transpose_result ? 2 : 1];
+ }
+ else
+ {
+ transpose_batch1 = transpose_result ? 'n' : 't';
+ batch1_ = THCTensor_(newContiguous)(state, batch1);
+ lda = batch1_->stride[1];
+ }
+
+ if (batch2->stride[transpose_result ? 2 : 1] == 1)
+ {
+ transpose_batch2 = 'n';
+ batch2_ = batch2;
+ ldb = batch2_->stride[transpose_result ? 1 : 2];
+ }
+ else if (batch2->stride[transpose_result ? 1 : 2] == 1)
+ {
+ transpose_batch2 = 't';
+ batch2_ = batch2;
+ ldb = batch2_->stride[transpose_result ? 2 : 1];
+ }
+ else
+ {
+ transpose_batch2 = transpose_result ? 'n' : 't';
+ batch2_ = THCTensor_(newContiguous)(state, batch2);
+ ldb = batch2_->stride[1];
+ }
+
+ // Compute pointers to matrices in each batch.
+ long num_batches = result_->size[0];
+ size_t matrices_size = num_batches * sizeof(real*);
+ const real **matrices1 = (const real **)THAlloc(matrices_size);
+ const real **matrices2 = (const real **)THAlloc(matrices_size);
+ real **result_matrices = (real **)THAlloc(matrices_size);
+ for (int i = 0; i < num_batches; ++i)
+ {
+ matrices1[i] = THCTensor_(data)(state, batch1_) + i * batch1_->stride[0];
+ matrices2[i] = THCTensor_(data)(state, batch2_) + i * batch2_->stride[0];
+ result_matrices[i] = THCTensor_(data)(state, result_) + i * result_->stride[0];
+ }
+
+ // Copy pointers to device.
+ const real **d_matrices1, **d_matrices2;
+ real **d_result_matrices;
+ THCudaCheck(THCudaMalloc(state, (void**)&d_matrices1, matrices_size));
+ THCudaCheck(THCudaMalloc(state, (void**)&d_matrices2, matrices_size));
+ THCudaCheck(THCudaMalloc(state, (void**)&d_result_matrices, matrices_size));
+
+ THCudaCheck(cudaMemcpyAsync(d_matrices1, matrices1, matrices_size,
+ cudaMemcpyHostToDevice, THCState_getCurrentStream(state)));
+ THCudaCheck(cudaMemcpyAsync(d_matrices2, matrices2, matrices_size,
+ cudaMemcpyHostToDevice, THCState_getCurrentStream(state)));
+ THCudaCheck(cudaMemcpyAsync(d_result_matrices, result_matrices, matrices_size,
+ cudaMemcpyHostToDevice, THCState_getCurrentStream(state)));
+
+#ifdef THC_REAL_IS_FLOAT
+ THCudaBlas_SgemmBatched(
+ state,
+ transpose_batch1,
+ transpose_batch2,
+ result_->size[transpose_result ? 2 : 1],
+ result_->size[transpose_result ? 1 : 2],
+ batch1_->size[transpose_result ? 1 : 2],
+ alpha,
+ d_matrices1, lda,
+ d_matrices2, ldb,
+ beta,
+ d_result_matrices, ldc,
+ num_batches);
+#elif defined(THC_REAL_IS_DOUBLE)
+ THCudaBlas_DgemmBatched(
+ state,
+ transpose_batch1,
+ transpose_batch2,
+ result_->size[transpose_result ? 2 : 1],
+ result_->size[transpose_result ? 1 : 2],
+ batch1_->size[transpose_result ? 1 : 2],
+ alpha,
+ d_matrices1, lda,
+ d_matrices2, ldb,
+ beta,
+ d_result_matrices, ldc,
+ num_batches);
+#endif
+
+ THCudaFree(state, d_matrices1);
+ THCudaFree(state, d_matrices2);
+ THCudaFree(state, d_result_matrices);
+ THFree(matrices1);
+ THFree(matrices2);
+ THFree(result_matrices);
+
+ if (batch1_ != batch1) {
+ THCTensor_(free)(state, batch1_);
+ }
+
+ if (batch2_ != batch2) {
+ THCTensor_(free)(state, batch2_);
+ }
+
+ if (result_ != result) {
+ THCTensor_(freeCopyTo)(state, result_, result);
+ }
+
+#else
+ THError("unimplemented data type");
+#endif
+}
+
+#endif
diff --git a/lib/THC/generic/THCTensorMathBlas.h b/lib/THC/generic/THCTensorMathBlas.h
new file mode 100644
index 0000000..68f95e3
--- /dev/null
+++ b/lib/THC/generic/THCTensorMathBlas.h
@@ -0,0 +1,13 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/THCTensorMathBlas.h"
+#else
+
+THC_API real THCTensor_(dot)(THCState *state, THCTensor *self, THCTensor *src);
+THC_API void THCTensor_(addmv)(THCState *state, THCTensor *self, real beta, THCTensor *t, real alpha, THCTensor *mat, THCTensor *vec);
+THC_API void THCTensor_(addmm)(THCState *state, THCTensor *self, real beta, THCTensor *t, real alpha, THCTensor *mat1, THCTensor *mat2);
+THC_API void THCTensor_(addr)(THCState *state, THCTensor *self, real beta, THCTensor *t, real alpha, THCTensor *vec1, THCTensor *vec2);
+THC_API void THCTensor_(addbmm)(THCState *state, THCTensor *result, real beta, THCTensor *t, real alpha, THCTensor *batch1, THCTensor *batch2);
+THC_API void THCTensor_(baddbmm)(THCState *state, THCTensor *result, real beta, THCTensor *t, real alpha, THCTensor *batch1, THCTensor *batch2);
+
+
+#endif
diff --git a/lib/THC/generic/THCTensorMathCompare.cu b/lib/THC/generic/THCTensorMathCompare.cu
new file mode 100644
index 0000000..77f1ab5
--- /dev/null
+++ b/lib/THC/generic/THCTensorMathCompare.cu
@@ -0,0 +1,101 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/THCTensorMathCompare.cu"
+#else
+
+THC_API void THCTensor_(ltValue)(THCState *state, THCudaByteTensor *self_, THCTensor *src, real value)
+{
+ THAssert(THCTensor_(checkGPU)(state, 2, self_, src));
+ THC_logicalValue(state, self_, src,
+ TensorLTValueOp<typename TensorUtils<THCTensor>::DataType,
+ unsigned char>(value));
+}
+
+THC_API void THCTensor_(gtValue)(THCState *state, THCudaByteTensor *self_, THCTensor *src, real value)
+{
+ THAssert(THCTensor_(checkGPU)(state, 2, self_, src));
+ THC_logicalValue(state, self_, src,
+ TensorGTValueOp<typename TensorUtils<THCTensor>::DataType,
+ unsigned char>(value));
+}
+
+THC_API void THCTensor_(leValue)(THCState *state, THCudaByteTensor *self_, THCTensor *src, real value)
+{
+ THAssert(THCTensor_(checkGPU)(state, 2, self_, src));
+ THC_logicalValue(state, self_, src,
+ TensorLEValueOp<typename TensorUtils<THCTensor>::DataType,
+ unsigned char>(value));
+}
+
+THC_API void THCTensor_(geValue)(THCState *state, THCudaByteTensor *self_, THCTensor *src, real value)
+{
+ THAssert(THCTensor_(checkGPU)(state, 2, self_, src));
+ THC_logicalValue(state, self_, src,
+ TensorGEValueOp<typename TensorUtils<THCTensor>::DataType,
+ unsigned char>(value));
+}
+
+THC_API void THCTensor_(eqValue)(THCState *state, THCudaByteTensor *self_, THCTensor *src, real value)
+{
+ THAssert(THCTensor_(checkGPU)(state, 2, self_, src));
+ THC_logicalValue(state, self_, src,
+ TensorEQValueOp<typename TensorUtils<THCTensor>::DataType,
+ unsigned char>(value));
+}
+
+THC_API void THCTensor_(neValue)(THCState *state, THCudaByteTensor *self_, THCTensor *src, real value)
+{
+ THAssert(THCTensor_(checkGPU)(state, 2, self_, src));
+ THC_logicalValue(state, self_, src,
+ TensorNEValueOp<typename TensorUtils<THCTensor>::DataType,
+ unsigned char>(value));
+}
+
+THC_API void THCTensor_(ltValueT)(THCState *state, THCTensor *self_, THCTensor *src, real value)
+{
+ THAssert(THCTensor_(checkGPU)(state, 2, self_, src));
+ THC_logicalValue(state, self_, src,
+ TensorLTValueOp<typename TensorUtils<THCTensor>::DataType,
+ typename TensorUtils<THCTensor>::DataType>(value));
+}
+
+THC_API void THCTensor_(gtValueT)(THCState *state, THCTensor *self_, THCTensor *src, real value)
+{
+ THAssert(THCTensor_(checkGPU)(state, 2, self_, src));
+ THC_logicalValue(state, self_, src,
+ TensorGTValueOp<typename TensorUtils<THCTensor>::DataType,
+ typename TensorUtils<THCTensor>::DataType>(value));
+}
+
+THC_API void THCTensor_(leValueT)(THCState *state, THCTensor *self_, THCTensor *src, real value)
+{
+ THAssert(THCTensor_(checkGPU)(state, 2, self_, src));
+ THC_logicalValue(state, self_, src,
+ TensorLEValueOp<typename TensorUtils<THCTensor>::DataType,
+ typename TensorUtils<THCTensor>::DataType>(value));
+}
+
+THC_API void THCTensor_(geValueT)(THCState *state, THCTensor *self_, THCTensor *src, real value)
+{
+ THAssert(THCTensor_(checkGPU)(state, 2, self_, src));
+ THC_logicalValue(state, self_, src,
+ TensorGEValueOp<typename TensorUtils<THCTensor>::DataType,
+ typename TensorUtils<THCTensor>::DataType>(value));
+}
+
+THC_API void THCTensor_(eqValueT)(THCState *state, THCTensor *self_, THCTensor *src, real value)
+{
+ THAssert(THCTensor_(checkGPU)(state, 2, self_, src));
+ THC_logicalValue(state, self_, src,
+ TensorEQValueOp<typename TensorUtils<THCTensor>::DataType,
+ typename TensorUtils<THCTensor>::DataType>(value));
+}
+
+THC_API void THCTensor_(neValueT)(THCState *state, THCTensor *self_, THCTensor *src, real value)
+{
+ THAssert(THCTensor_(checkGPU)(state, 2, self_, src));
+ THC_logicalValue(state, self_, src,
+ TensorNEValueOp<typename TensorUtils<THCTensor>::DataType,
+ typename TensorUtils<THCTensor>::DataType>(value));
+}
+
+#endif
diff --git a/lib/THC/generic/THCTensorMathCompare.h b/lib/THC/generic/THCTensorMathCompare.h
new file mode 100644
index 0000000..7b8837c
--- /dev/null
+++ b/lib/THC/generic/THCTensorMathCompare.h
@@ -0,0 +1,20 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/THCTensorMathCompare.h"
+#else
+
+THC_API void THCTensor_(ltValue)(THCState *state, THCudaByteTensor *self_, THCTensor *src, real value);
+THC_API void THCTensor_(gtValue)(THCState *state, THCudaByteTensor *self_, THCTensor *src, real value);
+THC_API void THCTensor_(leValue)(THCState *state, THCudaByteTensor *self_, THCTensor *src, real value);
+THC_API void THCTensor_(geValue)(THCState *state, THCudaByteTensor *self_, THCTensor *src, real value);
+THC_API void THCTensor_(eqValue)(THCState *state, THCudaByteTensor *self_, THCTensor *src, real value);
+THC_API void THCTensor_(neValue)(THCState *state, THCudaByteTensor *self_, THCTensor *src, real value);
+
+THC_API void THCTensor_(ltValueT)(THCState *state, THCTensor *self_, THCTensor *src, real value);
+THC_API void THCTensor_(gtValueT)(THCState *state, THCTensor *self_, THCTensor *src, real value);
+THC_API void THCTensor_(leValueT)(THCState *state, THCTensor *self_, THCTensor *src, real value);
+THC_API void THCTensor_(geValueT)(THCState *state, THCTensor *self_, THCTensor *src, real value);
+THC_API void THCTensor_(eqValueT)(THCState *state, THCTensor *self_, THCTensor *src, real value);
+THC_API void THCTensor_(neValueT)(THCState *state, THCTensor *self_, THCTensor *src, real value);
+
+
+#endif
diff --git a/lib/THC/generic/THCTensorMathCompareT.cu b/lib/THC/generic/THCTensorMathCompareT.cu
new file mode 100644
index 0000000..4b59abf
--- /dev/null
+++ b/lib/THC/generic/THCTensorMathCompareT.cu
@@ -0,0 +1,113 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/THCTensorMathCompareT.cu"
+#else
+
+THC_API void
+THCTensor_(ltTensor)(THCState *state, THCudaByteTensor *self_, THCTensor *src1, THCTensor *src2)
+{
+ THAssert(THCTensor_(checkGPU)(state, 3, self_, src1, src2));
+ THC_logicalTensor(state, self_, src1, src2,
+ TensorLTOp<typename TensorUtils<THCTensor>::DataType,
+ unsigned char>());
+}
+
+THC_API void
+THCTensor_(gtTensor)(THCState *state, THCudaByteTensor *self_, THCTensor *src1, THCTensor *src2)
+{
+ THAssert(THCTensor_(checkGPU)(state, 3, self_, src1, src2));
+ THC_logicalTensor(state, self_, src1, src2,
+ TensorGTOp<typename TensorUtils<THCTensor>::DataType,
+ unsigned char>());
+}
+
+THC_API void
+THCTensor_(leTensor)(THCState *state, THCudaByteTensor *self_, THCTensor *src1, THCTensor *src2)
+{
+ THAssert(THCTensor_(checkGPU)(state, 3, self_, src1, src2));
+ THC_logicalTensor(state, self_, src1, src2,
+ TensorLEOp<typename TensorUtils<THCTensor>::DataType,
+ unsigned char>());
+}
+
+THC_API void
+THCTensor_(geTensor)(THCState *state, THCudaByteTensor *self_, THCTensor *src1, THCTensor *src2)
+{
+ THAssert(THCTensor_(checkGPU)(state, 3, self_, src1, src2));
+ THC_logicalTensor(state, self_, src1, src2,
+ TensorGEOp<typename TensorUtils<THCTensor>::DataType,
+ unsigned char>());
+}
+
+THC_API void
+THCTensor_(eqTensor)(THCState *state, THCudaByteTensor *self_, THCTensor *src1, THCTensor *src2)
+{
+ THAssert(THCTensor_(checkGPU)(state, 3, self_, src1, src2));
+ THC_logicalTensor(state, self_, src1, src2,
+ TensorEQOp<typename TensorUtils<THCTensor>::DataType,
+ unsigned char>());
+}
+
+THC_API void
+THCTensor_(neTensor)(THCState *state, THCudaByteTensor *self_, THCTensor *src1, THCTensor *src2)
+{
+ THAssert(THCTensor_(checkGPU)(state, 3, self_, src1, src2));
+ THC_logicalTensor(state, self_, src1, src2,
+ TensorNEOp<typename TensorUtils<THCTensor>::DataType,
+ unsigned char>());
+}
+
+THC_API void
+THCTensor_(ltTensorT)(THCState *state, THCTensor *self_, THCTensor *src1, THCTensor *src2)
+{
+ THAssert(THCTensor_(checkGPU)(state, 3, self_, src1, src2));
+ THC_logicalTensor(state, self_, src1, src2,
+ TensorLTOp<typename TensorUtils<THCTensor>::DataType,
+ typename TensorUtils<THCTensor>::DataType>());
+}
+
+THC_API void
+THCTensor_(gtTensorT)(THCState *state, THCTensor *self_, THCTensor *src1, THCTensor *src2)
+{
+ THAssert(THCTensor_(checkGPU)(state, 3, self_, src1, src2));
+ THC_logicalTensor(state, self_, src1, src2,
+ TensorGTOp<typename TensorUtils<THCTensor>::DataType,
+ typename TensorUtils<THCTensor>::DataType>());
+}
+
+THC_API void
+THCTensor_(leTensorT)(THCState *state, THCTensor *self_, THCTensor *src1, THCTensor *src2)
+{
+ THAssert(THCTensor_(checkGPU)(state, 3, self_, src1, src2));
+ THC_logicalTensor(state, self_, src1, src2,
+ TensorLEOp<typename TensorUtils<THCTensor>::DataType,
+ typename TensorUtils<THCTensor>::DataType>());
+}
+
+THC_API void
+THCTensor_(geTensorT)(THCState *state, THCTensor *self_, THCTensor *src1, THCTensor *src2)
+{
+ THAssert(THCTensor_(checkGPU)(state, 3, self_, src1, src2));
+ THC_logicalTensor(state, self_, src1, src2,
+ TensorGEOp<typename TensorUtils<THCTensor>::DataType,
+ typename TensorUtils<THCTensor>::DataType>());
+}
+
+THC_API void
+THCTensor_(eqTensorT)(THCState *state, THCTensor *self_, THCTensor *src1, THCTensor *src2)
+{
+ THAssert(THCTensor_(checkGPU)(state, 3, self_, src1, src2));
+ THC_logicalTensor(state, self_, src1, src2,
+ TensorEQOp<typename TensorUtils<THCTensor>::DataType,
+ typename TensorUtils<THCTensor>::DataType>());
+}
+
+THC_API void
+THCTensor_(neTensorT)(THCState *state, THCTensor *self_, THCTensor *src1, THCTensor *src2)
+{
+ THAssert(THCTensor_(checkGPU)(state, 3, self_, src1, src2));
+ THC_logicalTensor(state, self_, src1, src2,
+ TensorNEOp<typename TensorUtils<THCTensor>::DataType,
+ typename TensorUtils<THCTensor>::DataType>());
+}
+
+#endif
diff --git a/lib/THC/generic/THCTensorMathCompareT.h b/lib/THC/generic/THCTensorMathCompareT.h
new file mode 100644
index 0000000..0d76835
--- /dev/null
+++ b/lib/THC/generic/THCTensorMathCompareT.h
@@ -0,0 +1,19 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/THCTensorMathCompareT.h"
+#else
+
+THC_API void THCTensor_(ltTensor)(THCState *state, THCudaByteTensor *self_, THCTensor *src1, THCTensor *src2);
+THC_API void THCTensor_(gtTensor)(THCState *state, THCudaByteTensor *self_, THCTensor *src1, THCTensor *src2);
+THC_API void THCTensor_(leTensor)(THCState *state, THCudaByteTensor *self_, THCTensor *src1, THCTensor *src2);
+THC_API void THCTensor_(geTensor)(THCState *state, THCudaByteTensor *self_, THCTensor *src1, THCTensor *src2);
+THC_API void THCTensor_(eqTensor)(THCState *state, THCudaByteTensor *self_, THCTensor *src1, THCTensor *src2);
+THC_API void THCTensor_(neTensor)(THCState *state, THCudaByteTensor *self_, THCTensor *src1, THCTensor *src2);
+
+THC_API void THCTensor_(ltTensorT)(THCState *state, THCTensor *self_, THCTensor *src1, THCTensor *src2);
+THC_API void THCTensor_(gtTensorT)(THCState *state, THCTensor *self_, THCTensor *src1, THCTensor *src2);
+THC_API void THCTensor_(leTensorT)(THCState *state, THCTensor *self_, THCTensor *src1, THCTensor *src2);
+THC_API void THCTensor_(geTensorT)(THCState *state, THCTensor *self_, THCTensor *src1, THCTensor *src2);
+THC_API void THCTensor_(eqTensorT)(THCState *state, THCTensor *self_, THCTensor *src1, THCTensor *src2);
+THC_API void THCTensor_(neTensorT)(THCState *state, THCTensor *self_, THCTensor *src1, THCTensor *src2);
+
+#endif
diff --git a/lib/THC/generic/THCTensorMathReduce.cu b/lib/THC/generic/THCTensorMathReduce.cu
new file mode 100644
index 0000000..e17013c
--- /dev/null
+++ b/lib/THC/generic/THCTensorMathReduce.cu
@@ -0,0 +1,135 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/THCTensorMathReduce.cu"
+#else
+
+THC_API void
+THCTensor_(sum)(THCState* state, THCTensor *self, THCTensor *src, long dimension) {
+ THAssert(THCTensor_(checkGPU)(state, 2, self, src));
+ if (!THC_reduceDim(state, self, src,
+ thrust::identity<real>(),
+ ReduceAdd<real, real>(),
+ ScalarConvert<int, real>::to(0),
+ dimension)) {
+ THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+ }
+
+ THCudaCheck(cudaGetLastError());
+}
+
+THC_API void
+THCTensor_(prod)(THCState* state, THCTensor *self, THCTensor *src, long dimension) {
+ THAssert(THCTensor_(checkGPU)(state, 2, self, src));
+ if (!THC_reduceDim(state, self, src,
+ thrust::identity<real>(),
+ ReduceMultiply<real, real>(),
+ ScalarConvert<int, real>::to(1),
+ dimension)) {
+ THArgCheck(false, 2, CUTORCH_DIM_WARNING);
+ }
+
+ THCudaCheck(cudaGetLastError());
+}
+
+THC_API accreal
+THCTensor_(sumall)(THCState *state, THCTensor *self) {
+ THAssert(THCTensor_(checkGPU)(state, 1, self));
+ accreal val;
+ if (!THC_reduceAll(state, self,
+ thrust::identity<real>(),
+ ReduceAdd<real, accreal>(),
+ ReduceAdd<accreal, accreal>(),
+ ScalarConvert<int, accreal>::to(0),
+ &val, 0)) {
+ THArgCheck(false, 1, CUTORCH_DIM_WARNING);
+ }
+
+ THCudaCheck(cudaGetLastError());
+ return val;
+}
+
+THC_API accreal
+THCTensor_(prodall)(THCState *state, THCTensor *self) {
+ THAssert(THCTensor_(checkGPU)(state, 1, self));
+ accreal val;
+ if (!THC_reduceAll(state, self,
+ thrust::identity<real>(),
+ ReduceMultiply<real, accreal>(),
+ ReduceMultiply<accreal, accreal>(),
+ ScalarConvert<int, accreal>::to(1),
+ &val, 0)) {
+ THArgCheck(false, 1, CUTORCH_DIM_WARNING);
+ }
+
+ THCudaCheck(cudaGetLastError());
+ return val;
+}
+
+THC_API real
+THCTensor_(minall)(THCState *state, THCTensor *self) {
+ THAssert(THCTensor_(checkGPU)(state, 1, self));
+ real val;
+ if (!THC_reduceAll(state, self,
+ thrust::identity<real>(),
+ ReduceMin<real>(),
+ ReduceMin<real>(),
+ THCNumerics<real>::max(), &val, 0)) {
+ THArgCheck(false, 1, CUTORCH_DIM_WARNING);
+ }
+
+ THCudaCheck(cudaGetLastError());
+ return val;
+}
+
+THC_API real
+THCTensor_(maxall)(THCState *state, THCTensor *self) {
+ THAssert(THCTensor_(checkGPU)(state, 1, self));
+ real val;
+ if (!THC_reduceAll(state, self,
+ thrust::identity<real>(),
+ ReduceMax<real>(),
+ ReduceMax<real>(),
+ THCNumerics<real>::min(), &val, 0)) {
+ THArgCheck(false, 1, CUTORCH_DIM_WARNING);
+ }
+
+ THCudaCheck(cudaGetLastError());
+ return val;
+}
+
+THC_API void
+THCTensor_(max)(THCState *state,
+ THCTensor *values,
+ THCudaLongTensor *indices,
+ THCTensor *src,
+ long dimension) {
+ THAssert(THCTensor_(checkGPU)(state, 3, values, indices, src));
+
+ thrust::pair<typename TensorUtils<THCTensor>::DataType, long>
+ init =
+ thrust::make_pair<typename TensorUtils<THCTensor>::DataType, long>(
+ THCNumerics<typename TensorUtils<THCTensor>::DataType>::min(), 1);
+
+ return THC_reduceDimIndex(
+ state, values, indices, src, dimension, init,
+ MaxValuePair<typename TensorUtils<THCTensor>::DataType, long>());
+}
+
+THC_API void
+THCTensor_(min)(THCState *state,
+ THCTensor *values,
+ THCudaLongTensor *indices,
+ THCTensor *src,
+ long dimension) {
+ THAssert(THCTensor_(checkGPU)(state, 3, values, indices, src));
+
+ thrust::pair<typename TensorUtils<THCTensor>::DataType, long>
+ init =
+ thrust::make_pair<typename TensorUtils<THCTensor>::DataType, long>(
+ THCNumerics<typename TensorUtils<THCTensor>::DataType>::max(), 1);
+
+ return THC_reduceDimIndex(
+ state, values, indices, src, dimension, init,
+ MinValuePair<typename TensorUtils<THCTensor>::DataType, long>());
+}
+
+#endif
diff --git a/lib/THC/generic/THCTensorMathReduce.h b/lib/THC/generic/THCTensorMathReduce.h
new file mode 100644
index 0000000..f584d68
--- /dev/null
+++ b/lib/THC/generic/THCTensorMathReduce.h
@@ -0,0 +1,23 @@
+#ifndef THC_GENERIC_FILE
+#define THC_GENERIC_FILE "generic/THCTensorMathReduce.h"
+#else
+
+THC_API void THCTensor_(sum)(THCState *state, THCTensor *self, THCTensor *src, long dim);
+THC_API void THCTensor_(prod)(THCState *state, THCTensor *self, THCTensor *src, long dim);
+
+THC_API accreal THCTensor_(sumall)(THCState *state, THCTensor *self);
+THC_API accreal THCTensor_(prodall)(THCState *state, THCTensor *self);
+
+THC_API void THCTensor_(min)(THCState *state,
+ THCTensor *values,
+ THCudaLongTensor *indices,
+ THCTensor *src, long dim);
+THC_API void THCTensor_(max)(THCState *state,
+ THCTensor *values,
+ THCudaLongTensor *indices,
+ THCTensor *src, long dim);
+
+THC_API real THCTensor_(minall)(THCState *state, THCTensor *self);
+THC_API real THCTensor_(maxall)(THCState *state, THCTensor *self);
+
+#endif
diff --git a/test/test.lua b/test/test.lua
index 143f92c..960944b 100644
--- a/test/test.lua
+++ b/test/test.lua
@@ -863,10 +863,10 @@ function test.allAndAny()
local size1 = chooseInt(10, 100)
local t = nil
if torch.uniform(0, 1) > 0.5 then
- t = torch.CudaTensor(size1):fill(1)
+ t = torch.CudaByteTensor(size1):fill(1)
else
local size2 = chooseInt(10, 100)
- t = torch.CudaTensor(size1, size2):fill(1)
+ t = torch.CudaByteTensor(size1, size2):fill(1)
if torch.uniform(0, 1) > 0.5 then
t = t:transpose(1, 2)
@@ -1323,7 +1323,7 @@ function test.cross()
local ndims = chooseInt(1, 10)
local crossdim = chooseInt(1, ndims)
sizes = {}
- for i = 1, ndims do
+ for i = 1, ndims do
sizes[i] = chooseInt(1, math.min(20, math.sqrt(nelems)))
nelems = nelems / sizes[i]
end
@@ -2359,7 +2359,7 @@ function test.maskedSelect()
local mask = torch.ByteTensor(n_row,n_col):bernoulli()
local y = x:maskedSelect(mask)
x=x:cuda()
- mask=mask:cuda()
+ mask=mask:cudaByte()
local y_cuda = x:maskedSelect(mask)
tester:assertTensorEq(y, y_cuda:float(), 0.00001, "Error in maskedSelect")
checkMultiDevice(x, 'maskedSelect', mask)
@@ -2369,7 +2369,7 @@ function test.maskedSelect()
local mask = torch.ByteTensor(n_row,n_col):bernoulli()
local y = x:t():maskedSelect(mask)
x=x:cuda()
- mask=mask:cuda()
+ mask=mask:cudaByte()
local y_cuda = x:t():maskedSelect(mask)
tester:assertTensorEq(y, y_cuda:float(), 0.00001,
"Error in maskedSelect non-contiguous")
@@ -2380,7 +2380,7 @@ function test.maskedSelect()
local y = torch.FloatTensor()
y:maskedSelect(x, mask)
x=x:cuda()
- mask=mask:cuda()
+ mask=mask:cudaByte()
local y_cuda = torch.CudaTensor()
y_cuda:maskedSelect(x, mask)
tester:assertTensorEq(y, y_cuda:float(), 0.00001,
@@ -2392,7 +2392,7 @@ function test.maskedSelect()
local y = torch.FloatTensor()
y:maskedSelect(x:t(), mask)
x=x:cuda()
- mask=mask:cuda()
+ mask=mask:cudaByte()
local y_cuda = torch.CudaTensor()
y_cuda:maskedSelect(x:t(), mask)
tester:assertTensorEq(y, y_cuda:float(), 0.00001,
@@ -2425,7 +2425,7 @@ function test.maskedCopy()
local mask = torch.ByteTensor(n_row,n_col):bernoulli()
y:maskedCopy(mask, x:clone())
local y_cuda=x:cuda():fill(-1)
- mask=mask:cuda()
+ mask=mask:cudaByte()
x=x:cuda()
y_cuda:maskedCopy(mask, x)
tester:assertTensorEq(y, y_cuda:float(), 0.00001,
@@ -2439,7 +2439,7 @@ function test.maskedCopy()
y:maskedCopy(mask, x:t())
local y_cuda=x:cuda():fill(-1)
x=x:cuda()
- mask=mask:cuda()
+ mask=mask:cudaByte()
y_cuda:maskedCopy(mask, x:t())
tester:assertTensorEq(y, y_cuda:float(), 0.00001,
"Error in maskedCopy (non-contiguous source)")
@@ -2451,7 +2451,7 @@ function test.maskedCopy()
y:t():maskedCopy(mask, x:t())
local y_cuda=x:cuda():fill(-1)
x=x:cuda()
- mask=mask:cuda()
+ mask=mask:cudaByte()
y_cuda:t():maskedCopy(mask, x:t())
tester:assertTensorEq(y, y_cuda:float(), 0.00001,
"Error in maskedCopy (non-contiguous dest)")
@@ -2501,7 +2501,7 @@ function test.maskedFill()
local mask = torch.ByteTensor(n_row,n_col):bernoulli()
x:maskedFill(mask, 334)
local x_cuda=gt:cuda()
- mask=mask:cuda()
+ mask=mask:cudaByte()
x_cuda:maskedFill(mask, 334)
tester:assertTensorEq(x, x_cuda:float(), 0.00001, "Error in maskedFill")
checkMultiDevice(x_cuda, 'maskedFill', mask, 334)
@@ -2511,7 +2511,7 @@ function test.maskedFill()
mask = mask:byte()
x:t():maskedFill(mask, 334)
local x_cuda = gt:cuda()
- mask=mask:cuda()
+ mask=mask:cudaByte()
x_cuda:t():maskedFill(mask, 334)
tester:assertTensorEq(x, x_cuda:float(), 0.00001,
"Error in maskedFill non-contiguous")
diff --git a/torch/generic/Tensor.c b/torch/generic/Tensor.c
index a4dbed9..97e80e9 100644
--- a/torch/generic/Tensor.c
+++ b/torch/generic/Tensor.c
@@ -531,7 +531,7 @@ static int torch_Tensor_(indexFill)(lua_State *L)
return 1;
}
-#endif
+#endif // THC_REAL_IS_FLOAT
static int torch_Tensor_(transpose)(lua_State *L)
{
@@ -655,10 +655,8 @@ static int torch_Tensor_(__newindex__)(lua_State *L)
THCState *state = cutorch_getstate(L);
THCTensor *tensor = luaT_checkudata(L, 1, torch_Tensor);
THLongStorage *idx = NULL;
-#ifdef THC_REAL_IS_FLOAT
THByteTensor *mask;
- THCudaTensor *maskCuda;
-#endif
+ THCudaByteTensor *maskCuda;
if(lua_isnumber(L, 2))
{
@@ -849,16 +847,18 @@ static int torch_Tensor_(__newindex__)(lua_State *L)
THCTensor_(free)(state, tensor);
lua_pushboolean(L, 1);
}
- // FIXME: pending generic implementation of
- // maskedFillByte/maskedCopyByte/maskedFill/maskedCopy
-#ifdef THC_REAL_IS_FLOAT
else if((mask = luaT_toudata(L, 2, "torch.ByteTensor")))
{
THCTensor *vals;
if (lua_isnumber(L, 3))
{
- THCTensor_(maskedFillByte)(state, tensor, mask,
- (real)(luaL_checknumber(L,3)));
+#ifdef THC_REAL_IS_HALF
+ real value = THC_float2half((float) luaL_checknumber(L, 3));
+#else
+ real value = (real) luaL_checknumber(L, 3);
+#endif
+
+ THCTensor_(maskedFillByte)(state, tensor, mask, value);
}
else if((vals = luaT_toudata(L, 3, torch_Tensor)))
{
@@ -869,13 +869,18 @@ static int torch_Tensor_(__newindex__)(lua_State *L)
luaL_error(L,"number or tensor expected");
}
}
- else if((maskCuda = luaT_toudata(L, 2, "torch.CudaTensor")))
+ else if((maskCuda = luaT_toudata(L, 2, "torch.CudaByteTensor")))
{
THCTensor *vals;
if (lua_isnumber(L, 3))
{
- THCTensor_(maskedFill)(state, tensor, maskCuda,
- (real)(luaL_checknumber(L,3)));
+#ifdef THC_REAL_IS_HALF
+ real value = THC_float2half((float) luaL_checknumber(L, 3));
+#else
+ real value = (real) luaL_checknumber(L, 3);
+#endif
+
+ THCTensor_(maskedFill)(state, tensor, maskCuda, value);
}
else if((vals = luaT_toudata(L, 3, torch_Tensor)))
{
@@ -886,7 +891,6 @@ static int torch_Tensor_(__newindex__)(lua_State *L)
luaL_error(L,"number or tensor expected");
}
}
-#endif // THC_REAL_IS_FLOAT
else
{
lua_pushboolean(L, 0);
@@ -900,10 +904,8 @@ static int torch_Tensor_(__index__)(lua_State *L)
THCState *state = cutorch_getstate(L);
THCTensor *tensor = luaT_checkudata(L, 1, torch_Tensor);
THLongStorage *idx = NULL;
-#ifdef THC_REAL_IS_FLOAT
THByteTensor *mask;
- THCudaTensor *maskCuda;
-#endif
+ THCudaByteTensor *maskCuda;
if(lua_isnumber(L, 2))
{
@@ -1039,8 +1041,6 @@ static int torch_Tensor_(__index__)(lua_State *L)
lua_pushboolean(L, 1);
return 2;
}
- // FIXME: pending generic implementation of maskedSelectByte/maskedSelect
-#ifdef THC_REAL_IS_FLOAT
else if((mask = luaT_toudata(L, 2, "torch.ByteTensor")))
{
THCTensor *vals = THCTensor_(new)(state);
@@ -1049,7 +1049,7 @@ static int torch_Tensor_(__index__)(lua_State *L)
lua_pushboolean(L, 1);
return 2;
}
- else if((maskCuda = luaT_toudata(L, 2, "torch.CudaTensor")))
+ else if((maskCuda = luaT_toudata(L, 2, "torch.CudaByteTensor")))
{
THCTensor *vals = THCTensor_(new)(state);
THCTensor_(maskedSelect)(state, vals, tensor, maskCuda);
@@ -1057,7 +1057,6 @@ static int torch_Tensor_(__index__)(lua_State *L)
lua_pushboolean(L, 1);
return 2;
}
-#endif // THC_REAL_IS_FLOAT
else
{
lua_pushboolean(L, 0);