diff options
author | Soumith Chintala <soumith@gmail.com> | 2017-01-01 20:34:46 +0300 |
---|---|---|
committer | GitHub <noreply@github.com> | 2017-01-01 20:34:46 +0300 |
commit | 306984ff623886b8c07c8934623bd94fb52ed983 (patch) | |
tree | 75b824745614ef5a84d684dd9321981914e4930b | |
parent | d41580eccefcbc1d11d404e0c4ae522560f8e263 (diff) | |
parent | be986fb5d8611e9cb158b69d0977564e41eb93f2 (diff) |
Merge branch 'master' into contiguous-cat-1d
43 files changed, 2899 insertions, 220 deletions
@@ -15,6 +15,7 @@ local function checkArgumentType(expected, actual, fn, ud, level) end if ok then + local Real2real = { Byte='unsigned char', Char='char', @@ -22,7 +23,8 @@ if ok then Int='int', Long='long', Float='float', - Double='double' + Double='double', + Half='THHalf' } -- Allocator @@ -34,6 +36,14 @@ typedef struct THAllocator { } THAllocator; ]] + -- Half + ffi.cdef[[ +typedef struct { + unsigned short x; +} __THHalf; +typedef __THHalf THHalf; +]] + -- Storage for Real, real in pairs(Real2real) do @@ -76,7 +86,7 @@ typedef struct THRealTensor long *size; long *stride; int nDimension; - + THRealStorage *storage; ptrdiff_t storageOffset; int refcount; @@ -88,7 +98,8 @@ typedef struct THRealTensor cdefs = cdefs:gsub('Real', Real):gsub('real', real) ffi.cdef(cdefs) - local Tensor = torch.getmetatable(string.format('torch.%sTensor', Real)) + local Tensor_type = string.format('torch.%sTensor', Real) + local Tensor = torch.getmetatable(Tensor_type) local Tensor_tt = ffi.typeof('TH' .. Real .. 'Tensor**') rawset(Tensor, @@ -107,75 +118,77 @@ typedef struct THRealTensor end) -- faster apply (contiguous case) - local apply = Tensor.apply - rawset(Tensor, - "apply", - function(self, func) - if self:isContiguous() and self.data then - local self_d = self:data() - for i=0,self:nElement()-1 do - local res = func(tonumber(self_d[i])) -- tonumber() required for long... - if res then - self_d[i] = res + if Tensor_type ~= 'torch.HalfTensor' then + local apply = Tensor.apply + rawset(Tensor, + "apply", + function(self, func) + if self:isContiguous() and self.data then + local self_d = self:data() + for i=0,self:nElement()-1 do + local res = func(tonumber(self_d[i])) -- tonumber() required for long... + if res then + self_d[i] = res + end end + return self + else + return apply(self, func) end - return self - else - return apply(self, func) - end - end) - - -- faster map (contiguous case) - local map = Tensor.map - rawset(Tensor, - "map", - function(self, src, func) - checkArgument(torch.isTensor(src), "map", 1, "tensor expected") - checkArgumentType(self:type(), src:type(), "map", 1) - - if self:isContiguous() and src:isContiguous() and self.data and src.data then - local self_d = self:data() - local src_d = src:data() - assert(src:nElement() == self:nElement(), 'size mismatch') - for i=0,self:nElement()-1 do - local res = func(tonumber(self_d[i]), tonumber(src_d[i])) -- tonumber() required for long... - if res then - self_d[i] = res + end) + + -- faster map (contiguous case) + local map = Tensor.map + rawset(Tensor, + "map", + function(self, src, func) + checkArgument(torch.isTensor(src), "map", 1, "tensor expected") + checkArgumentType(self:type(), src:type(), "map", 1) + + if self:isContiguous() and src:isContiguous() and self.data and src.data then + local self_d = self:data() + local src_d = src:data() + assert(src:nElement() == self:nElement(), 'size mismatch') + for i=0,self:nElement()-1 do + local res = func(tonumber(self_d[i]), tonumber(src_d[i])) -- tonumber() required for long... + if res then + self_d[i] = res + end end + return self + else + return map(self, src, func) end - return self - else - return map(self, src, func) - end - end) - - -- faster map2 (contiguous case) - local map2 = Tensor.map2 - rawset(Tensor, - "map2", - function(self, src1, src2, func) - checkArgument(torch.isTensor(src1), "map", 1, "tensor expected") - checkArgument(torch.isTensor(src2), "map", 2, "tensor expected") - checkArgumentType(self:type(), src1:type(), "map", 1) - checkArgumentType(self:type(), src2:type(), "map", 2) - - if self:isContiguous() and src1:isContiguous() and src2:isContiguous() and self.data and src1.data and src2.data then - local self_d = self:data() - local src1_d = src1:data() - local src2_d = src2:data() - assert(src1:nElement() == self:nElement(), 'size mismatch') - assert(src2:nElement() == self:nElement(), 'size mismatch') - for i=0,self:nElement()-1 do - local res = func(tonumber(self_d[i]), tonumber(src1_d[i]), tonumber(src2_d[i])) -- tonumber() required for long... - if res then - self_d[i] = res + end) + + -- faster map2 (contiguous case) + local map2 = Tensor.map2 + rawset(Tensor, + "map2", + function(self, src1, src2, func) + checkArgument(torch.isTensor(src1), "map", 1, "tensor expected") + checkArgument(torch.isTensor(src2), "map", 2, "tensor expected") + checkArgumentType(self:type(), src1:type(), "map", 1) + checkArgumentType(self:type(), src2:type(), "map", 2) + + if self:isContiguous() and src1:isContiguous() and src2:isContiguous() and self.data and src1.data and src2.data then + local self_d = self:data() + local src1_d = src1:data() + local src2_d = src2:data() + assert(src1:nElement() == self:nElement(), 'size mismatch') + assert(src2:nElement() == self:nElement(), 'size mismatch') + for i=0,self:nElement()-1 do + local res = func(tonumber(self_d[i]), tonumber(src1_d[i]), tonumber(src2_d[i])) -- tonumber() required for long... + if res then + self_d[i] = res + end end + return self + else + return map2(self, src1, src2, func) end - return self - else - return map2(self, src1, src2, func) - end - end) + end) + end end -- torch.data @@ -7,3 +7,6 @@ #include "generic/Storage.c" #include "THGenerateAllTypes.h" + +#include "generic/Storage.c" +#include "THGenerateHalfType.h" @@ -7,3 +7,6 @@ #include "generic/Tensor.c" #include "THGenerateAllTypes.h" + +#include "generic/Tensor.c" +#include "THGenerateHalfType.h" @@ -5,14 +5,14 @@ local Storage = {} local Tensor = {} -- types -local types = {'Byte', 'Char', 'Short', 'Int', 'Long', 'Float', 'Double'} +local types = {'Byte', 'Char', 'Short', 'Int', 'Long', 'Float', 'Half', 'Double'} -- Lua 5.2 compatibility local log10 = math.log10 or function(x) return math.log(x, 10) end -- tostring() functions for Tensor and Storage local function Storage__printformat(self) - if self:size() == 0 then + if self:size() == 0 then return "", nil, 0 end local intMode = true @@ -277,6 +277,10 @@ function Tensor.double(self) return self:type('torch.DoubleTensor') end +function Tensor.half(self) + return self:type('torch.HalfTensor') +end + function Tensor.real(self) return self:type(torch.getdefaulttensortype()) end @@ -556,6 +560,14 @@ torch.permute = Tensor.permute for _,type in ipairs(types) do local metatable = torch.getmetatable('torch.' .. type .. 'Tensor') for funcname, func in pairs(Tensor) do - rawset(metatable, funcname, func) + if funcname ~= 'totable' or type ~='Half' then + rawset(metatable, funcname, func) + else + local function Tensor__totable(self) + local host_tensor = self:float() + return self:float():totable() + end + rawset(torch.getmetatable('torch.HalfTensor'), 'totable', Tensor__totable) + end end end diff --git a/TensorMath.lua b/TensorMath.lua index 5971a7b..d816740 100644 --- a/TensorMath.lua +++ b/TensorMath.lua @@ -6,56 +6,7 @@ local interface = wrap.CInterface.new() local method = wrap.CInterface.new() local argtypes = wrap.CInterface.argtypes -argtypes['ptrdiff_t'] = { - - helpname = function(arg) - return 'ptrdiff_t' - end, - - declare = function(arg) - -- if it is a number we initialize here - local default = tonumber(tostring(arg.default)) or 0 - return string.format("%s arg%d = %g;", 'ptrdiff_t', arg.i, default) - end, - - check = function(arg, idx) - return string.format("lua_isnumber(L, %d)", idx) - end, - - read = function(arg, idx) - return string.format("arg%d = (%s)lua_tonumber(L, %d);", arg.i, 'ptrdiff_t', idx) - end, - - init = function(arg) - -- otherwise do it here - if arg.default then - local default = tostring(arg.default) - if not tonumber(default) then - return string.format("arg%d = %s;", arg.i, default) - end - end - end, - - carg = function(arg) - return string.format('arg%d', arg.i) - end, - - creturn = function(arg) - return string.format('arg%d', arg.i) - end, - - precall = function(arg) - if arg.returned then - return string.format('lua_pushnumber(L, (lua_Number)arg%d);', arg.i) - end - end, - - postcall = function(arg) - if arg.creturned then - return string.format('lua_pushnumber(L, (lua_Number)arg%d);', arg.i) - end - end -} +argtypes['ptrdiff_t'] = wrap.types.ptrdiff_t interface:print([[ #include "TH.h" @@ -216,6 +167,7 @@ local reals = {ByteTensor='unsigned char', IntTensor='int', LongTensor='long', FloatTensor='float', + HalfTensor='half', DoubleTensor='double'} local accreals = {ByteTensor='long', @@ -224,11 +176,12 @@ local accreals = {ByteTensor='long', IntTensor='long', LongTensor='long', FloatTensor='double', + HalfTensor='float', DoubleTensor='double'} for _,Tensor in ipairs({"ByteTensor", "CharTensor", "ShortTensor", "IntTensor", "LongTensor", - "FloatTensor", "DoubleTensor"}) do + "FloatTensor", "HalfTensor", "DoubleTensor"}) do local real = reals[Tensor] local accreal = accreals[Tensor] @@ -257,6 +210,7 @@ for _,Tensor in ipairs({"ByteTensor", "CharTensor", end end + if Tensor ~= 'HalfTensor' then wrap("zero", cname("zero"), {{name=Tensor, returned=true}}) @@ -1030,6 +984,7 @@ static void THTensor_random1__(THTensor *self, THGenerator *gen, long b) cname("nonzero"), {{name="IndexTensor", default=true, returned=true}, {name=Tensor}}) + end -- ~= HalfTensor if Tensor == 'ByteTensor' then -- Logical accumulators only apply to ByteTensor @@ -1089,6 +1044,14 @@ static void THTensor_random1__(THTensor *self, THGenerator *gen, long b) {name="double",default=0}, {name="double",default=0}}) + wrap("bhistc", + cname("bhistc"), + {{name=Tensor, default=true, returned=true}, + {name=Tensor}, + {name="long",default=100}, + {name="double",default=0}, + {name="double",default=0}}) + wrap("norm", cname("normall"), {{name=Tensor}, @@ -687,6 +687,7 @@ local typesMatching = { ['torch.LongStorage'] = torch.LongTensor, ['torch.FloatStorage'] = torch.FloatTensor, ['torch.DoubleStorage'] = torch.DoubleTensor, + ['torch.HalfStorage'] = torch.HalfTensor, } --[[ Tests for storage equality. diff --git a/doc/maths.md b/doc/maths.md index 44e5ea6..b4f1592 100755 --- a/doc/maths.md +++ b/doc/maths.md @@ -158,6 +158,43 @@ By default the elements are sorted into 100 equally spaced bins between the mini `y = torch.histc(x, n, min, max)` same as above with `n` bins and `[min, max]` as elements range. +<a name="torch.bhistc"></a> +### [res] torch.bhistc([res,] x [,nbins, min_value, max_value]) ### +<a name="torch.bhistc"></a> + +`y = torch.bhistc(x)` returns the histogram of the elements in 2d tensor `x` along the last dimension. +By default the elements are sorted into 100 equally spaced bins between the minimum and maximum values of `x`. + +`y = torch.bhistc(x, n)` same as above with `n` bins. + +`y = torch.bhistc(x, n, min, max)` same as above with `n` bins and `[min, max]` as elements range. + +```lua +x =torch.Tensor(3, 6) + +> x[1] = torch.Tensor{ 2, 4, 2, 2, 5, 4 } +> x[2] = torch.Tensor{ 3, 5, 1, 5, 3, 5 } +> x[3] = torch.Tensor{ 3, 4, 2, 5, 5, 1 } + +> x + 2 4 2 2 5 4 + 3 5 1 5 3 5 + 3 4 2 5 5 1 +[torch.DoubleTensor of size 3x6] + +> torch.bhistc(x, 5, 1, 5) + 0 3 0 2 1 + 1 0 2 0 3 + 1 1 1 1 2 +[torch.DoubleTensor of size 3x5] + +> y = torch.Tensor(1, 6):copy(x[1]) + +> torch.bhistc(y, 5) + 3 0 2 0 1 +[torch.DoubleTensor of size 1x5] +``` + <a name="torch.linspace"></a> ### [res] torch.linspace([res,] x1, x2, [,n]) ### <a name="torch.linspace"></a> diff --git a/generic/Storage.c b/generic/Storage.c index 134dc63..a6652a5 100644 --- a/generic/Storage.c +++ b/generic/Storage.c @@ -41,7 +41,7 @@ static int torch_Storage_(new)(lua_State *L) THStorage_(free)(storage); luaL_error(L, "element at index %d is not a number", i); } - THStorage_(set)(storage, i-1, (real)lua_tonumber(L, -1)); + THStorage_(set)(storage, i-1, LUA_NUMBER_TO_REAL(lua_tonumber(L, -1))); lua_pop(L, 1); } } @@ -131,6 +131,8 @@ static int torch_Storage_(copy)(lua_State *L) THStorage_(copyFloat)(storage, src); else if( (src = luaT_toudata(L, 2, "torch.DoubleStorage")) ) THStorage_(copyDouble)(storage, src); + else if( (src = luaT_toudata(L, 2, "torch.HalfStorage")) ) + THStorage_(copyHalf)(storage, src); else luaL_typerror(L, 2, "torch.*Storage"); lua_settop(L, 1); diff --git a/generic/Tensor.c b/generic/Tensor.c index abb7819..c2417fe 100644 --- a/generic/Tensor.c +++ b/generic/Tensor.c @@ -142,7 +142,7 @@ static int torch_Tensor_(new)(lua_State *L) THTensor_(free)(tensor); THError("invalid element (not a number)"); } - THStorage_(set)(THTensor_(storage)(tensor), si++, (real)lua_tonumber(L, -1)); + THStorage_(set)(THTensor_(storage)(tensor), si++, LUA_NUMBER_TO_REAL(lua_tonumber(L, -1))); lua_pop(L, 1); } @@ -384,6 +384,7 @@ static int torch_Tensor_(select)(lua_State *L) return 1; } +#ifndef TH_REAL_IS_HALF static int torch_Tensor_(indexSelect)(lua_State *L) { int narg = lua_gettop(L); @@ -567,6 +568,7 @@ static int torch_Tensor_(maskedFill)(lua_State *L) return 1; } +#endif static int torch_Tensor_(transpose)(lua_State *L) { @@ -675,6 +677,8 @@ static int torch_Tensor_(copy)(lua_State *L) THTensor_(copyFloat)(tensor, src); else if( (src = luaT_toudata(L, 2, "torch.DoubleTensor")) ) THTensor_(copyDouble)(tensor, src); + else if( (src = luaT_toudata(L, 2, "torch.HalfTensor")) ) + THTensor_(copyHalf)(tensor, src); else luaL_typerror(L, 2, "torch.*Tensor"); lua_settop(L, 1); @@ -700,10 +704,14 @@ static int torch_Tensor_(__newindex__)(lua_State *L) THArgCheck(index >= 0 && index < tensor->size[0], 2, "out of range"); THStorage_(set)(tensor->storage, tensor->storageOffset+index*tensor->stride[0], value); } else { +#ifndef TH_REAL_IS_HALF tensor = THTensor_(newWithTensor)(tensor); THTensor_(narrow)(tensor, NULL, 0, index, 1); THTensor_(fill)(tensor, value); THTensor_(free)(tensor); +#else + THError("fill on torch.HalfTensor not yet supported"); +#endif } } else if( (src = luaT_toudata(L, 3, torch_Tensor)) ) { tensor = THTensor_(newWithTensor)(tensor); @@ -745,6 +753,11 @@ static int torch_Tensor_(__newindex__)(lua_State *L) THTensor_(narrow)(tensor, NULL, 0, index, 1); THTensor_(copyDouble)(tensor, src); THTensor_(free)(tensor); + } else if( (src = luaT_toudata(L, 3, "torch.HalfTensor")) ) { + tensor = THTensor_(newWithTensor)(tensor); + THTensor_(narrow)(tensor, NULL, 0, index, 1); + THTensor_(copyHalf)(tensor, src); + THTensor_(free)(tensor); } else { luaL_typerror(L, 3, "torch.*Tensor"); } @@ -829,7 +842,11 @@ static int torch_Tensor_(__newindex__)(lua_State *L) /* doing a copy */ void *src; if (lua_isnumber(L,3)) { - THTensor_(fill)(tensor, lua_tonumber(L,3)); +#ifndef TH_REAL_IS_HALF + THTensor_(fill)(tensor, LUA_NUMBER_TO_REAL(lua_tonumber(L,3))); +#else + THError("fill on torch.HalfTensor not yet supported"); +#endif } else if( (src = luaT_toudata(L, 3, torch_Tensor)) ) { THTensor_(copy)(tensor, src); } else if( (src = luaT_toudata(L, 3, "torch.ByteTensor")) ) { @@ -846,6 +863,8 @@ static int torch_Tensor_(__newindex__)(lua_State *L) THTensor_(copyFloat)(tensor, src); } else if( (src = luaT_toudata(L, 3, "torch.DoubleTensor")) ) { THTensor_(copyDouble)(tensor, src); + } else if( (src = luaT_toudata(L, 3, "torch.HalfTensor")) ) { + THTensor_(copyHalf)(tensor, src); } else { luaL_typerror(L, 3, "torch.*Tensor"); } @@ -855,6 +874,7 @@ static int torch_Tensor_(__newindex__)(lua_State *L) } else if((mask = luaT_toudata(L, 2, "torch.ByteTensor"))) { +#ifndef TH_REAL_IS_HALF THTensor *vals; if (lua_isnumber(L, 3)) { @@ -868,6 +888,9 @@ static int torch_Tensor_(__newindex__)(lua_State *L) { THError("number or " torch_Tensor " expected"); } +#else + THError("ByteTensor indexing not yet supported with half types"); +#endif } else lua_pushboolean(L, 0); @@ -916,7 +939,7 @@ static int torch_Tensor_(__index__)(lua_State *L) THArgCheck((z >= 0) && (z < tensor->size[dim]), 2, "index out of bound"); index += z*tensor->stride[dim]; } - luaG_(pushreal)(L, (double)THStorage_(get)(THTensor_(storage)(tensor), index)); + luaG_(pushreal)(L, THStorage_(get)(THTensor_(storage)(tensor), index)); lua_pushboolean(L, 1); return 2; } @@ -987,11 +1010,16 @@ static int torch_Tensor_(__index__)(lua_State *L) } else if((mask = luaT_toudata(L, 2, "torch.ByteTensor"))) { +#ifndef TH_REAL_IS_HALF THTensor *vals = THTensor_(new)(); THTensor_(maskedSelect)(vals, tensor, mask); luaT_pushudata(L, vals, torch_Tensor); lua_pushboolean(L, 1); return 2; +#else + THError("ByteTensor based indexing not yetsupported with half type"); + return 0; +#endif } else { @@ -1131,6 +1159,7 @@ static void torch_Tensor_(c_readTensorStorageSizeStride)(lua_State *L, int index THArgCheck(0, index, "expecting number"); } +#ifndef TH_REAL_IS_HALF static int torch_Tensor_(apply)(lua_State *L) { THTensor *tensor = luaT_checkudata(L, 1, torch_Tensor); @@ -1143,7 +1172,7 @@ static int torch_Tensor_(apply)(lua_State *L) lua_call(L, 1, 1); if(lua_isnumber(L, 3)) { - *tensor_data = (real)lua_tonumber(L, 3); + *tensor_data = LUA_NUMBER_TO_REAL(lua_tonumber(L, 3)); lua_pop(L, 1); } else if(lua_isnil(L, 3)) @@ -1169,7 +1198,7 @@ static int torch_Tensor_(map)(lua_State *L) lua_call(L, 2, 1); if(lua_isnumber(L, 4)) { - *tensor_data = (real)lua_tonumber(L, 4); + *tensor_data = LUA_NUMBER_TO_REAL(lua_tonumber(L, 4)); lua_pop(L, 1); } else if(lua_isnil(L, 4)) @@ -1197,7 +1226,7 @@ static int torch_Tensor_(map2)(lua_State *L) lua_call(L, 3, 1); if(lua_isnumber(L, 5)) { - *tensor_data = (real)lua_tonumber(L, 5); + *tensor_data = LUA_NUMBER_TO_REAL(lua_tonumber(L, 5)); lua_pop(L, 1); } else if(lua_isnil(L, 5)) @@ -1208,6 +1237,7 @@ static int torch_Tensor_(map2)(lua_State *L) lua_settop(L, 1); return 1; } +#endif static int torch_Tensor_(factory)(lua_State *L) { @@ -1286,6 +1316,7 @@ static const struct luaL_Reg torch_Tensor_(_) [] = { {"narrow", torch_Tensor_(narrow)}, {"sub", torch_Tensor_(sub)}, {"select", torch_Tensor_(select)}, +#ifndef TH_REAL_IS_HALF {"index", torch_Tensor_(indexSelect)}, {"indexCopy", torch_Tensor_(indexCopy)}, {"indexAdd", torch_Tensor_(indexAdd)}, @@ -1293,6 +1324,7 @@ static const struct luaL_Reg torch_Tensor_(_) [] = { {"maskedSelect", torch_Tensor_(maskedSelect)}, {"maskedCopy", torch_Tensor_(maskedCopy)}, {"maskedFill", torch_Tensor_(maskedFill)}, +#endif {"transpose", torch_Tensor_(transpose)}, {"t", torch_Tensor_(t)}, {"unfold", torch_Tensor_(unfold)}, @@ -1302,9 +1334,11 @@ static const struct luaL_Reg torch_Tensor_(_) [] = { {"isSize", torch_Tensor_(isSize)}, {"nElement", torch_Tensor_(nElement)}, {"copy", torch_Tensor_(copy)}, +#ifndef TH_REAL_IS_HALF {"apply", torch_Tensor_(apply)}, {"map", torch_Tensor_(map)}, {"map2", torch_Tensor_(map2)}, +#endif {"read", torch_Tensor_(read)}, {"write", torch_Tensor_(write)}, {"__index__", torch_Tensor_(__index__)}, @@ -1318,7 +1352,10 @@ void torch_Tensor_(init)(lua_State *L) torch_Tensor_(new), torch_Tensor_(free), torch_Tensor_(factory)); luaT_setfuncs(L, torch_Tensor_(_), 0); lua_pop(L, 1); +#ifndef TH_REAL_IS_HALF THVector_(vectorDispatchInit)(); +#endif + } #endif diff --git a/generic/TensorOperator.c b/generic/TensorOperator.c index eba6c81..e131c57 100644 --- a/generic/TensorOperator.c +++ b/generic/TensorOperator.c @@ -14,7 +14,7 @@ static int torch_TensorOperator_(__add__)(lua_State *L) { r = THTensor_(new)(); luaT_pushudata(L, r, torch_Tensor); - + if(!tensor1 && tensor2) { THTensor_(resizeAs)(r, tensor2); @@ -49,7 +49,7 @@ static int torch_TensorOperator_(__sub__)(lua_State *L) { r = THTensor_(new)(); luaT_pushudata(L, r, torch_Tensor); - + if(!tensor1 && tensor2) { THTensor_(resizeAs)(r, tensor2); @@ -98,7 +98,7 @@ static int torch_TensorOperator_(__mul__)(lua_State *L) { r = THTensor_(new)(); luaT_pushudata(L, r, torch_Tensor); - + if(!tensor1 && tensor2) { THTensor_(resizeAs)(r, tensor2); @@ -115,7 +115,7 @@ static int torch_TensorOperator_(__mul__)(lua_State *L) { int dimt = tensor1->nDimension; int dims = tensor2->nDimension; - + if(dimt == 1 && dims == 1) lua_pushnumber(L, THTensor_(dot)(tensor1, tensor2)); /* ok, we wasted r, but who cares */ else if(dimt == 2 && dims == 1) @@ -131,7 +131,7 @@ static int torch_TensorOperator_(__mul__)(lua_State *L) THTensor_(addmm)(r, 1, r, 1, tensor1, tensor2); } else - luaL_error(L, "multiplication between %dD and %dD tensors not yet supported", tensor1->nDimension, tensor2->nDimension); + luaL_error(L, "multiplication between %dD and %dD tensors not yet supported", tensor1->nDimension, tensor2->nDimension); } } return 1; diff --git a/generic/luaG.h b/generic/luaG.h index 950eae9..f1ffce2 100644 --- a/generic/luaG.h +++ b/generic/luaG.h @@ -6,10 +6,24 @@ #define luaG_(NAME) TH_CONCAT_3(luaG_,Real,NAME) #endif -static void luaG_(pushreal)(lua_State *L, accreal n) { -#if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE) || LUA_VERSION_NUM < 503 - lua_pushnumber(L, (lua_Number)n); -#elif defined(TH_REAL_IS_BYTE) || defined(TH_REAL_IS_CHAR) || defined(TH_REAL_IS_SHORT) || defined(TH_REAL_IS_INT) || defined(TH_REAL_IS_LONG) +#undef REAL_TO_LUA_NUMBER +#undef LUA_NUMBER_TO_REAL + +#if defined(TH_REAL_IS_HALF) +# define REAL_TO_LUA_NUMBER(n) (lua_Number)TH_half2float(n) +# define LUA_NUMBER_TO_REAL(n) TH_float2half((lua_Number)n) +#else +# define REAL_TO_LUA_NUMBER(n) (lua_Number)(n) +# define LUA_NUMBER_TO_REAL(n) (real)n +#endif + + + +static void luaG_(pushreal)(lua_State *L, real n) { +#if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_HALF) || LUA_VERSION_NUM < 503 + lua_pushnumber(L, REAL_TO_LUA_NUMBER(n)); +#elif defined(TH_REAL_IS_BYTE) || defined(TH_REAL_IS_CHAR) || defined(TH_REAL_IS_SHORT) \ + || defined(TH_REAL_IS_INT) || defined(TH_REAL_IS_LONG) lua_pushinteger(L, (lua_Integer)n); #else #error "unhandled real type in luaG_pushreal" @@ -17,8 +31,8 @@ static void luaG_(pushreal)(lua_State *L, accreal n) { } static real luaG_(checkreal)(lua_State *L, int idx) { -#if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE) - return (lua_Number)luaL_checknumber(L, idx); +#if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_HALF) + return LUA_NUMBER_TO_REAL(luaL_checknumber(L, idx)); #elif defined(TH_REAL_IS_BYTE) || defined(TH_REAL_IS_CHAR) || defined(TH_REAL_IS_SHORT) || defined(TH_REAL_IS_INT) || defined(TH_REAL_IS_LONG) int type = lua_type(L, idx); if (type == LUA_TSTRING) { @@ -38,8 +52,8 @@ static real luaG_(checkreal)(lua_State *L, int idx) { } static real luaG_(optreal)(lua_State *L, int idx, real n) { -#if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE) || LUA_VERSION_NUM < 503 - return (lua_Number)luaL_optnumber(L, idx, (lua_Number)n); +#if defined(TH_REAL_IS_HALF) || defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE) || LUA_VERSION_NUM < 503 + return LUA_NUMBER_TO_REAL(luaL_optnumber(L, idx, REAL_TO_LUA_NUMBER(n))); #elif defined(TH_REAL_IS_BYTE) || defined(TH_REAL_IS_CHAR) || defined(TH_REAL_IS_SHORT) || defined(TH_REAL_IS_INT) || defined(TH_REAL_IS_LONG) return (lua_Integer)luaL_optinteger(L, idx, (lua_Integer)n); #else @@ -16,6 +16,7 @@ extern void torch_IntStorage_init(lua_State *L); extern void torch_LongStorage_init(lua_State *L); extern void torch_FloatStorage_init(lua_State *L); extern void torch_DoubleStorage_init(lua_State *L); +extern void torch_HalfStorage_init(lua_State *L); extern void torch_ByteTensor_init(lua_State *L); extern void torch_CharTensor_init(lua_State *L); @@ -24,6 +25,7 @@ extern void torch_IntTensor_init(lua_State *L); extern void torch_LongTensor_init(lua_State *L); extern void torch_FloatTensor_init(lua_State *L); extern void torch_DoubleTensor_init(lua_State *L); +extern void torch_HalfTensor_init(lua_State *L); extern void torch_ByteTensorOperator_init(lua_State *L); extern void torch_CharTensorOperator_init(lua_State *L); @@ -33,8 +35,10 @@ extern void torch_LongTensorOperator_init(lua_State *L); extern void torch_FloatTensorOperator_init(lua_State *L); extern void torch_DoubleTensorOperator_init(lua_State *L); + extern void torch_TensorMath_init(lua_State *L); + LUA_EXTERNC DLL_EXPORT int luaopen_libtorch(lua_State *L); int luaopen_libtorch(lua_State *L) @@ -45,7 +49,6 @@ int luaopen_libtorch(lua_State *L) lua_setglobal(L, "torch"); torch_utils_init(L); - torch_File_init(L); torch_ByteStorage_init(L); @@ -55,6 +58,7 @@ int luaopen_libtorch(lua_State *L) torch_LongStorage_init(L); torch_FloatStorage_init(L); torch_DoubleStorage_init(L); + torch_HalfStorage_init(L); torch_ByteTensor_init(L); torch_CharTensor_init(L); @@ -63,6 +67,7 @@ int luaopen_libtorch(lua_State *L) torch_LongTensor_init(L); torch_FloatTensor_init(L); torch_DoubleTensor_init(L); + torch_HalfTensor_init(L); torch_ByteTensorOperator_init(L); torch_CharTensorOperator_init(L); diff --git a/lib/TH/CMakeLists.txt b/lib/TH/CMakeLists.txt index e6cf91d..0b5f166 100644 --- a/lib/TH/CMakeLists.txt +++ b/lib/TH/CMakeLists.txt @@ -78,6 +78,21 @@ IF (CORTEXA9_FOUND) SET(CMAKE_C_FLAGS "-mcpu=cortex-a9 ${CMAKE_C_FLAGS}") ENDIF (CORTEXA9_FOUND) +INCLUDE(CheckIncludeFile) +CHECK_INCLUDE_FILE(cpuid.h HAVE_CPUID_H) +# Check for a cpuid intrinsic +IF(HAVE_CPUID_H) + CHECK_C_SOURCE_COMPILES("#include <cpuid.h> + int main() + { + unsigned int eax, ebx, ecx, edx; + return __get_cpuid(0, &eax, &ebx, &ecx, &edx); + }" HAVE_GCC_GET_CPUID) +ENDIF() +IF(HAVE_GCC_GET_CPUID) + SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DHAVE_GCC_GET_CPUID") +ENDIF(HAVE_GCC_GET_CPUID) + FIND_PACKAGE(SSE) IF(C_SSE2_FOUND) SET(CMAKE_C_FLAGS "${C_SSE2_FLAGS} -DUSE_SSE2 ${CMAKE_C_FLAGS}") @@ -122,11 +137,11 @@ IF(C_AVX_FOUND) ENDIF(C_AVX_FOUND) SET(hdr - THGeneral.h THAllocator.h THStorage.h THTensor.h THTensorApply.h THBlas.h THMath.h - THLapack.h THLogAdd.h THRandom.h THVector.h THAtomic.h) + THGeneral.h THHalf.h THAllocator.h THStorage.h THTensor.h THTensorApply.h THBlas.h THMath.h + THLapack.h THLogAdd.h THRandom.h THVector.h THAtomic.h ) SET(src - THGeneral.c THAllocator.c THStorage.c THTensor.c THBlas.c THLapack.c + THGeneral.c THHalf.c THAllocator.c THStorage.c THTensor.c THBlas.c THLapack.c THLogAdd.c THRandom.c THFile.c THDiskFile.c THMemoryFile.c THAtomic.c THVector.c) SET(src ${src} ${hdr} ${simd}) @@ -135,9 +150,13 @@ if(BUILD_STATIC) ADD_LIBRARY(TH_static STATIC ${src}) endif() +IF(NOT TH_SO_VERSION) + SET(TH_SO_VERSION 0) +ENDIF(NOT TH_SO_VERSION) +MESSAGE(STATUS "TH_SO_VERSION: ${TH_SO_VERSION}") SET_TARGET_PROPERTIES(TH PROPERTIES - VERSION 0 - SOVERSION 0) + VERSION ${TH_SO_VERSION} + SOVERSION ${TH_SO_VERSION}) CHECK_C_SOURCE_RUNS(" #include <stdatomic.h> @@ -320,6 +339,7 @@ INSTALL(FILES THFilePrivate.h ${CMAKE_CURRENT_BINARY_DIR}/THGeneral.h THGenerateAllTypes.h + THGenerateHalfType.h THGenerateFloatTypes.h THGenerateIntTypes.h THLapack.h @@ -333,6 +353,7 @@ INSTALL(FILES THTensorMacros.h THVector.h THAtomic.h + THHalf.h DESTINATION "${TH_INSTALL_INCLUDE_SUBDIR}/TH") INSTALL(FILES diff --git a/lib/TH/THDiskFile.c b/lib/TH/THDiskFile.c index 9d9cbae..01b1951 100644 --- a/lib/TH/THDiskFile.c +++ b/lib/TH/THDiskFile.c @@ -348,14 +348,14 @@ READ_WRITE_METHODS(int, Int, int ret = fscanf(dfself->handle, "%d", &data[i]); if(ret <= 0) break; else nread++, int ret = fprintf(dfself->handle, "%d", data[i]); if(ret <= 0) break; else nwrite++) -/*READ_WRITE_METHODS(long, Long, - int ret = fscanf(dfself->handle, "%ld", &data[i]); if(ret <= 0) break; else nread++, - int ret = fprintf(dfself->handle, "%ld", data[i]); if(ret <= 0) break; else nwrite++)*/ - READ_WRITE_METHODS(float, Float, int ret = fscanf(dfself->handle, "%g", &data[i]); if(ret <= 0) break; else nread++, int ret = fprintf(dfself->handle, "%.9g", data[i]); if(ret <= 0) break; else nwrite++) +READ_WRITE_METHODS(THHalf, Half, + float buf; int ret = fscanf(dfself->handle, "%g", &buf); if(ret <= 0) break; else { data[i]= TH_float2half(buf); nread++; }, + int ret = fprintf(dfself->handle, "%.9g", TH_half2float(data[i])); if(ret <= 0) break; else nwrite++) + READ_WRITE_METHODS(double, Double, int ret = fscanf(dfself->handle, "%lg", &data[i]); if(ret <= 0) break; else nread++, int ret = fprintf(dfself->handle, "%.17g", data[i]); if(ret <= 0) break; else nwrite++) @@ -618,6 +618,7 @@ THFile *THDiskFile_new(const char *name, const char *mode, int isQuiet) THDiskFile_readLong, THDiskFile_readFloat, THDiskFile_readDouble, + THDiskFile_readHalf, THDiskFile_readString, THDiskFile_writeByte, @@ -627,6 +628,7 @@ THFile *THDiskFile_new(const char *name, const char *mode, int isQuiet) THDiskFile_writeLong, THDiskFile_writeFloat, THDiskFile_writeDouble, + THDiskFile_writeHalf, THDiskFile_writeString, THDiskFile_synchronize, @@ -730,6 +732,7 @@ THFile *THPipeFile_new(const char *name, const char *mode, int isQuiet) THDiskFile_readLong, THDiskFile_readFloat, THDiskFile_readDouble, + THDiskFile_readHalf, THDiskFile_readString, THDiskFile_writeByte, @@ -739,6 +742,7 @@ THFile *THPipeFile_new(const char *name, const char *mode, int isQuiet) THDiskFile_writeLong, THDiskFile_writeFloat, THDiskFile_writeDouble, + THDiskFile_writeHalf, THDiskFile_writeString, THDiskFile_synchronize, diff --git a/lib/TH/THFile.c b/lib/TH/THFile.c index c8913af..3717b7b 100644 --- a/lib/TH/THFile.c +++ b/lib/TH/THFile.c @@ -19,6 +19,7 @@ IMPLEMENT_THFILE_RW(Int, int) IMPLEMENT_THFILE_RW(Long, long) IMPLEMENT_THFILE_RW(Float, float) IMPLEMENT_THFILE_RW(Double, double) +IMPLEMENT_THFILE_RW(Half, THHalf) size_t THFile_readStringRaw(THFile *self, const char *format, char **str_) { @@ -133,6 +134,7 @@ IMPLEMENT_THFILE_SCALAR(Int, int) IMPLEMENT_THFILE_SCALAR(Long, long) IMPLEMENT_THFILE_SCALAR(Float, float) IMPLEMENT_THFILE_SCALAR(Double, double) +IMPLEMENT_THFILE_SCALAR(Half, THHalf) #define IMPLEMENT_THFILE_STORAGE(TYPEC, TYPE) \ size_t THFile_read##TYPEC(THFile *self, TH##TYPEC##Storage *storage) \ @@ -152,3 +154,4 @@ IMPLEMENT_THFILE_STORAGE(Int, int) IMPLEMENT_THFILE_STORAGE(Long, long) IMPLEMENT_THFILE_STORAGE(Float, float) IMPLEMENT_THFILE_STORAGE(Double, double) +IMPLEMENT_THFILE_STORAGE(Half, THHalf) diff --git a/lib/TH/THFile.h b/lib/TH/THFile.h index 64dd2da..e097bdf 100644 --- a/lib/TH/THFile.h +++ b/lib/TH/THFile.h @@ -74,6 +74,13 @@ TH_API size_t THFile_writeFloatRaw(THFile *self, float *data, size_t n); TH_API size_t THFile_writeDoubleRaw(THFile *self, double *data, size_t n); TH_API size_t THFile_writeStringRaw(THFile *self, const char *str, size_t size); +TH_API THHalf THFile_readHalfScalar(THFile *self); +TH_API void THFile_writeHalfScalar(THFile *self, THHalf scalar); +TH_API size_t THFile_readHalf(THFile *self, THHalfStorage *storage); +TH_API size_t THFile_writeHalf(THFile *self, THHalfStorage *storage); +TH_API size_t THFile_readHalfRaw(THFile *self, THHalf* data, size_t size); +TH_API size_t THFile_writeHalfRaw(THFile *self, THHalf* data, size_t size); + TH_API void THFile_synchronize(THFile *self); TH_API void THFile_seek(THFile *self, size_t position); TH_API void THFile_seekEnd(THFile *self); diff --git a/lib/TH/THFilePrivate.h b/lib/TH/THFilePrivate.h index d268041..55169c3 100644 --- a/lib/TH/THFilePrivate.h +++ b/lib/TH/THFilePrivate.h @@ -1,3 +1,8 @@ +#include "THGeneral.h" + +#include "THHalf.h" + + struct THFile__ { struct THFileVTable *vtable; @@ -23,6 +28,7 @@ struct THFileVTable size_t (*readLong)(THFile *self, long *data, size_t n); size_t (*readFloat)(THFile *self, float *data, size_t n); size_t (*readDouble)(THFile *self, double *data, size_t n); + size_t (*readHalf)(THFile *self, THHalf *data, size_t n); size_t (*readString)(THFile *self, const char *format, char **str_); size_t (*writeByte)(THFile *self, unsigned char *data, size_t n); @@ -32,6 +38,7 @@ struct THFileVTable size_t (*writeLong)(THFile *self, long *data, size_t n); size_t (*writeFloat)(THFile *self, float *data, size_t n); size_t (*writeDouble)(THFile *self, double *data, size_t n); + size_t (*writeHalf)(THFile *self, THHalf *data, size_t n); size_t (*writeString)(THFile *self, const char *str, size_t size); void (*synchronize)(THFile *self); diff --git a/lib/TH/THGenerateAllTypes.h b/lib/TH/THGenerateAllTypes.h index 539629b..4a77081 100644 --- a/lib/TH/THGenerateAllTypes.h +++ b/lib/TH/THGenerateAllTypes.h @@ -8,7 +8,6 @@ #define THInf UCHAR_MAX #define TH_REAL_IS_BYTE #line 1 TH_GENERIC_FILE -/*#line 1 "THByteStorage.h"*/ #include TH_GENERIC_FILE #undef real #undef accreal diff --git a/lib/TH/THGenerateHalfType.h b/lib/TH/THGenerateHalfType.h new file mode 100644 index 0000000..9acc534 --- /dev/null +++ b/lib/TH/THGenerateHalfType.h @@ -0,0 +1,19 @@ +#ifndef TH_GENERIC_FILE +#error "You must define TH_GENERIC_FILE before including THGenerateHalfType.h" +#endif + +#include "THHalf.h" +#define real THHalf +#define accreal float +#define Real Half +#define THInf TH_HALF_MAX +#define TH_REAL_IS_HALF +#line 1 TH_GENERIC_FILE +#include TH_GENERIC_FILE +#undef real +#undef accreal +#undef Real +#undef THInf +#undef TH_REAL_IS_HALF + +#undef TH_GENERIC_FILE diff --git a/lib/TH/THHalf.c b/lib/TH/THHalf.c new file mode 100644 index 0000000..b0b9bc5 --- /dev/null +++ b/lib/TH/THHalf.c @@ -0,0 +1,138 @@ +#include "THHalf.h" + +/* + * Copyright 1993-2014 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * These Licensed Deliverables contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +// Host functions for converting between FP32 and FP16 formats +// Paulius Micikevicius (pauliusm@nvidia.com) + +float TH_half2float(THHalf h) +{ + unsigned sign = ((h.x >> 15) & 1); + unsigned exponent = ((h.x >> 10) & 0x1f); + unsigned mantissa = ((h.x & 0x3ff) << 13); + + if (exponent == 0x1f) { /* NaN or Inf */ + mantissa = (mantissa ? (sign = 0, 0x7fffff) : 0); + exponent = 0xff; + } else if (!exponent) { /* Denorm or Zero */ + if (mantissa) { + unsigned int msb; + exponent = 0x71; + do { + msb = (mantissa & 0x400000); + mantissa <<= 1; /* normalize */ + --exponent; + } while (!msb); + mantissa &= 0x7fffff; /* 1.mantissa is implicit */ + } + } else { + exponent += 0x70; + } + + int temp = ((sign << 31) | (exponent << 23) | mantissa); + + return *((float*)((void*)&temp)); +} + +THHalf TH_float2half(float f) +{ + THHalf ret; + + unsigned x = *((int*)(void*)(&f)); + unsigned u = (x & 0x7fffffff), remainder, shift, lsb, lsb_s1, lsb_m1; + unsigned sign, exponent, mantissa; + + // Get rid of +NaN/-NaN case first. + if (u > 0x7f800000) { + ret.x = 0x7fffU; + return ret; + } + + sign = ((x >> 16) & 0x8000); + + // Get rid of +Inf/-Inf, +0/-0. + if (u > 0x477fefff) { + ret.x = sign | 0x7c00U; + return ret; + } + if (u < 0x33000001) { + ret.x = (sign | 0x0000); + return ret; + } + + exponent = ((u >> 23) & 0xff); + mantissa = (u & 0x7fffff); + + if (exponent > 0x70) { + shift = 13; + exponent -= 0x70; + } else { + shift = 0x7e - exponent; + exponent = 0; + mantissa |= 0x800000; + } + lsb = (1 << shift); + lsb_s1 = (lsb >> 1); + lsb_m1 = (lsb - 1); + + // Round to nearest even. + remainder = (mantissa & lsb_m1); + mantissa >>= shift; + if (remainder > lsb_s1 || (remainder == lsb_s1 && (mantissa & 0x1))) { + ++mantissa; + if (!(mantissa & 0x3ff)) { + ++exponent; + mantissa = 0; + } + } + + ret.x = (sign | (exponent << 10) | mantissa); + return ret; +} diff --git a/lib/TH/THHalf.h b/lib/TH/THHalf.h new file mode 100644 index 0000000..0549d21 --- /dev/null +++ b/lib/TH/THHalf.h @@ -0,0 +1,40 @@ +#ifndef TH_HALF_H +#define TH_HALF_H + +#include "THGeneral.h" +#include <stdint.h> + +/* Neither built-in nor included from Cutorch, use our definition lifted from CUDA */ +#if defined(__GNUC__) +#define __thalign__(n) __attribute__((aligned(n))) +#elif defined(_WIN32) +#define __thalign__(n) __declspec(align(n)) +#else +#define __thalign__(n) +#endif + +typedef struct __thalign__(2){ + unsigned short x; +} __THHalf; + +typedef struct __thalign__(4) { + unsigned int x; +} __THHalf2; + +typedef __THHalf THHalf; +typedef __THHalf2 THHalf2; + +/* numeric limits */ + + +TH_API THHalf TH_float2half(float a); +TH_API float TH_half2float(THHalf a); + +#ifndef TH_HALF_BITS_TO_LITERAL +# define TH_HALF_BITS_TO_LITERAL(n) { n } +#endif + +#define TH_HALF_MAX TH_HALF_BITS_TO_LITERAL(0x7BFF) + +#undef __thalign__ +#endif diff --git a/lib/TH/THMemoryFile.c b/lib/TH/THMemoryFile.c index 8d97621..ecce6e1 100644 --- a/lib/TH/THMemoryFile.c +++ b/lib/TH/THMemoryFile.c @@ -332,16 +332,18 @@ READ_WRITE_METHODS(int, Int, nByteWritten = snprintf(mfself->storage->data+mfself->position, mfself->storage->size-mfself->position, "%d", data[i]), 1) -/*READ_WRITE_METHODS(long, Long, - int nByteRead_; int ret = sscanf(mfself->storage->data+mfself->position, "%ld%n", &data[i], &nByteRead_); nByteRead = nByteRead_; if(ret <= 0) break; else nread++, - nByteWritten = snprintf(mfself->storage->data+mfself->position, mfself->storage->size-mfself->position, "%ld", data[i]), - 1)*/ - READ_WRITE_METHODS(float, Float, int nByteRead_; int ret = sscanf(mfself->storage->data+mfself->position, "%g%n", &data[i], &nByteRead_); nByteRead = nByteRead_; if(ret <= 0) break; else nread++, nByteWritten = snprintf(mfself->storage->data+mfself->position, mfself->storage->size-mfself->position, "%.9g", data[i]), 1) +READ_WRITE_METHODS(THHalf, Half, + int nByteRead_; float buf; \ + int ret = sscanf(mfself->storage->data+mfself->position, "%g%n", &buf, &nByteRead_); \ + data[i] = TH_float2half(buf); nByteRead = nByteRead_; if(ret <= 0) break; else nread++, + nByteWritten = snprintf(mfself->storage->data+mfself->position, mfself->storage->size-mfself->position, "%.9g", TH_half2float(data[i])), + 1) + READ_WRITE_METHODS(double, Double, int nByteRead_; int ret = sscanf(mfself->storage->data+mfself->position, "%lg%n", &data[i], &nByteRead_); nByteRead = nByteRead_; if(ret <= 0) break; else nread++, nByteWritten = snprintf(mfself->storage->data+mfself->position, mfself->storage->size-mfself->position, "%.17g", data[i]), @@ -621,6 +623,7 @@ THFile *THMemoryFile_newWithStorage(THCharStorage *storage, const char *mode) THMemoryFile_readLong, THMemoryFile_readFloat, THMemoryFile_readDouble, + THMemoryFile_readHalf, THMemoryFile_readString, THMemoryFile_writeByte, @@ -630,6 +633,7 @@ THFile *THMemoryFile_newWithStorage(THCharStorage *storage, const char *mode) THMemoryFile_writeLong, THMemoryFile_writeFloat, THMemoryFile_writeDouble, + THMemoryFile_writeHalf, THMemoryFile_writeString, THMemoryFile_synchronize, diff --git a/lib/TH/THStorage.c b/lib/TH/THStorage.c index d18488e..bb63a43 100644 --- a/lib/TH/THStorage.c +++ b/lib/TH/THStorage.c @@ -4,5 +4,11 @@ #include "generic/THStorage.c" #include "THGenerateAllTypes.h" +#include "generic/THStorage.c" +#include "THGenerateHalfType.h" + #include "generic/THStorageCopy.c" #include "THGenerateAllTypes.h" + +#include "generic/THStorageCopy.c" +#include "THGenerateHalfType.h" diff --git a/lib/TH/THStorage.h b/lib/TH/THStorage.h index 36ed507..9565e10 100644 --- a/lib/TH/THStorage.h +++ b/lib/TH/THStorage.h @@ -14,7 +14,13 @@ #include "generic/THStorage.h" #include "THGenerateAllTypes.h" +#include "generic/THStorage.h" +#include "THGenerateHalfType.h" + #include "generic/THStorageCopy.h" #include "THGenerateAllTypes.h" +#include "generic/THStorageCopy.h" +#include "THGenerateHalfType.h" + #endif diff --git a/lib/TH/THTensor.c b/lib/TH/THTensor.c index 2878fc9..37071df 100644 --- a/lib/TH/THTensor.c +++ b/lib/TH/THTensor.c @@ -11,9 +11,15 @@ #include "generic/THTensor.c" #include "THGenerateAllTypes.h" +#include "generic/THTensor.c" +#include "THGenerateHalfType.h" + #include "generic/THTensorCopy.c" #include "THGenerateAllTypes.h" +#include "generic/THTensorCopy.c" +#include "THGenerateHalfType.h" + #include "generic/THTensorRandom.c" #include "THGenerateAllTypes.h" diff --git a/lib/TH/THTensor.h b/lib/TH/THTensor.h index 6eddf9c..a155efd 100644 --- a/lib/TH/THTensor.h +++ b/lib/TH/THTensor.h @@ -16,9 +16,15 @@ typedef struct { #include "generic/THTensor.h" #include "THGenerateAllTypes.h" +#include "generic/THTensor.h" +#include "THGenerateHalfType.h" + #include "generic/THTensorCopy.h" #include "THGenerateAllTypes.h" +#include "generic/THTensorCopy.h" +#include "THGenerateHalfType.h" + #include "THTensorMacros.h" /* random numbers */ diff --git a/lib/TH/THVector.c b/lib/TH/THVector.c index 6179d89..907adbb 100644 --- a/lib/TH/THVector.c +++ b/lib/TH/THVector.c @@ -1,10 +1,15 @@ #include "THVector.h" + #include "generic/simd/simd.h" #ifdef __NEON__ #include "vector/NEON.c" #endif +#ifdef __PPC64__ +#include "vector/VSX.c" +#endif + #if defined(USE_SSE2) || defined(USE_SSE3) || defined(USE_SSSE3) \ || defined(USE_SSE4_1) || defined(USE_SSE4_2) #include "vector/SSE.c" diff --git a/lib/TH/generic/THBlas.c b/lib/TH/generic/THBlas.c index 6452f94..371df4d 100644 --- a/lib/TH/generic/THBlas.c +++ b/lib/TH/generic/THBlas.c @@ -2,6 +2,7 @@ #define TH_GENERIC_FILE "generic/THBlas.c" #else + #ifdef BLAS_F2C # define ffloat double #else @@ -24,8 +25,8 @@ TH_EXTERNC void dger_(int *m, int *n, double *alpha, double *x, int *incx, doubl TH_EXTERNC void sger_(int *m, int *n, float *alpha, float *x, int *incx, float *y, int *incy, float *a, int *lda); TH_EXTERNC void dgemm_(char *transa, char *transb, int *m, int *n, int *k, double *alpha, double *a, int *lda, double *b, int *ldb, double *beta, double *c, int *ldc); TH_EXTERNC void sgemm_(char *transa, char *transb, int *m, int *n, int *k, float *alpha, float *a, int *lda, float *b, int *ldb, float *beta, float *c, int *ldc); - - + + void THBlas_(swap)(long n, real *x, long incx, real *y, long incy) { @@ -182,9 +183,9 @@ void THBlas_(gemv)(char trans, long m, long n, real alpha, real *a, long lda, re { if(n == 1) lda = m; - + #if defined(USE_BLAS) && (defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT)) - if( (m <= INT_MAX) && (n <= INT_MAX) && + if( (m <= INT_MAX) && (n <= INT_MAX) && (lda > 0) && (lda <= INT_MAX) && (incx > 0) && (incx <= INT_MAX) && (incy > 0) && (incy <= INT_MAX) ) @@ -224,7 +225,7 @@ void THBlas_(gemv)(char trans, long m, long n, real alpha, real *a, long lda, re { if(beta != 1) THBlas_(scal)(m, beta, y, incy); - + for(j = 0; j < n; j++) { real *column_ = a+lda*j; diff --git a/lib/TH/generic/THStorageCopy.c b/lib/TH/generic/THStorageCopy.c index 583e088..ce4b57e 100644 --- a/lib/TH/generic/THStorageCopy.c +++ b/lib/TH/generic/THStorageCopy.c @@ -15,16 +15,42 @@ void THStorage_(copy)(THStorage *storage, THStorage *src) THStorage_(rawCopy)(storage, src->data); } - #define IMPLEMENT_THStorage_COPY(TYPENAMESRC) \ void THStorage_(copy##TYPENAMESRC)(THStorage *storage, TH##TYPENAMESRC##Storage *src) \ { \ - ptrdiff_t i; \ + ptrdiff_t i; \ + for(i = 0; i < storage->size; i++) \ + storage->data[i] = (real)src->data[i]; \ +} + +#define IMPLEMENT_THStorage_COPY_FROM_HALF(TYPENAMESRC) \ +void THStorage_(copy##TYPENAMESRC)(THStorage *storage, TH##TYPENAMESRC##Storage *src) \ +{ \ + THArgCheck(storage->size == src->size, 2, "size mismatch"); \ + ptrdiff_t i; \ + for(i = 0; i < storage->size; i++) \ + storage->data[i] = (real)TH_half2float(src->data[i]); \ +} + +#define IMPLEMENT_THStorage_COPY_TO_HALF(TYPENAMESRC) \ +void THStorage_(copy##TYPENAMESRC)(THStorage *storage, TH##TYPENAMESRC##Storage *src) \ +{ \ THArgCheck(storage->size == src->size, 2, "size mismatch"); \ - for(i = 0; i < storage->size; i++) \ - storage->data[i] = (real)src->data[i]; \ + ptrdiff_t i; \ + for(i = 0; i < storage->size; i++) \ + storage->data[i] = TH_float2half((float)(src->data[i])); \ } +#define IMPLEMENT_THStorage_COPY_TO_FROM_HALF(TYPENAMESRC) \ +void THStorage_(copy##TYPENAMESRC)(THStorage *storage, TH##TYPENAMESRC##Storage *src) \ +{ \ + THArgCheck(storage->size == src->size, 2, "size mismatch"); \ + ptrdiff_t i; \ + for(i = 0; i < storage->size; i++) \ + storage->data[i] = src->data[i]; \ +} + +#ifndef TH_REAL_IS_HALF IMPLEMENT_THStorage_COPY(Byte) IMPLEMENT_THStorage_COPY(Char) IMPLEMENT_THStorage_COPY(Short) @@ -32,5 +58,18 @@ IMPLEMENT_THStorage_COPY(Int) IMPLEMENT_THStorage_COPY(Long) IMPLEMENT_THStorage_COPY(Float) IMPLEMENT_THStorage_COPY(Double) +IMPLEMENT_THStorage_COPY_FROM_HALF(Half) +#else +/* only allow pass-through for Half */ +IMPLEMENT_THStorage_COPY_TO_FROM_HALF(Half) +IMPLEMENT_THStorage_COPY_TO_HALF(Byte) +IMPLEMENT_THStorage_COPY_TO_HALF(Char) +IMPLEMENT_THStorage_COPY_TO_HALF(Short) +IMPLEMENT_THStorage_COPY_TO_HALF(Int) +IMPLEMENT_THStorage_COPY_TO_HALF(Long) +IMPLEMENT_THStorage_COPY_TO_HALF(Float) +IMPLEMENT_THStorage_COPY_TO_HALF(Double) +#endif + #endif diff --git a/lib/TH/generic/THStorageCopy.h b/lib/TH/generic/THStorageCopy.h index f853a82..ce8a2a6 100644 --- a/lib/TH/generic/THStorageCopy.h +++ b/lib/TH/generic/THStorageCopy.h @@ -13,5 +13,6 @@ TH_API void THStorage_(copyInt)(THStorage *storage, struct THIntStorage *src); TH_API void THStorage_(copyLong)(THStorage *storage, struct THLongStorage *src); TH_API void THStorage_(copyFloat)(THStorage *storage, struct THFloatStorage *src); TH_API void THStorage_(copyDouble)(THStorage *storage, struct THDoubleStorage *src); +TH_API void THStorage_(copyHalf)(THStorage *storage, struct THHalfStorage *src); #endif diff --git a/lib/TH/generic/THTensorConv.c b/lib/TH/generic/THTensorConv.c index d98a2aa..1e21991 100644 --- a/lib/TH/generic/THTensorConv.c +++ b/lib/TH/generic/THTensorConv.c @@ -2,7 +2,6 @@ #define TH_GENERIC_FILE "generic/THTensorConv.c" #else - /* 2D Input, 2D kernel : convolve given image with the given kernel. */ @@ -775,7 +774,7 @@ void THTensor_(conv2DRevgerm)(THTensor *r_, real beta, real alpha, THTensor *t_, real *ptr_output = output_data + k*nInputPlane*nOutputCols*nOutputRows + i*nOutputCols*nOutputRows; /* get input */ real *ptr_input = input_data + p*istride0 + i*istride1; - + /* do image, kernel convolution */ THTensor_(validXCorr2DRevptr)(ptr_output, alpha, @@ -1174,7 +1173,7 @@ void THTensor_(conv2Dmm)(THTensor *r_, real beta, real alpha, THTensor *t_, THTe real *ptr_weight = weight_data + k*kstride0 + i*kstride1; /* get input */ real *ptr_input = input_data + p*nInputPlane*nInputRows*nInputCols + i*nInputRows*nInputCols; - + /* do image, kernel convolution */ if (*vf == 'F') if (*xc == 'X') @@ -1955,5 +1954,4 @@ void THTensor_(conv3Dmap)(THTensor *r_, real beta, real alpha, THTensor *t_, THT THTensor_(free)(input); THTensor_(free)(kernel); } - #endif diff --git a/lib/TH/generic/THTensorConv.h b/lib/TH/generic/THTensorConv.h index d215fcd..79866f3 100644 --- a/lib/TH/generic/THTensorConv.h +++ b/lib/TH/generic/THTensorConv.h @@ -2,7 +2,6 @@ #define TH_GENERIC_FILE "generic/THTensorConv.h" #else - TH_API void THTensor_(validXCorr2Dptr)(real *r_, real alpha, real *t_, long ir, long ic, @@ -66,7 +65,7 @@ TH_API void THTensor_(fullConv3Dptr)(real *r_, long st, long sr, long sc); TH_API void THTensor_(validXCorr3DRevptr)(real *r_, - real alpha, + real alpha, real *t_, long it, long ir, long ic, real *k_, long kt, long kr, long kc, long st, long sr, long sc); diff --git a/lib/TH/generic/THTensorCopy.c b/lib/TH/generic/THTensorCopy.c index ea6d6f1..d7228f6 100644 --- a/lib/TH/generic/THTensorCopy.c +++ b/lib/TH/generic/THTensorCopy.c @@ -13,6 +13,19 @@ void THTensor_(copy##TYPENAMESRC)(THTensor *tensor, TH##TYPENAMESRC##Tensor *src TH_TENSOR_APPLY2(real, tensor, TYPE_SRC, src, *tensor_data = (real)(*src_data);) \ } +#define IMPLEMENT_THTensor_COPY_TO_HALF(TYPENAMESRC, TYPE_SRC) \ +void THTensor_(copy##TYPENAMESRC)(THTensor *tensor, TH##TYPENAMESRC##Tensor *src) \ +{ \ + TH_TENSOR_APPLY2(real, tensor, TYPE_SRC, src, *tensor_data = TH_float2half((float)*src_data);) \ +} + +#define IMPLEMENT_THTensor_COPY_FROM_HALF(TYPENAMESRC, TYPE_SRC) \ +void THTensor_(copy##TYPENAMESRC)(THTensor *tensor, TH##TYPENAMESRC##Tensor *src) \ +{ \ + TH_TENSOR_APPLY2(real, tensor, TYPE_SRC, src, *tensor_data = (real)TH_half2float(*src_data);) \ +} + +#ifndef TH_REAL_IS_HALF IMPLEMENT_THTensor_COPY(Byte, unsigned char) IMPLEMENT_THTensor_COPY(Char, char) IMPLEMENT_THTensor_COPY(Short, short) @@ -20,5 +33,18 @@ IMPLEMENT_THTensor_COPY(Int, int) IMPLEMENT_THTensor_COPY(Long, long) IMPLEMENT_THTensor_COPY(Float, float) IMPLEMENT_THTensor_COPY(Double, double) +IMPLEMENT_THTensor_COPY_FROM_HALF(Half, THHalf) +#else +/* only allow pass-through for Half */ +IMPLEMENT_THTensor_COPY(Half, THHalf) +IMPLEMENT_THTensor_COPY_TO_HALF(Byte, unsigned char) +IMPLEMENT_THTensor_COPY_TO_HALF(Char, char) +IMPLEMENT_THTensor_COPY_TO_HALF(Short, short) +IMPLEMENT_THTensor_COPY_TO_HALF(Int, int) +IMPLEMENT_THTensor_COPY_TO_HALF(Long, long) +IMPLEMENT_THTensor_COPY_TO_HALF(Float, float) +IMPLEMENT_THTensor_COPY_TO_HALF(Double, double) + +#endif /* REAL_IS_HALF */ #endif diff --git a/lib/TH/generic/THTensorCopy.h b/lib/TH/generic/THTensorCopy.h index 8d03b22..b9e5bfc 100644 --- a/lib/TH/generic/THTensorCopy.h +++ b/lib/TH/generic/THTensorCopy.h @@ -12,5 +12,6 @@ TH_API void THTensor_(copyInt)(THTensor *tensor, struct THIntTensor *src); TH_API void THTensor_(copyLong)(THTensor *tensor, struct THLongTensor *src); TH_API void THTensor_(copyFloat)(THTensor *tensor, struct THFloatTensor *src); TH_API void THTensor_(copyDouble)(THTensor *tensor, struct THDoubleTensor *src); +TH_API void THTensor_(copyHalf)(THTensor *tensor, struct THHalfTensor *src); #endif diff --git a/lib/TH/generic/THTensorMath.c b/lib/TH/generic/THTensorMath.c index 9fc1577..78ee056 100644 --- a/lib/TH/generic/THTensorMath.c +++ b/lib/TH/generic/THTensorMath.c @@ -98,10 +98,15 @@ void THTensor_(nonzero)(THLongTensor *subscript, THTensor *tensor) long i = 0; long dim; long div = 1; +#ifdef TH_REAL_IS_HALF +#define IS_NONZERO(val) (TH_half2float(val)!=0) +#else +#define IS_NONZERO(val) ((val)!=0) +#endif /* First Pass to determine size of subscripts */ TH_TENSOR_APPLY(real, tensor, - if (*tensor_data != 0) { + if IS_NONZERO(*tensor_data) { ++numel; }); #ifdef DEBUG @@ -112,7 +117,7 @@ void THTensor_(nonzero)(THLongTensor *subscript, THTensor *tensor) /* Second pass populates subscripts */ subscript_data = THLongTensor_data(subscript); TH_TENSOR_APPLY(real, tensor, - if (*tensor_data != 0) { + if IS_NONZERO(*tensor_data) { div = 1; for (dim = tensor->nDimension - 1; dim >= 0; dim--) { @@ -396,6 +401,7 @@ accreal THTensor_(dot)(THTensor *tensor, THTensor *src) return sum; } + #undef th_isnan #if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE) #define th_isnan(val) \ @@ -2596,5 +2602,45 @@ void THTensor_(histc)(THTensor *hist, THTensor *tensor, long nbins, real minvalu ); } +void THTensor_(bhistc)(THTensor *hist, THTensor *tensor, long nbins, real minvalue, real maxvalue) +{ + THArgCheck(THTensor_(nDimension)(tensor) < 3, 2, "invalid dimension %d, the input must be a 2d tensor", THTensor_(nDimension)(tensor)); + + int dimension = 1; + THArgCheck(dimension >= 0 && dimension < THTensor_(nDimension)(tensor), 2, "invalid dimension %d", + dimension + TH_INDEX_BASE); + + real minval; + real maxval; + real *h_data; + + THTensor_(resize2d)(hist, tensor->size[0], nbins); + THTensor_(zero)(hist); + + minval = minvalue; + maxval = maxvalue; + if (minval == maxval) + { + minval = THTensor_(minall)(tensor); + maxval = THTensor_(maxall)(tensor); + } + if (minval == maxval) + { + minval = minval - 1; + maxval = maxval + 1; + } + + TH_TENSOR_DIM_APPLY2(real, tensor, real, hist, dimension, long i; + for(i = 0; i < tensor_size; i++) + { + if(tensor_data[i*tensor_stride] >= minval && tensor_data[i*tensor_stride] <= maxval) { + const int bin = (int)((tensor_data[i*tensor_stride]-minval) / (maxval-minval) * nbins); + hist_data[THMin(bin, nbins-1)] += 1; + } + } + ); +} + #endif /* floating point only part */ +#undef IS_NONZERO #endif diff --git a/lib/TH/generic/THTensorMath.h b/lib/TH/generic/THTensorMath.h index 87f1616..c656dfd 100644 --- a/lib/TH/generic/THTensorMath.h +++ b/lib/TH/generic/THTensorMath.h @@ -2,8 +2,6 @@ #define TH_GENERIC_FILE "generic/THTensorMath.h" #else - - TH_API void THTensor_(fill)(THTensor *r_, real value); TH_API void THTensor_(zero)(THTensor *r_); @@ -163,6 +161,7 @@ TH_API void THTensor_(norm)(THTensor *r_, THTensor *t, real value, int dimension TH_API void THTensor_(renorm)(THTensor *r_, THTensor *t, real value, int dimension, real maxnorm); TH_API accreal THTensor_(dist)(THTensor *a, THTensor *b, real value); TH_API void THTensor_(histc)(THTensor *hist, THTensor *tensor, long nbins, real minvalue, real maxvalue); +TH_API void THTensor_(bhistc)(THTensor *hist, THTensor *tensor, long nbins, real minvalue, real maxvalue); TH_API accreal THTensor_(meanall)(THTensor *self); TH_API accreal THTensor_(varall)(THTensor *self); @@ -173,7 +172,6 @@ TH_API void THTensor_(linspace)(THTensor *r_, real a, real b, long n); TH_API void THTensor_(logspace)(THTensor *r_, real a, real b, long n); TH_API void THTensor_(rand)(THTensor *r_, THGenerator *_generator, THLongStorage *size); TH_API void THTensor_(randn)(THTensor *r_, THGenerator *_generator, THLongStorage *size); - #endif #if defined(TH_REAL_IS_BYTE) diff --git a/lib/TH/generic/THVectorDispatch.c b/lib/TH/generic/THVectorDispatch.c index 6fd1d68..2436a12 100644 --- a/lib/TH/generic/THVectorDispatch.c +++ b/lib/TH/generic/THVectorDispatch.c @@ -20,6 +20,12 @@ static FunctionDescription THVector_(fill_DISPATCHTABLE)[] = { #endif #endif + #if defined(__PPC64__) + #if defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT) + FUNCTION_IMPL(THVector_(fill_VSX), SIMDExtension_VSX), + #endif + #endif + #if defined(USE_SSE2) || defined(USE_SSE3) || defined(USE_SSSE3) \ || defined(USE_SSE4_1) || defined(USE_SSE4_2) #if defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT) @@ -32,7 +38,6 @@ void THVector_(fill)(real *x, const real c, const ptrdiff_t n) { THVector_(fill_DISPATCHPTR)(x, c, n); } - static void (*THVector_(add_DISPATCHPTR))(real *, const real *, const real, const ptrdiff_t) = &THVector_(add_DEFAULT); static FunctionDescription THVector_(add_DISPATCHTABLE)[] = { #if defined(__NEON__) @@ -41,6 +46,12 @@ static FunctionDescription THVector_(add_DISPATCHTABLE)[] = { #endif #endif + #if defined(__PPC64__) + #if defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT) + FUNCTION_IMPL(THVector_(add_VSX), SIMDExtension_VSX), + #endif + #endif + #if defined(USE_SSE2) || defined(USE_SSE3) || defined(USE_SSSE3) \ || defined(USE_SSE4_1) || defined(USE_SSE4_2) #if defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT) @@ -63,6 +74,12 @@ static FunctionDescription THVector_(diff_DISPATCHTABLE)[] = { #endif #endif + #if defined(__PPC64__) + #if defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT) + FUNCTION_IMPL(THVector_(diff_VSX), SIMDExtension_VSX), + #endif + #endif + #if defined(USE_SSE2) || defined(USE_SSE3) || defined(USE_SSSE3) \ || defined(USE_SSE4_1) || defined(USE_SSE4_2) #if defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT) @@ -85,6 +102,12 @@ static FunctionDescription THVector_(scale_DISPATCHTABLE)[] = { #endif #endif + #if defined(__PPC64__) + #if defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT) + FUNCTION_IMPL(THVector_(scale_VSX), SIMDExtension_VSX), + #endif + #endif + #if defined(USE_SSE2) || defined(USE_SSE3) || defined(USE_SSSE3) \ || defined(USE_SSE4_1) || defined(USE_SSE4_2) #if defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT) @@ -107,6 +130,12 @@ static FunctionDescription THVector_(mul_DISPATCHTABLE)[] = { #endif #endif + #if defined(__PPC64__) + #if defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT) + FUNCTION_IMPL(THVector_(mul_VSX), SIMDExtension_VSX), + #endif + #endif + #if defined(USE_SSE2) || defined(USE_SSE3) || defined(USE_SSSE3) \ || defined(USE_SSE4_1) || defined(USE_SSE4_2) #if defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT) diff --git a/lib/TH/generic/simd/simd.h b/lib/TH/generic/simd/simd.h index b059258..ee53e2c 100644 --- a/lib/TH/generic/simd/simd.h +++ b/lib/TH/generic/simd/simd.h @@ -2,8 +2,10 @@ #define TH_SIMD_INC #include <stdint.h> -#ifdef _MSC_VER +#if defined(_MSC_VER) #include <intrin.h> +#elif defined(HAVE_GCC_GET_CPUID) +#include <cpuid.h> #endif // Can be found on Intel ISA Reference for CPUID @@ -40,6 +42,8 @@ enum SIMDExtensions { #if defined(__NEON__) SIMDExtension_NEON = 0x1, +#elif defined(__PPC64__) + SIMDExtension_VSX = 0x1, #else SIMDExtension_AVX2 = 0x1, SIMDExtension_AVX = 0x2, @@ -48,26 +52,57 @@ enum SIMDExtensions SIMDExtension_DEFAULT = 0x0 }; -#if defined(__NEON__) + +#if defined(__arm__) + + #if defined(__NEON__) static inline uint32_t detectHostSIMDExtensions() { return SIMDExtension_NEON; } -#else // x86 + #else //ARM without NEON + +static inline uint32_t detectHostSIMDExtensions() +{ + return SIMDExtension_DEFAULT; +} + + #endif + +#elif defined(__PPC64__) + + #if defined(__VSX__) + +static inline uint32_t detectHostSIMDExtensions() +{ + return SIMDExtension_VSX; +} -#if defined(__arm__) //ARM without NEON + #else static inline uint32_t detectHostSIMDExtensions() { return SIMDExtension_DEFAULT; } + + #endif -#else +#else // x86 static inline void cpuid(uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx) { -#ifndef _MSC_VER +#if defined(_MSC_VER) + uint32_t cpuInfo[4]; + __cpuid(cpuInfo, *eax); + *eax = cpuInfo[0]; + *ebx = cpuInfo[1]; + *ecx = cpuInfo[2]; + *edx = cpuInfo[3]; +#elif defined(HAVE_GCC_GET_CPUID) + uint32_t level = *eax; + __get_cpuid (level, eax, ebx, ecx, edx); +#else uint32_t a = *eax, b, c, d; asm volatile ( "cpuid\n\t" : "+a"(a), "=b"(b), "=c"(c), "=d"(d) ); @@ -75,13 +110,6 @@ static inline void cpuid(uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t * *ebx = b; *ecx = c; *edx = d; -#else - uint32_t cpuInfo[4]; - __cpuid(cpuInfo, *eax); - *eax = cpuInfo[0]; - *ebx = cpuInfo[1]; - *ecx = cpuInfo[2]; - *edx = cpuInfo[3]; #endif } @@ -106,7 +134,6 @@ static inline uint32_t detectHostSIMDExtensions() return hostSimdExts; } -#endif // end x86 SIMD extension detection code -#endif // end __arm__ +#endif // end SIMD extension detection code #endif diff --git a/lib/TH/vector/VSX.c b/lib/TH/vector/VSX.c new file mode 100644 index 0000000..14f14a7 --- /dev/null +++ b/lib/TH/vector/VSX.c @@ -0,0 +1,1915 @@ +#ifdef __PPC64__ + +#include <altivec.h> +#include <stddef.h> + + +//-------------------------------------------------------------------------------------------------- +// THDoubleVector_fill_VSX was tested on Power8: +// +// Unrolling 128 elements is 20% faster than unrolling 64 elements. +// Unrolling 64 elements is faster than unrolling any lesser number of elements. +//-------------------------------------------------------------------------------------------------- +static void THDoubleVector_fill_VSX(double *x, const double c, const ptrdiff_t n) +{ + ptrdiff_t i; + + double val[2] = {c, c}; + vector double fp64vec2 = vec_xl(0, val); + + for (i = 0; i <= n-128; i += 128) + { + vec_xst(fp64vec2, 0, x+(i )); + vec_xst(fp64vec2, 0, x+(i+2 )); + vec_xst(fp64vec2, 0, x+(i+4 )); + vec_xst(fp64vec2, 0, x+(i+6 )); + vec_xst(fp64vec2, 0, x+(i+8 )); + vec_xst(fp64vec2, 0, x+(i+10 )); + vec_xst(fp64vec2, 0, x+(i+12 )); + vec_xst(fp64vec2, 0, x+(i+14 )); + vec_xst(fp64vec2, 0, x+(i+16 )); + vec_xst(fp64vec2, 0, x+(i+18 )); + vec_xst(fp64vec2, 0, x+(i+20 )); + vec_xst(fp64vec2, 0, x+(i+22 )); + vec_xst(fp64vec2, 0, x+(i+24 )); + vec_xst(fp64vec2, 0, x+(i+26 )); + vec_xst(fp64vec2, 0, x+(i+28 )); + vec_xst(fp64vec2, 0, x+(i+30 )); + vec_xst(fp64vec2, 0, x+(i+32 )); + vec_xst(fp64vec2, 0, x+(i+34 )); + vec_xst(fp64vec2, 0, x+(i+36 )); + vec_xst(fp64vec2, 0, x+(i+38 )); + vec_xst(fp64vec2, 0, x+(i+40 )); + vec_xst(fp64vec2, 0, x+(i+42 )); + vec_xst(fp64vec2, 0, x+(i+44 )); + vec_xst(fp64vec2, 0, x+(i+46 )); + vec_xst(fp64vec2, 0, x+(i+48 )); + vec_xst(fp64vec2, 0, x+(i+50 )); + vec_xst(fp64vec2, 0, x+(i+52 )); + vec_xst(fp64vec2, 0, x+(i+54 )); + vec_xst(fp64vec2, 0, x+(i+56 )); + vec_xst(fp64vec2, 0, x+(i+58 )); + vec_xst(fp64vec2, 0, x+(i+60 )); + vec_xst(fp64vec2, 0, x+(i+62 )); + vec_xst(fp64vec2, 0, x+(i+64 )); + vec_xst(fp64vec2, 0, x+(i+66 )); + vec_xst(fp64vec2, 0, x+(i+68 )); + vec_xst(fp64vec2, 0, x+(i+70 )); + vec_xst(fp64vec2, 0, x+(i+72 )); + vec_xst(fp64vec2, 0, x+(i+74 )); + vec_xst(fp64vec2, 0, x+(i+76 )); + vec_xst(fp64vec2, 0, x+(i+78 )); + vec_xst(fp64vec2, 0, x+(i+80 )); + vec_xst(fp64vec2, 0, x+(i+82 )); + vec_xst(fp64vec2, 0, x+(i+84 )); + vec_xst(fp64vec2, 0, x+(i+86 )); + vec_xst(fp64vec2, 0, x+(i+88 )); + vec_xst(fp64vec2, 0, x+(i+90 )); + vec_xst(fp64vec2, 0, x+(i+92 )); + vec_xst(fp64vec2, 0, x+(i+94 )); + vec_xst(fp64vec2, 0, x+(i+96 )); + vec_xst(fp64vec2, 0, x+(i+98 )); + vec_xst(fp64vec2, 0, x+(i+100)); + vec_xst(fp64vec2, 0, x+(i+102)); + vec_xst(fp64vec2, 0, x+(i+104)); + vec_xst(fp64vec2, 0, x+(i+106)); + vec_xst(fp64vec2, 0, x+(i+108)); + vec_xst(fp64vec2, 0, x+(i+110)); + vec_xst(fp64vec2, 0, x+(i+112)); + vec_xst(fp64vec2, 0, x+(i+114)); + vec_xst(fp64vec2, 0, x+(i+116)); + vec_xst(fp64vec2, 0, x+(i+118)); + vec_xst(fp64vec2, 0, x+(i+120)); + vec_xst(fp64vec2, 0, x+(i+122)); + vec_xst(fp64vec2, 0, x+(i+124)); + vec_xst(fp64vec2, 0, x+(i+126)); + } + for (; i <= n-16; i += 16) + { + vec_xst(fp64vec2, 0, x+(i )); + vec_xst(fp64vec2, 0, x+(i+2 )); + vec_xst(fp64vec2, 0, x+(i+4 )); + vec_xst(fp64vec2, 0, x+(i+6 )); + vec_xst(fp64vec2, 0, x+(i+8 )); + vec_xst(fp64vec2, 0, x+(i+10 )); + vec_xst(fp64vec2, 0, x+(i+12 )); + vec_xst(fp64vec2, 0, x+(i+14 )); + } + for (; i <= n-2; i += 2) + vec_xst(fp64vec2, 0, x+(i )); + for (; i < n; i++) + x[i] = c; +} + + +//-------------------------------------------------------------------------------------------------- +// THDoubleVector_add_VSX was tested on Power8: +// +// Max speedup achieved when unrolling 24 elements. +// When unrolling 32 elements, the performance was the same as for 24. +// When unrolling 16 elements, performance was not as good as for 24. +// Unrolling 24 elements was 43% faster than unrolling 4 elements (2.8 sec vs 4.0 sec). +// Unrolling 24 elements was about 8% faster than unrolling 16 elements (2.8 sec vs 3.0 sec). +//-------------------------------------------------------------------------------------------------- +static void THDoubleVector_add_VSX(double *y, const double *x, const double c, const ptrdiff_t n) +{ + ptrdiff_t i; + vector double c_fp64vec2; + vector double y0_fp64vec2, y1_fp64vec2, y2_fp64vec2, y3_fp64vec2, y4_fp64vec2, y5_fp64vec2, y6_fp64vec2, y7_fp64vec2; + vector double y8_fp64vec2, y9_fp64vec2, y10_fp64vec2, y11_fp64vec2; + vector double x0_fp64vec2, x1_fp64vec2, x2_fp64vec2, x3_fp64vec2, x4_fp64vec2, x5_fp64vec2, x6_fp64vec2, x7_fp64vec2; + vector double x8_fp64vec2, x9_fp64vec2, x10_fp64vec2, x11_fp64vec2; + + double val[2] = {c, c}; + c_fp64vec2 = vec_xl(0, val); + + for (i = 0; i <= n-24; i += 24) + { + x0_fp64vec2 = vec_xl(0, x+(i )); + x1_fp64vec2 = vec_xl(0, x+(i+2 )); + x2_fp64vec2 = vec_xl(0, x+(i+4 )); + x3_fp64vec2 = vec_xl(0, x+(i+6 )); + x4_fp64vec2 = vec_xl(0, x+(i+8 )); + x5_fp64vec2 = vec_xl(0, x+(i+10)); + x6_fp64vec2 = vec_xl(0, x+(i+12)); + x7_fp64vec2 = vec_xl(0, x+(i+14)); + x8_fp64vec2 = vec_xl(0, x+(i+16)); + x9_fp64vec2 = vec_xl(0, x+(i+18)); + x10_fp64vec2 = vec_xl(0, x+(i+20)); + x11_fp64vec2 = vec_xl(0, x+(i+22)); + + y0_fp64vec2 = vec_xl(0, y+(i )); + y1_fp64vec2 = vec_xl(0, y+(i+2 )); + y2_fp64vec2 = vec_xl(0, y+(i+4 )); + y3_fp64vec2 = vec_xl(0, y+(i+6 )); + y4_fp64vec2 = vec_xl(0, y+(i+8 )); + y5_fp64vec2 = vec_xl(0, y+(i+10)); + y6_fp64vec2 = vec_xl(0, y+(i+12)); + y7_fp64vec2 = vec_xl(0, y+(i+14)); + y8_fp64vec2 = vec_xl(0, y+(i+16)); + y9_fp64vec2 = vec_xl(0, y+(i+18)); + y10_fp64vec2 = vec_xl(0, y+(i+20)); + y11_fp64vec2 = vec_xl(0, y+(i+22)); + + y0_fp64vec2 = vec_madd(c_fp64vec2, x0_fp64vec2, y0_fp64vec2 ); + y1_fp64vec2 = vec_madd(c_fp64vec2, x1_fp64vec2, y1_fp64vec2 ); + y2_fp64vec2 = vec_madd(c_fp64vec2, x2_fp64vec2, y2_fp64vec2 ); + y3_fp64vec2 = vec_madd(c_fp64vec2, x3_fp64vec2, y3_fp64vec2 ); + y4_fp64vec2 = vec_madd(c_fp64vec2, x4_fp64vec2, y4_fp64vec2 ); + y5_fp64vec2 = vec_madd(c_fp64vec2, x5_fp64vec2, y5_fp64vec2 ); + y6_fp64vec2 = vec_madd(c_fp64vec2, x6_fp64vec2, y6_fp64vec2 ); + y7_fp64vec2 = vec_madd(c_fp64vec2, x7_fp64vec2, y7_fp64vec2 ); + y8_fp64vec2 = vec_madd(c_fp64vec2, x8_fp64vec2, y8_fp64vec2 ); + y9_fp64vec2 = vec_madd(c_fp64vec2, x9_fp64vec2, y9_fp64vec2 ); + y10_fp64vec2 = vec_madd(c_fp64vec2, x10_fp64vec2, y10_fp64vec2); + y11_fp64vec2 = vec_madd(c_fp64vec2, x11_fp64vec2, y11_fp64vec2); + + vec_xst(y0_fp64vec2, 0, y+(i )); + vec_xst(y1_fp64vec2, 0, y+(i+2 )); + vec_xst(y2_fp64vec2, 0, y+(i+4 )); + vec_xst(y3_fp64vec2, 0, y+(i+6 )); + vec_xst(y4_fp64vec2, 0, y+(i+8 )); + vec_xst(y5_fp64vec2, 0, y+(i+10)); + vec_xst(y6_fp64vec2, 0, y+(i+12)); + vec_xst(y7_fp64vec2, 0, y+(i+14)); + vec_xst(y8_fp64vec2, 0, y+(i+16)); + vec_xst(y9_fp64vec2, 0, y+(i+18)); + vec_xst(y10_fp64vec2, 0, y+(i+20)); + vec_xst(y11_fp64vec2, 0, y+(i+22)); + } + for (; i <= n-8; i += 8) + { + x0_fp64vec2 = vec_xl(0, x+(i )); + x1_fp64vec2 = vec_xl(0, x+(i+2 )); + x2_fp64vec2 = vec_xl(0, x+(i+4 )); + x3_fp64vec2 = vec_xl(0, x+(i+6 )); + + y0_fp64vec2 = vec_xl(0, y+(i )); + y1_fp64vec2 = vec_xl(0, y+(i+2 )); + y2_fp64vec2 = vec_xl(0, y+(i+4 )); + y3_fp64vec2 = vec_xl(0, y+(i+6 )); + + y0_fp64vec2 = vec_madd(c_fp64vec2, x0_fp64vec2, y0_fp64vec2 ); + y1_fp64vec2 = vec_madd(c_fp64vec2, x1_fp64vec2, y1_fp64vec2 ); + y2_fp64vec2 = vec_madd(c_fp64vec2, x2_fp64vec2, y2_fp64vec2 ); + y3_fp64vec2 = vec_madd(c_fp64vec2, x3_fp64vec2, y3_fp64vec2 ); + + vec_xst(y0_fp64vec2, 0, y+(i )); + vec_xst(y1_fp64vec2, 0, y+(i+2 )); + vec_xst(y2_fp64vec2, 0, y+(i+4 )); + vec_xst(y3_fp64vec2, 0, y+(i+6 )); + } + for (; i <= n-2; i += 2) + { + x0_fp64vec2 = vec_xl(0, x+(i )); + y0_fp64vec2 = vec_xl(0, y+(i )); + y0_fp64vec2 = vec_madd(c_fp64vec2, x0_fp64vec2, y0_fp64vec2 ); + vec_xst(y0_fp64vec2, 0, y+(i )); + } + for (; i < n; i++) + y[i] = (c * x[i]) + y[i]; +} + + +static void THDoubleVector_diff_VSX(double *z, const double *x, const double *y, const ptrdiff_t n) { + ptrdiff_t i; + + vector double xz0_fp64vec2, xz1_fp64vec2, xz2_fp64vec2, xz3_fp64vec2, xz4_fp64vec2, xz5_fp64vec2, xz6_fp64vec2, xz7_fp64vec2; + vector double xz8_fp64vec2, xz9_fp64vec2, xz10_fp64vec2, xz11_fp64vec2; + vector double y0_fp64vec2, y1_fp64vec2, y2_fp64vec2, y3_fp64vec2, y4_fp64vec2, y5_fp64vec2, y6_fp64vec2, y7_fp64vec2; + vector double y8_fp64vec2, y9_fp64vec2, y10_fp64vec2, y11_fp64vec2; + + for (i = 0; i <= n-24; i += 24) + { + xz0_fp64vec2 = vec_xl(0, x+(i )); + xz1_fp64vec2 = vec_xl(0, x+(i+2 )); + xz2_fp64vec2 = vec_xl(0, x+(i+4 )); + xz3_fp64vec2 = vec_xl(0, x+(i+6 )); + xz4_fp64vec2 = vec_xl(0, x+(i+8 )); + xz5_fp64vec2 = vec_xl(0, x+(i+10)); + xz6_fp64vec2 = vec_xl(0, x+(i+12)); + xz7_fp64vec2 = vec_xl(0, x+(i+14)); + xz8_fp64vec2 = vec_xl(0, x+(i+16)); + xz9_fp64vec2 = vec_xl(0, x+(i+18)); + xz10_fp64vec2 = vec_xl(0, x+(i+20)); + xz11_fp64vec2 = vec_xl(0, x+(i+22)); + + y0_fp64vec2 = vec_xl(0, y+(i )); + y1_fp64vec2 = vec_xl(0, y+(i+2 )); + y2_fp64vec2 = vec_xl(0, y+(i+4 )); + y3_fp64vec2 = vec_xl(0, y+(i+6 )); + y4_fp64vec2 = vec_xl(0, y+(i+8 )); + y5_fp64vec2 = vec_xl(0, y+(i+10)); + y6_fp64vec2 = vec_xl(0, y+(i+12)); + y7_fp64vec2 = vec_xl(0, y+(i+14)); + y8_fp64vec2 = vec_xl(0, y+(i+16)); + y9_fp64vec2 = vec_xl(0, y+(i+18)); + y10_fp64vec2 = vec_xl(0, y+(i+20)); + y11_fp64vec2 = vec_xl(0, y+(i+22)); + + xz0_fp64vec2 = vec_sub(xz0_fp64vec2, y0_fp64vec2 ); + xz1_fp64vec2 = vec_sub(xz1_fp64vec2, y1_fp64vec2 ); + xz2_fp64vec2 = vec_sub(xz2_fp64vec2, y2_fp64vec2 ); + xz3_fp64vec2 = vec_sub(xz3_fp64vec2, y3_fp64vec2 ); + xz4_fp64vec2 = vec_sub(xz4_fp64vec2, y4_fp64vec2 ); + xz5_fp64vec2 = vec_sub(xz5_fp64vec2, y5_fp64vec2 ); + xz6_fp64vec2 = vec_sub(xz6_fp64vec2, y6_fp64vec2 ); + xz7_fp64vec2 = vec_sub(xz7_fp64vec2, y7_fp64vec2 ); + xz8_fp64vec2 = vec_sub(xz8_fp64vec2, y8_fp64vec2 ); + xz9_fp64vec2 = vec_sub(xz9_fp64vec2, y9_fp64vec2 ); + xz10_fp64vec2 = vec_sub(xz10_fp64vec2, y10_fp64vec2); + xz11_fp64vec2 = vec_sub(xz11_fp64vec2, y11_fp64vec2); + + vec_xst(xz0_fp64vec2, 0, z+(i )); + vec_xst(xz1_fp64vec2, 0, z+(i+2 )); + vec_xst(xz2_fp64vec2, 0, z+(i+4 )); + vec_xst(xz3_fp64vec2, 0, z+(i+6 )); + vec_xst(xz4_fp64vec2, 0, z+(i+8 )); + vec_xst(xz5_fp64vec2, 0, z+(i+10)); + vec_xst(xz6_fp64vec2, 0, z+(i+12)); + vec_xst(xz7_fp64vec2, 0, z+(i+14)); + vec_xst(xz8_fp64vec2, 0, z+(i+16)); + vec_xst(xz9_fp64vec2, 0, z+(i+18)); + vec_xst(xz10_fp64vec2, 0, z+(i+20)); + vec_xst(xz11_fp64vec2, 0, z+(i+22)); + } + for (; i <= n-8; i += 8) + { + xz0_fp64vec2 = vec_xl(0, x+(i )); + xz1_fp64vec2 = vec_xl(0, x+(i+2 )); + xz2_fp64vec2 = vec_xl(0, x+(i+4 )); + xz3_fp64vec2 = vec_xl(0, x+(i+6 )); + + y0_fp64vec2 = vec_xl(0, y+(i )); + y1_fp64vec2 = vec_xl(0, y+(i+2 )); + y2_fp64vec2 = vec_xl(0, y+(i+4 )); + y3_fp64vec2 = vec_xl(0, y+(i+6 )); + + xz0_fp64vec2 = vec_sub(xz0_fp64vec2, y0_fp64vec2 ); + xz1_fp64vec2 = vec_sub(xz1_fp64vec2, y1_fp64vec2 ); + xz2_fp64vec2 = vec_sub(xz2_fp64vec2, y2_fp64vec2 ); + xz3_fp64vec2 = vec_sub(xz3_fp64vec2, y3_fp64vec2 ); + + vec_xst(xz0_fp64vec2, 0, z+(i )); + vec_xst(xz1_fp64vec2, 0, z+(i+2 )); + vec_xst(xz2_fp64vec2, 0, z+(i+4 )); + vec_xst(xz3_fp64vec2, 0, z+(i+6 )); + } + for (; i <= n-2; i += 2) + { + xz0_fp64vec2 = vec_xl(0, x+(i )); + y0_fp64vec2 = vec_xl(0, y+(i )); + xz0_fp64vec2 = vec_sub(xz0_fp64vec2, y0_fp64vec2 ); + vec_xst(xz0_fp64vec2, 0, z+(i )); + } + for (; i < n; i++) + z[i] = x[i] - y[i]; +} + + +static void THDoubleVector_scale_VSX(double *y, const double c, const ptrdiff_t n) +{ + ptrdiff_t i; + + vector double c_fp64vec2; + double val[2] = {c, c}; + c_fp64vec2 = vec_xl(0, val); + + vector double y0_fp64vec2, y1_fp64vec2, y2_fp64vec2, y3_fp64vec2, y4_fp64vec2, y5_fp64vec2, y6_fp64vec2, y7_fp64vec2; + vector double y8_fp64vec2, y9_fp64vec2, y10_fp64vec2, y11_fp64vec2, y12_fp64vec2, y13_fp64vec2, y14_fp64vec2, y15_fp64vec2; + + for (i = 0; i <= n-32; i += 32) + { + y0_fp64vec2 = vec_xl(0, y+(i )); + y1_fp64vec2 = vec_xl(0, y+(i+2 )); + y2_fp64vec2 = vec_xl(0, y+(i+4 )); + y3_fp64vec2 = vec_xl(0, y+(i+6 )); + y4_fp64vec2 = vec_xl(0, y+(i+8 )); + y5_fp64vec2 = vec_xl(0, y+(i+10)); + y6_fp64vec2 = vec_xl(0, y+(i+12)); + y7_fp64vec2 = vec_xl(0, y+(i+14)); + y8_fp64vec2 = vec_xl(0, y+(i+16)); + y9_fp64vec2 = vec_xl(0, y+(i+18)); + y10_fp64vec2 = vec_xl(0, y+(i+20)); + y11_fp64vec2 = vec_xl(0, y+(i+22)); + y12_fp64vec2 = vec_xl(0, y+(i+24)); + y13_fp64vec2 = vec_xl(0, y+(i+26)); + y14_fp64vec2 = vec_xl(0, y+(i+28)); + y15_fp64vec2 = vec_xl(0, y+(i+30)); + + y0_fp64vec2 = vec_mul(y0_fp64vec2, c_fp64vec2); + y1_fp64vec2 = vec_mul(y1_fp64vec2, c_fp64vec2); + y2_fp64vec2 = vec_mul(y2_fp64vec2, c_fp64vec2); + y3_fp64vec2 = vec_mul(y3_fp64vec2, c_fp64vec2); + y4_fp64vec2 = vec_mul(y4_fp64vec2, c_fp64vec2); + y5_fp64vec2 = vec_mul(y5_fp64vec2, c_fp64vec2); + y6_fp64vec2 = vec_mul(y6_fp64vec2, c_fp64vec2); + y7_fp64vec2 = vec_mul(y7_fp64vec2, c_fp64vec2); + y8_fp64vec2 = vec_mul(y8_fp64vec2, c_fp64vec2); + y9_fp64vec2 = vec_mul(y9_fp64vec2, c_fp64vec2); + y10_fp64vec2 = vec_mul(y10_fp64vec2, c_fp64vec2); + y11_fp64vec2 = vec_mul(y11_fp64vec2, c_fp64vec2); + y12_fp64vec2 = vec_mul(y12_fp64vec2, c_fp64vec2); + y13_fp64vec2 = vec_mul(y13_fp64vec2, c_fp64vec2); + y14_fp64vec2 = vec_mul(y14_fp64vec2, c_fp64vec2); + y15_fp64vec2 = vec_mul(y15_fp64vec2, c_fp64vec2); + + vec_xst(y0_fp64vec2, 0, y+(i )); + vec_xst(y1_fp64vec2, 0, y+(i+2 )); + vec_xst(y2_fp64vec2, 0, y+(i+4 )); + vec_xst(y3_fp64vec2, 0, y+(i+6 )); + vec_xst(y4_fp64vec2, 0, y+(i+8 )); + vec_xst(y5_fp64vec2, 0, y+(i+10)); + vec_xst(y6_fp64vec2, 0, y+(i+12)); + vec_xst(y7_fp64vec2, 0, y+(i+14)); + vec_xst(y8_fp64vec2, 0, y+(i+16)); + vec_xst(y9_fp64vec2, 0, y+(i+18)); + vec_xst(y10_fp64vec2, 0, y+(i+20)); + vec_xst(y11_fp64vec2, 0, y+(i+22)); + vec_xst(y12_fp64vec2, 0, y+(i+24)); + vec_xst(y13_fp64vec2, 0, y+(i+26)); + vec_xst(y14_fp64vec2, 0, y+(i+28)); + vec_xst(y15_fp64vec2, 0, y+(i+30)); + } + for (; i <= n-8; i += 8) + { + y0_fp64vec2 = vec_xl(0, y+(i )); + y1_fp64vec2 = vec_xl(0, y+(i+2 )); + y2_fp64vec2 = vec_xl(0, y+(i+4 )); + y3_fp64vec2 = vec_xl(0, y+(i+6 )); + + y0_fp64vec2 = vec_mul(y0_fp64vec2, c_fp64vec2); + y1_fp64vec2 = vec_mul(y1_fp64vec2, c_fp64vec2); + y2_fp64vec2 = vec_mul(y2_fp64vec2, c_fp64vec2); + y3_fp64vec2 = vec_mul(y3_fp64vec2, c_fp64vec2); + + vec_xst(y0_fp64vec2, 0, y+(i )); + vec_xst(y1_fp64vec2, 0, y+(i+2 )); + vec_xst(y2_fp64vec2, 0, y+(i+4 )); + vec_xst(y3_fp64vec2, 0, y+(i+6 )); + } + for (; i <= n-2; i += 2) + { + y0_fp64vec2 = vec_xl(0, y+(i )); + y0_fp64vec2 = vec_mul(y0_fp64vec2, c_fp64vec2); + vec_xst(y0_fp64vec2, 0, y+(i )); + } + for (; i < n; i++) + y[i] = y[i] * c; +} + + +static void THDoubleVector_mul_VSX(double *y, const double *x, const ptrdiff_t n) +{ + ptrdiff_t i; + + vector double y0_fp64vec2, y1_fp64vec2, y2_fp64vec2, y3_fp64vec2, y4_fp64vec2, y5_fp64vec2, y6_fp64vec2, y7_fp64vec2; + vector double y8_fp64vec2, y9_fp64vec2, y10_fp64vec2, y11_fp64vec2; + vector double x0_fp64vec2, x1_fp64vec2, x2_fp64vec2, x3_fp64vec2, x4_fp64vec2, x5_fp64vec2, x6_fp64vec2, x7_fp64vec2; + vector double x8_fp64vec2, x9_fp64vec2, x10_fp64vec2, x11_fp64vec2; + + + for (i = 0; i <= n-24; i += 24) + { + y0_fp64vec2 = vec_xl(0, y+(i )); + y1_fp64vec2 = vec_xl(0, y+(i+2 )); + y2_fp64vec2 = vec_xl(0, y+(i+4 )); + y3_fp64vec2 = vec_xl(0, y+(i+6 )); + y4_fp64vec2 = vec_xl(0, y+(i+8 )); + y5_fp64vec2 = vec_xl(0, y+(i+10)); + y6_fp64vec2 = vec_xl(0, y+(i+12)); + y7_fp64vec2 = vec_xl(0, y+(i+14)); + y8_fp64vec2 = vec_xl(0, y+(i+16)); + y9_fp64vec2 = vec_xl(0, y+(i+18)); + y10_fp64vec2 = vec_xl(0, y+(i+20)); + y11_fp64vec2 = vec_xl(0, y+(i+22)); + + x0_fp64vec2 = vec_xl(0, x+(i )); + x1_fp64vec2 = vec_xl(0, x+(i+2 )); + x2_fp64vec2 = vec_xl(0, x+(i+4 )); + x3_fp64vec2 = vec_xl(0, x+(i+6 )); + x4_fp64vec2 = vec_xl(0, x+(i+8 )); + x5_fp64vec2 = vec_xl(0, x+(i+10)); + x6_fp64vec2 = vec_xl(0, x+(i+12)); + x7_fp64vec2 = vec_xl(0, x+(i+14)); + x8_fp64vec2 = vec_xl(0, x+(i+16)); + x9_fp64vec2 = vec_xl(0, x+(i+18)); + x10_fp64vec2 = vec_xl(0, x+(i+20)); + x11_fp64vec2 = vec_xl(0, x+(i+22)); + + y0_fp64vec2 = vec_mul(y0_fp64vec2, x0_fp64vec2); + y1_fp64vec2 = vec_mul(y1_fp64vec2, x1_fp64vec2); + y2_fp64vec2 = vec_mul(y2_fp64vec2, x2_fp64vec2); + y3_fp64vec2 = vec_mul(y3_fp64vec2, x3_fp64vec2); + y4_fp64vec2 = vec_mul(y4_fp64vec2, x4_fp64vec2); + y5_fp64vec2 = vec_mul(y5_fp64vec2, x5_fp64vec2); + y6_fp64vec2 = vec_mul(y6_fp64vec2, x6_fp64vec2); + y7_fp64vec2 = vec_mul(y7_fp64vec2, x7_fp64vec2); + y8_fp64vec2 = vec_mul(y8_fp64vec2, x8_fp64vec2); + y9_fp64vec2 = vec_mul(y9_fp64vec2, x9_fp64vec2); + y10_fp64vec2 = vec_mul(y10_fp64vec2, x10_fp64vec2); + y11_fp64vec2 = vec_mul(y11_fp64vec2, x11_fp64vec2); + + vec_xst(y0_fp64vec2, 0, y+(i )); + vec_xst(y1_fp64vec2, 0, y+(i+2 )); + vec_xst(y2_fp64vec2, 0, y+(i+4 )); + vec_xst(y3_fp64vec2, 0, y+(i+6 )); + vec_xst(y4_fp64vec2, 0, y+(i+8 )); + vec_xst(y5_fp64vec2, 0, y+(i+10)); + vec_xst(y6_fp64vec2, 0, y+(i+12)); + vec_xst(y7_fp64vec2, 0, y+(i+14)); + vec_xst(y8_fp64vec2, 0, y+(i+16)); + vec_xst(y9_fp64vec2, 0, y+(i+18)); + vec_xst(y10_fp64vec2, 0, y+(i+20)); + vec_xst(y11_fp64vec2, 0, y+(i+22)); + } + for (; i <= n-8; i += 8) + { + y0_fp64vec2 = vec_xl(0, y+(i )); + y1_fp64vec2 = vec_xl(0, y+(i+2 )); + y2_fp64vec2 = vec_xl(0, y+(i+4 )); + y3_fp64vec2 = vec_xl(0, y+(i+6 )); + + x0_fp64vec2 = vec_xl(0, x+(i )); + x1_fp64vec2 = vec_xl(0, x+(i+2 )); + x2_fp64vec2 = vec_xl(0, x+(i+4 )); + x3_fp64vec2 = vec_xl(0, x+(i+6 )); + + y0_fp64vec2 = vec_mul(y0_fp64vec2, x0_fp64vec2); + y1_fp64vec2 = vec_mul(y1_fp64vec2, x1_fp64vec2); + y2_fp64vec2 = vec_mul(y2_fp64vec2, x2_fp64vec2); + y3_fp64vec2 = vec_mul(y3_fp64vec2, x3_fp64vec2); + + vec_xst(y0_fp64vec2, 0, y+(i )); + vec_xst(y1_fp64vec2, 0, y+(i+2 )); + vec_xst(y2_fp64vec2, 0, y+(i+4 )); + vec_xst(y3_fp64vec2, 0, y+(i+6 )); + } + for (; i <= n-2; i += 2) + { + y0_fp64vec2 = vec_xl(0, y+(i )); + x0_fp64vec2 = vec_xl(0, x+(i )); + y0_fp64vec2 = vec_mul(y0_fp64vec2, x0_fp64vec2); + vec_xst(y0_fp64vec2, 0, y+(i )); + } + for (; i < n; i++) + y[i] = y[i] * x[i]; +} + + + + + + + +static void THFloatVector_fill_VSX(float *x, const float c, const ptrdiff_t n) +{ + ptrdiff_t i; + + float val[4] = {c, c, c, c}; + vector float fp32vec4 = vec_xl(0, val); + + for (i = 0; i <= n-256; i += 256) + { + vec_xst(fp32vec4, 0, x+(i )); + vec_xst(fp32vec4, 0, x+(i+4 )); + vec_xst(fp32vec4, 0, x+(i+8 )); + vec_xst(fp32vec4, 0, x+(i+12 )); + vec_xst(fp32vec4, 0, x+(i+16 )); + vec_xst(fp32vec4, 0, x+(i+20 )); + vec_xst(fp32vec4, 0, x+(i+24 )); + vec_xst(fp32vec4, 0, x+(i+28 )); + vec_xst(fp32vec4, 0, x+(i+32 )); + vec_xst(fp32vec4, 0, x+(i+36 )); + vec_xst(fp32vec4, 0, x+(i+40 )); + vec_xst(fp32vec4, 0, x+(i+44 )); + vec_xst(fp32vec4, 0, x+(i+48 )); + vec_xst(fp32vec4, 0, x+(i+52 )); + vec_xst(fp32vec4, 0, x+(i+56 )); + vec_xst(fp32vec4, 0, x+(i+60 )); + vec_xst(fp32vec4, 0, x+(i+64 )); + vec_xst(fp32vec4, 0, x+(i+68 )); + vec_xst(fp32vec4, 0, x+(i+72 )); + vec_xst(fp32vec4, 0, x+(i+76 )); + vec_xst(fp32vec4, 0, x+(i+80 )); + vec_xst(fp32vec4, 0, x+(i+84 )); + vec_xst(fp32vec4, 0, x+(i+88 )); + vec_xst(fp32vec4, 0, x+(i+92 )); + vec_xst(fp32vec4, 0, x+(i+96 )); + vec_xst(fp32vec4, 0, x+(i+100)); + vec_xst(fp32vec4, 0, x+(i+104)); + vec_xst(fp32vec4, 0, x+(i+108)); + vec_xst(fp32vec4, 0, x+(i+112)); + vec_xst(fp32vec4, 0, x+(i+116)); + vec_xst(fp32vec4, 0, x+(i+120)); + vec_xst(fp32vec4, 0, x+(i+124)); + vec_xst(fp32vec4, 0, x+(i+128)); + vec_xst(fp32vec4, 0, x+(i+132)); + vec_xst(fp32vec4, 0, x+(i+136)); + vec_xst(fp32vec4, 0, x+(i+140)); + vec_xst(fp32vec4, 0, x+(i+144)); + vec_xst(fp32vec4, 0, x+(i+148)); + vec_xst(fp32vec4, 0, x+(i+152)); + vec_xst(fp32vec4, 0, x+(i+156)); + vec_xst(fp32vec4, 0, x+(i+160)); + vec_xst(fp32vec4, 0, x+(i+164)); + vec_xst(fp32vec4, 0, x+(i+168)); + vec_xst(fp32vec4, 0, x+(i+172)); + vec_xst(fp32vec4, 0, x+(i+176)); + vec_xst(fp32vec4, 0, x+(i+180)); + vec_xst(fp32vec4, 0, x+(i+184)); + vec_xst(fp32vec4, 0, x+(i+188)); + vec_xst(fp32vec4, 0, x+(i+192)); + vec_xst(fp32vec4, 0, x+(i+196)); + vec_xst(fp32vec4, 0, x+(i+200)); + vec_xst(fp32vec4, 0, x+(i+204)); + vec_xst(fp32vec4, 0, x+(i+208)); + vec_xst(fp32vec4, 0, x+(i+212)); + vec_xst(fp32vec4, 0, x+(i+216)); + vec_xst(fp32vec4, 0, x+(i+220)); + vec_xst(fp32vec4, 0, x+(i+224)); + vec_xst(fp32vec4, 0, x+(i+228)); + vec_xst(fp32vec4, 0, x+(i+232)); + vec_xst(fp32vec4, 0, x+(i+236)); + vec_xst(fp32vec4, 0, x+(i+240)); + vec_xst(fp32vec4, 0, x+(i+244)); + vec_xst(fp32vec4, 0, x+(i+248)); + vec_xst(fp32vec4, 0, x+(i+252)); + } + for (; i <= n-32; i += 32) + { + vec_xst(fp32vec4, 0, x+(i )); + vec_xst(fp32vec4, 0, x+(i+4 )); + vec_xst(fp32vec4, 0, x+(i+8 )); + vec_xst(fp32vec4, 0, x+(i+12 )); + vec_xst(fp32vec4, 0, x+(i+16 )); + vec_xst(fp32vec4, 0, x+(i+20 )); + vec_xst(fp32vec4, 0, x+(i+24 )); + vec_xst(fp32vec4, 0, x+(i+28 )); + } + for (; i <= n-4; i += 4) + vec_xst(fp32vec4, 0, x+(i )); + for (; i < n; i++) + x[i] = c; +} + + +static void THFloatVector_add_VSX(float *y, const float *x, const float c, const ptrdiff_t n) +{ + ptrdiff_t i; + vector float c_fp32vec4; + vector float y0_fp32vec4, y1_fp32vec4, y2_fp32vec4, y3_fp32vec4, y4_fp32vec4, y5_fp32vec4, y6_fp32vec4, y7_fp32vec4; + vector float y8_fp32vec4, y9_fp32vec4, y10_fp32vec4, y11_fp32vec4; + vector float x0_fp32vec4, x1_fp32vec4, x2_fp32vec4, x3_fp32vec4, x4_fp32vec4, x5_fp32vec4, x6_fp32vec4, x7_fp32vec4; + vector float x8_fp32vec4, x9_fp32vec4, x10_fp32vec4, x11_fp32vec4; + + float val[4] = {c, c, c, c}; + c_fp32vec4 = vec_xl(0, val); + + for (i = 0; i <= n-48; i += 48) + { + x0_fp32vec4 = vec_xl(0, x+(i )); + x1_fp32vec4 = vec_xl(0, x+(i+4 )); + x2_fp32vec4 = vec_xl(0, x+(i+8 )); + x3_fp32vec4 = vec_xl(0, x+(i+12)); + x4_fp32vec4 = vec_xl(0, x+(i+16)); + x5_fp32vec4 = vec_xl(0, x+(i+20)); + x6_fp32vec4 = vec_xl(0, x+(i+24)); + x7_fp32vec4 = vec_xl(0, x+(i+28)); + x8_fp32vec4 = vec_xl(0, x+(i+32)); + x9_fp32vec4 = vec_xl(0, x+(i+36)); + x10_fp32vec4 = vec_xl(0, x+(i+40)); + x11_fp32vec4 = vec_xl(0, x+(i+44)); + + y0_fp32vec4 = vec_xl(0, y+(i )); + y1_fp32vec4 = vec_xl(0, y+(i+4 )); + y2_fp32vec4 = vec_xl(0, y+(i+8 )); + y3_fp32vec4 = vec_xl(0, y+(i+12)); + y4_fp32vec4 = vec_xl(0, y+(i+16)); + y5_fp32vec4 = vec_xl(0, y+(i+20)); + y6_fp32vec4 = vec_xl(0, y+(i+24)); + y7_fp32vec4 = vec_xl(0, y+(i+28)); + y8_fp32vec4 = vec_xl(0, y+(i+32)); + y9_fp32vec4 = vec_xl(0, y+(i+36)); + y10_fp32vec4 = vec_xl(0, y+(i+40)); + y11_fp32vec4 = vec_xl(0, y+(i+44)); + + y0_fp32vec4 = vec_madd(c_fp32vec4, x0_fp32vec4, y0_fp32vec4 ); + y1_fp32vec4 = vec_madd(c_fp32vec4, x1_fp32vec4, y1_fp32vec4 ); + y2_fp32vec4 = vec_madd(c_fp32vec4, x2_fp32vec4, y2_fp32vec4 ); + y3_fp32vec4 = vec_madd(c_fp32vec4, x3_fp32vec4, y3_fp32vec4 ); + y4_fp32vec4 = vec_madd(c_fp32vec4, x4_fp32vec4, y4_fp32vec4 ); + y5_fp32vec4 = vec_madd(c_fp32vec4, x5_fp32vec4, y5_fp32vec4 ); + y6_fp32vec4 = vec_madd(c_fp32vec4, x6_fp32vec4, y6_fp32vec4 ); + y7_fp32vec4 = vec_madd(c_fp32vec4, x7_fp32vec4, y7_fp32vec4 ); + y8_fp32vec4 = vec_madd(c_fp32vec4, x8_fp32vec4, y8_fp32vec4 ); + y9_fp32vec4 = vec_madd(c_fp32vec4, x9_fp32vec4, y9_fp32vec4 ); + y10_fp32vec4 = vec_madd(c_fp32vec4, x10_fp32vec4, y10_fp32vec4); + y11_fp32vec4 = vec_madd(c_fp32vec4, x11_fp32vec4, y11_fp32vec4); + + vec_xst(y0_fp32vec4, 0, y+(i )); + vec_xst(y1_fp32vec4, 0, y+(i+4 )); + vec_xst(y2_fp32vec4, 0, y+(i+8 )); + vec_xst(y3_fp32vec4, 0, y+(i+12)); + vec_xst(y4_fp32vec4, 0, y+(i+16)); + vec_xst(y5_fp32vec4, 0, y+(i+20)); + vec_xst(y6_fp32vec4, 0, y+(i+24)); + vec_xst(y7_fp32vec4, 0, y+(i+28)); + vec_xst(y8_fp32vec4, 0, y+(i+32)); + vec_xst(y9_fp32vec4, 0, y+(i+36)); + vec_xst(y10_fp32vec4, 0, y+(i+40)); + vec_xst(y11_fp32vec4, 0, y+(i+44)); + } + for (; i <= n-16; i += 16) + { + x0_fp32vec4 = vec_xl(0, x+(i )); + x1_fp32vec4 = vec_xl(0, x+(i+4 )); + x2_fp32vec4 = vec_xl(0, x+(i+8 )); + x3_fp32vec4 = vec_xl(0, x+(i+12)); + + y0_fp32vec4 = vec_xl(0, y+(i )); + y1_fp32vec4 = vec_xl(0, y+(i+4 )); + y2_fp32vec4 = vec_xl(0, y+(i+8 )); + y3_fp32vec4 = vec_xl(0, y+(i+12)); + + y0_fp32vec4 = vec_madd(c_fp32vec4, x0_fp32vec4, y0_fp32vec4 ); + y1_fp32vec4 = vec_madd(c_fp32vec4, x1_fp32vec4, y1_fp32vec4 ); + y2_fp32vec4 = vec_madd(c_fp32vec4, x2_fp32vec4, y2_fp32vec4 ); + y3_fp32vec4 = vec_madd(c_fp32vec4, x3_fp32vec4, y3_fp32vec4 ); + + vec_xst(y0_fp32vec4, 0, y+(i )); + vec_xst(y1_fp32vec4, 0, y+(i+4 )); + vec_xst(y2_fp32vec4, 0, y+(i+8 )); + vec_xst(y3_fp32vec4, 0, y+(i+12)); + } + for (; i <= n-4; i += 4) + { + x0_fp32vec4 = vec_xl(0, x+(i )); + y0_fp32vec4 = vec_xl(0, y+(i )); + y0_fp32vec4 = vec_madd(c_fp32vec4, x0_fp32vec4, y0_fp32vec4 ); + vec_xst(y0_fp32vec4, 0, y+(i )); + } + for (; i < n; i++) + y[i] = (c * x[i]) + y[i]; +} + + + + +static void THFloatVector_diff_VSX(float *z, const float *x, const float *y, const ptrdiff_t n) { + ptrdiff_t i; + + vector float xz0_fp32vec4, xz1_fp32vec4, xz2_fp32vec4, xz3_fp32vec4, xz4_fp32vec4, xz5_fp32vec4, xz6_fp32vec4, xz7_fp32vec4; + vector float xz8_fp32vec4, xz9_fp32vec4, xz10_fp32vec4, xz11_fp32vec4; + vector float y0_fp32vec4, y1_fp32vec4, y2_fp32vec4, y3_fp32vec4, y4_fp32vec4, y5_fp32vec4, y6_fp32vec4, y7_fp32vec4; + vector float y8_fp32vec4, y9_fp32vec4, y10_fp32vec4, y11_fp32vec4; + + for (i = 0; i <= n-48; i += 48) + { + xz0_fp32vec4 = vec_xl(0, x+(i )); + xz1_fp32vec4 = vec_xl(0, x+(i+4 )); + xz2_fp32vec4 = vec_xl(0, x+(i+8 )); + xz3_fp32vec4 = vec_xl(0, x+(i+12)); + xz4_fp32vec4 = vec_xl(0, x+(i+16)); + xz5_fp32vec4 = vec_xl(0, x+(i+20)); + xz6_fp32vec4 = vec_xl(0, x+(i+24)); + xz7_fp32vec4 = vec_xl(0, x+(i+28)); + xz8_fp32vec4 = vec_xl(0, x+(i+32)); + xz9_fp32vec4 = vec_xl(0, x+(i+36)); + xz10_fp32vec4 = vec_xl(0, x+(i+40)); + xz11_fp32vec4 = vec_xl(0, x+(i+44)); + + y0_fp32vec4 = vec_xl(0, y+(i )); + y1_fp32vec4 = vec_xl(0, y+(i+4 )); + y2_fp32vec4 = vec_xl(0, y+(i+8 )); + y3_fp32vec4 = vec_xl(0, y+(i+12)); + y4_fp32vec4 = vec_xl(0, y+(i+16)); + y5_fp32vec4 = vec_xl(0, y+(i+20)); + y6_fp32vec4 = vec_xl(0, y+(i+24)); + y7_fp32vec4 = vec_xl(0, y+(i+28)); + y8_fp32vec4 = vec_xl(0, y+(i+32)); + y9_fp32vec4 = vec_xl(0, y+(i+36)); + y10_fp32vec4 = vec_xl(0, y+(i+40)); + y11_fp32vec4 = vec_xl(0, y+(i+44)); + + xz0_fp32vec4 = vec_sub(xz0_fp32vec4, y0_fp32vec4 ); + xz1_fp32vec4 = vec_sub(xz1_fp32vec4, y1_fp32vec4 ); + xz2_fp32vec4 = vec_sub(xz2_fp32vec4, y2_fp32vec4 ); + xz3_fp32vec4 = vec_sub(xz3_fp32vec4, y3_fp32vec4 ); + xz4_fp32vec4 = vec_sub(xz4_fp32vec4, y4_fp32vec4 ); + xz5_fp32vec4 = vec_sub(xz5_fp32vec4, y5_fp32vec4 ); + xz6_fp32vec4 = vec_sub(xz6_fp32vec4, y6_fp32vec4 ); + xz7_fp32vec4 = vec_sub(xz7_fp32vec4, y7_fp32vec4 ); + xz8_fp32vec4 = vec_sub(xz8_fp32vec4, y8_fp32vec4 ); + xz9_fp32vec4 = vec_sub(xz9_fp32vec4, y9_fp32vec4 ); + xz10_fp32vec4 = vec_sub(xz10_fp32vec4, y10_fp32vec4); + xz11_fp32vec4 = vec_sub(xz11_fp32vec4, y11_fp32vec4); + + vec_xst(xz0_fp32vec4, 0, z+(i )); + vec_xst(xz1_fp32vec4, 0, z+(i+4 )); + vec_xst(xz2_fp32vec4, 0, z+(i+8 )); + vec_xst(xz3_fp32vec4, 0, z+(i+12)); + vec_xst(xz4_fp32vec4, 0, z+(i+16)); + vec_xst(xz5_fp32vec4, 0, z+(i+20)); + vec_xst(xz6_fp32vec4, 0, z+(i+24)); + vec_xst(xz7_fp32vec4, 0, z+(i+28)); + vec_xst(xz8_fp32vec4, 0, z+(i+32)); + vec_xst(xz9_fp32vec4, 0, z+(i+36)); + vec_xst(xz10_fp32vec4, 0, z+(i+40)); + vec_xst(xz11_fp32vec4, 0, z+(i+44)); + } + for (; i <= n-16; i += 16) + { + xz0_fp32vec4 = vec_xl(0, x+(i )); + xz1_fp32vec4 = vec_xl(0, x+(i+4 )); + xz2_fp32vec4 = vec_xl(0, x+(i+8 )); + xz3_fp32vec4 = vec_xl(0, x+(i+12)); + + y0_fp32vec4 = vec_xl(0, y+(i )); + y1_fp32vec4 = vec_xl(0, y+(i+4 )); + y2_fp32vec4 = vec_xl(0, y+(i+8 )); + y3_fp32vec4 = vec_xl(0, y+(i+12)); + + xz0_fp32vec4 = vec_sub(xz0_fp32vec4, y0_fp32vec4 ); + xz1_fp32vec4 = vec_sub(xz1_fp32vec4, y1_fp32vec4 ); + xz2_fp32vec4 = vec_sub(xz2_fp32vec4, y2_fp32vec4 ); + xz3_fp32vec4 = vec_sub(xz3_fp32vec4, y3_fp32vec4 ); + + vec_xst(xz0_fp32vec4, 0, z+(i )); + vec_xst(xz1_fp32vec4, 0, z+(i+4 )); + vec_xst(xz2_fp32vec4, 0, z+(i+8 )); + vec_xst(xz3_fp32vec4, 0, z+(i+12)); + } + for (; i <= n-4; i += 4) + { + xz0_fp32vec4 = vec_xl(0, x+(i )); + y0_fp32vec4 = vec_xl(0, y+(i )); + xz0_fp32vec4 = vec_sub(xz0_fp32vec4, y0_fp32vec4 ); + vec_xst(xz0_fp32vec4, 0, z+(i )); + } + for (; i < n; i++) + z[i] = x[i] - y[i]; +} + + +static void THFloatVector_scale_VSX(float *y, const float c, const ptrdiff_t n) +{ + ptrdiff_t i; + + vector float c_fp32vec4; + float val[4] = {c, c, c, c}; + c_fp32vec4 = vec_xl(0, val); + + vector float y0_fp32vec4, y1_fp32vec4, y2_fp32vec4, y3_fp32vec4, y4_fp32vec4, y5_fp32vec4, y6_fp32vec4, y7_fp32vec4; + vector float y8_fp32vec4, y9_fp32vec4, y10_fp32vec4, y11_fp32vec4, y12_fp32vec4, y13_fp32vec4, y14_fp32vec4, y15_fp32vec4; + + for (i = 0; i <= n-64; i += 64) + { + y0_fp32vec4 = vec_xl(0, y+(i )); + y1_fp32vec4 = vec_xl(0, y+(i+4 )); + y2_fp32vec4 = vec_xl(0, y+(i+8 )); + y3_fp32vec4 = vec_xl(0, y+(i+12)); + y4_fp32vec4 = vec_xl(0, y+(i+16)); + y5_fp32vec4 = vec_xl(0, y+(i+20)); + y6_fp32vec4 = vec_xl(0, y+(i+24)); + y7_fp32vec4 = vec_xl(0, y+(i+28)); + y8_fp32vec4 = vec_xl(0, y+(i+32)); + y9_fp32vec4 = vec_xl(0, y+(i+36)); + y10_fp32vec4 = vec_xl(0, y+(i+40)); + y11_fp32vec4 = vec_xl(0, y+(i+44)); + y12_fp32vec4 = vec_xl(0, y+(i+48)); + y13_fp32vec4 = vec_xl(0, y+(i+52)); + y14_fp32vec4 = vec_xl(0, y+(i+56)); + y15_fp32vec4 = vec_xl(0, y+(i+60)); + + y0_fp32vec4 = vec_mul(y0_fp32vec4, c_fp32vec4); + y1_fp32vec4 = vec_mul(y1_fp32vec4, c_fp32vec4); + y2_fp32vec4 = vec_mul(y2_fp32vec4, c_fp32vec4); + y3_fp32vec4 = vec_mul(y3_fp32vec4, c_fp32vec4); + y4_fp32vec4 = vec_mul(y4_fp32vec4, c_fp32vec4); + y5_fp32vec4 = vec_mul(y5_fp32vec4, c_fp32vec4); + y6_fp32vec4 = vec_mul(y6_fp32vec4, c_fp32vec4); + y7_fp32vec4 = vec_mul(y7_fp32vec4, c_fp32vec4); + y8_fp32vec4 = vec_mul(y8_fp32vec4, c_fp32vec4); + y9_fp32vec4 = vec_mul(y9_fp32vec4, c_fp32vec4); + y10_fp32vec4 = vec_mul(y10_fp32vec4, c_fp32vec4); + y11_fp32vec4 = vec_mul(y11_fp32vec4, c_fp32vec4); + y12_fp32vec4 = vec_mul(y12_fp32vec4, c_fp32vec4); + y13_fp32vec4 = vec_mul(y13_fp32vec4, c_fp32vec4); + y14_fp32vec4 = vec_mul(y14_fp32vec4, c_fp32vec4); + y15_fp32vec4 = vec_mul(y15_fp32vec4, c_fp32vec4); + + vec_xst(y0_fp32vec4, 0, y+(i )); + vec_xst(y1_fp32vec4, 0, y+(i+4 )); + vec_xst(y2_fp32vec4, 0, y+(i+8 )); + vec_xst(y3_fp32vec4, 0, y+(i+12)); + vec_xst(y4_fp32vec4, 0, y+(i+16)); + vec_xst(y5_fp32vec4, 0, y+(i+20)); + vec_xst(y6_fp32vec4, 0, y+(i+24)); + vec_xst(y7_fp32vec4, 0, y+(i+28)); + vec_xst(y8_fp32vec4, 0, y+(i+32)); + vec_xst(y9_fp32vec4, 0, y+(i+36)); + vec_xst(y10_fp32vec4, 0, y+(i+40)); + vec_xst(y11_fp32vec4, 0, y+(i+44)); + vec_xst(y12_fp32vec4, 0, y+(i+48)); + vec_xst(y13_fp32vec4, 0, y+(i+52)); + vec_xst(y14_fp32vec4, 0, y+(i+56)); + vec_xst(y15_fp32vec4, 0, y+(i+60)); + } + for (; i <= n-16; i += 16) + { + y0_fp32vec4 = vec_xl(0, y+(i )); + y1_fp32vec4 = vec_xl(0, y+(i+4 )); + y2_fp32vec4 = vec_xl(0, y+(i+8 )); + y3_fp32vec4 = vec_xl(0, y+(i+12)); + + y0_fp32vec4 = vec_mul(y0_fp32vec4, c_fp32vec4); + y1_fp32vec4 = vec_mul(y1_fp32vec4, c_fp32vec4); + y2_fp32vec4 = vec_mul(y2_fp32vec4, c_fp32vec4); + y3_fp32vec4 = vec_mul(y3_fp32vec4, c_fp32vec4); + + vec_xst(y0_fp32vec4, 0, y+(i )); + vec_xst(y1_fp32vec4, 0, y+(i+4 )); + vec_xst(y2_fp32vec4, 0, y+(i+8 )); + vec_xst(y3_fp32vec4, 0, y+(i+12)); + } + for (; i <= n-4; i += 4) + { + y0_fp32vec4 = vec_xl(0, y+(i )); + y0_fp32vec4 = vec_mul(y0_fp32vec4, c_fp32vec4); + vec_xst(y0_fp32vec4, 0, y+(i )); + } + for (; i < n; i++) + y[i] = y[i] * c; +} + + + +static void THFloatVector_mul_VSX(float *y, const float *x, const ptrdiff_t n) +{ + ptrdiff_t i; + + vector float y0_fp32vec4, y1_fp32vec4, y2_fp32vec4, y3_fp32vec4, y4_fp32vec4, y5_fp32vec4, y6_fp32vec4, y7_fp32vec4; + vector float y8_fp32vec4, y9_fp32vec4, y10_fp32vec4, y11_fp32vec4; + vector float x0_fp32vec4, x1_fp32vec4, x2_fp32vec4, x3_fp32vec4, x4_fp32vec4, x5_fp32vec4, x6_fp32vec4, x7_fp32vec4; + vector float x8_fp32vec4, x9_fp32vec4, x10_fp32vec4, x11_fp32vec4; + + + for (i = 0; i <= n-48; i += 48) + { + y0_fp32vec4 = vec_xl(0, y+(i )); + y1_fp32vec4 = vec_xl(0, y+(i+4 )); + y2_fp32vec4 = vec_xl(0, y+(i+8 )); + y3_fp32vec4 = vec_xl(0, y+(i+12)); + y4_fp32vec4 = vec_xl(0, y+(i+16)); + y5_fp32vec4 = vec_xl(0, y+(i+20)); + y6_fp32vec4 = vec_xl(0, y+(i+24)); + y7_fp32vec4 = vec_xl(0, y+(i+28)); + y8_fp32vec4 = vec_xl(0, y+(i+32)); + y9_fp32vec4 = vec_xl(0, y+(i+36)); + y10_fp32vec4 = vec_xl(0, y+(i+40)); + y11_fp32vec4 = vec_xl(0, y+(i+44)); + + x0_fp32vec4 = vec_xl(0, x+(i )); + x1_fp32vec4 = vec_xl(0, x+(i+4 )); + x2_fp32vec4 = vec_xl(0, x+(i+8 )); + x3_fp32vec4 = vec_xl(0, x+(i+12)); + x4_fp32vec4 = vec_xl(0, x+(i+16)); + x5_fp32vec4 = vec_xl(0, x+(i+20)); + x6_fp32vec4 = vec_xl(0, x+(i+24)); + x7_fp32vec4 = vec_xl(0, x+(i+28)); + x8_fp32vec4 = vec_xl(0, x+(i+32)); + x9_fp32vec4 = vec_xl(0, x+(i+36)); + x10_fp32vec4 = vec_xl(0, x+(i+40)); + x11_fp32vec4 = vec_xl(0, x+(i+44)); + + y0_fp32vec4 = vec_mul(y0_fp32vec4, x0_fp32vec4); + y1_fp32vec4 = vec_mul(y1_fp32vec4, x1_fp32vec4); + y2_fp32vec4 = vec_mul(y2_fp32vec4, x2_fp32vec4); + y3_fp32vec4 = vec_mul(y3_fp32vec4, x3_fp32vec4); + y4_fp32vec4 = vec_mul(y4_fp32vec4, x4_fp32vec4); + y5_fp32vec4 = vec_mul(y5_fp32vec4, x5_fp32vec4); + y6_fp32vec4 = vec_mul(y6_fp32vec4, x6_fp32vec4); + y7_fp32vec4 = vec_mul(y7_fp32vec4, x7_fp32vec4); + y8_fp32vec4 = vec_mul(y8_fp32vec4, x8_fp32vec4); + y9_fp32vec4 = vec_mul(y9_fp32vec4, x9_fp32vec4); + y10_fp32vec4 = vec_mul(y10_fp32vec4, x10_fp32vec4); + y11_fp32vec4 = vec_mul(y11_fp32vec4, x11_fp32vec4); + + vec_xst(y0_fp32vec4, 0, y+(i )); + vec_xst(y1_fp32vec4, 0, y+(i+4 )); + vec_xst(y2_fp32vec4, 0, y+(i+8 )); + vec_xst(y3_fp32vec4, 0, y+(i+12)); + vec_xst(y4_fp32vec4, 0, y+(i+16)); + vec_xst(y5_fp32vec4, 0, y+(i+20)); + vec_xst(y6_fp32vec4, 0, y+(i+24)); + vec_xst(y7_fp32vec4, 0, y+(i+28)); + vec_xst(y8_fp32vec4, 0, y+(i+32)); + vec_xst(y9_fp32vec4, 0, y+(i+36)); + vec_xst(y10_fp32vec4, 0, y+(i+40)); + vec_xst(y11_fp32vec4, 0, y+(i+44)); + } + for (; i <= n-16; i += 16) + { + y0_fp32vec4 = vec_xl(0, y+(i )); + y1_fp32vec4 = vec_xl(0, y+(i+4 )); + y2_fp32vec4 = vec_xl(0, y+(i+8 )); + y3_fp32vec4 = vec_xl(0, y+(i+12)); + + x0_fp32vec4 = vec_xl(0, x+(i )); + x1_fp32vec4 = vec_xl(0, x+(i+4 )); + x2_fp32vec4 = vec_xl(0, x+(i+8 )); + x3_fp32vec4 = vec_xl(0, x+(i+12)); + + y0_fp32vec4 = vec_mul(y0_fp32vec4, x0_fp32vec4); + y1_fp32vec4 = vec_mul(y1_fp32vec4, x1_fp32vec4); + y2_fp32vec4 = vec_mul(y2_fp32vec4, x2_fp32vec4); + y3_fp32vec4 = vec_mul(y3_fp32vec4, x3_fp32vec4); + + vec_xst(y0_fp32vec4, 0, y+(i )); + vec_xst(y1_fp32vec4, 0, y+(i+4 )); + vec_xst(y2_fp32vec4, 0, y+(i+8 )); + vec_xst(y3_fp32vec4, 0, y+(i+12)); + } + for (; i <= n-4; i += 4) + { + y0_fp32vec4 = vec_xl(0, y+(i )); + x0_fp32vec4 = vec_xl(0, x+(i )); + y0_fp32vec4 = vec_mul(y0_fp32vec4, x0_fp32vec4); + vec_xst(y0_fp32vec4, 0, y+(i )); + } + for (; i < n; i++) + y[i] = y[i] * x[i]; +} + + + + + +//------------------------------------------------ +// +// Testing for correctness and performance +// +// If you want to run these tests, compile this +// file with -DRUN_VSX_TESTS on a Power machine, +// and then run the executable that is generated. +// +//------------------------------------------------ +// +// Example passing run (from a Power8 machine): +// +// $ gcc VSX.c -O2 -D RUN_VSX_TESTS -o vsxtest +// $ ./vsxtest +// +// standardDouble_fill() test took 0.34604 seconds +// THDoubleVector_fill_VSX() test took 0.15663 seconds +// All assertions PASSED for THDoubleVector_fill_VSX() test. +// +// standardFloat_fill() test took 0.32901 seconds +// THFloatVector_fill_VSX() test took 0.07830 seconds +// All assertions PASSED for THFloatVector_fill_VSX() test. +// +// standardDouble_add() test took 0.51602 seconds +// THDoubleVector_add_VSX() test took 0.31384 seconds +// All assertions PASSED for THDoubleVector_add_VSX() test. +// +// standardFloat_add() test took 0.39845 seconds +// THFloatVector_add_VSX() test took 0.14544 seconds +// All assertions PASSED for THFloatVector_add_VSX() test. +// +// standardDouble_diff() test took 0.48219 seconds +// THDoubleVector_diff_VSX() test took 0.31708 seconds +// All assertions PASSED for THDoubleVector_diff_VSX() test. +// +// standardFloat_diff() test took 0.60340 seconds +// THFloatVector_diff_VSX() test took 0.17083 seconds +// All assertions PASSED for THFloatVector_diff_VSX() test. +// +// standardDouble_scale() test took 0.33157 seconds +// THDoubleVector_scale_VSX() test took 0.19075 seconds +// All assertions PASSED for THDoubleVector_scale_VSX() test. +// +// standardFloat_scale() test took 0.33008 seconds +// THFloatVector_scale_VSX() test took 0.09741 seconds +// All assertions PASSED for THFloatVector_scale_VSX() test. +// +// standardDouble_mul() test took 0.50986 seconds +// THDoubleVector_mul_VSX() test took 0.30939 seconds +// All assertions PASSED for THDoubleVector_mul_VSX() test. +// +// standardFloat_mul() test took 0.40241 seconds +// THFloatVector_mul_VSX() test took 0.14346 seconds +// All assertions PASSED for THFloatVector_mul_VSX() test. +// +// Finished runnning all tests. All tests PASSED. +// +//------------------------------------------------ +#ifdef RUN_VSX_TESTS + +#include <stdio.h> +#include <stdlib.h> +#include <time.h> +#include <assert.h> +#include <math.h> + +#define VSX_PERF_NUM_TEST_ELEMENTS 100000000 +#define VSX_FUNC_NUM_TEST_ELEMENTS 2507 + +void test_THDoubleVector_fill_VSX(); + +static void standardDouble_fill(double *x, const double c, const ptrdiff_t n) +{ + for (ptrdiff_t i = 0; i < n; i++) + x[i] = c; +} + +static void standardFloat_fill(float *x, const float c, const ptrdiff_t n) +{ + for (ptrdiff_t i = 0; i < n; i++) + x[i] = c; +} + +static void standardDouble_add(double *y, const double *x, const double c, const ptrdiff_t n) +{ + for (ptrdiff_t i = 0; i < n; i++) + y[i] += c * x[i]; +} + +static void standardFloat_add(float *y, const float *x, const float c, const ptrdiff_t n) +{ + for (ptrdiff_t i = 0; i < n; i++) + y[i] += c * x[i]; +} + +static void standardDouble_diff(double *z, const double *x, const double *y, const ptrdiff_t n) +{ + for (ptrdiff_t i = 0; i < n; i++) + z[i] = x[i] - y[i]; +} + +static void standardFloat_diff(float *z, const float *x, const float *y, const ptrdiff_t n) +{ + for (ptrdiff_t i = 0; i < n; i++) + z[i] = x[i] - y[i]; +} + +static void standardDouble_scale(double *y, const double c, const ptrdiff_t n) +{ + for (ptrdiff_t i = 0; i < n; i++) + y[i] *= c; +} + +static void standardFloat_scale(float *y, const float c, const ptrdiff_t n) +{ + for (ptrdiff_t i = 0; i < n; i++) + y[i] *= c; +} + +static void standardDouble_mul(double *y, const double *x, const ptrdiff_t n) +{ + for (ptrdiff_t i = 0; i < n; i++) + y[i] *= x[i]; +} + +static void standardFloat_mul(float *y, const float *x, const ptrdiff_t n) +{ + for (ptrdiff_t i = 0; i < n; i++) + y[i] *= x[i]; +} + +double randDouble() +{ + return (double)(rand()%100)/(double)(rand()%100) * (rand()%2 ? -1.0 : 1.0); +} + +int near(double a, double b) +{ + int aClass = fpclassify(a); + int bClass = fpclassify(b); + + if(aClass != bClass) // i.e. is it NAN, infinite, or finite...? + return 0; + + if(aClass == FP_INFINITE) // if it is infinite, the sign must be the same, i.e. positive infinity is not near negative infinity + return (signbit(a) == signbit(b)); + else if(aClass == FP_NORMAL) // if it is a normal number then check the magnitude of the difference between the numbers + return fabs(a - b) < 0.001; + else // if both number are of the same class as each other and are of any other class (i.e. such as NAN), then they are near to each other. + return 1; +} + +void test_THDoubleVector_fill_VSX() +{ + clock_t start, end; + double elapsedSeconds_optimized, elapsedSeconds_standard; + + double *x_standard = (double *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(double)); + double *x_optimized = (double *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(double)); + + double yVal0 = 17.2; + double yVal1 = 8.2; + double yVal2 = 5.1; + double yVal3 = -0.9; + + //------------------------------------------------- + // Performance Test + //------------------------------------------------- + start = clock(); + standardDouble_fill(x_standard, yVal0, VSX_PERF_NUM_TEST_ELEMENTS ); + standardDouble_fill(x_standard, yVal1, VSX_PERF_NUM_TEST_ELEMENTS-1); + standardDouble_fill(x_standard, yVal2, VSX_PERF_NUM_TEST_ELEMENTS-2); + standardDouble_fill(x_standard, yVal3, VSX_PERF_NUM_TEST_ELEMENTS-3); + end = clock(); + + elapsedSeconds_standard = (double)(end - start) / CLOCKS_PER_SEC; + printf("standardDouble_fill() test took %.5lf seconds\n", elapsedSeconds_standard); + + start = clock(); + THDoubleVector_fill_VSX(x_optimized, yVal0, VSX_PERF_NUM_TEST_ELEMENTS ); + THDoubleVector_fill_VSX(x_optimized, yVal1, VSX_PERF_NUM_TEST_ELEMENTS-1); + THDoubleVector_fill_VSX(x_optimized, yVal2, VSX_PERF_NUM_TEST_ELEMENTS-2); + THDoubleVector_fill_VSX(x_optimized, yVal3, VSX_PERF_NUM_TEST_ELEMENTS-3); + end = clock(); + + elapsedSeconds_optimized = (double)(end - start) / CLOCKS_PER_SEC; + printf("THDoubleVector_fill_VSX() test took %.5lf seconds\n", elapsedSeconds_optimized); + + + //------------------------------------------------- + // Correctness Test + //------------------------------------------------- + yVal0 += 1.0; + yVal1 += 1.0; + yVal2 += 1.0; + yVal3 -= 1.0; + + standardDouble_fill( x_standard, yVal0, VSX_FUNC_NUM_TEST_ELEMENTS); + THDoubleVector_fill_VSX(x_optimized, yVal0, VSX_FUNC_NUM_TEST_ELEMENTS); + for(int i = 0; i < VSX_FUNC_NUM_TEST_ELEMENTS; i++) + assert(x_optimized[i] == yVal0); + + standardDouble_fill( x_standard+1, yVal1, VSX_FUNC_NUM_TEST_ELEMENTS-2); + THDoubleVector_fill_VSX(x_optimized+1, yVal1, VSX_FUNC_NUM_TEST_ELEMENTS-2); + standardDouble_fill( x_standard+2, yVal2, VSX_FUNC_NUM_TEST_ELEMENTS-4); + THDoubleVector_fill_VSX(x_optimized+2, yVal2, VSX_FUNC_NUM_TEST_ELEMENTS-4); + standardDouble_fill( x_standard+3, yVal3, VSX_FUNC_NUM_TEST_ELEMENTS-6); + THDoubleVector_fill_VSX(x_optimized+3, yVal3, VSX_FUNC_NUM_TEST_ELEMENTS-6); + standardDouble_fill( x_standard+517, yVal0, VSX_FUNC_NUM_TEST_ELEMENTS-1029); + THDoubleVector_fill_VSX(x_optimized+517, yVal0, VSX_FUNC_NUM_TEST_ELEMENTS-1029); + int r = rand() % 258; + standardDouble_fill( x_standard+517+r, yVal2, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100)); + THDoubleVector_fill_VSX(x_optimized+517+r, yVal2, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100)); + for(int i = 0; i < VSX_FUNC_NUM_TEST_ELEMENTS; i++) + assert(x_optimized[i] == x_standard[i]); + printf("All assertions PASSED for THDoubleVector_fill_VSX() test.\n\n"); + + + free(x_standard); + free(x_optimized); +} + + +void test_THFloatVector_fill_VSX() +{ + clock_t start, end; + double elapsedSeconds_optimized, elapsedSeconds_standard; + + float *x_standard = (float *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(float)); + float *x_optimized = (float *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(float)); + + float yVal0 = 17.2; + float yVal1 = 8.2; + float yVal2 = 5.1; + float yVal3 = -0.9; + + //------------------------------------------------- + // Performance Test + //------------------------------------------------- + start = clock(); + standardFloat_fill(x_standard, yVal0, VSX_PERF_NUM_TEST_ELEMENTS ); + standardFloat_fill(x_standard, yVal1, VSX_PERF_NUM_TEST_ELEMENTS-1); + standardFloat_fill(x_standard, yVal2, VSX_PERF_NUM_TEST_ELEMENTS-2); + standardFloat_fill(x_standard, yVal3, VSX_PERF_NUM_TEST_ELEMENTS-3); + end = clock(); + + elapsedSeconds_standard = (double)(end - start) / CLOCKS_PER_SEC; + printf("standardFloat_fill() test took %.5lf seconds\n", elapsedSeconds_standard); + + start = clock(); + THFloatVector_fill_VSX(x_optimized, yVal0, VSX_PERF_NUM_TEST_ELEMENTS ); + THFloatVector_fill_VSX(x_optimized, yVal1, VSX_PERF_NUM_TEST_ELEMENTS-1); + THFloatVector_fill_VSX(x_optimized, yVal2, VSX_PERF_NUM_TEST_ELEMENTS-2); + THFloatVector_fill_VSX(x_optimized, yVal3, VSX_PERF_NUM_TEST_ELEMENTS-3); + end = clock(); + + elapsedSeconds_optimized = (double)(end - start) / CLOCKS_PER_SEC; + printf("THFloatVector_fill_VSX() test took %.5lf seconds\n", elapsedSeconds_optimized); + + + //------------------------------------------------- + // Correctness Test + //------------------------------------------------- + yVal0 += 1.0; + yVal1 += 1.0; + yVal2 += 1.0; + yVal3 -= 1.0; + + standardFloat_fill( x_standard, yVal0, VSX_FUNC_NUM_TEST_ELEMENTS); + THFloatVector_fill_VSX(x_optimized, yVal0, VSX_FUNC_NUM_TEST_ELEMENTS); + for(int i = 0; i < VSX_FUNC_NUM_TEST_ELEMENTS; i++) + assert(x_optimized[i] == yVal0); + + standardFloat_fill( x_standard+1, yVal1, VSX_FUNC_NUM_TEST_ELEMENTS-2); + THFloatVector_fill_VSX(x_optimized+1, yVal1, VSX_FUNC_NUM_TEST_ELEMENTS-2); + standardFloat_fill( x_standard+2, yVal2, VSX_FUNC_NUM_TEST_ELEMENTS-4); + THFloatVector_fill_VSX(x_optimized+2, yVal2, VSX_FUNC_NUM_TEST_ELEMENTS-4); + standardFloat_fill( x_standard+3, yVal3, VSX_FUNC_NUM_TEST_ELEMENTS-6); + THFloatVector_fill_VSX(x_optimized+3, yVal3, VSX_FUNC_NUM_TEST_ELEMENTS-6); + standardFloat_fill( x_standard+517, yVal0, VSX_FUNC_NUM_TEST_ELEMENTS-1029); + THFloatVector_fill_VSX(x_optimized+517, yVal0, VSX_FUNC_NUM_TEST_ELEMENTS-1029); + int r = rand() % 258; + standardFloat_fill( x_standard+517+r, yVal2, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100)); + THFloatVector_fill_VSX(x_optimized+517+r, yVal2, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100)); + for(int i = 0; i < VSX_FUNC_NUM_TEST_ELEMENTS; i++) + assert(x_optimized[i] == x_standard[i]); + printf("All assertions PASSED for THFloatVector_fill_VSX() test.\n\n"); + + + free(x_standard); + free(x_optimized); +} + +void test_THDoubleVector_add_VSX() +{ + clock_t start, end; + double elapsedSeconds_optimized, elapsedSeconds_standard; + + double *y_standard = (double *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(double)); + double *y_optimized = (double *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(double)); + double *x = (double *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(double)); + double c = (double)randDouble(); + + // Initialize randomly + for(int i = 0; i < VSX_PERF_NUM_TEST_ELEMENTS; i++) + { + x[i] = randDouble(); + double yVal = randDouble(); + y_standard[i] = yVal; + y_optimized[i] = yVal; + } + + + //------------------------------------------------- + // Performance Test + //------------------------------------------------- + start = clock(); + standardDouble_add(y_standard, x, c, VSX_PERF_NUM_TEST_ELEMENTS ); + standardDouble_add(y_standard, x, c, VSX_PERF_NUM_TEST_ELEMENTS-1); + standardDouble_add(y_standard, x, c, VSX_PERF_NUM_TEST_ELEMENTS-2); + standardDouble_add(y_standard, x, c, VSX_PERF_NUM_TEST_ELEMENTS-3); + end = clock(); + + elapsedSeconds_standard = (double)(end - start) / CLOCKS_PER_SEC; + printf("standardDouble_add() test took %.5lf seconds\n", elapsedSeconds_standard); + + start = clock(); + THDoubleVector_add_VSX(y_optimized, x, c, VSX_PERF_NUM_TEST_ELEMENTS ); + THDoubleVector_add_VSX(y_optimized, x, c, VSX_PERF_NUM_TEST_ELEMENTS-1); + THDoubleVector_add_VSX(y_optimized, x, c, VSX_PERF_NUM_TEST_ELEMENTS-2); + THDoubleVector_add_VSX(y_optimized, x, c, VSX_PERF_NUM_TEST_ELEMENTS-3); + end = clock(); + + elapsedSeconds_optimized = (double)(end - start) / CLOCKS_PER_SEC; + printf("THDoubleVector_add_VSX() test took %.5lf seconds\n", elapsedSeconds_optimized); + + + //------------------------------------------------- + // Correctness Test + //------------------------------------------------- + standardDouble_add( y_standard+1, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-2); + THDoubleVector_add_VSX(y_optimized+1, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-2); + standardDouble_add( y_standard+2, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-4); + THDoubleVector_add_VSX(y_optimized+2, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-4); + standardDouble_add( y_standard+3, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-6); + THDoubleVector_add_VSX(y_optimized+3, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-6); + standardDouble_add( y_standard+517, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-1029); + THDoubleVector_add_VSX(y_optimized+517, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-1029); + int r = rand() % 258; + standardDouble_add( y_standard+517+r, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100)); + THDoubleVector_add_VSX(y_optimized+517+r, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100)); + for(int i = 0; i < VSX_FUNC_NUM_TEST_ELEMENTS; i++) + { + if(!near(y_optimized[i], y_standard[i])) + printf("%d %f %f\n", i, y_optimized[i], y_standard[i]); + assert(near(y_optimized[i], y_standard[i])); + } + printf("All assertions PASSED for THDoubleVector_add_VSX() test.\n\n"); + + + free(y_standard); + free(y_optimized); + free(x); +} + + +void test_THFloatVector_add_VSX() +{ + clock_t start, end; + double elapsedSeconds_optimized, elapsedSeconds_standard; + + float *y_standard = (float *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(float)); + float *y_optimized = (float *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(float)); + float *x = (float *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(float)); + float c = (float)randDouble(); + + // Initialize randomly + for(int i = 0; i < VSX_PERF_NUM_TEST_ELEMENTS; i++) + { + x[i] = (float)randDouble(); + float yVal = (float)randDouble(); + y_standard[i] = yVal; + y_optimized[i] = yVal; + } + + + //------------------------------------------------- + // Performance Test + //------------------------------------------------- + start = clock(); + standardFloat_add(y_standard, x, c, VSX_PERF_NUM_TEST_ELEMENTS ); + standardFloat_add(y_standard, x, c, VSX_PERF_NUM_TEST_ELEMENTS-1); + standardFloat_add(y_standard, x, c, VSX_PERF_NUM_TEST_ELEMENTS-2); + standardFloat_add(y_standard, x, c, VSX_PERF_NUM_TEST_ELEMENTS-3); + end = clock(); + + elapsedSeconds_standard = (double)(end - start) / CLOCKS_PER_SEC; + printf("standardFloat_add() test took %.5lf seconds\n", elapsedSeconds_standard); + + start = clock(); + THFloatVector_add_VSX(y_optimized, x, c, VSX_PERF_NUM_TEST_ELEMENTS ); + THFloatVector_add_VSX(y_optimized, x, c, VSX_PERF_NUM_TEST_ELEMENTS-1); + THFloatVector_add_VSX(y_optimized, x, c, VSX_PERF_NUM_TEST_ELEMENTS-2); + THFloatVector_add_VSX(y_optimized, x, c, VSX_PERF_NUM_TEST_ELEMENTS-3); + end = clock(); + + elapsedSeconds_optimized = (double)(end - start) / CLOCKS_PER_SEC; + printf("THFloatVector_add_VSX() test took %.5lf seconds\n", elapsedSeconds_optimized); + + + //------------------------------------------------- + // Correctness Test + //------------------------------------------------- + standardFloat_add( y_standard+1, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-2); + THFloatVector_add_VSX(y_optimized+1, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-2); + standardFloat_add( y_standard+2, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-4); + THFloatVector_add_VSX(y_optimized+2, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-4); + standardFloat_add( y_standard+3, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-6); + THFloatVector_add_VSX(y_optimized+3, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-6); + standardFloat_add( y_standard+517, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-1029); + THFloatVector_add_VSX(y_optimized+517, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-1029); + int r = rand() % 258; + standardFloat_add( y_standard+517+r, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100)); + THFloatVector_add_VSX(y_optimized+517+r, x, c, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100)); + for(int i = 0; i < VSX_FUNC_NUM_TEST_ELEMENTS; i++) + { + if(!near(y_optimized[i], y_standard[i])) + printf("%d %f %f\n", i, y_optimized[i], y_standard[i]); + assert(near(y_optimized[i], y_standard[i])); + } + printf("All assertions PASSED for THFloatVector_add_VSX() test.\n\n"); + + + free(y_standard); + free(y_optimized); + free(x); +} + +void test_THDoubleVector_diff_VSX() +{ + clock_t start, end; + double elapsedSeconds_optimized, elapsedSeconds_standard; + + double *z_standard = (double *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(double)); + double *z_optimized = (double *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(double)); + double *y = (double *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(double)); + double *x = (double *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(double)); + + // Initialize randomly + for(int i = 0; i < VSX_PERF_NUM_TEST_ELEMENTS; i++) + { + x[i] = randDouble(); + y[i] = randDouble(); + double zVal = randDouble(); + z_standard[i] = zVal; + z_optimized[i] = zVal; + } + + + //------------------------------------------------- + // Performance Test + //------------------------------------------------- + start = clock(); + standardDouble_diff(z_standard, y, x, VSX_PERF_NUM_TEST_ELEMENTS ); + standardDouble_diff(z_standard, y, x, VSX_PERF_NUM_TEST_ELEMENTS-1); + standardDouble_diff(z_standard, y, x, VSX_PERF_NUM_TEST_ELEMENTS-2); + standardDouble_diff(z_standard, y, x, VSX_PERF_NUM_TEST_ELEMENTS-3); + end = clock(); + + elapsedSeconds_standard = (double)(end - start) / CLOCKS_PER_SEC; + printf("standardDouble_diff() test took %.5lf seconds\n", elapsedSeconds_standard); + + start = clock(); + THDoubleVector_diff_VSX(z_optimized, y, x, VSX_PERF_NUM_TEST_ELEMENTS ); + THDoubleVector_diff_VSX(z_optimized, y, x, VSX_PERF_NUM_TEST_ELEMENTS-1); + THDoubleVector_diff_VSX(z_optimized, y, x, VSX_PERF_NUM_TEST_ELEMENTS-2); + THDoubleVector_diff_VSX(z_optimized, y, x, VSX_PERF_NUM_TEST_ELEMENTS-3); + end = clock(); + + elapsedSeconds_optimized = (double)(end - start) / CLOCKS_PER_SEC; + printf("THDoubleVector_diff_VSX() test took %.5lf seconds\n", elapsedSeconds_optimized); + + + //------------------------------------------------- + // Correctness Test + //------------------------------------------------- + standardDouble_diff( z_standard+1, y, x, VSX_FUNC_NUM_TEST_ELEMENTS-2); + THDoubleVector_diff_VSX(z_optimized+1, y, x, VSX_FUNC_NUM_TEST_ELEMENTS-2); + standardDouble_diff( z_standard+2, y, x, VSX_FUNC_NUM_TEST_ELEMENTS-4); + THDoubleVector_diff_VSX(z_optimized+2, y, x, VSX_FUNC_NUM_TEST_ELEMENTS-4); + standardDouble_diff( z_standard+3, y, x, VSX_FUNC_NUM_TEST_ELEMENTS-6); + THDoubleVector_diff_VSX(z_optimized+3, y, x, VSX_FUNC_NUM_TEST_ELEMENTS-6); + standardDouble_diff( z_standard+517, y, x, VSX_FUNC_NUM_TEST_ELEMENTS-1029); + THDoubleVector_diff_VSX(z_optimized+517, y, x, VSX_FUNC_NUM_TEST_ELEMENTS-1029); + int r = rand() % 258; + standardDouble_diff( z_standard+517+r, y, x, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100)); + THDoubleVector_diff_VSX(z_optimized+517+r, y, x, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100)); + for(int i = 0; i < VSX_FUNC_NUM_TEST_ELEMENTS; i++) + { + if(!near(z_optimized[i], z_standard[i])) + printf("%d %f %f\n", i, z_optimized[i], z_standard[i]); + assert(near(z_optimized[i], z_standard[i])); + } + printf("All assertions PASSED for THDoubleVector_diff_VSX() test.\n\n"); + + + free(z_standard); + free(z_optimized); + free(y); + free(x); +} + + +void test_THFloatVector_diff_VSX() +{ + clock_t start, end; + double elapsedSeconds_optimized, elapsedSeconds_standard; + + float *z_standard = (float *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(float)); + float *z_optimized = (float *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(float)); + float *y = (float *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(float)); + float *x = (float *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(float)); + + // Initialize randomly + for(int i = 0; i < VSX_PERF_NUM_TEST_ELEMENTS; i++) + { + x[i] = (float)randDouble(); + y[i] = (float)randDouble(); + float zVal = (float)randDouble(); + z_standard[i] = zVal; + z_optimized[i] = zVal; + } + + + //------------------------------------------------- + // Performance Test + //------------------------------------------------- + start = clock(); + standardFloat_diff(z_standard, y, x, VSX_PERF_NUM_TEST_ELEMENTS ); + standardFloat_diff(z_standard, y, x, VSX_PERF_NUM_TEST_ELEMENTS-1); + standardFloat_diff(z_standard, y, x, VSX_PERF_NUM_TEST_ELEMENTS-2); + standardFloat_diff(z_standard, y, x, VSX_PERF_NUM_TEST_ELEMENTS-3); + end = clock(); + + elapsedSeconds_standard = (double)(end - start) / CLOCKS_PER_SEC; + printf("standardFloat_diff() test took %.5lf seconds\n", elapsedSeconds_standard); + + start = clock(); + THFloatVector_diff_VSX(z_optimized, y, x, VSX_PERF_NUM_TEST_ELEMENTS ); + THFloatVector_diff_VSX(z_optimized, y, x, VSX_PERF_NUM_TEST_ELEMENTS-1); + THFloatVector_diff_VSX(z_optimized, y, x, VSX_PERF_NUM_TEST_ELEMENTS-2); + THFloatVector_diff_VSX(z_optimized, y, x, VSX_PERF_NUM_TEST_ELEMENTS-3); + end = clock(); + + elapsedSeconds_optimized = (double)(end - start) / CLOCKS_PER_SEC; + printf("THFloatVector_diff_VSX() test took %.5lf seconds\n", elapsedSeconds_optimized); + + + //------------------------------------------------- + // Correctness Test + //------------------------------------------------- + standardFloat_diff( z_standard+1, y, x, VSX_FUNC_NUM_TEST_ELEMENTS-2); + THFloatVector_diff_VSX(z_optimized+1, y, x, VSX_FUNC_NUM_TEST_ELEMENTS-2); + standardFloat_diff( z_standard+2, y, x, VSX_FUNC_NUM_TEST_ELEMENTS-4); + THFloatVector_diff_VSX(z_optimized+2, y, x, VSX_FUNC_NUM_TEST_ELEMENTS-4); + standardFloat_diff( z_standard+3, y, x, VSX_FUNC_NUM_TEST_ELEMENTS-6); + THFloatVector_diff_VSX(z_optimized+3, y, x, VSX_FUNC_NUM_TEST_ELEMENTS-6); + standardFloat_diff( z_standard+517, y, x, VSX_FUNC_NUM_TEST_ELEMENTS-1029); + THFloatVector_diff_VSX(z_optimized+517, y, x, VSX_FUNC_NUM_TEST_ELEMENTS-1029); + int r = rand() % 258; + standardFloat_diff( z_standard+517+r, y, x, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100)); + THFloatVector_diff_VSX(z_optimized+517+r, y, x, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100)); + for(int i = 0; i < VSX_FUNC_NUM_TEST_ELEMENTS; i++) + { + if(!near(z_optimized[i], z_standard[i])) + printf("%d %f %f\n", i, z_optimized[i], z_standard[i]); + assert(near(z_optimized[i], z_standard[i])); + } + printf("All assertions PASSED for THFloatVector_diff_VSX() test.\n\n"); + + + free(z_standard); + free(z_optimized); + free(y); + free(x); +} + + +void test_THDoubleVector_scale_VSX() +{ + clock_t start, end; + double elapsedSeconds_optimized, elapsedSeconds_standard; + + double *y_standard = (double *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(double)); + double *y_optimized = (double *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(double)); + double c = randDouble(); + + // Initialize randomly + for(int i = 0; i < VSX_PERF_NUM_TEST_ELEMENTS; i++) + { + double yVal = randDouble(); + y_standard[i] = yVal; + y_optimized[i] = yVal; + } + + + //------------------------------------------------- + // Performance Test + //------------------------------------------------- + start = clock(); + standardDouble_scale(y_standard, c, VSX_PERF_NUM_TEST_ELEMENTS ); + standardDouble_scale(y_standard, c, VSX_PERF_NUM_TEST_ELEMENTS-1); + standardDouble_scale(y_standard, c, VSX_PERF_NUM_TEST_ELEMENTS-2); + standardDouble_scale(y_standard, c, VSX_PERF_NUM_TEST_ELEMENTS-3); + end = clock(); + + elapsedSeconds_standard = (double)(end - start) / CLOCKS_PER_SEC; + printf("standardDouble_scale() test took %.5lf seconds\n", elapsedSeconds_standard); + + start = clock(); + THDoubleVector_scale_VSX(y_optimized, c, VSX_PERF_NUM_TEST_ELEMENTS ); + THDoubleVector_scale_VSX(y_optimized, c, VSX_PERF_NUM_TEST_ELEMENTS-1); + THDoubleVector_scale_VSX(y_optimized, c, VSX_PERF_NUM_TEST_ELEMENTS-2); + THDoubleVector_scale_VSX(y_optimized, c, VSX_PERF_NUM_TEST_ELEMENTS-3); + end = clock(); + + elapsedSeconds_optimized = (double)(end - start) / CLOCKS_PER_SEC; + printf("THDoubleVector_scale_VSX() test took %.5lf seconds\n", elapsedSeconds_optimized); + + + //------------------------------------------------- + // Correctness Test + //------------------------------------------------- + standardDouble_scale( y_standard+1, c, VSX_FUNC_NUM_TEST_ELEMENTS-2); + THDoubleVector_scale_VSX(y_optimized+1, c, VSX_FUNC_NUM_TEST_ELEMENTS-2); + standardDouble_scale( y_standard+2, c, VSX_FUNC_NUM_TEST_ELEMENTS-4); + THDoubleVector_scale_VSX(y_optimized+2, c, VSX_FUNC_NUM_TEST_ELEMENTS-4); + standardDouble_scale( y_standard+3, c, VSX_FUNC_NUM_TEST_ELEMENTS-6); + THDoubleVector_scale_VSX(y_optimized+3, c, VSX_FUNC_NUM_TEST_ELEMENTS-6); + standardDouble_scale( y_standard+517, c, VSX_FUNC_NUM_TEST_ELEMENTS-1029); + THDoubleVector_scale_VSX(y_optimized+517, c, VSX_FUNC_NUM_TEST_ELEMENTS-1029); + int r = rand() % 258; + standardDouble_scale( y_standard+517+r, c, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100)); + THDoubleVector_scale_VSX(y_optimized+517+r, c, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100)); + for(int i = 0; i < VSX_FUNC_NUM_TEST_ELEMENTS; i++) + { + if(!near(y_optimized[i], y_standard[i])) + printf("%d %f %f\n", i, y_optimized[i], y_standard[i]); + assert(near(y_optimized[i], y_standard[i])); + } + printf("All assertions PASSED for THDoubleVector_scale_VSX() test.\n\n"); + + + free(y_standard); + free(y_optimized); +} + + +void test_THFloatVector_scale_VSX() +{ + clock_t start, end; + double elapsedSeconds_optimized, elapsedSeconds_standard; + + float *y_standard = (float *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(float)); + float *y_optimized = (float *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(float)); + float c = (float)randDouble(); + + // Initialize randomly + for(int i = 0; i < VSX_PERF_NUM_TEST_ELEMENTS; i++) + { + float yVal = (float)randDouble(); + y_standard[i] = yVal; + y_optimized[i] = yVal; + } + + + //------------------------------------------------- + // Performance Test + //------------------------------------------------- + start = clock(); + standardFloat_scale(y_standard, c, VSX_PERF_NUM_TEST_ELEMENTS ); + standardFloat_scale(y_standard, c, VSX_PERF_NUM_TEST_ELEMENTS-1); + standardFloat_scale(y_standard, c, VSX_PERF_NUM_TEST_ELEMENTS-2); + standardFloat_scale(y_standard, c, VSX_PERF_NUM_TEST_ELEMENTS-3); + end = clock(); + + elapsedSeconds_standard = (double)(end - start) / CLOCKS_PER_SEC; + printf("standardFloat_scale() test took %.5lf seconds\n", elapsedSeconds_standard); + + start = clock(); + THFloatVector_scale_VSX(y_optimized, c, VSX_PERF_NUM_TEST_ELEMENTS ); + THFloatVector_scale_VSX(y_optimized, c, VSX_PERF_NUM_TEST_ELEMENTS-1); + THFloatVector_scale_VSX(y_optimized, c, VSX_PERF_NUM_TEST_ELEMENTS-2); + THFloatVector_scale_VSX(y_optimized, c, VSX_PERF_NUM_TEST_ELEMENTS-3); + end = clock(); + + elapsedSeconds_optimized = (double)(end - start) / CLOCKS_PER_SEC; + printf("THFloatVector_scale_VSX() test took %.5lf seconds\n", elapsedSeconds_optimized); + + + //------------------------------------------------- + // Correctness Test + //------------------------------------------------- + standardFloat_scale( y_standard+1, c, VSX_FUNC_NUM_TEST_ELEMENTS-2); + THFloatVector_scale_VSX(y_optimized+1, c, VSX_FUNC_NUM_TEST_ELEMENTS-2); + standardFloat_scale( y_standard+2, c, VSX_FUNC_NUM_TEST_ELEMENTS-4); + THFloatVector_scale_VSX(y_optimized+2, c, VSX_FUNC_NUM_TEST_ELEMENTS-4); + standardFloat_scale( y_standard+3, c, VSX_FUNC_NUM_TEST_ELEMENTS-6); + THFloatVector_scale_VSX(y_optimized+3, c, VSX_FUNC_NUM_TEST_ELEMENTS-6); + standardFloat_scale( y_standard+517, c, VSX_FUNC_NUM_TEST_ELEMENTS-1029); + THFloatVector_scale_VSX(y_optimized+517, c, VSX_FUNC_NUM_TEST_ELEMENTS-1029); + int r = rand() % 258; + standardFloat_scale( y_standard+517+r, c, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100)); + THFloatVector_scale_VSX(y_optimized+517+r, c, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100)); + for(int i = 0; i < VSX_FUNC_NUM_TEST_ELEMENTS; i++) + { + if(!near(y_optimized[i], y_standard[i])) + printf("%d %f %f\n", i, y_optimized[i], y_standard[i]); + assert(near(y_optimized[i], y_standard[i])); + } + printf("All assertions PASSED for THFloatVector_scale_VSX() test.\n\n"); + + + free(y_standard); + free(y_optimized); +} + +void test_THDoubleVector_mul_VSX() +{ + clock_t start, end; + double elapsedSeconds_optimized, elapsedSeconds_standard; + + double *y_standard = (double *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(double)); + double *y_optimized = (double *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(double)); + double *x = (double *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(double)); + + // Initialize randomly + for(int i = 0; i < VSX_PERF_NUM_TEST_ELEMENTS; i++) + { + x[i] = randDouble(); + double yVal = randDouble(); + y_standard[i] = yVal; + y_optimized[i] = yVal; + } + + + //------------------------------------------------- + // Performance Test + //------------------------------------------------- + start = clock(); + standardDouble_mul(y_standard, x, VSX_PERF_NUM_TEST_ELEMENTS ); + standardDouble_mul(y_standard, x, VSX_PERF_NUM_TEST_ELEMENTS-1); + standardDouble_mul(y_standard, x, VSX_PERF_NUM_TEST_ELEMENTS-2); + standardDouble_mul(y_standard, x, VSX_PERF_NUM_TEST_ELEMENTS-3); + end = clock(); + + elapsedSeconds_standard = (double)(end - start) / CLOCKS_PER_SEC; + printf("standardDouble_mul() test took %.5lf seconds\n", elapsedSeconds_standard); + + start = clock(); + THDoubleVector_mul_VSX(y_optimized, x, VSX_PERF_NUM_TEST_ELEMENTS ); + THDoubleVector_mul_VSX(y_optimized, x, VSX_PERF_NUM_TEST_ELEMENTS-1); + THDoubleVector_mul_VSX(y_optimized, x, VSX_PERF_NUM_TEST_ELEMENTS-2); + THDoubleVector_mul_VSX(y_optimized, x, VSX_PERF_NUM_TEST_ELEMENTS-3); + end = clock(); + + elapsedSeconds_optimized = (double)(end - start) / CLOCKS_PER_SEC; + printf("THDoubleVector_mul_VSX() test took %.5lf seconds\n", elapsedSeconds_optimized); + + + //------------------------------------------------- + // Correctness Test + //------------------------------------------------- + standardDouble_mul( y_standard+1, x, VSX_FUNC_NUM_TEST_ELEMENTS-2); + THDoubleVector_mul_VSX(y_optimized+1, x, VSX_FUNC_NUM_TEST_ELEMENTS-2); + standardDouble_mul( y_standard+2, x, VSX_FUNC_NUM_TEST_ELEMENTS-4); + THDoubleVector_mul_VSX(y_optimized+2, x, VSX_FUNC_NUM_TEST_ELEMENTS-4); + standardDouble_mul( y_standard+3, x, VSX_FUNC_NUM_TEST_ELEMENTS-6); + THDoubleVector_mul_VSX(y_optimized+3, x, VSX_FUNC_NUM_TEST_ELEMENTS-6); + standardDouble_mul( y_standard+517, x, VSX_FUNC_NUM_TEST_ELEMENTS-1029); + THDoubleVector_mul_VSX(y_optimized+517, x, VSX_FUNC_NUM_TEST_ELEMENTS-1029); + int r = rand() % 258; + standardDouble_mul( y_standard+517+r, x, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100)); + THDoubleVector_mul_VSX(y_optimized+517+r, x, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100)); + for(int i = 0; i < VSX_FUNC_NUM_TEST_ELEMENTS; i++) + { + if(!near(y_optimized[i], y_standard[i])) + printf("%d %f %f\n", i, y_optimized[i], y_standard[i]); + assert(near(y_optimized[i], y_standard[i])); + } + printf("All assertions PASSED for THDoubleVector_mul_VSX() test.\n\n"); + + + free(y_standard); + free(y_optimized); + free(x); +} + + +void test_THFloatVector_mul_VSX() +{ + clock_t start, end; + double elapsedSeconds_optimized, elapsedSeconds_standard; + + float *y_standard = (float *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(float)); + float *y_optimized = (float *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(float)); + float *x = (float *)malloc(VSX_PERF_NUM_TEST_ELEMENTS*sizeof(float)); + + // Initialize randomly + for(int i = 0; i < VSX_PERF_NUM_TEST_ELEMENTS; i++) + { + x[i] = (float)randDouble(); + float yVal = (float)randDouble(); + y_standard[i] = yVal; + y_optimized[i] = yVal; + } + + + //------------------------------------------------- + // Performance Test + //------------------------------------------------- + start = clock(); + standardFloat_mul(y_standard, x, VSX_PERF_NUM_TEST_ELEMENTS ); + standardFloat_mul(y_standard, x, VSX_PERF_NUM_TEST_ELEMENTS-1); + standardFloat_mul(y_standard, x, VSX_PERF_NUM_TEST_ELEMENTS-2); + standardFloat_mul(y_standard, x, VSX_PERF_NUM_TEST_ELEMENTS-3); + end = clock(); + + elapsedSeconds_standard = (double)(end - start) / CLOCKS_PER_SEC; + printf("standardFloat_mul() test took %.5lf seconds\n", elapsedSeconds_standard); + + start = clock(); + THFloatVector_mul_VSX(y_optimized, x, VSX_PERF_NUM_TEST_ELEMENTS ); + THFloatVector_mul_VSX(y_optimized, x, VSX_PERF_NUM_TEST_ELEMENTS-1); + THFloatVector_mul_VSX(y_optimized, x, VSX_PERF_NUM_TEST_ELEMENTS-2); + THFloatVector_mul_VSX(y_optimized, x, VSX_PERF_NUM_TEST_ELEMENTS-3); + end = clock(); + + elapsedSeconds_optimized = (double)(end - start) / CLOCKS_PER_SEC; + printf("THFloatVector_mul_VSX() test took %.5lf seconds\n", elapsedSeconds_optimized); + + + //------------------------------------------------- + // Correctness Test + //------------------------------------------------- + standardFloat_mul( y_standard+1, x, VSX_FUNC_NUM_TEST_ELEMENTS-2); + THFloatVector_mul_VSX(y_optimized+1, x, VSX_FUNC_NUM_TEST_ELEMENTS-2); + standardFloat_mul( y_standard+2, x, VSX_FUNC_NUM_TEST_ELEMENTS-4); + THFloatVector_mul_VSX(y_optimized+2, x, VSX_FUNC_NUM_TEST_ELEMENTS-4); + standardFloat_mul( y_standard+3, x, VSX_FUNC_NUM_TEST_ELEMENTS-6); + THFloatVector_mul_VSX(y_optimized+3, x, VSX_FUNC_NUM_TEST_ELEMENTS-6); + standardFloat_mul( y_standard+517, x, VSX_FUNC_NUM_TEST_ELEMENTS-1029); + THFloatVector_mul_VSX(y_optimized+517, x, VSX_FUNC_NUM_TEST_ELEMENTS-1029); + int r = rand() % 258; + standardFloat_mul( y_standard+517+r, x, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100)); + THFloatVector_mul_VSX(y_optimized+517+r, x, VSX_FUNC_NUM_TEST_ELEMENTS-(1029+r+100)); + for(int i = 0; i < VSX_FUNC_NUM_TEST_ELEMENTS; i++) + { + if(!near(y_optimized[i], y_standard[i])) + printf("%d %f %f\n", i, y_optimized[i], y_standard[i]); + assert(near(y_optimized[i], y_standard[i])); + } + printf("All assertions PASSED for THFloatVector_mul_VSX() test.\n\n"); + + + free(y_standard); + free(y_optimized); + free(x); +} + + + +int main() +{ + printf("\n"); + + + // First test utility functions + + assert(!near(0.1, -0.1)); + assert(!near(0.1f, -0.1f)); + assert(!near(9, 10)); + assert(near(0.1, 0.1000001)); + assert(near(0.1f, 0.1000001f)); + assert(near(100.764, 100.764)); + assert(!near(NAN, 0.0)); + assert(!near(-9.5, NAN)); + assert(!near(NAN, 100)); + assert(!near(-0.0, NAN)); + assert(near(NAN, NAN)); + assert(near(INFINITY, INFINITY)); + assert(near(-INFINITY, -INFINITY)); + assert(!near(INFINITY, NAN)); + assert(!near(0, INFINITY)); + assert(!near(-999.4324, INFINITY)); + assert(!near(INFINITY, 982374.1)); + assert(!near(-INFINITY, INFINITY)); + + + + // Then test each vectorized function + + test_THDoubleVector_fill_VSX(); + test_THFloatVector_fill_VSX(); + + test_THDoubleVector_add_VSX(); + test_THFloatVector_add_VSX(); + + test_THDoubleVector_diff_VSX(); + test_THFloatVector_diff_VSX(); + + test_THDoubleVector_scale_VSX(); + test_THFloatVector_scale_VSX(); + + test_THDoubleVector_mul_VSX(); + test_THFloatVector_mul_VSX(); + + + printf("Finished runnning all tests. All tests PASSED.\n"); + return 0; +} + + +#endif // defined RUN_VSX_TESTS + +#endif // defined __PPC64__ + diff --git a/test/test.lua b/test/test.lua index eb7cf0a..e7e26e4 100644 --- a/test/test.lua +++ b/test/test.lua @@ -19,6 +19,35 @@ local function maxdiff(x,y) end end +-- workarounds for non-existant functions +function torch.HalfTensor:__sub(other) + return (self:real() - other:real()):half() +end + +function torch.HalfTensor:mean(dim) + return self:real():mean(dim):half() +end + +function torch.HalfTensor:abs() + return self:real():abs():half() +end + +function torch.HalfTensor:max() + return self:real():max() +end + +function torch.HalfTensor:add(a, b) + return (self:real():add(a, b:real())):half() +end + +function torch.HalfTensor:reshape(a, b) + return (self:real():reshape(a, b)):half() +end + +function torch.HalfTensor:fill(a) + return self:real():fill(a):half() +end + function torchtest.dot() local types = { ['torch.DoubleTensor'] = 1e-8, -- for ddot @@ -1355,6 +1384,18 @@ function torchtest.histc() local z = torch.Tensor{ 0, 3, 0, 2, 1 } mytester:assertTensorEq(y,z,precision,'error in torch.histc') end +function torchtest.bhistc() + local x = torch.Tensor(3, 6) + x[1] = torch.Tensor{ 2, 4, 2, 2, 5, 4 } + x[2] = torch.Tensor{ 3, 5, 1, 5, 3, 5 } + x[3] = torch.Tensor{ 3, 4, 2, 5, 5, 1 } + local y = torch.bhistc(x, 5, 1, 5) -- nbins, min, max + local z = torch.Tensor(3, 5) + z[1] = torch.Tensor{ 0, 3, 0, 2, 1 } + z[2] = torch.Tensor{ 1, 0, 2, 0, 3 } + z[3] = torch.Tensor{ 1, 1, 1, 1, 2 } + mytester:assertTensorEq(y,z,precision,'error in torch.bhistc in last dimension') +end function torchtest.ones() local mx = torch.ones(msize,msize) local mxx = torch.Tensor() @@ -3103,7 +3144,13 @@ function torchtest.isTypeOfPattern() end function torchtest.isTensor() - local t = torch.randn(3,4) + for k,v in ipairs({"real", "half"}) do + torchtest_isTensor(torch.getmetatable(torch.Tensor():type())[v]) + end +end + +function torchtest_isTensor(func) + local t = func(torch.randn(3,4)) mytester:assert(torch.isTensor(t), 'error in isTensor') mytester:assert(torch.isTensor(t[1]), 'error in isTensor for subTensor') mytester:assert(not torch.isTensor(t[1][2]), 'false positive in isTensor') @@ -3111,14 +3158,26 @@ function torchtest.isTensor() end function torchtest.isStorage() + for k,v in ipairs({"real", "half"}) do + torchtest_isStorage(torch.getmetatable(torch.Tensor():type())[v]) + end +end + +function torchtest_isStorage(func) local t = torch.randn(3,4) mytester:assert(torch.isStorage(t:storage()), 'error in isStorage') mytester:assert(not torch.isStorage(t), 'false positive in isStorage') end function torchtest.view() - local tensor = torch.rand(15) - local template = torch.rand(3,5) + for k,v in ipairs({"real", "half"}) do + torchtest_view(torch.getmetatable(torch.Tensor():type())[v]) + end +end + +function torchtest_view(func) + local tensor = func(torch.rand(15)) + local template = func(torch.rand(3,5)) local target = template:size():totable() mytester:assertTableEq(tensor:viewAs(template):size():totable(), target, 'Error in viewAs') mytester:assertTableEq(tensor:view(3,5):size():totable(), target, 'Error in view') @@ -3129,7 +3188,7 @@ function torchtest.view() tensor_view:fill(torch.rand(1)[1]) mytester:asserteq((tensor_view-tensor):abs():max(), 0, 'Error in view') - local target_tensor = torch.Tensor() + local target_tensor = func(torch.Tensor()) mytester:assertTableEq(target_tensor:viewAs(tensor, template):size():totable(), target, 'Error in viewAs') mytester:assertTableEq(target_tensor:view(tensor, 3,5):size():totable(), target, 'Error in view') mytester:assertTableEq(target_tensor:view(tensor, torch.LongStorage{3,5}):size():totable(), target, 'Error in view using LongStorage') @@ -3140,9 +3199,15 @@ function torchtest.view() end function torchtest.expand() - local result = torch.Tensor() - local tensor = torch.rand(8,1) - local template = torch.rand(8,5) + for k,v in ipairs({"real", "half"}) do + torchtest_expand(torch.getmetatable(torch.Tensor():type())[v]) + end +end + +function torchtest_expand(func) + local result = func(torch.Tensor()) + local tensor = func(torch.rand(8,1)) + local template = func(torch.rand(8,5)) local target = template:size():totable() mytester:assertTableEq(tensor:expandAs(template):size():totable(), target, 'Error in expandAs') mytester:assertTableEq(tensor:expand(8,5):size():totable(), target, 'Error in expand') @@ -3157,8 +3222,14 @@ function torchtest.expand() end function torchtest.repeatTensor() - local result = torch.Tensor() - local tensor = torch.rand(8,4) + for k,v in ipairs({"real", "half"}) do + torchtest_repeatTensor(torch.getmetatable(torch.Tensor():type())[v]) + end +end + +function torchtest_repeatTensor(func, mean) + local result = func(torch.Tensor()) + local tensor = func(torch.rand(8,4)) local size = {3,1,1} local sizeStorage = torch.LongStorage(size) local target = {3,8,4} @@ -3172,10 +3243,16 @@ function torchtest.repeatTensor() end function torchtest.isSameSizeAs() - local t1 = torch.Tensor(3, 4, 9, 10) - local t2 = torch.Tensor(3, 4) - local t3 = torch.Tensor(1, 9, 3, 3) - local t4 = torch.Tensor(3, 4, 9, 10) + for k,v in ipairs({"real", "half"}) do + torchtest_isSameSizeAs(torch.getmetatable(torch.Tensor():type())[v]) + end +end + +function torchtest_isSameSizeAs(func) + local t1 = func(torch.Tensor(3, 4, 9, 10)) + local t2 = func(torch.Tensor(3, 4)) + local t3 = func(torch.Tensor(1, 9, 3, 3)) + local t4 = func(torch.Tensor(3, 4, 9, 10)) mytester:assert(t1:isSameSizeAs(t2) == false, "wrong answer ") mytester:assert(t1:isSameSizeAs(t3) == false, "wrong answer ") @@ -3183,15 +3260,21 @@ function torchtest.isSameSizeAs() end function torchtest.isSetTo() - local t1 = torch.Tensor(3, 4, 9, 10) - local t2 = torch.Tensor(3, 4, 9, 10) - local t3 = torch.Tensor():set(t1) + for k,v in ipairs({"real", "half"}) do + torchtest_isSetTo(torch.getmetatable(torch.Tensor():type())[v]) + end +end + +function torchtest_isSetTo(func) + local t1 = func(torch.Tensor(3, 4, 9, 10)) + local t2 = func(torch.Tensor(3, 4, 9, 10)) + local t3 = func(torch.Tensor()):set(t1) local t4 = t3:reshape(12, 90) mytester:assert(t1:isSetTo(t2) == false, "tensors do not share storage") mytester:assert(t1:isSetTo(t3) == true, "tensor is set to other") mytester:assert(t3:isSetTo(t1) == true, "isSetTo should be symmetric") mytester:assert(t1:isSetTo(t4) == false, "tensors have different view") - mytester:assert(not torch.Tensor():isSetTo(torch.Tensor()), + mytester:assert(not func(torch.Tensor()):isSetTo(func(torch.Tensor())), "Tensors with no storages should not appear to be set " .. "to each other") end @@ -3229,7 +3312,13 @@ function torchtest.equal() end function torchtest.isSize() - local t1 = torch.Tensor(3, 4, 5) + for k,v in ipairs({"real", "half"}) do + torchtest_isSize(torch.getmetatable(torch.Tensor():type())[v]) + end +end + +function torchtest_isSize(func) + local t1 = func(torch.Tensor(3, 4, 5)) local s1 = torch.LongStorage({3, 4, 5}) local s2 = torch.LongStorage({5, 4, 3}) @@ -3246,6 +3335,7 @@ function torchtest.elementSize() local long = torch.LongStorage():elementSize() local float = torch.FloatStorage():elementSize() local double = torch.DoubleStorage():elementSize() + local half = torch.HalfStorage():elementSize() mytester:asserteq(byte, torch.ByteTensor():elementSize()) mytester:asserteq(char, torch.CharTensor():elementSize()) @@ -3254,6 +3344,7 @@ function torchtest.elementSize() mytester:asserteq(long, torch.LongTensor():elementSize()) mytester:asserteq(float, torch.FloatTensor():elementSize()) mytester:asserteq(double, torch.DoubleTensor():elementSize()) + mytester:asserteq(half, torch.HalfTensor():elementSize()) mytester:assertne(byte, 0) mytester:assertne(char, 0) @@ -3262,6 +3353,7 @@ function torchtest.elementSize() mytester:assertne(long, 0) mytester:assertne(float, 0) mytester:assertne(double, 0) + mytester:assertne(half, 0) -- These tests are portable, not necessarily strict for your system. mytester:asserteq(byte, 1) @@ -3272,11 +3364,18 @@ function torchtest.elementSize() mytester:assert(long >= 4) mytester:assert(long >= int) mytester:assert(double >= float) + mytester:assert(half <= float) end function torchtest.split() + for k,v in ipairs({"real", "half"}) do + torchtest_split(torch.getmetatable(torch.Tensor():type())[v]) + end +end + +function torchtest_split(func) local result = {} - local tensor = torch.rand(7,4) + local tensor = func(torch.rand(7,4)) local splitSize = 3 local targetSize = {{3,4},{3,4},{1,4}} local dim = 1 @@ -3301,8 +3400,14 @@ function torchtest.split() end function torchtest.chunk() + for k,v in ipairs({"real", "half"}) do + torchtest_chunk(torch.getmetatable(torch.Tensor():type())[v]) + end +end + +function torchtest_chunk(func) local result = {} - local tensor = torch.rand(4,7) + local tensor = func(torch.rand(4,7)) local nChunk = 3 local targetSize = {{4,3},{4,3},{4,1}} local dim = 2 @@ -3322,24 +3427,34 @@ function torchtest.chunk() end end -function torchtest.totable() +function torchtest.table() + local convStorage = { + ['real'] = 'FloatStorage', + ['half'] = 'HalfStorage' + } + for k,v in ipairs(convStorage) do + torchtest_totable(torch.getmetatable(torch.Tensor():type())[k], v) + end +end + +function torchtest_totable(func, storageType) local table0D = {} - local tensor0D = torch.Tensor(table0D) + local tensor0D = func(torch.Tensor(table0D)) mytester:assertTableEq(torch.totable(tensor0D), table0D, 'tensor0D:totable incorrect') local table1D = {1, 2, 3} - local tensor1D = torch.Tensor(table1D) - local storage = torch.Storage(table1D) + local tensor1D = func(torch.Tensor(table1D)) + local storage = torch[storageType](table1D) mytester:assertTableEq(tensor1D:totable(), table1D, 'tensor1D:totable incorrect') mytester:assertTableEq(storage:totable(), table1D, 'storage:totable incorrect') mytester:assertTableEq(torch.totable(tensor1D), table1D, 'torch.totable incorrect for Tensors') mytester:assertTableEq(torch.totable(storage), table1D, 'torch.totable incorrect for Storages') local table2D = {{1, 2}, {3, 4}} - local tensor2D = torch.Tensor(table2D) + local tensor2D = func(torch.Tensor(table2D)) mytester:assertTableEq(tensor2D:totable(), table2D, 'tensor2D:totable incorrect') - local tensor3D = torch.Tensor({{{1, 2}, {3, 4}}, {{5, 6}, {7, 8}}}) + local tensor3D = func(torch.Tensor({{{1, 2}, {3, 4}}, {{5, 6}, {7, 8}}})) local tensorNonContig = tensor3D:select(2, 2) mytester:assert(not tensorNonContig:isContiguous(), 'invalid test') mytester:assertTableEq(tensorNonContig:totable(), {{3, 4}, {7, 8}}, @@ -3347,6 +3462,12 @@ function torchtest.totable() end function torchtest.permute() + for k,v in ipairs({"real", "half"}) do + torchtest_permute(torch.getmetatable(torch.Tensor():type())[v]) + end +end + +function torchtest_permute(func) local orig = {1,2,3,4,5,6,7} local perm = torch.randperm(7):totable() local x = torch.Tensor(unpack(orig)):fill(0) diff --git a/test/test_half.lua b/test/test_half.lua new file mode 100644 index 0000000..bf3830b --- /dev/null +++ b/test/test_half.lua @@ -0,0 +1,55 @@ +local mytester +local torchtest = torch.TestSuite() + +-- Lua 5.2 compatibility +local loadstring = loadstring or load +local unpack = unpack or table.unpack + +function torchtest.easy() + local x=torch.randn(5, 6):half() + mytester:assert(x:isContiguous(), 'x should be contiguous') + mytester:assert(x:dim() == 2, 'x should have dim of 2') + mytester:assert(x:nDimension() == 2, 'x should have nDimension of 2') + mytester:assert(x:nElement() == 5 * 6, 'x should have 30 elements') + local stride = x:stride() + local expectedStride = torch.LongStorage{6,1} + for i=1,stride:size() do + mytester:assert(stride[i] == expectedStride[i], "stride is wrong") + end + + x=x:t() + mytester:assert(not x:isContiguous(), 'x transpose should not be contiguous') + x=x:transpose(1,2) + mytester:assert(x:isContiguous(), 'x should be contiguous after 2 transposes') + + local y=torch.HalfTensor() + y:resizeAs(x:t()):copy(x:t()) + mytester:assert(x:isContiguous(), 'after resize and copy, x should be contiguous') + mytester:assertTensorEq(y, x:t(), 0.001, 'copy broken after resizeAs') + local z=torch.HalfTensor() + z:resize(6, 5):copy(x:t()) + mytester:assertTensorEq(y, x:t(), 0.001, 'copy broken after resize') +end + +function torchtest.narrowSub() + local x = torch.randn(5, 6):half() + local narrow = x:narrow(1, 2, 3) + local sub = x:sub(2, 4) + mytester:assertTensorEq(narrow, sub, 0.001, 'narrow not equal to sub') +end + +function torchtest.selectClone() + local x = torch.zeros(5, 6) + x:select(1,2):fill(2) + x=x:half() + local y=x:clone() + mytester:assertTensorEq(x, y, 0.001, 'not equal after select and clone') + x:select(1,1):fill(3) + mytester:assert(y[1][1] == 0, 'clone broken') +end + +torch.setheaptracking(true) +math.randomseed(os.time()) +mytester = torch.Tester() +mytester:add(torchtest) +mytester:run(tests) diff --git a/test/test_writeObject.lua b/test/test_writeObject.lua index 1013a96..52bcb71 100644 --- a/test/test_writeObject.lua +++ b/test/test_writeObject.lua @@ -4,6 +4,9 @@ local myTester = torch.Tester() local tests = torch.TestSuite() +function torch.HalfTensor:norm() + return self:real():norm() +end -- checks that an object can be written and unwritten -- returns false if an error occurs @@ -66,7 +69,13 @@ function tests.test_a_recursive_closure() end function tests.test_a_tensor() - local x = torch.rand(5, 10) + for k,v in ipairs({"real", "half"}) do + tests_test_a_tensor(torch.getmetatable(torch.Tensor():type())[v]) + end +end + +function tests_test_a_tensor(func) + local x = func(torch.rand(5, 10)) local xcopy = serializeAndDeserialize(x) myTester:assert(x:norm() == xcopy:norm(), 'tensors should be the same') end diff --git a/torchcwrap.lua b/torchcwrap.lua index ab0df43..551bd05 100644 --- a/torchcwrap.lua +++ b/torchcwrap.lua @@ -202,7 +202,7 @@ types.IndexTensor = { } for _,typename in ipairs({"ByteTensor", "CharTensor", "ShortTensor", "IntTensor", "LongTensor", - "FloatTensor", "DoubleTensor"}) do + "FloatTensor", "HalfTensor", "DoubleTensor"}) do types[typename] = { @@ -460,3 +460,56 @@ types.charoption = { postcall = function(arg) end } + +for _,typename in ipairs({"ptrdiff_t", "size_t"}) do + types[typename] = { + + helpname = function(arg) + return typename + end, + + declare = function(arg) + -- if it is a number we initialize here + local default = tonumber(tostring(arg.default)) or 0 + return string.format("%s arg%d = %g;", typename, arg.i, default) + end, + + check = function(arg, idx) + return string.format("lua_isnumber(L, %d)", idx) + end, + + read = function(arg, idx) + return string.format("arg%d = (%s)lua_tonumber(L, %d);", arg.i, typename, idx) + end, + + init = function(arg) + -- otherwise do it here + if arg.default then + local default = tostring(arg.default) + if not tonumber(default) then + return string.format("arg%d = %s;", arg.i, default) + end + end + end, + + carg = function(arg) + return string.format('arg%d', arg.i) + end, + + creturn = function(arg) + return string.format('arg%d', arg.i) + end, + + precall = function(arg) + if arg.returned then + return string.format('lua_pushnumber(L, (lua_Number)arg%d);', arg.i) + end + end, + + postcall = function(arg) + if arg.creturned then + return string.format('lua_pushnumber(L, (lua_Number)arg%d);', arg.i) + end + end + } +end |